Add elevenlabs-stt skill and documentation

2026-03-08 21:11:09 -05:00
parent 6c12b74cca
commit 976888f002
5 changed files with 232 additions and 0 deletions
@@ -14,6 +14,7 @@ This repository is intended to be a simple skill source: install the repo (or a

 | Skill | What it does | Path |
 |---|---|---|
+| `elevenlabs-stt` | Transcribe local audio files with ElevenLabs Speech-to-Text, with diarization, language hints, event tags, and JSON output. | `skills/elevenlabs-stt` |
 | `gitea-api` | Interact with Gitea via REST API (repos, issues, PRs, releases, branches, user info). | `skills/gitea-api` |
 | `portainer` | Manage Portainer stacks via API (list, start/stop/restart, update, prune images). | `skills/portainer` |
 | `searxng` | Search through a local or self-hosted SearXNG instance for web, news, images, and more. | `skills/searxng` |
@@ -4,6 +4,7 @@ This folder contains detailed docs for each skill in this repository.

 ## Skills

+- [`elevenlabs-stt`](elevenlabs-stt.md) — Local audio transcription through ElevenLabs Speech-to-Text
 - [`gitea-api`](gitea-api.md) — REST-based Gitea automation (no `tea` CLI required)
 - [`portainer`](portainer.md) — Portainer stack management (list, lifecycle, updates, image pruning)
 - [`searxng`](searxng.md) — Privacy-respecting metasearch via a local or self-hosted SearXNG instance
@@ -0,0 +1,41 @@
+# elevenlabs-stt
+
+Transcribe local audio files with ElevenLabs Speech-to-Text.
+
+## What this skill is for
+
+- Local audio transcription
+- Voice note transcription
+- Optional speaker diarization
+- Language hints and event tagging
+- JSON output for programmatic use
+
+## Requirements
+
+Required binaries:
+- `curl`
+- `jq`
+- `python3`
+
+Preferred auth:
+- `ELEVENLABS_API_KEY` in the environment
+
+Fallback auth:
+- local OpenClaw config lookup from `~/.openclaw/openclaw.json` or `~/.openclaw/secrets.json`
+
+## Wrapper
+
+Use the bundled script directly:
+
+```bash
+bash skills/elevenlabs-stt/scripts/transcribe.sh /path/to/audio.mp3
+bash skills/elevenlabs-stt/scripts/transcribe.sh /path/to/audio.mp3 --diarize --lang en
+bash skills/elevenlabs-stt/scripts/transcribe.sh /path/to/audio.mp3 --json
+bash skills/elevenlabs-stt/scripts/transcribe.sh /path/to/audio.mp3 --events
+```
+
+## Notes
+
+- Uses ElevenLabs STT model `scribe_v2`.
+- Uploads a local file directly to ElevenLabs.
+- If `ELEVENLABS_API_KEY` is not exported, the script tries local OpenClaw config/secrets automatically.
@@ -0,0 +1,46 @@
+---
+name: elevenlabs-stt
+description: Transcribe audio files with ElevenLabs Speech-to-Text (Scribe v2) from the local CLI. Use when you need local audio transcription with optional speaker diarization, language hints, event tagging, or JSON output via scripts/transcribe.sh.
+---
+
+# ElevenLabs Speech-to-Text
+
+Use `scripts/transcribe.sh` to transcribe a local audio file with ElevenLabs STT.
+
+## Requirements
+
+Preferred: set `ELEVENLABS_API_KEY` in the environment before running the script.
+
+Fallback: if the environment variable is not set, the script will try to read the key from local OpenClaw config files in `~/.openclaw/`.
+
+Required binaries:
+- `curl`
+- `jq`
+- `python3`
+
+## Usage
+
+Run from the skill directory or call the script by full path.
+
+Examples:
+
+```bash
+scripts/transcribe.sh /path/to/audio.mp3
+scripts/transcribe.sh /path/to/audio.mp3 --diarize --lang en
+scripts/transcribe.sh /path/to/audio.mp3 --json
+scripts/transcribe.sh /path/to/audio.mp3 --events
+```
+
+## Options
+
+- `--diarize` — enable speaker diarization
+- `--lang CODE` — pass an ISO language code hint such as `en`, `es`, or `fr`
+- `--json` — print the full JSON response instead of only transcript text
+- `--events` — include audio event tagging when supported
+
+## Notes
+
+- The script uploads a local file directly to ElevenLabs.
+- The model is fixed to `scribe_v2` in the current script.
+- The script returns plain transcript text by default, or pretty-printed JSON with `--json`.
+- If the API returns an error payload, the script prints the error and exits non-zero.
@@ -0,0 +1,143 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# ElevenLabs Speech-to-Text transcription script
+# Usage: transcribe.sh <audio_file> [options]
+
+show_help() {
+    cat << EOF
+Usage: $(basename "$0") <audio_file> [options]
+
+Options:
+  --diarize     Enable speaker diarization
+  --lang CODE   ISO language code (e.g., en, pt, es, fr)
+  --json        Output full JSON response
+  --events      Tag audio events (laughter, music, etc.)
+  -h, --help    Show this help
+
+Environment:
+  ELEVENLABS_API_KEY  Required API key
+
+Examples:
+  $(basename "$0") voice_note.ogg
+  $(basename "$0") meeting.mp3 --diarize --lang en
+  $(basename "$0") podcast.mp3 --json > transcript.json
+EOF
+    exit 0
+}
+
+# Defaults
+DIARIZE="false"
+LANG_CODE=""
+JSON_OUTPUT="false"
+TAG_EVENTS="false"
+FILE=""
+
+# Parse arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -h|--help) show_help ;;
+        --diarize) DIARIZE="true"; shift ;;
+        --lang) LANG_CODE="$2"; shift 2 ;;
+        --json) JSON_OUTPUT="true"; shift ;;
+        --events) TAG_EVENTS="true"; shift ;;
+        -*) echo "Unknown option: $1" >&2; exit 1 ;;
+        *) FILE="$1"; shift ;;
+    esac
+done
+
+# Validate
+if [[ -z "$FILE" ]]; then
+    echo "Error: No audio file specified" >&2
+    show_help
+fi
+
+if [[ ! -f "$FILE" ]]; then
+    echo "Error: File not found: $FILE" >&2
+    exit 1
+fi
+
+# API key (check env, then fall back to local OpenClaw config/secrets)
+API_KEY="${ELEVENLABS_API_KEY:-}"
+if [[ -z "$API_KEY" ]]; then
+    OPENCLAW_DIR="${HOME}/.openclaw"
+    for CANDIDATE in "$OPENCLAW_DIR/secrets.json" "$OPENCLAW_DIR/openclaw.json"; do
+        if [[ -f "$CANDIDATE" ]]; then
+            API_KEY=$(python3 - "$CANDIDATE" <<'PY'
+import json, sys
+path = sys.argv[1]
+try:
+    with open(path) as f:
+        data = json.load(f)
+except Exception:
+    print("")
+    raise SystemExit(0)
+
+candidates = [
+    ("elevenlabs", "apiKey"),
+    ("messages", "tts", "elevenlabs", "apiKey"),
+]
+for cand in candidates:
+    cur = data
+    ok = True
+    for key in cand:
+        if isinstance(cur, dict) and key in cur:
+            cur = cur[key]
+        else:
+            ok = False
+            break
+    if ok and isinstance(cur, str) and cur:
+        print(cur)
+        raise SystemExit(0)
+print("")
+PY
+)
+            if [[ -n "$API_KEY" ]]; then
+                break
+            fi
+        fi
+    done
+fi
+if [[ -z "$API_KEY" ]]; then
+    echo "Error: ELEVENLABS_API_KEY not set and no local OpenClaw ElevenLabs key was found" >&2
+    exit 1
+fi
+
+# Build curl command
+CURL_ARGS=(
+    -s
+    -X POST
+    "https://api.elevenlabs.io/v1/speech-to-text"
+    -H "xi-api-key: $API_KEY"
+    -F "file=@$FILE"
+    -F "model_id=scribe_v2"
+    -F "diarize=$DIARIZE"
+    -F "tag_audio_events=$TAG_EVENTS"
+)
+
+if [[ -n "$LANG_CODE" ]]; then
+    CURL_ARGS+=(-F "language_code=$LANG_CODE")
+fi
+
+# Make request
+RESPONSE=$(curl "${CURL_ARGS[@]}")
+
+# Check for errors
+if echo "$RESPONSE" | grep -q '"detail"'; then
+    echo "Error from API:" >&2
+    echo "$RESPONSE" | jq -r '.detail.message // .detail' >&2
+    exit 1
+fi
+
+# Output
+if [[ "$JSON_OUTPUT" == "true" ]]; then
+    echo "$RESPONSE" | jq .
+else
+    # Extract just the text
+    TEXT=$(echo "$RESPONSE" | jq -r '.text // empty')
+    if [[ -n "$TEXT" ]]; then
+        echo "$TEXT"
+    else
+        echo "$RESPONSE"
+    fi
+fi