From 976888f002e8a0f7e5b1d2e7cb3672c08da627bc Mon Sep 17 00:00:00 2001 From: Stefano Fiorini Date: Sun, 8 Mar 2026 21:11:09 -0500 Subject: [PATCH] Add elevenlabs-stt skill and documentation --- README.md | 1 + docs/README.md | 1 + docs/elevenlabs-stt.md | 41 ++++++ skills/elevenlabs-stt/SKILL.md | 46 +++++++ skills/elevenlabs-stt/scripts/transcribe.sh | 143 ++++++++++++++++++++ 5 files changed, 232 insertions(+) create mode 100644 docs/elevenlabs-stt.md create mode 100644 skills/elevenlabs-stt/SKILL.md create mode 100755 skills/elevenlabs-stt/scripts/transcribe.sh diff --git a/README.md b/README.md index af07729..8465d79 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ This repository is intended to be a simple skill source: install the repo (or a | Skill | What it does | Path | |---|---|---| +| `elevenlabs-stt` | Transcribe local audio files with ElevenLabs Speech-to-Text, with diarization, language hints, event tags, and JSON output. | `skills/elevenlabs-stt` | | `gitea-api` | Interact with Gitea via REST API (repos, issues, PRs, releases, branches, user info). | `skills/gitea-api` | | `portainer` | Manage Portainer stacks via API (list, start/stop/restart, update, prune images). | `skills/portainer` | | `searxng` | Search through a local or self-hosted SearXNG instance for web, news, images, and more. | `skills/searxng` | diff --git a/docs/README.md b/docs/README.md index b438efb..5da9453 100644 --- a/docs/README.md +++ b/docs/README.md @@ -4,6 +4,7 @@ This folder contains detailed docs for each skill in this repository. ## Skills +- [`elevenlabs-stt`](elevenlabs-stt.md) — Local audio transcription through ElevenLabs Speech-to-Text - [`gitea-api`](gitea-api.md) — REST-based Gitea automation (no `tea` CLI required) - [`portainer`](portainer.md) — Portainer stack management (list, lifecycle, updates, image pruning) - [`searxng`](searxng.md) — Privacy-respecting metasearch via a local or self-hosted SearXNG instance diff --git a/docs/elevenlabs-stt.md b/docs/elevenlabs-stt.md new file mode 100644 index 0000000..461584d --- /dev/null +++ b/docs/elevenlabs-stt.md @@ -0,0 +1,41 @@ +# elevenlabs-stt + +Transcribe local audio files with ElevenLabs Speech-to-Text. + +## What this skill is for + +- Local audio transcription +- Voice note transcription +- Optional speaker diarization +- Language hints and event tagging +- JSON output for programmatic use + +## Requirements + +Required binaries: +- `curl` +- `jq` +- `python3` + +Preferred auth: +- `ELEVENLABS_API_KEY` in the environment + +Fallback auth: +- local OpenClaw config lookup from `~/.openclaw/openclaw.json` or `~/.openclaw/secrets.json` + +## Wrapper + +Use the bundled script directly: + +```bash +bash skills/elevenlabs-stt/scripts/transcribe.sh /path/to/audio.mp3 +bash skills/elevenlabs-stt/scripts/transcribe.sh /path/to/audio.mp3 --diarize --lang en +bash skills/elevenlabs-stt/scripts/transcribe.sh /path/to/audio.mp3 --json +bash skills/elevenlabs-stt/scripts/transcribe.sh /path/to/audio.mp3 --events +``` + +## Notes + +- Uses ElevenLabs STT model `scribe_v2`. +- Uploads a local file directly to ElevenLabs. +- If `ELEVENLABS_API_KEY` is not exported, the script tries local OpenClaw config/secrets automatically. diff --git a/skills/elevenlabs-stt/SKILL.md b/skills/elevenlabs-stt/SKILL.md new file mode 100644 index 0000000..29111e1 --- /dev/null +++ b/skills/elevenlabs-stt/SKILL.md @@ -0,0 +1,46 @@ +--- +name: elevenlabs-stt +description: Transcribe audio files with ElevenLabs Speech-to-Text (Scribe v2) from the local CLI. Use when you need local audio transcription with optional speaker diarization, language hints, event tagging, or JSON output via scripts/transcribe.sh. +--- + +# ElevenLabs Speech-to-Text + +Use `scripts/transcribe.sh` to transcribe a local audio file with ElevenLabs STT. + +## Requirements + +Preferred: set `ELEVENLABS_API_KEY` in the environment before running the script. + +Fallback: if the environment variable is not set, the script will try to read the key from local OpenClaw config files in `~/.openclaw/`. + +Required binaries: +- `curl` +- `jq` +- `python3` + +## Usage + +Run from the skill directory or call the script by full path. + +Examples: + +```bash +scripts/transcribe.sh /path/to/audio.mp3 +scripts/transcribe.sh /path/to/audio.mp3 --diarize --lang en +scripts/transcribe.sh /path/to/audio.mp3 --json +scripts/transcribe.sh /path/to/audio.mp3 --events +``` + +## Options + +- `--diarize` — enable speaker diarization +- `--lang CODE` — pass an ISO language code hint such as `en`, `es`, or `fr` +- `--json` — print the full JSON response instead of only transcript text +- `--events` — include audio event tagging when supported + +## Notes + +- The script uploads a local file directly to ElevenLabs. +- The model is fixed to `scribe_v2` in the current script. +- The script returns plain transcript text by default, or pretty-printed JSON with `--json`. +- If the API returns an error payload, the script prints the error and exits non-zero. diff --git a/skills/elevenlabs-stt/scripts/transcribe.sh b/skills/elevenlabs-stt/scripts/transcribe.sh new file mode 100755 index 0000000..9bc7952 --- /dev/null +++ b/skills/elevenlabs-stt/scripts/transcribe.sh @@ -0,0 +1,143 @@ +#!/usr/bin/env bash +set -euo pipefail + +# ElevenLabs Speech-to-Text transcription script +# Usage: transcribe.sh [options] + +show_help() { + cat << EOF +Usage: $(basename "$0") [options] + +Options: + --diarize Enable speaker diarization + --lang CODE ISO language code (e.g., en, pt, es, fr) + --json Output full JSON response + --events Tag audio events (laughter, music, etc.) + -h, --help Show this help + +Environment: + ELEVENLABS_API_KEY Required API key + +Examples: + $(basename "$0") voice_note.ogg + $(basename "$0") meeting.mp3 --diarize --lang en + $(basename "$0") podcast.mp3 --json > transcript.json +EOF + exit 0 +} + +# Defaults +DIARIZE="false" +LANG_CODE="" +JSON_OUTPUT="false" +TAG_EVENTS="false" +FILE="" + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) show_help ;; + --diarize) DIARIZE="true"; shift ;; + --lang) LANG_CODE="$2"; shift 2 ;; + --json) JSON_OUTPUT="true"; shift ;; + --events) TAG_EVENTS="true"; shift ;; + -*) echo "Unknown option: $1" >&2; exit 1 ;; + *) FILE="$1"; shift ;; + esac +done + +# Validate +if [[ -z "$FILE" ]]; then + echo "Error: No audio file specified" >&2 + show_help +fi + +if [[ ! -f "$FILE" ]]; then + echo "Error: File not found: $FILE" >&2 + exit 1 +fi + +# API key (check env, then fall back to local OpenClaw config/secrets) +API_KEY="${ELEVENLABS_API_KEY:-}" +if [[ -z "$API_KEY" ]]; then + OPENCLAW_DIR="${HOME}/.openclaw" + for CANDIDATE in "$OPENCLAW_DIR/secrets.json" "$OPENCLAW_DIR/openclaw.json"; do + if [[ -f "$CANDIDATE" ]]; then + API_KEY=$(python3 - "$CANDIDATE" <<'PY' +import json, sys +path = sys.argv[1] +try: + with open(path) as f: + data = json.load(f) +except Exception: + print("") + raise SystemExit(0) + +candidates = [ + ("elevenlabs", "apiKey"), + ("messages", "tts", "elevenlabs", "apiKey"), +] +for cand in candidates: + cur = data + ok = True + for key in cand: + if isinstance(cur, dict) and key in cur: + cur = cur[key] + else: + ok = False + break + if ok and isinstance(cur, str) and cur: + print(cur) + raise SystemExit(0) +print("") +PY +) + if [[ -n "$API_KEY" ]]; then + break + fi + fi + done +fi +if [[ -z "$API_KEY" ]]; then + echo "Error: ELEVENLABS_API_KEY not set and no local OpenClaw ElevenLabs key was found" >&2 + exit 1 +fi + +# Build curl command +CURL_ARGS=( + -s + -X POST + "https://api.elevenlabs.io/v1/speech-to-text" + -H "xi-api-key: $API_KEY" + -F "file=@$FILE" + -F "model_id=scribe_v2" + -F "diarize=$DIARIZE" + -F "tag_audio_events=$TAG_EVENTS" +) + +if [[ -n "$LANG_CODE" ]]; then + CURL_ARGS+=(-F "language_code=$LANG_CODE") +fi + +# Make request +RESPONSE=$(curl "${CURL_ARGS[@]}") + +# Check for errors +if echo "$RESPONSE" | grep -q '"detail"'; then + echo "Error from API:" >&2 + echo "$RESPONSE" | jq -r '.detail.message // .detail' >&2 + exit 1 +fi + +# Output +if [[ "$JSON_OUTPUT" == "true" ]]; then + echo "$RESPONSE" | jq . +else + # Extract just the text + TEXT=$(echo "$RESPONSE" | jq -r '.text // empty') + if [[ -n "$TEXT" ]]; then + echo "$TEXT" + else + echo "$RESPONSE" + fi +fi