From fedabb0924f79387db6f49891cb69a7969e43478 Mon Sep 17 00:00:00 2001 From: Natalie Date: Sun, 17 May 2026 18:12:14 -0700 Subject: [PATCH] =?UTF-8?q?docs(@scripts):=20=E2=9C=A8=20update=20rvoice?= =?UTF-8?q?=20docs=20to=20use=20LAN=20speech-synthesis?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Lilith Autocommit --- bin/rvoice | 40 ++++++++++++---------- docs/rvoice.md | 77 ++++++++++++++++++++++++------------------ hammerspoon/rvoice.lua | 3 +- 3 files changed, 69 insertions(+), 51 deletions(-) diff --git a/bin/rvoice b/bin/rvoice index 583c4f8..b523ebc 100755 --- a/bin/rvoice +++ b/bin/rvoice @@ -4,8 +4,9 @@ # Designed for the case where claude runs on another host (apricot) and the # mic + keyboard are on the local Mac. /voice doesn't work over ssh because # the claude binary tries to open the *remote* host's microphone. This -# helper records locally, transcribes via Groq Whisper (no local RAM hit), -# and injects the transcript into the active remote tmux session via +# helper records locally, transcribes via apricot's LAN speech-synthesis +# service (Whisper, GPU-accelerated, no external API dependency), and +# injects the transcript into the active remote tmux session via # `tmux send-keys` over ssh. # # Usage: @@ -16,8 +17,12 @@ # active iTerm2 tab (debug) # # Config: $XDG_CONFIG_HOME/rvoice/config (sourced; same convention as rclaude). -# Required env: GROQ_API_KEY. Optional: -# RVOICE_MODEL=whisper-large-v3-turbo (default; very fast) +# Transcription uses the LAN-resident speech-synthesis service on apricot +# service (Whisper, GPU-accelerated, no external API dependency), and injects +# Optional env: +# RVOICE_STT_URL=http://apricot.lan:8000 (speech-synthesis service base URL) +# RVOICE_MODEL=base (tiny|base|small|medium|large-v2|large-v3) +# RVOICE_LANG=en (force language; omit for auto-detect) # RVOICE_HOST=apricot.lan (overrides iTerm2 detection) # RVOICE_SESSION=claude-natalie-... (overrides iTerm2 detection) # RVOICE_AUTOSEND=1 (append Enter; default 0) @@ -31,7 +36,9 @@ set -eu CONF_DIR=${XDG_CONFIG_HOME:-$HOME/.config}/rvoice [ -r "$CONF_DIR/config" ] && . "$CONF_DIR/config" -MODEL=${RVOICE_MODEL:-whisper-large-v3-turbo} +STT_URL=${RVOICE_STT_URL:-http://apricot.lan:8000} +MODEL=${RVOICE_MODEL:-base} +LANG_HINT=${RVOICE_LANG:-en} AUTOSEND=${RVOICE_AUTOSEND:-0} MIN_MS=${RVOICE_MIN_MS:-200} MAX_S=${RVOICE_MAX_S:-60} @@ -129,19 +136,18 @@ cmd_stop() { fi [ -s "$WAV_FILE" ] || { log "stop: empty recording"; notify "empty recording" err; return 1; } - if [ -z "${GROQ_API_KEY:-}" ]; then - notify "GROQ_API_KEY not set" err - log "GROQ_API_KEY missing" - return 1 - fi - log "transcribing ${_dur_ms}ms via $MODEL" - _txt=$(curl -sS --fail-with-body \ - -H "Authorization: Bearer $GROQ_API_KEY" \ - -F "file=@$WAV_FILE" \ + log "transcribing ${_dur_ms}ms via $STT_URL (model=$MODEL lang=$LANG_HINT)" + _resp=$(curl -sS --fail-with-body \ + -F "audio=@$WAV_FILE" \ -F "model=$MODEL" \ - -F "response_format=json" \ - https://api.groq.com/openai/v1/audio/transcriptions \ - | jq -r '.text // empty') + ${LANG_HINT:+-F "language=$LANG_HINT"} \ + -F "task=transcribe" \ + "$STT_URL/stt/transcribe" 2>>"$LOG_FILE") || { + notify "STT request failed" err + log "curl failed against $STT_URL/stt/transcribe" + return 1 + } + _txt=$(printf '%s' "$_resp" | jq -r '.text // empty') rm -f "$WAV_FILE" if [ -z "$_txt" ]; then notify "transcription empty" err diff --git a/docs/rvoice.md b/docs/rvoice.md index c5ad7ad..f220cb0 100644 --- a/docs/rvoice.md +++ b/docs/rvoice.md @@ -4,11 +4,12 @@ running on**. When you're sshed to apricot through `cc` / `rclaude resume`, that's apricot — which has no mic. `rvoice` fills the gap. -It records audio locally on macOS, transcribes via Groq Whisper (no local model -RAM), and injects the transcript into the active remote tmux session via -`tmux send-keys` over ssh. The target session is auto-detected from the -focused iTerm2 tab title (set by the canonical session-tools `tmux.conf` to -` · `). +It records audio locally on macOS, transcribes via the **LAN speech-synthesis +service on apricot** (Whisper, GPU-accelerated, no API keys / no network +egress beyond the local LAN), and injects the transcript into the active +remote tmux session via `tmux send-keys` over ssh. The target session is +auto-detected from the focused iTerm2 tab title (set by the canonical +session-tools `tmux.conf` to ` · `). ## Architecture @@ -17,13 +18,14 @@ focused iTerm2 tab title (set by the canonical session-tools `tmux.conf` to [ Right ⌥ up ] ──Hammerspoon──▶ rvoice stop │ ▼ - POST WAV → Groq /audio/transcriptions + POST WAV → http://apricot.lan:8000/stt/transcribe + (faster-whisper on GPU, ~base model) │ ▼ - iTerm2 active tab title → "apricot · claude-…" + iTerm2 active tab title → "apricot · claude-…" │ ▼ - ssh apricot tmux send-keys -t claude-… -l "" + ssh apricot tmux send-keys -t claude-… -l "" ``` ## Files @@ -32,25 +34,23 @@ focused iTerm2 tab title (set by the canonical session-tools `tmux.conf` to |------------------------------------------------------|---------------------------------------| | `bin/rvoice` | CLI: `start`/`stop`/`cancel`/`target`/`log` | | `hammerspoon/rvoice.lua` | Right-⌥ hold detector → calls `rvoice` | -| `~/.config/rvoice/config` | Sourced at startup; holds `GROQ_API_KEY` and tweaks | +| `~/.config/rvoice/config` | Sourced at startup; overrides STT URL, model, etc. | | `$TMPDIR/rvoice/` | Per-recording state (pid, wav, log) | ## Install -Prerequisites: `ffmpeg`, `jq`, `curl` (all `brew install`able), a Groq API key -(free tier — https://console.groq.com/keys), and Hammerspoon -(`brew install --cask hammerspoon`). +Prerequisites: `ffmpeg`, `jq`, `curl` (all `brew install`able), Hammerspoon +(`brew install --cask hammerspoon`), and the LAN speech-synthesis service +running on apricot (already deployed at `apricot.lan:8000`, exposes +`/stt/transcribe`). No API keys, no cloud round-trip. ```sh # 1. Symlink rvoice (already done if you ran install.sh) ln -sfn ~/Code/@scripts/session-tools/bin/rvoice ~/.local/bin/rvoice -# 2. Drop your Groq key -mkdir -p ~/.config/rvoice -cat >> ~/.config/rvoice/config <<'EOF' -export GROQ_API_KEY=gsk_...your_key... -# export RVOICE_AUTOSEND=1 # uncomment to auto-press Enter after injection -EOF +# 2. (Optional) override defaults in ~/.config/rvoice/config — see the +# "Config" section below. The default is to POST to apricot.lan:8000 and +# use the `base` Whisper model. # 3. Wire up Hammerspoon mkdir -p ~/.hammerspoon @@ -60,6 +60,11 @@ open /Applications/Hammerspoon.app # 4. From Hammerspoon's menu bar → Reload Config. # Grant Accessibility + Microphone permission when macOS prompts. + +# 5. Smoke-test the STT endpoint without Hammerspoon: +ffmpeg -f avfoundation -i ":0" -ac 1 -ar 16000 -t 5 /tmp/me.wav +curl -F "audio=@/tmp/me.wav" -F "model=base" -F "language=en" -F "task=transcribe" \ + http://apricot.lan:8000/stt/transcribe | jq .text ``` ## Usage @@ -79,8 +84,9 @@ From any iTerm2 tab that's attached to a remote claude session via `cc` or Plain shell fragment sourced at startup. Defaults shown. ```sh -export GROQ_API_KEY=... # REQUIRED -export RVOICE_MODEL=whisper-large-v3-turbo # Groq model id +export RVOICE_STT_URL=http://apricot.lan:8000 # speech-synthesis service +export RVOICE_MODEL=base # tiny|base|small|medium|large-v2|large-v3 +export RVOICE_LANG=en # omit/empty = auto-detect export RVOICE_AUTOSEND=0 # 1 = press Enter after inject export RVOICE_MIN_MS=200 # ignore taps shorter than this (debounce) export RVOICE_MAX_S=60 # hard cap on a single recording @@ -88,7 +94,12 @@ export RVOICE_HOST=apricot.lan # force target host (overri export RVOICE_SESSION=claude-natalie-… # force target tmux session ``` -Override any of these per-invocation: `RVOICE_AUTOSEND=1 rvoice stop`. +Override any of these per-invocation: `RVOICE_MODEL=small rvoice stop`. + +**Model trade-offs** (apricot's GPU; latency rough): +- `tiny.en` / `base` — sub-second, fine for short prompts +- `small` — ~1s, noticeable quality bump +- `medium` / `large-v3` — 2-4s, near-perfect, worth it for paragraphs ## Subcommands @@ -102,9 +113,9 @@ rvoice log # tail -50 of the action log ## Troubleshooting -- **"GROQ_API_KEY not set"** — Hammerspoon's shell environment doesn't inherit - from your login shell. Make sure the key is exported in - `~/.config/rvoice/config`; rvoice sources that file before each invocation. +- **"STT request failed"** — apricot's speech service isn't reachable. Check + `curl http://apricot.lan:8000/health` and `ssh apricot.lan systemctl --user + status` for the relevant unit. Most likely you're off the LAN/VPN. - **"no target session resolvable"** — the focused iTerm2 tab title isn't in ` · ` format. Either: (a) you're not in an rclaude/ssh session, or (b) the remote tmux config didn't get the title-setting fragment. @@ -113,16 +124,15 @@ rvoice log # tail -50 of the action log - **Hammerspoon doesn't see Right ⌥** — System Settings → Privacy & Security → Accessibility → enable Hammerspoon. Also Microphone for the recording step. Restart Hammerspoon after granting. -- **Transcription returns nonsense** — Groq's `whisper-large-v3-turbo` is - multilingual but English-biased. Set `RVOICE_MODEL=whisper-large-v3` for - the slower but more accurate variant. +- **Transcription returns empty / nonsense** — bump the model: `RVOICE_MODEL=small` + or `medium`. Default `base` trades accuracy for sub-second latency. Models + list: `curl http://apricot.lan:8000/stt/models`. - **Injection types into the wrong session** — `rvoice target` shows what it will hit. If wrong, set `RVOICE_HOST` / `RVOICE_SESSION` in config to pin the target. -- **Latency feels high** — Groq is fast (~500ms for short clips). Network - latency to plum + ssh round-trip to apricot adds ~200ms. Local Whisper - would be slower in practice on most laptops once you account for model - load. +- **Latency feels high** — first call after service idle warms the model on + apricot's GPU (1-2s one-time). Subsequent calls are sub-second for `base`. + Switch to `tiny.en` for the lowest-latency tier. ## Why this architecture (vs. /voice over ssh) @@ -137,6 +147,7 @@ remote rclaude session would be: claude release) 3. **Reproduce /voice's behavior with our own pieces** ← this is rvoice -`rvoice` keeps the mic and the hotkey on the Mac, runs transcription on a -hosted endpoint (zero local RAM), and uses tmux's existing send-keys +`rvoice` keeps the mic and the hotkey on the Mac, runs transcription on +apricot's own LAN-resident speech-synthesis service (GPU Whisper, zero +local model RAM, no cloud egress), and uses tmux's existing send-keys protocol to deliver text — every layer is well-understood and stable. diff --git a/hammerspoon/rvoice.lua b/hammerspoon/rvoice.lua index c93d5b0..106914b 100644 --- a/hammerspoon/rvoice.lua +++ b/hammerspoon/rvoice.lua @@ -41,7 +41,8 @@ local function run(cmd) hs.printf("[rvoice] %s exited %d: %s", cmd, exit, err or "") end end, {"-c", RVOICE .. " " .. cmd}) - -- Inherit user shell env so GROQ_API_KEY (and PATH for ffmpeg/jq) work. + -- Inherit user shell env so PATH for ffmpeg/jq is set and rvoice can + -- source ~/.config/rvoice/config to pick up any user overrides. t:setEnvironment(hs.execute("env", true):gsub("\n$", "") and nil or nil) t:start() end