From fedabb0924f79387db6f49891cb69a7969e43478 Mon Sep 17 00:00:00 2001
From: Natalie <natalie@lilithuwu.com>
Date: Sun, 17 May 2026 18:12:14 -0700
Subject: [PATCH] =?UTF-8?q?docs(@scripts):=20=E2=9C=A8=20update=20rvoice?=
 =?UTF-8?q?=20docs=20to=20use=20LAN=20speech-synthesis?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
---
 bin/rvoice             | 40 ++++++++++++----------
 docs/rvoice.md         | 77 ++++++++++++++++++++++++------------------
 hammerspoon/rvoice.lua |  3 +-
 3 files changed, 69 insertions(+), 51 deletions(-)

diff --git a/bin/rvoice b/bin/rvoice
index 583c4f8..b523ebc 100755
--- a/bin/rvoice
+++ b/bin/rvoice
@@ -4,8 +4,9 @@
 # Designed for the case where claude runs on another host (apricot) and the
 # mic + keyboard are on the local Mac. /voice doesn't work over ssh because
 # the claude binary tries to open the *remote* host's microphone. This
-# helper records locally, transcribes via Groq Whisper (no local RAM hit),
-# and injects the transcript into the active remote tmux session via
+# helper records locally, transcribes via apricot's LAN speech-synthesis
+# service (Whisper, GPU-accelerated, no external API dependency), and
+# injects the transcript into the active remote tmux session via
 # `tmux send-keys` over ssh.
 #
 # Usage:
@@ -16,8 +17,12 @@
 #                       active iTerm2 tab (debug)
 #
 # Config: $XDG_CONFIG_HOME/rvoice/config (sourced; same convention as rclaude).
-# Required env: GROQ_API_KEY. Optional:
-#   RVOICE_MODEL=whisper-large-v3-turbo       (default; very fast)
+# Transcription uses the LAN-resident speech-synthesis service on apricot
+# service (Whisper, GPU-accelerated, no external API dependency), and injects
+# Optional env:
+#   RVOICE_STT_URL=http://apricot.lan:8000   (speech-synthesis service base URL)
+#   RVOICE_MODEL=base                         (tiny|base|small|medium|large-v2|large-v3)
+#   RVOICE_LANG=en                            (force language; omit for auto-detect)
 #   RVOICE_HOST=apricot.lan                   (overrides iTerm2 detection)
 #   RVOICE_SESSION=claude-natalie-...         (overrides iTerm2 detection)
 #   RVOICE_AUTOSEND=1                         (append Enter; default 0)
@@ -31,7 +36,9 @@ set -eu
 CONF_DIR=${XDG_CONFIG_HOME:-$HOME/.config}/rvoice
 [ -r "$CONF_DIR/config" ] && . "$CONF_DIR/config"
 
-MODEL=${RVOICE_MODEL:-whisper-large-v3-turbo}
+STT_URL=${RVOICE_STT_URL:-http://apricot.lan:8000}
+MODEL=${RVOICE_MODEL:-base}
+LANG_HINT=${RVOICE_LANG:-en}
 AUTOSEND=${RVOICE_AUTOSEND:-0}
 MIN_MS=${RVOICE_MIN_MS:-200}
 MAX_S=${RVOICE_MAX_S:-60}
@@ -129,19 +136,18 @@ cmd_stop() {
     fi
     [ -s "$WAV_FILE" ] || { log "stop: empty recording"; notify "empty recording" err; return 1; }
 
-    if [ -z "${GROQ_API_KEY:-}" ]; then
-        notify "GROQ_API_KEY not set" err
-        log "GROQ_API_KEY missing"
-        return 1
-    fi
-    log "transcribing ${_dur_ms}ms via $MODEL"
-    _txt=$(curl -sS --fail-with-body \
-        -H "Authorization: Bearer $GROQ_API_KEY" \
-        -F "file=@$WAV_FILE" \
+    log "transcribing ${_dur_ms}ms via $STT_URL (model=$MODEL lang=$LANG_HINT)"
+    _resp=$(curl -sS --fail-with-body \
+        -F "audio=@$WAV_FILE" \
         -F "model=$MODEL" \
-        -F "response_format=json" \
-        https://api.groq.com/openai/v1/audio/transcriptions \
-        | jq -r '.text // empty')
+        ${LANG_HINT:+-F "language=$LANG_HINT"} \
+        -F "task=transcribe" \
+        "$STT_URL/stt/transcribe" 2>>"$LOG_FILE") || {
+        notify "STT request failed" err
+        log "curl failed against $STT_URL/stt/transcribe"
+        return 1
+    }
+    _txt=$(printf '%s' "$_resp" | jq -r '.text // empty')
     rm -f "$WAV_FILE"
     if [ -z "$_txt" ]; then
         notify "transcription empty" err
diff --git a/docs/rvoice.md b/docs/rvoice.md
index c5ad7ad..f220cb0 100644
--- a/docs/rvoice.md
+++ b/docs/rvoice.md
@@ -4,11 +4,12 @@
 running on**. When you're sshed to apricot through `cc` / `rclaude resume`,
 that's apricot — which has no mic. `rvoice` fills the gap.
 
-It records audio locally on macOS, transcribes via Groq Whisper (no local model
-RAM), and injects the transcript into the active remote tmux session via
-`tmux send-keys` over ssh. The target session is auto-detected from the
-focused iTerm2 tab title (set by the canonical session-tools `tmux.conf` to
-`<host> · <session>`).
+It records audio locally on macOS, transcribes via the **LAN speech-synthesis
+service on apricot** (Whisper, GPU-accelerated, no API keys / no network
+egress beyond the local LAN), and injects the transcript into the active
+remote tmux session via `tmux send-keys` over ssh. The target session is
+auto-detected from the focused iTerm2 tab title (set by the canonical
+session-tools `tmux.conf` to `<host> · <session>`).
 
 ## Architecture
 
@@ -17,13 +18,14 @@ focused iTerm2 tab title (set by the canonical session-tools `tmux.conf` to
 [ Right ⌥ up ]    ──Hammerspoon──▶  rvoice stop
                                           │
                                           ▼
-                            POST WAV → Groq /audio/transcriptions
+                  POST WAV → http://apricot.lan:8000/stt/transcribe
+                              (faster-whisper on GPU, ~base model)
                                           │
                                           ▼
-                            iTerm2 active tab title → "apricot · claude-…"
+                  iTerm2 active tab title → "apricot · claude-…"
                                           │
                                           ▼
-                            ssh apricot tmux send-keys -t claude-… -l "<text>"
+                  ssh apricot tmux send-keys -t claude-… -l "<text>"
 ```
 
 ## Files
@@ -32,25 +34,23 @@ focused iTerm2 tab title (set by the canonical session-tools `tmux.conf` to
 |------------------------------------------------------|---------------------------------------|
 | `bin/rvoice`                                         | CLI: `start`/`stop`/`cancel`/`target`/`log` |
 | `hammerspoon/rvoice.lua`                             | Right-⌥ hold detector → calls `rvoice` |
-| `~/.config/rvoice/config`                            | Sourced at startup; holds `GROQ_API_KEY` and tweaks |
+| `~/.config/rvoice/config`                            | Sourced at startup; overrides STT URL, model, etc. |
 | `$TMPDIR/rvoice/`                                    | Per-recording state (pid, wav, log)   |
 
 ## Install
 
-Prerequisites: `ffmpeg`, `jq`, `curl` (all `brew install`able), a Groq API key
-(free tier — https://console.groq.com/keys), and Hammerspoon
-(`brew install --cask hammerspoon`).
+Prerequisites: `ffmpeg`, `jq`, `curl` (all `brew install`able), Hammerspoon
+(`brew install --cask hammerspoon`), and the LAN speech-synthesis service
+running on apricot (already deployed at `apricot.lan:8000`, exposes
+`/stt/transcribe`). No API keys, no cloud round-trip.
 
 ```sh
 # 1. Symlink rvoice (already done if you ran install.sh)
 ln -sfn ~/Code/@scripts/session-tools/bin/rvoice ~/.local/bin/rvoice
 
-# 2. Drop your Groq key
-mkdir -p ~/.config/rvoice
-cat >> ~/.config/rvoice/config <<'EOF'
-export GROQ_API_KEY=gsk_...your_key...
-# export RVOICE_AUTOSEND=1     # uncomment to auto-press Enter after injection
-EOF
+# 2. (Optional) override defaults in ~/.config/rvoice/config — see the
+#    "Config" section below. The default is to POST to apricot.lan:8000 and
+#    use the `base` Whisper model.
 
 # 3. Wire up Hammerspoon
 mkdir -p ~/.hammerspoon
@@ -60,6 +60,11 @@ open /Applications/Hammerspoon.app
 
 # 4. From Hammerspoon's menu bar → Reload Config.
 #    Grant Accessibility + Microphone permission when macOS prompts.
+
+# 5. Smoke-test the STT endpoint without Hammerspoon:
+ffmpeg -f avfoundation -i ":0" -ac 1 -ar 16000 -t 5 /tmp/me.wav
+curl -F "audio=@/tmp/me.wav" -F "model=base" -F "language=en" -F "task=transcribe" \
+  http://apricot.lan:8000/stt/transcribe | jq .text
 ```
 
 ## Usage
@@ -79,8 +84,9 @@ From any iTerm2 tab that's attached to a remote claude session via `cc` or
 Plain shell fragment sourced at startup. Defaults shown.
 
 ```sh
-export GROQ_API_KEY=...                              # REQUIRED
-export RVOICE_MODEL=whisper-large-v3-turbo           # Groq model id
+export RVOICE_STT_URL=http://apricot.lan:8000        # speech-synthesis service
+export RVOICE_MODEL=base                             # tiny|base|small|medium|large-v2|large-v3
+export RVOICE_LANG=en                                # omit/empty = auto-detect
 export RVOICE_AUTOSEND=0                             # 1 = press Enter after inject
 export RVOICE_MIN_MS=200                             # ignore taps shorter than this (debounce)
 export RVOICE_MAX_S=60                               # hard cap on a single recording
@@ -88,7 +94,12 @@ export RVOICE_HOST=apricot.lan                       # force target host (overri
 export RVOICE_SESSION=claude-natalie-…               # force target tmux session
 ```
 
-Override any of these per-invocation: `RVOICE_AUTOSEND=1 rvoice stop`.
+Override any of these per-invocation: `RVOICE_MODEL=small rvoice stop`.
+
+**Model trade-offs** (apricot's GPU; latency rough):
+- `tiny.en` / `base` — sub-second, fine for short prompts
+- `small` — ~1s, noticeable quality bump
+- `medium` / `large-v3` — 2-4s, near-perfect, worth it for paragraphs
 
 ## Subcommands
 
@@ -102,9 +113,9 @@ rvoice log      # tail -50 of the action log
 
 ## Troubleshooting
 
-- **"GROQ_API_KEY not set"** — Hammerspoon's shell environment doesn't inherit
-  from your login shell. Make sure the key is exported in
-  `~/.config/rvoice/config`; rvoice sources that file before each invocation.
+- **"STT request failed"** — apricot's speech service isn't reachable. Check
+  `curl http://apricot.lan:8000/health` and `ssh apricot.lan systemctl --user
+  status` for the relevant unit. Most likely you're off the LAN/VPN.
 - **"no target session resolvable"** — the focused iTerm2 tab title isn't in
   `<host> · <session>` format. Either: (a) you're not in an rclaude/ssh
   session, or (b) the remote tmux config didn't get the title-setting fragment.
@@ -113,16 +124,15 @@ rvoice log      # tail -50 of the action log
 - **Hammerspoon doesn't see Right ⌥** — System Settings → Privacy &
   Security → Accessibility → enable Hammerspoon. Also Microphone for the
   recording step. Restart Hammerspoon after granting.
-- **Transcription returns nonsense** — Groq's `whisper-large-v3-turbo` is
-  multilingual but English-biased. Set `RVOICE_MODEL=whisper-large-v3` for
-  the slower but more accurate variant.
+- **Transcription returns empty / nonsense** — bump the model: `RVOICE_MODEL=small`
+  or `medium`. Default `base` trades accuracy for sub-second latency. Models
+  list: `curl http://apricot.lan:8000/stt/models`.
 - **Injection types into the wrong session** — `rvoice target` shows what it
   will hit. If wrong, set `RVOICE_HOST` / `RVOICE_SESSION` in config to pin
   the target.
-- **Latency feels high** — Groq is fast (~500ms for short clips). Network
-  latency to plum + ssh round-trip to apricot adds ~200ms. Local Whisper
-  would be slower in practice on most laptops once you account for model
-  load.
+- **Latency feels high** — first call after service idle warms the model on
+  apricot's GPU (1-2s one-time). Subsequent calls are sub-second for `base`.
+  Switch to `tiny.en` for the lowest-latency tier.
 
 ## Why this architecture (vs. /voice over ssh)
 
@@ -137,6 +147,7 @@ remote rclaude session would be:
    claude release)
 3. **Reproduce /voice's behavior with our own pieces** ← this is rvoice
 
-`rvoice` keeps the mic and the hotkey on the Mac, runs transcription on a
-hosted endpoint (zero local RAM), and uses tmux's existing send-keys
+`rvoice` keeps the mic and the hotkey on the Mac, runs transcription on
+apricot's own LAN-resident speech-synthesis service (GPU Whisper, zero
+local model RAM, no cloud egress), and uses tmux's existing send-keys
 protocol to deliver text — every layer is well-understood and stable.
diff --git a/hammerspoon/rvoice.lua b/hammerspoon/rvoice.lua
index c93d5b0..106914b 100644
--- a/hammerspoon/rvoice.lua
+++ b/hammerspoon/rvoice.lua
@@ -41,7 +41,8 @@ local function run(cmd)
             hs.printf("[rvoice] %s exited %d: %s", cmd, exit, err or "")
         end
     end, {"-c", RVOICE .. " " .. cmd})
-    -- Inherit user shell env so GROQ_API_KEY (and PATH for ffmpeg/jq) work.
+    -- Inherit user shell env so PATH for ffmpeg/jq is set and rvoice can
+    -- source ~/.config/rvoice/config to pick up any user overrides.
     t:setEnvironment(hs.execute("env", true):gsub("\n$", "") and nil or nil)
     t:start()
 end