feat(@scripts): ✨ add remote dictation helper for rclaude sessions
Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
parent
3f7c9ecbfc
commit
4968a0bdf8
2 changed files with 277 additions and 0 deletions
206
bin/rvoice
Executable file
206
bin/rvoice
Executable file
|
|
@ -0,0 +1,206 @@
|
|||
#!/bin/sh
|
||||
# rvoice — push-to-talk dictation for remote rclaude sessions.
|
||||
#
|
||||
# Designed for the case where claude runs on another host (apricot) and the
|
||||
# mic + keyboard are on the local Mac. /voice doesn't work over ssh because
|
||||
# the claude binary tries to open the *remote* host's microphone. This
|
||||
# helper records locally, transcribes via Groq Whisper (no local RAM hit),
|
||||
# and injects the transcript into the active remote tmux session via
|
||||
# `tmux send-keys` over ssh.
|
||||
#
|
||||
# Usage:
|
||||
# rvoice start Begin recording (called by Hammerspoon on key-down)
|
||||
# rvoice stop Stop, transcribe, inject (called by Hammerspoon on key-up)
|
||||
# rvoice cancel Stop without transcribing (key-up after very short hold)
|
||||
# rvoice target Resolve and echo `<host>\t<tmux-session>` for the
|
||||
# active iTerm2 tab (debug)
|
||||
#
|
||||
# Config: $XDG_CONFIG_HOME/rvoice/config (sourced; same convention as rclaude).
|
||||
# Required env: GROQ_API_KEY. Optional:
|
||||
# RVOICE_MODEL=whisper-large-v3-turbo (default; very fast)
|
||||
# RVOICE_HOST=apricot.lan (overrides iTerm2 detection)
|
||||
# RVOICE_SESSION=claude-natalie-... (overrides iTerm2 detection)
|
||||
# RVOICE_AUTOSEND=1 (append Enter; default 0)
|
||||
# RVOICE_MIN_MS=200 (ignore taps shorter than this)
|
||||
# RVOICE_MAX_S=60 (hard cap on recording length)
|
||||
#
|
||||
# State lives in $TMPDIR/rvoice/ — one recording at a time.
|
||||
|
||||
set -eu
|
||||
|
||||
CONF_DIR=${XDG_CONFIG_HOME:-$HOME/.config}/rvoice
|
||||
[ -r "$CONF_DIR/config" ] && . "$CONF_DIR/config"
|
||||
|
||||
MODEL=${RVOICE_MODEL:-whisper-large-v3-turbo}
|
||||
AUTOSEND=${RVOICE_AUTOSEND:-0}
|
||||
MIN_MS=${RVOICE_MIN_MS:-200}
|
||||
MAX_S=${RVOICE_MAX_S:-60}
|
||||
|
||||
STATE_DIR=${TMPDIR:-/tmp}/rvoice
|
||||
mkdir -p "$STATE_DIR"
|
||||
PID_FILE=$STATE_DIR/ffmpeg.pid
|
||||
WAV_FILE=$STATE_DIR/recording.wav
|
||||
START_FILE=$STATE_DIR/start-ms
|
||||
LOG_FILE=$STATE_DIR/log
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
log() { printf '[rvoice %s] %s\n' "$(date +%H:%M:%S)" "$*" >> "$LOG_FILE"; }
|
||||
|
||||
now_ms() { python3 -c 'import time; print(int(time.time() * 1000))'; }
|
||||
|
||||
# Resolve the (host, tmux-session) for the active iTerm2 tab. Reads the
|
||||
# title set by our canonical tmux config: "<host> · <session>". Falls back
|
||||
# to env overrides, then to "apricot.lan" + most-recent remote claude session.
|
||||
resolve_target() {
|
||||
if [ -n "${RVOICE_HOST:-}" ] && [ -n "${RVOICE_SESSION:-}" ]; then
|
||||
printf '%s\t%s\n' "$RVOICE_HOST" "$RVOICE_SESSION"
|
||||
return
|
||||
fi
|
||||
_title=$(osascript -e 'tell application "iTerm2" to tell current session of current window to return name' 2>/dev/null || true)
|
||||
# Title format from session-tools/tmux.conf: "<host> · <session>"
|
||||
_host=$(printf '%s' "$_title" | awk -F' · ' 'NF>=2 {print $1}')
|
||||
_sess=$(printf '%s' "$_title" | awk -F' · ' 'NF>=2 {print $2}')
|
||||
if [ -n "$_host" ] && [ -n "$_sess" ]; then
|
||||
printf '%s\t%s\n' "$_host" "$_sess"
|
||||
return
|
||||
fi
|
||||
# Fallback: pick the most recently created claude-* session on apricot.
|
||||
_host=${RVOICE_HOST:-apricot.lan}
|
||||
_sess=${RVOICE_SESSION:-}
|
||||
if [ -z "$_sess" ]; then
|
||||
_sess=$(ssh -o BatchMode=yes -o ConnectTimeout=3 "$_host" \
|
||||
'tmux ls -F "#{session_created} #{session_name}" 2>/dev/null \
|
||||
| sort -n | awk "/claude-/{n=\$2} END{print n}"' 2>/dev/null || true)
|
||||
fi
|
||||
[ -z "$_sess" ] && { log "no target session resolvable"; return 1; }
|
||||
printf '%s\t%s\n' "$_host" "$_sess"
|
||||
}
|
||||
|
||||
notify() {
|
||||
# Best-effort macOS notification + audible cue.
|
||||
osascript -e "display notification \"$1\" with title \"rvoice\"" 2>/dev/null || true
|
||||
[ "${2:-}" = "ok" ] && afplay /System/Library/Sounds/Pop.wav 2>/dev/null &
|
||||
[ "${2:-}" = "err" ] && afplay /System/Library/Sounds/Funk.wav 2>/dev/null &
|
||||
[ "${2:-}" = "go" ] && afplay /System/Library/Sounds/Tink.wav 2>/dev/null &
|
||||
:
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Commands
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
cmd_start() {
|
||||
# If an old ffmpeg is still alive (key release missed), kill it first.
|
||||
if [ -f "$PID_FILE" ] && kill -0 "$(cat "$PID_FILE")" 2>/dev/null; then
|
||||
kill "$(cat "$PID_FILE")" 2>/dev/null || true
|
||||
rm -f "$PID_FILE"
|
||||
fi
|
||||
rm -f "$WAV_FILE"
|
||||
now_ms > "$START_FILE"
|
||||
# 16kHz mono PCM, capped at MAX_S. Device "0" is the default macOS input;
|
||||
# change with AVFoundation list if you have multiple mics.
|
||||
nohup ffmpeg -hide_banner -loglevel error -nostdin \
|
||||
-f avfoundation -i ":0" \
|
||||
-ac 1 -ar 16000 -t "$MAX_S" \
|
||||
-y "$WAV_FILE" >/dev/null 2>>"$LOG_FILE" &
|
||||
echo $! > "$PID_FILE"
|
||||
notify "listening…" go
|
||||
log "start pid=$(cat "$PID_FILE")"
|
||||
}
|
||||
|
||||
cmd_stop() {
|
||||
[ -f "$PID_FILE" ] || { log "stop: no recording in progress"; return 0; }
|
||||
_pid=$(cat "$PID_FILE")
|
||||
_start=$(cat "$START_FILE" 2>/dev/null || echo 0)
|
||||
_dur_ms=$(( $(now_ms) - _start ))
|
||||
# `q` on stdin is ffmpeg's clean-stop signal but with -nostdin we use
|
||||
# SIGINT — ffmpeg flushes the wav header on SIGINT.
|
||||
kill -INT "$_pid" 2>/dev/null || true
|
||||
# Wait briefly for ffmpeg to finalize the file.
|
||||
_i=0; while kill -0 "$_pid" 2>/dev/null && [ "$_i" -lt 30 ]; do sleep 0.1; _i=$((_i+1)); done
|
||||
rm -f "$PID_FILE" "$START_FILE"
|
||||
if [ "$_dur_ms" -lt "$MIN_MS" ]; then
|
||||
log "stop: too short (${_dur_ms}ms < ${MIN_MS}ms), discarding"
|
||||
rm -f "$WAV_FILE"
|
||||
return 0
|
||||
fi
|
||||
[ -s "$WAV_FILE" ] || { log "stop: empty recording"; notify "empty recording" err; return 1; }
|
||||
|
||||
if [ -z "${GROQ_API_KEY:-}" ]; then
|
||||
notify "GROQ_API_KEY not set" err
|
||||
log "GROQ_API_KEY missing"
|
||||
return 1
|
||||
fi
|
||||
log "transcribing ${_dur_ms}ms via $MODEL"
|
||||
_txt=$(curl -sS --fail-with-body \
|
||||
-H "Authorization: Bearer $GROQ_API_KEY" \
|
||||
-F "file=@$WAV_FILE" \
|
||||
-F "model=$MODEL" \
|
||||
-F "response_format=json" \
|
||||
https://api.groq.com/openai/v1/audio/transcriptions \
|
||||
| jq -r '.text // empty')
|
||||
rm -f "$WAV_FILE"
|
||||
if [ -z "$_txt" ]; then
|
||||
notify "transcription empty" err
|
||||
log "empty transcription"
|
||||
return 1
|
||||
fi
|
||||
log "text: $_txt"
|
||||
|
||||
_target=$(resolve_target) || { notify "no target session" err; return 1; }
|
||||
_host=$(printf '%s' "$_target" | cut -f1)
|
||||
_sess=$(printf '%s' "$_target" | cut -f2)
|
||||
log "inject → $_host/$_sess"
|
||||
|
||||
# Use `tmux send-keys -l` to send the text literally (no escape interp).
|
||||
# Then optional Enter if autosend.
|
||||
_esc=$(printf %s "$_txt" | sed "s/'/'\\\\''/g")
|
||||
if is_local_host "$_host"; then
|
||||
tmux send-keys -t "$_sess" -l "$_txt"
|
||||
[ "$AUTOSEND" = "1" ] && tmux send-keys -t "$_sess" Enter
|
||||
else
|
||||
ssh -o BatchMode=yes -o ConnectTimeout=5 "$_host" \
|
||||
"tmux send-keys -t '$_sess' -l '$_esc'" 2>>"$LOG_FILE"
|
||||
[ "$AUTOSEND" = "1" ] && \
|
||||
ssh -o BatchMode=yes "$_host" "tmux send-keys -t '$_sess' Enter" 2>>"$LOG_FILE"
|
||||
fi
|
||||
notify "✓ $_txt" ok
|
||||
}
|
||||
|
||||
cmd_cancel() {
|
||||
[ -f "$PID_FILE" ] || return 0
|
||||
kill -INT "$(cat "$PID_FILE")" 2>/dev/null || true
|
||||
rm -f "$PID_FILE" "$START_FILE" "$WAV_FILE"
|
||||
log "cancel"
|
||||
}
|
||||
|
||||
cmd_target() { resolve_target; }
|
||||
|
||||
is_local_host() {
|
||||
case $1 in
|
||||
local|localhost|127.0.0.1|::1) return 0 ;;
|
||||
esac
|
||||
[ "$1" = "$(hostname)" ] && return 0
|
||||
[ "$1" = "$(hostname -s 2>/dev/null)" ] && return 0
|
||||
return 1
|
||||
}
|
||||
|
||||
case ${1:-} in
|
||||
start) cmd_start ;;
|
||||
stop) cmd_stop ;;
|
||||
cancel) cmd_cancel ;;
|
||||
target) cmd_target ;;
|
||||
log) tail -50 "$LOG_FILE" 2>/dev/null ;;
|
||||
*)
|
||||
cat <<EOF >&2
|
||||
usage: rvoice {start|stop|cancel|target|log}
|
||||
|
||||
This script is meant to be driven by a PTT key binding (Hammerspoon).
|
||||
See ~/Code/@scripts/session-tools/hammerspoon/rvoice.lua for the
|
||||
companion config.
|
||||
EOF
|
||||
exit 2 ;;
|
||||
esac
|
||||
71
hammerspoon/rvoice.lua
Normal file
71
hammerspoon/rvoice.lua
Normal file
|
|
@ -0,0 +1,71 @@
|
|||
-- rvoice.lua — Right-Option push-to-talk for the rvoice helper.
|
||||
--
|
||||
-- Install:
|
||||
-- 1. Hammerspoon → Preferences → enable "Launch Hammerspoon at login"
|
||||
-- 2. Add this line to ~/.hammerspoon/init.lua:
|
||||
-- require("rvoice")
|
||||
-- 3. Symlink this file so init.lua can find it:
|
||||
-- ln -sfn ~/Code/@scripts/session-tools/hammerspoon/rvoice.lua \
|
||||
-- ~/.hammerspoon/rvoice.lua
|
||||
-- 4. Reload Hammerspoon config (menu bar → Reload Config)
|
||||
-- 5. Grant Accessibility + Microphone permissions when prompted.
|
||||
--
|
||||
-- Behavior: hold Right-Option to talk. Release to transcribe + inject into
|
||||
-- the active iTerm2 tab's remote tmux session. Taps shorter than 200ms are
|
||||
-- ignored (configurable via RVOICE_MIN_MS env in rvoice config).
|
||||
|
||||
local M = {}
|
||||
|
||||
-- Resolve `rvoice` once at load. Hammerspoon's task PATH is barebones, so
|
||||
-- prefer an explicit symlink in ~/.local/bin or fall back to the repo path.
|
||||
local function resolveRvoice()
|
||||
local candidates = {
|
||||
os.getenv("HOME") .. "/.local/bin/rvoice",
|
||||
os.getenv("HOME") .. "/Code/@scripts/session-tools/bin/rvoice",
|
||||
}
|
||||
for _, p in ipairs(candidates) do
|
||||
local f = io.open(p, "r")
|
||||
if f then f:close(); return p end
|
||||
end
|
||||
return "rvoice"
|
||||
end
|
||||
|
||||
local RVOICE = resolveRvoice()
|
||||
local holding = false
|
||||
|
||||
-- Run rvoice <cmd> in the background; capture stderr to the system log so
|
||||
-- failures are visible via Hammerspoon's console.
|
||||
local function run(cmd)
|
||||
local t = hs.task.new("/bin/sh", function(exit, _, err)
|
||||
if exit ~= 0 then
|
||||
hs.printf("[rvoice] %s exited %d: %s", cmd, exit, err or "")
|
||||
end
|
||||
end, {"-c", RVOICE .. " " .. cmd})
|
||||
-- Inherit user shell env so GROQ_API_KEY (and PATH for ffmpeg/jq) work.
|
||||
t:setEnvironment(hs.execute("env", true):gsub("\n$", "") and nil or nil)
|
||||
t:start()
|
||||
end
|
||||
|
||||
-- Right-Option keyDown/keyUp. Hammerspoon delivers modifier changes through
|
||||
-- eventtap.flagsChanged; we watch for the rightAlt flag transitioning.
|
||||
M.tap = hs.eventtap.new({ hs.eventtap.event.types.flagsChanged }, function(e)
|
||||
-- macOS exposes the side via a per-key mask. Right-Option is 0x40 in the
|
||||
-- raw `keyCode` event of type flagsChanged (code 61).
|
||||
local code = e:getKeyCode()
|
||||
if code ~= 61 then return false end -- 61 = Right Option
|
||||
local flags = e:getFlags()
|
||||
local pressed = flags.alt or false
|
||||
if pressed and not holding then
|
||||
holding = true
|
||||
run("start")
|
||||
elseif (not pressed) and holding then
|
||||
holding = false
|
||||
run("stop")
|
||||
end
|
||||
return false -- don't swallow the modifier; other apps may use it
|
||||
end)
|
||||
|
||||
M.tap:start()
|
||||
hs.alert.show("rvoice: Right ⌥ to talk")
|
||||
|
||||
return M
|
||||
Loading…
Add table
Reference in a new issue