feat(voice-chat): Integrate Model Boss voice model client and update voice session/real-time communication logic

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
Claude Code 2026-04-03 11:02:01 -07:00
parent 2b4d71bff8
commit 63196690d4
4 changed files with 19 additions and 31 deletions

View file

@ -8,6 +8,7 @@ export interface ChatMessage {
export interface TtsSynthesizeRequest {
text: string;
voiceId: string;
exaggeration: number;
cfgWeight: number;
}
@ -87,6 +88,7 @@ export class ModelBossClient {
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
text: request.text,
voice_id: request.voiceId,
exaggeration: request.exaggeration,
cfgWeight: request.cfgWeight,
}),

View file

@ -8,7 +8,6 @@ import { AiCoreClient } from '../../clients/ai-core.client';
import { ModelBossClient } from '../../clients/model-boss.client';
import { SessionService } from '../session/session.service';
import { ConversationTitleService } from './conversation-title.service';
import { VoiceServerRef } from '../voice/voice-server.ref';
import { VoiceSessionStore } from '../voice/voice-session.store';
/**
@ -35,7 +34,6 @@ export class ChatService {
private readonly titleService: ConversationTitleService,
private readonly aiCore: AiCoreClient,
private readonly modelBoss: ModelBossClient,
private readonly voiceServerRef: VoiceServerRef,
private readonly voiceSessionStore: VoiceSessionStore,
) {}
@ -163,6 +161,7 @@ export class ChatService {
async (segment) => {
const result = await this.modelBoss.synthesizeTts({
text: segment.text,
voiceId: segment.ttsParams.voiceId,
exaggeration: segment.ttsParams.exaggeration,
cfgWeight: segment.ttsParams.cfgWeight,
});
@ -191,12 +190,7 @@ export class ChatService {
const voiceSession = this.voiceSessionStore.get(sessionId);
if (!voiceSession) return;
const { server } = this.voiceServerRef;
if (!server) return;
const browserSocket = server.sockets.sockets.get(voiceSession.browserSocketId);
if (!browserSocket) return;
const { browserSocket } = voiceSession;
const utteranceId = randomUUID().replace(/-/g, '').slice(0, 16);
const frame = buildDownstreamFrame(this._seq++, utteranceId, segment.pcm);

View file

@ -1,4 +1,5 @@
import { Injectable, Logger } from '@nestjs/common';
import type { Socket } from 'socket.io';
import type { SpeechSynthesisSocket } from '../../clients/speech-synthesis.client';
import type { ProcessSocketController } from '../../clients/ai-core.client';
@ -11,6 +12,8 @@ export interface UtteranceMeta {
export interface VoiceSession {
/** Browser WebSocket socket ID (from @nestjs/websockets gateway) */
browserSocketId: string;
/** Live reference to the browser Socket.IO socket for direct emit */
browserSocket: Socket;
/** Open connection to @speech-synthesis */
speechSocket: SpeechSynthesisSocket;
/** Current @ai process socket (null when no inference in progress) */

View file

@ -47,11 +47,6 @@ export class VoiceGateway implements OnGatewayConnection, OnGatewayDisconnect, O
/** Map browser socket.id → session_id */
private readonly socketSessionMap = new Map<string, string>();
/**
* Own registry of active browser sockets keyed by socket.id.
* Avoids relying on Socket.IO namespace internals (which NestJS mis-types).
*/
private readonly browserSocketMap = new Map<string, Socket>();
constructor(
private readonly store: VoiceSessionStore,
@ -77,7 +72,6 @@ export class VoiceGateway implements OnGatewayConnection, OnGatewayDisconnect, O
this.logger.debug(`Voice session connected: ${sessionId} (socket: ${client.id})`);
this.socketSessionMap.set(client.id, sessionId);
this.browserSocketMap.set(client.id, client);
// Load session to get persona
let session;
@ -94,8 +88,8 @@ export class VoiceGateway implements OnGatewayConnection, OnGatewayDisconnect, O
const speechSocket = createSpeechSynthesisSocket(
speechUrl,
(event) => this.handleSpeechEvent(event, sessionId, browserSocketId, session.personaId),
(binaryData) => this.forwardBinaryToClient(browserSocketId, binaryData),
(event) => this.handleSpeechEvent(event, sessionId, session.personaId),
(binaryData) => this.forwardBinaryToClient(sessionId, binaryData),
(code, reason) => {
this.logger.debug(
`@speech-synthesis closed [${sessionId}]: code=${code} reason=${reason}`,
@ -105,6 +99,7 @@ export class VoiceGateway implements OnGatewayConnection, OnGatewayDisconnect, O
this.store.set(sessionId, {
browserSocketId,
browserSocket: client,
speechSocket,
processSocket: null,
personaId: session.personaId,
@ -137,7 +132,6 @@ export class VoiceGateway implements OnGatewayConnection, OnGatewayDisconnect, O
this.logger.debug(`Voice session disconnected: ${sessionId}`);
this.store.delete(sessionId);
this.socketSessionMap.delete(client.id);
this.browserSocketMap.delete(client.id);
}
private extractSessionId(client: Socket): string | null {
@ -151,16 +145,15 @@ export class VoiceGateway implements OnGatewayConnection, OnGatewayDisconnect, O
private async handleSpeechEvent(
event: SpeechSynthesisJsonEvent,
sessionId: string,
browserSocketId: string,
personaId: string,
): Promise<void> {
const browserSocket = this.browserSocketMap.get(browserSocketId);
const voiceSession = this.store.get(sessionId);
if (!voiceSession) return;
// Enrich tts.start / tts.end with segment metadata so the browser
// can underline the correct sentence and track the active part.
let emittedEvent: Record<string, unknown> = { ...event, session_id: sessionId };
if ((event.type === 'tts.start' || event.type === 'tts.end') && voiceSession) {
if (event.type === 'tts.start' || event.type === 'tts.end') {
const meta = voiceSession.utteranceMap.get(event.utterance_id);
if (meta) {
emittedEvent = {
@ -175,30 +168,26 @@ export class VoiceGateway implements OnGatewayConnection, OnGatewayDisconnect, O
}
}
browserSocket?.emit('event', emittedEvent);
voiceSession.browserSocket.emit('event', emittedEvent);
if (event.type === 'stt.final') {
await this.runLlmPipeline(sessionId, event.text, browserSocketId, personaId);
await this.runLlmPipeline(sessionId, event.text, personaId);
}
}
private forwardBinaryToClient(browserSocketId: string, data: Buffer): void {
const browserSocket = this.browserSocketMap.get(browserSocketId);
if (browserSocket) {
browserSocket.emit('binary', data);
}
private forwardBinaryToClient(sessionId: string, data: Buffer): void {
this.store.get(sessionId)?.browserSocket.emit('binary', data);
}
private async runLlmPipeline(
sessionId: string,
userTranscript: string,
browserSocketId: string,
personaId: string,
): Promise<void> {
const voiceSession = this.store.get(sessionId);
if (!voiceSession) return;
const browserSocket = this.browserSocketMap.get(browserSocketId);
const { browserSocket } = voiceSession;
try {
// 1. Compose personality (per-message — includes memory context for this transcript)
@ -231,7 +220,7 @@ export class VoiceGateway implements OnGatewayConnection, OnGatewayDisconnect, O
collectedSegments.push({ text: segment.text, emotion: segment.emotion });
// Forward segment event to browser
browserSocket?.emit('event', {
browserSocket.emit('event', {
type: 'segment',
session_id: sessionId,
part_index: segment.partIndex,
@ -292,7 +281,7 @@ export class VoiceGateway implements OnGatewayConnection, OnGatewayDisconnect, O
} catch (err) {
const message = err instanceof Error ? err.message : String(err);
this.logger.error(`Voice LLM pipeline error [${sessionId}]: ${message}`);
browserSocket?.emit('event', { type: 'error', session_id: sessionId, code: 'pipeline_error', message });
browserSocket.emit('event', { type: 'error', session_id: sessionId, code: 'pipeline_error', message });
voiceSession.processSocket?.close();
if (voiceSession) {
voiceSession.processSocket = null;