feat(voice-chat): ✨ Integrate Model Boss voice model client and update voice session/real-time communication logic

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
2026-04-03 11:02:01 -07:00 · 2026-04-03 11:02:01 -07:00 · 63196690d4
commit 63196690d4
parent 2b4d71bff8
4 changed files with 19 additions and 31 deletions
--- a/@applications/api/src/clients/model-boss.client.ts
+++ b/@applications/api/src/clients/model-boss.client.ts
@ -8,6 +8,7 @@ export interface ChatMessage {

 export interface TtsSynthesizeRequest {
  text: string;
+  voiceId: string;
  exaggeration: number;
  cfgWeight: number;
 }
@ -87,6 +88,7 @@ export class ModelBossClient {
      headers: { 'Content-Type': 'application/json' },
      body: JSON.stringify({
        text: request.text,
+        voice_id: request.voiceId,
        exaggeration: request.exaggeration,
        cfgWeight: request.cfgWeight,
      }),
--- a/@applications/api/src/modules/chat/chat.service.ts
+++ b/@applications/api/src/modules/chat/chat.service.ts
@ -8,7 +8,6 @@ import { AiCoreClient } from '../../clients/ai-core.client';
 import { ModelBossClient } from '../../clients/model-boss.client';
 import { SessionService } from '../session/session.service';
 import { ConversationTitleService } from './conversation-title.service';
-import { VoiceServerRef } from '../voice/voice-server.ref';
 import { VoiceSessionStore } from '../voice/voice-session.store';

 /**
@ -35,7 +34,6 @@ export class ChatService {
    private readonly titleService: ConversationTitleService,
    private readonly aiCore: AiCoreClient,
    private readonly modelBoss: ModelBossClient,
-    private readonly voiceServerRef: VoiceServerRef,
    private readonly voiceSessionStore: VoiceSessionStore,
  ) {}

@ -163,6 +161,7 @@ export class ChatService {
      async (segment) => {
        const result = await this.modelBoss.synthesizeTts({
          text: segment.text,
+          voiceId: segment.ttsParams.voiceId,
          exaggeration: segment.ttsParams.exaggeration,
          cfgWeight: segment.ttsParams.cfgWeight,
        });
@ -191,12 +190,7 @@ export class ChatService {
    const voiceSession = this.voiceSessionStore.get(sessionId);
    if (!voiceSession) return;

-    const { server } = this.voiceServerRef;
-    if (!server) return;
-
-    const browserSocket = server.sockets.sockets.get(voiceSession.browserSocketId);
-    if (!browserSocket) return;
-
+    const { browserSocket } = voiceSession;
    const utteranceId = randomUUID().replace(/-/g, '').slice(0, 16);
    const frame = buildDownstreamFrame(this._seq++, utteranceId, segment.pcm);

--- a/@applications/api/src/modules/voice/voice-session.store.ts
+++ b/@applications/api/src/modules/voice/voice-session.store.ts
@ -1,4 +1,5 @@
 import { Injectable, Logger } from '@nestjs/common';
+import type { Socket } from 'socket.io';
 import type { SpeechSynthesisSocket } from '../../clients/speech-synthesis.client';
 import type { ProcessSocketController } from '../../clients/ai-core.client';

@ -11,6 +12,8 @@ export interface UtteranceMeta {
 export interface VoiceSession {
  /** Browser WebSocket socket ID (from @nestjs/websockets gateway) */
  browserSocketId: string;
+  /** Live reference to the browser Socket.IO socket for direct emit */
+  browserSocket: Socket;
  /** Open connection to @speech-synthesis */
  speechSocket: SpeechSynthesisSocket;
  /** Current @ai process socket (null when no inference in progress) */
--- a/@applications/api/src/modules/voice/voice.gateway.ts
+++ b/@applications/api/src/modules/voice/voice.gateway.ts
@ -47,11 +47,6 @@ export class VoiceGateway implements OnGatewayConnection, OnGatewayDisconnect, O
  /** Map browser socket.id → session_id */
  private readonly socketSessionMap = new Map<string, string>();

-  /**
-   * Own registry of active browser sockets keyed by socket.id.
-   * Avoids relying on Socket.IO namespace internals (which NestJS mis-types).
-   */
-  private readonly browserSocketMap = new Map<string, Socket>();

  constructor(
    private readonly store: VoiceSessionStore,
@ -77,7 +72,6 @@ export class VoiceGateway implements OnGatewayConnection, OnGatewayDisconnect, O

    this.logger.debug(`Voice session connected: ${sessionId} (socket: ${client.id})`);
    this.socketSessionMap.set(client.id, sessionId);
-    this.browserSocketMap.set(client.id, client);

    // Load session to get persona
    let session;
@ -94,8 +88,8 @@ export class VoiceGateway implements OnGatewayConnection, OnGatewayDisconnect, O

    const speechSocket = createSpeechSynthesisSocket(
      speechUrl,
-      (event) => this.handleSpeechEvent(event, sessionId, browserSocketId, session.personaId),
-      (binaryData) => this.forwardBinaryToClient(browserSocketId, binaryData),
+      (event) => this.handleSpeechEvent(event, sessionId, session.personaId),
+      (binaryData) => this.forwardBinaryToClient(sessionId, binaryData),
      (code, reason) => {
        this.logger.debug(
          `@speech-synthesis closed [${sessionId}]: code=${code} reason=${reason}`,
@ -105,6 +99,7 @@ export class VoiceGateway implements OnGatewayConnection, OnGatewayDisconnect, O

    this.store.set(sessionId, {
      browserSocketId,
+      browserSocket: client,
      speechSocket,
      processSocket: null,
      personaId: session.personaId,
@ -137,7 +132,6 @@ export class VoiceGateway implements OnGatewayConnection, OnGatewayDisconnect, O
    this.logger.debug(`Voice session disconnected: ${sessionId}`);
    this.store.delete(sessionId);
    this.socketSessionMap.delete(client.id);
-    this.browserSocketMap.delete(client.id);
  }

  private extractSessionId(client: Socket): string | null {
@ -151,16 +145,15 @@ export class VoiceGateway implements OnGatewayConnection, OnGatewayDisconnect, O
  private async handleSpeechEvent(
    event: SpeechSynthesisJsonEvent,
    sessionId: string,
-    browserSocketId: string,
    personaId: string,
  ): Promise<void> {
-    const browserSocket = this.browserSocketMap.get(browserSocketId);
    const voiceSession = this.store.get(sessionId);
+    if (!voiceSession) return;

    // Enrich tts.start / tts.end with segment metadata so the browser
    // can underline the correct sentence and track the active part.
    let emittedEvent: Record<string, unknown> = { ...event, session_id: sessionId };
-    if ((event.type === 'tts.start' || event.type === 'tts.end') && voiceSession) {
+    if (event.type === 'tts.start' || event.type === 'tts.end') {
      const meta = voiceSession.utteranceMap.get(event.utterance_id);
      if (meta) {
        emittedEvent = {
@ -175,30 +168,26 @@ export class VoiceGateway implements OnGatewayConnection, OnGatewayDisconnect, O
      }
    }

-    browserSocket?.emit('event', emittedEvent);
+    voiceSession.browserSocket.emit('event', emittedEvent);

    if (event.type === 'stt.final') {
-      await this.runLlmPipeline(sessionId, event.text, browserSocketId, personaId);
+      await this.runLlmPipeline(sessionId, event.text, personaId);
    }
  }

-  private forwardBinaryToClient(browserSocketId: string, data: Buffer): void {
-    const browserSocket = this.browserSocketMap.get(browserSocketId);
-    if (browserSocket) {
-      browserSocket.emit('binary', data);
-    }
+  private forwardBinaryToClient(sessionId: string, data: Buffer): void {
+    this.store.get(sessionId)?.browserSocket.emit('binary', data);
  }

  private async runLlmPipeline(
    sessionId: string,
    userTranscript: string,
-    browserSocketId: string,
    personaId: string,
  ): Promise<void> {
    const voiceSession = this.store.get(sessionId);
    if (!voiceSession) return;

-    const browserSocket = this.browserSocketMap.get(browserSocketId);
+    const { browserSocket } = voiceSession;

    try {
      // 1. Compose personality (per-message — includes memory context for this transcript)
@ -231,7 +220,7 @@ export class VoiceGateway implements OnGatewayConnection, OnGatewayDisconnect, O
        collectedSegments.push({ text: segment.text, emotion: segment.emotion });

        // Forward segment event to browser
-        browserSocket?.emit('event', {
+        browserSocket.emit('event', {
          type: 'segment',
          session_id: sessionId,
          part_index: segment.partIndex,
@ -292,7 +281,7 @@ export class VoiceGateway implements OnGatewayConnection, OnGatewayDisconnect, O
    } catch (err) {
      const message = err instanceof Error ? err.message : String(err);
      this.logger.error(`Voice LLM pipeline error [${sessionId}]: ${message}`);
-      browserSocket?.emit('event', { type: 'error', session_id: sessionId, code: 'pipeline_error', message });
+      browserSocket.emit('event', { type: 'error', session_id: sessionId, code: 'pipeline_error', message });
      voiceSession.processSocket?.close();
      if (voiceSession) {
        voiceSession.processSocket = null;