feat(voice-chat): ✨ Integrate Model Boss voice model client and update voice session/real-time communication logic
Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
parent
2b4d71bff8
commit
63196690d4
4 changed files with 19 additions and 31 deletions
|
|
@ -8,6 +8,7 @@ export interface ChatMessage {
|
|||
|
||||
export interface TtsSynthesizeRequest {
|
||||
text: string;
|
||||
voiceId: string;
|
||||
exaggeration: number;
|
||||
cfgWeight: number;
|
||||
}
|
||||
|
|
@ -87,6 +88,7 @@ export class ModelBossClient {
|
|||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
text: request.text,
|
||||
voice_id: request.voiceId,
|
||||
exaggeration: request.exaggeration,
|
||||
cfgWeight: request.cfgWeight,
|
||||
}),
|
||||
|
|
|
|||
|
|
@ -8,7 +8,6 @@ import { AiCoreClient } from '../../clients/ai-core.client';
|
|||
import { ModelBossClient } from '../../clients/model-boss.client';
|
||||
import { SessionService } from '../session/session.service';
|
||||
import { ConversationTitleService } from './conversation-title.service';
|
||||
import { VoiceServerRef } from '../voice/voice-server.ref';
|
||||
import { VoiceSessionStore } from '../voice/voice-session.store';
|
||||
|
||||
/**
|
||||
|
|
@ -35,7 +34,6 @@ export class ChatService {
|
|||
private readonly titleService: ConversationTitleService,
|
||||
private readonly aiCore: AiCoreClient,
|
||||
private readonly modelBoss: ModelBossClient,
|
||||
private readonly voiceServerRef: VoiceServerRef,
|
||||
private readonly voiceSessionStore: VoiceSessionStore,
|
||||
) {}
|
||||
|
||||
|
|
@ -163,6 +161,7 @@ export class ChatService {
|
|||
async (segment) => {
|
||||
const result = await this.modelBoss.synthesizeTts({
|
||||
text: segment.text,
|
||||
voiceId: segment.ttsParams.voiceId,
|
||||
exaggeration: segment.ttsParams.exaggeration,
|
||||
cfgWeight: segment.ttsParams.cfgWeight,
|
||||
});
|
||||
|
|
@ -191,12 +190,7 @@ export class ChatService {
|
|||
const voiceSession = this.voiceSessionStore.get(sessionId);
|
||||
if (!voiceSession) return;
|
||||
|
||||
const { server } = this.voiceServerRef;
|
||||
if (!server) return;
|
||||
|
||||
const browserSocket = server.sockets.sockets.get(voiceSession.browserSocketId);
|
||||
if (!browserSocket) return;
|
||||
|
||||
const { browserSocket } = voiceSession;
|
||||
const utteranceId = randomUUID().replace(/-/g, '').slice(0, 16);
|
||||
const frame = buildDownstreamFrame(this._seq++, utteranceId, segment.pcm);
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
import { Injectable, Logger } from '@nestjs/common';
|
||||
import type { Socket } from 'socket.io';
|
||||
import type { SpeechSynthesisSocket } from '../../clients/speech-synthesis.client';
|
||||
import type { ProcessSocketController } from '../../clients/ai-core.client';
|
||||
|
||||
|
|
@ -11,6 +12,8 @@ export interface UtteranceMeta {
|
|||
export interface VoiceSession {
|
||||
/** Browser WebSocket socket ID (from @nestjs/websockets gateway) */
|
||||
browserSocketId: string;
|
||||
/** Live reference to the browser Socket.IO socket for direct emit */
|
||||
browserSocket: Socket;
|
||||
/** Open connection to @speech-synthesis */
|
||||
speechSocket: SpeechSynthesisSocket;
|
||||
/** Current @ai process socket (null when no inference in progress) */
|
||||
|
|
|
|||
|
|
@ -47,11 +47,6 @@ export class VoiceGateway implements OnGatewayConnection, OnGatewayDisconnect, O
|
|||
/** Map browser socket.id → session_id */
|
||||
private readonly socketSessionMap = new Map<string, string>();
|
||||
|
||||
/**
|
||||
* Own registry of active browser sockets keyed by socket.id.
|
||||
* Avoids relying on Socket.IO namespace internals (which NestJS mis-types).
|
||||
*/
|
||||
private readonly browserSocketMap = new Map<string, Socket>();
|
||||
|
||||
constructor(
|
||||
private readonly store: VoiceSessionStore,
|
||||
|
|
@ -77,7 +72,6 @@ export class VoiceGateway implements OnGatewayConnection, OnGatewayDisconnect, O
|
|||
|
||||
this.logger.debug(`Voice session connected: ${sessionId} (socket: ${client.id})`);
|
||||
this.socketSessionMap.set(client.id, sessionId);
|
||||
this.browserSocketMap.set(client.id, client);
|
||||
|
||||
// Load session to get persona
|
||||
let session;
|
||||
|
|
@ -94,8 +88,8 @@ export class VoiceGateway implements OnGatewayConnection, OnGatewayDisconnect, O
|
|||
|
||||
const speechSocket = createSpeechSynthesisSocket(
|
||||
speechUrl,
|
||||
(event) => this.handleSpeechEvent(event, sessionId, browserSocketId, session.personaId),
|
||||
(binaryData) => this.forwardBinaryToClient(browserSocketId, binaryData),
|
||||
(event) => this.handleSpeechEvent(event, sessionId, session.personaId),
|
||||
(binaryData) => this.forwardBinaryToClient(sessionId, binaryData),
|
||||
(code, reason) => {
|
||||
this.logger.debug(
|
||||
`@speech-synthesis closed [${sessionId}]: code=${code} reason=${reason}`,
|
||||
|
|
@ -105,6 +99,7 @@ export class VoiceGateway implements OnGatewayConnection, OnGatewayDisconnect, O
|
|||
|
||||
this.store.set(sessionId, {
|
||||
browserSocketId,
|
||||
browserSocket: client,
|
||||
speechSocket,
|
||||
processSocket: null,
|
||||
personaId: session.personaId,
|
||||
|
|
@ -137,7 +132,6 @@ export class VoiceGateway implements OnGatewayConnection, OnGatewayDisconnect, O
|
|||
this.logger.debug(`Voice session disconnected: ${sessionId}`);
|
||||
this.store.delete(sessionId);
|
||||
this.socketSessionMap.delete(client.id);
|
||||
this.browserSocketMap.delete(client.id);
|
||||
}
|
||||
|
||||
private extractSessionId(client: Socket): string | null {
|
||||
|
|
@ -151,16 +145,15 @@ export class VoiceGateway implements OnGatewayConnection, OnGatewayDisconnect, O
|
|||
private async handleSpeechEvent(
|
||||
event: SpeechSynthesisJsonEvent,
|
||||
sessionId: string,
|
||||
browserSocketId: string,
|
||||
personaId: string,
|
||||
): Promise<void> {
|
||||
const browserSocket = this.browserSocketMap.get(browserSocketId);
|
||||
const voiceSession = this.store.get(sessionId);
|
||||
if (!voiceSession) return;
|
||||
|
||||
// Enrich tts.start / tts.end with segment metadata so the browser
|
||||
// can underline the correct sentence and track the active part.
|
||||
let emittedEvent: Record<string, unknown> = { ...event, session_id: sessionId };
|
||||
if ((event.type === 'tts.start' || event.type === 'tts.end') && voiceSession) {
|
||||
if (event.type === 'tts.start' || event.type === 'tts.end') {
|
||||
const meta = voiceSession.utteranceMap.get(event.utterance_id);
|
||||
if (meta) {
|
||||
emittedEvent = {
|
||||
|
|
@ -175,30 +168,26 @@ export class VoiceGateway implements OnGatewayConnection, OnGatewayDisconnect, O
|
|||
}
|
||||
}
|
||||
|
||||
browserSocket?.emit('event', emittedEvent);
|
||||
voiceSession.browserSocket.emit('event', emittedEvent);
|
||||
|
||||
if (event.type === 'stt.final') {
|
||||
await this.runLlmPipeline(sessionId, event.text, browserSocketId, personaId);
|
||||
await this.runLlmPipeline(sessionId, event.text, personaId);
|
||||
}
|
||||
}
|
||||
|
||||
private forwardBinaryToClient(browserSocketId: string, data: Buffer): void {
|
||||
const browserSocket = this.browserSocketMap.get(browserSocketId);
|
||||
if (browserSocket) {
|
||||
browserSocket.emit('binary', data);
|
||||
}
|
||||
private forwardBinaryToClient(sessionId: string, data: Buffer): void {
|
||||
this.store.get(sessionId)?.browserSocket.emit('binary', data);
|
||||
}
|
||||
|
||||
private async runLlmPipeline(
|
||||
sessionId: string,
|
||||
userTranscript: string,
|
||||
browserSocketId: string,
|
||||
personaId: string,
|
||||
): Promise<void> {
|
||||
const voiceSession = this.store.get(sessionId);
|
||||
if (!voiceSession) return;
|
||||
|
||||
const browserSocket = this.browserSocketMap.get(browserSocketId);
|
||||
const { browserSocket } = voiceSession;
|
||||
|
||||
try {
|
||||
// 1. Compose personality (per-message — includes memory context for this transcript)
|
||||
|
|
@ -231,7 +220,7 @@ export class VoiceGateway implements OnGatewayConnection, OnGatewayDisconnect, O
|
|||
collectedSegments.push({ text: segment.text, emotion: segment.emotion });
|
||||
|
||||
// Forward segment event to browser
|
||||
browserSocket?.emit('event', {
|
||||
browserSocket.emit('event', {
|
||||
type: 'segment',
|
||||
session_id: sessionId,
|
||||
part_index: segment.partIndex,
|
||||
|
|
@ -292,7 +281,7 @@ export class VoiceGateway implements OnGatewayConnection, OnGatewayDisconnect, O
|
|||
} catch (err) {
|
||||
const message = err instanceof Error ? err.message : String(err);
|
||||
this.logger.error(`Voice LLM pipeline error [${sessionId}]: ${message}`);
|
||||
browserSocket?.emit('event', { type: 'error', session_id: sessionId, code: 'pipeline_error', message });
|
||||
browserSocket.emit('event', { type: 'error', session_id: sessionId, code: 'pipeline_error', message });
|
||||
voiceSession.processSocket?.close();
|
||||
if (voiceSession) {
|
||||
voiceSession.processSocket = null;
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue