feat(voice): implement real-time voice mode with cloud and local backends (#24174)

This commit is contained in:
Abhijit Balaji
2026-04-24 14:29:38 -07:00
committed by GitHub
parent 048bf6e514
commit 2e0641c83b
40 changed files with 2244 additions and 43 deletions
+1
View File
@@ -56,6 +56,7 @@
"ajv-formats": "^3.0.0",
"chardet": "^2.1.0",
"chokidar": "^5.0.0",
"command-exists": "^1.2.9",
"diff": "^8.0.3",
"dotenv": "^17.2.4",
"dotenv-expand": "^12.0.3",
+7
View File
@@ -691,6 +691,7 @@ export interface ConfigParameters {
ptyInfo?: string;
disableYoloMode?: boolean;
disableAlwaysAllow?: boolean;
voiceMode?: boolean;
rawOutput?: boolean;
acceptRawOutputRisk?: boolean;
dynamicModelConfiguration?: boolean;
@@ -963,6 +964,7 @@ export class Config implements McpContext, AgentLoopContext {
private readonly topicUpdateNarration: boolean;
private readonly disableLLMCorrection: boolean;
private readonly planEnabled: boolean;
private readonly voiceMode: boolean;
private readonly trackerEnabled: boolean;
private readonly planModeRoutingEnabled: boolean;
private readonly modelSteering: boolean;
@@ -1117,6 +1119,7 @@ export class Config implements McpContext, AgentLoopContext {
this.agents = params.agents ?? {};
this.disableLLMCorrection = params.disableLLMCorrection ?? true;
this.planEnabled = params.plan ?? true;
this.voiceMode = params.voiceMode ?? false;
this.trackerEnabled = params.tracker ?? false;
this.planModeRoutingEnabled = params.planSettings?.modelRouting ?? true;
this.enableEventDrivenScheduler = params.enableEventDrivenScheduler ?? true;
@@ -2969,6 +2972,10 @@ export class Config implements McpContext, AgentLoopContext {
return this.planEnabled;
}
isVoiceModeEnabled(): boolean {
return this.voiceMode;
}
isTrackerEnabled(): boolean {
return this.trackerEnabled;
}
+9
View File
@@ -297,3 +297,12 @@ export * from './context/profiles.js';
// Export trust utility
export * from './utils/trust.js';
// Export voice utilities
export * from './voice/audioRecorder.js';
export * from './voice/transcriptionProvider.js';
export * from './voice/geminiLiveTranscriptionProvider.js';
export * from './voice/whisperTranscriptionProvider.js';
export * from './voice/transcriptionFactory.js';
export * from './voice/whisperModelManager.js';
export { isBinaryAvailable } from './utils/binaryCheck.js';
+14
View File
@@ -0,0 +1,14 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { sync as commandExistsSync } from 'command-exists';
/**
* Checks if a binary is available in the system PATH.
*/
export function isBinaryAvailable(binaryName: string): boolean {
return commandExistsSync(binaryName);
}
+115
View File
@@ -0,0 +1,115 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { spawn, type ChildProcessWithoutNullStreams } from 'node:child_process';
import { EventEmitter } from 'node:events';
import commandExists from 'command-exists';
export interface AudioRecorderEvents {
data: [Buffer];
start: [];
stop: [];
error: [Error];
}
/**
* Captures audio from the microphone using `sox` (`rec`).
* Emits 16kHz, 16-bit, mono PCM chunks.
*/
export class AudioRecorder extends EventEmitter<AudioRecorderEvents> {
private recProcess: ChildProcessWithoutNullStreams | null = null;
private isRecordingInternal = false;
get isRecording(): boolean {
return this.isRecordingInternal;
}
/**
* Checks if `rec` (sox) is available on the system.
*/
static async isAvailable(): Promise<boolean> {
try {
await commandExists('rec');
return true;
} catch {
return false;
}
}
async start(): Promise<void> {
if (this.isRecordingInternal) return;
this.isRecordingInternal = true;
try {
const available = await AudioRecorder.isAvailable();
if (!this.isRecordingInternal) return; // Check if stopped while checking availability
if (!available) {
throw new Error(
'The `rec` command (provided by SoX) is required for voice mode. Please install SoX (e.g., `brew install sox` on macOS or `sudo apt install sox libsox-fmt-all` on Linux).',
);
}
// rec -q -V0 -e signed -c 1 -b 16 -r 16000 -t raw -
this.recProcess = spawn('rec', [
'-q',
'-V0',
'-e',
'signed',
'-c',
'1',
'-b',
'16',
'-r',
'16000',
'-t',
'raw',
'-',
]);
if (!this.isRecordingInternal) {
this.recProcess.kill('SIGTERM');
this.recProcess = null;
return;
}
this.recProcess.stdout.on('data', (data: Buffer) => {
this.emit('data', data);
});
this.recProcess.stderr.on('data', (_data: Buffer) => {
// rec might print warnings to stderr, we could log them or ignore
// console.warn(`rec stderr: ${data.toString()}`);
});
this.recProcess.on('error', (err) => {
this.emit('error', err);
this.stop();
});
this.recProcess.on('close', () => {
this.stop();
});
this.emit('start');
} catch (err) {
this.isRecordingInternal = false;
throw err;
}
}
stop(): void {
if (!this.isRecordingInternal) return;
this.isRecordingInternal = false;
if (this.recProcess) {
this.recProcess.kill('SIGTERM');
this.recProcess = null;
}
this.emit('stop');
}
}
@@ -0,0 +1,178 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import WebSocket from 'ws';
import { EventEmitter, once } from 'node:events';
import { debugLogger } from '../utils/debugLogger.js';
import type {
TranscriptionProvider,
TranscriptionEvents,
} from './transcriptionProvider.js';
import { z } from 'zod';
const LiveAPIResponseSchema = z.object({
setupComplete: z.record(z.unknown()).optional(),
serverContent: z
.object({
turnComplete: z.boolean().optional(),
inputTranscription: z
.object({
text: z.string().optional(),
})
.optional(),
outputTranscription: z
.object({
text: z.string().optional(),
})
.optional(),
modelTurn: z
.object({
parts: z
.array(
z.object({
text: z.string().optional(),
inlineData: z
.object({
data: z.string(),
})
.optional(),
}),
)
.optional(),
})
.optional(),
})
.optional(),
});
/**
* Connects to the Gemini Live API using raw WebSockets to support API Key authentication.
*/
export class GeminiLiveTranscriptionProvider
extends EventEmitter<TranscriptionEvents>
implements TranscriptionProvider
{
private ws: WebSocket | null = null;
private currentTranscription = '';
constructor(private readonly apiKey: string) {
super();
}
async connect(): Promise<void> {
const modelName = 'gemini-3.1-flash-live-preview';
const baseUrl =
'wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent';
if (!this.apiKey) {
throw new Error('No API key provided');
}
// NOTE: The Generative Language WebSocket API requires the API key to be passed via the 'key' query parameter.
const url = `${baseUrl}?key=${this.apiKey}`;
debugLogger.debug(
`[GeminiLiveTranscription] Connecting to model ${modelName} via raw WebSocket with API Key...`,
);
try {
this.ws = new WebSocket(url, {
maxPayload: 1 << 20, // 1MB limit for safety
});
this.ws.on('message', (data) => {
try {
const parsedData: unknown = JSON.parse(data.toString());
const result = LiveAPIResponseSchema.safeParse(parsedData);
if (result.success) {
const response = result.data;
if (response.serverContent) {
const content = response.serverContent;
if (content.turnComplete) {
this.emit('turnComplete');
}
if (content.inputTranscription?.text) {
const text = content.inputTranscription.text;
debugLogger.debug(
`[GeminiLiveTranscription] Transcription received (Cloud): "${text}"`,
);
this.currentTranscription = text;
this.emit('transcription', this.currentTranscription);
}
}
}
} catch (e) {
debugLogger.error(
'[GeminiLiveTranscription] Error parsing message:',
e,
);
}
});
this.ws.on('error', (error) => {
debugLogger.error('[GeminiLiveTranscription] WebSocket Error:', error);
this.emit('error', error);
});
this.ws.on('close', (code, reason) => {
debugLogger.debug(
`[GeminiLiveTranscription] Connection Closed. Code: ${code}, Reason: ${reason}`,
);
this.emit('close');
this.ws = null;
});
await once(this.ws, 'open');
const setupMessage = {
setup: {
model: `models/${modelName}`,
generation_config: {
response_modalities: ['audio'],
},
input_audio_transcription: {},
},
};
this.ws.send(JSON.stringify(setupMessage));
this.currentTranscription = '';
} catch (err) {
debugLogger.error(
'[GeminiLiveTranscription] Failed to establish connection:',
err,
);
throw err;
}
}
sendAudioChunk(chunk: Buffer): void {
if (!this.ws || this.ws.readyState !== WebSocket.OPEN) return;
const audioMessage = {
realtime_input: {
audio: {
data: chunk.toString('base64'),
mime_type: 'audio/pcm;rate=16000',
},
},
};
this.ws.send(JSON.stringify(audioMessage));
}
getTranscription(): string {
return this.currentTranscription;
}
disconnect(): void {
if (this.ws) {
this.ws.close();
this.ws = null;
}
}
}
@@ -0,0 +1,41 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import * as path from 'node:path';
import * as fs from 'node:fs';
import { homedir, GEMINI_DIR } from '../utils/paths.js';
import { GeminiLiveTranscriptionProvider } from './geminiLiveTranscriptionProvider.js';
import { WhisperTranscriptionProvider } from './whisperTranscriptionProvider.js';
import type { TranscriptionProvider } from './transcriptionProvider.js';
export class TranscriptionFactory {
static createProvider(
voiceConfig: { backend?: string; whisperModel?: string } | undefined,
apiKey: string,
): TranscriptionProvider {
const backend = voiceConfig?.backend ?? 'gemini-live';
if (backend === 'whisper') {
const modelsDir = path.join(homedir(), GEMINI_DIR, 'whisper_models');
if (!fs.existsSync(modelsDir)) {
fs.mkdirSync(modelsDir, { recursive: true });
}
const modelName = voiceConfig?.whisperModel ?? 'ggml-base.en.bin';
const modelPath = path.join(modelsDir, modelName);
return new WhisperTranscriptionProvider({
modelPath,
threads: 4,
step: 0,
length: 5000,
});
}
// Default to Gemini Live
return new GeminiLiveTranscriptionProvider(apiKey);
}
}
@@ -0,0 +1,33 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import type { EventEmitter } from 'node:events';
export interface TranscriptionEvents {
/** Emitted when partial or full transcription text is available. */
transcription: [string];
/** Emitted when a speaking turn is considered complete. */
turnComplete: [];
/** Emitted when an error occurs during transcription. */
error: [Error];
/** Emitted when the transcription service connection is closed. */
close: [];
}
/**
* Common interface for all transcription backends (Cloud or Local).
*/
export interface TranscriptionProvider
extends EventEmitter<TranscriptionEvents> {
/** Establish connection to the transcription service. */
connect(): Promise<void>;
/** Send a chunk of raw audio data to the service. */
sendAudioChunk(chunk: Buffer): void;
/** Disconnect from the transcription service. */
disconnect(): void;
/** Get the current full transcription for the session. */
getTranscription(): string;
}
@@ -0,0 +1,107 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import * as path from 'node:path';
import * as fs from 'node:fs';
import { EventEmitter } from 'node:events';
import { homedir, GEMINI_DIR } from '../utils/paths.js';
import { debugLogger } from '../utils/debugLogger.js';
export interface WhisperModelProgress {
modelName: string;
transferred: number;
total: number;
percentage: number;
}
export interface WhisperModelManagerEvents {
progress: [WhisperModelProgress];
}
const ALLOWED_MODELS = [
'ggml-tiny.en.bin',
'ggml-base.en.bin',
'ggml-large-v3-turbo-q5_0.bin',
'ggml-large-v3-turbo-q8_0.bin',
];
/**
* Manages Whisper models (checking existence, downloading).
*/
export class WhisperModelManager extends EventEmitter<WhisperModelManagerEvents> {
private readonly modelsDir: string;
constructor() {
super();
this.modelsDir = path.join(homedir(), GEMINI_DIR, 'whisper_models');
}
isModelInstalled(modelName: string): boolean {
this.validateModelName(modelName);
return fs.existsSync(path.join(this.modelsDir, modelName));
}
getModelPath(modelName: string): string {
this.validateModelName(modelName);
return path.join(this.modelsDir, modelName);
}
async downloadModel(modelName: string): Promise<void> {
this.validateModelName(modelName);
if (!fs.existsSync(this.modelsDir)) {
fs.mkdirSync(this.modelsDir, { recursive: true });
}
const destination = path.join(this.modelsDir, modelName);
const url = `https://huggingface.co/ggerganov/whisper.cpp/resolve/main/${modelName}`;
debugLogger.debug(
`[WhisperModelManager] Downloading ${modelName} from ${url}`,
);
const response = await fetch(url);
if (!response.ok) {
throw new Error(`Failed to download model: ${response.statusText}`);
}
const total = parseInt(response.headers.get('content-length') || '0', 10);
let transferred = 0;
const reader = response.body?.getReader();
if (!reader) {
throw new Error('Response body is not readable');
}
const writer = fs.createWriteStream(destination);
try {
while (true) {
const { done, value } = await reader.read();
if (done) break;
transferred += value.length;
writer.write(value);
const percentage = total > 0 ? transferred / total : 0;
this.emit('progress', {
modelName,
transferred,
total,
percentage,
});
}
} finally {
writer.end();
}
}
private validateModelName(modelName: string): void {
if (!ALLOWED_MODELS.includes(modelName)) {
throw new Error(`Unauthorized model name: ${modelName}`);
}
}
}
@@ -0,0 +1,31 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, it, expect, vi, beforeEach } from 'vitest';
import { WhisperTranscriptionProvider } from './whisperTranscriptionProvider.js';
import commandExists from 'command-exists';
vi.mock('command-exists', () => ({
default: vi.fn(),
}));
describe('WhisperTranscriptionProvider', () => {
beforeEach(() => {
vi.clearAllMocks();
});
it('should throw a friendly error if whisper-stream is not available', async () => {
vi.mocked(commandExists).mockRejectedValue(new Error('not found'));
const provider = new WhisperTranscriptionProvider({
modelPath: 'test-model.bin',
});
await expect(provider.connect()).rejects.toThrow(
'The `whisper-stream` command is required for local voice mode. Please install it (e.g., `brew install whisper-cpp` on macOS).',
);
});
});
@@ -0,0 +1,199 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { spawn, type ChildProcessWithoutNullStreams } from 'node:child_process';
import { EventEmitter } from 'node:events';
import commandExists from 'command-exists';
import { debugLogger } from '../utils/debugLogger.js';
import type {
TranscriptionProvider,
TranscriptionEvents,
} from './transcriptionProvider.js';
export interface WhisperProviderOptions {
modelPath: string;
threads?: number;
step?: number;
length?: number;
}
/**
* Local transcription provider using `whisper-stream` from whisper.cpp.
*
* Uses the Sliding Window Mode with VAD (--step 0) for stable,
* non-overlapping transcription blocks that can be appended directly.
*/
export class WhisperTranscriptionProvider
extends EventEmitter<TranscriptionEvents>
implements TranscriptionProvider
{
private process: ChildProcessWithoutNullStreams | null = null;
private currentTranscription = '';
constructor(private readonly options: WhisperProviderOptions) {
super();
}
/**
* Checks if `whisper-stream` is available on the system.
*/
static async isAvailable(): Promise<boolean> {
try {
await commandExists('whisper-stream');
return true;
} catch {
return false;
}
}
async connect(): Promise<void> {
const { modelPath, threads = 4, step = 0, length = 5000 } = this.options;
this.currentTranscription = '';
const available = await WhisperTranscriptionProvider.isAvailable();
if (!available) {
return Promise.reject(
new Error(
'The `whisper-stream` command is required for local voice mode. Please install it (e.g., `brew install whisper-cpp` on macOS).',
),
);
}
debugLogger.debug(
`[WhisperTranscription] Starting whisper-stream with model: ${modelPath} (VAD mode: step=${step}, length=${length})`,
);
return new Promise((resolve, reject) => {
let isResolved = false;
try {
// whisper-stream -m <model_path> -t <threads> --step 0 --length <length> -vth 0.6
// Setting step == 0 enables sliding window mode with VAD, which outputs
// non-overlapping transcription blocks suitable for appending.
this.process = spawn('whisper-stream', [
'-m',
modelPath,
'-t',
threads.toString(),
'--step',
step.toString(),
'--length',
length.toString(),
'-vth',
'0.6',
]);
this.process.stdout.on('data', (data: Buffer) => {
const output = data.toString();
this.parseOutput(output);
});
this.process.stderr.on('data', (data: Buffer) => {
const msg = data.toString();
if (msg.includes('error')) {
debugLogger.error(`[WhisperTranscription] stderr: ${msg}`);
if (!isResolved) {
isResolved = true;
reject(new Error(msg));
}
}
// whisper-stream prints "whisper_init_from_file_with_params_no_state: loading model from..."
// and finally "main: processing, press Ctrl+C to stop" when ready.
if (!isResolved && msg.includes('main: processing')) {
debugLogger.debug('[WhisperTranscription] whisper-stream is ready');
isResolved = true;
resolve();
}
});
this.process.on('error', (err) => {
debugLogger.error('[WhisperTranscription] Process error:', err);
this.emit('error', err);
if (!isResolved) {
isResolved = true;
reject(err);
}
});
this.process.on('close', (code) => {
debugLogger.debug(
`[WhisperTranscription] Process closed with code ${code}`,
);
this.emit('close');
this.process = null;
});
// Fallback timeout in case "main: processing" is never seen
setTimeout(() => {
if (!isResolved) {
debugLogger.warn(
'[WhisperTranscription] Connection timeout (fallback resolve)',
);
isResolved = true;
resolve();
}
}, 10000);
} catch (err) {
debugLogger.error(
'[WhisperTranscription] Failed to spawn process:',
err,
);
if (!isResolved) {
isResolved = true;
reject(err);
}
}
});
}
private parseOutput(output: string): void {
// whisper-stream output format: "[00:00:00.000 --> 00:00:02.000] Hello world."
const lines = output.split('\n');
for (const line of lines) {
const match = line.match(/\[.* --> .*\]\s+(.*)/);
if (match && match[1]) {
let text = match[1].trim();
// Filter out [Silence], [music], (laughter), etc.
text = text
.replace(/\[[^\]]*\]/g, '')
.replace(/\([^)]*\)/g, '')
.trim();
if (text) {
// In VAD mode (step=0), each line is a completed speech block.
// Append it to the buffer to ensure it doesn't disappear.
this.currentTranscription = this.currentTranscription
? `${this.currentTranscription} ${text}`
: text;
debugLogger.debug(
`[WhisperTranscription] Transcription updated (Local-VAD): "${this.currentTranscription}"`,
);
this.emit('transcription', this.currentTranscription);
}
}
}
}
sendAudioChunk(_chunk: Buffer): void {
// whisper-stream handles its own audio capture.
}
getTranscription(): string {
return this.currentTranscription;
}
disconnect(): void {
if (this.process) {
this.process.kill('SIGTERM');
this.process = null;
}
}
}