feat(voice): implement real-time voice mode with cloud and local backends (#24174)

2026-04-26 04:54:25 -07:00 · 2026-04-24 14:29:38 -07:00
parent 048bf6e514
commit 2e0641c83b
40 changed files with 2244 additions and 43 deletions
@@ -56,6 +56,7 @@
    "ajv-formats": "^3.0.0",
    "chardet": "^2.1.0",
    "chokidar": "^5.0.0",
+    "command-exists": "^1.2.9",
    "diff": "^8.0.3",
    "dotenv": "^17.2.4",
    "dotenv-expand": "^12.0.3",
@@ -691,6 +691,7 @@ export interface ConfigParameters {
  ptyInfo?: string;
  disableYoloMode?: boolean;
  disableAlwaysAllow?: boolean;
+  voiceMode?: boolean;
  rawOutput?: boolean;
  acceptRawOutputRisk?: boolean;
  dynamicModelConfiguration?: boolean;
@@ -963,6 +964,7 @@ export class Config implements McpContext, AgentLoopContext {
  private readonly topicUpdateNarration: boolean;
  private readonly disableLLMCorrection: boolean;
  private readonly planEnabled: boolean;
+  private readonly voiceMode: boolean;
  private readonly trackerEnabled: boolean;
  private readonly planModeRoutingEnabled: boolean;
  private readonly modelSteering: boolean;
@@ -1117,6 +1119,7 @@ export class Config implements McpContext, AgentLoopContext {
    this.agents = params.agents ?? {};
    this.disableLLMCorrection = params.disableLLMCorrection ?? true;
    this.planEnabled = params.plan ?? true;
+    this.voiceMode = params.voiceMode ?? false;
    this.trackerEnabled = params.tracker ?? false;
    this.planModeRoutingEnabled = params.planSettings?.modelRouting ?? true;
    this.enableEventDrivenScheduler = params.enableEventDrivenScheduler ?? true;
@@ -2969,6 +2972,10 @@ export class Config implements McpContext, AgentLoopContext {
    return this.planEnabled;
  }

+  isVoiceModeEnabled(): boolean {
+    return this.voiceMode;
+  }
+
  isTrackerEnabled(): boolean {
    return this.trackerEnabled;
  }
@@ -297,3 +297,12 @@ export * from './context/profiles.js';

 // Export trust utility
 export * from './utils/trust.js';
+
+// Export voice utilities
+export * from './voice/audioRecorder.js';
+export * from './voice/transcriptionProvider.js';
+export * from './voice/geminiLiveTranscriptionProvider.js';
+export * from './voice/whisperTranscriptionProvider.js';
+export * from './voice/transcriptionFactory.js';
+export * from './voice/whisperModelManager.js';
+export { isBinaryAvailable } from './utils/binaryCheck.js';
@@ -0,0 +1,14 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { sync as commandExistsSync } from 'command-exists';
+
+/**
+ * Checks if a binary is available in the system PATH.
+ */
+export function isBinaryAvailable(binaryName: string): boolean {
+  return commandExistsSync(binaryName);
+}
@@ -0,0 +1,115 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { spawn, type ChildProcessWithoutNullStreams } from 'node:child_process';
+import { EventEmitter } from 'node:events';
+import commandExists from 'command-exists';
+
+export interface AudioRecorderEvents {
+  data: [Buffer];
+  start: [];
+  stop: [];
+  error: [Error];
+}
+
+/**
+ * Captures audio from the microphone using `sox` (`rec`).
+ * Emits 16kHz, 16-bit, mono PCM chunks.
+ */
+export class AudioRecorder extends EventEmitter<AudioRecorderEvents> {
+  private recProcess: ChildProcessWithoutNullStreams | null = null;
+  private isRecordingInternal = false;
+
+  get isRecording(): boolean {
+    return this.isRecordingInternal;
+  }
+
+  /**
+   * Checks if `rec` (sox) is available on the system.
+   */
+  static async isAvailable(): Promise<boolean> {
+    try {
+      await commandExists('rec');
+      return true;
+    } catch {
+      return false;
+    }
+  }
+
+  async start(): Promise<void> {
+    if (this.isRecordingInternal) return;
+    this.isRecordingInternal = true;
+
+    try {
+      const available = await AudioRecorder.isAvailable();
+      if (!this.isRecordingInternal) return; // Check if stopped while checking availability
+
+      if (!available) {
+        throw new Error(
+          'The `rec` command (provided by SoX) is required for voice mode. Please install SoX (e.g., `brew install sox` on macOS or `sudo apt install sox libsox-fmt-all` on Linux).',
+        );
+      }
+
+      // rec -q -V0 -e signed -c 1 -b 16 -r 16000 -t raw -
+      this.recProcess = spawn('rec', [
+        '-q',
+        '-V0',
+        '-e',
+        'signed',
+        '-c',
+        '1',
+        '-b',
+        '16',
+        '-r',
+        '16000',
+        '-t',
+        'raw',
+        '-',
+      ]);
+
+      if (!this.isRecordingInternal) {
+        this.recProcess.kill('SIGTERM');
+        this.recProcess = null;
+        return;
+      }
+
+      this.recProcess.stdout.on('data', (data: Buffer) => {
+        this.emit('data', data);
+      });
+
+      this.recProcess.stderr.on('data', (_data: Buffer) => {
+        // rec might print warnings to stderr, we could log them or ignore
+        // console.warn(`rec stderr: ${data.toString()}`);
+      });
+
+      this.recProcess.on('error', (err) => {
+        this.emit('error', err);
+        this.stop();
+      });
+
+      this.recProcess.on('close', () => {
+        this.stop();
+      });
+
+      this.emit('start');
+    } catch (err) {
+      this.isRecordingInternal = false;
+      throw err;
+    }
+  }
+
+  stop(): void {
+    if (!this.isRecordingInternal) return;
+    this.isRecordingInternal = false;
+
+    if (this.recProcess) {
+      this.recProcess.kill('SIGTERM');
+      this.recProcess = null;
+    }
+
+    this.emit('stop');
+  }
+}
@@ -0,0 +1,178 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import WebSocket from 'ws';
+import { EventEmitter, once } from 'node:events';
+import { debugLogger } from '../utils/debugLogger.js';
+import type {
+  TranscriptionProvider,
+  TranscriptionEvents,
+} from './transcriptionProvider.js';
+
+import { z } from 'zod';
+
+const LiveAPIResponseSchema = z.object({
+  setupComplete: z.record(z.unknown()).optional(),
+  serverContent: z
+    .object({
+      turnComplete: z.boolean().optional(),
+      inputTranscription: z
+        .object({
+          text: z.string().optional(),
+        })
+        .optional(),
+      outputTranscription: z
+        .object({
+          text: z.string().optional(),
+        })
+        .optional(),
+      modelTurn: z
+        .object({
+          parts: z
+            .array(
+              z.object({
+                text: z.string().optional(),
+                inlineData: z
+                  .object({
+                    data: z.string(),
+                  })
+                  .optional(),
+              }),
+            )
+            .optional(),
+        })
+        .optional(),
+    })
+    .optional(),
+});
+
+/**
+ * Connects to the Gemini Live API using raw WebSockets to support API Key authentication.
+ */
+export class GeminiLiveTranscriptionProvider
+  extends EventEmitter<TranscriptionEvents>
+  implements TranscriptionProvider
+{
+  private ws: WebSocket | null = null;
+  private currentTranscription = '';
+
+  constructor(private readonly apiKey: string) {
+    super();
+  }
+
+  async connect(): Promise<void> {
+    const modelName = 'gemini-3.1-flash-live-preview';
+    const baseUrl =
+      'wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent';
+
+    if (!this.apiKey) {
+      throw new Error('No API key provided');
+    }
+
+    // NOTE: The Generative Language WebSocket API requires the API key to be passed via the 'key' query parameter.
+    const url = `${baseUrl}?key=${this.apiKey}`;
+    debugLogger.debug(
+      `[GeminiLiveTranscription] Connecting to model ${modelName} via raw WebSocket with API Key...`,
+    );
+
+    try {
+      this.ws = new WebSocket(url, {
+        maxPayload: 1 << 20, // 1MB limit for safety
+      });
+
+      this.ws.on('message', (data) => {
+        try {
+          const parsedData: unknown = JSON.parse(data.toString());
+          const result = LiveAPIResponseSchema.safeParse(parsedData);
+
+          if (result.success) {
+            const response = result.data;
+            if (response.serverContent) {
+              const content = response.serverContent;
+
+              if (content.turnComplete) {
+                this.emit('turnComplete');
+              }
+
+              if (content.inputTranscription?.text) {
+                const text = content.inputTranscription.text;
+                debugLogger.debug(
+                  `[GeminiLiveTranscription] Transcription received (Cloud): "${text}"`,
+                );
+                this.currentTranscription = text;
+                this.emit('transcription', this.currentTranscription);
+              }
+            }
+          }
+        } catch (e) {
+          debugLogger.error(
+            '[GeminiLiveTranscription] Error parsing message:',
+            e,
+          );
+        }
+      });
+
+      this.ws.on('error', (error) => {
+        debugLogger.error('[GeminiLiveTranscription] WebSocket Error:', error);
+        this.emit('error', error);
+      });
+
+      this.ws.on('close', (code, reason) => {
+        debugLogger.debug(
+          `[GeminiLiveTranscription] Connection Closed. Code: ${code}, Reason: ${reason}`,
+        );
+        this.emit('close');
+        this.ws = null;
+      });
+
+      await once(this.ws, 'open');
+
+      const setupMessage = {
+        setup: {
+          model: `models/${modelName}`,
+          generation_config: {
+            response_modalities: ['audio'],
+          },
+          input_audio_transcription: {},
+        },
+      };
+
+      this.ws.send(JSON.stringify(setupMessage));
+      this.currentTranscription = '';
+    } catch (err) {
+      debugLogger.error(
+        '[GeminiLiveTranscription] Failed to establish connection:',
+        err,
+      );
+      throw err;
+    }
+  }
+
+  sendAudioChunk(chunk: Buffer): void {
+    if (!this.ws || this.ws.readyState !== WebSocket.OPEN) return;
+
+    const audioMessage = {
+      realtime_input: {
+        audio: {
+          data: chunk.toString('base64'),
+          mime_type: 'audio/pcm;rate=16000',
+        },
+      },
+    };
+    this.ws.send(JSON.stringify(audioMessage));
+  }
+
+  getTranscription(): string {
+    return this.currentTranscription;
+  }
+
+  disconnect(): void {
+    if (this.ws) {
+      this.ws.close();
+      this.ws = null;
+    }
+  }
+}
@@ -0,0 +1,41 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import * as path from 'node:path';
+import * as fs from 'node:fs';
+import { homedir, GEMINI_DIR } from '../utils/paths.js';
+import { GeminiLiveTranscriptionProvider } from './geminiLiveTranscriptionProvider.js';
+import { WhisperTranscriptionProvider } from './whisperTranscriptionProvider.js';
+import type { TranscriptionProvider } from './transcriptionProvider.js';
+
+export class TranscriptionFactory {
+  static createProvider(
+    voiceConfig: { backend?: string; whisperModel?: string } | undefined,
+    apiKey: string,
+  ): TranscriptionProvider {
+    const backend = voiceConfig?.backend ?? 'gemini-live';
+
+    if (backend === 'whisper') {
+      const modelsDir = path.join(homedir(), GEMINI_DIR, 'whisper_models');
+      if (!fs.existsSync(modelsDir)) {
+        fs.mkdirSync(modelsDir, { recursive: true });
+      }
+
+      const modelName = voiceConfig?.whisperModel ?? 'ggml-base.en.bin';
+      const modelPath = path.join(modelsDir, modelName);
+
+      return new WhisperTranscriptionProvider({
+        modelPath,
+        threads: 4,
+        step: 0,
+        length: 5000,
+      });
+    }
+
+    // Default to Gemini Live
+    return new GeminiLiveTranscriptionProvider(apiKey);
+  }
+}
@@ -0,0 +1,33 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import type { EventEmitter } from 'node:events';
+
+export interface TranscriptionEvents {
+  /** Emitted when partial or full transcription text is available. */
+  transcription: [string];
+  /** Emitted when a speaking turn is considered complete. */
+  turnComplete: [];
+  /** Emitted when an error occurs during transcription. */
+  error: [Error];
+  /** Emitted when the transcription service connection is closed. */
+  close: [];
+}
+
+/**
+ * Common interface for all transcription backends (Cloud or Local).
+ */
+export interface TranscriptionProvider
+  extends EventEmitter<TranscriptionEvents> {
+  /** Establish connection to the transcription service. */
+  connect(): Promise<void>;
+  /** Send a chunk of raw audio data to the service. */
+  sendAudioChunk(chunk: Buffer): void;
+  /** Disconnect from the transcription service. */
+  disconnect(): void;
+  /** Get the current full transcription for the session. */
+  getTranscription(): string;
+}
@@ -0,0 +1,107 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import * as path from 'node:path';
+import * as fs from 'node:fs';
+import { EventEmitter } from 'node:events';
+import { homedir, GEMINI_DIR } from '../utils/paths.js';
+import { debugLogger } from '../utils/debugLogger.js';
+
+export interface WhisperModelProgress {
+  modelName: string;
+  transferred: number;
+  total: number;
+  percentage: number;
+}
+
+export interface WhisperModelManagerEvents {
+  progress: [WhisperModelProgress];
+}
+
+const ALLOWED_MODELS = [
+  'ggml-tiny.en.bin',
+  'ggml-base.en.bin',
+  'ggml-large-v3-turbo-q5_0.bin',
+  'ggml-large-v3-turbo-q8_0.bin',
+];
+
+/**
+ * Manages Whisper models (checking existence, downloading).
+ */
+export class WhisperModelManager extends EventEmitter<WhisperModelManagerEvents> {
+  private readonly modelsDir: string;
+
+  constructor() {
+    super();
+    this.modelsDir = path.join(homedir(), GEMINI_DIR, 'whisper_models');
+  }
+
+  isModelInstalled(modelName: string): boolean {
+    this.validateModelName(modelName);
+    return fs.existsSync(path.join(this.modelsDir, modelName));
+  }
+
+  getModelPath(modelName: string): string {
+    this.validateModelName(modelName);
+    return path.join(this.modelsDir, modelName);
+  }
+
+  async downloadModel(modelName: string): Promise<void> {
+    this.validateModelName(modelName);
+
+    if (!fs.existsSync(this.modelsDir)) {
+      fs.mkdirSync(this.modelsDir, { recursive: true });
+    }
+
+    const destination = path.join(this.modelsDir, modelName);
+    const url = `https://huggingface.co/ggerganov/whisper.cpp/resolve/main/${modelName}`;
+
+    debugLogger.debug(
+      `[WhisperModelManager] Downloading ${modelName} from ${url}`,
+    );
+
+    const response = await fetch(url);
+    if (!response.ok) {
+      throw new Error(`Failed to download model: ${response.statusText}`);
+    }
+
+    const total = parseInt(response.headers.get('content-length') || '0', 10);
+    let transferred = 0;
+
+    const reader = response.body?.getReader();
+    if (!reader) {
+      throw new Error('Response body is not readable');
+    }
+
+    const writer = fs.createWriteStream(destination);
+
+    try {
+      while (true) {
+        const { done, value } = await reader.read();
+        if (done) break;
+
+        transferred += value.length;
+        writer.write(value);
+
+        const percentage = total > 0 ? transferred / total : 0;
+        this.emit('progress', {
+          modelName,
+          transferred,
+          total,
+          percentage,
+        });
+      }
+    } finally {
+      writer.end();
+    }
+  }
+
+  private validateModelName(modelName: string): void {
+    if (!ALLOWED_MODELS.includes(modelName)) {
+      throw new Error(`Unauthorized model name: ${modelName}`);
+    }
+  }
+}
@@ -0,0 +1,31 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, it, expect, vi, beforeEach } from 'vitest';
+import { WhisperTranscriptionProvider } from './whisperTranscriptionProvider.js';
+import commandExists from 'command-exists';
+
+vi.mock('command-exists', () => ({
+  default: vi.fn(),
+}));
+
+describe('WhisperTranscriptionProvider', () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+  });
+
+  it('should throw a friendly error if whisper-stream is not available', async () => {
+    vi.mocked(commandExists).mockRejectedValue(new Error('not found'));
+
+    const provider = new WhisperTranscriptionProvider({
+      modelPath: 'test-model.bin',
+    });
+
+    await expect(provider.connect()).rejects.toThrow(
+      'The `whisper-stream` command is required for local voice mode. Please install it (e.g., `brew install whisper-cpp` on macOS).',
+    );
+  });
+});
@@ -0,0 +1,199 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { spawn, type ChildProcessWithoutNullStreams } from 'node:child_process';
+import { EventEmitter } from 'node:events';
+import commandExists from 'command-exists';
+import { debugLogger } from '../utils/debugLogger.js';
+import type {
+  TranscriptionProvider,
+  TranscriptionEvents,
+} from './transcriptionProvider.js';
+
+export interface WhisperProviderOptions {
+  modelPath: string;
+  threads?: number;
+  step?: number;
+  length?: number;
+}
+
+/**
+ * Local transcription provider using `whisper-stream` from whisper.cpp.
+ *
+ * Uses the Sliding Window Mode with VAD (--step 0) for stable,
+ * non-overlapping transcription blocks that can be appended directly.
+ */
+export class WhisperTranscriptionProvider
+  extends EventEmitter<TranscriptionEvents>
+  implements TranscriptionProvider
+{
+  private process: ChildProcessWithoutNullStreams | null = null;
+  private currentTranscription = '';
+
+  constructor(private readonly options: WhisperProviderOptions) {
+    super();
+  }
+
+  /**
+   * Checks if `whisper-stream` is available on the system.
+   */
+  static async isAvailable(): Promise<boolean> {
+    try {
+      await commandExists('whisper-stream');
+      return true;
+    } catch {
+      return false;
+    }
+  }
+
+  async connect(): Promise<void> {
+    const { modelPath, threads = 4, step = 0, length = 5000 } = this.options;
+
+    this.currentTranscription = '';
+
+    const available = await WhisperTranscriptionProvider.isAvailable();
+    if (!available) {
+      return Promise.reject(
+        new Error(
+          'The `whisper-stream` command is required for local voice mode. Please install it (e.g., `brew install whisper-cpp` on macOS).',
+        ),
+      );
+    }
+
+    debugLogger.debug(
+      `[WhisperTranscription] Starting whisper-stream with model: ${modelPath} (VAD mode: step=${step}, length=${length})`,
+    );
+
+    return new Promise((resolve, reject) => {
+      let isResolved = false;
+
+      try {
+        // whisper-stream -m <model_path> -t <threads> --step 0 --length <length> -vth 0.6
+        // Setting step == 0 enables sliding window mode with VAD, which outputs
+        // non-overlapping transcription blocks suitable for appending.
+        this.process = spawn('whisper-stream', [
+          '-m',
+          modelPath,
+          '-t',
+          threads.toString(),
+          '--step',
+          step.toString(),
+          '--length',
+          length.toString(),
+          '-vth',
+          '0.6',
+        ]);
+
+        this.process.stdout.on('data', (data: Buffer) => {
+          const output = data.toString();
+          this.parseOutput(output);
+        });
+
+        this.process.stderr.on('data', (data: Buffer) => {
+          const msg = data.toString();
+          if (msg.includes('error')) {
+            debugLogger.error(`[WhisperTranscription] stderr: ${msg}`);
+            if (!isResolved) {
+              isResolved = true;
+              reject(new Error(msg));
+            }
+          }
+
+          // whisper-stream prints "whisper_init_from_file_with_params_no_state: loading model from..."
+          // and finally "main: processing, press Ctrl+C to stop" when ready.
+          if (!isResolved && msg.includes('main: processing')) {
+            debugLogger.debug('[WhisperTranscription] whisper-stream is ready');
+            isResolved = true;
+            resolve();
+          }
+        });
+
+        this.process.on('error', (err) => {
+          debugLogger.error('[WhisperTranscription] Process error:', err);
+          this.emit('error', err);
+          if (!isResolved) {
+            isResolved = true;
+            reject(err);
+          }
+        });
+
+        this.process.on('close', (code) => {
+          debugLogger.debug(
+            `[WhisperTranscription] Process closed with code ${code}`,
+          );
+          this.emit('close');
+          this.process = null;
+        });
+
+        // Fallback timeout in case "main: processing" is never seen
+        setTimeout(() => {
+          if (!isResolved) {
+            debugLogger.warn(
+              '[WhisperTranscription] Connection timeout (fallback resolve)',
+            );
+            isResolved = true;
+            resolve();
+          }
+        }, 10000);
+      } catch (err) {
+        debugLogger.error(
+          '[WhisperTranscription] Failed to spawn process:',
+          err,
+        );
+        if (!isResolved) {
+          isResolved = true;
+          reject(err);
+        }
+      }
+    });
+  }
+
+  private parseOutput(output: string): void {
+    // whisper-stream output format: "[00:00:00.000 --> 00:00:02.000]   Hello world."
+    const lines = output.split('\n');
+
+    for (const line of lines) {
+      const match = line.match(/\[.* --> .*\]\s+(.*)/);
+      if (match && match[1]) {
+        let text = match[1].trim();
+
+        // Filter out [Silence], [music], (laughter), etc.
+        text = text
+          .replace(/\[[^\]]*\]/g, '')
+          .replace(/\([^)]*\)/g, '')
+          .trim();
+
+        if (text) {
+          // In VAD mode (step=0), each line is a completed speech block.
+          // Append it to the buffer to ensure it doesn't disappear.
+          this.currentTranscription = this.currentTranscription
+            ? `${this.currentTranscription} ${text}`
+            : text;
+
+          debugLogger.debug(
+            `[WhisperTranscription] Transcription updated (Local-VAD): "${this.currentTranscription}"`,
+          );
+          this.emit('transcription', this.currentTranscription);
+        }
+      }
+    }
+  }
+
+  sendAudioChunk(_chunk: Buffer): void {
+    // whisper-stream handles its own audio capture.
+  }
+
+  getTranscription(): string {
+    return this.currentTranscription;
+  }
+
+  disconnect(): void {
+    if (this.process) {
+      this.process.kill('SIGTERM');
+      this.process = null;
+    }
+  }
+}