mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-04-26 04:54:25 -07:00
feat(voice): implement real-time voice mode with cloud and local backends (#24174)
This commit is contained in:
@@ -56,6 +56,7 @@
|
||||
"ajv-formats": "^3.0.0",
|
||||
"chardet": "^2.1.0",
|
||||
"chokidar": "^5.0.0",
|
||||
"command-exists": "^1.2.9",
|
||||
"diff": "^8.0.3",
|
||||
"dotenv": "^17.2.4",
|
||||
"dotenv-expand": "^12.0.3",
|
||||
|
||||
@@ -691,6 +691,7 @@ export interface ConfigParameters {
|
||||
ptyInfo?: string;
|
||||
disableYoloMode?: boolean;
|
||||
disableAlwaysAllow?: boolean;
|
||||
voiceMode?: boolean;
|
||||
rawOutput?: boolean;
|
||||
acceptRawOutputRisk?: boolean;
|
||||
dynamicModelConfiguration?: boolean;
|
||||
@@ -963,6 +964,7 @@ export class Config implements McpContext, AgentLoopContext {
|
||||
private readonly topicUpdateNarration: boolean;
|
||||
private readonly disableLLMCorrection: boolean;
|
||||
private readonly planEnabled: boolean;
|
||||
private readonly voiceMode: boolean;
|
||||
private readonly trackerEnabled: boolean;
|
||||
private readonly planModeRoutingEnabled: boolean;
|
||||
private readonly modelSteering: boolean;
|
||||
@@ -1117,6 +1119,7 @@ export class Config implements McpContext, AgentLoopContext {
|
||||
this.agents = params.agents ?? {};
|
||||
this.disableLLMCorrection = params.disableLLMCorrection ?? true;
|
||||
this.planEnabled = params.plan ?? true;
|
||||
this.voiceMode = params.voiceMode ?? false;
|
||||
this.trackerEnabled = params.tracker ?? false;
|
||||
this.planModeRoutingEnabled = params.planSettings?.modelRouting ?? true;
|
||||
this.enableEventDrivenScheduler = params.enableEventDrivenScheduler ?? true;
|
||||
@@ -2969,6 +2972,10 @@ export class Config implements McpContext, AgentLoopContext {
|
||||
return this.planEnabled;
|
||||
}
|
||||
|
||||
isVoiceModeEnabled(): boolean {
|
||||
return this.voiceMode;
|
||||
}
|
||||
|
||||
isTrackerEnabled(): boolean {
|
||||
return this.trackerEnabled;
|
||||
}
|
||||
|
||||
@@ -297,3 +297,12 @@ export * from './context/profiles.js';
|
||||
|
||||
// Export trust utility
|
||||
export * from './utils/trust.js';
|
||||
|
||||
// Export voice utilities
|
||||
export * from './voice/audioRecorder.js';
|
||||
export * from './voice/transcriptionProvider.js';
|
||||
export * from './voice/geminiLiveTranscriptionProvider.js';
|
||||
export * from './voice/whisperTranscriptionProvider.js';
|
||||
export * from './voice/transcriptionFactory.js';
|
||||
export * from './voice/whisperModelManager.js';
|
||||
export { isBinaryAvailable } from './utils/binaryCheck.js';
|
||||
|
||||
@@ -0,0 +1,14 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { sync as commandExistsSync } from 'command-exists';
|
||||
|
||||
/**
|
||||
* Checks if a binary is available in the system PATH.
|
||||
*/
|
||||
export function isBinaryAvailable(binaryName: string): boolean {
|
||||
return commandExistsSync(binaryName);
|
||||
}
|
||||
@@ -0,0 +1,115 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { spawn, type ChildProcessWithoutNullStreams } from 'node:child_process';
|
||||
import { EventEmitter } from 'node:events';
|
||||
import commandExists from 'command-exists';
|
||||
|
||||
export interface AudioRecorderEvents {
|
||||
data: [Buffer];
|
||||
start: [];
|
||||
stop: [];
|
||||
error: [Error];
|
||||
}
|
||||
|
||||
/**
|
||||
* Captures audio from the microphone using `sox` (`rec`).
|
||||
* Emits 16kHz, 16-bit, mono PCM chunks.
|
||||
*/
|
||||
export class AudioRecorder extends EventEmitter<AudioRecorderEvents> {
|
||||
private recProcess: ChildProcessWithoutNullStreams | null = null;
|
||||
private isRecordingInternal = false;
|
||||
|
||||
get isRecording(): boolean {
|
||||
return this.isRecordingInternal;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if `rec` (sox) is available on the system.
|
||||
*/
|
||||
static async isAvailable(): Promise<boolean> {
|
||||
try {
|
||||
await commandExists('rec');
|
||||
return true;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
async start(): Promise<void> {
|
||||
if (this.isRecordingInternal) return;
|
||||
this.isRecordingInternal = true;
|
||||
|
||||
try {
|
||||
const available = await AudioRecorder.isAvailable();
|
||||
if (!this.isRecordingInternal) return; // Check if stopped while checking availability
|
||||
|
||||
if (!available) {
|
||||
throw new Error(
|
||||
'The `rec` command (provided by SoX) is required for voice mode. Please install SoX (e.g., `brew install sox` on macOS or `sudo apt install sox libsox-fmt-all` on Linux).',
|
||||
);
|
||||
}
|
||||
|
||||
// rec -q -V0 -e signed -c 1 -b 16 -r 16000 -t raw -
|
||||
this.recProcess = spawn('rec', [
|
||||
'-q',
|
||||
'-V0',
|
||||
'-e',
|
||||
'signed',
|
||||
'-c',
|
||||
'1',
|
||||
'-b',
|
||||
'16',
|
||||
'-r',
|
||||
'16000',
|
||||
'-t',
|
||||
'raw',
|
||||
'-',
|
||||
]);
|
||||
|
||||
if (!this.isRecordingInternal) {
|
||||
this.recProcess.kill('SIGTERM');
|
||||
this.recProcess = null;
|
||||
return;
|
||||
}
|
||||
|
||||
this.recProcess.stdout.on('data', (data: Buffer) => {
|
||||
this.emit('data', data);
|
||||
});
|
||||
|
||||
this.recProcess.stderr.on('data', (_data: Buffer) => {
|
||||
// rec might print warnings to stderr, we could log them or ignore
|
||||
// console.warn(`rec stderr: ${data.toString()}`);
|
||||
});
|
||||
|
||||
this.recProcess.on('error', (err) => {
|
||||
this.emit('error', err);
|
||||
this.stop();
|
||||
});
|
||||
|
||||
this.recProcess.on('close', () => {
|
||||
this.stop();
|
||||
});
|
||||
|
||||
this.emit('start');
|
||||
} catch (err) {
|
||||
this.isRecordingInternal = false;
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
stop(): void {
|
||||
if (!this.isRecordingInternal) return;
|
||||
this.isRecordingInternal = false;
|
||||
|
||||
if (this.recProcess) {
|
||||
this.recProcess.kill('SIGTERM');
|
||||
this.recProcess = null;
|
||||
}
|
||||
|
||||
this.emit('stop');
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,178 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import WebSocket from 'ws';
|
||||
import { EventEmitter, once } from 'node:events';
|
||||
import { debugLogger } from '../utils/debugLogger.js';
|
||||
import type {
|
||||
TranscriptionProvider,
|
||||
TranscriptionEvents,
|
||||
} from './transcriptionProvider.js';
|
||||
|
||||
import { z } from 'zod';
|
||||
|
||||
const LiveAPIResponseSchema = z.object({
|
||||
setupComplete: z.record(z.unknown()).optional(),
|
||||
serverContent: z
|
||||
.object({
|
||||
turnComplete: z.boolean().optional(),
|
||||
inputTranscription: z
|
||||
.object({
|
||||
text: z.string().optional(),
|
||||
})
|
||||
.optional(),
|
||||
outputTranscription: z
|
||||
.object({
|
||||
text: z.string().optional(),
|
||||
})
|
||||
.optional(),
|
||||
modelTurn: z
|
||||
.object({
|
||||
parts: z
|
||||
.array(
|
||||
z.object({
|
||||
text: z.string().optional(),
|
||||
inlineData: z
|
||||
.object({
|
||||
data: z.string(),
|
||||
})
|
||||
.optional(),
|
||||
}),
|
||||
)
|
||||
.optional(),
|
||||
})
|
||||
.optional(),
|
||||
})
|
||||
.optional(),
|
||||
});
|
||||
|
||||
/**
|
||||
* Connects to the Gemini Live API using raw WebSockets to support API Key authentication.
|
||||
*/
|
||||
export class GeminiLiveTranscriptionProvider
|
||||
extends EventEmitter<TranscriptionEvents>
|
||||
implements TranscriptionProvider
|
||||
{
|
||||
private ws: WebSocket | null = null;
|
||||
private currentTranscription = '';
|
||||
|
||||
constructor(private readonly apiKey: string) {
|
||||
super();
|
||||
}
|
||||
|
||||
async connect(): Promise<void> {
|
||||
const modelName = 'gemini-3.1-flash-live-preview';
|
||||
const baseUrl =
|
||||
'wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent';
|
||||
|
||||
if (!this.apiKey) {
|
||||
throw new Error('No API key provided');
|
||||
}
|
||||
|
||||
// NOTE: The Generative Language WebSocket API requires the API key to be passed via the 'key' query parameter.
|
||||
const url = `${baseUrl}?key=${this.apiKey}`;
|
||||
debugLogger.debug(
|
||||
`[GeminiLiveTranscription] Connecting to model ${modelName} via raw WebSocket with API Key...`,
|
||||
);
|
||||
|
||||
try {
|
||||
this.ws = new WebSocket(url, {
|
||||
maxPayload: 1 << 20, // 1MB limit for safety
|
||||
});
|
||||
|
||||
this.ws.on('message', (data) => {
|
||||
try {
|
||||
const parsedData: unknown = JSON.parse(data.toString());
|
||||
const result = LiveAPIResponseSchema.safeParse(parsedData);
|
||||
|
||||
if (result.success) {
|
||||
const response = result.data;
|
||||
if (response.serverContent) {
|
||||
const content = response.serverContent;
|
||||
|
||||
if (content.turnComplete) {
|
||||
this.emit('turnComplete');
|
||||
}
|
||||
|
||||
if (content.inputTranscription?.text) {
|
||||
const text = content.inputTranscription.text;
|
||||
debugLogger.debug(
|
||||
`[GeminiLiveTranscription] Transcription received (Cloud): "${text}"`,
|
||||
);
|
||||
this.currentTranscription = text;
|
||||
this.emit('transcription', this.currentTranscription);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
debugLogger.error(
|
||||
'[GeminiLiveTranscription] Error parsing message:',
|
||||
e,
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
this.ws.on('error', (error) => {
|
||||
debugLogger.error('[GeminiLiveTranscription] WebSocket Error:', error);
|
||||
this.emit('error', error);
|
||||
});
|
||||
|
||||
this.ws.on('close', (code, reason) => {
|
||||
debugLogger.debug(
|
||||
`[GeminiLiveTranscription] Connection Closed. Code: ${code}, Reason: ${reason}`,
|
||||
);
|
||||
this.emit('close');
|
||||
this.ws = null;
|
||||
});
|
||||
|
||||
await once(this.ws, 'open');
|
||||
|
||||
const setupMessage = {
|
||||
setup: {
|
||||
model: `models/${modelName}`,
|
||||
generation_config: {
|
||||
response_modalities: ['audio'],
|
||||
},
|
||||
input_audio_transcription: {},
|
||||
},
|
||||
};
|
||||
|
||||
this.ws.send(JSON.stringify(setupMessage));
|
||||
this.currentTranscription = '';
|
||||
} catch (err) {
|
||||
debugLogger.error(
|
||||
'[GeminiLiveTranscription] Failed to establish connection:',
|
||||
err,
|
||||
);
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
sendAudioChunk(chunk: Buffer): void {
|
||||
if (!this.ws || this.ws.readyState !== WebSocket.OPEN) return;
|
||||
|
||||
const audioMessage = {
|
||||
realtime_input: {
|
||||
audio: {
|
||||
data: chunk.toString('base64'),
|
||||
mime_type: 'audio/pcm;rate=16000',
|
||||
},
|
||||
},
|
||||
};
|
||||
this.ws.send(JSON.stringify(audioMessage));
|
||||
}
|
||||
|
||||
getTranscription(): string {
|
||||
return this.currentTranscription;
|
||||
}
|
||||
|
||||
disconnect(): void {
|
||||
if (this.ws) {
|
||||
this.ws.close();
|
||||
this.ws = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,41 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import * as path from 'node:path';
|
||||
import * as fs from 'node:fs';
|
||||
import { homedir, GEMINI_DIR } from '../utils/paths.js';
|
||||
import { GeminiLiveTranscriptionProvider } from './geminiLiveTranscriptionProvider.js';
|
||||
import { WhisperTranscriptionProvider } from './whisperTranscriptionProvider.js';
|
||||
import type { TranscriptionProvider } from './transcriptionProvider.js';
|
||||
|
||||
export class TranscriptionFactory {
|
||||
static createProvider(
|
||||
voiceConfig: { backend?: string; whisperModel?: string } | undefined,
|
||||
apiKey: string,
|
||||
): TranscriptionProvider {
|
||||
const backend = voiceConfig?.backend ?? 'gemini-live';
|
||||
|
||||
if (backend === 'whisper') {
|
||||
const modelsDir = path.join(homedir(), GEMINI_DIR, 'whisper_models');
|
||||
if (!fs.existsSync(modelsDir)) {
|
||||
fs.mkdirSync(modelsDir, { recursive: true });
|
||||
}
|
||||
|
||||
const modelName = voiceConfig?.whisperModel ?? 'ggml-base.en.bin';
|
||||
const modelPath = path.join(modelsDir, modelName);
|
||||
|
||||
return new WhisperTranscriptionProvider({
|
||||
modelPath,
|
||||
threads: 4,
|
||||
step: 0,
|
||||
length: 5000,
|
||||
});
|
||||
}
|
||||
|
||||
// Default to Gemini Live
|
||||
return new GeminiLiveTranscriptionProvider(apiKey);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,33 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import type { EventEmitter } from 'node:events';
|
||||
|
||||
export interface TranscriptionEvents {
|
||||
/** Emitted when partial or full transcription text is available. */
|
||||
transcription: [string];
|
||||
/** Emitted when a speaking turn is considered complete. */
|
||||
turnComplete: [];
|
||||
/** Emitted when an error occurs during transcription. */
|
||||
error: [Error];
|
||||
/** Emitted when the transcription service connection is closed. */
|
||||
close: [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Common interface for all transcription backends (Cloud or Local).
|
||||
*/
|
||||
export interface TranscriptionProvider
|
||||
extends EventEmitter<TranscriptionEvents> {
|
||||
/** Establish connection to the transcription service. */
|
||||
connect(): Promise<void>;
|
||||
/** Send a chunk of raw audio data to the service. */
|
||||
sendAudioChunk(chunk: Buffer): void;
|
||||
/** Disconnect from the transcription service. */
|
||||
disconnect(): void;
|
||||
/** Get the current full transcription for the session. */
|
||||
getTranscription(): string;
|
||||
}
|
||||
@@ -0,0 +1,107 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import * as path from 'node:path';
|
||||
import * as fs from 'node:fs';
|
||||
import { EventEmitter } from 'node:events';
|
||||
import { homedir, GEMINI_DIR } from '../utils/paths.js';
|
||||
import { debugLogger } from '../utils/debugLogger.js';
|
||||
|
||||
export interface WhisperModelProgress {
|
||||
modelName: string;
|
||||
transferred: number;
|
||||
total: number;
|
||||
percentage: number;
|
||||
}
|
||||
|
||||
export interface WhisperModelManagerEvents {
|
||||
progress: [WhisperModelProgress];
|
||||
}
|
||||
|
||||
const ALLOWED_MODELS = [
|
||||
'ggml-tiny.en.bin',
|
||||
'ggml-base.en.bin',
|
||||
'ggml-large-v3-turbo-q5_0.bin',
|
||||
'ggml-large-v3-turbo-q8_0.bin',
|
||||
];
|
||||
|
||||
/**
|
||||
* Manages Whisper models (checking existence, downloading).
|
||||
*/
|
||||
export class WhisperModelManager extends EventEmitter<WhisperModelManagerEvents> {
|
||||
private readonly modelsDir: string;
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
this.modelsDir = path.join(homedir(), GEMINI_DIR, 'whisper_models');
|
||||
}
|
||||
|
||||
isModelInstalled(modelName: string): boolean {
|
||||
this.validateModelName(modelName);
|
||||
return fs.existsSync(path.join(this.modelsDir, modelName));
|
||||
}
|
||||
|
||||
getModelPath(modelName: string): string {
|
||||
this.validateModelName(modelName);
|
||||
return path.join(this.modelsDir, modelName);
|
||||
}
|
||||
|
||||
async downloadModel(modelName: string): Promise<void> {
|
||||
this.validateModelName(modelName);
|
||||
|
||||
if (!fs.existsSync(this.modelsDir)) {
|
||||
fs.mkdirSync(this.modelsDir, { recursive: true });
|
||||
}
|
||||
|
||||
const destination = path.join(this.modelsDir, modelName);
|
||||
const url = `https://huggingface.co/ggerganov/whisper.cpp/resolve/main/${modelName}`;
|
||||
|
||||
debugLogger.debug(
|
||||
`[WhisperModelManager] Downloading ${modelName} from ${url}`,
|
||||
);
|
||||
|
||||
const response = await fetch(url);
|
||||
if (!response.ok) {
|
||||
throw new Error(`Failed to download model: ${response.statusText}`);
|
||||
}
|
||||
|
||||
const total = parseInt(response.headers.get('content-length') || '0', 10);
|
||||
let transferred = 0;
|
||||
|
||||
const reader = response.body?.getReader();
|
||||
if (!reader) {
|
||||
throw new Error('Response body is not readable');
|
||||
}
|
||||
|
||||
const writer = fs.createWriteStream(destination);
|
||||
|
||||
try {
|
||||
while (true) {
|
||||
const { done, value } = await reader.read();
|
||||
if (done) break;
|
||||
|
||||
transferred += value.length;
|
||||
writer.write(value);
|
||||
|
||||
const percentage = total > 0 ? transferred / total : 0;
|
||||
this.emit('progress', {
|
||||
modelName,
|
||||
transferred,
|
||||
total,
|
||||
percentage,
|
||||
});
|
||||
}
|
||||
} finally {
|
||||
writer.end();
|
||||
}
|
||||
}
|
||||
|
||||
private validateModelName(modelName: string): void {
|
||||
if (!ALLOWED_MODELS.includes(modelName)) {
|
||||
throw new Error(`Unauthorized model name: ${modelName}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,31 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { describe, it, expect, vi, beforeEach } from 'vitest';
|
||||
import { WhisperTranscriptionProvider } from './whisperTranscriptionProvider.js';
|
||||
import commandExists from 'command-exists';
|
||||
|
||||
vi.mock('command-exists', () => ({
|
||||
default: vi.fn(),
|
||||
}));
|
||||
|
||||
describe('WhisperTranscriptionProvider', () => {
|
||||
beforeEach(() => {
|
||||
vi.clearAllMocks();
|
||||
});
|
||||
|
||||
it('should throw a friendly error if whisper-stream is not available', async () => {
|
||||
vi.mocked(commandExists).mockRejectedValue(new Error('not found'));
|
||||
|
||||
const provider = new WhisperTranscriptionProvider({
|
||||
modelPath: 'test-model.bin',
|
||||
});
|
||||
|
||||
await expect(provider.connect()).rejects.toThrow(
|
||||
'The `whisper-stream` command is required for local voice mode. Please install it (e.g., `brew install whisper-cpp` on macOS).',
|
||||
);
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,199 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { spawn, type ChildProcessWithoutNullStreams } from 'node:child_process';
|
||||
import { EventEmitter } from 'node:events';
|
||||
import commandExists from 'command-exists';
|
||||
import { debugLogger } from '../utils/debugLogger.js';
|
||||
import type {
|
||||
TranscriptionProvider,
|
||||
TranscriptionEvents,
|
||||
} from './transcriptionProvider.js';
|
||||
|
||||
export interface WhisperProviderOptions {
|
||||
modelPath: string;
|
||||
threads?: number;
|
||||
step?: number;
|
||||
length?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Local transcription provider using `whisper-stream` from whisper.cpp.
|
||||
*
|
||||
* Uses the Sliding Window Mode with VAD (--step 0) for stable,
|
||||
* non-overlapping transcription blocks that can be appended directly.
|
||||
*/
|
||||
export class WhisperTranscriptionProvider
|
||||
extends EventEmitter<TranscriptionEvents>
|
||||
implements TranscriptionProvider
|
||||
{
|
||||
private process: ChildProcessWithoutNullStreams | null = null;
|
||||
private currentTranscription = '';
|
||||
|
||||
constructor(private readonly options: WhisperProviderOptions) {
|
||||
super();
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if `whisper-stream` is available on the system.
|
||||
*/
|
||||
static async isAvailable(): Promise<boolean> {
|
||||
try {
|
||||
await commandExists('whisper-stream');
|
||||
return true;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
async connect(): Promise<void> {
|
||||
const { modelPath, threads = 4, step = 0, length = 5000 } = this.options;
|
||||
|
||||
this.currentTranscription = '';
|
||||
|
||||
const available = await WhisperTranscriptionProvider.isAvailable();
|
||||
if (!available) {
|
||||
return Promise.reject(
|
||||
new Error(
|
||||
'The `whisper-stream` command is required for local voice mode. Please install it (e.g., `brew install whisper-cpp` on macOS).',
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
debugLogger.debug(
|
||||
`[WhisperTranscription] Starting whisper-stream with model: ${modelPath} (VAD mode: step=${step}, length=${length})`,
|
||||
);
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
let isResolved = false;
|
||||
|
||||
try {
|
||||
// whisper-stream -m <model_path> -t <threads> --step 0 --length <length> -vth 0.6
|
||||
// Setting step == 0 enables sliding window mode with VAD, which outputs
|
||||
// non-overlapping transcription blocks suitable for appending.
|
||||
this.process = spawn('whisper-stream', [
|
||||
'-m',
|
||||
modelPath,
|
||||
'-t',
|
||||
threads.toString(),
|
||||
'--step',
|
||||
step.toString(),
|
||||
'--length',
|
||||
length.toString(),
|
||||
'-vth',
|
||||
'0.6',
|
||||
]);
|
||||
|
||||
this.process.stdout.on('data', (data: Buffer) => {
|
||||
const output = data.toString();
|
||||
this.parseOutput(output);
|
||||
});
|
||||
|
||||
this.process.stderr.on('data', (data: Buffer) => {
|
||||
const msg = data.toString();
|
||||
if (msg.includes('error')) {
|
||||
debugLogger.error(`[WhisperTranscription] stderr: ${msg}`);
|
||||
if (!isResolved) {
|
||||
isResolved = true;
|
||||
reject(new Error(msg));
|
||||
}
|
||||
}
|
||||
|
||||
// whisper-stream prints "whisper_init_from_file_with_params_no_state: loading model from..."
|
||||
// and finally "main: processing, press Ctrl+C to stop" when ready.
|
||||
if (!isResolved && msg.includes('main: processing')) {
|
||||
debugLogger.debug('[WhisperTranscription] whisper-stream is ready');
|
||||
isResolved = true;
|
||||
resolve();
|
||||
}
|
||||
});
|
||||
|
||||
this.process.on('error', (err) => {
|
||||
debugLogger.error('[WhisperTranscription] Process error:', err);
|
||||
this.emit('error', err);
|
||||
if (!isResolved) {
|
||||
isResolved = true;
|
||||
reject(err);
|
||||
}
|
||||
});
|
||||
|
||||
this.process.on('close', (code) => {
|
||||
debugLogger.debug(
|
||||
`[WhisperTranscription] Process closed with code ${code}`,
|
||||
);
|
||||
this.emit('close');
|
||||
this.process = null;
|
||||
});
|
||||
|
||||
// Fallback timeout in case "main: processing" is never seen
|
||||
setTimeout(() => {
|
||||
if (!isResolved) {
|
||||
debugLogger.warn(
|
||||
'[WhisperTranscription] Connection timeout (fallback resolve)',
|
||||
);
|
||||
isResolved = true;
|
||||
resolve();
|
||||
}
|
||||
}, 10000);
|
||||
} catch (err) {
|
||||
debugLogger.error(
|
||||
'[WhisperTranscription] Failed to spawn process:',
|
||||
err,
|
||||
);
|
||||
if (!isResolved) {
|
||||
isResolved = true;
|
||||
reject(err);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
private parseOutput(output: string): void {
|
||||
// whisper-stream output format: "[00:00:00.000 --> 00:00:02.000] Hello world."
|
||||
const lines = output.split('\n');
|
||||
|
||||
for (const line of lines) {
|
||||
const match = line.match(/\[.* --> .*\]\s+(.*)/);
|
||||
if (match && match[1]) {
|
||||
let text = match[1].trim();
|
||||
|
||||
// Filter out [Silence], [music], (laughter), etc.
|
||||
text = text
|
||||
.replace(/\[[^\]]*\]/g, '')
|
||||
.replace(/\([^)]*\)/g, '')
|
||||
.trim();
|
||||
|
||||
if (text) {
|
||||
// In VAD mode (step=0), each line is a completed speech block.
|
||||
// Append it to the buffer to ensure it doesn't disappear.
|
||||
this.currentTranscription = this.currentTranscription
|
||||
? `${this.currentTranscription} ${text}`
|
||||
: text;
|
||||
|
||||
debugLogger.debug(
|
||||
`[WhisperTranscription] Transcription updated (Local-VAD): "${this.currentTranscription}"`,
|
||||
);
|
||||
this.emit('transcription', this.currentTranscription);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sendAudioChunk(_chunk: Buffer): void {
|
||||
// whisper-stream handles its own audio capture.
|
||||
}
|
||||
|
||||
getTranscription(): string {
|
||||
return this.currentTranscription;
|
||||
}
|
||||
|
||||
disconnect(): void {
|
||||
if (this.process) {
|
||||
this.process.kill('SIGTERM');
|
||||
this.process = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user