feat(voice): implement real-time voice mode with cloud and local backends (#24174)

This commit is contained in:
Abhijit Balaji
2026-04-24 14:29:38 -07:00
committed by GitHub
parent 048bf6e514
commit 2e0641c83b
40 changed files with 2244 additions and 43 deletions
+1
View File
@@ -1000,6 +1000,7 @@ export async function loadCliConfig(
enableExtensionReloading: settings.experimental?.extensionReloading,
enableAgents: settings.experimental?.enableAgents,
plan: settings.general?.plan?.enabled ?? true,
voiceMode: settings.experimental?.voiceMode,
tracker: settings.experimental?.taskTracker,
directWebFetch: settings.experimental?.directWebFetch,
planSettings: settings.general?.plan?.directory
+81
View File
@@ -2061,6 +2061,87 @@ const SETTINGS_SCHEMA = {
description: 'Enable access to Gemma 4 models (experimental).',
showInDialog: true,
},
voiceMode: {
type: 'boolean',
label: 'Voice Mode',
category: 'Experimental',
requiresRestart: false,
default: false,
description:
'Enable experimental voice dictation and commands (/voice, /voice model).',
showInDialog: true,
},
voice: {
type: 'object',
label: 'Voice',
category: 'Experimental',
requiresRestart: false,
default: {},
description: 'Settings for voice mode and transcription.',
showInDialog: false,
properties: {
activationMode: {
type: 'enum',
label: 'Voice Activation Mode',
category: 'Experimental',
requiresRestart: false,
default: 'push-to-talk',
description: 'How to trigger voice recording with the Space key.',
showInDialog: true,
options: [
{ value: 'push-to-talk', label: 'Push-To-Talk (Hold Space)' },
{ value: 'toggle', label: 'Toggle (Press Space to start/stop)' },
],
},
backend: {
type: 'enum',
label: 'Voice Transcription Backend',
category: 'Experimental',
requiresRestart: false,
default: 'gemini-live',
description: 'The backend to use for voice transcription.',
showInDialog: true,
options: [
{ value: 'gemini-live', label: 'Gemini Live API (Cloud)' },
{ value: 'whisper', label: 'Whisper (Local)' },
],
},
whisperModel: {
type: 'enum',
label: 'Whisper Model',
category: 'Experimental',
requiresRestart: false,
default: 'ggml-base.en.bin',
description: 'The Whisper model to use for local transcription.',
showInDialog: true,
options: [
{ value: 'ggml-tiny.en.bin', label: 'Tiny (EN) - Fast (~75MB)' },
{
value: 'ggml-base.en.bin',
label: 'Base (EN) - Balanced (~142MB)',
},
{
value: 'ggml-large-v3-turbo-q5_0.bin',
label: 'Large v3 Turbo (Q5_0) - High Accuracy (~547MB)',
},
{
value: 'ggml-large-v3-turbo-q8_0.bin',
label: 'Large v3 Turbo (Q8_0) - Max Accuracy (~834MB)',
},
],
},
stopGracePeriodMs: {
type: 'number',
label: 'Voice Stop Grace Period (ms)',
category: 'Experimental',
requiresRestart: false,
default: 1000,
description:
'How long to wait for final transcription after stopping recording.',
showInDialog: true,
},
},
},
adk: {
type: 'object',
label: 'ADK',
@@ -170,6 +170,7 @@ describe('BuiltinCommandLoader', () => {
getAllSkills: vi.fn().mockReturnValue([]),
isAdminEnabled: vi.fn().mockReturnValue(true),
}),
isVoiceModeEnabled: vi.fn().mockReturnValue(true),
getContentGeneratorConfig: vi.fn().mockReturnValue({
authType: 'other',
}),
@@ -396,6 +397,7 @@ describe('BuiltinCommandLoader profile', () => {
getAllSkills: vi.fn().mockReturnValue([]),
isAdminEnabled: vi.fn().mockReturnValue(true),
}),
isVoiceModeEnabled: vi.fn().mockReturnValue(true),
getContentGeneratorConfig: vi.fn().mockReturnValue({
authType: 'other',
}),
@@ -62,6 +62,7 @@ import { setupGithubCommand } from '../ui/commands/setupGithubCommand.js';
import { terminalSetupCommand } from '../ui/commands/terminalSetupCommand.js';
import { upgradeCommand } from '../ui/commands/upgradeCommand.js';
import { gemmaStatusCommand } from '../ui/commands/gemmaStatusCommand.js';
import { voiceCommand } from '../ui/commands/voiceCommand.js';
/**
* Loads the core, hard-coded slash commands that are an integral part
@@ -227,6 +228,7 @@ export class BuiltinCommandLoader implements ICommandLoader {
vimCommand,
setupGithubCommand,
terminalSetupCommand,
...(this.config?.isVoiceModeEnabled() ? [voiceCommand] : []),
...(this.config?.getContentGeneratorConfig()?.authType ===
AuthType.LOGIN_WITH_GOOGLE
? [upgradeCommand]
+3
View File
@@ -552,6 +552,8 @@ const mockUIActions: UIActions = {
exitPrivacyNotice: vi.fn(),
closeSettingsDialog: vi.fn(),
closeModelDialog: vi.fn(),
openVoiceModelDialog: vi.fn(),
closeVoiceModelDialog: vi.fn(),
openAgentConfigDialog: vi.fn(),
closeAgentConfigDialog: vi.fn(),
openPermissionsDialog: vi.fn(),
@@ -598,6 +600,7 @@ const mockUIActions: UIActions = {
handleNewAgentsSelect: vi.fn(),
getPreferredEditor: vi.fn(),
clearAccountSuspension: vi.fn(),
setVoiceModeEnabled: vi.fn(),
};
import { type TextBuffer } from '../ui/components/shared/text-buffer.js';
+24
View File
@@ -103,6 +103,7 @@ import { useQuotaAndFallback } from './hooks/useQuotaAndFallback.js';
import { useEditorSettings } from './hooks/useEditorSettings.js';
import { useSettingsCommand } from './hooks/useSettingsCommand.js';
import { useModelCommand } from './hooks/useModelCommand.js';
import { useVoiceModelCommand } from './hooks/useVoiceModelCommand.js';
import { useSlashCommandProcessor } from './hooks/slashCommandProcessor.js';
import { useVimMode } from './contexts/VimModeContext.js';
import {
@@ -312,6 +313,7 @@ export const AppContainer = (props: AppContainerProps) => {
);
const [shellModeActive, setShellModeActive] = useState(false);
const [isVoiceModeEnabled, setVoiceModeEnabled] = useState(false);
const [modelSwitchedFromQuotaError, setModelSwitchedFromQuotaError] =
useState<boolean>(false);
const [historyRemountKey, setHistoryRemountKey] = useState(0);
@@ -946,6 +948,12 @@ Logging in with Google... Restarting Gemini CLI to continue.
const { isModelDialogOpen, openModelDialog, closeModelDialog } =
useModelCommand();
const {
isVoiceModelDialogOpen,
openVoiceModelDialog,
closeVoiceModelDialog,
} = useVoiceModelCommand();
const { toggleVimEnabled } = useVimMode();
const setIsBackgroundTaskListOpenRef = useRef<(open: boolean) => void>(
@@ -969,6 +977,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
openSettingsDialog,
openSessionBrowser,
openModelDialog,
openVoiceModelDialog,
openAgentConfigDialog,
openPermissionsDialog,
quit: (messages: HistoryItem[]) => {
@@ -981,6 +990,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
},
setDebugMessage,
toggleCorgiMode: () => setCorgiMode((prev) => !prev),
toggleVoiceMode: () => setVoiceModeEnabled((prev) => !prev),
toggleDebugProfiler,
dispatchExtensionStateUpdate,
addConfirmUpdateExtensionRequest,
@@ -1006,6 +1016,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
openSettingsDialog,
openSessionBrowser,
openModelDialog,
openVoiceModelDialog,
openAgentConfigDialog,
setQuittingMessages,
setDebugMessage,
@@ -2191,6 +2202,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
isThemeDialogOpen ||
isSettingsDialogOpen ||
isModelDialogOpen ||
isVoiceModelDialogOpen ||
isAgentConfigDialogOpen ||
isPermissionsDialogOpen ||
isAuthenticating ||
@@ -2448,6 +2460,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
isSettingsDialogOpen,
isSessionBrowserOpen,
isModelDialogOpen,
isVoiceModelDialogOpen,
isAgentConfigDialogOpen,
selectedAgentName,
selectedAgentDisplayName,
@@ -2468,6 +2481,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
pendingGeminiHistoryItems,
thought,
isInputActive,
isVoiceModeEnabled,
isResuming,
shouldShowIdePrompt,
isFolderTrustDialogOpen: isFolderTrustDialogOpen ?? false,
@@ -2559,6 +2573,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
isSettingsDialogOpen,
isSessionBrowserOpen,
isModelDialogOpen,
isVoiceModelDialogOpen,
isAgentConfigDialogOpen,
selectedAgentName,
selectedAgentDisplayName,
@@ -2579,6 +2594,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
pendingGeminiHistoryItems,
thought,
isInputActive,
isVoiceModeEnabled,
isResuming,
shouldShowIdePrompt,
isFolderTrustDialogOpen,
@@ -2671,6 +2687,8 @@ Logging in with Google... Restarting Gemini CLI to continue.
exitPrivacyNotice,
closeSettingsDialog,
closeModelDialog,
openVoiceModelDialog,
closeVoiceModelDialog,
openAgentConfigDialog,
closeAgentConfigDialog,
openPermissionsDialog,
@@ -2751,6 +2769,9 @@ Logging in with Google... Restarting Gemini CLI to continue.
setAccountSuspensionInfo(null);
setAuthState(AuthState.Updating);
},
setVoiceModeEnabled: (value: boolean) => {
setVoiceModeEnabled(value);
},
}),
[
handleThemeSelect,
@@ -2764,6 +2785,8 @@ Logging in with Google... Restarting Gemini CLI to continue.
exitPrivacyNotice,
closeSettingsDialog,
closeModelDialog,
openVoiceModelDialog,
closeVoiceModelDialog,
openAgentConfigDialog,
closeAgentConfigDialog,
openPermissionsDialog,
@@ -2807,6 +2830,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
config,
historyManager,
getPreferredEditor,
setVoiceModeEnabled,
],
);
+2
View File
@@ -72,6 +72,7 @@ export interface CommandContext {
loadHistory: (history: HistoryItem[], postLoadInput?: string) => void;
/** Toggles a special display mode. */
toggleCorgiMode: () => void;
toggleVoiceMode: () => void;
toggleDebugProfiler: () => void;
toggleVimEnabled: () => Promise<boolean>;
reloadCommands: () => void;
@@ -125,6 +126,7 @@ export interface OpenDialogActionReturn {
| 'settings'
| 'sessionBrowser'
| 'model'
| 'voice-model'
| 'agentConfig'
| 'permissions';
}
@@ -0,0 +1,30 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { CommandKind, type SlashCommand } from './types.js';
export const voiceCommand: SlashCommand = {
name: 'voice',
altNames: [],
description: 'Toggle voice dictation mode',
kind: CommandKind.BUILT_IN,
autoExecute: true,
action: (context) => {
context.ui.toggleVoiceMode();
},
subCommands: [
{
name: 'model',
description: 'Manage voice transcription models',
kind: CommandKind.BUILT_IN,
autoExecute: true,
action: async () => ({
type: 'dialog',
dialog: 'voice-model',
}),
},
],
};
@@ -25,6 +25,7 @@ import { relaunchApp } from '../../utils/processUtils.js';
import { SessionBrowser } from './SessionBrowser.js';
import { PermissionsModifyTrustDialog } from './PermissionsModifyTrustDialog.js';
import { ModelDialog } from './ModelDialog.js';
import { VoiceModelDialog } from './VoiceModelDialog.js';
import { theme } from '../semantic-colors.js';
import { useUIState } from '../contexts/UIStateContext.js';
import { useQuotaState } from '../contexts/QuotaContext.js';
@@ -238,6 +239,9 @@ export const DialogManager = ({
if (uiState.isModelDialogOpen) {
return <ModelDialog onClose={uiActions.closeModelDialog} />;
}
if (uiState.isVoiceModelDialogOpen) {
return <VoiceModelDialog onClose={uiActions.closeVoiceModelDialog} />;
}
if (
uiState.isAgentConfigDialogOpen &&
uiState.selectedAgentName &&
@@ -9,12 +9,41 @@ import { createMockSettings } from '../../test-utils/settings.js';
import { makeFakeConfig } from '@google/gemini-cli-core';
import { waitFor } from '../../test-utils/async.js';
import { act, useState, useMemo } from 'react';
import type { EventEmitter } from 'node:events';
const { fakeTranscriptionProvider } = vi.hoisted(() => {
// Use require within hoisted block for immediate synchronous access
// eslint-disable-next-line @typescript-eslint/no-require-imports, no-restricted-syntax
const { EventEmitter } = require('node:events');
class FakeTranscriptionProvider extends EventEmitter {
connect = vi.fn().mockResolvedValue(undefined);
disconnect = vi.fn();
sendAudioChunk = vi.fn();
getTranscription = vi.fn().mockReturnValue('');
}
return {
fakeTranscriptionProvider: new FakeTranscriptionProvider(),
};
});
vi.mock('@google/gemini-cli-core', async (importOriginal) => {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const actual = (await importOriginal()) as any;
return {
...actual,
TranscriptionFactory: {
createProvider: vi.fn(() => fakeTranscriptionProvider),
},
};
});
import {
InputPrompt,
tryTogglePasteExpansion,
type InputPromptProps,
} from './InputPrompt.js';
import { InputContext } from '../contexts/InputContext.js';
import { type UIState } from '../contexts/UIStateContext.js';
import {
calculateTransformationsForLine,
calculateTransformedLine,
@@ -417,6 +446,7 @@ describe('InputPrompt', () => {
getWorkspaceContext: () => ({
getDirectories: () => ['/test/project/src'],
}),
getContentGeneratorConfig: () => ({ apiKey: 'test-api-key' }),
} as unknown as Config,
slashCommands: mockSlashCommands,
commandContext: mockCommandContext,
@@ -4925,6 +4955,383 @@ describe('InputPrompt', () => {
},
);
});
describe('Voice Mode', () => {
beforeEach(() => {
(
fakeTranscriptionProvider as unknown as EventEmitter
).removeAllListeners();
vi.clearAllMocks();
});
it('should start recording when space is pressed and voice mode is enabled (toggle)', async () => {
await act(async () => {
mockBuffer.setText('');
});
const { stdin, unmount, lastFrame } = await renderWithProviders(
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
{
uiState: { isVoiceModeEnabled: true } as UIState,
settings: createMockSettings({
experimental: { voice: { activationMode: 'toggle' } },
}),
},
);
// Initially not recording
expect(lastFrame()).not.toContain('🎙️ Listening...');
expect(lastFrame()).toContain(
'Voice mode: Space to start/stop recording',
);
// Press space to start
await act(async () => {
stdin.write(' ');
});
// Now should show listening
await waitFor(() => {
expect(lastFrame()).toContain('🎙️ Listening...');
});
unmount();
});
it('should toggle recording off when space is pressed again (toggle)', async () => {
await act(async () => {
mockBuffer.setText('');
});
const { stdin, unmount, lastFrame } = await renderWithProviders(
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
{
uiState: { isVoiceModeEnabled: true } as UIState,
settings: createMockSettings({
experimental: { voice: { activationMode: 'toggle' } },
}),
},
);
// Start recording
await act(async () => {
stdin.write(' ');
});
await waitFor(() => {
expect(lastFrame()).toContain('🎙️ Listening...');
});
// Stop recording
await act(async () => {
stdin.write(' ');
});
await waitFor(() => {
expect(lastFrame()).not.toContain('🎙️ Listening...');
expect(lastFrame()).toContain(
'Voice mode: Space to start/stop recording',
);
});
unmount();
});
it('should resume recording when space is pressed even if buffer is not empty (toggle)', async () => {
await act(async () => {
mockBuffer.setText('some existing text');
});
const { stdin, unmount, lastFrame } = await renderWithProviders(
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
{
uiState: { isVoiceModeEnabled: true } as UIState,
settings: createMockSettings({
experimental: { voice: { activationMode: 'toggle' } },
}),
},
);
// Should show voice mode hint even if buffer is not empty (new behavior)
expect(lastFrame()).toContain(
'Voice mode: Space to start/stop recording',
);
expect(lastFrame()).toContain('some existing text');
// Press space to start recording again
await act(async () => {
stdin.write(' ');
});
await waitFor(() => {
expect(lastFrame()).toContain('🎙️ Listening...');
});
unmount();
});
it('should not start recording if voice mode is disabled (toggle)', async () => {
await act(async () => {
mockBuffer.setText('');
});
const { stdin, unmount, lastFrame } = await renderWithProviders(
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
{
uiState: { isVoiceModeEnabled: false } as UIState,
settings: createMockSettings({
experimental: { voice: { activationMode: 'toggle' } },
}),
},
);
// Press space
await act(async () => {
stdin.write(' ');
});
// Should NOT show listening, instead should call handleInput which handles space
expect(lastFrame()).not.toContain('🎙️ Listening...');
expect(mockBuffer.handleInput).toHaveBeenCalled();
unmount();
});
it('should append transcription correctly across multiple turn updates (toggle)', async () => {
await act(async () => {
mockBuffer.setText('initial');
});
const { stdin, unmount } = await renderWithProviders(
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
{
uiState: { isVoiceModeEnabled: true } as UIState,
settings: createMockSettings({
experimental: { voice: { activationMode: 'toggle' } },
}),
},
);
// Start recording
await act(async () => {
stdin.write(' ');
});
// Emit first transcription
await act(async () => {
(fakeTranscriptionProvider as unknown as EventEmitter).emit(
'transcription',
'hello',
);
});
await waitFor(() => {
expect(mockBuffer.setText).toHaveBeenCalledWith('initial hello', 'end');
});
// Emit turnComplete (Gemini Live starts over after this)
await act(async () => {
(fakeTranscriptionProvider as unknown as EventEmitter).emit(
'turnComplete',
);
});
// Emit second part (Gemini Live sends new turn text starting from empty)
await act(async () => {
(fakeTranscriptionProvider as unknown as EventEmitter).emit(
'transcription',
'world',
);
});
await waitFor(() => {
// Should have appended 'world' to the baseline 'initial hello'
expect(mockBuffer.setText).toHaveBeenCalledWith(
'initial hello world',
'end',
);
});
unmount();
});
it('should append transcription correctly when resuming voice mode (toggle)', async () => {
await act(async () => {
mockBuffer.setText('First turn.');
});
const { stdin, unmount } = await renderWithProviders(
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
{
uiState: { isVoiceModeEnabled: true } as UIState,
settings: createMockSettings({
experimental: { voice: { activationMode: 'toggle' } },
}),
},
);
// Start recording (resumed)
await act(async () => {
stdin.write(' ');
});
// Emit transcription
await act(async () => {
(fakeTranscriptionProvider as unknown as EventEmitter).emit(
'transcription',
'Second turn.',
);
});
await waitFor(() => {
expect(mockBuffer.setText).toHaveBeenCalledWith(
'First turn. Second turn.',
'end',
);
});
unmount();
});
describe('push-to-talk', () => {
beforeEach(() => {
vi.useFakeTimers();
});
afterEach(() => {
vi.useRealTimers();
});
it('should insert a space on a single tap', async () => {
const { stdin, unmount, lastFrame } = await renderWithProviders(
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
{
uiState: { isVoiceModeEnabled: true } as UIState,
settings: createMockSettings({
experimental: { voice: { activationMode: 'push-to-talk' } },
}),
},
);
expect(lastFrame()).toContain('Voice mode: Hold Space to record');
// Press space once
await act(async () => {
stdin.write(' ');
});
// Should insert space optimistically
expect(mockBuffer.insert).toHaveBeenCalledWith(' ');
expect(lastFrame()).not.toContain('🎙️ Listening...');
// Advance timer past HOLD_DELAY_MS
await act(async () => {
vi.advanceTimersByTime(700);
});
expect(lastFrame()).not.toContain('🎙️ Listening...');
unmount();
});
it('should start recording on hold (simulated by repeat spaces)', async () => {
const { stdin, unmount, lastFrame } = await renderWithProviders(
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
{
uiState: { isVoiceModeEnabled: true } as UIState,
settings: createMockSettings({
experimental: { voice: { activationMode: 'push-to-talk' } },
}),
},
);
// First space
await act(async () => {
stdin.write(' ');
});
expect(mockBuffer.insert).toHaveBeenCalledWith(' ');
// Second space (repeat)
await act(async () => {
stdin.write(' ');
});
await waitFor(() => {
// Should have backspaced the optimistic space
expect(mockBuffer.backspace).toHaveBeenCalled();
// Should show listening
expect(lastFrame()).toContain('🎙️ Listening...');
});
unmount();
});
it('should stop recording when space heartbeat stops (release)', async () => {
const { stdin, unmount, lastFrame } = await renderWithProviders(
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
{
uiState: { isVoiceModeEnabled: true } as UIState,
settings: createMockSettings({
experimental: { voice: { activationMode: 'push-to-talk' } },
}),
},
);
// Start hold
await act(async () => {
stdin.write(' ');
stdin.write(' ');
});
// Use a short interval in waitFor to prevent advancing fake timers past the 300ms RELEASE_DELAY_MS
await waitFor(
() => {
expect(lastFrame()).toContain('🎙️ Listening...');
},
{ interval: 10 },
);
// Simulate heartbeat (held key) - send space first to reset timer, then advance
await act(async () => {
stdin.write(' ');
vi.advanceTimersByTime(100);
});
expect(lastFrame()).toContain('🎙️ Listening...');
// Stop heartbeat (release)
await act(async () => {
vi.advanceTimersByTime(400); // Past RELEASE_DELAY_MS
});
await waitFor(() => {
expect(lastFrame()).not.toContain('🎙️ Listening...');
});
unmount();
});
it('should cancel hold state if non-space key is pressed after first space', async () => {
const { stdin, unmount } = await renderWithProviders(
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
{
uiState: { isVoiceModeEnabled: true } as UIState,
settings: createMockSettings({
experimental: { voice: { activationMode: 'push-to-talk' } },
}),
},
);
// First space
await act(async () => {
stdin.write(' ');
});
// Type 'a'
await act(async () => {
stdin.write('a');
});
// Should NOT start recording on next space even if fast
await act(async () => {
stdin.write(' ');
});
expect(mockBuffer.insert).toHaveBeenCalledTimes(2); // Two spaces inserted
expect(mockBuffer.handleInput).toHaveBeenCalledWith(
expect.objectContaining({ name: 'a' }),
);
unmount();
});
});
});
});
function clean(str: string | undefined): string {
+53 -16
View File
@@ -56,6 +56,7 @@ import {
debugLogger,
type Config,
} from '@google/gemini-cli-core';
import { useVoiceMode } from '../hooks/useVoiceMode.js';
import {
parseInputForHighlighting,
parseSegmentsFromTokens,
@@ -159,7 +160,6 @@ export function isLargePaste(text: string): boolean {
}
const DOUBLE_TAB_CLEAN_UI_TOGGLE_WINDOW_MS = 350;
/**
* Attempt to toggle expansion of a paste placeholder in the buffer.
* Returns true if a toggle action was performed or hint was shown, false otherwise.
@@ -238,6 +238,7 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
setEmbeddedShellFocused,
setShortcutsHelpVisible,
toggleCleanUiDetailsVisible,
setVoiceModeEnabled,
} = useUIActions();
const {
terminalWidth,
@@ -246,6 +247,7 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
backgroundTasks,
backgroundTaskHeight,
shortcutsHelpVisible,
isVoiceModeEnabled,
} = useUIState();
const [suppressCompletion, setSuppressCompletion] = useState(false);
const { handlePress: registerPlainTabPress, resetCount: resetPlainTabPress } =
@@ -263,6 +265,7 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
resetEscapeState();
if (buffer.text.length > 0) {
buffer.setText('');
resetTurnBaseline();
resetCompletionState();
} else if (history.length > 0) {
onSubmit('/rewind');
@@ -281,6 +284,16 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
const hasUserNavigatedSuggestions = useRef(false);
const listRef = useRef<ScrollableListRef<ScrollableItem>>(null);
const { isRecording, handleVoiceInput, resetTurnBaseline } = useVoiceMode({
buffer,
config,
settings,
setQueueErrorMessage,
isVoiceModeEnabled,
setVoiceModeEnabled,
keyMatchers,
});
const [reverseSearchActive, setReverseSearchActive] = useState(false);
const [commandSearchActive, setCommandSearchActive] = useState(false);
const [textBeforeReverseSearch, setTextBeforeReverseSearch] = useState('');
@@ -387,6 +400,7 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
// Clear the buffer *before* calling onSubmit to prevent potential re-submission
// if onSubmit triggers a re-render while the buffer still holds the old value.
buffer.setText('');
resetTurnBaseline();
onSubmit(processedValue);
resetCompletionState();
resetReverseSearchCompletionState();
@@ -398,6 +412,7 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
shellModeActive,
shellHistory,
resetReverseSearchCompletionState,
resetTurnBaseline,
],
);
@@ -647,6 +662,8 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
const handleInput = useCallback(
(key: Key) => {
if (handleVoiceInput(key)) return true;
// Determine if this keypress is a history navigation command
const isHistoryUp =
!shellModeActive &&
@@ -873,9 +890,9 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
) {
setShellModeActive(!shellModeActive);
buffer.setText(''); // Clear the '!' from input
resetTurnBaseline();
return true;
}
if (keyMatchers[Command.ESCAPE](key)) {
const cancelSearch = (
setActive: (active: boolean) => void,
@@ -1360,6 +1377,7 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
backgroundTaskHeight,
streamingState,
handleEscPress,
resetTurnBaseline,
registerPlainTabPress,
resetPlainTabPress,
toggleCleanUiDetailsVisible,
@@ -1369,9 +1387,9 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
keyMatchers,
isHelpDismissKey,
settings,
handleVoiceInput,
],
);
useKeypress(handleInput, {
isActive: !isEmbeddedShellFocused && !copyModeEnabled,
priority: true,
@@ -1792,20 +1810,39 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
)}{' '}
</Text>
<Box flexGrow={1} flexDirection="column" ref={innerBoxRef}>
{buffer.text.length === 0 && placeholder ? (
showCursor ? (
<Text
terminalCursorFocus={showCursor}
terminalCursorPosition={0}
>
{chalk.inverse(placeholder.slice(0, 1))}
<Text color={theme.text.secondary}>
{placeholder.slice(1)}
</Text>
{isRecording && (
<Box flexDirection="row" marginBottom={0}>
<Text color={theme.status.success}>🎙 Listening...</Text>
</Box>
)}
{isVoiceModeEnabled && !isRecording && (
<Box flexDirection="row" marginBottom={0}>
<Text color={theme.text.secondary}>
&gt; Voice mode:{' '}
{(settings.experimental.voice?.activationMode ??
'push-to-talk') === 'push-to-talk'
? 'Hold Space to record'
: 'Space to start/stop recording'}{' '}
(Esc to exit)
</Text>
) : (
<Text color={theme.text.secondary}>{placeholder}</Text>
)
</Box>
)}
{buffer.text.length === 0 && !isRecording ? (
!isVoiceModeEnabled && placeholder ? (
showCursor ? (
<Text
terminalCursorFocus={showCursor}
terminalCursorPosition={0}
>
{chalk.inverse(placeholder.slice(0, 1))}
<Text color={theme.text.secondary}>
{placeholder.slice(1)}
</Text>
</Text>
) : (
<Text color={theme.text.secondary}>{placeholder}</Text>
)
) : null
) : (
<Box
flexDirection="column"
@@ -0,0 +1,236 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import type React from 'react';
import { useCallback, useMemo, useState } from 'react';
import { Box, Text } from 'ink';
import { theme } from '../semantic-colors.js';
import { DescriptiveRadioButtonSelect } from './shared/DescriptiveRadioButtonSelect.js';
import { useSettingsStore } from '../contexts/SettingsContext.js';
import { SettingScope } from '../../config/settings.js';
import { useKeypress, type Key } from '../hooks/useKeypress.js';
import { isBinaryAvailable } from '@google/gemini-cli-core';
import {
WhisperModelManager,
type WhisperModelProgress,
} from '@google/gemini-cli-core';
import { CliSpinner } from './CliSpinner.js';
interface VoiceModelDialogProps {
onClose: () => void;
}
type DialogView = 'backend' | 'whisper-models';
const WHISPER_MODELS = [
{
value: 'ggml-tiny.en.bin',
label: 'Tiny (EN)',
description: 'Fastest, lower accuracy (~75MB)',
},
{
value: 'ggml-base.en.bin',
label: 'Base (EN)',
description: 'Balanced speed and accuracy (~142MB)',
},
{
value: 'ggml-large-v3-turbo-q5_0.bin',
label: 'Large v3 Turbo (Q5_0)',
description: 'High accuracy, quantized (~547MB)',
},
{
value: 'ggml-large-v3-turbo-q8_0.bin',
label: 'Large v3 Turbo (Q8_0)',
description: 'Maximum accuracy, high memory (~834MB)',
},
];
export function VoiceModelDialog({
onClose,
}: VoiceModelDialogProps): React.JSX.Element {
const { settings, setSetting } = useSettingsStore();
const [view, setView] = useState<DialogView>('backend');
const [downloadProgress, setDownloadProgress] =
useState<WhisperModelProgress | null>(null);
const [error, setError] = useState<string | null>(null);
const whisperInstalled = useMemo(
() => isBinaryAvailable('whisper-stream'),
[],
);
const modelManager = useMemo(() => new WhisperModelManager(), []);
const currentBackend =
settings.merged.experimental.voice?.backend ?? 'gemini-live';
const currentWhisperModel =
settings.merged.experimental.voice?.whisperModel ?? 'ggml-base.en.bin';
const handleKeypress = useCallback(
(key: Key) => {
if (key.name === 'escape') {
if (view === 'whisper-models') {
setView('backend');
} else {
onClose();
}
return true;
}
return false;
},
[view, onClose],
);
useKeypress(handleKeypress, { isActive: true });
const handleBackendSelect = useCallback(
(value: string) => {
if (value === 'whisper') {
setView('whisper-models');
} else {
setSetting(
SettingScope.User,
'experimental.voice.backend',
'gemini-live',
);
onClose();
}
},
[setSetting, onClose],
);
const handleWhisperModelSelect = useCallback(
async (modelName: string) => {
if (modelManager.isModelInstalled(modelName)) {
setSetting(SettingScope.User, 'experimental.voice.backend', 'whisper');
setSetting(
SettingScope.User,
'experimental.voice.whisperModel',
modelName,
);
onClose();
} else {
setError(null);
const onProgress = (p: WhisperModelProgress) => setDownloadProgress(p);
modelManager.on('progress', onProgress);
try {
await modelManager.downloadModel(modelName);
setSetting(
SettingScope.User,
'experimental.voice.backend',
'whisper',
);
setSetting(
SettingScope.User,
'experimental.voice.whisperModel',
modelName,
);
onClose();
} catch (err) {
setError(
`Failed to download: ${err instanceof Error ? err.message : String(err)}`,
);
} finally {
modelManager.off('progress', onProgress);
setDownloadProgress(null);
}
}
},
[modelManager, setSetting, onClose],
);
const backendOptions = useMemo(
() => [
{
value: 'gemini-live',
title: 'Gemini Live API (Cloud)',
description: 'Real-time cloud transcription via Gemini Live API.',
key: 'gemini-live',
},
{
value: 'whisper',
title: 'Whisper (Local)',
description: whisperInstalled
? 'Local transcription using whisper.cpp.'
: 'Local transcription (Requires: brew install whisper-cpp)',
key: 'whisper',
},
],
[whisperInstalled],
);
const whisperOptions = useMemo(
() =>
WHISPER_MODELS.map((m) => ({
value: m.value,
title: `${m.label}${modelManager.isModelInstalled(m.value) ? ' (Installed)' : ' (Download)'}`,
description: m.description,
key: m.value,
})),
[modelManager],
);
return (
<Box
borderStyle="round"
borderColor={theme.border.default}
flexDirection="column"
padding={1}
width="100%"
>
<Text bold>
{view === 'backend'
? 'Select Voice Transcription Backend'
: 'Select Whisper Model'}
</Text>
{error && (
<Box marginTop={1}>
<Text color={theme.status.error}>{error}</Text>
</Box>
)}
{downloadProgress ? (
<Box marginTop={1} flexDirection="column">
<Box>
<Text>Downloading {downloadProgress.modelName}... </Text>
<CliSpinner />
<Text> {Math.round(downloadProgress.percentage * 100)}%</Text>
</Box>
</Box>
) : (
<Box marginTop={1}>
{view === 'backend' ? (
<DescriptiveRadioButtonSelect
items={backendOptions}
onSelect={handleBackendSelect}
initialIndex={currentBackend === 'whisper' ? 1 : 0}
showNumbers={true}
/>
) : (
<DescriptiveRadioButtonSelect
items={whisperOptions}
onSelect={handleWhisperModelSelect}
initialIndex={whisperOptions.findIndex(
(o) => o.value === currentWhisperModel,
)}
showNumbers={true}
/>
)}
</Box>
)}
<Box marginTop={1} flexDirection="column">
<Text color={theme.text.secondary}>
{view === 'whisper-models'
? '(Press Esc to go back)'
: '(Press Esc to close)'}
</Text>
</Box>
</Box>
);
}
@@ -168,13 +168,6 @@ exports[`InputPrompt > mouse interaction > should toggle paste expansion on doub
"
`;
exports[`InputPrompt > mouse interaction > should toggle paste expansion on double-click 4`] = `
"▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀
> [Pasted Text: 10 lines]
▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄
"
`;
exports[`InputPrompt > multiline rendering > should correctly render multiline input including blank lines 1`] = `
"────────────────────────────────────────────────────────────────────────────────────────────────────
> hello
@@ -41,6 +41,8 @@ export interface UIActions {
exitPrivacyNotice: () => void;
closeSettingsDialog: () => void;
closeModelDialog: () => void;
openVoiceModelDialog: () => void;
closeVoiceModelDialog: () => void;
openAgentConfigDialog: (
name: string,
displayName: string,
@@ -93,6 +95,7 @@ export interface UIActions {
handleNewAgentsSelect: (choice: NewAgentsChoice) => Promise<void>;
getPreferredEditor: () => EditorType | undefined;
clearAccountSuspension: () => void;
setVoiceModeEnabled: (value: boolean) => void;
}
export const UIActionsContext = createContext<UIActions | null>(null);
@@ -112,6 +112,7 @@ export interface UIState {
isSettingsDialogOpen: boolean;
isSessionBrowserOpen: boolean;
isModelDialogOpen: boolean;
isVoiceModelDialogOpen: boolean;
isAgentConfigDialogOpen: boolean;
selectedAgentName?: string;
selectedAgentDisplayName?: string;
@@ -132,6 +133,7 @@ export interface UIState {
pendingGeminiHistoryItems: HistoryItemWithoutId[];
thought: ThoughtSummary | null;
isInputActive: boolean;
isVoiceModeEnabled: boolean;
isResuming: boolean;
shouldShowIdePrompt: boolean;
isFolderTrustDialogOpen: boolean;
@@ -205,11 +205,13 @@ describe('useSlashCommandProcessor', () => {
openSettingsDialog: vi.fn(),
openSessionBrowser: vi.fn(),
openModelDialog: mockOpenModelDialog,
openVoiceModelDialog: vi.fn(),
openAgentConfigDialog,
openPermissionsDialog: vi.fn(),
quit: mockSetQuittingMessages,
setDebugMessage: vi.fn(),
toggleCorgiMode: vi.fn(),
toggleVoiceMode: vi.fn(),
toggleDebugProfiler: vi.fn(),
dispatchExtensionStateUpdate: vi.fn(),
addConfirmUpdateExtensionRequest: vi.fn(),
@@ -72,6 +72,7 @@ interface SlashCommandProcessorActions {
openSettingsDialog: () => void;
openSessionBrowser: () => void;
openModelDialog: () => void;
openVoiceModelDialog: () => void;
openAgentConfigDialog: (
name: string,
displayName: string,
@@ -81,6 +82,7 @@ interface SlashCommandProcessorActions {
quit: (messages: HistoryItem[]) => void;
setDebugMessage: (message: string) => void;
toggleCorgiMode: () => void;
toggleVoiceMode: () => void;
toggleDebugProfiler: () => void;
dispatchExtensionStateUpdate: (action: ExtensionUpdateAction) => void;
addConfirmUpdateExtensionRequest: (request: ConfirmationRequest) => void;
@@ -232,6 +234,7 @@ export const useSlashCommandProcessor = (
pendingItem,
setPendingItem,
toggleCorgiMode: actions.toggleCorgiMode,
toggleVoiceMode: actions.toggleVoiceMode,
toggleDebugProfiler: actions.toggleDebugProfiler,
toggleVimEnabled,
reloadCommands,
@@ -503,6 +506,9 @@ export const useSlashCommandProcessor = (
case 'model':
actions.openModelDialog();
return { type: 'handled' };
case 'voice-model':
actions.openVoiceModelDialog();
return { type: 'handled' };
case 'agentConfig': {
// eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
const props = result.props as Record<string, unknown>;
+429
View File
@@ -0,0 +1,429 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { useState, useRef, useCallback, useEffect } from 'react';
import {
AudioRecorder,
TranscriptionFactory,
debugLogger,
type Config,
type TranscriptionProvider,
} from '@google/gemini-cli-core';
import type { TextBuffer } from '../components/shared/text-buffer.js';
import type { MergedSettings } from '../../config/settingsSchema.js';
import type { Key } from './useKeypress.js';
import { Command } from '../key/keyMatchers.js';
interface UseVoiceModeProps {
buffer: TextBuffer;
config: Config;
settings: MergedSettings;
setQueueErrorMessage: (message: string | null) => void;
isVoiceModeEnabled: boolean;
setVoiceModeEnabled: (enabled: boolean) => void;
keyMatchers: Record<Command, (key: Key) => boolean>;
}
const HOLD_DELAY_MS = 600;
const RELEASE_DELAY_MS = 300;
export function useVoiceMode({
buffer,
config,
settings,
setQueueErrorMessage,
isVoiceModeEnabled,
setVoiceModeEnabled,
keyMatchers,
}: UseVoiceModeProps) {
const [isRecording, setIsRecording] = useState(false);
const [isConnecting, setIsConnecting] = useState(false);
const liveTranscriptionRef = useRef('');
const stopRequestedRef = useRef(false);
const isRecordingRef = useRef(false);
const lastFailureTimeRef = useRef(0);
const recordingInProgressRef = useRef(false);
const voiceTimeoutRef = useRef<NodeJS.Timeout | null>(null);
const recorderRef = useRef<AudioRecorder | null>(null);
const transcriptionServiceRef = useRef<TranscriptionProvider | null>(null);
const turnBaselineRef = useRef<string | null>(null);
const pttStateRef = useRef<'idle' | 'possible-hold' | 'recording'>('idle');
const pttTimerRef = useRef<NodeJS.Timeout | null>(null);
const disconnectTimerRef = useRef<NodeJS.Timeout | null>(null);
const bufferRef = useRef(buffer);
bufferRef.current = buffer;
const stopVoiceRecording = useCallback(() => {
if (stopRequestedRef.current) return;
debugLogger.debug('[Voice] Stop requested');
stopRequestedRef.current = true;
setIsRecording(false);
isRecordingRef.current = false;
setIsConnecting(false);
if (recorderRef.current) {
recorderRef.current.stop();
recorderRef.current = null;
}
const serviceToDisconnect = transcriptionServiceRef.current;
transcriptionServiceRef.current = null;
if (serviceToDisconnect) {
const isLive = settings.experimental.voice?.backend === 'gemini-live';
const gracePeriodMs =
settings.experimental.voice?.stopGracePeriodMs ??
(isLive ? 2000 : 1000);
debugLogger.debug(
`[Voice] Draining transcription for ${gracePeriodMs}ms`,
);
if (disconnectTimerRef.current) clearTimeout(disconnectTimerRef.current);
disconnectTimerRef.current = setTimeout(() => {
debugLogger.debug('[Voice] Grace period ended, disconnecting service');
serviceToDisconnect.disconnect();
disconnectTimerRef.current = null;
}, gracePeriodMs);
}
liveTranscriptionRef.current = '';
pttStateRef.current = 'idle';
}, [settings.experimental.voice]);
const startVoiceRecording = useCallback(() => {
if (
isRecordingRef.current ||
Date.now() - lastFailureTimeRef.current < 2000
) {
return;
}
if (disconnectTimerRef.current) {
clearTimeout(disconnectTimerRef.current);
disconnectTimerRef.current = null;
}
recordingInProgressRef.current = true;
turnBaselineRef.current = bufferRef.current.text;
setIsConnecting(true);
setIsRecording(true);
isRecordingRef.current = true;
liveTranscriptionRef.current = '';
stopRequestedRef.current = false;
const apiKey =
config.getContentGeneratorConfig()?.apiKey ||
process.env['GEMINI_API_KEY'] ||
'';
const startAsync = async () => {
// If there's an active draining service, disconnect it immediately
// before starting a new one to prevent orphaned event collisions.
if (disconnectTimerRef.current) {
clearTimeout(disconnectTimerRef.current);
disconnectTimerRef.current = null;
}
if (transcriptionServiceRef.current) {
transcriptionServiceRef.current.disconnect();
transcriptionServiceRef.current = null;
}
const cleanupIfStopped = () => {
if (stopRequestedRef.current) {
if (recorderRef.current) {
recorderRef.current.stop();
recorderRef.current = null;
}
if (transcriptionServiceRef.current) {
transcriptionServiceRef.current.disconnect();
transcriptionServiceRef.current = null;
}
setIsRecording(false);
isRecordingRef.current = false;
setIsConnecting(false);
recordingInProgressRef.current = false;
return true;
}
return false;
};
if (cleanupIfStopped()) return;
const voiceBackend =
settings.experimental.voice?.backend ?? 'gemini-live';
if (!apiKey && voiceBackend === 'gemini-live') {
setQueueErrorMessage(
'Cloud voice mode requires a GEMINI_API_KEY. Please set it in your environment or ~/.gemini/.env.',
);
setIsRecording(false);
isRecordingRef.current = false;
setIsConnecting(false);
recordingInProgressRef.current = false;
lastFailureTimeRef.current = Date.now();
return;
}
if (voiceBackend === 'gemini-live') {
recorderRef.current = new AudioRecorder();
}
const currentService = TranscriptionFactory.createProvider(
settings.experimental.voice,
apiKey,
);
transcriptionServiceRef.current = currentService;
currentService.on('transcription', (text) => {
if (
transcriptionServiceRef.current !== currentService &&
stopRequestedRef.current
) {
// If this is an orphaned service that was replaced by a new session, ignore its events
return;
}
if (text) {
const currentBufferText = bufferRef.current.text;
const previousTranscription = liveTranscriptionRef.current;
let newTotalText = currentBufferText;
if (
previousTranscription &&
currentBufferText.endsWith(previousTranscription)
) {
newTotalText = currentBufferText.slice(
0,
-previousTranscription.length,
);
} else if (
currentBufferText &&
!currentBufferText.endsWith(' ') &&
!currentBufferText.endsWith('\n')
) {
newTotalText += ' ';
}
newTotalText += text;
bufferRef.current.setText(newTotalText, 'end');
}
liveTranscriptionRef.current = text;
});
currentService.on('turnComplete', () => {
if (
transcriptionServiceRef.current !== currentService &&
stopRequestedRef.current
)
return;
liveTranscriptionRef.current = '';
});
currentService.on('error', (err) => {
if (transcriptionServiceRef.current !== currentService) return;
debugLogger.error('[Voice] Transcription error:', err);
lastFailureTimeRef.current = Date.now();
recordingInProgressRef.current = false;
});
currentService.on('close', () => {
if (transcriptionServiceRef.current !== currentService) return;
if (!stopRequestedRef.current) {
setIsRecording(false);
isRecordingRef.current = false;
setIsConnecting(false);
recordingInProgressRef.current = false;
lastFailureTimeRef.current = Date.now();
}
});
try {
await currentService.connect();
if (cleanupIfStopped()) return;
await recorderRef.current?.start();
if (cleanupIfStopped()) return;
setIsConnecting(false);
const currentVoiceBackend =
settings.experimental.voice?.backend ?? 'gemini-live';
recorderRef.current?.on('data', (chunk) => {
if (currentVoiceBackend === 'gemini-live') {
currentService.sendAudioChunk(chunk);
}
});
recorderRef.current?.on('error', (err) => {
debugLogger.error('[Voice] Recorder error:', err);
stopVoiceRecording();
lastFailureTimeRef.current = Date.now();
});
} catch (err: unknown) {
if (transcriptionServiceRef.current !== currentService) return;
const message = err instanceof Error ? err.message : String(err);
setQueueErrorMessage(`Voice mode failure: ${message}`);
setIsRecording(false);
isRecordingRef.current = false;
setIsConnecting(false);
recordingInProgressRef.current = false;
lastFailureTimeRef.current = Date.now();
if (recorderRef.current) {
recorderRef.current.stop();
recorderRef.current = null;
}
if (transcriptionServiceRef.current) {
transcriptionServiceRef.current.disconnect();
transcriptionServiceRef.current = null;
}
}
};
void startAsync();
}, [
config,
settings.experimental.voice,
setQueueErrorMessage,
stopVoiceRecording,
]);
useEffect(
() => () => {
if (voiceTimeoutRef.current) clearTimeout(voiceTimeoutRef.current);
if (recorderRef.current) {
recorderRef.current.stop();
recorderRef.current = null;
}
if (transcriptionServiceRef.current) {
transcriptionServiceRef.current.disconnect();
transcriptionServiceRef.current = null;
}
if (pttTimerRef.current) clearTimeout(pttTimerRef.current);
if (disconnectTimerRef.current) clearTimeout(disconnectTimerRef.current);
},
[],
);
const handleVoiceInput = useCallback(
(key: Key): boolean => {
const activeRecording = isRecording || isRecordingRef.current;
if (activeRecording) {
const activationMode =
settings.experimental.voice?.activationMode ?? 'push-to-talk';
if (keyMatchers[Command.ESCAPE](key)) {
stopVoiceRecording();
return true;
}
if (keyMatchers[Command.VOICE_MODE_PTT](key)) {
if (activationMode === 'push-to-talk') {
if (pttTimerRef.current) {
clearTimeout(pttTimerRef.current);
}
pttTimerRef.current = setTimeout(() => {
stopVoiceRecording();
pttTimerRef.current = null;
}, RELEASE_DELAY_MS);
return true;
} else {
stopVoiceRecording();
return true;
}
}
return true;
}
if (isVoiceModeEnabled) {
const activationMode =
settings.experimental.voice?.activationMode ?? 'push-to-talk';
if (keyMatchers[Command.ESCAPE](key) && buffer.text === '') {
setVoiceModeEnabled(false);
return true;
}
if (keyMatchers[Command.VOICE_MODE_PTT](key)) {
if (
key.name === 'space' &&
!key.ctrl &&
!key.alt &&
!key.shift &&
!key.cmd
) {
if (activationMode === 'toggle') {
startVoiceRecording();
return true;
} else {
if (pttStateRef.current === 'idle') {
buffer.insert(' ');
pttStateRef.current = 'possible-hold';
if (pttTimerRef.current) clearTimeout(pttTimerRef.current);
pttTimerRef.current = setTimeout(() => {
pttStateRef.current = 'idle';
pttTimerRef.current = null;
}, HOLD_DELAY_MS);
return true;
} else if (pttStateRef.current === 'possible-hold') {
if (pttTimerRef.current) clearTimeout(pttTimerRef.current);
buffer.backspace();
pttStateRef.current = 'recording';
startVoiceRecording();
pttTimerRef.current = setTimeout(() => {
stopVoiceRecording();
pttTimerRef.current = null;
}, RELEASE_DELAY_MS);
return true;
}
}
}
}
if (pttStateRef.current === 'possible-hold') {
pttStateRef.current = 'idle';
if (pttTimerRef.current) {
clearTimeout(pttTimerRef.current);
pttTimerRef.current = null;
}
}
}
return false;
},
[
isRecording,
isVoiceModeEnabled,
settings.experimental.voice,
keyMatchers,
stopVoiceRecording,
startVoiceRecording,
buffer,
setVoiceModeEnabled,
],
);
return {
isRecording,
isConnecting,
startVoiceRecording,
stopVoiceRecording,
handleVoiceInput,
resetTurnBaseline: () => {
turnBaselineRef.current = null;
},
};
}
@@ -0,0 +1,31 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { useState, useCallback } from 'react';
interface UseVoiceModelCommandReturn {
isVoiceModelDialogOpen: boolean;
openVoiceModelDialog: () => void;
closeVoiceModelDialog: () => void;
}
export const useVoiceModelCommand = (): UseVoiceModelCommandReturn => {
const [isVoiceModelDialogOpen, setIsVoiceModelDialogOpen] = useState(false);
const openVoiceModelDialog = useCallback(() => {
setIsVoiceModelDialogOpen(true);
}, []);
const closeVoiceModelDialog = useCallback(() => {
setIsVoiceModelDialogOpen(false);
}, []);
return {
isVoiceModelDialogOpen,
openVoiceModelDialog,
closeVoiceModelDialog,
};
};
+8 -3
View File
@@ -97,6 +97,7 @@ export enum Command {
RESTART_APP = 'app.restart',
SUSPEND_APP = 'app.suspend',
SHOW_SHELL_INPUT_UNFOCUS_WARNING = 'app.showShellUnfocusWarning',
VOICE_MODE_PTT = 'app.voiceModePTT',
// Background Shell Controls
BACKGROUND_SHELL_ESCAPE = 'background.escape',
@@ -407,9 +408,7 @@ export const defaultKeyBindingConfig: KeyBindingConfig = new Map([
[Command.RESTART_APP, [new KeyBinding('r'), new KeyBinding('shift+r')]],
[Command.SUSPEND_APP, [new KeyBinding('ctrl+z')]],
[Command.SHOW_SHELL_INPUT_UNFOCUS_WARNING, [new KeyBinding('tab')]],
[Command.DUMP_FRAME, [new KeyBinding('f8')]],
[Command.START_RECORDING, [new KeyBinding('f6')]],
[Command.STOP_RECORDING, [new KeyBinding('f7')]],
[Command.VOICE_MODE_PTT, [new KeyBinding('space')]],
// Background Shell Controls
[Command.BACKGROUND_SHELL_ESCAPE, [new KeyBinding('escape')]],
@@ -424,6 +423,10 @@ export const defaultKeyBindingConfig: KeyBindingConfig = new Map([
// Extension Controls
[Command.UPDATE_EXTENSION, [new KeyBinding('i')]],
[Command.LINK_EXTENSION, [new KeyBinding('l')]],
[Command.DUMP_FRAME, [new KeyBinding('f8')]],
[Command.START_RECORDING, [new KeyBinding('f6')]],
[Command.STOP_RECORDING, [new KeyBinding('f7')]],
]);
interface CommandCategory {
@@ -538,6 +541,7 @@ export const commandCategories: readonly CommandCategory[] = [
Command.RESTART_APP,
Command.SUSPEND_APP,
Command.SHOW_SHELL_INPUT_UNFOCUS_WARNING,
Command.VOICE_MODE_PTT,
],
},
{
@@ -658,6 +662,7 @@ export const commandDescriptions: Readonly<Record<Command, string>> = {
[Command.SUSPEND_APP]: 'Suspend the CLI and move it to the background.',
[Command.SHOW_SHELL_INPUT_UNFOCUS_WARNING]:
'Show warning when trying to move focus away from shell input.',
[Command.VOICE_MODE_PTT]: 'Hold to speak in Voice Mode.',
// Background Shell Controls
[Command.BACKGROUND_SHELL_ESCAPE]: 'Dismiss background shell list.',
@@ -43,5 +43,6 @@ export function createNonInteractiveUI(): CommandContext['ui'] {
removeComponent: () => {},
toggleBackgroundTasks: () => {},
toggleShortcutsHelp: () => {},
toggleVoiceMode: () => {},
};
}
+1
View File
@@ -56,6 +56,7 @@
"ajv-formats": "^3.0.0",
"chardet": "^2.1.0",
"chokidar": "^5.0.0",
"command-exists": "^1.2.9",
"diff": "^8.0.3",
"dotenv": "^17.2.4",
"dotenv-expand": "^12.0.3",
+7
View File
@@ -691,6 +691,7 @@ export interface ConfigParameters {
ptyInfo?: string;
disableYoloMode?: boolean;
disableAlwaysAllow?: boolean;
voiceMode?: boolean;
rawOutput?: boolean;
acceptRawOutputRisk?: boolean;
dynamicModelConfiguration?: boolean;
@@ -963,6 +964,7 @@ export class Config implements McpContext, AgentLoopContext {
private readonly topicUpdateNarration: boolean;
private readonly disableLLMCorrection: boolean;
private readonly planEnabled: boolean;
private readonly voiceMode: boolean;
private readonly trackerEnabled: boolean;
private readonly planModeRoutingEnabled: boolean;
private readonly modelSteering: boolean;
@@ -1117,6 +1119,7 @@ export class Config implements McpContext, AgentLoopContext {
this.agents = params.agents ?? {};
this.disableLLMCorrection = params.disableLLMCorrection ?? true;
this.planEnabled = params.plan ?? true;
this.voiceMode = params.voiceMode ?? false;
this.trackerEnabled = params.tracker ?? false;
this.planModeRoutingEnabled = params.planSettings?.modelRouting ?? true;
this.enableEventDrivenScheduler = params.enableEventDrivenScheduler ?? true;
@@ -2969,6 +2972,10 @@ export class Config implements McpContext, AgentLoopContext {
return this.planEnabled;
}
isVoiceModeEnabled(): boolean {
return this.voiceMode;
}
isTrackerEnabled(): boolean {
return this.trackerEnabled;
}
+9
View File
@@ -297,3 +297,12 @@ export * from './context/profiles.js';
// Export trust utility
export * from './utils/trust.js';
// Export voice utilities
export * from './voice/audioRecorder.js';
export * from './voice/transcriptionProvider.js';
export * from './voice/geminiLiveTranscriptionProvider.js';
export * from './voice/whisperTranscriptionProvider.js';
export * from './voice/transcriptionFactory.js';
export * from './voice/whisperModelManager.js';
export { isBinaryAvailable } from './utils/binaryCheck.js';
+14
View File
@@ -0,0 +1,14 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { sync as commandExistsSync } from 'command-exists';
/**
* Checks if a binary is available in the system PATH.
*/
export function isBinaryAvailable(binaryName: string): boolean {
return commandExistsSync(binaryName);
}
+115
View File
@@ -0,0 +1,115 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { spawn, type ChildProcessWithoutNullStreams } from 'node:child_process';
import { EventEmitter } from 'node:events';
import commandExists from 'command-exists';
export interface AudioRecorderEvents {
data: [Buffer];
start: [];
stop: [];
error: [Error];
}
/**
* Captures audio from the microphone using `sox` (`rec`).
* Emits 16kHz, 16-bit, mono PCM chunks.
*/
export class AudioRecorder extends EventEmitter<AudioRecorderEvents> {
private recProcess: ChildProcessWithoutNullStreams | null = null;
private isRecordingInternal = false;
get isRecording(): boolean {
return this.isRecordingInternal;
}
/**
* Checks if `rec` (sox) is available on the system.
*/
static async isAvailable(): Promise<boolean> {
try {
await commandExists('rec');
return true;
} catch {
return false;
}
}
async start(): Promise<void> {
if (this.isRecordingInternal) return;
this.isRecordingInternal = true;
try {
const available = await AudioRecorder.isAvailable();
if (!this.isRecordingInternal) return; // Check if stopped while checking availability
if (!available) {
throw new Error(
'The `rec` command (provided by SoX) is required for voice mode. Please install SoX (e.g., `brew install sox` on macOS or `sudo apt install sox libsox-fmt-all` on Linux).',
);
}
// rec -q -V0 -e signed -c 1 -b 16 -r 16000 -t raw -
this.recProcess = spawn('rec', [
'-q',
'-V0',
'-e',
'signed',
'-c',
'1',
'-b',
'16',
'-r',
'16000',
'-t',
'raw',
'-',
]);
if (!this.isRecordingInternal) {
this.recProcess.kill('SIGTERM');
this.recProcess = null;
return;
}
this.recProcess.stdout.on('data', (data: Buffer) => {
this.emit('data', data);
});
this.recProcess.stderr.on('data', (_data: Buffer) => {
// rec might print warnings to stderr, we could log them or ignore
// console.warn(`rec stderr: ${data.toString()}`);
});
this.recProcess.on('error', (err) => {
this.emit('error', err);
this.stop();
});
this.recProcess.on('close', () => {
this.stop();
});
this.emit('start');
} catch (err) {
this.isRecordingInternal = false;
throw err;
}
}
stop(): void {
if (!this.isRecordingInternal) return;
this.isRecordingInternal = false;
if (this.recProcess) {
this.recProcess.kill('SIGTERM');
this.recProcess = null;
}
this.emit('stop');
}
}
@@ -0,0 +1,178 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import WebSocket from 'ws';
import { EventEmitter, once } from 'node:events';
import { debugLogger } from '../utils/debugLogger.js';
import type {
TranscriptionProvider,
TranscriptionEvents,
} from './transcriptionProvider.js';
import { z } from 'zod';
const LiveAPIResponseSchema = z.object({
setupComplete: z.record(z.unknown()).optional(),
serverContent: z
.object({
turnComplete: z.boolean().optional(),
inputTranscription: z
.object({
text: z.string().optional(),
})
.optional(),
outputTranscription: z
.object({
text: z.string().optional(),
})
.optional(),
modelTurn: z
.object({
parts: z
.array(
z.object({
text: z.string().optional(),
inlineData: z
.object({
data: z.string(),
})
.optional(),
}),
)
.optional(),
})
.optional(),
})
.optional(),
});
/**
* Connects to the Gemini Live API using raw WebSockets to support API Key authentication.
*/
export class GeminiLiveTranscriptionProvider
extends EventEmitter<TranscriptionEvents>
implements TranscriptionProvider
{
private ws: WebSocket | null = null;
private currentTranscription = '';
constructor(private readonly apiKey: string) {
super();
}
async connect(): Promise<void> {
const modelName = 'gemini-3.1-flash-live-preview';
const baseUrl =
'wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent';
if (!this.apiKey) {
throw new Error('No API key provided');
}
// NOTE: The Generative Language WebSocket API requires the API key to be passed via the 'key' query parameter.
const url = `${baseUrl}?key=${this.apiKey}`;
debugLogger.debug(
`[GeminiLiveTranscription] Connecting to model ${modelName} via raw WebSocket with API Key...`,
);
try {
this.ws = new WebSocket(url, {
maxPayload: 1 << 20, // 1MB limit for safety
});
this.ws.on('message', (data) => {
try {
const parsedData: unknown = JSON.parse(data.toString());
const result = LiveAPIResponseSchema.safeParse(parsedData);
if (result.success) {
const response = result.data;
if (response.serverContent) {
const content = response.serverContent;
if (content.turnComplete) {
this.emit('turnComplete');
}
if (content.inputTranscription?.text) {
const text = content.inputTranscription.text;
debugLogger.debug(
`[GeminiLiveTranscription] Transcription received (Cloud): "${text}"`,
);
this.currentTranscription = text;
this.emit('transcription', this.currentTranscription);
}
}
}
} catch (e) {
debugLogger.error(
'[GeminiLiveTranscription] Error parsing message:',
e,
);
}
});
this.ws.on('error', (error) => {
debugLogger.error('[GeminiLiveTranscription] WebSocket Error:', error);
this.emit('error', error);
});
this.ws.on('close', (code, reason) => {
debugLogger.debug(
`[GeminiLiveTranscription] Connection Closed. Code: ${code}, Reason: ${reason}`,
);
this.emit('close');
this.ws = null;
});
await once(this.ws, 'open');
const setupMessage = {
setup: {
model: `models/${modelName}`,
generation_config: {
response_modalities: ['audio'],
},
input_audio_transcription: {},
},
};
this.ws.send(JSON.stringify(setupMessage));
this.currentTranscription = '';
} catch (err) {
debugLogger.error(
'[GeminiLiveTranscription] Failed to establish connection:',
err,
);
throw err;
}
}
sendAudioChunk(chunk: Buffer): void {
if (!this.ws || this.ws.readyState !== WebSocket.OPEN) return;
const audioMessage = {
realtime_input: {
audio: {
data: chunk.toString('base64'),
mime_type: 'audio/pcm;rate=16000',
},
},
};
this.ws.send(JSON.stringify(audioMessage));
}
getTranscription(): string {
return this.currentTranscription;
}
disconnect(): void {
if (this.ws) {
this.ws.close();
this.ws = null;
}
}
}
@@ -0,0 +1,41 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import * as path from 'node:path';
import * as fs from 'node:fs';
import { homedir, GEMINI_DIR } from '../utils/paths.js';
import { GeminiLiveTranscriptionProvider } from './geminiLiveTranscriptionProvider.js';
import { WhisperTranscriptionProvider } from './whisperTranscriptionProvider.js';
import type { TranscriptionProvider } from './transcriptionProvider.js';
export class TranscriptionFactory {
static createProvider(
voiceConfig: { backend?: string; whisperModel?: string } | undefined,
apiKey: string,
): TranscriptionProvider {
const backend = voiceConfig?.backend ?? 'gemini-live';
if (backend === 'whisper') {
const modelsDir = path.join(homedir(), GEMINI_DIR, 'whisper_models');
if (!fs.existsSync(modelsDir)) {
fs.mkdirSync(modelsDir, { recursive: true });
}
const modelName = voiceConfig?.whisperModel ?? 'ggml-base.en.bin';
const modelPath = path.join(modelsDir, modelName);
return new WhisperTranscriptionProvider({
modelPath,
threads: 4,
step: 0,
length: 5000,
});
}
// Default to Gemini Live
return new GeminiLiveTranscriptionProvider(apiKey);
}
}
@@ -0,0 +1,33 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import type { EventEmitter } from 'node:events';
export interface TranscriptionEvents {
/** Emitted when partial or full transcription text is available. */
transcription: [string];
/** Emitted when a speaking turn is considered complete. */
turnComplete: [];
/** Emitted when an error occurs during transcription. */
error: [Error];
/** Emitted when the transcription service connection is closed. */
close: [];
}
/**
* Common interface for all transcription backends (Cloud or Local).
*/
export interface TranscriptionProvider
extends EventEmitter<TranscriptionEvents> {
/** Establish connection to the transcription service. */
connect(): Promise<void>;
/** Send a chunk of raw audio data to the service. */
sendAudioChunk(chunk: Buffer): void;
/** Disconnect from the transcription service. */
disconnect(): void;
/** Get the current full transcription for the session. */
getTranscription(): string;
}
@@ -0,0 +1,107 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import * as path from 'node:path';
import * as fs from 'node:fs';
import { EventEmitter } from 'node:events';
import { homedir, GEMINI_DIR } from '../utils/paths.js';
import { debugLogger } from '../utils/debugLogger.js';
export interface WhisperModelProgress {
modelName: string;
transferred: number;
total: number;
percentage: number;
}
export interface WhisperModelManagerEvents {
progress: [WhisperModelProgress];
}
const ALLOWED_MODELS = [
'ggml-tiny.en.bin',
'ggml-base.en.bin',
'ggml-large-v3-turbo-q5_0.bin',
'ggml-large-v3-turbo-q8_0.bin',
];
/**
* Manages Whisper models (checking existence, downloading).
*/
export class WhisperModelManager extends EventEmitter<WhisperModelManagerEvents> {
private readonly modelsDir: string;
constructor() {
super();
this.modelsDir = path.join(homedir(), GEMINI_DIR, 'whisper_models');
}
isModelInstalled(modelName: string): boolean {
this.validateModelName(modelName);
return fs.existsSync(path.join(this.modelsDir, modelName));
}
getModelPath(modelName: string): string {
this.validateModelName(modelName);
return path.join(this.modelsDir, modelName);
}
async downloadModel(modelName: string): Promise<void> {
this.validateModelName(modelName);
if (!fs.existsSync(this.modelsDir)) {
fs.mkdirSync(this.modelsDir, { recursive: true });
}
const destination = path.join(this.modelsDir, modelName);
const url = `https://huggingface.co/ggerganov/whisper.cpp/resolve/main/${modelName}`;
debugLogger.debug(
`[WhisperModelManager] Downloading ${modelName} from ${url}`,
);
const response = await fetch(url);
if (!response.ok) {
throw new Error(`Failed to download model: ${response.statusText}`);
}
const total = parseInt(response.headers.get('content-length') || '0', 10);
let transferred = 0;
const reader = response.body?.getReader();
if (!reader) {
throw new Error('Response body is not readable');
}
const writer = fs.createWriteStream(destination);
try {
while (true) {
const { done, value } = await reader.read();
if (done) break;
transferred += value.length;
writer.write(value);
const percentage = total > 0 ? transferred / total : 0;
this.emit('progress', {
modelName,
transferred,
total,
percentage,
});
}
} finally {
writer.end();
}
}
private validateModelName(modelName: string): void {
if (!ALLOWED_MODELS.includes(modelName)) {
throw new Error(`Unauthorized model name: ${modelName}`);
}
}
}
@@ -0,0 +1,31 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, it, expect, vi, beforeEach } from 'vitest';
import { WhisperTranscriptionProvider } from './whisperTranscriptionProvider.js';
import commandExists from 'command-exists';
vi.mock('command-exists', () => ({
default: vi.fn(),
}));
describe('WhisperTranscriptionProvider', () => {
beforeEach(() => {
vi.clearAllMocks();
});
it('should throw a friendly error if whisper-stream is not available', async () => {
vi.mocked(commandExists).mockRejectedValue(new Error('not found'));
const provider = new WhisperTranscriptionProvider({
modelPath: 'test-model.bin',
});
await expect(provider.connect()).rejects.toThrow(
'The `whisper-stream` command is required for local voice mode. Please install it (e.g., `brew install whisper-cpp` on macOS).',
);
});
});
@@ -0,0 +1,199 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { spawn, type ChildProcessWithoutNullStreams } from 'node:child_process';
import { EventEmitter } from 'node:events';
import commandExists from 'command-exists';
import { debugLogger } from '../utils/debugLogger.js';
import type {
TranscriptionProvider,
TranscriptionEvents,
} from './transcriptionProvider.js';
export interface WhisperProviderOptions {
modelPath: string;
threads?: number;
step?: number;
length?: number;
}
/**
* Local transcription provider using `whisper-stream` from whisper.cpp.
*
* Uses the Sliding Window Mode with VAD (--step 0) for stable,
* non-overlapping transcription blocks that can be appended directly.
*/
export class WhisperTranscriptionProvider
extends EventEmitter<TranscriptionEvents>
implements TranscriptionProvider
{
private process: ChildProcessWithoutNullStreams | null = null;
private currentTranscription = '';
constructor(private readonly options: WhisperProviderOptions) {
super();
}
/**
* Checks if `whisper-stream` is available on the system.
*/
static async isAvailable(): Promise<boolean> {
try {
await commandExists('whisper-stream');
return true;
} catch {
return false;
}
}
async connect(): Promise<void> {
const { modelPath, threads = 4, step = 0, length = 5000 } = this.options;
this.currentTranscription = '';
const available = await WhisperTranscriptionProvider.isAvailable();
if (!available) {
return Promise.reject(
new Error(
'The `whisper-stream` command is required for local voice mode. Please install it (e.g., `brew install whisper-cpp` on macOS).',
),
);
}
debugLogger.debug(
`[WhisperTranscription] Starting whisper-stream with model: ${modelPath} (VAD mode: step=${step}, length=${length})`,
);
return new Promise((resolve, reject) => {
let isResolved = false;
try {
// whisper-stream -m <model_path> -t <threads> --step 0 --length <length> -vth 0.6
// Setting step == 0 enables sliding window mode with VAD, which outputs
// non-overlapping transcription blocks suitable for appending.
this.process = spawn('whisper-stream', [
'-m',
modelPath,
'-t',
threads.toString(),
'--step',
step.toString(),
'--length',
length.toString(),
'-vth',
'0.6',
]);
this.process.stdout.on('data', (data: Buffer) => {
const output = data.toString();
this.parseOutput(output);
});
this.process.stderr.on('data', (data: Buffer) => {
const msg = data.toString();
if (msg.includes('error')) {
debugLogger.error(`[WhisperTranscription] stderr: ${msg}`);
if (!isResolved) {
isResolved = true;
reject(new Error(msg));
}
}
// whisper-stream prints "whisper_init_from_file_with_params_no_state: loading model from..."
// and finally "main: processing, press Ctrl+C to stop" when ready.
if (!isResolved && msg.includes('main: processing')) {
debugLogger.debug('[WhisperTranscription] whisper-stream is ready');
isResolved = true;
resolve();
}
});
this.process.on('error', (err) => {
debugLogger.error('[WhisperTranscription] Process error:', err);
this.emit('error', err);
if (!isResolved) {
isResolved = true;
reject(err);
}
});
this.process.on('close', (code) => {
debugLogger.debug(
`[WhisperTranscription] Process closed with code ${code}`,
);
this.emit('close');
this.process = null;
});
// Fallback timeout in case "main: processing" is never seen
setTimeout(() => {
if (!isResolved) {
debugLogger.warn(
'[WhisperTranscription] Connection timeout (fallback resolve)',
);
isResolved = true;
resolve();
}
}, 10000);
} catch (err) {
debugLogger.error(
'[WhisperTranscription] Failed to spawn process:',
err,
);
if (!isResolved) {
isResolved = true;
reject(err);
}
}
});
}
private parseOutput(output: string): void {
// whisper-stream output format: "[00:00:00.000 --> 00:00:02.000] Hello world."
const lines = output.split('\n');
for (const line of lines) {
const match = line.match(/\[.* --> .*\]\s+(.*)/);
if (match && match[1]) {
let text = match[1].trim();
// Filter out [Silence], [music], (laughter), etc.
text = text
.replace(/\[[^\]]*\]/g, '')
.replace(/\([^)]*\)/g, '')
.trim();
if (text) {
// In VAD mode (step=0), each line is a completed speech block.
// Append it to the buffer to ensure it doesn't disappear.
this.currentTranscription = this.currentTranscription
? `${this.currentTranscription} ${text}`
: text;
debugLogger.debug(
`[WhisperTranscription] Transcription updated (Local-VAD): "${this.currentTranscription}"`,
);
this.emit('transcription', this.currentTranscription);
}
}
}
}
sendAudioChunk(_chunk: Buffer): void {
// whisper-stream handles its own audio capture.
}
getTranscription(): string {
return this.currentTranscription;
}
disconnect(): void {
if (this.process) {
this.process.kill('SIGTERM');
this.process = null;
}
}
}