mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-04-25 20:44:46 -07:00
feat(voice): implement real-time voice mode with cloud and local backends (#24174)
This commit is contained in:
@@ -1000,6 +1000,7 @@ export async function loadCliConfig(
|
||||
enableExtensionReloading: settings.experimental?.extensionReloading,
|
||||
enableAgents: settings.experimental?.enableAgents,
|
||||
plan: settings.general?.plan?.enabled ?? true,
|
||||
voiceMode: settings.experimental?.voiceMode,
|
||||
tracker: settings.experimental?.taskTracker,
|
||||
directWebFetch: settings.experimental?.directWebFetch,
|
||||
planSettings: settings.general?.plan?.directory
|
||||
|
||||
@@ -2061,6 +2061,87 @@ const SETTINGS_SCHEMA = {
|
||||
description: 'Enable access to Gemma 4 models (experimental).',
|
||||
showInDialog: true,
|
||||
},
|
||||
voiceMode: {
|
||||
type: 'boolean',
|
||||
label: 'Voice Mode',
|
||||
category: 'Experimental',
|
||||
requiresRestart: false,
|
||||
default: false,
|
||||
description:
|
||||
'Enable experimental voice dictation and commands (/voice, /voice model).',
|
||||
showInDialog: true,
|
||||
},
|
||||
voice: {
|
||||
type: 'object',
|
||||
label: 'Voice',
|
||||
category: 'Experimental',
|
||||
requiresRestart: false,
|
||||
default: {},
|
||||
description: 'Settings for voice mode and transcription.',
|
||||
showInDialog: false,
|
||||
properties: {
|
||||
activationMode: {
|
||||
type: 'enum',
|
||||
label: 'Voice Activation Mode',
|
||||
category: 'Experimental',
|
||||
requiresRestart: false,
|
||||
default: 'push-to-talk',
|
||||
description: 'How to trigger voice recording with the Space key.',
|
||||
showInDialog: true,
|
||||
options: [
|
||||
{ value: 'push-to-talk', label: 'Push-To-Talk (Hold Space)' },
|
||||
{ value: 'toggle', label: 'Toggle (Press Space to start/stop)' },
|
||||
],
|
||||
},
|
||||
backend: {
|
||||
type: 'enum',
|
||||
label: 'Voice Transcription Backend',
|
||||
category: 'Experimental',
|
||||
requiresRestart: false,
|
||||
default: 'gemini-live',
|
||||
description: 'The backend to use for voice transcription.',
|
||||
showInDialog: true,
|
||||
options: [
|
||||
{ value: 'gemini-live', label: 'Gemini Live API (Cloud)' },
|
||||
{ value: 'whisper', label: 'Whisper (Local)' },
|
||||
],
|
||||
},
|
||||
whisperModel: {
|
||||
type: 'enum',
|
||||
label: 'Whisper Model',
|
||||
category: 'Experimental',
|
||||
requiresRestart: false,
|
||||
default: 'ggml-base.en.bin',
|
||||
description: 'The Whisper model to use for local transcription.',
|
||||
showInDialog: true,
|
||||
options: [
|
||||
{ value: 'ggml-tiny.en.bin', label: 'Tiny (EN) - Fast (~75MB)' },
|
||||
{
|
||||
value: 'ggml-base.en.bin',
|
||||
label: 'Base (EN) - Balanced (~142MB)',
|
||||
},
|
||||
{
|
||||
value: 'ggml-large-v3-turbo-q5_0.bin',
|
||||
label: 'Large v3 Turbo (Q5_0) - High Accuracy (~547MB)',
|
||||
},
|
||||
{
|
||||
value: 'ggml-large-v3-turbo-q8_0.bin',
|
||||
label: 'Large v3 Turbo (Q8_0) - Max Accuracy (~834MB)',
|
||||
},
|
||||
],
|
||||
},
|
||||
stopGracePeriodMs: {
|
||||
type: 'number',
|
||||
label: 'Voice Stop Grace Period (ms)',
|
||||
category: 'Experimental',
|
||||
requiresRestart: false,
|
||||
default: 1000,
|
||||
description:
|
||||
'How long to wait for final transcription after stopping recording.',
|
||||
showInDialog: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
adk: {
|
||||
type: 'object',
|
||||
label: 'ADK',
|
||||
|
||||
@@ -170,6 +170,7 @@ describe('BuiltinCommandLoader', () => {
|
||||
getAllSkills: vi.fn().mockReturnValue([]),
|
||||
isAdminEnabled: vi.fn().mockReturnValue(true),
|
||||
}),
|
||||
isVoiceModeEnabled: vi.fn().mockReturnValue(true),
|
||||
getContentGeneratorConfig: vi.fn().mockReturnValue({
|
||||
authType: 'other',
|
||||
}),
|
||||
@@ -396,6 +397,7 @@ describe('BuiltinCommandLoader profile', () => {
|
||||
getAllSkills: vi.fn().mockReturnValue([]),
|
||||
isAdminEnabled: vi.fn().mockReturnValue(true),
|
||||
}),
|
||||
isVoiceModeEnabled: vi.fn().mockReturnValue(true),
|
||||
getContentGeneratorConfig: vi.fn().mockReturnValue({
|
||||
authType: 'other',
|
||||
}),
|
||||
|
||||
@@ -62,6 +62,7 @@ import { setupGithubCommand } from '../ui/commands/setupGithubCommand.js';
|
||||
import { terminalSetupCommand } from '../ui/commands/terminalSetupCommand.js';
|
||||
import { upgradeCommand } from '../ui/commands/upgradeCommand.js';
|
||||
import { gemmaStatusCommand } from '../ui/commands/gemmaStatusCommand.js';
|
||||
import { voiceCommand } from '../ui/commands/voiceCommand.js';
|
||||
|
||||
/**
|
||||
* Loads the core, hard-coded slash commands that are an integral part
|
||||
@@ -227,6 +228,7 @@ export class BuiltinCommandLoader implements ICommandLoader {
|
||||
vimCommand,
|
||||
setupGithubCommand,
|
||||
terminalSetupCommand,
|
||||
...(this.config?.isVoiceModeEnabled() ? [voiceCommand] : []),
|
||||
...(this.config?.getContentGeneratorConfig()?.authType ===
|
||||
AuthType.LOGIN_WITH_GOOGLE
|
||||
? [upgradeCommand]
|
||||
|
||||
@@ -552,6 +552,8 @@ const mockUIActions: UIActions = {
|
||||
exitPrivacyNotice: vi.fn(),
|
||||
closeSettingsDialog: vi.fn(),
|
||||
closeModelDialog: vi.fn(),
|
||||
openVoiceModelDialog: vi.fn(),
|
||||
closeVoiceModelDialog: vi.fn(),
|
||||
openAgentConfigDialog: vi.fn(),
|
||||
closeAgentConfigDialog: vi.fn(),
|
||||
openPermissionsDialog: vi.fn(),
|
||||
@@ -598,6 +600,7 @@ const mockUIActions: UIActions = {
|
||||
handleNewAgentsSelect: vi.fn(),
|
||||
getPreferredEditor: vi.fn(),
|
||||
clearAccountSuspension: vi.fn(),
|
||||
setVoiceModeEnabled: vi.fn(),
|
||||
};
|
||||
|
||||
import { type TextBuffer } from '../ui/components/shared/text-buffer.js';
|
||||
|
||||
@@ -103,6 +103,7 @@ import { useQuotaAndFallback } from './hooks/useQuotaAndFallback.js';
|
||||
import { useEditorSettings } from './hooks/useEditorSettings.js';
|
||||
import { useSettingsCommand } from './hooks/useSettingsCommand.js';
|
||||
import { useModelCommand } from './hooks/useModelCommand.js';
|
||||
import { useVoiceModelCommand } from './hooks/useVoiceModelCommand.js';
|
||||
import { useSlashCommandProcessor } from './hooks/slashCommandProcessor.js';
|
||||
import { useVimMode } from './contexts/VimModeContext.js';
|
||||
import {
|
||||
@@ -312,6 +313,7 @@ export const AppContainer = (props: AppContainerProps) => {
|
||||
);
|
||||
|
||||
const [shellModeActive, setShellModeActive] = useState(false);
|
||||
const [isVoiceModeEnabled, setVoiceModeEnabled] = useState(false);
|
||||
const [modelSwitchedFromQuotaError, setModelSwitchedFromQuotaError] =
|
||||
useState<boolean>(false);
|
||||
const [historyRemountKey, setHistoryRemountKey] = useState(0);
|
||||
@@ -946,6 +948,12 @@ Logging in with Google... Restarting Gemini CLI to continue.
|
||||
const { isModelDialogOpen, openModelDialog, closeModelDialog } =
|
||||
useModelCommand();
|
||||
|
||||
const {
|
||||
isVoiceModelDialogOpen,
|
||||
openVoiceModelDialog,
|
||||
closeVoiceModelDialog,
|
||||
} = useVoiceModelCommand();
|
||||
|
||||
const { toggleVimEnabled } = useVimMode();
|
||||
|
||||
const setIsBackgroundTaskListOpenRef = useRef<(open: boolean) => void>(
|
||||
@@ -969,6 +977,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
|
||||
openSettingsDialog,
|
||||
openSessionBrowser,
|
||||
openModelDialog,
|
||||
openVoiceModelDialog,
|
||||
openAgentConfigDialog,
|
||||
openPermissionsDialog,
|
||||
quit: (messages: HistoryItem[]) => {
|
||||
@@ -981,6 +990,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
|
||||
},
|
||||
setDebugMessage,
|
||||
toggleCorgiMode: () => setCorgiMode((prev) => !prev),
|
||||
toggleVoiceMode: () => setVoiceModeEnabled((prev) => !prev),
|
||||
toggleDebugProfiler,
|
||||
dispatchExtensionStateUpdate,
|
||||
addConfirmUpdateExtensionRequest,
|
||||
@@ -1006,6 +1016,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
|
||||
openSettingsDialog,
|
||||
openSessionBrowser,
|
||||
openModelDialog,
|
||||
openVoiceModelDialog,
|
||||
openAgentConfigDialog,
|
||||
setQuittingMessages,
|
||||
setDebugMessage,
|
||||
@@ -2191,6 +2202,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
|
||||
isThemeDialogOpen ||
|
||||
isSettingsDialogOpen ||
|
||||
isModelDialogOpen ||
|
||||
isVoiceModelDialogOpen ||
|
||||
isAgentConfigDialogOpen ||
|
||||
isPermissionsDialogOpen ||
|
||||
isAuthenticating ||
|
||||
@@ -2448,6 +2460,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
|
||||
isSettingsDialogOpen,
|
||||
isSessionBrowserOpen,
|
||||
isModelDialogOpen,
|
||||
isVoiceModelDialogOpen,
|
||||
isAgentConfigDialogOpen,
|
||||
selectedAgentName,
|
||||
selectedAgentDisplayName,
|
||||
@@ -2468,6 +2481,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
|
||||
pendingGeminiHistoryItems,
|
||||
thought,
|
||||
isInputActive,
|
||||
isVoiceModeEnabled,
|
||||
isResuming,
|
||||
shouldShowIdePrompt,
|
||||
isFolderTrustDialogOpen: isFolderTrustDialogOpen ?? false,
|
||||
@@ -2559,6 +2573,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
|
||||
isSettingsDialogOpen,
|
||||
isSessionBrowserOpen,
|
||||
isModelDialogOpen,
|
||||
isVoiceModelDialogOpen,
|
||||
isAgentConfigDialogOpen,
|
||||
selectedAgentName,
|
||||
selectedAgentDisplayName,
|
||||
@@ -2579,6 +2594,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
|
||||
pendingGeminiHistoryItems,
|
||||
thought,
|
||||
isInputActive,
|
||||
isVoiceModeEnabled,
|
||||
isResuming,
|
||||
shouldShowIdePrompt,
|
||||
isFolderTrustDialogOpen,
|
||||
@@ -2671,6 +2687,8 @@ Logging in with Google... Restarting Gemini CLI to continue.
|
||||
exitPrivacyNotice,
|
||||
closeSettingsDialog,
|
||||
closeModelDialog,
|
||||
openVoiceModelDialog,
|
||||
closeVoiceModelDialog,
|
||||
openAgentConfigDialog,
|
||||
closeAgentConfigDialog,
|
||||
openPermissionsDialog,
|
||||
@@ -2751,6 +2769,9 @@ Logging in with Google... Restarting Gemini CLI to continue.
|
||||
setAccountSuspensionInfo(null);
|
||||
setAuthState(AuthState.Updating);
|
||||
},
|
||||
setVoiceModeEnabled: (value: boolean) => {
|
||||
setVoiceModeEnabled(value);
|
||||
},
|
||||
}),
|
||||
[
|
||||
handleThemeSelect,
|
||||
@@ -2764,6 +2785,8 @@ Logging in with Google... Restarting Gemini CLI to continue.
|
||||
exitPrivacyNotice,
|
||||
closeSettingsDialog,
|
||||
closeModelDialog,
|
||||
openVoiceModelDialog,
|
||||
closeVoiceModelDialog,
|
||||
openAgentConfigDialog,
|
||||
closeAgentConfigDialog,
|
||||
openPermissionsDialog,
|
||||
@@ -2807,6 +2830,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
|
||||
config,
|
||||
historyManager,
|
||||
getPreferredEditor,
|
||||
setVoiceModeEnabled,
|
||||
],
|
||||
);
|
||||
|
||||
|
||||
@@ -72,6 +72,7 @@ export interface CommandContext {
|
||||
loadHistory: (history: HistoryItem[], postLoadInput?: string) => void;
|
||||
/** Toggles a special display mode. */
|
||||
toggleCorgiMode: () => void;
|
||||
toggleVoiceMode: () => void;
|
||||
toggleDebugProfiler: () => void;
|
||||
toggleVimEnabled: () => Promise<boolean>;
|
||||
reloadCommands: () => void;
|
||||
@@ -125,6 +126,7 @@ export interface OpenDialogActionReturn {
|
||||
| 'settings'
|
||||
| 'sessionBrowser'
|
||||
| 'model'
|
||||
| 'voice-model'
|
||||
| 'agentConfig'
|
||||
| 'permissions';
|
||||
}
|
||||
|
||||
@@ -0,0 +1,30 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { CommandKind, type SlashCommand } from './types.js';
|
||||
|
||||
export const voiceCommand: SlashCommand = {
|
||||
name: 'voice',
|
||||
altNames: [],
|
||||
description: 'Toggle voice dictation mode',
|
||||
kind: CommandKind.BUILT_IN,
|
||||
autoExecute: true,
|
||||
action: (context) => {
|
||||
context.ui.toggleVoiceMode();
|
||||
},
|
||||
subCommands: [
|
||||
{
|
||||
name: 'model',
|
||||
description: 'Manage voice transcription models',
|
||||
kind: CommandKind.BUILT_IN,
|
||||
autoExecute: true,
|
||||
action: async () => ({
|
||||
type: 'dialog',
|
||||
dialog: 'voice-model',
|
||||
}),
|
||||
},
|
||||
],
|
||||
};
|
||||
@@ -25,6 +25,7 @@ import { relaunchApp } from '../../utils/processUtils.js';
|
||||
import { SessionBrowser } from './SessionBrowser.js';
|
||||
import { PermissionsModifyTrustDialog } from './PermissionsModifyTrustDialog.js';
|
||||
import { ModelDialog } from './ModelDialog.js';
|
||||
import { VoiceModelDialog } from './VoiceModelDialog.js';
|
||||
import { theme } from '../semantic-colors.js';
|
||||
import { useUIState } from '../contexts/UIStateContext.js';
|
||||
import { useQuotaState } from '../contexts/QuotaContext.js';
|
||||
@@ -238,6 +239,9 @@ export const DialogManager = ({
|
||||
if (uiState.isModelDialogOpen) {
|
||||
return <ModelDialog onClose={uiActions.closeModelDialog} />;
|
||||
}
|
||||
if (uiState.isVoiceModelDialogOpen) {
|
||||
return <VoiceModelDialog onClose={uiActions.closeVoiceModelDialog} />;
|
||||
}
|
||||
if (
|
||||
uiState.isAgentConfigDialogOpen &&
|
||||
uiState.selectedAgentName &&
|
||||
|
||||
@@ -9,12 +9,41 @@ import { createMockSettings } from '../../test-utils/settings.js';
|
||||
import { makeFakeConfig } from '@google/gemini-cli-core';
|
||||
import { waitFor } from '../../test-utils/async.js';
|
||||
import { act, useState, useMemo } from 'react';
|
||||
import type { EventEmitter } from 'node:events';
|
||||
|
||||
const { fakeTranscriptionProvider } = vi.hoisted(() => {
|
||||
// Use require within hoisted block for immediate synchronous access
|
||||
// eslint-disable-next-line @typescript-eslint/no-require-imports, no-restricted-syntax
|
||||
const { EventEmitter } = require('node:events');
|
||||
class FakeTranscriptionProvider extends EventEmitter {
|
||||
connect = vi.fn().mockResolvedValue(undefined);
|
||||
disconnect = vi.fn();
|
||||
sendAudioChunk = vi.fn();
|
||||
getTranscription = vi.fn().mockReturnValue('');
|
||||
}
|
||||
return {
|
||||
fakeTranscriptionProvider: new FakeTranscriptionProvider(),
|
||||
};
|
||||
});
|
||||
|
||||
vi.mock('@google/gemini-cli-core', async (importOriginal) => {
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
const actual = (await importOriginal()) as any;
|
||||
return {
|
||||
...actual,
|
||||
TranscriptionFactory: {
|
||||
createProvider: vi.fn(() => fakeTranscriptionProvider),
|
||||
},
|
||||
};
|
||||
});
|
||||
|
||||
import {
|
||||
InputPrompt,
|
||||
tryTogglePasteExpansion,
|
||||
type InputPromptProps,
|
||||
} from './InputPrompt.js';
|
||||
import { InputContext } from '../contexts/InputContext.js';
|
||||
import { type UIState } from '../contexts/UIStateContext.js';
|
||||
import {
|
||||
calculateTransformationsForLine,
|
||||
calculateTransformedLine,
|
||||
@@ -417,6 +446,7 @@ describe('InputPrompt', () => {
|
||||
getWorkspaceContext: () => ({
|
||||
getDirectories: () => ['/test/project/src'],
|
||||
}),
|
||||
getContentGeneratorConfig: () => ({ apiKey: 'test-api-key' }),
|
||||
} as unknown as Config,
|
||||
slashCommands: mockSlashCommands,
|
||||
commandContext: mockCommandContext,
|
||||
@@ -4925,6 +4955,383 @@ describe('InputPrompt', () => {
|
||||
},
|
||||
);
|
||||
});
|
||||
|
||||
describe('Voice Mode', () => {
|
||||
beforeEach(() => {
|
||||
(
|
||||
fakeTranscriptionProvider as unknown as EventEmitter
|
||||
).removeAllListeners();
|
||||
vi.clearAllMocks();
|
||||
});
|
||||
|
||||
it('should start recording when space is pressed and voice mode is enabled (toggle)', async () => {
|
||||
await act(async () => {
|
||||
mockBuffer.setText('');
|
||||
});
|
||||
const { stdin, unmount, lastFrame } = await renderWithProviders(
|
||||
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
|
||||
{
|
||||
uiState: { isVoiceModeEnabled: true } as UIState,
|
||||
settings: createMockSettings({
|
||||
experimental: { voice: { activationMode: 'toggle' } },
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
// Initially not recording
|
||||
expect(lastFrame()).not.toContain('🎙️ Listening...');
|
||||
expect(lastFrame()).toContain(
|
||||
'Voice mode: Space to start/stop recording',
|
||||
);
|
||||
|
||||
// Press space to start
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
});
|
||||
|
||||
// Now should show listening
|
||||
await waitFor(() => {
|
||||
expect(lastFrame()).toContain('🎙️ Listening...');
|
||||
});
|
||||
|
||||
unmount();
|
||||
});
|
||||
|
||||
it('should toggle recording off when space is pressed again (toggle)', async () => {
|
||||
await act(async () => {
|
||||
mockBuffer.setText('');
|
||||
});
|
||||
const { stdin, unmount, lastFrame } = await renderWithProviders(
|
||||
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
|
||||
{
|
||||
uiState: { isVoiceModeEnabled: true } as UIState,
|
||||
settings: createMockSettings({
|
||||
experimental: { voice: { activationMode: 'toggle' } },
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
// Start recording
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
});
|
||||
await waitFor(() => {
|
||||
expect(lastFrame()).toContain('🎙️ Listening...');
|
||||
});
|
||||
|
||||
// Stop recording
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
});
|
||||
await waitFor(() => {
|
||||
expect(lastFrame()).not.toContain('🎙️ Listening...');
|
||||
expect(lastFrame()).toContain(
|
||||
'Voice mode: Space to start/stop recording',
|
||||
);
|
||||
});
|
||||
|
||||
unmount();
|
||||
});
|
||||
|
||||
it('should resume recording when space is pressed even if buffer is not empty (toggle)', async () => {
|
||||
await act(async () => {
|
||||
mockBuffer.setText('some existing text');
|
||||
});
|
||||
const { stdin, unmount, lastFrame } = await renderWithProviders(
|
||||
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
|
||||
{
|
||||
uiState: { isVoiceModeEnabled: true } as UIState,
|
||||
settings: createMockSettings({
|
||||
experimental: { voice: { activationMode: 'toggle' } },
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
// Should show voice mode hint even if buffer is not empty (new behavior)
|
||||
expect(lastFrame()).toContain(
|
||||
'Voice mode: Space to start/stop recording',
|
||||
);
|
||||
expect(lastFrame()).toContain('some existing text');
|
||||
|
||||
// Press space to start recording again
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
});
|
||||
|
||||
await waitFor(() => {
|
||||
expect(lastFrame()).toContain('🎙️ Listening...');
|
||||
});
|
||||
|
||||
unmount();
|
||||
});
|
||||
|
||||
it('should not start recording if voice mode is disabled (toggle)', async () => {
|
||||
await act(async () => {
|
||||
mockBuffer.setText('');
|
||||
});
|
||||
const { stdin, unmount, lastFrame } = await renderWithProviders(
|
||||
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
|
||||
{
|
||||
uiState: { isVoiceModeEnabled: false } as UIState,
|
||||
settings: createMockSettings({
|
||||
experimental: { voice: { activationMode: 'toggle' } },
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
// Press space
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
});
|
||||
|
||||
// Should NOT show listening, instead should call handleInput which handles space
|
||||
expect(lastFrame()).not.toContain('🎙️ Listening...');
|
||||
expect(mockBuffer.handleInput).toHaveBeenCalled();
|
||||
unmount();
|
||||
});
|
||||
|
||||
it('should append transcription correctly across multiple turn updates (toggle)', async () => {
|
||||
await act(async () => {
|
||||
mockBuffer.setText('initial');
|
||||
});
|
||||
const { stdin, unmount } = await renderWithProviders(
|
||||
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
|
||||
{
|
||||
uiState: { isVoiceModeEnabled: true } as UIState,
|
||||
settings: createMockSettings({
|
||||
experimental: { voice: { activationMode: 'toggle' } },
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
// Start recording
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
});
|
||||
|
||||
// Emit first transcription
|
||||
await act(async () => {
|
||||
(fakeTranscriptionProvider as unknown as EventEmitter).emit(
|
||||
'transcription',
|
||||
'hello',
|
||||
);
|
||||
});
|
||||
await waitFor(() => {
|
||||
expect(mockBuffer.setText).toHaveBeenCalledWith('initial hello', 'end');
|
||||
});
|
||||
|
||||
// Emit turnComplete (Gemini Live starts over after this)
|
||||
await act(async () => {
|
||||
(fakeTranscriptionProvider as unknown as EventEmitter).emit(
|
||||
'turnComplete',
|
||||
);
|
||||
});
|
||||
|
||||
// Emit second part (Gemini Live sends new turn text starting from empty)
|
||||
await act(async () => {
|
||||
(fakeTranscriptionProvider as unknown as EventEmitter).emit(
|
||||
'transcription',
|
||||
'world',
|
||||
);
|
||||
});
|
||||
await waitFor(() => {
|
||||
// Should have appended 'world' to the baseline 'initial hello'
|
||||
expect(mockBuffer.setText).toHaveBeenCalledWith(
|
||||
'initial hello world',
|
||||
'end',
|
||||
);
|
||||
});
|
||||
|
||||
unmount();
|
||||
});
|
||||
|
||||
it('should append transcription correctly when resuming voice mode (toggle)', async () => {
|
||||
await act(async () => {
|
||||
mockBuffer.setText('First turn.');
|
||||
});
|
||||
const { stdin, unmount } = await renderWithProviders(
|
||||
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
|
||||
{
|
||||
uiState: { isVoiceModeEnabled: true } as UIState,
|
||||
settings: createMockSettings({
|
||||
experimental: { voice: { activationMode: 'toggle' } },
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
// Start recording (resumed)
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
});
|
||||
|
||||
// Emit transcription
|
||||
await act(async () => {
|
||||
(fakeTranscriptionProvider as unknown as EventEmitter).emit(
|
||||
'transcription',
|
||||
'Second turn.',
|
||||
);
|
||||
});
|
||||
|
||||
await waitFor(() => {
|
||||
expect(mockBuffer.setText).toHaveBeenCalledWith(
|
||||
'First turn. Second turn.',
|
||||
'end',
|
||||
);
|
||||
});
|
||||
|
||||
unmount();
|
||||
});
|
||||
|
||||
describe('push-to-talk', () => {
|
||||
beforeEach(() => {
|
||||
vi.useFakeTimers();
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
vi.useRealTimers();
|
||||
});
|
||||
|
||||
it('should insert a space on a single tap', async () => {
|
||||
const { stdin, unmount, lastFrame } = await renderWithProviders(
|
||||
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
|
||||
{
|
||||
uiState: { isVoiceModeEnabled: true } as UIState,
|
||||
settings: createMockSettings({
|
||||
experimental: { voice: { activationMode: 'push-to-talk' } },
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
expect(lastFrame()).toContain('Voice mode: Hold Space to record');
|
||||
|
||||
// Press space once
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
});
|
||||
|
||||
// Should insert space optimistically
|
||||
expect(mockBuffer.insert).toHaveBeenCalledWith(' ');
|
||||
expect(lastFrame()).not.toContain('🎙️ Listening...');
|
||||
|
||||
// Advance timer past HOLD_DELAY_MS
|
||||
await act(async () => {
|
||||
vi.advanceTimersByTime(700);
|
||||
});
|
||||
|
||||
expect(lastFrame()).not.toContain('🎙️ Listening...');
|
||||
unmount();
|
||||
});
|
||||
|
||||
it('should start recording on hold (simulated by repeat spaces)', async () => {
|
||||
const { stdin, unmount, lastFrame } = await renderWithProviders(
|
||||
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
|
||||
{
|
||||
uiState: { isVoiceModeEnabled: true } as UIState,
|
||||
settings: createMockSettings({
|
||||
experimental: { voice: { activationMode: 'push-to-talk' } },
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
// First space
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
});
|
||||
expect(mockBuffer.insert).toHaveBeenCalledWith(' ');
|
||||
|
||||
// Second space (repeat)
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
});
|
||||
|
||||
await waitFor(() => {
|
||||
// Should have backspaced the optimistic space
|
||||
expect(mockBuffer.backspace).toHaveBeenCalled();
|
||||
// Should show listening
|
||||
expect(lastFrame()).toContain('🎙️ Listening...');
|
||||
});
|
||||
|
||||
unmount();
|
||||
});
|
||||
|
||||
it('should stop recording when space heartbeat stops (release)', async () => {
|
||||
const { stdin, unmount, lastFrame } = await renderWithProviders(
|
||||
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
|
||||
{
|
||||
uiState: { isVoiceModeEnabled: true } as UIState,
|
||||
settings: createMockSettings({
|
||||
experimental: { voice: { activationMode: 'push-to-talk' } },
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
// Start hold
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
stdin.write(' ');
|
||||
});
|
||||
|
||||
// Use a short interval in waitFor to prevent advancing fake timers past the 300ms RELEASE_DELAY_MS
|
||||
await waitFor(
|
||||
() => {
|
||||
expect(lastFrame()).toContain('🎙️ Listening...');
|
||||
},
|
||||
{ interval: 10 },
|
||||
);
|
||||
|
||||
// Simulate heartbeat (held key) - send space first to reset timer, then advance
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
vi.advanceTimersByTime(100);
|
||||
});
|
||||
expect(lastFrame()).toContain('🎙️ Listening...');
|
||||
|
||||
// Stop heartbeat (release)
|
||||
await act(async () => {
|
||||
vi.advanceTimersByTime(400); // Past RELEASE_DELAY_MS
|
||||
});
|
||||
|
||||
await waitFor(() => {
|
||||
expect(lastFrame()).not.toContain('🎙️ Listening...');
|
||||
});
|
||||
|
||||
unmount();
|
||||
});
|
||||
|
||||
it('should cancel hold state if non-space key is pressed after first space', async () => {
|
||||
const { stdin, unmount } = await renderWithProviders(
|
||||
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
|
||||
{
|
||||
uiState: { isVoiceModeEnabled: true } as UIState,
|
||||
settings: createMockSettings({
|
||||
experimental: { voice: { activationMode: 'push-to-talk' } },
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
// First space
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
});
|
||||
|
||||
// Type 'a'
|
||||
await act(async () => {
|
||||
stdin.write('a');
|
||||
});
|
||||
|
||||
// Should NOT start recording on next space even if fast
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
});
|
||||
|
||||
expect(mockBuffer.insert).toHaveBeenCalledTimes(2); // Two spaces inserted
|
||||
expect(mockBuffer.handleInput).toHaveBeenCalledWith(
|
||||
expect.objectContaining({ name: 'a' }),
|
||||
);
|
||||
unmount();
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
function clean(str: string | undefined): string {
|
||||
|
||||
@@ -56,6 +56,7 @@ import {
|
||||
debugLogger,
|
||||
type Config,
|
||||
} from '@google/gemini-cli-core';
|
||||
import { useVoiceMode } from '../hooks/useVoiceMode.js';
|
||||
import {
|
||||
parseInputForHighlighting,
|
||||
parseSegmentsFromTokens,
|
||||
@@ -159,7 +160,6 @@ export function isLargePaste(text: string): boolean {
|
||||
}
|
||||
|
||||
const DOUBLE_TAB_CLEAN_UI_TOGGLE_WINDOW_MS = 350;
|
||||
|
||||
/**
|
||||
* Attempt to toggle expansion of a paste placeholder in the buffer.
|
||||
* Returns true if a toggle action was performed or hint was shown, false otherwise.
|
||||
@@ -238,6 +238,7 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
|
||||
setEmbeddedShellFocused,
|
||||
setShortcutsHelpVisible,
|
||||
toggleCleanUiDetailsVisible,
|
||||
setVoiceModeEnabled,
|
||||
} = useUIActions();
|
||||
const {
|
||||
terminalWidth,
|
||||
@@ -246,6 +247,7 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
|
||||
backgroundTasks,
|
||||
backgroundTaskHeight,
|
||||
shortcutsHelpVisible,
|
||||
isVoiceModeEnabled,
|
||||
} = useUIState();
|
||||
const [suppressCompletion, setSuppressCompletion] = useState(false);
|
||||
const { handlePress: registerPlainTabPress, resetCount: resetPlainTabPress } =
|
||||
@@ -263,6 +265,7 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
|
||||
resetEscapeState();
|
||||
if (buffer.text.length > 0) {
|
||||
buffer.setText('');
|
||||
resetTurnBaseline();
|
||||
resetCompletionState();
|
||||
} else if (history.length > 0) {
|
||||
onSubmit('/rewind');
|
||||
@@ -281,6 +284,16 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
|
||||
const hasUserNavigatedSuggestions = useRef(false);
|
||||
const listRef = useRef<ScrollableListRef<ScrollableItem>>(null);
|
||||
|
||||
const { isRecording, handleVoiceInput, resetTurnBaseline } = useVoiceMode({
|
||||
buffer,
|
||||
config,
|
||||
settings,
|
||||
setQueueErrorMessage,
|
||||
isVoiceModeEnabled,
|
||||
setVoiceModeEnabled,
|
||||
keyMatchers,
|
||||
});
|
||||
|
||||
const [reverseSearchActive, setReverseSearchActive] = useState(false);
|
||||
const [commandSearchActive, setCommandSearchActive] = useState(false);
|
||||
const [textBeforeReverseSearch, setTextBeforeReverseSearch] = useState('');
|
||||
@@ -387,6 +400,7 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
|
||||
// Clear the buffer *before* calling onSubmit to prevent potential re-submission
|
||||
// if onSubmit triggers a re-render while the buffer still holds the old value.
|
||||
buffer.setText('');
|
||||
resetTurnBaseline();
|
||||
onSubmit(processedValue);
|
||||
resetCompletionState();
|
||||
resetReverseSearchCompletionState();
|
||||
@@ -398,6 +412,7 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
|
||||
shellModeActive,
|
||||
shellHistory,
|
||||
resetReverseSearchCompletionState,
|
||||
resetTurnBaseline,
|
||||
],
|
||||
);
|
||||
|
||||
@@ -647,6 +662,8 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
|
||||
|
||||
const handleInput = useCallback(
|
||||
(key: Key) => {
|
||||
if (handleVoiceInput(key)) return true;
|
||||
|
||||
// Determine if this keypress is a history navigation command
|
||||
const isHistoryUp =
|
||||
!shellModeActive &&
|
||||
@@ -873,9 +890,9 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
|
||||
) {
|
||||
setShellModeActive(!shellModeActive);
|
||||
buffer.setText(''); // Clear the '!' from input
|
||||
resetTurnBaseline();
|
||||
return true;
|
||||
}
|
||||
|
||||
if (keyMatchers[Command.ESCAPE](key)) {
|
||||
const cancelSearch = (
|
||||
setActive: (active: boolean) => void,
|
||||
@@ -1360,6 +1377,7 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
|
||||
backgroundTaskHeight,
|
||||
streamingState,
|
||||
handleEscPress,
|
||||
resetTurnBaseline,
|
||||
registerPlainTabPress,
|
||||
resetPlainTabPress,
|
||||
toggleCleanUiDetailsVisible,
|
||||
@@ -1369,9 +1387,9 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
|
||||
keyMatchers,
|
||||
isHelpDismissKey,
|
||||
settings,
|
||||
handleVoiceInput,
|
||||
],
|
||||
);
|
||||
|
||||
useKeypress(handleInput, {
|
||||
isActive: !isEmbeddedShellFocused && !copyModeEnabled,
|
||||
priority: true,
|
||||
@@ -1792,20 +1810,39 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
|
||||
)}{' '}
|
||||
</Text>
|
||||
<Box flexGrow={1} flexDirection="column" ref={innerBoxRef}>
|
||||
{buffer.text.length === 0 && placeholder ? (
|
||||
showCursor ? (
|
||||
<Text
|
||||
terminalCursorFocus={showCursor}
|
||||
terminalCursorPosition={0}
|
||||
>
|
||||
{chalk.inverse(placeholder.slice(0, 1))}
|
||||
<Text color={theme.text.secondary}>
|
||||
{placeholder.slice(1)}
|
||||
</Text>
|
||||
{isRecording && (
|
||||
<Box flexDirection="row" marginBottom={0}>
|
||||
<Text color={theme.status.success}>🎙️ Listening...</Text>
|
||||
</Box>
|
||||
)}
|
||||
{isVoiceModeEnabled && !isRecording && (
|
||||
<Box flexDirection="row" marginBottom={0}>
|
||||
<Text color={theme.text.secondary}>
|
||||
> Voice mode:{' '}
|
||||
{(settings.experimental.voice?.activationMode ??
|
||||
'push-to-talk') === 'push-to-talk'
|
||||
? 'Hold Space to record'
|
||||
: 'Space to start/stop recording'}{' '}
|
||||
(Esc to exit)
|
||||
</Text>
|
||||
) : (
|
||||
<Text color={theme.text.secondary}>{placeholder}</Text>
|
||||
)
|
||||
</Box>
|
||||
)}
|
||||
{buffer.text.length === 0 && !isRecording ? (
|
||||
!isVoiceModeEnabled && placeholder ? (
|
||||
showCursor ? (
|
||||
<Text
|
||||
terminalCursorFocus={showCursor}
|
||||
terminalCursorPosition={0}
|
||||
>
|
||||
{chalk.inverse(placeholder.slice(0, 1))}
|
||||
<Text color={theme.text.secondary}>
|
||||
{placeholder.slice(1)}
|
||||
</Text>
|
||||
</Text>
|
||||
) : (
|
||||
<Text color={theme.text.secondary}>{placeholder}</Text>
|
||||
)
|
||||
) : null
|
||||
) : (
|
||||
<Box
|
||||
flexDirection="column"
|
||||
|
||||
@@ -0,0 +1,236 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import type React from 'react';
|
||||
import { useCallback, useMemo, useState } from 'react';
|
||||
import { Box, Text } from 'ink';
|
||||
import { theme } from '../semantic-colors.js';
|
||||
import { DescriptiveRadioButtonSelect } from './shared/DescriptiveRadioButtonSelect.js';
|
||||
import { useSettingsStore } from '../contexts/SettingsContext.js';
|
||||
import { SettingScope } from '../../config/settings.js';
|
||||
import { useKeypress, type Key } from '../hooks/useKeypress.js';
|
||||
import { isBinaryAvailable } from '@google/gemini-cli-core';
|
||||
import {
|
||||
WhisperModelManager,
|
||||
type WhisperModelProgress,
|
||||
} from '@google/gemini-cli-core';
|
||||
import { CliSpinner } from './CliSpinner.js';
|
||||
|
||||
interface VoiceModelDialogProps {
|
||||
onClose: () => void;
|
||||
}
|
||||
|
||||
type DialogView = 'backend' | 'whisper-models';
|
||||
|
||||
const WHISPER_MODELS = [
|
||||
{
|
||||
value: 'ggml-tiny.en.bin',
|
||||
label: 'Tiny (EN)',
|
||||
description: 'Fastest, lower accuracy (~75MB)',
|
||||
},
|
||||
{
|
||||
value: 'ggml-base.en.bin',
|
||||
label: 'Base (EN)',
|
||||
description: 'Balanced speed and accuracy (~142MB)',
|
||||
},
|
||||
{
|
||||
value: 'ggml-large-v3-turbo-q5_0.bin',
|
||||
label: 'Large v3 Turbo (Q5_0)',
|
||||
description: 'High accuracy, quantized (~547MB)',
|
||||
},
|
||||
{
|
||||
value: 'ggml-large-v3-turbo-q8_0.bin',
|
||||
label: 'Large v3 Turbo (Q8_0)',
|
||||
description: 'Maximum accuracy, high memory (~834MB)',
|
||||
},
|
||||
];
|
||||
|
||||
export function VoiceModelDialog({
|
||||
onClose,
|
||||
}: VoiceModelDialogProps): React.JSX.Element {
|
||||
const { settings, setSetting } = useSettingsStore();
|
||||
const [view, setView] = useState<DialogView>('backend');
|
||||
const [downloadProgress, setDownloadProgress] =
|
||||
useState<WhisperModelProgress | null>(null);
|
||||
const [error, setError] = useState<string | null>(null);
|
||||
|
||||
const whisperInstalled = useMemo(
|
||||
() => isBinaryAvailable('whisper-stream'),
|
||||
[],
|
||||
);
|
||||
const modelManager = useMemo(() => new WhisperModelManager(), []);
|
||||
|
||||
const currentBackend =
|
||||
settings.merged.experimental.voice?.backend ?? 'gemini-live';
|
||||
const currentWhisperModel =
|
||||
settings.merged.experimental.voice?.whisperModel ?? 'ggml-base.en.bin';
|
||||
|
||||
const handleKeypress = useCallback(
|
||||
(key: Key) => {
|
||||
if (key.name === 'escape') {
|
||||
if (view === 'whisper-models') {
|
||||
setView('backend');
|
||||
} else {
|
||||
onClose();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
},
|
||||
[view, onClose],
|
||||
);
|
||||
|
||||
useKeypress(handleKeypress, { isActive: true });
|
||||
|
||||
const handleBackendSelect = useCallback(
|
||||
(value: string) => {
|
||||
if (value === 'whisper') {
|
||||
setView('whisper-models');
|
||||
} else {
|
||||
setSetting(
|
||||
SettingScope.User,
|
||||
'experimental.voice.backend',
|
||||
'gemini-live',
|
||||
);
|
||||
onClose();
|
||||
}
|
||||
},
|
||||
[setSetting, onClose],
|
||||
);
|
||||
|
||||
const handleWhisperModelSelect = useCallback(
|
||||
async (modelName: string) => {
|
||||
if (modelManager.isModelInstalled(modelName)) {
|
||||
setSetting(SettingScope.User, 'experimental.voice.backend', 'whisper');
|
||||
setSetting(
|
||||
SettingScope.User,
|
||||
'experimental.voice.whisperModel',
|
||||
modelName,
|
||||
);
|
||||
onClose();
|
||||
} else {
|
||||
setError(null);
|
||||
const onProgress = (p: WhisperModelProgress) => setDownloadProgress(p);
|
||||
modelManager.on('progress', onProgress);
|
||||
|
||||
try {
|
||||
await modelManager.downloadModel(modelName);
|
||||
|
||||
setSetting(
|
||||
SettingScope.User,
|
||||
'experimental.voice.backend',
|
||||
'whisper',
|
||||
);
|
||||
setSetting(
|
||||
SettingScope.User,
|
||||
'experimental.voice.whisperModel',
|
||||
modelName,
|
||||
);
|
||||
onClose();
|
||||
} catch (err) {
|
||||
setError(
|
||||
`Failed to download: ${err instanceof Error ? err.message : String(err)}`,
|
||||
);
|
||||
} finally {
|
||||
modelManager.off('progress', onProgress);
|
||||
setDownloadProgress(null);
|
||||
}
|
||||
}
|
||||
},
|
||||
[modelManager, setSetting, onClose],
|
||||
);
|
||||
|
||||
const backendOptions = useMemo(
|
||||
() => [
|
||||
{
|
||||
value: 'gemini-live',
|
||||
title: 'Gemini Live API (Cloud)',
|
||||
description: 'Real-time cloud transcription via Gemini Live API.',
|
||||
key: 'gemini-live',
|
||||
},
|
||||
{
|
||||
value: 'whisper',
|
||||
title: 'Whisper (Local)',
|
||||
description: whisperInstalled
|
||||
? 'Local transcription using whisper.cpp.'
|
||||
: 'Local transcription (Requires: brew install whisper-cpp)',
|
||||
key: 'whisper',
|
||||
},
|
||||
],
|
||||
[whisperInstalled],
|
||||
);
|
||||
|
||||
const whisperOptions = useMemo(
|
||||
() =>
|
||||
WHISPER_MODELS.map((m) => ({
|
||||
value: m.value,
|
||||
title: `${m.label}${modelManager.isModelInstalled(m.value) ? ' (Installed)' : ' (Download)'}`,
|
||||
description: m.description,
|
||||
key: m.value,
|
||||
})),
|
||||
[modelManager],
|
||||
);
|
||||
|
||||
return (
|
||||
<Box
|
||||
borderStyle="round"
|
||||
borderColor={theme.border.default}
|
||||
flexDirection="column"
|
||||
padding={1}
|
||||
width="100%"
|
||||
>
|
||||
<Text bold>
|
||||
{view === 'backend'
|
||||
? 'Select Voice Transcription Backend'
|
||||
: 'Select Whisper Model'}
|
||||
</Text>
|
||||
|
||||
{error && (
|
||||
<Box marginTop={1}>
|
||||
<Text color={theme.status.error}>{error}</Text>
|
||||
</Box>
|
||||
)}
|
||||
|
||||
{downloadProgress ? (
|
||||
<Box marginTop={1} flexDirection="column">
|
||||
<Box>
|
||||
<Text>Downloading {downloadProgress.modelName}... </Text>
|
||||
<CliSpinner />
|
||||
<Text> {Math.round(downloadProgress.percentage * 100)}%</Text>
|
||||
</Box>
|
||||
</Box>
|
||||
) : (
|
||||
<Box marginTop={1}>
|
||||
{view === 'backend' ? (
|
||||
<DescriptiveRadioButtonSelect
|
||||
items={backendOptions}
|
||||
onSelect={handleBackendSelect}
|
||||
initialIndex={currentBackend === 'whisper' ? 1 : 0}
|
||||
showNumbers={true}
|
||||
/>
|
||||
) : (
|
||||
<DescriptiveRadioButtonSelect
|
||||
items={whisperOptions}
|
||||
onSelect={handleWhisperModelSelect}
|
||||
initialIndex={whisperOptions.findIndex(
|
||||
(o) => o.value === currentWhisperModel,
|
||||
)}
|
||||
showNumbers={true}
|
||||
/>
|
||||
)}
|
||||
</Box>
|
||||
)}
|
||||
|
||||
<Box marginTop={1} flexDirection="column">
|
||||
<Text color={theme.text.secondary}>
|
||||
{view === 'whisper-models'
|
||||
? '(Press Esc to go back)'
|
||||
: '(Press Esc to close)'}
|
||||
</Text>
|
||||
</Box>
|
||||
</Box>
|
||||
);
|
||||
}
|
||||
@@ -168,13 +168,6 @@ exports[`InputPrompt > mouse interaction > should toggle paste expansion on doub
|
||||
"
|
||||
`;
|
||||
|
||||
exports[`InputPrompt > mouse interaction > should toggle paste expansion on double-click 4`] = `
|
||||
"▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀
|
||||
> [Pasted Text: 10 lines]
|
||||
▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄
|
||||
"
|
||||
`;
|
||||
|
||||
exports[`InputPrompt > multiline rendering > should correctly render multiline input including blank lines 1`] = `
|
||||
"────────────────────────────────────────────────────────────────────────────────────────────────────
|
||||
> hello
|
||||
|
||||
@@ -41,6 +41,8 @@ export interface UIActions {
|
||||
exitPrivacyNotice: () => void;
|
||||
closeSettingsDialog: () => void;
|
||||
closeModelDialog: () => void;
|
||||
openVoiceModelDialog: () => void;
|
||||
closeVoiceModelDialog: () => void;
|
||||
openAgentConfigDialog: (
|
||||
name: string,
|
||||
displayName: string,
|
||||
@@ -93,6 +95,7 @@ export interface UIActions {
|
||||
handleNewAgentsSelect: (choice: NewAgentsChoice) => Promise<void>;
|
||||
getPreferredEditor: () => EditorType | undefined;
|
||||
clearAccountSuspension: () => void;
|
||||
setVoiceModeEnabled: (value: boolean) => void;
|
||||
}
|
||||
|
||||
export const UIActionsContext = createContext<UIActions | null>(null);
|
||||
|
||||
@@ -112,6 +112,7 @@ export interface UIState {
|
||||
isSettingsDialogOpen: boolean;
|
||||
isSessionBrowserOpen: boolean;
|
||||
isModelDialogOpen: boolean;
|
||||
isVoiceModelDialogOpen: boolean;
|
||||
isAgentConfigDialogOpen: boolean;
|
||||
selectedAgentName?: string;
|
||||
selectedAgentDisplayName?: string;
|
||||
@@ -132,6 +133,7 @@ export interface UIState {
|
||||
pendingGeminiHistoryItems: HistoryItemWithoutId[];
|
||||
thought: ThoughtSummary | null;
|
||||
isInputActive: boolean;
|
||||
isVoiceModeEnabled: boolean;
|
||||
isResuming: boolean;
|
||||
shouldShowIdePrompt: boolean;
|
||||
isFolderTrustDialogOpen: boolean;
|
||||
|
||||
@@ -205,11 +205,13 @@ describe('useSlashCommandProcessor', () => {
|
||||
openSettingsDialog: vi.fn(),
|
||||
openSessionBrowser: vi.fn(),
|
||||
openModelDialog: mockOpenModelDialog,
|
||||
openVoiceModelDialog: vi.fn(),
|
||||
openAgentConfigDialog,
|
||||
openPermissionsDialog: vi.fn(),
|
||||
quit: mockSetQuittingMessages,
|
||||
setDebugMessage: vi.fn(),
|
||||
toggleCorgiMode: vi.fn(),
|
||||
toggleVoiceMode: vi.fn(),
|
||||
toggleDebugProfiler: vi.fn(),
|
||||
dispatchExtensionStateUpdate: vi.fn(),
|
||||
addConfirmUpdateExtensionRequest: vi.fn(),
|
||||
|
||||
@@ -72,6 +72,7 @@ interface SlashCommandProcessorActions {
|
||||
openSettingsDialog: () => void;
|
||||
openSessionBrowser: () => void;
|
||||
openModelDialog: () => void;
|
||||
openVoiceModelDialog: () => void;
|
||||
openAgentConfigDialog: (
|
||||
name: string,
|
||||
displayName: string,
|
||||
@@ -81,6 +82,7 @@ interface SlashCommandProcessorActions {
|
||||
quit: (messages: HistoryItem[]) => void;
|
||||
setDebugMessage: (message: string) => void;
|
||||
toggleCorgiMode: () => void;
|
||||
toggleVoiceMode: () => void;
|
||||
toggleDebugProfiler: () => void;
|
||||
dispatchExtensionStateUpdate: (action: ExtensionUpdateAction) => void;
|
||||
addConfirmUpdateExtensionRequest: (request: ConfirmationRequest) => void;
|
||||
@@ -232,6 +234,7 @@ export const useSlashCommandProcessor = (
|
||||
pendingItem,
|
||||
setPendingItem,
|
||||
toggleCorgiMode: actions.toggleCorgiMode,
|
||||
toggleVoiceMode: actions.toggleVoiceMode,
|
||||
toggleDebugProfiler: actions.toggleDebugProfiler,
|
||||
toggleVimEnabled,
|
||||
reloadCommands,
|
||||
@@ -503,6 +506,9 @@ export const useSlashCommandProcessor = (
|
||||
case 'model':
|
||||
actions.openModelDialog();
|
||||
return { type: 'handled' };
|
||||
case 'voice-model':
|
||||
actions.openVoiceModelDialog();
|
||||
return { type: 'handled' };
|
||||
case 'agentConfig': {
|
||||
// eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
|
||||
const props = result.props as Record<string, unknown>;
|
||||
|
||||
@@ -0,0 +1,429 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { useState, useRef, useCallback, useEffect } from 'react';
|
||||
import {
|
||||
AudioRecorder,
|
||||
TranscriptionFactory,
|
||||
debugLogger,
|
||||
type Config,
|
||||
type TranscriptionProvider,
|
||||
} from '@google/gemini-cli-core';
|
||||
import type { TextBuffer } from '../components/shared/text-buffer.js';
|
||||
import type { MergedSettings } from '../../config/settingsSchema.js';
|
||||
import type { Key } from './useKeypress.js';
|
||||
import { Command } from '../key/keyMatchers.js';
|
||||
|
||||
interface UseVoiceModeProps {
|
||||
buffer: TextBuffer;
|
||||
config: Config;
|
||||
settings: MergedSettings;
|
||||
setQueueErrorMessage: (message: string | null) => void;
|
||||
isVoiceModeEnabled: boolean;
|
||||
setVoiceModeEnabled: (enabled: boolean) => void;
|
||||
keyMatchers: Record<Command, (key: Key) => boolean>;
|
||||
}
|
||||
|
||||
const HOLD_DELAY_MS = 600;
|
||||
const RELEASE_DELAY_MS = 300;
|
||||
|
||||
export function useVoiceMode({
|
||||
buffer,
|
||||
config,
|
||||
settings,
|
||||
setQueueErrorMessage,
|
||||
isVoiceModeEnabled,
|
||||
setVoiceModeEnabled,
|
||||
keyMatchers,
|
||||
}: UseVoiceModeProps) {
|
||||
const [isRecording, setIsRecording] = useState(false);
|
||||
const [isConnecting, setIsConnecting] = useState(false);
|
||||
|
||||
const liveTranscriptionRef = useRef('');
|
||||
const stopRequestedRef = useRef(false);
|
||||
const isRecordingRef = useRef(false);
|
||||
const lastFailureTimeRef = useRef(0);
|
||||
const recordingInProgressRef = useRef(false);
|
||||
const voiceTimeoutRef = useRef<NodeJS.Timeout | null>(null);
|
||||
const recorderRef = useRef<AudioRecorder | null>(null);
|
||||
const transcriptionServiceRef = useRef<TranscriptionProvider | null>(null);
|
||||
const turnBaselineRef = useRef<string | null>(null);
|
||||
|
||||
const pttStateRef = useRef<'idle' | 'possible-hold' | 'recording'>('idle');
|
||||
const pttTimerRef = useRef<NodeJS.Timeout | null>(null);
|
||||
const disconnectTimerRef = useRef<NodeJS.Timeout | null>(null);
|
||||
|
||||
const bufferRef = useRef(buffer);
|
||||
bufferRef.current = buffer;
|
||||
|
||||
const stopVoiceRecording = useCallback(() => {
|
||||
if (stopRequestedRef.current) return;
|
||||
debugLogger.debug('[Voice] Stop requested');
|
||||
stopRequestedRef.current = true;
|
||||
|
||||
setIsRecording(false);
|
||||
isRecordingRef.current = false;
|
||||
setIsConnecting(false);
|
||||
|
||||
if (recorderRef.current) {
|
||||
recorderRef.current.stop();
|
||||
recorderRef.current = null;
|
||||
}
|
||||
|
||||
const serviceToDisconnect = transcriptionServiceRef.current;
|
||||
transcriptionServiceRef.current = null;
|
||||
|
||||
if (serviceToDisconnect) {
|
||||
const isLive = settings.experimental.voice?.backend === 'gemini-live';
|
||||
const gracePeriodMs =
|
||||
settings.experimental.voice?.stopGracePeriodMs ??
|
||||
(isLive ? 2000 : 1000);
|
||||
debugLogger.debug(
|
||||
`[Voice] Draining transcription for ${gracePeriodMs}ms`,
|
||||
);
|
||||
|
||||
if (disconnectTimerRef.current) clearTimeout(disconnectTimerRef.current);
|
||||
disconnectTimerRef.current = setTimeout(() => {
|
||||
debugLogger.debug('[Voice] Grace period ended, disconnecting service');
|
||||
serviceToDisconnect.disconnect();
|
||||
disconnectTimerRef.current = null;
|
||||
}, gracePeriodMs);
|
||||
}
|
||||
|
||||
liveTranscriptionRef.current = '';
|
||||
pttStateRef.current = 'idle';
|
||||
}, [settings.experimental.voice]);
|
||||
|
||||
const startVoiceRecording = useCallback(() => {
|
||||
if (
|
||||
isRecordingRef.current ||
|
||||
Date.now() - lastFailureTimeRef.current < 2000
|
||||
) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (disconnectTimerRef.current) {
|
||||
clearTimeout(disconnectTimerRef.current);
|
||||
disconnectTimerRef.current = null;
|
||||
}
|
||||
|
||||
recordingInProgressRef.current = true;
|
||||
turnBaselineRef.current = bufferRef.current.text;
|
||||
|
||||
setIsConnecting(true);
|
||||
setIsRecording(true);
|
||||
isRecordingRef.current = true;
|
||||
|
||||
liveTranscriptionRef.current = '';
|
||||
stopRequestedRef.current = false;
|
||||
|
||||
const apiKey =
|
||||
config.getContentGeneratorConfig()?.apiKey ||
|
||||
process.env['GEMINI_API_KEY'] ||
|
||||
'';
|
||||
|
||||
const startAsync = async () => {
|
||||
// If there's an active draining service, disconnect it immediately
|
||||
// before starting a new one to prevent orphaned event collisions.
|
||||
if (disconnectTimerRef.current) {
|
||||
clearTimeout(disconnectTimerRef.current);
|
||||
disconnectTimerRef.current = null;
|
||||
}
|
||||
if (transcriptionServiceRef.current) {
|
||||
transcriptionServiceRef.current.disconnect();
|
||||
transcriptionServiceRef.current = null;
|
||||
}
|
||||
|
||||
const cleanupIfStopped = () => {
|
||||
if (stopRequestedRef.current) {
|
||||
if (recorderRef.current) {
|
||||
recorderRef.current.stop();
|
||||
recorderRef.current = null;
|
||||
}
|
||||
if (transcriptionServiceRef.current) {
|
||||
transcriptionServiceRef.current.disconnect();
|
||||
transcriptionServiceRef.current = null;
|
||||
}
|
||||
setIsRecording(false);
|
||||
isRecordingRef.current = false;
|
||||
setIsConnecting(false);
|
||||
recordingInProgressRef.current = false;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
if (cleanupIfStopped()) return;
|
||||
|
||||
const voiceBackend =
|
||||
settings.experimental.voice?.backend ?? 'gemini-live';
|
||||
|
||||
if (!apiKey && voiceBackend === 'gemini-live') {
|
||||
setQueueErrorMessage(
|
||||
'Cloud voice mode requires a GEMINI_API_KEY. Please set it in your environment or ~/.gemini/.env.',
|
||||
);
|
||||
setIsRecording(false);
|
||||
isRecordingRef.current = false;
|
||||
setIsConnecting(false);
|
||||
recordingInProgressRef.current = false;
|
||||
lastFailureTimeRef.current = Date.now();
|
||||
return;
|
||||
}
|
||||
|
||||
if (voiceBackend === 'gemini-live') {
|
||||
recorderRef.current = new AudioRecorder();
|
||||
}
|
||||
|
||||
const currentService = TranscriptionFactory.createProvider(
|
||||
settings.experimental.voice,
|
||||
apiKey,
|
||||
);
|
||||
transcriptionServiceRef.current = currentService;
|
||||
|
||||
currentService.on('transcription', (text) => {
|
||||
if (
|
||||
transcriptionServiceRef.current !== currentService &&
|
||||
stopRequestedRef.current
|
||||
) {
|
||||
// If this is an orphaned service that was replaced by a new session, ignore its events
|
||||
return;
|
||||
}
|
||||
|
||||
if (text) {
|
||||
const currentBufferText = bufferRef.current.text;
|
||||
const previousTranscription = liveTranscriptionRef.current;
|
||||
|
||||
let newTotalText = currentBufferText;
|
||||
|
||||
if (
|
||||
previousTranscription &&
|
||||
currentBufferText.endsWith(previousTranscription)
|
||||
) {
|
||||
newTotalText = currentBufferText.slice(
|
||||
0,
|
||||
-previousTranscription.length,
|
||||
);
|
||||
} else if (
|
||||
currentBufferText &&
|
||||
!currentBufferText.endsWith(' ') &&
|
||||
!currentBufferText.endsWith('\n')
|
||||
) {
|
||||
newTotalText += ' ';
|
||||
}
|
||||
|
||||
newTotalText += text;
|
||||
bufferRef.current.setText(newTotalText, 'end');
|
||||
}
|
||||
liveTranscriptionRef.current = text;
|
||||
});
|
||||
|
||||
currentService.on('turnComplete', () => {
|
||||
if (
|
||||
transcriptionServiceRef.current !== currentService &&
|
||||
stopRequestedRef.current
|
||||
)
|
||||
return;
|
||||
liveTranscriptionRef.current = '';
|
||||
});
|
||||
|
||||
currentService.on('error', (err) => {
|
||||
if (transcriptionServiceRef.current !== currentService) return;
|
||||
debugLogger.error('[Voice] Transcription error:', err);
|
||||
lastFailureTimeRef.current = Date.now();
|
||||
recordingInProgressRef.current = false;
|
||||
});
|
||||
|
||||
currentService.on('close', () => {
|
||||
if (transcriptionServiceRef.current !== currentService) return;
|
||||
if (!stopRequestedRef.current) {
|
||||
setIsRecording(false);
|
||||
isRecordingRef.current = false;
|
||||
setIsConnecting(false);
|
||||
recordingInProgressRef.current = false;
|
||||
lastFailureTimeRef.current = Date.now();
|
||||
}
|
||||
});
|
||||
|
||||
try {
|
||||
await currentService.connect();
|
||||
if (cleanupIfStopped()) return;
|
||||
|
||||
await recorderRef.current?.start();
|
||||
if (cleanupIfStopped()) return;
|
||||
|
||||
setIsConnecting(false);
|
||||
|
||||
const currentVoiceBackend =
|
||||
settings.experimental.voice?.backend ?? 'gemini-live';
|
||||
|
||||
recorderRef.current?.on('data', (chunk) => {
|
||||
if (currentVoiceBackend === 'gemini-live') {
|
||||
currentService.sendAudioChunk(chunk);
|
||||
}
|
||||
});
|
||||
recorderRef.current?.on('error', (err) => {
|
||||
debugLogger.error('[Voice] Recorder error:', err);
|
||||
stopVoiceRecording();
|
||||
lastFailureTimeRef.current = Date.now();
|
||||
});
|
||||
} catch (err: unknown) {
|
||||
if (transcriptionServiceRef.current !== currentService) return;
|
||||
const message = err instanceof Error ? err.message : String(err);
|
||||
setQueueErrorMessage(`Voice mode failure: ${message}`);
|
||||
setIsRecording(false);
|
||||
isRecordingRef.current = false;
|
||||
setIsConnecting(false);
|
||||
recordingInProgressRef.current = false;
|
||||
lastFailureTimeRef.current = Date.now();
|
||||
|
||||
if (recorderRef.current) {
|
||||
recorderRef.current.stop();
|
||||
recorderRef.current = null;
|
||||
}
|
||||
if (transcriptionServiceRef.current) {
|
||||
transcriptionServiceRef.current.disconnect();
|
||||
transcriptionServiceRef.current = null;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
void startAsync();
|
||||
}, [
|
||||
config,
|
||||
settings.experimental.voice,
|
||||
setQueueErrorMessage,
|
||||
stopVoiceRecording,
|
||||
]);
|
||||
|
||||
useEffect(
|
||||
() => () => {
|
||||
if (voiceTimeoutRef.current) clearTimeout(voiceTimeoutRef.current);
|
||||
if (recorderRef.current) {
|
||||
recorderRef.current.stop();
|
||||
recorderRef.current = null;
|
||||
}
|
||||
if (transcriptionServiceRef.current) {
|
||||
transcriptionServiceRef.current.disconnect();
|
||||
transcriptionServiceRef.current = null;
|
||||
}
|
||||
if (pttTimerRef.current) clearTimeout(pttTimerRef.current);
|
||||
if (disconnectTimerRef.current) clearTimeout(disconnectTimerRef.current);
|
||||
},
|
||||
[],
|
||||
);
|
||||
|
||||
const handleVoiceInput = useCallback(
|
||||
(key: Key): boolean => {
|
||||
const activeRecording = isRecording || isRecordingRef.current;
|
||||
|
||||
if (activeRecording) {
|
||||
const activationMode =
|
||||
settings.experimental.voice?.activationMode ?? 'push-to-talk';
|
||||
|
||||
if (keyMatchers[Command.ESCAPE](key)) {
|
||||
stopVoiceRecording();
|
||||
return true;
|
||||
}
|
||||
|
||||
if (keyMatchers[Command.VOICE_MODE_PTT](key)) {
|
||||
if (activationMode === 'push-to-talk') {
|
||||
if (pttTimerRef.current) {
|
||||
clearTimeout(pttTimerRef.current);
|
||||
}
|
||||
pttTimerRef.current = setTimeout(() => {
|
||||
stopVoiceRecording();
|
||||
pttTimerRef.current = null;
|
||||
}, RELEASE_DELAY_MS);
|
||||
return true;
|
||||
} else {
|
||||
stopVoiceRecording();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
if (isVoiceModeEnabled) {
|
||||
const activationMode =
|
||||
settings.experimental.voice?.activationMode ?? 'push-to-talk';
|
||||
|
||||
if (keyMatchers[Command.ESCAPE](key) && buffer.text === '') {
|
||||
setVoiceModeEnabled(false);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (keyMatchers[Command.VOICE_MODE_PTT](key)) {
|
||||
if (
|
||||
key.name === 'space' &&
|
||||
!key.ctrl &&
|
||||
!key.alt &&
|
||||
!key.shift &&
|
||||
!key.cmd
|
||||
) {
|
||||
if (activationMode === 'toggle') {
|
||||
startVoiceRecording();
|
||||
return true;
|
||||
} else {
|
||||
if (pttStateRef.current === 'idle') {
|
||||
buffer.insert(' ');
|
||||
pttStateRef.current = 'possible-hold';
|
||||
|
||||
if (pttTimerRef.current) clearTimeout(pttTimerRef.current);
|
||||
pttTimerRef.current = setTimeout(() => {
|
||||
pttStateRef.current = 'idle';
|
||||
pttTimerRef.current = null;
|
||||
}, HOLD_DELAY_MS);
|
||||
return true;
|
||||
} else if (pttStateRef.current === 'possible-hold') {
|
||||
if (pttTimerRef.current) clearTimeout(pttTimerRef.current);
|
||||
buffer.backspace();
|
||||
pttStateRef.current = 'recording';
|
||||
startVoiceRecording();
|
||||
|
||||
pttTimerRef.current = setTimeout(() => {
|
||||
stopVoiceRecording();
|
||||
pttTimerRef.current = null;
|
||||
}, RELEASE_DELAY_MS);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (pttStateRef.current === 'possible-hold') {
|
||||
pttStateRef.current = 'idle';
|
||||
if (pttTimerRef.current) {
|
||||
clearTimeout(pttTimerRef.current);
|
||||
pttTimerRef.current = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
},
|
||||
[
|
||||
isRecording,
|
||||
isVoiceModeEnabled,
|
||||
settings.experimental.voice,
|
||||
keyMatchers,
|
||||
stopVoiceRecording,
|
||||
startVoiceRecording,
|
||||
buffer,
|
||||
setVoiceModeEnabled,
|
||||
],
|
||||
);
|
||||
|
||||
return {
|
||||
isRecording,
|
||||
isConnecting,
|
||||
startVoiceRecording,
|
||||
stopVoiceRecording,
|
||||
handleVoiceInput,
|
||||
resetTurnBaseline: () => {
|
||||
turnBaselineRef.current = null;
|
||||
},
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,31 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { useState, useCallback } from 'react';
|
||||
|
||||
interface UseVoiceModelCommandReturn {
|
||||
isVoiceModelDialogOpen: boolean;
|
||||
openVoiceModelDialog: () => void;
|
||||
closeVoiceModelDialog: () => void;
|
||||
}
|
||||
|
||||
export const useVoiceModelCommand = (): UseVoiceModelCommandReturn => {
|
||||
const [isVoiceModelDialogOpen, setIsVoiceModelDialogOpen] = useState(false);
|
||||
|
||||
const openVoiceModelDialog = useCallback(() => {
|
||||
setIsVoiceModelDialogOpen(true);
|
||||
}, []);
|
||||
|
||||
const closeVoiceModelDialog = useCallback(() => {
|
||||
setIsVoiceModelDialogOpen(false);
|
||||
}, []);
|
||||
|
||||
return {
|
||||
isVoiceModelDialogOpen,
|
||||
openVoiceModelDialog,
|
||||
closeVoiceModelDialog,
|
||||
};
|
||||
};
|
||||
@@ -97,6 +97,7 @@ export enum Command {
|
||||
RESTART_APP = 'app.restart',
|
||||
SUSPEND_APP = 'app.suspend',
|
||||
SHOW_SHELL_INPUT_UNFOCUS_WARNING = 'app.showShellUnfocusWarning',
|
||||
VOICE_MODE_PTT = 'app.voiceModePTT',
|
||||
|
||||
// Background Shell Controls
|
||||
BACKGROUND_SHELL_ESCAPE = 'background.escape',
|
||||
@@ -407,9 +408,7 @@ export const defaultKeyBindingConfig: KeyBindingConfig = new Map([
|
||||
[Command.RESTART_APP, [new KeyBinding('r'), new KeyBinding('shift+r')]],
|
||||
[Command.SUSPEND_APP, [new KeyBinding('ctrl+z')]],
|
||||
[Command.SHOW_SHELL_INPUT_UNFOCUS_WARNING, [new KeyBinding('tab')]],
|
||||
[Command.DUMP_FRAME, [new KeyBinding('f8')]],
|
||||
[Command.START_RECORDING, [new KeyBinding('f6')]],
|
||||
[Command.STOP_RECORDING, [new KeyBinding('f7')]],
|
||||
[Command.VOICE_MODE_PTT, [new KeyBinding('space')]],
|
||||
|
||||
// Background Shell Controls
|
||||
[Command.BACKGROUND_SHELL_ESCAPE, [new KeyBinding('escape')]],
|
||||
@@ -424,6 +423,10 @@ export const defaultKeyBindingConfig: KeyBindingConfig = new Map([
|
||||
// Extension Controls
|
||||
[Command.UPDATE_EXTENSION, [new KeyBinding('i')]],
|
||||
[Command.LINK_EXTENSION, [new KeyBinding('l')]],
|
||||
|
||||
[Command.DUMP_FRAME, [new KeyBinding('f8')]],
|
||||
[Command.START_RECORDING, [new KeyBinding('f6')]],
|
||||
[Command.STOP_RECORDING, [new KeyBinding('f7')]],
|
||||
]);
|
||||
|
||||
interface CommandCategory {
|
||||
@@ -538,6 +541,7 @@ export const commandCategories: readonly CommandCategory[] = [
|
||||
Command.RESTART_APP,
|
||||
Command.SUSPEND_APP,
|
||||
Command.SHOW_SHELL_INPUT_UNFOCUS_WARNING,
|
||||
Command.VOICE_MODE_PTT,
|
||||
],
|
||||
},
|
||||
{
|
||||
@@ -658,6 +662,7 @@ export const commandDescriptions: Readonly<Record<Command, string>> = {
|
||||
[Command.SUSPEND_APP]: 'Suspend the CLI and move it to the background.',
|
||||
[Command.SHOW_SHELL_INPUT_UNFOCUS_WARNING]:
|
||||
'Show warning when trying to move focus away from shell input.',
|
||||
[Command.VOICE_MODE_PTT]: 'Hold to speak in Voice Mode.',
|
||||
|
||||
// Background Shell Controls
|
||||
[Command.BACKGROUND_SHELL_ESCAPE]: 'Dismiss background shell list.',
|
||||
|
||||
@@ -43,5 +43,6 @@ export function createNonInteractiveUI(): CommandContext['ui'] {
|
||||
removeComponent: () => {},
|
||||
toggleBackgroundTasks: () => {},
|
||||
toggleShortcutsHelp: () => {},
|
||||
toggleVoiceMode: () => {},
|
||||
};
|
||||
}
|
||||
|
||||
@@ -56,6 +56,7 @@
|
||||
"ajv-formats": "^3.0.0",
|
||||
"chardet": "^2.1.0",
|
||||
"chokidar": "^5.0.0",
|
||||
"command-exists": "^1.2.9",
|
||||
"diff": "^8.0.3",
|
||||
"dotenv": "^17.2.4",
|
||||
"dotenv-expand": "^12.0.3",
|
||||
|
||||
@@ -691,6 +691,7 @@ export interface ConfigParameters {
|
||||
ptyInfo?: string;
|
||||
disableYoloMode?: boolean;
|
||||
disableAlwaysAllow?: boolean;
|
||||
voiceMode?: boolean;
|
||||
rawOutput?: boolean;
|
||||
acceptRawOutputRisk?: boolean;
|
||||
dynamicModelConfiguration?: boolean;
|
||||
@@ -963,6 +964,7 @@ export class Config implements McpContext, AgentLoopContext {
|
||||
private readonly topicUpdateNarration: boolean;
|
||||
private readonly disableLLMCorrection: boolean;
|
||||
private readonly planEnabled: boolean;
|
||||
private readonly voiceMode: boolean;
|
||||
private readonly trackerEnabled: boolean;
|
||||
private readonly planModeRoutingEnabled: boolean;
|
||||
private readonly modelSteering: boolean;
|
||||
@@ -1117,6 +1119,7 @@ export class Config implements McpContext, AgentLoopContext {
|
||||
this.agents = params.agents ?? {};
|
||||
this.disableLLMCorrection = params.disableLLMCorrection ?? true;
|
||||
this.planEnabled = params.plan ?? true;
|
||||
this.voiceMode = params.voiceMode ?? false;
|
||||
this.trackerEnabled = params.tracker ?? false;
|
||||
this.planModeRoutingEnabled = params.planSettings?.modelRouting ?? true;
|
||||
this.enableEventDrivenScheduler = params.enableEventDrivenScheduler ?? true;
|
||||
@@ -2969,6 +2972,10 @@ export class Config implements McpContext, AgentLoopContext {
|
||||
return this.planEnabled;
|
||||
}
|
||||
|
||||
isVoiceModeEnabled(): boolean {
|
||||
return this.voiceMode;
|
||||
}
|
||||
|
||||
isTrackerEnabled(): boolean {
|
||||
return this.trackerEnabled;
|
||||
}
|
||||
|
||||
@@ -297,3 +297,12 @@ export * from './context/profiles.js';
|
||||
|
||||
// Export trust utility
|
||||
export * from './utils/trust.js';
|
||||
|
||||
// Export voice utilities
|
||||
export * from './voice/audioRecorder.js';
|
||||
export * from './voice/transcriptionProvider.js';
|
||||
export * from './voice/geminiLiveTranscriptionProvider.js';
|
||||
export * from './voice/whisperTranscriptionProvider.js';
|
||||
export * from './voice/transcriptionFactory.js';
|
||||
export * from './voice/whisperModelManager.js';
|
||||
export { isBinaryAvailable } from './utils/binaryCheck.js';
|
||||
|
||||
@@ -0,0 +1,14 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { sync as commandExistsSync } from 'command-exists';
|
||||
|
||||
/**
|
||||
* Checks if a binary is available in the system PATH.
|
||||
*/
|
||||
export function isBinaryAvailable(binaryName: string): boolean {
|
||||
return commandExistsSync(binaryName);
|
||||
}
|
||||
@@ -0,0 +1,115 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { spawn, type ChildProcessWithoutNullStreams } from 'node:child_process';
|
||||
import { EventEmitter } from 'node:events';
|
||||
import commandExists from 'command-exists';
|
||||
|
||||
export interface AudioRecorderEvents {
|
||||
data: [Buffer];
|
||||
start: [];
|
||||
stop: [];
|
||||
error: [Error];
|
||||
}
|
||||
|
||||
/**
|
||||
* Captures audio from the microphone using `sox` (`rec`).
|
||||
* Emits 16kHz, 16-bit, mono PCM chunks.
|
||||
*/
|
||||
export class AudioRecorder extends EventEmitter<AudioRecorderEvents> {
|
||||
private recProcess: ChildProcessWithoutNullStreams | null = null;
|
||||
private isRecordingInternal = false;
|
||||
|
||||
get isRecording(): boolean {
|
||||
return this.isRecordingInternal;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if `rec` (sox) is available on the system.
|
||||
*/
|
||||
static async isAvailable(): Promise<boolean> {
|
||||
try {
|
||||
await commandExists('rec');
|
||||
return true;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
async start(): Promise<void> {
|
||||
if (this.isRecordingInternal) return;
|
||||
this.isRecordingInternal = true;
|
||||
|
||||
try {
|
||||
const available = await AudioRecorder.isAvailable();
|
||||
if (!this.isRecordingInternal) return; // Check if stopped while checking availability
|
||||
|
||||
if (!available) {
|
||||
throw new Error(
|
||||
'The `rec` command (provided by SoX) is required for voice mode. Please install SoX (e.g., `brew install sox` on macOS or `sudo apt install sox libsox-fmt-all` on Linux).',
|
||||
);
|
||||
}
|
||||
|
||||
// rec -q -V0 -e signed -c 1 -b 16 -r 16000 -t raw -
|
||||
this.recProcess = spawn('rec', [
|
||||
'-q',
|
||||
'-V0',
|
||||
'-e',
|
||||
'signed',
|
||||
'-c',
|
||||
'1',
|
||||
'-b',
|
||||
'16',
|
||||
'-r',
|
||||
'16000',
|
||||
'-t',
|
||||
'raw',
|
||||
'-',
|
||||
]);
|
||||
|
||||
if (!this.isRecordingInternal) {
|
||||
this.recProcess.kill('SIGTERM');
|
||||
this.recProcess = null;
|
||||
return;
|
||||
}
|
||||
|
||||
this.recProcess.stdout.on('data', (data: Buffer) => {
|
||||
this.emit('data', data);
|
||||
});
|
||||
|
||||
this.recProcess.stderr.on('data', (_data: Buffer) => {
|
||||
// rec might print warnings to stderr, we could log them or ignore
|
||||
// console.warn(`rec stderr: ${data.toString()}`);
|
||||
});
|
||||
|
||||
this.recProcess.on('error', (err) => {
|
||||
this.emit('error', err);
|
||||
this.stop();
|
||||
});
|
||||
|
||||
this.recProcess.on('close', () => {
|
||||
this.stop();
|
||||
});
|
||||
|
||||
this.emit('start');
|
||||
} catch (err) {
|
||||
this.isRecordingInternal = false;
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
stop(): void {
|
||||
if (!this.isRecordingInternal) return;
|
||||
this.isRecordingInternal = false;
|
||||
|
||||
if (this.recProcess) {
|
||||
this.recProcess.kill('SIGTERM');
|
||||
this.recProcess = null;
|
||||
}
|
||||
|
||||
this.emit('stop');
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,178 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import WebSocket from 'ws';
|
||||
import { EventEmitter, once } from 'node:events';
|
||||
import { debugLogger } from '../utils/debugLogger.js';
|
||||
import type {
|
||||
TranscriptionProvider,
|
||||
TranscriptionEvents,
|
||||
} from './transcriptionProvider.js';
|
||||
|
||||
import { z } from 'zod';
|
||||
|
||||
const LiveAPIResponseSchema = z.object({
|
||||
setupComplete: z.record(z.unknown()).optional(),
|
||||
serverContent: z
|
||||
.object({
|
||||
turnComplete: z.boolean().optional(),
|
||||
inputTranscription: z
|
||||
.object({
|
||||
text: z.string().optional(),
|
||||
})
|
||||
.optional(),
|
||||
outputTranscription: z
|
||||
.object({
|
||||
text: z.string().optional(),
|
||||
})
|
||||
.optional(),
|
||||
modelTurn: z
|
||||
.object({
|
||||
parts: z
|
||||
.array(
|
||||
z.object({
|
||||
text: z.string().optional(),
|
||||
inlineData: z
|
||||
.object({
|
||||
data: z.string(),
|
||||
})
|
||||
.optional(),
|
||||
}),
|
||||
)
|
||||
.optional(),
|
||||
})
|
||||
.optional(),
|
||||
})
|
||||
.optional(),
|
||||
});
|
||||
|
||||
/**
|
||||
* Connects to the Gemini Live API using raw WebSockets to support API Key authentication.
|
||||
*/
|
||||
export class GeminiLiveTranscriptionProvider
|
||||
extends EventEmitter<TranscriptionEvents>
|
||||
implements TranscriptionProvider
|
||||
{
|
||||
private ws: WebSocket | null = null;
|
||||
private currentTranscription = '';
|
||||
|
||||
constructor(private readonly apiKey: string) {
|
||||
super();
|
||||
}
|
||||
|
||||
async connect(): Promise<void> {
|
||||
const modelName = 'gemini-3.1-flash-live-preview';
|
||||
const baseUrl =
|
||||
'wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent';
|
||||
|
||||
if (!this.apiKey) {
|
||||
throw new Error('No API key provided');
|
||||
}
|
||||
|
||||
// NOTE: The Generative Language WebSocket API requires the API key to be passed via the 'key' query parameter.
|
||||
const url = `${baseUrl}?key=${this.apiKey}`;
|
||||
debugLogger.debug(
|
||||
`[GeminiLiveTranscription] Connecting to model ${modelName} via raw WebSocket with API Key...`,
|
||||
);
|
||||
|
||||
try {
|
||||
this.ws = new WebSocket(url, {
|
||||
maxPayload: 1 << 20, // 1MB limit for safety
|
||||
});
|
||||
|
||||
this.ws.on('message', (data) => {
|
||||
try {
|
||||
const parsedData: unknown = JSON.parse(data.toString());
|
||||
const result = LiveAPIResponseSchema.safeParse(parsedData);
|
||||
|
||||
if (result.success) {
|
||||
const response = result.data;
|
||||
if (response.serverContent) {
|
||||
const content = response.serverContent;
|
||||
|
||||
if (content.turnComplete) {
|
||||
this.emit('turnComplete');
|
||||
}
|
||||
|
||||
if (content.inputTranscription?.text) {
|
||||
const text = content.inputTranscription.text;
|
||||
debugLogger.debug(
|
||||
`[GeminiLiveTranscription] Transcription received (Cloud): "${text}"`,
|
||||
);
|
||||
this.currentTranscription = text;
|
||||
this.emit('transcription', this.currentTranscription);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
debugLogger.error(
|
||||
'[GeminiLiveTranscription] Error parsing message:',
|
||||
e,
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
this.ws.on('error', (error) => {
|
||||
debugLogger.error('[GeminiLiveTranscription] WebSocket Error:', error);
|
||||
this.emit('error', error);
|
||||
});
|
||||
|
||||
this.ws.on('close', (code, reason) => {
|
||||
debugLogger.debug(
|
||||
`[GeminiLiveTranscription] Connection Closed. Code: ${code}, Reason: ${reason}`,
|
||||
);
|
||||
this.emit('close');
|
||||
this.ws = null;
|
||||
});
|
||||
|
||||
await once(this.ws, 'open');
|
||||
|
||||
const setupMessage = {
|
||||
setup: {
|
||||
model: `models/${modelName}`,
|
||||
generation_config: {
|
||||
response_modalities: ['audio'],
|
||||
},
|
||||
input_audio_transcription: {},
|
||||
},
|
||||
};
|
||||
|
||||
this.ws.send(JSON.stringify(setupMessage));
|
||||
this.currentTranscription = '';
|
||||
} catch (err) {
|
||||
debugLogger.error(
|
||||
'[GeminiLiveTranscription] Failed to establish connection:',
|
||||
err,
|
||||
);
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
sendAudioChunk(chunk: Buffer): void {
|
||||
if (!this.ws || this.ws.readyState !== WebSocket.OPEN) return;
|
||||
|
||||
const audioMessage = {
|
||||
realtime_input: {
|
||||
audio: {
|
||||
data: chunk.toString('base64'),
|
||||
mime_type: 'audio/pcm;rate=16000',
|
||||
},
|
||||
},
|
||||
};
|
||||
this.ws.send(JSON.stringify(audioMessage));
|
||||
}
|
||||
|
||||
getTranscription(): string {
|
||||
return this.currentTranscription;
|
||||
}
|
||||
|
||||
disconnect(): void {
|
||||
if (this.ws) {
|
||||
this.ws.close();
|
||||
this.ws = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,41 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import * as path from 'node:path';
|
||||
import * as fs from 'node:fs';
|
||||
import { homedir, GEMINI_DIR } from '../utils/paths.js';
|
||||
import { GeminiLiveTranscriptionProvider } from './geminiLiveTranscriptionProvider.js';
|
||||
import { WhisperTranscriptionProvider } from './whisperTranscriptionProvider.js';
|
||||
import type { TranscriptionProvider } from './transcriptionProvider.js';
|
||||
|
||||
export class TranscriptionFactory {
|
||||
static createProvider(
|
||||
voiceConfig: { backend?: string; whisperModel?: string } | undefined,
|
||||
apiKey: string,
|
||||
): TranscriptionProvider {
|
||||
const backend = voiceConfig?.backend ?? 'gemini-live';
|
||||
|
||||
if (backend === 'whisper') {
|
||||
const modelsDir = path.join(homedir(), GEMINI_DIR, 'whisper_models');
|
||||
if (!fs.existsSync(modelsDir)) {
|
||||
fs.mkdirSync(modelsDir, { recursive: true });
|
||||
}
|
||||
|
||||
const modelName = voiceConfig?.whisperModel ?? 'ggml-base.en.bin';
|
||||
const modelPath = path.join(modelsDir, modelName);
|
||||
|
||||
return new WhisperTranscriptionProvider({
|
||||
modelPath,
|
||||
threads: 4,
|
||||
step: 0,
|
||||
length: 5000,
|
||||
});
|
||||
}
|
||||
|
||||
// Default to Gemini Live
|
||||
return new GeminiLiveTranscriptionProvider(apiKey);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,33 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import type { EventEmitter } from 'node:events';
|
||||
|
||||
export interface TranscriptionEvents {
|
||||
/** Emitted when partial or full transcription text is available. */
|
||||
transcription: [string];
|
||||
/** Emitted when a speaking turn is considered complete. */
|
||||
turnComplete: [];
|
||||
/** Emitted when an error occurs during transcription. */
|
||||
error: [Error];
|
||||
/** Emitted when the transcription service connection is closed. */
|
||||
close: [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Common interface for all transcription backends (Cloud or Local).
|
||||
*/
|
||||
export interface TranscriptionProvider
|
||||
extends EventEmitter<TranscriptionEvents> {
|
||||
/** Establish connection to the transcription service. */
|
||||
connect(): Promise<void>;
|
||||
/** Send a chunk of raw audio data to the service. */
|
||||
sendAudioChunk(chunk: Buffer): void;
|
||||
/** Disconnect from the transcription service. */
|
||||
disconnect(): void;
|
||||
/** Get the current full transcription for the session. */
|
||||
getTranscription(): string;
|
||||
}
|
||||
@@ -0,0 +1,107 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import * as path from 'node:path';
|
||||
import * as fs from 'node:fs';
|
||||
import { EventEmitter } from 'node:events';
|
||||
import { homedir, GEMINI_DIR } from '../utils/paths.js';
|
||||
import { debugLogger } from '../utils/debugLogger.js';
|
||||
|
||||
export interface WhisperModelProgress {
|
||||
modelName: string;
|
||||
transferred: number;
|
||||
total: number;
|
||||
percentage: number;
|
||||
}
|
||||
|
||||
export interface WhisperModelManagerEvents {
|
||||
progress: [WhisperModelProgress];
|
||||
}
|
||||
|
||||
const ALLOWED_MODELS = [
|
||||
'ggml-tiny.en.bin',
|
||||
'ggml-base.en.bin',
|
||||
'ggml-large-v3-turbo-q5_0.bin',
|
||||
'ggml-large-v3-turbo-q8_0.bin',
|
||||
];
|
||||
|
||||
/**
|
||||
* Manages Whisper models (checking existence, downloading).
|
||||
*/
|
||||
export class WhisperModelManager extends EventEmitter<WhisperModelManagerEvents> {
|
||||
private readonly modelsDir: string;
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
this.modelsDir = path.join(homedir(), GEMINI_DIR, 'whisper_models');
|
||||
}
|
||||
|
||||
isModelInstalled(modelName: string): boolean {
|
||||
this.validateModelName(modelName);
|
||||
return fs.existsSync(path.join(this.modelsDir, modelName));
|
||||
}
|
||||
|
||||
getModelPath(modelName: string): string {
|
||||
this.validateModelName(modelName);
|
||||
return path.join(this.modelsDir, modelName);
|
||||
}
|
||||
|
||||
async downloadModel(modelName: string): Promise<void> {
|
||||
this.validateModelName(modelName);
|
||||
|
||||
if (!fs.existsSync(this.modelsDir)) {
|
||||
fs.mkdirSync(this.modelsDir, { recursive: true });
|
||||
}
|
||||
|
||||
const destination = path.join(this.modelsDir, modelName);
|
||||
const url = `https://huggingface.co/ggerganov/whisper.cpp/resolve/main/${modelName}`;
|
||||
|
||||
debugLogger.debug(
|
||||
`[WhisperModelManager] Downloading ${modelName} from ${url}`,
|
||||
);
|
||||
|
||||
const response = await fetch(url);
|
||||
if (!response.ok) {
|
||||
throw new Error(`Failed to download model: ${response.statusText}`);
|
||||
}
|
||||
|
||||
const total = parseInt(response.headers.get('content-length') || '0', 10);
|
||||
let transferred = 0;
|
||||
|
||||
const reader = response.body?.getReader();
|
||||
if (!reader) {
|
||||
throw new Error('Response body is not readable');
|
||||
}
|
||||
|
||||
const writer = fs.createWriteStream(destination);
|
||||
|
||||
try {
|
||||
while (true) {
|
||||
const { done, value } = await reader.read();
|
||||
if (done) break;
|
||||
|
||||
transferred += value.length;
|
||||
writer.write(value);
|
||||
|
||||
const percentage = total > 0 ? transferred / total : 0;
|
||||
this.emit('progress', {
|
||||
modelName,
|
||||
transferred,
|
||||
total,
|
||||
percentage,
|
||||
});
|
||||
}
|
||||
} finally {
|
||||
writer.end();
|
||||
}
|
||||
}
|
||||
|
||||
private validateModelName(modelName: string): void {
|
||||
if (!ALLOWED_MODELS.includes(modelName)) {
|
||||
throw new Error(`Unauthorized model name: ${modelName}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,31 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { describe, it, expect, vi, beforeEach } from 'vitest';
|
||||
import { WhisperTranscriptionProvider } from './whisperTranscriptionProvider.js';
|
||||
import commandExists from 'command-exists';
|
||||
|
||||
vi.mock('command-exists', () => ({
|
||||
default: vi.fn(),
|
||||
}));
|
||||
|
||||
describe('WhisperTranscriptionProvider', () => {
|
||||
beforeEach(() => {
|
||||
vi.clearAllMocks();
|
||||
});
|
||||
|
||||
it('should throw a friendly error if whisper-stream is not available', async () => {
|
||||
vi.mocked(commandExists).mockRejectedValue(new Error('not found'));
|
||||
|
||||
const provider = new WhisperTranscriptionProvider({
|
||||
modelPath: 'test-model.bin',
|
||||
});
|
||||
|
||||
await expect(provider.connect()).rejects.toThrow(
|
||||
'The `whisper-stream` command is required for local voice mode. Please install it (e.g., `brew install whisper-cpp` on macOS).',
|
||||
);
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,199 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { spawn, type ChildProcessWithoutNullStreams } from 'node:child_process';
|
||||
import { EventEmitter } from 'node:events';
|
||||
import commandExists from 'command-exists';
|
||||
import { debugLogger } from '../utils/debugLogger.js';
|
||||
import type {
|
||||
TranscriptionProvider,
|
||||
TranscriptionEvents,
|
||||
} from './transcriptionProvider.js';
|
||||
|
||||
export interface WhisperProviderOptions {
|
||||
modelPath: string;
|
||||
threads?: number;
|
||||
step?: number;
|
||||
length?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Local transcription provider using `whisper-stream` from whisper.cpp.
|
||||
*
|
||||
* Uses the Sliding Window Mode with VAD (--step 0) for stable,
|
||||
* non-overlapping transcription blocks that can be appended directly.
|
||||
*/
|
||||
export class WhisperTranscriptionProvider
|
||||
extends EventEmitter<TranscriptionEvents>
|
||||
implements TranscriptionProvider
|
||||
{
|
||||
private process: ChildProcessWithoutNullStreams | null = null;
|
||||
private currentTranscription = '';
|
||||
|
||||
constructor(private readonly options: WhisperProviderOptions) {
|
||||
super();
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if `whisper-stream` is available on the system.
|
||||
*/
|
||||
static async isAvailable(): Promise<boolean> {
|
||||
try {
|
||||
await commandExists('whisper-stream');
|
||||
return true;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
async connect(): Promise<void> {
|
||||
const { modelPath, threads = 4, step = 0, length = 5000 } = this.options;
|
||||
|
||||
this.currentTranscription = '';
|
||||
|
||||
const available = await WhisperTranscriptionProvider.isAvailable();
|
||||
if (!available) {
|
||||
return Promise.reject(
|
||||
new Error(
|
||||
'The `whisper-stream` command is required for local voice mode. Please install it (e.g., `brew install whisper-cpp` on macOS).',
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
debugLogger.debug(
|
||||
`[WhisperTranscription] Starting whisper-stream with model: ${modelPath} (VAD mode: step=${step}, length=${length})`,
|
||||
);
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
let isResolved = false;
|
||||
|
||||
try {
|
||||
// whisper-stream -m <model_path> -t <threads> --step 0 --length <length> -vth 0.6
|
||||
// Setting step == 0 enables sliding window mode with VAD, which outputs
|
||||
// non-overlapping transcription blocks suitable for appending.
|
||||
this.process = spawn('whisper-stream', [
|
||||
'-m',
|
||||
modelPath,
|
||||
'-t',
|
||||
threads.toString(),
|
||||
'--step',
|
||||
step.toString(),
|
||||
'--length',
|
||||
length.toString(),
|
||||
'-vth',
|
||||
'0.6',
|
||||
]);
|
||||
|
||||
this.process.stdout.on('data', (data: Buffer) => {
|
||||
const output = data.toString();
|
||||
this.parseOutput(output);
|
||||
});
|
||||
|
||||
this.process.stderr.on('data', (data: Buffer) => {
|
||||
const msg = data.toString();
|
||||
if (msg.includes('error')) {
|
||||
debugLogger.error(`[WhisperTranscription] stderr: ${msg}`);
|
||||
if (!isResolved) {
|
||||
isResolved = true;
|
||||
reject(new Error(msg));
|
||||
}
|
||||
}
|
||||
|
||||
// whisper-stream prints "whisper_init_from_file_with_params_no_state: loading model from..."
|
||||
// and finally "main: processing, press Ctrl+C to stop" when ready.
|
||||
if (!isResolved && msg.includes('main: processing')) {
|
||||
debugLogger.debug('[WhisperTranscription] whisper-stream is ready');
|
||||
isResolved = true;
|
||||
resolve();
|
||||
}
|
||||
});
|
||||
|
||||
this.process.on('error', (err) => {
|
||||
debugLogger.error('[WhisperTranscription] Process error:', err);
|
||||
this.emit('error', err);
|
||||
if (!isResolved) {
|
||||
isResolved = true;
|
||||
reject(err);
|
||||
}
|
||||
});
|
||||
|
||||
this.process.on('close', (code) => {
|
||||
debugLogger.debug(
|
||||
`[WhisperTranscription] Process closed with code ${code}`,
|
||||
);
|
||||
this.emit('close');
|
||||
this.process = null;
|
||||
});
|
||||
|
||||
// Fallback timeout in case "main: processing" is never seen
|
||||
setTimeout(() => {
|
||||
if (!isResolved) {
|
||||
debugLogger.warn(
|
||||
'[WhisperTranscription] Connection timeout (fallback resolve)',
|
||||
);
|
||||
isResolved = true;
|
||||
resolve();
|
||||
}
|
||||
}, 10000);
|
||||
} catch (err) {
|
||||
debugLogger.error(
|
||||
'[WhisperTranscription] Failed to spawn process:',
|
||||
err,
|
||||
);
|
||||
if (!isResolved) {
|
||||
isResolved = true;
|
||||
reject(err);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
private parseOutput(output: string): void {
|
||||
// whisper-stream output format: "[00:00:00.000 --> 00:00:02.000] Hello world."
|
||||
const lines = output.split('\n');
|
||||
|
||||
for (const line of lines) {
|
||||
const match = line.match(/\[.* --> .*\]\s+(.*)/);
|
||||
if (match && match[1]) {
|
||||
let text = match[1].trim();
|
||||
|
||||
// Filter out [Silence], [music], (laughter), etc.
|
||||
text = text
|
||||
.replace(/\[[^\]]*\]/g, '')
|
||||
.replace(/\([^)]*\)/g, '')
|
||||
.trim();
|
||||
|
||||
if (text) {
|
||||
// In VAD mode (step=0), each line is a completed speech block.
|
||||
// Append it to the buffer to ensure it doesn't disappear.
|
||||
this.currentTranscription = this.currentTranscription
|
||||
? `${this.currentTranscription} ${text}`
|
||||
: text;
|
||||
|
||||
debugLogger.debug(
|
||||
`[WhisperTranscription] Transcription updated (Local-VAD): "${this.currentTranscription}"`,
|
||||
);
|
||||
this.emit('transcription', this.currentTranscription);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sendAudioChunk(_chunk: Buffer): void {
|
||||
// whisper-stream handles its own audio capture.
|
||||
}
|
||||
|
||||
getTranscription(): string {
|
||||
return this.currentTranscription;
|
||||
}
|
||||
|
||||
disconnect(): void {
|
||||
if (this.process) {
|
||||
this.process.kill('SIGTERM');
|
||||
this.process = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user