mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-05-04 00:44:05 -07:00
feat(voice): implement real-time voice mode with cloud and local backends (#24174)
This commit is contained in:
@@ -103,6 +103,7 @@ import { useQuotaAndFallback } from './hooks/useQuotaAndFallback.js';
|
||||
import { useEditorSettings } from './hooks/useEditorSettings.js';
|
||||
import { useSettingsCommand } from './hooks/useSettingsCommand.js';
|
||||
import { useModelCommand } from './hooks/useModelCommand.js';
|
||||
import { useVoiceModelCommand } from './hooks/useVoiceModelCommand.js';
|
||||
import { useSlashCommandProcessor } from './hooks/slashCommandProcessor.js';
|
||||
import { useVimMode } from './contexts/VimModeContext.js';
|
||||
import {
|
||||
@@ -312,6 +313,7 @@ export const AppContainer = (props: AppContainerProps) => {
|
||||
);
|
||||
|
||||
const [shellModeActive, setShellModeActive] = useState(false);
|
||||
const [isVoiceModeEnabled, setVoiceModeEnabled] = useState(false);
|
||||
const [modelSwitchedFromQuotaError, setModelSwitchedFromQuotaError] =
|
||||
useState<boolean>(false);
|
||||
const [historyRemountKey, setHistoryRemountKey] = useState(0);
|
||||
@@ -946,6 +948,12 @@ Logging in with Google... Restarting Gemini CLI to continue.
|
||||
const { isModelDialogOpen, openModelDialog, closeModelDialog } =
|
||||
useModelCommand();
|
||||
|
||||
const {
|
||||
isVoiceModelDialogOpen,
|
||||
openVoiceModelDialog,
|
||||
closeVoiceModelDialog,
|
||||
} = useVoiceModelCommand();
|
||||
|
||||
const { toggleVimEnabled } = useVimMode();
|
||||
|
||||
const setIsBackgroundTaskListOpenRef = useRef<(open: boolean) => void>(
|
||||
@@ -969,6 +977,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
|
||||
openSettingsDialog,
|
||||
openSessionBrowser,
|
||||
openModelDialog,
|
||||
openVoiceModelDialog,
|
||||
openAgentConfigDialog,
|
||||
openPermissionsDialog,
|
||||
quit: (messages: HistoryItem[]) => {
|
||||
@@ -981,6 +990,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
|
||||
},
|
||||
setDebugMessage,
|
||||
toggleCorgiMode: () => setCorgiMode((prev) => !prev),
|
||||
toggleVoiceMode: () => setVoiceModeEnabled((prev) => !prev),
|
||||
toggleDebugProfiler,
|
||||
dispatchExtensionStateUpdate,
|
||||
addConfirmUpdateExtensionRequest,
|
||||
@@ -1006,6 +1016,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
|
||||
openSettingsDialog,
|
||||
openSessionBrowser,
|
||||
openModelDialog,
|
||||
openVoiceModelDialog,
|
||||
openAgentConfigDialog,
|
||||
setQuittingMessages,
|
||||
setDebugMessage,
|
||||
@@ -2191,6 +2202,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
|
||||
isThemeDialogOpen ||
|
||||
isSettingsDialogOpen ||
|
||||
isModelDialogOpen ||
|
||||
isVoiceModelDialogOpen ||
|
||||
isAgentConfigDialogOpen ||
|
||||
isPermissionsDialogOpen ||
|
||||
isAuthenticating ||
|
||||
@@ -2448,6 +2460,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
|
||||
isSettingsDialogOpen,
|
||||
isSessionBrowserOpen,
|
||||
isModelDialogOpen,
|
||||
isVoiceModelDialogOpen,
|
||||
isAgentConfigDialogOpen,
|
||||
selectedAgentName,
|
||||
selectedAgentDisplayName,
|
||||
@@ -2468,6 +2481,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
|
||||
pendingGeminiHistoryItems,
|
||||
thought,
|
||||
isInputActive,
|
||||
isVoiceModeEnabled,
|
||||
isResuming,
|
||||
shouldShowIdePrompt,
|
||||
isFolderTrustDialogOpen: isFolderTrustDialogOpen ?? false,
|
||||
@@ -2559,6 +2573,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
|
||||
isSettingsDialogOpen,
|
||||
isSessionBrowserOpen,
|
||||
isModelDialogOpen,
|
||||
isVoiceModelDialogOpen,
|
||||
isAgentConfigDialogOpen,
|
||||
selectedAgentName,
|
||||
selectedAgentDisplayName,
|
||||
@@ -2579,6 +2594,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
|
||||
pendingGeminiHistoryItems,
|
||||
thought,
|
||||
isInputActive,
|
||||
isVoiceModeEnabled,
|
||||
isResuming,
|
||||
shouldShowIdePrompt,
|
||||
isFolderTrustDialogOpen,
|
||||
@@ -2671,6 +2687,8 @@ Logging in with Google... Restarting Gemini CLI to continue.
|
||||
exitPrivacyNotice,
|
||||
closeSettingsDialog,
|
||||
closeModelDialog,
|
||||
openVoiceModelDialog,
|
||||
closeVoiceModelDialog,
|
||||
openAgentConfigDialog,
|
||||
closeAgentConfigDialog,
|
||||
openPermissionsDialog,
|
||||
@@ -2751,6 +2769,9 @@ Logging in with Google... Restarting Gemini CLI to continue.
|
||||
setAccountSuspensionInfo(null);
|
||||
setAuthState(AuthState.Updating);
|
||||
},
|
||||
setVoiceModeEnabled: (value: boolean) => {
|
||||
setVoiceModeEnabled(value);
|
||||
},
|
||||
}),
|
||||
[
|
||||
handleThemeSelect,
|
||||
@@ -2764,6 +2785,8 @@ Logging in with Google... Restarting Gemini CLI to continue.
|
||||
exitPrivacyNotice,
|
||||
closeSettingsDialog,
|
||||
closeModelDialog,
|
||||
openVoiceModelDialog,
|
||||
closeVoiceModelDialog,
|
||||
openAgentConfigDialog,
|
||||
closeAgentConfigDialog,
|
||||
openPermissionsDialog,
|
||||
@@ -2807,6 +2830,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
|
||||
config,
|
||||
historyManager,
|
||||
getPreferredEditor,
|
||||
setVoiceModeEnabled,
|
||||
],
|
||||
);
|
||||
|
||||
|
||||
@@ -72,6 +72,7 @@ export interface CommandContext {
|
||||
loadHistory: (history: HistoryItem[], postLoadInput?: string) => void;
|
||||
/** Toggles a special display mode. */
|
||||
toggleCorgiMode: () => void;
|
||||
toggleVoiceMode: () => void;
|
||||
toggleDebugProfiler: () => void;
|
||||
toggleVimEnabled: () => Promise<boolean>;
|
||||
reloadCommands: () => void;
|
||||
@@ -125,6 +126,7 @@ export interface OpenDialogActionReturn {
|
||||
| 'settings'
|
||||
| 'sessionBrowser'
|
||||
| 'model'
|
||||
| 'voice-model'
|
||||
| 'agentConfig'
|
||||
| 'permissions';
|
||||
}
|
||||
|
||||
@@ -0,0 +1,30 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { CommandKind, type SlashCommand } from './types.js';
|
||||
|
||||
export const voiceCommand: SlashCommand = {
|
||||
name: 'voice',
|
||||
altNames: [],
|
||||
description: 'Toggle voice dictation mode',
|
||||
kind: CommandKind.BUILT_IN,
|
||||
autoExecute: true,
|
||||
action: (context) => {
|
||||
context.ui.toggleVoiceMode();
|
||||
},
|
||||
subCommands: [
|
||||
{
|
||||
name: 'model',
|
||||
description: 'Manage voice transcription models',
|
||||
kind: CommandKind.BUILT_IN,
|
||||
autoExecute: true,
|
||||
action: async () => ({
|
||||
type: 'dialog',
|
||||
dialog: 'voice-model',
|
||||
}),
|
||||
},
|
||||
],
|
||||
};
|
||||
@@ -25,6 +25,7 @@ import { relaunchApp } from '../../utils/processUtils.js';
|
||||
import { SessionBrowser } from './SessionBrowser.js';
|
||||
import { PermissionsModifyTrustDialog } from './PermissionsModifyTrustDialog.js';
|
||||
import { ModelDialog } from './ModelDialog.js';
|
||||
import { VoiceModelDialog } from './VoiceModelDialog.js';
|
||||
import { theme } from '../semantic-colors.js';
|
||||
import { useUIState } from '../contexts/UIStateContext.js';
|
||||
import { useQuotaState } from '../contexts/QuotaContext.js';
|
||||
@@ -238,6 +239,9 @@ export const DialogManager = ({
|
||||
if (uiState.isModelDialogOpen) {
|
||||
return <ModelDialog onClose={uiActions.closeModelDialog} />;
|
||||
}
|
||||
if (uiState.isVoiceModelDialogOpen) {
|
||||
return <VoiceModelDialog onClose={uiActions.closeVoiceModelDialog} />;
|
||||
}
|
||||
if (
|
||||
uiState.isAgentConfigDialogOpen &&
|
||||
uiState.selectedAgentName &&
|
||||
|
||||
@@ -9,12 +9,41 @@ import { createMockSettings } from '../../test-utils/settings.js';
|
||||
import { makeFakeConfig } from '@google/gemini-cli-core';
|
||||
import { waitFor } from '../../test-utils/async.js';
|
||||
import { act, useState, useMemo } from 'react';
|
||||
import type { EventEmitter } from 'node:events';
|
||||
|
||||
const { fakeTranscriptionProvider } = vi.hoisted(() => {
|
||||
// Use require within hoisted block for immediate synchronous access
|
||||
// eslint-disable-next-line @typescript-eslint/no-require-imports, no-restricted-syntax
|
||||
const { EventEmitter } = require('node:events');
|
||||
class FakeTranscriptionProvider extends EventEmitter {
|
||||
connect = vi.fn().mockResolvedValue(undefined);
|
||||
disconnect = vi.fn();
|
||||
sendAudioChunk = vi.fn();
|
||||
getTranscription = vi.fn().mockReturnValue('');
|
||||
}
|
||||
return {
|
||||
fakeTranscriptionProvider: new FakeTranscriptionProvider(),
|
||||
};
|
||||
});
|
||||
|
||||
vi.mock('@google/gemini-cli-core', async (importOriginal) => {
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
const actual = (await importOriginal()) as any;
|
||||
return {
|
||||
...actual,
|
||||
TranscriptionFactory: {
|
||||
createProvider: vi.fn(() => fakeTranscriptionProvider),
|
||||
},
|
||||
};
|
||||
});
|
||||
|
||||
import {
|
||||
InputPrompt,
|
||||
tryTogglePasteExpansion,
|
||||
type InputPromptProps,
|
||||
} from './InputPrompt.js';
|
||||
import { InputContext } from '../contexts/InputContext.js';
|
||||
import { type UIState } from '../contexts/UIStateContext.js';
|
||||
import {
|
||||
calculateTransformationsForLine,
|
||||
calculateTransformedLine,
|
||||
@@ -417,6 +446,7 @@ describe('InputPrompt', () => {
|
||||
getWorkspaceContext: () => ({
|
||||
getDirectories: () => ['/test/project/src'],
|
||||
}),
|
||||
getContentGeneratorConfig: () => ({ apiKey: 'test-api-key' }),
|
||||
} as unknown as Config,
|
||||
slashCommands: mockSlashCommands,
|
||||
commandContext: mockCommandContext,
|
||||
@@ -4925,6 +4955,383 @@ describe('InputPrompt', () => {
|
||||
},
|
||||
);
|
||||
});
|
||||
|
||||
describe('Voice Mode', () => {
|
||||
beforeEach(() => {
|
||||
(
|
||||
fakeTranscriptionProvider as unknown as EventEmitter
|
||||
).removeAllListeners();
|
||||
vi.clearAllMocks();
|
||||
});
|
||||
|
||||
it('should start recording when space is pressed and voice mode is enabled (toggle)', async () => {
|
||||
await act(async () => {
|
||||
mockBuffer.setText('');
|
||||
});
|
||||
const { stdin, unmount, lastFrame } = await renderWithProviders(
|
||||
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
|
||||
{
|
||||
uiState: { isVoiceModeEnabled: true } as UIState,
|
||||
settings: createMockSettings({
|
||||
experimental: { voice: { activationMode: 'toggle' } },
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
// Initially not recording
|
||||
expect(lastFrame()).not.toContain('🎙️ Listening...');
|
||||
expect(lastFrame()).toContain(
|
||||
'Voice mode: Space to start/stop recording',
|
||||
);
|
||||
|
||||
// Press space to start
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
});
|
||||
|
||||
// Now should show listening
|
||||
await waitFor(() => {
|
||||
expect(lastFrame()).toContain('🎙️ Listening...');
|
||||
});
|
||||
|
||||
unmount();
|
||||
});
|
||||
|
||||
it('should toggle recording off when space is pressed again (toggle)', async () => {
|
||||
await act(async () => {
|
||||
mockBuffer.setText('');
|
||||
});
|
||||
const { stdin, unmount, lastFrame } = await renderWithProviders(
|
||||
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
|
||||
{
|
||||
uiState: { isVoiceModeEnabled: true } as UIState,
|
||||
settings: createMockSettings({
|
||||
experimental: { voice: { activationMode: 'toggle' } },
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
// Start recording
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
});
|
||||
await waitFor(() => {
|
||||
expect(lastFrame()).toContain('🎙️ Listening...');
|
||||
});
|
||||
|
||||
// Stop recording
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
});
|
||||
await waitFor(() => {
|
||||
expect(lastFrame()).not.toContain('🎙️ Listening...');
|
||||
expect(lastFrame()).toContain(
|
||||
'Voice mode: Space to start/stop recording',
|
||||
);
|
||||
});
|
||||
|
||||
unmount();
|
||||
});
|
||||
|
||||
it('should resume recording when space is pressed even if buffer is not empty (toggle)', async () => {
|
||||
await act(async () => {
|
||||
mockBuffer.setText('some existing text');
|
||||
});
|
||||
const { stdin, unmount, lastFrame } = await renderWithProviders(
|
||||
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
|
||||
{
|
||||
uiState: { isVoiceModeEnabled: true } as UIState,
|
||||
settings: createMockSettings({
|
||||
experimental: { voice: { activationMode: 'toggle' } },
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
// Should show voice mode hint even if buffer is not empty (new behavior)
|
||||
expect(lastFrame()).toContain(
|
||||
'Voice mode: Space to start/stop recording',
|
||||
);
|
||||
expect(lastFrame()).toContain('some existing text');
|
||||
|
||||
// Press space to start recording again
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
});
|
||||
|
||||
await waitFor(() => {
|
||||
expect(lastFrame()).toContain('🎙️ Listening...');
|
||||
});
|
||||
|
||||
unmount();
|
||||
});
|
||||
|
||||
it('should not start recording if voice mode is disabled (toggle)', async () => {
|
||||
await act(async () => {
|
||||
mockBuffer.setText('');
|
||||
});
|
||||
const { stdin, unmount, lastFrame } = await renderWithProviders(
|
||||
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
|
||||
{
|
||||
uiState: { isVoiceModeEnabled: false } as UIState,
|
||||
settings: createMockSettings({
|
||||
experimental: { voice: { activationMode: 'toggle' } },
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
// Press space
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
});
|
||||
|
||||
// Should NOT show listening, instead should call handleInput which handles space
|
||||
expect(lastFrame()).not.toContain('🎙️ Listening...');
|
||||
expect(mockBuffer.handleInput).toHaveBeenCalled();
|
||||
unmount();
|
||||
});
|
||||
|
||||
it('should append transcription correctly across multiple turn updates (toggle)', async () => {
|
||||
await act(async () => {
|
||||
mockBuffer.setText('initial');
|
||||
});
|
||||
const { stdin, unmount } = await renderWithProviders(
|
||||
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
|
||||
{
|
||||
uiState: { isVoiceModeEnabled: true } as UIState,
|
||||
settings: createMockSettings({
|
||||
experimental: { voice: { activationMode: 'toggle' } },
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
// Start recording
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
});
|
||||
|
||||
// Emit first transcription
|
||||
await act(async () => {
|
||||
(fakeTranscriptionProvider as unknown as EventEmitter).emit(
|
||||
'transcription',
|
||||
'hello',
|
||||
);
|
||||
});
|
||||
await waitFor(() => {
|
||||
expect(mockBuffer.setText).toHaveBeenCalledWith('initial hello', 'end');
|
||||
});
|
||||
|
||||
// Emit turnComplete (Gemini Live starts over after this)
|
||||
await act(async () => {
|
||||
(fakeTranscriptionProvider as unknown as EventEmitter).emit(
|
||||
'turnComplete',
|
||||
);
|
||||
});
|
||||
|
||||
// Emit second part (Gemini Live sends new turn text starting from empty)
|
||||
await act(async () => {
|
||||
(fakeTranscriptionProvider as unknown as EventEmitter).emit(
|
||||
'transcription',
|
||||
'world',
|
||||
);
|
||||
});
|
||||
await waitFor(() => {
|
||||
// Should have appended 'world' to the baseline 'initial hello'
|
||||
expect(mockBuffer.setText).toHaveBeenCalledWith(
|
||||
'initial hello world',
|
||||
'end',
|
||||
);
|
||||
});
|
||||
|
||||
unmount();
|
||||
});
|
||||
|
||||
it('should append transcription correctly when resuming voice mode (toggle)', async () => {
|
||||
await act(async () => {
|
||||
mockBuffer.setText('First turn.');
|
||||
});
|
||||
const { stdin, unmount } = await renderWithProviders(
|
||||
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
|
||||
{
|
||||
uiState: { isVoiceModeEnabled: true } as UIState,
|
||||
settings: createMockSettings({
|
||||
experimental: { voice: { activationMode: 'toggle' } },
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
// Start recording (resumed)
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
});
|
||||
|
||||
// Emit transcription
|
||||
await act(async () => {
|
||||
(fakeTranscriptionProvider as unknown as EventEmitter).emit(
|
||||
'transcription',
|
||||
'Second turn.',
|
||||
);
|
||||
});
|
||||
|
||||
await waitFor(() => {
|
||||
expect(mockBuffer.setText).toHaveBeenCalledWith(
|
||||
'First turn. Second turn.',
|
||||
'end',
|
||||
);
|
||||
});
|
||||
|
||||
unmount();
|
||||
});
|
||||
|
||||
describe('push-to-talk', () => {
|
||||
beforeEach(() => {
|
||||
vi.useFakeTimers();
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
vi.useRealTimers();
|
||||
});
|
||||
|
||||
it('should insert a space on a single tap', async () => {
|
||||
const { stdin, unmount, lastFrame } = await renderWithProviders(
|
||||
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
|
||||
{
|
||||
uiState: { isVoiceModeEnabled: true } as UIState,
|
||||
settings: createMockSettings({
|
||||
experimental: { voice: { activationMode: 'push-to-talk' } },
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
expect(lastFrame()).toContain('Voice mode: Hold Space to record');
|
||||
|
||||
// Press space once
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
});
|
||||
|
||||
// Should insert space optimistically
|
||||
expect(mockBuffer.insert).toHaveBeenCalledWith(' ');
|
||||
expect(lastFrame()).not.toContain('🎙️ Listening...');
|
||||
|
||||
// Advance timer past HOLD_DELAY_MS
|
||||
await act(async () => {
|
||||
vi.advanceTimersByTime(700);
|
||||
});
|
||||
|
||||
expect(lastFrame()).not.toContain('🎙️ Listening...');
|
||||
unmount();
|
||||
});
|
||||
|
||||
it('should start recording on hold (simulated by repeat spaces)', async () => {
|
||||
const { stdin, unmount, lastFrame } = await renderWithProviders(
|
||||
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
|
||||
{
|
||||
uiState: { isVoiceModeEnabled: true } as UIState,
|
||||
settings: createMockSettings({
|
||||
experimental: { voice: { activationMode: 'push-to-talk' } },
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
// First space
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
});
|
||||
expect(mockBuffer.insert).toHaveBeenCalledWith(' ');
|
||||
|
||||
// Second space (repeat)
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
});
|
||||
|
||||
await waitFor(() => {
|
||||
// Should have backspaced the optimistic space
|
||||
expect(mockBuffer.backspace).toHaveBeenCalled();
|
||||
// Should show listening
|
||||
expect(lastFrame()).toContain('🎙️ Listening...');
|
||||
});
|
||||
|
||||
unmount();
|
||||
});
|
||||
|
||||
it('should stop recording when space heartbeat stops (release)', async () => {
|
||||
const { stdin, unmount, lastFrame } = await renderWithProviders(
|
||||
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
|
||||
{
|
||||
uiState: { isVoiceModeEnabled: true } as UIState,
|
||||
settings: createMockSettings({
|
||||
experimental: { voice: { activationMode: 'push-to-talk' } },
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
// Start hold
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
stdin.write(' ');
|
||||
});
|
||||
|
||||
// Use a short interval in waitFor to prevent advancing fake timers past the 300ms RELEASE_DELAY_MS
|
||||
await waitFor(
|
||||
() => {
|
||||
expect(lastFrame()).toContain('🎙️ Listening...');
|
||||
},
|
||||
{ interval: 10 },
|
||||
);
|
||||
|
||||
// Simulate heartbeat (held key) - send space first to reset timer, then advance
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
vi.advanceTimersByTime(100);
|
||||
});
|
||||
expect(lastFrame()).toContain('🎙️ Listening...');
|
||||
|
||||
// Stop heartbeat (release)
|
||||
await act(async () => {
|
||||
vi.advanceTimersByTime(400); // Past RELEASE_DELAY_MS
|
||||
});
|
||||
|
||||
await waitFor(() => {
|
||||
expect(lastFrame()).not.toContain('🎙️ Listening...');
|
||||
});
|
||||
|
||||
unmount();
|
||||
});
|
||||
|
||||
it('should cancel hold state if non-space key is pressed after first space', async () => {
|
||||
const { stdin, unmount } = await renderWithProviders(
|
||||
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
|
||||
{
|
||||
uiState: { isVoiceModeEnabled: true } as UIState,
|
||||
settings: createMockSettings({
|
||||
experimental: { voice: { activationMode: 'push-to-talk' } },
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
// First space
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
});
|
||||
|
||||
// Type 'a'
|
||||
await act(async () => {
|
||||
stdin.write('a');
|
||||
});
|
||||
|
||||
// Should NOT start recording on next space even if fast
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
});
|
||||
|
||||
expect(mockBuffer.insert).toHaveBeenCalledTimes(2); // Two spaces inserted
|
||||
expect(mockBuffer.handleInput).toHaveBeenCalledWith(
|
||||
expect.objectContaining({ name: 'a' }),
|
||||
);
|
||||
unmount();
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
function clean(str: string | undefined): string {
|
||||
|
||||
@@ -56,6 +56,7 @@ import {
|
||||
debugLogger,
|
||||
type Config,
|
||||
} from '@google/gemini-cli-core';
|
||||
import { useVoiceMode } from '../hooks/useVoiceMode.js';
|
||||
import {
|
||||
parseInputForHighlighting,
|
||||
parseSegmentsFromTokens,
|
||||
@@ -159,7 +160,6 @@ export function isLargePaste(text: string): boolean {
|
||||
}
|
||||
|
||||
const DOUBLE_TAB_CLEAN_UI_TOGGLE_WINDOW_MS = 350;
|
||||
|
||||
/**
|
||||
* Attempt to toggle expansion of a paste placeholder in the buffer.
|
||||
* Returns true if a toggle action was performed or hint was shown, false otherwise.
|
||||
@@ -238,6 +238,7 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
|
||||
setEmbeddedShellFocused,
|
||||
setShortcutsHelpVisible,
|
||||
toggleCleanUiDetailsVisible,
|
||||
setVoiceModeEnabled,
|
||||
} = useUIActions();
|
||||
const {
|
||||
terminalWidth,
|
||||
@@ -246,6 +247,7 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
|
||||
backgroundTasks,
|
||||
backgroundTaskHeight,
|
||||
shortcutsHelpVisible,
|
||||
isVoiceModeEnabled,
|
||||
} = useUIState();
|
||||
const [suppressCompletion, setSuppressCompletion] = useState(false);
|
||||
const { handlePress: registerPlainTabPress, resetCount: resetPlainTabPress } =
|
||||
@@ -263,6 +265,7 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
|
||||
resetEscapeState();
|
||||
if (buffer.text.length > 0) {
|
||||
buffer.setText('');
|
||||
resetTurnBaseline();
|
||||
resetCompletionState();
|
||||
} else if (history.length > 0) {
|
||||
onSubmit('/rewind');
|
||||
@@ -281,6 +284,16 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
|
||||
const hasUserNavigatedSuggestions = useRef(false);
|
||||
const listRef = useRef<ScrollableListRef<ScrollableItem>>(null);
|
||||
|
||||
const { isRecording, handleVoiceInput, resetTurnBaseline } = useVoiceMode({
|
||||
buffer,
|
||||
config,
|
||||
settings,
|
||||
setQueueErrorMessage,
|
||||
isVoiceModeEnabled,
|
||||
setVoiceModeEnabled,
|
||||
keyMatchers,
|
||||
});
|
||||
|
||||
const [reverseSearchActive, setReverseSearchActive] = useState(false);
|
||||
const [commandSearchActive, setCommandSearchActive] = useState(false);
|
||||
const [textBeforeReverseSearch, setTextBeforeReverseSearch] = useState('');
|
||||
@@ -387,6 +400,7 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
|
||||
// Clear the buffer *before* calling onSubmit to prevent potential re-submission
|
||||
// if onSubmit triggers a re-render while the buffer still holds the old value.
|
||||
buffer.setText('');
|
||||
resetTurnBaseline();
|
||||
onSubmit(processedValue);
|
||||
resetCompletionState();
|
||||
resetReverseSearchCompletionState();
|
||||
@@ -398,6 +412,7 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
|
||||
shellModeActive,
|
||||
shellHistory,
|
||||
resetReverseSearchCompletionState,
|
||||
resetTurnBaseline,
|
||||
],
|
||||
);
|
||||
|
||||
@@ -647,6 +662,8 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
|
||||
|
||||
const handleInput = useCallback(
|
||||
(key: Key) => {
|
||||
if (handleVoiceInput(key)) return true;
|
||||
|
||||
// Determine if this keypress is a history navigation command
|
||||
const isHistoryUp =
|
||||
!shellModeActive &&
|
||||
@@ -873,9 +890,9 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
|
||||
) {
|
||||
setShellModeActive(!shellModeActive);
|
||||
buffer.setText(''); // Clear the '!' from input
|
||||
resetTurnBaseline();
|
||||
return true;
|
||||
}
|
||||
|
||||
if (keyMatchers[Command.ESCAPE](key)) {
|
||||
const cancelSearch = (
|
||||
setActive: (active: boolean) => void,
|
||||
@@ -1360,6 +1377,7 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
|
||||
backgroundTaskHeight,
|
||||
streamingState,
|
||||
handleEscPress,
|
||||
resetTurnBaseline,
|
||||
registerPlainTabPress,
|
||||
resetPlainTabPress,
|
||||
toggleCleanUiDetailsVisible,
|
||||
@@ -1369,9 +1387,9 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
|
||||
keyMatchers,
|
||||
isHelpDismissKey,
|
||||
settings,
|
||||
handleVoiceInput,
|
||||
],
|
||||
);
|
||||
|
||||
useKeypress(handleInput, {
|
||||
isActive: !isEmbeddedShellFocused && !copyModeEnabled,
|
||||
priority: true,
|
||||
@@ -1792,20 +1810,39 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
|
||||
)}{' '}
|
||||
</Text>
|
||||
<Box flexGrow={1} flexDirection="column" ref={innerBoxRef}>
|
||||
{buffer.text.length === 0 && placeholder ? (
|
||||
showCursor ? (
|
||||
<Text
|
||||
terminalCursorFocus={showCursor}
|
||||
terminalCursorPosition={0}
|
||||
>
|
||||
{chalk.inverse(placeholder.slice(0, 1))}
|
||||
<Text color={theme.text.secondary}>
|
||||
{placeholder.slice(1)}
|
||||
</Text>
|
||||
{isRecording && (
|
||||
<Box flexDirection="row" marginBottom={0}>
|
||||
<Text color={theme.status.success}>🎙️ Listening...</Text>
|
||||
</Box>
|
||||
)}
|
||||
{isVoiceModeEnabled && !isRecording && (
|
||||
<Box flexDirection="row" marginBottom={0}>
|
||||
<Text color={theme.text.secondary}>
|
||||
> Voice mode:{' '}
|
||||
{(settings.experimental.voice?.activationMode ??
|
||||
'push-to-talk') === 'push-to-talk'
|
||||
? 'Hold Space to record'
|
||||
: 'Space to start/stop recording'}{' '}
|
||||
(Esc to exit)
|
||||
</Text>
|
||||
) : (
|
||||
<Text color={theme.text.secondary}>{placeholder}</Text>
|
||||
)
|
||||
</Box>
|
||||
)}
|
||||
{buffer.text.length === 0 && !isRecording ? (
|
||||
!isVoiceModeEnabled && placeholder ? (
|
||||
showCursor ? (
|
||||
<Text
|
||||
terminalCursorFocus={showCursor}
|
||||
terminalCursorPosition={0}
|
||||
>
|
||||
{chalk.inverse(placeholder.slice(0, 1))}
|
||||
<Text color={theme.text.secondary}>
|
||||
{placeholder.slice(1)}
|
||||
</Text>
|
||||
</Text>
|
||||
) : (
|
||||
<Text color={theme.text.secondary}>{placeholder}</Text>
|
||||
)
|
||||
) : null
|
||||
) : (
|
||||
<Box
|
||||
flexDirection="column"
|
||||
|
||||
@@ -0,0 +1,236 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import type React from 'react';
|
||||
import { useCallback, useMemo, useState } from 'react';
|
||||
import { Box, Text } from 'ink';
|
||||
import { theme } from '../semantic-colors.js';
|
||||
import { DescriptiveRadioButtonSelect } from './shared/DescriptiveRadioButtonSelect.js';
|
||||
import { useSettingsStore } from '../contexts/SettingsContext.js';
|
||||
import { SettingScope } from '../../config/settings.js';
|
||||
import { useKeypress, type Key } from '../hooks/useKeypress.js';
|
||||
import { isBinaryAvailable } from '@google/gemini-cli-core';
|
||||
import {
|
||||
WhisperModelManager,
|
||||
type WhisperModelProgress,
|
||||
} from '@google/gemini-cli-core';
|
||||
import { CliSpinner } from './CliSpinner.js';
|
||||
|
||||
interface VoiceModelDialogProps {
|
||||
onClose: () => void;
|
||||
}
|
||||
|
||||
type DialogView = 'backend' | 'whisper-models';
|
||||
|
||||
const WHISPER_MODELS = [
|
||||
{
|
||||
value: 'ggml-tiny.en.bin',
|
||||
label: 'Tiny (EN)',
|
||||
description: 'Fastest, lower accuracy (~75MB)',
|
||||
},
|
||||
{
|
||||
value: 'ggml-base.en.bin',
|
||||
label: 'Base (EN)',
|
||||
description: 'Balanced speed and accuracy (~142MB)',
|
||||
},
|
||||
{
|
||||
value: 'ggml-large-v3-turbo-q5_0.bin',
|
||||
label: 'Large v3 Turbo (Q5_0)',
|
||||
description: 'High accuracy, quantized (~547MB)',
|
||||
},
|
||||
{
|
||||
value: 'ggml-large-v3-turbo-q8_0.bin',
|
||||
label: 'Large v3 Turbo (Q8_0)',
|
||||
description: 'Maximum accuracy, high memory (~834MB)',
|
||||
},
|
||||
];
|
||||
|
||||
export function VoiceModelDialog({
|
||||
onClose,
|
||||
}: VoiceModelDialogProps): React.JSX.Element {
|
||||
const { settings, setSetting } = useSettingsStore();
|
||||
const [view, setView] = useState<DialogView>('backend');
|
||||
const [downloadProgress, setDownloadProgress] =
|
||||
useState<WhisperModelProgress | null>(null);
|
||||
const [error, setError] = useState<string | null>(null);
|
||||
|
||||
const whisperInstalled = useMemo(
|
||||
() => isBinaryAvailable('whisper-stream'),
|
||||
[],
|
||||
);
|
||||
const modelManager = useMemo(() => new WhisperModelManager(), []);
|
||||
|
||||
const currentBackend =
|
||||
settings.merged.experimental.voice?.backend ?? 'gemini-live';
|
||||
const currentWhisperModel =
|
||||
settings.merged.experimental.voice?.whisperModel ?? 'ggml-base.en.bin';
|
||||
|
||||
const handleKeypress = useCallback(
|
||||
(key: Key) => {
|
||||
if (key.name === 'escape') {
|
||||
if (view === 'whisper-models') {
|
||||
setView('backend');
|
||||
} else {
|
||||
onClose();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
},
|
||||
[view, onClose],
|
||||
);
|
||||
|
||||
useKeypress(handleKeypress, { isActive: true });
|
||||
|
||||
const handleBackendSelect = useCallback(
|
||||
(value: string) => {
|
||||
if (value === 'whisper') {
|
||||
setView('whisper-models');
|
||||
} else {
|
||||
setSetting(
|
||||
SettingScope.User,
|
||||
'experimental.voice.backend',
|
||||
'gemini-live',
|
||||
);
|
||||
onClose();
|
||||
}
|
||||
},
|
||||
[setSetting, onClose],
|
||||
);
|
||||
|
||||
const handleWhisperModelSelect = useCallback(
|
||||
async (modelName: string) => {
|
||||
if (modelManager.isModelInstalled(modelName)) {
|
||||
setSetting(SettingScope.User, 'experimental.voice.backend', 'whisper');
|
||||
setSetting(
|
||||
SettingScope.User,
|
||||
'experimental.voice.whisperModel',
|
||||
modelName,
|
||||
);
|
||||
onClose();
|
||||
} else {
|
||||
setError(null);
|
||||
const onProgress = (p: WhisperModelProgress) => setDownloadProgress(p);
|
||||
modelManager.on('progress', onProgress);
|
||||
|
||||
try {
|
||||
await modelManager.downloadModel(modelName);
|
||||
|
||||
setSetting(
|
||||
SettingScope.User,
|
||||
'experimental.voice.backend',
|
||||
'whisper',
|
||||
);
|
||||
setSetting(
|
||||
SettingScope.User,
|
||||
'experimental.voice.whisperModel',
|
||||
modelName,
|
||||
);
|
||||
onClose();
|
||||
} catch (err) {
|
||||
setError(
|
||||
`Failed to download: ${err instanceof Error ? err.message : String(err)}`,
|
||||
);
|
||||
} finally {
|
||||
modelManager.off('progress', onProgress);
|
||||
setDownloadProgress(null);
|
||||
}
|
||||
}
|
||||
},
|
||||
[modelManager, setSetting, onClose],
|
||||
);
|
||||
|
||||
const backendOptions = useMemo(
|
||||
() => [
|
||||
{
|
||||
value: 'gemini-live',
|
||||
title: 'Gemini Live API (Cloud)',
|
||||
description: 'Real-time cloud transcription via Gemini Live API.',
|
||||
key: 'gemini-live',
|
||||
},
|
||||
{
|
||||
value: 'whisper',
|
||||
title: 'Whisper (Local)',
|
||||
description: whisperInstalled
|
||||
? 'Local transcription using whisper.cpp.'
|
||||
: 'Local transcription (Requires: brew install whisper-cpp)',
|
||||
key: 'whisper',
|
||||
},
|
||||
],
|
||||
[whisperInstalled],
|
||||
);
|
||||
|
||||
const whisperOptions = useMemo(
|
||||
() =>
|
||||
WHISPER_MODELS.map((m) => ({
|
||||
value: m.value,
|
||||
title: `${m.label}${modelManager.isModelInstalled(m.value) ? ' (Installed)' : ' (Download)'}`,
|
||||
description: m.description,
|
||||
key: m.value,
|
||||
})),
|
||||
[modelManager],
|
||||
);
|
||||
|
||||
return (
|
||||
<Box
|
||||
borderStyle="round"
|
||||
borderColor={theme.border.default}
|
||||
flexDirection="column"
|
||||
padding={1}
|
||||
width="100%"
|
||||
>
|
||||
<Text bold>
|
||||
{view === 'backend'
|
||||
? 'Select Voice Transcription Backend'
|
||||
: 'Select Whisper Model'}
|
||||
</Text>
|
||||
|
||||
{error && (
|
||||
<Box marginTop={1}>
|
||||
<Text color={theme.status.error}>{error}</Text>
|
||||
</Box>
|
||||
)}
|
||||
|
||||
{downloadProgress ? (
|
||||
<Box marginTop={1} flexDirection="column">
|
||||
<Box>
|
||||
<Text>Downloading {downloadProgress.modelName}... </Text>
|
||||
<CliSpinner />
|
||||
<Text> {Math.round(downloadProgress.percentage * 100)}%</Text>
|
||||
</Box>
|
||||
</Box>
|
||||
) : (
|
||||
<Box marginTop={1}>
|
||||
{view === 'backend' ? (
|
||||
<DescriptiveRadioButtonSelect
|
||||
items={backendOptions}
|
||||
onSelect={handleBackendSelect}
|
||||
initialIndex={currentBackend === 'whisper' ? 1 : 0}
|
||||
showNumbers={true}
|
||||
/>
|
||||
) : (
|
||||
<DescriptiveRadioButtonSelect
|
||||
items={whisperOptions}
|
||||
onSelect={handleWhisperModelSelect}
|
||||
initialIndex={whisperOptions.findIndex(
|
||||
(o) => o.value === currentWhisperModel,
|
||||
)}
|
||||
showNumbers={true}
|
||||
/>
|
||||
)}
|
||||
</Box>
|
||||
)}
|
||||
|
||||
<Box marginTop={1} flexDirection="column">
|
||||
<Text color={theme.text.secondary}>
|
||||
{view === 'whisper-models'
|
||||
? '(Press Esc to go back)'
|
||||
: '(Press Esc to close)'}
|
||||
</Text>
|
||||
</Box>
|
||||
</Box>
|
||||
);
|
||||
}
|
||||
@@ -168,13 +168,6 @@ exports[`InputPrompt > mouse interaction > should toggle paste expansion on doub
|
||||
"
|
||||
`;
|
||||
|
||||
exports[`InputPrompt > mouse interaction > should toggle paste expansion on double-click 4`] = `
|
||||
"▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀
|
||||
> [Pasted Text: 10 lines]
|
||||
▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄
|
||||
"
|
||||
`;
|
||||
|
||||
exports[`InputPrompt > multiline rendering > should correctly render multiline input including blank lines 1`] = `
|
||||
"────────────────────────────────────────────────────────────────────────────────────────────────────
|
||||
> hello
|
||||
|
||||
@@ -41,6 +41,8 @@ export interface UIActions {
|
||||
exitPrivacyNotice: () => void;
|
||||
closeSettingsDialog: () => void;
|
||||
closeModelDialog: () => void;
|
||||
openVoiceModelDialog: () => void;
|
||||
closeVoiceModelDialog: () => void;
|
||||
openAgentConfigDialog: (
|
||||
name: string,
|
||||
displayName: string,
|
||||
@@ -93,6 +95,7 @@ export interface UIActions {
|
||||
handleNewAgentsSelect: (choice: NewAgentsChoice) => Promise<void>;
|
||||
getPreferredEditor: () => EditorType | undefined;
|
||||
clearAccountSuspension: () => void;
|
||||
setVoiceModeEnabled: (value: boolean) => void;
|
||||
}
|
||||
|
||||
export const UIActionsContext = createContext<UIActions | null>(null);
|
||||
|
||||
@@ -112,6 +112,7 @@ export interface UIState {
|
||||
isSettingsDialogOpen: boolean;
|
||||
isSessionBrowserOpen: boolean;
|
||||
isModelDialogOpen: boolean;
|
||||
isVoiceModelDialogOpen: boolean;
|
||||
isAgentConfigDialogOpen: boolean;
|
||||
selectedAgentName?: string;
|
||||
selectedAgentDisplayName?: string;
|
||||
@@ -132,6 +133,7 @@ export interface UIState {
|
||||
pendingGeminiHistoryItems: HistoryItemWithoutId[];
|
||||
thought: ThoughtSummary | null;
|
||||
isInputActive: boolean;
|
||||
isVoiceModeEnabled: boolean;
|
||||
isResuming: boolean;
|
||||
shouldShowIdePrompt: boolean;
|
||||
isFolderTrustDialogOpen: boolean;
|
||||
|
||||
@@ -205,11 +205,13 @@ describe('useSlashCommandProcessor', () => {
|
||||
openSettingsDialog: vi.fn(),
|
||||
openSessionBrowser: vi.fn(),
|
||||
openModelDialog: mockOpenModelDialog,
|
||||
openVoiceModelDialog: vi.fn(),
|
||||
openAgentConfigDialog,
|
||||
openPermissionsDialog: vi.fn(),
|
||||
quit: mockSetQuittingMessages,
|
||||
setDebugMessage: vi.fn(),
|
||||
toggleCorgiMode: vi.fn(),
|
||||
toggleVoiceMode: vi.fn(),
|
||||
toggleDebugProfiler: vi.fn(),
|
||||
dispatchExtensionStateUpdate: vi.fn(),
|
||||
addConfirmUpdateExtensionRequest: vi.fn(),
|
||||
|
||||
@@ -72,6 +72,7 @@ interface SlashCommandProcessorActions {
|
||||
openSettingsDialog: () => void;
|
||||
openSessionBrowser: () => void;
|
||||
openModelDialog: () => void;
|
||||
openVoiceModelDialog: () => void;
|
||||
openAgentConfigDialog: (
|
||||
name: string,
|
||||
displayName: string,
|
||||
@@ -81,6 +82,7 @@ interface SlashCommandProcessorActions {
|
||||
quit: (messages: HistoryItem[]) => void;
|
||||
setDebugMessage: (message: string) => void;
|
||||
toggleCorgiMode: () => void;
|
||||
toggleVoiceMode: () => void;
|
||||
toggleDebugProfiler: () => void;
|
||||
dispatchExtensionStateUpdate: (action: ExtensionUpdateAction) => void;
|
||||
addConfirmUpdateExtensionRequest: (request: ConfirmationRequest) => void;
|
||||
@@ -232,6 +234,7 @@ export const useSlashCommandProcessor = (
|
||||
pendingItem,
|
||||
setPendingItem,
|
||||
toggleCorgiMode: actions.toggleCorgiMode,
|
||||
toggleVoiceMode: actions.toggleVoiceMode,
|
||||
toggleDebugProfiler: actions.toggleDebugProfiler,
|
||||
toggleVimEnabled,
|
||||
reloadCommands,
|
||||
@@ -503,6 +506,9 @@ export const useSlashCommandProcessor = (
|
||||
case 'model':
|
||||
actions.openModelDialog();
|
||||
return { type: 'handled' };
|
||||
case 'voice-model':
|
||||
actions.openVoiceModelDialog();
|
||||
return { type: 'handled' };
|
||||
case 'agentConfig': {
|
||||
// eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
|
||||
const props = result.props as Record<string, unknown>;
|
||||
|
||||
@@ -0,0 +1,429 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { useState, useRef, useCallback, useEffect } from 'react';
|
||||
import {
|
||||
AudioRecorder,
|
||||
TranscriptionFactory,
|
||||
debugLogger,
|
||||
type Config,
|
||||
type TranscriptionProvider,
|
||||
} from '@google/gemini-cli-core';
|
||||
import type { TextBuffer } from '../components/shared/text-buffer.js';
|
||||
import type { MergedSettings } from '../../config/settingsSchema.js';
|
||||
import type { Key } from './useKeypress.js';
|
||||
import { Command } from '../key/keyMatchers.js';
|
||||
|
||||
interface UseVoiceModeProps {
|
||||
buffer: TextBuffer;
|
||||
config: Config;
|
||||
settings: MergedSettings;
|
||||
setQueueErrorMessage: (message: string | null) => void;
|
||||
isVoiceModeEnabled: boolean;
|
||||
setVoiceModeEnabled: (enabled: boolean) => void;
|
||||
keyMatchers: Record<Command, (key: Key) => boolean>;
|
||||
}
|
||||
|
||||
const HOLD_DELAY_MS = 600;
|
||||
const RELEASE_DELAY_MS = 300;
|
||||
|
||||
export function useVoiceMode({
|
||||
buffer,
|
||||
config,
|
||||
settings,
|
||||
setQueueErrorMessage,
|
||||
isVoiceModeEnabled,
|
||||
setVoiceModeEnabled,
|
||||
keyMatchers,
|
||||
}: UseVoiceModeProps) {
|
||||
const [isRecording, setIsRecording] = useState(false);
|
||||
const [isConnecting, setIsConnecting] = useState(false);
|
||||
|
||||
const liveTranscriptionRef = useRef('');
|
||||
const stopRequestedRef = useRef(false);
|
||||
const isRecordingRef = useRef(false);
|
||||
const lastFailureTimeRef = useRef(0);
|
||||
const recordingInProgressRef = useRef(false);
|
||||
const voiceTimeoutRef = useRef<NodeJS.Timeout | null>(null);
|
||||
const recorderRef = useRef<AudioRecorder | null>(null);
|
||||
const transcriptionServiceRef = useRef<TranscriptionProvider | null>(null);
|
||||
const turnBaselineRef = useRef<string | null>(null);
|
||||
|
||||
const pttStateRef = useRef<'idle' | 'possible-hold' | 'recording'>('idle');
|
||||
const pttTimerRef = useRef<NodeJS.Timeout | null>(null);
|
||||
const disconnectTimerRef = useRef<NodeJS.Timeout | null>(null);
|
||||
|
||||
const bufferRef = useRef(buffer);
|
||||
bufferRef.current = buffer;
|
||||
|
||||
const stopVoiceRecording = useCallback(() => {
|
||||
if (stopRequestedRef.current) return;
|
||||
debugLogger.debug('[Voice] Stop requested');
|
||||
stopRequestedRef.current = true;
|
||||
|
||||
setIsRecording(false);
|
||||
isRecordingRef.current = false;
|
||||
setIsConnecting(false);
|
||||
|
||||
if (recorderRef.current) {
|
||||
recorderRef.current.stop();
|
||||
recorderRef.current = null;
|
||||
}
|
||||
|
||||
const serviceToDisconnect = transcriptionServiceRef.current;
|
||||
transcriptionServiceRef.current = null;
|
||||
|
||||
if (serviceToDisconnect) {
|
||||
const isLive = settings.experimental.voice?.backend === 'gemini-live';
|
||||
const gracePeriodMs =
|
||||
settings.experimental.voice?.stopGracePeriodMs ??
|
||||
(isLive ? 2000 : 1000);
|
||||
debugLogger.debug(
|
||||
`[Voice] Draining transcription for ${gracePeriodMs}ms`,
|
||||
);
|
||||
|
||||
if (disconnectTimerRef.current) clearTimeout(disconnectTimerRef.current);
|
||||
disconnectTimerRef.current = setTimeout(() => {
|
||||
debugLogger.debug('[Voice] Grace period ended, disconnecting service');
|
||||
serviceToDisconnect.disconnect();
|
||||
disconnectTimerRef.current = null;
|
||||
}, gracePeriodMs);
|
||||
}
|
||||
|
||||
liveTranscriptionRef.current = '';
|
||||
pttStateRef.current = 'idle';
|
||||
}, [settings.experimental.voice]);
|
||||
|
||||
const startVoiceRecording = useCallback(() => {
|
||||
if (
|
||||
isRecordingRef.current ||
|
||||
Date.now() - lastFailureTimeRef.current < 2000
|
||||
) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (disconnectTimerRef.current) {
|
||||
clearTimeout(disconnectTimerRef.current);
|
||||
disconnectTimerRef.current = null;
|
||||
}
|
||||
|
||||
recordingInProgressRef.current = true;
|
||||
turnBaselineRef.current = bufferRef.current.text;
|
||||
|
||||
setIsConnecting(true);
|
||||
setIsRecording(true);
|
||||
isRecordingRef.current = true;
|
||||
|
||||
liveTranscriptionRef.current = '';
|
||||
stopRequestedRef.current = false;
|
||||
|
||||
const apiKey =
|
||||
config.getContentGeneratorConfig()?.apiKey ||
|
||||
process.env['GEMINI_API_KEY'] ||
|
||||
'';
|
||||
|
||||
const startAsync = async () => {
|
||||
// If there's an active draining service, disconnect it immediately
|
||||
// before starting a new one to prevent orphaned event collisions.
|
||||
if (disconnectTimerRef.current) {
|
||||
clearTimeout(disconnectTimerRef.current);
|
||||
disconnectTimerRef.current = null;
|
||||
}
|
||||
if (transcriptionServiceRef.current) {
|
||||
transcriptionServiceRef.current.disconnect();
|
||||
transcriptionServiceRef.current = null;
|
||||
}
|
||||
|
||||
const cleanupIfStopped = () => {
|
||||
if (stopRequestedRef.current) {
|
||||
if (recorderRef.current) {
|
||||
recorderRef.current.stop();
|
||||
recorderRef.current = null;
|
||||
}
|
||||
if (transcriptionServiceRef.current) {
|
||||
transcriptionServiceRef.current.disconnect();
|
||||
transcriptionServiceRef.current = null;
|
||||
}
|
||||
setIsRecording(false);
|
||||
isRecordingRef.current = false;
|
||||
setIsConnecting(false);
|
||||
recordingInProgressRef.current = false;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
if (cleanupIfStopped()) return;
|
||||
|
||||
const voiceBackend =
|
||||
settings.experimental.voice?.backend ?? 'gemini-live';
|
||||
|
||||
if (!apiKey && voiceBackend === 'gemini-live') {
|
||||
setQueueErrorMessage(
|
||||
'Cloud voice mode requires a GEMINI_API_KEY. Please set it in your environment or ~/.gemini/.env.',
|
||||
);
|
||||
setIsRecording(false);
|
||||
isRecordingRef.current = false;
|
||||
setIsConnecting(false);
|
||||
recordingInProgressRef.current = false;
|
||||
lastFailureTimeRef.current = Date.now();
|
||||
return;
|
||||
}
|
||||
|
||||
if (voiceBackend === 'gemini-live') {
|
||||
recorderRef.current = new AudioRecorder();
|
||||
}
|
||||
|
||||
const currentService = TranscriptionFactory.createProvider(
|
||||
settings.experimental.voice,
|
||||
apiKey,
|
||||
);
|
||||
transcriptionServiceRef.current = currentService;
|
||||
|
||||
currentService.on('transcription', (text) => {
|
||||
if (
|
||||
transcriptionServiceRef.current !== currentService &&
|
||||
stopRequestedRef.current
|
||||
) {
|
||||
// If this is an orphaned service that was replaced by a new session, ignore its events
|
||||
return;
|
||||
}
|
||||
|
||||
if (text) {
|
||||
const currentBufferText = bufferRef.current.text;
|
||||
const previousTranscription = liveTranscriptionRef.current;
|
||||
|
||||
let newTotalText = currentBufferText;
|
||||
|
||||
if (
|
||||
previousTranscription &&
|
||||
currentBufferText.endsWith(previousTranscription)
|
||||
) {
|
||||
newTotalText = currentBufferText.slice(
|
||||
0,
|
||||
-previousTranscription.length,
|
||||
);
|
||||
} else if (
|
||||
currentBufferText &&
|
||||
!currentBufferText.endsWith(' ') &&
|
||||
!currentBufferText.endsWith('\n')
|
||||
) {
|
||||
newTotalText += ' ';
|
||||
}
|
||||
|
||||
newTotalText += text;
|
||||
bufferRef.current.setText(newTotalText, 'end');
|
||||
}
|
||||
liveTranscriptionRef.current = text;
|
||||
});
|
||||
|
||||
currentService.on('turnComplete', () => {
|
||||
if (
|
||||
transcriptionServiceRef.current !== currentService &&
|
||||
stopRequestedRef.current
|
||||
)
|
||||
return;
|
||||
liveTranscriptionRef.current = '';
|
||||
});
|
||||
|
||||
currentService.on('error', (err) => {
|
||||
if (transcriptionServiceRef.current !== currentService) return;
|
||||
debugLogger.error('[Voice] Transcription error:', err);
|
||||
lastFailureTimeRef.current = Date.now();
|
||||
recordingInProgressRef.current = false;
|
||||
});
|
||||
|
||||
currentService.on('close', () => {
|
||||
if (transcriptionServiceRef.current !== currentService) return;
|
||||
if (!stopRequestedRef.current) {
|
||||
setIsRecording(false);
|
||||
isRecordingRef.current = false;
|
||||
setIsConnecting(false);
|
||||
recordingInProgressRef.current = false;
|
||||
lastFailureTimeRef.current = Date.now();
|
||||
}
|
||||
});
|
||||
|
||||
try {
|
||||
await currentService.connect();
|
||||
if (cleanupIfStopped()) return;
|
||||
|
||||
await recorderRef.current?.start();
|
||||
if (cleanupIfStopped()) return;
|
||||
|
||||
setIsConnecting(false);
|
||||
|
||||
const currentVoiceBackend =
|
||||
settings.experimental.voice?.backend ?? 'gemini-live';
|
||||
|
||||
recorderRef.current?.on('data', (chunk) => {
|
||||
if (currentVoiceBackend === 'gemini-live') {
|
||||
currentService.sendAudioChunk(chunk);
|
||||
}
|
||||
});
|
||||
recorderRef.current?.on('error', (err) => {
|
||||
debugLogger.error('[Voice] Recorder error:', err);
|
||||
stopVoiceRecording();
|
||||
lastFailureTimeRef.current = Date.now();
|
||||
});
|
||||
} catch (err: unknown) {
|
||||
if (transcriptionServiceRef.current !== currentService) return;
|
||||
const message = err instanceof Error ? err.message : String(err);
|
||||
setQueueErrorMessage(`Voice mode failure: ${message}`);
|
||||
setIsRecording(false);
|
||||
isRecordingRef.current = false;
|
||||
setIsConnecting(false);
|
||||
recordingInProgressRef.current = false;
|
||||
lastFailureTimeRef.current = Date.now();
|
||||
|
||||
if (recorderRef.current) {
|
||||
recorderRef.current.stop();
|
||||
recorderRef.current = null;
|
||||
}
|
||||
if (transcriptionServiceRef.current) {
|
||||
transcriptionServiceRef.current.disconnect();
|
||||
transcriptionServiceRef.current = null;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
void startAsync();
|
||||
}, [
|
||||
config,
|
||||
settings.experimental.voice,
|
||||
setQueueErrorMessage,
|
||||
stopVoiceRecording,
|
||||
]);
|
||||
|
||||
useEffect(
|
||||
() => () => {
|
||||
if (voiceTimeoutRef.current) clearTimeout(voiceTimeoutRef.current);
|
||||
if (recorderRef.current) {
|
||||
recorderRef.current.stop();
|
||||
recorderRef.current = null;
|
||||
}
|
||||
if (transcriptionServiceRef.current) {
|
||||
transcriptionServiceRef.current.disconnect();
|
||||
transcriptionServiceRef.current = null;
|
||||
}
|
||||
if (pttTimerRef.current) clearTimeout(pttTimerRef.current);
|
||||
if (disconnectTimerRef.current) clearTimeout(disconnectTimerRef.current);
|
||||
},
|
||||
[],
|
||||
);
|
||||
|
||||
const handleVoiceInput = useCallback(
|
||||
(key: Key): boolean => {
|
||||
const activeRecording = isRecording || isRecordingRef.current;
|
||||
|
||||
if (activeRecording) {
|
||||
const activationMode =
|
||||
settings.experimental.voice?.activationMode ?? 'push-to-talk';
|
||||
|
||||
if (keyMatchers[Command.ESCAPE](key)) {
|
||||
stopVoiceRecording();
|
||||
return true;
|
||||
}
|
||||
|
||||
if (keyMatchers[Command.VOICE_MODE_PTT](key)) {
|
||||
if (activationMode === 'push-to-talk') {
|
||||
if (pttTimerRef.current) {
|
||||
clearTimeout(pttTimerRef.current);
|
||||
}
|
||||
pttTimerRef.current = setTimeout(() => {
|
||||
stopVoiceRecording();
|
||||
pttTimerRef.current = null;
|
||||
}, RELEASE_DELAY_MS);
|
||||
return true;
|
||||
} else {
|
||||
stopVoiceRecording();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
if (isVoiceModeEnabled) {
|
||||
const activationMode =
|
||||
settings.experimental.voice?.activationMode ?? 'push-to-talk';
|
||||
|
||||
if (keyMatchers[Command.ESCAPE](key) && buffer.text === '') {
|
||||
setVoiceModeEnabled(false);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (keyMatchers[Command.VOICE_MODE_PTT](key)) {
|
||||
if (
|
||||
key.name === 'space' &&
|
||||
!key.ctrl &&
|
||||
!key.alt &&
|
||||
!key.shift &&
|
||||
!key.cmd
|
||||
) {
|
||||
if (activationMode === 'toggle') {
|
||||
startVoiceRecording();
|
||||
return true;
|
||||
} else {
|
||||
if (pttStateRef.current === 'idle') {
|
||||
buffer.insert(' ');
|
||||
pttStateRef.current = 'possible-hold';
|
||||
|
||||
if (pttTimerRef.current) clearTimeout(pttTimerRef.current);
|
||||
pttTimerRef.current = setTimeout(() => {
|
||||
pttStateRef.current = 'idle';
|
||||
pttTimerRef.current = null;
|
||||
}, HOLD_DELAY_MS);
|
||||
return true;
|
||||
} else if (pttStateRef.current === 'possible-hold') {
|
||||
if (pttTimerRef.current) clearTimeout(pttTimerRef.current);
|
||||
buffer.backspace();
|
||||
pttStateRef.current = 'recording';
|
||||
startVoiceRecording();
|
||||
|
||||
pttTimerRef.current = setTimeout(() => {
|
||||
stopVoiceRecording();
|
||||
pttTimerRef.current = null;
|
||||
}, RELEASE_DELAY_MS);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (pttStateRef.current === 'possible-hold') {
|
||||
pttStateRef.current = 'idle';
|
||||
if (pttTimerRef.current) {
|
||||
clearTimeout(pttTimerRef.current);
|
||||
pttTimerRef.current = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
},
|
||||
[
|
||||
isRecording,
|
||||
isVoiceModeEnabled,
|
||||
settings.experimental.voice,
|
||||
keyMatchers,
|
||||
stopVoiceRecording,
|
||||
startVoiceRecording,
|
||||
buffer,
|
||||
setVoiceModeEnabled,
|
||||
],
|
||||
);
|
||||
|
||||
return {
|
||||
isRecording,
|
||||
isConnecting,
|
||||
startVoiceRecording,
|
||||
stopVoiceRecording,
|
||||
handleVoiceInput,
|
||||
resetTurnBaseline: () => {
|
||||
turnBaselineRef.current = null;
|
||||
},
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,31 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { useState, useCallback } from 'react';
|
||||
|
||||
interface UseVoiceModelCommandReturn {
|
||||
isVoiceModelDialogOpen: boolean;
|
||||
openVoiceModelDialog: () => void;
|
||||
closeVoiceModelDialog: () => void;
|
||||
}
|
||||
|
||||
export const useVoiceModelCommand = (): UseVoiceModelCommandReturn => {
|
||||
const [isVoiceModelDialogOpen, setIsVoiceModelDialogOpen] = useState(false);
|
||||
|
||||
const openVoiceModelDialog = useCallback(() => {
|
||||
setIsVoiceModelDialogOpen(true);
|
||||
}, []);
|
||||
|
||||
const closeVoiceModelDialog = useCallback(() => {
|
||||
setIsVoiceModelDialogOpen(false);
|
||||
}, []);
|
||||
|
||||
return {
|
||||
isVoiceModelDialogOpen,
|
||||
openVoiceModelDialog,
|
||||
closeVoiceModelDialog,
|
||||
};
|
||||
};
|
||||
@@ -97,6 +97,7 @@ export enum Command {
|
||||
RESTART_APP = 'app.restart',
|
||||
SUSPEND_APP = 'app.suspend',
|
||||
SHOW_SHELL_INPUT_UNFOCUS_WARNING = 'app.showShellUnfocusWarning',
|
||||
VOICE_MODE_PTT = 'app.voiceModePTT',
|
||||
|
||||
// Background Shell Controls
|
||||
BACKGROUND_SHELL_ESCAPE = 'background.escape',
|
||||
@@ -407,9 +408,7 @@ export const defaultKeyBindingConfig: KeyBindingConfig = new Map([
|
||||
[Command.RESTART_APP, [new KeyBinding('r'), new KeyBinding('shift+r')]],
|
||||
[Command.SUSPEND_APP, [new KeyBinding('ctrl+z')]],
|
||||
[Command.SHOW_SHELL_INPUT_UNFOCUS_WARNING, [new KeyBinding('tab')]],
|
||||
[Command.DUMP_FRAME, [new KeyBinding('f8')]],
|
||||
[Command.START_RECORDING, [new KeyBinding('f6')]],
|
||||
[Command.STOP_RECORDING, [new KeyBinding('f7')]],
|
||||
[Command.VOICE_MODE_PTT, [new KeyBinding('space')]],
|
||||
|
||||
// Background Shell Controls
|
||||
[Command.BACKGROUND_SHELL_ESCAPE, [new KeyBinding('escape')]],
|
||||
@@ -424,6 +423,10 @@ export const defaultKeyBindingConfig: KeyBindingConfig = new Map([
|
||||
// Extension Controls
|
||||
[Command.UPDATE_EXTENSION, [new KeyBinding('i')]],
|
||||
[Command.LINK_EXTENSION, [new KeyBinding('l')]],
|
||||
|
||||
[Command.DUMP_FRAME, [new KeyBinding('f8')]],
|
||||
[Command.START_RECORDING, [new KeyBinding('f6')]],
|
||||
[Command.STOP_RECORDING, [new KeyBinding('f7')]],
|
||||
]);
|
||||
|
||||
interface CommandCategory {
|
||||
@@ -538,6 +541,7 @@ export const commandCategories: readonly CommandCategory[] = [
|
||||
Command.RESTART_APP,
|
||||
Command.SUSPEND_APP,
|
||||
Command.SHOW_SHELL_INPUT_UNFOCUS_WARNING,
|
||||
Command.VOICE_MODE_PTT,
|
||||
],
|
||||
},
|
||||
{
|
||||
@@ -658,6 +662,7 @@ export const commandDescriptions: Readonly<Record<Command, string>> = {
|
||||
[Command.SUSPEND_APP]: 'Suspend the CLI and move it to the background.',
|
||||
[Command.SHOW_SHELL_INPUT_UNFOCUS_WARNING]:
|
||||
'Show warning when trying to move focus away from shell input.',
|
||||
[Command.VOICE_MODE_PTT]: 'Hold to speak in Voice Mode.',
|
||||
|
||||
// Background Shell Controls
|
||||
[Command.BACKGROUND_SHELL_ESCAPE]: 'Dismiss background shell list.',
|
||||
|
||||
@@ -43,5 +43,6 @@ export function createNonInteractiveUI(): CommandContext['ui'] {
|
||||
removeComponent: () => {},
|
||||
toggleBackgroundTasks: () => {},
|
||||
toggleShortcutsHelp: () => {},
|
||||
toggleVoiceMode: () => {},
|
||||
};
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user