mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-05-14 13:53:02 -07:00
feat(voice): implement real-time voice mode with cloud and local backends (#24174)
This commit is contained in:
@@ -25,6 +25,7 @@ import { relaunchApp } from '../../utils/processUtils.js';
|
||||
import { SessionBrowser } from './SessionBrowser.js';
|
||||
import { PermissionsModifyTrustDialog } from './PermissionsModifyTrustDialog.js';
|
||||
import { ModelDialog } from './ModelDialog.js';
|
||||
import { VoiceModelDialog } from './VoiceModelDialog.js';
|
||||
import { theme } from '../semantic-colors.js';
|
||||
import { useUIState } from '../contexts/UIStateContext.js';
|
||||
import { useQuotaState } from '../contexts/QuotaContext.js';
|
||||
@@ -238,6 +239,9 @@ export const DialogManager = ({
|
||||
if (uiState.isModelDialogOpen) {
|
||||
return <ModelDialog onClose={uiActions.closeModelDialog} />;
|
||||
}
|
||||
if (uiState.isVoiceModelDialogOpen) {
|
||||
return <VoiceModelDialog onClose={uiActions.closeVoiceModelDialog} />;
|
||||
}
|
||||
if (
|
||||
uiState.isAgentConfigDialogOpen &&
|
||||
uiState.selectedAgentName &&
|
||||
|
||||
@@ -9,12 +9,41 @@ import { createMockSettings } from '../../test-utils/settings.js';
|
||||
import { makeFakeConfig } from '@google/gemini-cli-core';
|
||||
import { waitFor } from '../../test-utils/async.js';
|
||||
import { act, useState, useMemo } from 'react';
|
||||
import type { EventEmitter } from 'node:events';
|
||||
|
||||
const { fakeTranscriptionProvider } = vi.hoisted(() => {
|
||||
// Use require within hoisted block for immediate synchronous access
|
||||
// eslint-disable-next-line @typescript-eslint/no-require-imports, no-restricted-syntax
|
||||
const { EventEmitter } = require('node:events');
|
||||
class FakeTranscriptionProvider extends EventEmitter {
|
||||
connect = vi.fn().mockResolvedValue(undefined);
|
||||
disconnect = vi.fn();
|
||||
sendAudioChunk = vi.fn();
|
||||
getTranscription = vi.fn().mockReturnValue('');
|
||||
}
|
||||
return {
|
||||
fakeTranscriptionProvider: new FakeTranscriptionProvider(),
|
||||
};
|
||||
});
|
||||
|
||||
vi.mock('@google/gemini-cli-core', async (importOriginal) => {
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
const actual = (await importOriginal()) as any;
|
||||
return {
|
||||
...actual,
|
||||
TranscriptionFactory: {
|
||||
createProvider: vi.fn(() => fakeTranscriptionProvider),
|
||||
},
|
||||
};
|
||||
});
|
||||
|
||||
import {
|
||||
InputPrompt,
|
||||
tryTogglePasteExpansion,
|
||||
type InputPromptProps,
|
||||
} from './InputPrompt.js';
|
||||
import { InputContext } from '../contexts/InputContext.js';
|
||||
import { type UIState } from '../contexts/UIStateContext.js';
|
||||
import {
|
||||
calculateTransformationsForLine,
|
||||
calculateTransformedLine,
|
||||
@@ -417,6 +446,7 @@ describe('InputPrompt', () => {
|
||||
getWorkspaceContext: () => ({
|
||||
getDirectories: () => ['/test/project/src'],
|
||||
}),
|
||||
getContentGeneratorConfig: () => ({ apiKey: 'test-api-key' }),
|
||||
} as unknown as Config,
|
||||
slashCommands: mockSlashCommands,
|
||||
commandContext: mockCommandContext,
|
||||
@@ -4925,6 +4955,383 @@ describe('InputPrompt', () => {
|
||||
},
|
||||
);
|
||||
});
|
||||
|
||||
describe('Voice Mode', () => {
|
||||
beforeEach(() => {
|
||||
(
|
||||
fakeTranscriptionProvider as unknown as EventEmitter
|
||||
).removeAllListeners();
|
||||
vi.clearAllMocks();
|
||||
});
|
||||
|
||||
it('should start recording when space is pressed and voice mode is enabled (toggle)', async () => {
|
||||
await act(async () => {
|
||||
mockBuffer.setText('');
|
||||
});
|
||||
const { stdin, unmount, lastFrame } = await renderWithProviders(
|
||||
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
|
||||
{
|
||||
uiState: { isVoiceModeEnabled: true } as UIState,
|
||||
settings: createMockSettings({
|
||||
experimental: { voice: { activationMode: 'toggle' } },
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
// Initially not recording
|
||||
expect(lastFrame()).not.toContain('🎙️ Listening...');
|
||||
expect(lastFrame()).toContain(
|
||||
'Voice mode: Space to start/stop recording',
|
||||
);
|
||||
|
||||
// Press space to start
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
});
|
||||
|
||||
// Now should show listening
|
||||
await waitFor(() => {
|
||||
expect(lastFrame()).toContain('🎙️ Listening...');
|
||||
});
|
||||
|
||||
unmount();
|
||||
});
|
||||
|
||||
it('should toggle recording off when space is pressed again (toggle)', async () => {
|
||||
await act(async () => {
|
||||
mockBuffer.setText('');
|
||||
});
|
||||
const { stdin, unmount, lastFrame } = await renderWithProviders(
|
||||
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
|
||||
{
|
||||
uiState: { isVoiceModeEnabled: true } as UIState,
|
||||
settings: createMockSettings({
|
||||
experimental: { voice: { activationMode: 'toggle' } },
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
// Start recording
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
});
|
||||
await waitFor(() => {
|
||||
expect(lastFrame()).toContain('🎙️ Listening...');
|
||||
});
|
||||
|
||||
// Stop recording
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
});
|
||||
await waitFor(() => {
|
||||
expect(lastFrame()).not.toContain('🎙️ Listening...');
|
||||
expect(lastFrame()).toContain(
|
||||
'Voice mode: Space to start/stop recording',
|
||||
);
|
||||
});
|
||||
|
||||
unmount();
|
||||
});
|
||||
|
||||
it('should resume recording when space is pressed even if buffer is not empty (toggle)', async () => {
|
||||
await act(async () => {
|
||||
mockBuffer.setText('some existing text');
|
||||
});
|
||||
const { stdin, unmount, lastFrame } = await renderWithProviders(
|
||||
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
|
||||
{
|
||||
uiState: { isVoiceModeEnabled: true } as UIState,
|
||||
settings: createMockSettings({
|
||||
experimental: { voice: { activationMode: 'toggle' } },
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
// Should show voice mode hint even if buffer is not empty (new behavior)
|
||||
expect(lastFrame()).toContain(
|
||||
'Voice mode: Space to start/stop recording',
|
||||
);
|
||||
expect(lastFrame()).toContain('some existing text');
|
||||
|
||||
// Press space to start recording again
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
});
|
||||
|
||||
await waitFor(() => {
|
||||
expect(lastFrame()).toContain('🎙️ Listening...');
|
||||
});
|
||||
|
||||
unmount();
|
||||
});
|
||||
|
||||
it('should not start recording if voice mode is disabled (toggle)', async () => {
|
||||
await act(async () => {
|
||||
mockBuffer.setText('');
|
||||
});
|
||||
const { stdin, unmount, lastFrame } = await renderWithProviders(
|
||||
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
|
||||
{
|
||||
uiState: { isVoiceModeEnabled: false } as UIState,
|
||||
settings: createMockSettings({
|
||||
experimental: { voice: { activationMode: 'toggle' } },
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
// Press space
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
});
|
||||
|
||||
// Should NOT show listening, instead should call handleInput which handles space
|
||||
expect(lastFrame()).not.toContain('🎙️ Listening...');
|
||||
expect(mockBuffer.handleInput).toHaveBeenCalled();
|
||||
unmount();
|
||||
});
|
||||
|
||||
it('should append transcription correctly across multiple turn updates (toggle)', async () => {
|
||||
await act(async () => {
|
||||
mockBuffer.setText('initial');
|
||||
});
|
||||
const { stdin, unmount } = await renderWithProviders(
|
||||
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
|
||||
{
|
||||
uiState: { isVoiceModeEnabled: true } as UIState,
|
||||
settings: createMockSettings({
|
||||
experimental: { voice: { activationMode: 'toggle' } },
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
// Start recording
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
});
|
||||
|
||||
// Emit first transcription
|
||||
await act(async () => {
|
||||
(fakeTranscriptionProvider as unknown as EventEmitter).emit(
|
||||
'transcription',
|
||||
'hello',
|
||||
);
|
||||
});
|
||||
await waitFor(() => {
|
||||
expect(mockBuffer.setText).toHaveBeenCalledWith('initial hello', 'end');
|
||||
});
|
||||
|
||||
// Emit turnComplete (Gemini Live starts over after this)
|
||||
await act(async () => {
|
||||
(fakeTranscriptionProvider as unknown as EventEmitter).emit(
|
||||
'turnComplete',
|
||||
);
|
||||
});
|
||||
|
||||
// Emit second part (Gemini Live sends new turn text starting from empty)
|
||||
await act(async () => {
|
||||
(fakeTranscriptionProvider as unknown as EventEmitter).emit(
|
||||
'transcription',
|
||||
'world',
|
||||
);
|
||||
});
|
||||
await waitFor(() => {
|
||||
// Should have appended 'world' to the baseline 'initial hello'
|
||||
expect(mockBuffer.setText).toHaveBeenCalledWith(
|
||||
'initial hello world',
|
||||
'end',
|
||||
);
|
||||
});
|
||||
|
||||
unmount();
|
||||
});
|
||||
|
||||
it('should append transcription correctly when resuming voice mode (toggle)', async () => {
|
||||
await act(async () => {
|
||||
mockBuffer.setText('First turn.');
|
||||
});
|
||||
const { stdin, unmount } = await renderWithProviders(
|
||||
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
|
||||
{
|
||||
uiState: { isVoiceModeEnabled: true } as UIState,
|
||||
settings: createMockSettings({
|
||||
experimental: { voice: { activationMode: 'toggle' } },
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
// Start recording (resumed)
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
});
|
||||
|
||||
// Emit transcription
|
||||
await act(async () => {
|
||||
(fakeTranscriptionProvider as unknown as EventEmitter).emit(
|
||||
'transcription',
|
||||
'Second turn.',
|
||||
);
|
||||
});
|
||||
|
||||
await waitFor(() => {
|
||||
expect(mockBuffer.setText).toHaveBeenCalledWith(
|
||||
'First turn. Second turn.',
|
||||
'end',
|
||||
);
|
||||
});
|
||||
|
||||
unmount();
|
||||
});
|
||||
|
||||
describe('push-to-talk', () => {
|
||||
beforeEach(() => {
|
||||
vi.useFakeTimers();
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
vi.useRealTimers();
|
||||
});
|
||||
|
||||
it('should insert a space on a single tap', async () => {
|
||||
const { stdin, unmount, lastFrame } = await renderWithProviders(
|
||||
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
|
||||
{
|
||||
uiState: { isVoiceModeEnabled: true } as UIState,
|
||||
settings: createMockSettings({
|
||||
experimental: { voice: { activationMode: 'push-to-talk' } },
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
expect(lastFrame()).toContain('Voice mode: Hold Space to record');
|
||||
|
||||
// Press space once
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
});
|
||||
|
||||
// Should insert space optimistically
|
||||
expect(mockBuffer.insert).toHaveBeenCalledWith(' ');
|
||||
expect(lastFrame()).not.toContain('🎙️ Listening...');
|
||||
|
||||
// Advance timer past HOLD_DELAY_MS
|
||||
await act(async () => {
|
||||
vi.advanceTimersByTime(700);
|
||||
});
|
||||
|
||||
expect(lastFrame()).not.toContain('🎙️ Listening...');
|
||||
unmount();
|
||||
});
|
||||
|
||||
it('should start recording on hold (simulated by repeat spaces)', async () => {
|
||||
const { stdin, unmount, lastFrame } = await renderWithProviders(
|
||||
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
|
||||
{
|
||||
uiState: { isVoiceModeEnabled: true } as UIState,
|
||||
settings: createMockSettings({
|
||||
experimental: { voice: { activationMode: 'push-to-talk' } },
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
// First space
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
});
|
||||
expect(mockBuffer.insert).toHaveBeenCalledWith(' ');
|
||||
|
||||
// Second space (repeat)
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
});
|
||||
|
||||
await waitFor(() => {
|
||||
// Should have backspaced the optimistic space
|
||||
expect(mockBuffer.backspace).toHaveBeenCalled();
|
||||
// Should show listening
|
||||
expect(lastFrame()).toContain('🎙️ Listening...');
|
||||
});
|
||||
|
||||
unmount();
|
||||
});
|
||||
|
||||
it('should stop recording when space heartbeat stops (release)', async () => {
|
||||
const { stdin, unmount, lastFrame } = await renderWithProviders(
|
||||
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
|
||||
{
|
||||
uiState: { isVoiceModeEnabled: true } as UIState,
|
||||
settings: createMockSettings({
|
||||
experimental: { voice: { activationMode: 'push-to-talk' } },
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
// Start hold
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
stdin.write(' ');
|
||||
});
|
||||
|
||||
// Use a short interval in waitFor to prevent advancing fake timers past the 300ms RELEASE_DELAY_MS
|
||||
await waitFor(
|
||||
() => {
|
||||
expect(lastFrame()).toContain('🎙️ Listening...');
|
||||
},
|
||||
{ interval: 10 },
|
||||
);
|
||||
|
||||
// Simulate heartbeat (held key) - send space first to reset timer, then advance
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
vi.advanceTimersByTime(100);
|
||||
});
|
||||
expect(lastFrame()).toContain('🎙️ Listening...');
|
||||
|
||||
// Stop heartbeat (release)
|
||||
await act(async () => {
|
||||
vi.advanceTimersByTime(400); // Past RELEASE_DELAY_MS
|
||||
});
|
||||
|
||||
await waitFor(() => {
|
||||
expect(lastFrame()).not.toContain('🎙️ Listening...');
|
||||
});
|
||||
|
||||
unmount();
|
||||
});
|
||||
|
||||
it('should cancel hold state if non-space key is pressed after first space', async () => {
|
||||
const { stdin, unmount } = await renderWithProviders(
|
||||
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
|
||||
{
|
||||
uiState: { isVoiceModeEnabled: true } as UIState,
|
||||
settings: createMockSettings({
|
||||
experimental: { voice: { activationMode: 'push-to-talk' } },
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
// First space
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
});
|
||||
|
||||
// Type 'a'
|
||||
await act(async () => {
|
||||
stdin.write('a');
|
||||
});
|
||||
|
||||
// Should NOT start recording on next space even if fast
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
});
|
||||
|
||||
expect(mockBuffer.insert).toHaveBeenCalledTimes(2); // Two spaces inserted
|
||||
expect(mockBuffer.handleInput).toHaveBeenCalledWith(
|
||||
expect.objectContaining({ name: 'a' }),
|
||||
);
|
||||
unmount();
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
function clean(str: string | undefined): string {
|
||||
|
||||
@@ -56,6 +56,7 @@ import {
|
||||
debugLogger,
|
||||
type Config,
|
||||
} from '@google/gemini-cli-core';
|
||||
import { useVoiceMode } from '../hooks/useVoiceMode.js';
|
||||
import {
|
||||
parseInputForHighlighting,
|
||||
parseSegmentsFromTokens,
|
||||
@@ -159,7 +160,6 @@ export function isLargePaste(text: string): boolean {
|
||||
}
|
||||
|
||||
const DOUBLE_TAB_CLEAN_UI_TOGGLE_WINDOW_MS = 350;
|
||||
|
||||
/**
|
||||
* Attempt to toggle expansion of a paste placeholder in the buffer.
|
||||
* Returns true if a toggle action was performed or hint was shown, false otherwise.
|
||||
@@ -238,6 +238,7 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
|
||||
setEmbeddedShellFocused,
|
||||
setShortcutsHelpVisible,
|
||||
toggleCleanUiDetailsVisible,
|
||||
setVoiceModeEnabled,
|
||||
} = useUIActions();
|
||||
const {
|
||||
terminalWidth,
|
||||
@@ -246,6 +247,7 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
|
||||
backgroundTasks,
|
||||
backgroundTaskHeight,
|
||||
shortcutsHelpVisible,
|
||||
isVoiceModeEnabled,
|
||||
} = useUIState();
|
||||
const [suppressCompletion, setSuppressCompletion] = useState(false);
|
||||
const { handlePress: registerPlainTabPress, resetCount: resetPlainTabPress } =
|
||||
@@ -263,6 +265,7 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
|
||||
resetEscapeState();
|
||||
if (buffer.text.length > 0) {
|
||||
buffer.setText('');
|
||||
resetTurnBaseline();
|
||||
resetCompletionState();
|
||||
} else if (history.length > 0) {
|
||||
onSubmit('/rewind');
|
||||
@@ -281,6 +284,16 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
|
||||
const hasUserNavigatedSuggestions = useRef(false);
|
||||
const listRef = useRef<ScrollableListRef<ScrollableItem>>(null);
|
||||
|
||||
const { isRecording, handleVoiceInput, resetTurnBaseline } = useVoiceMode({
|
||||
buffer,
|
||||
config,
|
||||
settings,
|
||||
setQueueErrorMessage,
|
||||
isVoiceModeEnabled,
|
||||
setVoiceModeEnabled,
|
||||
keyMatchers,
|
||||
});
|
||||
|
||||
const [reverseSearchActive, setReverseSearchActive] = useState(false);
|
||||
const [commandSearchActive, setCommandSearchActive] = useState(false);
|
||||
const [textBeforeReverseSearch, setTextBeforeReverseSearch] = useState('');
|
||||
@@ -387,6 +400,7 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
|
||||
// Clear the buffer *before* calling onSubmit to prevent potential re-submission
|
||||
// if onSubmit triggers a re-render while the buffer still holds the old value.
|
||||
buffer.setText('');
|
||||
resetTurnBaseline();
|
||||
onSubmit(processedValue);
|
||||
resetCompletionState();
|
||||
resetReverseSearchCompletionState();
|
||||
@@ -398,6 +412,7 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
|
||||
shellModeActive,
|
||||
shellHistory,
|
||||
resetReverseSearchCompletionState,
|
||||
resetTurnBaseline,
|
||||
],
|
||||
);
|
||||
|
||||
@@ -647,6 +662,8 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
|
||||
|
||||
const handleInput = useCallback(
|
||||
(key: Key) => {
|
||||
if (handleVoiceInput(key)) return true;
|
||||
|
||||
// Determine if this keypress is a history navigation command
|
||||
const isHistoryUp =
|
||||
!shellModeActive &&
|
||||
@@ -873,9 +890,9 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
|
||||
) {
|
||||
setShellModeActive(!shellModeActive);
|
||||
buffer.setText(''); // Clear the '!' from input
|
||||
resetTurnBaseline();
|
||||
return true;
|
||||
}
|
||||
|
||||
if (keyMatchers[Command.ESCAPE](key)) {
|
||||
const cancelSearch = (
|
||||
setActive: (active: boolean) => void,
|
||||
@@ -1360,6 +1377,7 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
|
||||
backgroundTaskHeight,
|
||||
streamingState,
|
||||
handleEscPress,
|
||||
resetTurnBaseline,
|
||||
registerPlainTabPress,
|
||||
resetPlainTabPress,
|
||||
toggleCleanUiDetailsVisible,
|
||||
@@ -1369,9 +1387,9 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
|
||||
keyMatchers,
|
||||
isHelpDismissKey,
|
||||
settings,
|
||||
handleVoiceInput,
|
||||
],
|
||||
);
|
||||
|
||||
useKeypress(handleInput, {
|
||||
isActive: !isEmbeddedShellFocused && !copyModeEnabled,
|
||||
priority: true,
|
||||
@@ -1792,20 +1810,39 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
|
||||
)}{' '}
|
||||
</Text>
|
||||
<Box flexGrow={1} flexDirection="column" ref={innerBoxRef}>
|
||||
{buffer.text.length === 0 && placeholder ? (
|
||||
showCursor ? (
|
||||
<Text
|
||||
terminalCursorFocus={showCursor}
|
||||
terminalCursorPosition={0}
|
||||
>
|
||||
{chalk.inverse(placeholder.slice(0, 1))}
|
||||
<Text color={theme.text.secondary}>
|
||||
{placeholder.slice(1)}
|
||||
</Text>
|
||||
{isRecording && (
|
||||
<Box flexDirection="row" marginBottom={0}>
|
||||
<Text color={theme.status.success}>🎙️ Listening...</Text>
|
||||
</Box>
|
||||
)}
|
||||
{isVoiceModeEnabled && !isRecording && (
|
||||
<Box flexDirection="row" marginBottom={0}>
|
||||
<Text color={theme.text.secondary}>
|
||||
> Voice mode:{' '}
|
||||
{(settings.experimental.voice?.activationMode ??
|
||||
'push-to-talk') === 'push-to-talk'
|
||||
? 'Hold Space to record'
|
||||
: 'Space to start/stop recording'}{' '}
|
||||
(Esc to exit)
|
||||
</Text>
|
||||
) : (
|
||||
<Text color={theme.text.secondary}>{placeholder}</Text>
|
||||
)
|
||||
</Box>
|
||||
)}
|
||||
{buffer.text.length === 0 && !isRecording ? (
|
||||
!isVoiceModeEnabled && placeholder ? (
|
||||
showCursor ? (
|
||||
<Text
|
||||
terminalCursorFocus={showCursor}
|
||||
terminalCursorPosition={0}
|
||||
>
|
||||
{chalk.inverse(placeholder.slice(0, 1))}
|
||||
<Text color={theme.text.secondary}>
|
||||
{placeholder.slice(1)}
|
||||
</Text>
|
||||
</Text>
|
||||
) : (
|
||||
<Text color={theme.text.secondary}>{placeholder}</Text>
|
||||
)
|
||||
) : null
|
||||
) : (
|
||||
<Box
|
||||
flexDirection="column"
|
||||
|
||||
@@ -0,0 +1,236 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import type React from 'react';
|
||||
import { useCallback, useMemo, useState } from 'react';
|
||||
import { Box, Text } from 'ink';
|
||||
import { theme } from '../semantic-colors.js';
|
||||
import { DescriptiveRadioButtonSelect } from './shared/DescriptiveRadioButtonSelect.js';
|
||||
import { useSettingsStore } from '../contexts/SettingsContext.js';
|
||||
import { SettingScope } from '../../config/settings.js';
|
||||
import { useKeypress, type Key } from '../hooks/useKeypress.js';
|
||||
import { isBinaryAvailable } from '@google/gemini-cli-core';
|
||||
import {
|
||||
WhisperModelManager,
|
||||
type WhisperModelProgress,
|
||||
} from '@google/gemini-cli-core';
|
||||
import { CliSpinner } from './CliSpinner.js';
|
||||
|
||||
interface VoiceModelDialogProps {
|
||||
onClose: () => void;
|
||||
}
|
||||
|
||||
type DialogView = 'backend' | 'whisper-models';
|
||||
|
||||
const WHISPER_MODELS = [
|
||||
{
|
||||
value: 'ggml-tiny.en.bin',
|
||||
label: 'Tiny (EN)',
|
||||
description: 'Fastest, lower accuracy (~75MB)',
|
||||
},
|
||||
{
|
||||
value: 'ggml-base.en.bin',
|
||||
label: 'Base (EN)',
|
||||
description: 'Balanced speed and accuracy (~142MB)',
|
||||
},
|
||||
{
|
||||
value: 'ggml-large-v3-turbo-q5_0.bin',
|
||||
label: 'Large v3 Turbo (Q5_0)',
|
||||
description: 'High accuracy, quantized (~547MB)',
|
||||
},
|
||||
{
|
||||
value: 'ggml-large-v3-turbo-q8_0.bin',
|
||||
label: 'Large v3 Turbo (Q8_0)',
|
||||
description: 'Maximum accuracy, high memory (~834MB)',
|
||||
},
|
||||
];
|
||||
|
||||
export function VoiceModelDialog({
|
||||
onClose,
|
||||
}: VoiceModelDialogProps): React.JSX.Element {
|
||||
const { settings, setSetting } = useSettingsStore();
|
||||
const [view, setView] = useState<DialogView>('backend');
|
||||
const [downloadProgress, setDownloadProgress] =
|
||||
useState<WhisperModelProgress | null>(null);
|
||||
const [error, setError] = useState<string | null>(null);
|
||||
|
||||
const whisperInstalled = useMemo(
|
||||
() => isBinaryAvailable('whisper-stream'),
|
||||
[],
|
||||
);
|
||||
const modelManager = useMemo(() => new WhisperModelManager(), []);
|
||||
|
||||
const currentBackend =
|
||||
settings.merged.experimental.voice?.backend ?? 'gemini-live';
|
||||
const currentWhisperModel =
|
||||
settings.merged.experimental.voice?.whisperModel ?? 'ggml-base.en.bin';
|
||||
|
||||
const handleKeypress = useCallback(
|
||||
(key: Key) => {
|
||||
if (key.name === 'escape') {
|
||||
if (view === 'whisper-models') {
|
||||
setView('backend');
|
||||
} else {
|
||||
onClose();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
},
|
||||
[view, onClose],
|
||||
);
|
||||
|
||||
useKeypress(handleKeypress, { isActive: true });
|
||||
|
||||
const handleBackendSelect = useCallback(
|
||||
(value: string) => {
|
||||
if (value === 'whisper') {
|
||||
setView('whisper-models');
|
||||
} else {
|
||||
setSetting(
|
||||
SettingScope.User,
|
||||
'experimental.voice.backend',
|
||||
'gemini-live',
|
||||
);
|
||||
onClose();
|
||||
}
|
||||
},
|
||||
[setSetting, onClose],
|
||||
);
|
||||
|
||||
const handleWhisperModelSelect = useCallback(
|
||||
async (modelName: string) => {
|
||||
if (modelManager.isModelInstalled(modelName)) {
|
||||
setSetting(SettingScope.User, 'experimental.voice.backend', 'whisper');
|
||||
setSetting(
|
||||
SettingScope.User,
|
||||
'experimental.voice.whisperModel',
|
||||
modelName,
|
||||
);
|
||||
onClose();
|
||||
} else {
|
||||
setError(null);
|
||||
const onProgress = (p: WhisperModelProgress) => setDownloadProgress(p);
|
||||
modelManager.on('progress', onProgress);
|
||||
|
||||
try {
|
||||
await modelManager.downloadModel(modelName);
|
||||
|
||||
setSetting(
|
||||
SettingScope.User,
|
||||
'experimental.voice.backend',
|
||||
'whisper',
|
||||
);
|
||||
setSetting(
|
||||
SettingScope.User,
|
||||
'experimental.voice.whisperModel',
|
||||
modelName,
|
||||
);
|
||||
onClose();
|
||||
} catch (err) {
|
||||
setError(
|
||||
`Failed to download: ${err instanceof Error ? err.message : String(err)}`,
|
||||
);
|
||||
} finally {
|
||||
modelManager.off('progress', onProgress);
|
||||
setDownloadProgress(null);
|
||||
}
|
||||
}
|
||||
},
|
||||
[modelManager, setSetting, onClose],
|
||||
);
|
||||
|
||||
const backendOptions = useMemo(
|
||||
() => [
|
||||
{
|
||||
value: 'gemini-live',
|
||||
title: 'Gemini Live API (Cloud)',
|
||||
description: 'Real-time cloud transcription via Gemini Live API.',
|
||||
key: 'gemini-live',
|
||||
},
|
||||
{
|
||||
value: 'whisper',
|
||||
title: 'Whisper (Local)',
|
||||
description: whisperInstalled
|
||||
? 'Local transcription using whisper.cpp.'
|
||||
: 'Local transcription (Requires: brew install whisper-cpp)',
|
||||
key: 'whisper',
|
||||
},
|
||||
],
|
||||
[whisperInstalled],
|
||||
);
|
||||
|
||||
const whisperOptions = useMemo(
|
||||
() =>
|
||||
WHISPER_MODELS.map((m) => ({
|
||||
value: m.value,
|
||||
title: `${m.label}${modelManager.isModelInstalled(m.value) ? ' (Installed)' : ' (Download)'}`,
|
||||
description: m.description,
|
||||
key: m.value,
|
||||
})),
|
||||
[modelManager],
|
||||
);
|
||||
|
||||
return (
|
||||
<Box
|
||||
borderStyle="round"
|
||||
borderColor={theme.border.default}
|
||||
flexDirection="column"
|
||||
padding={1}
|
||||
width="100%"
|
||||
>
|
||||
<Text bold>
|
||||
{view === 'backend'
|
||||
? 'Select Voice Transcription Backend'
|
||||
: 'Select Whisper Model'}
|
||||
</Text>
|
||||
|
||||
{error && (
|
||||
<Box marginTop={1}>
|
||||
<Text color={theme.status.error}>{error}</Text>
|
||||
</Box>
|
||||
)}
|
||||
|
||||
{downloadProgress ? (
|
||||
<Box marginTop={1} flexDirection="column">
|
||||
<Box>
|
||||
<Text>Downloading {downloadProgress.modelName}... </Text>
|
||||
<CliSpinner />
|
||||
<Text> {Math.round(downloadProgress.percentage * 100)}%</Text>
|
||||
</Box>
|
||||
</Box>
|
||||
) : (
|
||||
<Box marginTop={1}>
|
||||
{view === 'backend' ? (
|
||||
<DescriptiveRadioButtonSelect
|
||||
items={backendOptions}
|
||||
onSelect={handleBackendSelect}
|
||||
initialIndex={currentBackend === 'whisper' ? 1 : 0}
|
||||
showNumbers={true}
|
||||
/>
|
||||
) : (
|
||||
<DescriptiveRadioButtonSelect
|
||||
items={whisperOptions}
|
||||
onSelect={handleWhisperModelSelect}
|
||||
initialIndex={whisperOptions.findIndex(
|
||||
(o) => o.value === currentWhisperModel,
|
||||
)}
|
||||
showNumbers={true}
|
||||
/>
|
||||
)}
|
||||
</Box>
|
||||
)}
|
||||
|
||||
<Box marginTop={1} flexDirection="column">
|
||||
<Text color={theme.text.secondary}>
|
||||
{view === 'whisper-models'
|
||||
? '(Press Esc to go back)'
|
||||
: '(Press Esc to close)'}
|
||||
</Text>
|
||||
</Box>
|
||||
</Box>
|
||||
);
|
||||
}
|
||||
@@ -168,13 +168,6 @@ exports[`InputPrompt > mouse interaction > should toggle paste expansion on doub
|
||||
"
|
||||
`;
|
||||
|
||||
exports[`InputPrompt > mouse interaction > should toggle paste expansion on double-click 4`] = `
|
||||
"▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀
|
||||
> [Pasted Text: 10 lines]
|
||||
▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄
|
||||
"
|
||||
`;
|
||||
|
||||
exports[`InputPrompt > multiline rendering > should correctly render multiline input including blank lines 1`] = `
|
||||
"────────────────────────────────────────────────────────────────────────────────────────────────────
|
||||
> hello
|
||||
|
||||
Reference in New Issue
Block a user