feat(voice): implement real-time voice mode with cloud and local backends (#24174)

This commit is contained in:
Abhijit Balaji
2026-04-24 14:29:38 -07:00
committed by GitHub
parent 048bf6e514
commit 2e0641c83b
40 changed files with 2244 additions and 43 deletions
+53 -16
View File
@@ -56,6 +56,7 @@ import {
debugLogger,
type Config,
} from '@google/gemini-cli-core';
import { useVoiceMode } from '../hooks/useVoiceMode.js';
import {
parseInputForHighlighting,
parseSegmentsFromTokens,
@@ -159,7 +160,6 @@ export function isLargePaste(text: string): boolean {
}
const DOUBLE_TAB_CLEAN_UI_TOGGLE_WINDOW_MS = 350;
/**
* Attempt to toggle expansion of a paste placeholder in the buffer.
* Returns true if a toggle action was performed or hint was shown, false otherwise.
@@ -238,6 +238,7 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
setEmbeddedShellFocused,
setShortcutsHelpVisible,
toggleCleanUiDetailsVisible,
setVoiceModeEnabled,
} = useUIActions();
const {
terminalWidth,
@@ -246,6 +247,7 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
backgroundTasks,
backgroundTaskHeight,
shortcutsHelpVisible,
isVoiceModeEnabled,
} = useUIState();
const [suppressCompletion, setSuppressCompletion] = useState(false);
const { handlePress: registerPlainTabPress, resetCount: resetPlainTabPress } =
@@ -263,6 +265,7 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
resetEscapeState();
if (buffer.text.length > 0) {
buffer.setText('');
resetTurnBaseline();
resetCompletionState();
} else if (history.length > 0) {
onSubmit('/rewind');
@@ -281,6 +284,16 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
const hasUserNavigatedSuggestions = useRef(false);
const listRef = useRef<ScrollableListRef<ScrollableItem>>(null);
const { isRecording, handleVoiceInput, resetTurnBaseline } = useVoiceMode({
buffer,
config,
settings,
setQueueErrorMessage,
isVoiceModeEnabled,
setVoiceModeEnabled,
keyMatchers,
});
const [reverseSearchActive, setReverseSearchActive] = useState(false);
const [commandSearchActive, setCommandSearchActive] = useState(false);
const [textBeforeReverseSearch, setTextBeforeReverseSearch] = useState('');
@@ -387,6 +400,7 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
// Clear the buffer *before* calling onSubmit to prevent potential re-submission
// if onSubmit triggers a re-render while the buffer still holds the old value.
buffer.setText('');
resetTurnBaseline();
onSubmit(processedValue);
resetCompletionState();
resetReverseSearchCompletionState();
@@ -398,6 +412,7 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
shellModeActive,
shellHistory,
resetReverseSearchCompletionState,
resetTurnBaseline,
],
);
@@ -647,6 +662,8 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
const handleInput = useCallback(
(key: Key) => {
if (handleVoiceInput(key)) return true;
// Determine if this keypress is a history navigation command
const isHistoryUp =
!shellModeActive &&
@@ -873,9 +890,9 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
) {
setShellModeActive(!shellModeActive);
buffer.setText(''); // Clear the '!' from input
resetTurnBaseline();
return true;
}
if (keyMatchers[Command.ESCAPE](key)) {
const cancelSearch = (
setActive: (active: boolean) => void,
@@ -1360,6 +1377,7 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
backgroundTaskHeight,
streamingState,
handleEscPress,
resetTurnBaseline,
registerPlainTabPress,
resetPlainTabPress,
toggleCleanUiDetailsVisible,
@@ -1369,9 +1387,9 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
keyMatchers,
isHelpDismissKey,
settings,
handleVoiceInput,
],
);
useKeypress(handleInput, {
isActive: !isEmbeddedShellFocused && !copyModeEnabled,
priority: true,
@@ -1792,20 +1810,39 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
)}{' '}
</Text>
<Box flexGrow={1} flexDirection="column" ref={innerBoxRef}>
{buffer.text.length === 0 && placeholder ? (
showCursor ? (
<Text
terminalCursorFocus={showCursor}
terminalCursorPosition={0}
>
{chalk.inverse(placeholder.slice(0, 1))}
<Text color={theme.text.secondary}>
{placeholder.slice(1)}
</Text>
{isRecording && (
<Box flexDirection="row" marginBottom={0}>
<Text color={theme.status.success}>🎙 Listening...</Text>
</Box>
)}
{isVoiceModeEnabled && !isRecording && (
<Box flexDirection="row" marginBottom={0}>
<Text color={theme.text.secondary}>
&gt; Voice mode:{' '}
{(settings.experimental.voice?.activationMode ??
'push-to-talk') === 'push-to-talk'
? 'Hold Space to record'
: 'Space to start/stop recording'}{' '}
(Esc to exit)
</Text>
) : (
<Text color={theme.text.secondary}>{placeholder}</Text>
)
</Box>
)}
{buffer.text.length === 0 && !isRecording ? (
!isVoiceModeEnabled && placeholder ? (
showCursor ? (
<Text
terminalCursorFocus={showCursor}
terminalCursorPosition={0}
>
{chalk.inverse(placeholder.slice(0, 1))}
<Text color={theme.text.secondary}>
{placeholder.slice(1)}
</Text>
</Text>
) : (
<Text color={theme.text.secondary}>{placeholder}</Text>
)
) : null
) : (
<Box
flexDirection="column"