feat(voice): implement real-time voice mode with cloud and local backends (#24174)

2026-05-04 00:44:05 -07:00 · 2026-04-24 14:29:38 -07:00
parent 048bf6e514
commit 2e0641c83b
40 changed files with 2244 additions and 43 deletions
@@ -103,6 +103,7 @@ import { useQuotaAndFallback } from './hooks/useQuotaAndFallback.js';
 import { useEditorSettings } from './hooks/useEditorSettings.js';
 import { useSettingsCommand } from './hooks/useSettingsCommand.js';
 import { useModelCommand } from './hooks/useModelCommand.js';
+import { useVoiceModelCommand } from './hooks/useVoiceModelCommand.js';
 import { useSlashCommandProcessor } from './hooks/slashCommandProcessor.js';
 import { useVimMode } from './contexts/VimModeContext.js';
 import {
@@ -312,6 +313,7 @@ export const AppContainer = (props: AppContainerProps) => {
  );

  const [shellModeActive, setShellModeActive] = useState(false);
+  const [isVoiceModeEnabled, setVoiceModeEnabled] = useState(false);
  const [modelSwitchedFromQuotaError, setModelSwitchedFromQuotaError] =
    useState<boolean>(false);
  const [historyRemountKey, setHistoryRemountKey] = useState(0);
@@ -946,6 +948,12 @@ Logging in with Google... Restarting Gemini CLI to continue.
  const { isModelDialogOpen, openModelDialog, closeModelDialog } =
    useModelCommand();

+  const {
+    isVoiceModelDialogOpen,
+    openVoiceModelDialog,
+    closeVoiceModelDialog,
+  } = useVoiceModelCommand();
+
  const { toggleVimEnabled } = useVimMode();

  const setIsBackgroundTaskListOpenRef = useRef<(open: boolean) => void>(
@@ -969,6 +977,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
      openSettingsDialog,
      openSessionBrowser,
      openModelDialog,
+      openVoiceModelDialog,
      openAgentConfigDialog,
      openPermissionsDialog,
      quit: (messages: HistoryItem[]) => {
@@ -981,6 +990,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
      },
      setDebugMessage,
      toggleCorgiMode: () => setCorgiMode((prev) => !prev),
+      toggleVoiceMode: () => setVoiceModeEnabled((prev) => !prev),
      toggleDebugProfiler,
      dispatchExtensionStateUpdate,
      addConfirmUpdateExtensionRequest,
@@ -1006,6 +1016,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
      openSettingsDialog,
      openSessionBrowser,
      openModelDialog,
+      openVoiceModelDialog,
      openAgentConfigDialog,
      setQuittingMessages,
      setDebugMessage,
@@ -2191,6 +2202,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
    isThemeDialogOpen ||
    isSettingsDialogOpen ||
    isModelDialogOpen ||
+    isVoiceModelDialogOpen ||
    isAgentConfigDialogOpen ||
    isPermissionsDialogOpen ||
    isAuthenticating ||
@@ -2448,6 +2460,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
      isSettingsDialogOpen,
      isSessionBrowserOpen,
      isModelDialogOpen,
+      isVoiceModelDialogOpen,
      isAgentConfigDialogOpen,
      selectedAgentName,
      selectedAgentDisplayName,
@@ -2468,6 +2481,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
      pendingGeminiHistoryItems,
      thought,
      isInputActive,
+      isVoiceModeEnabled,
      isResuming,
      shouldShowIdePrompt,
      isFolderTrustDialogOpen: isFolderTrustDialogOpen ?? false,
@@ -2559,6 +2573,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
      isSettingsDialogOpen,
      isSessionBrowserOpen,
      isModelDialogOpen,
+      isVoiceModelDialogOpen,
      isAgentConfigDialogOpen,
      selectedAgentName,
      selectedAgentDisplayName,
@@ -2579,6 +2594,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
      pendingGeminiHistoryItems,
      thought,
      isInputActive,
+      isVoiceModeEnabled,
      isResuming,
      shouldShowIdePrompt,
      isFolderTrustDialogOpen,
@@ -2671,6 +2687,8 @@ Logging in with Google... Restarting Gemini CLI to continue.
      exitPrivacyNotice,
      closeSettingsDialog,
      closeModelDialog,
+      openVoiceModelDialog,
+      closeVoiceModelDialog,
      openAgentConfigDialog,
      closeAgentConfigDialog,
      openPermissionsDialog,
@@ -2751,6 +2769,9 @@ Logging in with Google... Restarting Gemini CLI to continue.
        setAccountSuspensionInfo(null);
        setAuthState(AuthState.Updating);
      },
+      setVoiceModeEnabled: (value: boolean) => {
+        setVoiceModeEnabled(value);
+      },
    }),
    [
      handleThemeSelect,
@@ -2764,6 +2785,8 @@ Logging in with Google... Restarting Gemini CLI to continue.
      exitPrivacyNotice,
      closeSettingsDialog,
      closeModelDialog,
+      openVoiceModelDialog,
+      closeVoiceModelDialog,
      openAgentConfigDialog,
      closeAgentConfigDialog,
      openPermissionsDialog,
@@ -2807,6 +2830,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
      config,
      historyManager,
      getPreferredEditor,
+      setVoiceModeEnabled,
    ],
  );

@@ -72,6 +72,7 @@ export interface CommandContext {
    loadHistory: (history: HistoryItem[], postLoadInput?: string) => void;
    /** Toggles a special display mode. */
    toggleCorgiMode: () => void;
+    toggleVoiceMode: () => void;
    toggleDebugProfiler: () => void;
    toggleVimEnabled: () => Promise<boolean>;
    reloadCommands: () => void;
@@ -125,6 +126,7 @@ export interface OpenDialogActionReturn {
    | 'settings'
    | 'sessionBrowser'
    | 'model'
+    | 'voice-model'
    | 'agentConfig'
    | 'permissions';
 }
@@ -0,0 +1,30 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { CommandKind, type SlashCommand } from './types.js';
+
+export const voiceCommand: SlashCommand = {
+  name: 'voice',
+  altNames: [],
+  description: 'Toggle voice dictation mode',
+  kind: CommandKind.BUILT_IN,
+  autoExecute: true,
+  action: (context) => {
+    context.ui.toggleVoiceMode();
+  },
+  subCommands: [
+    {
+      name: 'model',
+      description: 'Manage voice transcription models',
+      kind: CommandKind.BUILT_IN,
+      autoExecute: true,
+      action: async () => ({
+        type: 'dialog',
+        dialog: 'voice-model',
+      }),
+    },
+  ],
+};
@@ -25,6 +25,7 @@ import { relaunchApp } from '../../utils/processUtils.js';
 import { SessionBrowser } from './SessionBrowser.js';
 import { PermissionsModifyTrustDialog } from './PermissionsModifyTrustDialog.js';
 import { ModelDialog } from './ModelDialog.js';
+import { VoiceModelDialog } from './VoiceModelDialog.js';
 import { theme } from '../semantic-colors.js';
 import { useUIState } from '../contexts/UIStateContext.js';
 import { useQuotaState } from '../contexts/QuotaContext.js';
@@ -238,6 +239,9 @@ export const DialogManager = ({
  if (uiState.isModelDialogOpen) {
    return <ModelDialog onClose={uiActions.closeModelDialog} />;
  }
+  if (uiState.isVoiceModelDialogOpen) {
+    return <VoiceModelDialog onClose={uiActions.closeVoiceModelDialog} />;
+  }
  if (
    uiState.isAgentConfigDialogOpen &&
    uiState.selectedAgentName &&
@@ -9,12 +9,41 @@ import { createMockSettings } from '../../test-utils/settings.js';
 import { makeFakeConfig } from '@google/gemini-cli-core';
 import { waitFor } from '../../test-utils/async.js';
 import { act, useState, useMemo } from 'react';
+import type { EventEmitter } from 'node:events';
+
+const { fakeTranscriptionProvider } = vi.hoisted(() => {
+  // Use require within hoisted block for immediate synchronous access
+  // eslint-disable-next-line @typescript-eslint/no-require-imports, no-restricted-syntax
+  const { EventEmitter } = require('node:events');
+  class FakeTranscriptionProvider extends EventEmitter {
+    connect = vi.fn().mockResolvedValue(undefined);
+    disconnect = vi.fn();
+    sendAudioChunk = vi.fn();
+    getTranscription = vi.fn().mockReturnValue('');
+  }
+  return {
+    fakeTranscriptionProvider: new FakeTranscriptionProvider(),
+  };
+});
+
+vi.mock('@google/gemini-cli-core', async (importOriginal) => {
+  // eslint-disable-next-line @typescript-eslint/no-explicit-any
+  const actual = (await importOriginal()) as any;
+  return {
+    ...actual,
+    TranscriptionFactory: {
+      createProvider: vi.fn(() => fakeTranscriptionProvider),
+    },
+  };
+});
+
 import {
  InputPrompt,
  tryTogglePasteExpansion,
  type InputPromptProps,
 } from './InputPrompt.js';
 import { InputContext } from '../contexts/InputContext.js';
+import { type UIState } from '../contexts/UIStateContext.js';
 import {
  calculateTransformationsForLine,
  calculateTransformedLine,
@@ -417,6 +446,7 @@ describe('InputPrompt', () => {
        getWorkspaceContext: () => ({
          getDirectories: () => ['/test/project/src'],
        }),
+        getContentGeneratorConfig: () => ({ apiKey: 'test-api-key' }),
      } as unknown as Config,
      slashCommands: mockSlashCommands,
      commandContext: mockCommandContext,
@@ -4925,6 +4955,383 @@ describe('InputPrompt', () => {
      },
    );
  });
+
+  describe('Voice Mode', () => {
+    beforeEach(() => {
+      (
+        fakeTranscriptionProvider as unknown as EventEmitter
+      ).removeAllListeners();
+      vi.clearAllMocks();
+    });
+
+    it('should start recording when space is pressed and voice mode is enabled (toggle)', async () => {
+      await act(async () => {
+        mockBuffer.setText('');
+      });
+      const { stdin, unmount, lastFrame } = await renderWithProviders(
+        <TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
+        {
+          uiState: { isVoiceModeEnabled: true } as UIState,
+          settings: createMockSettings({
+            experimental: { voice: { activationMode: 'toggle' } },
+          }),
+        },
+      );
+
+      // Initially not recording
+      expect(lastFrame()).not.toContain('🎙️ Listening...');
+      expect(lastFrame()).toContain(
+        'Voice mode: Space to start/stop recording',
+      );
+
+      // Press space to start
+      await act(async () => {
+        stdin.write(' ');
+      });
+
+      // Now should show listening
+      await waitFor(() => {
+        expect(lastFrame()).toContain('🎙️ Listening...');
+      });
+
+      unmount();
+    });
+
+    it('should toggle recording off when space is pressed again (toggle)', async () => {
+      await act(async () => {
+        mockBuffer.setText('');
+      });
+      const { stdin, unmount, lastFrame } = await renderWithProviders(
+        <TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
+        {
+          uiState: { isVoiceModeEnabled: true } as UIState,
+          settings: createMockSettings({
+            experimental: { voice: { activationMode: 'toggle' } },
+          }),
+        },
+      );
+
+      // Start recording
+      await act(async () => {
+        stdin.write(' ');
+      });
+      await waitFor(() => {
+        expect(lastFrame()).toContain('🎙️ Listening...');
+      });
+
+      // Stop recording
+      await act(async () => {
+        stdin.write(' ');
+      });
+      await waitFor(() => {
+        expect(lastFrame()).not.toContain('🎙️ Listening...');
+        expect(lastFrame()).toContain(
+          'Voice mode: Space to start/stop recording',
+        );
+      });
+
+      unmount();
+    });
+
+    it('should resume recording when space is pressed even if buffer is not empty (toggle)', async () => {
+      await act(async () => {
+        mockBuffer.setText('some existing text');
+      });
+      const { stdin, unmount, lastFrame } = await renderWithProviders(
+        <TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
+        {
+          uiState: { isVoiceModeEnabled: true } as UIState,
+          settings: createMockSettings({
+            experimental: { voice: { activationMode: 'toggle' } },
+          }),
+        },
+      );
+
+      // Should show voice mode hint even if buffer is not empty (new behavior)
+      expect(lastFrame()).toContain(
+        'Voice mode: Space to start/stop recording',
+      );
+      expect(lastFrame()).toContain('some existing text');
+
+      // Press space to start recording again
+      await act(async () => {
+        stdin.write(' ');
+      });
+
+      await waitFor(() => {
+        expect(lastFrame()).toContain('🎙️ Listening...');
+      });
+
+      unmount();
+    });
+
+    it('should not start recording if voice mode is disabled (toggle)', async () => {
+      await act(async () => {
+        mockBuffer.setText('');
+      });
+      const { stdin, unmount, lastFrame } = await renderWithProviders(
+        <TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
+        {
+          uiState: { isVoiceModeEnabled: false } as UIState,
+          settings: createMockSettings({
+            experimental: { voice: { activationMode: 'toggle' } },
+          }),
+        },
+      );
+
+      // Press space
+      await act(async () => {
+        stdin.write(' ');
+      });
+
+      // Should NOT show listening, instead should call handleInput which handles space
+      expect(lastFrame()).not.toContain('🎙️ Listening...');
+      expect(mockBuffer.handleInput).toHaveBeenCalled();
+      unmount();
+    });
+
+    it('should append transcription correctly across multiple turn updates (toggle)', async () => {
+      await act(async () => {
+        mockBuffer.setText('initial');
+      });
+      const { stdin, unmount } = await renderWithProviders(
+        <TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
+        {
+          uiState: { isVoiceModeEnabled: true } as UIState,
+          settings: createMockSettings({
+            experimental: { voice: { activationMode: 'toggle' } },
+          }),
+        },
+      );
+
+      // Start recording
+      await act(async () => {
+        stdin.write(' ');
+      });
+
+      // Emit first transcription
+      await act(async () => {
+        (fakeTranscriptionProvider as unknown as EventEmitter).emit(
+          'transcription',
+          'hello',
+        );
+      });
+      await waitFor(() => {
+        expect(mockBuffer.setText).toHaveBeenCalledWith('initial hello', 'end');
+      });
+
+      // Emit turnComplete (Gemini Live starts over after this)
+      await act(async () => {
+        (fakeTranscriptionProvider as unknown as EventEmitter).emit(
+          'turnComplete',
+        );
+      });
+
+      // Emit second part (Gemini Live sends new turn text starting from empty)
+      await act(async () => {
+        (fakeTranscriptionProvider as unknown as EventEmitter).emit(
+          'transcription',
+          'world',
+        );
+      });
+      await waitFor(() => {
+        // Should have appended 'world' to the baseline 'initial hello'
+        expect(mockBuffer.setText).toHaveBeenCalledWith(
+          'initial hello world',
+          'end',
+        );
+      });
+
+      unmount();
+    });
+
+    it('should append transcription correctly when resuming voice mode (toggle)', async () => {
+      await act(async () => {
+        mockBuffer.setText('First turn.');
+      });
+      const { stdin, unmount } = await renderWithProviders(
+        <TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
+        {
+          uiState: { isVoiceModeEnabled: true } as UIState,
+          settings: createMockSettings({
+            experimental: { voice: { activationMode: 'toggle' } },
+          }),
+        },
+      );
+
+      // Start recording (resumed)
+      await act(async () => {
+        stdin.write(' ');
+      });
+
+      // Emit transcription
+      await act(async () => {
+        (fakeTranscriptionProvider as unknown as EventEmitter).emit(
+          'transcription',
+          'Second turn.',
+        );
+      });
+
+      await waitFor(() => {
+        expect(mockBuffer.setText).toHaveBeenCalledWith(
+          'First turn. Second turn.',
+          'end',
+        );
+      });
+
+      unmount();
+    });
+
+    describe('push-to-talk', () => {
+      beforeEach(() => {
+        vi.useFakeTimers();
+      });
+
+      afterEach(() => {
+        vi.useRealTimers();
+      });
+
+      it('should insert a space on a single tap', async () => {
+        const { stdin, unmount, lastFrame } = await renderWithProviders(
+          <TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
+          {
+            uiState: { isVoiceModeEnabled: true } as UIState,
+            settings: createMockSettings({
+              experimental: { voice: { activationMode: 'push-to-talk' } },
+            }),
+          },
+        );
+
+        expect(lastFrame()).toContain('Voice mode: Hold Space to record');
+
+        // Press space once
+        await act(async () => {
+          stdin.write(' ');
+        });
+
+        // Should insert space optimistically
+        expect(mockBuffer.insert).toHaveBeenCalledWith(' ');
+        expect(lastFrame()).not.toContain('🎙️ Listening...');
+
+        // Advance timer past HOLD_DELAY_MS
+        await act(async () => {
+          vi.advanceTimersByTime(700);
+        });
+
+        expect(lastFrame()).not.toContain('🎙️ Listening...');
+        unmount();
+      });
+
+      it('should start recording on hold (simulated by repeat spaces)', async () => {
+        const { stdin, unmount, lastFrame } = await renderWithProviders(
+          <TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
+          {
+            uiState: { isVoiceModeEnabled: true } as UIState,
+            settings: createMockSettings({
+              experimental: { voice: { activationMode: 'push-to-talk' } },
+            }),
+          },
+        );
+
+        // First space
+        await act(async () => {
+          stdin.write(' ');
+        });
+        expect(mockBuffer.insert).toHaveBeenCalledWith(' ');
+
+        // Second space (repeat)
+        await act(async () => {
+          stdin.write(' ');
+        });
+
+        await waitFor(() => {
+          // Should have backspaced the optimistic space
+          expect(mockBuffer.backspace).toHaveBeenCalled();
+          // Should show listening
+          expect(lastFrame()).toContain('🎙️ Listening...');
+        });
+
+        unmount();
+      });
+
+      it('should stop recording when space heartbeat stops (release)', async () => {
+        const { stdin, unmount, lastFrame } = await renderWithProviders(
+          <TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
+          {
+            uiState: { isVoiceModeEnabled: true } as UIState,
+            settings: createMockSettings({
+              experimental: { voice: { activationMode: 'push-to-talk' } },
+            }),
+          },
+        );
+
+        // Start hold
+        await act(async () => {
+          stdin.write(' ');
+          stdin.write(' ');
+        });
+
+        // Use a short interval in waitFor to prevent advancing fake timers past the 300ms RELEASE_DELAY_MS
+        await waitFor(
+          () => {
+            expect(lastFrame()).toContain('🎙️ Listening...');
+          },
+          { interval: 10 },
+        );
+
+        // Simulate heartbeat (held key) - send space first to reset timer, then advance
+        await act(async () => {
+          stdin.write(' ');
+          vi.advanceTimersByTime(100);
+        });
+        expect(lastFrame()).toContain('🎙️ Listening...');
+
+        // Stop heartbeat (release)
+        await act(async () => {
+          vi.advanceTimersByTime(400); // Past RELEASE_DELAY_MS
+        });
+
+        await waitFor(() => {
+          expect(lastFrame()).not.toContain('🎙️ Listening...');
+        });
+
+        unmount();
+      });
+
+      it('should cancel hold state if non-space key is pressed after first space', async () => {
+        const { stdin, unmount } = await renderWithProviders(
+          <TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
+          {
+            uiState: { isVoiceModeEnabled: true } as UIState,
+            settings: createMockSettings({
+              experimental: { voice: { activationMode: 'push-to-talk' } },
+            }),
+          },
+        );
+
+        // First space
+        await act(async () => {
+          stdin.write(' ');
+        });
+
+        // Type 'a'
+        await act(async () => {
+          stdin.write('a');
+        });
+
+        // Should NOT start recording on next space even if fast
+        await act(async () => {
+          stdin.write(' ');
+        });
+
+        expect(mockBuffer.insert).toHaveBeenCalledTimes(2); // Two spaces inserted
+        expect(mockBuffer.handleInput).toHaveBeenCalledWith(
+          expect.objectContaining({ name: 'a' }),
+        );
+        unmount();
+      });
+    });
+  });
 });

 function clean(str: string | undefined): string {
@@ -56,6 +56,7 @@ import {
  debugLogger,
  type Config,
 } from '@google/gemini-cli-core';
+import { useVoiceMode } from '../hooks/useVoiceMode.js';
 import {
  parseInputForHighlighting,
  parseSegmentsFromTokens,
@@ -159,7 +160,6 @@ export function isLargePaste(text: string): boolean {
 }

 const DOUBLE_TAB_CLEAN_UI_TOGGLE_WINDOW_MS = 350;
-
 /**
 * Attempt to toggle expansion of a paste placeholder in the buffer.
 * Returns true if a toggle action was performed or hint was shown, false otherwise.
@@ -238,6 +238,7 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
    setEmbeddedShellFocused,
    setShortcutsHelpVisible,
    toggleCleanUiDetailsVisible,
+    setVoiceModeEnabled,
  } = useUIActions();
  const {
    terminalWidth,
@@ -246,6 +247,7 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
    backgroundTasks,
    backgroundTaskHeight,
    shortcutsHelpVisible,
+    isVoiceModeEnabled,
  } = useUIState();
  const [suppressCompletion, setSuppressCompletion] = useState(false);
  const { handlePress: registerPlainTabPress, resetCount: resetPlainTabPress } =
@@ -263,6 +265,7 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
          resetEscapeState();
          if (buffer.text.length > 0) {
            buffer.setText('');
+            resetTurnBaseline();
            resetCompletionState();
          } else if (history.length > 0) {
            onSubmit('/rewind');
@@ -281,6 +284,16 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
  const hasUserNavigatedSuggestions = useRef(false);
  const listRef = useRef<ScrollableListRef<ScrollableItem>>(null);

+  const { isRecording, handleVoiceInput, resetTurnBaseline } = useVoiceMode({
+    buffer,
+    config,
+    settings,
+    setQueueErrorMessage,
+    isVoiceModeEnabled,
+    setVoiceModeEnabled,
+    keyMatchers,
+  });
+
  const [reverseSearchActive, setReverseSearchActive] = useState(false);
  const [commandSearchActive, setCommandSearchActive] = useState(false);
  const [textBeforeReverseSearch, setTextBeforeReverseSearch] = useState('');
@@ -387,6 +400,7 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
      // Clear the buffer *before* calling onSubmit to prevent potential re-submission
      // if onSubmit triggers a re-render while the buffer still holds the old value.
      buffer.setText('');
+      resetTurnBaseline();
      onSubmit(processedValue);
      resetCompletionState();
      resetReverseSearchCompletionState();
@@ -398,6 +412,7 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
      shellModeActive,
      shellHistory,
      resetReverseSearchCompletionState,
+      resetTurnBaseline,
    ],
  );

@@ -647,6 +662,8 @@ export const InputPrompt: React.FC<InputPromptProps> = ({

  const handleInput = useCallback(
    (key: Key) => {
+      if (handleVoiceInput(key)) return true;
+
      // Determine if this keypress is a history navigation command
      const isHistoryUp =
        !shellModeActive &&
@@ -873,9 +890,9 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
      ) {
        setShellModeActive(!shellModeActive);
        buffer.setText(''); // Clear the '!' from input
+        resetTurnBaseline();
        return true;
      }
-
      if (keyMatchers[Command.ESCAPE](key)) {
        const cancelSearch = (
          setActive: (active: boolean) => void,
@@ -1360,6 +1377,7 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
      backgroundTaskHeight,
      streamingState,
      handleEscPress,
+      resetTurnBaseline,
      registerPlainTabPress,
      resetPlainTabPress,
      toggleCleanUiDetailsVisible,
@@ -1369,9 +1387,9 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
      keyMatchers,
      isHelpDismissKey,
      settings,
+      handleVoiceInput,
    ],
  );
-
  useKeypress(handleInput, {
    isActive: !isEmbeddedShellFocused && !copyModeEnabled,
    priority: true,
@@ -1792,20 +1810,39 @@ export const InputPrompt: React.FC<InputPromptProps> = ({
            )}{' '}
          </Text>
          <Box flexGrow={1} flexDirection="column" ref={innerBoxRef}>
-            {buffer.text.length === 0 && placeholder ? (
-              showCursor ? (
-                <Text
-                  terminalCursorFocus={showCursor}
-                  terminalCursorPosition={0}
-                >
-                  {chalk.inverse(placeholder.slice(0, 1))}
-                  <Text color={theme.text.secondary}>
-                    {placeholder.slice(1)}
-                  </Text>
+            {isRecording && (
+              <Box flexDirection="row" marginBottom={0}>
+                <Text color={theme.status.success}>🎙️ Listening...</Text>
+              </Box>
+            )}
+            {isVoiceModeEnabled && !isRecording && (
+              <Box flexDirection="row" marginBottom={0}>
+                <Text color={theme.text.secondary}>
+                  &gt; Voice mode:{' '}
+                  {(settings.experimental.voice?.activationMode ??
+                    'push-to-talk') === 'push-to-talk'
+                    ? 'Hold Space to record'
+                    : 'Space to start/stop recording'}{' '}
+                  (Esc to exit)
                </Text>
-              ) : (
-                <Text color={theme.text.secondary}>{placeholder}</Text>
-              )
+              </Box>
+            )}
+            {buffer.text.length === 0 && !isRecording ? (
+              !isVoiceModeEnabled && placeholder ? (
+                showCursor ? (
+                  <Text
+                    terminalCursorFocus={showCursor}
+                    terminalCursorPosition={0}
+                  >
+                    {chalk.inverse(placeholder.slice(0, 1))}
+                    <Text color={theme.text.secondary}>
+                      {placeholder.slice(1)}
+                    </Text>
+                  </Text>
+                ) : (
+                  <Text color={theme.text.secondary}>{placeholder}</Text>
+                )
+              ) : null
            ) : (
              <Box
                flexDirection="column"
@@ -0,0 +1,236 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import type React from 'react';
+import { useCallback, useMemo, useState } from 'react';
+import { Box, Text } from 'ink';
+import { theme } from '../semantic-colors.js';
+import { DescriptiveRadioButtonSelect } from './shared/DescriptiveRadioButtonSelect.js';
+import { useSettingsStore } from '../contexts/SettingsContext.js';
+import { SettingScope } from '../../config/settings.js';
+import { useKeypress, type Key } from '../hooks/useKeypress.js';
+import { isBinaryAvailable } from '@google/gemini-cli-core';
+import {
+  WhisperModelManager,
+  type WhisperModelProgress,
+} from '@google/gemini-cli-core';
+import { CliSpinner } from './CliSpinner.js';
+
+interface VoiceModelDialogProps {
+  onClose: () => void;
+}
+
+type DialogView = 'backend' | 'whisper-models';
+
+const WHISPER_MODELS = [
+  {
+    value: 'ggml-tiny.en.bin',
+    label: 'Tiny (EN)',
+    description: 'Fastest, lower accuracy (~75MB)',
+  },
+  {
+    value: 'ggml-base.en.bin',
+    label: 'Base (EN)',
+    description: 'Balanced speed and accuracy (~142MB)',
+  },
+  {
+    value: 'ggml-large-v3-turbo-q5_0.bin',
+    label: 'Large v3 Turbo (Q5_0)',
+    description: 'High accuracy, quantized (~547MB)',
+  },
+  {
+    value: 'ggml-large-v3-turbo-q8_0.bin',
+    label: 'Large v3 Turbo (Q8_0)',
+    description: 'Maximum accuracy, high memory (~834MB)',
+  },
+];
+
+export function VoiceModelDialog({
+  onClose,
+}: VoiceModelDialogProps): React.JSX.Element {
+  const { settings, setSetting } = useSettingsStore();
+  const [view, setView] = useState<DialogView>('backend');
+  const [downloadProgress, setDownloadProgress] =
+    useState<WhisperModelProgress | null>(null);
+  const [error, setError] = useState<string | null>(null);
+
+  const whisperInstalled = useMemo(
+    () => isBinaryAvailable('whisper-stream'),
+    [],
+  );
+  const modelManager = useMemo(() => new WhisperModelManager(), []);
+
+  const currentBackend =
+    settings.merged.experimental.voice?.backend ?? 'gemini-live';
+  const currentWhisperModel =
+    settings.merged.experimental.voice?.whisperModel ?? 'ggml-base.en.bin';
+
+  const handleKeypress = useCallback(
+    (key: Key) => {
+      if (key.name === 'escape') {
+        if (view === 'whisper-models') {
+          setView('backend');
+        } else {
+          onClose();
+        }
+        return true;
+      }
+      return false;
+    },
+    [view, onClose],
+  );
+
+  useKeypress(handleKeypress, { isActive: true });
+
+  const handleBackendSelect = useCallback(
+    (value: string) => {
+      if (value === 'whisper') {
+        setView('whisper-models');
+      } else {
+        setSetting(
+          SettingScope.User,
+          'experimental.voice.backend',
+          'gemini-live',
+        );
+        onClose();
+      }
+    },
+    [setSetting, onClose],
+  );
+
+  const handleWhisperModelSelect = useCallback(
+    async (modelName: string) => {
+      if (modelManager.isModelInstalled(modelName)) {
+        setSetting(SettingScope.User, 'experimental.voice.backend', 'whisper');
+        setSetting(
+          SettingScope.User,
+          'experimental.voice.whisperModel',
+          modelName,
+        );
+        onClose();
+      } else {
+        setError(null);
+        const onProgress = (p: WhisperModelProgress) => setDownloadProgress(p);
+        modelManager.on('progress', onProgress);
+
+        try {
+          await modelManager.downloadModel(modelName);
+
+          setSetting(
+            SettingScope.User,
+            'experimental.voice.backend',
+            'whisper',
+          );
+          setSetting(
+            SettingScope.User,
+            'experimental.voice.whisperModel',
+            modelName,
+          );
+          onClose();
+        } catch (err) {
+          setError(
+            `Failed to download: ${err instanceof Error ? err.message : String(err)}`,
+          );
+        } finally {
+          modelManager.off('progress', onProgress);
+          setDownloadProgress(null);
+        }
+      }
+    },
+    [modelManager, setSetting, onClose],
+  );
+
+  const backendOptions = useMemo(
+    () => [
+      {
+        value: 'gemini-live',
+        title: 'Gemini Live API (Cloud)',
+        description: 'Real-time cloud transcription via Gemini Live API.',
+        key: 'gemini-live',
+      },
+      {
+        value: 'whisper',
+        title: 'Whisper (Local)',
+        description: whisperInstalled
+          ? 'Local transcription using whisper.cpp.'
+          : 'Local transcription (Requires: brew install whisper-cpp)',
+        key: 'whisper',
+      },
+    ],
+    [whisperInstalled],
+  );
+
+  const whisperOptions = useMemo(
+    () =>
+      WHISPER_MODELS.map((m) => ({
+        value: m.value,
+        title: `${m.label}${modelManager.isModelInstalled(m.value) ? ' (Installed)' : ' (Download)'}`,
+        description: m.description,
+        key: m.value,
+      })),
+    [modelManager],
+  );
+
+  return (
+    <Box
+      borderStyle="round"
+      borderColor={theme.border.default}
+      flexDirection="column"
+      padding={1}
+      width="100%"
+    >
+      <Text bold>
+        {view === 'backend'
+          ? 'Select Voice Transcription Backend'
+          : 'Select Whisper Model'}
+      </Text>
+
+      {error && (
+        <Box marginTop={1}>
+          <Text color={theme.status.error}>{error}</Text>
+        </Box>
+      )}
+
+      {downloadProgress ? (
+        <Box marginTop={1} flexDirection="column">
+          <Box>
+            <Text>Downloading {downloadProgress.modelName}... </Text>
+            <CliSpinner />
+            <Text> {Math.round(downloadProgress.percentage * 100)}%</Text>
+          </Box>
+        </Box>
+      ) : (
+        <Box marginTop={1}>
+          {view === 'backend' ? (
+            <DescriptiveRadioButtonSelect
+              items={backendOptions}
+              onSelect={handleBackendSelect}
+              initialIndex={currentBackend === 'whisper' ? 1 : 0}
+              showNumbers={true}
+            />
+          ) : (
+            <DescriptiveRadioButtonSelect
+              items={whisperOptions}
+              onSelect={handleWhisperModelSelect}
+              initialIndex={whisperOptions.findIndex(
+                (o) => o.value === currentWhisperModel,
+              )}
+              showNumbers={true}
+            />
+          )}
+        </Box>
+      )}
+
+      <Box marginTop={1} flexDirection="column">
+        <Text color={theme.text.secondary}>
+          {view === 'whisper-models'
+            ? '(Press Esc to go back)'
+            : '(Press Esc to close)'}
+        </Text>
+      </Box>
+    </Box>
+  );
+}
@@ -168,13 +168,6 @@ exports[`InputPrompt > mouse interaction > should toggle paste expansion on doub
 "
 `;

-exports[`InputPrompt > mouse interaction > should toggle paste expansion on double-click 4`] = `
-"▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀
- > [Pasted Text: 10 lines]                                                                          
-▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄
-"
-`;
-
 exports[`InputPrompt > multiline rendering > should correctly render multiline input including blank lines 1`] = `
 "────────────────────────────────────────────────────────────────────────────────────────────────────
 > hello
@@ -41,6 +41,8 @@ export interface UIActions {
  exitPrivacyNotice: () => void;
  closeSettingsDialog: () => void;
  closeModelDialog: () => void;
+  openVoiceModelDialog: () => void;
+  closeVoiceModelDialog: () => void;
  openAgentConfigDialog: (
    name: string,
    displayName: string,
@@ -93,6 +95,7 @@ export interface UIActions {
  handleNewAgentsSelect: (choice: NewAgentsChoice) => Promise<void>;
  getPreferredEditor: () => EditorType | undefined;
  clearAccountSuspension: () => void;
+  setVoiceModeEnabled: (value: boolean) => void;
 }

 export const UIActionsContext = createContext<UIActions | null>(null);
@@ -112,6 +112,7 @@ export interface UIState {
  isSettingsDialogOpen: boolean;
  isSessionBrowserOpen: boolean;
  isModelDialogOpen: boolean;
+  isVoiceModelDialogOpen: boolean;
  isAgentConfigDialogOpen: boolean;
  selectedAgentName?: string;
  selectedAgentDisplayName?: string;
@@ -132,6 +133,7 @@ export interface UIState {
  pendingGeminiHistoryItems: HistoryItemWithoutId[];
  thought: ThoughtSummary | null;
  isInputActive: boolean;
+  isVoiceModeEnabled: boolean;
  isResuming: boolean;
  shouldShowIdePrompt: boolean;
  isFolderTrustDialogOpen: boolean;
@@ -205,11 +205,13 @@ describe('useSlashCommandProcessor', () => {
            openSettingsDialog: vi.fn(),
            openSessionBrowser: vi.fn(),
            openModelDialog: mockOpenModelDialog,
+            openVoiceModelDialog: vi.fn(),
            openAgentConfigDialog,
            openPermissionsDialog: vi.fn(),
            quit: mockSetQuittingMessages,
            setDebugMessage: vi.fn(),
            toggleCorgiMode: vi.fn(),
+            toggleVoiceMode: vi.fn(),
            toggleDebugProfiler: vi.fn(),
            dispatchExtensionStateUpdate: vi.fn(),
            addConfirmUpdateExtensionRequest: vi.fn(),
@@ -72,6 +72,7 @@ interface SlashCommandProcessorActions {
  openSettingsDialog: () => void;
  openSessionBrowser: () => void;
  openModelDialog: () => void;
+  openVoiceModelDialog: () => void;
  openAgentConfigDialog: (
    name: string,
    displayName: string,
@@ -81,6 +82,7 @@ interface SlashCommandProcessorActions {
  quit: (messages: HistoryItem[]) => void;
  setDebugMessage: (message: string) => void;
  toggleCorgiMode: () => void;
+  toggleVoiceMode: () => void;
  toggleDebugProfiler: () => void;
  dispatchExtensionStateUpdate: (action: ExtensionUpdateAction) => void;
  addConfirmUpdateExtensionRequest: (request: ConfirmationRequest) => void;
@@ -232,6 +234,7 @@ export const useSlashCommandProcessor = (
        pendingItem,
        setPendingItem,
        toggleCorgiMode: actions.toggleCorgiMode,
+        toggleVoiceMode: actions.toggleVoiceMode,
        toggleDebugProfiler: actions.toggleDebugProfiler,
        toggleVimEnabled,
        reloadCommands,
@@ -503,6 +506,9 @@ export const useSlashCommandProcessor = (
                    case 'model':
                      actions.openModelDialog();
                      return { type: 'handled' };
+                    case 'voice-model':
+                      actions.openVoiceModelDialog();
+                      return { type: 'handled' };
                    case 'agentConfig': {
                      // eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
                      const props = result.props as Record<string, unknown>;
@@ -0,0 +1,429 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { useState, useRef, useCallback, useEffect } from 'react';
+import {
+  AudioRecorder,
+  TranscriptionFactory,
+  debugLogger,
+  type Config,
+  type TranscriptionProvider,
+} from '@google/gemini-cli-core';
+import type { TextBuffer } from '../components/shared/text-buffer.js';
+import type { MergedSettings } from '../../config/settingsSchema.js';
+import type { Key } from './useKeypress.js';
+import { Command } from '../key/keyMatchers.js';
+
+interface UseVoiceModeProps {
+  buffer: TextBuffer;
+  config: Config;
+  settings: MergedSettings;
+  setQueueErrorMessage: (message: string | null) => void;
+  isVoiceModeEnabled: boolean;
+  setVoiceModeEnabled: (enabled: boolean) => void;
+  keyMatchers: Record<Command, (key: Key) => boolean>;
+}
+
+const HOLD_DELAY_MS = 600;
+const RELEASE_DELAY_MS = 300;
+
+export function useVoiceMode({
+  buffer,
+  config,
+  settings,
+  setQueueErrorMessage,
+  isVoiceModeEnabled,
+  setVoiceModeEnabled,
+  keyMatchers,
+}: UseVoiceModeProps) {
+  const [isRecording, setIsRecording] = useState(false);
+  const [isConnecting, setIsConnecting] = useState(false);
+
+  const liveTranscriptionRef = useRef('');
+  const stopRequestedRef = useRef(false);
+  const isRecordingRef = useRef(false);
+  const lastFailureTimeRef = useRef(0);
+  const recordingInProgressRef = useRef(false);
+  const voiceTimeoutRef = useRef<NodeJS.Timeout | null>(null);
+  const recorderRef = useRef<AudioRecorder | null>(null);
+  const transcriptionServiceRef = useRef<TranscriptionProvider | null>(null);
+  const turnBaselineRef = useRef<string | null>(null);
+
+  const pttStateRef = useRef<'idle' | 'possible-hold' | 'recording'>('idle');
+  const pttTimerRef = useRef<NodeJS.Timeout | null>(null);
+  const disconnectTimerRef = useRef<NodeJS.Timeout | null>(null);
+
+  const bufferRef = useRef(buffer);
+  bufferRef.current = buffer;
+
+  const stopVoiceRecording = useCallback(() => {
+    if (stopRequestedRef.current) return;
+    debugLogger.debug('[Voice] Stop requested');
+    stopRequestedRef.current = true;
+
+    setIsRecording(false);
+    isRecordingRef.current = false;
+    setIsConnecting(false);
+
+    if (recorderRef.current) {
+      recorderRef.current.stop();
+      recorderRef.current = null;
+    }
+
+    const serviceToDisconnect = transcriptionServiceRef.current;
+    transcriptionServiceRef.current = null;
+
+    if (serviceToDisconnect) {
+      const isLive = settings.experimental.voice?.backend === 'gemini-live';
+      const gracePeriodMs =
+        settings.experimental.voice?.stopGracePeriodMs ??
+        (isLive ? 2000 : 1000);
+      debugLogger.debug(
+        `[Voice] Draining transcription for ${gracePeriodMs}ms`,
+      );
+
+      if (disconnectTimerRef.current) clearTimeout(disconnectTimerRef.current);
+      disconnectTimerRef.current = setTimeout(() => {
+        debugLogger.debug('[Voice] Grace period ended, disconnecting service');
+        serviceToDisconnect.disconnect();
+        disconnectTimerRef.current = null;
+      }, gracePeriodMs);
+    }
+
+    liveTranscriptionRef.current = '';
+    pttStateRef.current = 'idle';
+  }, [settings.experimental.voice]);
+
+  const startVoiceRecording = useCallback(() => {
+    if (
+      isRecordingRef.current ||
+      Date.now() - lastFailureTimeRef.current < 2000
+    ) {
+      return;
+    }
+
+    if (disconnectTimerRef.current) {
+      clearTimeout(disconnectTimerRef.current);
+      disconnectTimerRef.current = null;
+    }
+
+    recordingInProgressRef.current = true;
+    turnBaselineRef.current = bufferRef.current.text;
+
+    setIsConnecting(true);
+    setIsRecording(true);
+    isRecordingRef.current = true;
+
+    liveTranscriptionRef.current = '';
+    stopRequestedRef.current = false;
+
+    const apiKey =
+      config.getContentGeneratorConfig()?.apiKey ||
+      process.env['GEMINI_API_KEY'] ||
+      '';
+
+    const startAsync = async () => {
+      // If there's an active draining service, disconnect it immediately
+      // before starting a new one to prevent orphaned event collisions.
+      if (disconnectTimerRef.current) {
+        clearTimeout(disconnectTimerRef.current);
+        disconnectTimerRef.current = null;
+      }
+      if (transcriptionServiceRef.current) {
+        transcriptionServiceRef.current.disconnect();
+        transcriptionServiceRef.current = null;
+      }
+
+      const cleanupIfStopped = () => {
+        if (stopRequestedRef.current) {
+          if (recorderRef.current) {
+            recorderRef.current.stop();
+            recorderRef.current = null;
+          }
+          if (transcriptionServiceRef.current) {
+            transcriptionServiceRef.current.disconnect();
+            transcriptionServiceRef.current = null;
+          }
+          setIsRecording(false);
+          isRecordingRef.current = false;
+          setIsConnecting(false);
+          recordingInProgressRef.current = false;
+          return true;
+        }
+        return false;
+      };
+
+      if (cleanupIfStopped()) return;
+
+      const voiceBackend =
+        settings.experimental.voice?.backend ?? 'gemini-live';
+
+      if (!apiKey && voiceBackend === 'gemini-live') {
+        setQueueErrorMessage(
+          'Cloud voice mode requires a GEMINI_API_KEY. Please set it in your environment or ~/.gemini/.env.',
+        );
+        setIsRecording(false);
+        isRecordingRef.current = false;
+        setIsConnecting(false);
+        recordingInProgressRef.current = false;
+        lastFailureTimeRef.current = Date.now();
+        return;
+      }
+
+      if (voiceBackend === 'gemini-live') {
+        recorderRef.current = new AudioRecorder();
+      }
+
+      const currentService = TranscriptionFactory.createProvider(
+        settings.experimental.voice,
+        apiKey,
+      );
+      transcriptionServiceRef.current = currentService;
+
+      currentService.on('transcription', (text) => {
+        if (
+          transcriptionServiceRef.current !== currentService &&
+          stopRequestedRef.current
+        ) {
+          // If this is an orphaned service that was replaced by a new session, ignore its events
+          return;
+        }
+
+        if (text) {
+          const currentBufferText = bufferRef.current.text;
+          const previousTranscription = liveTranscriptionRef.current;
+
+          let newTotalText = currentBufferText;
+
+          if (
+            previousTranscription &&
+            currentBufferText.endsWith(previousTranscription)
+          ) {
+            newTotalText = currentBufferText.slice(
+              0,
+              -previousTranscription.length,
+            );
+          } else if (
+            currentBufferText &&
+            !currentBufferText.endsWith(' ') &&
+            !currentBufferText.endsWith('\n')
+          ) {
+            newTotalText += ' ';
+          }
+
+          newTotalText += text;
+          bufferRef.current.setText(newTotalText, 'end');
+        }
+        liveTranscriptionRef.current = text;
+      });
+
+      currentService.on('turnComplete', () => {
+        if (
+          transcriptionServiceRef.current !== currentService &&
+          stopRequestedRef.current
+        )
+          return;
+        liveTranscriptionRef.current = '';
+      });
+
+      currentService.on('error', (err) => {
+        if (transcriptionServiceRef.current !== currentService) return;
+        debugLogger.error('[Voice] Transcription error:', err);
+        lastFailureTimeRef.current = Date.now();
+        recordingInProgressRef.current = false;
+      });
+
+      currentService.on('close', () => {
+        if (transcriptionServiceRef.current !== currentService) return;
+        if (!stopRequestedRef.current) {
+          setIsRecording(false);
+          isRecordingRef.current = false;
+          setIsConnecting(false);
+          recordingInProgressRef.current = false;
+          lastFailureTimeRef.current = Date.now();
+        }
+      });
+
+      try {
+        await currentService.connect();
+        if (cleanupIfStopped()) return;
+
+        await recorderRef.current?.start();
+        if (cleanupIfStopped()) return;
+
+        setIsConnecting(false);
+
+        const currentVoiceBackend =
+          settings.experimental.voice?.backend ?? 'gemini-live';
+
+        recorderRef.current?.on('data', (chunk) => {
+          if (currentVoiceBackend === 'gemini-live') {
+            currentService.sendAudioChunk(chunk);
+          }
+        });
+        recorderRef.current?.on('error', (err) => {
+          debugLogger.error('[Voice] Recorder error:', err);
+          stopVoiceRecording();
+          lastFailureTimeRef.current = Date.now();
+        });
+      } catch (err: unknown) {
+        if (transcriptionServiceRef.current !== currentService) return;
+        const message = err instanceof Error ? err.message : String(err);
+        setQueueErrorMessage(`Voice mode failure: ${message}`);
+        setIsRecording(false);
+        isRecordingRef.current = false;
+        setIsConnecting(false);
+        recordingInProgressRef.current = false;
+        lastFailureTimeRef.current = Date.now();
+
+        if (recorderRef.current) {
+          recorderRef.current.stop();
+          recorderRef.current = null;
+        }
+        if (transcriptionServiceRef.current) {
+          transcriptionServiceRef.current.disconnect();
+          transcriptionServiceRef.current = null;
+        }
+      }
+    };
+
+    void startAsync();
+  }, [
+    config,
+    settings.experimental.voice,
+    setQueueErrorMessage,
+    stopVoiceRecording,
+  ]);
+
+  useEffect(
+    () => () => {
+      if (voiceTimeoutRef.current) clearTimeout(voiceTimeoutRef.current);
+      if (recorderRef.current) {
+        recorderRef.current.stop();
+        recorderRef.current = null;
+      }
+      if (transcriptionServiceRef.current) {
+        transcriptionServiceRef.current.disconnect();
+        transcriptionServiceRef.current = null;
+      }
+      if (pttTimerRef.current) clearTimeout(pttTimerRef.current);
+      if (disconnectTimerRef.current) clearTimeout(disconnectTimerRef.current);
+    },
+    [],
+  );
+
+  const handleVoiceInput = useCallback(
+    (key: Key): boolean => {
+      const activeRecording = isRecording || isRecordingRef.current;
+
+      if (activeRecording) {
+        const activationMode =
+          settings.experimental.voice?.activationMode ?? 'push-to-talk';
+
+        if (keyMatchers[Command.ESCAPE](key)) {
+          stopVoiceRecording();
+          return true;
+        }
+
+        if (keyMatchers[Command.VOICE_MODE_PTT](key)) {
+          if (activationMode === 'push-to-talk') {
+            if (pttTimerRef.current) {
+              clearTimeout(pttTimerRef.current);
+            }
+            pttTimerRef.current = setTimeout(() => {
+              stopVoiceRecording();
+              pttTimerRef.current = null;
+            }, RELEASE_DELAY_MS);
+            return true;
+          } else {
+            stopVoiceRecording();
+            return true;
+          }
+        }
+        return true;
+      }
+
+      if (isVoiceModeEnabled) {
+        const activationMode =
+          settings.experimental.voice?.activationMode ?? 'push-to-talk';
+
+        if (keyMatchers[Command.ESCAPE](key) && buffer.text === '') {
+          setVoiceModeEnabled(false);
+          return true;
+        }
+
+        if (keyMatchers[Command.VOICE_MODE_PTT](key)) {
+          if (
+            key.name === 'space' &&
+            !key.ctrl &&
+            !key.alt &&
+            !key.shift &&
+            !key.cmd
+          ) {
+            if (activationMode === 'toggle') {
+              startVoiceRecording();
+              return true;
+            } else {
+              if (pttStateRef.current === 'idle') {
+                buffer.insert(' ');
+                pttStateRef.current = 'possible-hold';
+
+                if (pttTimerRef.current) clearTimeout(pttTimerRef.current);
+                pttTimerRef.current = setTimeout(() => {
+                  pttStateRef.current = 'idle';
+                  pttTimerRef.current = null;
+                }, HOLD_DELAY_MS);
+                return true;
+              } else if (pttStateRef.current === 'possible-hold') {
+                if (pttTimerRef.current) clearTimeout(pttTimerRef.current);
+                buffer.backspace();
+                pttStateRef.current = 'recording';
+                startVoiceRecording();
+
+                pttTimerRef.current = setTimeout(() => {
+                  stopVoiceRecording();
+                  pttTimerRef.current = null;
+                }, RELEASE_DELAY_MS);
+                return true;
+              }
+            }
+          }
+        }
+
+        if (pttStateRef.current === 'possible-hold') {
+          pttStateRef.current = 'idle';
+          if (pttTimerRef.current) {
+            clearTimeout(pttTimerRef.current);
+            pttTimerRef.current = null;
+          }
+        }
+      }
+
+      return false;
+    },
+    [
+      isRecording,
+      isVoiceModeEnabled,
+      settings.experimental.voice,
+      keyMatchers,
+      stopVoiceRecording,
+      startVoiceRecording,
+      buffer,
+      setVoiceModeEnabled,
+    ],
+  );
+
+  return {
+    isRecording,
+    isConnecting,
+    startVoiceRecording,
+    stopVoiceRecording,
+    handleVoiceInput,
+    resetTurnBaseline: () => {
+      turnBaselineRef.current = null;
+    },
+  };
+}
@@ -0,0 +1,31 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { useState, useCallback } from 'react';
+
+interface UseVoiceModelCommandReturn {
+  isVoiceModelDialogOpen: boolean;
+  openVoiceModelDialog: () => void;
+  closeVoiceModelDialog: () => void;
+}
+
+export const useVoiceModelCommand = (): UseVoiceModelCommandReturn => {
+  const [isVoiceModelDialogOpen, setIsVoiceModelDialogOpen] = useState(false);
+
+  const openVoiceModelDialog = useCallback(() => {
+    setIsVoiceModelDialogOpen(true);
+  }, []);
+
+  const closeVoiceModelDialog = useCallback(() => {
+    setIsVoiceModelDialogOpen(false);
+  }, []);
+
+  return {
+    isVoiceModelDialogOpen,
+    openVoiceModelDialog,
+    closeVoiceModelDialog,
+  };
+};
@@ -97,6 +97,7 @@ export enum Command {
  RESTART_APP = 'app.restart',
  SUSPEND_APP = 'app.suspend',
  SHOW_SHELL_INPUT_UNFOCUS_WARNING = 'app.showShellUnfocusWarning',
+  VOICE_MODE_PTT = 'app.voiceModePTT',

  // Background Shell Controls
  BACKGROUND_SHELL_ESCAPE = 'background.escape',
@@ -407,9 +408,7 @@ export const defaultKeyBindingConfig: KeyBindingConfig = new Map([
  [Command.RESTART_APP, [new KeyBinding('r'), new KeyBinding('shift+r')]],
  [Command.SUSPEND_APP, [new KeyBinding('ctrl+z')]],
  [Command.SHOW_SHELL_INPUT_UNFOCUS_WARNING, [new KeyBinding('tab')]],
-  [Command.DUMP_FRAME, [new KeyBinding('f8')]],
-  [Command.START_RECORDING, [new KeyBinding('f6')]],
-  [Command.STOP_RECORDING, [new KeyBinding('f7')]],
+  [Command.VOICE_MODE_PTT, [new KeyBinding('space')]],

  // Background Shell Controls
  [Command.BACKGROUND_SHELL_ESCAPE, [new KeyBinding('escape')]],
@@ -424,6 +423,10 @@ export const defaultKeyBindingConfig: KeyBindingConfig = new Map([
  // Extension Controls
  [Command.UPDATE_EXTENSION, [new KeyBinding('i')]],
  [Command.LINK_EXTENSION, [new KeyBinding('l')]],
+
+  [Command.DUMP_FRAME, [new KeyBinding('f8')]],
+  [Command.START_RECORDING, [new KeyBinding('f6')]],
+  [Command.STOP_RECORDING, [new KeyBinding('f7')]],
 ]);

 interface CommandCategory {
@@ -538,6 +541,7 @@ export const commandCategories: readonly CommandCategory[] = [
      Command.RESTART_APP,
      Command.SUSPEND_APP,
      Command.SHOW_SHELL_INPUT_UNFOCUS_WARNING,
+      Command.VOICE_MODE_PTT,
    ],
  },
  {
@@ -658,6 +662,7 @@ export const commandDescriptions: Readonly<Record<Command, string>> = {
  [Command.SUSPEND_APP]: 'Suspend the CLI and move it to the background.',
  [Command.SHOW_SHELL_INPUT_UNFOCUS_WARNING]:
    'Show warning when trying to move focus away from shell input.',
+  [Command.VOICE_MODE_PTT]: 'Hold to speak in Voice Mode.',

  // Background Shell Controls
  [Command.BACKGROUND_SHELL_ESCAPE]: 'Dismiss background shell list.',
@@ -43,5 +43,6 @@ export function createNonInteractiveUI(): CommandContext['ui'] {
    removeComponent: () => {},
    toggleBackgroundTasks: () => {},
    toggleShortcutsHelp: () => {},
+    toggleVoiceMode: () => {},
  };
 }