diff --git a/.gemini/settings.json b/.gemini/settings.json
index 4ad7bc3ed6..e7ff785b7c 100644
--- a/.gemini/settings.json
+++ b/.gemini/settings.json
@@ -3,7 +3,10 @@
"extensionReloading": true,
"modelSteering": true,
"autoMemory": true,
- "gemma": true
+ "gemma": true,
+ "memoryManager": true,
+ "topicUpdateNarration": true,
+ "voiceMode": true
},
"general": {
"devtools": true
diff --git a/Dockerfile b/Dockerfile
index 25d27d46c6..44ba343902 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -40,8 +40,8 @@ ENV PATH=$PATH:/usr/local/share/npm-global/bin
USER node
# install gemini-cli and clean up
-COPY packages/cli/dist/google-gemini-cli-*.tgz /tmp/gemini-cli.tgz
-COPY packages/core/dist/google-gemini-cli-core-*.tgz /tmp/gemini-core.tgz
+COPY --chown=node:node packages/cli/dist/google-gemini-cli-*.tgz /tmp/gemini-cli.tgz
+COPY --chown=node:node packages/core/dist/google-gemini-cli-core-*.tgz /tmp/gemini-core.tgz
RUN npm install -g /tmp/gemini-core.tgz \
&& npm install -g /tmp/gemini-cli.tgz \
&& node -e "const fs=require('node:fs'); JSON.parse(fs.readFileSync('/usr/local/share/npm-global/lib/node_modules/@google/gemini-cli/package.json','utf8')); JSON.parse(fs.readFileSync('/usr/local/share/npm-global/lib/node_modules/@google/gemini-cli-core/package.json','utf8'));" \
diff --git a/docs/cli/settings.md b/docs/cli/settings.md
index 10bfee644f..834750fdf9 100644
--- a/docs/cli/settings.md
+++ b/docs/cli/settings.md
@@ -161,20 +161,25 @@ they appear in the UI.
### Experimental
-| UI Label | Setting | Description | Default |
-| ---------------------------------------------------- | ----------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
-| Gemma Models | `experimental.gemma` | Enable access to Gemma 4 models (experimental). | `false` |
-| Enable Git Worktrees | `experimental.worktrees` | Enable automated Git worktree management for parallel work. | `false` |
-| Use OSC 52 Paste | `experimental.useOSC52Paste` | Use OSC 52 for pasting. This may be more robust than the default system when using remote terminal sessions (if your terminal is configured to allow it). | `false` |
-| Use OSC 52 Copy | `experimental.useOSC52Copy` | Use OSC 52 for copying. This may be more robust than the default system when using remote terminal sessions (if your terminal is configured to allow it). | `false` |
-| Model Steering | `experimental.modelSteering` | Enable model steering (user hints) to guide the model during tool execution. | `false` |
-| Direct Web Fetch | `experimental.directWebFetch` | Enable web fetch behavior that bypasses LLM summarization. | `false` |
-| Enable Gemma Model Router | `experimental.gemmaModelRouter.enabled` | Enable the Gemma Model Router (experimental). Requires a local endpoint serving Gemma via the Gemini API using LiteRT-LM shim. | `false` |
-| Auto-start LiteRT Server | `experimental.gemmaModelRouter.autoStartServer` | Automatically start the LiteRT-LM server when Gemini CLI starts and the Gemma router is enabled. | `false` |
-| Memory v2 | `experimental.memoryV2` | Disable the built-in save_memory tool and let the main agent persist project context by editing markdown files directly with edit/write_file. Route facts across four tiers: team-shared conventions go to project GEMINI.md files, project-specific personal notes go to the per-project private memory folder (MEMORY.md as index + sibling .md files for detail), and cross-project personal preferences go to the global ~/.gemini/GEMINI.md (the only file under ~/.gemini/ that the agent can edit — settings, credentials, etc. remain off-limits). Set to false to fall back to the legacy save_memory tool. | `true` |
-| Auto Memory | `experimental.autoMemory` | Automatically extract reusable skills from past sessions in the background. Review results with /memory inbox. | `false` |
-| Use the generalist profile to manage agent contexts. | `experimental.generalistProfile` | Suitable for general coding and software development tasks. | `false` |
-| Enable Context Management | `experimental.contextManagement` | Enable logic for context management. | `false` |
+| UI Label | Setting | Description | Default |
+| ---------------------------------------------------- | ----------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------- |
+| Gemma Models | `experimental.gemma` | Enable access to Gemma 4 models (experimental). | `false` |
+| Voice Mode | `experimental.voiceMode` | Enable experimental voice dictation and commands (/voice, /voice model). | `false` |
+| Voice Activation Mode | `experimental.voice.activationMode` | How to trigger voice recording with the Space key. | `"push-to-talk"` |
+| Voice Transcription Backend | `experimental.voice.backend` | The backend to use for voice transcription. | `"gemini-live"` |
+| Whisper Model | `experimental.voice.whisperModel` | The Whisper model to use for local transcription. | `"ggml-base.en.bin"` |
+| Voice Stop Grace Period (ms) | `experimental.voice.stopGracePeriodMs` | How long to wait for final transcription after stopping recording. | `1000` |
+| Enable Git Worktrees | `experimental.worktrees` | Enable automated Git worktree management for parallel work. | `false` |
+| Use OSC 52 Paste | `experimental.useOSC52Paste` | Use OSC 52 for pasting. This may be more robust than the default system when using remote terminal sessions (if your terminal is configured to allow it). | `false` |
+| Use OSC 52 Copy | `experimental.useOSC52Copy` | Use OSC 52 for copying. This may be more robust than the default system when using remote terminal sessions (if your terminal is configured to allow it). | `false` |
+| Model Steering | `experimental.modelSteering` | Enable model steering (user hints) to guide the model during tool execution. | `false` |
+| Direct Web Fetch | `experimental.directWebFetch` | Enable web fetch behavior that bypasses LLM summarization. | `false` |
+| Enable Gemma Model Router | `experimental.gemmaModelRouter.enabled` | Enable the Gemma Model Router (experimental). Requires a local endpoint serving Gemma via the Gemini API using LiteRT-LM shim. | `false` |
+| Auto-start LiteRT Server | `experimental.gemmaModelRouter.autoStartServer` | Automatically start the LiteRT-LM server when Gemini CLI starts and the Gemma router is enabled. | `false` |
+| Memory v2 | `experimental.memoryV2` | Disable the built-in save_memory tool and let the main agent persist project context by editing markdown files directly with edit/write_file. Route facts across four tiers: team-shared conventions go to project GEMINI.md files, project-specific personal notes go to the per-project private memory folder (MEMORY.md as index + sibling .md files for detail), and cross-project personal preferences go to the global ~/.gemini/GEMINI.md (the only file under ~/.gemini/ that the agent can edit — settings, credentials, etc. remain off-limits). Set to false to fall back to the legacy save_memory tool. | `true` |
+| Auto Memory | `experimental.autoMemory` | Automatically extract reusable skills from past sessions in the background. Review results with /memory inbox. | `false` |
+| Use the generalist profile to manage agent contexts. | `experimental.generalistProfile` | Suitable for general coding and software development tasks. | `false` |
+| Enable Context Management | `experimental.contextManagement` | Enable logic for context management. | `false` |
### Skills
diff --git a/docs/reference/configuration.md b/docs/reference/configuration.md
index b2d8955d5f..94a64cc197 100644
--- a/docs/reference/configuration.md
+++ b/docs/reference/configuration.md
@@ -1691,6 +1691,32 @@ their corresponding top-level category object in your `settings.json` file.
- **Default:** `false`
- **Requires restart:** Yes
+- **`experimental.voiceMode`** (boolean):
+ - **Description:** Enable experimental voice dictation and commands (/voice,
+ /voice model).
+ - **Default:** `false`
+
+- **`experimental.voice.activationMode`** (enum):
+ - **Description:** How to trigger voice recording with the Space key.
+ - **Default:** `"push-to-talk"`
+ - **Values:** `"push-to-talk"`, `"toggle"`
+
+- **`experimental.voice.backend`** (enum):
+ - **Description:** The backend to use for voice transcription.
+ - **Default:** `"gemini-live"`
+ - **Values:** `"gemini-live"`, `"whisper"`
+
+- **`experimental.voice.whisperModel`** (enum):
+ - **Description:** The Whisper model to use for local transcription.
+ - **Default:** `"ggml-base.en.bin"`
+ - **Values:** `"ggml-tiny.en.bin"`, `"ggml-base.en.bin"`,
+ `"ggml-large-v3-turbo-q5_0.bin"`, `"ggml-large-v3-turbo-q8_0.bin"`
+
+- **`experimental.voice.stopGracePeriodMs`** (number):
+ - **Description:** How long to wait for final transcription after stopping
+ recording.
+ - **Default:** `1000`
+
- **`experimental.adk.agentSessionNoninteractiveEnabled`** (boolean):
- **Description:** Enable non-interactive agent sessions.
- **Default:** `false`
diff --git a/docs/reference/keyboard-shortcuts.md b/docs/reference/keyboard-shortcuts.md
index 98d31c0ae2..6f7a8cce4a 100644
--- a/docs/reference/keyboard-shortcuts.md
+++ b/docs/reference/keyboard-shortcuts.md
@@ -115,6 +115,7 @@ available combinations.
| `app.restart` | Restart the application. | `R`
`Shift+R` |
| `app.suspend` | Suspend the CLI and move it to the background. | `Ctrl+Z` |
| `app.showShellUnfocusWarning` | Show warning when trying to move focus away from shell input. | `Tab` |
+| `app.voiceModePTT` | Hold to speak in Voice Mode. | `Space` |
#### Background Shell Controls
diff --git a/integration-tests/voice-mode.test.ts b/integration-tests/voice-mode.test.ts
new file mode 100644
index 0000000000..49844494a8
--- /dev/null
+++ b/integration-tests/voice-mode.test.ts
@@ -0,0 +1,76 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import { TestRig } from './test-helper.js';
+import {
+ WhisperModelManager,
+ WhisperTranscriptionProvider,
+} from '@google/gemini-cli-core';
+import * as fs from 'node:fs';
+import commandExists from 'command-exists';
+
+describe('Voice Mode Integration', () => {
+ let rig: TestRig;
+
+ beforeEach(() => {
+ rig = new TestRig();
+ });
+
+ afterEach(async () => await rig.cleanup());
+
+ it('should be able to download tiny whisper model', async () => {
+ // This test doesn't require the binary, only network access.
+ // However, it's slow and downloads 75MB. We'll keep it for now but
+ // wrap it in a try-catch to avoid failing on network flakiness in CI.
+ const manager = new WhisperModelManager();
+ const modelName = 'ggml-tiny.en.bin';
+
+ try {
+ // Cleanup if already exists to ensure we actually test download
+ const modelPath = manager.getModelPath(modelName);
+ if (fs.existsSync(modelPath)) {
+ fs.unlinkSync(modelPath);
+ }
+
+ await manager.downloadModel(modelName);
+ expect(fs.existsSync(modelPath)).toBe(true);
+ expect(fs.statSync(modelPath).size).toBeGreaterThan(70 * 1024 * 1024); // ~75MB
+ } catch (e) {
+ console.warn(
+ 'Skipping whisper model download test due to error (possibly network):',
+ e,
+ );
+ }
+ }, 300000); // 5 min timeout for download
+
+ it('should initialize WhisperTranscriptionProvider and handle process', async () => {
+ // Skip this test if whisper-stream is not installed (typical for CI)
+ try {
+ await commandExists('whisper-stream');
+ } catch {
+ console.log(
+ 'Skipping Whisper transcription test: whisper-stream not found',
+ );
+ return;
+ }
+
+ const manager = new WhisperModelManager();
+ const modelName = 'ggml-tiny.en.bin';
+ if (!manager.isModelInstalled(modelName)) {
+ await manager.downloadModel(modelName);
+ }
+
+ const provider = new WhisperTranscriptionProvider({
+ modelPath: manager.getModelPath(modelName),
+ });
+
+ // Since we can't easily provide real mic input in CI,
+ // we just verify it can start and be disconnected.
+ await provider.connect();
+ provider.disconnect();
+ });
+});
diff --git a/package-lock.json b/package-lock.json
index 89a358ef9e..d3b8904735 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -18390,6 +18390,7 @@
"ajv-formats": "^3.0.0",
"chardet": "^2.1.0",
"chokidar": "^5.0.0",
+ "command-exists": "^1.2.9",
"diff": "^8.0.3",
"dotenv": "^17.2.4",
"dotenv-expand": "^12.0.3",
diff --git a/packages/cli/src/config/config.ts b/packages/cli/src/config/config.ts
index 6b99a3606d..1ce93cf9ff 100755
--- a/packages/cli/src/config/config.ts
+++ b/packages/cli/src/config/config.ts
@@ -1000,6 +1000,7 @@ export async function loadCliConfig(
enableExtensionReloading: settings.experimental?.extensionReloading,
enableAgents: settings.experimental?.enableAgents,
plan: settings.general?.plan?.enabled ?? true,
+ voiceMode: settings.experimental?.voiceMode,
tracker: settings.experimental?.taskTracker,
directWebFetch: settings.experimental?.directWebFetch,
planSettings: settings.general?.plan?.directory
diff --git a/packages/cli/src/config/settingsSchema.ts b/packages/cli/src/config/settingsSchema.ts
index 2b6c959397..08edbd66f9 100644
--- a/packages/cli/src/config/settingsSchema.ts
+++ b/packages/cli/src/config/settingsSchema.ts
@@ -2061,6 +2061,87 @@ const SETTINGS_SCHEMA = {
description: 'Enable access to Gemma 4 models (experimental).',
showInDialog: true,
},
+ voiceMode: {
+ type: 'boolean',
+ label: 'Voice Mode',
+ category: 'Experimental',
+ requiresRestart: false,
+ default: false,
+ description:
+ 'Enable experimental voice dictation and commands (/voice, /voice model).',
+ showInDialog: true,
+ },
+ voice: {
+ type: 'object',
+ label: 'Voice',
+ category: 'Experimental',
+ requiresRestart: false,
+ default: {},
+ description: 'Settings for voice mode and transcription.',
+ showInDialog: false,
+ properties: {
+ activationMode: {
+ type: 'enum',
+ label: 'Voice Activation Mode',
+ category: 'Experimental',
+ requiresRestart: false,
+ default: 'push-to-talk',
+ description: 'How to trigger voice recording with the Space key.',
+ showInDialog: true,
+ options: [
+ { value: 'push-to-talk', label: 'Push-To-Talk (Hold Space)' },
+ { value: 'toggle', label: 'Toggle (Press Space to start/stop)' },
+ ],
+ },
+ backend: {
+ type: 'enum',
+ label: 'Voice Transcription Backend',
+ category: 'Experimental',
+ requiresRestart: false,
+ default: 'gemini-live',
+ description: 'The backend to use for voice transcription.',
+ showInDialog: true,
+ options: [
+ { value: 'gemini-live', label: 'Gemini Live API (Cloud)' },
+ { value: 'whisper', label: 'Whisper (Local)' },
+ ],
+ },
+ whisperModel: {
+ type: 'enum',
+ label: 'Whisper Model',
+ category: 'Experimental',
+ requiresRestart: false,
+ default: 'ggml-base.en.bin',
+ description: 'The Whisper model to use for local transcription.',
+ showInDialog: true,
+ options: [
+ { value: 'ggml-tiny.en.bin', label: 'Tiny (EN) - Fast (~75MB)' },
+ {
+ value: 'ggml-base.en.bin',
+ label: 'Base (EN) - Balanced (~142MB)',
+ },
+ {
+ value: 'ggml-large-v3-turbo-q5_0.bin',
+ label: 'Large v3 Turbo (Q5_0) - High Accuracy (~547MB)',
+ },
+ {
+ value: 'ggml-large-v3-turbo-q8_0.bin',
+ label: 'Large v3 Turbo (Q8_0) - Max Accuracy (~834MB)',
+ },
+ ],
+ },
+ stopGracePeriodMs: {
+ type: 'number',
+ label: 'Voice Stop Grace Period (ms)',
+ category: 'Experimental',
+ requiresRestart: false,
+ default: 1000,
+ description:
+ 'How long to wait for final transcription after stopping recording.',
+ showInDialog: true,
+ },
+ },
+ },
adk: {
type: 'object',
label: 'ADK',
diff --git a/packages/cli/src/services/BuiltinCommandLoader.test.ts b/packages/cli/src/services/BuiltinCommandLoader.test.ts
index f166c161cd..d53273134c 100644
--- a/packages/cli/src/services/BuiltinCommandLoader.test.ts
+++ b/packages/cli/src/services/BuiltinCommandLoader.test.ts
@@ -170,6 +170,7 @@ describe('BuiltinCommandLoader', () => {
getAllSkills: vi.fn().mockReturnValue([]),
isAdminEnabled: vi.fn().mockReturnValue(true),
}),
+ isVoiceModeEnabled: vi.fn().mockReturnValue(true),
getContentGeneratorConfig: vi.fn().mockReturnValue({
authType: 'other',
}),
@@ -396,6 +397,7 @@ describe('BuiltinCommandLoader profile', () => {
getAllSkills: vi.fn().mockReturnValue([]),
isAdminEnabled: vi.fn().mockReturnValue(true),
}),
+ isVoiceModeEnabled: vi.fn().mockReturnValue(true),
getContentGeneratorConfig: vi.fn().mockReturnValue({
authType: 'other',
}),
diff --git a/packages/cli/src/services/BuiltinCommandLoader.ts b/packages/cli/src/services/BuiltinCommandLoader.ts
index 94b5986eb3..1c5288707c 100644
--- a/packages/cli/src/services/BuiltinCommandLoader.ts
+++ b/packages/cli/src/services/BuiltinCommandLoader.ts
@@ -62,6 +62,7 @@ import { setupGithubCommand } from '../ui/commands/setupGithubCommand.js';
import { terminalSetupCommand } from '../ui/commands/terminalSetupCommand.js';
import { upgradeCommand } from '../ui/commands/upgradeCommand.js';
import { gemmaStatusCommand } from '../ui/commands/gemmaStatusCommand.js';
+import { voiceCommand } from '../ui/commands/voiceCommand.js';
/**
* Loads the core, hard-coded slash commands that are an integral part
@@ -227,6 +228,7 @@ export class BuiltinCommandLoader implements ICommandLoader {
vimCommand,
setupGithubCommand,
terminalSetupCommand,
+ ...(this.config?.isVoiceModeEnabled() ? [voiceCommand] : []),
...(this.config?.getContentGeneratorConfig()?.authType ===
AuthType.LOGIN_WITH_GOOGLE
? [upgradeCommand]
diff --git a/packages/cli/src/test-utils/render.tsx b/packages/cli/src/test-utils/render.tsx
index a9f786f11c..83e69d6663 100644
--- a/packages/cli/src/test-utils/render.tsx
+++ b/packages/cli/src/test-utils/render.tsx
@@ -552,6 +552,8 @@ const mockUIActions: UIActions = {
exitPrivacyNotice: vi.fn(),
closeSettingsDialog: vi.fn(),
closeModelDialog: vi.fn(),
+ openVoiceModelDialog: vi.fn(),
+ closeVoiceModelDialog: vi.fn(),
openAgentConfigDialog: vi.fn(),
closeAgentConfigDialog: vi.fn(),
openPermissionsDialog: vi.fn(),
@@ -598,6 +600,7 @@ const mockUIActions: UIActions = {
handleNewAgentsSelect: vi.fn(),
getPreferredEditor: vi.fn(),
clearAccountSuspension: vi.fn(),
+ setVoiceModeEnabled: vi.fn(),
};
import { type TextBuffer } from '../ui/components/shared/text-buffer.js';
diff --git a/packages/cli/src/ui/AppContainer.tsx b/packages/cli/src/ui/AppContainer.tsx
index fdbaf57fbe..f5294ae23e 100644
--- a/packages/cli/src/ui/AppContainer.tsx
+++ b/packages/cli/src/ui/AppContainer.tsx
@@ -103,6 +103,7 @@ import { useQuotaAndFallback } from './hooks/useQuotaAndFallback.js';
import { useEditorSettings } from './hooks/useEditorSettings.js';
import { useSettingsCommand } from './hooks/useSettingsCommand.js';
import { useModelCommand } from './hooks/useModelCommand.js';
+import { useVoiceModelCommand } from './hooks/useVoiceModelCommand.js';
import { useSlashCommandProcessor } from './hooks/slashCommandProcessor.js';
import { useVimMode } from './contexts/VimModeContext.js';
import {
@@ -312,6 +313,7 @@ export const AppContainer = (props: AppContainerProps) => {
);
const [shellModeActive, setShellModeActive] = useState(false);
+ const [isVoiceModeEnabled, setVoiceModeEnabled] = useState(false);
const [modelSwitchedFromQuotaError, setModelSwitchedFromQuotaError] =
useState(false);
const [historyRemountKey, setHistoryRemountKey] = useState(0);
@@ -946,6 +948,12 @@ Logging in with Google... Restarting Gemini CLI to continue.
const { isModelDialogOpen, openModelDialog, closeModelDialog } =
useModelCommand();
+ const {
+ isVoiceModelDialogOpen,
+ openVoiceModelDialog,
+ closeVoiceModelDialog,
+ } = useVoiceModelCommand();
+
const { toggleVimEnabled } = useVimMode();
const setIsBackgroundTaskListOpenRef = useRef<(open: boolean) => void>(
@@ -969,6 +977,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
openSettingsDialog,
openSessionBrowser,
openModelDialog,
+ openVoiceModelDialog,
openAgentConfigDialog,
openPermissionsDialog,
quit: (messages: HistoryItem[]) => {
@@ -981,6 +990,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
},
setDebugMessage,
toggleCorgiMode: () => setCorgiMode((prev) => !prev),
+ toggleVoiceMode: () => setVoiceModeEnabled((prev) => !prev),
toggleDebugProfiler,
dispatchExtensionStateUpdate,
addConfirmUpdateExtensionRequest,
@@ -1006,6 +1016,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
openSettingsDialog,
openSessionBrowser,
openModelDialog,
+ openVoiceModelDialog,
openAgentConfigDialog,
setQuittingMessages,
setDebugMessage,
@@ -2191,6 +2202,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
isThemeDialogOpen ||
isSettingsDialogOpen ||
isModelDialogOpen ||
+ isVoiceModelDialogOpen ||
isAgentConfigDialogOpen ||
isPermissionsDialogOpen ||
isAuthenticating ||
@@ -2448,6 +2460,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
isSettingsDialogOpen,
isSessionBrowserOpen,
isModelDialogOpen,
+ isVoiceModelDialogOpen,
isAgentConfigDialogOpen,
selectedAgentName,
selectedAgentDisplayName,
@@ -2468,6 +2481,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
pendingGeminiHistoryItems,
thought,
isInputActive,
+ isVoiceModeEnabled,
isResuming,
shouldShowIdePrompt,
isFolderTrustDialogOpen: isFolderTrustDialogOpen ?? false,
@@ -2559,6 +2573,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
isSettingsDialogOpen,
isSessionBrowserOpen,
isModelDialogOpen,
+ isVoiceModelDialogOpen,
isAgentConfigDialogOpen,
selectedAgentName,
selectedAgentDisplayName,
@@ -2579,6 +2594,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
pendingGeminiHistoryItems,
thought,
isInputActive,
+ isVoiceModeEnabled,
isResuming,
shouldShowIdePrompt,
isFolderTrustDialogOpen,
@@ -2671,6 +2687,8 @@ Logging in with Google... Restarting Gemini CLI to continue.
exitPrivacyNotice,
closeSettingsDialog,
closeModelDialog,
+ openVoiceModelDialog,
+ closeVoiceModelDialog,
openAgentConfigDialog,
closeAgentConfigDialog,
openPermissionsDialog,
@@ -2751,6 +2769,9 @@ Logging in with Google... Restarting Gemini CLI to continue.
setAccountSuspensionInfo(null);
setAuthState(AuthState.Updating);
},
+ setVoiceModeEnabled: (value: boolean) => {
+ setVoiceModeEnabled(value);
+ },
}),
[
handleThemeSelect,
@@ -2764,6 +2785,8 @@ Logging in with Google... Restarting Gemini CLI to continue.
exitPrivacyNotice,
closeSettingsDialog,
closeModelDialog,
+ openVoiceModelDialog,
+ closeVoiceModelDialog,
openAgentConfigDialog,
closeAgentConfigDialog,
openPermissionsDialog,
@@ -2807,6 +2830,7 @@ Logging in with Google... Restarting Gemini CLI to continue.
config,
historyManager,
getPreferredEditor,
+ setVoiceModeEnabled,
],
);
diff --git a/packages/cli/src/ui/commands/types.ts b/packages/cli/src/ui/commands/types.ts
index 466e70c994..f0805f4426 100644
--- a/packages/cli/src/ui/commands/types.ts
+++ b/packages/cli/src/ui/commands/types.ts
@@ -72,6 +72,7 @@ export interface CommandContext {
loadHistory: (history: HistoryItem[], postLoadInput?: string) => void;
/** Toggles a special display mode. */
toggleCorgiMode: () => void;
+ toggleVoiceMode: () => void;
toggleDebugProfiler: () => void;
toggleVimEnabled: () => Promise;
reloadCommands: () => void;
@@ -125,6 +126,7 @@ export interface OpenDialogActionReturn {
| 'settings'
| 'sessionBrowser'
| 'model'
+ | 'voice-model'
| 'agentConfig'
| 'permissions';
}
diff --git a/packages/cli/src/ui/commands/voiceCommand.ts b/packages/cli/src/ui/commands/voiceCommand.ts
new file mode 100644
index 0000000000..b9df28ca27
--- /dev/null
+++ b/packages/cli/src/ui/commands/voiceCommand.ts
@@ -0,0 +1,30 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { CommandKind, type SlashCommand } from './types.js';
+
+export const voiceCommand: SlashCommand = {
+ name: 'voice',
+ altNames: [],
+ description: 'Toggle voice dictation mode',
+ kind: CommandKind.BUILT_IN,
+ autoExecute: true,
+ action: (context) => {
+ context.ui.toggleVoiceMode();
+ },
+ subCommands: [
+ {
+ name: 'model',
+ description: 'Manage voice transcription models',
+ kind: CommandKind.BUILT_IN,
+ autoExecute: true,
+ action: async () => ({
+ type: 'dialog',
+ dialog: 'voice-model',
+ }),
+ },
+ ],
+};
diff --git a/packages/cli/src/ui/components/DialogManager.tsx b/packages/cli/src/ui/components/DialogManager.tsx
index b231a62db5..40f0b06138 100644
--- a/packages/cli/src/ui/components/DialogManager.tsx
+++ b/packages/cli/src/ui/components/DialogManager.tsx
@@ -25,6 +25,7 @@ import { relaunchApp } from '../../utils/processUtils.js';
import { SessionBrowser } from './SessionBrowser.js';
import { PermissionsModifyTrustDialog } from './PermissionsModifyTrustDialog.js';
import { ModelDialog } from './ModelDialog.js';
+import { VoiceModelDialog } from './VoiceModelDialog.js';
import { theme } from '../semantic-colors.js';
import { useUIState } from '../contexts/UIStateContext.js';
import { useQuotaState } from '../contexts/QuotaContext.js';
@@ -238,6 +239,9 @@ export const DialogManager = ({
if (uiState.isModelDialogOpen) {
return ;
}
+ if (uiState.isVoiceModelDialogOpen) {
+ return ;
+ }
if (
uiState.isAgentConfigDialogOpen &&
uiState.selectedAgentName &&
diff --git a/packages/cli/src/ui/components/InputPrompt.test.tsx b/packages/cli/src/ui/components/InputPrompt.test.tsx
index e50a2f1d81..5be237a15f 100644
--- a/packages/cli/src/ui/components/InputPrompt.test.tsx
+++ b/packages/cli/src/ui/components/InputPrompt.test.tsx
@@ -9,12 +9,41 @@ import { createMockSettings } from '../../test-utils/settings.js';
import { makeFakeConfig } from '@google/gemini-cli-core';
import { waitFor } from '../../test-utils/async.js';
import { act, useState, useMemo } from 'react';
+import type { EventEmitter } from 'node:events';
+
+const { fakeTranscriptionProvider } = vi.hoisted(() => {
+ // Use require within hoisted block for immediate synchronous access
+ // eslint-disable-next-line @typescript-eslint/no-require-imports, no-restricted-syntax
+ const { EventEmitter } = require('node:events');
+ class FakeTranscriptionProvider extends EventEmitter {
+ connect = vi.fn().mockResolvedValue(undefined);
+ disconnect = vi.fn();
+ sendAudioChunk = vi.fn();
+ getTranscription = vi.fn().mockReturnValue('');
+ }
+ return {
+ fakeTranscriptionProvider: new FakeTranscriptionProvider(),
+ };
+});
+
+vi.mock('@google/gemini-cli-core', async (importOriginal) => {
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
+ const actual = (await importOriginal()) as any;
+ return {
+ ...actual,
+ TranscriptionFactory: {
+ createProvider: vi.fn(() => fakeTranscriptionProvider),
+ },
+ };
+});
+
import {
InputPrompt,
tryTogglePasteExpansion,
type InputPromptProps,
} from './InputPrompt.js';
import { InputContext } from '../contexts/InputContext.js';
+import { type UIState } from '../contexts/UIStateContext.js';
import {
calculateTransformationsForLine,
calculateTransformedLine,
@@ -417,6 +446,7 @@ describe('InputPrompt', () => {
getWorkspaceContext: () => ({
getDirectories: () => ['/test/project/src'],
}),
+ getContentGeneratorConfig: () => ({ apiKey: 'test-api-key' }),
} as unknown as Config,
slashCommands: mockSlashCommands,
commandContext: mockCommandContext,
@@ -4925,6 +4955,383 @@ describe('InputPrompt', () => {
},
);
});
+
+ describe('Voice Mode', () => {
+ beforeEach(() => {
+ (
+ fakeTranscriptionProvider as unknown as EventEmitter
+ ).removeAllListeners();
+ vi.clearAllMocks();
+ });
+
+ it('should start recording when space is pressed and voice mode is enabled (toggle)', async () => {
+ await act(async () => {
+ mockBuffer.setText('');
+ });
+ const { stdin, unmount, lastFrame } = await renderWithProviders(
+ ,
+ {
+ uiState: { isVoiceModeEnabled: true } as UIState,
+ settings: createMockSettings({
+ experimental: { voice: { activationMode: 'toggle' } },
+ }),
+ },
+ );
+
+ // Initially not recording
+ expect(lastFrame()).not.toContain('🎙️ Listening...');
+ expect(lastFrame()).toContain(
+ 'Voice mode: Space to start/stop recording',
+ );
+
+ // Press space to start
+ await act(async () => {
+ stdin.write(' ');
+ });
+
+ // Now should show listening
+ await waitFor(() => {
+ expect(lastFrame()).toContain('🎙️ Listening...');
+ });
+
+ unmount();
+ });
+
+ it('should toggle recording off when space is pressed again (toggle)', async () => {
+ await act(async () => {
+ mockBuffer.setText('');
+ });
+ const { stdin, unmount, lastFrame } = await renderWithProviders(
+ ,
+ {
+ uiState: { isVoiceModeEnabled: true } as UIState,
+ settings: createMockSettings({
+ experimental: { voice: { activationMode: 'toggle' } },
+ }),
+ },
+ );
+
+ // Start recording
+ await act(async () => {
+ stdin.write(' ');
+ });
+ await waitFor(() => {
+ expect(lastFrame()).toContain('🎙️ Listening...');
+ });
+
+ // Stop recording
+ await act(async () => {
+ stdin.write(' ');
+ });
+ await waitFor(() => {
+ expect(lastFrame()).not.toContain('🎙️ Listening...');
+ expect(lastFrame()).toContain(
+ 'Voice mode: Space to start/stop recording',
+ );
+ });
+
+ unmount();
+ });
+
+ it('should resume recording when space is pressed even if buffer is not empty (toggle)', async () => {
+ await act(async () => {
+ mockBuffer.setText('some existing text');
+ });
+ const { stdin, unmount, lastFrame } = await renderWithProviders(
+ ,
+ {
+ uiState: { isVoiceModeEnabled: true } as UIState,
+ settings: createMockSettings({
+ experimental: { voice: { activationMode: 'toggle' } },
+ }),
+ },
+ );
+
+ // Should show voice mode hint even if buffer is not empty (new behavior)
+ expect(lastFrame()).toContain(
+ 'Voice mode: Space to start/stop recording',
+ );
+ expect(lastFrame()).toContain('some existing text');
+
+ // Press space to start recording again
+ await act(async () => {
+ stdin.write(' ');
+ });
+
+ await waitFor(() => {
+ expect(lastFrame()).toContain('🎙️ Listening...');
+ });
+
+ unmount();
+ });
+
+ it('should not start recording if voice mode is disabled (toggle)', async () => {
+ await act(async () => {
+ mockBuffer.setText('');
+ });
+ const { stdin, unmount, lastFrame } = await renderWithProviders(
+ ,
+ {
+ uiState: { isVoiceModeEnabled: false } as UIState,
+ settings: createMockSettings({
+ experimental: { voice: { activationMode: 'toggle' } },
+ }),
+ },
+ );
+
+ // Press space
+ await act(async () => {
+ stdin.write(' ');
+ });
+
+ // Should NOT show listening, instead should call handleInput which handles space
+ expect(lastFrame()).not.toContain('🎙️ Listening...');
+ expect(mockBuffer.handleInput).toHaveBeenCalled();
+ unmount();
+ });
+
+ it('should append transcription correctly across multiple turn updates (toggle)', async () => {
+ await act(async () => {
+ mockBuffer.setText('initial');
+ });
+ const { stdin, unmount } = await renderWithProviders(
+ ,
+ {
+ uiState: { isVoiceModeEnabled: true } as UIState,
+ settings: createMockSettings({
+ experimental: { voice: { activationMode: 'toggle' } },
+ }),
+ },
+ );
+
+ // Start recording
+ await act(async () => {
+ stdin.write(' ');
+ });
+
+ // Emit first transcription
+ await act(async () => {
+ (fakeTranscriptionProvider as unknown as EventEmitter).emit(
+ 'transcription',
+ 'hello',
+ );
+ });
+ await waitFor(() => {
+ expect(mockBuffer.setText).toHaveBeenCalledWith('initial hello', 'end');
+ });
+
+ // Emit turnComplete (Gemini Live starts over after this)
+ await act(async () => {
+ (fakeTranscriptionProvider as unknown as EventEmitter).emit(
+ 'turnComplete',
+ );
+ });
+
+ // Emit second part (Gemini Live sends new turn text starting from empty)
+ await act(async () => {
+ (fakeTranscriptionProvider as unknown as EventEmitter).emit(
+ 'transcription',
+ 'world',
+ );
+ });
+ await waitFor(() => {
+ // Should have appended 'world' to the baseline 'initial hello'
+ expect(mockBuffer.setText).toHaveBeenCalledWith(
+ 'initial hello world',
+ 'end',
+ );
+ });
+
+ unmount();
+ });
+
+ it('should append transcription correctly when resuming voice mode (toggle)', async () => {
+ await act(async () => {
+ mockBuffer.setText('First turn.');
+ });
+ const { stdin, unmount } = await renderWithProviders(
+ ,
+ {
+ uiState: { isVoiceModeEnabled: true } as UIState,
+ settings: createMockSettings({
+ experimental: { voice: { activationMode: 'toggle' } },
+ }),
+ },
+ );
+
+ // Start recording (resumed)
+ await act(async () => {
+ stdin.write(' ');
+ });
+
+ // Emit transcription
+ await act(async () => {
+ (fakeTranscriptionProvider as unknown as EventEmitter).emit(
+ 'transcription',
+ 'Second turn.',
+ );
+ });
+
+ await waitFor(() => {
+ expect(mockBuffer.setText).toHaveBeenCalledWith(
+ 'First turn. Second turn.',
+ 'end',
+ );
+ });
+
+ unmount();
+ });
+
+ describe('push-to-talk', () => {
+ beforeEach(() => {
+ vi.useFakeTimers();
+ });
+
+ afterEach(() => {
+ vi.useRealTimers();
+ });
+
+ it('should insert a space on a single tap', async () => {
+ const { stdin, unmount, lastFrame } = await renderWithProviders(
+ ,
+ {
+ uiState: { isVoiceModeEnabled: true } as UIState,
+ settings: createMockSettings({
+ experimental: { voice: { activationMode: 'push-to-talk' } },
+ }),
+ },
+ );
+
+ expect(lastFrame()).toContain('Voice mode: Hold Space to record');
+
+ // Press space once
+ await act(async () => {
+ stdin.write(' ');
+ });
+
+ // Should insert space optimistically
+ expect(mockBuffer.insert).toHaveBeenCalledWith(' ');
+ expect(lastFrame()).not.toContain('🎙️ Listening...');
+
+ // Advance timer past HOLD_DELAY_MS
+ await act(async () => {
+ vi.advanceTimersByTime(700);
+ });
+
+ expect(lastFrame()).not.toContain('🎙️ Listening...');
+ unmount();
+ });
+
+ it('should start recording on hold (simulated by repeat spaces)', async () => {
+ const { stdin, unmount, lastFrame } = await renderWithProviders(
+ ,
+ {
+ uiState: { isVoiceModeEnabled: true } as UIState,
+ settings: createMockSettings({
+ experimental: { voice: { activationMode: 'push-to-talk' } },
+ }),
+ },
+ );
+
+ // First space
+ await act(async () => {
+ stdin.write(' ');
+ });
+ expect(mockBuffer.insert).toHaveBeenCalledWith(' ');
+
+ // Second space (repeat)
+ await act(async () => {
+ stdin.write(' ');
+ });
+
+ await waitFor(() => {
+ // Should have backspaced the optimistic space
+ expect(mockBuffer.backspace).toHaveBeenCalled();
+ // Should show listening
+ expect(lastFrame()).toContain('🎙️ Listening...');
+ });
+
+ unmount();
+ });
+
+ it('should stop recording when space heartbeat stops (release)', async () => {
+ const { stdin, unmount, lastFrame } = await renderWithProviders(
+ ,
+ {
+ uiState: { isVoiceModeEnabled: true } as UIState,
+ settings: createMockSettings({
+ experimental: { voice: { activationMode: 'push-to-talk' } },
+ }),
+ },
+ );
+
+ // Start hold
+ await act(async () => {
+ stdin.write(' ');
+ stdin.write(' ');
+ });
+
+ // Use a short interval in waitFor to prevent advancing fake timers past the 300ms RELEASE_DELAY_MS
+ await waitFor(
+ () => {
+ expect(lastFrame()).toContain('🎙️ Listening...');
+ },
+ { interval: 10 },
+ );
+
+ // Simulate heartbeat (held key) - send space first to reset timer, then advance
+ await act(async () => {
+ stdin.write(' ');
+ vi.advanceTimersByTime(100);
+ });
+ expect(lastFrame()).toContain('🎙️ Listening...');
+
+ // Stop heartbeat (release)
+ await act(async () => {
+ vi.advanceTimersByTime(400); // Past RELEASE_DELAY_MS
+ });
+
+ await waitFor(() => {
+ expect(lastFrame()).not.toContain('🎙️ Listening...');
+ });
+
+ unmount();
+ });
+
+ it('should cancel hold state if non-space key is pressed after first space', async () => {
+ const { stdin, unmount } = await renderWithProviders(
+ ,
+ {
+ uiState: { isVoiceModeEnabled: true } as UIState,
+ settings: createMockSettings({
+ experimental: { voice: { activationMode: 'push-to-talk' } },
+ }),
+ },
+ );
+
+ // First space
+ await act(async () => {
+ stdin.write(' ');
+ });
+
+ // Type 'a'
+ await act(async () => {
+ stdin.write('a');
+ });
+
+ // Should NOT start recording on next space even if fast
+ await act(async () => {
+ stdin.write(' ');
+ });
+
+ expect(mockBuffer.insert).toHaveBeenCalledTimes(2); // Two spaces inserted
+ expect(mockBuffer.handleInput).toHaveBeenCalledWith(
+ expect.objectContaining({ name: 'a' }),
+ );
+ unmount();
+ });
+ });
+ });
});
function clean(str: string | undefined): string {
diff --git a/packages/cli/src/ui/components/InputPrompt.tsx b/packages/cli/src/ui/components/InputPrompt.tsx
index c9f75c740b..f69138c8c7 100644
--- a/packages/cli/src/ui/components/InputPrompt.tsx
+++ b/packages/cli/src/ui/components/InputPrompt.tsx
@@ -56,6 +56,7 @@ import {
debugLogger,
type Config,
} from '@google/gemini-cli-core';
+import { useVoiceMode } from '../hooks/useVoiceMode.js';
import {
parseInputForHighlighting,
parseSegmentsFromTokens,
@@ -159,7 +160,6 @@ export function isLargePaste(text: string): boolean {
}
const DOUBLE_TAB_CLEAN_UI_TOGGLE_WINDOW_MS = 350;
-
/**
* Attempt to toggle expansion of a paste placeholder in the buffer.
* Returns true if a toggle action was performed or hint was shown, false otherwise.
@@ -238,6 +238,7 @@ export const InputPrompt: React.FC = ({
setEmbeddedShellFocused,
setShortcutsHelpVisible,
toggleCleanUiDetailsVisible,
+ setVoiceModeEnabled,
} = useUIActions();
const {
terminalWidth,
@@ -246,6 +247,7 @@ export const InputPrompt: React.FC = ({
backgroundTasks,
backgroundTaskHeight,
shortcutsHelpVisible,
+ isVoiceModeEnabled,
} = useUIState();
const [suppressCompletion, setSuppressCompletion] = useState(false);
const { handlePress: registerPlainTabPress, resetCount: resetPlainTabPress } =
@@ -263,6 +265,7 @@ export const InputPrompt: React.FC = ({
resetEscapeState();
if (buffer.text.length > 0) {
buffer.setText('');
+ resetTurnBaseline();
resetCompletionState();
} else if (history.length > 0) {
onSubmit('/rewind');
@@ -281,6 +284,16 @@ export const InputPrompt: React.FC = ({
const hasUserNavigatedSuggestions = useRef(false);
const listRef = useRef>(null);
+ const { isRecording, handleVoiceInput, resetTurnBaseline } = useVoiceMode({
+ buffer,
+ config,
+ settings,
+ setQueueErrorMessage,
+ isVoiceModeEnabled,
+ setVoiceModeEnabled,
+ keyMatchers,
+ });
+
const [reverseSearchActive, setReverseSearchActive] = useState(false);
const [commandSearchActive, setCommandSearchActive] = useState(false);
const [textBeforeReverseSearch, setTextBeforeReverseSearch] = useState('');
@@ -387,6 +400,7 @@ export const InputPrompt: React.FC = ({
// Clear the buffer *before* calling onSubmit to prevent potential re-submission
// if onSubmit triggers a re-render while the buffer still holds the old value.
buffer.setText('');
+ resetTurnBaseline();
onSubmit(processedValue);
resetCompletionState();
resetReverseSearchCompletionState();
@@ -398,6 +412,7 @@ export const InputPrompt: React.FC = ({
shellModeActive,
shellHistory,
resetReverseSearchCompletionState,
+ resetTurnBaseline,
],
);
@@ -647,6 +662,8 @@ export const InputPrompt: React.FC = ({
const handleInput = useCallback(
(key: Key) => {
+ if (handleVoiceInput(key)) return true;
+
// Determine if this keypress is a history navigation command
const isHistoryUp =
!shellModeActive &&
@@ -873,9 +890,9 @@ export const InputPrompt: React.FC = ({
) {
setShellModeActive(!shellModeActive);
buffer.setText(''); // Clear the '!' from input
+ resetTurnBaseline();
return true;
}
-
if (keyMatchers[Command.ESCAPE](key)) {
const cancelSearch = (
setActive: (active: boolean) => void,
@@ -1360,6 +1377,7 @@ export const InputPrompt: React.FC = ({
backgroundTaskHeight,
streamingState,
handleEscPress,
+ resetTurnBaseline,
registerPlainTabPress,
resetPlainTabPress,
toggleCleanUiDetailsVisible,
@@ -1369,9 +1387,9 @@ export const InputPrompt: React.FC = ({
keyMatchers,
isHelpDismissKey,
settings,
+ handleVoiceInput,
],
);
-
useKeypress(handleInput, {
isActive: !isEmbeddedShellFocused && !copyModeEnabled,
priority: true,
@@ -1792,20 +1810,39 @@ export const InputPrompt: React.FC = ({
)}{' '}
- {buffer.text.length === 0 && placeholder ? (
- showCursor ? (
-
- {chalk.inverse(placeholder.slice(0, 1))}
-
- {placeholder.slice(1)}
-
+ {isRecording && (
+
+ 🎙️ Listening...
+
+ )}
+ {isVoiceModeEnabled && !isRecording && (
+
+
+ > Voice mode:{' '}
+ {(settings.experimental.voice?.activationMode ??
+ 'push-to-talk') === 'push-to-talk'
+ ? 'Hold Space to record'
+ : 'Space to start/stop recording'}{' '}
+ (Esc to exit)
- ) : (
- {placeholder}
- )
+
+ )}
+ {buffer.text.length === 0 && !isRecording ? (
+ !isVoiceModeEnabled && placeholder ? (
+ showCursor ? (
+
+ {chalk.inverse(placeholder.slice(0, 1))}
+
+ {placeholder.slice(1)}
+
+
+ ) : (
+ {placeholder}
+ )
+ ) : null
) : (
void;
+}
+
+type DialogView = 'backend' | 'whisper-models';
+
+const WHISPER_MODELS = [
+ {
+ value: 'ggml-tiny.en.bin',
+ label: 'Tiny (EN)',
+ description: 'Fastest, lower accuracy (~75MB)',
+ },
+ {
+ value: 'ggml-base.en.bin',
+ label: 'Base (EN)',
+ description: 'Balanced speed and accuracy (~142MB)',
+ },
+ {
+ value: 'ggml-large-v3-turbo-q5_0.bin',
+ label: 'Large v3 Turbo (Q5_0)',
+ description: 'High accuracy, quantized (~547MB)',
+ },
+ {
+ value: 'ggml-large-v3-turbo-q8_0.bin',
+ label: 'Large v3 Turbo (Q8_0)',
+ description: 'Maximum accuracy, high memory (~834MB)',
+ },
+];
+
+export function VoiceModelDialog({
+ onClose,
+}: VoiceModelDialogProps): React.JSX.Element {
+ const { settings, setSetting } = useSettingsStore();
+ const [view, setView] = useState('backend');
+ const [downloadProgress, setDownloadProgress] =
+ useState(null);
+ const [error, setError] = useState(null);
+
+ const whisperInstalled = useMemo(
+ () => isBinaryAvailable('whisper-stream'),
+ [],
+ );
+ const modelManager = useMemo(() => new WhisperModelManager(), []);
+
+ const currentBackend =
+ settings.merged.experimental.voice?.backend ?? 'gemini-live';
+ const currentWhisperModel =
+ settings.merged.experimental.voice?.whisperModel ?? 'ggml-base.en.bin';
+
+ const handleKeypress = useCallback(
+ (key: Key) => {
+ if (key.name === 'escape') {
+ if (view === 'whisper-models') {
+ setView('backend');
+ } else {
+ onClose();
+ }
+ return true;
+ }
+ return false;
+ },
+ [view, onClose],
+ );
+
+ useKeypress(handleKeypress, { isActive: true });
+
+ const handleBackendSelect = useCallback(
+ (value: string) => {
+ if (value === 'whisper') {
+ setView('whisper-models');
+ } else {
+ setSetting(
+ SettingScope.User,
+ 'experimental.voice.backend',
+ 'gemini-live',
+ );
+ onClose();
+ }
+ },
+ [setSetting, onClose],
+ );
+
+ const handleWhisperModelSelect = useCallback(
+ async (modelName: string) => {
+ if (modelManager.isModelInstalled(modelName)) {
+ setSetting(SettingScope.User, 'experimental.voice.backend', 'whisper');
+ setSetting(
+ SettingScope.User,
+ 'experimental.voice.whisperModel',
+ modelName,
+ );
+ onClose();
+ } else {
+ setError(null);
+ const onProgress = (p: WhisperModelProgress) => setDownloadProgress(p);
+ modelManager.on('progress', onProgress);
+
+ try {
+ await modelManager.downloadModel(modelName);
+
+ setSetting(
+ SettingScope.User,
+ 'experimental.voice.backend',
+ 'whisper',
+ );
+ setSetting(
+ SettingScope.User,
+ 'experimental.voice.whisperModel',
+ modelName,
+ );
+ onClose();
+ } catch (err) {
+ setError(
+ `Failed to download: ${err instanceof Error ? err.message : String(err)}`,
+ );
+ } finally {
+ modelManager.off('progress', onProgress);
+ setDownloadProgress(null);
+ }
+ }
+ },
+ [modelManager, setSetting, onClose],
+ );
+
+ const backendOptions = useMemo(
+ () => [
+ {
+ value: 'gemini-live',
+ title: 'Gemini Live API (Cloud)',
+ description: 'Real-time cloud transcription via Gemini Live API.',
+ key: 'gemini-live',
+ },
+ {
+ value: 'whisper',
+ title: 'Whisper (Local)',
+ description: whisperInstalled
+ ? 'Local transcription using whisper.cpp.'
+ : 'Local transcription (Requires: brew install whisper-cpp)',
+ key: 'whisper',
+ },
+ ],
+ [whisperInstalled],
+ );
+
+ const whisperOptions = useMemo(
+ () =>
+ WHISPER_MODELS.map((m) => ({
+ value: m.value,
+ title: `${m.label}${modelManager.isModelInstalled(m.value) ? ' (Installed)' : ' (Download)'}`,
+ description: m.description,
+ key: m.value,
+ })),
+ [modelManager],
+ );
+
+ return (
+
+
+ {view === 'backend'
+ ? 'Select Voice Transcription Backend'
+ : 'Select Whisper Model'}
+
+
+ {error && (
+
+ {error}
+
+ )}
+
+ {downloadProgress ? (
+
+
+ Downloading {downloadProgress.modelName}...
+
+ {Math.round(downloadProgress.percentage * 100)}%
+
+
+ ) : (
+
+ {view === 'backend' ? (
+
+ ) : (
+ o.value === currentWhisperModel,
+ )}
+ showNumbers={true}
+ />
+ )}
+
+ )}
+
+
+
+ {view === 'whisper-models'
+ ? '(Press Esc to go back)'
+ : '(Press Esc to close)'}
+
+
+
+ );
+}
diff --git a/packages/cli/src/ui/components/__snapshots__/InputPrompt.test.tsx.snap b/packages/cli/src/ui/components/__snapshots__/InputPrompt.test.tsx.snap
index 4830e90db1..db449ce4d7 100644
--- a/packages/cli/src/ui/components/__snapshots__/InputPrompt.test.tsx.snap
+++ b/packages/cli/src/ui/components/__snapshots__/InputPrompt.test.tsx.snap
@@ -168,13 +168,6 @@ exports[`InputPrompt > mouse interaction > should toggle paste expansion on doub
"
`;
-exports[`InputPrompt > mouse interaction > should toggle paste expansion on double-click 4`] = `
-"▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀
- > [Pasted Text: 10 lines]
-▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄
-"
-`;
-
exports[`InputPrompt > multiline rendering > should correctly render multiline input including blank lines 1`] = `
"────────────────────────────────────────────────────────────────────────────────────────────────────
> hello
diff --git a/packages/cli/src/ui/contexts/UIActionsContext.tsx b/packages/cli/src/ui/contexts/UIActionsContext.tsx
index f1959c0173..fb979e7c17 100644
--- a/packages/cli/src/ui/contexts/UIActionsContext.tsx
+++ b/packages/cli/src/ui/contexts/UIActionsContext.tsx
@@ -41,6 +41,8 @@ export interface UIActions {
exitPrivacyNotice: () => void;
closeSettingsDialog: () => void;
closeModelDialog: () => void;
+ openVoiceModelDialog: () => void;
+ closeVoiceModelDialog: () => void;
openAgentConfigDialog: (
name: string,
displayName: string,
@@ -93,6 +95,7 @@ export interface UIActions {
handleNewAgentsSelect: (choice: NewAgentsChoice) => Promise;
getPreferredEditor: () => EditorType | undefined;
clearAccountSuspension: () => void;
+ setVoiceModeEnabled: (value: boolean) => void;
}
export const UIActionsContext = createContext(null);
diff --git a/packages/cli/src/ui/contexts/UIStateContext.tsx b/packages/cli/src/ui/contexts/UIStateContext.tsx
index ed33c21ee5..2150218d44 100644
--- a/packages/cli/src/ui/contexts/UIStateContext.tsx
+++ b/packages/cli/src/ui/contexts/UIStateContext.tsx
@@ -112,6 +112,7 @@ export interface UIState {
isSettingsDialogOpen: boolean;
isSessionBrowserOpen: boolean;
isModelDialogOpen: boolean;
+ isVoiceModelDialogOpen: boolean;
isAgentConfigDialogOpen: boolean;
selectedAgentName?: string;
selectedAgentDisplayName?: string;
@@ -132,6 +133,7 @@ export interface UIState {
pendingGeminiHistoryItems: HistoryItemWithoutId[];
thought: ThoughtSummary | null;
isInputActive: boolean;
+ isVoiceModeEnabled: boolean;
isResuming: boolean;
shouldShowIdePrompt: boolean;
isFolderTrustDialogOpen: boolean;
diff --git a/packages/cli/src/ui/hooks/slashCommandProcessor.test.tsx b/packages/cli/src/ui/hooks/slashCommandProcessor.test.tsx
index 3e521a6627..f4b18d5bbf 100644
--- a/packages/cli/src/ui/hooks/slashCommandProcessor.test.tsx
+++ b/packages/cli/src/ui/hooks/slashCommandProcessor.test.tsx
@@ -205,11 +205,13 @@ describe('useSlashCommandProcessor', () => {
openSettingsDialog: vi.fn(),
openSessionBrowser: vi.fn(),
openModelDialog: mockOpenModelDialog,
+ openVoiceModelDialog: vi.fn(),
openAgentConfigDialog,
openPermissionsDialog: vi.fn(),
quit: mockSetQuittingMessages,
setDebugMessage: vi.fn(),
toggleCorgiMode: vi.fn(),
+ toggleVoiceMode: vi.fn(),
toggleDebugProfiler: vi.fn(),
dispatchExtensionStateUpdate: vi.fn(),
addConfirmUpdateExtensionRequest: vi.fn(),
diff --git a/packages/cli/src/ui/hooks/slashCommandProcessor.ts b/packages/cli/src/ui/hooks/slashCommandProcessor.ts
index 20de86002c..3007a96a73 100644
--- a/packages/cli/src/ui/hooks/slashCommandProcessor.ts
+++ b/packages/cli/src/ui/hooks/slashCommandProcessor.ts
@@ -72,6 +72,7 @@ interface SlashCommandProcessorActions {
openSettingsDialog: () => void;
openSessionBrowser: () => void;
openModelDialog: () => void;
+ openVoiceModelDialog: () => void;
openAgentConfigDialog: (
name: string,
displayName: string,
@@ -81,6 +82,7 @@ interface SlashCommandProcessorActions {
quit: (messages: HistoryItem[]) => void;
setDebugMessage: (message: string) => void;
toggleCorgiMode: () => void;
+ toggleVoiceMode: () => void;
toggleDebugProfiler: () => void;
dispatchExtensionStateUpdate: (action: ExtensionUpdateAction) => void;
addConfirmUpdateExtensionRequest: (request: ConfirmationRequest) => void;
@@ -232,6 +234,7 @@ export const useSlashCommandProcessor = (
pendingItem,
setPendingItem,
toggleCorgiMode: actions.toggleCorgiMode,
+ toggleVoiceMode: actions.toggleVoiceMode,
toggleDebugProfiler: actions.toggleDebugProfiler,
toggleVimEnabled,
reloadCommands,
@@ -503,6 +506,9 @@ export const useSlashCommandProcessor = (
case 'model':
actions.openModelDialog();
return { type: 'handled' };
+ case 'voice-model':
+ actions.openVoiceModelDialog();
+ return { type: 'handled' };
case 'agentConfig': {
// eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
const props = result.props as Record;
diff --git a/packages/cli/src/ui/hooks/useVoiceMode.ts b/packages/cli/src/ui/hooks/useVoiceMode.ts
new file mode 100644
index 0000000000..0f37c66357
--- /dev/null
+++ b/packages/cli/src/ui/hooks/useVoiceMode.ts
@@ -0,0 +1,429 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { useState, useRef, useCallback, useEffect } from 'react';
+import {
+ AudioRecorder,
+ TranscriptionFactory,
+ debugLogger,
+ type Config,
+ type TranscriptionProvider,
+} from '@google/gemini-cli-core';
+import type { TextBuffer } from '../components/shared/text-buffer.js';
+import type { MergedSettings } from '../../config/settingsSchema.js';
+import type { Key } from './useKeypress.js';
+import { Command } from '../key/keyMatchers.js';
+
+interface UseVoiceModeProps {
+ buffer: TextBuffer;
+ config: Config;
+ settings: MergedSettings;
+ setQueueErrorMessage: (message: string | null) => void;
+ isVoiceModeEnabled: boolean;
+ setVoiceModeEnabled: (enabled: boolean) => void;
+ keyMatchers: Record boolean>;
+}
+
+const HOLD_DELAY_MS = 600;
+const RELEASE_DELAY_MS = 300;
+
+export function useVoiceMode({
+ buffer,
+ config,
+ settings,
+ setQueueErrorMessage,
+ isVoiceModeEnabled,
+ setVoiceModeEnabled,
+ keyMatchers,
+}: UseVoiceModeProps) {
+ const [isRecording, setIsRecording] = useState(false);
+ const [isConnecting, setIsConnecting] = useState(false);
+
+ const liveTranscriptionRef = useRef('');
+ const stopRequestedRef = useRef(false);
+ const isRecordingRef = useRef(false);
+ const lastFailureTimeRef = useRef(0);
+ const recordingInProgressRef = useRef(false);
+ const voiceTimeoutRef = useRef(null);
+ const recorderRef = useRef(null);
+ const transcriptionServiceRef = useRef(null);
+ const turnBaselineRef = useRef(null);
+
+ const pttStateRef = useRef<'idle' | 'possible-hold' | 'recording'>('idle');
+ const pttTimerRef = useRef(null);
+ const disconnectTimerRef = useRef(null);
+
+ const bufferRef = useRef(buffer);
+ bufferRef.current = buffer;
+
+ const stopVoiceRecording = useCallback(() => {
+ if (stopRequestedRef.current) return;
+ debugLogger.debug('[Voice] Stop requested');
+ stopRequestedRef.current = true;
+
+ setIsRecording(false);
+ isRecordingRef.current = false;
+ setIsConnecting(false);
+
+ if (recorderRef.current) {
+ recorderRef.current.stop();
+ recorderRef.current = null;
+ }
+
+ const serviceToDisconnect = transcriptionServiceRef.current;
+ transcriptionServiceRef.current = null;
+
+ if (serviceToDisconnect) {
+ const isLive = settings.experimental.voice?.backend === 'gemini-live';
+ const gracePeriodMs =
+ settings.experimental.voice?.stopGracePeriodMs ??
+ (isLive ? 2000 : 1000);
+ debugLogger.debug(
+ `[Voice] Draining transcription for ${gracePeriodMs}ms`,
+ );
+
+ if (disconnectTimerRef.current) clearTimeout(disconnectTimerRef.current);
+ disconnectTimerRef.current = setTimeout(() => {
+ debugLogger.debug('[Voice] Grace period ended, disconnecting service');
+ serviceToDisconnect.disconnect();
+ disconnectTimerRef.current = null;
+ }, gracePeriodMs);
+ }
+
+ liveTranscriptionRef.current = '';
+ pttStateRef.current = 'idle';
+ }, [settings.experimental.voice]);
+
+ const startVoiceRecording = useCallback(() => {
+ if (
+ isRecordingRef.current ||
+ Date.now() - lastFailureTimeRef.current < 2000
+ ) {
+ return;
+ }
+
+ if (disconnectTimerRef.current) {
+ clearTimeout(disconnectTimerRef.current);
+ disconnectTimerRef.current = null;
+ }
+
+ recordingInProgressRef.current = true;
+ turnBaselineRef.current = bufferRef.current.text;
+
+ setIsConnecting(true);
+ setIsRecording(true);
+ isRecordingRef.current = true;
+
+ liveTranscriptionRef.current = '';
+ stopRequestedRef.current = false;
+
+ const apiKey =
+ config.getContentGeneratorConfig()?.apiKey ||
+ process.env['GEMINI_API_KEY'] ||
+ '';
+
+ const startAsync = async () => {
+ // If there's an active draining service, disconnect it immediately
+ // before starting a new one to prevent orphaned event collisions.
+ if (disconnectTimerRef.current) {
+ clearTimeout(disconnectTimerRef.current);
+ disconnectTimerRef.current = null;
+ }
+ if (transcriptionServiceRef.current) {
+ transcriptionServiceRef.current.disconnect();
+ transcriptionServiceRef.current = null;
+ }
+
+ const cleanupIfStopped = () => {
+ if (stopRequestedRef.current) {
+ if (recorderRef.current) {
+ recorderRef.current.stop();
+ recorderRef.current = null;
+ }
+ if (transcriptionServiceRef.current) {
+ transcriptionServiceRef.current.disconnect();
+ transcriptionServiceRef.current = null;
+ }
+ setIsRecording(false);
+ isRecordingRef.current = false;
+ setIsConnecting(false);
+ recordingInProgressRef.current = false;
+ return true;
+ }
+ return false;
+ };
+
+ if (cleanupIfStopped()) return;
+
+ const voiceBackend =
+ settings.experimental.voice?.backend ?? 'gemini-live';
+
+ if (!apiKey && voiceBackend === 'gemini-live') {
+ setQueueErrorMessage(
+ 'Cloud voice mode requires a GEMINI_API_KEY. Please set it in your environment or ~/.gemini/.env.',
+ );
+ setIsRecording(false);
+ isRecordingRef.current = false;
+ setIsConnecting(false);
+ recordingInProgressRef.current = false;
+ lastFailureTimeRef.current = Date.now();
+ return;
+ }
+
+ if (voiceBackend === 'gemini-live') {
+ recorderRef.current = new AudioRecorder();
+ }
+
+ const currentService = TranscriptionFactory.createProvider(
+ settings.experimental.voice,
+ apiKey,
+ );
+ transcriptionServiceRef.current = currentService;
+
+ currentService.on('transcription', (text) => {
+ if (
+ transcriptionServiceRef.current !== currentService &&
+ stopRequestedRef.current
+ ) {
+ // If this is an orphaned service that was replaced by a new session, ignore its events
+ return;
+ }
+
+ if (text) {
+ const currentBufferText = bufferRef.current.text;
+ const previousTranscription = liveTranscriptionRef.current;
+
+ let newTotalText = currentBufferText;
+
+ if (
+ previousTranscription &&
+ currentBufferText.endsWith(previousTranscription)
+ ) {
+ newTotalText = currentBufferText.slice(
+ 0,
+ -previousTranscription.length,
+ );
+ } else if (
+ currentBufferText &&
+ !currentBufferText.endsWith(' ') &&
+ !currentBufferText.endsWith('\n')
+ ) {
+ newTotalText += ' ';
+ }
+
+ newTotalText += text;
+ bufferRef.current.setText(newTotalText, 'end');
+ }
+ liveTranscriptionRef.current = text;
+ });
+
+ currentService.on('turnComplete', () => {
+ if (
+ transcriptionServiceRef.current !== currentService &&
+ stopRequestedRef.current
+ )
+ return;
+ liveTranscriptionRef.current = '';
+ });
+
+ currentService.on('error', (err) => {
+ if (transcriptionServiceRef.current !== currentService) return;
+ debugLogger.error('[Voice] Transcription error:', err);
+ lastFailureTimeRef.current = Date.now();
+ recordingInProgressRef.current = false;
+ });
+
+ currentService.on('close', () => {
+ if (transcriptionServiceRef.current !== currentService) return;
+ if (!stopRequestedRef.current) {
+ setIsRecording(false);
+ isRecordingRef.current = false;
+ setIsConnecting(false);
+ recordingInProgressRef.current = false;
+ lastFailureTimeRef.current = Date.now();
+ }
+ });
+
+ try {
+ await currentService.connect();
+ if (cleanupIfStopped()) return;
+
+ await recorderRef.current?.start();
+ if (cleanupIfStopped()) return;
+
+ setIsConnecting(false);
+
+ const currentVoiceBackend =
+ settings.experimental.voice?.backend ?? 'gemini-live';
+
+ recorderRef.current?.on('data', (chunk) => {
+ if (currentVoiceBackend === 'gemini-live') {
+ currentService.sendAudioChunk(chunk);
+ }
+ });
+ recorderRef.current?.on('error', (err) => {
+ debugLogger.error('[Voice] Recorder error:', err);
+ stopVoiceRecording();
+ lastFailureTimeRef.current = Date.now();
+ });
+ } catch (err: unknown) {
+ if (transcriptionServiceRef.current !== currentService) return;
+ const message = err instanceof Error ? err.message : String(err);
+ setQueueErrorMessage(`Voice mode failure: ${message}`);
+ setIsRecording(false);
+ isRecordingRef.current = false;
+ setIsConnecting(false);
+ recordingInProgressRef.current = false;
+ lastFailureTimeRef.current = Date.now();
+
+ if (recorderRef.current) {
+ recorderRef.current.stop();
+ recorderRef.current = null;
+ }
+ if (transcriptionServiceRef.current) {
+ transcriptionServiceRef.current.disconnect();
+ transcriptionServiceRef.current = null;
+ }
+ }
+ };
+
+ void startAsync();
+ }, [
+ config,
+ settings.experimental.voice,
+ setQueueErrorMessage,
+ stopVoiceRecording,
+ ]);
+
+ useEffect(
+ () => () => {
+ if (voiceTimeoutRef.current) clearTimeout(voiceTimeoutRef.current);
+ if (recorderRef.current) {
+ recorderRef.current.stop();
+ recorderRef.current = null;
+ }
+ if (transcriptionServiceRef.current) {
+ transcriptionServiceRef.current.disconnect();
+ transcriptionServiceRef.current = null;
+ }
+ if (pttTimerRef.current) clearTimeout(pttTimerRef.current);
+ if (disconnectTimerRef.current) clearTimeout(disconnectTimerRef.current);
+ },
+ [],
+ );
+
+ const handleVoiceInput = useCallback(
+ (key: Key): boolean => {
+ const activeRecording = isRecording || isRecordingRef.current;
+
+ if (activeRecording) {
+ const activationMode =
+ settings.experimental.voice?.activationMode ?? 'push-to-talk';
+
+ if (keyMatchers[Command.ESCAPE](key)) {
+ stopVoiceRecording();
+ return true;
+ }
+
+ if (keyMatchers[Command.VOICE_MODE_PTT](key)) {
+ if (activationMode === 'push-to-talk') {
+ if (pttTimerRef.current) {
+ clearTimeout(pttTimerRef.current);
+ }
+ pttTimerRef.current = setTimeout(() => {
+ stopVoiceRecording();
+ pttTimerRef.current = null;
+ }, RELEASE_DELAY_MS);
+ return true;
+ } else {
+ stopVoiceRecording();
+ return true;
+ }
+ }
+ return true;
+ }
+
+ if (isVoiceModeEnabled) {
+ const activationMode =
+ settings.experimental.voice?.activationMode ?? 'push-to-talk';
+
+ if (keyMatchers[Command.ESCAPE](key) && buffer.text === '') {
+ setVoiceModeEnabled(false);
+ return true;
+ }
+
+ if (keyMatchers[Command.VOICE_MODE_PTT](key)) {
+ if (
+ key.name === 'space' &&
+ !key.ctrl &&
+ !key.alt &&
+ !key.shift &&
+ !key.cmd
+ ) {
+ if (activationMode === 'toggle') {
+ startVoiceRecording();
+ return true;
+ } else {
+ if (pttStateRef.current === 'idle') {
+ buffer.insert(' ');
+ pttStateRef.current = 'possible-hold';
+
+ if (pttTimerRef.current) clearTimeout(pttTimerRef.current);
+ pttTimerRef.current = setTimeout(() => {
+ pttStateRef.current = 'idle';
+ pttTimerRef.current = null;
+ }, HOLD_DELAY_MS);
+ return true;
+ } else if (pttStateRef.current === 'possible-hold') {
+ if (pttTimerRef.current) clearTimeout(pttTimerRef.current);
+ buffer.backspace();
+ pttStateRef.current = 'recording';
+ startVoiceRecording();
+
+ pttTimerRef.current = setTimeout(() => {
+ stopVoiceRecording();
+ pttTimerRef.current = null;
+ }, RELEASE_DELAY_MS);
+ return true;
+ }
+ }
+ }
+ }
+
+ if (pttStateRef.current === 'possible-hold') {
+ pttStateRef.current = 'idle';
+ if (pttTimerRef.current) {
+ clearTimeout(pttTimerRef.current);
+ pttTimerRef.current = null;
+ }
+ }
+ }
+
+ return false;
+ },
+ [
+ isRecording,
+ isVoiceModeEnabled,
+ settings.experimental.voice,
+ keyMatchers,
+ stopVoiceRecording,
+ startVoiceRecording,
+ buffer,
+ setVoiceModeEnabled,
+ ],
+ );
+
+ return {
+ isRecording,
+ isConnecting,
+ startVoiceRecording,
+ stopVoiceRecording,
+ handleVoiceInput,
+ resetTurnBaseline: () => {
+ turnBaselineRef.current = null;
+ },
+ };
+}
diff --git a/packages/cli/src/ui/hooks/useVoiceModelCommand.ts b/packages/cli/src/ui/hooks/useVoiceModelCommand.ts
new file mode 100644
index 0000000000..943c65ce30
--- /dev/null
+++ b/packages/cli/src/ui/hooks/useVoiceModelCommand.ts
@@ -0,0 +1,31 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { useState, useCallback } from 'react';
+
+interface UseVoiceModelCommandReturn {
+ isVoiceModelDialogOpen: boolean;
+ openVoiceModelDialog: () => void;
+ closeVoiceModelDialog: () => void;
+}
+
+export const useVoiceModelCommand = (): UseVoiceModelCommandReturn => {
+ const [isVoiceModelDialogOpen, setIsVoiceModelDialogOpen] = useState(false);
+
+ const openVoiceModelDialog = useCallback(() => {
+ setIsVoiceModelDialogOpen(true);
+ }, []);
+
+ const closeVoiceModelDialog = useCallback(() => {
+ setIsVoiceModelDialogOpen(false);
+ }, []);
+
+ return {
+ isVoiceModelDialogOpen,
+ openVoiceModelDialog,
+ closeVoiceModelDialog,
+ };
+};
diff --git a/packages/cli/src/ui/key/keyBindings.ts b/packages/cli/src/ui/key/keyBindings.ts
index e3fbcd8262..a038f6173c 100644
--- a/packages/cli/src/ui/key/keyBindings.ts
+++ b/packages/cli/src/ui/key/keyBindings.ts
@@ -97,6 +97,7 @@ export enum Command {
RESTART_APP = 'app.restart',
SUSPEND_APP = 'app.suspend',
SHOW_SHELL_INPUT_UNFOCUS_WARNING = 'app.showShellUnfocusWarning',
+ VOICE_MODE_PTT = 'app.voiceModePTT',
// Background Shell Controls
BACKGROUND_SHELL_ESCAPE = 'background.escape',
@@ -407,9 +408,7 @@ export const defaultKeyBindingConfig: KeyBindingConfig = new Map([
[Command.RESTART_APP, [new KeyBinding('r'), new KeyBinding('shift+r')]],
[Command.SUSPEND_APP, [new KeyBinding('ctrl+z')]],
[Command.SHOW_SHELL_INPUT_UNFOCUS_WARNING, [new KeyBinding('tab')]],
- [Command.DUMP_FRAME, [new KeyBinding('f8')]],
- [Command.START_RECORDING, [new KeyBinding('f6')]],
- [Command.STOP_RECORDING, [new KeyBinding('f7')]],
+ [Command.VOICE_MODE_PTT, [new KeyBinding('space')]],
// Background Shell Controls
[Command.BACKGROUND_SHELL_ESCAPE, [new KeyBinding('escape')]],
@@ -424,6 +423,10 @@ export const defaultKeyBindingConfig: KeyBindingConfig = new Map([
// Extension Controls
[Command.UPDATE_EXTENSION, [new KeyBinding('i')]],
[Command.LINK_EXTENSION, [new KeyBinding('l')]],
+
+ [Command.DUMP_FRAME, [new KeyBinding('f8')]],
+ [Command.START_RECORDING, [new KeyBinding('f6')]],
+ [Command.STOP_RECORDING, [new KeyBinding('f7')]],
]);
interface CommandCategory {
@@ -538,6 +541,7 @@ export const commandCategories: readonly CommandCategory[] = [
Command.RESTART_APP,
Command.SUSPEND_APP,
Command.SHOW_SHELL_INPUT_UNFOCUS_WARNING,
+ Command.VOICE_MODE_PTT,
],
},
{
@@ -658,6 +662,7 @@ export const commandDescriptions: Readonly> = {
[Command.SUSPEND_APP]: 'Suspend the CLI and move it to the background.',
[Command.SHOW_SHELL_INPUT_UNFOCUS_WARNING]:
'Show warning when trying to move focus away from shell input.',
+ [Command.VOICE_MODE_PTT]: 'Hold to speak in Voice Mode.',
// Background Shell Controls
[Command.BACKGROUND_SHELL_ESCAPE]: 'Dismiss background shell list.',
diff --git a/packages/cli/src/ui/noninteractive/nonInteractiveUi.ts b/packages/cli/src/ui/noninteractive/nonInteractiveUi.ts
index 3aff41d2de..9118518455 100644
--- a/packages/cli/src/ui/noninteractive/nonInteractiveUi.ts
+++ b/packages/cli/src/ui/noninteractive/nonInteractiveUi.ts
@@ -43,5 +43,6 @@ export function createNonInteractiveUI(): CommandContext['ui'] {
removeComponent: () => {},
toggleBackgroundTasks: () => {},
toggleShortcutsHelp: () => {},
+ toggleVoiceMode: () => {},
};
}
diff --git a/packages/core/package.json b/packages/core/package.json
index eda0e1e5fe..48412418c4 100644
--- a/packages/core/package.json
+++ b/packages/core/package.json
@@ -56,6 +56,7 @@
"ajv-formats": "^3.0.0",
"chardet": "^2.1.0",
"chokidar": "^5.0.0",
+ "command-exists": "^1.2.9",
"diff": "^8.0.3",
"dotenv": "^17.2.4",
"dotenv-expand": "^12.0.3",
diff --git a/packages/core/src/config/config.ts b/packages/core/src/config/config.ts
index 939fa77d70..11f7a24841 100644
--- a/packages/core/src/config/config.ts
+++ b/packages/core/src/config/config.ts
@@ -691,6 +691,7 @@ export interface ConfigParameters {
ptyInfo?: string;
disableYoloMode?: boolean;
disableAlwaysAllow?: boolean;
+ voiceMode?: boolean;
rawOutput?: boolean;
acceptRawOutputRisk?: boolean;
dynamicModelConfiguration?: boolean;
@@ -963,6 +964,7 @@ export class Config implements McpContext, AgentLoopContext {
private readonly topicUpdateNarration: boolean;
private readonly disableLLMCorrection: boolean;
private readonly planEnabled: boolean;
+ private readonly voiceMode: boolean;
private readonly trackerEnabled: boolean;
private readonly planModeRoutingEnabled: boolean;
private readonly modelSteering: boolean;
@@ -1117,6 +1119,7 @@ export class Config implements McpContext, AgentLoopContext {
this.agents = params.agents ?? {};
this.disableLLMCorrection = params.disableLLMCorrection ?? true;
this.planEnabled = params.plan ?? true;
+ this.voiceMode = params.voiceMode ?? false;
this.trackerEnabled = params.tracker ?? false;
this.planModeRoutingEnabled = params.planSettings?.modelRouting ?? true;
this.enableEventDrivenScheduler = params.enableEventDrivenScheduler ?? true;
@@ -2969,6 +2972,10 @@ export class Config implements McpContext, AgentLoopContext {
return this.planEnabled;
}
+ isVoiceModeEnabled(): boolean {
+ return this.voiceMode;
+ }
+
isTrackerEnabled(): boolean {
return this.trackerEnabled;
}
diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts
index 3123dd9096..86b36a6b0e 100644
--- a/packages/core/src/index.ts
+++ b/packages/core/src/index.ts
@@ -297,3 +297,12 @@ export * from './context/profiles.js';
// Export trust utility
export * from './utils/trust.js';
+
+// Export voice utilities
+export * from './voice/audioRecorder.js';
+export * from './voice/transcriptionProvider.js';
+export * from './voice/geminiLiveTranscriptionProvider.js';
+export * from './voice/whisperTranscriptionProvider.js';
+export * from './voice/transcriptionFactory.js';
+export * from './voice/whisperModelManager.js';
+export { isBinaryAvailable } from './utils/binaryCheck.js';
diff --git a/packages/core/src/utils/binaryCheck.ts b/packages/core/src/utils/binaryCheck.ts
new file mode 100644
index 0000000000..8d37f0def4
--- /dev/null
+++ b/packages/core/src/utils/binaryCheck.ts
@@ -0,0 +1,14 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { sync as commandExistsSync } from 'command-exists';
+
+/**
+ * Checks if a binary is available in the system PATH.
+ */
+export function isBinaryAvailable(binaryName: string): boolean {
+ return commandExistsSync(binaryName);
+}
diff --git a/packages/core/src/voice/audioRecorder.ts b/packages/core/src/voice/audioRecorder.ts
new file mode 100644
index 0000000000..c1217e5d7a
--- /dev/null
+++ b/packages/core/src/voice/audioRecorder.ts
@@ -0,0 +1,115 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { spawn, type ChildProcessWithoutNullStreams } from 'node:child_process';
+import { EventEmitter } from 'node:events';
+import commandExists from 'command-exists';
+
+export interface AudioRecorderEvents {
+ data: [Buffer];
+ start: [];
+ stop: [];
+ error: [Error];
+}
+
+/**
+ * Captures audio from the microphone using `sox` (`rec`).
+ * Emits 16kHz, 16-bit, mono PCM chunks.
+ */
+export class AudioRecorder extends EventEmitter {
+ private recProcess: ChildProcessWithoutNullStreams | null = null;
+ private isRecordingInternal = false;
+
+ get isRecording(): boolean {
+ return this.isRecordingInternal;
+ }
+
+ /**
+ * Checks if `rec` (sox) is available on the system.
+ */
+ static async isAvailable(): Promise {
+ try {
+ await commandExists('rec');
+ return true;
+ } catch {
+ return false;
+ }
+ }
+
+ async start(): Promise {
+ if (this.isRecordingInternal) return;
+ this.isRecordingInternal = true;
+
+ try {
+ const available = await AudioRecorder.isAvailable();
+ if (!this.isRecordingInternal) return; // Check if stopped while checking availability
+
+ if (!available) {
+ throw new Error(
+ 'The `rec` command (provided by SoX) is required for voice mode. Please install SoX (e.g., `brew install sox` on macOS or `sudo apt install sox libsox-fmt-all` on Linux).',
+ );
+ }
+
+ // rec -q -V0 -e signed -c 1 -b 16 -r 16000 -t raw -
+ this.recProcess = spawn('rec', [
+ '-q',
+ '-V0',
+ '-e',
+ 'signed',
+ '-c',
+ '1',
+ '-b',
+ '16',
+ '-r',
+ '16000',
+ '-t',
+ 'raw',
+ '-',
+ ]);
+
+ if (!this.isRecordingInternal) {
+ this.recProcess.kill('SIGTERM');
+ this.recProcess = null;
+ return;
+ }
+
+ this.recProcess.stdout.on('data', (data: Buffer) => {
+ this.emit('data', data);
+ });
+
+ this.recProcess.stderr.on('data', (_data: Buffer) => {
+ // rec might print warnings to stderr, we could log them or ignore
+ // console.warn(`rec stderr: ${data.toString()}`);
+ });
+
+ this.recProcess.on('error', (err) => {
+ this.emit('error', err);
+ this.stop();
+ });
+
+ this.recProcess.on('close', () => {
+ this.stop();
+ });
+
+ this.emit('start');
+ } catch (err) {
+ this.isRecordingInternal = false;
+ throw err;
+ }
+ }
+
+ stop(): void {
+ if (!this.isRecordingInternal) return;
+ this.isRecordingInternal = false;
+
+ if (this.recProcess) {
+ this.recProcess.kill('SIGTERM');
+ this.recProcess = null;
+ }
+
+ this.emit('stop');
+ }
+}
diff --git a/packages/core/src/voice/geminiLiveTranscriptionProvider.ts b/packages/core/src/voice/geminiLiveTranscriptionProvider.ts
new file mode 100644
index 0000000000..4895a60e56
--- /dev/null
+++ b/packages/core/src/voice/geminiLiveTranscriptionProvider.ts
@@ -0,0 +1,178 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import WebSocket from 'ws';
+import { EventEmitter, once } from 'node:events';
+import { debugLogger } from '../utils/debugLogger.js';
+import type {
+ TranscriptionProvider,
+ TranscriptionEvents,
+} from './transcriptionProvider.js';
+
+import { z } from 'zod';
+
+const LiveAPIResponseSchema = z.object({
+ setupComplete: z.record(z.unknown()).optional(),
+ serverContent: z
+ .object({
+ turnComplete: z.boolean().optional(),
+ inputTranscription: z
+ .object({
+ text: z.string().optional(),
+ })
+ .optional(),
+ outputTranscription: z
+ .object({
+ text: z.string().optional(),
+ })
+ .optional(),
+ modelTurn: z
+ .object({
+ parts: z
+ .array(
+ z.object({
+ text: z.string().optional(),
+ inlineData: z
+ .object({
+ data: z.string(),
+ })
+ .optional(),
+ }),
+ )
+ .optional(),
+ })
+ .optional(),
+ })
+ .optional(),
+});
+
+/**
+ * Connects to the Gemini Live API using raw WebSockets to support API Key authentication.
+ */
+export class GeminiLiveTranscriptionProvider
+ extends EventEmitter
+ implements TranscriptionProvider
+{
+ private ws: WebSocket | null = null;
+ private currentTranscription = '';
+
+ constructor(private readonly apiKey: string) {
+ super();
+ }
+
+ async connect(): Promise {
+ const modelName = 'gemini-3.1-flash-live-preview';
+ const baseUrl =
+ 'wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent';
+
+ if (!this.apiKey) {
+ throw new Error('No API key provided');
+ }
+
+ // NOTE: The Generative Language WebSocket API requires the API key to be passed via the 'key' query parameter.
+ const url = `${baseUrl}?key=${this.apiKey}`;
+ debugLogger.debug(
+ `[GeminiLiveTranscription] Connecting to model ${modelName} via raw WebSocket with API Key...`,
+ );
+
+ try {
+ this.ws = new WebSocket(url, {
+ maxPayload: 1 << 20, // 1MB limit for safety
+ });
+
+ this.ws.on('message', (data) => {
+ try {
+ const parsedData: unknown = JSON.parse(data.toString());
+ const result = LiveAPIResponseSchema.safeParse(parsedData);
+
+ if (result.success) {
+ const response = result.data;
+ if (response.serverContent) {
+ const content = response.serverContent;
+
+ if (content.turnComplete) {
+ this.emit('turnComplete');
+ }
+
+ if (content.inputTranscription?.text) {
+ const text = content.inputTranscription.text;
+ debugLogger.debug(
+ `[GeminiLiveTranscription] Transcription received (Cloud): "${text}"`,
+ );
+ this.currentTranscription = text;
+ this.emit('transcription', this.currentTranscription);
+ }
+ }
+ }
+ } catch (e) {
+ debugLogger.error(
+ '[GeminiLiveTranscription] Error parsing message:',
+ e,
+ );
+ }
+ });
+
+ this.ws.on('error', (error) => {
+ debugLogger.error('[GeminiLiveTranscription] WebSocket Error:', error);
+ this.emit('error', error);
+ });
+
+ this.ws.on('close', (code, reason) => {
+ debugLogger.debug(
+ `[GeminiLiveTranscription] Connection Closed. Code: ${code}, Reason: ${reason}`,
+ );
+ this.emit('close');
+ this.ws = null;
+ });
+
+ await once(this.ws, 'open');
+
+ const setupMessage = {
+ setup: {
+ model: `models/${modelName}`,
+ generation_config: {
+ response_modalities: ['audio'],
+ },
+ input_audio_transcription: {},
+ },
+ };
+
+ this.ws.send(JSON.stringify(setupMessage));
+ this.currentTranscription = '';
+ } catch (err) {
+ debugLogger.error(
+ '[GeminiLiveTranscription] Failed to establish connection:',
+ err,
+ );
+ throw err;
+ }
+ }
+
+ sendAudioChunk(chunk: Buffer): void {
+ if (!this.ws || this.ws.readyState !== WebSocket.OPEN) return;
+
+ const audioMessage = {
+ realtime_input: {
+ audio: {
+ data: chunk.toString('base64'),
+ mime_type: 'audio/pcm;rate=16000',
+ },
+ },
+ };
+ this.ws.send(JSON.stringify(audioMessage));
+ }
+
+ getTranscription(): string {
+ return this.currentTranscription;
+ }
+
+ disconnect(): void {
+ if (this.ws) {
+ this.ws.close();
+ this.ws = null;
+ }
+ }
+}
diff --git a/packages/core/src/voice/transcriptionFactory.ts b/packages/core/src/voice/transcriptionFactory.ts
new file mode 100644
index 0000000000..ee0ac64700
--- /dev/null
+++ b/packages/core/src/voice/transcriptionFactory.ts
@@ -0,0 +1,41 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import * as path from 'node:path';
+import * as fs from 'node:fs';
+import { homedir, GEMINI_DIR } from '../utils/paths.js';
+import { GeminiLiveTranscriptionProvider } from './geminiLiveTranscriptionProvider.js';
+import { WhisperTranscriptionProvider } from './whisperTranscriptionProvider.js';
+import type { TranscriptionProvider } from './transcriptionProvider.js';
+
+export class TranscriptionFactory {
+ static createProvider(
+ voiceConfig: { backend?: string; whisperModel?: string } | undefined,
+ apiKey: string,
+ ): TranscriptionProvider {
+ const backend = voiceConfig?.backend ?? 'gemini-live';
+
+ if (backend === 'whisper') {
+ const modelsDir = path.join(homedir(), GEMINI_DIR, 'whisper_models');
+ if (!fs.existsSync(modelsDir)) {
+ fs.mkdirSync(modelsDir, { recursive: true });
+ }
+
+ const modelName = voiceConfig?.whisperModel ?? 'ggml-base.en.bin';
+ const modelPath = path.join(modelsDir, modelName);
+
+ return new WhisperTranscriptionProvider({
+ modelPath,
+ threads: 4,
+ step: 0,
+ length: 5000,
+ });
+ }
+
+ // Default to Gemini Live
+ return new GeminiLiveTranscriptionProvider(apiKey);
+ }
+}
diff --git a/packages/core/src/voice/transcriptionProvider.ts b/packages/core/src/voice/transcriptionProvider.ts
new file mode 100644
index 0000000000..6b3bf20c0a
--- /dev/null
+++ b/packages/core/src/voice/transcriptionProvider.ts
@@ -0,0 +1,33 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import type { EventEmitter } from 'node:events';
+
+export interface TranscriptionEvents {
+ /** Emitted when partial or full transcription text is available. */
+ transcription: [string];
+ /** Emitted when a speaking turn is considered complete. */
+ turnComplete: [];
+ /** Emitted when an error occurs during transcription. */
+ error: [Error];
+ /** Emitted when the transcription service connection is closed. */
+ close: [];
+}
+
+/**
+ * Common interface for all transcription backends (Cloud or Local).
+ */
+export interface TranscriptionProvider
+ extends EventEmitter {
+ /** Establish connection to the transcription service. */
+ connect(): Promise;
+ /** Send a chunk of raw audio data to the service. */
+ sendAudioChunk(chunk: Buffer): void;
+ /** Disconnect from the transcription service. */
+ disconnect(): void;
+ /** Get the current full transcription for the session. */
+ getTranscription(): string;
+}
diff --git a/packages/core/src/voice/whisperModelManager.ts b/packages/core/src/voice/whisperModelManager.ts
new file mode 100644
index 0000000000..64988eb48d
--- /dev/null
+++ b/packages/core/src/voice/whisperModelManager.ts
@@ -0,0 +1,107 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import * as path from 'node:path';
+import * as fs from 'node:fs';
+import { EventEmitter } from 'node:events';
+import { homedir, GEMINI_DIR } from '../utils/paths.js';
+import { debugLogger } from '../utils/debugLogger.js';
+
+export interface WhisperModelProgress {
+ modelName: string;
+ transferred: number;
+ total: number;
+ percentage: number;
+}
+
+export interface WhisperModelManagerEvents {
+ progress: [WhisperModelProgress];
+}
+
+const ALLOWED_MODELS = [
+ 'ggml-tiny.en.bin',
+ 'ggml-base.en.bin',
+ 'ggml-large-v3-turbo-q5_0.bin',
+ 'ggml-large-v3-turbo-q8_0.bin',
+];
+
+/**
+ * Manages Whisper models (checking existence, downloading).
+ */
+export class WhisperModelManager extends EventEmitter {
+ private readonly modelsDir: string;
+
+ constructor() {
+ super();
+ this.modelsDir = path.join(homedir(), GEMINI_DIR, 'whisper_models');
+ }
+
+ isModelInstalled(modelName: string): boolean {
+ this.validateModelName(modelName);
+ return fs.existsSync(path.join(this.modelsDir, modelName));
+ }
+
+ getModelPath(modelName: string): string {
+ this.validateModelName(modelName);
+ return path.join(this.modelsDir, modelName);
+ }
+
+ async downloadModel(modelName: string): Promise {
+ this.validateModelName(modelName);
+
+ if (!fs.existsSync(this.modelsDir)) {
+ fs.mkdirSync(this.modelsDir, { recursive: true });
+ }
+
+ const destination = path.join(this.modelsDir, modelName);
+ const url = `https://huggingface.co/ggerganov/whisper.cpp/resolve/main/${modelName}`;
+
+ debugLogger.debug(
+ `[WhisperModelManager] Downloading ${modelName} from ${url}`,
+ );
+
+ const response = await fetch(url);
+ if (!response.ok) {
+ throw new Error(`Failed to download model: ${response.statusText}`);
+ }
+
+ const total = parseInt(response.headers.get('content-length') || '0', 10);
+ let transferred = 0;
+
+ const reader = response.body?.getReader();
+ if (!reader) {
+ throw new Error('Response body is not readable');
+ }
+
+ const writer = fs.createWriteStream(destination);
+
+ try {
+ while (true) {
+ const { done, value } = await reader.read();
+ if (done) break;
+
+ transferred += value.length;
+ writer.write(value);
+
+ const percentage = total > 0 ? transferred / total : 0;
+ this.emit('progress', {
+ modelName,
+ transferred,
+ total,
+ percentage,
+ });
+ }
+ } finally {
+ writer.end();
+ }
+ }
+
+ private validateModelName(modelName: string): void {
+ if (!ALLOWED_MODELS.includes(modelName)) {
+ throw new Error(`Unauthorized model name: ${modelName}`);
+ }
+ }
+}
diff --git a/packages/core/src/voice/whisperTranscriptionProvider.test.ts b/packages/core/src/voice/whisperTranscriptionProvider.test.ts
new file mode 100644
index 0000000000..69b48fbb02
--- /dev/null
+++ b/packages/core/src/voice/whisperTranscriptionProvider.test.ts
@@ -0,0 +1,31 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, it, expect, vi, beforeEach } from 'vitest';
+import { WhisperTranscriptionProvider } from './whisperTranscriptionProvider.js';
+import commandExists from 'command-exists';
+
+vi.mock('command-exists', () => ({
+ default: vi.fn(),
+}));
+
+describe('WhisperTranscriptionProvider', () => {
+ beforeEach(() => {
+ vi.clearAllMocks();
+ });
+
+ it('should throw a friendly error if whisper-stream is not available', async () => {
+ vi.mocked(commandExists).mockRejectedValue(new Error('not found'));
+
+ const provider = new WhisperTranscriptionProvider({
+ modelPath: 'test-model.bin',
+ });
+
+ await expect(provider.connect()).rejects.toThrow(
+ 'The `whisper-stream` command is required for local voice mode. Please install it (e.g., `brew install whisper-cpp` on macOS).',
+ );
+ });
+});
diff --git a/packages/core/src/voice/whisperTranscriptionProvider.ts b/packages/core/src/voice/whisperTranscriptionProvider.ts
new file mode 100644
index 0000000000..b5b871df83
--- /dev/null
+++ b/packages/core/src/voice/whisperTranscriptionProvider.ts
@@ -0,0 +1,199 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { spawn, type ChildProcessWithoutNullStreams } from 'node:child_process';
+import { EventEmitter } from 'node:events';
+import commandExists from 'command-exists';
+import { debugLogger } from '../utils/debugLogger.js';
+import type {
+ TranscriptionProvider,
+ TranscriptionEvents,
+} from './transcriptionProvider.js';
+
+export interface WhisperProviderOptions {
+ modelPath: string;
+ threads?: number;
+ step?: number;
+ length?: number;
+}
+
+/**
+ * Local transcription provider using `whisper-stream` from whisper.cpp.
+ *
+ * Uses the Sliding Window Mode with VAD (--step 0) for stable,
+ * non-overlapping transcription blocks that can be appended directly.
+ */
+export class WhisperTranscriptionProvider
+ extends EventEmitter
+ implements TranscriptionProvider
+{
+ private process: ChildProcessWithoutNullStreams | null = null;
+ private currentTranscription = '';
+
+ constructor(private readonly options: WhisperProviderOptions) {
+ super();
+ }
+
+ /**
+ * Checks if `whisper-stream` is available on the system.
+ */
+ static async isAvailable(): Promise {
+ try {
+ await commandExists('whisper-stream');
+ return true;
+ } catch {
+ return false;
+ }
+ }
+
+ async connect(): Promise {
+ const { modelPath, threads = 4, step = 0, length = 5000 } = this.options;
+
+ this.currentTranscription = '';
+
+ const available = await WhisperTranscriptionProvider.isAvailable();
+ if (!available) {
+ return Promise.reject(
+ new Error(
+ 'The `whisper-stream` command is required for local voice mode. Please install it (e.g., `brew install whisper-cpp` on macOS).',
+ ),
+ );
+ }
+
+ debugLogger.debug(
+ `[WhisperTranscription] Starting whisper-stream with model: ${modelPath} (VAD mode: step=${step}, length=${length})`,
+ );
+
+ return new Promise((resolve, reject) => {
+ let isResolved = false;
+
+ try {
+ // whisper-stream -m -t --step 0 --length -vth 0.6
+ // Setting step == 0 enables sliding window mode with VAD, which outputs
+ // non-overlapping transcription blocks suitable for appending.
+ this.process = spawn('whisper-stream', [
+ '-m',
+ modelPath,
+ '-t',
+ threads.toString(),
+ '--step',
+ step.toString(),
+ '--length',
+ length.toString(),
+ '-vth',
+ '0.6',
+ ]);
+
+ this.process.stdout.on('data', (data: Buffer) => {
+ const output = data.toString();
+ this.parseOutput(output);
+ });
+
+ this.process.stderr.on('data', (data: Buffer) => {
+ const msg = data.toString();
+ if (msg.includes('error')) {
+ debugLogger.error(`[WhisperTranscription] stderr: ${msg}`);
+ if (!isResolved) {
+ isResolved = true;
+ reject(new Error(msg));
+ }
+ }
+
+ // whisper-stream prints "whisper_init_from_file_with_params_no_state: loading model from..."
+ // and finally "main: processing, press Ctrl+C to stop" when ready.
+ if (!isResolved && msg.includes('main: processing')) {
+ debugLogger.debug('[WhisperTranscription] whisper-stream is ready');
+ isResolved = true;
+ resolve();
+ }
+ });
+
+ this.process.on('error', (err) => {
+ debugLogger.error('[WhisperTranscription] Process error:', err);
+ this.emit('error', err);
+ if (!isResolved) {
+ isResolved = true;
+ reject(err);
+ }
+ });
+
+ this.process.on('close', (code) => {
+ debugLogger.debug(
+ `[WhisperTranscription] Process closed with code ${code}`,
+ );
+ this.emit('close');
+ this.process = null;
+ });
+
+ // Fallback timeout in case "main: processing" is never seen
+ setTimeout(() => {
+ if (!isResolved) {
+ debugLogger.warn(
+ '[WhisperTranscription] Connection timeout (fallback resolve)',
+ );
+ isResolved = true;
+ resolve();
+ }
+ }, 10000);
+ } catch (err) {
+ debugLogger.error(
+ '[WhisperTranscription] Failed to spawn process:',
+ err,
+ );
+ if (!isResolved) {
+ isResolved = true;
+ reject(err);
+ }
+ }
+ });
+ }
+
+ private parseOutput(output: string): void {
+ // whisper-stream output format: "[00:00:00.000 --> 00:00:02.000] Hello world."
+ const lines = output.split('\n');
+
+ for (const line of lines) {
+ const match = line.match(/\[.* --> .*\]\s+(.*)/);
+ if (match && match[1]) {
+ let text = match[1].trim();
+
+ // Filter out [Silence], [music], (laughter), etc.
+ text = text
+ .replace(/\[[^\]]*\]/g, '')
+ .replace(/\([^)]*\)/g, '')
+ .trim();
+
+ if (text) {
+ // In VAD mode (step=0), each line is a completed speech block.
+ // Append it to the buffer to ensure it doesn't disappear.
+ this.currentTranscription = this.currentTranscription
+ ? `${this.currentTranscription} ${text}`
+ : text;
+
+ debugLogger.debug(
+ `[WhisperTranscription] Transcription updated (Local-VAD): "${this.currentTranscription}"`,
+ );
+ this.emit('transcription', this.currentTranscription);
+ }
+ }
+ }
+ }
+
+ sendAudioChunk(_chunk: Buffer): void {
+ // whisper-stream handles its own audio capture.
+ }
+
+ getTranscription(): string {
+ return this.currentTranscription;
+ }
+
+ disconnect(): void {
+ if (this.process) {
+ this.process.kill('SIGTERM');
+ this.process = null;
+ }
+ }
+}
diff --git a/schemas/settings.schema.json b/schemas/settings.schema.json
index f4263fcc3e..a883150d66 100644
--- a/schemas/settings.schema.json
+++ b/schemas/settings.schema.json
@@ -2910,6 +2910,59 @@
"default": false,
"type": "boolean"
},
+ "voiceMode": {
+ "title": "Voice Mode",
+ "description": "Enable experimental voice dictation and commands (/voice, /voice model).",
+ "markdownDescription": "Enable experimental voice dictation and commands (/voice, /voice model).\n\n- Category: `Experimental`\n- Requires restart: `no`\n- Default: `false`",
+ "default": false,
+ "type": "boolean"
+ },
+ "voice": {
+ "title": "Voice",
+ "description": "Settings for voice mode and transcription.",
+ "markdownDescription": "Settings for voice mode and transcription.\n\n- Category: `Experimental`\n- Requires restart: `no`\n- Default: `{}`",
+ "default": {},
+ "type": "object",
+ "properties": {
+ "activationMode": {
+ "title": "Voice Activation Mode",
+ "description": "How to trigger voice recording with the Space key.",
+ "markdownDescription": "How to trigger voice recording with the Space key.\n\n- Category: `Experimental`\n- Requires restart: `no`\n- Default: `push-to-talk`",
+ "default": "push-to-talk",
+ "type": "string",
+ "enum": ["push-to-talk", "toggle"]
+ },
+ "backend": {
+ "title": "Voice Transcription Backend",
+ "description": "The backend to use for voice transcription.",
+ "markdownDescription": "The backend to use for voice transcription.\n\n- Category: `Experimental`\n- Requires restart: `no`\n- Default: `gemini-live`",
+ "default": "gemini-live",
+ "type": "string",
+ "enum": ["gemini-live", "whisper"]
+ },
+ "whisperModel": {
+ "title": "Whisper Model",
+ "description": "The Whisper model to use for local transcription.",
+ "markdownDescription": "The Whisper model to use for local transcription.\n\n- Category: `Experimental`\n- Requires restart: `no`\n- Default: `ggml-base.en.bin`",
+ "default": "ggml-base.en.bin",
+ "type": "string",
+ "enum": [
+ "ggml-tiny.en.bin",
+ "ggml-base.en.bin",
+ "ggml-large-v3-turbo-q5_0.bin",
+ "ggml-large-v3-turbo-q8_0.bin"
+ ]
+ },
+ "stopGracePeriodMs": {
+ "title": "Voice Stop Grace Period (ms)",
+ "description": "How long to wait for final transcription after stopping recording.",
+ "markdownDescription": "How long to wait for final transcription after stopping recording.\n\n- Category: `Experimental`\n- Requires restart: `no`\n- Default: `1000`",
+ "default": 1000,
+ "type": "number"
+ }
+ },
+ "additionalProperties": false
+ },
"adk": {
"title": "ADK",
"description": "Settings for the Agent Development Kit (ADK).",