From 2e0641c83b012042ccbc012d420cfe6a5d46fdd7 Mon Sep 17 00:00:00 2001 From: Abhijit Balaji Date: Fri, 24 Apr 2026 14:29:38 -0700 Subject: [PATCH] feat(voice): implement real-time voice mode with cloud and local backends (#24174) --- .gemini/settings.json | 5 +- Dockerfile | 4 +- docs/cli/settings.md | 33 +- docs/reference/configuration.md | 26 ++ docs/reference/keyboard-shortcuts.md | 1 + integration-tests/voice-mode.test.ts | 76 ++++ package-lock.json | 1 + packages/cli/src/config/config.ts | 1 + packages/cli/src/config/settingsSchema.ts | 81 ++++ .../src/services/BuiltinCommandLoader.test.ts | 2 + .../cli/src/services/BuiltinCommandLoader.ts | 2 + packages/cli/src/test-utils/render.tsx | 3 + packages/cli/src/ui/AppContainer.tsx | 24 + packages/cli/src/ui/commands/types.ts | 2 + packages/cli/src/ui/commands/voiceCommand.ts | 30 ++ .../cli/src/ui/components/DialogManager.tsx | 4 + .../src/ui/components/InputPrompt.test.tsx | 407 +++++++++++++++++ .../cli/src/ui/components/InputPrompt.tsx | 69 ++- .../src/ui/components/VoiceModelDialog.tsx | 236 ++++++++++ .../__snapshots__/InputPrompt.test.tsx.snap | 7 - .../cli/src/ui/contexts/UIActionsContext.tsx | 3 + .../cli/src/ui/contexts/UIStateContext.tsx | 2 + .../ui/hooks/slashCommandProcessor.test.tsx | 2 + .../cli/src/ui/hooks/slashCommandProcessor.ts | 6 + packages/cli/src/ui/hooks/useVoiceMode.ts | 429 ++++++++++++++++++ .../cli/src/ui/hooks/useVoiceModelCommand.ts | 31 ++ packages/cli/src/ui/key/keyBindings.ts | 11 +- .../src/ui/noninteractive/nonInteractiveUi.ts | 1 + packages/core/package.json | 1 + packages/core/src/config/config.ts | 7 + packages/core/src/index.ts | 9 + packages/core/src/utils/binaryCheck.ts | 14 + packages/core/src/voice/audioRecorder.ts | 115 +++++ .../voice/geminiLiveTranscriptionProvider.ts | 178 ++++++++ .../core/src/voice/transcriptionFactory.ts | 41 ++ .../core/src/voice/transcriptionProvider.ts | 33 ++ .../core/src/voice/whisperModelManager.ts | 107 +++++ .../whisperTranscriptionProvider.test.ts | 31 ++ .../src/voice/whisperTranscriptionProvider.ts | 199 ++++++++ schemas/settings.schema.json | 53 +++ 40 files changed, 2244 insertions(+), 43 deletions(-) create mode 100644 integration-tests/voice-mode.test.ts create mode 100644 packages/cli/src/ui/commands/voiceCommand.ts create mode 100644 packages/cli/src/ui/components/VoiceModelDialog.tsx create mode 100644 packages/cli/src/ui/hooks/useVoiceMode.ts create mode 100644 packages/cli/src/ui/hooks/useVoiceModelCommand.ts create mode 100644 packages/core/src/utils/binaryCheck.ts create mode 100644 packages/core/src/voice/audioRecorder.ts create mode 100644 packages/core/src/voice/geminiLiveTranscriptionProvider.ts create mode 100644 packages/core/src/voice/transcriptionFactory.ts create mode 100644 packages/core/src/voice/transcriptionProvider.ts create mode 100644 packages/core/src/voice/whisperModelManager.ts create mode 100644 packages/core/src/voice/whisperTranscriptionProvider.test.ts create mode 100644 packages/core/src/voice/whisperTranscriptionProvider.ts diff --git a/.gemini/settings.json b/.gemini/settings.json index 4ad7bc3ed6..e7ff785b7c 100644 --- a/.gemini/settings.json +++ b/.gemini/settings.json @@ -3,7 +3,10 @@ "extensionReloading": true, "modelSteering": true, "autoMemory": true, - "gemma": true + "gemma": true, + "memoryManager": true, + "topicUpdateNarration": true, + "voiceMode": true }, "general": { "devtools": true diff --git a/Dockerfile b/Dockerfile index 25d27d46c6..44ba343902 100644 --- a/Dockerfile +++ b/Dockerfile @@ -40,8 +40,8 @@ ENV PATH=$PATH:/usr/local/share/npm-global/bin USER node # install gemini-cli and clean up -COPY packages/cli/dist/google-gemini-cli-*.tgz /tmp/gemini-cli.tgz -COPY packages/core/dist/google-gemini-cli-core-*.tgz /tmp/gemini-core.tgz +COPY --chown=node:node packages/cli/dist/google-gemini-cli-*.tgz /tmp/gemini-cli.tgz +COPY --chown=node:node packages/core/dist/google-gemini-cli-core-*.tgz /tmp/gemini-core.tgz RUN npm install -g /tmp/gemini-core.tgz \ && npm install -g /tmp/gemini-cli.tgz \ && node -e "const fs=require('node:fs'); JSON.parse(fs.readFileSync('/usr/local/share/npm-global/lib/node_modules/@google/gemini-cli/package.json','utf8')); JSON.parse(fs.readFileSync('/usr/local/share/npm-global/lib/node_modules/@google/gemini-cli-core/package.json','utf8'));" \ diff --git a/docs/cli/settings.md b/docs/cli/settings.md index 10bfee644f..834750fdf9 100644 --- a/docs/cli/settings.md +++ b/docs/cli/settings.md @@ -161,20 +161,25 @@ they appear in the UI. ### Experimental -| UI Label | Setting | Description | Default | -| ---------------------------------------------------- | ----------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- | -| Gemma Models | `experimental.gemma` | Enable access to Gemma 4 models (experimental). | `false` | -| Enable Git Worktrees | `experimental.worktrees` | Enable automated Git worktree management for parallel work. | `false` | -| Use OSC 52 Paste | `experimental.useOSC52Paste` | Use OSC 52 for pasting. This may be more robust than the default system when using remote terminal sessions (if your terminal is configured to allow it). | `false` | -| Use OSC 52 Copy | `experimental.useOSC52Copy` | Use OSC 52 for copying. This may be more robust than the default system when using remote terminal sessions (if your terminal is configured to allow it). | `false` | -| Model Steering | `experimental.modelSteering` | Enable model steering (user hints) to guide the model during tool execution. | `false` | -| Direct Web Fetch | `experimental.directWebFetch` | Enable web fetch behavior that bypasses LLM summarization. | `false` | -| Enable Gemma Model Router | `experimental.gemmaModelRouter.enabled` | Enable the Gemma Model Router (experimental). Requires a local endpoint serving Gemma via the Gemini API using LiteRT-LM shim. | `false` | -| Auto-start LiteRT Server | `experimental.gemmaModelRouter.autoStartServer` | Automatically start the LiteRT-LM server when Gemini CLI starts and the Gemma router is enabled. | `false` | -| Memory v2 | `experimental.memoryV2` | Disable the built-in save_memory tool and let the main agent persist project context by editing markdown files directly with edit/write_file. Route facts across four tiers: team-shared conventions go to project GEMINI.md files, project-specific personal notes go to the per-project private memory folder (MEMORY.md as index + sibling .md files for detail), and cross-project personal preferences go to the global ~/.gemini/GEMINI.md (the only file under ~/.gemini/ that the agent can edit — settings, credentials, etc. remain off-limits). Set to false to fall back to the legacy save_memory tool. | `true` | -| Auto Memory | `experimental.autoMemory` | Automatically extract reusable skills from past sessions in the background. Review results with /memory inbox. | `false` | -| Use the generalist profile to manage agent contexts. | `experimental.generalistProfile` | Suitable for general coding and software development tasks. | `false` | -| Enable Context Management | `experimental.contextManagement` | Enable logic for context management. | `false` | +| UI Label | Setting | Description | Default | +| ---------------------------------------------------- | ----------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------- | +| Gemma Models | `experimental.gemma` | Enable access to Gemma 4 models (experimental). | `false` | +| Voice Mode | `experimental.voiceMode` | Enable experimental voice dictation and commands (/voice, /voice model). | `false` | +| Voice Activation Mode | `experimental.voice.activationMode` | How to trigger voice recording with the Space key. | `"push-to-talk"` | +| Voice Transcription Backend | `experimental.voice.backend` | The backend to use for voice transcription. | `"gemini-live"` | +| Whisper Model | `experimental.voice.whisperModel` | The Whisper model to use for local transcription. | `"ggml-base.en.bin"` | +| Voice Stop Grace Period (ms) | `experimental.voice.stopGracePeriodMs` | How long to wait for final transcription after stopping recording. | `1000` | +| Enable Git Worktrees | `experimental.worktrees` | Enable automated Git worktree management for parallel work. | `false` | +| Use OSC 52 Paste | `experimental.useOSC52Paste` | Use OSC 52 for pasting. This may be more robust than the default system when using remote terminal sessions (if your terminal is configured to allow it). | `false` | +| Use OSC 52 Copy | `experimental.useOSC52Copy` | Use OSC 52 for copying. This may be more robust than the default system when using remote terminal sessions (if your terminal is configured to allow it). | `false` | +| Model Steering | `experimental.modelSteering` | Enable model steering (user hints) to guide the model during tool execution. | `false` | +| Direct Web Fetch | `experimental.directWebFetch` | Enable web fetch behavior that bypasses LLM summarization. | `false` | +| Enable Gemma Model Router | `experimental.gemmaModelRouter.enabled` | Enable the Gemma Model Router (experimental). Requires a local endpoint serving Gemma via the Gemini API using LiteRT-LM shim. | `false` | +| Auto-start LiteRT Server | `experimental.gemmaModelRouter.autoStartServer` | Automatically start the LiteRT-LM server when Gemini CLI starts and the Gemma router is enabled. | `false` | +| Memory v2 | `experimental.memoryV2` | Disable the built-in save_memory tool and let the main agent persist project context by editing markdown files directly with edit/write_file. Route facts across four tiers: team-shared conventions go to project GEMINI.md files, project-specific personal notes go to the per-project private memory folder (MEMORY.md as index + sibling .md files for detail), and cross-project personal preferences go to the global ~/.gemini/GEMINI.md (the only file under ~/.gemini/ that the agent can edit — settings, credentials, etc. remain off-limits). Set to false to fall back to the legacy save_memory tool. | `true` | +| Auto Memory | `experimental.autoMemory` | Automatically extract reusable skills from past sessions in the background. Review results with /memory inbox. | `false` | +| Use the generalist profile to manage agent contexts. | `experimental.generalistProfile` | Suitable for general coding and software development tasks. | `false` | +| Enable Context Management | `experimental.contextManagement` | Enable logic for context management. | `false` | ### Skills diff --git a/docs/reference/configuration.md b/docs/reference/configuration.md index b2d8955d5f..94a64cc197 100644 --- a/docs/reference/configuration.md +++ b/docs/reference/configuration.md @@ -1691,6 +1691,32 @@ their corresponding top-level category object in your `settings.json` file. - **Default:** `false` - **Requires restart:** Yes +- **`experimental.voiceMode`** (boolean): + - **Description:** Enable experimental voice dictation and commands (/voice, + /voice model). + - **Default:** `false` + +- **`experimental.voice.activationMode`** (enum): + - **Description:** How to trigger voice recording with the Space key. + - **Default:** `"push-to-talk"` + - **Values:** `"push-to-talk"`, `"toggle"` + +- **`experimental.voice.backend`** (enum): + - **Description:** The backend to use for voice transcription. + - **Default:** `"gemini-live"` + - **Values:** `"gemini-live"`, `"whisper"` + +- **`experimental.voice.whisperModel`** (enum): + - **Description:** The Whisper model to use for local transcription. + - **Default:** `"ggml-base.en.bin"` + - **Values:** `"ggml-tiny.en.bin"`, `"ggml-base.en.bin"`, + `"ggml-large-v3-turbo-q5_0.bin"`, `"ggml-large-v3-turbo-q8_0.bin"` + +- **`experimental.voice.stopGracePeriodMs`** (number): + - **Description:** How long to wait for final transcription after stopping + recording. + - **Default:** `1000` + - **`experimental.adk.agentSessionNoninteractiveEnabled`** (boolean): - **Description:** Enable non-interactive agent sessions. - **Default:** `false` diff --git a/docs/reference/keyboard-shortcuts.md b/docs/reference/keyboard-shortcuts.md index 98d31c0ae2..6f7a8cce4a 100644 --- a/docs/reference/keyboard-shortcuts.md +++ b/docs/reference/keyboard-shortcuts.md @@ -115,6 +115,7 @@ available combinations. | `app.restart` | Restart the application. | `R`
`Shift+R` | | `app.suspend` | Suspend the CLI and move it to the background. | `Ctrl+Z` | | `app.showShellUnfocusWarning` | Show warning when trying to move focus away from shell input. | `Tab` | +| `app.voiceModePTT` | Hold to speak in Voice Mode. | `Space` | #### Background Shell Controls diff --git a/integration-tests/voice-mode.test.ts b/integration-tests/voice-mode.test.ts new file mode 100644 index 0000000000..49844494a8 --- /dev/null +++ b/integration-tests/voice-mode.test.ts @@ -0,0 +1,76 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import { TestRig } from './test-helper.js'; +import { + WhisperModelManager, + WhisperTranscriptionProvider, +} from '@google/gemini-cli-core'; +import * as fs from 'node:fs'; +import commandExists from 'command-exists'; + +describe('Voice Mode Integration', () => { + let rig: TestRig; + + beforeEach(() => { + rig = new TestRig(); + }); + + afterEach(async () => await rig.cleanup()); + + it('should be able to download tiny whisper model', async () => { + // This test doesn't require the binary, only network access. + // However, it's slow and downloads 75MB. We'll keep it for now but + // wrap it in a try-catch to avoid failing on network flakiness in CI. + const manager = new WhisperModelManager(); + const modelName = 'ggml-tiny.en.bin'; + + try { + // Cleanup if already exists to ensure we actually test download + const modelPath = manager.getModelPath(modelName); + if (fs.existsSync(modelPath)) { + fs.unlinkSync(modelPath); + } + + await manager.downloadModel(modelName); + expect(fs.existsSync(modelPath)).toBe(true); + expect(fs.statSync(modelPath).size).toBeGreaterThan(70 * 1024 * 1024); // ~75MB + } catch (e) { + console.warn( + 'Skipping whisper model download test due to error (possibly network):', + e, + ); + } + }, 300000); // 5 min timeout for download + + it('should initialize WhisperTranscriptionProvider and handle process', async () => { + // Skip this test if whisper-stream is not installed (typical for CI) + try { + await commandExists('whisper-stream'); + } catch { + console.log( + 'Skipping Whisper transcription test: whisper-stream not found', + ); + return; + } + + const manager = new WhisperModelManager(); + const modelName = 'ggml-tiny.en.bin'; + if (!manager.isModelInstalled(modelName)) { + await manager.downloadModel(modelName); + } + + const provider = new WhisperTranscriptionProvider({ + modelPath: manager.getModelPath(modelName), + }); + + // Since we can't easily provide real mic input in CI, + // we just verify it can start and be disconnected. + await provider.connect(); + provider.disconnect(); + }); +}); diff --git a/package-lock.json b/package-lock.json index 89a358ef9e..d3b8904735 100644 --- a/package-lock.json +++ b/package-lock.json @@ -18390,6 +18390,7 @@ "ajv-formats": "^3.0.0", "chardet": "^2.1.0", "chokidar": "^5.0.0", + "command-exists": "^1.2.9", "diff": "^8.0.3", "dotenv": "^17.2.4", "dotenv-expand": "^12.0.3", diff --git a/packages/cli/src/config/config.ts b/packages/cli/src/config/config.ts index 6b99a3606d..1ce93cf9ff 100755 --- a/packages/cli/src/config/config.ts +++ b/packages/cli/src/config/config.ts @@ -1000,6 +1000,7 @@ export async function loadCliConfig( enableExtensionReloading: settings.experimental?.extensionReloading, enableAgents: settings.experimental?.enableAgents, plan: settings.general?.plan?.enabled ?? true, + voiceMode: settings.experimental?.voiceMode, tracker: settings.experimental?.taskTracker, directWebFetch: settings.experimental?.directWebFetch, planSettings: settings.general?.plan?.directory diff --git a/packages/cli/src/config/settingsSchema.ts b/packages/cli/src/config/settingsSchema.ts index 2b6c959397..08edbd66f9 100644 --- a/packages/cli/src/config/settingsSchema.ts +++ b/packages/cli/src/config/settingsSchema.ts @@ -2061,6 +2061,87 @@ const SETTINGS_SCHEMA = { description: 'Enable access to Gemma 4 models (experimental).', showInDialog: true, }, + voiceMode: { + type: 'boolean', + label: 'Voice Mode', + category: 'Experimental', + requiresRestart: false, + default: false, + description: + 'Enable experimental voice dictation and commands (/voice, /voice model).', + showInDialog: true, + }, + voice: { + type: 'object', + label: 'Voice', + category: 'Experimental', + requiresRestart: false, + default: {}, + description: 'Settings for voice mode and transcription.', + showInDialog: false, + properties: { + activationMode: { + type: 'enum', + label: 'Voice Activation Mode', + category: 'Experimental', + requiresRestart: false, + default: 'push-to-talk', + description: 'How to trigger voice recording with the Space key.', + showInDialog: true, + options: [ + { value: 'push-to-talk', label: 'Push-To-Talk (Hold Space)' }, + { value: 'toggle', label: 'Toggle (Press Space to start/stop)' }, + ], + }, + backend: { + type: 'enum', + label: 'Voice Transcription Backend', + category: 'Experimental', + requiresRestart: false, + default: 'gemini-live', + description: 'The backend to use for voice transcription.', + showInDialog: true, + options: [ + { value: 'gemini-live', label: 'Gemini Live API (Cloud)' }, + { value: 'whisper', label: 'Whisper (Local)' }, + ], + }, + whisperModel: { + type: 'enum', + label: 'Whisper Model', + category: 'Experimental', + requiresRestart: false, + default: 'ggml-base.en.bin', + description: 'The Whisper model to use for local transcription.', + showInDialog: true, + options: [ + { value: 'ggml-tiny.en.bin', label: 'Tiny (EN) - Fast (~75MB)' }, + { + value: 'ggml-base.en.bin', + label: 'Base (EN) - Balanced (~142MB)', + }, + { + value: 'ggml-large-v3-turbo-q5_0.bin', + label: 'Large v3 Turbo (Q5_0) - High Accuracy (~547MB)', + }, + { + value: 'ggml-large-v3-turbo-q8_0.bin', + label: 'Large v3 Turbo (Q8_0) - Max Accuracy (~834MB)', + }, + ], + }, + stopGracePeriodMs: { + type: 'number', + label: 'Voice Stop Grace Period (ms)', + category: 'Experimental', + requiresRestart: false, + default: 1000, + description: + 'How long to wait for final transcription after stopping recording.', + showInDialog: true, + }, + }, + }, adk: { type: 'object', label: 'ADK', diff --git a/packages/cli/src/services/BuiltinCommandLoader.test.ts b/packages/cli/src/services/BuiltinCommandLoader.test.ts index f166c161cd..d53273134c 100644 --- a/packages/cli/src/services/BuiltinCommandLoader.test.ts +++ b/packages/cli/src/services/BuiltinCommandLoader.test.ts @@ -170,6 +170,7 @@ describe('BuiltinCommandLoader', () => { getAllSkills: vi.fn().mockReturnValue([]), isAdminEnabled: vi.fn().mockReturnValue(true), }), + isVoiceModeEnabled: vi.fn().mockReturnValue(true), getContentGeneratorConfig: vi.fn().mockReturnValue({ authType: 'other', }), @@ -396,6 +397,7 @@ describe('BuiltinCommandLoader profile', () => { getAllSkills: vi.fn().mockReturnValue([]), isAdminEnabled: vi.fn().mockReturnValue(true), }), + isVoiceModeEnabled: vi.fn().mockReturnValue(true), getContentGeneratorConfig: vi.fn().mockReturnValue({ authType: 'other', }), diff --git a/packages/cli/src/services/BuiltinCommandLoader.ts b/packages/cli/src/services/BuiltinCommandLoader.ts index 94b5986eb3..1c5288707c 100644 --- a/packages/cli/src/services/BuiltinCommandLoader.ts +++ b/packages/cli/src/services/BuiltinCommandLoader.ts @@ -62,6 +62,7 @@ import { setupGithubCommand } from '../ui/commands/setupGithubCommand.js'; import { terminalSetupCommand } from '../ui/commands/terminalSetupCommand.js'; import { upgradeCommand } from '../ui/commands/upgradeCommand.js'; import { gemmaStatusCommand } from '../ui/commands/gemmaStatusCommand.js'; +import { voiceCommand } from '../ui/commands/voiceCommand.js'; /** * Loads the core, hard-coded slash commands that are an integral part @@ -227,6 +228,7 @@ export class BuiltinCommandLoader implements ICommandLoader { vimCommand, setupGithubCommand, terminalSetupCommand, + ...(this.config?.isVoiceModeEnabled() ? [voiceCommand] : []), ...(this.config?.getContentGeneratorConfig()?.authType === AuthType.LOGIN_WITH_GOOGLE ? [upgradeCommand] diff --git a/packages/cli/src/test-utils/render.tsx b/packages/cli/src/test-utils/render.tsx index a9f786f11c..83e69d6663 100644 --- a/packages/cli/src/test-utils/render.tsx +++ b/packages/cli/src/test-utils/render.tsx @@ -552,6 +552,8 @@ const mockUIActions: UIActions = { exitPrivacyNotice: vi.fn(), closeSettingsDialog: vi.fn(), closeModelDialog: vi.fn(), + openVoiceModelDialog: vi.fn(), + closeVoiceModelDialog: vi.fn(), openAgentConfigDialog: vi.fn(), closeAgentConfigDialog: vi.fn(), openPermissionsDialog: vi.fn(), @@ -598,6 +600,7 @@ const mockUIActions: UIActions = { handleNewAgentsSelect: vi.fn(), getPreferredEditor: vi.fn(), clearAccountSuspension: vi.fn(), + setVoiceModeEnabled: vi.fn(), }; import { type TextBuffer } from '../ui/components/shared/text-buffer.js'; diff --git a/packages/cli/src/ui/AppContainer.tsx b/packages/cli/src/ui/AppContainer.tsx index fdbaf57fbe..f5294ae23e 100644 --- a/packages/cli/src/ui/AppContainer.tsx +++ b/packages/cli/src/ui/AppContainer.tsx @@ -103,6 +103,7 @@ import { useQuotaAndFallback } from './hooks/useQuotaAndFallback.js'; import { useEditorSettings } from './hooks/useEditorSettings.js'; import { useSettingsCommand } from './hooks/useSettingsCommand.js'; import { useModelCommand } from './hooks/useModelCommand.js'; +import { useVoiceModelCommand } from './hooks/useVoiceModelCommand.js'; import { useSlashCommandProcessor } from './hooks/slashCommandProcessor.js'; import { useVimMode } from './contexts/VimModeContext.js'; import { @@ -312,6 +313,7 @@ export const AppContainer = (props: AppContainerProps) => { ); const [shellModeActive, setShellModeActive] = useState(false); + const [isVoiceModeEnabled, setVoiceModeEnabled] = useState(false); const [modelSwitchedFromQuotaError, setModelSwitchedFromQuotaError] = useState(false); const [historyRemountKey, setHistoryRemountKey] = useState(0); @@ -946,6 +948,12 @@ Logging in with Google... Restarting Gemini CLI to continue. const { isModelDialogOpen, openModelDialog, closeModelDialog } = useModelCommand(); + const { + isVoiceModelDialogOpen, + openVoiceModelDialog, + closeVoiceModelDialog, + } = useVoiceModelCommand(); + const { toggleVimEnabled } = useVimMode(); const setIsBackgroundTaskListOpenRef = useRef<(open: boolean) => void>( @@ -969,6 +977,7 @@ Logging in with Google... Restarting Gemini CLI to continue. openSettingsDialog, openSessionBrowser, openModelDialog, + openVoiceModelDialog, openAgentConfigDialog, openPermissionsDialog, quit: (messages: HistoryItem[]) => { @@ -981,6 +990,7 @@ Logging in with Google... Restarting Gemini CLI to continue. }, setDebugMessage, toggleCorgiMode: () => setCorgiMode((prev) => !prev), + toggleVoiceMode: () => setVoiceModeEnabled((prev) => !prev), toggleDebugProfiler, dispatchExtensionStateUpdate, addConfirmUpdateExtensionRequest, @@ -1006,6 +1016,7 @@ Logging in with Google... Restarting Gemini CLI to continue. openSettingsDialog, openSessionBrowser, openModelDialog, + openVoiceModelDialog, openAgentConfigDialog, setQuittingMessages, setDebugMessage, @@ -2191,6 +2202,7 @@ Logging in with Google... Restarting Gemini CLI to continue. isThemeDialogOpen || isSettingsDialogOpen || isModelDialogOpen || + isVoiceModelDialogOpen || isAgentConfigDialogOpen || isPermissionsDialogOpen || isAuthenticating || @@ -2448,6 +2460,7 @@ Logging in with Google... Restarting Gemini CLI to continue. isSettingsDialogOpen, isSessionBrowserOpen, isModelDialogOpen, + isVoiceModelDialogOpen, isAgentConfigDialogOpen, selectedAgentName, selectedAgentDisplayName, @@ -2468,6 +2481,7 @@ Logging in with Google... Restarting Gemini CLI to continue. pendingGeminiHistoryItems, thought, isInputActive, + isVoiceModeEnabled, isResuming, shouldShowIdePrompt, isFolderTrustDialogOpen: isFolderTrustDialogOpen ?? false, @@ -2559,6 +2573,7 @@ Logging in with Google... Restarting Gemini CLI to continue. isSettingsDialogOpen, isSessionBrowserOpen, isModelDialogOpen, + isVoiceModelDialogOpen, isAgentConfigDialogOpen, selectedAgentName, selectedAgentDisplayName, @@ -2579,6 +2594,7 @@ Logging in with Google... Restarting Gemini CLI to continue. pendingGeminiHistoryItems, thought, isInputActive, + isVoiceModeEnabled, isResuming, shouldShowIdePrompt, isFolderTrustDialogOpen, @@ -2671,6 +2687,8 @@ Logging in with Google... Restarting Gemini CLI to continue. exitPrivacyNotice, closeSettingsDialog, closeModelDialog, + openVoiceModelDialog, + closeVoiceModelDialog, openAgentConfigDialog, closeAgentConfigDialog, openPermissionsDialog, @@ -2751,6 +2769,9 @@ Logging in with Google... Restarting Gemini CLI to continue. setAccountSuspensionInfo(null); setAuthState(AuthState.Updating); }, + setVoiceModeEnabled: (value: boolean) => { + setVoiceModeEnabled(value); + }, }), [ handleThemeSelect, @@ -2764,6 +2785,8 @@ Logging in with Google... Restarting Gemini CLI to continue. exitPrivacyNotice, closeSettingsDialog, closeModelDialog, + openVoiceModelDialog, + closeVoiceModelDialog, openAgentConfigDialog, closeAgentConfigDialog, openPermissionsDialog, @@ -2807,6 +2830,7 @@ Logging in with Google... Restarting Gemini CLI to continue. config, historyManager, getPreferredEditor, + setVoiceModeEnabled, ], ); diff --git a/packages/cli/src/ui/commands/types.ts b/packages/cli/src/ui/commands/types.ts index 466e70c994..f0805f4426 100644 --- a/packages/cli/src/ui/commands/types.ts +++ b/packages/cli/src/ui/commands/types.ts @@ -72,6 +72,7 @@ export interface CommandContext { loadHistory: (history: HistoryItem[], postLoadInput?: string) => void; /** Toggles a special display mode. */ toggleCorgiMode: () => void; + toggleVoiceMode: () => void; toggleDebugProfiler: () => void; toggleVimEnabled: () => Promise; reloadCommands: () => void; @@ -125,6 +126,7 @@ export interface OpenDialogActionReturn { | 'settings' | 'sessionBrowser' | 'model' + | 'voice-model' | 'agentConfig' | 'permissions'; } diff --git a/packages/cli/src/ui/commands/voiceCommand.ts b/packages/cli/src/ui/commands/voiceCommand.ts new file mode 100644 index 0000000000..b9df28ca27 --- /dev/null +++ b/packages/cli/src/ui/commands/voiceCommand.ts @@ -0,0 +1,30 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { CommandKind, type SlashCommand } from './types.js'; + +export const voiceCommand: SlashCommand = { + name: 'voice', + altNames: [], + description: 'Toggle voice dictation mode', + kind: CommandKind.BUILT_IN, + autoExecute: true, + action: (context) => { + context.ui.toggleVoiceMode(); + }, + subCommands: [ + { + name: 'model', + description: 'Manage voice transcription models', + kind: CommandKind.BUILT_IN, + autoExecute: true, + action: async () => ({ + type: 'dialog', + dialog: 'voice-model', + }), + }, + ], +}; diff --git a/packages/cli/src/ui/components/DialogManager.tsx b/packages/cli/src/ui/components/DialogManager.tsx index b231a62db5..40f0b06138 100644 --- a/packages/cli/src/ui/components/DialogManager.tsx +++ b/packages/cli/src/ui/components/DialogManager.tsx @@ -25,6 +25,7 @@ import { relaunchApp } from '../../utils/processUtils.js'; import { SessionBrowser } from './SessionBrowser.js'; import { PermissionsModifyTrustDialog } from './PermissionsModifyTrustDialog.js'; import { ModelDialog } from './ModelDialog.js'; +import { VoiceModelDialog } from './VoiceModelDialog.js'; import { theme } from '../semantic-colors.js'; import { useUIState } from '../contexts/UIStateContext.js'; import { useQuotaState } from '../contexts/QuotaContext.js'; @@ -238,6 +239,9 @@ export const DialogManager = ({ if (uiState.isModelDialogOpen) { return ; } + if (uiState.isVoiceModelDialogOpen) { + return ; + } if ( uiState.isAgentConfigDialogOpen && uiState.selectedAgentName && diff --git a/packages/cli/src/ui/components/InputPrompt.test.tsx b/packages/cli/src/ui/components/InputPrompt.test.tsx index e50a2f1d81..5be237a15f 100644 --- a/packages/cli/src/ui/components/InputPrompt.test.tsx +++ b/packages/cli/src/ui/components/InputPrompt.test.tsx @@ -9,12 +9,41 @@ import { createMockSettings } from '../../test-utils/settings.js'; import { makeFakeConfig } from '@google/gemini-cli-core'; import { waitFor } from '../../test-utils/async.js'; import { act, useState, useMemo } from 'react'; +import type { EventEmitter } from 'node:events'; + +const { fakeTranscriptionProvider } = vi.hoisted(() => { + // Use require within hoisted block for immediate synchronous access + // eslint-disable-next-line @typescript-eslint/no-require-imports, no-restricted-syntax + const { EventEmitter } = require('node:events'); + class FakeTranscriptionProvider extends EventEmitter { + connect = vi.fn().mockResolvedValue(undefined); + disconnect = vi.fn(); + sendAudioChunk = vi.fn(); + getTranscription = vi.fn().mockReturnValue(''); + } + return { + fakeTranscriptionProvider: new FakeTranscriptionProvider(), + }; +}); + +vi.mock('@google/gemini-cli-core', async (importOriginal) => { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const actual = (await importOriginal()) as any; + return { + ...actual, + TranscriptionFactory: { + createProvider: vi.fn(() => fakeTranscriptionProvider), + }, + }; +}); + import { InputPrompt, tryTogglePasteExpansion, type InputPromptProps, } from './InputPrompt.js'; import { InputContext } from '../contexts/InputContext.js'; +import { type UIState } from '../contexts/UIStateContext.js'; import { calculateTransformationsForLine, calculateTransformedLine, @@ -417,6 +446,7 @@ describe('InputPrompt', () => { getWorkspaceContext: () => ({ getDirectories: () => ['/test/project/src'], }), + getContentGeneratorConfig: () => ({ apiKey: 'test-api-key' }), } as unknown as Config, slashCommands: mockSlashCommands, commandContext: mockCommandContext, @@ -4925,6 +4955,383 @@ describe('InputPrompt', () => { }, ); }); + + describe('Voice Mode', () => { + beforeEach(() => { + ( + fakeTranscriptionProvider as unknown as EventEmitter + ).removeAllListeners(); + vi.clearAllMocks(); + }); + + it('should start recording when space is pressed and voice mode is enabled (toggle)', async () => { + await act(async () => { + mockBuffer.setText(''); + }); + const { stdin, unmount, lastFrame } = await renderWithProviders( + , + { + uiState: { isVoiceModeEnabled: true } as UIState, + settings: createMockSettings({ + experimental: { voice: { activationMode: 'toggle' } }, + }), + }, + ); + + // Initially not recording + expect(lastFrame()).not.toContain('🎙️ Listening...'); + expect(lastFrame()).toContain( + 'Voice mode: Space to start/stop recording', + ); + + // Press space to start + await act(async () => { + stdin.write(' '); + }); + + // Now should show listening + await waitFor(() => { + expect(lastFrame()).toContain('🎙️ Listening...'); + }); + + unmount(); + }); + + it('should toggle recording off when space is pressed again (toggle)', async () => { + await act(async () => { + mockBuffer.setText(''); + }); + const { stdin, unmount, lastFrame } = await renderWithProviders( + , + { + uiState: { isVoiceModeEnabled: true } as UIState, + settings: createMockSettings({ + experimental: { voice: { activationMode: 'toggle' } }, + }), + }, + ); + + // Start recording + await act(async () => { + stdin.write(' '); + }); + await waitFor(() => { + expect(lastFrame()).toContain('🎙️ Listening...'); + }); + + // Stop recording + await act(async () => { + stdin.write(' '); + }); + await waitFor(() => { + expect(lastFrame()).not.toContain('🎙️ Listening...'); + expect(lastFrame()).toContain( + 'Voice mode: Space to start/stop recording', + ); + }); + + unmount(); + }); + + it('should resume recording when space is pressed even if buffer is not empty (toggle)', async () => { + await act(async () => { + mockBuffer.setText('some existing text'); + }); + const { stdin, unmount, lastFrame } = await renderWithProviders( + , + { + uiState: { isVoiceModeEnabled: true } as UIState, + settings: createMockSettings({ + experimental: { voice: { activationMode: 'toggle' } }, + }), + }, + ); + + // Should show voice mode hint even if buffer is not empty (new behavior) + expect(lastFrame()).toContain( + 'Voice mode: Space to start/stop recording', + ); + expect(lastFrame()).toContain('some existing text'); + + // Press space to start recording again + await act(async () => { + stdin.write(' '); + }); + + await waitFor(() => { + expect(lastFrame()).toContain('🎙️ Listening...'); + }); + + unmount(); + }); + + it('should not start recording if voice mode is disabled (toggle)', async () => { + await act(async () => { + mockBuffer.setText(''); + }); + const { stdin, unmount, lastFrame } = await renderWithProviders( + , + { + uiState: { isVoiceModeEnabled: false } as UIState, + settings: createMockSettings({ + experimental: { voice: { activationMode: 'toggle' } }, + }), + }, + ); + + // Press space + await act(async () => { + stdin.write(' '); + }); + + // Should NOT show listening, instead should call handleInput which handles space + expect(lastFrame()).not.toContain('🎙️ Listening...'); + expect(mockBuffer.handleInput).toHaveBeenCalled(); + unmount(); + }); + + it('should append transcription correctly across multiple turn updates (toggle)', async () => { + await act(async () => { + mockBuffer.setText('initial'); + }); + const { stdin, unmount } = await renderWithProviders( + , + { + uiState: { isVoiceModeEnabled: true } as UIState, + settings: createMockSettings({ + experimental: { voice: { activationMode: 'toggle' } }, + }), + }, + ); + + // Start recording + await act(async () => { + stdin.write(' '); + }); + + // Emit first transcription + await act(async () => { + (fakeTranscriptionProvider as unknown as EventEmitter).emit( + 'transcription', + 'hello', + ); + }); + await waitFor(() => { + expect(mockBuffer.setText).toHaveBeenCalledWith('initial hello', 'end'); + }); + + // Emit turnComplete (Gemini Live starts over after this) + await act(async () => { + (fakeTranscriptionProvider as unknown as EventEmitter).emit( + 'turnComplete', + ); + }); + + // Emit second part (Gemini Live sends new turn text starting from empty) + await act(async () => { + (fakeTranscriptionProvider as unknown as EventEmitter).emit( + 'transcription', + 'world', + ); + }); + await waitFor(() => { + // Should have appended 'world' to the baseline 'initial hello' + expect(mockBuffer.setText).toHaveBeenCalledWith( + 'initial hello world', + 'end', + ); + }); + + unmount(); + }); + + it('should append transcription correctly when resuming voice mode (toggle)', async () => { + await act(async () => { + mockBuffer.setText('First turn.'); + }); + const { stdin, unmount } = await renderWithProviders( + , + { + uiState: { isVoiceModeEnabled: true } as UIState, + settings: createMockSettings({ + experimental: { voice: { activationMode: 'toggle' } }, + }), + }, + ); + + // Start recording (resumed) + await act(async () => { + stdin.write(' '); + }); + + // Emit transcription + await act(async () => { + (fakeTranscriptionProvider as unknown as EventEmitter).emit( + 'transcription', + 'Second turn.', + ); + }); + + await waitFor(() => { + expect(mockBuffer.setText).toHaveBeenCalledWith( + 'First turn. Second turn.', + 'end', + ); + }); + + unmount(); + }); + + describe('push-to-talk', () => { + beforeEach(() => { + vi.useFakeTimers(); + }); + + afterEach(() => { + vi.useRealTimers(); + }); + + it('should insert a space on a single tap', async () => { + const { stdin, unmount, lastFrame } = await renderWithProviders( + , + { + uiState: { isVoiceModeEnabled: true } as UIState, + settings: createMockSettings({ + experimental: { voice: { activationMode: 'push-to-talk' } }, + }), + }, + ); + + expect(lastFrame()).toContain('Voice mode: Hold Space to record'); + + // Press space once + await act(async () => { + stdin.write(' '); + }); + + // Should insert space optimistically + expect(mockBuffer.insert).toHaveBeenCalledWith(' '); + expect(lastFrame()).not.toContain('🎙️ Listening...'); + + // Advance timer past HOLD_DELAY_MS + await act(async () => { + vi.advanceTimersByTime(700); + }); + + expect(lastFrame()).not.toContain('🎙️ Listening...'); + unmount(); + }); + + it('should start recording on hold (simulated by repeat spaces)', async () => { + const { stdin, unmount, lastFrame } = await renderWithProviders( + , + { + uiState: { isVoiceModeEnabled: true } as UIState, + settings: createMockSettings({ + experimental: { voice: { activationMode: 'push-to-talk' } }, + }), + }, + ); + + // First space + await act(async () => { + stdin.write(' '); + }); + expect(mockBuffer.insert).toHaveBeenCalledWith(' '); + + // Second space (repeat) + await act(async () => { + stdin.write(' '); + }); + + await waitFor(() => { + // Should have backspaced the optimistic space + expect(mockBuffer.backspace).toHaveBeenCalled(); + // Should show listening + expect(lastFrame()).toContain('🎙️ Listening...'); + }); + + unmount(); + }); + + it('should stop recording when space heartbeat stops (release)', async () => { + const { stdin, unmount, lastFrame } = await renderWithProviders( + , + { + uiState: { isVoiceModeEnabled: true } as UIState, + settings: createMockSettings({ + experimental: { voice: { activationMode: 'push-to-talk' } }, + }), + }, + ); + + // Start hold + await act(async () => { + stdin.write(' '); + stdin.write(' '); + }); + + // Use a short interval in waitFor to prevent advancing fake timers past the 300ms RELEASE_DELAY_MS + await waitFor( + () => { + expect(lastFrame()).toContain('🎙️ Listening...'); + }, + { interval: 10 }, + ); + + // Simulate heartbeat (held key) - send space first to reset timer, then advance + await act(async () => { + stdin.write(' '); + vi.advanceTimersByTime(100); + }); + expect(lastFrame()).toContain('🎙️ Listening...'); + + // Stop heartbeat (release) + await act(async () => { + vi.advanceTimersByTime(400); // Past RELEASE_DELAY_MS + }); + + await waitFor(() => { + expect(lastFrame()).not.toContain('🎙️ Listening...'); + }); + + unmount(); + }); + + it('should cancel hold state if non-space key is pressed after first space', async () => { + const { stdin, unmount } = await renderWithProviders( + , + { + uiState: { isVoiceModeEnabled: true } as UIState, + settings: createMockSettings({ + experimental: { voice: { activationMode: 'push-to-talk' } }, + }), + }, + ); + + // First space + await act(async () => { + stdin.write(' '); + }); + + // Type 'a' + await act(async () => { + stdin.write('a'); + }); + + // Should NOT start recording on next space even if fast + await act(async () => { + stdin.write(' '); + }); + + expect(mockBuffer.insert).toHaveBeenCalledTimes(2); // Two spaces inserted + expect(mockBuffer.handleInput).toHaveBeenCalledWith( + expect.objectContaining({ name: 'a' }), + ); + unmount(); + }); + }); + }); }); function clean(str: string | undefined): string { diff --git a/packages/cli/src/ui/components/InputPrompt.tsx b/packages/cli/src/ui/components/InputPrompt.tsx index c9f75c740b..f69138c8c7 100644 --- a/packages/cli/src/ui/components/InputPrompt.tsx +++ b/packages/cli/src/ui/components/InputPrompt.tsx @@ -56,6 +56,7 @@ import { debugLogger, type Config, } from '@google/gemini-cli-core'; +import { useVoiceMode } from '../hooks/useVoiceMode.js'; import { parseInputForHighlighting, parseSegmentsFromTokens, @@ -159,7 +160,6 @@ export function isLargePaste(text: string): boolean { } const DOUBLE_TAB_CLEAN_UI_TOGGLE_WINDOW_MS = 350; - /** * Attempt to toggle expansion of a paste placeholder in the buffer. * Returns true if a toggle action was performed or hint was shown, false otherwise. @@ -238,6 +238,7 @@ export const InputPrompt: React.FC = ({ setEmbeddedShellFocused, setShortcutsHelpVisible, toggleCleanUiDetailsVisible, + setVoiceModeEnabled, } = useUIActions(); const { terminalWidth, @@ -246,6 +247,7 @@ export const InputPrompt: React.FC = ({ backgroundTasks, backgroundTaskHeight, shortcutsHelpVisible, + isVoiceModeEnabled, } = useUIState(); const [suppressCompletion, setSuppressCompletion] = useState(false); const { handlePress: registerPlainTabPress, resetCount: resetPlainTabPress } = @@ -263,6 +265,7 @@ export const InputPrompt: React.FC = ({ resetEscapeState(); if (buffer.text.length > 0) { buffer.setText(''); + resetTurnBaseline(); resetCompletionState(); } else if (history.length > 0) { onSubmit('/rewind'); @@ -281,6 +284,16 @@ export const InputPrompt: React.FC = ({ const hasUserNavigatedSuggestions = useRef(false); const listRef = useRef>(null); + const { isRecording, handleVoiceInput, resetTurnBaseline } = useVoiceMode({ + buffer, + config, + settings, + setQueueErrorMessage, + isVoiceModeEnabled, + setVoiceModeEnabled, + keyMatchers, + }); + const [reverseSearchActive, setReverseSearchActive] = useState(false); const [commandSearchActive, setCommandSearchActive] = useState(false); const [textBeforeReverseSearch, setTextBeforeReverseSearch] = useState(''); @@ -387,6 +400,7 @@ export const InputPrompt: React.FC = ({ // Clear the buffer *before* calling onSubmit to prevent potential re-submission // if onSubmit triggers a re-render while the buffer still holds the old value. buffer.setText(''); + resetTurnBaseline(); onSubmit(processedValue); resetCompletionState(); resetReverseSearchCompletionState(); @@ -398,6 +412,7 @@ export const InputPrompt: React.FC = ({ shellModeActive, shellHistory, resetReverseSearchCompletionState, + resetTurnBaseline, ], ); @@ -647,6 +662,8 @@ export const InputPrompt: React.FC = ({ const handleInput = useCallback( (key: Key) => { + if (handleVoiceInput(key)) return true; + // Determine if this keypress is a history navigation command const isHistoryUp = !shellModeActive && @@ -873,9 +890,9 @@ export const InputPrompt: React.FC = ({ ) { setShellModeActive(!shellModeActive); buffer.setText(''); // Clear the '!' from input + resetTurnBaseline(); return true; } - if (keyMatchers[Command.ESCAPE](key)) { const cancelSearch = ( setActive: (active: boolean) => void, @@ -1360,6 +1377,7 @@ export const InputPrompt: React.FC = ({ backgroundTaskHeight, streamingState, handleEscPress, + resetTurnBaseline, registerPlainTabPress, resetPlainTabPress, toggleCleanUiDetailsVisible, @@ -1369,9 +1387,9 @@ export const InputPrompt: React.FC = ({ keyMatchers, isHelpDismissKey, settings, + handleVoiceInput, ], ); - useKeypress(handleInput, { isActive: !isEmbeddedShellFocused && !copyModeEnabled, priority: true, @@ -1792,20 +1810,39 @@ export const InputPrompt: React.FC = ({ )}{' '} - {buffer.text.length === 0 && placeholder ? ( - showCursor ? ( - - {chalk.inverse(placeholder.slice(0, 1))} - - {placeholder.slice(1)} - + {isRecording && ( + + 🎙️ Listening... + + )} + {isVoiceModeEnabled && !isRecording && ( + + + > Voice mode:{' '} + {(settings.experimental.voice?.activationMode ?? + 'push-to-talk') === 'push-to-talk' + ? 'Hold Space to record' + : 'Space to start/stop recording'}{' '} + (Esc to exit) - ) : ( - {placeholder} - ) + + )} + {buffer.text.length === 0 && !isRecording ? ( + !isVoiceModeEnabled && placeholder ? ( + showCursor ? ( + + {chalk.inverse(placeholder.slice(0, 1))} + + {placeholder.slice(1)} + + + ) : ( + {placeholder} + ) + ) : null ) : ( void; +} + +type DialogView = 'backend' | 'whisper-models'; + +const WHISPER_MODELS = [ + { + value: 'ggml-tiny.en.bin', + label: 'Tiny (EN)', + description: 'Fastest, lower accuracy (~75MB)', + }, + { + value: 'ggml-base.en.bin', + label: 'Base (EN)', + description: 'Balanced speed and accuracy (~142MB)', + }, + { + value: 'ggml-large-v3-turbo-q5_0.bin', + label: 'Large v3 Turbo (Q5_0)', + description: 'High accuracy, quantized (~547MB)', + }, + { + value: 'ggml-large-v3-turbo-q8_0.bin', + label: 'Large v3 Turbo (Q8_0)', + description: 'Maximum accuracy, high memory (~834MB)', + }, +]; + +export function VoiceModelDialog({ + onClose, +}: VoiceModelDialogProps): React.JSX.Element { + const { settings, setSetting } = useSettingsStore(); + const [view, setView] = useState('backend'); + const [downloadProgress, setDownloadProgress] = + useState(null); + const [error, setError] = useState(null); + + const whisperInstalled = useMemo( + () => isBinaryAvailable('whisper-stream'), + [], + ); + const modelManager = useMemo(() => new WhisperModelManager(), []); + + const currentBackend = + settings.merged.experimental.voice?.backend ?? 'gemini-live'; + const currentWhisperModel = + settings.merged.experimental.voice?.whisperModel ?? 'ggml-base.en.bin'; + + const handleKeypress = useCallback( + (key: Key) => { + if (key.name === 'escape') { + if (view === 'whisper-models') { + setView('backend'); + } else { + onClose(); + } + return true; + } + return false; + }, + [view, onClose], + ); + + useKeypress(handleKeypress, { isActive: true }); + + const handleBackendSelect = useCallback( + (value: string) => { + if (value === 'whisper') { + setView('whisper-models'); + } else { + setSetting( + SettingScope.User, + 'experimental.voice.backend', + 'gemini-live', + ); + onClose(); + } + }, + [setSetting, onClose], + ); + + const handleWhisperModelSelect = useCallback( + async (modelName: string) => { + if (modelManager.isModelInstalled(modelName)) { + setSetting(SettingScope.User, 'experimental.voice.backend', 'whisper'); + setSetting( + SettingScope.User, + 'experimental.voice.whisperModel', + modelName, + ); + onClose(); + } else { + setError(null); + const onProgress = (p: WhisperModelProgress) => setDownloadProgress(p); + modelManager.on('progress', onProgress); + + try { + await modelManager.downloadModel(modelName); + + setSetting( + SettingScope.User, + 'experimental.voice.backend', + 'whisper', + ); + setSetting( + SettingScope.User, + 'experimental.voice.whisperModel', + modelName, + ); + onClose(); + } catch (err) { + setError( + `Failed to download: ${err instanceof Error ? err.message : String(err)}`, + ); + } finally { + modelManager.off('progress', onProgress); + setDownloadProgress(null); + } + } + }, + [modelManager, setSetting, onClose], + ); + + const backendOptions = useMemo( + () => [ + { + value: 'gemini-live', + title: 'Gemini Live API (Cloud)', + description: 'Real-time cloud transcription via Gemini Live API.', + key: 'gemini-live', + }, + { + value: 'whisper', + title: 'Whisper (Local)', + description: whisperInstalled + ? 'Local transcription using whisper.cpp.' + : 'Local transcription (Requires: brew install whisper-cpp)', + key: 'whisper', + }, + ], + [whisperInstalled], + ); + + const whisperOptions = useMemo( + () => + WHISPER_MODELS.map((m) => ({ + value: m.value, + title: `${m.label}${modelManager.isModelInstalled(m.value) ? ' (Installed)' : ' (Download)'}`, + description: m.description, + key: m.value, + })), + [modelManager], + ); + + return ( + + + {view === 'backend' + ? 'Select Voice Transcription Backend' + : 'Select Whisper Model'} + + + {error && ( + + {error} + + )} + + {downloadProgress ? ( + + + Downloading {downloadProgress.modelName}... + + {Math.round(downloadProgress.percentage * 100)}% + + + ) : ( + + {view === 'backend' ? ( + + ) : ( + o.value === currentWhisperModel, + )} + showNumbers={true} + /> + )} + + )} + + + + {view === 'whisper-models' + ? '(Press Esc to go back)' + : '(Press Esc to close)'} + + + + ); +} diff --git a/packages/cli/src/ui/components/__snapshots__/InputPrompt.test.tsx.snap b/packages/cli/src/ui/components/__snapshots__/InputPrompt.test.tsx.snap index 4830e90db1..db449ce4d7 100644 --- a/packages/cli/src/ui/components/__snapshots__/InputPrompt.test.tsx.snap +++ b/packages/cli/src/ui/components/__snapshots__/InputPrompt.test.tsx.snap @@ -168,13 +168,6 @@ exports[`InputPrompt > mouse interaction > should toggle paste expansion on doub " `; -exports[`InputPrompt > mouse interaction > should toggle paste expansion on double-click 4`] = ` -"▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀ - > [Pasted Text: 10 lines] -▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄ -" -`; - exports[`InputPrompt > multiline rendering > should correctly render multiline input including blank lines 1`] = ` "──────────────────────────────────────────────────────────────────────────────────────────────────── > hello diff --git a/packages/cli/src/ui/contexts/UIActionsContext.tsx b/packages/cli/src/ui/contexts/UIActionsContext.tsx index f1959c0173..fb979e7c17 100644 --- a/packages/cli/src/ui/contexts/UIActionsContext.tsx +++ b/packages/cli/src/ui/contexts/UIActionsContext.tsx @@ -41,6 +41,8 @@ export interface UIActions { exitPrivacyNotice: () => void; closeSettingsDialog: () => void; closeModelDialog: () => void; + openVoiceModelDialog: () => void; + closeVoiceModelDialog: () => void; openAgentConfigDialog: ( name: string, displayName: string, @@ -93,6 +95,7 @@ export interface UIActions { handleNewAgentsSelect: (choice: NewAgentsChoice) => Promise; getPreferredEditor: () => EditorType | undefined; clearAccountSuspension: () => void; + setVoiceModeEnabled: (value: boolean) => void; } export const UIActionsContext = createContext(null); diff --git a/packages/cli/src/ui/contexts/UIStateContext.tsx b/packages/cli/src/ui/contexts/UIStateContext.tsx index ed33c21ee5..2150218d44 100644 --- a/packages/cli/src/ui/contexts/UIStateContext.tsx +++ b/packages/cli/src/ui/contexts/UIStateContext.tsx @@ -112,6 +112,7 @@ export interface UIState { isSettingsDialogOpen: boolean; isSessionBrowserOpen: boolean; isModelDialogOpen: boolean; + isVoiceModelDialogOpen: boolean; isAgentConfigDialogOpen: boolean; selectedAgentName?: string; selectedAgentDisplayName?: string; @@ -132,6 +133,7 @@ export interface UIState { pendingGeminiHistoryItems: HistoryItemWithoutId[]; thought: ThoughtSummary | null; isInputActive: boolean; + isVoiceModeEnabled: boolean; isResuming: boolean; shouldShowIdePrompt: boolean; isFolderTrustDialogOpen: boolean; diff --git a/packages/cli/src/ui/hooks/slashCommandProcessor.test.tsx b/packages/cli/src/ui/hooks/slashCommandProcessor.test.tsx index 3e521a6627..f4b18d5bbf 100644 --- a/packages/cli/src/ui/hooks/slashCommandProcessor.test.tsx +++ b/packages/cli/src/ui/hooks/slashCommandProcessor.test.tsx @@ -205,11 +205,13 @@ describe('useSlashCommandProcessor', () => { openSettingsDialog: vi.fn(), openSessionBrowser: vi.fn(), openModelDialog: mockOpenModelDialog, + openVoiceModelDialog: vi.fn(), openAgentConfigDialog, openPermissionsDialog: vi.fn(), quit: mockSetQuittingMessages, setDebugMessage: vi.fn(), toggleCorgiMode: vi.fn(), + toggleVoiceMode: vi.fn(), toggleDebugProfiler: vi.fn(), dispatchExtensionStateUpdate: vi.fn(), addConfirmUpdateExtensionRequest: vi.fn(), diff --git a/packages/cli/src/ui/hooks/slashCommandProcessor.ts b/packages/cli/src/ui/hooks/slashCommandProcessor.ts index 20de86002c..3007a96a73 100644 --- a/packages/cli/src/ui/hooks/slashCommandProcessor.ts +++ b/packages/cli/src/ui/hooks/slashCommandProcessor.ts @@ -72,6 +72,7 @@ interface SlashCommandProcessorActions { openSettingsDialog: () => void; openSessionBrowser: () => void; openModelDialog: () => void; + openVoiceModelDialog: () => void; openAgentConfigDialog: ( name: string, displayName: string, @@ -81,6 +82,7 @@ interface SlashCommandProcessorActions { quit: (messages: HistoryItem[]) => void; setDebugMessage: (message: string) => void; toggleCorgiMode: () => void; + toggleVoiceMode: () => void; toggleDebugProfiler: () => void; dispatchExtensionStateUpdate: (action: ExtensionUpdateAction) => void; addConfirmUpdateExtensionRequest: (request: ConfirmationRequest) => void; @@ -232,6 +234,7 @@ export const useSlashCommandProcessor = ( pendingItem, setPendingItem, toggleCorgiMode: actions.toggleCorgiMode, + toggleVoiceMode: actions.toggleVoiceMode, toggleDebugProfiler: actions.toggleDebugProfiler, toggleVimEnabled, reloadCommands, @@ -503,6 +506,9 @@ export const useSlashCommandProcessor = ( case 'model': actions.openModelDialog(); return { type: 'handled' }; + case 'voice-model': + actions.openVoiceModelDialog(); + return { type: 'handled' }; case 'agentConfig': { // eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion const props = result.props as Record; diff --git a/packages/cli/src/ui/hooks/useVoiceMode.ts b/packages/cli/src/ui/hooks/useVoiceMode.ts new file mode 100644 index 0000000000..0f37c66357 --- /dev/null +++ b/packages/cli/src/ui/hooks/useVoiceMode.ts @@ -0,0 +1,429 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { useState, useRef, useCallback, useEffect } from 'react'; +import { + AudioRecorder, + TranscriptionFactory, + debugLogger, + type Config, + type TranscriptionProvider, +} from '@google/gemini-cli-core'; +import type { TextBuffer } from '../components/shared/text-buffer.js'; +import type { MergedSettings } from '../../config/settingsSchema.js'; +import type { Key } from './useKeypress.js'; +import { Command } from '../key/keyMatchers.js'; + +interface UseVoiceModeProps { + buffer: TextBuffer; + config: Config; + settings: MergedSettings; + setQueueErrorMessage: (message: string | null) => void; + isVoiceModeEnabled: boolean; + setVoiceModeEnabled: (enabled: boolean) => void; + keyMatchers: Record boolean>; +} + +const HOLD_DELAY_MS = 600; +const RELEASE_DELAY_MS = 300; + +export function useVoiceMode({ + buffer, + config, + settings, + setQueueErrorMessage, + isVoiceModeEnabled, + setVoiceModeEnabled, + keyMatchers, +}: UseVoiceModeProps) { + const [isRecording, setIsRecording] = useState(false); + const [isConnecting, setIsConnecting] = useState(false); + + const liveTranscriptionRef = useRef(''); + const stopRequestedRef = useRef(false); + const isRecordingRef = useRef(false); + const lastFailureTimeRef = useRef(0); + const recordingInProgressRef = useRef(false); + const voiceTimeoutRef = useRef(null); + const recorderRef = useRef(null); + const transcriptionServiceRef = useRef(null); + const turnBaselineRef = useRef(null); + + const pttStateRef = useRef<'idle' | 'possible-hold' | 'recording'>('idle'); + const pttTimerRef = useRef(null); + const disconnectTimerRef = useRef(null); + + const bufferRef = useRef(buffer); + bufferRef.current = buffer; + + const stopVoiceRecording = useCallback(() => { + if (stopRequestedRef.current) return; + debugLogger.debug('[Voice] Stop requested'); + stopRequestedRef.current = true; + + setIsRecording(false); + isRecordingRef.current = false; + setIsConnecting(false); + + if (recorderRef.current) { + recorderRef.current.stop(); + recorderRef.current = null; + } + + const serviceToDisconnect = transcriptionServiceRef.current; + transcriptionServiceRef.current = null; + + if (serviceToDisconnect) { + const isLive = settings.experimental.voice?.backend === 'gemini-live'; + const gracePeriodMs = + settings.experimental.voice?.stopGracePeriodMs ?? + (isLive ? 2000 : 1000); + debugLogger.debug( + `[Voice] Draining transcription for ${gracePeriodMs}ms`, + ); + + if (disconnectTimerRef.current) clearTimeout(disconnectTimerRef.current); + disconnectTimerRef.current = setTimeout(() => { + debugLogger.debug('[Voice] Grace period ended, disconnecting service'); + serviceToDisconnect.disconnect(); + disconnectTimerRef.current = null; + }, gracePeriodMs); + } + + liveTranscriptionRef.current = ''; + pttStateRef.current = 'idle'; + }, [settings.experimental.voice]); + + const startVoiceRecording = useCallback(() => { + if ( + isRecordingRef.current || + Date.now() - lastFailureTimeRef.current < 2000 + ) { + return; + } + + if (disconnectTimerRef.current) { + clearTimeout(disconnectTimerRef.current); + disconnectTimerRef.current = null; + } + + recordingInProgressRef.current = true; + turnBaselineRef.current = bufferRef.current.text; + + setIsConnecting(true); + setIsRecording(true); + isRecordingRef.current = true; + + liveTranscriptionRef.current = ''; + stopRequestedRef.current = false; + + const apiKey = + config.getContentGeneratorConfig()?.apiKey || + process.env['GEMINI_API_KEY'] || + ''; + + const startAsync = async () => { + // If there's an active draining service, disconnect it immediately + // before starting a new one to prevent orphaned event collisions. + if (disconnectTimerRef.current) { + clearTimeout(disconnectTimerRef.current); + disconnectTimerRef.current = null; + } + if (transcriptionServiceRef.current) { + transcriptionServiceRef.current.disconnect(); + transcriptionServiceRef.current = null; + } + + const cleanupIfStopped = () => { + if (stopRequestedRef.current) { + if (recorderRef.current) { + recorderRef.current.stop(); + recorderRef.current = null; + } + if (transcriptionServiceRef.current) { + transcriptionServiceRef.current.disconnect(); + transcriptionServiceRef.current = null; + } + setIsRecording(false); + isRecordingRef.current = false; + setIsConnecting(false); + recordingInProgressRef.current = false; + return true; + } + return false; + }; + + if (cleanupIfStopped()) return; + + const voiceBackend = + settings.experimental.voice?.backend ?? 'gemini-live'; + + if (!apiKey && voiceBackend === 'gemini-live') { + setQueueErrorMessage( + 'Cloud voice mode requires a GEMINI_API_KEY. Please set it in your environment or ~/.gemini/.env.', + ); + setIsRecording(false); + isRecordingRef.current = false; + setIsConnecting(false); + recordingInProgressRef.current = false; + lastFailureTimeRef.current = Date.now(); + return; + } + + if (voiceBackend === 'gemini-live') { + recorderRef.current = new AudioRecorder(); + } + + const currentService = TranscriptionFactory.createProvider( + settings.experimental.voice, + apiKey, + ); + transcriptionServiceRef.current = currentService; + + currentService.on('transcription', (text) => { + if ( + transcriptionServiceRef.current !== currentService && + stopRequestedRef.current + ) { + // If this is an orphaned service that was replaced by a new session, ignore its events + return; + } + + if (text) { + const currentBufferText = bufferRef.current.text; + const previousTranscription = liveTranscriptionRef.current; + + let newTotalText = currentBufferText; + + if ( + previousTranscription && + currentBufferText.endsWith(previousTranscription) + ) { + newTotalText = currentBufferText.slice( + 0, + -previousTranscription.length, + ); + } else if ( + currentBufferText && + !currentBufferText.endsWith(' ') && + !currentBufferText.endsWith('\n') + ) { + newTotalText += ' '; + } + + newTotalText += text; + bufferRef.current.setText(newTotalText, 'end'); + } + liveTranscriptionRef.current = text; + }); + + currentService.on('turnComplete', () => { + if ( + transcriptionServiceRef.current !== currentService && + stopRequestedRef.current + ) + return; + liveTranscriptionRef.current = ''; + }); + + currentService.on('error', (err) => { + if (transcriptionServiceRef.current !== currentService) return; + debugLogger.error('[Voice] Transcription error:', err); + lastFailureTimeRef.current = Date.now(); + recordingInProgressRef.current = false; + }); + + currentService.on('close', () => { + if (transcriptionServiceRef.current !== currentService) return; + if (!stopRequestedRef.current) { + setIsRecording(false); + isRecordingRef.current = false; + setIsConnecting(false); + recordingInProgressRef.current = false; + lastFailureTimeRef.current = Date.now(); + } + }); + + try { + await currentService.connect(); + if (cleanupIfStopped()) return; + + await recorderRef.current?.start(); + if (cleanupIfStopped()) return; + + setIsConnecting(false); + + const currentVoiceBackend = + settings.experimental.voice?.backend ?? 'gemini-live'; + + recorderRef.current?.on('data', (chunk) => { + if (currentVoiceBackend === 'gemini-live') { + currentService.sendAudioChunk(chunk); + } + }); + recorderRef.current?.on('error', (err) => { + debugLogger.error('[Voice] Recorder error:', err); + stopVoiceRecording(); + lastFailureTimeRef.current = Date.now(); + }); + } catch (err: unknown) { + if (transcriptionServiceRef.current !== currentService) return; + const message = err instanceof Error ? err.message : String(err); + setQueueErrorMessage(`Voice mode failure: ${message}`); + setIsRecording(false); + isRecordingRef.current = false; + setIsConnecting(false); + recordingInProgressRef.current = false; + lastFailureTimeRef.current = Date.now(); + + if (recorderRef.current) { + recorderRef.current.stop(); + recorderRef.current = null; + } + if (transcriptionServiceRef.current) { + transcriptionServiceRef.current.disconnect(); + transcriptionServiceRef.current = null; + } + } + }; + + void startAsync(); + }, [ + config, + settings.experimental.voice, + setQueueErrorMessage, + stopVoiceRecording, + ]); + + useEffect( + () => () => { + if (voiceTimeoutRef.current) clearTimeout(voiceTimeoutRef.current); + if (recorderRef.current) { + recorderRef.current.stop(); + recorderRef.current = null; + } + if (transcriptionServiceRef.current) { + transcriptionServiceRef.current.disconnect(); + transcriptionServiceRef.current = null; + } + if (pttTimerRef.current) clearTimeout(pttTimerRef.current); + if (disconnectTimerRef.current) clearTimeout(disconnectTimerRef.current); + }, + [], + ); + + const handleVoiceInput = useCallback( + (key: Key): boolean => { + const activeRecording = isRecording || isRecordingRef.current; + + if (activeRecording) { + const activationMode = + settings.experimental.voice?.activationMode ?? 'push-to-talk'; + + if (keyMatchers[Command.ESCAPE](key)) { + stopVoiceRecording(); + return true; + } + + if (keyMatchers[Command.VOICE_MODE_PTT](key)) { + if (activationMode === 'push-to-talk') { + if (pttTimerRef.current) { + clearTimeout(pttTimerRef.current); + } + pttTimerRef.current = setTimeout(() => { + stopVoiceRecording(); + pttTimerRef.current = null; + }, RELEASE_DELAY_MS); + return true; + } else { + stopVoiceRecording(); + return true; + } + } + return true; + } + + if (isVoiceModeEnabled) { + const activationMode = + settings.experimental.voice?.activationMode ?? 'push-to-talk'; + + if (keyMatchers[Command.ESCAPE](key) && buffer.text === '') { + setVoiceModeEnabled(false); + return true; + } + + if (keyMatchers[Command.VOICE_MODE_PTT](key)) { + if ( + key.name === 'space' && + !key.ctrl && + !key.alt && + !key.shift && + !key.cmd + ) { + if (activationMode === 'toggle') { + startVoiceRecording(); + return true; + } else { + if (pttStateRef.current === 'idle') { + buffer.insert(' '); + pttStateRef.current = 'possible-hold'; + + if (pttTimerRef.current) clearTimeout(pttTimerRef.current); + pttTimerRef.current = setTimeout(() => { + pttStateRef.current = 'idle'; + pttTimerRef.current = null; + }, HOLD_DELAY_MS); + return true; + } else if (pttStateRef.current === 'possible-hold') { + if (pttTimerRef.current) clearTimeout(pttTimerRef.current); + buffer.backspace(); + pttStateRef.current = 'recording'; + startVoiceRecording(); + + pttTimerRef.current = setTimeout(() => { + stopVoiceRecording(); + pttTimerRef.current = null; + }, RELEASE_DELAY_MS); + return true; + } + } + } + } + + if (pttStateRef.current === 'possible-hold') { + pttStateRef.current = 'idle'; + if (pttTimerRef.current) { + clearTimeout(pttTimerRef.current); + pttTimerRef.current = null; + } + } + } + + return false; + }, + [ + isRecording, + isVoiceModeEnabled, + settings.experimental.voice, + keyMatchers, + stopVoiceRecording, + startVoiceRecording, + buffer, + setVoiceModeEnabled, + ], + ); + + return { + isRecording, + isConnecting, + startVoiceRecording, + stopVoiceRecording, + handleVoiceInput, + resetTurnBaseline: () => { + turnBaselineRef.current = null; + }, + }; +} diff --git a/packages/cli/src/ui/hooks/useVoiceModelCommand.ts b/packages/cli/src/ui/hooks/useVoiceModelCommand.ts new file mode 100644 index 0000000000..943c65ce30 --- /dev/null +++ b/packages/cli/src/ui/hooks/useVoiceModelCommand.ts @@ -0,0 +1,31 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { useState, useCallback } from 'react'; + +interface UseVoiceModelCommandReturn { + isVoiceModelDialogOpen: boolean; + openVoiceModelDialog: () => void; + closeVoiceModelDialog: () => void; +} + +export const useVoiceModelCommand = (): UseVoiceModelCommandReturn => { + const [isVoiceModelDialogOpen, setIsVoiceModelDialogOpen] = useState(false); + + const openVoiceModelDialog = useCallback(() => { + setIsVoiceModelDialogOpen(true); + }, []); + + const closeVoiceModelDialog = useCallback(() => { + setIsVoiceModelDialogOpen(false); + }, []); + + return { + isVoiceModelDialogOpen, + openVoiceModelDialog, + closeVoiceModelDialog, + }; +}; diff --git a/packages/cli/src/ui/key/keyBindings.ts b/packages/cli/src/ui/key/keyBindings.ts index e3fbcd8262..a038f6173c 100644 --- a/packages/cli/src/ui/key/keyBindings.ts +++ b/packages/cli/src/ui/key/keyBindings.ts @@ -97,6 +97,7 @@ export enum Command { RESTART_APP = 'app.restart', SUSPEND_APP = 'app.suspend', SHOW_SHELL_INPUT_UNFOCUS_WARNING = 'app.showShellUnfocusWarning', + VOICE_MODE_PTT = 'app.voiceModePTT', // Background Shell Controls BACKGROUND_SHELL_ESCAPE = 'background.escape', @@ -407,9 +408,7 @@ export const defaultKeyBindingConfig: KeyBindingConfig = new Map([ [Command.RESTART_APP, [new KeyBinding('r'), new KeyBinding('shift+r')]], [Command.SUSPEND_APP, [new KeyBinding('ctrl+z')]], [Command.SHOW_SHELL_INPUT_UNFOCUS_WARNING, [new KeyBinding('tab')]], - [Command.DUMP_FRAME, [new KeyBinding('f8')]], - [Command.START_RECORDING, [new KeyBinding('f6')]], - [Command.STOP_RECORDING, [new KeyBinding('f7')]], + [Command.VOICE_MODE_PTT, [new KeyBinding('space')]], // Background Shell Controls [Command.BACKGROUND_SHELL_ESCAPE, [new KeyBinding('escape')]], @@ -424,6 +423,10 @@ export const defaultKeyBindingConfig: KeyBindingConfig = new Map([ // Extension Controls [Command.UPDATE_EXTENSION, [new KeyBinding('i')]], [Command.LINK_EXTENSION, [new KeyBinding('l')]], + + [Command.DUMP_FRAME, [new KeyBinding('f8')]], + [Command.START_RECORDING, [new KeyBinding('f6')]], + [Command.STOP_RECORDING, [new KeyBinding('f7')]], ]); interface CommandCategory { @@ -538,6 +541,7 @@ export const commandCategories: readonly CommandCategory[] = [ Command.RESTART_APP, Command.SUSPEND_APP, Command.SHOW_SHELL_INPUT_UNFOCUS_WARNING, + Command.VOICE_MODE_PTT, ], }, { @@ -658,6 +662,7 @@ export const commandDescriptions: Readonly> = { [Command.SUSPEND_APP]: 'Suspend the CLI and move it to the background.', [Command.SHOW_SHELL_INPUT_UNFOCUS_WARNING]: 'Show warning when trying to move focus away from shell input.', + [Command.VOICE_MODE_PTT]: 'Hold to speak in Voice Mode.', // Background Shell Controls [Command.BACKGROUND_SHELL_ESCAPE]: 'Dismiss background shell list.', diff --git a/packages/cli/src/ui/noninteractive/nonInteractiveUi.ts b/packages/cli/src/ui/noninteractive/nonInteractiveUi.ts index 3aff41d2de..9118518455 100644 --- a/packages/cli/src/ui/noninteractive/nonInteractiveUi.ts +++ b/packages/cli/src/ui/noninteractive/nonInteractiveUi.ts @@ -43,5 +43,6 @@ export function createNonInteractiveUI(): CommandContext['ui'] { removeComponent: () => {}, toggleBackgroundTasks: () => {}, toggleShortcutsHelp: () => {}, + toggleVoiceMode: () => {}, }; } diff --git a/packages/core/package.json b/packages/core/package.json index eda0e1e5fe..48412418c4 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -56,6 +56,7 @@ "ajv-formats": "^3.0.0", "chardet": "^2.1.0", "chokidar": "^5.0.0", + "command-exists": "^1.2.9", "diff": "^8.0.3", "dotenv": "^17.2.4", "dotenv-expand": "^12.0.3", diff --git a/packages/core/src/config/config.ts b/packages/core/src/config/config.ts index 939fa77d70..11f7a24841 100644 --- a/packages/core/src/config/config.ts +++ b/packages/core/src/config/config.ts @@ -691,6 +691,7 @@ export interface ConfigParameters { ptyInfo?: string; disableYoloMode?: boolean; disableAlwaysAllow?: boolean; + voiceMode?: boolean; rawOutput?: boolean; acceptRawOutputRisk?: boolean; dynamicModelConfiguration?: boolean; @@ -963,6 +964,7 @@ export class Config implements McpContext, AgentLoopContext { private readonly topicUpdateNarration: boolean; private readonly disableLLMCorrection: boolean; private readonly planEnabled: boolean; + private readonly voiceMode: boolean; private readonly trackerEnabled: boolean; private readonly planModeRoutingEnabled: boolean; private readonly modelSteering: boolean; @@ -1117,6 +1119,7 @@ export class Config implements McpContext, AgentLoopContext { this.agents = params.agents ?? {}; this.disableLLMCorrection = params.disableLLMCorrection ?? true; this.planEnabled = params.plan ?? true; + this.voiceMode = params.voiceMode ?? false; this.trackerEnabled = params.tracker ?? false; this.planModeRoutingEnabled = params.planSettings?.modelRouting ?? true; this.enableEventDrivenScheduler = params.enableEventDrivenScheduler ?? true; @@ -2969,6 +2972,10 @@ export class Config implements McpContext, AgentLoopContext { return this.planEnabled; } + isVoiceModeEnabled(): boolean { + return this.voiceMode; + } + isTrackerEnabled(): boolean { return this.trackerEnabled; } diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 3123dd9096..86b36a6b0e 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -297,3 +297,12 @@ export * from './context/profiles.js'; // Export trust utility export * from './utils/trust.js'; + +// Export voice utilities +export * from './voice/audioRecorder.js'; +export * from './voice/transcriptionProvider.js'; +export * from './voice/geminiLiveTranscriptionProvider.js'; +export * from './voice/whisperTranscriptionProvider.js'; +export * from './voice/transcriptionFactory.js'; +export * from './voice/whisperModelManager.js'; +export { isBinaryAvailable } from './utils/binaryCheck.js'; diff --git a/packages/core/src/utils/binaryCheck.ts b/packages/core/src/utils/binaryCheck.ts new file mode 100644 index 0000000000..8d37f0def4 --- /dev/null +++ b/packages/core/src/utils/binaryCheck.ts @@ -0,0 +1,14 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { sync as commandExistsSync } from 'command-exists'; + +/** + * Checks if a binary is available in the system PATH. + */ +export function isBinaryAvailable(binaryName: string): boolean { + return commandExistsSync(binaryName); +} diff --git a/packages/core/src/voice/audioRecorder.ts b/packages/core/src/voice/audioRecorder.ts new file mode 100644 index 0000000000..c1217e5d7a --- /dev/null +++ b/packages/core/src/voice/audioRecorder.ts @@ -0,0 +1,115 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { spawn, type ChildProcessWithoutNullStreams } from 'node:child_process'; +import { EventEmitter } from 'node:events'; +import commandExists from 'command-exists'; + +export interface AudioRecorderEvents { + data: [Buffer]; + start: []; + stop: []; + error: [Error]; +} + +/** + * Captures audio from the microphone using `sox` (`rec`). + * Emits 16kHz, 16-bit, mono PCM chunks. + */ +export class AudioRecorder extends EventEmitter { + private recProcess: ChildProcessWithoutNullStreams | null = null; + private isRecordingInternal = false; + + get isRecording(): boolean { + return this.isRecordingInternal; + } + + /** + * Checks if `rec` (sox) is available on the system. + */ + static async isAvailable(): Promise { + try { + await commandExists('rec'); + return true; + } catch { + return false; + } + } + + async start(): Promise { + if (this.isRecordingInternal) return; + this.isRecordingInternal = true; + + try { + const available = await AudioRecorder.isAvailable(); + if (!this.isRecordingInternal) return; // Check if stopped while checking availability + + if (!available) { + throw new Error( + 'The `rec` command (provided by SoX) is required for voice mode. Please install SoX (e.g., `brew install sox` on macOS or `sudo apt install sox libsox-fmt-all` on Linux).', + ); + } + + // rec -q -V0 -e signed -c 1 -b 16 -r 16000 -t raw - + this.recProcess = spawn('rec', [ + '-q', + '-V0', + '-e', + 'signed', + '-c', + '1', + '-b', + '16', + '-r', + '16000', + '-t', + 'raw', + '-', + ]); + + if (!this.isRecordingInternal) { + this.recProcess.kill('SIGTERM'); + this.recProcess = null; + return; + } + + this.recProcess.stdout.on('data', (data: Buffer) => { + this.emit('data', data); + }); + + this.recProcess.stderr.on('data', (_data: Buffer) => { + // rec might print warnings to stderr, we could log them or ignore + // console.warn(`rec stderr: ${data.toString()}`); + }); + + this.recProcess.on('error', (err) => { + this.emit('error', err); + this.stop(); + }); + + this.recProcess.on('close', () => { + this.stop(); + }); + + this.emit('start'); + } catch (err) { + this.isRecordingInternal = false; + throw err; + } + } + + stop(): void { + if (!this.isRecordingInternal) return; + this.isRecordingInternal = false; + + if (this.recProcess) { + this.recProcess.kill('SIGTERM'); + this.recProcess = null; + } + + this.emit('stop'); + } +} diff --git a/packages/core/src/voice/geminiLiveTranscriptionProvider.ts b/packages/core/src/voice/geminiLiveTranscriptionProvider.ts new file mode 100644 index 0000000000..4895a60e56 --- /dev/null +++ b/packages/core/src/voice/geminiLiveTranscriptionProvider.ts @@ -0,0 +1,178 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import WebSocket from 'ws'; +import { EventEmitter, once } from 'node:events'; +import { debugLogger } from '../utils/debugLogger.js'; +import type { + TranscriptionProvider, + TranscriptionEvents, +} from './transcriptionProvider.js'; + +import { z } from 'zod'; + +const LiveAPIResponseSchema = z.object({ + setupComplete: z.record(z.unknown()).optional(), + serverContent: z + .object({ + turnComplete: z.boolean().optional(), + inputTranscription: z + .object({ + text: z.string().optional(), + }) + .optional(), + outputTranscription: z + .object({ + text: z.string().optional(), + }) + .optional(), + modelTurn: z + .object({ + parts: z + .array( + z.object({ + text: z.string().optional(), + inlineData: z + .object({ + data: z.string(), + }) + .optional(), + }), + ) + .optional(), + }) + .optional(), + }) + .optional(), +}); + +/** + * Connects to the Gemini Live API using raw WebSockets to support API Key authentication. + */ +export class GeminiLiveTranscriptionProvider + extends EventEmitter + implements TranscriptionProvider +{ + private ws: WebSocket | null = null; + private currentTranscription = ''; + + constructor(private readonly apiKey: string) { + super(); + } + + async connect(): Promise { + const modelName = 'gemini-3.1-flash-live-preview'; + const baseUrl = + 'wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent'; + + if (!this.apiKey) { + throw new Error('No API key provided'); + } + + // NOTE: The Generative Language WebSocket API requires the API key to be passed via the 'key' query parameter. + const url = `${baseUrl}?key=${this.apiKey}`; + debugLogger.debug( + `[GeminiLiveTranscription] Connecting to model ${modelName} via raw WebSocket with API Key...`, + ); + + try { + this.ws = new WebSocket(url, { + maxPayload: 1 << 20, // 1MB limit for safety + }); + + this.ws.on('message', (data) => { + try { + const parsedData: unknown = JSON.parse(data.toString()); + const result = LiveAPIResponseSchema.safeParse(parsedData); + + if (result.success) { + const response = result.data; + if (response.serverContent) { + const content = response.serverContent; + + if (content.turnComplete) { + this.emit('turnComplete'); + } + + if (content.inputTranscription?.text) { + const text = content.inputTranscription.text; + debugLogger.debug( + `[GeminiLiveTranscription] Transcription received (Cloud): "${text}"`, + ); + this.currentTranscription = text; + this.emit('transcription', this.currentTranscription); + } + } + } + } catch (e) { + debugLogger.error( + '[GeminiLiveTranscription] Error parsing message:', + e, + ); + } + }); + + this.ws.on('error', (error) => { + debugLogger.error('[GeminiLiveTranscription] WebSocket Error:', error); + this.emit('error', error); + }); + + this.ws.on('close', (code, reason) => { + debugLogger.debug( + `[GeminiLiveTranscription] Connection Closed. Code: ${code}, Reason: ${reason}`, + ); + this.emit('close'); + this.ws = null; + }); + + await once(this.ws, 'open'); + + const setupMessage = { + setup: { + model: `models/${modelName}`, + generation_config: { + response_modalities: ['audio'], + }, + input_audio_transcription: {}, + }, + }; + + this.ws.send(JSON.stringify(setupMessage)); + this.currentTranscription = ''; + } catch (err) { + debugLogger.error( + '[GeminiLiveTranscription] Failed to establish connection:', + err, + ); + throw err; + } + } + + sendAudioChunk(chunk: Buffer): void { + if (!this.ws || this.ws.readyState !== WebSocket.OPEN) return; + + const audioMessage = { + realtime_input: { + audio: { + data: chunk.toString('base64'), + mime_type: 'audio/pcm;rate=16000', + }, + }, + }; + this.ws.send(JSON.stringify(audioMessage)); + } + + getTranscription(): string { + return this.currentTranscription; + } + + disconnect(): void { + if (this.ws) { + this.ws.close(); + this.ws = null; + } + } +} diff --git a/packages/core/src/voice/transcriptionFactory.ts b/packages/core/src/voice/transcriptionFactory.ts new file mode 100644 index 0000000000..ee0ac64700 --- /dev/null +++ b/packages/core/src/voice/transcriptionFactory.ts @@ -0,0 +1,41 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import * as path from 'node:path'; +import * as fs from 'node:fs'; +import { homedir, GEMINI_DIR } from '../utils/paths.js'; +import { GeminiLiveTranscriptionProvider } from './geminiLiveTranscriptionProvider.js'; +import { WhisperTranscriptionProvider } from './whisperTranscriptionProvider.js'; +import type { TranscriptionProvider } from './transcriptionProvider.js'; + +export class TranscriptionFactory { + static createProvider( + voiceConfig: { backend?: string; whisperModel?: string } | undefined, + apiKey: string, + ): TranscriptionProvider { + const backend = voiceConfig?.backend ?? 'gemini-live'; + + if (backend === 'whisper') { + const modelsDir = path.join(homedir(), GEMINI_DIR, 'whisper_models'); + if (!fs.existsSync(modelsDir)) { + fs.mkdirSync(modelsDir, { recursive: true }); + } + + const modelName = voiceConfig?.whisperModel ?? 'ggml-base.en.bin'; + const modelPath = path.join(modelsDir, modelName); + + return new WhisperTranscriptionProvider({ + modelPath, + threads: 4, + step: 0, + length: 5000, + }); + } + + // Default to Gemini Live + return new GeminiLiveTranscriptionProvider(apiKey); + } +} diff --git a/packages/core/src/voice/transcriptionProvider.ts b/packages/core/src/voice/transcriptionProvider.ts new file mode 100644 index 0000000000..6b3bf20c0a --- /dev/null +++ b/packages/core/src/voice/transcriptionProvider.ts @@ -0,0 +1,33 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { EventEmitter } from 'node:events'; + +export interface TranscriptionEvents { + /** Emitted when partial or full transcription text is available. */ + transcription: [string]; + /** Emitted when a speaking turn is considered complete. */ + turnComplete: []; + /** Emitted when an error occurs during transcription. */ + error: [Error]; + /** Emitted when the transcription service connection is closed. */ + close: []; +} + +/** + * Common interface for all transcription backends (Cloud or Local). + */ +export interface TranscriptionProvider + extends EventEmitter { + /** Establish connection to the transcription service. */ + connect(): Promise; + /** Send a chunk of raw audio data to the service. */ + sendAudioChunk(chunk: Buffer): void; + /** Disconnect from the transcription service. */ + disconnect(): void; + /** Get the current full transcription for the session. */ + getTranscription(): string; +} diff --git a/packages/core/src/voice/whisperModelManager.ts b/packages/core/src/voice/whisperModelManager.ts new file mode 100644 index 0000000000..64988eb48d --- /dev/null +++ b/packages/core/src/voice/whisperModelManager.ts @@ -0,0 +1,107 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import * as path from 'node:path'; +import * as fs from 'node:fs'; +import { EventEmitter } from 'node:events'; +import { homedir, GEMINI_DIR } from '../utils/paths.js'; +import { debugLogger } from '../utils/debugLogger.js'; + +export interface WhisperModelProgress { + modelName: string; + transferred: number; + total: number; + percentage: number; +} + +export interface WhisperModelManagerEvents { + progress: [WhisperModelProgress]; +} + +const ALLOWED_MODELS = [ + 'ggml-tiny.en.bin', + 'ggml-base.en.bin', + 'ggml-large-v3-turbo-q5_0.bin', + 'ggml-large-v3-turbo-q8_0.bin', +]; + +/** + * Manages Whisper models (checking existence, downloading). + */ +export class WhisperModelManager extends EventEmitter { + private readonly modelsDir: string; + + constructor() { + super(); + this.modelsDir = path.join(homedir(), GEMINI_DIR, 'whisper_models'); + } + + isModelInstalled(modelName: string): boolean { + this.validateModelName(modelName); + return fs.existsSync(path.join(this.modelsDir, modelName)); + } + + getModelPath(modelName: string): string { + this.validateModelName(modelName); + return path.join(this.modelsDir, modelName); + } + + async downloadModel(modelName: string): Promise { + this.validateModelName(modelName); + + if (!fs.existsSync(this.modelsDir)) { + fs.mkdirSync(this.modelsDir, { recursive: true }); + } + + const destination = path.join(this.modelsDir, modelName); + const url = `https://huggingface.co/ggerganov/whisper.cpp/resolve/main/${modelName}`; + + debugLogger.debug( + `[WhisperModelManager] Downloading ${modelName} from ${url}`, + ); + + const response = await fetch(url); + if (!response.ok) { + throw new Error(`Failed to download model: ${response.statusText}`); + } + + const total = parseInt(response.headers.get('content-length') || '0', 10); + let transferred = 0; + + const reader = response.body?.getReader(); + if (!reader) { + throw new Error('Response body is not readable'); + } + + const writer = fs.createWriteStream(destination); + + try { + while (true) { + const { done, value } = await reader.read(); + if (done) break; + + transferred += value.length; + writer.write(value); + + const percentage = total > 0 ? transferred / total : 0; + this.emit('progress', { + modelName, + transferred, + total, + percentage, + }); + } + } finally { + writer.end(); + } + } + + private validateModelName(modelName: string): void { + if (!ALLOWED_MODELS.includes(modelName)) { + throw new Error(`Unauthorized model name: ${modelName}`); + } + } +} diff --git a/packages/core/src/voice/whisperTranscriptionProvider.test.ts b/packages/core/src/voice/whisperTranscriptionProvider.test.ts new file mode 100644 index 0000000000..69b48fbb02 --- /dev/null +++ b/packages/core/src/voice/whisperTranscriptionProvider.test.ts @@ -0,0 +1,31 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, it, expect, vi, beforeEach } from 'vitest'; +import { WhisperTranscriptionProvider } from './whisperTranscriptionProvider.js'; +import commandExists from 'command-exists'; + +vi.mock('command-exists', () => ({ + default: vi.fn(), +})); + +describe('WhisperTranscriptionProvider', () => { + beforeEach(() => { + vi.clearAllMocks(); + }); + + it('should throw a friendly error if whisper-stream is not available', async () => { + vi.mocked(commandExists).mockRejectedValue(new Error('not found')); + + const provider = new WhisperTranscriptionProvider({ + modelPath: 'test-model.bin', + }); + + await expect(provider.connect()).rejects.toThrow( + 'The `whisper-stream` command is required for local voice mode. Please install it (e.g., `brew install whisper-cpp` on macOS).', + ); + }); +}); diff --git a/packages/core/src/voice/whisperTranscriptionProvider.ts b/packages/core/src/voice/whisperTranscriptionProvider.ts new file mode 100644 index 0000000000..b5b871df83 --- /dev/null +++ b/packages/core/src/voice/whisperTranscriptionProvider.ts @@ -0,0 +1,199 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { spawn, type ChildProcessWithoutNullStreams } from 'node:child_process'; +import { EventEmitter } from 'node:events'; +import commandExists from 'command-exists'; +import { debugLogger } from '../utils/debugLogger.js'; +import type { + TranscriptionProvider, + TranscriptionEvents, +} from './transcriptionProvider.js'; + +export interface WhisperProviderOptions { + modelPath: string; + threads?: number; + step?: number; + length?: number; +} + +/** + * Local transcription provider using `whisper-stream` from whisper.cpp. + * + * Uses the Sliding Window Mode with VAD (--step 0) for stable, + * non-overlapping transcription blocks that can be appended directly. + */ +export class WhisperTranscriptionProvider + extends EventEmitter + implements TranscriptionProvider +{ + private process: ChildProcessWithoutNullStreams | null = null; + private currentTranscription = ''; + + constructor(private readonly options: WhisperProviderOptions) { + super(); + } + + /** + * Checks if `whisper-stream` is available on the system. + */ + static async isAvailable(): Promise { + try { + await commandExists('whisper-stream'); + return true; + } catch { + return false; + } + } + + async connect(): Promise { + const { modelPath, threads = 4, step = 0, length = 5000 } = this.options; + + this.currentTranscription = ''; + + const available = await WhisperTranscriptionProvider.isAvailable(); + if (!available) { + return Promise.reject( + new Error( + 'The `whisper-stream` command is required for local voice mode. Please install it (e.g., `brew install whisper-cpp` on macOS).', + ), + ); + } + + debugLogger.debug( + `[WhisperTranscription] Starting whisper-stream with model: ${modelPath} (VAD mode: step=${step}, length=${length})`, + ); + + return new Promise((resolve, reject) => { + let isResolved = false; + + try { + // whisper-stream -m -t --step 0 --length -vth 0.6 + // Setting step == 0 enables sliding window mode with VAD, which outputs + // non-overlapping transcription blocks suitable for appending. + this.process = spawn('whisper-stream', [ + '-m', + modelPath, + '-t', + threads.toString(), + '--step', + step.toString(), + '--length', + length.toString(), + '-vth', + '0.6', + ]); + + this.process.stdout.on('data', (data: Buffer) => { + const output = data.toString(); + this.parseOutput(output); + }); + + this.process.stderr.on('data', (data: Buffer) => { + const msg = data.toString(); + if (msg.includes('error')) { + debugLogger.error(`[WhisperTranscription] stderr: ${msg}`); + if (!isResolved) { + isResolved = true; + reject(new Error(msg)); + } + } + + // whisper-stream prints "whisper_init_from_file_with_params_no_state: loading model from..." + // and finally "main: processing, press Ctrl+C to stop" when ready. + if (!isResolved && msg.includes('main: processing')) { + debugLogger.debug('[WhisperTranscription] whisper-stream is ready'); + isResolved = true; + resolve(); + } + }); + + this.process.on('error', (err) => { + debugLogger.error('[WhisperTranscription] Process error:', err); + this.emit('error', err); + if (!isResolved) { + isResolved = true; + reject(err); + } + }); + + this.process.on('close', (code) => { + debugLogger.debug( + `[WhisperTranscription] Process closed with code ${code}`, + ); + this.emit('close'); + this.process = null; + }); + + // Fallback timeout in case "main: processing" is never seen + setTimeout(() => { + if (!isResolved) { + debugLogger.warn( + '[WhisperTranscription] Connection timeout (fallback resolve)', + ); + isResolved = true; + resolve(); + } + }, 10000); + } catch (err) { + debugLogger.error( + '[WhisperTranscription] Failed to spawn process:', + err, + ); + if (!isResolved) { + isResolved = true; + reject(err); + } + } + }); + } + + private parseOutput(output: string): void { + // whisper-stream output format: "[00:00:00.000 --> 00:00:02.000] Hello world." + const lines = output.split('\n'); + + for (const line of lines) { + const match = line.match(/\[.* --> .*\]\s+(.*)/); + if (match && match[1]) { + let text = match[1].trim(); + + // Filter out [Silence], [music], (laughter), etc. + text = text + .replace(/\[[^\]]*\]/g, '') + .replace(/\([^)]*\)/g, '') + .trim(); + + if (text) { + // In VAD mode (step=0), each line is a completed speech block. + // Append it to the buffer to ensure it doesn't disappear. + this.currentTranscription = this.currentTranscription + ? `${this.currentTranscription} ${text}` + : text; + + debugLogger.debug( + `[WhisperTranscription] Transcription updated (Local-VAD): "${this.currentTranscription}"`, + ); + this.emit('transcription', this.currentTranscription); + } + } + } + } + + sendAudioChunk(_chunk: Buffer): void { + // whisper-stream handles its own audio capture. + } + + getTranscription(): string { + return this.currentTranscription; + } + + disconnect(): void { + if (this.process) { + this.process.kill('SIGTERM'); + this.process = null; + } + } +} diff --git a/schemas/settings.schema.json b/schemas/settings.schema.json index f4263fcc3e..a883150d66 100644 --- a/schemas/settings.schema.json +++ b/schemas/settings.schema.json @@ -2910,6 +2910,59 @@ "default": false, "type": "boolean" }, + "voiceMode": { + "title": "Voice Mode", + "description": "Enable experimental voice dictation and commands (/voice, /voice model).", + "markdownDescription": "Enable experimental voice dictation and commands (/voice, /voice model).\n\n- Category: `Experimental`\n- Requires restart: `no`\n- Default: `false`", + "default": false, + "type": "boolean" + }, + "voice": { + "title": "Voice", + "description": "Settings for voice mode and transcription.", + "markdownDescription": "Settings for voice mode and transcription.\n\n- Category: `Experimental`\n- Requires restart: `no`\n- Default: `{}`", + "default": {}, + "type": "object", + "properties": { + "activationMode": { + "title": "Voice Activation Mode", + "description": "How to trigger voice recording with the Space key.", + "markdownDescription": "How to trigger voice recording with the Space key.\n\n- Category: `Experimental`\n- Requires restart: `no`\n- Default: `push-to-talk`", + "default": "push-to-talk", + "type": "string", + "enum": ["push-to-talk", "toggle"] + }, + "backend": { + "title": "Voice Transcription Backend", + "description": "The backend to use for voice transcription.", + "markdownDescription": "The backend to use for voice transcription.\n\n- Category: `Experimental`\n- Requires restart: `no`\n- Default: `gemini-live`", + "default": "gemini-live", + "type": "string", + "enum": ["gemini-live", "whisper"] + }, + "whisperModel": { + "title": "Whisper Model", + "description": "The Whisper model to use for local transcription.", + "markdownDescription": "The Whisper model to use for local transcription.\n\n- Category: `Experimental`\n- Requires restart: `no`\n- Default: `ggml-base.en.bin`", + "default": "ggml-base.en.bin", + "type": "string", + "enum": [ + "ggml-tiny.en.bin", + "ggml-base.en.bin", + "ggml-large-v3-turbo-q5_0.bin", + "ggml-large-v3-turbo-q8_0.bin" + ] + }, + "stopGracePeriodMs": { + "title": "Voice Stop Grace Period (ms)", + "description": "How long to wait for final transcription after stopping recording.", + "markdownDescription": "How long to wait for final transcription after stopping recording.\n\n- Category: `Experimental`\n- Requires restart: `no`\n- Default: `1000`", + "default": 1000, + "type": "number" + } + }, + "additionalProperties": false + }, "adk": { "title": "ADK", "description": "Settings for the Agent Development Kit (ADK).",