feat(voice): implement real-time voice mode with cloud and local backends (#24174)

2026-05-14 05:42:54 -07:00 · 2026-04-24 14:29:38 -07:00
parent 048bf6e514
commit 2e0641c83b
40 changed files with 2244 additions and 43 deletions
@@ -9,12 +9,41 @@ import { createMockSettings } from '../../test-utils/settings.js';
 import { makeFakeConfig } from '@google/gemini-cli-core';
 import { waitFor } from '../../test-utils/async.js';
 import { act, useState, useMemo } from 'react';
+import type { EventEmitter } from 'node:events';
+
+const { fakeTranscriptionProvider } = vi.hoisted(() => {
+  // Use require within hoisted block for immediate synchronous access
+  // eslint-disable-next-line @typescript-eslint/no-require-imports, no-restricted-syntax
+  const { EventEmitter } = require('node:events');
+  class FakeTranscriptionProvider extends EventEmitter {
+    connect = vi.fn().mockResolvedValue(undefined);
+    disconnect = vi.fn();
+    sendAudioChunk = vi.fn();
+    getTranscription = vi.fn().mockReturnValue('');
+  }
+  return {
+    fakeTranscriptionProvider: new FakeTranscriptionProvider(),
+  };
+});
+
+vi.mock('@google/gemini-cli-core', async (importOriginal) => {
+  // eslint-disable-next-line @typescript-eslint/no-explicit-any
+  const actual = (await importOriginal()) as any;
+  return {
+    ...actual,
+    TranscriptionFactory: {
+      createProvider: vi.fn(() => fakeTranscriptionProvider),
+    },
+  };
+});
+
 import {
  InputPrompt,
  tryTogglePasteExpansion,
  type InputPromptProps,
 } from './InputPrompt.js';
 import { InputContext } from '../contexts/InputContext.js';
+import { type UIState } from '../contexts/UIStateContext.js';
 import {
  calculateTransformationsForLine,
  calculateTransformedLine,
@@ -417,6 +446,7 @@ describe('InputPrompt', () => {
        getWorkspaceContext: () => ({
          getDirectories: () => ['/test/project/src'],
        }),
+        getContentGeneratorConfig: () => ({ apiKey: 'test-api-key' }),
      } as unknown as Config,
      slashCommands: mockSlashCommands,
      commandContext: mockCommandContext,
@@ -4925,6 +4955,383 @@ describe('InputPrompt', () => {
      },
    );
  });
+
+  describe('Voice Mode', () => {
+    beforeEach(() => {
+      (
+        fakeTranscriptionProvider as unknown as EventEmitter
+      ).removeAllListeners();
+      vi.clearAllMocks();
+    });
+
+    it('should start recording when space is pressed and voice mode is enabled (toggle)', async () => {
+      await act(async () => {
+        mockBuffer.setText('');
+      });
+      const { stdin, unmount, lastFrame } = await renderWithProviders(
+        <TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
+        {
+          uiState: { isVoiceModeEnabled: true } as UIState,
+          settings: createMockSettings({
+            experimental: { voice: { activationMode: 'toggle' } },
+          }),
+        },
+      );
+
+      // Initially not recording
+      expect(lastFrame()).not.toContain('🎙️ Listening...');
+      expect(lastFrame()).toContain(
+        'Voice mode: Space to start/stop recording',
+      );
+
+      // Press space to start
+      await act(async () => {
+        stdin.write(' ');
+      });
+
+      // Now should show listening
+      await waitFor(() => {
+        expect(lastFrame()).toContain('🎙️ Listening...');
+      });
+
+      unmount();
+    });
+
+    it('should toggle recording off when space is pressed again (toggle)', async () => {
+      await act(async () => {
+        mockBuffer.setText('');
+      });
+      const { stdin, unmount, lastFrame } = await renderWithProviders(
+        <TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
+        {
+          uiState: { isVoiceModeEnabled: true } as UIState,
+          settings: createMockSettings({
+            experimental: { voice: { activationMode: 'toggle' } },
+          }),
+        },
+      );
+
+      // Start recording
+      await act(async () => {
+        stdin.write(' ');
+      });
+      await waitFor(() => {
+        expect(lastFrame()).toContain('🎙️ Listening...');
+      });
+
+      // Stop recording
+      await act(async () => {
+        stdin.write(' ');
+      });
+      await waitFor(() => {
+        expect(lastFrame()).not.toContain('🎙️ Listening...');
+        expect(lastFrame()).toContain(
+          'Voice mode: Space to start/stop recording',
+        );
+      });
+
+      unmount();
+    });
+
+    it('should resume recording when space is pressed even if buffer is not empty (toggle)', async () => {
+      await act(async () => {
+        mockBuffer.setText('some existing text');
+      });
+      const { stdin, unmount, lastFrame } = await renderWithProviders(
+        <TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
+        {
+          uiState: { isVoiceModeEnabled: true } as UIState,
+          settings: createMockSettings({
+            experimental: { voice: { activationMode: 'toggle' } },
+          }),
+        },
+      );
+
+      // Should show voice mode hint even if buffer is not empty (new behavior)
+      expect(lastFrame()).toContain(
+        'Voice mode: Space to start/stop recording',
+      );
+      expect(lastFrame()).toContain('some existing text');
+
+      // Press space to start recording again
+      await act(async () => {
+        stdin.write(' ');
+      });
+
+      await waitFor(() => {
+        expect(lastFrame()).toContain('🎙️ Listening...');
+      });
+
+      unmount();
+    });
+
+    it('should not start recording if voice mode is disabled (toggle)', async () => {
+      await act(async () => {
+        mockBuffer.setText('');
+      });
+      const { stdin, unmount, lastFrame } = await renderWithProviders(
+        <TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
+        {
+          uiState: { isVoiceModeEnabled: false } as UIState,
+          settings: createMockSettings({
+            experimental: { voice: { activationMode: 'toggle' } },
+          }),
+        },
+      );
+
+      // Press space
+      await act(async () => {
+        stdin.write(' ');
+      });
+
+      // Should NOT show listening, instead should call handleInput which handles space
+      expect(lastFrame()).not.toContain('🎙️ Listening...');
+      expect(mockBuffer.handleInput).toHaveBeenCalled();
+      unmount();
+    });
+
+    it('should append transcription correctly across multiple turn updates (toggle)', async () => {
+      await act(async () => {
+        mockBuffer.setText('initial');
+      });
+      const { stdin, unmount } = await renderWithProviders(
+        <TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
+        {
+          uiState: { isVoiceModeEnabled: true } as UIState,
+          settings: createMockSettings({
+            experimental: { voice: { activationMode: 'toggle' } },
+          }),
+        },
+      );
+
+      // Start recording
+      await act(async () => {
+        stdin.write(' ');
+      });
+
+      // Emit first transcription
+      await act(async () => {
+        (fakeTranscriptionProvider as unknown as EventEmitter).emit(
+          'transcription',
+          'hello',
+        );
+      });
+      await waitFor(() => {
+        expect(mockBuffer.setText).toHaveBeenCalledWith('initial hello', 'end');
+      });
+
+      // Emit turnComplete (Gemini Live starts over after this)
+      await act(async () => {
+        (fakeTranscriptionProvider as unknown as EventEmitter).emit(
+          'turnComplete',
+        );
+      });
+
+      // Emit second part (Gemini Live sends new turn text starting from empty)
+      await act(async () => {
+        (fakeTranscriptionProvider as unknown as EventEmitter).emit(
+          'transcription',
+          'world',
+        );
+      });
+      await waitFor(() => {
+        // Should have appended 'world' to the baseline 'initial hello'
+        expect(mockBuffer.setText).toHaveBeenCalledWith(
+          'initial hello world',
+          'end',
+        );
+      });
+
+      unmount();
+    });
+
+    it('should append transcription correctly when resuming voice mode (toggle)', async () => {
+      await act(async () => {
+        mockBuffer.setText('First turn.');
+      });
+      const { stdin, unmount } = await renderWithProviders(
+        <TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
+        {
+          uiState: { isVoiceModeEnabled: true } as UIState,
+          settings: createMockSettings({
+            experimental: { voice: { activationMode: 'toggle' } },
+          }),
+        },
+      );
+
+      // Start recording (resumed)
+      await act(async () => {
+        stdin.write(' ');
+      });
+
+      // Emit transcription
+      await act(async () => {
+        (fakeTranscriptionProvider as unknown as EventEmitter).emit(
+          'transcription',
+          'Second turn.',
+        );
+      });
+
+      await waitFor(() => {
+        expect(mockBuffer.setText).toHaveBeenCalledWith(
+          'First turn. Second turn.',
+          'end',
+        );
+      });
+
+      unmount();
+    });
+
+    describe('push-to-talk', () => {
+      beforeEach(() => {
+        vi.useFakeTimers();
+      });
+
+      afterEach(() => {
+        vi.useRealTimers();
+      });
+
+      it('should insert a space on a single tap', async () => {
+        const { stdin, unmount, lastFrame } = await renderWithProviders(
+          <TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
+          {
+            uiState: { isVoiceModeEnabled: true } as UIState,
+            settings: createMockSettings({
+              experimental: { voice: { activationMode: 'push-to-talk' } },
+            }),
+          },
+        );
+
+        expect(lastFrame()).toContain('Voice mode: Hold Space to record');
+
+        // Press space once
+        await act(async () => {
+          stdin.write(' ');
+        });
+
+        // Should insert space optimistically
+        expect(mockBuffer.insert).toHaveBeenCalledWith(' ');
+        expect(lastFrame()).not.toContain('🎙️ Listening...');
+
+        // Advance timer past HOLD_DELAY_MS
+        await act(async () => {
+          vi.advanceTimersByTime(700);
+        });
+
+        expect(lastFrame()).not.toContain('🎙️ Listening...');
+        unmount();
+      });
+
+      it('should start recording on hold (simulated by repeat spaces)', async () => {
+        const { stdin, unmount, lastFrame } = await renderWithProviders(
+          <TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
+          {
+            uiState: { isVoiceModeEnabled: true } as UIState,
+            settings: createMockSettings({
+              experimental: { voice: { activationMode: 'push-to-talk' } },
+            }),
+          },
+        );
+
+        // First space
+        await act(async () => {
+          stdin.write(' ');
+        });
+        expect(mockBuffer.insert).toHaveBeenCalledWith(' ');
+
+        // Second space (repeat)
+        await act(async () => {
+          stdin.write(' ');
+        });
+
+        await waitFor(() => {
+          // Should have backspaced the optimistic space
+          expect(mockBuffer.backspace).toHaveBeenCalled();
+          // Should show listening
+          expect(lastFrame()).toContain('🎙️ Listening...');
+        });
+
+        unmount();
+      });
+
+      it('should stop recording when space heartbeat stops (release)', async () => {
+        const { stdin, unmount, lastFrame } = await renderWithProviders(
+          <TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
+          {
+            uiState: { isVoiceModeEnabled: true } as UIState,
+            settings: createMockSettings({
+              experimental: { voice: { activationMode: 'push-to-talk' } },
+            }),
+          },
+        );
+
+        // Start hold
+        await act(async () => {
+          stdin.write(' ');
+          stdin.write(' ');
+        });
+
+        // Use a short interval in waitFor to prevent advancing fake timers past the 300ms RELEASE_DELAY_MS
+        await waitFor(
+          () => {
+            expect(lastFrame()).toContain('🎙️ Listening...');
+          },
+          { interval: 10 },
+        );
+
+        // Simulate heartbeat (held key) - send space first to reset timer, then advance
+        await act(async () => {
+          stdin.write(' ');
+          vi.advanceTimersByTime(100);
+        });
+        expect(lastFrame()).toContain('🎙️ Listening...');
+
+        // Stop heartbeat (release)
+        await act(async () => {
+          vi.advanceTimersByTime(400); // Past RELEASE_DELAY_MS
+        });
+
+        await waitFor(() => {
+          expect(lastFrame()).not.toContain('🎙️ Listening...');
+        });
+
+        unmount();
+      });
+
+      it('should cancel hold state if non-space key is pressed after first space', async () => {
+        const { stdin, unmount } = await renderWithProviders(
+          <TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
+          {
+            uiState: { isVoiceModeEnabled: true } as UIState,
+            settings: createMockSettings({
+              experimental: { voice: { activationMode: 'push-to-talk' } },
+            }),
+          },
+        );
+
+        // First space
+        await act(async () => {
+          stdin.write(' ');
+        });
+
+        // Type 'a'
+        await act(async () => {
+          stdin.write('a');
+        });
+
+        // Should NOT start recording on next space even if fast
+        await act(async () => {
+          stdin.write(' ');
+        });
+
+        expect(mockBuffer.insert).toHaveBeenCalledTimes(2); // Two spaces inserted
+        expect(mockBuffer.handleInput).toHaveBeenCalledWith(
+          expect.objectContaining({ name: 'a' }),
+        );
+        unmount();
+      });
+    });
+  });
 });

 function clean(str: string | undefined): string {