fix(cli): insert voice transcription at cursor position instead of ap… (#26287)

Co-authored-by: Zheyuan <zlin252@emory.edu>
2026-05-14 22:02:59 -07:00 · 2026-05-01 12:41:17 -04:00
parent d9f273e440
commit 7213822e84
2 changed files with 59 additions and 28 deletions
@@ -348,7 +348,7 @@ describe('InputPrompt', () => {
      visualToLogicalMap: [[0, 0]],
      visualToTransformedMap: [0],
      transformationsByLine: [],
-      getOffset: vi.fn().mockReturnValue(0),
+      getOffset: vi.fn().mockImplementation(() => mockBuffer.cursor[1]),
      pastedContent: {},
    } as unknown as TextBuffer;
@@ -5114,17 +5114,15 @@ describe('InputPrompt', () => {
        );
      });
      await waitFor(() => {
-        expect(mockBuffer.setText).toHaveBeenCalledWith('initial hello', 'end');
+        expect(mockBuffer.setText).toHaveBeenCalledWith('initial hello', 13);
      });
-      // Emit turnComplete (Gemini Live starts over after this)
+      // turnComplete advances the baseline; next turn appends after it
      await act(async () => {
        (fakeTranscriptionProvider as unknown as EventEmitter).emit(
          'turnComplete',
        );
      });
      // Emit second part (Gemini Live sends new turn text starting from empty)
      await act(async () => {
        (fakeTranscriptionProvider as unknown as EventEmitter).emit(
          'transcription',
@@ -5132,10 +5130,9 @@ describe('InputPrompt', () => {
        );
      });
      await waitFor(() => {
        // Should have appended 'world' to the baseline 'initial hello'
        expect(mockBuffer.setText).toHaveBeenCalledWith(
          'initial hello world',
-          'end',
+          19,
        );
      });
@@ -5172,13 +5169,48 @@ describe('InputPrompt', () => {
      await waitFor(() => {
        expect(mockBuffer.setText).toHaveBeenCalledWith(
          'First turn. Second turn.',
-          'end',
+          24,
        );
      });
      unmount();
    });
    it('should insert transcription at cursor position when buffer has text before and after (toggle)', async () => {
      await act(async () => {
        mockBuffer.setText('hello world');
        mockBuffer.cursor = [0, 5]; // cursor after 'hello'
      });
      const { stdin, unmount } = await renderWithProviders(
        <TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
        {
          uiState: { isVoiceModeEnabled: true } as UIState,
          settings: createMockSettings({
            experimental: { voice: { activationMode: 'toggle' } },
          }),
        },
      );
      await act(async () => {
        stdin.write(' ');
      });
      await act(async () => {
        (fakeTranscriptionProvider as unknown as EventEmitter).emit(
          'transcription',
          'there',
        );
      });
      // 'hello'(5) + ' '(1) + 'there'(5) = cursor at 11; ' world' preserved after
      await waitFor(() => {
        expect(mockBuffer.setText).toHaveBeenCalledWith(
          'hello there world',
          11,
        );
      });
      unmount();
    });
    describe('push-to-talk', () => {
      beforeEach(() => {
        vi.useFakeTimers();
@@ -51,6 +51,7 @@ export function useVoiceMode({
  const recorderRef = useRef<AudioRecorder | null>(null);
  const transcriptionServiceRef = useRef<TranscriptionProvider | null>(null);
  const turnBaselineRef = useRef<string | null>(null);
  const turnBaselineCursorOffsetRef = useRef<number>(0);
  const pttStateRef = useRef<'idle' | 'possible-hold' | 'recording'>('idle');
  const pttTimerRef = useRef<NodeJS.Timeout | null>(null);
@@ -112,6 +113,7 @@ export function useVoiceMode({
    recordingInProgressRef.current = true;
    turnBaselineRef.current = bufferRef.current.text;
    turnBaselineCursorOffsetRef.current = bufferRef.current.getOffset();
    setIsConnecting(true);
    setIsRecording(true);
@@ -193,29 +195,23 @@ export function useVoiceMode({
        }
        if (text) {
-          const currentBufferText = bufferRef.current.text;
+          const baseline = turnBaselineRef.current ?? '';
-          const previousTranscription = liveTranscriptionRef.current;
+          const insertOffset = turnBaselineCursorOffsetRef.current;
          const textBefore = baseline.slice(0, insertOffset);
          const textAfter = baseline.slice(insertOffset);
-          let newTotalText = currentBufferText;
+          const prefix =
            textBefore.length > 0 && !/\s$/.test(textBefore)
              ? textBefore + ' '
              : textBefore;
-          if (
+          const suffix =
-            previousTranscription &&
+            text.length > 0 && textAfter.length > 0 && !/^\s/.test(textAfter)
-            currentBufferText.endsWith(previousTranscription)
+              ? ' '
-          ) {
+              : '';
            newTotalText = currentBufferText.slice(
              0,
              -previousTranscription.length,
            );
          } else if (
            currentBufferText &&
            !currentBufferText.endsWith(' ') &&
            !currentBufferText.endsWith('\n')
          ) {
            newTotalText += ' ';
          }
-          newTotalText += text;
+          const newTotalText = prefix + text + suffix + textAfter;
-          bufferRef.current.setText(newTotalText, 'end');
+          bufferRef.current.setText(newTotalText, prefix.length + text.length);
        }
        liveTranscriptionRef.current = text;
      });
@@ -226,6 +222,9 @@ export function useVoiceMode({
          stopRequestedRef.current
        )
          return;
        // Advance the baseline so subsequent turns append after this turn's text
        turnBaselineRef.current = bufferRef.current.text;
        turnBaselineCursorOffsetRef.current = bufferRef.current.getOffset();
        liveTranscriptionRef.current = '';
      });