fix(cli): insert voice transcription at cursor position instead of ap… (#26287)

Co-authored-by: Zheyuan <zlin252@emory.edu>
2026-05-13 05:12:55 -07:00 · 2026-05-01 12:41:17 -04:00
parent d9f273e440
commit 7213822e84
2 changed files with 59 additions and 28 deletions
@@ -348,7 +348,7 @@ describe('InputPrompt', () => {
      visualToLogicalMap: [[0, 0]],
      visualToTransformedMap: [0],
      transformationsByLine: [],
-      getOffset: vi.fn().mockReturnValue(0),
+      getOffset: vi.fn().mockImplementation(() => mockBuffer.cursor[1]),
      pastedContent: {},
    } as unknown as TextBuffer;

@@ -5114,17 +5114,15 @@ describe('InputPrompt', () => {
        );
      });
      await waitFor(() => {
-        expect(mockBuffer.setText).toHaveBeenCalledWith('initial hello', 'end');
+        expect(mockBuffer.setText).toHaveBeenCalledWith('initial hello', 13);
      });

-      // Emit turnComplete (Gemini Live starts over after this)
+      // turnComplete advances the baseline; next turn appends after it
      await act(async () => {
        (fakeTranscriptionProvider as unknown as EventEmitter).emit(
          'turnComplete',
        );
      });
-
-      // Emit second part (Gemini Live sends new turn text starting from empty)
      await act(async () => {
        (fakeTranscriptionProvider as unknown as EventEmitter).emit(
          'transcription',
@@ -5132,10 +5130,9 @@ describe('InputPrompt', () => {
        );
      });
      await waitFor(() => {
-        // Should have appended 'world' to the baseline 'initial hello'
        expect(mockBuffer.setText).toHaveBeenCalledWith(
          'initial hello world',
-          'end',
+          19,
        );
      });

@@ -5172,13 +5169,48 @@ describe('InputPrompt', () => {
      await waitFor(() => {
        expect(mockBuffer.setText).toHaveBeenCalledWith(
          'First turn. Second turn.',
-          'end',
+          24,
        );
      });

      unmount();
    });

+    it('should insert transcription at cursor position when buffer has text before and after (toggle)', async () => {
+      await act(async () => {
+        mockBuffer.setText('hello world');
+        mockBuffer.cursor = [0, 5]; // cursor after 'hello'
+      });
+      const { stdin, unmount } = await renderWithProviders(
+        <TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
+        {
+          uiState: { isVoiceModeEnabled: true } as UIState,
+          settings: createMockSettings({
+            experimental: { voice: { activationMode: 'toggle' } },
+          }),
+        },
+      );
+
+      await act(async () => {
+        stdin.write(' ');
+      });
+      await act(async () => {
+        (fakeTranscriptionProvider as unknown as EventEmitter).emit(
+          'transcription',
+          'there',
+        );
+      });
+
+      // 'hello'(5) + ' '(1) + 'there'(5) = cursor at 11; ' world' preserved after
+      await waitFor(() => {
+        expect(mockBuffer.setText).toHaveBeenCalledWith(
+          'hello there world',
+          11,
+        );
+      });
+      unmount();
+    });
+
    describe('push-to-talk', () => {
      beforeEach(() => {
        vi.useFakeTimers();
@@ -51,6 +51,7 @@ export function useVoiceMode({
  const recorderRef = useRef<AudioRecorder | null>(null);
  const transcriptionServiceRef = useRef<TranscriptionProvider | null>(null);
  const turnBaselineRef = useRef<string | null>(null);
+  const turnBaselineCursorOffsetRef = useRef<number>(0);

  const pttStateRef = useRef<'idle' | 'possible-hold' | 'recording'>('idle');
  const pttTimerRef = useRef<NodeJS.Timeout | null>(null);
@@ -112,6 +113,7 @@ export function useVoiceMode({

    recordingInProgressRef.current = true;
    turnBaselineRef.current = bufferRef.current.text;
+    turnBaselineCursorOffsetRef.current = bufferRef.current.getOffset();

    setIsConnecting(true);
    setIsRecording(true);
@@ -193,29 +195,23 @@ export function useVoiceMode({
        }

        if (text) {
-          const currentBufferText = bufferRef.current.text;
-          const previousTranscription = liveTranscriptionRef.current;
+          const baseline = turnBaselineRef.current ?? '';
+          const insertOffset = turnBaselineCursorOffsetRef.current;
+          const textBefore = baseline.slice(0, insertOffset);
+          const textAfter = baseline.slice(insertOffset);

-          let newTotalText = currentBufferText;
+          const prefix =
+            textBefore.length > 0 && !/\s$/.test(textBefore)
+              ? textBefore + ' '
+              : textBefore;

-          if (
-            previousTranscription &&
-            currentBufferText.endsWith(previousTranscription)
-          ) {
-            newTotalText = currentBufferText.slice(
-              0,
-              -previousTranscription.length,
-            );
-          } else if (
-            currentBufferText &&
-            !currentBufferText.endsWith(' ') &&
-            !currentBufferText.endsWith('\n')
-          ) {
-            newTotalText += ' ';
-          }
+          const suffix =
+            text.length > 0 && textAfter.length > 0 && !/^\s/.test(textAfter)
+              ? ' '
+              : '';

-          newTotalText += text;
-          bufferRef.current.setText(newTotalText, 'end');
+          const newTotalText = prefix + text + suffix + textAfter;
+          bufferRef.current.setText(newTotalText, prefix.length + text.length);
        }
        liveTranscriptionRef.current = text;
      });
@@ -226,6 +222,9 @@ export function useVoiceMode({
          stopRequestedRef.current
        )
          return;
+        // Advance the baseline so subsequent turns append after this turn's text
+        turnBaselineRef.current = bufferRef.current.text;
+        turnBaselineCursorOffsetRef.current = bufferRef.current.getOffset();
        liveTranscriptionRef.current = '';
      });