fix(cli): insert voice transcription at cursor position instead of ap… (#26287)

Co-authored-by: Zheyuan <zlin252@emory.edu>
This commit is contained in:
Zheyuan Lin
2026-05-01 12:41:17 -04:00
committed by GitHub
parent d9f273e440
commit 7213822e84
2 changed files with 59 additions and 28 deletions
@@ -348,7 +348,7 @@ describe('InputPrompt', () => {
visualToLogicalMap: [[0, 0]], visualToLogicalMap: [[0, 0]],
visualToTransformedMap: [0], visualToTransformedMap: [0],
transformationsByLine: [], transformationsByLine: [],
getOffset: vi.fn().mockReturnValue(0), getOffset: vi.fn().mockImplementation(() => mockBuffer.cursor[1]),
pastedContent: {}, pastedContent: {},
} as unknown as TextBuffer; } as unknown as TextBuffer;
@@ -5114,17 +5114,15 @@ describe('InputPrompt', () => {
); );
}); });
await waitFor(() => { await waitFor(() => {
expect(mockBuffer.setText).toHaveBeenCalledWith('initial hello', 'end'); expect(mockBuffer.setText).toHaveBeenCalledWith('initial hello', 13);
}); });
// Emit turnComplete (Gemini Live starts over after this) // turnComplete advances the baseline; next turn appends after it
await act(async () => { await act(async () => {
(fakeTranscriptionProvider as unknown as EventEmitter).emit( (fakeTranscriptionProvider as unknown as EventEmitter).emit(
'turnComplete', 'turnComplete',
); );
}); });
// Emit second part (Gemini Live sends new turn text starting from empty)
await act(async () => { await act(async () => {
(fakeTranscriptionProvider as unknown as EventEmitter).emit( (fakeTranscriptionProvider as unknown as EventEmitter).emit(
'transcription', 'transcription',
@@ -5132,10 +5130,9 @@ describe('InputPrompt', () => {
); );
}); });
await waitFor(() => { await waitFor(() => {
// Should have appended 'world' to the baseline 'initial hello'
expect(mockBuffer.setText).toHaveBeenCalledWith( expect(mockBuffer.setText).toHaveBeenCalledWith(
'initial hello world', 'initial hello world',
'end', 19,
); );
}); });
@@ -5172,13 +5169,48 @@ describe('InputPrompt', () => {
await waitFor(() => { await waitFor(() => {
expect(mockBuffer.setText).toHaveBeenCalledWith( expect(mockBuffer.setText).toHaveBeenCalledWith(
'First turn. Second turn.', 'First turn. Second turn.',
'end', 24,
); );
}); });
unmount(); unmount();
}); });
it('should insert transcription at cursor position when buffer has text before and after (toggle)', async () => {
await act(async () => {
mockBuffer.setText('hello world');
mockBuffer.cursor = [0, 5]; // cursor after 'hello'
});
const { stdin, unmount } = await renderWithProviders(
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
{
uiState: { isVoiceModeEnabled: true } as UIState,
settings: createMockSettings({
experimental: { voice: { activationMode: 'toggle' } },
}),
},
);
await act(async () => {
stdin.write(' ');
});
await act(async () => {
(fakeTranscriptionProvider as unknown as EventEmitter).emit(
'transcription',
'there',
);
});
// 'hello'(5) + ' '(1) + 'there'(5) = cursor at 11; ' world' preserved after
await waitFor(() => {
expect(mockBuffer.setText).toHaveBeenCalledWith(
'hello there world',
11,
);
});
unmount();
});
describe('push-to-talk', () => { describe('push-to-talk', () => {
beforeEach(() => { beforeEach(() => {
vi.useFakeTimers(); vi.useFakeTimers();
+19 -20
View File
@@ -51,6 +51,7 @@ export function useVoiceMode({
const recorderRef = useRef<AudioRecorder | null>(null); const recorderRef = useRef<AudioRecorder | null>(null);
const transcriptionServiceRef = useRef<TranscriptionProvider | null>(null); const transcriptionServiceRef = useRef<TranscriptionProvider | null>(null);
const turnBaselineRef = useRef<string | null>(null); const turnBaselineRef = useRef<string | null>(null);
const turnBaselineCursorOffsetRef = useRef<number>(0);
const pttStateRef = useRef<'idle' | 'possible-hold' | 'recording'>('idle'); const pttStateRef = useRef<'idle' | 'possible-hold' | 'recording'>('idle');
const pttTimerRef = useRef<NodeJS.Timeout | null>(null); const pttTimerRef = useRef<NodeJS.Timeout | null>(null);
@@ -112,6 +113,7 @@ export function useVoiceMode({
recordingInProgressRef.current = true; recordingInProgressRef.current = true;
turnBaselineRef.current = bufferRef.current.text; turnBaselineRef.current = bufferRef.current.text;
turnBaselineCursorOffsetRef.current = bufferRef.current.getOffset();
setIsConnecting(true); setIsConnecting(true);
setIsRecording(true); setIsRecording(true);
@@ -193,29 +195,23 @@ export function useVoiceMode({
} }
if (text) { if (text) {
const currentBufferText = bufferRef.current.text; const baseline = turnBaselineRef.current ?? '';
const previousTranscription = liveTranscriptionRef.current; const insertOffset = turnBaselineCursorOffsetRef.current;
const textBefore = baseline.slice(0, insertOffset);
const textAfter = baseline.slice(insertOffset);
let newTotalText = currentBufferText; const prefix =
textBefore.length > 0 && !/\s$/.test(textBefore)
? textBefore + ' '
: textBefore;
if ( const suffix =
previousTranscription && text.length > 0 && textAfter.length > 0 && !/^\s/.test(textAfter)
currentBufferText.endsWith(previousTranscription) ? ' '
) { : '';
newTotalText = currentBufferText.slice(
0,
-previousTranscription.length,
);
} else if (
currentBufferText &&
!currentBufferText.endsWith(' ') &&
!currentBufferText.endsWith('\n')
) {
newTotalText += ' ';
}
newTotalText += text; const newTotalText = prefix + text + suffix + textAfter;
bufferRef.current.setText(newTotalText, 'end'); bufferRef.current.setText(newTotalText, prefix.length + text.length);
} }
liveTranscriptionRef.current = text; liveTranscriptionRef.current = text;
}); });
@@ -226,6 +222,9 @@ export function useVoiceMode({
stopRequestedRef.current stopRequestedRef.current
) )
return; return;
// Advance the baseline so subsequent turns append after this turn's text
turnBaselineRef.current = bufferRef.current.text;
turnBaselineCursorOffsetRef.current = bufferRef.current.getOffset();
liveTranscriptionRef.current = ''; liveTranscriptionRef.current = '';
}); });