mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-05-13 05:12:55 -07:00
fix(cli): insert voice transcription at cursor position instead of ap… (#26287)
Co-authored-by: Zheyuan <zlin252@emory.edu>
This commit is contained in:
@@ -348,7 +348,7 @@ describe('InputPrompt', () => {
|
||||
visualToLogicalMap: [[0, 0]],
|
||||
visualToTransformedMap: [0],
|
||||
transformationsByLine: [],
|
||||
getOffset: vi.fn().mockReturnValue(0),
|
||||
getOffset: vi.fn().mockImplementation(() => mockBuffer.cursor[1]),
|
||||
pastedContent: {},
|
||||
} as unknown as TextBuffer;
|
||||
|
||||
@@ -5114,17 +5114,15 @@ describe('InputPrompt', () => {
|
||||
);
|
||||
});
|
||||
await waitFor(() => {
|
||||
expect(mockBuffer.setText).toHaveBeenCalledWith('initial hello', 'end');
|
||||
expect(mockBuffer.setText).toHaveBeenCalledWith('initial hello', 13);
|
||||
});
|
||||
|
||||
// Emit turnComplete (Gemini Live starts over after this)
|
||||
// turnComplete advances the baseline; next turn appends after it
|
||||
await act(async () => {
|
||||
(fakeTranscriptionProvider as unknown as EventEmitter).emit(
|
||||
'turnComplete',
|
||||
);
|
||||
});
|
||||
|
||||
// Emit second part (Gemini Live sends new turn text starting from empty)
|
||||
await act(async () => {
|
||||
(fakeTranscriptionProvider as unknown as EventEmitter).emit(
|
||||
'transcription',
|
||||
@@ -5132,10 +5130,9 @@ describe('InputPrompt', () => {
|
||||
);
|
||||
});
|
||||
await waitFor(() => {
|
||||
// Should have appended 'world' to the baseline 'initial hello'
|
||||
expect(mockBuffer.setText).toHaveBeenCalledWith(
|
||||
'initial hello world',
|
||||
'end',
|
||||
19,
|
||||
);
|
||||
});
|
||||
|
||||
@@ -5172,13 +5169,48 @@ describe('InputPrompt', () => {
|
||||
await waitFor(() => {
|
||||
expect(mockBuffer.setText).toHaveBeenCalledWith(
|
||||
'First turn. Second turn.',
|
||||
'end',
|
||||
24,
|
||||
);
|
||||
});
|
||||
|
||||
unmount();
|
||||
});
|
||||
|
||||
it('should insert transcription at cursor position when buffer has text before and after (toggle)', async () => {
|
||||
await act(async () => {
|
||||
mockBuffer.setText('hello world');
|
||||
mockBuffer.cursor = [0, 5]; // cursor after 'hello'
|
||||
});
|
||||
const { stdin, unmount } = await renderWithProviders(
|
||||
<TestInputPrompt {...props} focus={true} buffer={mockBuffer} />,
|
||||
{
|
||||
uiState: { isVoiceModeEnabled: true } as UIState,
|
||||
settings: createMockSettings({
|
||||
experimental: { voice: { activationMode: 'toggle' } },
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
await act(async () => {
|
||||
stdin.write(' ');
|
||||
});
|
||||
await act(async () => {
|
||||
(fakeTranscriptionProvider as unknown as EventEmitter).emit(
|
||||
'transcription',
|
||||
'there',
|
||||
);
|
||||
});
|
||||
|
||||
// 'hello'(5) + ' '(1) + 'there'(5) = cursor at 11; ' world' preserved after
|
||||
await waitFor(() => {
|
||||
expect(mockBuffer.setText).toHaveBeenCalledWith(
|
||||
'hello there world',
|
||||
11,
|
||||
);
|
||||
});
|
||||
unmount();
|
||||
});
|
||||
|
||||
describe('push-to-talk', () => {
|
||||
beforeEach(() => {
|
||||
vi.useFakeTimers();
|
||||
|
||||
@@ -51,6 +51,7 @@ export function useVoiceMode({
|
||||
const recorderRef = useRef<AudioRecorder | null>(null);
|
||||
const transcriptionServiceRef = useRef<TranscriptionProvider | null>(null);
|
||||
const turnBaselineRef = useRef<string | null>(null);
|
||||
const turnBaselineCursorOffsetRef = useRef<number>(0);
|
||||
|
||||
const pttStateRef = useRef<'idle' | 'possible-hold' | 'recording'>('idle');
|
||||
const pttTimerRef = useRef<NodeJS.Timeout | null>(null);
|
||||
@@ -112,6 +113,7 @@ export function useVoiceMode({
|
||||
|
||||
recordingInProgressRef.current = true;
|
||||
turnBaselineRef.current = bufferRef.current.text;
|
||||
turnBaselineCursorOffsetRef.current = bufferRef.current.getOffset();
|
||||
|
||||
setIsConnecting(true);
|
||||
setIsRecording(true);
|
||||
@@ -193,29 +195,23 @@ export function useVoiceMode({
|
||||
}
|
||||
|
||||
if (text) {
|
||||
const currentBufferText = bufferRef.current.text;
|
||||
const previousTranscription = liveTranscriptionRef.current;
|
||||
const baseline = turnBaselineRef.current ?? '';
|
||||
const insertOffset = turnBaselineCursorOffsetRef.current;
|
||||
const textBefore = baseline.slice(0, insertOffset);
|
||||
const textAfter = baseline.slice(insertOffset);
|
||||
|
||||
let newTotalText = currentBufferText;
|
||||
const prefix =
|
||||
textBefore.length > 0 && !/\s$/.test(textBefore)
|
||||
? textBefore + ' '
|
||||
: textBefore;
|
||||
|
||||
if (
|
||||
previousTranscription &&
|
||||
currentBufferText.endsWith(previousTranscription)
|
||||
) {
|
||||
newTotalText = currentBufferText.slice(
|
||||
0,
|
||||
-previousTranscription.length,
|
||||
);
|
||||
} else if (
|
||||
currentBufferText &&
|
||||
!currentBufferText.endsWith(' ') &&
|
||||
!currentBufferText.endsWith('\n')
|
||||
) {
|
||||
newTotalText += ' ';
|
||||
}
|
||||
const suffix =
|
||||
text.length > 0 && textAfter.length > 0 && !/^\s/.test(textAfter)
|
||||
? ' '
|
||||
: '';
|
||||
|
||||
newTotalText += text;
|
||||
bufferRef.current.setText(newTotalText, 'end');
|
||||
const newTotalText = prefix + text + suffix + textAfter;
|
||||
bufferRef.current.setText(newTotalText, prefix.length + text.length);
|
||||
}
|
||||
liveTranscriptionRef.current = text;
|
||||
});
|
||||
@@ -226,6 +222,9 @@ export function useVoiceMode({
|
||||
stopRequestedRef.current
|
||||
)
|
||||
return;
|
||||
// Advance the baseline so subsequent turns append after this turn's text
|
||||
turnBaselineRef.current = bufferRef.current.text;
|
||||
turnBaselineCursorOffsetRef.current = bufferRef.current.getOffset();
|
||||
liveTranscriptionRef.current = '';
|
||||
});
|
||||
|
||||
|
||||
Reference in New Issue
Block a user