diff --git a/docs/sidebar.json b/docs/sidebar.json index 7c201e0071..e26004a973 100644 --- a/docs/sidebar.json +++ b/docs/sidebar.json @@ -111,7 +111,7 @@ "badge": "🔬", "slug": "docs/cli/notifications" }, - { "label": "Plan mode", "badge": "🔬", "slug": "docs/cli/plan-mode" }, + { "label": "Plan mode", "slug": "docs/cli/plan-mode" }, { "label": "Subagents", "badge": "🔬", diff --git a/evals/tracker.eval.ts b/evals/tracker.eval.ts new file mode 100644 index 0000000000..7afb41dbec --- /dev/null +++ b/evals/tracker.eval.ts @@ -0,0 +1,116 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, expect } from 'vitest'; +import { + TRACKER_CREATE_TASK_TOOL_NAME, + TRACKER_UPDATE_TASK_TOOL_NAME, +} from '@google/gemini-cli-core'; +import { evalTest, assertModelHasOutput } from './test-helper.js'; +import fs from 'node:fs'; +import path from 'node:path'; + +const FILES = { + 'package.json': JSON.stringify({ + name: 'test-project', + version: '1.0.0', + scripts: { test: 'echo "All tests passed!"' }, + }), + 'src/login.js': + 'function login(username, password) {\n if (!username) throw new Error("Missing username");\n // BUG: missing password check\n return true;\n}', +} as const; + +describe('tracker_mode', () => { + evalTest('USUALLY_PASSES', { + name: 'should manage tasks in the tracker when explicitly requested during a bug fix', + params: { + settings: { experimental: { taskTracker: true } }, + }, + files: FILES, + prompt: + 'We have a bug in src/login.js: the password check is missing. First, create a task in the tracker to fix it. Then fix the bug, and mark the task as closed.', + assert: async (rig, result) => { + const wasCreateCalled = await rig.waitForToolCall( + TRACKER_CREATE_TASK_TOOL_NAME, + ); + expect( + wasCreateCalled, + 'Expected tracker_create_task tool to be called', + ).toBe(true); + + const toolLogs = rig.readToolLogs(); + const createCall = toolLogs.find( + (log) => log.toolRequest.name === TRACKER_CREATE_TASK_TOOL_NAME, + ); + expect(createCall).toBeDefined(); + const args = JSON.parse(createCall!.toolRequest.args); + expect( + (args.title?.toLowerCase() ?? '') + + (args.description?.toLowerCase() ?? ''), + ).toContain('login'); + + const wasUpdateCalled = await rig.waitForToolCall( + TRACKER_UPDATE_TASK_TOOL_NAME, + ); + expect( + wasUpdateCalled, + 'Expected tracker_update_task tool to be called', + ).toBe(true); + + const updateCall = toolLogs.find( + (log) => log.toolRequest.name === TRACKER_UPDATE_TASK_TOOL_NAME, + ); + expect(updateCall).toBeDefined(); + const updateArgs = JSON.parse(updateCall!.toolRequest.args); + expect(updateArgs.status).toBe('closed'); + + const loginContent = fs.readFileSync( + path.join(rig.testDir!, 'src/login.js'), + 'utf-8', + ); + expect(loginContent).not.toContain('// BUG: missing password check'); + + assertModelHasOutput(result); + }, + }); + + evalTest('USUALLY_PASSES', { + name: 'should implicitly create tasks when asked to build a feature plan', + params: { + settings: { experimental: { taskTracker: true } }, + }, + files: FILES, + prompt: + 'I need to build a complex new feature for user authentication in our project. Create a detailed implementation plan and organize the work into bite-sized chunks. Do not actually implement the code yet, just plan it.', + assert: async (rig, result) => { + // The model should proactively use tracker_create_task to organize the work + const wasToolCalled = await rig.waitForToolCall( + TRACKER_CREATE_TASK_TOOL_NAME, + ); + expect( + wasToolCalled, + 'Expected tracker_create_task to be called implicitly to organize plan', + ).toBe(true); + + const toolLogs = rig.readToolLogs(); + const createCalls = toolLogs.filter( + (log) => log.toolRequest.name === TRACKER_CREATE_TASK_TOOL_NAME, + ); + + // We expect it to create at least one task for authentication, likely more. + expect(createCalls.length).toBeGreaterThan(0); + + // Verify it didn't write any code since we asked it to just plan + const loginContent = fs.readFileSync( + path.join(rig.testDir!, 'src/login.js'), + 'utf-8', + ); + expect(loginContent).toContain('// BUG: missing password check'); + + assertModelHasOutput(result); + }, + }); +}); diff --git a/packages/cli/src/ui/hooks/useGeminiStream.test.tsx b/packages/cli/src/ui/hooks/useGeminiStream.test.tsx index 1f2ef5f90c..a1251f4143 100644 --- a/packages/cli/src/ui/hooks/useGeminiStream.test.tsx +++ b/packages/cli/src/ui/hooks/useGeminiStream.test.tsx @@ -3510,6 +3510,116 @@ describe('useGeminiStream', () => { expect(result.current.loopDetectionConfirmationRequest).not.toBeNull(); }); }); + + describe('Race Condition Prevention', () => { + it('should reject concurrent submitQuery when already responding', async () => { + // Stream that stays open (simulates "still responding") + mockSendMessageStream.mockReturnValue( + (async function* () { + yield { + type: ServerGeminiEventType.Content, + value: 'First response', + }; + // Keep the stream open + await new Promise(() => {}); + })(), + ); + + const { result } = renderTestHook(); + + // Start first query without awaiting (fire-and-forget, like existing tests) + await act(async () => { + // eslint-disable-next-line @typescript-eslint/no-floating-promises + result.current.submitQuery('first query'); + }); + + // Wait for the stream to start responding + await waitFor(() => { + expect(result.current.streamingState).toBe(StreamingState.Responding); + }); + + // Try a second query while first is still responding + await act(async () => { + // eslint-disable-next-line @typescript-eslint/no-floating-promises + result.current.submitQuery('second query'); + }); + + // Should have only called sendMessageStream once (second was rejected) + expect(mockSendMessageStream).toHaveBeenCalledTimes(1); + }); + + it('should allow continuation queries via loop detection retry', async () => { + const mockLoopDetectionService = { + disableForSession: vi.fn(), + }; + const mockClient = { + ...new MockedGeminiClientClass(mockConfig), + getLoopDetectionService: () => mockLoopDetectionService, + }; + mockConfig.getGeminiClient = vi.fn().mockReturnValue(mockClient); + + // First call triggers loop detection + mockSendMessageStream.mockReturnValueOnce( + (async function* () { + yield { + type: ServerGeminiEventType.LoopDetected, + }; + })(), + ); + + // Retry call succeeds + mockSendMessageStream.mockReturnValueOnce( + (async function* () { + yield { + type: ServerGeminiEventType.Content, + value: 'Retry success', + }; + yield { + type: ServerGeminiEventType.Finished, + value: { reason: 'STOP' }, + }; + })(), + ); + + const { result } = renderTestHook(); + + await act(async () => { + await result.current.submitQuery('test query'); + }); + + await waitFor(() => { + expect( + result.current.loopDetectionConfirmationRequest, + ).not.toBeNull(); + }); + + // User selects "disable" which triggers a continuation query + await act(async () => { + result.current.loopDetectionConfirmationRequest?.onComplete({ + userSelection: 'disable', + }); + }); + + // Verify disableForSession was called + expect( + mockLoopDetectionService.disableForSession, + ).toHaveBeenCalledTimes(1); + + // Continuation query should have gone through (2 total calls) + await waitFor(() => { + expect(mockSendMessageStream).toHaveBeenCalledTimes(2); + expect(mockSendMessageStream).toHaveBeenNthCalledWith( + 2, + 'test query', + expect.any(AbortSignal), + expect.any(String), + undefined, + false, + 'test query', + ); + }); + }); + }); }); describe('Agent Execution Events', () => { diff --git a/packages/cli/src/ui/hooks/useGeminiStream.ts b/packages/cli/src/ui/hooks/useGeminiStream.ts index d254902a94..d2e485db1f 100644 --- a/packages/cli/src/ui/hooks/useGeminiStream.ts +++ b/packages/cli/src/ui/hooks/useGeminiStream.ts @@ -216,7 +216,15 @@ export const useGeminiStream = ( const previousApprovalModeRef = useRef( config.getApprovalMode(), ); - const [isResponding, setIsResponding] = useState(false); + const [isResponding, setIsRespondingState] = useState(false); + const isRespondingRef = useRef(false); + const setIsResponding = useCallback( + (value: boolean) => { + setIsRespondingState(value); + isRespondingRef.current = value; + }, + [setIsRespondingState], + ); const [thought, thoughtRef, setThought] = useStateAndRef(null); const [pendingHistoryItem, pendingHistoryItemRef, setPendingHistoryItem] = @@ -320,11 +328,14 @@ export const useGeminiStream = ( return (executingShellTool as TrackedExecutingToolCall | undefined)?.pid; }, [toolCalls]); - const onExec = useCallback(async (done: Promise) => { - setIsResponding(true); - await done; - setIsResponding(false); - }, []); + const onExec = useCallback( + async (done: Promise) => { + setIsResponding(true); + await done; + setIsResponding(false); + }, + [setIsResponding], + ); const { handleShellCommand, @@ -538,7 +549,7 @@ export const useGeminiStream = ( setIsResponding(false); } prevActiveShellPtyIdRef.current = activeShellPtyId; - }, [activeShellPtyId, addItem]); + }, [activeShellPtyId, addItem, setIsResponding]); useEffect(() => { if ( @@ -700,6 +711,7 @@ export const useGeminiStream = ( cancelAllToolCalls, toolCalls, activeShellPtyId, + setIsResponding, ]); useKeypress( @@ -952,7 +964,13 @@ export const useGeminiStream = ( setIsResponding(false); setThought(null); // Reset thought when user cancels }, - [addItem, pendingHistoryItemRef, setPendingHistoryItem, setThought], + [ + addItem, + pendingHistoryItemRef, + setPendingHistoryItem, + setThought, + setIsResponding, + ], ); const handleErrorEvent = useCallback( @@ -1358,14 +1376,15 @@ export const useGeminiStream = ( async ({ metadata: spanMetadata }) => { spanMetadata.input = query; - const queryId = `${Date.now()}-${Math.random()}`; - activeQueryIdRef.current = queryId; if ( - (streamingState === StreamingState.Responding || + (isRespondingRef.current || + streamingState === StreamingState.Responding || streamingState === StreamingState.WaitingForConfirmation) && !options?.isContinuation ) return; + const queryId = `${Date.now()}-${Math.random()}`; + activeQueryIdRef.current = queryId; const userMessageTimestamp = Date.now(); @@ -1452,7 +1471,7 @@ export const useGeminiStream = ( loopDetectedRef.current = false; // Show the confirmation dialog to choose whether to disable loop detection setLoopDetectionConfirmationRequest({ - onComplete: (result: { + onComplete: async (result: { userSelection: 'disable' | 'keep'; }) => { setLoopDetectionConfirmationRequest(null); @@ -1468,8 +1487,7 @@ export const useGeminiStream = ( }); if (lastQueryRef.current && lastPromptIdRef.current) { - // eslint-disable-next-line @typescript-eslint/no-floating-promises - submitQuery( + await submitQuery( lastQueryRef.current, { isContinuation: true }, lastPromptIdRef.current, @@ -1537,6 +1555,7 @@ export const useGeminiStream = ( maybeAddSuppressedToolErrorNote, maybeAddLowVerbosityFailureNote, settings.merged.billing?.overageStrategy, + setIsResponding, ], ); @@ -1803,6 +1822,7 @@ export const useGeminiStream = ( isLowErrorVerbosity, maybeAddSuppressedToolErrorNote, maybeAddLowVerbosityFailureNote, + setIsResponding, ], );