mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-03-10 22:21:22 -07:00
Merge branch 'main' into feat/browser-allowed-domain
This commit is contained in:
@@ -111,7 +111,7 @@
|
||||
"badge": "🔬",
|
||||
"slug": "docs/cli/notifications"
|
||||
},
|
||||
{ "label": "Plan mode", "badge": "🔬", "slug": "docs/cli/plan-mode" },
|
||||
{ "label": "Plan mode", "slug": "docs/cli/plan-mode" },
|
||||
{
|
||||
"label": "Subagents",
|
||||
"badge": "🔬",
|
||||
|
||||
116
evals/tracker.eval.ts
Normal file
116
evals/tracker.eval.ts
Normal file
@@ -0,0 +1,116 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { describe, expect } from 'vitest';
|
||||
import {
|
||||
TRACKER_CREATE_TASK_TOOL_NAME,
|
||||
TRACKER_UPDATE_TASK_TOOL_NAME,
|
||||
} from '@google/gemini-cli-core';
|
||||
import { evalTest, assertModelHasOutput } from './test-helper.js';
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
|
||||
const FILES = {
|
||||
'package.json': JSON.stringify({
|
||||
name: 'test-project',
|
||||
version: '1.0.0',
|
||||
scripts: { test: 'echo "All tests passed!"' },
|
||||
}),
|
||||
'src/login.js':
|
||||
'function login(username, password) {\n if (!username) throw new Error("Missing username");\n // BUG: missing password check\n return true;\n}',
|
||||
} as const;
|
||||
|
||||
describe('tracker_mode', () => {
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should manage tasks in the tracker when explicitly requested during a bug fix',
|
||||
params: {
|
||||
settings: { experimental: { taskTracker: true } },
|
||||
},
|
||||
files: FILES,
|
||||
prompt:
|
||||
'We have a bug in src/login.js: the password check is missing. First, create a task in the tracker to fix it. Then fix the bug, and mark the task as closed.',
|
||||
assert: async (rig, result) => {
|
||||
const wasCreateCalled = await rig.waitForToolCall(
|
||||
TRACKER_CREATE_TASK_TOOL_NAME,
|
||||
);
|
||||
expect(
|
||||
wasCreateCalled,
|
||||
'Expected tracker_create_task tool to be called',
|
||||
).toBe(true);
|
||||
|
||||
const toolLogs = rig.readToolLogs();
|
||||
const createCall = toolLogs.find(
|
||||
(log) => log.toolRequest.name === TRACKER_CREATE_TASK_TOOL_NAME,
|
||||
);
|
||||
expect(createCall).toBeDefined();
|
||||
const args = JSON.parse(createCall!.toolRequest.args);
|
||||
expect(
|
||||
(args.title?.toLowerCase() ?? '') +
|
||||
(args.description?.toLowerCase() ?? ''),
|
||||
).toContain('login');
|
||||
|
||||
const wasUpdateCalled = await rig.waitForToolCall(
|
||||
TRACKER_UPDATE_TASK_TOOL_NAME,
|
||||
);
|
||||
expect(
|
||||
wasUpdateCalled,
|
||||
'Expected tracker_update_task tool to be called',
|
||||
).toBe(true);
|
||||
|
||||
const updateCall = toolLogs.find(
|
||||
(log) => log.toolRequest.name === TRACKER_UPDATE_TASK_TOOL_NAME,
|
||||
);
|
||||
expect(updateCall).toBeDefined();
|
||||
const updateArgs = JSON.parse(updateCall!.toolRequest.args);
|
||||
expect(updateArgs.status).toBe('closed');
|
||||
|
||||
const loginContent = fs.readFileSync(
|
||||
path.join(rig.testDir!, 'src/login.js'),
|
||||
'utf-8',
|
||||
);
|
||||
expect(loginContent).not.toContain('// BUG: missing password check');
|
||||
|
||||
assertModelHasOutput(result);
|
||||
},
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should implicitly create tasks when asked to build a feature plan',
|
||||
params: {
|
||||
settings: { experimental: { taskTracker: true } },
|
||||
},
|
||||
files: FILES,
|
||||
prompt:
|
||||
'I need to build a complex new feature for user authentication in our project. Create a detailed implementation plan and organize the work into bite-sized chunks. Do not actually implement the code yet, just plan it.',
|
||||
assert: async (rig, result) => {
|
||||
// The model should proactively use tracker_create_task to organize the work
|
||||
const wasToolCalled = await rig.waitForToolCall(
|
||||
TRACKER_CREATE_TASK_TOOL_NAME,
|
||||
);
|
||||
expect(
|
||||
wasToolCalled,
|
||||
'Expected tracker_create_task to be called implicitly to organize plan',
|
||||
).toBe(true);
|
||||
|
||||
const toolLogs = rig.readToolLogs();
|
||||
const createCalls = toolLogs.filter(
|
||||
(log) => log.toolRequest.name === TRACKER_CREATE_TASK_TOOL_NAME,
|
||||
);
|
||||
|
||||
// We expect it to create at least one task for authentication, likely more.
|
||||
expect(createCalls.length).toBeGreaterThan(0);
|
||||
|
||||
// Verify it didn't write any code since we asked it to just plan
|
||||
const loginContent = fs.readFileSync(
|
||||
path.join(rig.testDir!, 'src/login.js'),
|
||||
'utf-8',
|
||||
);
|
||||
expect(loginContent).toContain('// BUG: missing password check');
|
||||
|
||||
assertModelHasOutput(result);
|
||||
},
|
||||
});
|
||||
});
|
||||
@@ -3510,6 +3510,116 @@ describe('useGeminiStream', () => {
|
||||
expect(result.current.loopDetectionConfirmationRequest).not.toBeNull();
|
||||
});
|
||||
});
|
||||
|
||||
describe('Race Condition Prevention', () => {
|
||||
it('should reject concurrent submitQuery when already responding', async () => {
|
||||
// Stream that stays open (simulates "still responding")
|
||||
mockSendMessageStream.mockReturnValue(
|
||||
(async function* () {
|
||||
yield {
|
||||
type: ServerGeminiEventType.Content,
|
||||
value: 'First response',
|
||||
};
|
||||
// Keep the stream open
|
||||
await new Promise(() => {});
|
||||
})(),
|
||||
);
|
||||
|
||||
const { result } = renderTestHook();
|
||||
|
||||
// Start first query without awaiting (fire-and-forget, like existing tests)
|
||||
await act(async () => {
|
||||
// eslint-disable-next-line @typescript-eslint/no-floating-promises
|
||||
result.current.submitQuery('first query');
|
||||
});
|
||||
|
||||
// Wait for the stream to start responding
|
||||
await waitFor(() => {
|
||||
expect(result.current.streamingState).toBe(StreamingState.Responding);
|
||||
});
|
||||
|
||||
// Try a second query while first is still responding
|
||||
await act(async () => {
|
||||
// eslint-disable-next-line @typescript-eslint/no-floating-promises
|
||||
result.current.submitQuery('second query');
|
||||
});
|
||||
|
||||
// Should have only called sendMessageStream once (second was rejected)
|
||||
expect(mockSendMessageStream).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it('should allow continuation queries via loop detection retry', async () => {
|
||||
const mockLoopDetectionService = {
|
||||
disableForSession: vi.fn(),
|
||||
};
|
||||
const mockClient = {
|
||||
...new MockedGeminiClientClass(mockConfig),
|
||||
getLoopDetectionService: () => mockLoopDetectionService,
|
||||
};
|
||||
mockConfig.getGeminiClient = vi.fn().mockReturnValue(mockClient);
|
||||
|
||||
// First call triggers loop detection
|
||||
mockSendMessageStream.mockReturnValueOnce(
|
||||
(async function* () {
|
||||
yield {
|
||||
type: ServerGeminiEventType.LoopDetected,
|
||||
};
|
||||
})(),
|
||||
);
|
||||
|
||||
// Retry call succeeds
|
||||
mockSendMessageStream.mockReturnValueOnce(
|
||||
(async function* () {
|
||||
yield {
|
||||
type: ServerGeminiEventType.Content,
|
||||
value: 'Retry success',
|
||||
};
|
||||
yield {
|
||||
type: ServerGeminiEventType.Finished,
|
||||
value: { reason: 'STOP' },
|
||||
};
|
||||
})(),
|
||||
);
|
||||
|
||||
const { result } = renderTestHook();
|
||||
|
||||
await act(async () => {
|
||||
await result.current.submitQuery('test query');
|
||||
});
|
||||
|
||||
await waitFor(() => {
|
||||
expect(
|
||||
result.current.loopDetectionConfirmationRequest,
|
||||
).not.toBeNull();
|
||||
});
|
||||
|
||||
// User selects "disable" which triggers a continuation query
|
||||
await act(async () => {
|
||||
result.current.loopDetectionConfirmationRequest?.onComplete({
|
||||
userSelection: 'disable',
|
||||
});
|
||||
});
|
||||
|
||||
// Verify disableForSession was called
|
||||
expect(
|
||||
mockLoopDetectionService.disableForSession,
|
||||
).toHaveBeenCalledTimes(1);
|
||||
|
||||
// Continuation query should have gone through (2 total calls)
|
||||
await waitFor(() => {
|
||||
expect(mockSendMessageStream).toHaveBeenCalledTimes(2);
|
||||
expect(mockSendMessageStream).toHaveBeenNthCalledWith(
|
||||
2,
|
||||
'test query',
|
||||
expect.any(AbortSignal),
|
||||
expect.any(String),
|
||||
undefined,
|
||||
false,
|
||||
'test query',
|
||||
);
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe('Agent Execution Events', () => {
|
||||
|
||||
@@ -216,7 +216,15 @@ export const useGeminiStream = (
|
||||
const previousApprovalModeRef = useRef<ApprovalMode>(
|
||||
config.getApprovalMode(),
|
||||
);
|
||||
const [isResponding, setIsResponding] = useState<boolean>(false);
|
||||
const [isResponding, setIsRespondingState] = useState<boolean>(false);
|
||||
const isRespondingRef = useRef<boolean>(false);
|
||||
const setIsResponding = useCallback(
|
||||
(value: boolean) => {
|
||||
setIsRespondingState(value);
|
||||
isRespondingRef.current = value;
|
||||
},
|
||||
[setIsRespondingState],
|
||||
);
|
||||
const [thought, thoughtRef, setThought] =
|
||||
useStateAndRef<ThoughtSummary | null>(null);
|
||||
const [pendingHistoryItem, pendingHistoryItemRef, setPendingHistoryItem] =
|
||||
@@ -320,11 +328,14 @@ export const useGeminiStream = (
|
||||
return (executingShellTool as TrackedExecutingToolCall | undefined)?.pid;
|
||||
}, [toolCalls]);
|
||||
|
||||
const onExec = useCallback(async (done: Promise<void>) => {
|
||||
setIsResponding(true);
|
||||
await done;
|
||||
setIsResponding(false);
|
||||
}, []);
|
||||
const onExec = useCallback(
|
||||
async (done: Promise<void>) => {
|
||||
setIsResponding(true);
|
||||
await done;
|
||||
setIsResponding(false);
|
||||
},
|
||||
[setIsResponding],
|
||||
);
|
||||
|
||||
const {
|
||||
handleShellCommand,
|
||||
@@ -538,7 +549,7 @@ export const useGeminiStream = (
|
||||
setIsResponding(false);
|
||||
}
|
||||
prevActiveShellPtyIdRef.current = activeShellPtyId;
|
||||
}, [activeShellPtyId, addItem]);
|
||||
}, [activeShellPtyId, addItem, setIsResponding]);
|
||||
|
||||
useEffect(() => {
|
||||
if (
|
||||
@@ -700,6 +711,7 @@ export const useGeminiStream = (
|
||||
cancelAllToolCalls,
|
||||
toolCalls,
|
||||
activeShellPtyId,
|
||||
setIsResponding,
|
||||
]);
|
||||
|
||||
useKeypress(
|
||||
@@ -952,7 +964,13 @@ export const useGeminiStream = (
|
||||
setIsResponding(false);
|
||||
setThought(null); // Reset thought when user cancels
|
||||
},
|
||||
[addItem, pendingHistoryItemRef, setPendingHistoryItem, setThought],
|
||||
[
|
||||
addItem,
|
||||
pendingHistoryItemRef,
|
||||
setPendingHistoryItem,
|
||||
setThought,
|
||||
setIsResponding,
|
||||
],
|
||||
);
|
||||
|
||||
const handleErrorEvent = useCallback(
|
||||
@@ -1358,14 +1376,15 @@ export const useGeminiStream = (
|
||||
async ({ metadata: spanMetadata }) => {
|
||||
spanMetadata.input = query;
|
||||
|
||||
const queryId = `${Date.now()}-${Math.random()}`;
|
||||
activeQueryIdRef.current = queryId;
|
||||
if (
|
||||
(streamingState === StreamingState.Responding ||
|
||||
(isRespondingRef.current ||
|
||||
streamingState === StreamingState.Responding ||
|
||||
streamingState === StreamingState.WaitingForConfirmation) &&
|
||||
!options?.isContinuation
|
||||
)
|
||||
return;
|
||||
const queryId = `${Date.now()}-${Math.random()}`;
|
||||
activeQueryIdRef.current = queryId;
|
||||
|
||||
const userMessageTimestamp = Date.now();
|
||||
|
||||
@@ -1452,7 +1471,7 @@ export const useGeminiStream = (
|
||||
loopDetectedRef.current = false;
|
||||
// Show the confirmation dialog to choose whether to disable loop detection
|
||||
setLoopDetectionConfirmationRequest({
|
||||
onComplete: (result: {
|
||||
onComplete: async (result: {
|
||||
userSelection: 'disable' | 'keep';
|
||||
}) => {
|
||||
setLoopDetectionConfirmationRequest(null);
|
||||
@@ -1468,8 +1487,7 @@ export const useGeminiStream = (
|
||||
});
|
||||
|
||||
if (lastQueryRef.current && lastPromptIdRef.current) {
|
||||
// eslint-disable-next-line @typescript-eslint/no-floating-promises
|
||||
submitQuery(
|
||||
await submitQuery(
|
||||
lastQueryRef.current,
|
||||
{ isContinuation: true },
|
||||
lastPromptIdRef.current,
|
||||
@@ -1537,6 +1555,7 @@ export const useGeminiStream = (
|
||||
maybeAddSuppressedToolErrorNote,
|
||||
maybeAddLowVerbosityFailureNote,
|
||||
settings.merged.billing?.overageStrategy,
|
||||
setIsResponding,
|
||||
],
|
||||
);
|
||||
|
||||
@@ -1803,6 +1822,7 @@ export const useGeminiStream = (
|
||||
isLowErrorVerbosity,
|
||||
maybeAddSuppressedToolErrorNote,
|
||||
maybeAddLowVerbosityFailureNote,
|
||||
setIsResponding,
|
||||
],
|
||||
);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user