mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-04-29 22:44:45 -07:00
c17400b830
Co-authored-by: anj-s <anjalisridhar@google.com>
182 lines
6.2 KiB
TypeScript
182 lines
6.2 KiB
TypeScript
/**
|
|
* @license
|
|
* Copyright 2026 Google LLC
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*/
|
|
|
|
import { describe, expect } from 'vitest';
|
|
import {
|
|
TRACKER_CREATE_TASK_TOOL_NAME,
|
|
TRACKER_UPDATE_TASK_TOOL_NAME,
|
|
} from '@google/gemini-cli-core';
|
|
import { evalTest, assertModelHasOutput } from './test-helper.js';
|
|
import fs from 'node:fs';
|
|
import path from 'node:path';
|
|
|
|
const FILES = {
|
|
'package.json': JSON.stringify({
|
|
name: 'test-project',
|
|
version: '1.0.0',
|
|
scripts: { test: 'echo "All tests passed!"' },
|
|
}),
|
|
'src/login.js':
|
|
'function login(username, password) {\n if (!username) throw new Error("Missing username");\n // BUG: missing password check\n return true;\n}',
|
|
} as const;
|
|
|
|
describe('tracker_mode', () => {
|
|
evalTest('USUALLY_PASSES', {
|
|
suiteName: 'default',
|
|
suiteType: 'behavioral',
|
|
name: 'should manage tasks in the tracker when explicitly requested during a bug fix',
|
|
params: {
|
|
settings: { experimental: { taskTracker: true } },
|
|
},
|
|
files: FILES,
|
|
prompt:
|
|
'We have a bug in src/login.js: the password check is missing. First, create a task in the tracker to fix it. Then fix the bug, and mark the task as closed.',
|
|
assert: async (rig, result) => {
|
|
const wasCreateCalled = await rig.waitForToolCall(
|
|
TRACKER_CREATE_TASK_TOOL_NAME,
|
|
);
|
|
expect(
|
|
wasCreateCalled,
|
|
'Expected tracker_create_task tool to be called',
|
|
).toBe(true);
|
|
|
|
const toolLogs = rig.readToolLogs();
|
|
const createCall = toolLogs.find(
|
|
(log) => log.toolRequest.name === TRACKER_CREATE_TASK_TOOL_NAME,
|
|
);
|
|
expect(createCall).toBeDefined();
|
|
const args = JSON.parse(createCall!.toolRequest.args);
|
|
expect(
|
|
(args.title?.toLowerCase() ?? '') +
|
|
(args.description?.toLowerCase() ?? ''),
|
|
).toContain('login');
|
|
|
|
const wasUpdateCalled = await rig.waitForToolCall(
|
|
TRACKER_UPDATE_TASK_TOOL_NAME,
|
|
);
|
|
expect(
|
|
wasUpdateCalled,
|
|
'Expected tracker_update_task tool to be called',
|
|
).toBe(true);
|
|
|
|
const updateCalls = toolLogs.filter(
|
|
(log) => log.toolRequest.name === TRACKER_UPDATE_TASK_TOOL_NAME,
|
|
);
|
|
expect(updateCalls.length).toBeGreaterThan(0);
|
|
const updateArgs = JSON.parse(
|
|
updateCalls[updateCalls.length - 1].toolRequest.args,
|
|
);
|
|
expect(updateArgs.status).toBe('closed');
|
|
|
|
const loginContent = fs.readFileSync(
|
|
path.join(rig.testDir!, 'src/login.js'),
|
|
'utf-8',
|
|
);
|
|
expect(loginContent).not.toContain('// BUG: missing password check');
|
|
|
|
assertModelHasOutput(result);
|
|
},
|
|
});
|
|
|
|
evalTest('USUALLY_PASSES', {
|
|
suiteName: 'default',
|
|
suiteType: 'behavioral',
|
|
name: 'should implicitly create tasks when asked to build a feature plan',
|
|
params: {
|
|
settings: { experimental: { taskTracker: true } },
|
|
},
|
|
files: FILES,
|
|
prompt:
|
|
'I need to build a complex new feature for user authentication in our project. Create a detailed implementation plan and organize the work into bite-sized chunks. Do not actually implement the code yet, just plan it.',
|
|
assert: async (rig, result) => {
|
|
// The model should proactively use tracker_create_task to organize the work
|
|
const wasToolCalled = await rig.waitForToolCall(
|
|
TRACKER_CREATE_TASK_TOOL_NAME,
|
|
);
|
|
expect(
|
|
wasToolCalled,
|
|
'Expected tracker_create_task to be called implicitly to organize plan',
|
|
).toBe(true);
|
|
|
|
const toolLogs = rig.readToolLogs();
|
|
const createCalls = toolLogs.filter(
|
|
(log) => log.toolRequest.name === TRACKER_CREATE_TASK_TOOL_NAME,
|
|
);
|
|
|
|
// We expect it to create at least one task for authentication, likely more.
|
|
expect(createCalls.length).toBeGreaterThan(0);
|
|
|
|
// Verify it didn't write any code since we asked it to just plan
|
|
const loginContent = fs.readFileSync(
|
|
path.join(rig.testDir!, 'src/login.js'),
|
|
'utf-8',
|
|
);
|
|
expect(loginContent).toContain('// BUG: missing password check');
|
|
|
|
assertModelHasOutput(result);
|
|
},
|
|
});
|
|
|
|
evalTest('USUALLY_PASSES', {
|
|
suiteName: 'default',
|
|
suiteType: 'behavioral',
|
|
name: 'should correctly identify the task tracker storage location from the system prompt',
|
|
params: {
|
|
settings: { experimental: { taskTracker: true } },
|
|
},
|
|
prompt:
|
|
'Where is my task tracker storage located? Please provide the absolute path in your response.',
|
|
assert: async (rig, result) => {
|
|
// The response should contain the dynamic path which follows the .gemini/tmp/.../tracker structure.
|
|
expect(result).toMatch(/\.gemini\/tmp\/.*\/tracker/);
|
|
},
|
|
});
|
|
|
|
evalTest('USUALLY_PASSES', {
|
|
suiteName: 'default',
|
|
suiteType: 'behavioral',
|
|
name: 'should update the tracker in the same turn as the task completion to save turns',
|
|
params: {
|
|
settings: { experimental: { taskTracker: true } },
|
|
},
|
|
files: FILES,
|
|
prompt:
|
|
'We have a bug in src/login.js: the password check is missing. Fix this bug. Then, create a new file src/auth.js that exports a simple verifyToken function. Please organize this into tasks and execute them.',
|
|
assert: async (rig, result) => {
|
|
await rig.waitForToolCall(TRACKER_CREATE_TASK_TOOL_NAME);
|
|
await rig.waitForToolCall(TRACKER_UPDATE_TASK_TOOL_NAME);
|
|
|
|
const toolLogs = rig.readToolLogs();
|
|
|
|
// Get the prompt ID of the fix for login.js
|
|
const loginEditCalls = toolLogs.filter(
|
|
(log) =>
|
|
(log.toolRequest.name === 'replace' ||
|
|
log.toolRequest.name === 'write_file') &&
|
|
log.toolRequest.args.includes('login.js'),
|
|
);
|
|
|
|
expect(loginEditCalls.length).toBeGreaterThan(0);
|
|
const loginEditPromptId =
|
|
loginEditCalls[loginEditCalls.length - 1].toolRequest.prompt_id;
|
|
|
|
// Verify there is an update to the tracker in the exact same turn
|
|
const parallelTrackerUpdates = toolLogs.filter(
|
|
(log) =>
|
|
log.toolRequest.name === TRACKER_UPDATE_TASK_TOOL_NAME &&
|
|
log.toolRequest.prompt_id === loginEditPromptId,
|
|
);
|
|
|
|
expect(
|
|
parallelTrackerUpdates.length,
|
|
'Expected tracker_update_task to be called in the same turn as the login.js fix',
|
|
).toBeGreaterThan(0);
|
|
|
|
assertModelHasOutput(result);
|
|
},
|
|
});
|
|
});
|