diff --git a/evals/tracker.eval.ts b/evals/tracker.eval.ts index 83ffc61d68..d4044b8cb6 100644 --- a/evals/tracker.eval.ts +++ b/evals/tracker.eval.ts @@ -62,11 +62,13 @@ describe('tracker_mode', () => { 'Expected tracker_update_task tool to be called', ).toBe(true); - const updateCall = toolLogs.find( + const updateCalls = toolLogs.filter( (log) => log.toolRequest.name === TRACKER_UPDATE_TASK_TOOL_NAME, ); - expect(updateCall).toBeDefined(); - const updateArgs = JSON.parse(updateCall!.toolRequest.args); + expect(updateCalls.length).toBeGreaterThan(0); + const updateArgs = JSON.parse( + updateCalls[updateCalls.length - 1].toolRequest.args, + ); expect(updateArgs.status).toBe('closed'); const loginContent = fs.readFileSync( @@ -128,12 +130,52 @@ describe('tracker_mode', () => { prompt: 'Where is my task tracker storage located? Please provide the absolute path in your response.', assert: async (rig, result) => { - // The rig sets GEMINI_CLI_HOME to rig.homeDir - const homeDir = rig.homeDir!; - // The response should contain the dynamic path which includes the home directory - // and follows the .gemini/tmp/.../tracker structure. - expect(result).toContain(homeDir); + // The response should contain the dynamic path which follows the .gemini/tmp/.../tracker structure. expect(result).toMatch(/\.gemini\/tmp\/.*\/tracker/); }, }); + + evalTest('USUALLY_PASSES', { + suiteName: 'default', + suiteType: 'behavioral', + name: 'should update the tracker in the same turn as the task completion to save turns', + params: { + settings: { experimental: { taskTracker: true } }, + }, + files: FILES, + prompt: + 'We have a bug in src/login.js: the password check is missing. Fix this bug. Then, create a new file src/auth.js that exports a simple verifyToken function. Please organize this into tasks and execute them.', + assert: async (rig, result) => { + await rig.waitForToolCall(TRACKER_CREATE_TASK_TOOL_NAME); + await rig.waitForToolCall(TRACKER_UPDATE_TASK_TOOL_NAME); + + const toolLogs = rig.readToolLogs(); + + // Get the prompt ID of the fix for login.js + const loginEditCalls = toolLogs.filter( + (log) => + (log.toolRequest.name === 'replace' || + log.toolRequest.name === 'write_file') && + log.toolRequest.args.includes('login.js'), + ); + + expect(loginEditCalls.length).toBeGreaterThan(0); + const loginEditPromptId = + loginEditCalls[loginEditCalls.length - 1].toolRequest.prompt_id; + + // Verify there is an update to the tracker in the exact same turn + const parallelTrackerUpdates = toolLogs.filter( + (log) => + log.toolRequest.name === TRACKER_UPDATE_TASK_TOOL_NAME && + log.toolRequest.prompt_id === loginEditPromptId, + ); + + expect( + parallelTrackerUpdates.length, + 'Expected tracker_update_task to be called in the same turn as the login.js fix', + ).toBeGreaterThan(0); + + assertModelHasOutput(result); + }, + }); }); diff --git a/packages/core/src/core/__snapshots__/prompts.test.ts.snap b/packages/core/src/core/__snapshots__/prompts.test.ts.snap index 596fec846a..3f2974aa9b 100644 --- a/packages/core/src/core/__snapshots__/prompts.test.ts.snap +++ b/packages/core/src/core/__snapshots__/prompts.test.ts.snap @@ -2965,6 +2965,7 @@ You are operating with a persistent file-based task tracking system located at \ 6. **STATE OVER CHAT**: If the user says "I think we finished that," but the tool says it is 'pending', trust the tool--or verify explicitly before updating. 7. **DEPENDENCY MANAGEMENT**: Respect task topology. Never attempt to execute a task if its dependencies are not marked as 'closed'. If you are blocked, focus only on the leaf nodes of the task graph. 8. **DETAILED TASKS**: Ensure that the tasks created have highly detailed titles and descriptions. The description MUST provide significantly more specific details and technical context than the title. +9. **TURN EFFICIENCY**: Update the tracker immediately when a step is completed. Combine \`tracker_update_task\` calls with other tool calls in the same turn to save turns. # Operational Guidelines @@ -3151,6 +3152,7 @@ You are operating with a persistent file-based task tracking system located at \ 6. **STATE OVER CHAT**: If the user says "I think we finished that," but the tool says it is 'pending', trust the tool--or verify explicitly before updating. 7. **DEPENDENCY MANAGEMENT**: Respect task topology. Never attempt to execute a task if its dependencies are not marked as 'closed'. If you are blocked, focus only on the leaf nodes of the task graph. 8. **DETAILED TASKS**: Ensure that the tasks created have highly detailed titles and descriptions. The description MUST provide significantly more specific details and technical context than the title. +9. **TURN EFFICIENCY**: Update the tracker immediately when a step is completed. Combine \`tracker_update_task\` calls with other tool calls in the same turn to save turns. # Operational Guidelines diff --git a/packages/core/src/prompts/snippets.legacy.ts b/packages/core/src/prompts/snippets.legacy.ts index df11011403..65f37c27b1 100644 --- a/packages/core/src/prompts/snippets.legacy.ts +++ b/packages/core/src/prompts/snippets.legacy.ts @@ -510,7 +510,8 @@ You are operating with a persistent file-based task tracking system located at \ 5. **VERIFICATION**: Before marking a task as complete, verify the work is actually done (e.g., run the test, check the file existence). 6. **STATE OVER CHAT**: If the user says "I think we finished that," but the tool says it is 'pending', trust the tool--or verify explicitly before updating. 7. **DEPENDENCY MANAGEMENT**: Respect task topology. Never attempt to execute a task if its dependencies are not marked as 'closed'. If you are blocked, focus only on the leaf nodes of the task graph. -8. **DETAILED TASKS**: Ensure that the tasks created have highly detailed titles and descriptions. The description MUST provide significantly more specific details and technical context than the title.`.trim(); +8. **DETAILED TASKS**: Ensure that the tasks created have highly detailed titles and descriptions. The description MUST provide significantly more specific details and technical context than the title. +9. **TURN EFFICIENCY**: Update the tracker immediately when a step is completed. Combine \`${TRACKER_UPDATE_TASK_TOOL_NAME}\` calls with other tool calls in the same turn to save turns.`.trim(); } // --- Leaf Helpers (Strictly strings or simple calls) --- diff --git a/packages/core/src/prompts/snippets.ts b/packages/core/src/prompts/snippets.ts index fc03975d97..9a45011785 100644 --- a/packages/core/src/prompts/snippets.ts +++ b/packages/core/src/prompts/snippets.ts @@ -577,7 +577,8 @@ You are operating with a persistent file-based task tracking system located at \ 5. **VERIFICATION**: Before marking a task as complete, verify the work is actually done (e.g., run the test, check the file existence). 6. **STATE OVER CHAT**: If the user says "I think we finished that," but the tool says it is 'pending', trust the tool--or verify explicitly before updating. 7. **DEPENDENCY MANAGEMENT**: Respect task topology. Never attempt to execute a task if its dependencies are not marked as 'closed'. If you are blocked, focus only on the leaf nodes of the task graph. -8. **DETAILED TASKS**: Ensure that the tasks created have highly detailed titles and descriptions. The description MUST provide significantly more specific details and technical context than the title.`.trim(); +8. **DETAILED TASKS**: Ensure that the tasks created have highly detailed titles and descriptions. The description MUST provide significantly more specific details and technical context than the title. +9. **TURN EFFICIENCY**: Update the tracker immediately when a step is completed. Combine ${trackerUpdate} calls with other tool calls in the same turn to save turns.`.trim(); } export function renderPlanningWorkflow(