diff --git a/evals/concurrency-safety.eval.ts b/evals/concurrency-safety.eval.ts new file mode 100644 index 0000000000..f2f9e24be9 --- /dev/null +++ b/evals/concurrency-safety.eval.ts @@ -0,0 +1,56 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { expect } from 'vitest'; +import { evalTest } from './test-helper.js'; + +const MUTATION_AGENT_DEFINITION = `--- +name: mutation-agent +description: An agent that modifies the workspace (writes, deletes, git operations, etc). +max_turns: 1 +tools: + - write_file +--- + +You are the mutation agent. Do the mutation requested. +`; + +describe('concurrency safety eval test cases', () => { + evalTest('USUALLY_PASSES', { + name: 'mutation agents are run in parallel when explicitly requested', + params: { + settings: { + experimental: { + enableAgents: true, + }, + }, + }, + prompt: + 'Update A.txt to say "A" and update B.txt to say "B". Delegate these tasks to two separate mutation-agent subagents. You MUST run these subagents in parallel at the same time.', + files: { + '.gemini/agents/mutation-agent.md': MUTATION_AGENT_DEFINITION, + }, + assert: async (rig) => { + const logs = rig.readToolLogs(); + const mutationCalls = logs.filter( + (log) => log.toolRequest?.name === 'mutation-agent', + ); + + expect( + mutationCalls.length, + 'Agent should have called the mutation-agent at least twice', + ).toBeGreaterThanOrEqual(2); + + const firstPromptId = mutationCalls[0].toolRequest.prompt_id; + const secondPromptId = mutationCalls[1].toolRequest.prompt_id; + + expect( + firstPromptId, + 'mutation agents should be called in parallel (same turn / prompt_ids) when explicitly requested', + ).toEqual(secondPromptId); + }, + }); +}); diff --git a/evals/test-helper.ts b/evals/test-helper.ts index 44c538c197..786ec0e418 100644 --- a/evals/test-helper.ts +++ b/evals/test-helper.ts @@ -112,6 +112,7 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) { // commands. execSync('git config core.editor "true"', execOptions); execSync('git config core.pager "cat"', execOptions); + execSync('git config commit.gpgsign false', execOptions); execSync('git add .', execOptions); execSync('git commit --allow-empty -m "Initial commit"', execOptions); } diff --git a/packages/core/src/core/__snapshots__/prompts.test.ts.snap b/packages/core/src/core/__snapshots__/prompts.test.ts.snap index 82c7a8f996..5a5099f0f1 100644 --- a/packages/core/src/core/__snapshots__/prompts.test.ts.snap +++ b/packages/core/src/core/__snapshots__/prompts.test.ts.snap @@ -62,6 +62,8 @@ Operate as a **strategic orchestrator**. Your own context window is your most pr When you delegate, the sub-agent's entire execution is consolidated into a single summary in your history, keeping your main loop lean. +**Concurrency Safety and Mandate:** You should NEVER run multiple subagents in a single turn if their abilities mutate the same files or resources. This is to prevent race conditions and ensure that the workspace is in a consistent state. Only run multiple subagents in parallel when their tasks are independent (e.g., multiple concurrent research or read-only tasks) or if parallel execution is explicitly requested by the user. + **High-Impact Delegation Candidates:** - **Repetitive Batch Tasks:** Tasks involving more than 3 files or repeated steps (e.g., "Add license headers to all files in src/", "Fix all lint errors in the project"). - **High-Volume Output:** Commands or tools expected to return large amounts of data (e.g., verbose builds, exhaustive file searches). @@ -230,6 +232,8 @@ Operate as a **strategic orchestrator**. Your own context window is your most pr When you delegate, the sub-agent's entire execution is consolidated into a single summary in your history, keeping your main loop lean. +**Concurrency Safety and Mandate:** You should NEVER run multiple subagents in a single turn if their abilities mutate the same files or resources. This is to prevent race conditions and ensure that the workspace is in a consistent state. Only run multiple subagents in parallel when their tasks are independent (e.g., multiple concurrent research or read-only tasks) or if parallel execution is explicitly requested by the user. + **High-Impact Delegation Candidates:** - **Repetitive Batch Tasks:** Tasks involving more than 3 files or repeated steps (e.g., "Add license headers to all files in src/", "Fix all lint errors in the project"). - **High-Volume Output:** Commands or tools expected to return large amounts of data (e.g., verbose builds, exhaustive file searches). @@ -517,6 +521,8 @@ Operate as a **strategic orchestrator**. Your own context window is your most pr When you delegate, the sub-agent's entire execution is consolidated into a single summary in your history, keeping your main loop lean. +**Concurrency Safety and Mandate:** You should NEVER run multiple subagents in a single turn if their abilities mutate the same files or resources. This is to prevent race conditions and ensure that the workspace is in a consistent state. Only run multiple subagents in parallel when their tasks are independent (e.g., multiple concurrent research or read-only tasks) or if parallel execution is explicitly requested by the user. + **High-Impact Delegation Candidates:** - **Repetitive Batch Tasks:** Tasks involving more than 3 files or repeated steps (e.g., "Add license headers to all files in src/", "Fix all lint errors in the project"). - **High-Volume Output:** Commands or tools expected to return large amounts of data (e.g., verbose builds, exhaustive file searches). @@ -685,6 +691,8 @@ Operate as a **strategic orchestrator**. Your own context window is your most pr When you delegate, the sub-agent's entire execution is consolidated into a single summary in your history, keeping your main loop lean. +**Concurrency Safety and Mandate:** You should NEVER run multiple subagents in a single turn if their abilities mutate the same files or resources. This is to prevent race conditions and ensure that the workspace is in a consistent state. Only run multiple subagents in parallel when their tasks are independent (e.g., multiple concurrent research or read-only tasks) or if parallel execution is explicitly requested by the user. + **High-Impact Delegation Candidates:** - **Repetitive Batch Tasks:** Tasks involving more than 3 files or repeated steps (e.g., "Add license headers to all files in src/", "Fix all lint errors in the project"). - **High-Volume Output:** Commands or tools expected to return large amounts of data (e.g., verbose builds, exhaustive file searches). @@ -1571,6 +1579,8 @@ Operate as a **strategic orchestrator**. Your own context window is your most pr When you delegate, the sub-agent's entire execution is consolidated into a single summary in your history, keeping your main loop lean. +**Concurrency Safety and Mandate:** You should NEVER run multiple subagents in a single turn if their abilities mutate the same files or resources. This is to prevent race conditions and ensure that the workspace is in a consistent state. Only run multiple subagents in parallel when their tasks are independent (e.g., multiple concurrent research or read-only tasks) or if parallel execution is explicitly requested by the user. + **High-Impact Delegation Candidates:** - **Repetitive Batch Tasks:** Tasks involving more than 3 files or repeated steps (e.g., "Add license headers to all files in src/", "Fix all lint errors in the project"). - **High-Volume Output:** Commands or tools expected to return large amounts of data (e.g., verbose builds, exhaustive file searches). @@ -1734,6 +1744,8 @@ Operate as a **strategic orchestrator**. Your own context window is your most pr When you delegate, the sub-agent's entire execution is consolidated into a single summary in your history, keeping your main loop lean. +**Concurrency Safety and Mandate:** You should NEVER run multiple subagents in a single turn if their abilities mutate the same files or resources. This is to prevent race conditions and ensure that the workspace is in a consistent state. Only run multiple subagents in parallel when their tasks are independent (e.g., multiple concurrent research or read-only tasks) or if parallel execution is explicitly requested by the user. + **High-Impact Delegation Candidates:** - **Repetitive Batch Tasks:** Tasks involving more than 3 files or repeated steps (e.g., "Add license headers to all files in src/", "Fix all lint errors in the project"). - **High-Volume Output:** Commands or tools expected to return large amounts of data (e.g., verbose builds, exhaustive file searches). @@ -1889,6 +1901,8 @@ Operate as a **strategic orchestrator**. Your own context window is your most pr When you delegate, the sub-agent's entire execution is consolidated into a single summary in your history, keeping your main loop lean. +**Concurrency Safety and Mandate:** You should NEVER run multiple subagents in a single turn if their abilities mutate the same files or resources. This is to prevent race conditions and ensure that the workspace is in a consistent state. Only run multiple subagents in parallel when their tasks are independent (e.g., multiple concurrent research or read-only tasks) or if parallel execution is explicitly requested by the user. + **High-Impact Delegation Candidates:** - **Repetitive Batch Tasks:** Tasks involving more than 3 files or repeated steps (e.g., "Add license headers to all files in src/", "Fix all lint errors in the project"). - **High-Volume Output:** Commands or tools expected to return large amounts of data (e.g., verbose builds, exhaustive file searches). @@ -2044,6 +2058,8 @@ Operate as a **strategic orchestrator**. Your own context window is your most pr When you delegate, the sub-agent's entire execution is consolidated into a single summary in your history, keeping your main loop lean. +**Concurrency Safety and Mandate:** You should NEVER run multiple subagents in a single turn if their abilities mutate the same files or resources. This is to prevent race conditions and ensure that the workspace is in a consistent state. Only run multiple subagents in parallel when their tasks are independent (e.g., multiple concurrent research or read-only tasks) or if parallel execution is explicitly requested by the user. + **High-Impact Delegation Candidates:** - **Repetitive Batch Tasks:** Tasks involving more than 3 files or repeated steps (e.g., "Add license headers to all files in src/", "Fix all lint errors in the project"). - **High-Volume Output:** Commands or tools expected to return large amounts of data (e.g., verbose builds, exhaustive file searches). @@ -2195,6 +2211,8 @@ Operate as a **strategic orchestrator**. Your own context window is your most pr When you delegate, the sub-agent's entire execution is consolidated into a single summary in your history, keeping your main loop lean. +**Concurrency Safety and Mandate:** You should NEVER run multiple subagents in a single turn if their abilities mutate the same files or resources. This is to prevent race conditions and ensure that the workspace is in a consistent state. Only run multiple subagents in parallel when their tasks are independent (e.g., multiple concurrent research or read-only tasks) or if parallel execution is explicitly requested by the user. + **High-Impact Delegation Candidates:** - **Repetitive Batch Tasks:** Tasks involving more than 3 files or repeated steps (e.g., "Add license headers to all files in src/", "Fix all lint errors in the project"). - **High-Volume Output:** Commands or tools expected to return large amounts of data (e.g., verbose builds, exhaustive file searches). @@ -2346,6 +2364,8 @@ Operate as a **strategic orchestrator**. Your own context window is your most pr When you delegate, the sub-agent's entire execution is consolidated into a single summary in your history, keeping your main loop lean. +**Concurrency Safety and Mandate:** You should NEVER run multiple subagents in a single turn if their abilities mutate the same files or resources. This is to prevent race conditions and ensure that the workspace is in a consistent state. Only run multiple subagents in parallel when their tasks are independent (e.g., multiple concurrent research or read-only tasks) or if parallel execution is explicitly requested by the user. + **High-Impact Delegation Candidates:** - **Repetitive Batch Tasks:** Tasks involving more than 3 files or repeated steps (e.g., "Add license headers to all files in src/", "Fix all lint errors in the project"). - **High-Volume Output:** Commands or tools expected to return large amounts of data (e.g., verbose builds, exhaustive file searches). @@ -2489,6 +2509,8 @@ Operate as a **strategic orchestrator**. Your own context window is your most pr When you delegate, the sub-agent's entire execution is consolidated into a single summary in your history, keeping your main loop lean. +**Concurrency Safety and Mandate:** You should NEVER run multiple subagents in a single turn if their abilities mutate the same files or resources. This is to prevent race conditions and ensure that the workspace is in a consistent state. Only run multiple subagents in parallel when their tasks are independent (e.g., multiple concurrent research or read-only tasks) or if parallel execution is explicitly requested by the user. + **High-Impact Delegation Candidates:** - **Repetitive Batch Tasks:** Tasks involving more than 3 files or repeated steps (e.g., "Add license headers to all files in src/", "Fix all lint errors in the project"). - **High-Volume Output:** Commands or tools expected to return large amounts of data (e.g., verbose builds, exhaustive file searches). @@ -2639,6 +2661,8 @@ Operate as a **strategic orchestrator**. Your own context window is your most pr When you delegate, the sub-agent's entire execution is consolidated into a single summary in your history, keeping your main loop lean. +**Concurrency Safety and Mandate:** You should NEVER run multiple subagents in a single turn if their abilities mutate the same files or resources. This is to prevent race conditions and ensure that the workspace is in a consistent state. Only run multiple subagents in parallel when their tasks are independent (e.g., multiple concurrent research or read-only tasks) or if parallel execution is explicitly requested by the user. + **High-Impact Delegation Candidates:** - **Repetitive Batch Tasks:** Tasks involving more than 3 files or repeated steps (e.g., "Add license headers to all files in src/", "Fix all lint errors in the project"). - **High-Volume Output:** Commands or tools expected to return large amounts of data (e.g., verbose builds, exhaustive file searches). @@ -2790,6 +2814,8 @@ Operate as a **strategic orchestrator**. Your own context window is your most pr When you delegate, the sub-agent's entire execution is consolidated into a single summary in your history, keeping your main loop lean. +**Concurrency Safety and Mandate:** You should NEVER run multiple subagents in a single turn if their abilities mutate the same files or resources. This is to prevent race conditions and ensure that the workspace is in a consistent state. Only run multiple subagents in parallel when their tasks are independent (e.g., multiple concurrent research or read-only tasks) or if parallel execution is explicitly requested by the user. + **High-Impact Delegation Candidates:** - **Repetitive Batch Tasks:** Tasks involving more than 3 files or repeated steps (e.g., "Add license headers to all files in src/", "Fix all lint errors in the project"). - **High-Volume Output:** Commands or tools expected to return large amounts of data (e.g., verbose builds, exhaustive file searches). @@ -3193,6 +3219,8 @@ Operate as a **strategic orchestrator**. Your own context window is your most pr When you delegate, the sub-agent's entire execution is consolidated into a single summary in your history, keeping your main loop lean. +**Concurrency Safety and Mandate:** You should NEVER run multiple subagents in a single turn if their abilities mutate the same files or resources. This is to prevent race conditions and ensure that the workspace is in a consistent state. Only run multiple subagents in parallel when their tasks are independent (e.g., multiple concurrent research or read-only tasks) or if parallel execution is explicitly requested by the user. + **High-Impact Delegation Candidates:** - **Repetitive Batch Tasks:** Tasks involving more than 3 files or repeated steps (e.g., "Add license headers to all files in src/", "Fix all lint errors in the project"). - **High-Volume Output:** Commands or tools expected to return large amounts of data (e.g., verbose builds, exhaustive file searches). @@ -3344,6 +3372,8 @@ Operate as a **strategic orchestrator**. Your own context window is your most pr When you delegate, the sub-agent's entire execution is consolidated into a single summary in your history, keeping your main loop lean. +**Concurrency Safety and Mandate:** You should NEVER run multiple subagents in a single turn if their abilities mutate the same files or resources. This is to prevent race conditions and ensure that the workspace is in a consistent state. Only run multiple subagents in parallel when their tasks are independent (e.g., multiple concurrent research or read-only tasks) or if parallel execution is explicitly requested by the user. + **High-Impact Delegation Candidates:** - **Repetitive Batch Tasks:** Tasks involving more than 3 files or repeated steps (e.g., "Add license headers to all files in src/", "Fix all lint errors in the project"). - **High-Volume Output:** Commands or tools expected to return large amounts of data (e.g., verbose builds, exhaustive file searches). @@ -3607,6 +3637,8 @@ Operate as a **strategic orchestrator**. Your own context window is your most pr When you delegate, the sub-agent's entire execution is consolidated into a single summary in your history, keeping your main loop lean. +**Concurrency Safety and Mandate:** You should NEVER run multiple subagents in a single turn if their abilities mutate the same files or resources. This is to prevent race conditions and ensure that the workspace is in a consistent state. Only run multiple subagents in parallel when their tasks are independent (e.g., multiple concurrent research or read-only tasks) or if parallel execution is explicitly requested by the user. + **High-Impact Delegation Candidates:** - **Repetitive Batch Tasks:** Tasks involving more than 3 files or repeated steps (e.g., "Add license headers to all files in src/", "Fix all lint errors in the project"). - **High-Volume Output:** Commands or tools expected to return large amounts of data (e.g., verbose builds, exhaustive file searches). @@ -3758,6 +3790,8 @@ Operate as a **strategic orchestrator**. Your own context window is your most pr When you delegate, the sub-agent's entire execution is consolidated into a single summary in your history, keeping your main loop lean. +**Concurrency Safety and Mandate:** You should NEVER run multiple subagents in a single turn if their abilities mutate the same files or resources. This is to prevent race conditions and ensure that the workspace is in a consistent state. Only run multiple subagents in parallel when their tasks are independent (e.g., multiple concurrent research or read-only tasks) or if parallel execution is explicitly requested by the user. + **High-Impact Delegation Candidates:** - **Repetitive Batch Tasks:** Tasks involving more than 3 files or repeated steps (e.g., "Add license headers to all files in src/", "Fix all lint errors in the project"). - **High-Volume Output:** Commands or tools expected to return large amounts of data (e.g., verbose builds, exhaustive file searches). diff --git a/packages/core/src/prompts/snippets.ts b/packages/core/src/prompts/snippets.ts index 041946c397..edbb577d17 100644 --- a/packages/core/src/prompts/snippets.ts +++ b/packages/core/src/prompts/snippets.ts @@ -251,6 +251,8 @@ Operate as a **strategic orchestrator**. Your own context window is your most pr When you delegate, the sub-agent's entire execution is consolidated into a single summary in your history, keeping your main loop lean. +**Concurrency Safety and Mandate:** You should NEVER run multiple subagents in a single turn if their abilities mutate the same files or resources. This is to prevent race conditions and ensure that the workspace is in a consistent state. Only run multiple subagents in parallel when their tasks are independent (e.g., multiple concurrent research or read-only tasks) or if parallel execution is explicitly requested by the user. + **High-Impact Delegation Candidates:** - **Repetitive Batch Tasks:** Tasks involving more than 3 files or repeated steps (e.g., "Add license headers to all files in src/", "Fix all lint errors in the project"). - **High-Volume Output:** Commands or tools expected to return large amounts of data (e.g., verbose builds, exhaustive file searches).