mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-03-10 14:10:37 -07:00
This commit is contained in:
56
evals/concurrency-safety.eval.ts
Normal file
56
evals/concurrency-safety.eval.ts
Normal file
@@ -0,0 +1,56 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { expect } from 'vitest';
|
||||
import { evalTest } from './test-helper.js';
|
||||
|
||||
const MUTATION_AGENT_DEFINITION = `---
|
||||
name: mutation-agent
|
||||
description: An agent that modifies the workspace (writes, deletes, git operations, etc).
|
||||
max_turns: 1
|
||||
tools:
|
||||
- write_file
|
||||
---
|
||||
|
||||
You are the mutation agent. Do the mutation requested.
|
||||
`;
|
||||
|
||||
describe('concurrency safety eval test cases', () => {
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'mutation agents are run in parallel when explicitly requested',
|
||||
params: {
|
||||
settings: {
|
||||
experimental: {
|
||||
enableAgents: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
prompt:
|
||||
'Update A.txt to say "A" and update B.txt to say "B". Delegate these tasks to two separate mutation-agent subagents. You MUST run these subagents in parallel at the same time.',
|
||||
files: {
|
||||
'.gemini/agents/mutation-agent.md': MUTATION_AGENT_DEFINITION,
|
||||
},
|
||||
assert: async (rig) => {
|
||||
const logs = rig.readToolLogs();
|
||||
const mutationCalls = logs.filter(
|
||||
(log) => log.toolRequest?.name === 'mutation-agent',
|
||||
);
|
||||
|
||||
expect(
|
||||
mutationCalls.length,
|
||||
'Agent should have called the mutation-agent at least twice',
|
||||
).toBeGreaterThanOrEqual(2);
|
||||
|
||||
const firstPromptId = mutationCalls[0].toolRequest.prompt_id;
|
||||
const secondPromptId = mutationCalls[1].toolRequest.prompt_id;
|
||||
|
||||
expect(
|
||||
firstPromptId,
|
||||
'mutation agents should be called in parallel (same turn / prompt_ids) when explicitly requested',
|
||||
).toEqual(secondPromptId);
|
||||
},
|
||||
});
|
||||
});
|
||||
@@ -112,6 +112,7 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
|
||||
// commands.
|
||||
execSync('git config core.editor "true"', execOptions);
|
||||
execSync('git config core.pager "cat"', execOptions);
|
||||
execSync('git config commit.gpgsign false', execOptions);
|
||||
execSync('git add .', execOptions);
|
||||
execSync('git commit --allow-empty -m "Initial commit"', execOptions);
|
||||
}
|
||||
|
||||
@@ -62,6 +62,8 @@ Operate as a **strategic orchestrator**. Your own context window is your most pr
|
||||
|
||||
When you delegate, the sub-agent's entire execution is consolidated into a single summary in your history, keeping your main loop lean.
|
||||
|
||||
**Concurrency Safety and Mandate:** You should NEVER run multiple subagents in a single turn if their abilities mutate the same files or resources. This is to prevent race conditions and ensure that the workspace is in a consistent state. Only run multiple subagents in parallel when their tasks are independent (e.g., multiple concurrent research or read-only tasks) or if parallel execution is explicitly requested by the user.
|
||||
|
||||
**High-Impact Delegation Candidates:**
|
||||
- **Repetitive Batch Tasks:** Tasks involving more than 3 files or repeated steps (e.g., "Add license headers to all files in src/", "Fix all lint errors in the project").
|
||||
- **High-Volume Output:** Commands or tools expected to return large amounts of data (e.g., verbose builds, exhaustive file searches).
|
||||
@@ -230,6 +232,8 @@ Operate as a **strategic orchestrator**. Your own context window is your most pr
|
||||
|
||||
When you delegate, the sub-agent's entire execution is consolidated into a single summary in your history, keeping your main loop lean.
|
||||
|
||||
**Concurrency Safety and Mandate:** You should NEVER run multiple subagents in a single turn if their abilities mutate the same files or resources. This is to prevent race conditions and ensure that the workspace is in a consistent state. Only run multiple subagents in parallel when their tasks are independent (e.g., multiple concurrent research or read-only tasks) or if parallel execution is explicitly requested by the user.
|
||||
|
||||
**High-Impact Delegation Candidates:**
|
||||
- **Repetitive Batch Tasks:** Tasks involving more than 3 files or repeated steps (e.g., "Add license headers to all files in src/", "Fix all lint errors in the project").
|
||||
- **High-Volume Output:** Commands or tools expected to return large amounts of data (e.g., verbose builds, exhaustive file searches).
|
||||
@@ -517,6 +521,8 @@ Operate as a **strategic orchestrator**. Your own context window is your most pr
|
||||
|
||||
When you delegate, the sub-agent's entire execution is consolidated into a single summary in your history, keeping your main loop lean.
|
||||
|
||||
**Concurrency Safety and Mandate:** You should NEVER run multiple subagents in a single turn if their abilities mutate the same files or resources. This is to prevent race conditions and ensure that the workspace is in a consistent state. Only run multiple subagents in parallel when their tasks are independent (e.g., multiple concurrent research or read-only tasks) or if parallel execution is explicitly requested by the user.
|
||||
|
||||
**High-Impact Delegation Candidates:**
|
||||
- **Repetitive Batch Tasks:** Tasks involving more than 3 files or repeated steps (e.g., "Add license headers to all files in src/", "Fix all lint errors in the project").
|
||||
- **High-Volume Output:** Commands or tools expected to return large amounts of data (e.g., verbose builds, exhaustive file searches).
|
||||
@@ -685,6 +691,8 @@ Operate as a **strategic orchestrator**. Your own context window is your most pr
|
||||
|
||||
When you delegate, the sub-agent's entire execution is consolidated into a single summary in your history, keeping your main loop lean.
|
||||
|
||||
**Concurrency Safety and Mandate:** You should NEVER run multiple subagents in a single turn if their abilities mutate the same files or resources. This is to prevent race conditions and ensure that the workspace is in a consistent state. Only run multiple subagents in parallel when their tasks are independent (e.g., multiple concurrent research or read-only tasks) or if parallel execution is explicitly requested by the user.
|
||||
|
||||
**High-Impact Delegation Candidates:**
|
||||
- **Repetitive Batch Tasks:** Tasks involving more than 3 files or repeated steps (e.g., "Add license headers to all files in src/", "Fix all lint errors in the project").
|
||||
- **High-Volume Output:** Commands or tools expected to return large amounts of data (e.g., verbose builds, exhaustive file searches).
|
||||
@@ -1571,6 +1579,8 @@ Operate as a **strategic orchestrator**. Your own context window is your most pr
|
||||
|
||||
When you delegate, the sub-agent's entire execution is consolidated into a single summary in your history, keeping your main loop lean.
|
||||
|
||||
**Concurrency Safety and Mandate:** You should NEVER run multiple subagents in a single turn if their abilities mutate the same files or resources. This is to prevent race conditions and ensure that the workspace is in a consistent state. Only run multiple subagents in parallel when their tasks are independent (e.g., multiple concurrent research or read-only tasks) or if parallel execution is explicitly requested by the user.
|
||||
|
||||
**High-Impact Delegation Candidates:**
|
||||
- **Repetitive Batch Tasks:** Tasks involving more than 3 files or repeated steps (e.g., "Add license headers to all files in src/", "Fix all lint errors in the project").
|
||||
- **High-Volume Output:** Commands or tools expected to return large amounts of data (e.g., verbose builds, exhaustive file searches).
|
||||
@@ -1734,6 +1744,8 @@ Operate as a **strategic orchestrator**. Your own context window is your most pr
|
||||
|
||||
When you delegate, the sub-agent's entire execution is consolidated into a single summary in your history, keeping your main loop lean.
|
||||
|
||||
**Concurrency Safety and Mandate:** You should NEVER run multiple subagents in a single turn if their abilities mutate the same files or resources. This is to prevent race conditions and ensure that the workspace is in a consistent state. Only run multiple subagents in parallel when their tasks are independent (e.g., multiple concurrent research or read-only tasks) or if parallel execution is explicitly requested by the user.
|
||||
|
||||
**High-Impact Delegation Candidates:**
|
||||
- **Repetitive Batch Tasks:** Tasks involving more than 3 files or repeated steps (e.g., "Add license headers to all files in src/", "Fix all lint errors in the project").
|
||||
- **High-Volume Output:** Commands or tools expected to return large amounts of data (e.g., verbose builds, exhaustive file searches).
|
||||
@@ -1889,6 +1901,8 @@ Operate as a **strategic orchestrator**. Your own context window is your most pr
|
||||
|
||||
When you delegate, the sub-agent's entire execution is consolidated into a single summary in your history, keeping your main loop lean.
|
||||
|
||||
**Concurrency Safety and Mandate:** You should NEVER run multiple subagents in a single turn if their abilities mutate the same files or resources. This is to prevent race conditions and ensure that the workspace is in a consistent state. Only run multiple subagents in parallel when their tasks are independent (e.g., multiple concurrent research or read-only tasks) or if parallel execution is explicitly requested by the user.
|
||||
|
||||
**High-Impact Delegation Candidates:**
|
||||
- **Repetitive Batch Tasks:** Tasks involving more than 3 files or repeated steps (e.g., "Add license headers to all files in src/", "Fix all lint errors in the project").
|
||||
- **High-Volume Output:** Commands or tools expected to return large amounts of data (e.g., verbose builds, exhaustive file searches).
|
||||
@@ -2044,6 +2058,8 @@ Operate as a **strategic orchestrator**. Your own context window is your most pr
|
||||
|
||||
When you delegate, the sub-agent's entire execution is consolidated into a single summary in your history, keeping your main loop lean.
|
||||
|
||||
**Concurrency Safety and Mandate:** You should NEVER run multiple subagents in a single turn if their abilities mutate the same files or resources. This is to prevent race conditions and ensure that the workspace is in a consistent state. Only run multiple subagents in parallel when their tasks are independent (e.g., multiple concurrent research or read-only tasks) or if parallel execution is explicitly requested by the user.
|
||||
|
||||
**High-Impact Delegation Candidates:**
|
||||
- **Repetitive Batch Tasks:** Tasks involving more than 3 files or repeated steps (e.g., "Add license headers to all files in src/", "Fix all lint errors in the project").
|
||||
- **High-Volume Output:** Commands or tools expected to return large amounts of data (e.g., verbose builds, exhaustive file searches).
|
||||
@@ -2195,6 +2211,8 @@ Operate as a **strategic orchestrator**. Your own context window is your most pr
|
||||
|
||||
When you delegate, the sub-agent's entire execution is consolidated into a single summary in your history, keeping your main loop lean.
|
||||
|
||||
**Concurrency Safety and Mandate:** You should NEVER run multiple subagents in a single turn if their abilities mutate the same files or resources. This is to prevent race conditions and ensure that the workspace is in a consistent state. Only run multiple subagents in parallel when their tasks are independent (e.g., multiple concurrent research or read-only tasks) or if parallel execution is explicitly requested by the user.
|
||||
|
||||
**High-Impact Delegation Candidates:**
|
||||
- **Repetitive Batch Tasks:** Tasks involving more than 3 files or repeated steps (e.g., "Add license headers to all files in src/", "Fix all lint errors in the project").
|
||||
- **High-Volume Output:** Commands or tools expected to return large amounts of data (e.g., verbose builds, exhaustive file searches).
|
||||
@@ -2346,6 +2364,8 @@ Operate as a **strategic orchestrator**. Your own context window is your most pr
|
||||
|
||||
When you delegate, the sub-agent's entire execution is consolidated into a single summary in your history, keeping your main loop lean.
|
||||
|
||||
**Concurrency Safety and Mandate:** You should NEVER run multiple subagents in a single turn if their abilities mutate the same files or resources. This is to prevent race conditions and ensure that the workspace is in a consistent state. Only run multiple subagents in parallel when their tasks are independent (e.g., multiple concurrent research or read-only tasks) or if parallel execution is explicitly requested by the user.
|
||||
|
||||
**High-Impact Delegation Candidates:**
|
||||
- **Repetitive Batch Tasks:** Tasks involving more than 3 files or repeated steps (e.g., "Add license headers to all files in src/", "Fix all lint errors in the project").
|
||||
- **High-Volume Output:** Commands or tools expected to return large amounts of data (e.g., verbose builds, exhaustive file searches).
|
||||
@@ -2489,6 +2509,8 @@ Operate as a **strategic orchestrator**. Your own context window is your most pr
|
||||
|
||||
When you delegate, the sub-agent's entire execution is consolidated into a single summary in your history, keeping your main loop lean.
|
||||
|
||||
**Concurrency Safety and Mandate:** You should NEVER run multiple subagents in a single turn if their abilities mutate the same files or resources. This is to prevent race conditions and ensure that the workspace is in a consistent state. Only run multiple subagents in parallel when their tasks are independent (e.g., multiple concurrent research or read-only tasks) or if parallel execution is explicitly requested by the user.
|
||||
|
||||
**High-Impact Delegation Candidates:**
|
||||
- **Repetitive Batch Tasks:** Tasks involving more than 3 files or repeated steps (e.g., "Add license headers to all files in src/", "Fix all lint errors in the project").
|
||||
- **High-Volume Output:** Commands or tools expected to return large amounts of data (e.g., verbose builds, exhaustive file searches).
|
||||
@@ -2639,6 +2661,8 @@ Operate as a **strategic orchestrator**. Your own context window is your most pr
|
||||
|
||||
When you delegate, the sub-agent's entire execution is consolidated into a single summary in your history, keeping your main loop lean.
|
||||
|
||||
**Concurrency Safety and Mandate:** You should NEVER run multiple subagents in a single turn if their abilities mutate the same files or resources. This is to prevent race conditions and ensure that the workspace is in a consistent state. Only run multiple subagents in parallel when their tasks are independent (e.g., multiple concurrent research or read-only tasks) or if parallel execution is explicitly requested by the user.
|
||||
|
||||
**High-Impact Delegation Candidates:**
|
||||
- **Repetitive Batch Tasks:** Tasks involving more than 3 files or repeated steps (e.g., "Add license headers to all files in src/", "Fix all lint errors in the project").
|
||||
- **High-Volume Output:** Commands or tools expected to return large amounts of data (e.g., verbose builds, exhaustive file searches).
|
||||
@@ -2790,6 +2814,8 @@ Operate as a **strategic orchestrator**. Your own context window is your most pr
|
||||
|
||||
When you delegate, the sub-agent's entire execution is consolidated into a single summary in your history, keeping your main loop lean.
|
||||
|
||||
**Concurrency Safety and Mandate:** You should NEVER run multiple subagents in a single turn if their abilities mutate the same files or resources. This is to prevent race conditions and ensure that the workspace is in a consistent state. Only run multiple subagents in parallel when their tasks are independent (e.g., multiple concurrent research or read-only tasks) or if parallel execution is explicitly requested by the user.
|
||||
|
||||
**High-Impact Delegation Candidates:**
|
||||
- **Repetitive Batch Tasks:** Tasks involving more than 3 files or repeated steps (e.g., "Add license headers to all files in src/", "Fix all lint errors in the project").
|
||||
- **High-Volume Output:** Commands or tools expected to return large amounts of data (e.g., verbose builds, exhaustive file searches).
|
||||
@@ -3193,6 +3219,8 @@ Operate as a **strategic orchestrator**. Your own context window is your most pr
|
||||
|
||||
When you delegate, the sub-agent's entire execution is consolidated into a single summary in your history, keeping your main loop lean.
|
||||
|
||||
**Concurrency Safety and Mandate:** You should NEVER run multiple subagents in a single turn if their abilities mutate the same files or resources. This is to prevent race conditions and ensure that the workspace is in a consistent state. Only run multiple subagents in parallel when their tasks are independent (e.g., multiple concurrent research or read-only tasks) or if parallel execution is explicitly requested by the user.
|
||||
|
||||
**High-Impact Delegation Candidates:**
|
||||
- **Repetitive Batch Tasks:** Tasks involving more than 3 files or repeated steps (e.g., "Add license headers to all files in src/", "Fix all lint errors in the project").
|
||||
- **High-Volume Output:** Commands or tools expected to return large amounts of data (e.g., verbose builds, exhaustive file searches).
|
||||
@@ -3344,6 +3372,8 @@ Operate as a **strategic orchestrator**. Your own context window is your most pr
|
||||
|
||||
When you delegate, the sub-agent's entire execution is consolidated into a single summary in your history, keeping your main loop lean.
|
||||
|
||||
**Concurrency Safety and Mandate:** You should NEVER run multiple subagents in a single turn if their abilities mutate the same files or resources. This is to prevent race conditions and ensure that the workspace is in a consistent state. Only run multiple subagents in parallel when their tasks are independent (e.g., multiple concurrent research or read-only tasks) or if parallel execution is explicitly requested by the user.
|
||||
|
||||
**High-Impact Delegation Candidates:**
|
||||
- **Repetitive Batch Tasks:** Tasks involving more than 3 files or repeated steps (e.g., "Add license headers to all files in src/", "Fix all lint errors in the project").
|
||||
- **High-Volume Output:** Commands or tools expected to return large amounts of data (e.g., verbose builds, exhaustive file searches).
|
||||
@@ -3607,6 +3637,8 @@ Operate as a **strategic orchestrator**. Your own context window is your most pr
|
||||
|
||||
When you delegate, the sub-agent's entire execution is consolidated into a single summary in your history, keeping your main loop lean.
|
||||
|
||||
**Concurrency Safety and Mandate:** You should NEVER run multiple subagents in a single turn if their abilities mutate the same files or resources. This is to prevent race conditions and ensure that the workspace is in a consistent state. Only run multiple subagents in parallel when their tasks are independent (e.g., multiple concurrent research or read-only tasks) or if parallel execution is explicitly requested by the user.
|
||||
|
||||
**High-Impact Delegation Candidates:**
|
||||
- **Repetitive Batch Tasks:** Tasks involving more than 3 files or repeated steps (e.g., "Add license headers to all files in src/", "Fix all lint errors in the project").
|
||||
- **High-Volume Output:** Commands or tools expected to return large amounts of data (e.g., verbose builds, exhaustive file searches).
|
||||
@@ -3758,6 +3790,8 @@ Operate as a **strategic orchestrator**. Your own context window is your most pr
|
||||
|
||||
When you delegate, the sub-agent's entire execution is consolidated into a single summary in your history, keeping your main loop lean.
|
||||
|
||||
**Concurrency Safety and Mandate:** You should NEVER run multiple subagents in a single turn if their abilities mutate the same files or resources. This is to prevent race conditions and ensure that the workspace is in a consistent state. Only run multiple subagents in parallel when their tasks are independent (e.g., multiple concurrent research or read-only tasks) or if parallel execution is explicitly requested by the user.
|
||||
|
||||
**High-Impact Delegation Candidates:**
|
||||
- **Repetitive Batch Tasks:** Tasks involving more than 3 files or repeated steps (e.g., "Add license headers to all files in src/", "Fix all lint errors in the project").
|
||||
- **High-Volume Output:** Commands or tools expected to return large amounts of data (e.g., verbose builds, exhaustive file searches).
|
||||
|
||||
@@ -251,6 +251,8 @@ Operate as a **strategic orchestrator**. Your own context window is your most pr
|
||||
|
||||
When you delegate, the sub-agent's entire execution is consolidated into a single summary in your history, keeping your main loop lean.
|
||||
|
||||
**Concurrency Safety and Mandate:** You should NEVER run multiple subagents in a single turn if their abilities mutate the same files or resources. This is to prevent race conditions and ensure that the workspace is in a consistent state. Only run multiple subagents in parallel when their tasks are independent (e.g., multiple concurrent research or read-only tasks) or if parallel execution is explicitly requested by the user.
|
||||
|
||||
**High-Impact Delegation Candidates:**
|
||||
- **Repetitive Batch Tasks:** Tasks involving more than 3 files or repeated steps (e.g., "Add license headers to all files in src/", "Fix all lint errors in the project").
|
||||
- **High-Volume Output:** Commands or tools expected to return large amounts of data (e.g., verbose builds, exhaustive file searches).
|
||||
|
||||
Reference in New Issue
Block a user