fix(evals): add typecheck coverage for evals, integration-tests, and memory-tests (#25480)

This commit is contained in:
Sandy Tao
2026-04-16 11:20:27 -07:00
committed by GitHub
parent f16f1cced3
commit fafe3e35d2
15 changed files with 503 additions and 198 deletions

View File

@@ -11,6 +11,8 @@ import path from 'node:path';
describe('Background Process Monitoring', () => {
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should naturally use read output tool to find token',
prompt:
"Run the script using 'bash generate_token.sh'. It will emit a token after a short delay and continue running. Find the token and tell me what it is.",
@@ -50,6 +52,8 @@ sleep 100
});
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should naturally use list tool to verify multiple processes',
prompt:
"Start three background processes that run 'sleep 100', 'sleep 200', and 'sleep 300' respectively. Verify that all three are currently running.",

View File

@@ -298,6 +298,8 @@ describe('plan_mode', () => {
});
evalTest('ALWAYS_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should transition from plan mode to normal execution and create a plan file from scratch',
params: {
settings,
@@ -333,7 +335,7 @@ describe('plan_mode', () => {
expect(
planWrite?.toolRequest.success,
`Expected write_file to succeed, but got error: ${planWrite?.toolRequest.error}`,
`Expected write_file to succeed, but got error: ${(planWrite?.toolRequest as any).error}`,
).toBe(true);
assertModelHasOutput(result);
@@ -341,6 +343,8 @@ describe('plan_mode', () => {
});
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should not exit plan mode or draft before informal agreement',
approvalMode: ApprovalMode.PLAN,
params: {

View File

@@ -5,10 +5,7 @@
*/
import { describe, expect } from 'vitest';
import {
TRACKER_CREATE_TASK_TOOL_NAME,
TRACKER_UPDATE_TASK_TOOL_NAME,
} from '@google/gemini-cli-core';
import { TRACKER_CREATE_TASK_TOOL_NAME } from '@google/gemini-cli-core';
import { evalTest, TEST_AGENTS } from './test-helper.js';
describe('subtask delegation eval test cases', () => {
@@ -22,6 +19,8 @@ describe('subtask delegation eval test cases', () => {
* 3. Documenting (doc expert)
*/
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should delegate sequential subtasks to relevant experts using the task tracker',
params: {
settings: {
@@ -90,6 +89,8 @@ You are the doc expert. Document the provided implementation clearly.`,
* to multiple subagents in parallel using the task tracker to manage state.
*/
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should delegate independent subtasks to specialists using the task tracker',
params: {
settings: {

View File

@@ -119,6 +119,8 @@ describe('tracker_mode', () => {
});
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should correctly identify the task tracker storage location from the system prompt',
params: {
settings: { experimental: { taskTracker: true } },

13
evals/tsconfig.json Normal file
View File

@@ -0,0 +1,13 @@
{
"extends": "../tsconfig.json",
"compilerOptions": {
"noEmit": true,
"paths": {
"@google/gemini-cli-core": ["../packages/core/index.ts"],
"@google/gemini-cli": ["../packages/cli/index.ts"]
}
},
"include": ["**/*.ts"],
"exclude": ["logs"],
"references": [{ "path": "../packages/core" }, { "path": "../packages/cli" }]
}

View File

@@ -7,6 +7,8 @@
import { evalTest, TestRig } from './test-helper.js';
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'Reproduction: Agent uses Object.create() for cloning/delegation',
prompt:
'Create a utility function `createScopedConfig(config: Config, additionalDirectories: string[]): Config` in `packages/core/src/config/scoped-config.ts` that returns a new Config instance. This instance should override `getWorkspaceContext()` to include the additional directories, but delegate all other method calls (like `isPathAllowed` or `validatePathAccess`) to the original config. Note that `Config` is a complex class with private state and cannot be easily shallow-copied or reconstructed.',

View File

@@ -21,6 +21,8 @@ describe('update_topic_behavior', () => {
* more than 1/4 turns.
*/
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'update_topic should be used at start, end and middle for complex tasks',
prompt: `Create a simple users REST API using Express.
1. Initialize a new npm project and install express.
@@ -117,6 +119,8 @@ describe('update_topic_behavior', () => {
});
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'update_topic should NOT be used for informational coding tasks (Obvious)',
approvalMode: 'default',
prompt:
@@ -142,6 +146,8 @@ describe('update_topic_behavior', () => {
});
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'update_topic should NOT be used for surgical symbol searches (Grey Area)',
approvalMode: 'default',
prompt:
@@ -169,6 +175,8 @@ describe('update_topic_behavior', () => {
});
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'update_topic should be used for medium complexity multi-step tasks',
prompt:
'Refactor the `users-api` project. Move the routing logic from src/app.ts into a new file src/routes.ts, and update app.ts to use the new routes file.',
@@ -212,7 +220,9 @@ export default app;
expect(topicCalls.length).toBeGreaterThanOrEqual(2);
// Verify it actually did the refactoring to ensure it didn't just fail immediately
expect(fs.existsSync(path.join(rig.testDir, 'src/routes.ts'))).toBe(true);
expect(fs.existsSync(path.join(rig.testDir!, 'src/routes.ts'))).toBe(
true,
);
},
});
@@ -224,6 +234,8 @@ export default app;
* the prompt change that improves the behavior.
*/
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'update_topic should not be called twice in a row',
prompt: `
We need to build a C compiler.