mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-04-19 17:50:37 -07:00
fix(evals): add typecheck coverage for evals, integration-tests, and memory-tests (#25480)
This commit is contained in:
@@ -11,6 +11,8 @@ import path from 'node:path';
|
||||
|
||||
describe('Background Process Monitoring', () => {
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should naturally use read output tool to find token',
|
||||
prompt:
|
||||
"Run the script using 'bash generate_token.sh'. It will emit a token after a short delay and continue running. Find the token and tell me what it is.",
|
||||
@@ -50,6 +52,8 @@ sleep 100
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should naturally use list tool to verify multiple processes',
|
||||
prompt:
|
||||
"Start three background processes that run 'sleep 100', 'sleep 200', and 'sleep 300' respectively. Verify that all three are currently running.",
|
||||
|
||||
@@ -298,6 +298,8 @@ describe('plan_mode', () => {
|
||||
});
|
||||
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should transition from plan mode to normal execution and create a plan file from scratch',
|
||||
params: {
|
||||
settings,
|
||||
@@ -333,7 +335,7 @@ describe('plan_mode', () => {
|
||||
|
||||
expect(
|
||||
planWrite?.toolRequest.success,
|
||||
`Expected write_file to succeed, but got error: ${planWrite?.toolRequest.error}`,
|
||||
`Expected write_file to succeed, but got error: ${(planWrite?.toolRequest as any).error}`,
|
||||
).toBe(true);
|
||||
|
||||
assertModelHasOutput(result);
|
||||
@@ -341,6 +343,8 @@ describe('plan_mode', () => {
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should not exit plan mode or draft before informal agreement',
|
||||
approvalMode: ApprovalMode.PLAN,
|
||||
params: {
|
||||
|
||||
@@ -5,10 +5,7 @@
|
||||
*/
|
||||
|
||||
import { describe, expect } from 'vitest';
|
||||
import {
|
||||
TRACKER_CREATE_TASK_TOOL_NAME,
|
||||
TRACKER_UPDATE_TASK_TOOL_NAME,
|
||||
} from '@google/gemini-cli-core';
|
||||
import { TRACKER_CREATE_TASK_TOOL_NAME } from '@google/gemini-cli-core';
|
||||
import { evalTest, TEST_AGENTS } from './test-helper.js';
|
||||
|
||||
describe('subtask delegation eval test cases', () => {
|
||||
@@ -22,6 +19,8 @@ describe('subtask delegation eval test cases', () => {
|
||||
* 3. Documenting (doc expert)
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should delegate sequential subtasks to relevant experts using the task tracker',
|
||||
params: {
|
||||
settings: {
|
||||
@@ -90,6 +89,8 @@ You are the doc expert. Document the provided implementation clearly.`,
|
||||
* to multiple subagents in parallel using the task tracker to manage state.
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should delegate independent subtasks to specialists using the task tracker',
|
||||
params: {
|
||||
settings: {
|
||||
|
||||
@@ -119,6 +119,8 @@ describe('tracker_mode', () => {
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should correctly identify the task tracker storage location from the system prompt',
|
||||
params: {
|
||||
settings: { experimental: { taskTracker: true } },
|
||||
|
||||
13
evals/tsconfig.json
Normal file
13
evals/tsconfig.json
Normal file
@@ -0,0 +1,13 @@
|
||||
{
|
||||
"extends": "../tsconfig.json",
|
||||
"compilerOptions": {
|
||||
"noEmit": true,
|
||||
"paths": {
|
||||
"@google/gemini-cli-core": ["../packages/core/index.ts"],
|
||||
"@google/gemini-cli": ["../packages/cli/index.ts"]
|
||||
}
|
||||
},
|
||||
"include": ["**/*.ts"],
|
||||
"exclude": ["logs"],
|
||||
"references": [{ "path": "../packages/core" }, { "path": "../packages/cli" }]
|
||||
}
|
||||
@@ -7,6 +7,8 @@
|
||||
import { evalTest, TestRig } from './test-helper.js';
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'Reproduction: Agent uses Object.create() for cloning/delegation',
|
||||
prompt:
|
||||
'Create a utility function `createScopedConfig(config: Config, additionalDirectories: string[]): Config` in `packages/core/src/config/scoped-config.ts` that returns a new Config instance. This instance should override `getWorkspaceContext()` to include the additional directories, but delegate all other method calls (like `isPathAllowed` or `validatePathAccess`) to the original config. Note that `Config` is a complex class with private state and cannot be easily shallow-copied or reconstructed.',
|
||||
|
||||
@@ -21,6 +21,8 @@ describe('update_topic_behavior', () => {
|
||||
* more than 1/4 turns.
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'update_topic should be used at start, end and middle for complex tasks',
|
||||
prompt: `Create a simple users REST API using Express.
|
||||
1. Initialize a new npm project and install express.
|
||||
@@ -117,6 +119,8 @@ describe('update_topic_behavior', () => {
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'update_topic should NOT be used for informational coding tasks (Obvious)',
|
||||
approvalMode: 'default',
|
||||
prompt:
|
||||
@@ -142,6 +146,8 @@ describe('update_topic_behavior', () => {
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'update_topic should NOT be used for surgical symbol searches (Grey Area)',
|
||||
approvalMode: 'default',
|
||||
prompt:
|
||||
@@ -169,6 +175,8 @@ describe('update_topic_behavior', () => {
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'update_topic should be used for medium complexity multi-step tasks',
|
||||
prompt:
|
||||
'Refactor the `users-api` project. Move the routing logic from src/app.ts into a new file src/routes.ts, and update app.ts to use the new routes file.',
|
||||
@@ -212,7 +220,9 @@ export default app;
|
||||
expect(topicCalls.length).toBeGreaterThanOrEqual(2);
|
||||
|
||||
// Verify it actually did the refactoring to ensure it didn't just fail immediately
|
||||
expect(fs.existsSync(path.join(rig.testDir, 'src/routes.ts'))).toBe(true);
|
||||
expect(fs.existsSync(path.join(rig.testDir!, 'src/routes.ts'))).toBe(
|
||||
true,
|
||||
);
|
||||
},
|
||||
});
|
||||
|
||||
@@ -224,6 +234,8 @@ export default app;
|
||||
* the prompt change that improves the behavior.
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'update_topic should not be called twice in a row',
|
||||
prompt: `
|
||||
We need to build a C compiler.
|
||||
|
||||
Reference in New Issue
Block a user