fix(evals): add typecheck coverage for evals, integration-tests, and memory-tests (#25480)

2026-04-19 17:50:37 -07:00 · 2026-04-16 11:20:27 -07:00
parent f16f1cced3
commit fafe3e35d2
15 changed files with 503 additions and 198 deletions
--- a/evals/background_processes.eval.ts
+++ b/evals/background_processes.eval.ts
@@ -11,6 +11,8 @@ import path from 'node:path';

 describe('Background Process Monitoring', () => {
  evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
    name: 'should naturally use read output tool to find token',
    prompt:
      "Run the script using 'bash generate_token.sh'. It will emit a token after a short delay and continue running. Find the token and tell me what it is.",
@@ -50,6 +52,8 @@ sleep 100
  });

  evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
    name: 'should naturally use list tool to verify multiple processes',
    prompt:
      "Start three background processes that run 'sleep 100', 'sleep 200', and 'sleep 300' respectively. Verify that all three are currently running.",
--- a/evals/plan_mode.eval.ts
+++ b/evals/plan_mode.eval.ts
@@ -298,6 +298,8 @@ describe('plan_mode', () => {
  });

  evalTest('ALWAYS_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
    name: 'should transition from plan mode to normal execution and create a plan file from scratch',
    params: {
      settings,
@@ -333,7 +335,7 @@ describe('plan_mode', () => {

      expect(
        planWrite?.toolRequest.success,
-        `Expected write_file to succeed, but got error: ${planWrite?.toolRequest.error}`,
+        `Expected write_file to succeed, but got error: ${(planWrite?.toolRequest as any).error}`,
      ).toBe(true);

      assertModelHasOutput(result);
@@ -341,6 +343,8 @@ describe('plan_mode', () => {
  });

  evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
    name: 'should not exit plan mode or draft before informal agreement',
    approvalMode: ApprovalMode.PLAN,
    params: {
--- a/evals/subtask_delegation.eval.ts
+++ b/evals/subtask_delegation.eval.ts
@@ -5,10 +5,7 @@
 */

 import { describe, expect } from 'vitest';
-import {
-  TRACKER_CREATE_TASK_TOOL_NAME,
-  TRACKER_UPDATE_TASK_TOOL_NAME,
-} from '@google/gemini-cli-core';
+import { TRACKER_CREATE_TASK_TOOL_NAME } from '@google/gemini-cli-core';
 import { evalTest, TEST_AGENTS } from './test-helper.js';

 describe('subtask delegation eval test cases', () => {
@@ -22,6 +19,8 @@ describe('subtask delegation eval test cases', () => {
   * 3. Documenting (doc expert)
   */
  evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
    name: 'should delegate sequential subtasks to relevant experts using the task tracker',
    params: {
      settings: {
@@ -90,6 +89,8 @@ You are the doc expert. Document the provided implementation clearly.`,
   * to multiple subagents in parallel using the task tracker to manage state.
   */
  evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
    name: 'should delegate independent subtasks to specialists using the task tracker',
    params: {
      settings: {
--- a/evals/tracker.eval.ts
+++ b/evals/tracker.eval.ts
@@ -119,6 +119,8 @@ describe('tracker_mode', () => {
  });

  evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
    name: 'should correctly identify the task tracker storage location from the system prompt',
    params: {
      settings: { experimental: { taskTracker: true } },
--- a/evals/tsconfig.json
+++ b/evals/tsconfig.json
@@ -0,0 +1,13 @@
+{
+  "extends": "../tsconfig.json",
+  "compilerOptions": {
+    "noEmit": true,
+    "paths": {
+      "@google/gemini-cli-core": ["../packages/core/index.ts"],
+      "@google/gemini-cli": ["../packages/cli/index.ts"]
+    }
+  },
+  "include": ["**/*.ts"],
+  "exclude": ["logs"],
+  "references": [{ "path": "../packages/core" }, { "path": "../packages/cli" }]
+}
--- a/evals/unsafe-cloning.eval.ts
+++ b/evals/unsafe-cloning.eval.ts
@@ -7,6 +7,8 @@
 import { evalTest, TestRig } from './test-helper.js';

 evalTest('USUALLY_PASSES', {
+  suiteName: 'default',
+  suiteType: 'behavioral',
  name: 'Reproduction: Agent uses Object.create() for cloning/delegation',
  prompt:
    'Create a utility function `createScopedConfig(config: Config, additionalDirectories: string[]): Config` in `packages/core/src/config/scoped-config.ts` that returns a new Config instance. This instance should override `getWorkspaceContext()` to include the additional directories, but delegate all other method calls (like `isPathAllowed` or `validatePathAccess`) to the original config. Note that `Config` is a complex class with private state and cannot be easily shallow-copied or reconstructed.',
--- a/evals/update_topic.eval.ts
+++ b/evals/update_topic.eval.ts
@@ -21,6 +21,8 @@ describe('update_topic_behavior', () => {
   * more than 1/4 turns.
   */
  evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
    name: 'update_topic should be used at start, end and middle for complex tasks',
    prompt: `Create a simple users REST API using Express. 
 1. Initialize a new npm project and install express.
@@ -117,6 +119,8 @@ describe('update_topic_behavior', () => {
  });

  evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
    name: 'update_topic should NOT be used for informational coding tasks (Obvious)',
    approvalMode: 'default',
    prompt:
@@ -142,6 +146,8 @@ describe('update_topic_behavior', () => {
  });

  evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
    name: 'update_topic should NOT be used for surgical symbol searches (Grey Area)',
    approvalMode: 'default',
    prompt:
@@ -169,6 +175,8 @@ describe('update_topic_behavior', () => {
  });

  evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
    name: 'update_topic should be used for medium complexity multi-step tasks',
    prompt:
      'Refactor the `users-api` project. Move the routing logic from src/app.ts into a new file src/routes.ts, and update app.ts to use the new routes file.',
@@ -212,7 +220,9 @@ export default app;
      expect(topicCalls.length).toBeGreaterThanOrEqual(2);

      // Verify it actually did the refactoring to ensure it didn't just fail immediately
-      expect(fs.existsSync(path.join(rig.testDir, 'src/routes.ts'))).toBe(true);
+      expect(fs.existsSync(path.join(rig.testDir!, 'src/routes.ts'))).toBe(
+        true,
+      );
    },
  });

@@ -224,6 +234,8 @@ export default app;
   * the prompt change that improves the behavior.
   */
  evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
    name: 'update_topic should not be called twice in a row',
    prompt: `
      We need to build a C compiler.