Initial evals infra split.

2026-06-15 13:57:45 -07:00 · 2026-04-01 17:26:43 -07:00
parent e293424bb4
commit 704ad573f1
8 changed files with 370 additions and 89 deletions
@@ -10,6 +10,9 @@ import {
  runEval,
  prepareLogDir,
  symlinkNodeModules,
+  withEvalRetries,
+  prepareWorkspace,
+  BaseEvalCase,
 } from './test-helper.js';
 import fs from 'node:fs';
 import path from 'node:path';
@@ -32,12 +35,9 @@ interface EvalConfigOverrides {
  [key: string]: unknown;
 }

-export interface AppEvalCase {
-  name: string;
+export interface AppEvalCase extends BaseEvalCase {
  configOverrides?: EvalConfigOverrides;
  prompt: string;
-  timeout?: number;
-  files?: Record<string, string>;
  setup?: (rig: AppRig) => Promise<void>;
  assert: (rig: AppRig, output: string) => Promise<void>;
 }
@@ -48,55 +48,54 @@ export interface AppEvalCase {
 */
 export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
  const fn = async () => {
-    const rig = new AppRig({
-      configOverrides: {
-        model: DEFAULT_GEMINI_MODEL,
-        ...evalCase.configOverrides,
-      },
-    });
+    await withEvalRetries(evalCase.name, async () => {
+      const rig = new AppRig({
+        configOverrides: {
+          model: DEFAULT_GEMINI_MODEL,
+          ...evalCase.configOverrides,
+        },
+      });

-    const { logDir, sanitizedName } = await prepareLogDir(evalCase.name);
-    const logFile = path.join(logDir, `${sanitizedName}.log`);
+      const { logDir, sanitizedName } = await prepareLogDir(evalCase.name);
+      const logFile = path.join(logDir, `${sanitizedName}.log`);

-    try {
-      await rig.initialize();
+      try {
+        await rig.initialize();

-      const testDir = rig.getTestDir();
-      symlinkNodeModules(testDir);
+        const testDir = rig.getTestDir();
+        symlinkNodeModules(testDir);

-      // Setup initial files
-      if (evalCase.files) {
-        for (const [filePath, content] of Object.entries(evalCase.files)) {
-          const fullPath = path.join(testDir, filePath);
-          fs.mkdirSync(path.dirname(fullPath), { recursive: true });
-          fs.writeFileSync(fullPath, content);
+        // Setup initial files
+        if (evalCase.files) {
+          // Note: AppRig does not use a separate homeDir, so we use testDir twice
+          await prepareWorkspace(testDir, testDir, evalCase.files);
        }
+
+        // Run custom setup if provided (e.g. for breakpoints)
+        if (evalCase.setup) {
+          await evalCase.setup(rig);
+        }
+
+        // Render the app!
+        await rig.render();
+
+        // Wait for initial ready state
+        await rig.waitForIdle();
+
+        // Send the initial prompt
+        await rig.sendMessage(evalCase.prompt);
+
+        // Run assertion. Interaction-heavy tests can do their own waiting/steering here.
+        const output = rig.getStaticOutput();
+        await evalCase.assert(rig, output);
+      } finally {
+        const output = rig.getStaticOutput();
+        if (output) {
+          await fs.promises.writeFile(logFile, output);
+        }
+        await rig.unmount();
      }
-
-      // Run custom setup if provided (e.g. for breakpoints)
-      if (evalCase.setup) {
-        await evalCase.setup(rig);
-      }
-
-      // Render the app!
-      await rig.render();
-
-      // Wait for initial ready state
-      await rig.waitForIdle();
-
-      // Send the initial prompt
-      await rig.sendMessage(evalCase.prompt);
-
-      // Run assertion. Interaction-heavy tests can do their own waiting/steering here.
-      const output = rig.getStaticOutput();
-      await evalCase.assert(rig, output);
-    } finally {
-      const output = rig.getStaticOutput();
-      if (output) {
-        await fs.promises.writeFile(logFile, output);
-      }
-      await rig.unmount();
-    }
+    });
  };

  runEval(policy, evalCase.name, fn, (evalCase.timeout ?? 60000) + 10000);
@@ -0,0 +1,133 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import {
+  type EvalPolicy,
+  runEval,
+  prepareLogDir,
+  withEvalRetries,
+  prepareWorkspace,
+  type BaseEvalCase,
+} from './test-helper.js';
+import fs from 'node:fs';
+import path from 'node:path';
+import os from 'node:os';
+import { randomUUID } from 'node:crypto';
+import {
+  Config,
+  type ConfigParameters,
+  AuthType,
+  ApprovalMode,
+  createPolicyEngineConfig,
+  ExtensionLoader,
+  IntegrityDataStatus,
+  makeFakeConfig,
+} from '@google/gemini-cli-core';
+import { createMockSettings } from '../packages/cli/src/test-utils/settings.js';
+
+// A minimal mock ExtensionManager to bypass integrity checks
+class MockExtensionManager extends ExtensionLoader {
+  getExtensions = () => [];
+  setRequestConsent = () => {};
+  setRequestSetting = () => {};
+  integrityManager = {
+    verifyExtensionIntegrity: async () => IntegrityDataStatus.VERIFIED,
+    storeExtensionIntegrity: async () => undefined,
+  };
+}
+
+export interface ComponentEvalCase extends BaseEvalCase {
+  configOverrides?: Partial<ConfigParameters>;
+  setup?: (config: Config) => Promise<void>;
+  assert: (config: Config) => Promise<void>;
+}
+
+export class ComponentRig {
+  public config: Config | undefined;
+  public testDir: string;
+  public sessionId: string;
+
+  constructor(
+    private options: { configOverrides?: Partial<ConfigParameters> } = {},
+  ) {
+    const uniqueId = randomUUID();
+    this.testDir = fs.mkdtempSync(
+      path.join(os.tmpdir(), `gemini-component-rig-${uniqueId.slice(0, 8)}-`),
+    );
+    this.sessionId = `test-session-${uniqueId}`;
+  }
+
+  async initialize() {
+    const settings = createMockSettings();
+    const policyEngineConfig = await createPolicyEngineConfig(
+      settings.merged,
+      ApprovalMode.DEFAULT,
+    );
+
+    const configParams: ConfigParameters = {
+      sessionId: this.sessionId,
+      targetDir: this.testDir,
+      cwd: this.testDir,
+      debugMode: false,
+      model: 'test-model',
+      interactive: false,
+      approvalMode: ApprovalMode.DEFAULT,
+      policyEngineConfig,
+      enableEventDrivenScheduler: false, // Don't need scheduler for direct component tests
+      extensionLoader: new MockExtensionManager() as any,
+      useAlternateBuffer: false,
+      ...this.options.configOverrides,
+    };
+
+    this.config = makeFakeConfig(configParams);
+    await this.config.initialize();
+
+    // Refresh auth using USE_GEMINI to initialize the real BaseLlmClient
+    await this.config.refreshAuth(AuthType.USE_GEMINI);
+  }
+
+  async cleanup() {
+    fs.rmSync(this.testDir, { recursive: true, force: true });
+  }
+}
+
+/**
+ * A helper for running behavioral evaluations directly against backend components.
+ * It provides a fully initialized Config with real API access, bypassing the UI.
+ */
+export function componentEvalTest(
+  policy: EvalPolicy,
+  evalCase: ComponentEvalCase,
+) {
+  const fn = async () => {
+    await withEvalRetries(evalCase.name, async () => {
+      const rig = new ComponentRig({
+        configOverrides: evalCase.configOverrides,
+      });
+
+      const { logDir, sanitizedName } = await prepareLogDir(evalCase.name);
+      const logFile = path.join(logDir, `${sanitizedName}-component.log`);
+
+      try {
+        await rig.initialize();
+
+        if (evalCase.files) {
+          await prepareWorkspace(rig.testDir, rig.testDir, evalCase.files);
+        }
+
+        if (evalCase.setup) {
+          await evalCase.setup(rig.config!);
+        }
+
+        await evalCase.assert(rig.config!);
+      } finally {
+        await rig.cleanup();
+      }
+    });
+  };
+
+  runEval(policy, evalCase.name, fn, (evalCase.timeout ?? 60000) + 10000);
+}
@@ -0,0 +1,121 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { expect } from 'vitest';
+import { componentEvalTest } from './component-test-helper.js';
+import {
+  AgentHistoryProvider,
+  ChatCompressionService,
+  CompressionStatus,
+  GeminiChat,
+} from '@google/gemini-cli-core';
+import type { Content } from '@google/genai';
+
+// Create a highly repetitive and long chat history to trigger compression.
+const createMockLongHistory = (numTurns: number = 30): Content[] => {
+  const history: Content[] = [];
+  for (let i = 0; i < numTurns; i++) {
+    history.push({
+      role: 'user',
+      parts: [
+        {
+          text: `Here is a repetitive piece of context: The system is running nominally. The load is ${
+            i % 100
+          }%. All components operational. Please acknowledge and summarize the previous items.`,
+        },
+      ],
+    });
+    history.push({
+      role: 'model',
+      parts: [
+        {
+          text: `Acknowledged. The system load is ${
+            i % 100
+          }%. I am maintaining readiness. The previous items are nominal.`,
+        },
+      ],
+    });
+  }
+  return history;
+};
+
+// --- AgentHistoryProvider Eval ---
+componentEvalTest('USUALLY_PASSES', {
+  name: 'AgentHistoryProvider correctly enforces High Watermark token limits',
+  setup: async (config) => {
+    // Optional setup before assertion
+  },
+  assert: async (config) => {
+    // Configure provider with very tight constraints to force truncation immediately
+    const providerConfig = {
+      isTruncationEnabled: true,
+      isSummarizationEnabled: true, // Need this to generate <state_snapshot>
+      maxTokens: 500, // Trigger limit
+      retainedTokens: 200, // Target budget after truncation
+      normalMessageTokens: 100, // Limit for old messages
+      maximumMessageTokens: 200, // Limit for newest messages
+      normalizationHeadRatio: 0.1, // Required by AgentHistoryProviderConfig
+    };
+
+    const provider = new AgentHistoryProvider(providerConfig, config);
+    const mockHistory = createMockLongHistory(30);
+
+    const originalLength = mockHistory.length;
+    const resultHistory = await provider.manageHistory(mockHistory);
+
+    // The returned history should be compressed (fewer turns, as the older turns were summarized)
+    expect(resultHistory.length).toBeLessThan(originalLength);
+
+    // There should be a system prompt or a summarized state snapshot injected into the history
+    const hasSummarizedContent = resultHistory.some(
+      (content) =>
+        content.role === 'user' &&
+        content.parts?.[0]?.text?.includes('<intent_summary>'),
+    );
+    expect(hasSummarizedContent).toBe(true);
+  },
+});
+
+// --- ChatCompressionService Eval ---
+componentEvalTest('USUALLY_PASSES', {
+  name: 'ChatCompressionService correctly condenses prompt history via Verification Probe',
+  assert: async (config) => {
+    const chatService = new ChatCompressionService();
+    const mockContext = {
+      config,
+      promptId: 'test-prompt-id',
+      toolRegistry: undefined as any,
+      promptRegistry: undefined as any,
+      resourceRegistry: undefined as any,
+      messageBus: undefined as any,
+      geminiClient: undefined as any,
+      sandboxManager: undefined as any,
+    };
+    
+    const chat = new GeminiChat(mockContext, '', [], createMockLongHistory(30));
+
+    const result = await chatService.compress(
+      chat,
+      'test-prompt-id',
+      true, // force compression
+      'test-model',
+      config,
+      false, // hasFailedCompressionAttempt
+    );
+
+    expect(result.newHistory).toBeDefined();
+    expect(result.newHistory).not.toBeNull();
+
+    // Verify it returned a condensed history array
+    expect(result.newHistory!.length).toBeLessThan(chat.getHistory().length);
+
+    // Verify info metadata indicates a successful compression token reduction
+    expect(result.info.newTokenCount).toBeLessThan(
+      result.info.originalTokenCount,
+    );
+    expect(result.info.compressionStatus).toBe(CompressionStatus.COMPRESSED);
+  },
+});
@@ -0,0 +1 @@
+{"numTotalTestSuites":1,"numPassedTestSuites":1,"numFailedTestSuites":0,"numPendingTestSuites":0,"numTotalTests":2,"numPassedTests":0,"numFailedTests":0,"numPendingTests":2,"numTodoTests":0,"snapshot":{"added":0,"failure":false,"filesAdded":0,"filesRemoved":0,"filesRemovedList":[],"filesUnmatched":0,"filesUpdated":0,"matched":0,"total":0,"unchecked":0,"uncheckedKeysByFile":[],"unmatched":0,"updated":0,"didUpdate":false},"startTime":1775089246511,"success":true,"testResults":[{"assertionResults":[{"ancestorTitles":[],"fullName":"AgentHistoryProvider correctly enforces High Watermark token limits","status":"skipped","title":"AgentHistoryProvider correctly enforces High Watermark token limits","failureMessages":[],"location":{"line":46,"column":1},"meta":{}},{"ancestorTitles":[],"fullName":"ChatCompressionService correctly condenses prompt history via Verification Probe","status":"skipped","title":"ChatCompressionService correctly condenses prompt history via Verification Probe","failureMessages":[],"location":{"line":83,"column":1},"meta":{}}],"startTime":1775089246511,"endTime":1775089246511,"status":"passed","message":"","name":"/Users/gundermanc/code/gemini-cli/compression/evals/compression.eval.ts"}]}
@@ -47,11 +47,47 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
  );
 }

-export async function internalEvalTest(evalCase: EvalCase) {
+export async function withEvalRetries(
+  name: string,
+  attemptFn: (attempt: number) => Promise<void>,
+) {
  const maxRetries = 3;
  let attempt = 0;

  while (attempt <= maxRetries) {
+    try {
+      await attemptFn(attempt);
+      return; // Success! Exit the retry loop.
+    } catch (error: unknown) {
+      const errorMessage =
+        error instanceof Error ? error.message : String(error);
+      const errorCode = getApiErrorCode(errorMessage);
+
+      if (errorCode) {
+        const status = attempt < maxRetries ? 'RETRY' : 'SKIP';
+        logReliabilityEvent(name, attempt, status, errorCode, errorMessage);
+
+        if (attempt < maxRetries) {
+          attempt++;
+          console.warn(
+            `[Eval] Attempt ${attempt} failed with ${errorCode} Error. Retrying...`,
+          );
+          continue; // Retry
+        }
+
+        console.warn(
+          `[Eval] '${name}' failed after ${maxRetries} retries due to persistent API errors. Skipping failure to avoid blocking PR.`,
+        );
+        return; // Gracefully exit without failing the test
+      }
+
+      throw error; // Real failure
+    }
+  }
+}
+
+export async function internalEvalTest(evalCase: EvalCase) {
+  await withEvalRetries(evalCase.name, async () => {
    const rig = new TestRig();
    const { logDir, sanitizedName } = await prepareLogDir(evalCase.name);
    const activityLogFile = path.join(logDir, `${sanitizedName}.jsonl`);
@@ -66,7 +102,7 @@ export async function internalEvalTest(evalCase: EvalCase) {
      }

      if (evalCase.files) {
-        await setupTestFiles(rig, evalCase.files);
+        await prepareWorkspace(rig.testDir!, rig.homeDir!, evalCase.files);
      }

      symlinkNodeModules(rig.testDir || '');
@@ -139,37 +175,6 @@ export async function internalEvalTest(evalCase: EvalCase) {

      await evalCase.assert(rig, result);
      isSuccess = true;
-      return; // Success! Exit the retry loop.
-    } catch (error: unknown) {
-      const errorMessage =
-        error instanceof Error ? error.message : String(error);
-      const errorCode = getApiErrorCode(errorMessage);
-
-      if (errorCode) {
-        const status = attempt < maxRetries ? 'RETRY' : 'SKIP';
-        logReliabilityEvent(
-          evalCase.name,
-          attempt,
-          status,
-          errorCode,
-          errorMessage,
-        );
-
-        if (attempt < maxRetries) {
-          attempt++;
-          console.warn(
-            `[Eval] Attempt ${attempt} failed with ${errorCode} Error. Retrying...`,
-          );
-          continue; // Retry
-        }
-
-        console.warn(
-          `[Eval] '${evalCase.name}' failed after ${maxRetries} retries due to persistent API errors. Skipping failure to avoid blocking PR.`,
-        );
-        return; // Gracefully exit without failing the test
-      }
-
-      throw error; // Real failure
    } finally {
      if (isSuccess) {
        await fs.promises.unlink(activityLogFile).catch((err) => {
@@ -188,7 +193,7 @@ export async function internalEvalTest(evalCase: EvalCase) {
      );
      await rig.cleanup();
    }
-  }
+  });
 }

 function getApiErrorCode(message: string): '500' | '503' | undefined {
@@ -252,9 +257,13 @@ function logReliabilityEvent(
 * intentionally uses synchronous filesystem and child_process operations
 * for simplicity and to ensure sequential environment preparation.
 */
-async function setupTestFiles(rig: TestRig, files: Record<string, string>) {
+export async function prepareWorkspace(
+  testDir: string,
+  homeDir: string,
+  files: Record<string, string>,
+) {
  const acknowledgedAgents: Record<string, Record<string, string>> = {};
-  const projectRoot = fs.realpathSync(rig.testDir!);
+  const projectRoot = fs.realpathSync(testDir);

  for (const [filePath, content] of Object.entries(files)) {
    if (filePath.includes('..') || path.isAbsolute(filePath)) {
@@ -290,7 +299,7 @@ async function setupTestFiles(rig: TestRig, files: Record<string, string>) {

  if (Object.keys(acknowledgedAgents).length > 0) {
    const ackPath = path.join(
-      rig.homeDir!,
+      homeDir,
      '.gemini',
      'acknowledgments',
      'agents.json',
@@ -299,7 +308,7 @@ async function setupTestFiles(rig: TestRig, files: Record<string, string>) {
    fs.writeFileSync(ackPath, JSON.stringify(acknowledgedAgents, null, 2));
  }

-  const execOptions = { cwd: rig.testDir!, stdio: 'inherit' as const };
+  const execOptions = { cwd: testDir, stdio: 'ignore' as const };
  execSync('git init --initial-branch=main', execOptions);
  execSync('git config user.email "test@example.com"', execOptions);
  execSync('git config user.name "Test User"', execOptions);
@@ -366,15 +375,18 @@ interface ForbiddenToolSettings {
  };
 }

-export interface EvalCase {
+export interface BaseEvalCase {
  name: string;
+  timeout?: number;
+  files?: Record<string, string>;
+}
+
+export interface EvalCase extends BaseEvalCase {
  params?: {
    settings?: ForbiddenToolSettings & Record<string, unknown>;
    [key: string]: unknown;
  };
  prompt: string;
-  timeout?: number;
-  files?: Record<string, string>;
  setup?: (rig: TestRig) => Promise<void> | void;
  /** Conversation history to pre-load via --resume. Each entry is a message object with type, content, etc. */
  messages?: Record<string, unknown>[];
@@ -0,0 +1,13 @@
+{
+  "extends": "../tsconfig.json",
+  "compilerOptions": {
+    "jsx": "react-jsx",
+    "lib": ["DOM", "DOM.Iterable", "ES2023"],
+    "types": ["node", "vitest/globals"]
+  },
+  "include": [
+    "**/*.ts",
+    "**/*.tsx"
+  ],
+  "exclude": ["node_modules", "logs"]
+}
@@ -24,7 +24,7 @@ export default defineConfig({
    environment: 'node',
    globals: true,
    alias: {
-      react: path.resolve(__dirname, '../node_modules/react'),
+      '@google/gemini-cli-core': path.resolve(__dirname, '../packages/core/index.ts'),
    },
    setupFiles: [path.resolve(__dirname, '../packages/cli/test-setup.ts')],
    server: {
@@ -126,6 +126,8 @@ export * from './utils/cache.js';
 export * from './utils/markdownUtils.js';

 // Export services
+export * from './services/agentHistoryProvider.js';
+export * from './services/chatCompressionService.js';
 export * from './services/fileDiscoveryService.js';
 export * from './services/gitService.js';
 export * from './services/FolderTrustDiscoveryService.js';
				`@@ -0,0 +1 @@`
				{"numTotalTestSuites":1,"numPassedTestSuites":1,"numFailedTestSuites":0,"numPendingTestSuites":0,"numTotalTests":2,"numPassedTests":0,"numFailedTests":0,"numPendingTests":2,"numTodoTests":0,"snapshot":{"added":0,"failure":false,"filesAdded":0,"filesRemoved":0,"filesRemovedList":[],"filesUnmatched":0,"filesUpdated":0,"matched":0,"total":0,"unchecked":0,"uncheckedKeysByFile":[],"unmatched":0,"updated":0,"didUpdate":false},"startTime":1775089246511,"success":true,"testResults":[{"assertionResults":[{"ancestorTitles":[],"fullName":"AgentHistoryProvider correctly enforces High Watermark token limits","status":"skipped","title":"AgentHistoryProvider correctly enforces High Watermark token limits","failureMessages":[],"location":{"line":46,"column":1},"meta":{}},{"ancestorTitles":[],"fullName":"ChatCompressionService correctly condenses prompt history via Verification Probe","status":"skipped","title":"ChatCompressionService correctly condenses prompt history via Verification Probe","failureMessages":[],"location":{"line":83,"column":1},"meta":{}}],"startTime":1775089246511,"endTime":1775089246511,"status":"passed","message":"","name":"/Users/gundermanc/code/gemini-cli/compression/evals/compression.eval.ts"}]}