diff --git a/evals/app-test-helper.ts b/evals/app-test-helper.ts index 8ea842aa38..017063a3f7 100644 --- a/evals/app-test-helper.ts +++ b/evals/app-test-helper.ts @@ -10,6 +10,9 @@ import { runEval, prepareLogDir, symlinkNodeModules, + withEvalRetries, + prepareWorkspace, + BaseEvalCase, } from './test-helper.js'; import fs from 'node:fs'; import path from 'node:path'; @@ -32,12 +35,9 @@ interface EvalConfigOverrides { [key: string]: unknown; } -export interface AppEvalCase { - name: string; +export interface AppEvalCase extends BaseEvalCase { configOverrides?: EvalConfigOverrides; prompt: string; - timeout?: number; - files?: Record; setup?: (rig: AppRig) => Promise; assert: (rig: AppRig, output: string) => Promise; } @@ -48,55 +48,54 @@ export interface AppEvalCase { */ export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) { const fn = async () => { - const rig = new AppRig({ - configOverrides: { - model: DEFAULT_GEMINI_MODEL, - ...evalCase.configOverrides, - }, - }); + await withEvalRetries(evalCase.name, async () => { + const rig = new AppRig({ + configOverrides: { + model: DEFAULT_GEMINI_MODEL, + ...evalCase.configOverrides, + }, + }); - const { logDir, sanitizedName } = await prepareLogDir(evalCase.name); - const logFile = path.join(logDir, `${sanitizedName}.log`); + const { logDir, sanitizedName } = await prepareLogDir(evalCase.name); + const logFile = path.join(logDir, `${sanitizedName}.log`); - try { - await rig.initialize(); + try { + await rig.initialize(); - const testDir = rig.getTestDir(); - symlinkNodeModules(testDir); + const testDir = rig.getTestDir(); + symlinkNodeModules(testDir); - // Setup initial files - if (evalCase.files) { - for (const [filePath, content] of Object.entries(evalCase.files)) { - const fullPath = path.join(testDir, filePath); - fs.mkdirSync(path.dirname(fullPath), { recursive: true }); - fs.writeFileSync(fullPath, content); + // Setup initial files + if (evalCase.files) { + // Note: AppRig does not use a separate homeDir, so we use testDir twice + await prepareWorkspace(testDir, testDir, evalCase.files); } + + // Run custom setup if provided (e.g. for breakpoints) + if (evalCase.setup) { + await evalCase.setup(rig); + } + + // Render the app! + await rig.render(); + + // Wait for initial ready state + await rig.waitForIdle(); + + // Send the initial prompt + await rig.sendMessage(evalCase.prompt); + + // Run assertion. Interaction-heavy tests can do their own waiting/steering here. + const output = rig.getStaticOutput(); + await evalCase.assert(rig, output); + } finally { + const output = rig.getStaticOutput(); + if (output) { + await fs.promises.writeFile(logFile, output); + } + await rig.unmount(); } - - // Run custom setup if provided (e.g. for breakpoints) - if (evalCase.setup) { - await evalCase.setup(rig); - } - - // Render the app! - await rig.render(); - - // Wait for initial ready state - await rig.waitForIdle(); - - // Send the initial prompt - await rig.sendMessage(evalCase.prompt); - - // Run assertion. Interaction-heavy tests can do their own waiting/steering here. - const output = rig.getStaticOutput(); - await evalCase.assert(rig, output); - } finally { - const output = rig.getStaticOutput(); - if (output) { - await fs.promises.writeFile(logFile, output); - } - await rig.unmount(); - } + }); }; runEval(policy, evalCase.name, fn, (evalCase.timeout ?? 60000) + 10000); diff --git a/evals/component-test-helper.ts b/evals/component-test-helper.ts new file mode 100644 index 0000000000..9e5c9aa567 --- /dev/null +++ b/evals/component-test-helper.ts @@ -0,0 +1,133 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { + type EvalPolicy, + runEval, + prepareLogDir, + withEvalRetries, + prepareWorkspace, + type BaseEvalCase, +} from './test-helper.js'; +import fs from 'node:fs'; +import path from 'node:path'; +import os from 'node:os'; +import { randomUUID } from 'node:crypto'; +import { + Config, + type ConfigParameters, + AuthType, + ApprovalMode, + createPolicyEngineConfig, + ExtensionLoader, + IntegrityDataStatus, + makeFakeConfig, +} from '@google/gemini-cli-core'; +import { createMockSettings } from '../packages/cli/src/test-utils/settings.js'; + +// A minimal mock ExtensionManager to bypass integrity checks +class MockExtensionManager extends ExtensionLoader { + getExtensions = () => []; + setRequestConsent = () => {}; + setRequestSetting = () => {}; + integrityManager = { + verifyExtensionIntegrity: async () => IntegrityDataStatus.VERIFIED, + storeExtensionIntegrity: async () => undefined, + }; +} + +export interface ComponentEvalCase extends BaseEvalCase { + configOverrides?: Partial; + setup?: (config: Config) => Promise; + assert: (config: Config) => Promise; +} + +export class ComponentRig { + public config: Config | undefined; + public testDir: string; + public sessionId: string; + + constructor( + private options: { configOverrides?: Partial } = {}, + ) { + const uniqueId = randomUUID(); + this.testDir = fs.mkdtempSync( + path.join(os.tmpdir(), `gemini-component-rig-${uniqueId.slice(0, 8)}-`), + ); + this.sessionId = `test-session-${uniqueId}`; + } + + async initialize() { + const settings = createMockSettings(); + const policyEngineConfig = await createPolicyEngineConfig( + settings.merged, + ApprovalMode.DEFAULT, + ); + + const configParams: ConfigParameters = { + sessionId: this.sessionId, + targetDir: this.testDir, + cwd: this.testDir, + debugMode: false, + model: 'test-model', + interactive: false, + approvalMode: ApprovalMode.DEFAULT, + policyEngineConfig, + enableEventDrivenScheduler: false, // Don't need scheduler for direct component tests + extensionLoader: new MockExtensionManager() as any, + useAlternateBuffer: false, + ...this.options.configOverrides, + }; + + this.config = makeFakeConfig(configParams); + await this.config.initialize(); + + // Refresh auth using USE_GEMINI to initialize the real BaseLlmClient + await this.config.refreshAuth(AuthType.USE_GEMINI); + } + + async cleanup() { + fs.rmSync(this.testDir, { recursive: true, force: true }); + } +} + +/** + * A helper for running behavioral evaluations directly against backend components. + * It provides a fully initialized Config with real API access, bypassing the UI. + */ +export function componentEvalTest( + policy: EvalPolicy, + evalCase: ComponentEvalCase, +) { + const fn = async () => { + await withEvalRetries(evalCase.name, async () => { + const rig = new ComponentRig({ + configOverrides: evalCase.configOverrides, + }); + + const { logDir, sanitizedName } = await prepareLogDir(evalCase.name); + const logFile = path.join(logDir, `${sanitizedName}-component.log`); + + try { + await rig.initialize(); + + if (evalCase.files) { + await prepareWorkspace(rig.testDir, rig.testDir, evalCase.files); + } + + if (evalCase.setup) { + await evalCase.setup(rig.config!); + } + + await evalCase.assert(rig.config!); + } finally { + await rig.cleanup(); + } + }); + }; + + runEval(policy, evalCase.name, fn, (evalCase.timeout ?? 60000) + 10000); +} diff --git a/evals/compression.eval.ts b/evals/compression.eval.ts new file mode 100644 index 0000000000..c67eefb0bf --- /dev/null +++ b/evals/compression.eval.ts @@ -0,0 +1,121 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { expect } from 'vitest'; +import { componentEvalTest } from './component-test-helper.js'; +import { + AgentHistoryProvider, + ChatCompressionService, + CompressionStatus, + GeminiChat, +} from '@google/gemini-cli-core'; +import type { Content } from '@google/genai'; + +// Create a highly repetitive and long chat history to trigger compression. +const createMockLongHistory = (numTurns: number = 30): Content[] => { + const history: Content[] = []; + for (let i = 0; i < numTurns; i++) { + history.push({ + role: 'user', + parts: [ + { + text: `Here is a repetitive piece of context: The system is running nominally. The load is ${ + i % 100 + }%. All components operational. Please acknowledge and summarize the previous items.`, + }, + ], + }); + history.push({ + role: 'model', + parts: [ + { + text: `Acknowledged. The system load is ${ + i % 100 + }%. I am maintaining readiness. The previous items are nominal.`, + }, + ], + }); + } + return history; +}; + +// --- AgentHistoryProvider Eval --- +componentEvalTest('USUALLY_PASSES', { + name: 'AgentHistoryProvider correctly enforces High Watermark token limits', + setup: async (config) => { + // Optional setup before assertion + }, + assert: async (config) => { + // Configure provider with very tight constraints to force truncation immediately + const providerConfig = { + isTruncationEnabled: true, + isSummarizationEnabled: true, // Need this to generate + maxTokens: 500, // Trigger limit + retainedTokens: 200, // Target budget after truncation + normalMessageTokens: 100, // Limit for old messages + maximumMessageTokens: 200, // Limit for newest messages + normalizationHeadRatio: 0.1, // Required by AgentHistoryProviderConfig + }; + + const provider = new AgentHistoryProvider(providerConfig, config); + const mockHistory = createMockLongHistory(30); + + const originalLength = mockHistory.length; + const resultHistory = await provider.manageHistory(mockHistory); + + // The returned history should be compressed (fewer turns, as the older turns were summarized) + expect(resultHistory.length).toBeLessThan(originalLength); + + // There should be a system prompt or a summarized state snapshot injected into the history + const hasSummarizedContent = resultHistory.some( + (content) => + content.role === 'user' && + content.parts?.[0]?.text?.includes(''), + ); + expect(hasSummarizedContent).toBe(true); + }, +}); + +// --- ChatCompressionService Eval --- +componentEvalTest('USUALLY_PASSES', { + name: 'ChatCompressionService correctly condenses prompt history via Verification Probe', + assert: async (config) => { + const chatService = new ChatCompressionService(); + const mockContext = { + config, + promptId: 'test-prompt-id', + toolRegistry: undefined as any, + promptRegistry: undefined as any, + resourceRegistry: undefined as any, + messageBus: undefined as any, + geminiClient: undefined as any, + sandboxManager: undefined as any, + }; + + const chat = new GeminiChat(mockContext, '', [], createMockLongHistory(30)); + + const result = await chatService.compress( + chat, + 'test-prompt-id', + true, // force compression + 'test-model', + config, + false, // hasFailedCompressionAttempt + ); + + expect(result.newHistory).toBeDefined(); + expect(result.newHistory).not.toBeNull(); + + // Verify it returned a condensed history array + expect(result.newHistory!.length).toBeLessThan(chat.getHistory().length); + + // Verify info metadata indicates a successful compression token reduction + expect(result.info.newTokenCount).toBeLessThan( + result.info.originalTokenCount, + ); + expect(result.info.compressionStatus).toBe(CompressionStatus.COMPRESSED); + }, +}); diff --git a/evals/evals/logs/report.json b/evals/evals/logs/report.json new file mode 100644 index 0000000000..c47c54b0af --- /dev/null +++ b/evals/evals/logs/report.json @@ -0,0 +1 @@ +{"numTotalTestSuites":1,"numPassedTestSuites":1,"numFailedTestSuites":0,"numPendingTestSuites":0,"numTotalTests":2,"numPassedTests":0,"numFailedTests":0,"numPendingTests":2,"numTodoTests":0,"snapshot":{"added":0,"failure":false,"filesAdded":0,"filesRemoved":0,"filesRemovedList":[],"filesUnmatched":0,"filesUpdated":0,"matched":0,"total":0,"unchecked":0,"uncheckedKeysByFile":[],"unmatched":0,"updated":0,"didUpdate":false},"startTime":1775089246511,"success":true,"testResults":[{"assertionResults":[{"ancestorTitles":[],"fullName":"AgentHistoryProvider correctly enforces High Watermark token limits","status":"skipped","title":"AgentHistoryProvider correctly enforces High Watermark token limits","failureMessages":[],"location":{"line":46,"column":1},"meta":{}},{"ancestorTitles":[],"fullName":"ChatCompressionService correctly condenses prompt history via Verification Probe","status":"skipped","title":"ChatCompressionService correctly condenses prompt history via Verification Probe","failureMessages":[],"location":{"line":83,"column":1},"meta":{}}],"startTime":1775089246511,"endTime":1775089246511,"status":"passed","message":"","name":"/Users/gundermanc/code/gemini-cli/compression/evals/compression.eval.ts"}]} \ No newline at end of file diff --git a/evals/test-helper.ts b/evals/test-helper.ts index 2bf9188eee..cf7ef273e9 100644 --- a/evals/test-helper.ts +++ b/evals/test-helper.ts @@ -47,11 +47,47 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) { ); } -export async function internalEvalTest(evalCase: EvalCase) { +export async function withEvalRetries( + name: string, + attemptFn: (attempt: number) => Promise, +) { const maxRetries = 3; let attempt = 0; while (attempt <= maxRetries) { + try { + await attemptFn(attempt); + return; // Success! Exit the retry loop. + } catch (error: unknown) { + const errorMessage = + error instanceof Error ? error.message : String(error); + const errorCode = getApiErrorCode(errorMessage); + + if (errorCode) { + const status = attempt < maxRetries ? 'RETRY' : 'SKIP'; + logReliabilityEvent(name, attempt, status, errorCode, errorMessage); + + if (attempt < maxRetries) { + attempt++; + console.warn( + `[Eval] Attempt ${attempt} failed with ${errorCode} Error. Retrying...`, + ); + continue; // Retry + } + + console.warn( + `[Eval] '${name}' failed after ${maxRetries} retries due to persistent API errors. Skipping failure to avoid blocking PR.`, + ); + return; // Gracefully exit without failing the test + } + + throw error; // Real failure + } + } +} + +export async function internalEvalTest(evalCase: EvalCase) { + await withEvalRetries(evalCase.name, async () => { const rig = new TestRig(); const { logDir, sanitizedName } = await prepareLogDir(evalCase.name); const activityLogFile = path.join(logDir, `${sanitizedName}.jsonl`); @@ -66,7 +102,7 @@ export async function internalEvalTest(evalCase: EvalCase) { } if (evalCase.files) { - await setupTestFiles(rig, evalCase.files); + await prepareWorkspace(rig.testDir!, rig.homeDir!, evalCase.files); } symlinkNodeModules(rig.testDir || ''); @@ -139,37 +175,6 @@ export async function internalEvalTest(evalCase: EvalCase) { await evalCase.assert(rig, result); isSuccess = true; - return; // Success! Exit the retry loop. - } catch (error: unknown) { - const errorMessage = - error instanceof Error ? error.message : String(error); - const errorCode = getApiErrorCode(errorMessage); - - if (errorCode) { - const status = attempt < maxRetries ? 'RETRY' : 'SKIP'; - logReliabilityEvent( - evalCase.name, - attempt, - status, - errorCode, - errorMessage, - ); - - if (attempt < maxRetries) { - attempt++; - console.warn( - `[Eval] Attempt ${attempt} failed with ${errorCode} Error. Retrying...`, - ); - continue; // Retry - } - - console.warn( - `[Eval] '${evalCase.name}' failed after ${maxRetries} retries due to persistent API errors. Skipping failure to avoid blocking PR.`, - ); - return; // Gracefully exit without failing the test - } - - throw error; // Real failure } finally { if (isSuccess) { await fs.promises.unlink(activityLogFile).catch((err) => { @@ -188,7 +193,7 @@ export async function internalEvalTest(evalCase: EvalCase) { ); await rig.cleanup(); } - } + }); } function getApiErrorCode(message: string): '500' | '503' | undefined { @@ -252,9 +257,13 @@ function logReliabilityEvent( * intentionally uses synchronous filesystem and child_process operations * for simplicity and to ensure sequential environment preparation. */ -async function setupTestFiles(rig: TestRig, files: Record) { +export async function prepareWorkspace( + testDir: string, + homeDir: string, + files: Record, +) { const acknowledgedAgents: Record> = {}; - const projectRoot = fs.realpathSync(rig.testDir!); + const projectRoot = fs.realpathSync(testDir); for (const [filePath, content] of Object.entries(files)) { if (filePath.includes('..') || path.isAbsolute(filePath)) { @@ -290,7 +299,7 @@ async function setupTestFiles(rig: TestRig, files: Record) { if (Object.keys(acknowledgedAgents).length > 0) { const ackPath = path.join( - rig.homeDir!, + homeDir, '.gemini', 'acknowledgments', 'agents.json', @@ -299,7 +308,7 @@ async function setupTestFiles(rig: TestRig, files: Record) { fs.writeFileSync(ackPath, JSON.stringify(acknowledgedAgents, null, 2)); } - const execOptions = { cwd: rig.testDir!, stdio: 'inherit' as const }; + const execOptions = { cwd: testDir, stdio: 'ignore' as const }; execSync('git init --initial-branch=main', execOptions); execSync('git config user.email "test@example.com"', execOptions); execSync('git config user.name "Test User"', execOptions); @@ -366,15 +375,18 @@ interface ForbiddenToolSettings { }; } -export interface EvalCase { +export interface BaseEvalCase { name: string; + timeout?: number; + files?: Record; +} + +export interface EvalCase extends BaseEvalCase { params?: { settings?: ForbiddenToolSettings & Record; [key: string]: unknown; }; prompt: string; - timeout?: number; - files?: Record; setup?: (rig: TestRig) => Promise | void; /** Conversation history to pre-load via --resume. Each entry is a message object with type, content, etc. */ messages?: Record[]; diff --git a/evals/tsconfig.json b/evals/tsconfig.json new file mode 100644 index 0000000000..8a8150e90d --- /dev/null +++ b/evals/tsconfig.json @@ -0,0 +1,13 @@ +{ + "extends": "../tsconfig.json", + "compilerOptions": { + "jsx": "react-jsx", + "lib": ["DOM", "DOM.Iterable", "ES2023"], + "types": ["node", "vitest/globals"] + }, + "include": [ + "**/*.ts", + "**/*.tsx" + ], + "exclude": ["node_modules", "logs"] +} \ No newline at end of file diff --git a/evals/vitest.config.ts b/evals/vitest.config.ts index 50733a999c..f92aa6b455 100644 --- a/evals/vitest.config.ts +++ b/evals/vitest.config.ts @@ -24,7 +24,7 @@ export default defineConfig({ environment: 'node', globals: true, alias: { - react: path.resolve(__dirname, '../node_modules/react'), + '@google/gemini-cli-core': path.resolve(__dirname, '../packages/core/index.ts'), }, setupFiles: [path.resolve(__dirname, '../packages/cli/test-setup.ts')], server: { diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 4633b5f4c3..48d46918f3 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -126,6 +126,8 @@ export * from './utils/cache.js'; export * from './utils/markdownUtils.js'; // Export services +export * from './services/agentHistoryProvider.js'; +export * from './services/chatCompressionService.js'; export * from './services/fileDiscoveryService.js'; export * from './services/gitService.js'; export * from './services/FolderTrustDiscoveryService.js';