Initial evals infra split.

This commit is contained in:
Christian Gunderman
2026-04-01 17:26:43 -07:00
parent e293424bb4
commit 704ad573f1
8 changed files with 370 additions and 89 deletions
+46 -47
View File
@@ -10,6 +10,9 @@ import {
runEval,
prepareLogDir,
symlinkNodeModules,
withEvalRetries,
prepareWorkspace,
BaseEvalCase,
} from './test-helper.js';
import fs from 'node:fs';
import path from 'node:path';
@@ -32,12 +35,9 @@ interface EvalConfigOverrides {
[key: string]: unknown;
}
export interface AppEvalCase {
name: string;
export interface AppEvalCase extends BaseEvalCase {
configOverrides?: EvalConfigOverrides;
prompt: string;
timeout?: number;
files?: Record<string, string>;
setup?: (rig: AppRig) => Promise<void>;
assert: (rig: AppRig, output: string) => Promise<void>;
}
@@ -48,55 +48,54 @@ export interface AppEvalCase {
*/
export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
const fn = async () => {
const rig = new AppRig({
configOverrides: {
model: DEFAULT_GEMINI_MODEL,
...evalCase.configOverrides,
},
});
await withEvalRetries(evalCase.name, async () => {
const rig = new AppRig({
configOverrides: {
model: DEFAULT_GEMINI_MODEL,
...evalCase.configOverrides,
},
});
const { logDir, sanitizedName } = await prepareLogDir(evalCase.name);
const logFile = path.join(logDir, `${sanitizedName}.log`);
const { logDir, sanitizedName } = await prepareLogDir(evalCase.name);
const logFile = path.join(logDir, `${sanitizedName}.log`);
try {
await rig.initialize();
try {
await rig.initialize();
const testDir = rig.getTestDir();
symlinkNodeModules(testDir);
const testDir = rig.getTestDir();
symlinkNodeModules(testDir);
// Setup initial files
if (evalCase.files) {
for (const [filePath, content] of Object.entries(evalCase.files)) {
const fullPath = path.join(testDir, filePath);
fs.mkdirSync(path.dirname(fullPath), { recursive: true });
fs.writeFileSync(fullPath, content);
// Setup initial files
if (evalCase.files) {
// Note: AppRig does not use a separate homeDir, so we use testDir twice
await prepareWorkspace(testDir, testDir, evalCase.files);
}
// Run custom setup if provided (e.g. for breakpoints)
if (evalCase.setup) {
await evalCase.setup(rig);
}
// Render the app!
await rig.render();
// Wait for initial ready state
await rig.waitForIdle();
// Send the initial prompt
await rig.sendMessage(evalCase.prompt);
// Run assertion. Interaction-heavy tests can do their own waiting/steering here.
const output = rig.getStaticOutput();
await evalCase.assert(rig, output);
} finally {
const output = rig.getStaticOutput();
if (output) {
await fs.promises.writeFile(logFile, output);
}
await rig.unmount();
}
// Run custom setup if provided (e.g. for breakpoints)
if (evalCase.setup) {
await evalCase.setup(rig);
}
// Render the app!
await rig.render();
// Wait for initial ready state
await rig.waitForIdle();
// Send the initial prompt
await rig.sendMessage(evalCase.prompt);
// Run assertion. Interaction-heavy tests can do their own waiting/steering here.
const output = rig.getStaticOutput();
await evalCase.assert(rig, output);
} finally {
const output = rig.getStaticOutput();
if (output) {
await fs.promises.writeFile(logFile, output);
}
await rig.unmount();
}
});
};
runEval(policy, evalCase.name, fn, (evalCase.timeout ?? 60000) + 10000);
+133
View File
@@ -0,0 +1,133 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import {
type EvalPolicy,
runEval,
prepareLogDir,
withEvalRetries,
prepareWorkspace,
type BaseEvalCase,
} from './test-helper.js';
import fs from 'node:fs';
import path from 'node:path';
import os from 'node:os';
import { randomUUID } from 'node:crypto';
import {
Config,
type ConfigParameters,
AuthType,
ApprovalMode,
createPolicyEngineConfig,
ExtensionLoader,
IntegrityDataStatus,
makeFakeConfig,
} from '@google/gemini-cli-core';
import { createMockSettings } from '../packages/cli/src/test-utils/settings.js';
// A minimal mock ExtensionManager to bypass integrity checks
class MockExtensionManager extends ExtensionLoader {
getExtensions = () => [];
setRequestConsent = () => {};
setRequestSetting = () => {};
integrityManager = {
verifyExtensionIntegrity: async () => IntegrityDataStatus.VERIFIED,
storeExtensionIntegrity: async () => undefined,
};
}
export interface ComponentEvalCase extends BaseEvalCase {
configOverrides?: Partial<ConfigParameters>;
setup?: (config: Config) => Promise<void>;
assert: (config: Config) => Promise<void>;
}
export class ComponentRig {
public config: Config | undefined;
public testDir: string;
public sessionId: string;
constructor(
private options: { configOverrides?: Partial<ConfigParameters> } = {},
) {
const uniqueId = randomUUID();
this.testDir = fs.mkdtempSync(
path.join(os.tmpdir(), `gemini-component-rig-${uniqueId.slice(0, 8)}-`),
);
this.sessionId = `test-session-${uniqueId}`;
}
async initialize() {
const settings = createMockSettings();
const policyEngineConfig = await createPolicyEngineConfig(
settings.merged,
ApprovalMode.DEFAULT,
);
const configParams: ConfigParameters = {
sessionId: this.sessionId,
targetDir: this.testDir,
cwd: this.testDir,
debugMode: false,
model: 'test-model',
interactive: false,
approvalMode: ApprovalMode.DEFAULT,
policyEngineConfig,
enableEventDrivenScheduler: false, // Don't need scheduler for direct component tests
extensionLoader: new MockExtensionManager() as any,
useAlternateBuffer: false,
...this.options.configOverrides,
};
this.config = makeFakeConfig(configParams);
await this.config.initialize();
// Refresh auth using USE_GEMINI to initialize the real BaseLlmClient
await this.config.refreshAuth(AuthType.USE_GEMINI);
}
async cleanup() {
fs.rmSync(this.testDir, { recursive: true, force: true });
}
}
/**
* A helper for running behavioral evaluations directly against backend components.
* It provides a fully initialized Config with real API access, bypassing the UI.
*/
export function componentEvalTest(
policy: EvalPolicy,
evalCase: ComponentEvalCase,
) {
const fn = async () => {
await withEvalRetries(evalCase.name, async () => {
const rig = new ComponentRig({
configOverrides: evalCase.configOverrides,
});
const { logDir, sanitizedName } = await prepareLogDir(evalCase.name);
const logFile = path.join(logDir, `${sanitizedName}-component.log`);
try {
await rig.initialize();
if (evalCase.files) {
await prepareWorkspace(rig.testDir, rig.testDir, evalCase.files);
}
if (evalCase.setup) {
await evalCase.setup(rig.config!);
}
await evalCase.assert(rig.config!);
} finally {
await rig.cleanup();
}
});
};
runEval(policy, evalCase.name, fn, (evalCase.timeout ?? 60000) + 10000);
}
+121
View File
@@ -0,0 +1,121 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { expect } from 'vitest';
import { componentEvalTest } from './component-test-helper.js';
import {
AgentHistoryProvider,
ChatCompressionService,
CompressionStatus,
GeminiChat,
} from '@google/gemini-cli-core';
import type { Content } from '@google/genai';
// Create a highly repetitive and long chat history to trigger compression.
const createMockLongHistory = (numTurns: number = 30): Content[] => {
const history: Content[] = [];
for (let i = 0; i < numTurns; i++) {
history.push({
role: 'user',
parts: [
{
text: `Here is a repetitive piece of context: The system is running nominally. The load is ${
i % 100
}%. All components operational. Please acknowledge and summarize the previous items.`,
},
],
});
history.push({
role: 'model',
parts: [
{
text: `Acknowledged. The system load is ${
i % 100
}%. I am maintaining readiness. The previous items are nominal.`,
},
],
});
}
return history;
};
// --- AgentHistoryProvider Eval ---
componentEvalTest('USUALLY_PASSES', {
name: 'AgentHistoryProvider correctly enforces High Watermark token limits',
setup: async (config) => {
// Optional setup before assertion
},
assert: async (config) => {
// Configure provider with very tight constraints to force truncation immediately
const providerConfig = {
isTruncationEnabled: true,
isSummarizationEnabled: true, // Need this to generate <state_snapshot>
maxTokens: 500, // Trigger limit
retainedTokens: 200, // Target budget after truncation
normalMessageTokens: 100, // Limit for old messages
maximumMessageTokens: 200, // Limit for newest messages
normalizationHeadRatio: 0.1, // Required by AgentHistoryProviderConfig
};
const provider = new AgentHistoryProvider(providerConfig, config);
const mockHistory = createMockLongHistory(30);
const originalLength = mockHistory.length;
const resultHistory = await provider.manageHistory(mockHistory);
// The returned history should be compressed (fewer turns, as the older turns were summarized)
expect(resultHistory.length).toBeLessThan(originalLength);
// There should be a system prompt or a summarized state snapshot injected into the history
const hasSummarizedContent = resultHistory.some(
(content) =>
content.role === 'user' &&
content.parts?.[0]?.text?.includes('<intent_summary>'),
);
expect(hasSummarizedContent).toBe(true);
},
});
// --- ChatCompressionService Eval ---
componentEvalTest('USUALLY_PASSES', {
name: 'ChatCompressionService correctly condenses prompt history via Verification Probe',
assert: async (config) => {
const chatService = new ChatCompressionService();
const mockContext = {
config,
promptId: 'test-prompt-id',
toolRegistry: undefined as any,
promptRegistry: undefined as any,
resourceRegistry: undefined as any,
messageBus: undefined as any,
geminiClient: undefined as any,
sandboxManager: undefined as any,
};
const chat = new GeminiChat(mockContext, '', [], createMockLongHistory(30));
const result = await chatService.compress(
chat,
'test-prompt-id',
true, // force compression
'test-model',
config,
false, // hasFailedCompressionAttempt
);
expect(result.newHistory).toBeDefined();
expect(result.newHistory).not.toBeNull();
// Verify it returned a condensed history array
expect(result.newHistory!.length).toBeLessThan(chat.getHistory().length);
// Verify info metadata indicates a successful compression token reduction
expect(result.info.newTokenCount).toBeLessThan(
result.info.originalTokenCount,
);
expect(result.info.compressionStatus).toBe(CompressionStatus.COMPRESSED);
},
});
+1
View File
@@ -0,0 +1 @@
{"numTotalTestSuites":1,"numPassedTestSuites":1,"numFailedTestSuites":0,"numPendingTestSuites":0,"numTotalTests":2,"numPassedTests":0,"numFailedTests":0,"numPendingTests":2,"numTodoTests":0,"snapshot":{"added":0,"failure":false,"filesAdded":0,"filesRemoved":0,"filesRemovedList":[],"filesUnmatched":0,"filesUpdated":0,"matched":0,"total":0,"unchecked":0,"uncheckedKeysByFile":[],"unmatched":0,"updated":0,"didUpdate":false},"startTime":1775089246511,"success":true,"testResults":[{"assertionResults":[{"ancestorTitles":[],"fullName":"AgentHistoryProvider correctly enforces High Watermark token limits","status":"skipped","title":"AgentHistoryProvider correctly enforces High Watermark token limits","failureMessages":[],"location":{"line":46,"column":1},"meta":{}},{"ancestorTitles":[],"fullName":"ChatCompressionService correctly condenses prompt history via Verification Probe","status":"skipped","title":"ChatCompressionService correctly condenses prompt history via Verification Probe","failureMessages":[],"location":{"line":83,"column":1},"meta":{}}],"startTime":1775089246511,"endTime":1775089246511,"status":"passed","message":"","name":"/Users/gundermanc/code/gemini-cli/compression/evals/compression.eval.ts"}]}
+53 -41
View File
@@ -47,11 +47,47 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
);
}
export async function internalEvalTest(evalCase: EvalCase) {
export async function withEvalRetries(
name: string,
attemptFn: (attempt: number) => Promise<void>,
) {
const maxRetries = 3;
let attempt = 0;
while (attempt <= maxRetries) {
try {
await attemptFn(attempt);
return; // Success! Exit the retry loop.
} catch (error: unknown) {
const errorMessage =
error instanceof Error ? error.message : String(error);
const errorCode = getApiErrorCode(errorMessage);
if (errorCode) {
const status = attempt < maxRetries ? 'RETRY' : 'SKIP';
logReliabilityEvent(name, attempt, status, errorCode, errorMessage);
if (attempt < maxRetries) {
attempt++;
console.warn(
`[Eval] Attempt ${attempt} failed with ${errorCode} Error. Retrying...`,
);
continue; // Retry
}
console.warn(
`[Eval] '${name}' failed after ${maxRetries} retries due to persistent API errors. Skipping failure to avoid blocking PR.`,
);
return; // Gracefully exit without failing the test
}
throw error; // Real failure
}
}
}
export async function internalEvalTest(evalCase: EvalCase) {
await withEvalRetries(evalCase.name, async () => {
const rig = new TestRig();
const { logDir, sanitizedName } = await prepareLogDir(evalCase.name);
const activityLogFile = path.join(logDir, `${sanitizedName}.jsonl`);
@@ -66,7 +102,7 @@ export async function internalEvalTest(evalCase: EvalCase) {
}
if (evalCase.files) {
await setupTestFiles(rig, evalCase.files);
await prepareWorkspace(rig.testDir!, rig.homeDir!, evalCase.files);
}
symlinkNodeModules(rig.testDir || '');
@@ -139,37 +175,6 @@ export async function internalEvalTest(evalCase: EvalCase) {
await evalCase.assert(rig, result);
isSuccess = true;
return; // Success! Exit the retry loop.
} catch (error: unknown) {
const errorMessage =
error instanceof Error ? error.message : String(error);
const errorCode = getApiErrorCode(errorMessage);
if (errorCode) {
const status = attempt < maxRetries ? 'RETRY' : 'SKIP';
logReliabilityEvent(
evalCase.name,
attempt,
status,
errorCode,
errorMessage,
);
if (attempt < maxRetries) {
attempt++;
console.warn(
`[Eval] Attempt ${attempt} failed with ${errorCode} Error. Retrying...`,
);
continue; // Retry
}
console.warn(
`[Eval] '${evalCase.name}' failed after ${maxRetries} retries due to persistent API errors. Skipping failure to avoid blocking PR.`,
);
return; // Gracefully exit without failing the test
}
throw error; // Real failure
} finally {
if (isSuccess) {
await fs.promises.unlink(activityLogFile).catch((err) => {
@@ -188,7 +193,7 @@ export async function internalEvalTest(evalCase: EvalCase) {
);
await rig.cleanup();
}
}
});
}
function getApiErrorCode(message: string): '500' | '503' | undefined {
@@ -252,9 +257,13 @@ function logReliabilityEvent(
* intentionally uses synchronous filesystem and child_process operations
* for simplicity and to ensure sequential environment preparation.
*/
async function setupTestFiles(rig: TestRig, files: Record<string, string>) {
export async function prepareWorkspace(
testDir: string,
homeDir: string,
files: Record<string, string>,
) {
const acknowledgedAgents: Record<string, Record<string, string>> = {};
const projectRoot = fs.realpathSync(rig.testDir!);
const projectRoot = fs.realpathSync(testDir);
for (const [filePath, content] of Object.entries(files)) {
if (filePath.includes('..') || path.isAbsolute(filePath)) {
@@ -290,7 +299,7 @@ async function setupTestFiles(rig: TestRig, files: Record<string, string>) {
if (Object.keys(acknowledgedAgents).length > 0) {
const ackPath = path.join(
rig.homeDir!,
homeDir,
'.gemini',
'acknowledgments',
'agents.json',
@@ -299,7 +308,7 @@ async function setupTestFiles(rig: TestRig, files: Record<string, string>) {
fs.writeFileSync(ackPath, JSON.stringify(acknowledgedAgents, null, 2));
}
const execOptions = { cwd: rig.testDir!, stdio: 'inherit' as const };
const execOptions = { cwd: testDir, stdio: 'ignore' as const };
execSync('git init --initial-branch=main', execOptions);
execSync('git config user.email "test@example.com"', execOptions);
execSync('git config user.name "Test User"', execOptions);
@@ -366,15 +375,18 @@ interface ForbiddenToolSettings {
};
}
export interface EvalCase {
export interface BaseEvalCase {
name: string;
timeout?: number;
files?: Record<string, string>;
}
export interface EvalCase extends BaseEvalCase {
params?: {
settings?: ForbiddenToolSettings & Record<string, unknown>;
[key: string]: unknown;
};
prompt: string;
timeout?: number;
files?: Record<string, string>;
setup?: (rig: TestRig) => Promise<void> | void;
/** Conversation history to pre-load via --resume. Each entry is a message object with type, content, etc. */
messages?: Record<string, unknown>[];
+13
View File
@@ -0,0 +1,13 @@
{
"extends": "../tsconfig.json",
"compilerOptions": {
"jsx": "react-jsx",
"lib": ["DOM", "DOM.Iterable", "ES2023"],
"types": ["node", "vitest/globals"]
},
"include": [
"**/*.ts",
"**/*.tsx"
],
"exclude": ["node_modules", "logs"]
}
+1 -1
View File
@@ -24,7 +24,7 @@ export default defineConfig({
environment: 'node',
globals: true,
alias: {
react: path.resolve(__dirname, '../node_modules/react'),
'@google/gemini-cli-core': path.resolve(__dirname, '../packages/core/index.ts'),
},
setupFiles: [path.resolve(__dirname, '../packages/cli/test-setup.ts')],
server: {
+2
View File
@@ -126,6 +126,8 @@ export * from './utils/cache.js';
export * from './utils/markdownUtils.js';
// Export services
export * from './services/agentHistoryProvider.js';
export * from './services/chatCompressionService.js';
export * from './services/fileDiscoveryService.js';
export * from './services/gitService.js';
export * from './services/FolderTrustDiscoveryService.js';