feat(evals): add reliability harvester and 500/503 retry support (#23626)

This commit is contained in:
Alisa
2026-03-25 18:48:45 -07:00
committed by GitHub
parent c1e4dbd157
commit 2e03e3aed5
6 changed files with 509 additions and 75 deletions
+207
View File
@@ -0,0 +1,207 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
import fs from 'node:fs';
import path from 'node:path';
import { internalEvalTest } from './test-helper.js';
import { TestRig } from '@google/gemini-cli-test-utils';
// Mock TestRig to control API success/failure
vi.mock('@google/gemini-cli-test-utils', () => {
return {
TestRig: vi.fn().mockImplementation(() => ({
setup: vi.fn(),
run: vi.fn(),
cleanup: vi.fn(),
readToolLogs: vi.fn().mockReturnValue([]),
_lastRunStderr: '',
})),
};
});
describe('evalTest reliability logic', () => {
const LOG_DIR = path.resolve(process.cwd(), 'evals/logs');
const RELIABILITY_LOG = path.join(LOG_DIR, 'api-reliability.jsonl');
beforeEach(() => {
vi.clearAllMocks();
if (fs.existsSync(RELIABILITY_LOG)) {
fs.unlinkSync(RELIABILITY_LOG);
}
});
afterEach(() => {
if (fs.existsSync(RELIABILITY_LOG)) {
fs.unlinkSync(RELIABILITY_LOG);
}
});
it('should retry 3 times on 500 INTERNAL error and then SKIP', async () => {
const mockRig = new TestRig() as any;
(TestRig as any).mockReturnValue(mockRig);
// Simulate permanent 500 error
mockRig.run.mockRejectedValue(new Error('status: INTERNAL - API Down'));
// Execute the test function directly
await internalEvalTest({
name: 'test-api-failure',
prompt: 'do something',
assert: async () => {},
});
// Verify retries: 1 initial + 3 retries = 4 setups/runs
expect(mockRig.run).toHaveBeenCalledTimes(4);
// Verify log content
const logContent = fs
.readFileSync(RELIABILITY_LOG, 'utf-8')
.trim()
.split('\n');
expect(logContent.length).toBe(4);
const entries = logContent.map((line) => JSON.parse(line));
expect(entries[0].status).toBe('RETRY');
expect(entries[0].attempt).toBe(0);
expect(entries[3].status).toBe('SKIP');
expect(entries[3].attempt).toBe(3);
expect(entries[3].testName).toBe('test-api-failure');
});
it('should fail immediately on non-500 errors (like assertion failures)', async () => {
const mockRig = new TestRig() as any;
(TestRig as any).mockReturnValue(mockRig);
// Simulate a real logic error/bug
mockRig.run.mockResolvedValue('Success');
const assertError = new Error('Assertion failed: expected foo to be bar');
// Expect the test function to throw immediately
await expect(
internalEvalTest({
name: 'test-logic-failure',
prompt: 'do something',
assert: async () => {
throw assertError;
},
}),
).rejects.toThrow('Assertion failed');
// Verify NO retries: only 1 attempt
expect(mockRig.run).toHaveBeenCalledTimes(1);
// Verify NO reliability log was created (it's not an API error)
expect(fs.existsSync(RELIABILITY_LOG)).toBe(false);
});
it('should recover if a retry succeeds', async () => {
const mockRig = new TestRig() as any;
(TestRig as any).mockReturnValue(mockRig);
// Fail once, then succeed
mockRig.run
.mockRejectedValueOnce(new Error('status: INTERNAL'))
.mockResolvedValueOnce('Success');
await internalEvalTest({
name: 'test-recovery',
prompt: 'do something',
assert: async () => {},
});
// Ran twice: initial (fail) + retry 1 (success)
expect(mockRig.run).toHaveBeenCalledTimes(2);
// Log should only have the one RETRY entry
const logContent = fs
.readFileSync(RELIABILITY_LOG, 'utf-8')
.trim()
.split('\n');
expect(logContent.length).toBe(1);
expect(JSON.parse(logContent[0]).status).toBe('RETRY');
});
it('should retry 3 times on 503 UNAVAILABLE error and then SKIP', async () => {
const mockRig = new TestRig() as any;
(TestRig as any).mockReturnValue(mockRig);
// Simulate permanent 503 error
mockRig.run.mockRejectedValue(
new Error('status: UNAVAILABLE - Service Busy'),
);
await internalEvalTest({
name: 'test-api-503',
prompt: 'do something',
assert: async () => {},
});
expect(mockRig.run).toHaveBeenCalledTimes(4);
const logContent = fs
.readFileSync(RELIABILITY_LOG, 'utf-8')
.trim()
.split('\n');
const entries = logContent.map((line) => JSON.parse(line));
expect(entries[0].errorCode).toBe('503');
expect(entries[3].status).toBe('SKIP');
});
it('should throw if an absolute path is used in files', async () => {
const mockRig = new TestRig() as any;
(TestRig as any).mockReturnValue(mockRig);
mockRig.testDir = path.resolve(process.cwd(), 'test-dir-tmp');
if (!fs.existsSync(mockRig.testDir)) {
fs.mkdirSync(mockRig.testDir, { recursive: true });
}
try {
await expect(
internalEvalTest({
name: 'test-absolute-path',
prompt: 'do something',
files: {
'/etc/passwd': 'hacked',
},
assert: async () => {},
}),
).rejects.toThrow('Invalid file path in test case: /etc/passwd');
} finally {
if (fs.existsSync(mockRig.testDir)) {
fs.rmSync(mockRig.testDir, { recursive: true, force: true });
}
}
});
it('should throw if directory traversal is detected in files', async () => {
const mockRig = new TestRig() as any;
(TestRig as any).mockReturnValue(mockRig);
mockRig.testDir = path.resolve(process.cwd(), 'test-dir-tmp');
// Create a mock test-dir
if (!fs.existsSync(mockRig.testDir)) {
fs.mkdirSync(mockRig.testDir, { recursive: true });
}
try {
await expect(
internalEvalTest({
name: 'test-traversal',
prompt: 'do something',
files: {
'../sensitive.txt': 'hacked',
},
assert: async () => {},
}),
).rejects.toThrow('Invalid file path in test case: ../sensitive.txt');
} finally {
if (fs.existsSync(mockRig.testDir)) {
fs.rmSync(mockRig.testDir, { recursive: true, force: true });
}
}
});
});
+171 -71
View File
@@ -39,87 +39,34 @@ export * from '@google/gemini-cli-test-utils';
export type EvalPolicy = 'ALWAYS_PASSES' | 'USUALLY_PASSES';
export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
const fn = async () => {
runEval(
policy,
evalCase.name,
() => internalEvalTest(evalCase),
evalCase.timeout,
);
}
export async function internalEvalTest(evalCase: EvalCase) {
const maxRetries = 3;
let attempt = 0;
while (attempt <= maxRetries) {
const rig = new TestRig();
const { logDir, sanitizedName } = await prepareLogDir(evalCase.name);
const activityLogFile = path.join(logDir, `${sanitizedName}.jsonl`);
const logFile = path.join(logDir, `${sanitizedName}.log`);
let isSuccess = false;
try {
rig.setup(evalCase.name, evalCase.params);
// Symlink node modules to reduce the amount of time needed to
// bootstrap test projects.
symlinkNodeModules(rig.testDir || '');
if (evalCase.files) {
const acknowledgedAgents: Record<string, Record<string, string>> = {};
const projectRoot = fs.realpathSync(rig.testDir!);
for (const [filePath, content] of Object.entries(evalCase.files)) {
const fullPath = path.join(rig.testDir!, filePath);
fs.mkdirSync(path.dirname(fullPath), { recursive: true });
fs.writeFileSync(fullPath, content);
// If it's an agent file, calculate hash for acknowledgement
if (
filePath.startsWith('.gemini/agents/') &&
filePath.endsWith('.md')
) {
const hash = crypto
.createHash('sha256')
.update(content)
.digest('hex');
try {
const agentDefs = await parseAgentMarkdown(fullPath, content);
if (agentDefs.length > 0) {
const agentName = agentDefs[0].name;
if (!acknowledgedAgents[projectRoot]) {
acknowledgedAgents[projectRoot] = {};
}
acknowledgedAgents[projectRoot][agentName] = hash;
}
} catch (error) {
console.warn(
`Failed to parse agent for test acknowledgement: ${filePath}`,
error,
);
}
}
}
// Write acknowledged_agents.json to the home directory
if (Object.keys(acknowledgedAgents).length > 0) {
const ackPath = path.join(
rig.homeDir!,
'.gemini',
'acknowledgments',
'agents.json',
);
fs.mkdirSync(path.dirname(ackPath), { recursive: true });
fs.writeFileSync(
ackPath,
JSON.stringify(acknowledgedAgents, null, 2),
);
}
const execOptions = { cwd: rig.testDir!, stdio: 'inherit' as const };
execSync('git init', execOptions);
execSync('git config user.email "test@example.com"', execOptions);
execSync('git config user.name "Test User"', execOptions);
// Temporarily disable the interactive editor and git pager
// to avoid hanging the tests. It seems the the agent isn't
// consistently honoring the instructions to avoid interactive
// commands.
execSync('git config core.editor "true"', execOptions);
execSync('git config core.pager "cat"', execOptions);
execSync('git config commit.gpgsign false', execOptions);
execSync('git add .', execOptions);
execSync('git commit --allow-empty -m "Initial commit"', execOptions);
await setupTestFiles(rig, evalCase.files);
}
symlinkNodeModules(rig.testDir || '');
// If messages are provided, write a session file so --resume can load it.
let sessionId: string | undefined;
if (evalCase.messages) {
@@ -188,6 +135,37 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
await evalCase.assert(rig, result);
isSuccess = true;
return; // Success! Exit the retry loop.
} catch (error: unknown) {
const errorMessage =
error instanceof Error ? error.message : String(error);
const errorCode = getApiErrorCode(errorMessage);
if (errorCode) {
const status = attempt < maxRetries ? 'RETRY' : 'SKIP';
logReliabilityEvent(
evalCase.name,
attempt,
status,
errorCode,
errorMessage,
);
if (attempt < maxRetries) {
attempt++;
console.warn(
`[Eval] Attempt ${attempt} failed with ${errorCode} Error. Retrying...`,
);
continue; // Retry
}
console.warn(
`[Eval] '${evalCase.name}' failed after ${maxRetries} retries due to persistent API errors. Skipping failure to avoid blocking PR.`,
);
return; // Gracefully exit without failing the test
}
throw error; // Real failure
} finally {
if (isSuccess) {
await fs.promises.unlink(activityLogFile).catch((err) => {
@@ -206,9 +184,131 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
);
await rig.cleanup();
}
}
}
function getApiErrorCode(message: string): '500' | '503' | undefined {
if (
message.includes('status: UNAVAILABLE') ||
message.includes('code: 503') ||
message.includes('Service Unavailable')
) {
return '503';
}
if (
message.includes('status: INTERNAL') ||
message.includes('code: 500') ||
message.includes('Internal error encountered')
) {
return '500';
}
return undefined;
}
/**
* Log reliability event for later harvesting.
*
* Note: Uses synchronous file I/O to ensure the log is persisted even if the
* test process is abruptly terminated by a timeout or CI crash. Performance
* impact is negligible compared to long-running evaluation tests.
*/
function logReliabilityEvent(
testName: string,
attempt: number,
status: 'RETRY' | 'SKIP',
errorCode: '500' | '503',
errorMessage: string,
) {
const reliabilityLog = {
timestamp: new Date().toISOString(),
testName,
model: process.env.GEMINI_MODEL || 'unknown',
attempt,
status,
errorCode,
error: errorMessage,
};
runEval(policy, evalCase.name, fn, evalCase.timeout);
try {
const relDir = path.resolve(process.cwd(), 'evals/logs');
fs.mkdirSync(relDir, { recursive: true });
fs.appendFileSync(
path.join(relDir, 'api-reliability.jsonl'),
JSON.stringify(reliabilityLog) + '\n',
);
} catch (logError) {
console.error('Failed to write reliability log:', logError);
}
}
/**
* Helper to setup test files and git repository.
*
* Note: While this is an async function (due to parseAgentMarkdown), it
* intentionally uses synchronous filesystem and child_process operations
* for simplicity and to ensure sequential environment preparation.
*/
async function setupTestFiles(rig: TestRig, files: Record<string, string>) {
const acknowledgedAgents: Record<string, Record<string, string>> = {};
const projectRoot = fs.realpathSync(rig.testDir!);
for (const [filePath, content] of Object.entries(files)) {
if (filePath.includes('..') || path.isAbsolute(filePath)) {
throw new Error(`Invalid file path in test case: ${filePath}`);
}
const fullPath = path.join(projectRoot, filePath);
if (!fullPath.startsWith(projectRoot)) {
throw new Error(`Path traversal detected: ${filePath}`);
}
fs.mkdirSync(path.dirname(fullPath), { recursive: true });
fs.writeFileSync(fullPath, content);
if (filePath.startsWith('.gemini/agents/') && filePath.endsWith('.md')) {
const hash = crypto.createHash('sha256').update(content).digest('hex');
try {
const agentDefs = await parseAgentMarkdown(fullPath, content);
if (agentDefs.length > 0) {
const agentName = agentDefs[0].name;
if (!acknowledgedAgents[projectRoot]) {
acknowledgedAgents[projectRoot] = {};
}
acknowledgedAgents[projectRoot][agentName] = hash;
}
} catch (error) {
console.warn(
`Failed to parse agent for test acknowledgement: ${filePath}`,
error,
);
}
}
}
if (Object.keys(acknowledgedAgents).length > 0) {
const ackPath = path.join(
rig.homeDir!,
'.gemini',
'acknowledgments',
'agents.json',
);
fs.mkdirSync(path.dirname(ackPath), { recursive: true });
fs.writeFileSync(ackPath, JSON.stringify(acknowledgedAgents, null, 2));
}
const execOptions = { cwd: rig.testDir!, stdio: 'inherit' as const };
execSync('git init --initial-branch=main', execOptions);
execSync('git config user.email "test@example.com"', execOptions);
execSync('git config user.name "Test User"', execOptions);
// Temporarily disable the interactive editor and git pager
// to avoid hanging the tests. It seems the the agent isn't
// consistently honoring the instructions to avoid interactive
// commands.
execSync('git config core.editor "true"', execOptions);
execSync('git config core.pager "cat"', execOptions);
execSync('git config commit.gpgsign false', execOptions);
execSync('git add .', execOptions);
execSync('git commit --allow-empty -m "Initial commit"', execOptions);
}
/**
-4
View File
@@ -16,10 +16,6 @@ export default defineConfig({
},
test: {
testTimeout: 300000, // 5 minutes
// Retry in CI but not nightly to avoid blocking on API error.
retry: process.env['VITEST_RETRY']
? parseInt(process.env['VITEST_RETRY'], 10)
: 3,
reporters: ['default', 'json'],
outputFile: {
json: 'evals/logs/report.json',