mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-04-21 02:24:09 -07:00
feat(evals): add reliability harvester and 500/503 retry support (#23626)
This commit is contained in:
@@ -0,0 +1,207 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import { internalEvalTest } from './test-helper.js';
|
||||
import { TestRig } from '@google/gemini-cli-test-utils';
|
||||
|
||||
// Mock TestRig to control API success/failure
|
||||
vi.mock('@google/gemini-cli-test-utils', () => {
|
||||
return {
|
||||
TestRig: vi.fn().mockImplementation(() => ({
|
||||
setup: vi.fn(),
|
||||
run: vi.fn(),
|
||||
cleanup: vi.fn(),
|
||||
readToolLogs: vi.fn().mockReturnValue([]),
|
||||
_lastRunStderr: '',
|
||||
})),
|
||||
};
|
||||
});
|
||||
|
||||
describe('evalTest reliability logic', () => {
|
||||
const LOG_DIR = path.resolve(process.cwd(), 'evals/logs');
|
||||
const RELIABILITY_LOG = path.join(LOG_DIR, 'api-reliability.jsonl');
|
||||
|
||||
beforeEach(() => {
|
||||
vi.clearAllMocks();
|
||||
if (fs.existsSync(RELIABILITY_LOG)) {
|
||||
fs.unlinkSync(RELIABILITY_LOG);
|
||||
}
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
if (fs.existsSync(RELIABILITY_LOG)) {
|
||||
fs.unlinkSync(RELIABILITY_LOG);
|
||||
}
|
||||
});
|
||||
|
||||
it('should retry 3 times on 500 INTERNAL error and then SKIP', async () => {
|
||||
const mockRig = new TestRig() as any;
|
||||
(TestRig as any).mockReturnValue(mockRig);
|
||||
|
||||
// Simulate permanent 500 error
|
||||
mockRig.run.mockRejectedValue(new Error('status: INTERNAL - API Down'));
|
||||
|
||||
// Execute the test function directly
|
||||
await internalEvalTest({
|
||||
name: 'test-api-failure',
|
||||
prompt: 'do something',
|
||||
assert: async () => {},
|
||||
});
|
||||
|
||||
// Verify retries: 1 initial + 3 retries = 4 setups/runs
|
||||
expect(mockRig.run).toHaveBeenCalledTimes(4);
|
||||
|
||||
// Verify log content
|
||||
const logContent = fs
|
||||
.readFileSync(RELIABILITY_LOG, 'utf-8')
|
||||
.trim()
|
||||
.split('\n');
|
||||
expect(logContent.length).toBe(4);
|
||||
|
||||
const entries = logContent.map((line) => JSON.parse(line));
|
||||
expect(entries[0].status).toBe('RETRY');
|
||||
expect(entries[0].attempt).toBe(0);
|
||||
expect(entries[3].status).toBe('SKIP');
|
||||
expect(entries[3].attempt).toBe(3);
|
||||
expect(entries[3].testName).toBe('test-api-failure');
|
||||
});
|
||||
|
||||
it('should fail immediately on non-500 errors (like assertion failures)', async () => {
|
||||
const mockRig = new TestRig() as any;
|
||||
(TestRig as any).mockReturnValue(mockRig);
|
||||
|
||||
// Simulate a real logic error/bug
|
||||
mockRig.run.mockResolvedValue('Success');
|
||||
const assertError = new Error('Assertion failed: expected foo to be bar');
|
||||
|
||||
// Expect the test function to throw immediately
|
||||
await expect(
|
||||
internalEvalTest({
|
||||
name: 'test-logic-failure',
|
||||
prompt: 'do something',
|
||||
assert: async () => {
|
||||
throw assertError;
|
||||
},
|
||||
}),
|
||||
).rejects.toThrow('Assertion failed');
|
||||
|
||||
// Verify NO retries: only 1 attempt
|
||||
expect(mockRig.run).toHaveBeenCalledTimes(1);
|
||||
|
||||
// Verify NO reliability log was created (it's not an API error)
|
||||
expect(fs.existsSync(RELIABILITY_LOG)).toBe(false);
|
||||
});
|
||||
|
||||
it('should recover if a retry succeeds', async () => {
|
||||
const mockRig = new TestRig() as any;
|
||||
(TestRig as any).mockReturnValue(mockRig);
|
||||
|
||||
// Fail once, then succeed
|
||||
mockRig.run
|
||||
.mockRejectedValueOnce(new Error('status: INTERNAL'))
|
||||
.mockResolvedValueOnce('Success');
|
||||
|
||||
await internalEvalTest({
|
||||
name: 'test-recovery',
|
||||
prompt: 'do something',
|
||||
assert: async () => {},
|
||||
});
|
||||
|
||||
// Ran twice: initial (fail) + retry 1 (success)
|
||||
expect(mockRig.run).toHaveBeenCalledTimes(2);
|
||||
|
||||
// Log should only have the one RETRY entry
|
||||
const logContent = fs
|
||||
.readFileSync(RELIABILITY_LOG, 'utf-8')
|
||||
.trim()
|
||||
.split('\n');
|
||||
expect(logContent.length).toBe(1);
|
||||
expect(JSON.parse(logContent[0]).status).toBe('RETRY');
|
||||
});
|
||||
|
||||
it('should retry 3 times on 503 UNAVAILABLE error and then SKIP', async () => {
|
||||
const mockRig = new TestRig() as any;
|
||||
(TestRig as any).mockReturnValue(mockRig);
|
||||
|
||||
// Simulate permanent 503 error
|
||||
mockRig.run.mockRejectedValue(
|
||||
new Error('status: UNAVAILABLE - Service Busy'),
|
||||
);
|
||||
|
||||
await internalEvalTest({
|
||||
name: 'test-api-503',
|
||||
prompt: 'do something',
|
||||
assert: async () => {},
|
||||
});
|
||||
|
||||
expect(mockRig.run).toHaveBeenCalledTimes(4);
|
||||
|
||||
const logContent = fs
|
||||
.readFileSync(RELIABILITY_LOG, 'utf-8')
|
||||
.trim()
|
||||
.split('\n');
|
||||
const entries = logContent.map((line) => JSON.parse(line));
|
||||
expect(entries[0].errorCode).toBe('503');
|
||||
expect(entries[3].status).toBe('SKIP');
|
||||
});
|
||||
|
||||
it('should throw if an absolute path is used in files', async () => {
|
||||
const mockRig = new TestRig() as any;
|
||||
(TestRig as any).mockReturnValue(mockRig);
|
||||
mockRig.testDir = path.resolve(process.cwd(), 'test-dir-tmp');
|
||||
if (!fs.existsSync(mockRig.testDir)) {
|
||||
fs.mkdirSync(mockRig.testDir, { recursive: true });
|
||||
}
|
||||
|
||||
try {
|
||||
await expect(
|
||||
internalEvalTest({
|
||||
name: 'test-absolute-path',
|
||||
prompt: 'do something',
|
||||
files: {
|
||||
'/etc/passwd': 'hacked',
|
||||
},
|
||||
assert: async () => {},
|
||||
}),
|
||||
).rejects.toThrow('Invalid file path in test case: /etc/passwd');
|
||||
} finally {
|
||||
if (fs.existsSync(mockRig.testDir)) {
|
||||
fs.rmSync(mockRig.testDir, { recursive: true, force: true });
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
it('should throw if directory traversal is detected in files', async () => {
|
||||
const mockRig = new TestRig() as any;
|
||||
(TestRig as any).mockReturnValue(mockRig);
|
||||
mockRig.testDir = path.resolve(process.cwd(), 'test-dir-tmp');
|
||||
|
||||
// Create a mock test-dir
|
||||
if (!fs.existsSync(mockRig.testDir)) {
|
||||
fs.mkdirSync(mockRig.testDir, { recursive: true });
|
||||
}
|
||||
|
||||
try {
|
||||
await expect(
|
||||
internalEvalTest({
|
||||
name: 'test-traversal',
|
||||
prompt: 'do something',
|
||||
files: {
|
||||
'../sensitive.txt': 'hacked',
|
||||
},
|
||||
assert: async () => {},
|
||||
}),
|
||||
).rejects.toThrow('Invalid file path in test case: ../sensitive.txt');
|
||||
} finally {
|
||||
if (fs.existsSync(mockRig.testDir)) {
|
||||
fs.rmSync(mockRig.testDir, { recursive: true, force: true });
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
+171
-71
@@ -39,87 +39,34 @@ export * from '@google/gemini-cli-test-utils';
|
||||
export type EvalPolicy = 'ALWAYS_PASSES' | 'USUALLY_PASSES';
|
||||
|
||||
export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
|
||||
const fn = async () => {
|
||||
runEval(
|
||||
policy,
|
||||
evalCase.name,
|
||||
() => internalEvalTest(evalCase),
|
||||
evalCase.timeout,
|
||||
);
|
||||
}
|
||||
|
||||
export async function internalEvalTest(evalCase: EvalCase) {
|
||||
const maxRetries = 3;
|
||||
let attempt = 0;
|
||||
|
||||
while (attempt <= maxRetries) {
|
||||
const rig = new TestRig();
|
||||
const { logDir, sanitizedName } = await prepareLogDir(evalCase.name);
|
||||
const activityLogFile = path.join(logDir, `${sanitizedName}.jsonl`);
|
||||
const logFile = path.join(logDir, `${sanitizedName}.log`);
|
||||
let isSuccess = false;
|
||||
|
||||
try {
|
||||
rig.setup(evalCase.name, evalCase.params);
|
||||
|
||||
// Symlink node modules to reduce the amount of time needed to
|
||||
// bootstrap test projects.
|
||||
symlinkNodeModules(rig.testDir || '');
|
||||
|
||||
if (evalCase.files) {
|
||||
const acknowledgedAgents: Record<string, Record<string, string>> = {};
|
||||
const projectRoot = fs.realpathSync(rig.testDir!);
|
||||
|
||||
for (const [filePath, content] of Object.entries(evalCase.files)) {
|
||||
const fullPath = path.join(rig.testDir!, filePath);
|
||||
fs.mkdirSync(path.dirname(fullPath), { recursive: true });
|
||||
fs.writeFileSync(fullPath, content);
|
||||
|
||||
// If it's an agent file, calculate hash for acknowledgement
|
||||
if (
|
||||
filePath.startsWith('.gemini/agents/') &&
|
||||
filePath.endsWith('.md')
|
||||
) {
|
||||
const hash = crypto
|
||||
.createHash('sha256')
|
||||
.update(content)
|
||||
.digest('hex');
|
||||
|
||||
try {
|
||||
const agentDefs = await parseAgentMarkdown(fullPath, content);
|
||||
if (agentDefs.length > 0) {
|
||||
const agentName = agentDefs[0].name;
|
||||
if (!acknowledgedAgents[projectRoot]) {
|
||||
acknowledgedAgents[projectRoot] = {};
|
||||
}
|
||||
acknowledgedAgents[projectRoot][agentName] = hash;
|
||||
}
|
||||
} catch (error) {
|
||||
console.warn(
|
||||
`Failed to parse agent for test acknowledgement: ${filePath}`,
|
||||
error,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Write acknowledged_agents.json to the home directory
|
||||
if (Object.keys(acknowledgedAgents).length > 0) {
|
||||
const ackPath = path.join(
|
||||
rig.homeDir!,
|
||||
'.gemini',
|
||||
'acknowledgments',
|
||||
'agents.json',
|
||||
);
|
||||
fs.mkdirSync(path.dirname(ackPath), { recursive: true });
|
||||
fs.writeFileSync(
|
||||
ackPath,
|
||||
JSON.stringify(acknowledgedAgents, null, 2),
|
||||
);
|
||||
}
|
||||
|
||||
const execOptions = { cwd: rig.testDir!, stdio: 'inherit' as const };
|
||||
execSync('git init', execOptions);
|
||||
execSync('git config user.email "test@example.com"', execOptions);
|
||||
execSync('git config user.name "Test User"', execOptions);
|
||||
|
||||
// Temporarily disable the interactive editor and git pager
|
||||
// to avoid hanging the tests. It seems the the agent isn't
|
||||
// consistently honoring the instructions to avoid interactive
|
||||
// commands.
|
||||
execSync('git config core.editor "true"', execOptions);
|
||||
execSync('git config core.pager "cat"', execOptions);
|
||||
execSync('git config commit.gpgsign false', execOptions);
|
||||
execSync('git add .', execOptions);
|
||||
execSync('git commit --allow-empty -m "Initial commit"', execOptions);
|
||||
await setupTestFiles(rig, evalCase.files);
|
||||
}
|
||||
|
||||
symlinkNodeModules(rig.testDir || '');
|
||||
|
||||
// If messages are provided, write a session file so --resume can load it.
|
||||
let sessionId: string | undefined;
|
||||
if (evalCase.messages) {
|
||||
@@ -188,6 +135,37 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
|
||||
|
||||
await evalCase.assert(rig, result);
|
||||
isSuccess = true;
|
||||
return; // Success! Exit the retry loop.
|
||||
} catch (error: unknown) {
|
||||
const errorMessage =
|
||||
error instanceof Error ? error.message : String(error);
|
||||
const errorCode = getApiErrorCode(errorMessage);
|
||||
|
||||
if (errorCode) {
|
||||
const status = attempt < maxRetries ? 'RETRY' : 'SKIP';
|
||||
logReliabilityEvent(
|
||||
evalCase.name,
|
||||
attempt,
|
||||
status,
|
||||
errorCode,
|
||||
errorMessage,
|
||||
);
|
||||
|
||||
if (attempt < maxRetries) {
|
||||
attempt++;
|
||||
console.warn(
|
||||
`[Eval] Attempt ${attempt} failed with ${errorCode} Error. Retrying...`,
|
||||
);
|
||||
continue; // Retry
|
||||
}
|
||||
|
||||
console.warn(
|
||||
`[Eval] '${evalCase.name}' failed after ${maxRetries} retries due to persistent API errors. Skipping failure to avoid blocking PR.`,
|
||||
);
|
||||
return; // Gracefully exit without failing the test
|
||||
}
|
||||
|
||||
throw error; // Real failure
|
||||
} finally {
|
||||
if (isSuccess) {
|
||||
await fs.promises.unlink(activityLogFile).catch((err) => {
|
||||
@@ -206,9 +184,131 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
|
||||
);
|
||||
await rig.cleanup();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function getApiErrorCode(message: string): '500' | '503' | undefined {
|
||||
if (
|
||||
message.includes('status: UNAVAILABLE') ||
|
||||
message.includes('code: 503') ||
|
||||
message.includes('Service Unavailable')
|
||||
) {
|
||||
return '503';
|
||||
}
|
||||
if (
|
||||
message.includes('status: INTERNAL') ||
|
||||
message.includes('code: 500') ||
|
||||
message.includes('Internal error encountered')
|
||||
) {
|
||||
return '500';
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
/**
|
||||
* Log reliability event for later harvesting.
|
||||
*
|
||||
* Note: Uses synchronous file I/O to ensure the log is persisted even if the
|
||||
* test process is abruptly terminated by a timeout or CI crash. Performance
|
||||
* impact is negligible compared to long-running evaluation tests.
|
||||
*/
|
||||
function logReliabilityEvent(
|
||||
testName: string,
|
||||
attempt: number,
|
||||
status: 'RETRY' | 'SKIP',
|
||||
errorCode: '500' | '503',
|
||||
errorMessage: string,
|
||||
) {
|
||||
const reliabilityLog = {
|
||||
timestamp: new Date().toISOString(),
|
||||
testName,
|
||||
model: process.env.GEMINI_MODEL || 'unknown',
|
||||
attempt,
|
||||
status,
|
||||
errorCode,
|
||||
error: errorMessage,
|
||||
};
|
||||
|
||||
runEval(policy, evalCase.name, fn, evalCase.timeout);
|
||||
try {
|
||||
const relDir = path.resolve(process.cwd(), 'evals/logs');
|
||||
fs.mkdirSync(relDir, { recursive: true });
|
||||
fs.appendFileSync(
|
||||
path.join(relDir, 'api-reliability.jsonl'),
|
||||
JSON.stringify(reliabilityLog) + '\n',
|
||||
);
|
||||
} catch (logError) {
|
||||
console.error('Failed to write reliability log:', logError);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper to setup test files and git repository.
|
||||
*
|
||||
* Note: While this is an async function (due to parseAgentMarkdown), it
|
||||
* intentionally uses synchronous filesystem and child_process operations
|
||||
* for simplicity and to ensure sequential environment preparation.
|
||||
*/
|
||||
async function setupTestFiles(rig: TestRig, files: Record<string, string>) {
|
||||
const acknowledgedAgents: Record<string, Record<string, string>> = {};
|
||||
const projectRoot = fs.realpathSync(rig.testDir!);
|
||||
|
||||
for (const [filePath, content] of Object.entries(files)) {
|
||||
if (filePath.includes('..') || path.isAbsolute(filePath)) {
|
||||
throw new Error(`Invalid file path in test case: ${filePath}`);
|
||||
}
|
||||
const fullPath = path.join(projectRoot, filePath);
|
||||
if (!fullPath.startsWith(projectRoot)) {
|
||||
throw new Error(`Path traversal detected: ${filePath}`);
|
||||
}
|
||||
|
||||
fs.mkdirSync(path.dirname(fullPath), { recursive: true });
|
||||
fs.writeFileSync(fullPath, content);
|
||||
|
||||
if (filePath.startsWith('.gemini/agents/') && filePath.endsWith('.md')) {
|
||||
const hash = crypto.createHash('sha256').update(content).digest('hex');
|
||||
try {
|
||||
const agentDefs = await parseAgentMarkdown(fullPath, content);
|
||||
if (agentDefs.length > 0) {
|
||||
const agentName = agentDefs[0].name;
|
||||
if (!acknowledgedAgents[projectRoot]) {
|
||||
acknowledgedAgents[projectRoot] = {};
|
||||
}
|
||||
acknowledgedAgents[projectRoot][agentName] = hash;
|
||||
}
|
||||
} catch (error) {
|
||||
console.warn(
|
||||
`Failed to parse agent for test acknowledgement: ${filePath}`,
|
||||
error,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (Object.keys(acknowledgedAgents).length > 0) {
|
||||
const ackPath = path.join(
|
||||
rig.homeDir!,
|
||||
'.gemini',
|
||||
'acknowledgments',
|
||||
'agents.json',
|
||||
);
|
||||
fs.mkdirSync(path.dirname(ackPath), { recursive: true });
|
||||
fs.writeFileSync(ackPath, JSON.stringify(acknowledgedAgents, null, 2));
|
||||
}
|
||||
|
||||
const execOptions = { cwd: rig.testDir!, stdio: 'inherit' as const };
|
||||
execSync('git init --initial-branch=main', execOptions);
|
||||
execSync('git config user.email "test@example.com"', execOptions);
|
||||
execSync('git config user.name "Test User"', execOptions);
|
||||
|
||||
// Temporarily disable the interactive editor and git pager
|
||||
// to avoid hanging the tests. It seems the the agent isn't
|
||||
// consistently honoring the instructions to avoid interactive
|
||||
// commands.
|
||||
execSync('git config core.editor "true"', execOptions);
|
||||
execSync('git config core.pager "cat"', execOptions);
|
||||
execSync('git config commit.gpgsign false', execOptions);
|
||||
execSync('git add .', execOptions);
|
||||
execSync('git commit --allow-empty -m "Initial commit"', execOptions);
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -16,10 +16,6 @@ export default defineConfig({
|
||||
},
|
||||
test: {
|
||||
testTimeout: 300000, // 5 minutes
|
||||
// Retry in CI but not nightly to avoid blocking on API error.
|
||||
retry: process.env['VITEST_RETRY']
|
||||
? parseInt(process.env['VITEST_RETRY'], 10)
|
||||
: 3,
|
||||
reporters: ['default', 'json'],
|
||||
outputFile: {
|
||||
json: 'evals/logs/report.json',
|
||||
|
||||
Reference in New Issue
Block a user