feat(evals): add reliability harvester and 500/503 retry support (#23626)

This commit is contained in:
Alisa
2026-03-25 18:48:45 -07:00
committed by GitHub
parent c1e4dbd157
commit 2e03e3aed5
6 changed files with 509 additions and 75 deletions

View File

@@ -334,8 +334,20 @@ jobs:
if: "${{ steps.check_evals.outputs.should_run == 'true' }}"
env:
GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}'
GEMINI_MODEL: 'gemini-3-pro-preview'
# Disable Vitest internal retries to avoid double-retrying;
# custom retry logic is handled in evals/test-helper.ts
VITEST_RETRY: 0
run: 'npm run test:always_passing_evals'
- name: 'Upload Reliability Logs'
if: "always() && steps.check_evals.outputs.should_run == 'true'"
uses: 'actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02' # ratchet:actions/upload-artifact@v4
with:
name: 'eval-logs-${{ github.run_id }}-${{ github.run_attempt }}'
path: 'evals/logs/api-reliability.jsonl'
retention-days: 7
e2e:
name: 'E2E'
if: |

View File

@@ -61,6 +61,8 @@ jobs:
GEMINI_MODEL: '${{ matrix.model }}'
RUN_EVALS: "${{ github.event.inputs.run_all != 'false' }}"
TEST_NAME_PATTERN: '${{ github.event.inputs.test_name_pattern }}'
# Disable Vitest internal retries to avoid double-retrying;
# custom retry logic is handled in evals/test-helper.ts
VITEST_RETRY: 0
run: |
CMD="npm run test:all_evals"

207
evals/test-helper.test.ts Normal file
View File

@@ -0,0 +1,207 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
import fs from 'node:fs';
import path from 'node:path';
import { internalEvalTest } from './test-helper.js';
import { TestRig } from '@google/gemini-cli-test-utils';
// Mock TestRig to control API success/failure
vi.mock('@google/gemini-cli-test-utils', () => {
return {
TestRig: vi.fn().mockImplementation(() => ({
setup: vi.fn(),
run: vi.fn(),
cleanup: vi.fn(),
readToolLogs: vi.fn().mockReturnValue([]),
_lastRunStderr: '',
})),
};
});
describe('evalTest reliability logic', () => {
const LOG_DIR = path.resolve(process.cwd(), 'evals/logs');
const RELIABILITY_LOG = path.join(LOG_DIR, 'api-reliability.jsonl');
beforeEach(() => {
vi.clearAllMocks();
if (fs.existsSync(RELIABILITY_LOG)) {
fs.unlinkSync(RELIABILITY_LOG);
}
});
afterEach(() => {
if (fs.existsSync(RELIABILITY_LOG)) {
fs.unlinkSync(RELIABILITY_LOG);
}
});
it('should retry 3 times on 500 INTERNAL error and then SKIP', async () => {
const mockRig = new TestRig() as any;
(TestRig as any).mockReturnValue(mockRig);
// Simulate permanent 500 error
mockRig.run.mockRejectedValue(new Error('status: INTERNAL - API Down'));
// Execute the test function directly
await internalEvalTest({
name: 'test-api-failure',
prompt: 'do something',
assert: async () => {},
});
// Verify retries: 1 initial + 3 retries = 4 setups/runs
expect(mockRig.run).toHaveBeenCalledTimes(4);
// Verify log content
const logContent = fs
.readFileSync(RELIABILITY_LOG, 'utf-8')
.trim()
.split('\n');
expect(logContent.length).toBe(4);
const entries = logContent.map((line) => JSON.parse(line));
expect(entries[0].status).toBe('RETRY');
expect(entries[0].attempt).toBe(0);
expect(entries[3].status).toBe('SKIP');
expect(entries[3].attempt).toBe(3);
expect(entries[3].testName).toBe('test-api-failure');
});
it('should fail immediately on non-500 errors (like assertion failures)', async () => {
const mockRig = new TestRig() as any;
(TestRig as any).mockReturnValue(mockRig);
// Simulate a real logic error/bug
mockRig.run.mockResolvedValue('Success');
const assertError = new Error('Assertion failed: expected foo to be bar');
// Expect the test function to throw immediately
await expect(
internalEvalTest({
name: 'test-logic-failure',
prompt: 'do something',
assert: async () => {
throw assertError;
},
}),
).rejects.toThrow('Assertion failed');
// Verify NO retries: only 1 attempt
expect(mockRig.run).toHaveBeenCalledTimes(1);
// Verify NO reliability log was created (it's not an API error)
expect(fs.existsSync(RELIABILITY_LOG)).toBe(false);
});
it('should recover if a retry succeeds', async () => {
const mockRig = new TestRig() as any;
(TestRig as any).mockReturnValue(mockRig);
// Fail once, then succeed
mockRig.run
.mockRejectedValueOnce(new Error('status: INTERNAL'))
.mockResolvedValueOnce('Success');
await internalEvalTest({
name: 'test-recovery',
prompt: 'do something',
assert: async () => {},
});
// Ran twice: initial (fail) + retry 1 (success)
expect(mockRig.run).toHaveBeenCalledTimes(2);
// Log should only have the one RETRY entry
const logContent = fs
.readFileSync(RELIABILITY_LOG, 'utf-8')
.trim()
.split('\n');
expect(logContent.length).toBe(1);
expect(JSON.parse(logContent[0]).status).toBe('RETRY');
});
it('should retry 3 times on 503 UNAVAILABLE error and then SKIP', async () => {
const mockRig = new TestRig() as any;
(TestRig as any).mockReturnValue(mockRig);
// Simulate permanent 503 error
mockRig.run.mockRejectedValue(
new Error('status: UNAVAILABLE - Service Busy'),
);
await internalEvalTest({
name: 'test-api-503',
prompt: 'do something',
assert: async () => {},
});
expect(mockRig.run).toHaveBeenCalledTimes(4);
const logContent = fs
.readFileSync(RELIABILITY_LOG, 'utf-8')
.trim()
.split('\n');
const entries = logContent.map((line) => JSON.parse(line));
expect(entries[0].errorCode).toBe('503');
expect(entries[3].status).toBe('SKIP');
});
it('should throw if an absolute path is used in files', async () => {
const mockRig = new TestRig() as any;
(TestRig as any).mockReturnValue(mockRig);
mockRig.testDir = path.resolve(process.cwd(), 'test-dir-tmp');
if (!fs.existsSync(mockRig.testDir)) {
fs.mkdirSync(mockRig.testDir, { recursive: true });
}
try {
await expect(
internalEvalTest({
name: 'test-absolute-path',
prompt: 'do something',
files: {
'/etc/passwd': 'hacked',
},
assert: async () => {},
}),
).rejects.toThrow('Invalid file path in test case: /etc/passwd');
} finally {
if (fs.existsSync(mockRig.testDir)) {
fs.rmSync(mockRig.testDir, { recursive: true, force: true });
}
}
});
it('should throw if directory traversal is detected in files', async () => {
const mockRig = new TestRig() as any;
(TestRig as any).mockReturnValue(mockRig);
mockRig.testDir = path.resolve(process.cwd(), 'test-dir-tmp');
// Create a mock test-dir
if (!fs.existsSync(mockRig.testDir)) {
fs.mkdirSync(mockRig.testDir, { recursive: true });
}
try {
await expect(
internalEvalTest({
name: 'test-traversal',
prompt: 'do something',
files: {
'../sensitive.txt': 'hacked',
},
assert: async () => {},
}),
).rejects.toThrow('Invalid file path in test case: ../sensitive.txt');
} finally {
if (fs.existsSync(mockRig.testDir)) {
fs.rmSync(mockRig.testDir, { recursive: true, force: true });
}
}
});
});

View File

@@ -39,87 +39,34 @@ export * from '@google/gemini-cli-test-utils';
export type EvalPolicy = 'ALWAYS_PASSES' | 'USUALLY_PASSES';
export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
const fn = async () => {
runEval(
policy,
evalCase.name,
() => internalEvalTest(evalCase),
evalCase.timeout,
);
}
export async function internalEvalTest(evalCase: EvalCase) {
const maxRetries = 3;
let attempt = 0;
while (attempt <= maxRetries) {
const rig = new TestRig();
const { logDir, sanitizedName } = await prepareLogDir(evalCase.name);
const activityLogFile = path.join(logDir, `${sanitizedName}.jsonl`);
const logFile = path.join(logDir, `${sanitizedName}.log`);
let isSuccess = false;
try {
rig.setup(evalCase.name, evalCase.params);
// Symlink node modules to reduce the amount of time needed to
// bootstrap test projects.
symlinkNodeModules(rig.testDir || '');
if (evalCase.files) {
const acknowledgedAgents: Record<string, Record<string, string>> = {};
const projectRoot = fs.realpathSync(rig.testDir!);
for (const [filePath, content] of Object.entries(evalCase.files)) {
const fullPath = path.join(rig.testDir!, filePath);
fs.mkdirSync(path.dirname(fullPath), { recursive: true });
fs.writeFileSync(fullPath, content);
// If it's an agent file, calculate hash for acknowledgement
if (
filePath.startsWith('.gemini/agents/') &&
filePath.endsWith('.md')
) {
const hash = crypto
.createHash('sha256')
.update(content)
.digest('hex');
try {
const agentDefs = await parseAgentMarkdown(fullPath, content);
if (agentDefs.length > 0) {
const agentName = agentDefs[0].name;
if (!acknowledgedAgents[projectRoot]) {
acknowledgedAgents[projectRoot] = {};
}
acknowledgedAgents[projectRoot][agentName] = hash;
}
} catch (error) {
console.warn(
`Failed to parse agent for test acknowledgement: ${filePath}`,
error,
);
}
}
}
// Write acknowledged_agents.json to the home directory
if (Object.keys(acknowledgedAgents).length > 0) {
const ackPath = path.join(
rig.homeDir!,
'.gemini',
'acknowledgments',
'agents.json',
);
fs.mkdirSync(path.dirname(ackPath), { recursive: true });
fs.writeFileSync(
ackPath,
JSON.stringify(acknowledgedAgents, null, 2),
);
}
const execOptions = { cwd: rig.testDir!, stdio: 'inherit' as const };
execSync('git init', execOptions);
execSync('git config user.email "test@example.com"', execOptions);
execSync('git config user.name "Test User"', execOptions);
// Temporarily disable the interactive editor and git pager
// to avoid hanging the tests. It seems the the agent isn't
// consistently honoring the instructions to avoid interactive
// commands.
execSync('git config core.editor "true"', execOptions);
execSync('git config core.pager "cat"', execOptions);
execSync('git config commit.gpgsign false', execOptions);
execSync('git add .', execOptions);
execSync('git commit --allow-empty -m "Initial commit"', execOptions);
await setupTestFiles(rig, evalCase.files);
}
symlinkNodeModules(rig.testDir || '');
// If messages are provided, write a session file so --resume can load it.
let sessionId: string | undefined;
if (evalCase.messages) {
@@ -188,6 +135,37 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
await evalCase.assert(rig, result);
isSuccess = true;
return; // Success! Exit the retry loop.
} catch (error: unknown) {
const errorMessage =
error instanceof Error ? error.message : String(error);
const errorCode = getApiErrorCode(errorMessage);
if (errorCode) {
const status = attempt < maxRetries ? 'RETRY' : 'SKIP';
logReliabilityEvent(
evalCase.name,
attempt,
status,
errorCode,
errorMessage,
);
if (attempt < maxRetries) {
attempt++;
console.warn(
`[Eval] Attempt ${attempt} failed with ${errorCode} Error. Retrying...`,
);
continue; // Retry
}
console.warn(
`[Eval] '${evalCase.name}' failed after ${maxRetries} retries due to persistent API errors. Skipping failure to avoid blocking PR.`,
);
return; // Gracefully exit without failing the test
}
throw error; // Real failure
} finally {
if (isSuccess) {
await fs.promises.unlink(activityLogFile).catch((err) => {
@@ -206,9 +184,131 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
);
await rig.cleanup();
}
}
}
function getApiErrorCode(message: string): '500' | '503' | undefined {
if (
message.includes('status: UNAVAILABLE') ||
message.includes('code: 503') ||
message.includes('Service Unavailable')
) {
return '503';
}
if (
message.includes('status: INTERNAL') ||
message.includes('code: 500') ||
message.includes('Internal error encountered')
) {
return '500';
}
return undefined;
}
/**
* Log reliability event for later harvesting.
*
* Note: Uses synchronous file I/O to ensure the log is persisted even if the
* test process is abruptly terminated by a timeout or CI crash. Performance
* impact is negligible compared to long-running evaluation tests.
*/
function logReliabilityEvent(
testName: string,
attempt: number,
status: 'RETRY' | 'SKIP',
errorCode: '500' | '503',
errorMessage: string,
) {
const reliabilityLog = {
timestamp: new Date().toISOString(),
testName,
model: process.env.GEMINI_MODEL || 'unknown',
attempt,
status,
errorCode,
error: errorMessage,
};
runEval(policy, evalCase.name, fn, evalCase.timeout);
try {
const relDir = path.resolve(process.cwd(), 'evals/logs');
fs.mkdirSync(relDir, { recursive: true });
fs.appendFileSync(
path.join(relDir, 'api-reliability.jsonl'),
JSON.stringify(reliabilityLog) + '\n',
);
} catch (logError) {
console.error('Failed to write reliability log:', logError);
}
}
/**
* Helper to setup test files and git repository.
*
* Note: While this is an async function (due to parseAgentMarkdown), it
* intentionally uses synchronous filesystem and child_process operations
* for simplicity and to ensure sequential environment preparation.
*/
async function setupTestFiles(rig: TestRig, files: Record<string, string>) {
const acknowledgedAgents: Record<string, Record<string, string>> = {};
const projectRoot = fs.realpathSync(rig.testDir!);
for (const [filePath, content] of Object.entries(files)) {
if (filePath.includes('..') || path.isAbsolute(filePath)) {
throw new Error(`Invalid file path in test case: ${filePath}`);
}
const fullPath = path.join(projectRoot, filePath);
if (!fullPath.startsWith(projectRoot)) {
throw new Error(`Path traversal detected: ${filePath}`);
}
fs.mkdirSync(path.dirname(fullPath), { recursive: true });
fs.writeFileSync(fullPath, content);
if (filePath.startsWith('.gemini/agents/') && filePath.endsWith('.md')) {
const hash = crypto.createHash('sha256').update(content).digest('hex');
try {
const agentDefs = await parseAgentMarkdown(fullPath, content);
if (agentDefs.length > 0) {
const agentName = agentDefs[0].name;
if (!acknowledgedAgents[projectRoot]) {
acknowledgedAgents[projectRoot] = {};
}
acknowledgedAgents[projectRoot][agentName] = hash;
}
} catch (error) {
console.warn(
`Failed to parse agent for test acknowledgement: ${filePath}`,
error,
);
}
}
}
if (Object.keys(acknowledgedAgents).length > 0) {
const ackPath = path.join(
rig.homeDir!,
'.gemini',
'acknowledgments',
'agents.json',
);
fs.mkdirSync(path.dirname(ackPath), { recursive: true });
fs.writeFileSync(ackPath, JSON.stringify(acknowledgedAgents, null, 2));
}
const execOptions = { cwd: rig.testDir!, stdio: 'inherit' as const };
execSync('git init --initial-branch=main', execOptions);
execSync('git config user.email "test@example.com"', execOptions);
execSync('git config user.name "Test User"', execOptions);
// Temporarily disable the interactive editor and git pager
// to avoid hanging the tests. It seems the the agent isn't
// consistently honoring the instructions to avoid interactive
// commands.
execSync('git config core.editor "true"', execOptions);
execSync('git config core.pager "cat"', execOptions);
execSync('git config commit.gpgsign false', execOptions);
execSync('git add .', execOptions);
execSync('git commit --allow-empty -m "Initial commit"', execOptions);
}
/**

View File

@@ -16,10 +16,6 @@ export default defineConfig({
},
test: {
testTimeout: 300000, // 5 minutes
// Retry in CI but not nightly to avoid blocking on API error.
retry: process.env['VITEST_RETRY']
? parseInt(process.env['VITEST_RETRY'], 10)
: 3,
reporters: ['default', 'json'],
outputFile: {
json: 'evals/logs/report.json',

View File

@@ -0,0 +1,117 @@
#!/bin/bash
# Gemini API Reliability Harvester
# -------------------------------
# This script gathers data about 500 API errors encountered during evaluation runs
# (eval.yml) from GitHub Actions. It is used to analyze developer friction caused
# by transient API failures.
#
# Usage:
# ./scripts/harvest_api_reliability.sh [SINCE] [LIMIT] [BRANCH]
#
# Examples:
# ./scripts/harvest_api_reliability.sh # Last 7 days, all branches
# ./scripts/harvest_api_reliability.sh 14d 500 # Last 14 days, limit 500
# ./scripts/harvest_api_reliability.sh 2026-03-01 100 my-branch # Specific date and branch
#
# Prerequisites:
# - GitHub CLI (gh) installed and authenticated (`gh auth login`)
# - jq installed
# Arguments & Defaults
if [[ -n "$1" && $1 =~ ^[0-9]{4}-[0-9]{2}-[0-9]{2}$ ]]; then
SINCE="$1"
elif [[ -n "$1" && $1 =~ ^([0-9]+)d$ ]]; then
DAYS="${BASH_REMATCH[1]}"
if [[ "$OSTYPE" == "darwin"* ]]; then
SINCE=$(date -u -v-"${DAYS}"d +%Y-%m-%d)
else
SINCE=$(date -u -d "${DAYS} days ago" +%Y-%m-%d)
fi
else
# Default to 7 days ago in YYYY-MM-DD format (UTC)
if [[ "$OSTYPE" == "darwin"* ]]; then
SINCE=$(date -u -v-7d +%Y-%m-%d)
else
SINCE=$(date -u -d "7 days ago" +%Y-%m-%d)
fi
fi
LIMIT=${2:-300}
BRANCH=${3:-""}
WORKFLOWS=("Testing: E2E (Chained)" "Evals: Nightly")
DEST_DIR=$(mktemp -d -t gemini-reliability-XXXXXX)
MERGED_FILE="api-reliability-summary.jsonl"
# Ensure cleanup on exit
trap 'rm -rf "$DEST_DIR"' EXIT
if ! command -v gh &> /dev/null; then
echo "❌ Error: GitHub CLI (gh) is not installed."
exit 1
fi
if ! command -v jq &> /dev/null; then
echo "❌ Error: jq is not installed."
exit 1
fi
# Clean start
rm -f "$MERGED_FILE"
# gh run list --created expects a date (YYYY-MM-DD) or a range
CREATED_QUERY=">=$SINCE"
for WORKFLOW in "${WORKFLOWS[@]}"; do
echo "🔍 Fetching runs for '$WORKFLOW' created since $SINCE (max $LIMIT runs, branch: ${BRANCH:-all})..."
# Construct arguments for gh run list
GH_ARGS=("--workflow" "$WORKFLOW" "--created" "$CREATED_QUERY" "--limit" "$LIMIT" "--json" "databaseId" "--jq" ".[].databaseId")
if [ -n "$BRANCH" ]; then
GH_ARGS+=("--branch" "$BRANCH")
fi
RUN_IDS=$(gh run list "${GH_ARGS[@]}")
exit_code=$?
if [ $exit_code -ne 0 ]; then
echo "❌ Failed to fetch runs for '$WORKFLOW' (exit code: $exit_code). Please check 'gh auth status' and permissions." >&2
continue
fi
if [ -z "$RUN_IDS" ]; then
echo "📭 No runs found for workflow '$WORKFLOW' since $SINCE."
continue
fi
for ID in $RUN_IDS; do
# Download artifacts named 'eval-logs-*'
# Silencing output because many older runs won't have artifacts
gh run download "$ID" -p "eval-logs-*" -D "$DEST_DIR/$ID" &>/dev/null || continue
# Append to master log
# Use find to locate api-reliability.jsonl in any subdirectory of $DEST_DIR/$ID
find "$DEST_DIR/$ID" -type f -name "api-reliability.jsonl" -exec cat {} + >> "$MERGED_FILE" 2>/dev/null
done
done
if [ ! -f "$MERGED_FILE" ]; then
echo "📭 No reliability data found in the retrieved logs."
exit 0
fi
echo -e "\n✅ Harvest Complete! Data merged into: $MERGED_FILE"
echo "------------------------------------------------"
echo "📊 Gemini API Reliability Summary (Since $SINCE)"
echo "------------------------------------------------"
cat "$MERGED_FILE" | jq -s '
group_by(.model) | map({
model: .[0].model,
"500s": (map(select(.errorCode == "500")) | length),
"503s": (map(select(.errorCode == "503")) | length),
retries: (map(select(.status == "RETRY")) | length),
skips: (map(select(.status == "SKIP")) | length)
})'
echo -e "\n💡 Total events captured: $(wc -l < "$MERGED_FILE")"