diff --git a/.github/workflows/chained_e2e.yml b/.github/workflows/chained_e2e.yml index 8d714b34b0..fe87fb1d5d 100644 --- a/.github/workflows/chained_e2e.yml +++ b/.github/workflows/chained_e2e.yml @@ -334,8 +334,20 @@ jobs: if: "${{ steps.check_evals.outputs.should_run == 'true' }}" env: GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}' + GEMINI_MODEL: 'gemini-3-pro-preview' + # Disable Vitest internal retries to avoid double-retrying; + # custom retry logic is handled in evals/test-helper.ts + VITEST_RETRY: 0 run: 'npm run test:always_passing_evals' + - name: 'Upload Reliability Logs' + if: "always() && steps.check_evals.outputs.should_run == 'true'" + uses: 'actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02' # ratchet:actions/upload-artifact@v4 + with: + name: 'eval-logs-${{ github.run_id }}-${{ github.run_attempt }}' + path: 'evals/logs/api-reliability.jsonl' + retention-days: 7 + e2e: name: 'E2E' if: | diff --git a/.github/workflows/evals-nightly.yml b/.github/workflows/evals-nightly.yml index ee17a95121..9acc1de050 100644 --- a/.github/workflows/evals-nightly.yml +++ b/.github/workflows/evals-nightly.yml @@ -61,6 +61,8 @@ jobs: GEMINI_MODEL: '${{ matrix.model }}' RUN_EVALS: "${{ github.event.inputs.run_all != 'false' }}" TEST_NAME_PATTERN: '${{ github.event.inputs.test_name_pattern }}' + # Disable Vitest internal retries to avoid double-retrying; + # custom retry logic is handled in evals/test-helper.ts VITEST_RETRY: 0 run: | CMD="npm run test:all_evals" diff --git a/evals/test-helper.test.ts b/evals/test-helper.test.ts new file mode 100644 index 0000000000..c0147cda75 --- /dev/null +++ b/evals/test-helper.test.ts @@ -0,0 +1,207 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; +import fs from 'node:fs'; +import path from 'node:path'; +import { internalEvalTest } from './test-helper.js'; +import { TestRig } from '@google/gemini-cli-test-utils'; + +// Mock TestRig to control API success/failure +vi.mock('@google/gemini-cli-test-utils', () => { + return { + TestRig: vi.fn().mockImplementation(() => ({ + setup: vi.fn(), + run: vi.fn(), + cleanup: vi.fn(), + readToolLogs: vi.fn().mockReturnValue([]), + _lastRunStderr: '', + })), + }; +}); + +describe('evalTest reliability logic', () => { + const LOG_DIR = path.resolve(process.cwd(), 'evals/logs'); + const RELIABILITY_LOG = path.join(LOG_DIR, 'api-reliability.jsonl'); + + beforeEach(() => { + vi.clearAllMocks(); + if (fs.existsSync(RELIABILITY_LOG)) { + fs.unlinkSync(RELIABILITY_LOG); + } + }); + + afterEach(() => { + if (fs.existsSync(RELIABILITY_LOG)) { + fs.unlinkSync(RELIABILITY_LOG); + } + }); + + it('should retry 3 times on 500 INTERNAL error and then SKIP', async () => { + const mockRig = new TestRig() as any; + (TestRig as any).mockReturnValue(mockRig); + + // Simulate permanent 500 error + mockRig.run.mockRejectedValue(new Error('status: INTERNAL - API Down')); + + // Execute the test function directly + await internalEvalTest({ + name: 'test-api-failure', + prompt: 'do something', + assert: async () => {}, + }); + + // Verify retries: 1 initial + 3 retries = 4 setups/runs + expect(mockRig.run).toHaveBeenCalledTimes(4); + + // Verify log content + const logContent = fs + .readFileSync(RELIABILITY_LOG, 'utf-8') + .trim() + .split('\n'); + expect(logContent.length).toBe(4); + + const entries = logContent.map((line) => JSON.parse(line)); + expect(entries[0].status).toBe('RETRY'); + expect(entries[0].attempt).toBe(0); + expect(entries[3].status).toBe('SKIP'); + expect(entries[3].attempt).toBe(3); + expect(entries[3].testName).toBe('test-api-failure'); + }); + + it('should fail immediately on non-500 errors (like assertion failures)', async () => { + const mockRig = new TestRig() as any; + (TestRig as any).mockReturnValue(mockRig); + + // Simulate a real logic error/bug + mockRig.run.mockResolvedValue('Success'); + const assertError = new Error('Assertion failed: expected foo to be bar'); + + // Expect the test function to throw immediately + await expect( + internalEvalTest({ + name: 'test-logic-failure', + prompt: 'do something', + assert: async () => { + throw assertError; + }, + }), + ).rejects.toThrow('Assertion failed'); + + // Verify NO retries: only 1 attempt + expect(mockRig.run).toHaveBeenCalledTimes(1); + + // Verify NO reliability log was created (it's not an API error) + expect(fs.existsSync(RELIABILITY_LOG)).toBe(false); + }); + + it('should recover if a retry succeeds', async () => { + const mockRig = new TestRig() as any; + (TestRig as any).mockReturnValue(mockRig); + + // Fail once, then succeed + mockRig.run + .mockRejectedValueOnce(new Error('status: INTERNAL')) + .mockResolvedValueOnce('Success'); + + await internalEvalTest({ + name: 'test-recovery', + prompt: 'do something', + assert: async () => {}, + }); + + // Ran twice: initial (fail) + retry 1 (success) + expect(mockRig.run).toHaveBeenCalledTimes(2); + + // Log should only have the one RETRY entry + const logContent = fs + .readFileSync(RELIABILITY_LOG, 'utf-8') + .trim() + .split('\n'); + expect(logContent.length).toBe(1); + expect(JSON.parse(logContent[0]).status).toBe('RETRY'); + }); + + it('should retry 3 times on 503 UNAVAILABLE error and then SKIP', async () => { + const mockRig = new TestRig() as any; + (TestRig as any).mockReturnValue(mockRig); + + // Simulate permanent 503 error + mockRig.run.mockRejectedValue( + new Error('status: UNAVAILABLE - Service Busy'), + ); + + await internalEvalTest({ + name: 'test-api-503', + prompt: 'do something', + assert: async () => {}, + }); + + expect(mockRig.run).toHaveBeenCalledTimes(4); + + const logContent = fs + .readFileSync(RELIABILITY_LOG, 'utf-8') + .trim() + .split('\n'); + const entries = logContent.map((line) => JSON.parse(line)); + expect(entries[0].errorCode).toBe('503'); + expect(entries[3].status).toBe('SKIP'); + }); + + it('should throw if an absolute path is used in files', async () => { + const mockRig = new TestRig() as any; + (TestRig as any).mockReturnValue(mockRig); + mockRig.testDir = path.resolve(process.cwd(), 'test-dir-tmp'); + if (!fs.existsSync(mockRig.testDir)) { + fs.mkdirSync(mockRig.testDir, { recursive: true }); + } + + try { + await expect( + internalEvalTest({ + name: 'test-absolute-path', + prompt: 'do something', + files: { + '/etc/passwd': 'hacked', + }, + assert: async () => {}, + }), + ).rejects.toThrow('Invalid file path in test case: /etc/passwd'); + } finally { + if (fs.existsSync(mockRig.testDir)) { + fs.rmSync(mockRig.testDir, { recursive: true, force: true }); + } + } + }); + + it('should throw if directory traversal is detected in files', async () => { + const mockRig = new TestRig() as any; + (TestRig as any).mockReturnValue(mockRig); + mockRig.testDir = path.resolve(process.cwd(), 'test-dir-tmp'); + + // Create a mock test-dir + if (!fs.existsSync(mockRig.testDir)) { + fs.mkdirSync(mockRig.testDir, { recursive: true }); + } + + try { + await expect( + internalEvalTest({ + name: 'test-traversal', + prompt: 'do something', + files: { + '../sensitive.txt': 'hacked', + }, + assert: async () => {}, + }), + ).rejects.toThrow('Invalid file path in test case: ../sensitive.txt'); + } finally { + if (fs.existsSync(mockRig.testDir)) { + fs.rmSync(mockRig.testDir, { recursive: true, force: true }); + } + } + }); +}); diff --git a/evals/test-helper.ts b/evals/test-helper.ts index 7683fc510e..f79a78779a 100644 --- a/evals/test-helper.ts +++ b/evals/test-helper.ts @@ -39,87 +39,34 @@ export * from '@google/gemini-cli-test-utils'; export type EvalPolicy = 'ALWAYS_PASSES' | 'USUALLY_PASSES'; export function evalTest(policy: EvalPolicy, evalCase: EvalCase) { - const fn = async () => { + runEval( + policy, + evalCase.name, + () => internalEvalTest(evalCase), + evalCase.timeout, + ); +} + +export async function internalEvalTest(evalCase: EvalCase) { + const maxRetries = 3; + let attempt = 0; + + while (attempt <= maxRetries) { const rig = new TestRig(); const { logDir, sanitizedName } = await prepareLogDir(evalCase.name); const activityLogFile = path.join(logDir, `${sanitizedName}.jsonl`); const logFile = path.join(logDir, `${sanitizedName}.log`); let isSuccess = false; + try { rig.setup(evalCase.name, evalCase.params); - // Symlink node modules to reduce the amount of time needed to - // bootstrap test projects. - symlinkNodeModules(rig.testDir || ''); - if (evalCase.files) { - const acknowledgedAgents: Record> = {}; - const projectRoot = fs.realpathSync(rig.testDir!); - - for (const [filePath, content] of Object.entries(evalCase.files)) { - const fullPath = path.join(rig.testDir!, filePath); - fs.mkdirSync(path.dirname(fullPath), { recursive: true }); - fs.writeFileSync(fullPath, content); - - // If it's an agent file, calculate hash for acknowledgement - if ( - filePath.startsWith('.gemini/agents/') && - filePath.endsWith('.md') - ) { - const hash = crypto - .createHash('sha256') - .update(content) - .digest('hex'); - - try { - const agentDefs = await parseAgentMarkdown(fullPath, content); - if (agentDefs.length > 0) { - const agentName = agentDefs[0].name; - if (!acknowledgedAgents[projectRoot]) { - acknowledgedAgents[projectRoot] = {}; - } - acknowledgedAgents[projectRoot][agentName] = hash; - } - } catch (error) { - console.warn( - `Failed to parse agent for test acknowledgement: ${filePath}`, - error, - ); - } - } - } - - // Write acknowledged_agents.json to the home directory - if (Object.keys(acknowledgedAgents).length > 0) { - const ackPath = path.join( - rig.homeDir!, - '.gemini', - 'acknowledgments', - 'agents.json', - ); - fs.mkdirSync(path.dirname(ackPath), { recursive: true }); - fs.writeFileSync( - ackPath, - JSON.stringify(acknowledgedAgents, null, 2), - ); - } - - const execOptions = { cwd: rig.testDir!, stdio: 'inherit' as const }; - execSync('git init', execOptions); - execSync('git config user.email "test@example.com"', execOptions); - execSync('git config user.name "Test User"', execOptions); - - // Temporarily disable the interactive editor and git pager - // to avoid hanging the tests. It seems the the agent isn't - // consistently honoring the instructions to avoid interactive - // commands. - execSync('git config core.editor "true"', execOptions); - execSync('git config core.pager "cat"', execOptions); - execSync('git config commit.gpgsign false', execOptions); - execSync('git add .', execOptions); - execSync('git commit --allow-empty -m "Initial commit"', execOptions); + await setupTestFiles(rig, evalCase.files); } + symlinkNodeModules(rig.testDir || ''); + // If messages are provided, write a session file so --resume can load it. let sessionId: string | undefined; if (evalCase.messages) { @@ -188,6 +135,37 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) { await evalCase.assert(rig, result); isSuccess = true; + return; // Success! Exit the retry loop. + } catch (error: unknown) { + const errorMessage = + error instanceof Error ? error.message : String(error); + const errorCode = getApiErrorCode(errorMessage); + + if (errorCode) { + const status = attempt < maxRetries ? 'RETRY' : 'SKIP'; + logReliabilityEvent( + evalCase.name, + attempt, + status, + errorCode, + errorMessage, + ); + + if (attempt < maxRetries) { + attempt++; + console.warn( + `[Eval] Attempt ${attempt} failed with ${errorCode} Error. Retrying...`, + ); + continue; // Retry + } + + console.warn( + `[Eval] '${evalCase.name}' failed after ${maxRetries} retries due to persistent API errors. Skipping failure to avoid blocking PR.`, + ); + return; // Gracefully exit without failing the test + } + + throw error; // Real failure } finally { if (isSuccess) { await fs.promises.unlink(activityLogFile).catch((err) => { @@ -206,9 +184,131 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) { ); await rig.cleanup(); } + } +} + +function getApiErrorCode(message: string): '500' | '503' | undefined { + if ( + message.includes('status: UNAVAILABLE') || + message.includes('code: 503') || + message.includes('Service Unavailable') + ) { + return '503'; + } + if ( + message.includes('status: INTERNAL') || + message.includes('code: 500') || + message.includes('Internal error encountered') + ) { + return '500'; + } + return undefined; +} + +/** + * Log reliability event for later harvesting. + * + * Note: Uses synchronous file I/O to ensure the log is persisted even if the + * test process is abruptly terminated by a timeout or CI crash. Performance + * impact is negligible compared to long-running evaluation tests. + */ +function logReliabilityEvent( + testName: string, + attempt: number, + status: 'RETRY' | 'SKIP', + errorCode: '500' | '503', + errorMessage: string, +) { + const reliabilityLog = { + timestamp: new Date().toISOString(), + testName, + model: process.env.GEMINI_MODEL || 'unknown', + attempt, + status, + errorCode, + error: errorMessage, }; - runEval(policy, evalCase.name, fn, evalCase.timeout); + try { + const relDir = path.resolve(process.cwd(), 'evals/logs'); + fs.mkdirSync(relDir, { recursive: true }); + fs.appendFileSync( + path.join(relDir, 'api-reliability.jsonl'), + JSON.stringify(reliabilityLog) + '\n', + ); + } catch (logError) { + console.error('Failed to write reliability log:', logError); + } +} + +/** + * Helper to setup test files and git repository. + * + * Note: While this is an async function (due to parseAgentMarkdown), it + * intentionally uses synchronous filesystem and child_process operations + * for simplicity and to ensure sequential environment preparation. + */ +async function setupTestFiles(rig: TestRig, files: Record) { + const acknowledgedAgents: Record> = {}; + const projectRoot = fs.realpathSync(rig.testDir!); + + for (const [filePath, content] of Object.entries(files)) { + if (filePath.includes('..') || path.isAbsolute(filePath)) { + throw new Error(`Invalid file path in test case: ${filePath}`); + } + const fullPath = path.join(projectRoot, filePath); + if (!fullPath.startsWith(projectRoot)) { + throw new Error(`Path traversal detected: ${filePath}`); + } + + fs.mkdirSync(path.dirname(fullPath), { recursive: true }); + fs.writeFileSync(fullPath, content); + + if (filePath.startsWith('.gemini/agents/') && filePath.endsWith('.md')) { + const hash = crypto.createHash('sha256').update(content).digest('hex'); + try { + const agentDefs = await parseAgentMarkdown(fullPath, content); + if (agentDefs.length > 0) { + const agentName = agentDefs[0].name; + if (!acknowledgedAgents[projectRoot]) { + acknowledgedAgents[projectRoot] = {}; + } + acknowledgedAgents[projectRoot][agentName] = hash; + } + } catch (error) { + console.warn( + `Failed to parse agent for test acknowledgement: ${filePath}`, + error, + ); + } + } + } + + if (Object.keys(acknowledgedAgents).length > 0) { + const ackPath = path.join( + rig.homeDir!, + '.gemini', + 'acknowledgments', + 'agents.json', + ); + fs.mkdirSync(path.dirname(ackPath), { recursive: true }); + fs.writeFileSync(ackPath, JSON.stringify(acknowledgedAgents, null, 2)); + } + + const execOptions = { cwd: rig.testDir!, stdio: 'inherit' as const }; + execSync('git init --initial-branch=main', execOptions); + execSync('git config user.email "test@example.com"', execOptions); + execSync('git config user.name "Test User"', execOptions); + + // Temporarily disable the interactive editor and git pager + // to avoid hanging the tests. It seems the the agent isn't + // consistently honoring the instructions to avoid interactive + // commands. + execSync('git config core.editor "true"', execOptions); + execSync('git config core.pager "cat"', execOptions); + execSync('git config commit.gpgsign false', execOptions); + execSync('git add .', execOptions); + execSync('git commit --allow-empty -m "Initial commit"', execOptions); } /** diff --git a/evals/vitest.config.ts b/evals/vitest.config.ts index 3231f31a10..50733a999c 100644 --- a/evals/vitest.config.ts +++ b/evals/vitest.config.ts @@ -16,10 +16,6 @@ export default defineConfig({ }, test: { testTimeout: 300000, // 5 minutes - // Retry in CI but not nightly to avoid blocking on API error. - retry: process.env['VITEST_RETRY'] - ? parseInt(process.env['VITEST_RETRY'], 10) - : 3, reporters: ['default', 'json'], outputFile: { json: 'evals/logs/report.json', diff --git a/scripts/harvest_api_reliability.sh b/scripts/harvest_api_reliability.sh new file mode 100755 index 0000000000..140063b8ea --- /dev/null +++ b/scripts/harvest_api_reliability.sh @@ -0,0 +1,117 @@ +#!/bin/bash + +# Gemini API Reliability Harvester +# ------------------------------- +# This script gathers data about 500 API errors encountered during evaluation runs +# (eval.yml) from GitHub Actions. It is used to analyze developer friction caused +# by transient API failures. +# +# Usage: +# ./scripts/harvest_api_reliability.sh [SINCE] [LIMIT] [BRANCH] +# +# Examples: +# ./scripts/harvest_api_reliability.sh # Last 7 days, all branches +# ./scripts/harvest_api_reliability.sh 14d 500 # Last 14 days, limit 500 +# ./scripts/harvest_api_reliability.sh 2026-03-01 100 my-branch # Specific date and branch +# +# Prerequisites: +# - GitHub CLI (gh) installed and authenticated (`gh auth login`) +# - jq installed + +# Arguments & Defaults +if [[ -n "$1" && $1 =~ ^[0-9]{4}-[0-9]{2}-[0-9]{2}$ ]]; then + SINCE="$1" +elif [[ -n "$1" && $1 =~ ^([0-9]+)d$ ]]; then + DAYS="${BASH_REMATCH[1]}" + if [[ "$OSTYPE" == "darwin"* ]]; then + SINCE=$(date -u -v-"${DAYS}"d +%Y-%m-%d) + else + SINCE=$(date -u -d "${DAYS} days ago" +%Y-%m-%d) + fi +else + # Default to 7 days ago in YYYY-MM-DD format (UTC) + if [[ "$OSTYPE" == "darwin"* ]]; then + SINCE=$(date -u -v-7d +%Y-%m-%d) + else + SINCE=$(date -u -d "7 days ago" +%Y-%m-%d) + fi +fi + +LIMIT=${2:-300} +BRANCH=${3:-""} +WORKFLOWS=("Testing: E2E (Chained)" "Evals: Nightly") +DEST_DIR=$(mktemp -d -t gemini-reliability-XXXXXX) +MERGED_FILE="api-reliability-summary.jsonl" + +# Ensure cleanup on exit +trap 'rm -rf "$DEST_DIR"' EXIT + +if ! command -v gh &> /dev/null; then + echo "āŒ Error: GitHub CLI (gh) is not installed." + exit 1 +fi + +if ! command -v jq &> /dev/null; then + echo "āŒ Error: jq is not installed." + exit 1 +fi + +# Clean start +rm -f "$MERGED_FILE" + +# gh run list --created expects a date (YYYY-MM-DD) or a range +CREATED_QUERY=">=$SINCE" + +for WORKFLOW in "${WORKFLOWS[@]}"; do + echo "šŸ” Fetching runs for '$WORKFLOW' created since $SINCE (max $LIMIT runs, branch: ${BRANCH:-all})..." + + # Construct arguments for gh run list + GH_ARGS=("--workflow" "$WORKFLOW" "--created" "$CREATED_QUERY" "--limit" "$LIMIT" "--json" "databaseId" "--jq" ".[].databaseId") + if [ -n "$BRANCH" ]; then + GH_ARGS+=("--branch" "$BRANCH") + fi + + RUN_IDS=$(gh run list "${GH_ARGS[@]}") + exit_code=$? + + if [ $exit_code -ne 0 ]; then + echo "āŒ Failed to fetch runs for '$WORKFLOW' (exit code: $exit_code). Please check 'gh auth status' and permissions." >&2 + continue + fi + + if [ -z "$RUN_IDS" ]; then + echo "šŸ“­ No runs found for workflow '$WORKFLOW' since $SINCE." + continue + fi + + for ID in $RUN_IDS; do + # Download artifacts named 'eval-logs-*' + # Silencing output because many older runs won't have artifacts + gh run download "$ID" -p "eval-logs-*" -D "$DEST_DIR/$ID" &>/dev/null || continue + + # Append to master log + # Use find to locate api-reliability.jsonl in any subdirectory of $DEST_DIR/$ID + find "$DEST_DIR/$ID" -type f -name "api-reliability.jsonl" -exec cat {} + >> "$MERGED_FILE" 2>/dev/null + done +done + +if [ ! -f "$MERGED_FILE" ]; then + echo "šŸ“­ No reliability data found in the retrieved logs." + exit 0 +fi + +echo -e "\nāœ… Harvest Complete! Data merged into: $MERGED_FILE" +echo "------------------------------------------------" +echo "šŸ“Š Gemini API Reliability Summary (Since $SINCE)" +echo "------------------------------------------------" + +cat "$MERGED_FILE" | jq -s ' + group_by(.model) | map({ + model: .[0].model, + "500s": (map(select(.errorCode == "500")) | length), + "503s": (map(select(.errorCode == "503")) | length), + retries: (map(select(.status == "RETRY")) | length), + skips: (map(select(.status == "SKIP")) | length) + })' + +echo -e "\nšŸ’” Total events captured: $(wc -l < "$MERGED_FILE")"