feat(evals): add reliability harvester and 500/503 retry support (#23626)

2026-06-29 04:37:12 -07:00 · 2026-03-25 18:48:45 -07:00
parent c1e4dbd157
commit 2e03e3aed5
6 changed files with 509 additions and 75 deletions
@@ -334,8 +334,20 @@ jobs:
        if: "${{ steps.check_evals.outputs.should_run == 'true' }}"
        env:
          GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}'
+          GEMINI_MODEL: 'gemini-3-pro-preview'
+          # Disable Vitest internal retries to avoid double-retrying;
+          # custom retry logic is handled in evals/test-helper.ts
+          VITEST_RETRY: 0
        run: 'npm run test:always_passing_evals'

+      - name: 'Upload Reliability Logs'
+        if: "always() && steps.check_evals.outputs.should_run == 'true'"
+        uses: 'actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02' # ratchet:actions/upload-artifact@v4
+        with:
+          name: 'eval-logs-${{ github.run_id }}-${{ github.run_attempt }}'
+          path: 'evals/logs/api-reliability.jsonl'
+          retention-days: 7
+
  e2e:
    name: 'E2E'
    if: |
@@ -61,6 +61,8 @@ jobs:
          GEMINI_MODEL: '${{ matrix.model }}'
          RUN_EVALS: "${{ github.event.inputs.run_all != 'false' }}"
          TEST_NAME_PATTERN: '${{ github.event.inputs.test_name_pattern }}'
+          # Disable Vitest internal retries to avoid double-retrying;
+          # custom retry logic is handled in evals/test-helper.ts
          VITEST_RETRY: 0
        run: |
          CMD="npm run test:all_evals"
@@ -0,0 +1,207 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
+import fs from 'node:fs';
+import path from 'node:path';
+import { internalEvalTest } from './test-helper.js';
+import { TestRig } from '@google/gemini-cli-test-utils';
+
+// Mock TestRig to control API success/failure
+vi.mock('@google/gemini-cli-test-utils', () => {
+  return {
+    TestRig: vi.fn().mockImplementation(() => ({
+      setup: vi.fn(),
+      run: vi.fn(),
+      cleanup: vi.fn(),
+      readToolLogs: vi.fn().mockReturnValue([]),
+      _lastRunStderr: '',
+    })),
+  };
+});
+
+describe('evalTest reliability logic', () => {
+  const LOG_DIR = path.resolve(process.cwd(), 'evals/logs');
+  const RELIABILITY_LOG = path.join(LOG_DIR, 'api-reliability.jsonl');
+
+  beforeEach(() => {
+    vi.clearAllMocks();
+    if (fs.existsSync(RELIABILITY_LOG)) {
+      fs.unlinkSync(RELIABILITY_LOG);
+    }
+  });
+
+  afterEach(() => {
+    if (fs.existsSync(RELIABILITY_LOG)) {
+      fs.unlinkSync(RELIABILITY_LOG);
+    }
+  });
+
+  it('should retry 3 times on 500 INTERNAL error and then SKIP', async () => {
+    const mockRig = new TestRig() as any;
+    (TestRig as any).mockReturnValue(mockRig);
+
+    // Simulate permanent 500 error
+    mockRig.run.mockRejectedValue(new Error('status: INTERNAL - API Down'));
+
+    // Execute the test function directly
+    await internalEvalTest({
+      name: 'test-api-failure',
+      prompt: 'do something',
+      assert: async () => {},
+    });
+
+    // Verify retries: 1 initial + 3 retries = 4 setups/runs
+    expect(mockRig.run).toHaveBeenCalledTimes(4);
+
+    // Verify log content
+    const logContent = fs
+      .readFileSync(RELIABILITY_LOG, 'utf-8')
+      .trim()
+      .split('\n');
+    expect(logContent.length).toBe(4);
+
+    const entries = logContent.map((line) => JSON.parse(line));
+    expect(entries[0].status).toBe('RETRY');
+    expect(entries[0].attempt).toBe(0);
+    expect(entries[3].status).toBe('SKIP');
+    expect(entries[3].attempt).toBe(3);
+    expect(entries[3].testName).toBe('test-api-failure');
+  });
+
+  it('should fail immediately on non-500 errors (like assertion failures)', async () => {
+    const mockRig = new TestRig() as any;
+    (TestRig as any).mockReturnValue(mockRig);
+
+    // Simulate a real logic error/bug
+    mockRig.run.mockResolvedValue('Success');
+    const assertError = new Error('Assertion failed: expected foo to be bar');
+
+    // Expect the test function to throw immediately
+    await expect(
+      internalEvalTest({
+        name: 'test-logic-failure',
+        prompt: 'do something',
+        assert: async () => {
+          throw assertError;
+        },
+      }),
+    ).rejects.toThrow('Assertion failed');
+
+    // Verify NO retries: only 1 attempt
+    expect(mockRig.run).toHaveBeenCalledTimes(1);
+
+    // Verify NO reliability log was created (it's not an API error)
+    expect(fs.existsSync(RELIABILITY_LOG)).toBe(false);
+  });
+
+  it('should recover if a retry succeeds', async () => {
+    const mockRig = new TestRig() as any;
+    (TestRig as any).mockReturnValue(mockRig);
+
+    // Fail once, then succeed
+    mockRig.run
+      .mockRejectedValueOnce(new Error('status: INTERNAL'))
+      .mockResolvedValueOnce('Success');
+
+    await internalEvalTest({
+      name: 'test-recovery',
+      prompt: 'do something',
+      assert: async () => {},
+    });
+
+    // Ran twice: initial (fail) + retry 1 (success)
+    expect(mockRig.run).toHaveBeenCalledTimes(2);
+
+    // Log should only have the one RETRY entry
+    const logContent = fs
+      .readFileSync(RELIABILITY_LOG, 'utf-8')
+      .trim()
+      .split('\n');
+    expect(logContent.length).toBe(1);
+    expect(JSON.parse(logContent[0]).status).toBe('RETRY');
+  });
+
+  it('should retry 3 times on 503 UNAVAILABLE error and then SKIP', async () => {
+    const mockRig = new TestRig() as any;
+    (TestRig as any).mockReturnValue(mockRig);
+
+    // Simulate permanent 503 error
+    mockRig.run.mockRejectedValue(
+      new Error('status: UNAVAILABLE - Service Busy'),
+    );
+
+    await internalEvalTest({
+      name: 'test-api-503',
+      prompt: 'do something',
+      assert: async () => {},
+    });
+
+    expect(mockRig.run).toHaveBeenCalledTimes(4);
+
+    const logContent = fs
+      .readFileSync(RELIABILITY_LOG, 'utf-8')
+      .trim()
+      .split('\n');
+    const entries = logContent.map((line) => JSON.parse(line));
+    expect(entries[0].errorCode).toBe('503');
+    expect(entries[3].status).toBe('SKIP');
+  });
+
+  it('should throw if an absolute path is used in files', async () => {
+    const mockRig = new TestRig() as any;
+    (TestRig as any).mockReturnValue(mockRig);
+    mockRig.testDir = path.resolve(process.cwd(), 'test-dir-tmp');
+    if (!fs.existsSync(mockRig.testDir)) {
+      fs.mkdirSync(mockRig.testDir, { recursive: true });
+    }
+
+    try {
+      await expect(
+        internalEvalTest({
+          name: 'test-absolute-path',
+          prompt: 'do something',
+          files: {
+            '/etc/passwd': 'hacked',
+          },
+          assert: async () => {},
+        }),
+      ).rejects.toThrow('Invalid file path in test case: /etc/passwd');
+    } finally {
+      if (fs.existsSync(mockRig.testDir)) {
+        fs.rmSync(mockRig.testDir, { recursive: true, force: true });
+      }
+    }
+  });
+
+  it('should throw if directory traversal is detected in files', async () => {
+    const mockRig = new TestRig() as any;
+    (TestRig as any).mockReturnValue(mockRig);
+    mockRig.testDir = path.resolve(process.cwd(), 'test-dir-tmp');
+
+    // Create a mock test-dir
+    if (!fs.existsSync(mockRig.testDir)) {
+      fs.mkdirSync(mockRig.testDir, { recursive: true });
+    }
+
+    try {
+      await expect(
+        internalEvalTest({
+          name: 'test-traversal',
+          prompt: 'do something',
+          files: {
+            '../sensitive.txt': 'hacked',
+          },
+          assert: async () => {},
+        }),
+      ).rejects.toThrow('Invalid file path in test case: ../sensitive.txt');
+    } finally {
+      if (fs.existsSync(mockRig.testDir)) {
+        fs.rmSync(mockRig.testDir, { recursive: true, force: true });
+      }
+    }
+  });
+});
@@ -39,87 +39,34 @@ export * from '@google/gemini-cli-test-utils';
 export type EvalPolicy = 'ALWAYS_PASSES' | 'USUALLY_PASSES';

 export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
-  const fn = async () => {
+  runEval(
+    policy,
+    evalCase.name,
+    () => internalEvalTest(evalCase),
+    evalCase.timeout,
+  );
+}
+
+export async function internalEvalTest(evalCase: EvalCase) {
+  const maxRetries = 3;
+  let attempt = 0;
+
+  while (attempt <= maxRetries) {
    const rig = new TestRig();
    const { logDir, sanitizedName } = await prepareLogDir(evalCase.name);
    const activityLogFile = path.join(logDir, `${sanitizedName}.jsonl`);
    const logFile = path.join(logDir, `${sanitizedName}.log`);
    let isSuccess = false;
+
    try {
      rig.setup(evalCase.name, evalCase.params);

-      // Symlink node modules to reduce the amount of time needed to
-      // bootstrap test projects.
-      symlinkNodeModules(rig.testDir || '');
-
      if (evalCase.files) {
-        const acknowledgedAgents: Record<string, Record<string, string>> = {};
-        const projectRoot = fs.realpathSync(rig.testDir!);
-
-        for (const [filePath, content] of Object.entries(evalCase.files)) {
-          const fullPath = path.join(rig.testDir!, filePath);
-          fs.mkdirSync(path.dirname(fullPath), { recursive: true });
-          fs.writeFileSync(fullPath, content);
-
-          // If it's an agent file, calculate hash for acknowledgement
-          if (
-            filePath.startsWith('.gemini/agents/') &&
-            filePath.endsWith('.md')
-          ) {
-            const hash = crypto
-              .createHash('sha256')
-              .update(content)
-              .digest('hex');
-
-            try {
-              const agentDefs = await parseAgentMarkdown(fullPath, content);
-              if (agentDefs.length > 0) {
-                const agentName = agentDefs[0].name;
-                if (!acknowledgedAgents[projectRoot]) {
-                  acknowledgedAgents[projectRoot] = {};
-                }
-                acknowledgedAgents[projectRoot][agentName] = hash;
-              }
-            } catch (error) {
-              console.warn(
-                `Failed to parse agent for test acknowledgement: ${filePath}`,
-                error,
-              );
-            }
-          }
-        }
-
-        // Write acknowledged_agents.json to the home directory
-        if (Object.keys(acknowledgedAgents).length > 0) {
-          const ackPath = path.join(
-            rig.homeDir!,
-            '.gemini',
-            'acknowledgments',
-            'agents.json',
-          );
-          fs.mkdirSync(path.dirname(ackPath), { recursive: true });
-          fs.writeFileSync(
-            ackPath,
-            JSON.stringify(acknowledgedAgents, null, 2),
-          );
-        }
-
-        const execOptions = { cwd: rig.testDir!, stdio: 'inherit' as const };
-        execSync('git init', execOptions);
-        execSync('git config user.email "test@example.com"', execOptions);
-        execSync('git config user.name "Test User"', execOptions);
-
-        // Temporarily disable the interactive editor and git pager
-        // to avoid hanging the tests. It seems the the agent isn't
-        // consistently honoring the instructions to avoid interactive
-        // commands.
-        execSync('git config core.editor "true"', execOptions);
-        execSync('git config core.pager "cat"', execOptions);
-        execSync('git config commit.gpgsign false', execOptions);
-        execSync('git add .', execOptions);
-        execSync('git commit --allow-empty -m "Initial commit"', execOptions);
+        await setupTestFiles(rig, evalCase.files);
      }

+      symlinkNodeModules(rig.testDir || '');
+
      // If messages are provided, write a session file so --resume can load it.
      let sessionId: string | undefined;
      if (evalCase.messages) {
@@ -188,6 +135,37 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {

      await evalCase.assert(rig, result);
      isSuccess = true;
+      return; // Success! Exit the retry loop.
+    } catch (error: unknown) {
+      const errorMessage =
+        error instanceof Error ? error.message : String(error);
+      const errorCode = getApiErrorCode(errorMessage);
+
+      if (errorCode) {
+        const status = attempt < maxRetries ? 'RETRY' : 'SKIP';
+        logReliabilityEvent(
+          evalCase.name,
+          attempt,
+          status,
+          errorCode,
+          errorMessage,
+        );
+
+        if (attempt < maxRetries) {
+          attempt++;
+          console.warn(
+            `[Eval] Attempt ${attempt} failed with ${errorCode} Error. Retrying...`,
+          );
+          continue; // Retry
+        }
+
+        console.warn(
+          `[Eval] '${evalCase.name}' failed after ${maxRetries} retries due to persistent API errors. Skipping failure to avoid blocking PR.`,
+        );
+        return; // Gracefully exit without failing the test
+      }
+
+      throw error; // Real failure
    } finally {
      if (isSuccess) {
        await fs.promises.unlink(activityLogFile).catch((err) => {
@@ -206,9 +184,131 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
      );
      await rig.cleanup();
    }
+  }
+}
+
+function getApiErrorCode(message: string): '500' | '503' | undefined {
+  if (
+    message.includes('status: UNAVAILABLE') ||
+    message.includes('code: 503') ||
+    message.includes('Service Unavailable')
+  ) {
+    return '503';
+  }
+  if (
+    message.includes('status: INTERNAL') ||
+    message.includes('code: 500') ||
+    message.includes('Internal error encountered')
+  ) {
+    return '500';
+  }
+  return undefined;
+}
+
+/**
+ * Log reliability event for later harvesting.
+ *
+ * Note: Uses synchronous file I/O to ensure the log is persisted even if the
+ * test process is abruptly terminated by a timeout or CI crash. Performance
+ * impact is negligible compared to long-running evaluation tests.
+ */
+function logReliabilityEvent(
+  testName: string,
+  attempt: number,
+  status: 'RETRY' | 'SKIP',
+  errorCode: '500' | '503',
+  errorMessage: string,
+) {
+  const reliabilityLog = {
+    timestamp: new Date().toISOString(),
+    testName,
+    model: process.env.GEMINI_MODEL || 'unknown',
+    attempt,
+    status,
+    errorCode,
+    error: errorMessage,
  };

-  runEval(policy, evalCase.name, fn, evalCase.timeout);
+  try {
+    const relDir = path.resolve(process.cwd(), 'evals/logs');
+    fs.mkdirSync(relDir, { recursive: true });
+    fs.appendFileSync(
+      path.join(relDir, 'api-reliability.jsonl'),
+      JSON.stringify(reliabilityLog) + '\n',
+    );
+  } catch (logError) {
+    console.error('Failed to write reliability log:', logError);
+  }
+}
+
+/**
+ * Helper to setup test files and git repository.
+ *
+ * Note: While this is an async function (due to parseAgentMarkdown), it
+ * intentionally uses synchronous filesystem and child_process operations
+ * for simplicity and to ensure sequential environment preparation.
+ */
+async function setupTestFiles(rig: TestRig, files: Record<string, string>) {
+  const acknowledgedAgents: Record<string, Record<string, string>> = {};
+  const projectRoot = fs.realpathSync(rig.testDir!);
+
+  for (const [filePath, content] of Object.entries(files)) {
+    if (filePath.includes('..') || path.isAbsolute(filePath)) {
+      throw new Error(`Invalid file path in test case: ${filePath}`);
+    }
+    const fullPath = path.join(projectRoot, filePath);
+    if (!fullPath.startsWith(projectRoot)) {
+      throw new Error(`Path traversal detected: ${filePath}`);
+    }
+
+    fs.mkdirSync(path.dirname(fullPath), { recursive: true });
+    fs.writeFileSync(fullPath, content);
+
+    if (filePath.startsWith('.gemini/agents/') && filePath.endsWith('.md')) {
+      const hash = crypto.createHash('sha256').update(content).digest('hex');
+      try {
+        const agentDefs = await parseAgentMarkdown(fullPath, content);
+        if (agentDefs.length > 0) {
+          const agentName = agentDefs[0].name;
+          if (!acknowledgedAgents[projectRoot]) {
+            acknowledgedAgents[projectRoot] = {};
+          }
+          acknowledgedAgents[projectRoot][agentName] = hash;
+        }
+      } catch (error) {
+        console.warn(
+          `Failed to parse agent for test acknowledgement: ${filePath}`,
+          error,
+        );
+      }
+    }
+  }
+
+  if (Object.keys(acknowledgedAgents).length > 0) {
+    const ackPath = path.join(
+      rig.homeDir!,
+      '.gemini',
+      'acknowledgments',
+      'agents.json',
+    );
+    fs.mkdirSync(path.dirname(ackPath), { recursive: true });
+    fs.writeFileSync(ackPath, JSON.stringify(acknowledgedAgents, null, 2));
+  }
+
+  const execOptions = { cwd: rig.testDir!, stdio: 'inherit' as const };
+  execSync('git init --initial-branch=main', execOptions);
+  execSync('git config user.email "test@example.com"', execOptions);
+  execSync('git config user.name "Test User"', execOptions);
+
+  // Temporarily disable the interactive editor and git pager
+  // to avoid hanging the tests. It seems the the agent isn't
+  // consistently honoring the instructions to avoid interactive
+  // commands.
+  execSync('git config core.editor "true"', execOptions);
+  execSync('git config core.pager "cat"', execOptions);
+  execSync('git config commit.gpgsign false', execOptions);
+  execSync('git add .', execOptions);
+  execSync('git commit --allow-empty -m "Initial commit"', execOptions);
 }

 /**
@@ -16,10 +16,6 @@ export default defineConfig({
  },
  test: {
    testTimeout: 300000, // 5 minutes
-    // Retry in CI but not nightly to avoid blocking on API error.
-    retry: process.env['VITEST_RETRY']
-      ? parseInt(process.env['VITEST_RETRY'], 10)
-      : 3,
    reporters: ['default', 'json'],
    outputFile: {
      json: 'evals/logs/report.json',
@@ -0,0 +1,117 @@
+#!/bin/bash
+
+# Gemini API Reliability Harvester
+# -------------------------------
+# This script gathers data about 500 API errors encountered during evaluation runs
+# (eval.yml) from GitHub Actions. It is used to analyze developer friction caused 
+# by transient API failures.
+#
+# Usage:
+#   ./scripts/harvest_api_reliability.sh [SINCE] [LIMIT] [BRANCH]
+#
+# Examples:
+#   ./scripts/harvest_api_reliability.sh           # Last 7 days, all branches
+#   ./scripts/harvest_api_reliability.sh 14d 500   # Last 14 days, limit 500
+#   ./scripts/harvest_api_reliability.sh 2026-03-01 100 my-branch # Specific date and branch
+#
+# Prerequisites:
+#   - GitHub CLI (gh) installed and authenticated (`gh auth login`)
+#   - jq installed
+
+# Arguments & Defaults
+if [[ -n "$1" && $1 =~ ^[0-9]{4}-[0-9]{2}-[0-9]{2}$ ]]; then
+    SINCE="$1"
+elif [[ -n "$1" && $1 =~ ^([0-9]+)d$ ]]; then
+    DAYS="${BASH_REMATCH[1]}"
+    if [[ "$OSTYPE" == "darwin"* ]]; then
+        SINCE=$(date -u -v-"${DAYS}"d +%Y-%m-%d)
+    else
+        SINCE=$(date -u -d "${DAYS} days ago" +%Y-%m-%d)
+    fi
+else
+    # Default to 7 days ago in YYYY-MM-DD format (UTC)
+    if [[ "$OSTYPE" == "darwin"* ]]; then
+        SINCE=$(date -u -v-7d +%Y-%m-%d)
+    else
+        SINCE=$(date -u -d "7 days ago" +%Y-%m-%d)
+    fi
+fi
+
+LIMIT=${2:-300}
+BRANCH=${3:-""}
+WORKFLOWS=("Testing: E2E (Chained)" "Evals: Nightly")
+DEST_DIR=$(mktemp -d -t gemini-reliability-XXXXXX)
+MERGED_FILE="api-reliability-summary.jsonl"
+
+# Ensure cleanup on exit
+trap 'rm -rf "$DEST_DIR"' EXIT
+
+if ! command -v gh &> /dev/null; then
+    echo "❌ Error: GitHub CLI (gh) is not installed."
+    exit 1
+fi
+
+if ! command -v jq &> /dev/null; then
+    echo "❌ Error: jq is not installed."
+    exit 1
+fi
+
+# Clean start
+rm -f "$MERGED_FILE"
+
+# gh run list --created expects a date (YYYY-MM-DD) or a range
+CREATED_QUERY=">=$SINCE"
+
+for WORKFLOW in "${WORKFLOWS[@]}"; do
+    echo "🔍 Fetching runs for '$WORKFLOW' created since $SINCE (max $LIMIT runs, branch: ${BRANCH:-all})..."
+
+    # Construct arguments for gh run list
+    GH_ARGS=("--workflow" "$WORKFLOW" "--created" "$CREATED_QUERY" "--limit" "$LIMIT" "--json" "databaseId" "--jq" ".[].databaseId")
+    if [ -n "$BRANCH" ]; then
+        GH_ARGS+=("--branch" "$BRANCH")
+    fi
+
+    RUN_IDS=$(gh run list "${GH_ARGS[@]}")
+    exit_code=$?
+
+    if [ $exit_code -ne 0 ]; then
+        echo "❌ Failed to fetch runs for '$WORKFLOW' (exit code: $exit_code). Please check 'gh auth status' and permissions." >&2
+        continue
+    fi
+
+    if [ -z "$RUN_IDS" ]; then
+        echo "📭 No runs found for workflow '$WORKFLOW' since $SINCE."
+        continue
+    fi
+
+    for ID in $RUN_IDS; do
+        # Download artifacts named 'eval-logs-*'
+        # Silencing output because many older runs won't have artifacts
+        gh run download "$ID" -p "eval-logs-*" -D "$DEST_DIR/$ID" &>/dev/null || continue
+        
+        # Append to master log
+        # Use find to locate api-reliability.jsonl in any subdirectory of $DEST_DIR/$ID
+        find "$DEST_DIR/$ID" -type f -name "api-reliability.jsonl" -exec cat {} + >> "$MERGED_FILE" 2>/dev/null
+    done
+done
+
+if [ ! -f "$MERGED_FILE" ]; then
+    echo "📭 No reliability data found in the retrieved logs."
+    exit 0
+fi
+
+echo -e "\n✅ Harvest Complete! Data merged into: $MERGED_FILE"
+echo "------------------------------------------------"
+echo "📊 Gemini API Reliability Summary (Since $SINCE)"
+echo "------------------------------------------------"
+
+cat "$MERGED_FILE" | jq -s '
+  group_by(.model) | map({
+    model: .[0].model,
+    "500s": (map(select(.errorCode == "500")) | length),
+    "503s": (map(select(.errorCode == "503")) | length),
+    retries: (map(select(.status == "RETRY")) | length),
+    skips: (map(select(.status == "SKIP")) | length)
+  })'
+
+echo -e "\n💡 Total events captured: $(wc -l < "$MERGED_FILE")"