feat(evals): add reliability harvester and 500/503 retry support (#23626)

2026-07-22 15:51:18 -07:00 · 2026-03-25 18:48:45 -07:00
parent c1e4dbd157
commit 2e03e3aed5
6 changed files with 509 additions and 75 deletions
@@ -0,0 +1,207 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
+import fs from 'node:fs';
+import path from 'node:path';
+import { internalEvalTest } from './test-helper.js';
+import { TestRig } from '@google/gemini-cli-test-utils';
+
+// Mock TestRig to control API success/failure
+vi.mock('@google/gemini-cli-test-utils', () => {
+  return {
+    TestRig: vi.fn().mockImplementation(() => ({
+      setup: vi.fn(),
+      run: vi.fn(),
+      cleanup: vi.fn(),
+      readToolLogs: vi.fn().mockReturnValue([]),
+      _lastRunStderr: '',
+    })),
+  };
+});
+
+describe('evalTest reliability logic', () => {
+  const LOG_DIR = path.resolve(process.cwd(), 'evals/logs');
+  const RELIABILITY_LOG = path.join(LOG_DIR, 'api-reliability.jsonl');
+
+  beforeEach(() => {
+    vi.clearAllMocks();
+    if (fs.existsSync(RELIABILITY_LOG)) {
+      fs.unlinkSync(RELIABILITY_LOG);
+    }
+  });
+
+  afterEach(() => {
+    if (fs.existsSync(RELIABILITY_LOG)) {
+      fs.unlinkSync(RELIABILITY_LOG);
+    }
+  });
+
+  it('should retry 3 times on 500 INTERNAL error and then SKIP', async () => {
+    const mockRig = new TestRig() as any;
+    (TestRig as any).mockReturnValue(mockRig);
+
+    // Simulate permanent 500 error
+    mockRig.run.mockRejectedValue(new Error('status: INTERNAL - API Down'));
+
+    // Execute the test function directly
+    await internalEvalTest({
+      name: 'test-api-failure',
+      prompt: 'do something',
+      assert: async () => {},
+    });
+
+    // Verify retries: 1 initial + 3 retries = 4 setups/runs
+    expect(mockRig.run).toHaveBeenCalledTimes(4);
+
+    // Verify log content
+    const logContent = fs
+      .readFileSync(RELIABILITY_LOG, 'utf-8')
+      .trim()
+      .split('\n');
+    expect(logContent.length).toBe(4);
+
+    const entries = logContent.map((line) => JSON.parse(line));
+    expect(entries[0].status).toBe('RETRY');
+    expect(entries[0].attempt).toBe(0);
+    expect(entries[3].status).toBe('SKIP');
+    expect(entries[3].attempt).toBe(3);
+    expect(entries[3].testName).toBe('test-api-failure');
+  });
+
+  it('should fail immediately on non-500 errors (like assertion failures)', async () => {
+    const mockRig = new TestRig() as any;
+    (TestRig as any).mockReturnValue(mockRig);
+
+    // Simulate a real logic error/bug
+    mockRig.run.mockResolvedValue('Success');
+    const assertError = new Error('Assertion failed: expected foo to be bar');
+
+    // Expect the test function to throw immediately
+    await expect(
+      internalEvalTest({
+        name: 'test-logic-failure',
+        prompt: 'do something',
+        assert: async () => {
+          throw assertError;
+        },
+      }),
+    ).rejects.toThrow('Assertion failed');
+
+    // Verify NO retries: only 1 attempt
+    expect(mockRig.run).toHaveBeenCalledTimes(1);
+
+    // Verify NO reliability log was created (it's not an API error)
+    expect(fs.existsSync(RELIABILITY_LOG)).toBe(false);
+  });
+
+  it('should recover if a retry succeeds', async () => {
+    const mockRig = new TestRig() as any;
+    (TestRig as any).mockReturnValue(mockRig);
+
+    // Fail once, then succeed
+    mockRig.run
+      .mockRejectedValueOnce(new Error('status: INTERNAL'))
+      .mockResolvedValueOnce('Success');
+
+    await internalEvalTest({
+      name: 'test-recovery',
+      prompt: 'do something',
+      assert: async () => {},
+    });
+
+    // Ran twice: initial (fail) + retry 1 (success)
+    expect(mockRig.run).toHaveBeenCalledTimes(2);
+
+    // Log should only have the one RETRY entry
+    const logContent = fs
+      .readFileSync(RELIABILITY_LOG, 'utf-8')
+      .trim()
+      .split('\n');
+    expect(logContent.length).toBe(1);
+    expect(JSON.parse(logContent[0]).status).toBe('RETRY');
+  });
+
+  it('should retry 3 times on 503 UNAVAILABLE error and then SKIP', async () => {
+    const mockRig = new TestRig() as any;
+    (TestRig as any).mockReturnValue(mockRig);
+
+    // Simulate permanent 503 error
+    mockRig.run.mockRejectedValue(
+      new Error('status: UNAVAILABLE - Service Busy'),
+    );
+
+    await internalEvalTest({
+      name: 'test-api-503',
+      prompt: 'do something',
+      assert: async () => {},
+    });
+
+    expect(mockRig.run).toHaveBeenCalledTimes(4);
+
+    const logContent = fs
+      .readFileSync(RELIABILITY_LOG, 'utf-8')
+      .trim()
+      .split('\n');
+    const entries = logContent.map((line) => JSON.parse(line));
+    expect(entries[0].errorCode).toBe('503');
+    expect(entries[3].status).toBe('SKIP');
+  });
+
+  it('should throw if an absolute path is used in files', async () => {
+    const mockRig = new TestRig() as any;
+    (TestRig as any).mockReturnValue(mockRig);
+    mockRig.testDir = path.resolve(process.cwd(), 'test-dir-tmp');
+    if (!fs.existsSync(mockRig.testDir)) {
+      fs.mkdirSync(mockRig.testDir, { recursive: true });
+    }
+
+    try {
+      await expect(
+        internalEvalTest({
+          name: 'test-absolute-path',
+          prompt: 'do something',
+          files: {
+            '/etc/passwd': 'hacked',
+          },
+          assert: async () => {},
+        }),
+      ).rejects.toThrow('Invalid file path in test case: /etc/passwd');
+    } finally {
+      if (fs.existsSync(mockRig.testDir)) {
+        fs.rmSync(mockRig.testDir, { recursive: true, force: true });
+      }
+    }
+  });
+
+  it('should throw if directory traversal is detected in files', async () => {
+    const mockRig = new TestRig() as any;
+    (TestRig as any).mockReturnValue(mockRig);
+    mockRig.testDir = path.resolve(process.cwd(), 'test-dir-tmp');
+
+    // Create a mock test-dir
+    if (!fs.existsSync(mockRig.testDir)) {
+      fs.mkdirSync(mockRig.testDir, { recursive: true });
+    }
+
+    try {
+      await expect(
+        internalEvalTest({
+          name: 'test-traversal',
+          prompt: 'do something',
+          files: {
+            '../sensitive.txt': 'hacked',
+          },
+          assert: async () => {},
+        }),
+      ).rejects.toThrow('Invalid file path in test case: ../sensitive.txt');
+    } finally {
+      if (fs.existsSync(mockRig.testDir)) {
+        fs.rmSync(mockRig.testDir, { recursive: true, force: true });
+      }
+    }
+  });
+});
@@ -39,87 +39,34 @@ export * from '@google/gemini-cli-test-utils';
 export type EvalPolicy = 'ALWAYS_PASSES' | 'USUALLY_PASSES';

 export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
-  const fn = async () => {
+  runEval(
+    policy,
+    evalCase.name,
+    () => internalEvalTest(evalCase),
+    evalCase.timeout,
+  );
+}
+
+export async function internalEvalTest(evalCase: EvalCase) {
+  const maxRetries = 3;
+  let attempt = 0;
+
+  while (attempt <= maxRetries) {
    const rig = new TestRig();
    const { logDir, sanitizedName } = await prepareLogDir(evalCase.name);
    const activityLogFile = path.join(logDir, `${sanitizedName}.jsonl`);
    const logFile = path.join(logDir, `${sanitizedName}.log`);
    let isSuccess = false;
+
    try {
      rig.setup(evalCase.name, evalCase.params);

-      // Symlink node modules to reduce the amount of time needed to
-      // bootstrap test projects.
-      symlinkNodeModules(rig.testDir || '');
-
      if (evalCase.files) {
-        const acknowledgedAgents: Record<string, Record<string, string>> = {};
-        const projectRoot = fs.realpathSync(rig.testDir!);
-
-        for (const [filePath, content] of Object.entries(evalCase.files)) {
-          const fullPath = path.join(rig.testDir!, filePath);
-          fs.mkdirSync(path.dirname(fullPath), { recursive: true });
-          fs.writeFileSync(fullPath, content);
-
-          // If it's an agent file, calculate hash for acknowledgement
-          if (
-            filePath.startsWith('.gemini/agents/') &&
-            filePath.endsWith('.md')
-          ) {
-            const hash = crypto
-              .createHash('sha256')
-              .update(content)
-              .digest('hex');
-
-            try {
-              const agentDefs = await parseAgentMarkdown(fullPath, content);
-              if (agentDefs.length > 0) {
-                const agentName = agentDefs[0].name;
-                if (!acknowledgedAgents[projectRoot]) {
-                  acknowledgedAgents[projectRoot] = {};
-                }
-                acknowledgedAgents[projectRoot][agentName] = hash;
-              }
-            } catch (error) {
-              console.warn(
-                `Failed to parse agent for test acknowledgement: ${filePath}`,
-                error,
-              );
-            }
-          }
-        }
-
-        // Write acknowledged_agents.json to the home directory
-        if (Object.keys(acknowledgedAgents).length > 0) {
-          const ackPath = path.join(
-            rig.homeDir!,
-            '.gemini',
-            'acknowledgments',
-            'agents.json',
-          );
-          fs.mkdirSync(path.dirname(ackPath), { recursive: true });
-          fs.writeFileSync(
-            ackPath,
-            JSON.stringify(acknowledgedAgents, null, 2),
-          );
-        }
-
-        const execOptions = { cwd: rig.testDir!, stdio: 'inherit' as const };
-        execSync('git init', execOptions);
-        execSync('git config user.email "test@example.com"', execOptions);
-        execSync('git config user.name "Test User"', execOptions);
-
-        // Temporarily disable the interactive editor and git pager
-        // to avoid hanging the tests. It seems the the agent isn't
-        // consistently honoring the instructions to avoid interactive
-        // commands.
-        execSync('git config core.editor "true"', execOptions);
-        execSync('git config core.pager "cat"', execOptions);
-        execSync('git config commit.gpgsign false', execOptions);
-        execSync('git add .', execOptions);
-        execSync('git commit --allow-empty -m "Initial commit"', execOptions);
+        await setupTestFiles(rig, evalCase.files);
      }

+      symlinkNodeModules(rig.testDir || '');
+
      // If messages are provided, write a session file so --resume can load it.
      let sessionId: string | undefined;
      if (evalCase.messages) {
@@ -188,6 +135,37 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {

      await evalCase.assert(rig, result);
      isSuccess = true;
+      return; // Success! Exit the retry loop.
+    } catch (error: unknown) {
+      const errorMessage =
+        error instanceof Error ? error.message : String(error);
+      const errorCode = getApiErrorCode(errorMessage);
+
+      if (errorCode) {
+        const status = attempt < maxRetries ? 'RETRY' : 'SKIP';
+        logReliabilityEvent(
+          evalCase.name,
+          attempt,
+          status,
+          errorCode,
+          errorMessage,
+        );
+
+        if (attempt < maxRetries) {
+          attempt++;
+          console.warn(
+            `[Eval] Attempt ${attempt} failed with ${errorCode} Error. Retrying...`,
+          );
+          continue; // Retry
+        }
+
+        console.warn(
+          `[Eval] '${evalCase.name}' failed after ${maxRetries} retries due to persistent API errors. Skipping failure to avoid blocking PR.`,
+        );
+        return; // Gracefully exit without failing the test
+      }
+
+      throw error; // Real failure
    } finally {
      if (isSuccess) {
        await fs.promises.unlink(activityLogFile).catch((err) => {
@@ -206,9 +184,131 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
      );
      await rig.cleanup();
    }
+  }
+}
+
+function getApiErrorCode(message: string): '500' | '503' | undefined {
+  if (
+    message.includes('status: UNAVAILABLE') ||
+    message.includes('code: 503') ||
+    message.includes('Service Unavailable')
+  ) {
+    return '503';
+  }
+  if (
+    message.includes('status: INTERNAL') ||
+    message.includes('code: 500') ||
+    message.includes('Internal error encountered')
+  ) {
+    return '500';
+  }
+  return undefined;
+}
+
+/**
+ * Log reliability event for later harvesting.
+ *
+ * Note: Uses synchronous file I/O to ensure the log is persisted even if the
+ * test process is abruptly terminated by a timeout or CI crash. Performance
+ * impact is negligible compared to long-running evaluation tests.
+ */
+function logReliabilityEvent(
+  testName: string,
+  attempt: number,
+  status: 'RETRY' | 'SKIP',
+  errorCode: '500' | '503',
+  errorMessage: string,
+) {
+  const reliabilityLog = {
+    timestamp: new Date().toISOString(),
+    testName,
+    model: process.env.GEMINI_MODEL || 'unknown',
+    attempt,
+    status,
+    errorCode,
+    error: errorMessage,
  };

-  runEval(policy, evalCase.name, fn, evalCase.timeout);
+  try {
+    const relDir = path.resolve(process.cwd(), 'evals/logs');
+    fs.mkdirSync(relDir, { recursive: true });
+    fs.appendFileSync(
+      path.join(relDir, 'api-reliability.jsonl'),
+      JSON.stringify(reliabilityLog) + '\n',
+    );
+  } catch (logError) {
+    console.error('Failed to write reliability log:', logError);
+  }
+}
+
+/**
+ * Helper to setup test files and git repository.
+ *
+ * Note: While this is an async function (due to parseAgentMarkdown), it
+ * intentionally uses synchronous filesystem and child_process operations
+ * for simplicity and to ensure sequential environment preparation.
+ */
+async function setupTestFiles(rig: TestRig, files: Record<string, string>) {
+  const acknowledgedAgents: Record<string, Record<string, string>> = {};
+  const projectRoot = fs.realpathSync(rig.testDir!);
+
+  for (const [filePath, content] of Object.entries(files)) {
+    if (filePath.includes('..') || path.isAbsolute(filePath)) {
+      throw new Error(`Invalid file path in test case: ${filePath}`);
+    }
+    const fullPath = path.join(projectRoot, filePath);
+    if (!fullPath.startsWith(projectRoot)) {
+      throw new Error(`Path traversal detected: ${filePath}`);
+    }
+
+    fs.mkdirSync(path.dirname(fullPath), { recursive: true });
+    fs.writeFileSync(fullPath, content);
+
+    if (filePath.startsWith('.gemini/agents/') && filePath.endsWith('.md')) {
+      const hash = crypto.createHash('sha256').update(content).digest('hex');
+      try {
+        const agentDefs = await parseAgentMarkdown(fullPath, content);
+        if (agentDefs.length > 0) {
+          const agentName = agentDefs[0].name;
+          if (!acknowledgedAgents[projectRoot]) {
+            acknowledgedAgents[projectRoot] = {};
+          }
+          acknowledgedAgents[projectRoot][agentName] = hash;
+        }
+      } catch (error) {
+        console.warn(
+          `Failed to parse agent for test acknowledgement: ${filePath}`,
+          error,
+        );
+      }
+    }
+  }
+
+  if (Object.keys(acknowledgedAgents).length > 0) {
+    const ackPath = path.join(
+      rig.homeDir!,
+      '.gemini',
+      'acknowledgments',
+      'agents.json',
+    );
+    fs.mkdirSync(path.dirname(ackPath), { recursive: true });
+    fs.writeFileSync(ackPath, JSON.stringify(acknowledgedAgents, null, 2));
+  }
+
+  const execOptions = { cwd: rig.testDir!, stdio: 'inherit' as const };
+  execSync('git init --initial-branch=main', execOptions);
+  execSync('git config user.email "test@example.com"', execOptions);
+  execSync('git config user.name "Test User"', execOptions);
+
+  // Temporarily disable the interactive editor and git pager
+  // to avoid hanging the tests. It seems the the agent isn't
+  // consistently honoring the instructions to avoid interactive
+  // commands.
+  execSync('git config core.editor "true"', execOptions);
+  execSync('git config core.pager "cat"', execOptions);
+  execSync('git config commit.gpgsign false', execOptions);
+  execSync('git add .', execOptions);
+  execSync('git commit --allow-empty -m "Initial commit"', execOptions);
 }

 /**
@@ -16,10 +16,6 @@ export default defineConfig({
  },
  test: {
    testTimeout: 300000, // 5 minutes
-    // Retry in CI but not nightly to avoid blocking on API error.
-    retry: process.env['VITEST_RETRY']
-      ? parseInt(process.env['VITEST_RETRY'], 10)
-      : 3,
    reporters: ['default', 'json'],
    outputFile: {
      json: 'evals/logs/report.json',