mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-03-26 22:11:57 -07:00
feat(evals): add reliability harvester and 500/503 retry support (#23626)
This commit is contained in:
12
.github/workflows/chained_e2e.yml
vendored
12
.github/workflows/chained_e2e.yml
vendored
@@ -334,8 +334,20 @@ jobs:
|
||||
if: "${{ steps.check_evals.outputs.should_run == 'true' }}"
|
||||
env:
|
||||
GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}'
|
||||
GEMINI_MODEL: 'gemini-3-pro-preview'
|
||||
# Disable Vitest internal retries to avoid double-retrying;
|
||||
# custom retry logic is handled in evals/test-helper.ts
|
||||
VITEST_RETRY: 0
|
||||
run: 'npm run test:always_passing_evals'
|
||||
|
||||
- name: 'Upload Reliability Logs'
|
||||
if: "always() && steps.check_evals.outputs.should_run == 'true'"
|
||||
uses: 'actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02' # ratchet:actions/upload-artifact@v4
|
||||
with:
|
||||
name: 'eval-logs-${{ github.run_id }}-${{ github.run_attempt }}'
|
||||
path: 'evals/logs/api-reliability.jsonl'
|
||||
retention-days: 7
|
||||
|
||||
e2e:
|
||||
name: 'E2E'
|
||||
if: |
|
||||
|
||||
2
.github/workflows/evals-nightly.yml
vendored
2
.github/workflows/evals-nightly.yml
vendored
@@ -61,6 +61,8 @@ jobs:
|
||||
GEMINI_MODEL: '${{ matrix.model }}'
|
||||
RUN_EVALS: "${{ github.event.inputs.run_all != 'false' }}"
|
||||
TEST_NAME_PATTERN: '${{ github.event.inputs.test_name_pattern }}'
|
||||
# Disable Vitest internal retries to avoid double-retrying;
|
||||
# custom retry logic is handled in evals/test-helper.ts
|
||||
VITEST_RETRY: 0
|
||||
run: |
|
||||
CMD="npm run test:all_evals"
|
||||
|
||||
207
evals/test-helper.test.ts
Normal file
207
evals/test-helper.test.ts
Normal file
@@ -0,0 +1,207 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import { internalEvalTest } from './test-helper.js';
|
||||
import { TestRig } from '@google/gemini-cli-test-utils';
|
||||
|
||||
// Mock TestRig to control API success/failure
|
||||
vi.mock('@google/gemini-cli-test-utils', () => {
|
||||
return {
|
||||
TestRig: vi.fn().mockImplementation(() => ({
|
||||
setup: vi.fn(),
|
||||
run: vi.fn(),
|
||||
cleanup: vi.fn(),
|
||||
readToolLogs: vi.fn().mockReturnValue([]),
|
||||
_lastRunStderr: '',
|
||||
})),
|
||||
};
|
||||
});
|
||||
|
||||
describe('evalTest reliability logic', () => {
|
||||
const LOG_DIR = path.resolve(process.cwd(), 'evals/logs');
|
||||
const RELIABILITY_LOG = path.join(LOG_DIR, 'api-reliability.jsonl');
|
||||
|
||||
beforeEach(() => {
|
||||
vi.clearAllMocks();
|
||||
if (fs.existsSync(RELIABILITY_LOG)) {
|
||||
fs.unlinkSync(RELIABILITY_LOG);
|
||||
}
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
if (fs.existsSync(RELIABILITY_LOG)) {
|
||||
fs.unlinkSync(RELIABILITY_LOG);
|
||||
}
|
||||
});
|
||||
|
||||
it('should retry 3 times on 500 INTERNAL error and then SKIP', async () => {
|
||||
const mockRig = new TestRig() as any;
|
||||
(TestRig as any).mockReturnValue(mockRig);
|
||||
|
||||
// Simulate permanent 500 error
|
||||
mockRig.run.mockRejectedValue(new Error('status: INTERNAL - API Down'));
|
||||
|
||||
// Execute the test function directly
|
||||
await internalEvalTest({
|
||||
name: 'test-api-failure',
|
||||
prompt: 'do something',
|
||||
assert: async () => {},
|
||||
});
|
||||
|
||||
// Verify retries: 1 initial + 3 retries = 4 setups/runs
|
||||
expect(mockRig.run).toHaveBeenCalledTimes(4);
|
||||
|
||||
// Verify log content
|
||||
const logContent = fs
|
||||
.readFileSync(RELIABILITY_LOG, 'utf-8')
|
||||
.trim()
|
||||
.split('\n');
|
||||
expect(logContent.length).toBe(4);
|
||||
|
||||
const entries = logContent.map((line) => JSON.parse(line));
|
||||
expect(entries[0].status).toBe('RETRY');
|
||||
expect(entries[0].attempt).toBe(0);
|
||||
expect(entries[3].status).toBe('SKIP');
|
||||
expect(entries[3].attempt).toBe(3);
|
||||
expect(entries[3].testName).toBe('test-api-failure');
|
||||
});
|
||||
|
||||
it('should fail immediately on non-500 errors (like assertion failures)', async () => {
|
||||
const mockRig = new TestRig() as any;
|
||||
(TestRig as any).mockReturnValue(mockRig);
|
||||
|
||||
// Simulate a real logic error/bug
|
||||
mockRig.run.mockResolvedValue('Success');
|
||||
const assertError = new Error('Assertion failed: expected foo to be bar');
|
||||
|
||||
// Expect the test function to throw immediately
|
||||
await expect(
|
||||
internalEvalTest({
|
||||
name: 'test-logic-failure',
|
||||
prompt: 'do something',
|
||||
assert: async () => {
|
||||
throw assertError;
|
||||
},
|
||||
}),
|
||||
).rejects.toThrow('Assertion failed');
|
||||
|
||||
// Verify NO retries: only 1 attempt
|
||||
expect(mockRig.run).toHaveBeenCalledTimes(1);
|
||||
|
||||
// Verify NO reliability log was created (it's not an API error)
|
||||
expect(fs.existsSync(RELIABILITY_LOG)).toBe(false);
|
||||
});
|
||||
|
||||
it('should recover if a retry succeeds', async () => {
|
||||
const mockRig = new TestRig() as any;
|
||||
(TestRig as any).mockReturnValue(mockRig);
|
||||
|
||||
// Fail once, then succeed
|
||||
mockRig.run
|
||||
.mockRejectedValueOnce(new Error('status: INTERNAL'))
|
||||
.mockResolvedValueOnce('Success');
|
||||
|
||||
await internalEvalTest({
|
||||
name: 'test-recovery',
|
||||
prompt: 'do something',
|
||||
assert: async () => {},
|
||||
});
|
||||
|
||||
// Ran twice: initial (fail) + retry 1 (success)
|
||||
expect(mockRig.run).toHaveBeenCalledTimes(2);
|
||||
|
||||
// Log should only have the one RETRY entry
|
||||
const logContent = fs
|
||||
.readFileSync(RELIABILITY_LOG, 'utf-8')
|
||||
.trim()
|
||||
.split('\n');
|
||||
expect(logContent.length).toBe(1);
|
||||
expect(JSON.parse(logContent[0]).status).toBe('RETRY');
|
||||
});
|
||||
|
||||
it('should retry 3 times on 503 UNAVAILABLE error and then SKIP', async () => {
|
||||
const mockRig = new TestRig() as any;
|
||||
(TestRig as any).mockReturnValue(mockRig);
|
||||
|
||||
// Simulate permanent 503 error
|
||||
mockRig.run.mockRejectedValue(
|
||||
new Error('status: UNAVAILABLE - Service Busy'),
|
||||
);
|
||||
|
||||
await internalEvalTest({
|
||||
name: 'test-api-503',
|
||||
prompt: 'do something',
|
||||
assert: async () => {},
|
||||
});
|
||||
|
||||
expect(mockRig.run).toHaveBeenCalledTimes(4);
|
||||
|
||||
const logContent = fs
|
||||
.readFileSync(RELIABILITY_LOG, 'utf-8')
|
||||
.trim()
|
||||
.split('\n');
|
||||
const entries = logContent.map((line) => JSON.parse(line));
|
||||
expect(entries[0].errorCode).toBe('503');
|
||||
expect(entries[3].status).toBe('SKIP');
|
||||
});
|
||||
|
||||
it('should throw if an absolute path is used in files', async () => {
|
||||
const mockRig = new TestRig() as any;
|
||||
(TestRig as any).mockReturnValue(mockRig);
|
||||
mockRig.testDir = path.resolve(process.cwd(), 'test-dir-tmp');
|
||||
if (!fs.existsSync(mockRig.testDir)) {
|
||||
fs.mkdirSync(mockRig.testDir, { recursive: true });
|
||||
}
|
||||
|
||||
try {
|
||||
await expect(
|
||||
internalEvalTest({
|
||||
name: 'test-absolute-path',
|
||||
prompt: 'do something',
|
||||
files: {
|
||||
'/etc/passwd': 'hacked',
|
||||
},
|
||||
assert: async () => {},
|
||||
}),
|
||||
).rejects.toThrow('Invalid file path in test case: /etc/passwd');
|
||||
} finally {
|
||||
if (fs.existsSync(mockRig.testDir)) {
|
||||
fs.rmSync(mockRig.testDir, { recursive: true, force: true });
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
it('should throw if directory traversal is detected in files', async () => {
|
||||
const mockRig = new TestRig() as any;
|
||||
(TestRig as any).mockReturnValue(mockRig);
|
||||
mockRig.testDir = path.resolve(process.cwd(), 'test-dir-tmp');
|
||||
|
||||
// Create a mock test-dir
|
||||
if (!fs.existsSync(mockRig.testDir)) {
|
||||
fs.mkdirSync(mockRig.testDir, { recursive: true });
|
||||
}
|
||||
|
||||
try {
|
||||
await expect(
|
||||
internalEvalTest({
|
||||
name: 'test-traversal',
|
||||
prompt: 'do something',
|
||||
files: {
|
||||
'../sensitive.txt': 'hacked',
|
||||
},
|
||||
assert: async () => {},
|
||||
}),
|
||||
).rejects.toThrow('Invalid file path in test case: ../sensitive.txt');
|
||||
} finally {
|
||||
if (fs.existsSync(mockRig.testDir)) {
|
||||
fs.rmSync(mockRig.testDir, { recursive: true, force: true });
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
@@ -39,87 +39,34 @@ export * from '@google/gemini-cli-test-utils';
|
||||
export type EvalPolicy = 'ALWAYS_PASSES' | 'USUALLY_PASSES';
|
||||
|
||||
export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
|
||||
const fn = async () => {
|
||||
runEval(
|
||||
policy,
|
||||
evalCase.name,
|
||||
() => internalEvalTest(evalCase),
|
||||
evalCase.timeout,
|
||||
);
|
||||
}
|
||||
|
||||
export async function internalEvalTest(evalCase: EvalCase) {
|
||||
const maxRetries = 3;
|
||||
let attempt = 0;
|
||||
|
||||
while (attempt <= maxRetries) {
|
||||
const rig = new TestRig();
|
||||
const { logDir, sanitizedName } = await prepareLogDir(evalCase.name);
|
||||
const activityLogFile = path.join(logDir, `${sanitizedName}.jsonl`);
|
||||
const logFile = path.join(logDir, `${sanitizedName}.log`);
|
||||
let isSuccess = false;
|
||||
|
||||
try {
|
||||
rig.setup(evalCase.name, evalCase.params);
|
||||
|
||||
// Symlink node modules to reduce the amount of time needed to
|
||||
// bootstrap test projects.
|
||||
symlinkNodeModules(rig.testDir || '');
|
||||
|
||||
if (evalCase.files) {
|
||||
const acknowledgedAgents: Record<string, Record<string, string>> = {};
|
||||
const projectRoot = fs.realpathSync(rig.testDir!);
|
||||
|
||||
for (const [filePath, content] of Object.entries(evalCase.files)) {
|
||||
const fullPath = path.join(rig.testDir!, filePath);
|
||||
fs.mkdirSync(path.dirname(fullPath), { recursive: true });
|
||||
fs.writeFileSync(fullPath, content);
|
||||
|
||||
// If it's an agent file, calculate hash for acknowledgement
|
||||
if (
|
||||
filePath.startsWith('.gemini/agents/') &&
|
||||
filePath.endsWith('.md')
|
||||
) {
|
||||
const hash = crypto
|
||||
.createHash('sha256')
|
||||
.update(content)
|
||||
.digest('hex');
|
||||
|
||||
try {
|
||||
const agentDefs = await parseAgentMarkdown(fullPath, content);
|
||||
if (agentDefs.length > 0) {
|
||||
const agentName = agentDefs[0].name;
|
||||
if (!acknowledgedAgents[projectRoot]) {
|
||||
acknowledgedAgents[projectRoot] = {};
|
||||
}
|
||||
acknowledgedAgents[projectRoot][agentName] = hash;
|
||||
}
|
||||
} catch (error) {
|
||||
console.warn(
|
||||
`Failed to parse agent for test acknowledgement: ${filePath}`,
|
||||
error,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Write acknowledged_agents.json to the home directory
|
||||
if (Object.keys(acknowledgedAgents).length > 0) {
|
||||
const ackPath = path.join(
|
||||
rig.homeDir!,
|
||||
'.gemini',
|
||||
'acknowledgments',
|
||||
'agents.json',
|
||||
);
|
||||
fs.mkdirSync(path.dirname(ackPath), { recursive: true });
|
||||
fs.writeFileSync(
|
||||
ackPath,
|
||||
JSON.stringify(acknowledgedAgents, null, 2),
|
||||
);
|
||||
}
|
||||
|
||||
const execOptions = { cwd: rig.testDir!, stdio: 'inherit' as const };
|
||||
execSync('git init', execOptions);
|
||||
execSync('git config user.email "test@example.com"', execOptions);
|
||||
execSync('git config user.name "Test User"', execOptions);
|
||||
|
||||
// Temporarily disable the interactive editor and git pager
|
||||
// to avoid hanging the tests. It seems the the agent isn't
|
||||
// consistently honoring the instructions to avoid interactive
|
||||
// commands.
|
||||
execSync('git config core.editor "true"', execOptions);
|
||||
execSync('git config core.pager "cat"', execOptions);
|
||||
execSync('git config commit.gpgsign false', execOptions);
|
||||
execSync('git add .', execOptions);
|
||||
execSync('git commit --allow-empty -m "Initial commit"', execOptions);
|
||||
await setupTestFiles(rig, evalCase.files);
|
||||
}
|
||||
|
||||
symlinkNodeModules(rig.testDir || '');
|
||||
|
||||
// If messages are provided, write a session file so --resume can load it.
|
||||
let sessionId: string | undefined;
|
||||
if (evalCase.messages) {
|
||||
@@ -188,6 +135,37 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
|
||||
|
||||
await evalCase.assert(rig, result);
|
||||
isSuccess = true;
|
||||
return; // Success! Exit the retry loop.
|
||||
} catch (error: unknown) {
|
||||
const errorMessage =
|
||||
error instanceof Error ? error.message : String(error);
|
||||
const errorCode = getApiErrorCode(errorMessage);
|
||||
|
||||
if (errorCode) {
|
||||
const status = attempt < maxRetries ? 'RETRY' : 'SKIP';
|
||||
logReliabilityEvent(
|
||||
evalCase.name,
|
||||
attempt,
|
||||
status,
|
||||
errorCode,
|
||||
errorMessage,
|
||||
);
|
||||
|
||||
if (attempt < maxRetries) {
|
||||
attempt++;
|
||||
console.warn(
|
||||
`[Eval] Attempt ${attempt} failed with ${errorCode} Error. Retrying...`,
|
||||
);
|
||||
continue; // Retry
|
||||
}
|
||||
|
||||
console.warn(
|
||||
`[Eval] '${evalCase.name}' failed after ${maxRetries} retries due to persistent API errors. Skipping failure to avoid blocking PR.`,
|
||||
);
|
||||
return; // Gracefully exit without failing the test
|
||||
}
|
||||
|
||||
throw error; // Real failure
|
||||
} finally {
|
||||
if (isSuccess) {
|
||||
await fs.promises.unlink(activityLogFile).catch((err) => {
|
||||
@@ -206,9 +184,131 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
|
||||
);
|
||||
await rig.cleanup();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function getApiErrorCode(message: string): '500' | '503' | undefined {
|
||||
if (
|
||||
message.includes('status: UNAVAILABLE') ||
|
||||
message.includes('code: 503') ||
|
||||
message.includes('Service Unavailable')
|
||||
) {
|
||||
return '503';
|
||||
}
|
||||
if (
|
||||
message.includes('status: INTERNAL') ||
|
||||
message.includes('code: 500') ||
|
||||
message.includes('Internal error encountered')
|
||||
) {
|
||||
return '500';
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
/**
|
||||
* Log reliability event for later harvesting.
|
||||
*
|
||||
* Note: Uses synchronous file I/O to ensure the log is persisted even if the
|
||||
* test process is abruptly terminated by a timeout or CI crash. Performance
|
||||
* impact is negligible compared to long-running evaluation tests.
|
||||
*/
|
||||
function logReliabilityEvent(
|
||||
testName: string,
|
||||
attempt: number,
|
||||
status: 'RETRY' | 'SKIP',
|
||||
errorCode: '500' | '503',
|
||||
errorMessage: string,
|
||||
) {
|
||||
const reliabilityLog = {
|
||||
timestamp: new Date().toISOString(),
|
||||
testName,
|
||||
model: process.env.GEMINI_MODEL || 'unknown',
|
||||
attempt,
|
||||
status,
|
||||
errorCode,
|
||||
error: errorMessage,
|
||||
};
|
||||
|
||||
runEval(policy, evalCase.name, fn, evalCase.timeout);
|
||||
try {
|
||||
const relDir = path.resolve(process.cwd(), 'evals/logs');
|
||||
fs.mkdirSync(relDir, { recursive: true });
|
||||
fs.appendFileSync(
|
||||
path.join(relDir, 'api-reliability.jsonl'),
|
||||
JSON.stringify(reliabilityLog) + '\n',
|
||||
);
|
||||
} catch (logError) {
|
||||
console.error('Failed to write reliability log:', logError);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper to setup test files and git repository.
|
||||
*
|
||||
* Note: While this is an async function (due to parseAgentMarkdown), it
|
||||
* intentionally uses synchronous filesystem and child_process operations
|
||||
* for simplicity and to ensure sequential environment preparation.
|
||||
*/
|
||||
async function setupTestFiles(rig: TestRig, files: Record<string, string>) {
|
||||
const acknowledgedAgents: Record<string, Record<string, string>> = {};
|
||||
const projectRoot = fs.realpathSync(rig.testDir!);
|
||||
|
||||
for (const [filePath, content] of Object.entries(files)) {
|
||||
if (filePath.includes('..') || path.isAbsolute(filePath)) {
|
||||
throw new Error(`Invalid file path in test case: ${filePath}`);
|
||||
}
|
||||
const fullPath = path.join(projectRoot, filePath);
|
||||
if (!fullPath.startsWith(projectRoot)) {
|
||||
throw new Error(`Path traversal detected: ${filePath}`);
|
||||
}
|
||||
|
||||
fs.mkdirSync(path.dirname(fullPath), { recursive: true });
|
||||
fs.writeFileSync(fullPath, content);
|
||||
|
||||
if (filePath.startsWith('.gemini/agents/') && filePath.endsWith('.md')) {
|
||||
const hash = crypto.createHash('sha256').update(content).digest('hex');
|
||||
try {
|
||||
const agentDefs = await parseAgentMarkdown(fullPath, content);
|
||||
if (agentDefs.length > 0) {
|
||||
const agentName = agentDefs[0].name;
|
||||
if (!acknowledgedAgents[projectRoot]) {
|
||||
acknowledgedAgents[projectRoot] = {};
|
||||
}
|
||||
acknowledgedAgents[projectRoot][agentName] = hash;
|
||||
}
|
||||
} catch (error) {
|
||||
console.warn(
|
||||
`Failed to parse agent for test acknowledgement: ${filePath}`,
|
||||
error,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (Object.keys(acknowledgedAgents).length > 0) {
|
||||
const ackPath = path.join(
|
||||
rig.homeDir!,
|
||||
'.gemini',
|
||||
'acknowledgments',
|
||||
'agents.json',
|
||||
);
|
||||
fs.mkdirSync(path.dirname(ackPath), { recursive: true });
|
||||
fs.writeFileSync(ackPath, JSON.stringify(acknowledgedAgents, null, 2));
|
||||
}
|
||||
|
||||
const execOptions = { cwd: rig.testDir!, stdio: 'inherit' as const };
|
||||
execSync('git init --initial-branch=main', execOptions);
|
||||
execSync('git config user.email "test@example.com"', execOptions);
|
||||
execSync('git config user.name "Test User"', execOptions);
|
||||
|
||||
// Temporarily disable the interactive editor and git pager
|
||||
// to avoid hanging the tests. It seems the the agent isn't
|
||||
// consistently honoring the instructions to avoid interactive
|
||||
// commands.
|
||||
execSync('git config core.editor "true"', execOptions);
|
||||
execSync('git config core.pager "cat"', execOptions);
|
||||
execSync('git config commit.gpgsign false', execOptions);
|
||||
execSync('git add .', execOptions);
|
||||
execSync('git commit --allow-empty -m "Initial commit"', execOptions);
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -16,10 +16,6 @@ export default defineConfig({
|
||||
},
|
||||
test: {
|
||||
testTimeout: 300000, // 5 minutes
|
||||
// Retry in CI but not nightly to avoid blocking on API error.
|
||||
retry: process.env['VITEST_RETRY']
|
||||
? parseInt(process.env['VITEST_RETRY'], 10)
|
||||
: 3,
|
||||
reporters: ['default', 'json'],
|
||||
outputFile: {
|
||||
json: 'evals/logs/report.json',
|
||||
|
||||
117
scripts/harvest_api_reliability.sh
Executable file
117
scripts/harvest_api_reliability.sh
Executable file
@@ -0,0 +1,117 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Gemini API Reliability Harvester
|
||||
# -------------------------------
|
||||
# This script gathers data about 500 API errors encountered during evaluation runs
|
||||
# (eval.yml) from GitHub Actions. It is used to analyze developer friction caused
|
||||
# by transient API failures.
|
||||
#
|
||||
# Usage:
|
||||
# ./scripts/harvest_api_reliability.sh [SINCE] [LIMIT] [BRANCH]
|
||||
#
|
||||
# Examples:
|
||||
# ./scripts/harvest_api_reliability.sh # Last 7 days, all branches
|
||||
# ./scripts/harvest_api_reliability.sh 14d 500 # Last 14 days, limit 500
|
||||
# ./scripts/harvest_api_reliability.sh 2026-03-01 100 my-branch # Specific date and branch
|
||||
#
|
||||
# Prerequisites:
|
||||
# - GitHub CLI (gh) installed and authenticated (`gh auth login`)
|
||||
# - jq installed
|
||||
|
||||
# Arguments & Defaults
|
||||
if [[ -n "$1" && $1 =~ ^[0-9]{4}-[0-9]{2}-[0-9]{2}$ ]]; then
|
||||
SINCE="$1"
|
||||
elif [[ -n "$1" && $1 =~ ^([0-9]+)d$ ]]; then
|
||||
DAYS="${BASH_REMATCH[1]}"
|
||||
if [[ "$OSTYPE" == "darwin"* ]]; then
|
||||
SINCE=$(date -u -v-"${DAYS}"d +%Y-%m-%d)
|
||||
else
|
||||
SINCE=$(date -u -d "${DAYS} days ago" +%Y-%m-%d)
|
||||
fi
|
||||
else
|
||||
# Default to 7 days ago in YYYY-MM-DD format (UTC)
|
||||
if [[ "$OSTYPE" == "darwin"* ]]; then
|
||||
SINCE=$(date -u -v-7d +%Y-%m-%d)
|
||||
else
|
||||
SINCE=$(date -u -d "7 days ago" +%Y-%m-%d)
|
||||
fi
|
||||
fi
|
||||
|
||||
LIMIT=${2:-300}
|
||||
BRANCH=${3:-""}
|
||||
WORKFLOWS=("Testing: E2E (Chained)" "Evals: Nightly")
|
||||
DEST_DIR=$(mktemp -d -t gemini-reliability-XXXXXX)
|
||||
MERGED_FILE="api-reliability-summary.jsonl"
|
||||
|
||||
# Ensure cleanup on exit
|
||||
trap 'rm -rf "$DEST_DIR"' EXIT
|
||||
|
||||
if ! command -v gh &> /dev/null; then
|
||||
echo "❌ Error: GitHub CLI (gh) is not installed."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if ! command -v jq &> /dev/null; then
|
||||
echo "❌ Error: jq is not installed."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Clean start
|
||||
rm -f "$MERGED_FILE"
|
||||
|
||||
# gh run list --created expects a date (YYYY-MM-DD) or a range
|
||||
CREATED_QUERY=">=$SINCE"
|
||||
|
||||
for WORKFLOW in "${WORKFLOWS[@]}"; do
|
||||
echo "🔍 Fetching runs for '$WORKFLOW' created since $SINCE (max $LIMIT runs, branch: ${BRANCH:-all})..."
|
||||
|
||||
# Construct arguments for gh run list
|
||||
GH_ARGS=("--workflow" "$WORKFLOW" "--created" "$CREATED_QUERY" "--limit" "$LIMIT" "--json" "databaseId" "--jq" ".[].databaseId")
|
||||
if [ -n "$BRANCH" ]; then
|
||||
GH_ARGS+=("--branch" "$BRANCH")
|
||||
fi
|
||||
|
||||
RUN_IDS=$(gh run list "${GH_ARGS[@]}")
|
||||
exit_code=$?
|
||||
|
||||
if [ $exit_code -ne 0 ]; then
|
||||
echo "❌ Failed to fetch runs for '$WORKFLOW' (exit code: $exit_code). Please check 'gh auth status' and permissions." >&2
|
||||
continue
|
||||
fi
|
||||
|
||||
if [ -z "$RUN_IDS" ]; then
|
||||
echo "📭 No runs found for workflow '$WORKFLOW' since $SINCE."
|
||||
continue
|
||||
fi
|
||||
|
||||
for ID in $RUN_IDS; do
|
||||
# Download artifacts named 'eval-logs-*'
|
||||
# Silencing output because many older runs won't have artifacts
|
||||
gh run download "$ID" -p "eval-logs-*" -D "$DEST_DIR/$ID" &>/dev/null || continue
|
||||
|
||||
# Append to master log
|
||||
# Use find to locate api-reliability.jsonl in any subdirectory of $DEST_DIR/$ID
|
||||
find "$DEST_DIR/$ID" -type f -name "api-reliability.jsonl" -exec cat {} + >> "$MERGED_FILE" 2>/dev/null
|
||||
done
|
||||
done
|
||||
|
||||
if [ ! -f "$MERGED_FILE" ]; then
|
||||
echo "📭 No reliability data found in the retrieved logs."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo -e "\n✅ Harvest Complete! Data merged into: $MERGED_FILE"
|
||||
echo "------------------------------------------------"
|
||||
echo "📊 Gemini API Reliability Summary (Since $SINCE)"
|
||||
echo "------------------------------------------------"
|
||||
|
||||
cat "$MERGED_FILE" | jq -s '
|
||||
group_by(.model) | map({
|
||||
model: .[0].model,
|
||||
"500s": (map(select(.errorCode == "500")) | length),
|
||||
"503s": (map(select(.errorCode == "503")) | length),
|
||||
retries: (map(select(.status == "RETRY")) | length),
|
||||
skips: (map(select(.status == "SKIP")) | length)
|
||||
})'
|
||||
|
||||
echo -e "\n💡 Total events captured: $(wc -l < "$MERGED_FILE")"
|
||||
Reference in New Issue
Block a user