mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-06-10 11:12:35 -07:00
feat(evals): implement related evaluation system for targeted testing
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
name: 'Evals: PR Evaluation & Regression'
|
||||
|
||||
on:
|
||||
pull_request_target:
|
||||
pull_request:
|
||||
types: ['opened', 'synchronize', 'reopened', 'ready_for_review']
|
||||
paths:
|
||||
- 'packages/core/src/prompts/**'
|
||||
@@ -153,9 +153,10 @@ jobs:
|
||||
GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}'
|
||||
GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}'
|
||||
MODEL_LIST: '${{ env.MODEL_LIST }}'
|
||||
GITHUB_BASE_REF: '${{ github.base_ref }}'
|
||||
run: |
|
||||
# Run the regression check loop. The script saves the report to a file.
|
||||
node scripts/run_eval_regression.js
|
||||
# Run the related regression check loop.
|
||||
node scripts/run_eval_regression.js --related
|
||||
|
||||
# Use the generated report file if it exists
|
||||
if [[ -f eval_regression_report.md ]]; then
|
||||
|
||||
+42
-2
@@ -227,10 +227,18 @@ in Pull Requests. These can also be run locally for debugging.
|
||||
### Running Regression Checks Locally
|
||||
|
||||
You can simulate the PR regression check locally to verify your changes before
|
||||
pushing:
|
||||
pushing. To optimize your workflow and reduce LLM costs, use the **`--related`**
|
||||
flag to run only the tests relevant to your specific changes:
|
||||
|
||||
```bash
|
||||
# Run the full regression loop for a specific model
|
||||
# Run the targeted regression loop for your changes
|
||||
MODEL_LIST=gemini-3-flash-preview node scripts/run_eval_regression.js --related
|
||||
```
|
||||
|
||||
To run the full regression loop for a specific model (all stable tests):
|
||||
|
||||
```bash
|
||||
# Run everything
|
||||
MODEL_LIST=gemini-3-flash-preview node scripts/run_eval_regression.js
|
||||
```
|
||||
|
||||
@@ -244,6 +252,38 @@ OUTPUT=$(node scripts/get_trustworthy_evals.js "gemini-3-flash-preview")
|
||||
node scripts/run_regression_check.js "gemini-3-flash-preview" "$OUTPUT"
|
||||
```
|
||||
|
||||
### Related Testing with `--related`
|
||||
|
||||
The project uses a "Smart Eval" system to identify which behavioral evaluations
|
||||
are affected by your code changes. This is controlled by the `--related` flag
|
||||
available in `scripts/run_eval_regression.js`.
|
||||
|
||||
#### How it Works
|
||||
|
||||
1. **Change Detection**: The system uses Git to identify modified files in your
|
||||
branch compared to `main`.
|
||||
2. **Suite Mapping**: Modified files are matched against patterns in
|
||||
`evals/suites.json`. This file maps core files (e.g., `grep.ts`) to their
|
||||
corresponding evaluations.
|
||||
3. **Targeted Execution**: Only the evaluations belonging to the affected
|
||||
suites are executed.
|
||||
4. **Global Fallback**: If changes are detected in core system prompts or
|
||||
unmapped files, the system automatically falls back to running the full
|
||||
stable evaluation suite for safety.
|
||||
|
||||
#### Updating Detection Logic
|
||||
|
||||
If you add a new tool or functional area, you should update `evals/suites.json`
|
||||
to ensure your new evaluations are triggered correctly.
|
||||
|
||||
```json
|
||||
"my_new_tool": {
|
||||
"description": "Description of the tool",
|
||||
"patterns": ["packages/core/src/tools/my-new-tool.ts"],
|
||||
"evals": ["evals/my_new_tool.eval.ts"]
|
||||
}
|
||||
```
|
||||
|
||||
### The Regression Quality Bar
|
||||
|
||||
Because LLMs are non-deterministic, the PR regression check uses a high-signal
|
||||
|
||||
@@ -0,0 +1,171 @@
|
||||
{
|
||||
"allowedOverlaps": [],
|
||||
"grep": {
|
||||
"description": "Grep search functionality",
|
||||
"patterns": [
|
||||
"packages/core/src/tools/grep.ts",
|
||||
"packages/core/src/tools/ripGrep.ts",
|
||||
"packages/core/src/tools/grep-utils.ts",
|
||||
"evals/grep_search_functionality.eval.ts"
|
||||
],
|
||||
"evals": ["evals/grep_search_functionality.eval.ts"]
|
||||
},
|
||||
"memory": {
|
||||
"description": "Memory tool and fact persistence",
|
||||
"patterns": [
|
||||
"packages/core/src/tools/memoryTool.ts",
|
||||
"packages/core/src/persistence/storage.ts",
|
||||
"evals/save_memory.eval.ts",
|
||||
"evals/hierarchical_memory.eval.ts"
|
||||
],
|
||||
"evals": ["evals/save_memory.eval.ts", "evals/hierarchical_memory.eval.ts"]
|
||||
},
|
||||
"read_file": {
|
||||
"description": "File reading and content extraction",
|
||||
"patterns": [
|
||||
"packages/core/src/tools/read-file.ts",
|
||||
"packages/core/src/tools/read-many-files.ts",
|
||||
"evals/frugalReads.eval.ts"
|
||||
],
|
||||
"evals": ["evals/frugalReads.eval.ts"]
|
||||
},
|
||||
"glob": {
|
||||
"description": "File discovery and globbing",
|
||||
"patterns": [
|
||||
"packages/core/src/tools/glob.ts",
|
||||
"evals/frugalSearch.eval.ts"
|
||||
],
|
||||
"evals": ["evals/frugalSearch.eval.ts"]
|
||||
},
|
||||
"tracker": {
|
||||
"description": "Task and progress tracking",
|
||||
"patterns": [
|
||||
"packages/core/src/tools/trackerTools.ts",
|
||||
"evals/tracker.eval.ts"
|
||||
],
|
||||
"evals": ["evals/tracker.eval.ts"]
|
||||
},
|
||||
"ask_user": {
|
||||
"description": "Interactive user confirmation and input",
|
||||
"patterns": [
|
||||
"packages/core/src/tools/ask-user.ts",
|
||||
"evals/ask_user.eval.ts"
|
||||
],
|
||||
"evals": ["evals/ask_user.eval.ts"]
|
||||
},
|
||||
"plan_mode": {
|
||||
"description": "Plan Mode orchestration",
|
||||
"patterns": [
|
||||
"packages/core/src/tools/enter-plan-mode.ts",
|
||||
"packages/core/src/tools/exit-plan-mode.ts",
|
||||
"packages/core/src/agents/planAgent.ts",
|
||||
"evals/plan_mode.eval.ts"
|
||||
],
|
||||
"evals": ["evals/plan_mode.eval.ts"]
|
||||
},
|
||||
"git": {
|
||||
"description": "Git repository operations",
|
||||
"patterns": [
|
||||
"packages/core/src/utils/git.ts",
|
||||
"packages/core/src/tools/shell.ts",
|
||||
"evals/gitRepo.eval.ts",
|
||||
"evals/unsafe-cloning.eval.ts"
|
||||
],
|
||||
"evals": ["evals/gitRepo.eval.ts", "evals/unsafe-cloning.eval.ts"]
|
||||
},
|
||||
"agents": {
|
||||
"description": "Agent delegation and help",
|
||||
"patterns": [
|
||||
"packages/core/src/agents/**",
|
||||
"packages/core/src/routing/**",
|
||||
"evals/subagents.eval.ts",
|
||||
"evals/generalist_agent.eval.ts",
|
||||
"evals/generalist_delegation.eval.ts",
|
||||
"evals/cli_help_delegation.eval.ts"
|
||||
],
|
||||
"evals": [
|
||||
"evals/subagents.eval.ts",
|
||||
"evals/generalist_agent.eval.ts",
|
||||
"evals/generalist_delegation.eval.ts",
|
||||
"evals/cli_help_delegation.eval.ts"
|
||||
]
|
||||
},
|
||||
"background": {
|
||||
"description": "Background process management",
|
||||
"patterns": [
|
||||
"packages/core/src/tools/shellBackgroundTools.ts",
|
||||
"evals/background_processes.eval.ts"
|
||||
],
|
||||
"evals": ["evals/background_processes.eval.ts"]
|
||||
},
|
||||
"core_steering": {
|
||||
"description": "System prompts and core model steering",
|
||||
"patterns": [
|
||||
"packages/core/src/prompts/**",
|
||||
"evals/answer-vs-act.eval.ts",
|
||||
"evals/model_steering.eval.ts",
|
||||
"evals/redundant_casts.eval.ts"
|
||||
],
|
||||
"evals": [
|
||||
"ALL_ALWAYS_PASSING",
|
||||
"evals/answer-vs-act.eval.ts",
|
||||
"evals/model_steering.eval.ts",
|
||||
"evals/redundant_casts.eval.ts"
|
||||
]
|
||||
},
|
||||
"edit_fidelity": {
|
||||
"description": "Code edit accuracy and location checks",
|
||||
"patterns": [
|
||||
"packages/core/src/tools/edit.ts",
|
||||
"packages/core/src/tools/write-file.ts",
|
||||
"evals/edit-locations-eval.eval.ts",
|
||||
"evals/validation_fidelity.eval.ts",
|
||||
"evals/validation_fidelity_pre_existing_errors.eval.ts"
|
||||
],
|
||||
"evals": [
|
||||
"evals/edit-locations-eval.eval.ts",
|
||||
"evals/validation_fidelity.eval.ts",
|
||||
"evals/validation_fidelity_pre_existing_errors.eval.ts"
|
||||
]
|
||||
},
|
||||
"reliability": {
|
||||
"description": "Core safety and sandbox reliability",
|
||||
"patterns": [
|
||||
"packages/core/src/safety/**",
|
||||
"packages/core/src/fallback/**",
|
||||
"evals/concurrency-safety.eval.ts",
|
||||
"evals/sandbox_recovery.eval.ts",
|
||||
"evals/interactive-hang.eval.ts",
|
||||
"evals/tool_output_masking.eval.ts"
|
||||
],
|
||||
"evals": [
|
||||
"evals/concurrency-safety.eval.ts",
|
||||
"evals/sandbox_recovery.eval.ts",
|
||||
"evals/interactive-hang.eval.ts",
|
||||
"evals/tool_output_masking.eval.ts"
|
||||
]
|
||||
},
|
||||
"orchestration": {
|
||||
"description": "Agent task lifecycle and topic management",
|
||||
"patterns": [
|
||||
"packages/core/src/tools/update-topic.ts",
|
||||
"packages/core/src/tools/complete-task.ts",
|
||||
"evals/update_topic.eval.ts"
|
||||
],
|
||||
"evals": ["evals/update_topic.eval.ts"]
|
||||
},
|
||||
"general_tools": {
|
||||
"description": "General tool usage and efficiency",
|
||||
"patterns": [
|
||||
"packages/core/src/tools/tools.ts",
|
||||
"packages/core/src/tools/tool-registry.ts",
|
||||
"packages/core/src/tools/shell.ts",
|
||||
"evals/automated-tool-use.eval.ts",
|
||||
"evals/shell-efficiency.eval.ts"
|
||||
],
|
||||
"evals": [
|
||||
"evals/automated-tool-use.eval.ts",
|
||||
"evals/shell-efficiency.eval.ts"
|
||||
]
|
||||
}
|
||||
}
|
||||
@@ -47,6 +47,7 @@
|
||||
"posttest": "npm run build",
|
||||
"test:always_passing_evals": "vitest run --config evals/vitest.config.ts",
|
||||
"test:all_evals": "cross-env RUN_EVALS=1 vitest run --config evals/vitest.config.ts",
|
||||
"test:evals:related": "node scripts/run_eval_regression.js --related",
|
||||
"test:e2e": "cross-env VERBOSE=true KEEP_OUTPUT=true npm run test:integration:sandbox:none",
|
||||
"test:integration:all": "npm run test:integration:sandbox:none && npm run test:integration:sandbox:docker && npm run test:integration:sandbox:podman",
|
||||
"test:integration:flaky": "cross-env RUN_FLAKY_INTEGRATION=1 npm run test:integration:sandbox:none",
|
||||
|
||||
+167
-26
@@ -3,7 +3,23 @@
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
/**
|
||||
* @fileoverview Intelligence layer for detecting steering and behavior changes.
|
||||
*
|
||||
* This script identifies if code changes affect model steering (system prompts,
|
||||
* tool definitions, agent instructions) and maps them to relevant evaluation
|
||||
* suites. It supports both CI (GitHub Actions) and local development workflows.
|
||||
*
|
||||
* Detection Methods:
|
||||
* 1. Path-based: Monitors critical steering and tool directories.
|
||||
* 2. Signature-based: Scans diff content for core steering primitives
|
||||
* (e.g., ToolDefinition, inputSchema).
|
||||
* 3. Suite-aware: Uses evals/suites.json to identify related tests for surgical runs.
|
||||
*/
|
||||
|
||||
import { execSync } from 'node:child_process';
|
||||
import fs from 'node:fs';
|
||||
|
||||
const CORE_STEERING_PATHS = [
|
||||
'packages/core/src/prompts/',
|
||||
@@ -20,46 +36,132 @@ const STEERING_SIGNATURES = [
|
||||
"kind: 'local'",
|
||||
];
|
||||
|
||||
function minimatch(file, pattern) {
|
||||
if (pattern.endsWith('/**')) {
|
||||
const prefix = pattern.slice(0, -3);
|
||||
return file.startsWith(prefix);
|
||||
}
|
||||
if (pattern.includes('*')) {
|
||||
const regex = new RegExp(
|
||||
'^' +
|
||||
pattern
|
||||
.replace(/\./g, '\\.')
|
||||
.replace(/\*\*/g, '.*')
|
||||
.replace(/\*/g, '[^/]*') +
|
||||
'$',
|
||||
);
|
||||
return regex.test(file);
|
||||
}
|
||||
return file === pattern;
|
||||
}
|
||||
|
||||
function main() {
|
||||
const targetBranch = process.env.GITHUB_BASE_REF || 'main';
|
||||
const verbose = process.argv.includes('--verbose');
|
||||
const steeringOnly = process.argv.includes('--steering-only');
|
||||
const isRelatedMode = process.argv.includes('--related');
|
||||
const isJsonMode = process.argv.includes('--json');
|
||||
|
||||
try {
|
||||
const remoteUrl = process.env.GITHUB_REPOSITORY
|
||||
? `https://github.com/${process.env.GITHUB_REPOSITORY}.git`
|
||||
: 'origin';
|
||||
|
||||
// Fetch target branch from the remote.
|
||||
execSync(`git fetch ${remoteUrl} ${targetBranch}`, {
|
||||
stdio: 'ignore',
|
||||
});
|
||||
let changedFiles = [];
|
||||
const isCi = !!process.env.GITHUB_ACTIONS;
|
||||
|
||||
// Get changed files using the triple-dot syntax which correctly handles merge commits
|
||||
const head = process.env.PR_HEAD_SHA || 'HEAD';
|
||||
const changedFiles = execSync(`git diff --name-only FETCH_HEAD...${head}`, {
|
||||
encoding: 'utf-8',
|
||||
})
|
||||
.split('\n')
|
||||
.filter(Boolean);
|
||||
if (isCi) {
|
||||
try {
|
||||
// 1. Try fetching from remote (CI environment)
|
||||
execSync(`git fetch ${remoteUrl} ${targetBranch}`, {
|
||||
stdio: 'ignore',
|
||||
});
|
||||
|
||||
// Get changed files using the triple-dot syntax which correctly handles merge commits
|
||||
const head = process.env.PR_HEAD_SHA || 'HEAD';
|
||||
changedFiles = execSync(`git diff --name-only FETCH_HEAD...${head}`, {
|
||||
encoding: 'utf-8',
|
||||
})
|
||||
.split('\n')
|
||||
.filter(Boolean);
|
||||
} catch (e) {
|
||||
if (verbose)
|
||||
process.stderr.write(
|
||||
`Warning: git fetch failed in CI: ${e.message}\n`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Local fallback or if CI fetch failed: Try diffing against target branch
|
||||
if (changedFiles.length === 0) {
|
||||
try {
|
||||
changedFiles = execSync(`git diff --name-only ${targetBranch}`, {
|
||||
encoding: 'utf-8',
|
||||
})
|
||||
.split('\n')
|
||||
.filter(Boolean);
|
||||
} catch {
|
||||
// 3. Last resort: Just diff against HEAD (uncommitted changes only)
|
||||
changedFiles = execSync('git diff --name-only HEAD', {
|
||||
encoding: 'utf-8',
|
||||
})
|
||||
.split('\n')
|
||||
.filter(Boolean);
|
||||
}
|
||||
|
||||
// Also include untracked files in local mode
|
||||
const untracked = execSync('git ls-files --others --exclude-standard', {
|
||||
encoding: 'utf-8',
|
||||
})
|
||||
.split('\n')
|
||||
.filter(Boolean);
|
||||
changedFiles = [...new Set([...changedFiles, ...untracked])];
|
||||
}
|
||||
|
||||
let detected = false;
|
||||
const reasons = [];
|
||||
const affectedSuites = new Set();
|
||||
const rationales = [];
|
||||
|
||||
// Load suites for --related mode
|
||||
let suitesConfig = null;
|
||||
if (isRelatedMode) {
|
||||
try {
|
||||
suitesConfig = JSON.parse(
|
||||
fs.readFileSync('evals/suites.json', 'utf-8'),
|
||||
);
|
||||
} catch {
|
||||
process.stderr.write(`Warning: Could not load evals/suites.json\n`);
|
||||
}
|
||||
}
|
||||
|
||||
// 1. Path-based detection
|
||||
for (const file of changedFiles) {
|
||||
if (CORE_STEERING_PATHS.some((prefix) => file.startsWith(prefix))) {
|
||||
detected = true;
|
||||
reasons.push(`Matched core steering path: ${file}`);
|
||||
if (!verbose) break;
|
||||
}
|
||||
if (
|
||||
!steeringOnly &&
|
||||
TEST_PATHS.some((prefix) => file.startsWith(prefix))
|
||||
TEST_PATHS.some((prefix) => file.startsWith(prefix)) &&
|
||||
file.endsWith('.eval.ts')
|
||||
) {
|
||||
detected = true;
|
||||
reasons.push(`Matched test path: ${file}`);
|
||||
if (!verbose) break;
|
||||
reasons.push(`Matched test file: ${file}`);
|
||||
}
|
||||
|
||||
// Related suite detection
|
||||
if (suitesConfig) {
|
||||
for (const [suiteName, suite] of Object.entries(suitesConfig)) {
|
||||
if (suiteName === 'allowedOverlaps' || !suite.patterns) continue;
|
||||
|
||||
if (suite.patterns.some((pattern) => minimatch(file, pattern))) {
|
||||
affectedSuites.add(suiteName);
|
||||
rationales.push(
|
||||
`Testing **${suiteName}** because **${file}** was modified.`,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -70,15 +172,30 @@ function main() {
|
||||
);
|
||||
if (coreChanges.length > 0) {
|
||||
// Get the actual diff content for core files
|
||||
const diff = execSync(
|
||||
`git diff -U0 FETCH_HEAD...${head} -- packages/core/src/`,
|
||||
{ encoding: 'utf-8' },
|
||||
);
|
||||
// We need to be careful with the diff command depending on if we have FETCH_HEAD
|
||||
let diffCmd = '';
|
||||
try {
|
||||
const head = process.env.PR_HEAD_SHA || 'HEAD';
|
||||
diffCmd = `git diff -U0 FETCH_HEAD...${head} -- packages/core/src/`;
|
||||
execSync('git rev-parse FETCH_HEAD', { stdio: 'ignore' });
|
||||
} catch {
|
||||
diffCmd = `git diff -U0 ${targetBranch} -- packages/core/src/`;
|
||||
}
|
||||
|
||||
const diff = execSync(diffCmd, { encoding: 'utf-8' });
|
||||
for (const sig of STEERING_SIGNATURES) {
|
||||
if (diff.includes(sig)) {
|
||||
detected = true;
|
||||
reasons.push(`Matched steering signature in core: ${sig}`);
|
||||
if (!verbose) break;
|
||||
|
||||
// If we detected a steering signature, mark core_steering suite
|
||||
if (isRelatedMode) {
|
||||
affectedSuites.add('core_steering');
|
||||
rationales.push(
|
||||
`Testing **core_steering** because matched signature '${sig}' in core files.`,
|
||||
);
|
||||
}
|
||||
if (!verbose && !isRelatedMode) break;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -89,14 +206,38 @@ function main() {
|
||||
reasons.forEach((r) => process.stderr.write(` - ${r}\n`));
|
||||
}
|
||||
|
||||
process.stdout.write(detected ? 'true' : 'false');
|
||||
if (isJsonMode) {
|
||||
process.stdout.write(
|
||||
JSON.stringify(
|
||||
{
|
||||
detected,
|
||||
reasons,
|
||||
affectedSuites: Array.from(affectedSuites),
|
||||
rationales,
|
||||
},
|
||||
null,
|
||||
2,
|
||||
),
|
||||
);
|
||||
} else {
|
||||
process.stdout.write(detected ? 'true' : 'false');
|
||||
}
|
||||
} catch (error) {
|
||||
// If anything fails (e.g., no git history), run evals/guidance to be safe
|
||||
process.stderr.write(
|
||||
'Warning: Failed to determine if changes occurred. Defaulting to true.\n',
|
||||
);
|
||||
if (isJsonMode) {
|
||||
process.stdout.write(
|
||||
JSON.stringify({
|
||||
detected: true,
|
||||
reasons: [`Error during detection: ${error.message}`],
|
||||
affectedSuites: ['core_steering'],
|
||||
rationales: [
|
||||
'Error during detection: running all stable evals for safety.',
|
||||
],
|
||||
}),
|
||||
);
|
||||
} else {
|
||||
process.stdout.write('true');
|
||||
}
|
||||
process.stderr.write(String(error) + '\n');
|
||||
process.stdout.write('true');
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
* to ensure high-signal validation and minimize noise.
|
||||
*/
|
||||
|
||||
import fs from 'node:fs';
|
||||
import { fetchNightlyHistory, escapeRegex } from './eval_utils.js';
|
||||
|
||||
const LOOKBACK_COUNT = 6;
|
||||
@@ -25,11 +26,24 @@ const AGGREGATE_PASS_RATE_THRESHOLD = 0.8; // Weekly signal (e.g., 15/18)
|
||||
*/
|
||||
function main() {
|
||||
const targetModel = process.argv[2];
|
||||
if (!targetModel) {
|
||||
if (!targetModel || targetModel.startsWith('--')) {
|
||||
console.error('❌ Error: No target model specified.');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Parse --suites argument
|
||||
const suitesArgIndex = process.argv.indexOf('--suites');
|
||||
let requestedSuites = null;
|
||||
if (suitesArgIndex !== -1 && process.argv[suitesArgIndex + 1]) {
|
||||
requestedSuites = process.argv[suitesArgIndex + 1]
|
||||
.split(',')
|
||||
.map((s) => s.trim());
|
||||
}
|
||||
|
||||
console.error(`🔍 Identifying trustworthy evals for model: ${targetModel}`);
|
||||
if (requestedSuites) {
|
||||
console.error(`📂 Filtering by suites: ${requestedSuites.join(', ')}`);
|
||||
}
|
||||
|
||||
const history = fetchNightlyHistory(LOOKBACK_COUNT);
|
||||
if (history.length === 0) {
|
||||
@@ -37,6 +51,32 @@ function main() {
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Load suites configuration
|
||||
let allowedFiles = null;
|
||||
let runAllStable = false;
|
||||
if (requestedSuites) {
|
||||
try {
|
||||
const suitesConfig = JSON.parse(
|
||||
fs.readFileSync('evals/suites.json', 'utf-8'),
|
||||
);
|
||||
allowedFiles = new Set();
|
||||
for (const suiteName of requestedSuites) {
|
||||
const suite = suitesConfig[suiteName];
|
||||
if (suite) {
|
||||
if (suite.evals.includes('ALL_ALWAYS_PASSING')) {
|
||||
runAllStable = true;
|
||||
} else {
|
||||
suite.evals.forEach((file) => allowedFiles.add(file));
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
console.error(
|
||||
`⚠️ Warning: Could not load evals/suites.json or match suites: ${e.message}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Aggregate results for the target model across all history
|
||||
const testHistories = {}; // { [testName]: { totalPassed: 0, totalRuns: 0, dailyRates: [], file: string } }
|
||||
|
||||
@@ -83,11 +123,28 @@ function main() {
|
||||
const isAggregateHighSignal = aggregateRate > AGGREGATE_PASS_RATE_THRESHOLD;
|
||||
|
||||
if (isDailyStable && isAggregateHighSignal) {
|
||||
trustworthyTests.push(testName);
|
||||
if (info.file) {
|
||||
const match = info.file.match(/evals\/.*\.eval\.ts/);
|
||||
if (match) {
|
||||
trustworthyFiles.add(match[0]);
|
||||
// Suite filtering logic
|
||||
let isFileAllowed = true;
|
||||
if (requestedSuites && !runAllStable) {
|
||||
if (info.file) {
|
||||
const match = info.file.match(/evals\/.*\.eval\.ts/);
|
||||
if (match && !allowedFiles.has(match[0])) {
|
||||
isFileAllowed = false;
|
||||
} else if (!match) {
|
||||
isFileAllowed = false;
|
||||
}
|
||||
} else {
|
||||
isFileAllowed = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (isFileAllowed) {
|
||||
trustworthyTests.push(testName);
|
||||
if (info.file) {
|
||||
const match = info.file.match(/evals\/.*\.eval\.ts/);
|
||||
if (match) {
|
||||
trustworthyFiles.add(match[0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@@ -99,10 +156,14 @@ function main() {
|
||||
`✅ Found ${trustworthyTests.length} trustworthy tests across ${trustworthyFiles.size} files:`,
|
||||
);
|
||||
trustworthyTests.sort().forEach((name) => console.error(` - ${name}`));
|
||||
console.error(`\n⚪ Ignored ${volatileTests.length} volatile tests.`);
|
||||
console.error(
|
||||
`🆕 Ignored ${newTests.length} tests with insufficient history.`,
|
||||
);
|
||||
if (volatileTests.length > 0) {
|
||||
console.error(`\n⚪ Ignored ${volatileTests.length} volatile tests.`);
|
||||
}
|
||||
if (newTests.length > 0) {
|
||||
console.error(
|
||||
`🆕 Ignored ${newTests.length} tests with insufficient history.`,
|
||||
);
|
||||
}
|
||||
|
||||
// Output the list of names as a regex-friendly pattern for vitest -t
|
||||
const pattern = trustworthyTests.map((name) => escapeRegex(name)).join('|');
|
||||
|
||||
@@ -500,6 +500,9 @@ function main() {
|
||||
if (args.includes('--check-github-actions-pinning')) {
|
||||
runGithubActionsPinningLinter();
|
||||
}
|
||||
if (args.includes('--eval-suites')) {
|
||||
runEvalSuiteLinter();
|
||||
}
|
||||
|
||||
if (args.length === 0) {
|
||||
setupLinters();
|
||||
@@ -511,8 +514,18 @@ function main() {
|
||||
runSensitiveKeywordLinter();
|
||||
runTSConfigLinter();
|
||||
runGithubActionsPinningLinter();
|
||||
runEvalSuiteLinter();
|
||||
console.log('\nAll linting checks passed!');
|
||||
}
|
||||
}
|
||||
|
||||
export function runEvalSuiteLinter() {
|
||||
console.log('\nRunning eval suite linter...');
|
||||
try {
|
||||
execSync('node scripts/validate_eval_suites.js', { stdio: 'inherit' });
|
||||
} catch {
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
|
||||
@@ -22,22 +22,62 @@ import fs from 'node:fs';
|
||||
async function main() {
|
||||
const modelList = process.env.MODEL_LIST || 'gemini-3-flash-preview';
|
||||
const models = modelList.split(',').map((m) => m.trim());
|
||||
const isRelatedMode = process.argv.includes('--related');
|
||||
|
||||
let combinedReport = '';
|
||||
let hasRegression = false;
|
||||
let detectionRationale = '';
|
||||
let affectedSuitesStr = '';
|
||||
|
||||
console.log(
|
||||
`🚀 Starting evaluation orchestration for models: ${models.join(', ')}`,
|
||||
);
|
||||
|
||||
if (isRelatedMode) {
|
||||
console.log('🔍 Identifying related evaluations based on changes...');
|
||||
try {
|
||||
const detectionOutput = execSync(
|
||||
`node scripts/changed_prompt.js --related --json`,
|
||||
{ encoding: 'utf-8', stdio: ['inherit', 'pipe', 'inherit'] },
|
||||
).trim();
|
||||
const detection = JSON.parse(detectionOutput);
|
||||
|
||||
if (detection.affectedSuites && detection.affectedSuites.length > 0) {
|
||||
affectedSuitesStr = detection.affectedSuites.join(',');
|
||||
detectionRationale = '### 🧪 Related Evaluation Rationale\n\n';
|
||||
detection.rationales.forEach((r) => {
|
||||
detectionRationale += `- ${r}\n`;
|
||||
});
|
||||
detectionRationale +=
|
||||
'\n_Something missing? [Update evals/suites.json](evals/README.md#related-testing-with-related) to adjust detection logic._\n\n---\n\n';
|
||||
} else if (!detection.detected) {
|
||||
console.log('✅ No related changes detected. Skipping evaluations.');
|
||||
process.exit(0);
|
||||
} else {
|
||||
console.log(
|
||||
'⚠️ Changes detected but no specific suites matched. Running full stable suite for safety.',
|
||||
);
|
||||
detectionRationale =
|
||||
'### 🧪 Related Evaluation Rationale\n\n- No specific suites matched. Running full stable suite for safety.\n\n---\n\n';
|
||||
}
|
||||
} catch (e) {
|
||||
console.error(`❌ Error during suite detection: ${e.message}`);
|
||||
detectionRationale =
|
||||
'### 🧪 Related Evaluation Rationale\n\n- Error during suite detection. Running full stable suite for safety.\n\n---\n\n';
|
||||
}
|
||||
}
|
||||
|
||||
for (const model of models) {
|
||||
console.log(`\n--- Processing Model: ${model} ---`);
|
||||
|
||||
try {
|
||||
// 1. Identify Trustworthy Evals
|
||||
console.log(`🔍 Identifying trustworthy tests for ${model}...`);
|
||||
const suitesFlag = affectedSuitesStr
|
||||
? `--suites ${affectedSuitesStr}`
|
||||
: '';
|
||||
const output = execSync(
|
||||
`node scripts/get_trustworthy_evals.js "${model}"`,
|
||||
`node scripts/get_trustworthy_evals.js "${model}" ${suitesFlag}`,
|
||||
{
|
||||
encoding: 'utf-8',
|
||||
stdio: ['inherit', 'pipe', 'inherit'], // Capture stdout but pass stdin/stderr
|
||||
@@ -83,7 +123,8 @@ async function main() {
|
||||
|
||||
// Always save the combined report to a file so the workflow can capture it cleanly
|
||||
if (combinedReport) {
|
||||
fs.writeFileSync('eval_regression_report.md', combinedReport);
|
||||
const finalReport = detectionRationale + combinedReport;
|
||||
fs.writeFileSync('eval_regression_report.md', finalReport);
|
||||
console.log(
|
||||
'\n📊 Final Markdown report saved to eval_regression_report.md',
|
||||
);
|
||||
|
||||
@@ -0,0 +1,98 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
|
||||
const SUITES_PATH = 'evals/suites.json';
|
||||
const EVALS_DIR = 'evals';
|
||||
|
||||
/**
|
||||
* Validates that all eval files are mapped in suites.json and that there are no overlaps.
|
||||
*/
|
||||
function main() {
|
||||
if (!fs.existsSync(SUITES_PATH)) {
|
||||
console.error(`❌ Error: ${SUITES_PATH} not found.`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const suitesConfig = JSON.parse(fs.readFileSync(SUITES_PATH, 'utf-8'));
|
||||
const allowedOverlaps = new Set(suitesConfig.allowedOverlaps || []);
|
||||
const evalFilesOnDisk = fs
|
||||
.readdirSync(EVALS_DIR)
|
||||
.filter((f) => f.endsWith('.eval.ts'))
|
||||
.map((f) => path.join(EVALS_DIR, f));
|
||||
|
||||
const evalToSuiteMap = new Map();
|
||||
const errors = [];
|
||||
|
||||
// 1. Map evals to suites and check for overlaps/trigger-coverage
|
||||
for (const [suiteName, suite] of Object.entries(suitesConfig)) {
|
||||
if (suiteName === 'allowedOverlaps' || !suite.evals) continue;
|
||||
|
||||
for (const evalFile of suite.evals) {
|
||||
if (evalFile === 'ALL_ALWAYS_PASSING') continue;
|
||||
|
||||
if (!fs.existsSync(evalFile)) {
|
||||
errors.push(
|
||||
`Suite **${suiteName}** references non-existent file: **${evalFile}**`,
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check if the eval file itself is in the suite's trigger patterns
|
||||
if (!suite.patterns || !suite.patterns.includes(evalFile)) {
|
||||
errors.push(
|
||||
`Trigger coverage missing: **${evalFile}** is in the **${suiteName}** suite but is missing from its **patterns** array. (Changes to the test won't trigger itself correctly).`,
|
||||
);
|
||||
}
|
||||
|
||||
if (evalToSuiteMap.has(evalFile) && !allowedOverlaps.has(evalFile)) {
|
||||
errors.push(
|
||||
`Overlap detected: **${evalFile}** is present in both **${evalToSuiteMap.get(evalFile)}** and **${suiteName}** suites.`,
|
||||
);
|
||||
} else {
|
||||
const existingSuites = evalToSuiteMap.get(evalFile) || [];
|
||||
evalToSuiteMap.set(
|
||||
evalFile,
|
||||
Array.isArray(existingSuites)
|
||||
? [...existingSuites, suiteName]
|
||||
: [existingSuites, suiteName],
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Check for orphaned evals (on disk but not in suites.json)
|
||||
for (const diskFile of evalFilesOnDisk) {
|
||||
if (!evalToSuiteMap.has(diskFile)) {
|
||||
errors.push(
|
||||
`Orphaned eval detected: **${diskFile}** is not mapped to any suite in ${SUITES_PATH}.`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if (errors.length > 0) {
|
||||
console.error('\n❌ Eval Suite Validation Failed:');
|
||||
errors.forEach((err) => console.error(` - ${err}`));
|
||||
|
||||
const hasOverlap = errors.some((err) => err.includes('Overlap detected'));
|
||||
if (hasOverlap) {
|
||||
console.error(
|
||||
`\n💡 Tip: If this overlap is intentional, add the file path to the 'allowedOverlaps' list in ${SUITES_PATH}.`,
|
||||
);
|
||||
} else {
|
||||
console.error(`\n💡 Tip: Update ${SUITES_PATH} to resolve these issues.`);
|
||||
}
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log(
|
||||
'✅ Eval Suite Validation Passed: All files mapped and no overlaps found.',
|
||||
);
|
||||
}
|
||||
|
||||
main();
|
||||
Reference in New Issue
Block a user