feat(evals): implement related evaluation system for targeted testing

This commit is contained in:
Alisa Novikova
2026-04-07 19:35:12 -07:00
parent 06fcdc231c
commit db8910c39b
9 changed files with 610 additions and 43 deletions
+4 -3
View File
@@ -1,7 +1,7 @@
name: 'Evals: PR Evaluation & Regression'
on:
pull_request_target:
pull_request:
types: ['opened', 'synchronize', 'reopened', 'ready_for_review']
paths:
- 'packages/core/src/prompts/**'
@@ -153,9 +153,10 @@ jobs:
GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}'
GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}'
MODEL_LIST: '${{ env.MODEL_LIST }}'
GITHUB_BASE_REF: '${{ github.base_ref }}'
run: |
# Run the regression check loop. The script saves the report to a file.
node scripts/run_eval_regression.js
# Run the related regression check loop.
node scripts/run_eval_regression.js --related
# Use the generated report file if it exists
if [[ -f eval_regression_report.md ]]; then
+42 -2
View File
@@ -227,10 +227,18 @@ in Pull Requests. These can also be run locally for debugging.
### Running Regression Checks Locally
You can simulate the PR regression check locally to verify your changes before
pushing:
pushing. To optimize your workflow and reduce LLM costs, use the **`--related`**
flag to run only the tests relevant to your specific changes:
```bash
# Run the full regression loop for a specific model
# Run the targeted regression loop for your changes
MODEL_LIST=gemini-3-flash-preview node scripts/run_eval_regression.js --related
```
To run the full regression loop for a specific model (all stable tests):
```bash
# Run everything
MODEL_LIST=gemini-3-flash-preview node scripts/run_eval_regression.js
```
@@ -244,6 +252,38 @@ OUTPUT=$(node scripts/get_trustworthy_evals.js "gemini-3-flash-preview")
node scripts/run_regression_check.js "gemini-3-flash-preview" "$OUTPUT"
```
### Related Testing with `--related`
The project uses a "Smart Eval" system to identify which behavioral evaluations
are affected by your code changes. This is controlled by the `--related` flag
available in `scripts/run_eval_regression.js`.
#### How it Works
1. **Change Detection**: The system uses Git to identify modified files in your
branch compared to `main`.
2. **Suite Mapping**: Modified files are matched against patterns in
`evals/suites.json`. This file maps core files (e.g., `grep.ts`) to their
corresponding evaluations.
3. **Targeted Execution**: Only the evaluations belonging to the affected
suites are executed.
4. **Global Fallback**: If changes are detected in core system prompts or
unmapped files, the system automatically falls back to running the full
stable evaluation suite for safety.
#### Updating Detection Logic
If you add a new tool or functional area, you should update `evals/suites.json`
to ensure your new evaluations are triggered correctly.
```json
"my_new_tool": {
"description": "Description of the tool",
"patterns": ["packages/core/src/tools/my-new-tool.ts"],
"evals": ["evals/my_new_tool.eval.ts"]
}
```
### The Regression Quality Bar
Because LLMs are non-deterministic, the PR regression check uses a high-signal
+171
View File
@@ -0,0 +1,171 @@
{
"allowedOverlaps": [],
"grep": {
"description": "Grep search functionality",
"patterns": [
"packages/core/src/tools/grep.ts",
"packages/core/src/tools/ripGrep.ts",
"packages/core/src/tools/grep-utils.ts",
"evals/grep_search_functionality.eval.ts"
],
"evals": ["evals/grep_search_functionality.eval.ts"]
},
"memory": {
"description": "Memory tool and fact persistence",
"patterns": [
"packages/core/src/tools/memoryTool.ts",
"packages/core/src/persistence/storage.ts",
"evals/save_memory.eval.ts",
"evals/hierarchical_memory.eval.ts"
],
"evals": ["evals/save_memory.eval.ts", "evals/hierarchical_memory.eval.ts"]
},
"read_file": {
"description": "File reading and content extraction",
"patterns": [
"packages/core/src/tools/read-file.ts",
"packages/core/src/tools/read-many-files.ts",
"evals/frugalReads.eval.ts"
],
"evals": ["evals/frugalReads.eval.ts"]
},
"glob": {
"description": "File discovery and globbing",
"patterns": [
"packages/core/src/tools/glob.ts",
"evals/frugalSearch.eval.ts"
],
"evals": ["evals/frugalSearch.eval.ts"]
},
"tracker": {
"description": "Task and progress tracking",
"patterns": [
"packages/core/src/tools/trackerTools.ts",
"evals/tracker.eval.ts"
],
"evals": ["evals/tracker.eval.ts"]
},
"ask_user": {
"description": "Interactive user confirmation and input",
"patterns": [
"packages/core/src/tools/ask-user.ts",
"evals/ask_user.eval.ts"
],
"evals": ["evals/ask_user.eval.ts"]
},
"plan_mode": {
"description": "Plan Mode orchestration",
"patterns": [
"packages/core/src/tools/enter-plan-mode.ts",
"packages/core/src/tools/exit-plan-mode.ts",
"packages/core/src/agents/planAgent.ts",
"evals/plan_mode.eval.ts"
],
"evals": ["evals/plan_mode.eval.ts"]
},
"git": {
"description": "Git repository operations",
"patterns": [
"packages/core/src/utils/git.ts",
"packages/core/src/tools/shell.ts",
"evals/gitRepo.eval.ts",
"evals/unsafe-cloning.eval.ts"
],
"evals": ["evals/gitRepo.eval.ts", "evals/unsafe-cloning.eval.ts"]
},
"agents": {
"description": "Agent delegation and help",
"patterns": [
"packages/core/src/agents/**",
"packages/core/src/routing/**",
"evals/subagents.eval.ts",
"evals/generalist_agent.eval.ts",
"evals/generalist_delegation.eval.ts",
"evals/cli_help_delegation.eval.ts"
],
"evals": [
"evals/subagents.eval.ts",
"evals/generalist_agent.eval.ts",
"evals/generalist_delegation.eval.ts",
"evals/cli_help_delegation.eval.ts"
]
},
"background": {
"description": "Background process management",
"patterns": [
"packages/core/src/tools/shellBackgroundTools.ts",
"evals/background_processes.eval.ts"
],
"evals": ["evals/background_processes.eval.ts"]
},
"core_steering": {
"description": "System prompts and core model steering",
"patterns": [
"packages/core/src/prompts/**",
"evals/answer-vs-act.eval.ts",
"evals/model_steering.eval.ts",
"evals/redundant_casts.eval.ts"
],
"evals": [
"ALL_ALWAYS_PASSING",
"evals/answer-vs-act.eval.ts",
"evals/model_steering.eval.ts",
"evals/redundant_casts.eval.ts"
]
},
"edit_fidelity": {
"description": "Code edit accuracy and location checks",
"patterns": [
"packages/core/src/tools/edit.ts",
"packages/core/src/tools/write-file.ts",
"evals/edit-locations-eval.eval.ts",
"evals/validation_fidelity.eval.ts",
"evals/validation_fidelity_pre_existing_errors.eval.ts"
],
"evals": [
"evals/edit-locations-eval.eval.ts",
"evals/validation_fidelity.eval.ts",
"evals/validation_fidelity_pre_existing_errors.eval.ts"
]
},
"reliability": {
"description": "Core safety and sandbox reliability",
"patterns": [
"packages/core/src/safety/**",
"packages/core/src/fallback/**",
"evals/concurrency-safety.eval.ts",
"evals/sandbox_recovery.eval.ts",
"evals/interactive-hang.eval.ts",
"evals/tool_output_masking.eval.ts"
],
"evals": [
"evals/concurrency-safety.eval.ts",
"evals/sandbox_recovery.eval.ts",
"evals/interactive-hang.eval.ts",
"evals/tool_output_masking.eval.ts"
]
},
"orchestration": {
"description": "Agent task lifecycle and topic management",
"patterns": [
"packages/core/src/tools/update-topic.ts",
"packages/core/src/tools/complete-task.ts",
"evals/update_topic.eval.ts"
],
"evals": ["evals/update_topic.eval.ts"]
},
"general_tools": {
"description": "General tool usage and efficiency",
"patterns": [
"packages/core/src/tools/tools.ts",
"packages/core/src/tools/tool-registry.ts",
"packages/core/src/tools/shell.ts",
"evals/automated-tool-use.eval.ts",
"evals/shell-efficiency.eval.ts"
],
"evals": [
"evals/automated-tool-use.eval.ts",
"evals/shell-efficiency.eval.ts"
]
}
}
+1
View File
@@ -47,6 +47,7 @@
"posttest": "npm run build",
"test:always_passing_evals": "vitest run --config evals/vitest.config.ts",
"test:all_evals": "cross-env RUN_EVALS=1 vitest run --config evals/vitest.config.ts",
"test:evals:related": "node scripts/run_eval_regression.js --related",
"test:e2e": "cross-env VERBOSE=true KEEP_OUTPUT=true npm run test:integration:sandbox:none",
"test:integration:all": "npm run test:integration:sandbox:none && npm run test:integration:sandbox:docker && npm run test:integration:sandbox:podman",
"test:integration:flaky": "cross-env RUN_FLAKY_INTEGRATION=1 npm run test:integration:sandbox:none",
+167 -26
View File
@@ -3,7 +3,23 @@
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
/**
* @fileoverview Intelligence layer for detecting steering and behavior changes.
*
* This script identifies if code changes affect model steering (system prompts,
* tool definitions, agent instructions) and maps them to relevant evaluation
* suites. It supports both CI (GitHub Actions) and local development workflows.
*
* Detection Methods:
* 1. Path-based: Monitors critical steering and tool directories.
* 2. Signature-based: Scans diff content for core steering primitives
* (e.g., ToolDefinition, inputSchema).
* 3. Suite-aware: Uses evals/suites.json to identify related tests for surgical runs.
*/
import { execSync } from 'node:child_process';
import fs from 'node:fs';
const CORE_STEERING_PATHS = [
'packages/core/src/prompts/',
@@ -20,46 +36,132 @@ const STEERING_SIGNATURES = [
"kind: 'local'",
];
function minimatch(file, pattern) {
if (pattern.endsWith('/**')) {
const prefix = pattern.slice(0, -3);
return file.startsWith(prefix);
}
if (pattern.includes('*')) {
const regex = new RegExp(
'^' +
pattern
.replace(/\./g, '\\.')
.replace(/\*\*/g, '.*')
.replace(/\*/g, '[^/]*') +
'$',
);
return regex.test(file);
}
return file === pattern;
}
function main() {
const targetBranch = process.env.GITHUB_BASE_REF || 'main';
const verbose = process.argv.includes('--verbose');
const steeringOnly = process.argv.includes('--steering-only');
const isRelatedMode = process.argv.includes('--related');
const isJsonMode = process.argv.includes('--json');
try {
const remoteUrl = process.env.GITHUB_REPOSITORY
? `https://github.com/${process.env.GITHUB_REPOSITORY}.git`
: 'origin';
// Fetch target branch from the remote.
execSync(`git fetch ${remoteUrl} ${targetBranch}`, {
stdio: 'ignore',
});
let changedFiles = [];
const isCi = !!process.env.GITHUB_ACTIONS;
// Get changed files using the triple-dot syntax which correctly handles merge commits
const head = process.env.PR_HEAD_SHA || 'HEAD';
const changedFiles = execSync(`git diff --name-only FETCH_HEAD...${head}`, {
encoding: 'utf-8',
})
.split('\n')
.filter(Boolean);
if (isCi) {
try {
// 1. Try fetching from remote (CI environment)
execSync(`git fetch ${remoteUrl} ${targetBranch}`, {
stdio: 'ignore',
});
// Get changed files using the triple-dot syntax which correctly handles merge commits
const head = process.env.PR_HEAD_SHA || 'HEAD';
changedFiles = execSync(`git diff --name-only FETCH_HEAD...${head}`, {
encoding: 'utf-8',
})
.split('\n')
.filter(Boolean);
} catch (e) {
if (verbose)
process.stderr.write(
`Warning: git fetch failed in CI: ${e.message}\n`,
);
}
}
// 2. Local fallback or if CI fetch failed: Try diffing against target branch
if (changedFiles.length === 0) {
try {
changedFiles = execSync(`git diff --name-only ${targetBranch}`, {
encoding: 'utf-8',
})
.split('\n')
.filter(Boolean);
} catch {
// 3. Last resort: Just diff against HEAD (uncommitted changes only)
changedFiles = execSync('git diff --name-only HEAD', {
encoding: 'utf-8',
})
.split('\n')
.filter(Boolean);
}
// Also include untracked files in local mode
const untracked = execSync('git ls-files --others --exclude-standard', {
encoding: 'utf-8',
})
.split('\n')
.filter(Boolean);
changedFiles = [...new Set([...changedFiles, ...untracked])];
}
let detected = false;
const reasons = [];
const affectedSuites = new Set();
const rationales = [];
// Load suites for --related mode
let suitesConfig = null;
if (isRelatedMode) {
try {
suitesConfig = JSON.parse(
fs.readFileSync('evals/suites.json', 'utf-8'),
);
} catch {
process.stderr.write(`Warning: Could not load evals/suites.json\n`);
}
}
// 1. Path-based detection
for (const file of changedFiles) {
if (CORE_STEERING_PATHS.some((prefix) => file.startsWith(prefix))) {
detected = true;
reasons.push(`Matched core steering path: ${file}`);
if (!verbose) break;
}
if (
!steeringOnly &&
TEST_PATHS.some((prefix) => file.startsWith(prefix))
TEST_PATHS.some((prefix) => file.startsWith(prefix)) &&
file.endsWith('.eval.ts')
) {
detected = true;
reasons.push(`Matched test path: ${file}`);
if (!verbose) break;
reasons.push(`Matched test file: ${file}`);
}
// Related suite detection
if (suitesConfig) {
for (const [suiteName, suite] of Object.entries(suitesConfig)) {
if (suiteName === 'allowedOverlaps' || !suite.patterns) continue;
if (suite.patterns.some((pattern) => minimatch(file, pattern))) {
affectedSuites.add(suiteName);
rationales.push(
`Testing **${suiteName}** because **${file}** was modified.`,
);
}
}
}
}
@@ -70,15 +172,30 @@ function main() {
);
if (coreChanges.length > 0) {
// Get the actual diff content for core files
const diff = execSync(
`git diff -U0 FETCH_HEAD...${head} -- packages/core/src/`,
{ encoding: 'utf-8' },
);
// We need to be careful with the diff command depending on if we have FETCH_HEAD
let diffCmd = '';
try {
const head = process.env.PR_HEAD_SHA || 'HEAD';
diffCmd = `git diff -U0 FETCH_HEAD...${head} -- packages/core/src/`;
execSync('git rev-parse FETCH_HEAD', { stdio: 'ignore' });
} catch {
diffCmd = `git diff -U0 ${targetBranch} -- packages/core/src/`;
}
const diff = execSync(diffCmd, { encoding: 'utf-8' });
for (const sig of STEERING_SIGNATURES) {
if (diff.includes(sig)) {
detected = true;
reasons.push(`Matched steering signature in core: ${sig}`);
if (!verbose) break;
// If we detected a steering signature, mark core_steering suite
if (isRelatedMode) {
affectedSuites.add('core_steering');
rationales.push(
`Testing **core_steering** because matched signature '${sig}' in core files.`,
);
}
if (!verbose && !isRelatedMode) break;
}
}
}
@@ -89,14 +206,38 @@ function main() {
reasons.forEach((r) => process.stderr.write(` - ${r}\n`));
}
process.stdout.write(detected ? 'true' : 'false');
if (isJsonMode) {
process.stdout.write(
JSON.stringify(
{
detected,
reasons,
affectedSuites: Array.from(affectedSuites),
rationales,
},
null,
2,
),
);
} else {
process.stdout.write(detected ? 'true' : 'false');
}
} catch (error) {
// If anything fails (e.g., no git history), run evals/guidance to be safe
process.stderr.write(
'Warning: Failed to determine if changes occurred. Defaulting to true.\n',
);
if (isJsonMode) {
process.stdout.write(
JSON.stringify({
detected: true,
reasons: [`Error during detection: ${error.message}`],
affectedSuites: ['core_steering'],
rationales: [
'Error during detection: running all stable evals for safety.',
],
}),
);
} else {
process.stdout.write('true');
}
process.stderr.write(String(error) + '\n');
process.stdout.write('true');
}
}
+71 -10
View File
@@ -13,6 +13,7 @@
* to ensure high-signal validation and minimize noise.
*/
import fs from 'node:fs';
import { fetchNightlyHistory, escapeRegex } from './eval_utils.js';
const LOOKBACK_COUNT = 6;
@@ -25,11 +26,24 @@ const AGGREGATE_PASS_RATE_THRESHOLD = 0.8; // Weekly signal (e.g., 15/18)
*/
function main() {
const targetModel = process.argv[2];
if (!targetModel) {
if (!targetModel || targetModel.startsWith('--')) {
console.error('❌ Error: No target model specified.');
process.exit(1);
}
// Parse --suites argument
const suitesArgIndex = process.argv.indexOf('--suites');
let requestedSuites = null;
if (suitesArgIndex !== -1 && process.argv[suitesArgIndex + 1]) {
requestedSuites = process.argv[suitesArgIndex + 1]
.split(',')
.map((s) => s.trim());
}
console.error(`🔍 Identifying trustworthy evals for model: ${targetModel}`);
if (requestedSuites) {
console.error(`📂 Filtering by suites: ${requestedSuites.join(', ')}`);
}
const history = fetchNightlyHistory(LOOKBACK_COUNT);
if (history.length === 0) {
@@ -37,6 +51,32 @@ function main() {
process.exit(1);
}
// Load suites configuration
let allowedFiles = null;
let runAllStable = false;
if (requestedSuites) {
try {
const suitesConfig = JSON.parse(
fs.readFileSync('evals/suites.json', 'utf-8'),
);
allowedFiles = new Set();
for (const suiteName of requestedSuites) {
const suite = suitesConfig[suiteName];
if (suite) {
if (suite.evals.includes('ALL_ALWAYS_PASSING')) {
runAllStable = true;
} else {
suite.evals.forEach((file) => allowedFiles.add(file));
}
}
}
} catch (e) {
console.error(
`⚠️ Warning: Could not load evals/suites.json or match suites: ${e.message}`,
);
}
}
// Aggregate results for the target model across all history
const testHistories = {}; // { [testName]: { totalPassed: 0, totalRuns: 0, dailyRates: [], file: string } }
@@ -83,11 +123,28 @@ function main() {
const isAggregateHighSignal = aggregateRate > AGGREGATE_PASS_RATE_THRESHOLD;
if (isDailyStable && isAggregateHighSignal) {
trustworthyTests.push(testName);
if (info.file) {
const match = info.file.match(/evals\/.*\.eval\.ts/);
if (match) {
trustworthyFiles.add(match[0]);
// Suite filtering logic
let isFileAllowed = true;
if (requestedSuites && !runAllStable) {
if (info.file) {
const match = info.file.match(/evals\/.*\.eval\.ts/);
if (match && !allowedFiles.has(match[0])) {
isFileAllowed = false;
} else if (!match) {
isFileAllowed = false;
}
} else {
isFileAllowed = false;
}
}
if (isFileAllowed) {
trustworthyTests.push(testName);
if (info.file) {
const match = info.file.match(/evals\/.*\.eval\.ts/);
if (match) {
trustworthyFiles.add(match[0]);
}
}
}
} else {
@@ -99,10 +156,14 @@ function main() {
`✅ Found ${trustworthyTests.length} trustworthy tests across ${trustworthyFiles.size} files:`,
);
trustworthyTests.sort().forEach((name) => console.error(` - ${name}`));
console.error(`\n⚪ Ignored ${volatileTests.length} volatile tests.`);
console.error(
`🆕 Ignored ${newTests.length} tests with insufficient history.`,
);
if (volatileTests.length > 0) {
console.error(`\n⚪ Ignored ${volatileTests.length} volatile tests.`);
}
if (newTests.length > 0) {
console.error(
`🆕 Ignored ${newTests.length} tests with insufficient history.`,
);
}
// Output the list of names as a regex-friendly pattern for vitest -t
const pattern = trustworthyTests.map((name) => escapeRegex(name)).join('|');
+13
View File
@@ -500,6 +500,9 @@ function main() {
if (args.includes('--check-github-actions-pinning')) {
runGithubActionsPinningLinter();
}
if (args.includes('--eval-suites')) {
runEvalSuiteLinter();
}
if (args.length === 0) {
setupLinters();
@@ -511,8 +514,18 @@ function main() {
runSensitiveKeywordLinter();
runTSConfigLinter();
runGithubActionsPinningLinter();
runEvalSuiteLinter();
console.log('\nAll linting checks passed!');
}
}
export function runEvalSuiteLinter() {
console.log('\nRunning eval suite linter...');
try {
execSync('node scripts/validate_eval_suites.js', { stdio: 'inherit' });
} catch {
process.exit(1);
}
}
main();
+43 -2
View File
@@ -22,22 +22,62 @@ import fs from 'node:fs';
async function main() {
const modelList = process.env.MODEL_LIST || 'gemini-3-flash-preview';
const models = modelList.split(',').map((m) => m.trim());
const isRelatedMode = process.argv.includes('--related');
let combinedReport = '';
let hasRegression = false;
let detectionRationale = '';
let affectedSuitesStr = '';
console.log(
`🚀 Starting evaluation orchestration for models: ${models.join(', ')}`,
);
if (isRelatedMode) {
console.log('🔍 Identifying related evaluations based on changes...');
try {
const detectionOutput = execSync(
`node scripts/changed_prompt.js --related --json`,
{ encoding: 'utf-8', stdio: ['inherit', 'pipe', 'inherit'] },
).trim();
const detection = JSON.parse(detectionOutput);
if (detection.affectedSuites && detection.affectedSuites.length > 0) {
affectedSuitesStr = detection.affectedSuites.join(',');
detectionRationale = '### 🧪 Related Evaluation Rationale\n\n';
detection.rationales.forEach((r) => {
detectionRationale += `- ${r}\n`;
});
detectionRationale +=
'\n_Something missing? [Update evals/suites.json](evals/README.md#related-testing-with-related) to adjust detection logic._\n\n---\n\n';
} else if (!detection.detected) {
console.log('✅ No related changes detected. Skipping evaluations.');
process.exit(0);
} else {
console.log(
'⚠️ Changes detected but no specific suites matched. Running full stable suite for safety.',
);
detectionRationale =
'### 🧪 Related Evaluation Rationale\n\n- No specific suites matched. Running full stable suite for safety.\n\n---\n\n';
}
} catch (e) {
console.error(`❌ Error during suite detection: ${e.message}`);
detectionRationale =
'### 🧪 Related Evaluation Rationale\n\n- Error during suite detection. Running full stable suite for safety.\n\n---\n\n';
}
}
for (const model of models) {
console.log(`\n--- Processing Model: ${model} ---`);
try {
// 1. Identify Trustworthy Evals
console.log(`🔍 Identifying trustworthy tests for ${model}...`);
const suitesFlag = affectedSuitesStr
? `--suites ${affectedSuitesStr}`
: '';
const output = execSync(
`node scripts/get_trustworthy_evals.js "${model}"`,
`node scripts/get_trustworthy_evals.js "${model}" ${suitesFlag}`,
{
encoding: 'utf-8',
stdio: ['inherit', 'pipe', 'inherit'], // Capture stdout but pass stdin/stderr
@@ -83,7 +123,8 @@ async function main() {
// Always save the combined report to a file so the workflow can capture it cleanly
if (combinedReport) {
fs.writeFileSync('eval_regression_report.md', combinedReport);
const finalReport = detectionRationale + combinedReport;
fs.writeFileSync('eval_regression_report.md', finalReport);
console.log(
'\n📊 Final Markdown report saved to eval_regression_report.md',
);
+98
View File
@@ -0,0 +1,98 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import fs from 'node:fs';
import path from 'node:path';
const SUITES_PATH = 'evals/suites.json';
const EVALS_DIR = 'evals';
/**
* Validates that all eval files are mapped in suites.json and that there are no overlaps.
*/
function main() {
if (!fs.existsSync(SUITES_PATH)) {
console.error(`❌ Error: ${SUITES_PATH} not found.`);
process.exit(1);
}
const suitesConfig = JSON.parse(fs.readFileSync(SUITES_PATH, 'utf-8'));
const allowedOverlaps = new Set(suitesConfig.allowedOverlaps || []);
const evalFilesOnDisk = fs
.readdirSync(EVALS_DIR)
.filter((f) => f.endsWith('.eval.ts'))
.map((f) => path.join(EVALS_DIR, f));
const evalToSuiteMap = new Map();
const errors = [];
// 1. Map evals to suites and check for overlaps/trigger-coverage
for (const [suiteName, suite] of Object.entries(suitesConfig)) {
if (suiteName === 'allowedOverlaps' || !suite.evals) continue;
for (const evalFile of suite.evals) {
if (evalFile === 'ALL_ALWAYS_PASSING') continue;
if (!fs.existsSync(evalFile)) {
errors.push(
`Suite **${suiteName}** references non-existent file: **${evalFile}**`,
);
continue;
}
// Check if the eval file itself is in the suite's trigger patterns
if (!suite.patterns || !suite.patterns.includes(evalFile)) {
errors.push(
`Trigger coverage missing: **${evalFile}** is in the **${suiteName}** suite but is missing from its **patterns** array. (Changes to the test won't trigger itself correctly).`,
);
}
if (evalToSuiteMap.has(evalFile) && !allowedOverlaps.has(evalFile)) {
errors.push(
`Overlap detected: **${evalFile}** is present in both **${evalToSuiteMap.get(evalFile)}** and **${suiteName}** suites.`,
);
} else {
const existingSuites = evalToSuiteMap.get(evalFile) || [];
evalToSuiteMap.set(
evalFile,
Array.isArray(existingSuites)
? [...existingSuites, suiteName]
: [existingSuites, suiteName],
);
}
}
}
// 2. Check for orphaned evals (on disk but not in suites.json)
for (const diskFile of evalFilesOnDisk) {
if (!evalToSuiteMap.has(diskFile)) {
errors.push(
`Orphaned eval detected: **${diskFile}** is not mapped to any suite in ${SUITES_PATH}.`,
);
}
}
if (errors.length > 0) {
console.error('\n❌ Eval Suite Validation Failed:');
errors.forEach((err) => console.error(` - ${err}`));
const hasOverlap = errors.some((err) => err.includes('Overlap detected'));
if (hasOverlap) {
console.error(
`\n💡 Tip: If this overlap is intentional, add the file path to the 'allowedOverlaps' list in ${SUITES_PATH}.`,
);
} else {
console.error(`\n💡 Tip: Update ${SUITES_PATH} to resolve these issues.`);
}
process.exit(1);
}
console.log(
'✅ Eval Suite Validation Passed: All files mapped and no overlaps found.',
);
}
main();