From db8910c39be479488814ecbf31581dbe8553f971 Mon Sep 17 00:00:00 2001 From: Alisa Novikova <62909685+alisa-alisa@users.noreply.github.com> Date: Tue, 7 Apr 2026 19:35:12 -0700 Subject: [PATCH] feat(evals): implement related evaluation system for targeted testing --- .github/workflows/eval-pr.yml | 7 +- evals/README.md | 44 ++++++- evals/suites.json | 171 +++++++++++++++++++++++++++ package.json | 1 + scripts/changed_prompt.js | 193 ++++++++++++++++++++++++++----- scripts/get_trustworthy_evals.js | 81 +++++++++++-- scripts/lint.js | 13 +++ scripts/run_eval_regression.js | 45 ++++++- scripts/validate_eval_suites.js | 98 ++++++++++++++++ 9 files changed, 610 insertions(+), 43 deletions(-) create mode 100644 evals/suites.json create mode 100644 scripts/validate_eval_suites.js diff --git a/.github/workflows/eval-pr.yml b/.github/workflows/eval-pr.yml index 3e6784960c..6ade8c7542 100644 --- a/.github/workflows/eval-pr.yml +++ b/.github/workflows/eval-pr.yml @@ -1,7 +1,7 @@ name: 'Evals: PR Evaluation & Regression' on: - pull_request_target: + pull_request: types: ['opened', 'synchronize', 'reopened', 'ready_for_review'] paths: - 'packages/core/src/prompts/**' @@ -153,9 +153,10 @@ jobs: GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}' GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}' MODEL_LIST: '${{ env.MODEL_LIST }}' + GITHUB_BASE_REF: '${{ github.base_ref }}' run: | - # Run the regression check loop. The script saves the report to a file. - node scripts/run_eval_regression.js + # Run the related regression check loop. + node scripts/run_eval_regression.js --related # Use the generated report file if it exists if [[ -f eval_regression_report.md ]]; then diff --git a/evals/README.md b/evals/README.md index aebfe38ebc..36e543fd88 100644 --- a/evals/README.md +++ b/evals/README.md @@ -227,10 +227,18 @@ in Pull Requests. These can also be run locally for debugging. ### Running Regression Checks Locally You can simulate the PR regression check locally to verify your changes before -pushing: +pushing. To optimize your workflow and reduce LLM costs, use the **`--related`** +flag to run only the tests relevant to your specific changes: ```bash -# Run the full regression loop for a specific model +# Run the targeted regression loop for your changes +MODEL_LIST=gemini-3-flash-preview node scripts/run_eval_regression.js --related +``` + +To run the full regression loop for a specific model (all stable tests): + +```bash +# Run everything MODEL_LIST=gemini-3-flash-preview node scripts/run_eval_regression.js ``` @@ -244,6 +252,38 @@ OUTPUT=$(node scripts/get_trustworthy_evals.js "gemini-3-flash-preview") node scripts/run_regression_check.js "gemini-3-flash-preview" "$OUTPUT" ``` +### Related Testing with `--related` + +The project uses a "Smart Eval" system to identify which behavioral evaluations +are affected by your code changes. This is controlled by the `--related` flag +available in `scripts/run_eval_regression.js`. + +#### How it Works + +1. **Change Detection**: The system uses Git to identify modified files in your + branch compared to `main`. +2. **Suite Mapping**: Modified files are matched against patterns in + `evals/suites.json`. This file maps core files (e.g., `grep.ts`) to their + corresponding evaluations. +3. **Targeted Execution**: Only the evaluations belonging to the affected + suites are executed. +4. **Global Fallback**: If changes are detected in core system prompts or + unmapped files, the system automatically falls back to running the full + stable evaluation suite for safety. + +#### Updating Detection Logic + +If you add a new tool or functional area, you should update `evals/suites.json` +to ensure your new evaluations are triggered correctly. + +```json +"my_new_tool": { + "description": "Description of the tool", + "patterns": ["packages/core/src/tools/my-new-tool.ts"], + "evals": ["evals/my_new_tool.eval.ts"] +} +``` + ### The Regression Quality Bar Because LLMs are non-deterministic, the PR regression check uses a high-signal diff --git a/evals/suites.json b/evals/suites.json new file mode 100644 index 0000000000..feedfce4b3 --- /dev/null +++ b/evals/suites.json @@ -0,0 +1,171 @@ +{ + "allowedOverlaps": [], + "grep": { + "description": "Grep search functionality", + "patterns": [ + "packages/core/src/tools/grep.ts", + "packages/core/src/tools/ripGrep.ts", + "packages/core/src/tools/grep-utils.ts", + "evals/grep_search_functionality.eval.ts" + ], + "evals": ["evals/grep_search_functionality.eval.ts"] + }, + "memory": { + "description": "Memory tool and fact persistence", + "patterns": [ + "packages/core/src/tools/memoryTool.ts", + "packages/core/src/persistence/storage.ts", + "evals/save_memory.eval.ts", + "evals/hierarchical_memory.eval.ts" + ], + "evals": ["evals/save_memory.eval.ts", "evals/hierarchical_memory.eval.ts"] + }, + "read_file": { + "description": "File reading and content extraction", + "patterns": [ + "packages/core/src/tools/read-file.ts", + "packages/core/src/tools/read-many-files.ts", + "evals/frugalReads.eval.ts" + ], + "evals": ["evals/frugalReads.eval.ts"] + }, + "glob": { + "description": "File discovery and globbing", + "patterns": [ + "packages/core/src/tools/glob.ts", + "evals/frugalSearch.eval.ts" + ], + "evals": ["evals/frugalSearch.eval.ts"] + }, + "tracker": { + "description": "Task and progress tracking", + "patterns": [ + "packages/core/src/tools/trackerTools.ts", + "evals/tracker.eval.ts" + ], + "evals": ["evals/tracker.eval.ts"] + }, + "ask_user": { + "description": "Interactive user confirmation and input", + "patterns": [ + "packages/core/src/tools/ask-user.ts", + "evals/ask_user.eval.ts" + ], + "evals": ["evals/ask_user.eval.ts"] + }, + "plan_mode": { + "description": "Plan Mode orchestration", + "patterns": [ + "packages/core/src/tools/enter-plan-mode.ts", + "packages/core/src/tools/exit-plan-mode.ts", + "packages/core/src/agents/planAgent.ts", + "evals/plan_mode.eval.ts" + ], + "evals": ["evals/plan_mode.eval.ts"] + }, + "git": { + "description": "Git repository operations", + "patterns": [ + "packages/core/src/utils/git.ts", + "packages/core/src/tools/shell.ts", + "evals/gitRepo.eval.ts", + "evals/unsafe-cloning.eval.ts" + ], + "evals": ["evals/gitRepo.eval.ts", "evals/unsafe-cloning.eval.ts"] + }, + "agents": { + "description": "Agent delegation and help", + "patterns": [ + "packages/core/src/agents/**", + "packages/core/src/routing/**", + "evals/subagents.eval.ts", + "evals/generalist_agent.eval.ts", + "evals/generalist_delegation.eval.ts", + "evals/cli_help_delegation.eval.ts" + ], + "evals": [ + "evals/subagents.eval.ts", + "evals/generalist_agent.eval.ts", + "evals/generalist_delegation.eval.ts", + "evals/cli_help_delegation.eval.ts" + ] + }, + "background": { + "description": "Background process management", + "patterns": [ + "packages/core/src/tools/shellBackgroundTools.ts", + "evals/background_processes.eval.ts" + ], + "evals": ["evals/background_processes.eval.ts"] + }, + "core_steering": { + "description": "System prompts and core model steering", + "patterns": [ + "packages/core/src/prompts/**", + "evals/answer-vs-act.eval.ts", + "evals/model_steering.eval.ts", + "evals/redundant_casts.eval.ts" + ], + "evals": [ + "ALL_ALWAYS_PASSING", + "evals/answer-vs-act.eval.ts", + "evals/model_steering.eval.ts", + "evals/redundant_casts.eval.ts" + ] + }, + "edit_fidelity": { + "description": "Code edit accuracy and location checks", + "patterns": [ + "packages/core/src/tools/edit.ts", + "packages/core/src/tools/write-file.ts", + "evals/edit-locations-eval.eval.ts", + "evals/validation_fidelity.eval.ts", + "evals/validation_fidelity_pre_existing_errors.eval.ts" + ], + "evals": [ + "evals/edit-locations-eval.eval.ts", + "evals/validation_fidelity.eval.ts", + "evals/validation_fidelity_pre_existing_errors.eval.ts" + ] + }, + "reliability": { + "description": "Core safety and sandbox reliability", + "patterns": [ + "packages/core/src/safety/**", + "packages/core/src/fallback/**", + "evals/concurrency-safety.eval.ts", + "evals/sandbox_recovery.eval.ts", + "evals/interactive-hang.eval.ts", + "evals/tool_output_masking.eval.ts" + ], + "evals": [ + "evals/concurrency-safety.eval.ts", + "evals/sandbox_recovery.eval.ts", + "evals/interactive-hang.eval.ts", + "evals/tool_output_masking.eval.ts" + ] + }, + "orchestration": { + "description": "Agent task lifecycle and topic management", + "patterns": [ + "packages/core/src/tools/update-topic.ts", + "packages/core/src/tools/complete-task.ts", + "evals/update_topic.eval.ts" + ], + "evals": ["evals/update_topic.eval.ts"] + }, + "general_tools": { + "description": "General tool usage and efficiency", + "patterns": [ + "packages/core/src/tools/tools.ts", + "packages/core/src/tools/tool-registry.ts", + "packages/core/src/tools/shell.ts", + "evals/automated-tool-use.eval.ts", + "evals/shell-efficiency.eval.ts" + ], + "evals": [ + "evals/automated-tool-use.eval.ts", + "evals/shell-efficiency.eval.ts" + ] + } +} diff --git a/package.json b/package.json index e24f6a20b5..cde3743e16 100644 --- a/package.json +++ b/package.json @@ -47,6 +47,7 @@ "posttest": "npm run build", "test:always_passing_evals": "vitest run --config evals/vitest.config.ts", "test:all_evals": "cross-env RUN_EVALS=1 vitest run --config evals/vitest.config.ts", + "test:evals:related": "node scripts/run_eval_regression.js --related", "test:e2e": "cross-env VERBOSE=true KEEP_OUTPUT=true npm run test:integration:sandbox:none", "test:integration:all": "npm run test:integration:sandbox:none && npm run test:integration:sandbox:docker && npm run test:integration:sandbox:podman", "test:integration:flaky": "cross-env RUN_FLAKY_INTEGRATION=1 npm run test:integration:sandbox:none", diff --git a/scripts/changed_prompt.js b/scripts/changed_prompt.js index 3fe33443a0..4c76e90b38 100644 --- a/scripts/changed_prompt.js +++ b/scripts/changed_prompt.js @@ -3,7 +3,23 @@ * Copyright 2026 Google LLC * SPDX-License-Identifier: Apache-2.0 */ + +/** + * @fileoverview Intelligence layer for detecting steering and behavior changes. + * + * This script identifies if code changes affect model steering (system prompts, + * tool definitions, agent instructions) and maps them to relevant evaluation + * suites. It supports both CI (GitHub Actions) and local development workflows. + * + * Detection Methods: + * 1. Path-based: Monitors critical steering and tool directories. + * 2. Signature-based: Scans diff content for core steering primitives + * (e.g., ToolDefinition, inputSchema). + * 3. Suite-aware: Uses evals/suites.json to identify related tests for surgical runs. + */ + import { execSync } from 'node:child_process'; +import fs from 'node:fs'; const CORE_STEERING_PATHS = [ 'packages/core/src/prompts/', @@ -20,46 +36,132 @@ const STEERING_SIGNATURES = [ "kind: 'local'", ]; +function minimatch(file, pattern) { + if (pattern.endsWith('/**')) { + const prefix = pattern.slice(0, -3); + return file.startsWith(prefix); + } + if (pattern.includes('*')) { + const regex = new RegExp( + '^' + + pattern + .replace(/\./g, '\\.') + .replace(/\*\*/g, '.*') + .replace(/\*/g, '[^/]*') + + '$', + ); + return regex.test(file); + } + return file === pattern; +} + function main() { const targetBranch = process.env.GITHUB_BASE_REF || 'main'; const verbose = process.argv.includes('--verbose'); const steeringOnly = process.argv.includes('--steering-only'); + const isRelatedMode = process.argv.includes('--related'); + const isJsonMode = process.argv.includes('--json'); try { const remoteUrl = process.env.GITHUB_REPOSITORY ? `https://github.com/${process.env.GITHUB_REPOSITORY}.git` : 'origin'; - // Fetch target branch from the remote. - execSync(`git fetch ${remoteUrl} ${targetBranch}`, { - stdio: 'ignore', - }); + let changedFiles = []; + const isCi = !!process.env.GITHUB_ACTIONS; - // Get changed files using the triple-dot syntax which correctly handles merge commits - const head = process.env.PR_HEAD_SHA || 'HEAD'; - const changedFiles = execSync(`git diff --name-only FETCH_HEAD...${head}`, { - encoding: 'utf-8', - }) - .split('\n') - .filter(Boolean); + if (isCi) { + try { + // 1. Try fetching from remote (CI environment) + execSync(`git fetch ${remoteUrl} ${targetBranch}`, { + stdio: 'ignore', + }); + + // Get changed files using the triple-dot syntax which correctly handles merge commits + const head = process.env.PR_HEAD_SHA || 'HEAD'; + changedFiles = execSync(`git diff --name-only FETCH_HEAD...${head}`, { + encoding: 'utf-8', + }) + .split('\n') + .filter(Boolean); + } catch (e) { + if (verbose) + process.stderr.write( + `Warning: git fetch failed in CI: ${e.message}\n`, + ); + } + } + + // 2. Local fallback or if CI fetch failed: Try diffing against target branch + if (changedFiles.length === 0) { + try { + changedFiles = execSync(`git diff --name-only ${targetBranch}`, { + encoding: 'utf-8', + }) + .split('\n') + .filter(Boolean); + } catch { + // 3. Last resort: Just diff against HEAD (uncommitted changes only) + changedFiles = execSync('git diff --name-only HEAD', { + encoding: 'utf-8', + }) + .split('\n') + .filter(Boolean); + } + + // Also include untracked files in local mode + const untracked = execSync('git ls-files --others --exclude-standard', { + encoding: 'utf-8', + }) + .split('\n') + .filter(Boolean); + changedFiles = [...new Set([...changedFiles, ...untracked])]; + } let detected = false; const reasons = []; + const affectedSuites = new Set(); + const rationales = []; + + // Load suites for --related mode + let suitesConfig = null; + if (isRelatedMode) { + try { + suitesConfig = JSON.parse( + fs.readFileSync('evals/suites.json', 'utf-8'), + ); + } catch { + process.stderr.write(`Warning: Could not load evals/suites.json\n`); + } + } // 1. Path-based detection for (const file of changedFiles) { if (CORE_STEERING_PATHS.some((prefix) => file.startsWith(prefix))) { detected = true; reasons.push(`Matched core steering path: ${file}`); - if (!verbose) break; } if ( !steeringOnly && - TEST_PATHS.some((prefix) => file.startsWith(prefix)) + TEST_PATHS.some((prefix) => file.startsWith(prefix)) && + file.endsWith('.eval.ts') ) { detected = true; - reasons.push(`Matched test path: ${file}`); - if (!verbose) break; + reasons.push(`Matched test file: ${file}`); + } + + // Related suite detection + if (suitesConfig) { + for (const [suiteName, suite] of Object.entries(suitesConfig)) { + if (suiteName === 'allowedOverlaps' || !suite.patterns) continue; + + if (suite.patterns.some((pattern) => minimatch(file, pattern))) { + affectedSuites.add(suiteName); + rationales.push( + `Testing **${suiteName}** because **${file}** was modified.`, + ); + } + } } } @@ -70,15 +172,30 @@ function main() { ); if (coreChanges.length > 0) { // Get the actual diff content for core files - const diff = execSync( - `git diff -U0 FETCH_HEAD...${head} -- packages/core/src/`, - { encoding: 'utf-8' }, - ); + // We need to be careful with the diff command depending on if we have FETCH_HEAD + let diffCmd = ''; + try { + const head = process.env.PR_HEAD_SHA || 'HEAD'; + diffCmd = `git diff -U0 FETCH_HEAD...${head} -- packages/core/src/`; + execSync('git rev-parse FETCH_HEAD', { stdio: 'ignore' }); + } catch { + diffCmd = `git diff -U0 ${targetBranch} -- packages/core/src/`; + } + + const diff = execSync(diffCmd, { encoding: 'utf-8' }); for (const sig of STEERING_SIGNATURES) { if (diff.includes(sig)) { detected = true; reasons.push(`Matched steering signature in core: ${sig}`); - if (!verbose) break; + + // If we detected a steering signature, mark core_steering suite + if (isRelatedMode) { + affectedSuites.add('core_steering'); + rationales.push( + `Testing **core_steering** because matched signature '${sig}' in core files.`, + ); + } + if (!verbose && !isRelatedMode) break; } } } @@ -89,14 +206,38 @@ function main() { reasons.forEach((r) => process.stderr.write(` - ${r}\n`)); } - process.stdout.write(detected ? 'true' : 'false'); + if (isJsonMode) { + process.stdout.write( + JSON.stringify( + { + detected, + reasons, + affectedSuites: Array.from(affectedSuites), + rationales, + }, + null, + 2, + ), + ); + } else { + process.stdout.write(detected ? 'true' : 'false'); + } } catch (error) { - // If anything fails (e.g., no git history), run evals/guidance to be safe - process.stderr.write( - 'Warning: Failed to determine if changes occurred. Defaulting to true.\n', - ); + if (isJsonMode) { + process.stdout.write( + JSON.stringify({ + detected: true, + reasons: [`Error during detection: ${error.message}`], + affectedSuites: ['core_steering'], + rationales: [ + 'Error during detection: running all stable evals for safety.', + ], + }), + ); + } else { + process.stdout.write('true'); + } process.stderr.write(String(error) + '\n'); - process.stdout.write('true'); } } diff --git a/scripts/get_trustworthy_evals.js b/scripts/get_trustworthy_evals.js index c87d148e7a..ae25550134 100644 --- a/scripts/get_trustworthy_evals.js +++ b/scripts/get_trustworthy_evals.js @@ -13,6 +13,7 @@ * to ensure high-signal validation and minimize noise. */ +import fs from 'node:fs'; import { fetchNightlyHistory, escapeRegex } from './eval_utils.js'; const LOOKBACK_COUNT = 6; @@ -25,11 +26,24 @@ const AGGREGATE_PASS_RATE_THRESHOLD = 0.8; // Weekly signal (e.g., 15/18) */ function main() { const targetModel = process.argv[2]; - if (!targetModel) { + if (!targetModel || targetModel.startsWith('--')) { console.error('โŒ Error: No target model specified.'); process.exit(1); } + + // Parse --suites argument + const suitesArgIndex = process.argv.indexOf('--suites'); + let requestedSuites = null; + if (suitesArgIndex !== -1 && process.argv[suitesArgIndex + 1]) { + requestedSuites = process.argv[suitesArgIndex + 1] + .split(',') + .map((s) => s.trim()); + } + console.error(`๐Ÿ” Identifying trustworthy evals for model: ${targetModel}`); + if (requestedSuites) { + console.error(`๐Ÿ“‚ Filtering by suites: ${requestedSuites.join(', ')}`); + } const history = fetchNightlyHistory(LOOKBACK_COUNT); if (history.length === 0) { @@ -37,6 +51,32 @@ function main() { process.exit(1); } + // Load suites configuration + let allowedFiles = null; + let runAllStable = false; + if (requestedSuites) { + try { + const suitesConfig = JSON.parse( + fs.readFileSync('evals/suites.json', 'utf-8'), + ); + allowedFiles = new Set(); + for (const suiteName of requestedSuites) { + const suite = suitesConfig[suiteName]; + if (suite) { + if (suite.evals.includes('ALL_ALWAYS_PASSING')) { + runAllStable = true; + } else { + suite.evals.forEach((file) => allowedFiles.add(file)); + } + } + } + } catch (e) { + console.error( + `โš ๏ธ Warning: Could not load evals/suites.json or match suites: ${e.message}`, + ); + } + } + // Aggregate results for the target model across all history const testHistories = {}; // { [testName]: { totalPassed: 0, totalRuns: 0, dailyRates: [], file: string } } @@ -83,11 +123,28 @@ function main() { const isAggregateHighSignal = aggregateRate > AGGREGATE_PASS_RATE_THRESHOLD; if (isDailyStable && isAggregateHighSignal) { - trustworthyTests.push(testName); - if (info.file) { - const match = info.file.match(/evals\/.*\.eval\.ts/); - if (match) { - trustworthyFiles.add(match[0]); + // Suite filtering logic + let isFileAllowed = true; + if (requestedSuites && !runAllStable) { + if (info.file) { + const match = info.file.match(/evals\/.*\.eval\.ts/); + if (match && !allowedFiles.has(match[0])) { + isFileAllowed = false; + } else if (!match) { + isFileAllowed = false; + } + } else { + isFileAllowed = false; + } + } + + if (isFileAllowed) { + trustworthyTests.push(testName); + if (info.file) { + const match = info.file.match(/evals\/.*\.eval\.ts/); + if (match) { + trustworthyFiles.add(match[0]); + } } } } else { @@ -99,10 +156,14 @@ function main() { `โœ… Found ${trustworthyTests.length} trustworthy tests across ${trustworthyFiles.size} files:`, ); trustworthyTests.sort().forEach((name) => console.error(` - ${name}`)); - console.error(`\nโšช Ignored ${volatileTests.length} volatile tests.`); - console.error( - `๐Ÿ†• Ignored ${newTests.length} tests with insufficient history.`, - ); + if (volatileTests.length > 0) { + console.error(`\nโšช Ignored ${volatileTests.length} volatile tests.`); + } + if (newTests.length > 0) { + console.error( + `๐Ÿ†• Ignored ${newTests.length} tests with insufficient history.`, + ); + } // Output the list of names as a regex-friendly pattern for vitest -t const pattern = trustworthyTests.map((name) => escapeRegex(name)).join('|'); diff --git a/scripts/lint.js b/scripts/lint.js index 0cf51cb8ba..f5782ce467 100644 --- a/scripts/lint.js +++ b/scripts/lint.js @@ -500,6 +500,9 @@ function main() { if (args.includes('--check-github-actions-pinning')) { runGithubActionsPinningLinter(); } + if (args.includes('--eval-suites')) { + runEvalSuiteLinter(); + } if (args.length === 0) { setupLinters(); @@ -511,8 +514,18 @@ function main() { runSensitiveKeywordLinter(); runTSConfigLinter(); runGithubActionsPinningLinter(); + runEvalSuiteLinter(); console.log('\nAll linting checks passed!'); } } +export function runEvalSuiteLinter() { + console.log('\nRunning eval suite linter...'); + try { + execSync('node scripts/validate_eval_suites.js', { stdio: 'inherit' }); + } catch { + process.exit(1); + } +} + main(); diff --git a/scripts/run_eval_regression.js b/scripts/run_eval_regression.js index 7a64a6a2f9..03cdde90c1 100644 --- a/scripts/run_eval_regression.js +++ b/scripts/run_eval_regression.js @@ -22,22 +22,62 @@ import fs from 'node:fs'; async function main() { const modelList = process.env.MODEL_LIST || 'gemini-3-flash-preview'; const models = modelList.split(',').map((m) => m.trim()); + const isRelatedMode = process.argv.includes('--related'); let combinedReport = ''; let hasRegression = false; + let detectionRationale = ''; + let affectedSuitesStr = ''; console.log( `๐Ÿš€ Starting evaluation orchestration for models: ${models.join(', ')}`, ); + if (isRelatedMode) { + console.log('๐Ÿ” Identifying related evaluations based on changes...'); + try { + const detectionOutput = execSync( + `node scripts/changed_prompt.js --related --json`, + { encoding: 'utf-8', stdio: ['inherit', 'pipe', 'inherit'] }, + ).trim(); + const detection = JSON.parse(detectionOutput); + + if (detection.affectedSuites && detection.affectedSuites.length > 0) { + affectedSuitesStr = detection.affectedSuites.join(','); + detectionRationale = '### ๐Ÿงช Related Evaluation Rationale\n\n'; + detection.rationales.forEach((r) => { + detectionRationale += `- ${r}\n`; + }); + detectionRationale += + '\n_Something missing? [Update evals/suites.json](evals/README.md#related-testing-with-related) to adjust detection logic._\n\n---\n\n'; + } else if (!detection.detected) { + console.log('โœ… No related changes detected. Skipping evaluations.'); + process.exit(0); + } else { + console.log( + 'โš ๏ธ Changes detected but no specific suites matched. Running full stable suite for safety.', + ); + detectionRationale = + '### ๐Ÿงช Related Evaluation Rationale\n\n- No specific suites matched. Running full stable suite for safety.\n\n---\n\n'; + } + } catch (e) { + console.error(`โŒ Error during suite detection: ${e.message}`); + detectionRationale = + '### ๐Ÿงช Related Evaluation Rationale\n\n- Error during suite detection. Running full stable suite for safety.\n\n---\n\n'; + } + } + for (const model of models) { console.log(`\n--- Processing Model: ${model} ---`); try { // 1. Identify Trustworthy Evals console.log(`๐Ÿ” Identifying trustworthy tests for ${model}...`); + const suitesFlag = affectedSuitesStr + ? `--suites ${affectedSuitesStr}` + : ''; const output = execSync( - `node scripts/get_trustworthy_evals.js "${model}"`, + `node scripts/get_trustworthy_evals.js "${model}" ${suitesFlag}`, { encoding: 'utf-8', stdio: ['inherit', 'pipe', 'inherit'], // Capture stdout but pass stdin/stderr @@ -83,7 +123,8 @@ async function main() { // Always save the combined report to a file so the workflow can capture it cleanly if (combinedReport) { - fs.writeFileSync('eval_regression_report.md', combinedReport); + const finalReport = detectionRationale + combinedReport; + fs.writeFileSync('eval_regression_report.md', finalReport); console.log( '\n๐Ÿ“Š Final Markdown report saved to eval_regression_report.md', ); diff --git a/scripts/validate_eval_suites.js b/scripts/validate_eval_suites.js new file mode 100644 index 0000000000..9854856021 --- /dev/null +++ b/scripts/validate_eval_suites.js @@ -0,0 +1,98 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import fs from 'node:fs'; +import path from 'node:path'; + +const SUITES_PATH = 'evals/suites.json'; +const EVALS_DIR = 'evals'; + +/** + * Validates that all eval files are mapped in suites.json and that there are no overlaps. + */ +function main() { + if (!fs.existsSync(SUITES_PATH)) { + console.error(`โŒ Error: ${SUITES_PATH} not found.`); + process.exit(1); + } + + const suitesConfig = JSON.parse(fs.readFileSync(SUITES_PATH, 'utf-8')); + const allowedOverlaps = new Set(suitesConfig.allowedOverlaps || []); + const evalFilesOnDisk = fs + .readdirSync(EVALS_DIR) + .filter((f) => f.endsWith('.eval.ts')) + .map((f) => path.join(EVALS_DIR, f)); + + const evalToSuiteMap = new Map(); + const errors = []; + + // 1. Map evals to suites and check for overlaps/trigger-coverage + for (const [suiteName, suite] of Object.entries(suitesConfig)) { + if (suiteName === 'allowedOverlaps' || !suite.evals) continue; + + for (const evalFile of suite.evals) { + if (evalFile === 'ALL_ALWAYS_PASSING') continue; + + if (!fs.existsSync(evalFile)) { + errors.push( + `Suite **${suiteName}** references non-existent file: **${evalFile}**`, + ); + continue; + } + + // Check if the eval file itself is in the suite's trigger patterns + if (!suite.patterns || !suite.patterns.includes(evalFile)) { + errors.push( + `Trigger coverage missing: **${evalFile}** is in the **${suiteName}** suite but is missing from its **patterns** array. (Changes to the test won't trigger itself correctly).`, + ); + } + + if (evalToSuiteMap.has(evalFile) && !allowedOverlaps.has(evalFile)) { + errors.push( + `Overlap detected: **${evalFile}** is present in both **${evalToSuiteMap.get(evalFile)}** and **${suiteName}** suites.`, + ); + } else { + const existingSuites = evalToSuiteMap.get(evalFile) || []; + evalToSuiteMap.set( + evalFile, + Array.isArray(existingSuites) + ? [...existingSuites, suiteName] + : [existingSuites, suiteName], + ); + } + } + } + + // 2. Check for orphaned evals (on disk but not in suites.json) + for (const diskFile of evalFilesOnDisk) { + if (!evalToSuiteMap.has(diskFile)) { + errors.push( + `Orphaned eval detected: **${diskFile}** is not mapped to any suite in ${SUITES_PATH}.`, + ); + } + } + + if (errors.length > 0) { + console.error('\nโŒ Eval Suite Validation Failed:'); + errors.forEach((err) => console.error(` - ${err}`)); + + const hasOverlap = errors.some((err) => err.includes('Overlap detected')); + if (hasOverlap) { + console.error( + `\n๐Ÿ’ก Tip: If this overlap is intentional, add the file path to the 'allowedOverlaps' list in ${SUITES_PATH}.`, + ); + } else { + console.error(`\n๐Ÿ’ก Tip: Update ${SUITES_PATH} to resolve these issues.`); + } + process.exit(1); + } + + console.log( + 'โœ… Eval Suite Validation Passed: All files mapped and no overlaps found.', + ); +} + +main();