feat(evals): implement related evaluation system for targeted testing

This commit is contained in:
Alisa Novikova
2026-04-07 19:35:12 -07:00
parent 06fcdc231c
commit db8910c39b
9 changed files with 610 additions and 43 deletions
+167 -26
View File
@@ -3,7 +3,23 @@
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
/**
* @fileoverview Intelligence layer for detecting steering and behavior changes.
*
* This script identifies if code changes affect model steering (system prompts,
* tool definitions, agent instructions) and maps them to relevant evaluation
* suites. It supports both CI (GitHub Actions) and local development workflows.
*
* Detection Methods:
* 1. Path-based: Monitors critical steering and tool directories.
* 2. Signature-based: Scans diff content for core steering primitives
* (e.g., ToolDefinition, inputSchema).
* 3. Suite-aware: Uses evals/suites.json to identify related tests for surgical runs.
*/
import { execSync } from 'node:child_process';
import fs from 'node:fs';
const CORE_STEERING_PATHS = [
'packages/core/src/prompts/',
@@ -20,46 +36,132 @@ const STEERING_SIGNATURES = [
"kind: 'local'",
];
function minimatch(file, pattern) {
if (pattern.endsWith('/**')) {
const prefix = pattern.slice(0, -3);
return file.startsWith(prefix);
}
if (pattern.includes('*')) {
const regex = new RegExp(
'^' +
pattern
.replace(/\./g, '\\.')
.replace(/\*\*/g, '.*')
.replace(/\*/g, '[^/]*') +
'$',
);
return regex.test(file);
}
return file === pattern;
}
function main() {
const targetBranch = process.env.GITHUB_BASE_REF || 'main';
const verbose = process.argv.includes('--verbose');
const steeringOnly = process.argv.includes('--steering-only');
const isRelatedMode = process.argv.includes('--related');
const isJsonMode = process.argv.includes('--json');
try {
const remoteUrl = process.env.GITHUB_REPOSITORY
? `https://github.com/${process.env.GITHUB_REPOSITORY}.git`
: 'origin';
// Fetch target branch from the remote.
execSync(`git fetch ${remoteUrl} ${targetBranch}`, {
stdio: 'ignore',
});
let changedFiles = [];
const isCi = !!process.env.GITHUB_ACTIONS;
// Get changed files using the triple-dot syntax which correctly handles merge commits
const head = process.env.PR_HEAD_SHA || 'HEAD';
const changedFiles = execSync(`git diff --name-only FETCH_HEAD...${head}`, {
encoding: 'utf-8',
})
.split('\n')
.filter(Boolean);
if (isCi) {
try {
// 1. Try fetching from remote (CI environment)
execSync(`git fetch ${remoteUrl} ${targetBranch}`, {
stdio: 'ignore',
});
// Get changed files using the triple-dot syntax which correctly handles merge commits
const head = process.env.PR_HEAD_SHA || 'HEAD';
changedFiles = execSync(`git diff --name-only FETCH_HEAD...${head}`, {
encoding: 'utf-8',
})
.split('\n')
.filter(Boolean);
} catch (e) {
if (verbose)
process.stderr.write(
`Warning: git fetch failed in CI: ${e.message}\n`,
);
}
}
// 2. Local fallback or if CI fetch failed: Try diffing against target branch
if (changedFiles.length === 0) {
try {
changedFiles = execSync(`git diff --name-only ${targetBranch}`, {
encoding: 'utf-8',
})
.split('\n')
.filter(Boolean);
} catch {
// 3. Last resort: Just diff against HEAD (uncommitted changes only)
changedFiles = execSync('git diff --name-only HEAD', {
encoding: 'utf-8',
})
.split('\n')
.filter(Boolean);
}
// Also include untracked files in local mode
const untracked = execSync('git ls-files --others --exclude-standard', {
encoding: 'utf-8',
})
.split('\n')
.filter(Boolean);
changedFiles = [...new Set([...changedFiles, ...untracked])];
}
let detected = false;
const reasons = [];
const affectedSuites = new Set();
const rationales = [];
// Load suites for --related mode
let suitesConfig = null;
if (isRelatedMode) {
try {
suitesConfig = JSON.parse(
fs.readFileSync('evals/suites.json', 'utf-8'),
);
} catch {
process.stderr.write(`Warning: Could not load evals/suites.json\n`);
}
}
// 1. Path-based detection
for (const file of changedFiles) {
if (CORE_STEERING_PATHS.some((prefix) => file.startsWith(prefix))) {
detected = true;
reasons.push(`Matched core steering path: ${file}`);
if (!verbose) break;
}
if (
!steeringOnly &&
TEST_PATHS.some((prefix) => file.startsWith(prefix))
TEST_PATHS.some((prefix) => file.startsWith(prefix)) &&
file.endsWith('.eval.ts')
) {
detected = true;
reasons.push(`Matched test path: ${file}`);
if (!verbose) break;
reasons.push(`Matched test file: ${file}`);
}
// Related suite detection
if (suitesConfig) {
for (const [suiteName, suite] of Object.entries(suitesConfig)) {
if (suiteName === 'allowedOverlaps' || !suite.patterns) continue;
if (suite.patterns.some((pattern) => minimatch(file, pattern))) {
affectedSuites.add(suiteName);
rationales.push(
`Testing **${suiteName}** because **${file}** was modified.`,
);
}
}
}
}
@@ -70,15 +172,30 @@ function main() {
);
if (coreChanges.length > 0) {
// Get the actual diff content for core files
const diff = execSync(
`git diff -U0 FETCH_HEAD...${head} -- packages/core/src/`,
{ encoding: 'utf-8' },
);
// We need to be careful with the diff command depending on if we have FETCH_HEAD
let diffCmd = '';
try {
const head = process.env.PR_HEAD_SHA || 'HEAD';
diffCmd = `git diff -U0 FETCH_HEAD...${head} -- packages/core/src/`;
execSync('git rev-parse FETCH_HEAD', { stdio: 'ignore' });
} catch {
diffCmd = `git diff -U0 ${targetBranch} -- packages/core/src/`;
}
const diff = execSync(diffCmd, { encoding: 'utf-8' });
for (const sig of STEERING_SIGNATURES) {
if (diff.includes(sig)) {
detected = true;
reasons.push(`Matched steering signature in core: ${sig}`);
if (!verbose) break;
// If we detected a steering signature, mark core_steering suite
if (isRelatedMode) {
affectedSuites.add('core_steering');
rationales.push(
`Testing **core_steering** because matched signature '${sig}' in core files.`,
);
}
if (!verbose && !isRelatedMode) break;
}
}
}
@@ -89,14 +206,38 @@ function main() {
reasons.forEach((r) => process.stderr.write(` - ${r}\n`));
}
process.stdout.write(detected ? 'true' : 'false');
if (isJsonMode) {
process.stdout.write(
JSON.stringify(
{
detected,
reasons,
affectedSuites: Array.from(affectedSuites),
rationales,
},
null,
2,
),
);
} else {
process.stdout.write(detected ? 'true' : 'false');
}
} catch (error) {
// If anything fails (e.g., no git history), run evals/guidance to be safe
process.stderr.write(
'Warning: Failed to determine if changes occurred. Defaulting to true.\n',
);
if (isJsonMode) {
process.stdout.write(
JSON.stringify({
detected: true,
reasons: [`Error during detection: ${error.message}`],
affectedSuites: ['core_steering'],
rationales: [
'Error during detection: running all stable evals for safety.',
],
}),
);
} else {
process.stdout.write('true');
}
process.stderr.write(String(error) + '\n');
process.stdout.write('true');
}
}
+71 -10
View File
@@ -13,6 +13,7 @@
* to ensure high-signal validation and minimize noise.
*/
import fs from 'node:fs';
import { fetchNightlyHistory, escapeRegex } from './eval_utils.js';
const LOOKBACK_COUNT = 6;
@@ -25,11 +26,24 @@ const AGGREGATE_PASS_RATE_THRESHOLD = 0.8; // Weekly signal (e.g., 15/18)
*/
function main() {
const targetModel = process.argv[2];
if (!targetModel) {
if (!targetModel || targetModel.startsWith('--')) {
console.error('❌ Error: No target model specified.');
process.exit(1);
}
// Parse --suites argument
const suitesArgIndex = process.argv.indexOf('--suites');
let requestedSuites = null;
if (suitesArgIndex !== -1 && process.argv[suitesArgIndex + 1]) {
requestedSuites = process.argv[suitesArgIndex + 1]
.split(',')
.map((s) => s.trim());
}
console.error(`🔍 Identifying trustworthy evals for model: ${targetModel}`);
if (requestedSuites) {
console.error(`📂 Filtering by suites: ${requestedSuites.join(', ')}`);
}
const history = fetchNightlyHistory(LOOKBACK_COUNT);
if (history.length === 0) {
@@ -37,6 +51,32 @@ function main() {
process.exit(1);
}
// Load suites configuration
let allowedFiles = null;
let runAllStable = false;
if (requestedSuites) {
try {
const suitesConfig = JSON.parse(
fs.readFileSync('evals/suites.json', 'utf-8'),
);
allowedFiles = new Set();
for (const suiteName of requestedSuites) {
const suite = suitesConfig[suiteName];
if (suite) {
if (suite.evals.includes('ALL_ALWAYS_PASSING')) {
runAllStable = true;
} else {
suite.evals.forEach((file) => allowedFiles.add(file));
}
}
}
} catch (e) {
console.error(
`⚠️ Warning: Could not load evals/suites.json or match suites: ${e.message}`,
);
}
}
// Aggregate results for the target model across all history
const testHistories = {}; // { [testName]: { totalPassed: 0, totalRuns: 0, dailyRates: [], file: string } }
@@ -83,11 +123,28 @@ function main() {
const isAggregateHighSignal = aggregateRate > AGGREGATE_PASS_RATE_THRESHOLD;
if (isDailyStable && isAggregateHighSignal) {
trustworthyTests.push(testName);
if (info.file) {
const match = info.file.match(/evals\/.*\.eval\.ts/);
if (match) {
trustworthyFiles.add(match[0]);
// Suite filtering logic
let isFileAllowed = true;
if (requestedSuites && !runAllStable) {
if (info.file) {
const match = info.file.match(/evals\/.*\.eval\.ts/);
if (match && !allowedFiles.has(match[0])) {
isFileAllowed = false;
} else if (!match) {
isFileAllowed = false;
}
} else {
isFileAllowed = false;
}
}
if (isFileAllowed) {
trustworthyTests.push(testName);
if (info.file) {
const match = info.file.match(/evals\/.*\.eval\.ts/);
if (match) {
trustworthyFiles.add(match[0]);
}
}
}
} else {
@@ -99,10 +156,14 @@ function main() {
`✅ Found ${trustworthyTests.length} trustworthy tests across ${trustworthyFiles.size} files:`,
);
trustworthyTests.sort().forEach((name) => console.error(` - ${name}`));
console.error(`\n⚪ Ignored ${volatileTests.length} volatile tests.`);
console.error(
`🆕 Ignored ${newTests.length} tests with insufficient history.`,
);
if (volatileTests.length > 0) {
console.error(`\n⚪ Ignored ${volatileTests.length} volatile tests.`);
}
if (newTests.length > 0) {
console.error(
`🆕 Ignored ${newTests.length} tests with insufficient history.`,
);
}
// Output the list of names as a regex-friendly pattern for vitest -t
const pattern = trustworthyTests.map((name) => escapeRegex(name)).join('|');
+13
View File
@@ -500,6 +500,9 @@ function main() {
if (args.includes('--check-github-actions-pinning')) {
runGithubActionsPinningLinter();
}
if (args.includes('--eval-suites')) {
runEvalSuiteLinter();
}
if (args.length === 0) {
setupLinters();
@@ -511,8 +514,18 @@ function main() {
runSensitiveKeywordLinter();
runTSConfigLinter();
runGithubActionsPinningLinter();
runEvalSuiteLinter();
console.log('\nAll linting checks passed!');
}
}
export function runEvalSuiteLinter() {
console.log('\nRunning eval suite linter...');
try {
execSync('node scripts/validate_eval_suites.js', { stdio: 'inherit' });
} catch {
process.exit(1);
}
}
main();
+43 -2
View File
@@ -22,22 +22,62 @@ import fs from 'node:fs';
async function main() {
const modelList = process.env.MODEL_LIST || 'gemini-3-flash-preview';
const models = modelList.split(',').map((m) => m.trim());
const isRelatedMode = process.argv.includes('--related');
let combinedReport = '';
let hasRegression = false;
let detectionRationale = '';
let affectedSuitesStr = '';
console.log(
`🚀 Starting evaluation orchestration for models: ${models.join(', ')}`,
);
if (isRelatedMode) {
console.log('🔍 Identifying related evaluations based on changes...');
try {
const detectionOutput = execSync(
`node scripts/changed_prompt.js --related --json`,
{ encoding: 'utf-8', stdio: ['inherit', 'pipe', 'inherit'] },
).trim();
const detection = JSON.parse(detectionOutput);
if (detection.affectedSuites && detection.affectedSuites.length > 0) {
affectedSuitesStr = detection.affectedSuites.join(',');
detectionRationale = '### 🧪 Related Evaluation Rationale\n\n';
detection.rationales.forEach((r) => {
detectionRationale += `- ${r}\n`;
});
detectionRationale +=
'\n_Something missing? [Update evals/suites.json](evals/README.md#related-testing-with-related) to adjust detection logic._\n\n---\n\n';
} else if (!detection.detected) {
console.log('✅ No related changes detected. Skipping evaluations.');
process.exit(0);
} else {
console.log(
'⚠️ Changes detected but no specific suites matched. Running full stable suite for safety.',
);
detectionRationale =
'### 🧪 Related Evaluation Rationale\n\n- No specific suites matched. Running full stable suite for safety.\n\n---\n\n';
}
} catch (e) {
console.error(`❌ Error during suite detection: ${e.message}`);
detectionRationale =
'### 🧪 Related Evaluation Rationale\n\n- Error during suite detection. Running full stable suite for safety.\n\n---\n\n';
}
}
for (const model of models) {
console.log(`\n--- Processing Model: ${model} ---`);
try {
// 1. Identify Trustworthy Evals
console.log(`🔍 Identifying trustworthy tests for ${model}...`);
const suitesFlag = affectedSuitesStr
? `--suites ${affectedSuitesStr}`
: '';
const output = execSync(
`node scripts/get_trustworthy_evals.js "${model}"`,
`node scripts/get_trustworthy_evals.js "${model}" ${suitesFlag}`,
{
encoding: 'utf-8',
stdio: ['inherit', 'pipe', 'inherit'], // Capture stdout but pass stdin/stderr
@@ -83,7 +123,8 @@ async function main() {
// Always save the combined report to a file so the workflow can capture it cleanly
if (combinedReport) {
fs.writeFileSync('eval_regression_report.md', combinedReport);
const finalReport = detectionRationale + combinedReport;
fs.writeFileSync('eval_regression_report.md', finalReport);
console.log(
'\n📊 Final Markdown report saved to eval_regression_report.md',
);
+98
View File
@@ -0,0 +1,98 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import fs from 'node:fs';
import path from 'node:path';
const SUITES_PATH = 'evals/suites.json';
const EVALS_DIR = 'evals';
/**
* Validates that all eval files are mapped in suites.json and that there are no overlaps.
*/
function main() {
if (!fs.existsSync(SUITES_PATH)) {
console.error(`❌ Error: ${SUITES_PATH} not found.`);
process.exit(1);
}
const suitesConfig = JSON.parse(fs.readFileSync(SUITES_PATH, 'utf-8'));
const allowedOverlaps = new Set(suitesConfig.allowedOverlaps || []);
const evalFilesOnDisk = fs
.readdirSync(EVALS_DIR)
.filter((f) => f.endsWith('.eval.ts'))
.map((f) => path.join(EVALS_DIR, f));
const evalToSuiteMap = new Map();
const errors = [];
// 1. Map evals to suites and check for overlaps/trigger-coverage
for (const [suiteName, suite] of Object.entries(suitesConfig)) {
if (suiteName === 'allowedOverlaps' || !suite.evals) continue;
for (const evalFile of suite.evals) {
if (evalFile === 'ALL_ALWAYS_PASSING') continue;
if (!fs.existsSync(evalFile)) {
errors.push(
`Suite **${suiteName}** references non-existent file: **${evalFile}**`,
);
continue;
}
// Check if the eval file itself is in the suite's trigger patterns
if (!suite.patterns || !suite.patterns.includes(evalFile)) {
errors.push(
`Trigger coverage missing: **${evalFile}** is in the **${suiteName}** suite but is missing from its **patterns** array. (Changes to the test won't trigger itself correctly).`,
);
}
if (evalToSuiteMap.has(evalFile) && !allowedOverlaps.has(evalFile)) {
errors.push(
`Overlap detected: **${evalFile}** is present in both **${evalToSuiteMap.get(evalFile)}** and **${suiteName}** suites.`,
);
} else {
const existingSuites = evalToSuiteMap.get(evalFile) || [];
evalToSuiteMap.set(
evalFile,
Array.isArray(existingSuites)
? [...existingSuites, suiteName]
: [existingSuites, suiteName],
);
}
}
}
// 2. Check for orphaned evals (on disk but not in suites.json)
for (const diskFile of evalFilesOnDisk) {
if (!evalToSuiteMap.has(diskFile)) {
errors.push(
`Orphaned eval detected: **${diskFile}** is not mapped to any suite in ${SUITES_PATH}.`,
);
}
}
if (errors.length > 0) {
console.error('\n❌ Eval Suite Validation Failed:');
errors.forEach((err) => console.error(` - ${err}`));
const hasOverlap = errors.some((err) => err.includes('Overlap detected'));
if (hasOverlap) {
console.error(
`\n💡 Tip: If this overlap is intentional, add the file path to the 'allowedOverlaps' list in ${SUITES_PATH}.`,
);
} else {
console.error(`\n💡 Tip: Update ${SUITES_PATH} to resolve these issues.`);
}
process.exit(1);
}
console.log(
'✅ Eval Suite Validation Passed: All files mapped and no overlaps found.',
);
}
main();