diff --git a/package.json b/package.json index 507a28f889..d2e0900914 100644 --- a/package.json +++ b/package.json @@ -62,8 +62,8 @@ "release:version": "node scripts/version.js", "telemetry": "node scripts/telemetry.js", "data:validate": "tsx scripts/validate-data.ts", - "data:format": "prettier --write 'data/**/*.ts' 'data/*.json' 'scripts/validate-data.ts'", - "data:lint": "eslint 'data/**/*.ts' 'scripts/validate-data.ts'", + "data:format": "prettier --write 'data/**/*.ts' 'data/*.json' 'scripts/validate-data.ts' 'packages/core/src/evals/**/*.ts'", + "data:lint": "eslint 'data/**/*.ts' 'scripts/validate-data.ts' 'packages/core/src/evals/**/*.ts'", "check:lockfile": "node scripts/check-lockfile.js", "clean": "node scripts/clean.js", "pre-commit": "node scripts/pre-commit.js" diff --git a/packages/core/src/evals/config.ts b/packages/core/src/evals/config.ts new file mode 100644 index 0000000000..c45d2089e1 --- /dev/null +++ b/packages/core/src/evals/config.ts @@ -0,0 +1,100 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { OptimizationDirection } from './types.js'; + +/** + * Configuration for the Tool Alignment objective (The Accuracy Dimension). + */ +export interface AlignmentConfig { + /** + * Whether to increase or decrease the alignment score. + */ + direction: OptimizationDirection.MAXIMIZE; + + /** + * The relative importance of accuracy vs other objectives in the Pareto frontier. + */ + weight: number; + + /** + * Strongest negative signal (0.0): used when model falls into a known shell trap. + */ + hardFailureScore: number; + + /** + * Neutral negative signal (0.1): used when model fails to produce a valid tool call. + */ + invalidResponseScore: number; + + /** + * Partial positive signal (0.4): model chose the right tool but hallucinated arguments. + */ + toolNameMatchOnlyScore: number; + + /** + * Maximum positive signal (1.0): model matched the golden signature perfectly. + */ + functionalSuccessScore: number; +} + +/** + * Configuration for the Token Frugality objective (The Density Dimension). + */ +export interface FrugalityConfig { + /** + * Whether to increase or decrease the token count. + */ + direction: OptimizationDirection.MINIMIZE; + + /** + * Importance of brevity relative to accuracy. + */ + weight: number; + + /** + * The 'conversational budget' - max chars of non-tool text allowed before penalty. + */ + chattyThresholdChars: number; + + /** + * Amount subtracted from the functional score if the model is too verbose. + */ + chattyPenalty: number; +} + +/** + * Global evaluation configuration for multi-objective optimization. + */ +export interface EvalConfig { + objectives: { + alignment: AlignmentConfig; + frugality: FrugalityConfig; + }; +} + +/** + * Default weights and thresholds for the Genetic-Pareto (GEPA) engine. + * These constants drive the 'Selection Pressure' that evolves the prompt. + */ +export const DEFAULT_EVAL_CONFIG: EvalConfig = { + objectives: { + alignment: { + direction: OptimizationDirection.MAXIMIZE, + weight: 1.0, // PRIMARY: Accuracy cannot be sacrificed. + hardFailureScore: 0.0, + invalidResponseScore: 0.1, + toolNameMatchOnlyScore: 0.4, + functionalSuccessScore: 1.0, + }, + frugality: { + direction: OptimizationDirection.MINIMIZE, + weight: 0.6, // SECONDARY: Reward brevity once accuracy is high. + chattyThresholdChars: 30, // Budget for 'I have updated the file' etc. + chattyPenalty: 0.2, // Penalty creates a 'Reward Gap' for concise models. + }, + }, +}; diff --git a/packages/core/src/evals/metrics/tokenFrugality.test.ts b/packages/core/src/evals/metrics/tokenFrugality.test.ts new file mode 100644 index 0000000000..599c23fdd4 --- /dev/null +++ b/packages/core/src/evals/metrics/tokenFrugality.test.ts @@ -0,0 +1,41 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, it, expect } from 'vitest'; +import { evaluateTokenFrugality } from './tokenFrugality.js'; +import { MetricObjective, OptimizationDirection } from '../types.js'; + +describe('evaluateTokenFrugality', () => { + it('should return the raw character count as the score', () => { + const prediction = { output_text: 'Hello' }; + const result = evaluateTokenFrugality(prediction); + expect(result.score).toBe(5); + expect(result.objective).toBe(MetricObjective.FRUGALITY); + expect(result.direction).toBe(OptimizationDirection.MINIMIZE); + expect(result.reason).toContain('contains 5 characters'); + }); + + it('should flag if response is succinct (under threshold)', () => { + const prediction = { output_text: 'Short' }; + const result = evaluateTokenFrugality(prediction); + expect(result.metadata?.['isOverThreshold']).toBe(false); + expect(result.reason).toContain('Succinct response'); + }); + + it('should flag if response exceeds chatter threshold', () => { + const prediction = { output_text: 'a'.repeat(50) }; + const result = evaluateTokenFrugality(prediction); + expect(result.metadata?.['isOverThreshold']).toBe(true); + expect(result.reason).toContain('Exceeds threshold'); + }); + + it('should handle missing output text as 0 chars', () => { + const prediction = {}; + const result = evaluateTokenFrugality(prediction); + expect(result.score).toBe(0); + expect(result.reason).toContain('contains 0 characters'); + }); +}); diff --git a/packages/core/src/evals/metrics/tokenFrugality.ts b/packages/core/src/evals/metrics/tokenFrugality.ts new file mode 100644 index 0000000000..683f972f57 --- /dev/null +++ b/packages/core/src/evals/metrics/tokenFrugality.ts @@ -0,0 +1,49 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { debugLogger } from '../../utils/debugLogger.js'; +import { DEFAULT_EVAL_CONFIG } from '../config.js'; +import { MetricObjective, OptimizationDirection } from '../types.js'; +import type { MetricResult } from '../types.js'; + +/** + * Evaluates the frugality of a model's response by measuring total character count. + * Focuses on reducing conversational noise ("chatter"). + */ +export function evaluateTokenFrugality( + prediction: { output_text?: string }, + config = DEFAULT_EVAL_CONFIG.objectives.frugality, +): MetricResult { + const chatter = prediction.output_text ?? ''; + const chatterLength = chatter.length; + + debugLogger.debug( + `[Eval:Frugality] Measuring output text length: ${chatterLength} chars.`, + ); + + // In Genetic-Pareto, the raw score (length) is the value to be MINIMIZED. + // We provide the raw count as the score, and the direction tells the optimizer how to handle it. + + let reason = `Response contains ${chatterLength} characters of non-tool text.`; + + if (chatterLength > config.chattyThresholdChars) { + reason += ` (Exceeds threshold of ${config.chattyThresholdChars})`; + } else { + reason += ' (Succinct response)'; + } + + return { + score: chatterLength, + objective: MetricObjective.FRUGALITY, + direction: OptimizationDirection.MINIMIZE, + reason, + metadata: { + charCount: chatterLength, + threshold: config.chattyThresholdChars, + isOverThreshold: chatterLength > config.chattyThresholdChars, + }, + }; +} diff --git a/packages/core/src/evals/metrics/toolAlignment.test.ts b/packages/core/src/evals/metrics/toolAlignment.test.ts new file mode 100644 index 0000000000..9f624e8dc0 --- /dev/null +++ b/packages/core/src/evals/metrics/toolAlignment.test.ts @@ -0,0 +1,84 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, it, expect } from 'vitest'; +import { evaluateToolAlignment } from './toolAlignment.js'; +import { MetricObjective, OptimizationDirection } from '../types.js'; +import type { Scenario } from '../schema.js'; + +describe('evaluateToolAlignment', () => { + const mockScenario: Scenario = { + id: 'test-scenario', + metadata: { tags: ['test'], created_at: '2026-03-02' }, + input: { user_query: 'test query' }, + expected: { + tool_calls: [{ name: 'read_file', arguments: { file_path: 'test.ts' } }], + rationale: 'Testing alignment', + }, + negatives: [ + { + tool_calls: [ + { name: 'run_shell_command', arguments: { command: 'cat test.ts' } }, + ], + reason: 'Avoid shell', + severity: 'high', + }, + ], + }; + + it('should return 1.0 for a perfect match', () => { + const prediction = { + tool_calls: [{ name: 'read_file', arguments: { file_path: 'test.ts' } }], + }; + const result = evaluateToolAlignment(prediction, mockScenario); + expect(result.score).toBe(1.0); + expect(result.objective).toBe(MetricObjective.ALIGNMENT); + expect(result.direction).toBe(OptimizationDirection.MAXIMIZE); + expect(result.reason).toContain('Functional Success'); + }); + + it('should return 0.0 for a hard failure (negative match)', () => { + const prediction = { + tool_calls: [ + { name: 'run_shell_command', arguments: { command: 'cat test.ts' } }, + ], + }; + const result = evaluateToolAlignment(prediction, mockScenario); + expect(result.score).toBe(0.0); + expect(result.reason).toContain('Hard Failure'); + expect(result.metadata?.['matchedNegativeReason']).toBe('Avoid shell'); + }); + + it('should return 0.1 for an incorrect tool selection', () => { + const prediction = { + tool_calls: [ + { + name: 'write_file', + arguments: { file_path: 'test.ts', content: 'test' }, + }, + ], + }; + const result = evaluateToolAlignment(prediction, mockScenario); + expect(result.score).toBe(0.1); + expect(result.reason).toContain('wrong tool'); + }); + + it('should return 0.4 for correct tool but wrong arguments', () => { + const prediction = { + tool_calls: [{ name: 'read_file', arguments: { file_path: 'wrong.ts' } }], + }; + const result = evaluateToolAlignment(prediction, mockScenario); + expect(result.score).toBe(0.4); + expect(result.reason).toContain('arguments are incorrect'); + }); + + it('should return 0.1 for an empty tool call list', () => { + const prediction = { tool_calls: [] }; + const result = evaluateToolAlignment(prediction, mockScenario); + expect(result.score).toBe(0.1); + expect(result.reason).toContain('failed to produce any tool calls'); + }); +}); diff --git a/packages/core/src/evals/metrics/toolAlignment.ts b/packages/core/src/evals/metrics/toolAlignment.ts new file mode 100644 index 0000000000..d48f4dcce9 --- /dev/null +++ b/packages/core/src/evals/metrics/toolAlignment.ts @@ -0,0 +1,129 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { debugLogger } from '../../utils/debugLogger.js'; +import type { Scenario, ToolCall } from '../schema.js'; +import { DEFAULT_EVAL_CONFIG } from '../config.js'; +import { MetricObjective, OptimizationDirection } from '../types.js'; +import type { MetricResult } from '../types.js'; + +/** + * Evaluates the alignment of a model's predicted tool calls against a golden scenario. + * Focuses on accuracy and shell avoidance. + */ +export function evaluateToolAlignment( + prediction: { tool_calls: ToolCall[] }, + example: Scenario, + config = DEFAULT_EVAL_CONFIG.objectives.alignment, +): MetricResult { + const { tool_calls: predictedCalls } = prediction; + const { expected, negatives, id: scenarioId } = example; + + debugLogger.debug(`[Eval:${scenarioId}] Evaluating tool alignment...`); + + // 1. Check for Hard Failures (Explicit Negatives) + for (const negative of negatives) { + const isNegativeMatch = negative.tool_calls.every((negCall: ToolCall) => + predictedCalls.some( + (predCall: ToolCall) => + predCall.name === negCall.name && + areArgsMatching(negCall.arguments, predCall.arguments), + ), + ); + + if (isNegativeMatch && negative.tool_calls.length > 0) { + debugLogger.debug( + `[Eval:${scenarioId}] Hard Failure: Matched negative pattern.`, + ); + return { + score: config.hardFailureScore, + objective: MetricObjective.ALIGNMENT, + direction: OptimizationDirection.MAXIMIZE, + reason: `Hard Failure: ${negative.reason}`, + metadata: { + matchedNegativeReason: negative.reason, + severity: negative.severity, + }, + }; + } + } + + // 2. Structural Check + if (predictedCalls.length === 0) { + debugLogger.debug( + `[Eval:${scenarioId}] Invalid Response: No tool calls found.`, + ); + return { + score: config.invalidResponseScore, + objective: MetricObjective.ALIGNMENT, + direction: OptimizationDirection.MAXIMIZE, + reason: 'Model failed to produce any tool calls.', + }; + } + + // 3. Functional Alignment Check + const expectedCalls = expected.tool_calls; + + // Check if all expected tool names are present + const namesMatch = expectedCalls.every((exp: ToolCall) => + predictedCalls.some((pred: ToolCall) => pred.name === exp.name), + ); + + if (!namesMatch) { + debugLogger.debug( + `[Eval:${scenarioId}] Failure: Incorrect tool selection.`, + ); + return { + score: config.invalidResponseScore, + objective: MetricObjective.ALIGNMENT, + direction: OptimizationDirection.MAXIMIZE, + reason: 'Model selected the wrong tool(s).', + }; + } + + // Check for Argument Precision + const argsMatch = expectedCalls.every((exp: ToolCall) => + predictedCalls.some( + (pred: ToolCall) => + pred.name === exp.name && + areArgsMatching(exp.arguments, pred.arguments), + ), + ); + + if (!argsMatch) { + debugLogger.debug( + `[Eval:${scenarioId}] Partial Success: Right tool, wrong arguments.`, + ); + return { + score: config.toolNameMatchOnlyScore, + objective: MetricObjective.ALIGNMENT, + direction: OptimizationDirection.MAXIMIZE, + reason: 'Correct tool selected, but arguments are incorrect or missing.', + }; + } + + // 4. Perfect Success + debugLogger.debug( + `[Eval:${scenarioId}] Perfect Functional Alignment achieved.`, + ); + return { + score: config.functionalSuccessScore, + objective: MetricObjective.ALIGNMENT, + direction: OptimizationDirection.MAXIMIZE, + reason: + 'Functional Success: Tool and arguments align perfectly with golden scenario.', + }; +} + +/** + * Deep equality check for tool arguments. + */ +function areArgsMatching( + expected: Record, + predicted: Record, +): boolean { + return JSON.stringify(expected) === JSON.stringify(predicted); +} diff --git a/data/schema.ts b/packages/core/src/evals/schema.ts similarity index 100% rename from data/schema.ts rename to packages/core/src/evals/schema.ts diff --git a/packages/core/src/evals/types.ts b/packages/core/src/evals/types.ts new file mode 100644 index 0000000000..f918588a71 --- /dev/null +++ b/packages/core/src/evals/types.ts @@ -0,0 +1,52 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * Defines whether an objective should be increased or decreased during optimization. + */ +export enum OptimizationDirection { + MINIMIZE = 'minimize', + MAXIMIZE = 'maximize', +} + +/** + * The specific dimensions being measured by the evaluation pipeline. + */ +export enum MetricObjective { + ALIGNMENT = 'alignment', + FRUGALITY = 'frugality', +} + +/** + * Standardized result for any metric calculation. + * Designed for consumption by the Genetic-Pareto (GEPA) multi-objective function. + */ +export interface MetricResult { + /** + * The numeric score calculated by the metric. + */ + score: number; + + /** + * The specific objective this result corresponds to. + */ + objective: MetricObjective; + + /** + * Whether the goal is to increase or decrease this specific score. + */ + direction: OptimizationDirection; + + /** + * A human-readable (and optimizer-reflective) reason for the score. + */ + reason: string; + + /** + * Additional data points (e.g., char counts, matched negative IDs). + */ + metadata?: Record; +} diff --git a/scripts/validate-data.ts b/scripts/validate-data.ts index 42dfbd103d..295179eb74 100644 --- a/scripts/validate-data.ts +++ b/scripts/validate-data.ts @@ -6,7 +6,7 @@ import * as fs from 'node:fs'; import * as path from 'node:path'; -import type { Scenario } from '../data/schema.ts'; +import type { Scenario } from '../packages/core/src/evals/schema.ts'; const MANIFEST_FILE = 'data/manifest.json'; const DEFAULT_DATA_DIR = 'data';