mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-06-12 20:37:08 -07:00
feat(prompt-optimization): implement multi-objective evaluation metrics
Established a Pareto-ready evaluation foundation for the Genetic-Pareto (GEPA)
optimizer, supporting simultaneous optimization of accuracy and density.
Key improvements:
- Core Architecture: Defined standardized `MetricResult` and `OptimizationDirection`
types in `packages/core/src/evals/types.ts` to support multi-objective fitness.
- Centralized Config: Implemented `packages/core/src/evals/config.ts` with tunable
weights and detailed documentation for scoring gradients.
- Tool Alignment Metric: Created `metrics/toolAlignment.ts` to measure functional
accuracy, argument precision, and explicit shell avoidance.
- Token Frugality Metric: Created `metrics/tokenFrugality.ts` to measure and
penalize conversational noise ("chatter") using a configurable threshold.
- Verification Suite: Added comprehensive unit tests for all metrics, achieving
100% coverage of scoring logic and gradient steps.
- Project Integration: Relocated `schema.ts` to the core package for build safety,
updated the data validator, and extended project-wide lint/format scripts.
This commit is contained in:
+2
-2
@@ -62,8 +62,8 @@
|
||||
"release:version": "node scripts/version.js",
|
||||
"telemetry": "node scripts/telemetry.js",
|
||||
"data:validate": "tsx scripts/validate-data.ts",
|
||||
"data:format": "prettier --write 'data/**/*.ts' 'data/*.json' 'scripts/validate-data.ts'",
|
||||
"data:lint": "eslint 'data/**/*.ts' 'scripts/validate-data.ts'",
|
||||
"data:format": "prettier --write 'data/**/*.ts' 'data/*.json' 'scripts/validate-data.ts' 'packages/core/src/evals/**/*.ts'",
|
||||
"data:lint": "eslint 'data/**/*.ts' 'scripts/validate-data.ts' 'packages/core/src/evals/**/*.ts'",
|
||||
"check:lockfile": "node scripts/check-lockfile.js",
|
||||
"clean": "node scripts/clean.js",
|
||||
"pre-commit": "node scripts/pre-commit.js"
|
||||
|
||||
@@ -0,0 +1,100 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { OptimizationDirection } from './types.js';
|
||||
|
||||
/**
|
||||
* Configuration for the Tool Alignment objective (The Accuracy Dimension).
|
||||
*/
|
||||
export interface AlignmentConfig {
|
||||
/**
|
||||
* Whether to increase or decrease the alignment score.
|
||||
*/
|
||||
direction: OptimizationDirection.MAXIMIZE;
|
||||
|
||||
/**
|
||||
* The relative importance of accuracy vs other objectives in the Pareto frontier.
|
||||
*/
|
||||
weight: number;
|
||||
|
||||
/**
|
||||
* Strongest negative signal (0.0): used when model falls into a known shell trap.
|
||||
*/
|
||||
hardFailureScore: number;
|
||||
|
||||
/**
|
||||
* Neutral negative signal (0.1): used when model fails to produce a valid tool call.
|
||||
*/
|
||||
invalidResponseScore: number;
|
||||
|
||||
/**
|
||||
* Partial positive signal (0.4): model chose the right tool but hallucinated arguments.
|
||||
*/
|
||||
toolNameMatchOnlyScore: number;
|
||||
|
||||
/**
|
||||
* Maximum positive signal (1.0): model matched the golden signature perfectly.
|
||||
*/
|
||||
functionalSuccessScore: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Configuration for the Token Frugality objective (The Density Dimension).
|
||||
*/
|
||||
export interface FrugalityConfig {
|
||||
/**
|
||||
* Whether to increase or decrease the token count.
|
||||
*/
|
||||
direction: OptimizationDirection.MINIMIZE;
|
||||
|
||||
/**
|
||||
* Importance of brevity relative to accuracy.
|
||||
*/
|
||||
weight: number;
|
||||
|
||||
/**
|
||||
* The 'conversational budget' - max chars of non-tool text allowed before penalty.
|
||||
*/
|
||||
chattyThresholdChars: number;
|
||||
|
||||
/**
|
||||
* Amount subtracted from the functional score if the model is too verbose.
|
||||
*/
|
||||
chattyPenalty: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Global evaluation configuration for multi-objective optimization.
|
||||
*/
|
||||
export interface EvalConfig {
|
||||
objectives: {
|
||||
alignment: AlignmentConfig;
|
||||
frugality: FrugalityConfig;
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Default weights and thresholds for the Genetic-Pareto (GEPA) engine.
|
||||
* These constants drive the 'Selection Pressure' that evolves the prompt.
|
||||
*/
|
||||
export const DEFAULT_EVAL_CONFIG: EvalConfig = {
|
||||
objectives: {
|
||||
alignment: {
|
||||
direction: OptimizationDirection.MAXIMIZE,
|
||||
weight: 1.0, // PRIMARY: Accuracy cannot be sacrificed.
|
||||
hardFailureScore: 0.0,
|
||||
invalidResponseScore: 0.1,
|
||||
toolNameMatchOnlyScore: 0.4,
|
||||
functionalSuccessScore: 1.0,
|
||||
},
|
||||
frugality: {
|
||||
direction: OptimizationDirection.MINIMIZE,
|
||||
weight: 0.6, // SECONDARY: Reward brevity once accuracy is high.
|
||||
chattyThresholdChars: 30, // Budget for 'I have updated the file' etc.
|
||||
chattyPenalty: 0.2, // Penalty creates a 'Reward Gap' for concise models.
|
||||
},
|
||||
},
|
||||
};
|
||||
@@ -0,0 +1,41 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import { evaluateTokenFrugality } from './tokenFrugality.js';
|
||||
import { MetricObjective, OptimizationDirection } from '../types.js';
|
||||
|
||||
describe('evaluateTokenFrugality', () => {
|
||||
it('should return the raw character count as the score', () => {
|
||||
const prediction = { output_text: 'Hello' };
|
||||
const result = evaluateTokenFrugality(prediction);
|
||||
expect(result.score).toBe(5);
|
||||
expect(result.objective).toBe(MetricObjective.FRUGALITY);
|
||||
expect(result.direction).toBe(OptimizationDirection.MINIMIZE);
|
||||
expect(result.reason).toContain('contains 5 characters');
|
||||
});
|
||||
|
||||
it('should flag if response is succinct (under threshold)', () => {
|
||||
const prediction = { output_text: 'Short' };
|
||||
const result = evaluateTokenFrugality(prediction);
|
||||
expect(result.metadata?.['isOverThreshold']).toBe(false);
|
||||
expect(result.reason).toContain('Succinct response');
|
||||
});
|
||||
|
||||
it('should flag if response exceeds chatter threshold', () => {
|
||||
const prediction = { output_text: 'a'.repeat(50) };
|
||||
const result = evaluateTokenFrugality(prediction);
|
||||
expect(result.metadata?.['isOverThreshold']).toBe(true);
|
||||
expect(result.reason).toContain('Exceeds threshold');
|
||||
});
|
||||
|
||||
it('should handle missing output text as 0 chars', () => {
|
||||
const prediction = {};
|
||||
const result = evaluateTokenFrugality(prediction);
|
||||
expect(result.score).toBe(0);
|
||||
expect(result.reason).toContain('contains 0 characters');
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,49 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { debugLogger } from '../../utils/debugLogger.js';
|
||||
import { DEFAULT_EVAL_CONFIG } from '../config.js';
|
||||
import { MetricObjective, OptimizationDirection } from '../types.js';
|
||||
import type { MetricResult } from '../types.js';
|
||||
|
||||
/**
|
||||
* Evaluates the frugality of a model's response by measuring total character count.
|
||||
* Focuses on reducing conversational noise ("chatter").
|
||||
*/
|
||||
export function evaluateTokenFrugality(
|
||||
prediction: { output_text?: string },
|
||||
config = DEFAULT_EVAL_CONFIG.objectives.frugality,
|
||||
): MetricResult {
|
||||
const chatter = prediction.output_text ?? '';
|
||||
const chatterLength = chatter.length;
|
||||
|
||||
debugLogger.debug(
|
||||
`[Eval:Frugality] Measuring output text length: ${chatterLength} chars.`,
|
||||
);
|
||||
|
||||
// In Genetic-Pareto, the raw score (length) is the value to be MINIMIZED.
|
||||
// We provide the raw count as the score, and the direction tells the optimizer how to handle it.
|
||||
|
||||
let reason = `Response contains ${chatterLength} characters of non-tool text.`;
|
||||
|
||||
if (chatterLength > config.chattyThresholdChars) {
|
||||
reason += ` (Exceeds threshold of ${config.chattyThresholdChars})`;
|
||||
} else {
|
||||
reason += ' (Succinct response)';
|
||||
}
|
||||
|
||||
return {
|
||||
score: chatterLength,
|
||||
objective: MetricObjective.FRUGALITY,
|
||||
direction: OptimizationDirection.MINIMIZE,
|
||||
reason,
|
||||
metadata: {
|
||||
charCount: chatterLength,
|
||||
threshold: config.chattyThresholdChars,
|
||||
isOverThreshold: chatterLength > config.chattyThresholdChars,
|
||||
},
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,84 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import { evaluateToolAlignment } from './toolAlignment.js';
|
||||
import { MetricObjective, OptimizationDirection } from '../types.js';
|
||||
import type { Scenario } from '../schema.js';
|
||||
|
||||
describe('evaluateToolAlignment', () => {
|
||||
const mockScenario: Scenario = {
|
||||
id: 'test-scenario',
|
||||
metadata: { tags: ['test'], created_at: '2026-03-02' },
|
||||
input: { user_query: 'test query' },
|
||||
expected: {
|
||||
tool_calls: [{ name: 'read_file', arguments: { file_path: 'test.ts' } }],
|
||||
rationale: 'Testing alignment',
|
||||
},
|
||||
negatives: [
|
||||
{
|
||||
tool_calls: [
|
||||
{ name: 'run_shell_command', arguments: { command: 'cat test.ts' } },
|
||||
],
|
||||
reason: 'Avoid shell',
|
||||
severity: 'high',
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
it('should return 1.0 for a perfect match', () => {
|
||||
const prediction = {
|
||||
tool_calls: [{ name: 'read_file', arguments: { file_path: 'test.ts' } }],
|
||||
};
|
||||
const result = evaluateToolAlignment(prediction, mockScenario);
|
||||
expect(result.score).toBe(1.0);
|
||||
expect(result.objective).toBe(MetricObjective.ALIGNMENT);
|
||||
expect(result.direction).toBe(OptimizationDirection.MAXIMIZE);
|
||||
expect(result.reason).toContain('Functional Success');
|
||||
});
|
||||
|
||||
it('should return 0.0 for a hard failure (negative match)', () => {
|
||||
const prediction = {
|
||||
tool_calls: [
|
||||
{ name: 'run_shell_command', arguments: { command: 'cat test.ts' } },
|
||||
],
|
||||
};
|
||||
const result = evaluateToolAlignment(prediction, mockScenario);
|
||||
expect(result.score).toBe(0.0);
|
||||
expect(result.reason).toContain('Hard Failure');
|
||||
expect(result.metadata?.['matchedNegativeReason']).toBe('Avoid shell');
|
||||
});
|
||||
|
||||
it('should return 0.1 for an incorrect tool selection', () => {
|
||||
const prediction = {
|
||||
tool_calls: [
|
||||
{
|
||||
name: 'write_file',
|
||||
arguments: { file_path: 'test.ts', content: 'test' },
|
||||
},
|
||||
],
|
||||
};
|
||||
const result = evaluateToolAlignment(prediction, mockScenario);
|
||||
expect(result.score).toBe(0.1);
|
||||
expect(result.reason).toContain('wrong tool');
|
||||
});
|
||||
|
||||
it('should return 0.4 for correct tool but wrong arguments', () => {
|
||||
const prediction = {
|
||||
tool_calls: [{ name: 'read_file', arguments: { file_path: 'wrong.ts' } }],
|
||||
};
|
||||
const result = evaluateToolAlignment(prediction, mockScenario);
|
||||
expect(result.score).toBe(0.4);
|
||||
expect(result.reason).toContain('arguments are incorrect');
|
||||
});
|
||||
|
||||
it('should return 0.1 for an empty tool call list', () => {
|
||||
const prediction = { tool_calls: [] };
|
||||
const result = evaluateToolAlignment(prediction, mockScenario);
|
||||
expect(result.score).toBe(0.1);
|
||||
expect(result.reason).toContain('failed to produce any tool calls');
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,129 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { debugLogger } from '../../utils/debugLogger.js';
|
||||
import type { Scenario, ToolCall } from '../schema.js';
|
||||
import { DEFAULT_EVAL_CONFIG } from '../config.js';
|
||||
import { MetricObjective, OptimizationDirection } from '../types.js';
|
||||
import type { MetricResult } from '../types.js';
|
||||
|
||||
/**
|
||||
* Evaluates the alignment of a model's predicted tool calls against a golden scenario.
|
||||
* Focuses on accuracy and shell avoidance.
|
||||
*/
|
||||
export function evaluateToolAlignment(
|
||||
prediction: { tool_calls: ToolCall[] },
|
||||
example: Scenario,
|
||||
config = DEFAULT_EVAL_CONFIG.objectives.alignment,
|
||||
): MetricResult {
|
||||
const { tool_calls: predictedCalls } = prediction;
|
||||
const { expected, negatives, id: scenarioId } = example;
|
||||
|
||||
debugLogger.debug(`[Eval:${scenarioId}] Evaluating tool alignment...`);
|
||||
|
||||
// 1. Check for Hard Failures (Explicit Negatives)
|
||||
for (const negative of negatives) {
|
||||
const isNegativeMatch = negative.tool_calls.every((negCall: ToolCall) =>
|
||||
predictedCalls.some(
|
||||
(predCall: ToolCall) =>
|
||||
predCall.name === negCall.name &&
|
||||
areArgsMatching(negCall.arguments, predCall.arguments),
|
||||
),
|
||||
);
|
||||
|
||||
if (isNegativeMatch && negative.tool_calls.length > 0) {
|
||||
debugLogger.debug(
|
||||
`[Eval:${scenarioId}] Hard Failure: Matched negative pattern.`,
|
||||
);
|
||||
return {
|
||||
score: config.hardFailureScore,
|
||||
objective: MetricObjective.ALIGNMENT,
|
||||
direction: OptimizationDirection.MAXIMIZE,
|
||||
reason: `Hard Failure: ${negative.reason}`,
|
||||
metadata: {
|
||||
matchedNegativeReason: negative.reason,
|
||||
severity: negative.severity,
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Structural Check
|
||||
if (predictedCalls.length === 0) {
|
||||
debugLogger.debug(
|
||||
`[Eval:${scenarioId}] Invalid Response: No tool calls found.`,
|
||||
);
|
||||
return {
|
||||
score: config.invalidResponseScore,
|
||||
objective: MetricObjective.ALIGNMENT,
|
||||
direction: OptimizationDirection.MAXIMIZE,
|
||||
reason: 'Model failed to produce any tool calls.',
|
||||
};
|
||||
}
|
||||
|
||||
// 3. Functional Alignment Check
|
||||
const expectedCalls = expected.tool_calls;
|
||||
|
||||
// Check if all expected tool names are present
|
||||
const namesMatch = expectedCalls.every((exp: ToolCall) =>
|
||||
predictedCalls.some((pred: ToolCall) => pred.name === exp.name),
|
||||
);
|
||||
|
||||
if (!namesMatch) {
|
||||
debugLogger.debug(
|
||||
`[Eval:${scenarioId}] Failure: Incorrect tool selection.`,
|
||||
);
|
||||
return {
|
||||
score: config.invalidResponseScore,
|
||||
objective: MetricObjective.ALIGNMENT,
|
||||
direction: OptimizationDirection.MAXIMIZE,
|
||||
reason: 'Model selected the wrong tool(s).',
|
||||
};
|
||||
}
|
||||
|
||||
// Check for Argument Precision
|
||||
const argsMatch = expectedCalls.every((exp: ToolCall) =>
|
||||
predictedCalls.some(
|
||||
(pred: ToolCall) =>
|
||||
pred.name === exp.name &&
|
||||
areArgsMatching(exp.arguments, pred.arguments),
|
||||
),
|
||||
);
|
||||
|
||||
if (!argsMatch) {
|
||||
debugLogger.debug(
|
||||
`[Eval:${scenarioId}] Partial Success: Right tool, wrong arguments.`,
|
||||
);
|
||||
return {
|
||||
score: config.toolNameMatchOnlyScore,
|
||||
objective: MetricObjective.ALIGNMENT,
|
||||
direction: OptimizationDirection.MAXIMIZE,
|
||||
reason: 'Correct tool selected, but arguments are incorrect or missing.',
|
||||
};
|
||||
}
|
||||
|
||||
// 4. Perfect Success
|
||||
debugLogger.debug(
|
||||
`[Eval:${scenarioId}] Perfect Functional Alignment achieved.`,
|
||||
);
|
||||
return {
|
||||
score: config.functionalSuccessScore,
|
||||
objective: MetricObjective.ALIGNMENT,
|
||||
direction: OptimizationDirection.MAXIMIZE,
|
||||
reason:
|
||||
'Functional Success: Tool and arguments align perfectly with golden scenario.',
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Deep equality check for tool arguments.
|
||||
*/
|
||||
function areArgsMatching(
|
||||
expected: Record<string, unknown>,
|
||||
predicted: Record<string, unknown>,
|
||||
): boolean {
|
||||
return JSON.stringify(expected) === JSON.stringify(predicted);
|
||||
}
|
||||
@@ -0,0 +1,52 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
/**
|
||||
* Defines whether an objective should be increased or decreased during optimization.
|
||||
*/
|
||||
export enum OptimizationDirection {
|
||||
MINIMIZE = 'minimize',
|
||||
MAXIMIZE = 'maximize',
|
||||
}
|
||||
|
||||
/**
|
||||
* The specific dimensions being measured by the evaluation pipeline.
|
||||
*/
|
||||
export enum MetricObjective {
|
||||
ALIGNMENT = 'alignment',
|
||||
FRUGALITY = 'frugality',
|
||||
}
|
||||
|
||||
/**
|
||||
* Standardized result for any metric calculation.
|
||||
* Designed for consumption by the Genetic-Pareto (GEPA) multi-objective function.
|
||||
*/
|
||||
export interface MetricResult {
|
||||
/**
|
||||
* The numeric score calculated by the metric.
|
||||
*/
|
||||
score: number;
|
||||
|
||||
/**
|
||||
* The specific objective this result corresponds to.
|
||||
*/
|
||||
objective: MetricObjective;
|
||||
|
||||
/**
|
||||
* Whether the goal is to increase or decrease this specific score.
|
||||
*/
|
||||
direction: OptimizationDirection;
|
||||
|
||||
/**
|
||||
* A human-readable (and optimizer-reflective) reason for the score.
|
||||
*/
|
||||
reason: string;
|
||||
|
||||
/**
|
||||
* Additional data points (e.g., char counts, matched negative IDs).
|
||||
*/
|
||||
metadata?: Record<string, unknown>;
|
||||
}
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
import * as fs from 'node:fs';
|
||||
import * as path from 'node:path';
|
||||
import type { Scenario } from '../data/schema.ts';
|
||||
import type { Scenario } from '../packages/core/src/evals/schema.ts';
|
||||
|
||||
const MANIFEST_FILE = 'data/manifest.json';
|
||||
const DEFAULT_DATA_DIR = 'data';
|
||||
|
||||
Reference in New Issue
Block a user