mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-06-08 01:53:49 -07:00
feat(prompt-optimization): implement multi-objective evaluation metrics
Established a Pareto-ready evaluation foundation for the Genetic-Pareto (GEPA)
optimizer, supporting simultaneous optimization of accuracy and density.
Key improvements:
- Core Architecture: Defined standardized `MetricResult` and `OptimizationDirection`
types in `packages/core/src/evals/types.ts` to support multi-objective fitness.
- Centralized Config: Implemented `packages/core/src/evals/config.ts` with tunable
weights and detailed documentation for scoring gradients.
- Tool Alignment Metric: Created `metrics/toolAlignment.ts` to measure functional
accuracy, argument precision, and explicit shell avoidance.
- Token Frugality Metric: Created `metrics/tokenFrugality.ts` to measure and
penalize conversational noise ("chatter") using a configurable threshold.
- Verification Suite: Added comprehensive unit tests for all metrics, achieving
100% coverage of scoring logic and gradient steps.
- Project Integration: Relocated `schema.ts` to the core package for build safety,
updated the data validator, and extended project-wide lint/format scripts.
This commit is contained in:
@@ -1,49 +0,0 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
/**
|
||||
* The core data interface for the Tool Alignment Dataset.
|
||||
* Designed to be extensible for custom error reports and metrics.
|
||||
*/
|
||||
|
||||
export interface ToolCall {
|
||||
name: string;
|
||||
arguments: Record<string, unknown>;
|
||||
}
|
||||
|
||||
export interface NegativeExample {
|
||||
id?: string;
|
||||
tool_calls: ToolCall[];
|
||||
output_text?: string; // For "too chatty" or "hallucination" failures
|
||||
reason: string; // e.g., "Defaulted to shell 'cat'", "Included conversational filler"
|
||||
severity: 'low' | 'medium' | 'high'; // Helps the optimizer prioritize fixes
|
||||
}
|
||||
|
||||
export interface Scenario {
|
||||
id: string; // Unique identifier (e.g., 'read_file-01')
|
||||
metadata: {
|
||||
tags: string[]; // e.g., ['tool-alignment', 'shell-avoidance']
|
||||
created_at: string;
|
||||
platform?: 'darwin' | 'linux' | 'win32'; // To handle platform-specific shell variations
|
||||
model_info?: {
|
||||
// Placeholder for future tracking
|
||||
name?: string;
|
||||
version?: string;
|
||||
};
|
||||
};
|
||||
input: {
|
||||
user_query: string;
|
||||
context?: {
|
||||
current_file?: string;
|
||||
directory_structure?: string[];
|
||||
};
|
||||
};
|
||||
expected: {
|
||||
tool_calls: ToolCall[];
|
||||
rationale: string; // Why this is the 'Golden' choice
|
||||
};
|
||||
negatives: NegativeExample[]; // Array of multiple failure modes
|
||||
}
|
||||
Reference in New Issue
Block a user