Files
gemini-cli/scripts/optimization/evals/config.ts
T
Abhijit Balaji 9bdf5d5995 feat(optimization): consolidate extraction pipeline and metrics
- Flatten directory structure by moving masking and evals to scripts root.
- Merge evaluation metrics into scripts/optimization/evals.
- Restore and verify extraction tests for the new structure.
2026-03-04 15:14:45 -08:00

107 lines
2.7 KiB
TypeScript

/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
/**
* Configuration for the Tool Alignment objective (The Accuracy Dimension).
*/
export interface AlignmentConfig {
/**
* The relative importance of accuracy vs other objectives in the Pareto frontier.
*/
weight: number;
/**
* Strongest negative signal (0.0): used when model falls into a known shell trap.
*/
hardFailureScore: number;
/**
* Neutral negative signal (0.1): used when model fails to produce a valid tool call.
*/
invalidResponseScore: number;
/**
* Partial positive signal (0.4): model chose the right tool but hallucinated arguments.
*/
toolNameMatchOnlyScore: number;
/**
* Maximum positive signal (1.0): model matched the golden signature perfectly.
*/
functionalSuccessScore: number;
}
/**
* Configuration for the Brevity objective (The Density Dimension).
* Uses a word-count step-function to provide high-contrast signal for GEPA.
*/
export interface BrevityConfig {
/**
* Importance of brevity relative to accuracy.
*/
weight: number;
/**
* TIER 1: Response is perfectly succinct (e.g., <= 10 words).
*/
succinctThresholdWords: number;
succinctScore: number; // 1.0
/**
* TIER 2: Response is acceptable but slightly verbose (e.g., <= 25 words).
*/
acceptableThresholdWords: number;
acceptableScore: number; // 0.7
/**
* TIER 3: Response is verbose (e.g., <= 50 words).
*/
verboseThresholdWords: number;
verboseScore: number; // 0.4
/**
* TIER 4: Response is very heavy (e.g., > 50 words).
*/
heavyScore: number; // 0.1
}
/**
* Global evaluation configuration for multi-objective optimization.
*/
export interface EvalConfig {
objectives: {
alignment: AlignmentConfig;
brevity: BrevityConfig;
};
}
/**
* Default weights and thresholds for the Genetic-Pareto (GEPA) engine.
* These constants drive the 'Selection Pressure' that evolves the prompt.
* GEPA always MAXIMIZES, so higher scores represent better performance.
*/
export const DEFAULT_EVAL_CONFIG: EvalConfig = {
objectives: {
alignment: {
weight: 1.0, // PRIMARY: Accuracy cannot be sacrificed.
hardFailureScore: 0.0,
invalidResponseScore: 0.1,
toolNameMatchOnlyScore: 0.4,
functionalSuccessScore: 1.0,
},
brevity: {
weight: 0.6, // SECONDARY: Reward brevity once accuracy is high.
succinctThresholdWords: 10,
succinctScore: 1.0,
acceptableThresholdWords: 25,
acceptableScore: 0.7,
verboseThresholdWords: 50,
verboseScore: 0.4,
heavyScore: 0.1, // Never hard-zero brevity to allow gradient improvement.
},
},
};