Files
gemini-cli/scripts/optimization/lib/evals/config.ts
T
Abhijit Balaji 59d377e5e0 feat(optimization): implement manifest-driven extraction pipeline
- Implement `extract.ts` with robust character-aware parsing for snippets and tools.
- Consolidate research dependencies by moving `@ax-llm/ax` to root `optionalDependencies`.
- Relocate evaluation logic from `packages/core` to `scripts/optimization/lib/evals` to keep the production core lean.
- Add `optimization_targets` to `data/manifest.json` as the single source of truth for the pipeline.
- Implement comprehensive unit tests for extraction and variable masking with 100% pass rate.
- Update global config and linting rules to support the new optimization infrastructure.
2026-03-04 14:25:17 -08:00

107 lines
2.7 KiB
TypeScript

/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
/**
* Configuration for the Tool Alignment objective (The Accuracy Dimension).
*/
export interface AlignmentConfig {
/**
* The relative importance of accuracy vs other objectives in the Pareto frontier.
*/
weight: number;
/**
* Strongest negative signal (0.0): used when model falls into a known shell trap.
*/
hardFailureScore: number;
/**
* Neutral negative signal (0.1): used when model fails to produce a valid tool call.
*/
invalidResponseScore: number;
/**
* Partial positive signal (0.4): model chose the right tool but hallucinated arguments.
*/
toolNameMatchOnlyScore: number;
/**
* Maximum positive signal (1.0): model matched the golden signature perfectly.
*/
functionalSuccessScore: number;
}
/**
* Configuration for the Brevity objective (The Density Dimension).
* Uses a word-count step-function to provide high-contrast signal for GEPA.
*/
export interface BrevityConfig {
/**
* Importance of brevity relative to accuracy.
*/
weight: number;
/**
* TIER 1: Response is perfectly succinct (e.g., <= 10 words).
*/
succinctThresholdWords: number;
succinctScore: number; // 1.0
/**
* TIER 2: Response is acceptable but slightly verbose (e.g., <= 25 words).
*/
acceptableThresholdWords: number;
acceptableScore: number; // 0.7
/**
* TIER 3: Response is verbose (e.g., <= 50 words).
*/
verboseThresholdWords: number;
verboseScore: number; // 0.4
/**
* TIER 4: Response is very heavy (e.g., > 50 words).
*/
heavyScore: number; // 0.1
}
/**
* Global evaluation configuration for multi-objective optimization.
*/
export interface EvalConfig {
objectives: {
alignment: AlignmentConfig;
brevity: BrevityConfig;
};
}
/**
* Default weights and thresholds for the Genetic-Pareto (GEPA) engine.
* These constants drive the 'Selection Pressure' that evolves the prompt.
* GEPA always MAXIMIZES, so higher scores represent better performance.
*/
export const DEFAULT_EVAL_CONFIG: EvalConfig = {
objectives: {
alignment: {
weight: 1.0, // PRIMARY: Accuracy cannot be sacrificed.
hardFailureScore: 0.0,
invalidResponseScore: 0.1,
toolNameMatchOnlyScore: 0.4,
functionalSuccessScore: 1.0,
},
brevity: {
weight: 0.6, // SECONDARY: Reward brevity once accuracy is high.
succinctThresholdWords: 10,
succinctScore: 1.0,
acceptableThresholdWords: 25,
acceptableScore: 0.7,
verboseThresholdWords: 50,
verboseScore: 0.4,
heavyScore: 0.1, // Never hard-zero brevity to allow gradient improvement.
},
},
};