mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-05-28 12:42:42 -07:00
9bdf5d5995
- Flatten directory structure by moving masking and evals to scripts root. - Merge evaluation metrics into scripts/optimization/evals. - Restore and verify extraction tests for the new structure.
125 lines
3.6 KiB
TypeScript
125 lines
3.6 KiB
TypeScript
/**
|
|
* @license
|
|
* Copyright 2026 Google LLC
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*/
|
|
|
|
import { debugLogger } from '../../../../../packages/core/src/utils/debugLogger.js';
|
|
import type { Scenario, ToolCall } from '../schema.js';
|
|
import { DEFAULT_EVAL_CONFIG } from '../config.js';
|
|
import { MetricObjective } from '../types.js';
|
|
import type { MetricResult } from '../types.js';
|
|
|
|
/**
|
|
* Evaluates the alignment of a model's predicted tool calls against a golden scenario.
|
|
* Focuses on accuracy and shell avoidance.
|
|
*/
|
|
export function evaluateToolAlignment(
|
|
prediction: { tool_calls: ToolCall[] },
|
|
example: Scenario,
|
|
config = DEFAULT_EVAL_CONFIG.objectives.alignment,
|
|
): MetricResult {
|
|
const { tool_calls: predictedCalls } = prediction;
|
|
const { expected, negatives, id: scenarioId } = example;
|
|
|
|
debugLogger.debug(`[Eval:${scenarioId}] Evaluating tool alignment...`);
|
|
|
|
// 1. Check for Hard Failures (Explicit Negatives)
|
|
for (const negative of negatives) {
|
|
const isNegativeMatch = negative.tool_calls.every((negCall: ToolCall) =>
|
|
predictedCalls.some(
|
|
(predCall: ToolCall) =>
|
|
predCall.name === negCall.name &&
|
|
areArgsMatching(negCall.arguments, predCall.arguments),
|
|
),
|
|
);
|
|
|
|
if (isNegativeMatch && negative.tool_calls.length > 0) {
|
|
debugLogger.debug(
|
|
`[Eval:${scenarioId}] Hard Failure: Matched negative pattern.`,
|
|
);
|
|
return {
|
|
score: config.hardFailureScore,
|
|
objective: MetricObjective.ALIGNMENT,
|
|
reason: `Hard Failure: ${negative.reason}`,
|
|
metadata: {
|
|
matchedNegativeReason: negative.reason,
|
|
severity: negative.severity,
|
|
},
|
|
};
|
|
}
|
|
}
|
|
|
|
// 2. Structural Check
|
|
if (predictedCalls.length === 0) {
|
|
debugLogger.debug(
|
|
`[Eval:${scenarioId}] Invalid Response: No tool calls found.`,
|
|
);
|
|
return {
|
|
score: config.invalidResponseScore,
|
|
objective: MetricObjective.ALIGNMENT,
|
|
reason: 'Model failed to produce any tool calls.',
|
|
};
|
|
}
|
|
|
|
// 3. Functional Alignment Check
|
|
const expectedCalls = expected.tool_calls;
|
|
|
|
// Check if all expected tool names are present
|
|
const namesMatch = expectedCalls.every((exp: ToolCall) =>
|
|
predictedCalls.some((pred: ToolCall) => pred.name === exp.name),
|
|
);
|
|
|
|
if (!namesMatch) {
|
|
debugLogger.debug(
|
|
`[Eval:${scenarioId}] Failure: Incorrect tool selection.`,
|
|
);
|
|
return {
|
|
score: config.invalidResponseScore,
|
|
objective: MetricObjective.ALIGNMENT,
|
|
reason: 'Model selected the wrong tool(s).',
|
|
};
|
|
}
|
|
|
|
// Check for Argument Precision
|
|
const argsMatch = expectedCalls.every((exp: ToolCall) =>
|
|
predictedCalls.some(
|
|
(pred: ToolCall) =>
|
|
pred.name === exp.name &&
|
|
areArgsMatching(exp.arguments, pred.arguments),
|
|
),
|
|
);
|
|
|
|
if (!argsMatch) {
|
|
debugLogger.debug(
|
|
`[Eval:${scenarioId}] Partial Success: Right tool, wrong arguments.`,
|
|
);
|
|
return {
|
|
score: config.toolNameMatchOnlyScore,
|
|
objective: MetricObjective.ALIGNMENT,
|
|
reason: 'Correct tool selected, but arguments are incorrect or missing.',
|
|
};
|
|
}
|
|
|
|
// 4. Perfect Success
|
|
debugLogger.debug(
|
|
`[Eval:${scenarioId}] Perfect Functional Alignment achieved.`,
|
|
);
|
|
return {
|
|
score: config.functionalSuccessScore,
|
|
objective: MetricObjective.ALIGNMENT,
|
|
reason:
|
|
'Functional Success: Tool and arguments align perfectly with golden scenario.',
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Deep equality check for tool arguments.
|
|
*/
|
|
function areArgsMatching(
|
|
expected: Record<string, unknown>,
|
|
predicted: Record<string, unknown>,
|
|
): boolean {
|
|
return JSON.stringify(expected) === JSON.stringify(predicted);
|
|
}
|