mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-06-13 21:07:00 -07:00
feat(optimization): finalized iterative-surgical optimization suite (checkpoint)
This commit is contained in:
@@ -4,7 +4,7 @@
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { debugLogger } from '../../../../../packages/core/src/utils/debugLogger.js';
|
||||
import { debugLogger } from '../../../../packages/core/src/utils/debugLogger.js';
|
||||
import { DEFAULT_EVAL_CONFIG } from '../config.js';
|
||||
import { MetricObjective } from '../types.js';
|
||||
import type { MetricResult } from '../types.js';
|
||||
|
||||
@@ -7,77 +7,62 @@
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import { evaluateToolAlignment } from './toolAlignment.js';
|
||||
import { MetricObjective } from '../types.js';
|
||||
import type { Scenario } from '../schema.js';
|
||||
|
||||
describe('evaluateToolAlignment', () => {
|
||||
const mockScenario: Scenario = {
|
||||
const mockScenario = {
|
||||
id: 'test-scenario',
|
||||
metadata: { tags: ['test'], created_at: '2026-03-02' },
|
||||
input: { user_query: 'test query' },
|
||||
expected: {
|
||||
tool_calls: [{ name: 'read_file', arguments: { file_path: 'test.ts' } }],
|
||||
rationale: 'Testing alignment',
|
||||
tool_calls: [{ name: 'test_tool', arguments: { arg: 1 } }],
|
||||
},
|
||||
negatives: [
|
||||
{
|
||||
tool_calls: [
|
||||
{ name: 'run_shell_command', arguments: { command: 'cat test.ts' } },
|
||||
],
|
||||
reason: 'Avoid shell',
|
||||
tool_calls: [{ name: 'shell', arguments: { cmd: 'rm -rf' } }],
|
||||
reason: 'Matched negative shell pattern',
|
||||
severity: 'high',
|
||||
},
|
||||
}
|
||||
],
|
||||
};
|
||||
} as any;
|
||||
|
||||
it('should return 1.0 for a perfect match', () => {
|
||||
it('should return 1.0 for a perfect functional match', () => {
|
||||
const prediction = {
|
||||
tool_calls: [{ name: 'read_file', arguments: { file_path: 'test.ts' } }],
|
||||
tool_calls: [{ name: 'test_tool', arguments: { arg: 1 } }],
|
||||
};
|
||||
const result = evaluateToolAlignment(prediction, mockScenario);
|
||||
expect(result.score).toBe(1.0);
|
||||
expect(result.objective).toBe(MetricObjective.ALIGNMENT);
|
||||
expect(result.reason).toContain('Functional Success');
|
||||
});
|
||||
|
||||
it('should return 0.0 for a hard failure (negative match)', () => {
|
||||
const prediction = {
|
||||
tool_calls: [
|
||||
{ name: 'run_shell_command', arguments: { command: 'cat test.ts' } },
|
||||
],
|
||||
tool_calls: [{ name: 'shell', arguments: { cmd: 'rm -rf' } }],
|
||||
};
|
||||
const result = evaluateToolAlignment(prediction, mockScenario);
|
||||
expect(result.score).toBe(0.0);
|
||||
expect(result.reason).toContain('Hard Failure');
|
||||
expect(result.metadata?.['matchedNegativeReason']).toBe('Avoid shell');
|
||||
expect(result.reason).toContain('Matched negative shell pattern');
|
||||
});
|
||||
|
||||
it('should return 0.1 for an incorrect tool selection', () => {
|
||||
const prediction = {
|
||||
tool_calls: [
|
||||
{
|
||||
name: 'write_file',
|
||||
arguments: { file_path: 'test.ts', content: 'test' },
|
||||
},
|
||||
],
|
||||
tool_calls: [{ name: 'wrong_tool', arguments: { arg: 1 } }],
|
||||
};
|
||||
const result = evaluateToolAlignment(prediction, mockScenario);
|
||||
expect(result.score).toBe(0.1);
|
||||
expect(result.reason).toContain('wrong tool');
|
||||
});
|
||||
|
||||
it('should return 0.4 for correct tool but wrong arguments', () => {
|
||||
const prediction = {
|
||||
tool_calls: [{ name: 'read_file', arguments: { file_path: 'wrong.ts' } }],
|
||||
tool_calls: [{ name: 'test_tool', arguments: { arg: 999 } }],
|
||||
};
|
||||
const result = evaluateToolAlignment(prediction, mockScenario);
|
||||
expect(result.score).toBe(0.4);
|
||||
expect(result.reason).toContain('arguments are incorrect');
|
||||
});
|
||||
|
||||
it('should return 0.1 for an empty tool call list', () => {
|
||||
const prediction = { tool_calls: [] };
|
||||
const prediction = {
|
||||
tool_calls: [],
|
||||
};
|
||||
const result = evaluateToolAlignment(prediction, mockScenario);
|
||||
expect(result.score).toBe(0.1);
|
||||
expect(result.reason).toContain('failed to produce any tool calls');
|
||||
});
|
||||
});
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { debugLogger } from '../../../../../packages/core/src/utils/debugLogger.js';
|
||||
import { debugLogger } from '../../../../packages/core/src/utils/debugLogger.js';
|
||||
import type { Scenario, ToolCall } from '../schema.js';
|
||||
import { DEFAULT_EVAL_CONFIG } from '../config.js';
|
||||
import { MetricObjective } from '../types.js';
|
||||
@@ -12,7 +12,7 @@ import type { MetricResult } from '../types.js';
|
||||
|
||||
/**
|
||||
* Evaluates the alignment of a model's predicted tool calls against a golden scenario.
|
||||
* Focuses on accuracy and shell avoidance.
|
||||
* Focuses strictly on functional correctness (tool selection and argument precision).
|
||||
*/
|
||||
export function evaluateToolAlignment(
|
||||
prediction: { tool_calls: ToolCall[] },
|
||||
@@ -25,6 +25,7 @@ export function evaluateToolAlignment(
|
||||
debugLogger.debug(`[Eval:${scenarioId}] Evaluating tool alignment...`);
|
||||
|
||||
// 1. Check for Hard Failures (Explicit Negatives)
|
||||
// These are for specific "Forbidden" tool uses (e.g., using shell instead of read_file)
|
||||
for (const negative of negatives) {
|
||||
const isNegativeMatch = negative.tool_calls.every((negCall: ToolCall) =>
|
||||
predictedCalls.some(
|
||||
@@ -35,26 +36,17 @@ export function evaluateToolAlignment(
|
||||
);
|
||||
|
||||
if (isNegativeMatch && negative.tool_calls.length > 0) {
|
||||
debugLogger.debug(
|
||||
`[Eval:${scenarioId}] Hard Failure: Matched negative pattern.`,
|
||||
);
|
||||
return {
|
||||
score: config.hardFailureScore,
|
||||
objective: MetricObjective.ALIGNMENT,
|
||||
reason: `Hard Failure: ${negative.reason}`,
|
||||
metadata: {
|
||||
matchedNegativeReason: negative.reason,
|
||||
severity: negative.severity,
|
||||
},
|
||||
metadata: { matchedNegativeReason: negative.reason },
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Structural Check
|
||||
if (predictedCalls.length === 0) {
|
||||
debugLogger.debug(
|
||||
`[Eval:${scenarioId}] Invalid Response: No tool calls found.`,
|
||||
);
|
||||
return {
|
||||
score: config.invalidResponseScore,
|
||||
objective: MetricObjective.ALIGNMENT,
|
||||
@@ -71,9 +63,6 @@ export function evaluateToolAlignment(
|
||||
);
|
||||
|
||||
if (!namesMatch) {
|
||||
debugLogger.debug(
|
||||
`[Eval:${scenarioId}] Failure: Incorrect tool selection.`,
|
||||
);
|
||||
return {
|
||||
score: config.invalidResponseScore,
|
||||
objective: MetricObjective.ALIGNMENT,
|
||||
@@ -91,9 +80,6 @@ export function evaluateToolAlignment(
|
||||
);
|
||||
|
||||
if (!argsMatch) {
|
||||
debugLogger.debug(
|
||||
`[Eval:${scenarioId}] Partial Success: Right tool, wrong arguments.`,
|
||||
);
|
||||
return {
|
||||
score: config.toolNameMatchOnlyScore,
|
||||
objective: MetricObjective.ALIGNMENT,
|
||||
@@ -102,14 +88,10 @@ export function evaluateToolAlignment(
|
||||
}
|
||||
|
||||
// 4. Perfect Success
|
||||
debugLogger.debug(
|
||||
`[Eval:${scenarioId}] Perfect Functional Alignment achieved.`,
|
||||
);
|
||||
return {
|
||||
score: config.functionalSuccessScore,
|
||||
objective: MetricObjective.ALIGNMENT,
|
||||
reason:
|
||||
'Functional Success: Tool and arguments align perfectly with golden scenario.',
|
||||
reason: 'Functional Success: Tool and arguments align perfectly.',
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user