mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-05-13 13:22:35 -07:00
148 lines
4.6 KiB
TypeScript
148 lines
4.6 KiB
TypeScript
/**
|
|
* @license
|
|
* Copyright 2026 Google LLC
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*/
|
|
|
|
import { describe, expect } from 'vitest';
|
|
import {
|
|
componentEvalTest,
|
|
type ComponentEvalCase,
|
|
} from './component-test-helper.js';
|
|
import { type EvalPolicy } from './test-helper.js';
|
|
import { SnapshotGenerator } from '@google/gemini-cli-core';
|
|
import { NodeType, type ConcreteNode } from '@google/gemini-cli-core';
|
|
import { LLMJudge } from './llm-judge.js';
|
|
|
|
function snapshotEvalTest(policy: EvalPolicy, evalCase: ComponentEvalCase) {
|
|
return componentEvalTest(policy, evalCase);
|
|
}
|
|
|
|
describe('snapshot_fidelity', () => {
|
|
snapshotEvalTest('ALWAYS_PASSES', {
|
|
suiteName: 'default',
|
|
suiteType: 'behavioral',
|
|
name: 'SnapshotGenerator strictly retains specific empirical facts',
|
|
assert: async (config) => {
|
|
// 1. Construct a highly specific mock transcript containing 3 empirical facts we can test for:
|
|
// Fact A: File path -> src/compiler/server.ts
|
|
// Fact B: Error code -> COMPILE_ERR_404
|
|
// Fact C: Active Directive -> "do not fix it just yet"
|
|
const mockNodes: ConcreteNode[] = [
|
|
{
|
|
id: '1',
|
|
turnId: '1',
|
|
type: NodeType.USER_PROMPT,
|
|
timestamp: Date.now(),
|
|
role: 'user',
|
|
payload: {
|
|
text: 'I am trying to debug a weird timeout issue when compiling the TS server.',
|
|
},
|
|
},
|
|
{
|
|
id: '2',
|
|
turnId: '2',
|
|
type: NodeType.TOOL_EXECUTION,
|
|
timestamp: Date.now() + 100,
|
|
role: 'model',
|
|
payload: {
|
|
functionCall: {
|
|
name: 'run_shell_command',
|
|
args: { cmd: 'grep -rn "timeout" src/' },
|
|
},
|
|
},
|
|
},
|
|
{
|
|
id: '3',
|
|
turnId: '2',
|
|
type: NodeType.TOOL_EXECUTION,
|
|
timestamp: Date.now() + 200,
|
|
role: 'user',
|
|
payload: {
|
|
functionResponse: {
|
|
name: 'run_shell_command',
|
|
response: {
|
|
output:
|
|
'src/compiler/server.ts:442: setTimeout(() => reject(new Error("COMPILE_ERR_404")), 5000);',
|
|
},
|
|
},
|
|
},
|
|
},
|
|
{
|
|
id: '4',
|
|
turnId: '3',
|
|
type: NodeType.AGENT_YIELD,
|
|
timestamp: Date.now() + 300,
|
|
role: 'model',
|
|
payload: {
|
|
text: 'I found the exact line. It looks like the compiler throws COMPILE_ERR_404 if it hits 5 seconds.',
|
|
},
|
|
},
|
|
{
|
|
id: '5',
|
|
turnId: '4',
|
|
type: NodeType.USER_PROMPT,
|
|
timestamp: Date.now() + 400,
|
|
role: 'user',
|
|
payload: {
|
|
text: 'Okay, do not fix it just yet. I want you to remember this error code (COMPILE_ERR_404) and file path. First, list all the files in the directory.',
|
|
},
|
|
},
|
|
];
|
|
|
|
// 2. Extract the LLM Client from the component container
|
|
const llmClient = config.getBaseLlmClient();
|
|
|
|
const generator = new SnapshotGenerator({
|
|
llmClient,
|
|
promptId: 'eval-snapshot-test',
|
|
tokenCalculator: {
|
|
estimateTokensForString(str: string): number {
|
|
return str.length * 4;
|
|
},
|
|
},
|
|
} as any);
|
|
|
|
// 3. Generate the snapshot using the CURRENT system prompt
|
|
const snapshotText = await generator.synthesizeSnapshot(mockNodes);
|
|
|
|
// 4. Use LLM-as-a-Judge with Self-Consistency to evaluate factual fidelity
|
|
const judge = new LLMJudge(llmClient);
|
|
|
|
const judgePrompt = `
|
|
EVIDENCE (CONTEXT SNAPSHOT):
|
|
"""
|
|
${snapshotText}
|
|
"""
|
|
|
|
QUESTION:
|
|
Does the EVIDENCE explicitly contain all three of the following facts:
|
|
1. The specific file path "src/compiler/server.ts"
|
|
2. The specific error code "COMPILE_ERR_404"
|
|
3. The user's active constraint/directive to "do not fix it just yet" (or equivalent warning that implementation is paused)
|
|
|
|
Answer ONLY with "YES" if all three are unambiguously present.
|
|
Answer "NO" if any of the three are missing, abstracted away, or generalized (e.g., if it says "found an error" instead of "COMPILE_ERR_404").`;
|
|
|
|
// Use a self-consistency of 3 runs to get a robust majority vote
|
|
const result = await judge.judgeYesNo(judgePrompt, {
|
|
selfConsistencyRuns: 3,
|
|
});
|
|
|
|
// 5. Assert the verdict
|
|
const formattedVotes = JSON.stringify(result.votes);
|
|
const formattedReasoning = JSON.stringify(result.reasoning);
|
|
|
|
expect(
|
|
result.verdict,
|
|
`Snapshot failed to retain empirical facts.
|
|
Votes: ${formattedVotes}
|
|
Reasoning: ${formattedReasoning}
|
|
|
|
Generated Snapshot:
|
|
${snapshotText}`,
|
|
).toBe(true);
|
|
},
|
|
});
|
|
});
|