mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-06-13 21:07:00 -07:00
115 lines
3.3 KiB
TypeScript
115 lines
3.3 KiB
TypeScript
/**
|
|
* @license
|
|
* Copyright 2026 Google LLC
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*/
|
|
|
|
import { LlmRole, type BaseLlmClient } from '@google/gemini-cli-core';
|
|
|
|
export interface JudgeOptions {
|
|
/**
|
|
* The number of parallel generations to run for majority voting.
|
|
* Defaults to 1. Use 3 or 5 for self-consistency.
|
|
*/
|
|
selfConsistencyRuns?: number;
|
|
/**
|
|
* The model to use for judging. Defaults to gemini-3-flash-base.
|
|
*/
|
|
model?: string;
|
|
}
|
|
|
|
export interface JudgeResult {
|
|
verdict: boolean;
|
|
reasoning: string[];
|
|
votes: { yes: number; no: number; other: number };
|
|
}
|
|
|
|
/**
|
|
* A reusable LLM-as-a-judge utility for behavioral evaluations.
|
|
*/
|
|
export class LLMJudge {
|
|
constructor(private readonly llmClient: BaseLlmClient) {}
|
|
|
|
/**
|
|
* Asks the LLM a Yes/No question and returns a boolean verdict.
|
|
* If selfConsistencyRuns > 1, it runs in parallel and returns the majority vote.
|
|
*/
|
|
async judgeYesNo(
|
|
question: string,
|
|
options: JudgeOptions = {},
|
|
): Promise<JudgeResult> {
|
|
const runs = options.selfConsistencyRuns ?? 1;
|
|
const model = options.model ?? 'gemini-3-flash-base';
|
|
|
|
const systemPrompt = `You are a strict, impartial expert judge. Read the provided evidence and question carefully. You MUST answer the question with ONLY "YES" or "NO". Do not provide any conversational filler or explanation before your answer.`;
|
|
|
|
const generateCall = async (): Promise<string> => {
|
|
try {
|
|
const response = await this.llmClient.generateContent({
|
|
modelConfigKey: { model },
|
|
contents: [{ role: 'user', parts: [{ text: question }] }],
|
|
systemInstruction: {
|
|
role: 'system',
|
|
parts: [{ text: systemPrompt }],
|
|
},
|
|
promptId: 'llm-judge-eval',
|
|
role: LlmRole.UTILITY_TOOL,
|
|
abortSignal: new AbortController().signal,
|
|
});
|
|
|
|
const text =
|
|
response.candidates?.[0]?.content?.parts?.[0]?.text
|
|
?.trim()
|
|
?.toUpperCase() || 'ERROR';
|
|
return text;
|
|
} catch (e: any) {
|
|
return `ERROR: ${e.message}`;
|
|
}
|
|
};
|
|
|
|
const promises = Array.from({ length: runs }).map(() => generateCall());
|
|
const rawResults = await Promise.all(promises);
|
|
|
|
let yes = 0;
|
|
let no = 0;
|
|
let other = 0;
|
|
|
|
for (const res of rawResults) {
|
|
// Remove any punctuation the model might have appended
|
|
const cleanRes = res.replace(/[^A-Z ]/g, '');
|
|
if (
|
|
cleanRes.includes('THE ANSWER IS YES') ||
|
|
cleanRes.includes('ANSWER IS YES') ||
|
|
cleanRes.endsWith('YES')
|
|
) {
|
|
yes++;
|
|
} else if (
|
|
cleanRes.includes('THE ANSWER IS NO') ||
|
|
cleanRes.includes('ANSWER IS NO') ||
|
|
cleanRes.endsWith('NO')
|
|
) {
|
|
no++;
|
|
} else if (cleanRes.trim() === 'YES') {
|
|
yes++;
|
|
} else if (cleanRes.trim() === 'NO') {
|
|
no++;
|
|
} else {
|
|
// Fallback: look for YES or NO as standalone words or at the end
|
|
const words = cleanRes.split(/\s+/);
|
|
if (words.includes('YES')) yes++;
|
|
else if (words.includes('NO')) no++;
|
|
else other++;
|
|
}
|
|
}
|
|
|
|
// Pass if YES > NO and YES > OTHER (plurality)
|
|
const pass = yes > no && yes > other;
|
|
|
|
return {
|
|
verdict: pass,
|
|
reasoning: rawResults,
|
|
votes: { yes, no, other },
|
|
};
|
|
}
|
|
}
|