From ff4e816a706a161f961f7e52c044111edd1463dc Mon Sep 17 00:00:00 2001 From: cocosheng-g Date: Tue, 3 Feb 2026 21:02:55 -0500 Subject: [PATCH] refactor(evals): isolate workflow evals and target specific models --- evals/test-helper.ts | 33 +- evals/workflows/batch_triage.eval.ts | 271 ++++++++++++++++ evals/workflows/constants.ts | 10 + evals/workflows/dedup.eval.ts | 428 +++++++++++++++++++++++++ evals/workflows/dedup_refresh.eval.ts | 83 +++++ evals/workflows/triage.eval.ts | 433 ++++++++++++++++++++++++++ 6 files changed, 1247 insertions(+), 11 deletions(-) create mode 100644 evals/workflows/batch_triage.eval.ts create mode 100644 evals/workflows/constants.ts create mode 100644 evals/workflows/dedup.eval.ts create mode 100644 evals/workflows/dedup_refresh.eval.ts create mode 100644 evals/workflows/triage.eval.ts diff --git a/evals/test-helper.ts b/evals/test-helper.ts index 1167be53eb..bee3fafa0d 100644 --- a/evals/test-helper.ts +++ b/evals/test-helper.ts @@ -35,6 +35,18 @@ export * from '@google/gemini-cli-test-utils'; // This may take a really long time and is not recommended. export type EvalPolicy = 'ALWAYS_PASSES' | 'USUALLY_PASSES'; +export interface EvalCase { + name: string; + params?: Record; + prompt: string | string[]; + timeout?: number; + env?: Record; + files?: Record; + approvalMode?: 'default' | 'auto_edit' | 'yolo' | 'plan'; + targetModels?: string[]; + assert: (rig: TestRig, result: string) => Promise; +} + export function evalTest(policy: EvalPolicy, evalCase: EvalCase) { const fn = async () => { const rig = new TestRig() as any; @@ -157,6 +169,16 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) { } }; + const currentModel = process.env.GEMINI_MODEL; + if ( + evalCase.targetModels && + currentModel && + !evalCase.targetModels.includes(currentModel) + ) { + it.skip(`${evalCase.name} (skipped for model ${currentModel})`, fn); + return; + } + if (policy === 'USUALLY_PASSES' && !process.env['RUN_EVALS']) { it.skip(evalCase.name, fn); } else { @@ -170,14 +192,3 @@ async function prepareLogDir(name: string) { const sanitizedName = name.replace(/[^a-z0-9]/gi, '_').toLowerCase(); return { logDir, sanitizedName }; } - -export interface EvalCase { - name: string; - params?: Record; - prompt: string | string[]; - timeout?: number; - env?: Record; - files?: Record; - approvalMode?: 'default' | 'auto_edit' | 'yolo' | 'plan'; - assert: (rig: TestRig, result: string) => Promise; -} diff --git a/evals/workflows/batch_triage.eval.ts b/evals/workflows/batch_triage.eval.ts new file mode 100644 index 0000000000..14e38c4e41 --- /dev/null +++ b/evals/workflows/batch_triage.eval.ts @@ -0,0 +1,271 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, expect } from 'vitest'; +import { evalTest } from '../test-helper.js'; +import fs from 'node:fs/promises'; +import path from 'node:path'; +import yaml from 'js-yaml'; +import { WORKFLOW_TARGET_MODELS } from './constants.js'; + +// Read the workflow file to extract the prompt +const workflowPath = path.join( + process.cwd(), + '.github/workflows/gemini-scheduled-issue-triage.yml', +); +const workflowContent = await fs.readFile(workflowPath, 'utf8'); + +// Use a YAML parser for robustness +const workflowData = yaml.load(workflowContent) as { + jobs?: { + 'triage-issues'?: { + steps?: { + id?: string; + with?: { prompt?: string; script?: string }; + env?: { AVAILABLE_LABELS?: string }; + }[]; + }; + }; +}; + +const geminiStep = workflowData.jobs?.['triage-issues']?.steps?.find( + (step) => step.id === 'gemini_issue_analysis', +); + +const labelsStep = workflowData.jobs?.['triage-issues']?.steps?.find( + (step) => step.id === 'get_labels', +); + +const BATCH_TRIAGE_PROMPT_TEMPLATE = geminiStep?.with?.prompt; +const ORIGINAL_SETTINGS = JSON.parse(geminiStep?.with?.settings || '{}'); +const LABELS_SCRIPT = labelsStep?.with?.script; + +if (!BATCH_TRIAGE_PROMPT_TEMPLATE) { + throw new Error( + 'Could not extract prompt from workflow file. Check for `jobs.triage-issues.steps[id=gemini_issue_analysis].with.prompt` in the YAML file.', + ); +} + +// Extract available labels from the script +let availableLabels = ''; +if (LABELS_SCRIPT) { + const match = LABELS_SCRIPT.match( + /const labelNames = labels.map\(label => label.name\);/, + ); + // Wait, the script in scheduled triage is different! + // const labelNames = labels.map(label => label.name); + // It gets ALL labels. + // But the prompt expects "${AVAILABLE_LABELS}". + // In the test, we can just mock a reasonable set of labels. + availableLabels = + 'area/agent, area/core, area/enterprise, area/extensions, area/non-interactive, area/platform, area/security, area/unknown, kind/bug, kind/feature, kind/question, priority/p0, priority/p1, priority/p2, priority/p3'; +} + +const createPrompt = () => { + return BATCH_TRIAGE_PROMPT_TEMPLATE.replace( + '${AVAILABLE_LABELS}', + availableLabels, + ); +}; + +const BATCH_TRIAGE_SETTINGS = { + ...ORIGINAL_SETTINGS, +}; +if (BATCH_TRIAGE_SETTINGS.telemetry) { + delete BATCH_TRIAGE_SETTINGS.telemetry; +} + +const escapeHtml = (str: string) => { + return str.replace(/[<>&'"]/g, (c) => { + switch (c) { + case '<': + return '<'; + case '>': + return '>'; + case '&': + return '&'; + case "'": + return '''; + case '"': + return '"'; + } + return ''; + }); +}; + +const assertHasIssueLabel = (issueNumber: number, expectedLabel: string) => { + return async (rig: any, result: string) => { + // Verify JSON output stats + const output = JSON.parse(result); + expect(output.stats).toBeDefined(); + + // The model response JSON is in the 'response' field + const responseText = output.response; + let jsonString: string; + const match = responseText.match(/```json\s*([\s\S]*?)\s*```/); + if (match?.[1]) { + jsonString = match[1]; + } else { + const firstBracket = responseText.indexOf('['); + const lastBracket = responseText.lastIndexOf(']'); + if ( + firstBracket === -1 || + lastBracket === -1 || + lastBracket < firstBracket + ) { + throw new Error( + `Could not find a JSON array in the response: "${escapeHtml(responseText)}"`, + ); + } + jsonString = responseText.substring(firstBracket, lastBracket + 1); + } + + let data: { issue_number: number; labels_to_add: string[] }[]; + try { + data = JSON.parse(jsonString); + } catch (e) { + const err = e as Error; + throw new Error( + `Failed to parse JSON. Error: ${err.message}. Response: "${escapeHtml(responseText)}"`, + ); + } + + const issue = data.find((i) => i.issue_number === issueNumber); + if (!issue) { + throw new Error( + `Issue #${issueNumber} not found in output: ${JSON.stringify(data)}`, + ); + } + + expect(issue.labels_to_add).toContain(expectedLabel); + }; +}; + +describe('batch_triage_agent', () => { + evalTest('USUALLY_PASSES', { + name: 'should identify area/core for local test failures in batch', + prompt: ['--output-format', 'json', '--prompt', createPrompt()], + env: { + AVAILABLE_LABELS: availableLabels, + ISSUES_TO_TRIAGE: JSON.stringify([ + { + number: 101, + title: 'Local tests failing', + body: 'I am running npm test locally and it fails with an error.', + }, + ]), + }, + params: { settings: BATCH_TRIAGE_SETTINGS }, + targetModels: WORKFLOW_TARGET_MODELS, + assert: assertHasIssueLabel(101, 'area/core'), + }); + + evalTest('USUALLY_PASSES', { + name: 'should identify area/platform for CI failures in batch', + prompt: ['--output-format', 'json', '--prompt', createPrompt()], + env: { + AVAILABLE_LABELS: availableLabels, + ISSUES_TO_TRIAGE: JSON.stringify([ + { + number: 102, + title: 'CI pipeline failed', + body: 'The GitHub Action for tests failed on the main branch.', + }, + ]), + }, + params: { settings: BATCH_TRIAGE_SETTINGS }, + targetModels: WORKFLOW_TARGET_MODELS, + assert: assertHasIssueLabel(102, 'area/platform'), + }); + + evalTest('USUALLY_PASSES', { + name: 'should handle mixed batch correctly', + prompt: ['--output-format', 'json', '--prompt', createPrompt()], + env: { + AVAILABLE_LABELS: availableLabels, + ISSUES_TO_TRIAGE: JSON.stringify([ + { + number: 103, + title: 'Cannot install on MacOS', + body: 'Install fails with permission error.', + }, + { + number: 104, + title: 'Click to win', + body: 'Spam body', + }, + ]), + }, + params: { settings: BATCH_TRIAGE_SETTINGS }, + targetModels: WORKFLOW_TARGET_MODELS, + assert: async (rig: any, result) => { + // Assert issue 103 has area/core + await assertHasIssueLabel(103, 'area/core')(rig, result); + // Assert issue 104 has area/unknown + await assertHasIssueLabel(104, 'area/unknown')(rig, result); + }, + }); + + evalTest('USUALLY_PASSES', { + name: 'should handle issues needing retesting (old version)', + prompt: ['--output-format', 'json', '--prompt', createPrompt()], + env: { + AVAILABLE_LABELS: availableLabels, + ISSUES_TO_TRIAGE: JSON.stringify([ + { + number: 105, + title: 'Crash on version 0.1.0', + body: 'I am using /about and it says 0.1.0. The app crashes when I run it.', + }, + ]), + }, + params: { settings: BATCH_TRIAGE_SETTINGS }, + targetModels: WORKFLOW_TARGET_MODELS, + assert: assertHasIssueLabel(105, 'status/need-retesting'), + }); + + evalTest('USUALLY_PASSES', { + name: 'should handle issues needing more information', + prompt: ['--output-format', 'json', '--prompt', createPrompt()], + env: { + AVAILABLE_LABELS: availableLabels, + ISSUES_TO_TRIAGE: JSON.stringify([ + { + number: 106, + title: 'It does not work', + body: 'Something is broken.', + }, + ]), + }, + params: { settings: BATCH_TRIAGE_SETTINGS }, + targetModels: WORKFLOW_TARGET_MODELS, + assert: assertHasIssueLabel(106, 'status/need-information'), + }); + + evalTest('USUALLY_PASSES', { + name: 'should handle large batch of diverse issues', + prompt: ['--output-format', 'json', '--prompt', createPrompt()], + env: { + AVAILABLE_LABELS: availableLabels, + ISSUES_TO_TRIAGE: JSON.stringify([ + { number: 107, title: 'Bug A', body: 'Local test failure' }, + { number: 108, title: 'Bug B', body: 'CI failure' }, + { number: 109, title: 'Bug C', body: 'Security leak' }, + { number: 110, title: 'Bug D', body: 'Spam' }, + { number: 111, title: 'Bug E', body: 'Old version 0.0.1' }, + ]), + }, + params: { settings: BATCH_TRIAGE_SETTINGS }, + targetModels: WORKFLOW_TARGET_MODELS, + assert: async (rig: any, result) => { + await assertHasIssueLabel(107, 'area/core')(rig, result); + await assertHasIssueLabel(108, 'area/platform')(rig, result); + await assertHasIssueLabel(109, 'area/security')(rig, result); + await assertHasIssueLabel(110, 'area/unknown')(rig, result); + await assertHasIssueLabel(111, 'status/need-retesting')(rig, result); + }, + }); +}); diff --git a/evals/workflows/constants.ts b/evals/workflows/constants.ts new file mode 100644 index 0000000000..23af437090 --- /dev/null +++ b/evals/workflows/constants.ts @@ -0,0 +1,10 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +// The models that workflow evals should target. +// These workflows (triage, dedup) run in GitHub Actions using the default CLI model. +// We restrict evals to this model to avoid noise from other models in the nightly matrix. +export const WORKFLOW_TARGET_MODELS = ['gemini-2.5-pro']; diff --git a/evals/workflows/dedup.eval.ts b/evals/workflows/dedup.eval.ts new file mode 100644 index 0000000000..3d580548a8 --- /dev/null +++ b/evals/workflows/dedup.eval.ts @@ -0,0 +1,428 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, expect } from 'vitest'; +import { evalTest } from '../test-helper.js'; +import fs from 'node:fs/promises'; +import path from 'node:path'; +import yaml from 'js-yaml'; +import { WORKFLOW_TARGET_MODELS } from './constants.js'; + +// Read the workflow file to extract the prompt and settings +const workflowPath = path.join( + process.cwd(), + '.github/workflows/gemini-automated-issue-dedup.yml', +); +const workflowContent = await fs.readFile(workflowPath, 'utf8'); + +const workflowData = yaml.load(workflowContent) as any; +const geminiStep = workflowData.jobs?.['find-duplicates']?.steps?.find( + (step: any) => step.id === 'gemini_issue_deduplication', +); + +const DEDUP_PROMPT_TEMPLATE = geminiStep?.with?.prompt; +const ORIGINAL_SETTINGS = JSON.parse(geminiStep?.with?.settings || '{}'); + +if (!DEDUP_PROMPT_TEMPLATE) { + throw new Error('Could not extract prompt from de-duplication workflow.'); +} + +const mockMcpPath = path.join(process.cwd(), 'evals/mocks/dedup_mcp.ts'); + +const createPrompt = (issueNumber: number) => { + // The prompt uses ${{ github.event.issue.number }} but also references ${ISSUE_NUMBER} (env) + return DEDUP_PROMPT_TEMPLATE.replace( + /\${{ github\.repository }}/g, + 'google-gemini/gemini-cli', + ).replace(/\${{ github\.event\.issue\.number }}/g, issueNumber.toString()); +}; + +const DEDUP_SETTINGS = { + ...ORIGINAL_SETTINGS, + mcpServers: { + issue_deduplication: { + command: 'npx', + args: ['tsx', mockMcpPath], + }, + }, +}; +if (DEDUP_SETTINGS.telemetry) { + delete DEDUP_SETTINGS.telemetry; +} + +describe('dedup_agent', () => { + evalTest('USUALLY_PASSES', { + name: 'should identify duplicate issues', + prompt: ['--output-format', 'json', '--prompt', createPrompt(101)], + env: { + ISSUE_NUMBER: '101', + GITHUB_ENV: 'github_env', + }, + params: { + settings: DEDUP_SETTINGS, + }, + targetModels: WORKFLOW_TARGET_MODELS, + files: { + github_env: '', + // Mock gh binary + 'bin/gh': `#!/usr/bin/env node +const args = process.argv.slice(2).join(' '); +if (args.includes('issue view')) { + const issueNum = args.match(/view (\\d+)/)?.[1]; + if (issueNum === '101') { + console.log(JSON.stringify({ + number: 101, + title: 'CLI crashes on start', + body: 'It segfaults immediately.', + comments: [] + })); + } else if (issueNum === '201') { + console.log(JSON.stringify({ + number: 201, + title: 'Segfault on launch', + body: 'The app crashes right away.', + comments: [] + })); + } else if (issueNum === '202') { + console.log(JSON.stringify({ + number: 202, + title: 'Unrelated bug', + body: 'Themes are not working.', + comments: [] + })); + } +} +`, + }, + assert: async (rig: any, result) => { + // Verify JSON output stats + const output = JSON.parse(result); + expect(output.stats).toBeDefined(); + expect(output.stats.tools.byName['duplicates']).toBeDefined(); + expect(output.stats.tools.byName['run_shell_command']).toBeDefined(); + + // Verify detailed tool usage via telemetry + const toolLogs = rig.readToolLogs(); + const duplicatesCall = toolLogs.find( + (l: any) => l.toolRequest.name === 'duplicates', + ); + expect(duplicatesCall).toBeDefined(); + + // The current prompt uses echo to set GITHUB_ENV + // We check the tool call for the echo command + const shellCalls = toolLogs.filter( + (l: any) => l.toolRequest.name === 'run_shell_command', + ); + const envCall = shellCalls.find((call: any) => + call.toolRequest.args.includes('DUPLICATE_ISSUES_CSV'), + ); + + expect(envCall).toBeDefined(); + // Check the command content + const match = envCall.toolRequest.args.match( + /DUPLICATE_ISSUES_CSV=\[?([\d, ]*)\]?/, + ); + expect(match).not.toBeNull(); + const issues = match![1] + .split(',') + .map((s: string) => s.trim()) + .filter((s: string) => s); + expect(issues).toContain('201'); + expect(issues).not.toContain('202'); + }, + }); + + evalTest('USUALLY_PASSES', { + name: 'should respect "not a duplicate" comments', + prompt: ['--output-format', 'json', '--prompt', createPrompt(101)], + env: { + ISSUE_NUMBER: '101', + GITHUB_ENV: 'github_env', + }, + params: { + settings: DEDUP_SETTINGS, + }, + targetModels: WORKFLOW_TARGET_MODELS, + files: { + github_env: '', + 'bin/gh': `#!/usr/bin/env node +const args = process.argv.slice(2).join(' '); +if (args.includes('issue view')) { + const issueNum = args.match(/view (\\d+)/)?.[1]; + if (issueNum === '101') { + console.log(JSON.stringify({ + number: 101, + title: 'CLI crashes on start', + body: 'It segfaults immediately.', + comments: [{ body: 'Note: This is NOT a duplicate of #201, different root cause.' }] + })); + } else if (issueNum === '201') { + console.log(JSON.stringify({ + number: 201, + title: 'Segfault on launch', + body: 'The app crashes right away.', + comments: [] + })); + } else { + console.log(JSON.stringify({ number: parseInt(issueNum), title: '', body: '', comments: [] })); + } +} +`, + }, + assert: async (rig: any, result) => { + // Verify JSON output stats + const output = JSON.parse(result); + expect(output.stats).toBeDefined(); + + const toolLogs = rig.readToolLogs(); + const duplicatesCall = toolLogs.find( + (l: any) => l.toolRequest.name === 'duplicates', + ); + expect(duplicatesCall).toBeDefined(); + + const shellCalls = toolLogs.filter( + (l: any) => l.toolRequest.name === 'run_shell_command', + ); + // It might not call echo if no duplicates are found, or it might echo an empty list. + // We'll check if it does call echo, that 201 is NOT in it. + const envCall = shellCalls.find((call: any) => + call.toolRequest.args.includes('DUPLICATE_ISSUES_CSV'), + ); + + if (envCall) { + const match = envCall.toolRequest.args.match( + /DUPLICATE_ISSUES_CSV=\[?([\d, ]*)\]?/, + ); + const issues = match + ? match[1] + .split(',') + .map((s: string) => s.trim()) + .filter((s: string) => s) + : []; + expect(issues).not.toContain('201'); + } + }, + }); + + evalTest('USUALLY_PASSES', { + name: 'should differentiate false positives with high similarity', + prompt: ['--output-format', 'json', '--prompt', createPrompt(301)], + env: { + ISSUE_NUMBER: '301', + GITHUB_ENV: 'github_env', + }, + params: { + settings: DEDUP_SETTINGS, + }, + targetModels: WORKFLOW_TARGET_MODELS, + files: { + github_env: '', + 'bin/gh': `#!/usr/bin/env node +const args = process.argv.slice(2).join(' '); +if (args.includes('issue view')) { + const issueNum = args.match(/view (\\d+)/)?.[1]; + if (issueNum === '301') { + console.log(JSON.stringify({ + number: 301, + title: 'App crashes when I click Save', + body: 'I click the save button and it crashes.', + comments: [] + })); + } else if (issueNum === '302') { + console.log(JSON.stringify({ + number: 302, + title: 'App crashes when I click Load', + body: 'I click the load button and it crashes. This seems related to the loader component.', + comments: [] + })); + } else { + console.log(JSON.stringify({ number: parseInt(issueNum), title: '', body: '', comments: [] })); + } +} +`, + }, + assert: async (rig: any, result) => { + // Verify JSON output stats + const output = JSON.parse(result); + expect(output.stats).toBeDefined(); + + const toolLogs = rig.readToolLogs(); + const duplicatesCall = toolLogs.find( + (l: any) => l.toolRequest.name === 'duplicates', + ); + expect(duplicatesCall).toBeDefined(); + + const shellCalls = toolLogs.filter( + (l: any) => l.toolRequest.name === 'run_shell_command', + ); + const envCall = shellCalls.find((call: any) => + call.toolRequest.args.includes('DUPLICATE_ISSUES_CSV'), + ); + + if (envCall) { + const match = envCall.toolRequest.args.match( + /DUPLICATE_ISSUES_CSV=\[?([\d, ]*)\]?/, + ); + const issues = match + ? match[1] + .split(',') + .map((s: string) => s.trim()) + .filter((s: string) => s) + : []; + // Should NOT contain 302 because it's a different feature (Save vs Load) despite crash + expect(issues).not.toContain('302'); + } + }, + }); + + evalTest('USUALLY_PASSES', { + name: 'should reject matches with low similarity', + prompt: ['--output-format', 'json', '--prompt', createPrompt(401)], + env: { + ISSUE_NUMBER: '401', + GITHUB_ENV: 'github_env', + }, + params: { + settings: DEDUP_SETTINGS, + }, + targetModels: WORKFLOW_TARGET_MODELS, + files: { + github_env: '', + 'bin/gh': `#!/usr/bin/env node +const args = process.argv.slice(2).join(' '); +if (args.includes('issue view')) { + const issueNum = args.match(/view (\\d+)/)?.[1]; + if (issueNum === '401') { + console.log(JSON.stringify({ + number: 401, + title: 'Feature request: Dark mode', + body: 'Please add dark mode.', + comments: [] + })); + } else if (issueNum === '402') { + console.log(JSON.stringify({ + number: 402, + title: 'Feature request: Light mode', + body: 'Please add light mode.', + comments: [] + })); + } else { + console.log(JSON.stringify({ number: parseInt(issueNum), title: '', body: '', comments: [] })); + } +} +`, + }, + assert: async (rig: any, result) => { + // Verify JSON output stats + const output = JSON.parse(result); + expect(output.stats).toBeDefined(); + + const toolLogs = rig.readToolLogs(); + const duplicatesCall = toolLogs.find( + (l: any) => l.toolRequest.name === 'duplicates', + ); + expect(duplicatesCall).toBeDefined(); + + const shellCalls = toolLogs.filter( + (l: any) => l.toolRequest.name === 'run_shell_command', + ); + const envCall = shellCalls.find((call: any) => + call.toolRequest.args.includes('DUPLICATE_ISSUES_CSV'), + ); + + if (envCall) { + const match = envCall.toolRequest.args.match( + /DUPLICATE_ISSUES_CSV=\[?([\d, ]*)\]?/, + ); + const issues = match + ? match[1] + .split(',') + .map((s: string) => s.trim()) + .filter((s: string) => s) + : []; + expect(issues).not.toContain('402'); + expect(issues.length).toBe(0); + } + }, + }); + + evalTest('USUALLY_PASSES', { + name: 'should identify multiple duplicates', + prompt: ['--output-format', 'json', '--prompt', createPrompt(501)], + env: { + ISSUE_NUMBER: '501', + GITHUB_ENV: 'github_env', + }, + params: { + settings: DEDUP_SETTINGS, + }, + targetModels: WORKFLOW_TARGET_MODELS, + files: { + github_env: '', + 'bin/gh': `#!/usr/bin/env node +const args = process.argv.slice(2).join(' '); +if (args.includes('issue view')) { + const issueNum = args.match(/view (\\d+)/)?.[1]; + if (issueNum === '501') { + console.log(JSON.stringify({ + number: 501, + title: 'Crash on login', + body: 'The app crashes when I try to log in.', + comments: [] + })); + } else if (issueNum === '502') { + console.log(JSON.stringify({ + number: 502, + title: 'Crash on sign in', + body: 'Crashes during sign in process.', + comments: [] + })); + } else if (issueNum === '503') { + console.log(JSON.stringify({ + number: 503, + title: 'Crashes on login page', + body: 'I get a crash immediately on the login page.', + comments: [] + })); + } else { + console.log(JSON.stringify({ number: parseInt(issueNum), title: '', body: '', comments: [] })); + } +} +`, + }, + assert: async (rig: any, result) => { + // Verify JSON output stats + const output = JSON.parse(result); + expect(output.stats).toBeDefined(); + + const toolLogs = rig.readToolLogs(); + const duplicatesCall = toolLogs.find( + (l: any) => l.toolRequest.name === 'duplicates', + ); + expect(duplicatesCall).toBeDefined(); + + const shellCalls = toolLogs.filter( + (l: any) => l.toolRequest.name === 'run_shell_command', + ); + const envCall = shellCalls.find((call: any) => + call.toolRequest.args.includes('DUPLICATE_ISSUES_CSV'), + ); + + expect(envCall).toBeDefined(); + const match = envCall.toolRequest.args.match( + /DUPLICATE_ISSUES_CSV=\[?([\d, ]*)\]?/, + ); + const issues = match + ? match[1] + .split(',') + .map((s: string) => s.trim()) + .filter((s: string) => s) + : []; + expect(issues).toContain('502'); + expect(issues).toContain('503'); + }, + }); +}); diff --git a/evals/workflows/dedup_refresh.eval.ts b/evals/workflows/dedup_refresh.eval.ts new file mode 100644 index 0000000000..ad28013f8f --- /dev/null +++ b/evals/workflows/dedup_refresh.eval.ts @@ -0,0 +1,83 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, expect } from 'vitest'; +import { evalTest } from '../test-helper.js'; +import fs from 'node:fs/promises'; +import path from 'node:path'; +import yaml from 'js-yaml'; +import { WORKFLOW_TARGET_MODELS } from './constants.js'; + +// Read the workflow file to extract the prompt and settings +const workflowPath = path.join( + process.cwd(), + '.github/workflows/gemini-scheduled-issue-dedup.yml', +); +const workflowContent = await fs.readFile(workflowPath, 'utf8'); + +const workflowData = yaml.load(workflowContent) as any; +const geminiStep = workflowData.jobs?.['refresh-embeddings']?.steps?.find( + (step: any) => step.id === 'gemini_refresh_embeddings', +); + +const REFRESH_PROMPT_TEMPLATE = geminiStep?.with?.prompt; +const ORIGINAL_SETTINGS = JSON.parse(geminiStep?.with?.settings || '{}'); + +if (!REFRESH_PROMPT_TEMPLATE) { + throw new Error('Could not extract prompt from dedup refresh workflow.'); +} + +const mockMcpPath = path.join(process.cwd(), 'evals/mocks/dedup_mcp.ts'); + +const createPrompt = () => { + return REFRESH_PROMPT_TEMPLATE.replace( + /\${{ github\.repository }}/g, + 'google-gemini/gemini-cli', + ); +}; + +const REFRESH_SETTINGS = { + ...ORIGINAL_SETTINGS, + mcpServers: { + issue_deduplication: { + command: 'npx', + args: ['tsx', mockMcpPath], + }, + }, +}; +if (REFRESH_SETTINGS.telemetry) { + delete REFRESH_SETTINGS.telemetry; +} + +describe('dedup_refresh_agent', () => { + evalTest('USUALLY_PASSES', { + name: 'should call refresh tool', + prompt: ['--output-format', 'json', '--prompt', createPrompt()], + approvalMode: 'yolo', + params: { + settings: REFRESH_SETTINGS, + }, + targetModels: WORKFLOW_TARGET_MODELS, + assert: async (rig: any, result) => { + // result is the JSON output + const output = JSON.parse(result); + expect(output.stats).toBeDefined(); + + const toolStats = output.stats.tools.byName; + expect(toolStats.refresh).toBeDefined(); + expect(toolStats.refresh.count).toBe(1); + expect(toolStats.refresh.success).toBe(1); + + // We still check telemetry for deep arg inspection if needed, + // but stats verify the high-level goal. + const toolLogs = rig.readToolLogs(); + const refreshCall = toolLogs.find( + (l: any) => l.toolRequest.name === 'refresh', + ); + expect(refreshCall).toBeDefined(); + }, + }); +}); diff --git a/evals/workflows/triage.eval.ts b/evals/workflows/triage.eval.ts new file mode 100644 index 0000000000..4a363b78a2 --- /dev/null +++ b/evals/workflows/triage.eval.ts @@ -0,0 +1,433 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, expect } from 'vitest'; +import { evalTest } from '../test-helper.js'; +import fs from 'node:fs/promises'; +import path from 'node:path'; +import yaml from 'js-yaml'; +import { WORKFLOW_TARGET_MODELS } from './constants.js'; + +// Read the workflow file to extract the prompt +const workflowPath = path.join( + process.cwd(), + '.github/workflows/gemini-automated-issue-triage.yml', +); +const workflowContent = await fs.readFile(workflowPath, 'utf8'); + +// Use a YAML parser for robustness +const workflowData = yaml.load(workflowContent) as { + jobs?: { + 'triage-issue'?: { + steps?: { + id?: string; + with?: { prompt?: string; script?: string }; + }[]; + }; + }; +}; + +const triageStep = workflowData.jobs?.['triage-issue']?.steps?.find( + (step) => step.id === 'gemini_issue_analysis', +); + +const labelsStep = workflowData.jobs?.['triage-issue']?.steps?.find( + (step) => step.id === 'get_labels', +); + +const TRIAGE_PROMPT_TEMPLATE = triageStep?.with?.prompt; +const LABELS_SCRIPT = labelsStep?.with?.script; + +if (!TRIAGE_PROMPT_TEMPLATE) { + throw new Error( + 'Could not extract prompt from workflow file. Check for `jobs.triage-issue.steps[id=gemini_issue_analysis].with.prompt` in the YAML file.', + ); +} + +// Extract available labels from the script +let availableLabels = ''; +if (LABELS_SCRIPT) { + const match = LABELS_SCRIPT.match(/const allowedLabels = \[([\s\S]+?)\];/); + if (match && match[1]) { + // Clean up the extracted string: remove quotes, commas, and whitespace + availableLabels = match[1] + .replace(/['"\n\r]/g, '') + .split(',') + .map((s) => s.trim()) + .filter((s) => s.length > 0) + .join(', '); + } +} + +if (!availableLabels) { + throw new Error( + 'Could not extract available labels from workflow file. Check for `jobs.triage-issue.steps[id=get_labels].with.script` containing `const allowedLabels = [...]`.', + ); +} + +const createPrompt = (title: string, body: string) => { + // The placeholders in the YAML are ${{ env.ISSUE_TITLE }} etc. + // We need to replace them with the actual values for the test. + return TRIAGE_PROMPT_TEMPLATE.replace('${{ env.ISSUE_TITLE }}', title) + .replace('${{ env.ISSUE_BODY }}', body) + .replace('${{ env.AVAILABLE_LABELS }}', availableLabels); +}; + +const TRIAGE_SETTINGS = {}; + +const escapeHtml = (str: string) => { + return str.replace(/[<>&'"]/g, (c) => { + switch (c) { + case '<': + return '<'; + case '>': + return '>'; + case '&': + return '&'; + case "'": + return '''; + case '"': + return '"'; + } + return ''; // Should not happen + }); +}; + +const assertHasLabel = (expectedLabel: string) => { + return async (rig: any, result: string) => { + // Verify JSON output stats + const output = JSON.parse(result); + expect(output.stats).toBeDefined(); + + // The model response JSON is in the 'response' field + const responseText = output.response; + let jsonString: string; + const match = responseText.match(/```json\s*([\s\S]*?)\s*```/); + if (match?.[1]) { + jsonString = match[1]; + } else { + const firstBrace = responseText.indexOf('{'); + const lastBrace = responseText.lastIndexOf('}'); + if (firstBrace === -1 || lastBrace === -1 || lastBrace < firstBrace) { + throw new Error( + `Could not find a JSON object in the response: "${escapeHtml(responseText)}"`, + ); + } + jsonString = responseText.substring(firstBrace, lastBrace + 1); + } + + let data: { labels_to_set?: string[] }; + try { + data = JSON.parse(jsonString); + } catch (e) { + const err = e as Error; + throw new Error( + `Failed to parse JSON. Error: ${err.message}. Response: "${escapeHtml(responseText)}"`, + ); + } + + expect(data).toHaveProperty('labels_to_set'); + expect(Array.isArray(data.labels_to_set)).toBe(true); + expect(data.labels_to_set).toContain(expectedLabel); + }; +}; + +describe('triage_agent', () => { + evalTest('USUALLY_PASSES', { + name: 'should identify area/core for windows installation issues', + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt( + 'CLI failed to install on Windows', + 'I tried running npm install but it failed with an error on Windows 11.', + ), + ], + params: { settings: TRIAGE_SETTINGS }, + targetModels: WORKFLOW_TARGET_MODELS, + assert: assertHasLabel('area/core'), + }); + + evalTest('USUALLY_PASSES', { + name: 'should identify area/platform for CI/CD failures', + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt( + 'Tests are failing in the CI/CD pipeline', + 'The github action is failing with a 500 error.', + ), + ], + params: { settings: TRIAGE_SETTINGS }, + targetModels: WORKFLOW_TARGET_MODELS, + assert: assertHasLabel('area/platform'), + }); + + evalTest('USUALLY_PASSES', { + name: 'should identify area/platform for quota issues', + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt( + 'Resource Exhausted 429', + 'I am getting a 429 error when running the CLI.', + ), + ], + params: { settings: TRIAGE_SETTINGS }, + targetModels: WORKFLOW_TARGET_MODELS, + assert: assertHasLabel('area/platform'), + }); + + evalTest('USUALLY_PASSES', { + name: 'should identify area/core for local build failures', + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt( + 'Local build failing', + 'I cannot build the project locally. npm run build fails.', + ), + ], + params: { settings: TRIAGE_SETTINGS }, + targetModels: WORKFLOW_TARGET_MODELS, + assert: assertHasLabel('area/core'), + }); + + evalTest('USUALLY_PASSES', { + name: 'should identify area/platform for sandbox issues', + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt( + 'Sandbox connection failed', + 'I cannot connect to the docker sandbox environment.', + ), + ], + params: { settings: TRIAGE_SETTINGS }, + targetModels: WORKFLOW_TARGET_MODELS, + assert: assertHasLabel('area/platform'), + }); + + evalTest('USUALLY_PASSES', { + name: 'should identify area/core for local test failures', + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt( + 'Local tests failing', + 'I am running npm test locally and it fails.', + ), + ], + params: { settings: TRIAGE_SETTINGS }, + targetModels: WORKFLOW_TARGET_MODELS, + assert: assertHasLabel('area/core'), + }); + + evalTest('USUALLY_PASSES', { + name: 'should identify area/agent for questions about tools', + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt( + 'Bug with web search?', + 'I am trying to use web search but I do not know the syntax. Is it @web or /web?', + ), + ], + params: { settings: TRIAGE_SETTINGS }, + targetModels: WORKFLOW_TARGET_MODELS, + assert: assertHasLabel('area/agent'), + }); + + evalTest('USUALLY_PASSES', { + name: 'should identify area/extensions for feature requests', + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt( + 'Please add a python extension', + 'I want to write python scripts as an extension.', + ), + ], + params: { settings: TRIAGE_SETTINGS }, + targetModels: WORKFLOW_TARGET_MODELS, + assert: assertHasLabel('area/extensions'), + }); + + evalTest('USUALLY_PASSES', { + name: 'should identify area/unknown for off-topic spam', + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt('Buy cheap rolex', 'Click here for discount.'), + ], + params: { settings: TRIAGE_SETTINGS }, + targetModels: WORKFLOW_TARGET_MODELS, + assert: assertHasLabel('area/unknown'), + }); + + evalTest('USUALLY_PASSES', { + name: 'should identify area/core for crash reports phrased as questions', + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt( + 'Why does it segfault?', + 'Why does the CLI segfault immediately when I run it on Ubuntu?', + ), + ], + params: { settings: TRIAGE_SETTINGS }, + targetModels: WORKFLOW_TARGET_MODELS, + assert: assertHasLabel('area/core'), + }); + + evalTest('USUALLY_PASSES', { + name: 'should identify area/agent for feature requests for built-in tools', + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt( + 'Can we have a diff tool?', + 'Is it possible to add a built-in tool to show diffs before editing?', + ), + ], + params: { settings: TRIAGE_SETTINGS }, + targetModels: WORKFLOW_TARGET_MODELS, + assert: assertHasLabel('area/agent'), + }); + + evalTest('USUALLY_PASSES', { + name: 'should identify area/enterprise for license questions', + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt( + 'License key issue', + 'Where do I enter my enterprise license key? I cannot find the setting.', + ), + ], + params: { settings: TRIAGE_SETTINGS }, + targetModels: WORKFLOW_TARGET_MODELS, + assert: assertHasLabel('area/enterprise'), + }); + + evalTest('USUALLY_PASSES', { + name: 'should identify area/unknown for extremely vague reports', + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt('It does not work', 'I tried to use it and it failed.'), + ], + params: { settings: TRIAGE_SETTINGS }, + targetModels: WORKFLOW_TARGET_MODELS, + assert: assertHasLabel('area/unknown'), + }); + + evalTest('USUALLY_PASSES', { + name: 'should identify area/security for prompt injection reports', + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt( + 'Prompt injection vulnerability', + 'I found a way to make the agent ignore instructions by saying "Ignore all previous instructions".', + ), + ], + params: { settings: TRIAGE_SETTINGS }, + targetModels: WORKFLOW_TARGET_MODELS, + assert: assertHasLabel('area/security'), + }); + + evalTest('USUALLY_PASSES', { + name: 'should identify area/non-interactive for headless crashes', + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt( + 'Headless mode segfault', + 'When I run with --headless, the CLI crashes immediately.', + ), + ], + params: { settings: TRIAGE_SETTINGS }, + targetModels: WORKFLOW_TARGET_MODELS, + assert: assertHasLabel('area/non-interactive'), + }); + + evalTest('USUALLY_PASSES', { + name: 'should identify area/agent for mixed feedback and tool bugs', + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt( + 'Great tool but web search fails', + 'I love using Gemini CLI, it is amazing! However, the @web tool gives me an error every time I search for "react".', + ), + ], + params: { settings: TRIAGE_SETTINGS }, + targetModels: WORKFLOW_TARGET_MODELS, + assert: assertHasLabel('area/agent'), + }); + + evalTest('USUALLY_PASSES', { + name: 'should identify area/core for UI performance issues', + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt( + 'UI is very slow', + 'The new interface is lagging and unresponsive when I scroll.', + ), + ], + params: { settings: TRIAGE_SETTINGS }, + targetModels: WORKFLOW_TARGET_MODELS, + assert: assertHasLabel('area/core'), + }); + + evalTest('USUALLY_PASSES', { + name: 'should identify area/security for accidental secret leakage', + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt( + 'Leaked API key in logs', + 'I accidentally posted my API key in a previous issue comment. Can you delete it?', + ), + ], + params: { settings: TRIAGE_SETTINGS }, + targetModels: WORKFLOW_TARGET_MODELS, + assert: assertHasLabel('area/security'), + }); + + evalTest('USUALLY_PASSES', { + name: 'should identify area/unknown for nonsensical input', + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt('asdfasdf', 'qwerqwer zxcvbnm'), + ], + params: { settings: TRIAGE_SETTINGS }, + targetModels: WORKFLOW_TARGET_MODELS, + assert: assertHasLabel('area/unknown'), + }); +});