diff --git a/.github/workflows/gemini-automated-issue-triage.yml b/.github/workflows/gemini-automated-issue-triage.yml index 3b6a46f7a2..18a8b076ea 100644 --- a/.github/workflows/gemini-automated-issue-triage.yml +++ b/.github/workflows/gemini-automated-issue-triage.yml @@ -257,6 +257,13 @@ jobs: area/unknown - Description: Issues that do not clearly fit into any other defined area/ category, or where information is too limited to make a determination. Use this when no other area is appropriate. + ## Final Instructions + + - Output ONLY valid JSON format. + - Do NOT include any introductory or concluding remarks, explanations, or additional text. + - Do NOT include any thoughts or reasoning outside the JSON block. + - Ensure the output is a single JSON object with a "labels_to_set" array. + - name: 'Apply Labels to Issue' if: |- ${{ steps.gemini_issue_analysis.outputs.summary != '' }} diff --git a/.github/workflows/gemini-scheduled-issue-triage.yml b/.github/workflows/gemini-scheduled-issue-triage.yml index 25b0cdf4ec..9b19974afc 100644 --- a/.github/workflows/gemini-scheduled-issue-triage.yml +++ b/.github/workflows/gemini-scheduled-issue-triage.yml @@ -159,7 +159,7 @@ jobs: } ] ``` - If an issue cannot be classified, do not include it in the output array. + If an issue cannot be classified (e.g. spam), classify it as area/unknown. 9. For each issue please check if CLI version is present, this is usually in the output of the /about command and will look like 0.1.5 - Anything more than 6 versions older than the most recent should add the status/need-retesting label 10. If you see that the issue doesn't look like it has sufficient information recommend the status/need-information label and leave a comment politely requesting the relevant information, eg.. if repro steps are missing request for repro steps. if version information is missing request for version information into the explanation section below. @@ -207,7 +207,7 @@ jobs: area/enterprise: Telemetry, Policy, Quota / Licensing area/extensions: Gemini CLI extensions capability area/non-interactive: GitHub Actions, SDK, 3P Integrations, Shell Scripting, Command line automation - area/platform: Build infra, Release mgmt, Testing, Eval infra, Capacity, Quota mgmt + area/platform: Build infra, Release mgmt, Automated testing infrastructure (evals), Capacity, Quota mgmt. NOT for local test failures. area/security: security related issues Additional Context: @@ -215,6 +215,13 @@ jobs: - This product is designed to use different models eg.. using pro, downgrading to flash etc. - When users report that they dont expect the model to change those would be categorized as feature requests. + ## Final Instructions + + - Output ONLY valid JSON format. + - Do NOT include any introductory or concluding remarks, explanations, or additional text. + - Do NOT include any thoughts or reasoning outside the JSON block. + - Ensure the output is a single JSON array of objects. + - name: 'Apply Labels to Issues' if: |- ${{ steps.gemini_issue_analysis.outcome == 'success' && diff --git a/evals/batch_triage.eval.ts b/evals/batch_triage.eval.ts new file mode 100644 index 0000000000..609dab9f57 --- /dev/null +++ b/evals/batch_triage.eval.ts @@ -0,0 +1,254 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, expect } from 'vitest'; +import { evalTest } from './test-helper.js'; +import fs from 'node:fs/promises'; +import path from 'node:path'; +import yaml from 'js-yaml'; + +// Read the workflow file to extract the prompt +const workflowPath = path.join( + process.cwd(), + '.github/workflows/gemini-scheduled-issue-triage.yml', +); +const workflowContent = await fs.readFile(workflowPath, 'utf8'); + +// Use a YAML parser for robustness +const workflowData = yaml.load(workflowContent) as { + jobs?: { + 'triage-issues'?: { + steps?: { + id?: string; + with?: { prompt?: string; script?: string }; + env?: { AVAILABLE_LABELS?: string }; + }[]; + }; + }; +}; + +const geminiStep = workflowData.jobs?.['triage-issues']?.steps?.find( + (step) => step.id === 'gemini_issue_analysis', +); + +const labelsStep = workflowData.jobs?.['triage-issues']?.steps?.find( + (step) => step.id === 'get_labels', +); + +const BATCH_TRIAGE_PROMPT_TEMPLATE = geminiStep?.with?.prompt; +const ORIGINAL_SETTINGS = JSON.parse(geminiStep?.with?.settings || '{}'); +const LABELS_SCRIPT = labelsStep?.with?.script; + +if (!BATCH_TRIAGE_PROMPT_TEMPLATE) { + throw new Error( + 'Could not extract prompt from workflow file. Check for `jobs.triage-issues.steps[id=gemini_issue_analysis].with.prompt` in the YAML file.', + ); +} + +// Extract available labels from the script +let availableLabels = ''; +if (LABELS_SCRIPT) { + const match = LABELS_SCRIPT.match( + /const labelNames = labels.map\(label => label.name\);/, + ); + // Wait, the script in scheduled triage is different! + // const labelNames = labels.map(label => label.name); + // It gets ALL labels. + // But the prompt expects "${AVAILABLE_LABELS}". + // In the test, we can just mock a reasonable set of labels. + availableLabels = + 'area/agent, area/core, area/enterprise, area/extensions, area/non-interactive, area/platform, area/security, area/unknown, kind/bug, kind/feature, kind/question, priority/p0, priority/p1, priority/p2, priority/p3'; +} + +const createPrompt = () => { + return BATCH_TRIAGE_PROMPT_TEMPLATE.replace( + '${AVAILABLE_LABELS}', + availableLabels, + ); +}; + +const BATCH_TRIAGE_SETTINGS = { + ...ORIGINAL_SETTINGS, +}; +if (BATCH_TRIAGE_SETTINGS.telemetry) { + delete BATCH_TRIAGE_SETTINGS.telemetry; +} + +const escapeHtml = (str: string) => { + return str.replace(/[<>&'"]/g, (c) => { + switch (c) { + case '<': + return '<'; + case '>': + return '>'; + case '&': + return '&'; + case "'": + return '''; + case '"': + return '"'; + } + return ''; + }); +}; + +const assertHasIssueLabel = (issueNumber: number, expectedLabel: string) => { + return async (rig: any, result: string) => { + // Verify JSON output stats + const output = JSON.parse(result); + expect(output.stats).toBeDefined(); + + // The model response JSON is in the 'response' field + const responseText = output.response; + const firstBrace = responseText.indexOf('['); + const lastBrace = responseText.lastIndexOf(']'); + if (firstBrace === -1 || lastBrace === -1 || lastBrace < firstBrace) { + throw new Error( + `Could not find a JSON array in the response: "${escapeHtml(responseText)}"`, + ); + } + const jsonString = responseText.substring(firstBrace, lastBrace + 1); + + let data: { issue_number: number; labels_to_add: string[] }[]; + try { + data = JSON.parse(jsonString); + } catch (e) { + const err = e as Error; + throw new Error( + `Failed to parse JSON. Error: ${err.message}. Response: "${escapeHtml(responseText)}"`, + ); + } + + const issue = data.find((i) => i.issue_number === issueNumber); + if (!issue) { + throw new Error( + `Issue #${issueNumber} not found in output: ${JSON.stringify(data)}`, + ); + } + + expect(issue.labels_to_add).toContain(expectedLabel); + }; +}; + +describe('batch_triage_agent', () => { + evalTest('USUALLY_PASSES', { + name: 'should identify area/core for local test failures in batch', + prompt: ['--output-format', 'json', '--prompt', createPrompt()], + env: { + AVAILABLE_LABELS: availableLabels, + ISSUES_TO_TRIAGE: JSON.stringify([ + { + number: 101, + title: 'Local tests failing', + body: 'I am running npm test locally and it fails with an error.', + }, + ]), + }, + params: { settings: BATCH_TRIAGE_SETTINGS }, + assert: assertHasIssueLabel(101, 'area/core'), + }); + + evalTest('USUALLY_PASSES', { + name: 'should identify area/platform for CI failures in batch', + prompt: ['--output-format', 'json', '--prompt', createPrompt()], + env: { + AVAILABLE_LABELS: availableLabels, + ISSUES_TO_TRIAGE: JSON.stringify([ + { + number: 102, + title: 'CI pipeline failed', + body: 'The GitHub Action for tests failed on the main branch.', + }, + ]), + }, + params: { settings: BATCH_TRIAGE_SETTINGS }, + assert: assertHasIssueLabel(102, 'area/platform'), + }); + + evalTest('USUALLY_PASSES', { + name: 'should handle mixed batch correctly', + prompt: ['--output-format', 'json', '--prompt', createPrompt()], + env: { + AVAILABLE_LABELS: availableLabels, + ISSUES_TO_TRIAGE: JSON.stringify([ + { + number: 103, + title: 'Cannot install on MacOS', + body: 'Install fails with permission error.', + }, + { + number: 104, + title: 'Click to win', + body: 'Spam body', + }, + ]), + }, + params: { settings: BATCH_TRIAGE_SETTINGS }, + assert: async (rig: any, result) => { + // Assert issue 103 has area/core + await assertHasIssueLabel(103, 'area/core')(rig, result); + // Assert issue 104 has area/unknown + await assertHasIssueLabel(104, 'area/unknown')(rig, result); + }, + }); + + evalTest('USUALLY_PASSES', { + name: 'should handle issues needing retesting (old version)', + prompt: ['--output-format', 'json', '--prompt', createPrompt()], + env: { + AVAILABLE_LABELS: availableLabels, + ISSUES_TO_TRIAGE: JSON.stringify([ + { + number: 105, + title: 'Crash on version 0.1.0', + body: 'I am using /about and it says 0.1.0. The app crashes when I run it.', + }, + ]), + }, + params: { settings: BATCH_TRIAGE_SETTINGS }, + assert: assertHasIssueLabel(105, 'status/need-retesting'), + }); + + evalTest('USUALLY_PASSES', { + name: 'should handle issues needing more information', + prompt: ['--output-format', 'json', '--prompt', createPrompt()], + env: { + AVAILABLE_LABELS: availableLabels, + ISSUES_TO_TRIAGE: JSON.stringify([ + { + number: 106, + title: 'It does not work', + body: 'Something is broken.', + }, + ]), + }, + params: { settings: BATCH_TRIAGE_SETTINGS }, + assert: assertHasIssueLabel(106, 'status/need-information'), + }); + + evalTest('USUALLY_PASSES', { + name: 'should handle large batch of diverse issues', + prompt: ['--output-format', 'json', '--prompt', createPrompt()], + env: { + AVAILABLE_LABELS: availableLabels, + ISSUES_TO_TRIAGE: JSON.stringify([ + { number: 107, title: 'Bug A', body: 'Local test failure' }, + { number: 108, title: 'Bug B', body: 'CI failure' }, + { number: 109, title: 'Bug C', body: 'Security leak' }, + { number: 110, title: 'Bug D', body: 'Spam' }, + { number: 111, title: 'Bug E', body: 'Old version 0.0.1' }, + ]), + }, + params: { settings: BATCH_TRIAGE_SETTINGS }, + assert: async (rig: any, result) => { + await assertHasIssueLabel(107, 'area/core')(rig, result); + await assertHasIssueLabel(108, 'area/platform')(rig, result); + await assertHasIssueLabel(109, 'area/security')(rig, result); + await assertHasIssueLabel(110, 'area/unknown')(rig, result); + await assertHasIssueLabel(111, 'status/need-retesting')(rig, result); + }, + }); +}); diff --git a/evals/dedup.eval.ts b/evals/dedup.eval.ts new file mode 100644 index 0000000000..3980eb689c --- /dev/null +++ b/evals/dedup.eval.ts @@ -0,0 +1,422 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, expect } from 'vitest'; +import { evalTest } from './test-helper.js'; +import fs from 'node:fs/promises'; +import path from 'node:path'; +import yaml from 'js-yaml'; + +// Read the workflow file to extract the prompt and settings +const workflowPath = path.join( + process.cwd(), + '.github/workflows/gemini-automated-issue-dedup.yml', +); +const workflowContent = await fs.readFile(workflowPath, 'utf8'); + +const workflowData = yaml.load(workflowContent) as any; +const geminiStep = workflowData.jobs?.['find-duplicates']?.steps?.find( + (step: any) => step.id === 'gemini_issue_deduplication', +); + +const DEDUP_PROMPT_TEMPLATE = geminiStep?.with?.prompt; +const ORIGINAL_SETTINGS = JSON.parse(geminiStep?.with?.settings || '{}'); + +if (!DEDUP_PROMPT_TEMPLATE) { + throw new Error('Could not extract prompt from de-duplication workflow.'); +} + +const mockMcpPath = path.join(process.cwd(), 'evals/mocks/dedup_mcp.ts'); + +const createPrompt = (issueNumber: number) => { + // The prompt uses ${{ github.event.issue.number }} but also references ${ISSUE_NUMBER} (env) + return DEDUP_PROMPT_TEMPLATE.replace( + /\${{ github\.repository }}/g, + 'google-gemini/gemini-cli', + ).replace(/\${{ github\.event\.issue\.number }}/g, issueNumber.toString()); +}; + +const DEDUP_SETTINGS = { + ...ORIGINAL_SETTINGS, + mcpServers: { + issue_deduplication: { + command: 'npx', + args: ['tsx', mockMcpPath], + }, + }, +}; +if (DEDUP_SETTINGS.telemetry) { + delete DEDUP_SETTINGS.telemetry; +} + +describe('dedup_agent', () => { + evalTest('USUALLY_PASSES', { + name: 'should identify duplicate issues', + prompt: ['--output-format', 'json', '--prompt', createPrompt(101)], + env: { + ISSUE_NUMBER: '101', + GITHUB_ENV: 'github_env', + }, + params: { + settings: DEDUP_SETTINGS, + }, + files: { + github_env: '', + // Mock gh binary + 'bin/gh': `#!/usr/bin/env node +const args = process.argv.slice(2).join(' '); +if (args.includes('issue view')) { + const issueNum = args.match(/view (\\d+)/)?.[1]; + if (issueNum === '101') { + console.log(JSON.stringify({ + number: 101, + title: 'CLI crashes on start', + body: 'It segfaults immediately.', + comments: [] + })); + } else if (issueNum === '201') { + console.log(JSON.stringify({ + number: 201, + title: 'Segfault on launch', + body: 'The app crashes right away.', + comments: [] + })); + } else if (issueNum === '202') { + console.log(JSON.stringify({ + number: 202, + title: 'Unrelated bug', + body: 'Themes are not working.', + comments: [] + })); + } +} +`, + }, + assert: async (rig: any, result) => { + // Verify JSON output stats + const output = JSON.parse(result); + expect(output.stats).toBeDefined(); + expect(output.stats.tools.byName['duplicates']).toBeDefined(); + expect(output.stats.tools.byName['run_shell_command']).toBeDefined(); + + // Verify detailed tool usage via telemetry + const toolLogs = rig.readToolLogs(); + const duplicatesCall = toolLogs.find( + (l: any) => l.toolRequest.name === 'duplicates', + ); + expect(duplicatesCall).toBeDefined(); + + // The current prompt uses echo to set GITHUB_ENV + // We check the tool call for the echo command + const shellCalls = toolLogs.filter( + (l: any) => l.toolRequest.name === 'run_shell_command', + ); + const envCall = shellCalls.find((call: any) => + call.toolRequest.args.includes('DUPLICATE_ISSUES_CSV'), + ); + + expect(envCall).toBeDefined(); + // Check the command content + const match = envCall.toolRequest.args.match( + /DUPLICATE_ISSUES_CSV=\[?([\d, ]*)\]?/, + ); + expect(match).not.toBeNull(); + const issues = match![1] + .split(',') + .map((s: string) => s.trim()) + .filter((s: string) => s); + expect(issues).toContain('201'); + expect(issues).not.toContain('202'); + }, + }); + + evalTest('USUALLY_PASSES', { + name: 'should respect "not a duplicate" comments', + prompt: ['--output-format', 'json', '--prompt', createPrompt(101)], + env: { + ISSUE_NUMBER: '101', + GITHUB_ENV: 'github_env', + }, + params: { + settings: DEDUP_SETTINGS, + }, + files: { + github_env: '', + 'bin/gh': `#!/usr/bin/env node +const args = process.argv.slice(2).join(' '); +if (args.includes('issue view')) { + const issueNum = args.match(/view (\\d+)/)?.[1]; + if (issueNum === '101') { + console.log(JSON.stringify({ + number: 101, + title: 'CLI crashes on start', + body: 'It segfaults immediately.', + comments: [{ body: 'Note: This is NOT a duplicate of #201, different root cause.' }] + })); + } else if (issueNum === '201') { + console.log(JSON.stringify({ + number: 201, + title: 'Segfault on launch', + body: 'The app crashes right away.', + comments: [] + })); + } else { + console.log(JSON.stringify({ number: parseInt(issueNum), title: '', body: '', comments: [] })); + } +} +`, + }, + assert: async (rig: any, result) => { + // Verify JSON output stats + const output = JSON.parse(result); + expect(output.stats).toBeDefined(); + + const toolLogs = rig.readToolLogs(); + const duplicatesCall = toolLogs.find( + (l: any) => l.toolRequest.name === 'duplicates', + ); + expect(duplicatesCall).toBeDefined(); + + const shellCalls = toolLogs.filter( + (l: any) => l.toolRequest.name === 'run_shell_command', + ); + // It might not call echo if no duplicates are found, or it might echo an empty list. + // We'll check if it does call echo, that 201 is NOT in it. + const envCall = shellCalls.find((call: any) => + call.toolRequest.args.includes('DUPLICATE_ISSUES_CSV'), + ); + + if (envCall) { + const match = envCall.toolRequest.args.match( + /DUPLICATE_ISSUES_CSV=\[?([\d, ]*)\]?/, + ); + const issues = match + ? match[1] + .split(',') + .map((s: string) => s.trim()) + .filter((s: string) => s) + : []; + expect(issues).not.toContain('201'); + } + }, + }); + + evalTest('USUALLY_PASSES', { + name: 'should differentiate false positives with high similarity', + prompt: ['--output-format', 'json', '--prompt', createPrompt(301)], + env: { + ISSUE_NUMBER: '301', + GITHUB_ENV: 'github_env', + }, + params: { + settings: DEDUP_SETTINGS, + }, + files: { + github_env: '', + 'bin/gh': `#!/usr/bin/env node +const args = process.argv.slice(2).join(' '); +if (args.includes('issue view')) { + const issueNum = args.match(/view (\\d+)/)?.[1]; + if (issueNum === '301') { + console.log(JSON.stringify({ + number: 301, + title: 'App crashes when I click Save', + body: 'I click the save button and it crashes.', + comments: [] + })); + } else if (issueNum === '302') { + console.log(JSON.stringify({ + number: 302, + title: 'App crashes when I click Load', + body: 'I click the load button and it crashes. This seems related to the loader component.', + comments: [] + })); + } else { + console.log(JSON.stringify({ number: parseInt(issueNum), title: '', body: '', comments: [] })); + } +} +`, + }, + assert: async (rig: any, result) => { + // Verify JSON output stats + const output = JSON.parse(result); + expect(output.stats).toBeDefined(); + + const toolLogs = rig.readToolLogs(); + const duplicatesCall = toolLogs.find( + (l: any) => l.toolRequest.name === 'duplicates', + ); + expect(duplicatesCall).toBeDefined(); + + const shellCalls = toolLogs.filter( + (l: any) => l.toolRequest.name === 'run_shell_command', + ); + const envCall = shellCalls.find((call: any) => + call.toolRequest.args.includes('DUPLICATE_ISSUES_CSV'), + ); + + if (envCall) { + const match = envCall.toolRequest.args.match( + /DUPLICATE_ISSUES_CSV=\[?([\d, ]*)\]?/, + ); + const issues = match + ? match[1] + .split(',') + .map((s: string) => s.trim()) + .filter((s: string) => s) + : []; + // Should NOT contain 302 because it's a different feature (Save vs Load) despite crash + expect(issues).not.toContain('302'); + } + }, + }); + + evalTest('USUALLY_PASSES', { + name: 'should reject matches with low similarity', + prompt: ['--output-format', 'json', '--prompt', createPrompt(401)], + env: { + ISSUE_NUMBER: '401', + GITHUB_ENV: 'github_env', + }, + params: { + settings: DEDUP_SETTINGS, + }, + files: { + github_env: '', + 'bin/gh': `#!/usr/bin/env node +const args = process.argv.slice(2).join(' '); +if (args.includes('issue view')) { + const issueNum = args.match(/view (\\d+)/)?.[1]; + if (issueNum === '401') { + console.log(JSON.stringify({ + number: 401, + title: 'Feature request: Dark mode', + body: 'Please add dark mode.', + comments: [] + })); + } else if (issueNum === '402') { + console.log(JSON.stringify({ + number: 402, + title: 'Feature request: Light mode', + body: 'Please add light mode.', + comments: [] + })); + } else { + console.log(JSON.stringify({ number: parseInt(issueNum), title: '', body: '', comments: [] })); + } +} +`, + }, + assert: async (rig: any, result) => { + // Verify JSON output stats + const output = JSON.parse(result); + expect(output.stats).toBeDefined(); + + const toolLogs = rig.readToolLogs(); + const duplicatesCall = toolLogs.find( + (l: any) => l.toolRequest.name === 'duplicates', + ); + expect(duplicatesCall).toBeDefined(); + + const shellCalls = toolLogs.filter( + (l: any) => l.toolRequest.name === 'run_shell_command', + ); + const envCall = shellCalls.find((call: any) => + call.toolRequest.args.includes('DUPLICATE_ISSUES_CSV'), + ); + + if (envCall) { + const match = envCall.toolRequest.args.match( + /DUPLICATE_ISSUES_CSV=\[?([\d, ]*)\]?/, + ); + const issues = match + ? match[1] + .split(',') + .map((s: string) => s.trim()) + .filter((s: string) => s) + : []; + expect(issues).not.toContain('402'); + expect(issues.length).toBe(0); + } + }, + }); + + evalTest('USUALLY_PASSES', { + name: 'should identify multiple duplicates', + prompt: ['--output-format', 'json', '--prompt', createPrompt(501)], + env: { + ISSUE_NUMBER: '501', + GITHUB_ENV: 'github_env', + }, + params: { + settings: DEDUP_SETTINGS, + }, + files: { + github_env: '', + 'bin/gh': `#!/usr/bin/env node +const args = process.argv.slice(2).join(' '); +if (args.includes('issue view')) { + const issueNum = args.match(/view (\\d+)/)?.[1]; + if (issueNum === '501') { + console.log(JSON.stringify({ + number: 501, + title: 'Crash on login', + body: 'The app crashes when I try to log in.', + comments: [] + })); + } else if (issueNum === '502') { + console.log(JSON.stringify({ + number: 502, + title: 'Crash on sign in', + body: 'Crashes during sign in process.', + comments: [] + })); + } else if (issueNum === '503') { + console.log(JSON.stringify({ + number: 503, + title: 'Crashes on login page', + body: 'I get a crash immediately on the login page.', + comments: [] + })); + } else { + console.log(JSON.stringify({ number: parseInt(issueNum), title: '', body: '', comments: [] })); + } +} +`, + }, + assert: async (rig: any, result) => { + // Verify JSON output stats + const output = JSON.parse(result); + expect(output.stats).toBeDefined(); + + const toolLogs = rig.readToolLogs(); + const duplicatesCall = toolLogs.find( + (l: any) => l.toolRequest.name === 'duplicates', + ); + expect(duplicatesCall).toBeDefined(); + + const shellCalls = toolLogs.filter( + (l: any) => l.toolRequest.name === 'run_shell_command', + ); + const envCall = shellCalls.find((call: any) => + call.toolRequest.args.includes('DUPLICATE_ISSUES_CSV'), + ); + + expect(envCall).toBeDefined(); + const match = envCall.toolRequest.args.match( + /DUPLICATE_ISSUES_CSV=\[?([\d, ]*)\]?/, + ); + const issues = match + ? match[1] + .split(',') + .map((s: string) => s.trim()) + .filter((s: string) => s) + : []; + expect(issues).toContain('502'); + expect(issues).toContain('503'); + }, + }); +}); diff --git a/evals/dedup_refresh.eval.ts b/evals/dedup_refresh.eval.ts new file mode 100644 index 0000000000..480893bbd8 --- /dev/null +++ b/evals/dedup_refresh.eval.ts @@ -0,0 +1,81 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, expect } from 'vitest'; +import { evalTest } from './test-helper.js'; +import fs from 'node:fs/promises'; +import path from 'node:path'; +import yaml from 'js-yaml'; + +// Read the workflow file to extract the prompt and settings +const workflowPath = path.join( + process.cwd(), + '.github/workflows/gemini-scheduled-issue-dedup.yml', +); +const workflowContent = await fs.readFile(workflowPath, 'utf8'); + +const workflowData = yaml.load(workflowContent) as any; +const geminiStep = workflowData.jobs?.['refresh-embeddings']?.steps?.find( + (step: any) => step.id === 'gemini_refresh_embeddings', +); + +const REFRESH_PROMPT_TEMPLATE = geminiStep?.with?.prompt; +const ORIGINAL_SETTINGS = JSON.parse(geminiStep?.with?.settings || '{}'); + +if (!REFRESH_PROMPT_TEMPLATE) { + throw new Error('Could not extract prompt from dedup refresh workflow.'); +} + +const mockMcpPath = path.join(process.cwd(), 'evals/mocks/dedup_mcp.ts'); + +const createPrompt = () => { + return REFRESH_PROMPT_TEMPLATE.replace( + /\${{ github\.repository }}/g, + 'google-gemini/gemini-cli', + ); +}; + +const REFRESH_SETTINGS = { + ...ORIGINAL_SETTINGS, + mcpServers: { + issue_deduplication: { + command: 'npx', + args: ['tsx', mockMcpPath], + }, + }, +}; +if (REFRESH_SETTINGS.telemetry) { + delete REFRESH_SETTINGS.telemetry; +} + +describe('dedup_refresh_agent', () => { + evalTest('USUALLY_PASSES', { + name: 'should call refresh tool', + prompt: ['--output-format', 'json', '--prompt', createPrompt()], + approvalMode: 'yolo', + params: { + settings: REFRESH_SETTINGS, + }, + assert: async (rig: any, result) => { + // result is the JSON output + const output = JSON.parse(result); + expect(output.stats).toBeDefined(); + + const toolStats = output.stats.tools.byName; + expect(toolStats.refresh).toBeDefined(); + expect(toolStats.refresh.count).toBe(1); + expect(toolStats.refresh.success).toBe(1); + + // We still check telemetry for deep arg inspection if needed, + // but stats verify the high-level goal. + const toolLogs = rig.readToolLogs(); + const refreshCall = toolLogs.find( + (l: any) => l.toolRequest.name === 'refresh', + ); + expect(refreshCall).toBeDefined(); + }, + }); +}); diff --git a/evals/mocks/dedup_mcp.ts b/evals/mocks/dedup_mcp.ts new file mode 100644 index 0000000000..fa486448bd --- /dev/null +++ b/evals/mocks/dedup_mcp.ts @@ -0,0 +1,136 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { Server } from '@modelcontextprotocol/sdk/server/index.js'; +import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js'; +import { + CallToolRequestSchema, + ListToolsRequestSchema, +} from '@modelcontextprotocol/sdk/types.js'; + +const server = new Server( + { + name: 'mock-dedup-server', + version: '1.0.0', + }, + { + capabilities: { + tools: {}, + }, + }, +); + +server.setRequestHandler(ListToolsRequestSchema, async () => ({ + tools: [ + { + name: 'duplicates', + description: 'Find duplicate issues', + inputSchema: { + type: 'object', + properties: { + repo: { type: 'string' }, + issue_number: { type: 'number' }, + }, + required: ['repo', 'issue_number'], + }, + }, + { + name: 'refresh', + description: 'Refresh embeddings for all issues in a repository', + inputSchema: { + type: 'object', + properties: { + repo: { type: 'string' }, + force: { type: 'boolean' }, + }, + required: ['repo'], + }, + }, + ], +})); + +server.setRequestHandler(CallToolRequestSchema, async (request) => { + if (request.params.name === 'duplicates') { + const issueNumber = request.params.arguments?.issue_number; + + // Logic to return different results based on issue number + if (issueNumber === 101) { + return { + content: [ + { + type: 'text', + text: JSON.stringify([ + { number: 201, similarity: 0.95 }, + { number: 202, similarity: 0.85 }, + ]), + }, + ], + }; + } + + // Edge Case: False Positive / Ambiguous + if (issueNumber === 301) { + return { + content: [ + { + type: 'text', + text: JSON.stringify([ + { number: 302, similarity: 0.88 }, // High similarity but different root cause + ]), + }, + ], + }; + } + + // Edge Case: Low similarity (should reject) + if (issueNumber === 401) { + return { + content: [ + { + type: 'text', + text: JSON.stringify([{ number: 402, similarity: 0.75 }]), + }, + ], + }; + } + + // Edge Case: Multiple duplicates + if (issueNumber === 501) { + return { + content: [ + { + type: 'text', + text: JSON.stringify([ + { number: 502, similarity: 0.92 }, + { number: 503, similarity: 0.91 }, + ]), + }, + ], + }; + } + + return { + content: [{ type: 'text', text: '[]' }], + }; + } + + if (request.params.name === 'refresh') { + return { + content: [ + { + type: 'text', + text: JSON.stringify({ status: 'success', refreshed_count: 10 }), + }, + ], + }; + } + + throw new Error('Tool not found'); +}); + +const transport = new StdioServerTransport(); +await server.connect(transport); +console.error('Mock Dedup MCP server running'); diff --git a/evals/test-helper.ts b/evals/test-helper.ts index 37d79eb6a4..1167be53eb 100644 --- a/evals/test-helper.ts +++ b/evals/test-helper.ts @@ -37,9 +37,10 @@ export type EvalPolicy = 'ALWAYS_PASSES' | 'USUALLY_PASSES'; export function evalTest(policy: EvalPolicy, evalCase: EvalCase) { const fn = async () => { - const rig = new TestRig(); + const rig = new TestRig() as any; const { logDir, sanitizedName } = await prepareLogDir(evalCase.name); const activityLogFile = path.join(logDir, `${sanitizedName}.jsonl`); + rig._activityLogFile = activityLogFile; const logFile = path.join(logDir, `${sanitizedName}.log`); let isSuccess = false; try { @@ -53,6 +54,9 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) { const fullPath = path.join(rig.testDir!, filePath); fs.mkdirSync(path.dirname(fullPath), { recursive: true }); fs.writeFileSync(fullPath, content); + if (filePath.startsWith('bin/')) { + fs.chmodSync(fullPath, 0o755); + } // If it's an agent file, calculate hash for acknowledgement if ( @@ -118,6 +122,8 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) { timeout: evalCase.timeout, env: { GEMINI_CLI_ACTIVITY_LOG_FILE: activityLogFile, + PATH: `${path.join(rig.testDir!, 'bin')}${path.delimiter}${process.env.PATH}`, + ...evalCase.env, }, }); @@ -168,8 +174,9 @@ async function prepareLogDir(name: string) { export interface EvalCase { name: string; params?: Record; - prompt: string; + prompt: string | string[]; timeout?: number; + env?: Record; files?: Record; approvalMode?: 'default' | 'auto_edit' | 'yolo' | 'plan'; assert: (rig: TestRig, result: string) => Promise; diff --git a/evals/triage.eval.ts b/evals/triage.eval.ts index f977caf949..4b6b2b80f3 100644 --- a/evals/triage.eval.ts +++ b/evals/triage.eval.ts @@ -75,6 +75,8 @@ const createPrompt = (title: string, body: string) => { .replace('${{ env.AVAILABLE_LABELS }}', availableLabels); }; +const TRIAGE_SETTINGS = {}; + const escapeHtml = (str: string) => { return str.replace(/[<>&'"]/g, (c) => { switch (c) { @@ -94,15 +96,21 @@ const escapeHtml = (str: string) => { }; const assertHasLabel = (expectedLabel: string) => { - return async (_rig: unknown, result: string) => { - const firstBrace = result.indexOf('{'); - const lastBrace = result.lastIndexOf('}'); + return async (rig: any, result: string) => { + // Verify JSON output stats + const output = JSON.parse(result); + expect(output.stats).toBeDefined(); + + // The model response JSON is in the 'response' field + const responseText = output.response; + const firstBrace = responseText.indexOf('{'); + const lastBrace = responseText.lastIndexOf('}'); if (firstBrace === -1 || lastBrace === -1 || lastBrace < firstBrace) { throw new Error( - `Could not find a JSON object in the result: "${escapeHtml(result)}"`, + `Could not find a JSON object in the response: "${escapeHtml(responseText)}"`, ); } - const jsonString = result.substring(firstBrace, lastBrace + 1); + const jsonString = responseText.substring(firstBrace, lastBrace + 1); let data: { labels_to_set?: string[] }; try { @@ -110,7 +118,7 @@ const assertHasLabel = (expectedLabel: string) => { } catch (e) { const err = e as Error; throw new Error( - `Failed to parse JSON. Error: ${err.message}. Result: "${escapeHtml(result)}"`, + `Failed to parse JSON. Error: ${err.message}. Response: "${escapeHtml(responseText)}"`, ); } @@ -123,142 +131,277 @@ const assertHasLabel = (expectedLabel: string) => { describe('triage_agent', () => { evalTest('USUALLY_PASSES', { name: 'should identify area/core for windows installation issues', - prompt: createPrompt( - 'CLI failed to install on Windows', - 'I tried running npm install but it failed with an error on Windows 11.', - ), + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt( + 'CLI failed to install on Windows', + 'I tried running npm install but it failed with an error on Windows 11.', + ), + ], + params: { settings: TRIAGE_SETTINGS }, assert: assertHasLabel('area/core'), }); evalTest('USUALLY_PASSES', { name: 'should identify area/platform for CI/CD failures', - prompt: createPrompt( - 'Tests are failing in the CI/CD pipeline', - 'The github action is failing with a 500 error.', - ), + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt( + 'Tests are failing in the CI/CD pipeline', + 'The github action is failing with a 500 error.', + ), + ], + params: { settings: TRIAGE_SETTINGS }, assert: assertHasLabel('area/platform'), }); evalTest('USUALLY_PASSES', { name: 'should identify area/platform for quota issues', - prompt: createPrompt( - 'Resource Exhausted 429', - 'I am getting a 429 error when running the CLI.', - ), + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt( + 'Resource Exhausted 429', + 'I am getting a 429 error when running the CLI.', + ), + ], + params: { settings: TRIAGE_SETTINGS }, assert: assertHasLabel('area/platform'), }); evalTest('USUALLY_PASSES', { name: 'should identify area/core for local build failures', - prompt: createPrompt( - 'Local build failing', - 'I cannot build the project locally. npm run build fails.', - ), + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt( + 'Local build failing', + 'I cannot build the project locally. npm run build fails.', + ), + ], + params: { settings: TRIAGE_SETTINGS }, assert: assertHasLabel('area/core'), }); evalTest('USUALLY_PASSES', { name: 'should identify area/platform for sandbox issues', - prompt: createPrompt( - 'Sandbox connection failed', - 'I cannot connect to the docker sandbox environment.', - ), + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt( + 'Sandbox connection failed', + 'I cannot connect to the docker sandbox environment.', + ), + ], + params: { settings: TRIAGE_SETTINGS }, assert: assertHasLabel('area/platform'), }); evalTest('USUALLY_PASSES', { name: 'should identify area/core for local test failures', - prompt: createPrompt( - 'Local tests failing', - 'I am running npm test locally and it fails.', - ), + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt( + 'Local tests failing', + 'I am running npm test locally and it fails.', + ), + ], + params: { settings: TRIAGE_SETTINGS }, assert: assertHasLabel('area/core'), }); evalTest('USUALLY_PASSES', { name: 'should identify area/agent for questions about tools', - prompt: createPrompt( - 'Bug with web search?', - 'I am trying to use web search but I do not know the syntax. Is it @web or /web?', - ), + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt( + 'Bug with web search?', + 'I am trying to use web search but I do not know the syntax. Is it @web or /web?', + ), + ], + params: { settings: TRIAGE_SETTINGS }, assert: assertHasLabel('area/agent'), }); evalTest('USUALLY_PASSES', { name: 'should identify area/extensions for feature requests', - prompt: createPrompt( - 'Please add a python extension', - 'I want to write python scripts as an extension.', - ), + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt( + 'Please add a python extension', + 'I want to write python scripts as an extension.', + ), + ], + params: { settings: TRIAGE_SETTINGS }, assert: assertHasLabel('area/extensions'), }); evalTest('USUALLY_PASSES', { name: 'should identify area/unknown for off-topic spam', - prompt: createPrompt('Buy cheap rolex', 'Click here for discount.'), + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt('Buy cheap rolex', 'Click here for discount.'), + ], + params: { settings: TRIAGE_SETTINGS }, assert: assertHasLabel('area/unknown'), }); evalTest('USUALLY_PASSES', { name: 'should identify area/core for crash reports phrased as questions', - prompt: createPrompt( - 'Why does it segfault?', - 'Why does the CLI segfault immediately when I run it on Ubuntu?', - ), + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt( + 'Why does it segfault?', + 'Why does the CLI segfault immediately when I run it on Ubuntu?', + ), + ], + params: { settings: TRIAGE_SETTINGS }, assert: assertHasLabel('area/core'), }); evalTest('USUALLY_PASSES', { name: 'should identify area/agent for feature requests for built-in tools', - prompt: createPrompt( - 'Can we have a diff tool?', - 'Is it possible to add a built-in tool to show diffs before editing?', - ), + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt( + 'Can we have a diff tool?', + 'Is it possible to add a built-in tool to show diffs before editing?', + ), + ], + params: { settings: TRIAGE_SETTINGS }, assert: assertHasLabel('area/agent'), }); evalTest('USUALLY_PASSES', { name: 'should identify area/enterprise for license questions', - prompt: createPrompt( - 'License key issue', - 'Where do I enter my enterprise license key? I cannot find the setting.', - ), + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt( + 'License key issue', + 'Where do I enter my enterprise license key? I cannot find the setting.', + ), + ], + params: { settings: TRIAGE_SETTINGS }, assert: assertHasLabel('area/enterprise'), }); evalTest('USUALLY_PASSES', { name: 'should identify area/unknown for extremely vague reports', - prompt: createPrompt( - 'It does not work', - 'I tried to use it and it failed.', - ), + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt('It does not work', 'I tried to use it and it failed.'), + ], + params: { settings: TRIAGE_SETTINGS }, assert: assertHasLabel('area/unknown'), }); evalTest('USUALLY_PASSES', { name: 'should identify area/security for prompt injection reports', - prompt: createPrompt( - 'Prompt injection vulnerability', - 'I found a way to make the agent ignore instructions by saying "Ignore all previous instructions".', - ), + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt( + 'Prompt injection vulnerability', + 'I found a way to make the agent ignore instructions by saying "Ignore all previous instructions".', + ), + ], + params: { settings: TRIAGE_SETTINGS }, assert: assertHasLabel('area/security'), }); evalTest('USUALLY_PASSES', { name: 'should identify area/non-interactive for headless crashes', - prompt: createPrompt( - 'Headless mode segfault', - 'When I run with --headless, the CLI crashes immediately.', - ), + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt( + 'Headless mode segfault', + 'When I run with --headless, the CLI crashes immediately.', + ), + ], + params: { settings: TRIAGE_SETTINGS }, assert: assertHasLabel('area/non-interactive'), }); evalTest('USUALLY_PASSES', { name: 'should identify area/agent for mixed feedback and tool bugs', - prompt: createPrompt( - 'Great tool but web search fails', - 'I love using Gemini CLI, it is amazing! However, the @web tool gives me an error every time I search for "react".', - ), + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt( + 'Great tool but web search fails', + 'I love using Gemini CLI, it is amazing! However, the @web tool gives me an error every time I search for "react".', + ), + ], + params: { settings: TRIAGE_SETTINGS }, assert: assertHasLabel('area/agent'), }); + + evalTest('USUALLY_PASSES', { + name: 'should identify area/core for UI performance issues', + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt( + 'UI is very slow', + 'The new interface is lagging and unresponsive when I scroll.', + ), + ], + params: { settings: TRIAGE_SETTINGS }, + assert: assertHasLabel('area/core'), + }); + + evalTest('USUALLY_PASSES', { + name: 'should identify area/security for accidental secret leakage', + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt( + 'Leaked API key in logs', + 'I accidentally posted my API key in a previous issue comment. Can you delete it?', + ), + ], + params: { settings: TRIAGE_SETTINGS }, + assert: assertHasLabel('area/security'), + }); + + evalTest('USUALLY_PASSES', { + name: 'should identify area/unknown for nonsensical input', + prompt: [ + '--output-format', + 'json', + '--prompt', + createPrompt('asdfasdf', 'qwerqwer zxcvbnm'), + ], + params: { settings: TRIAGE_SETTINGS }, + assert: assertHasLabel('area/unknown'), + }); });