diff --git a/evals/batch_triage.eval.ts b/evals/batch_triage.eval.ts deleted file mode 100644 index 52376e7bc2..0000000000 --- a/evals/batch_triage.eval.ts +++ /dev/null @@ -1,264 +0,0 @@ -/** - * @license - * Copyright 2025 Google LLC - * SPDX-License-Identifier: Apache-2.0 - */ - -import { describe, expect } from 'vitest'; -import { evalTest } from './test-helper.js'; -import fs from 'node:fs/promises'; -import path from 'node:path'; -import yaml from 'js-yaml'; - -// Read the workflow file to extract the prompt -const workflowPath = path.join( - process.cwd(), - '.github/workflows/gemini-scheduled-issue-triage.yml', -); -const workflowContent = await fs.readFile(workflowPath, 'utf8'); - -// Use a YAML parser for robustness -const workflowData = yaml.load(workflowContent) as { - jobs?: { - 'triage-issues'?: { - steps?: { - id?: string; - with?: { prompt?: string; script?: string }; - env?: { AVAILABLE_LABELS?: string }; - }[]; - }; - }; -}; - -const geminiStep = workflowData.jobs?.['triage-issues']?.steps?.find( - (step) => step.id === 'gemini_issue_analysis', -); - -const labelsStep = workflowData.jobs?.['triage-issues']?.steps?.find( - (step) => step.id === 'get_labels', -); - -const BATCH_TRIAGE_PROMPT_TEMPLATE = geminiStep?.with?.prompt; -const ORIGINAL_SETTINGS = JSON.parse(geminiStep?.with?.settings || '{}'); -const LABELS_SCRIPT = labelsStep?.with?.script; - -if (!BATCH_TRIAGE_PROMPT_TEMPLATE) { - throw new Error( - 'Could not extract prompt from workflow file. Check for `jobs.triage-issues.steps[id=gemini_issue_analysis].with.prompt` in the YAML file.', - ); -} - -// Extract available labels from the script -let availableLabels = ''; -if (LABELS_SCRIPT) { - const match = LABELS_SCRIPT.match( - /const labelNames = labels.map\(label => label.name\);/, - ); - // Wait, the script in scheduled triage is different! - // const labelNames = labels.map(label => label.name); - // It gets ALL labels. - // But the prompt expects "${AVAILABLE_LABELS}". - // In the test, we can just mock a reasonable set of labels. - availableLabels = - 'area/agent, area/core, area/enterprise, area/extensions, area/non-interactive, area/platform, area/security, area/unknown, kind/bug, kind/feature, kind/question, priority/p0, priority/p1, priority/p2, priority/p3'; -} - -const createPrompt = () => { - return BATCH_TRIAGE_PROMPT_TEMPLATE.replace( - '${AVAILABLE_LABELS}', - availableLabels, - ); -}; - -const BATCH_TRIAGE_SETTINGS = { - ...ORIGINAL_SETTINGS, -}; -if (BATCH_TRIAGE_SETTINGS.telemetry) { - delete BATCH_TRIAGE_SETTINGS.telemetry; -} - -const escapeHtml = (str: string) => { - return str.replace(/[<>&'"]/g, (c) => { - switch (c) { - case '<': - return '<'; - case '>': - return '>'; - case '&': - return '&'; - case "'": - return '''; - case '"': - return '"'; - } - return ''; - }); -}; - -const assertHasIssueLabel = (issueNumber: number, expectedLabel: string) => { - return async (rig: any, result: string) => { - // Verify JSON output stats - const output = JSON.parse(result); - expect(output.stats).toBeDefined(); - - // The model response JSON is in the 'response' field - const responseText = output.response; - let jsonString: string; - const match = responseText.match(/```json\s*([\s\S]*?)\s*```/); - if (match?.[1]) { - jsonString = match[1]; - } else { - const firstBracket = responseText.indexOf('['); - const lastBracket = responseText.lastIndexOf(']'); - if ( - firstBracket === -1 || - lastBracket === -1 || - lastBracket < firstBracket - ) { - throw new Error( - `Could not find a JSON array in the response: "${escapeHtml(responseText)}"`, - ); - } - jsonString = responseText.substring(firstBracket, lastBracket + 1); - } - - let data: { issue_number: number; labels_to_add: string[] }[]; - try { - data = JSON.parse(jsonString); - } catch (e) { - const err = e as Error; - throw new Error( - `Failed to parse JSON. Error: ${err.message}. Response: "${escapeHtml(responseText)}"`, - ); - } - - const issue = data.find((i) => i.issue_number === issueNumber); - if (!issue) { - throw new Error( - `Issue #${issueNumber} not found in output: ${JSON.stringify(data)}`, - ); - } - - expect(issue.labels_to_add).toContain(expectedLabel); - }; -}; - -describe('batch_triage_agent', () => { - evalTest('USUALLY_PASSES', { - name: 'should identify area/core for local test failures in batch', - prompt: ['--output-format', 'json', '--prompt', createPrompt()], - env: { - AVAILABLE_LABELS: availableLabels, - ISSUES_TO_TRIAGE: JSON.stringify([ - { - number: 101, - title: 'Local tests failing', - body: 'I am running npm test locally and it fails with an error.', - }, - ]), - }, - params: { settings: BATCH_TRIAGE_SETTINGS }, - assert: assertHasIssueLabel(101, 'area/core'), - }); - - evalTest('USUALLY_PASSES', { - name: 'should identify area/platform for CI failures in batch', - prompt: ['--output-format', 'json', '--prompt', createPrompt()], - env: { - AVAILABLE_LABELS: availableLabels, - ISSUES_TO_TRIAGE: JSON.stringify([ - { - number: 102, - title: 'CI pipeline failed', - body: 'The GitHub Action for tests failed on the main branch.', - }, - ]), - }, - params: { settings: BATCH_TRIAGE_SETTINGS }, - assert: assertHasIssueLabel(102, 'area/platform'), - }); - - evalTest('USUALLY_PASSES', { - name: 'should handle mixed batch correctly', - prompt: ['--output-format', 'json', '--prompt', createPrompt()], - env: { - AVAILABLE_LABELS: availableLabels, - ISSUES_TO_TRIAGE: JSON.stringify([ - { - number: 103, - title: 'Cannot install on MacOS', - body: 'Install fails with permission error.', - }, - { - number: 104, - title: 'Click to win', - body: 'Spam body', - }, - ]), - }, - params: { settings: BATCH_TRIAGE_SETTINGS }, - assert: async (rig: any, result) => { - // Assert issue 103 has area/core - await assertHasIssueLabel(103, 'area/core')(rig, result); - // Assert issue 104 has area/unknown - await assertHasIssueLabel(104, 'area/unknown')(rig, result); - }, - }); - - evalTest('USUALLY_PASSES', { - name: 'should handle issues needing retesting (old version)', - prompt: ['--output-format', 'json', '--prompt', createPrompt()], - env: { - AVAILABLE_LABELS: availableLabels, - ISSUES_TO_TRIAGE: JSON.stringify([ - { - number: 105, - title: 'Crash on version 0.1.0', - body: 'I am using /about and it says 0.1.0. The app crashes when I run it.', - }, - ]), - }, - params: { settings: BATCH_TRIAGE_SETTINGS }, - assert: assertHasIssueLabel(105, 'status/need-retesting'), - }); - - evalTest('USUALLY_PASSES', { - name: 'should handle issues needing more information', - prompt: ['--output-format', 'json', '--prompt', createPrompt()], - env: { - AVAILABLE_LABELS: availableLabels, - ISSUES_TO_TRIAGE: JSON.stringify([ - { - number: 106, - title: 'It does not work', - body: 'Something is broken.', - }, - ]), - }, - params: { settings: BATCH_TRIAGE_SETTINGS }, - assert: assertHasIssueLabel(106, 'status/need-information'), - }); - - evalTest('USUALLY_PASSES', { - name: 'should handle large batch of diverse issues', - prompt: ['--output-format', 'json', '--prompt', createPrompt()], - env: { - AVAILABLE_LABELS: availableLabels, - ISSUES_TO_TRIAGE: JSON.stringify([ - { number: 107, title: 'Bug A', body: 'Local test failure' }, - { number: 108, title: 'Bug B', body: 'CI failure' }, - { number: 109, title: 'Bug C', body: 'Security leak' }, - { number: 110, title: 'Bug D', body: 'Spam' }, - { number: 111, title: 'Bug E', body: 'Old version 0.0.1' }, - ]), - }, - params: { settings: BATCH_TRIAGE_SETTINGS }, - assert: async (rig: any, result) => { - await assertHasIssueLabel(107, 'area/core')(rig, result); - await assertHasIssueLabel(108, 'area/platform')(rig, result); - await assertHasIssueLabel(109, 'area/security')(rig, result); - await assertHasIssueLabel(110, 'area/unknown')(rig, result); - await assertHasIssueLabel(111, 'status/need-retesting')(rig, result); - }, - }); -}); diff --git a/evals/dedup.eval.ts b/evals/dedup.eval.ts deleted file mode 100644 index 3980eb689c..0000000000 --- a/evals/dedup.eval.ts +++ /dev/null @@ -1,422 +0,0 @@ -/** - * @license - * Copyright 2025 Google LLC - * SPDX-License-Identifier: Apache-2.0 - */ - -import { describe, expect } from 'vitest'; -import { evalTest } from './test-helper.js'; -import fs from 'node:fs/promises'; -import path from 'node:path'; -import yaml from 'js-yaml'; - -// Read the workflow file to extract the prompt and settings -const workflowPath = path.join( - process.cwd(), - '.github/workflows/gemini-automated-issue-dedup.yml', -); -const workflowContent = await fs.readFile(workflowPath, 'utf8'); - -const workflowData = yaml.load(workflowContent) as any; -const geminiStep = workflowData.jobs?.['find-duplicates']?.steps?.find( - (step: any) => step.id === 'gemini_issue_deduplication', -); - -const DEDUP_PROMPT_TEMPLATE = geminiStep?.with?.prompt; -const ORIGINAL_SETTINGS = JSON.parse(geminiStep?.with?.settings || '{}'); - -if (!DEDUP_PROMPT_TEMPLATE) { - throw new Error('Could not extract prompt from de-duplication workflow.'); -} - -const mockMcpPath = path.join(process.cwd(), 'evals/mocks/dedup_mcp.ts'); - -const createPrompt = (issueNumber: number) => { - // The prompt uses ${{ github.event.issue.number }} but also references ${ISSUE_NUMBER} (env) - return DEDUP_PROMPT_TEMPLATE.replace( - /\${{ github\.repository }}/g, - 'google-gemini/gemini-cli', - ).replace(/\${{ github\.event\.issue\.number }}/g, issueNumber.toString()); -}; - -const DEDUP_SETTINGS = { - ...ORIGINAL_SETTINGS, - mcpServers: { - issue_deduplication: { - command: 'npx', - args: ['tsx', mockMcpPath], - }, - }, -}; -if (DEDUP_SETTINGS.telemetry) { - delete DEDUP_SETTINGS.telemetry; -} - -describe('dedup_agent', () => { - evalTest('USUALLY_PASSES', { - name: 'should identify duplicate issues', - prompt: ['--output-format', 'json', '--prompt', createPrompt(101)], - env: { - ISSUE_NUMBER: '101', - GITHUB_ENV: 'github_env', - }, - params: { - settings: DEDUP_SETTINGS, - }, - files: { - github_env: '', - // Mock gh binary - 'bin/gh': `#!/usr/bin/env node -const args = process.argv.slice(2).join(' '); -if (args.includes('issue view')) { - const issueNum = args.match(/view (\\d+)/)?.[1]; - if (issueNum === '101') { - console.log(JSON.stringify({ - number: 101, - title: 'CLI crashes on start', - body: 'It segfaults immediately.', - comments: [] - })); - } else if (issueNum === '201') { - console.log(JSON.stringify({ - number: 201, - title: 'Segfault on launch', - body: 'The app crashes right away.', - comments: [] - })); - } else if (issueNum === '202') { - console.log(JSON.stringify({ - number: 202, - title: 'Unrelated bug', - body: 'Themes are not working.', - comments: [] - })); - } -} -`, - }, - assert: async (rig: any, result) => { - // Verify JSON output stats - const output = JSON.parse(result); - expect(output.stats).toBeDefined(); - expect(output.stats.tools.byName['duplicates']).toBeDefined(); - expect(output.stats.tools.byName['run_shell_command']).toBeDefined(); - - // Verify detailed tool usage via telemetry - const toolLogs = rig.readToolLogs(); - const duplicatesCall = toolLogs.find( - (l: any) => l.toolRequest.name === 'duplicates', - ); - expect(duplicatesCall).toBeDefined(); - - // The current prompt uses echo to set GITHUB_ENV - // We check the tool call for the echo command - const shellCalls = toolLogs.filter( - (l: any) => l.toolRequest.name === 'run_shell_command', - ); - const envCall = shellCalls.find((call: any) => - call.toolRequest.args.includes('DUPLICATE_ISSUES_CSV'), - ); - - expect(envCall).toBeDefined(); - // Check the command content - const match = envCall.toolRequest.args.match( - /DUPLICATE_ISSUES_CSV=\[?([\d, ]*)\]?/, - ); - expect(match).not.toBeNull(); - const issues = match![1] - .split(',') - .map((s: string) => s.trim()) - .filter((s: string) => s); - expect(issues).toContain('201'); - expect(issues).not.toContain('202'); - }, - }); - - evalTest('USUALLY_PASSES', { - name: 'should respect "not a duplicate" comments', - prompt: ['--output-format', 'json', '--prompt', createPrompt(101)], - env: { - ISSUE_NUMBER: '101', - GITHUB_ENV: 'github_env', - }, - params: { - settings: DEDUP_SETTINGS, - }, - files: { - github_env: '', - 'bin/gh': `#!/usr/bin/env node -const args = process.argv.slice(2).join(' '); -if (args.includes('issue view')) { - const issueNum = args.match(/view (\\d+)/)?.[1]; - if (issueNum === '101') { - console.log(JSON.stringify({ - number: 101, - title: 'CLI crashes on start', - body: 'It segfaults immediately.', - comments: [{ body: 'Note: This is NOT a duplicate of #201, different root cause.' }] - })); - } else if (issueNum === '201') { - console.log(JSON.stringify({ - number: 201, - title: 'Segfault on launch', - body: 'The app crashes right away.', - comments: [] - })); - } else { - console.log(JSON.stringify({ number: parseInt(issueNum), title: '', body: '', comments: [] })); - } -} -`, - }, - assert: async (rig: any, result) => { - // Verify JSON output stats - const output = JSON.parse(result); - expect(output.stats).toBeDefined(); - - const toolLogs = rig.readToolLogs(); - const duplicatesCall = toolLogs.find( - (l: any) => l.toolRequest.name === 'duplicates', - ); - expect(duplicatesCall).toBeDefined(); - - const shellCalls = toolLogs.filter( - (l: any) => l.toolRequest.name === 'run_shell_command', - ); - // It might not call echo if no duplicates are found, or it might echo an empty list. - // We'll check if it does call echo, that 201 is NOT in it. - const envCall = shellCalls.find((call: any) => - call.toolRequest.args.includes('DUPLICATE_ISSUES_CSV'), - ); - - if (envCall) { - const match = envCall.toolRequest.args.match( - /DUPLICATE_ISSUES_CSV=\[?([\d, ]*)\]?/, - ); - const issues = match - ? match[1] - .split(',') - .map((s: string) => s.trim()) - .filter((s: string) => s) - : []; - expect(issues).not.toContain('201'); - } - }, - }); - - evalTest('USUALLY_PASSES', { - name: 'should differentiate false positives with high similarity', - prompt: ['--output-format', 'json', '--prompt', createPrompt(301)], - env: { - ISSUE_NUMBER: '301', - GITHUB_ENV: 'github_env', - }, - params: { - settings: DEDUP_SETTINGS, - }, - files: { - github_env: '', - 'bin/gh': `#!/usr/bin/env node -const args = process.argv.slice(2).join(' '); -if (args.includes('issue view')) { - const issueNum = args.match(/view (\\d+)/)?.[1]; - if (issueNum === '301') { - console.log(JSON.stringify({ - number: 301, - title: 'App crashes when I click Save', - body: 'I click the save button and it crashes.', - comments: [] - })); - } else if (issueNum === '302') { - console.log(JSON.stringify({ - number: 302, - title: 'App crashes when I click Load', - body: 'I click the load button and it crashes. This seems related to the loader component.', - comments: [] - })); - } else { - console.log(JSON.stringify({ number: parseInt(issueNum), title: '', body: '', comments: [] })); - } -} -`, - }, - assert: async (rig: any, result) => { - // Verify JSON output stats - const output = JSON.parse(result); - expect(output.stats).toBeDefined(); - - const toolLogs = rig.readToolLogs(); - const duplicatesCall = toolLogs.find( - (l: any) => l.toolRequest.name === 'duplicates', - ); - expect(duplicatesCall).toBeDefined(); - - const shellCalls = toolLogs.filter( - (l: any) => l.toolRequest.name === 'run_shell_command', - ); - const envCall = shellCalls.find((call: any) => - call.toolRequest.args.includes('DUPLICATE_ISSUES_CSV'), - ); - - if (envCall) { - const match = envCall.toolRequest.args.match( - /DUPLICATE_ISSUES_CSV=\[?([\d, ]*)\]?/, - ); - const issues = match - ? match[1] - .split(',') - .map((s: string) => s.trim()) - .filter((s: string) => s) - : []; - // Should NOT contain 302 because it's a different feature (Save vs Load) despite crash - expect(issues).not.toContain('302'); - } - }, - }); - - evalTest('USUALLY_PASSES', { - name: 'should reject matches with low similarity', - prompt: ['--output-format', 'json', '--prompt', createPrompt(401)], - env: { - ISSUE_NUMBER: '401', - GITHUB_ENV: 'github_env', - }, - params: { - settings: DEDUP_SETTINGS, - }, - files: { - github_env: '', - 'bin/gh': `#!/usr/bin/env node -const args = process.argv.slice(2).join(' '); -if (args.includes('issue view')) { - const issueNum = args.match(/view (\\d+)/)?.[1]; - if (issueNum === '401') { - console.log(JSON.stringify({ - number: 401, - title: 'Feature request: Dark mode', - body: 'Please add dark mode.', - comments: [] - })); - } else if (issueNum === '402') { - console.log(JSON.stringify({ - number: 402, - title: 'Feature request: Light mode', - body: 'Please add light mode.', - comments: [] - })); - } else { - console.log(JSON.stringify({ number: parseInt(issueNum), title: '', body: '', comments: [] })); - } -} -`, - }, - assert: async (rig: any, result) => { - // Verify JSON output stats - const output = JSON.parse(result); - expect(output.stats).toBeDefined(); - - const toolLogs = rig.readToolLogs(); - const duplicatesCall = toolLogs.find( - (l: any) => l.toolRequest.name === 'duplicates', - ); - expect(duplicatesCall).toBeDefined(); - - const shellCalls = toolLogs.filter( - (l: any) => l.toolRequest.name === 'run_shell_command', - ); - const envCall = shellCalls.find((call: any) => - call.toolRequest.args.includes('DUPLICATE_ISSUES_CSV'), - ); - - if (envCall) { - const match = envCall.toolRequest.args.match( - /DUPLICATE_ISSUES_CSV=\[?([\d, ]*)\]?/, - ); - const issues = match - ? match[1] - .split(',') - .map((s: string) => s.trim()) - .filter((s: string) => s) - : []; - expect(issues).not.toContain('402'); - expect(issues.length).toBe(0); - } - }, - }); - - evalTest('USUALLY_PASSES', { - name: 'should identify multiple duplicates', - prompt: ['--output-format', 'json', '--prompt', createPrompt(501)], - env: { - ISSUE_NUMBER: '501', - GITHUB_ENV: 'github_env', - }, - params: { - settings: DEDUP_SETTINGS, - }, - files: { - github_env: '', - 'bin/gh': `#!/usr/bin/env node -const args = process.argv.slice(2).join(' '); -if (args.includes('issue view')) { - const issueNum = args.match(/view (\\d+)/)?.[1]; - if (issueNum === '501') { - console.log(JSON.stringify({ - number: 501, - title: 'Crash on login', - body: 'The app crashes when I try to log in.', - comments: [] - })); - } else if (issueNum === '502') { - console.log(JSON.stringify({ - number: 502, - title: 'Crash on sign in', - body: 'Crashes during sign in process.', - comments: [] - })); - } else if (issueNum === '503') { - console.log(JSON.stringify({ - number: 503, - title: 'Crashes on login page', - body: 'I get a crash immediately on the login page.', - comments: [] - })); - } else { - console.log(JSON.stringify({ number: parseInt(issueNum), title: '', body: '', comments: [] })); - } -} -`, - }, - assert: async (rig: any, result) => { - // Verify JSON output stats - const output = JSON.parse(result); - expect(output.stats).toBeDefined(); - - const toolLogs = rig.readToolLogs(); - const duplicatesCall = toolLogs.find( - (l: any) => l.toolRequest.name === 'duplicates', - ); - expect(duplicatesCall).toBeDefined(); - - const shellCalls = toolLogs.filter( - (l: any) => l.toolRequest.name === 'run_shell_command', - ); - const envCall = shellCalls.find((call: any) => - call.toolRequest.args.includes('DUPLICATE_ISSUES_CSV'), - ); - - expect(envCall).toBeDefined(); - const match = envCall.toolRequest.args.match( - /DUPLICATE_ISSUES_CSV=\[?([\d, ]*)\]?/, - ); - const issues = match - ? match[1] - .split(',') - .map((s: string) => s.trim()) - .filter((s: string) => s) - : []; - expect(issues).toContain('502'); - expect(issues).toContain('503'); - }, - }); -}); diff --git a/evals/dedup_refresh.eval.ts b/evals/dedup_refresh.eval.ts deleted file mode 100644 index 480893bbd8..0000000000 --- a/evals/dedup_refresh.eval.ts +++ /dev/null @@ -1,81 +0,0 @@ -/** - * @license - * Copyright 2025 Google LLC - * SPDX-License-Identifier: Apache-2.0 - */ - -import { describe, expect } from 'vitest'; -import { evalTest } from './test-helper.js'; -import fs from 'node:fs/promises'; -import path from 'node:path'; -import yaml from 'js-yaml'; - -// Read the workflow file to extract the prompt and settings -const workflowPath = path.join( - process.cwd(), - '.github/workflows/gemini-scheduled-issue-dedup.yml', -); -const workflowContent = await fs.readFile(workflowPath, 'utf8'); - -const workflowData = yaml.load(workflowContent) as any; -const geminiStep = workflowData.jobs?.['refresh-embeddings']?.steps?.find( - (step: any) => step.id === 'gemini_refresh_embeddings', -); - -const REFRESH_PROMPT_TEMPLATE = geminiStep?.with?.prompt; -const ORIGINAL_SETTINGS = JSON.parse(geminiStep?.with?.settings || '{}'); - -if (!REFRESH_PROMPT_TEMPLATE) { - throw new Error('Could not extract prompt from dedup refresh workflow.'); -} - -const mockMcpPath = path.join(process.cwd(), 'evals/mocks/dedup_mcp.ts'); - -const createPrompt = () => { - return REFRESH_PROMPT_TEMPLATE.replace( - /\${{ github\.repository }}/g, - 'google-gemini/gemini-cli', - ); -}; - -const REFRESH_SETTINGS = { - ...ORIGINAL_SETTINGS, - mcpServers: { - issue_deduplication: { - command: 'npx', - args: ['tsx', mockMcpPath], - }, - }, -}; -if (REFRESH_SETTINGS.telemetry) { - delete REFRESH_SETTINGS.telemetry; -} - -describe('dedup_refresh_agent', () => { - evalTest('USUALLY_PASSES', { - name: 'should call refresh tool', - prompt: ['--output-format', 'json', '--prompt', createPrompt()], - approvalMode: 'yolo', - params: { - settings: REFRESH_SETTINGS, - }, - assert: async (rig: any, result) => { - // result is the JSON output - const output = JSON.parse(result); - expect(output.stats).toBeDefined(); - - const toolStats = output.stats.tools.byName; - expect(toolStats.refresh).toBeDefined(); - expect(toolStats.refresh.count).toBe(1); - expect(toolStats.refresh.success).toBe(1); - - // We still check telemetry for deep arg inspection if needed, - // but stats verify the high-level goal. - const toolLogs = rig.readToolLogs(); - const refreshCall = toolLogs.find( - (l: any) => l.toolRequest.name === 'refresh', - ); - expect(refreshCall).toBeDefined(); - }, - }); -}); diff --git a/evals/triage.eval.ts b/evals/triage.eval.ts deleted file mode 100644 index d32087d4dc..0000000000 --- a/evals/triage.eval.ts +++ /dev/null @@ -1,413 +0,0 @@ -/** - * @license - * Copyright 2025 Google LLC - * SPDX-License-Identifier: Apache-2.0 - */ - -import { describe, expect } from 'vitest'; -import { evalTest } from './test-helper.js'; -import fs from 'node:fs/promises'; -import path from 'node:path'; -import yaml from 'js-yaml'; - -// Read the workflow file to extract the prompt -const workflowPath = path.join( - process.cwd(), - '.github/workflows/gemini-automated-issue-triage.yml', -); -const workflowContent = await fs.readFile(workflowPath, 'utf8'); - -// Use a YAML parser for robustness -const workflowData = yaml.load(workflowContent) as { - jobs?: { - 'triage-issue'?: { - steps?: { - id?: string; - with?: { prompt?: string; script?: string }; - }[]; - }; - }; -}; - -const triageStep = workflowData.jobs?.['triage-issue']?.steps?.find( - (step) => step.id === 'gemini_issue_analysis', -); - -const labelsStep = workflowData.jobs?.['triage-issue']?.steps?.find( - (step) => step.id === 'get_labels', -); - -const TRIAGE_PROMPT_TEMPLATE = triageStep?.with?.prompt; -const LABELS_SCRIPT = labelsStep?.with?.script; - -if (!TRIAGE_PROMPT_TEMPLATE) { - throw new Error( - 'Could not extract prompt from workflow file. Check for `jobs.triage-issue.steps[id=gemini_issue_analysis].with.prompt` in the YAML file.', - ); -} - -// Extract available labels from the script -let availableLabels = ''; -if (LABELS_SCRIPT) { - const match = LABELS_SCRIPT.match(/const allowedLabels = \[([\s\S]+?)\];/); - if (match && match[1]) { - // Clean up the extracted string: remove quotes, commas, and whitespace - availableLabels = match[1] - .replace(/['"\n\r]/g, '') - .split(',') - .map((s) => s.trim()) - .filter((s) => s.length > 0) - .join(', '); - } -} - -if (!availableLabels) { - throw new Error( - 'Could not extract available labels from workflow file. Check for `jobs.triage-issue.steps[id=get_labels].with.script` containing `const allowedLabels = [...]`.', - ); -} - -const createPrompt = (title: string, body: string) => { - // The placeholders in the YAML are ${{ env.ISSUE_TITLE }} etc. - // We need to replace them with the actual values for the test. - return TRIAGE_PROMPT_TEMPLATE.replace('${{ env.ISSUE_TITLE }}', title) - .replace('${{ env.ISSUE_BODY }}', body) - .replace('${{ env.AVAILABLE_LABELS }}', availableLabels); -}; - -const TRIAGE_SETTINGS = {}; - -const escapeHtml = (str: string) => { - return str.replace(/[<>&'"]/g, (c) => { - switch (c) { - case '<': - return '<'; - case '>': - return '>'; - case '&': - return '&'; - case "'": - return '''; - case '"': - return '"'; - } - return ''; // Should not happen - }); -}; - -const assertHasLabel = (expectedLabel: string) => { - return async (rig: any, result: string) => { - // Verify JSON output stats - const output = JSON.parse(result); - expect(output.stats).toBeDefined(); - - // The model response JSON is in the 'response' field - const responseText = output.response; - let jsonString: string; - const match = responseText.match(/```json\s*([\s\S]*?)\s*```/); - if (match?.[1]) { - jsonString = match[1]; - } else { - const firstBrace = responseText.indexOf('{'); - const lastBrace = responseText.lastIndexOf('}'); - if (firstBrace === -1 || lastBrace === -1 || lastBrace < firstBrace) { - throw new Error( - `Could not find a JSON object in the response: "${escapeHtml(responseText)}"`, - ); - } - jsonString = responseText.substring(firstBrace, lastBrace + 1); - } - - let data: { labels_to_set?: string[] }; - try { - data = JSON.parse(jsonString); - } catch (e) { - const err = e as Error; - throw new Error( - `Failed to parse JSON. Error: ${err.message}. Response: "${escapeHtml(responseText)}"`, - ); - } - - expect(data).toHaveProperty('labels_to_set'); - expect(Array.isArray(data.labels_to_set)).toBe(true); - expect(data.labels_to_set).toContain(expectedLabel); - }; -}; - -describe('triage_agent', () => { - evalTest('USUALLY_PASSES', { - name: 'should identify area/core for windows installation issues', - prompt: [ - '--output-format', - 'json', - '--prompt', - createPrompt( - 'CLI failed to install on Windows', - 'I tried running npm install but it failed with an error on Windows 11.', - ), - ], - params: { settings: TRIAGE_SETTINGS }, - assert: assertHasLabel('area/core'), - }); - - evalTest('USUALLY_PASSES', { - name: 'should identify area/platform for CI/CD failures', - prompt: [ - '--output-format', - 'json', - '--prompt', - createPrompt( - 'Tests are failing in the CI/CD pipeline', - 'The github action is failing with a 500 error.', - ), - ], - params: { settings: TRIAGE_SETTINGS }, - assert: assertHasLabel('area/platform'), - }); - - evalTest('USUALLY_PASSES', { - name: 'should identify area/platform for quota issues', - prompt: [ - '--output-format', - 'json', - '--prompt', - createPrompt( - 'Resource Exhausted 429', - 'I am getting a 429 error when running the CLI.', - ), - ], - params: { settings: TRIAGE_SETTINGS }, - assert: assertHasLabel('area/platform'), - }); - - evalTest('USUALLY_PASSES', { - name: 'should identify area/core for local build failures', - prompt: [ - '--output-format', - 'json', - '--prompt', - createPrompt( - 'Local build failing', - 'I cannot build the project locally. npm run build fails.', - ), - ], - params: { settings: TRIAGE_SETTINGS }, - assert: assertHasLabel('area/core'), - }); - - evalTest('USUALLY_PASSES', { - name: 'should identify area/platform for sandbox issues', - prompt: [ - '--output-format', - 'json', - '--prompt', - createPrompt( - 'Sandbox connection failed', - 'I cannot connect to the docker sandbox environment.', - ), - ], - params: { settings: TRIAGE_SETTINGS }, - assert: assertHasLabel('area/platform'), - }); - - evalTest('USUALLY_PASSES', { - name: 'should identify area/core for local test failures', - prompt: [ - '--output-format', - 'json', - '--prompt', - createPrompt( - 'Local tests failing', - 'I am running npm test locally and it fails.', - ), - ], - params: { settings: TRIAGE_SETTINGS }, - assert: assertHasLabel('area/core'), - }); - - evalTest('USUALLY_PASSES', { - name: 'should identify area/agent for questions about tools', - prompt: [ - '--output-format', - 'json', - '--prompt', - createPrompt( - 'Bug with web search?', - 'I am trying to use web search but I do not know the syntax. Is it @web or /web?', - ), - ], - params: { settings: TRIAGE_SETTINGS }, - assert: assertHasLabel('area/agent'), - }); - - evalTest('USUALLY_PASSES', { - name: 'should identify area/extensions for feature requests', - prompt: [ - '--output-format', - 'json', - '--prompt', - createPrompt( - 'Please add a python extension', - 'I want to write python scripts as an extension.', - ), - ], - params: { settings: TRIAGE_SETTINGS }, - assert: assertHasLabel('area/extensions'), - }); - - evalTest('USUALLY_PASSES', { - name: 'should identify area/unknown for off-topic spam', - prompt: [ - '--output-format', - 'json', - '--prompt', - createPrompt('Buy cheap rolex', 'Click here for discount.'), - ], - params: { settings: TRIAGE_SETTINGS }, - assert: assertHasLabel('area/unknown'), - }); - - evalTest('USUALLY_PASSES', { - name: 'should identify area/core for crash reports phrased as questions', - prompt: [ - '--output-format', - 'json', - '--prompt', - createPrompt( - 'Why does it segfault?', - 'Why does the CLI segfault immediately when I run it on Ubuntu?', - ), - ], - params: { settings: TRIAGE_SETTINGS }, - assert: assertHasLabel('area/core'), - }); - - evalTest('USUALLY_PASSES', { - name: 'should identify area/agent for feature requests for built-in tools', - prompt: [ - '--output-format', - 'json', - '--prompt', - createPrompt( - 'Can we have a diff tool?', - 'Is it possible to add a built-in tool to show diffs before editing?', - ), - ], - params: { settings: TRIAGE_SETTINGS }, - assert: assertHasLabel('area/agent'), - }); - - evalTest('USUALLY_PASSES', { - name: 'should identify area/enterprise for license questions', - prompt: [ - '--output-format', - 'json', - '--prompt', - createPrompt( - 'License key issue', - 'Where do I enter my enterprise license key? I cannot find the setting.', - ), - ], - params: { settings: TRIAGE_SETTINGS }, - assert: assertHasLabel('area/enterprise'), - }); - - evalTest('USUALLY_PASSES', { - name: 'should identify area/unknown for extremely vague reports', - prompt: [ - '--output-format', - 'json', - '--prompt', - createPrompt('It does not work', 'I tried to use it and it failed.'), - ], - params: { settings: TRIAGE_SETTINGS }, - assert: assertHasLabel('area/unknown'), - }); - - evalTest('USUALLY_PASSES', { - name: 'should identify area/security for prompt injection reports', - prompt: [ - '--output-format', - 'json', - '--prompt', - createPrompt( - 'Prompt injection vulnerability', - 'I found a way to make the agent ignore instructions by saying "Ignore all previous instructions".', - ), - ], - params: { settings: TRIAGE_SETTINGS }, - assert: assertHasLabel('area/security'), - }); - - evalTest('USUALLY_PASSES', { - name: 'should identify area/non-interactive for headless crashes', - prompt: [ - '--output-format', - 'json', - '--prompt', - createPrompt( - 'Headless mode segfault', - 'When I run with --headless, the CLI crashes immediately.', - ), - ], - params: { settings: TRIAGE_SETTINGS }, - assert: assertHasLabel('area/non-interactive'), - }); - - evalTest('USUALLY_PASSES', { - name: 'should identify area/agent for mixed feedback and tool bugs', - prompt: [ - '--output-format', - 'json', - '--prompt', - createPrompt( - 'Great tool but web search fails', - 'I love using Gemini CLI, it is amazing! However, the @web tool gives me an error every time I search for "react".', - ), - ], - params: { settings: TRIAGE_SETTINGS }, - assert: assertHasLabel('area/agent'), - }); - - evalTest('USUALLY_PASSES', { - name: 'should identify area/core for UI performance issues', - prompt: [ - '--output-format', - 'json', - '--prompt', - createPrompt( - 'UI is very slow', - 'The new interface is lagging and unresponsive when I scroll.', - ), - ], - params: { settings: TRIAGE_SETTINGS }, - assert: assertHasLabel('area/core'), - }); - - evalTest('USUALLY_PASSES', { - name: 'should identify area/security for accidental secret leakage', - prompt: [ - '--output-format', - 'json', - '--prompt', - createPrompt( - 'Leaked API key in logs', - 'I accidentally posted my API key in a previous issue comment. Can you delete it?', - ), - ], - params: { settings: TRIAGE_SETTINGS }, - assert: assertHasLabel('area/security'), - }); - - evalTest('USUALLY_PASSES', { - name: 'should identify area/unknown for nonsensical input', - prompt: [ - '--output-format', - 'json', - '--prompt', - createPrompt('asdfasdf', 'qwerqwer zxcvbnm'), - ], - params: { settings: TRIAGE_SETTINGS }, - assert: assertHasLabel('area/unknown'), - }); -});