mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-03-12 15:10:59 -07:00
chore: remove old eval files moved to workflows/
This commit is contained in:
@@ -1,264 +0,0 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2025 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { describe, expect } from 'vitest';
|
||||
import { evalTest } from './test-helper.js';
|
||||
import fs from 'node:fs/promises';
|
||||
import path from 'node:path';
|
||||
import yaml from 'js-yaml';
|
||||
|
||||
// Read the workflow file to extract the prompt
|
||||
const workflowPath = path.join(
|
||||
process.cwd(),
|
||||
'.github/workflows/gemini-scheduled-issue-triage.yml',
|
||||
);
|
||||
const workflowContent = await fs.readFile(workflowPath, 'utf8');
|
||||
|
||||
// Use a YAML parser for robustness
|
||||
const workflowData = yaml.load(workflowContent) as {
|
||||
jobs?: {
|
||||
'triage-issues'?: {
|
||||
steps?: {
|
||||
id?: string;
|
||||
with?: { prompt?: string; script?: string };
|
||||
env?: { AVAILABLE_LABELS?: string };
|
||||
}[];
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
const geminiStep = workflowData.jobs?.['triage-issues']?.steps?.find(
|
||||
(step) => step.id === 'gemini_issue_analysis',
|
||||
);
|
||||
|
||||
const labelsStep = workflowData.jobs?.['triage-issues']?.steps?.find(
|
||||
(step) => step.id === 'get_labels',
|
||||
);
|
||||
|
||||
const BATCH_TRIAGE_PROMPT_TEMPLATE = geminiStep?.with?.prompt;
|
||||
const ORIGINAL_SETTINGS = JSON.parse(geminiStep?.with?.settings || '{}');
|
||||
const LABELS_SCRIPT = labelsStep?.with?.script;
|
||||
|
||||
if (!BATCH_TRIAGE_PROMPT_TEMPLATE) {
|
||||
throw new Error(
|
||||
'Could not extract prompt from workflow file. Check for `jobs.triage-issues.steps[id=gemini_issue_analysis].with.prompt` in the YAML file.',
|
||||
);
|
||||
}
|
||||
|
||||
// Extract available labels from the script
|
||||
let availableLabels = '';
|
||||
if (LABELS_SCRIPT) {
|
||||
const match = LABELS_SCRIPT.match(
|
||||
/const labelNames = labels.map\(label => label.name\);/,
|
||||
);
|
||||
// Wait, the script in scheduled triage is different!
|
||||
// const labelNames = labels.map(label => label.name);
|
||||
// It gets ALL labels.
|
||||
// But the prompt expects "${AVAILABLE_LABELS}".
|
||||
// In the test, we can just mock a reasonable set of labels.
|
||||
availableLabels =
|
||||
'area/agent, area/core, area/enterprise, area/extensions, area/non-interactive, area/platform, area/security, area/unknown, kind/bug, kind/feature, kind/question, priority/p0, priority/p1, priority/p2, priority/p3';
|
||||
}
|
||||
|
||||
const createPrompt = () => {
|
||||
return BATCH_TRIAGE_PROMPT_TEMPLATE.replace(
|
||||
'${AVAILABLE_LABELS}',
|
||||
availableLabels,
|
||||
);
|
||||
};
|
||||
|
||||
const BATCH_TRIAGE_SETTINGS = {
|
||||
...ORIGINAL_SETTINGS,
|
||||
};
|
||||
if (BATCH_TRIAGE_SETTINGS.telemetry) {
|
||||
delete BATCH_TRIAGE_SETTINGS.telemetry;
|
||||
}
|
||||
|
||||
const escapeHtml = (str: string) => {
|
||||
return str.replace(/[<>&'"]/g, (c) => {
|
||||
switch (c) {
|
||||
case '<':
|
||||
return '<';
|
||||
case '>':
|
||||
return '>';
|
||||
case '&':
|
||||
return '&';
|
||||
case "'":
|
||||
return ''';
|
||||
case '"':
|
||||
return '"';
|
||||
}
|
||||
return '';
|
||||
});
|
||||
};
|
||||
|
||||
const assertHasIssueLabel = (issueNumber: number, expectedLabel: string) => {
|
||||
return async (rig: any, result: string) => {
|
||||
// Verify JSON output stats
|
||||
const output = JSON.parse(result);
|
||||
expect(output.stats).toBeDefined();
|
||||
|
||||
// The model response JSON is in the 'response' field
|
||||
const responseText = output.response;
|
||||
let jsonString: string;
|
||||
const match = responseText.match(/```json\s*([\s\S]*?)\s*```/);
|
||||
if (match?.[1]) {
|
||||
jsonString = match[1];
|
||||
} else {
|
||||
const firstBracket = responseText.indexOf('[');
|
||||
const lastBracket = responseText.lastIndexOf(']');
|
||||
if (
|
||||
firstBracket === -1 ||
|
||||
lastBracket === -1 ||
|
||||
lastBracket < firstBracket
|
||||
) {
|
||||
throw new Error(
|
||||
`Could not find a JSON array in the response: "${escapeHtml(responseText)}"`,
|
||||
);
|
||||
}
|
||||
jsonString = responseText.substring(firstBracket, lastBracket + 1);
|
||||
}
|
||||
|
||||
let data: { issue_number: number; labels_to_add: string[] }[];
|
||||
try {
|
||||
data = JSON.parse(jsonString);
|
||||
} catch (e) {
|
||||
const err = e as Error;
|
||||
throw new Error(
|
||||
`Failed to parse JSON. Error: ${err.message}. Response: "${escapeHtml(responseText)}"`,
|
||||
);
|
||||
}
|
||||
|
||||
const issue = data.find((i) => i.issue_number === issueNumber);
|
||||
if (!issue) {
|
||||
throw new Error(
|
||||
`Issue #${issueNumber} not found in output: ${JSON.stringify(data)}`,
|
||||
);
|
||||
}
|
||||
|
||||
expect(issue.labels_to_add).toContain(expectedLabel);
|
||||
};
|
||||
};
|
||||
|
||||
describe('batch_triage_agent', () => {
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should identify area/core for local test failures in batch',
|
||||
prompt: ['--output-format', 'json', '--prompt', createPrompt()],
|
||||
env: {
|
||||
AVAILABLE_LABELS: availableLabels,
|
||||
ISSUES_TO_TRIAGE: JSON.stringify([
|
||||
{
|
||||
number: 101,
|
||||
title: 'Local tests failing',
|
||||
body: 'I am running npm test locally and it fails with an error.',
|
||||
},
|
||||
]),
|
||||
},
|
||||
params: { settings: BATCH_TRIAGE_SETTINGS },
|
||||
assert: assertHasIssueLabel(101, 'area/core'),
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should identify area/platform for CI failures in batch',
|
||||
prompt: ['--output-format', 'json', '--prompt', createPrompt()],
|
||||
env: {
|
||||
AVAILABLE_LABELS: availableLabels,
|
||||
ISSUES_TO_TRIAGE: JSON.stringify([
|
||||
{
|
||||
number: 102,
|
||||
title: 'CI pipeline failed',
|
||||
body: 'The GitHub Action for tests failed on the main branch.',
|
||||
},
|
||||
]),
|
||||
},
|
||||
params: { settings: BATCH_TRIAGE_SETTINGS },
|
||||
assert: assertHasIssueLabel(102, 'area/platform'),
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should handle mixed batch correctly',
|
||||
prompt: ['--output-format', 'json', '--prompt', createPrompt()],
|
||||
env: {
|
||||
AVAILABLE_LABELS: availableLabels,
|
||||
ISSUES_TO_TRIAGE: JSON.stringify([
|
||||
{
|
||||
number: 103,
|
||||
title: 'Cannot install on MacOS',
|
||||
body: 'Install fails with permission error.',
|
||||
},
|
||||
{
|
||||
number: 104,
|
||||
title: 'Click to win',
|
||||
body: 'Spam body',
|
||||
},
|
||||
]),
|
||||
},
|
||||
params: { settings: BATCH_TRIAGE_SETTINGS },
|
||||
assert: async (rig: any, result) => {
|
||||
// Assert issue 103 has area/core
|
||||
await assertHasIssueLabel(103, 'area/core')(rig, result);
|
||||
// Assert issue 104 has area/unknown
|
||||
await assertHasIssueLabel(104, 'area/unknown')(rig, result);
|
||||
},
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should handle issues needing retesting (old version)',
|
||||
prompt: ['--output-format', 'json', '--prompt', createPrompt()],
|
||||
env: {
|
||||
AVAILABLE_LABELS: availableLabels,
|
||||
ISSUES_TO_TRIAGE: JSON.stringify([
|
||||
{
|
||||
number: 105,
|
||||
title: 'Crash on version 0.1.0',
|
||||
body: 'I am using /about and it says 0.1.0. The app crashes when I run it.',
|
||||
},
|
||||
]),
|
||||
},
|
||||
params: { settings: BATCH_TRIAGE_SETTINGS },
|
||||
assert: assertHasIssueLabel(105, 'status/need-retesting'),
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should handle issues needing more information',
|
||||
prompt: ['--output-format', 'json', '--prompt', createPrompt()],
|
||||
env: {
|
||||
AVAILABLE_LABELS: availableLabels,
|
||||
ISSUES_TO_TRIAGE: JSON.stringify([
|
||||
{
|
||||
number: 106,
|
||||
title: 'It does not work',
|
||||
body: 'Something is broken.',
|
||||
},
|
||||
]),
|
||||
},
|
||||
params: { settings: BATCH_TRIAGE_SETTINGS },
|
||||
assert: assertHasIssueLabel(106, 'status/need-information'),
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should handle large batch of diverse issues',
|
||||
prompt: ['--output-format', 'json', '--prompt', createPrompt()],
|
||||
env: {
|
||||
AVAILABLE_LABELS: availableLabels,
|
||||
ISSUES_TO_TRIAGE: JSON.stringify([
|
||||
{ number: 107, title: 'Bug A', body: 'Local test failure' },
|
||||
{ number: 108, title: 'Bug B', body: 'CI failure' },
|
||||
{ number: 109, title: 'Bug C', body: 'Security leak' },
|
||||
{ number: 110, title: 'Bug D', body: 'Spam' },
|
||||
{ number: 111, title: 'Bug E', body: 'Old version 0.0.1' },
|
||||
]),
|
||||
},
|
||||
params: { settings: BATCH_TRIAGE_SETTINGS },
|
||||
assert: async (rig: any, result) => {
|
||||
await assertHasIssueLabel(107, 'area/core')(rig, result);
|
||||
await assertHasIssueLabel(108, 'area/platform')(rig, result);
|
||||
await assertHasIssueLabel(109, 'area/security')(rig, result);
|
||||
await assertHasIssueLabel(110, 'area/unknown')(rig, result);
|
||||
await assertHasIssueLabel(111, 'status/need-retesting')(rig, result);
|
||||
},
|
||||
});
|
||||
});
|
||||
@@ -1,422 +0,0 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2025 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { describe, expect } from 'vitest';
|
||||
import { evalTest } from './test-helper.js';
|
||||
import fs from 'node:fs/promises';
|
||||
import path from 'node:path';
|
||||
import yaml from 'js-yaml';
|
||||
|
||||
// Read the workflow file to extract the prompt and settings
|
||||
const workflowPath = path.join(
|
||||
process.cwd(),
|
||||
'.github/workflows/gemini-automated-issue-dedup.yml',
|
||||
);
|
||||
const workflowContent = await fs.readFile(workflowPath, 'utf8');
|
||||
|
||||
const workflowData = yaml.load(workflowContent) as any;
|
||||
const geminiStep = workflowData.jobs?.['find-duplicates']?.steps?.find(
|
||||
(step: any) => step.id === 'gemini_issue_deduplication',
|
||||
);
|
||||
|
||||
const DEDUP_PROMPT_TEMPLATE = geminiStep?.with?.prompt;
|
||||
const ORIGINAL_SETTINGS = JSON.parse(geminiStep?.with?.settings || '{}');
|
||||
|
||||
if (!DEDUP_PROMPT_TEMPLATE) {
|
||||
throw new Error('Could not extract prompt from de-duplication workflow.');
|
||||
}
|
||||
|
||||
const mockMcpPath = path.join(process.cwd(), 'evals/mocks/dedup_mcp.ts');
|
||||
|
||||
const createPrompt = (issueNumber: number) => {
|
||||
// The prompt uses ${{ github.event.issue.number }} but also references ${ISSUE_NUMBER} (env)
|
||||
return DEDUP_PROMPT_TEMPLATE.replace(
|
||||
/\${{ github\.repository }}/g,
|
||||
'google-gemini/gemini-cli',
|
||||
).replace(/\${{ github\.event\.issue\.number }}/g, issueNumber.toString());
|
||||
};
|
||||
|
||||
const DEDUP_SETTINGS = {
|
||||
...ORIGINAL_SETTINGS,
|
||||
mcpServers: {
|
||||
issue_deduplication: {
|
||||
command: 'npx',
|
||||
args: ['tsx', mockMcpPath],
|
||||
},
|
||||
},
|
||||
};
|
||||
if (DEDUP_SETTINGS.telemetry) {
|
||||
delete DEDUP_SETTINGS.telemetry;
|
||||
}
|
||||
|
||||
describe('dedup_agent', () => {
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should identify duplicate issues',
|
||||
prompt: ['--output-format', 'json', '--prompt', createPrompt(101)],
|
||||
env: {
|
||||
ISSUE_NUMBER: '101',
|
||||
GITHUB_ENV: 'github_env',
|
||||
},
|
||||
params: {
|
||||
settings: DEDUP_SETTINGS,
|
||||
},
|
||||
files: {
|
||||
github_env: '',
|
||||
// Mock gh binary
|
||||
'bin/gh': `#!/usr/bin/env node
|
||||
const args = process.argv.slice(2).join(' ');
|
||||
if (args.includes('issue view')) {
|
||||
const issueNum = args.match(/view (\\d+)/)?.[1];
|
||||
if (issueNum === '101') {
|
||||
console.log(JSON.stringify({
|
||||
number: 101,
|
||||
title: 'CLI crashes on start',
|
||||
body: 'It segfaults immediately.',
|
||||
comments: []
|
||||
}));
|
||||
} else if (issueNum === '201') {
|
||||
console.log(JSON.stringify({
|
||||
number: 201,
|
||||
title: 'Segfault on launch',
|
||||
body: 'The app crashes right away.',
|
||||
comments: []
|
||||
}));
|
||||
} else if (issueNum === '202') {
|
||||
console.log(JSON.stringify({
|
||||
number: 202,
|
||||
title: 'Unrelated bug',
|
||||
body: 'Themes are not working.',
|
||||
comments: []
|
||||
}));
|
||||
}
|
||||
}
|
||||
`,
|
||||
},
|
||||
assert: async (rig: any, result) => {
|
||||
// Verify JSON output stats
|
||||
const output = JSON.parse(result);
|
||||
expect(output.stats).toBeDefined();
|
||||
expect(output.stats.tools.byName['duplicates']).toBeDefined();
|
||||
expect(output.stats.tools.byName['run_shell_command']).toBeDefined();
|
||||
|
||||
// Verify detailed tool usage via telemetry
|
||||
const toolLogs = rig.readToolLogs();
|
||||
const duplicatesCall = toolLogs.find(
|
||||
(l: any) => l.toolRequest.name === 'duplicates',
|
||||
);
|
||||
expect(duplicatesCall).toBeDefined();
|
||||
|
||||
// The current prompt uses echo to set GITHUB_ENV
|
||||
// We check the tool call for the echo command
|
||||
const shellCalls = toolLogs.filter(
|
||||
(l: any) => l.toolRequest.name === 'run_shell_command',
|
||||
);
|
||||
const envCall = shellCalls.find((call: any) =>
|
||||
call.toolRequest.args.includes('DUPLICATE_ISSUES_CSV'),
|
||||
);
|
||||
|
||||
expect(envCall).toBeDefined();
|
||||
// Check the command content
|
||||
const match = envCall.toolRequest.args.match(
|
||||
/DUPLICATE_ISSUES_CSV=\[?([\d, ]*)\]?/,
|
||||
);
|
||||
expect(match).not.toBeNull();
|
||||
const issues = match![1]
|
||||
.split(',')
|
||||
.map((s: string) => s.trim())
|
||||
.filter((s: string) => s);
|
||||
expect(issues).toContain('201');
|
||||
expect(issues).not.toContain('202');
|
||||
},
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should respect "not a duplicate" comments',
|
||||
prompt: ['--output-format', 'json', '--prompt', createPrompt(101)],
|
||||
env: {
|
||||
ISSUE_NUMBER: '101',
|
||||
GITHUB_ENV: 'github_env',
|
||||
},
|
||||
params: {
|
||||
settings: DEDUP_SETTINGS,
|
||||
},
|
||||
files: {
|
||||
github_env: '',
|
||||
'bin/gh': `#!/usr/bin/env node
|
||||
const args = process.argv.slice(2).join(' ');
|
||||
if (args.includes('issue view')) {
|
||||
const issueNum = args.match(/view (\\d+)/)?.[1];
|
||||
if (issueNum === '101') {
|
||||
console.log(JSON.stringify({
|
||||
number: 101,
|
||||
title: 'CLI crashes on start',
|
||||
body: 'It segfaults immediately.',
|
||||
comments: [{ body: 'Note: This is NOT a duplicate of #201, different root cause.' }]
|
||||
}));
|
||||
} else if (issueNum === '201') {
|
||||
console.log(JSON.stringify({
|
||||
number: 201,
|
||||
title: 'Segfault on launch',
|
||||
body: 'The app crashes right away.',
|
||||
comments: []
|
||||
}));
|
||||
} else {
|
||||
console.log(JSON.stringify({ number: parseInt(issueNum), title: '', body: '', comments: [] }));
|
||||
}
|
||||
}
|
||||
`,
|
||||
},
|
||||
assert: async (rig: any, result) => {
|
||||
// Verify JSON output stats
|
||||
const output = JSON.parse(result);
|
||||
expect(output.stats).toBeDefined();
|
||||
|
||||
const toolLogs = rig.readToolLogs();
|
||||
const duplicatesCall = toolLogs.find(
|
||||
(l: any) => l.toolRequest.name === 'duplicates',
|
||||
);
|
||||
expect(duplicatesCall).toBeDefined();
|
||||
|
||||
const shellCalls = toolLogs.filter(
|
||||
(l: any) => l.toolRequest.name === 'run_shell_command',
|
||||
);
|
||||
// It might not call echo if no duplicates are found, or it might echo an empty list.
|
||||
// We'll check if it does call echo, that 201 is NOT in it.
|
||||
const envCall = shellCalls.find((call: any) =>
|
||||
call.toolRequest.args.includes('DUPLICATE_ISSUES_CSV'),
|
||||
);
|
||||
|
||||
if (envCall) {
|
||||
const match = envCall.toolRequest.args.match(
|
||||
/DUPLICATE_ISSUES_CSV=\[?([\d, ]*)\]?/,
|
||||
);
|
||||
const issues = match
|
||||
? match[1]
|
||||
.split(',')
|
||||
.map((s: string) => s.trim())
|
||||
.filter((s: string) => s)
|
||||
: [];
|
||||
expect(issues).not.toContain('201');
|
||||
}
|
||||
},
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should differentiate false positives with high similarity',
|
||||
prompt: ['--output-format', 'json', '--prompt', createPrompt(301)],
|
||||
env: {
|
||||
ISSUE_NUMBER: '301',
|
||||
GITHUB_ENV: 'github_env',
|
||||
},
|
||||
params: {
|
||||
settings: DEDUP_SETTINGS,
|
||||
},
|
||||
files: {
|
||||
github_env: '',
|
||||
'bin/gh': `#!/usr/bin/env node
|
||||
const args = process.argv.slice(2).join(' ');
|
||||
if (args.includes('issue view')) {
|
||||
const issueNum = args.match(/view (\\d+)/)?.[1];
|
||||
if (issueNum === '301') {
|
||||
console.log(JSON.stringify({
|
||||
number: 301,
|
||||
title: 'App crashes when I click Save',
|
||||
body: 'I click the save button and it crashes.',
|
||||
comments: []
|
||||
}));
|
||||
} else if (issueNum === '302') {
|
||||
console.log(JSON.stringify({
|
||||
number: 302,
|
||||
title: 'App crashes when I click Load',
|
||||
body: 'I click the load button and it crashes. This seems related to the loader component.',
|
||||
comments: []
|
||||
}));
|
||||
} else {
|
||||
console.log(JSON.stringify({ number: parseInt(issueNum), title: '', body: '', comments: [] }));
|
||||
}
|
||||
}
|
||||
`,
|
||||
},
|
||||
assert: async (rig: any, result) => {
|
||||
// Verify JSON output stats
|
||||
const output = JSON.parse(result);
|
||||
expect(output.stats).toBeDefined();
|
||||
|
||||
const toolLogs = rig.readToolLogs();
|
||||
const duplicatesCall = toolLogs.find(
|
||||
(l: any) => l.toolRequest.name === 'duplicates',
|
||||
);
|
||||
expect(duplicatesCall).toBeDefined();
|
||||
|
||||
const shellCalls = toolLogs.filter(
|
||||
(l: any) => l.toolRequest.name === 'run_shell_command',
|
||||
);
|
||||
const envCall = shellCalls.find((call: any) =>
|
||||
call.toolRequest.args.includes('DUPLICATE_ISSUES_CSV'),
|
||||
);
|
||||
|
||||
if (envCall) {
|
||||
const match = envCall.toolRequest.args.match(
|
||||
/DUPLICATE_ISSUES_CSV=\[?([\d, ]*)\]?/,
|
||||
);
|
||||
const issues = match
|
||||
? match[1]
|
||||
.split(',')
|
||||
.map((s: string) => s.trim())
|
||||
.filter((s: string) => s)
|
||||
: [];
|
||||
// Should NOT contain 302 because it's a different feature (Save vs Load) despite crash
|
||||
expect(issues).not.toContain('302');
|
||||
}
|
||||
},
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should reject matches with low similarity',
|
||||
prompt: ['--output-format', 'json', '--prompt', createPrompt(401)],
|
||||
env: {
|
||||
ISSUE_NUMBER: '401',
|
||||
GITHUB_ENV: 'github_env',
|
||||
},
|
||||
params: {
|
||||
settings: DEDUP_SETTINGS,
|
||||
},
|
||||
files: {
|
||||
github_env: '',
|
||||
'bin/gh': `#!/usr/bin/env node
|
||||
const args = process.argv.slice(2).join(' ');
|
||||
if (args.includes('issue view')) {
|
||||
const issueNum = args.match(/view (\\d+)/)?.[1];
|
||||
if (issueNum === '401') {
|
||||
console.log(JSON.stringify({
|
||||
number: 401,
|
||||
title: 'Feature request: Dark mode',
|
||||
body: 'Please add dark mode.',
|
||||
comments: []
|
||||
}));
|
||||
} else if (issueNum === '402') {
|
||||
console.log(JSON.stringify({
|
||||
number: 402,
|
||||
title: 'Feature request: Light mode',
|
||||
body: 'Please add light mode.',
|
||||
comments: []
|
||||
}));
|
||||
} else {
|
||||
console.log(JSON.stringify({ number: parseInt(issueNum), title: '', body: '', comments: [] }));
|
||||
}
|
||||
}
|
||||
`,
|
||||
},
|
||||
assert: async (rig: any, result) => {
|
||||
// Verify JSON output stats
|
||||
const output = JSON.parse(result);
|
||||
expect(output.stats).toBeDefined();
|
||||
|
||||
const toolLogs = rig.readToolLogs();
|
||||
const duplicatesCall = toolLogs.find(
|
||||
(l: any) => l.toolRequest.name === 'duplicates',
|
||||
);
|
||||
expect(duplicatesCall).toBeDefined();
|
||||
|
||||
const shellCalls = toolLogs.filter(
|
||||
(l: any) => l.toolRequest.name === 'run_shell_command',
|
||||
);
|
||||
const envCall = shellCalls.find((call: any) =>
|
||||
call.toolRequest.args.includes('DUPLICATE_ISSUES_CSV'),
|
||||
);
|
||||
|
||||
if (envCall) {
|
||||
const match = envCall.toolRequest.args.match(
|
||||
/DUPLICATE_ISSUES_CSV=\[?([\d, ]*)\]?/,
|
||||
);
|
||||
const issues = match
|
||||
? match[1]
|
||||
.split(',')
|
||||
.map((s: string) => s.trim())
|
||||
.filter((s: string) => s)
|
||||
: [];
|
||||
expect(issues).not.toContain('402');
|
||||
expect(issues.length).toBe(0);
|
||||
}
|
||||
},
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should identify multiple duplicates',
|
||||
prompt: ['--output-format', 'json', '--prompt', createPrompt(501)],
|
||||
env: {
|
||||
ISSUE_NUMBER: '501',
|
||||
GITHUB_ENV: 'github_env',
|
||||
},
|
||||
params: {
|
||||
settings: DEDUP_SETTINGS,
|
||||
},
|
||||
files: {
|
||||
github_env: '',
|
||||
'bin/gh': `#!/usr/bin/env node
|
||||
const args = process.argv.slice(2).join(' ');
|
||||
if (args.includes('issue view')) {
|
||||
const issueNum = args.match(/view (\\d+)/)?.[1];
|
||||
if (issueNum === '501') {
|
||||
console.log(JSON.stringify({
|
||||
number: 501,
|
||||
title: 'Crash on login',
|
||||
body: 'The app crashes when I try to log in.',
|
||||
comments: []
|
||||
}));
|
||||
} else if (issueNum === '502') {
|
||||
console.log(JSON.stringify({
|
||||
number: 502,
|
||||
title: 'Crash on sign in',
|
||||
body: 'Crashes during sign in process.',
|
||||
comments: []
|
||||
}));
|
||||
} else if (issueNum === '503') {
|
||||
console.log(JSON.stringify({
|
||||
number: 503,
|
||||
title: 'Crashes on login page',
|
||||
body: 'I get a crash immediately on the login page.',
|
||||
comments: []
|
||||
}));
|
||||
} else {
|
||||
console.log(JSON.stringify({ number: parseInt(issueNum), title: '', body: '', comments: [] }));
|
||||
}
|
||||
}
|
||||
`,
|
||||
},
|
||||
assert: async (rig: any, result) => {
|
||||
// Verify JSON output stats
|
||||
const output = JSON.parse(result);
|
||||
expect(output.stats).toBeDefined();
|
||||
|
||||
const toolLogs = rig.readToolLogs();
|
||||
const duplicatesCall = toolLogs.find(
|
||||
(l: any) => l.toolRequest.name === 'duplicates',
|
||||
);
|
||||
expect(duplicatesCall).toBeDefined();
|
||||
|
||||
const shellCalls = toolLogs.filter(
|
||||
(l: any) => l.toolRequest.name === 'run_shell_command',
|
||||
);
|
||||
const envCall = shellCalls.find((call: any) =>
|
||||
call.toolRequest.args.includes('DUPLICATE_ISSUES_CSV'),
|
||||
);
|
||||
|
||||
expect(envCall).toBeDefined();
|
||||
const match = envCall.toolRequest.args.match(
|
||||
/DUPLICATE_ISSUES_CSV=\[?([\d, ]*)\]?/,
|
||||
);
|
||||
const issues = match
|
||||
? match[1]
|
||||
.split(',')
|
||||
.map((s: string) => s.trim())
|
||||
.filter((s: string) => s)
|
||||
: [];
|
||||
expect(issues).toContain('502');
|
||||
expect(issues).toContain('503');
|
||||
},
|
||||
});
|
||||
});
|
||||
@@ -1,81 +0,0 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2025 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { describe, expect } from 'vitest';
|
||||
import { evalTest } from './test-helper.js';
|
||||
import fs from 'node:fs/promises';
|
||||
import path from 'node:path';
|
||||
import yaml from 'js-yaml';
|
||||
|
||||
// Read the workflow file to extract the prompt and settings
|
||||
const workflowPath = path.join(
|
||||
process.cwd(),
|
||||
'.github/workflows/gemini-scheduled-issue-dedup.yml',
|
||||
);
|
||||
const workflowContent = await fs.readFile(workflowPath, 'utf8');
|
||||
|
||||
const workflowData = yaml.load(workflowContent) as any;
|
||||
const geminiStep = workflowData.jobs?.['refresh-embeddings']?.steps?.find(
|
||||
(step: any) => step.id === 'gemini_refresh_embeddings',
|
||||
);
|
||||
|
||||
const REFRESH_PROMPT_TEMPLATE = geminiStep?.with?.prompt;
|
||||
const ORIGINAL_SETTINGS = JSON.parse(geminiStep?.with?.settings || '{}');
|
||||
|
||||
if (!REFRESH_PROMPT_TEMPLATE) {
|
||||
throw new Error('Could not extract prompt from dedup refresh workflow.');
|
||||
}
|
||||
|
||||
const mockMcpPath = path.join(process.cwd(), 'evals/mocks/dedup_mcp.ts');
|
||||
|
||||
const createPrompt = () => {
|
||||
return REFRESH_PROMPT_TEMPLATE.replace(
|
||||
/\${{ github\.repository }}/g,
|
||||
'google-gemini/gemini-cli',
|
||||
);
|
||||
};
|
||||
|
||||
const REFRESH_SETTINGS = {
|
||||
...ORIGINAL_SETTINGS,
|
||||
mcpServers: {
|
||||
issue_deduplication: {
|
||||
command: 'npx',
|
||||
args: ['tsx', mockMcpPath],
|
||||
},
|
||||
},
|
||||
};
|
||||
if (REFRESH_SETTINGS.telemetry) {
|
||||
delete REFRESH_SETTINGS.telemetry;
|
||||
}
|
||||
|
||||
describe('dedup_refresh_agent', () => {
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should call refresh tool',
|
||||
prompt: ['--output-format', 'json', '--prompt', createPrompt()],
|
||||
approvalMode: 'yolo',
|
||||
params: {
|
||||
settings: REFRESH_SETTINGS,
|
||||
},
|
||||
assert: async (rig: any, result) => {
|
||||
// result is the JSON output
|
||||
const output = JSON.parse(result);
|
||||
expect(output.stats).toBeDefined();
|
||||
|
||||
const toolStats = output.stats.tools.byName;
|
||||
expect(toolStats.refresh).toBeDefined();
|
||||
expect(toolStats.refresh.count).toBe(1);
|
||||
expect(toolStats.refresh.success).toBe(1);
|
||||
|
||||
// We still check telemetry for deep arg inspection if needed,
|
||||
// but stats verify the high-level goal.
|
||||
const toolLogs = rig.readToolLogs();
|
||||
const refreshCall = toolLogs.find(
|
||||
(l: any) => l.toolRequest.name === 'refresh',
|
||||
);
|
||||
expect(refreshCall).toBeDefined();
|
||||
},
|
||||
});
|
||||
});
|
||||
@@ -1,413 +0,0 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2025 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { describe, expect } from 'vitest';
|
||||
import { evalTest } from './test-helper.js';
|
||||
import fs from 'node:fs/promises';
|
||||
import path from 'node:path';
|
||||
import yaml from 'js-yaml';
|
||||
|
||||
// Read the workflow file to extract the prompt
|
||||
const workflowPath = path.join(
|
||||
process.cwd(),
|
||||
'.github/workflows/gemini-automated-issue-triage.yml',
|
||||
);
|
||||
const workflowContent = await fs.readFile(workflowPath, 'utf8');
|
||||
|
||||
// Use a YAML parser for robustness
|
||||
const workflowData = yaml.load(workflowContent) as {
|
||||
jobs?: {
|
||||
'triage-issue'?: {
|
||||
steps?: {
|
||||
id?: string;
|
||||
with?: { prompt?: string; script?: string };
|
||||
}[];
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
const triageStep = workflowData.jobs?.['triage-issue']?.steps?.find(
|
||||
(step) => step.id === 'gemini_issue_analysis',
|
||||
);
|
||||
|
||||
const labelsStep = workflowData.jobs?.['triage-issue']?.steps?.find(
|
||||
(step) => step.id === 'get_labels',
|
||||
);
|
||||
|
||||
const TRIAGE_PROMPT_TEMPLATE = triageStep?.with?.prompt;
|
||||
const LABELS_SCRIPT = labelsStep?.with?.script;
|
||||
|
||||
if (!TRIAGE_PROMPT_TEMPLATE) {
|
||||
throw new Error(
|
||||
'Could not extract prompt from workflow file. Check for `jobs.triage-issue.steps[id=gemini_issue_analysis].with.prompt` in the YAML file.',
|
||||
);
|
||||
}
|
||||
|
||||
// Extract available labels from the script
|
||||
let availableLabels = '';
|
||||
if (LABELS_SCRIPT) {
|
||||
const match = LABELS_SCRIPT.match(/const allowedLabels = \[([\s\S]+?)\];/);
|
||||
if (match && match[1]) {
|
||||
// Clean up the extracted string: remove quotes, commas, and whitespace
|
||||
availableLabels = match[1]
|
||||
.replace(/['"\n\r]/g, '')
|
||||
.split(',')
|
||||
.map((s) => s.trim())
|
||||
.filter((s) => s.length > 0)
|
||||
.join(', ');
|
||||
}
|
||||
}
|
||||
|
||||
if (!availableLabels) {
|
||||
throw new Error(
|
||||
'Could not extract available labels from workflow file. Check for `jobs.triage-issue.steps[id=get_labels].with.script` containing `const allowedLabels = [...]`.',
|
||||
);
|
||||
}
|
||||
|
||||
const createPrompt = (title: string, body: string) => {
|
||||
// The placeholders in the YAML are ${{ env.ISSUE_TITLE }} etc.
|
||||
// We need to replace them with the actual values for the test.
|
||||
return TRIAGE_PROMPT_TEMPLATE.replace('${{ env.ISSUE_TITLE }}', title)
|
||||
.replace('${{ env.ISSUE_BODY }}', body)
|
||||
.replace('${{ env.AVAILABLE_LABELS }}', availableLabels);
|
||||
};
|
||||
|
||||
const TRIAGE_SETTINGS = {};
|
||||
|
||||
const escapeHtml = (str: string) => {
|
||||
return str.replace(/[<>&'"]/g, (c) => {
|
||||
switch (c) {
|
||||
case '<':
|
||||
return '<';
|
||||
case '>':
|
||||
return '>';
|
||||
case '&':
|
||||
return '&';
|
||||
case "'":
|
||||
return ''';
|
||||
case '"':
|
||||
return '"';
|
||||
}
|
||||
return ''; // Should not happen
|
||||
});
|
||||
};
|
||||
|
||||
const assertHasLabel = (expectedLabel: string) => {
|
||||
return async (rig: any, result: string) => {
|
||||
// Verify JSON output stats
|
||||
const output = JSON.parse(result);
|
||||
expect(output.stats).toBeDefined();
|
||||
|
||||
// The model response JSON is in the 'response' field
|
||||
const responseText = output.response;
|
||||
let jsonString: string;
|
||||
const match = responseText.match(/```json\s*([\s\S]*?)\s*```/);
|
||||
if (match?.[1]) {
|
||||
jsonString = match[1];
|
||||
} else {
|
||||
const firstBrace = responseText.indexOf('{');
|
||||
const lastBrace = responseText.lastIndexOf('}');
|
||||
if (firstBrace === -1 || lastBrace === -1 || lastBrace < firstBrace) {
|
||||
throw new Error(
|
||||
`Could not find a JSON object in the response: "${escapeHtml(responseText)}"`,
|
||||
);
|
||||
}
|
||||
jsonString = responseText.substring(firstBrace, lastBrace + 1);
|
||||
}
|
||||
|
||||
let data: { labels_to_set?: string[] };
|
||||
try {
|
||||
data = JSON.parse(jsonString);
|
||||
} catch (e) {
|
||||
const err = e as Error;
|
||||
throw new Error(
|
||||
`Failed to parse JSON. Error: ${err.message}. Response: "${escapeHtml(responseText)}"`,
|
||||
);
|
||||
}
|
||||
|
||||
expect(data).toHaveProperty('labels_to_set');
|
||||
expect(Array.isArray(data.labels_to_set)).toBe(true);
|
||||
expect(data.labels_to_set).toContain(expectedLabel);
|
||||
};
|
||||
};
|
||||
|
||||
describe('triage_agent', () => {
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should identify area/core for windows installation issues',
|
||||
prompt: [
|
||||
'--output-format',
|
||||
'json',
|
||||
'--prompt',
|
||||
createPrompt(
|
||||
'CLI failed to install on Windows',
|
||||
'I tried running npm install but it failed with an error on Windows 11.',
|
||||
),
|
||||
],
|
||||
params: { settings: TRIAGE_SETTINGS },
|
||||
assert: assertHasLabel('area/core'),
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should identify area/platform for CI/CD failures',
|
||||
prompt: [
|
||||
'--output-format',
|
||||
'json',
|
||||
'--prompt',
|
||||
createPrompt(
|
||||
'Tests are failing in the CI/CD pipeline',
|
||||
'The github action is failing with a 500 error.',
|
||||
),
|
||||
],
|
||||
params: { settings: TRIAGE_SETTINGS },
|
||||
assert: assertHasLabel('area/platform'),
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should identify area/platform for quota issues',
|
||||
prompt: [
|
||||
'--output-format',
|
||||
'json',
|
||||
'--prompt',
|
||||
createPrompt(
|
||||
'Resource Exhausted 429',
|
||||
'I am getting a 429 error when running the CLI.',
|
||||
),
|
||||
],
|
||||
params: { settings: TRIAGE_SETTINGS },
|
||||
assert: assertHasLabel('area/platform'),
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should identify area/core for local build failures',
|
||||
prompt: [
|
||||
'--output-format',
|
||||
'json',
|
||||
'--prompt',
|
||||
createPrompt(
|
||||
'Local build failing',
|
||||
'I cannot build the project locally. npm run build fails.',
|
||||
),
|
||||
],
|
||||
params: { settings: TRIAGE_SETTINGS },
|
||||
assert: assertHasLabel('area/core'),
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should identify area/platform for sandbox issues',
|
||||
prompt: [
|
||||
'--output-format',
|
||||
'json',
|
||||
'--prompt',
|
||||
createPrompt(
|
||||
'Sandbox connection failed',
|
||||
'I cannot connect to the docker sandbox environment.',
|
||||
),
|
||||
],
|
||||
params: { settings: TRIAGE_SETTINGS },
|
||||
assert: assertHasLabel('area/platform'),
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should identify area/core for local test failures',
|
||||
prompt: [
|
||||
'--output-format',
|
||||
'json',
|
||||
'--prompt',
|
||||
createPrompt(
|
||||
'Local tests failing',
|
||||
'I am running npm test locally and it fails.',
|
||||
),
|
||||
],
|
||||
params: { settings: TRIAGE_SETTINGS },
|
||||
assert: assertHasLabel('area/core'),
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should identify area/agent for questions about tools',
|
||||
prompt: [
|
||||
'--output-format',
|
||||
'json',
|
||||
'--prompt',
|
||||
createPrompt(
|
||||
'Bug with web search?',
|
||||
'I am trying to use web search but I do not know the syntax. Is it @web or /web?',
|
||||
),
|
||||
],
|
||||
params: { settings: TRIAGE_SETTINGS },
|
||||
assert: assertHasLabel('area/agent'),
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should identify area/extensions for feature requests',
|
||||
prompt: [
|
||||
'--output-format',
|
||||
'json',
|
||||
'--prompt',
|
||||
createPrompt(
|
||||
'Please add a python extension',
|
||||
'I want to write python scripts as an extension.',
|
||||
),
|
||||
],
|
||||
params: { settings: TRIAGE_SETTINGS },
|
||||
assert: assertHasLabel('area/extensions'),
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should identify area/unknown for off-topic spam',
|
||||
prompt: [
|
||||
'--output-format',
|
||||
'json',
|
||||
'--prompt',
|
||||
createPrompt('Buy cheap rolex', 'Click here for discount.'),
|
||||
],
|
||||
params: { settings: TRIAGE_SETTINGS },
|
||||
assert: assertHasLabel('area/unknown'),
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should identify area/core for crash reports phrased as questions',
|
||||
prompt: [
|
||||
'--output-format',
|
||||
'json',
|
||||
'--prompt',
|
||||
createPrompt(
|
||||
'Why does it segfault?',
|
||||
'Why does the CLI segfault immediately when I run it on Ubuntu?',
|
||||
),
|
||||
],
|
||||
params: { settings: TRIAGE_SETTINGS },
|
||||
assert: assertHasLabel('area/core'),
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should identify area/agent for feature requests for built-in tools',
|
||||
prompt: [
|
||||
'--output-format',
|
||||
'json',
|
||||
'--prompt',
|
||||
createPrompt(
|
||||
'Can we have a diff tool?',
|
||||
'Is it possible to add a built-in tool to show diffs before editing?',
|
||||
),
|
||||
],
|
||||
params: { settings: TRIAGE_SETTINGS },
|
||||
assert: assertHasLabel('area/agent'),
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should identify area/enterprise for license questions',
|
||||
prompt: [
|
||||
'--output-format',
|
||||
'json',
|
||||
'--prompt',
|
||||
createPrompt(
|
||||
'License key issue',
|
||||
'Where do I enter my enterprise license key? I cannot find the setting.',
|
||||
),
|
||||
],
|
||||
params: { settings: TRIAGE_SETTINGS },
|
||||
assert: assertHasLabel('area/enterprise'),
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should identify area/unknown for extremely vague reports',
|
||||
prompt: [
|
||||
'--output-format',
|
||||
'json',
|
||||
'--prompt',
|
||||
createPrompt('It does not work', 'I tried to use it and it failed.'),
|
||||
],
|
||||
params: { settings: TRIAGE_SETTINGS },
|
||||
assert: assertHasLabel('area/unknown'),
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should identify area/security for prompt injection reports',
|
||||
prompt: [
|
||||
'--output-format',
|
||||
'json',
|
||||
'--prompt',
|
||||
createPrompt(
|
||||
'Prompt injection vulnerability',
|
||||
'I found a way to make the agent ignore instructions by saying "Ignore all previous instructions".',
|
||||
),
|
||||
],
|
||||
params: { settings: TRIAGE_SETTINGS },
|
||||
assert: assertHasLabel('area/security'),
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should identify area/non-interactive for headless crashes',
|
||||
prompt: [
|
||||
'--output-format',
|
||||
'json',
|
||||
'--prompt',
|
||||
createPrompt(
|
||||
'Headless mode segfault',
|
||||
'When I run with --headless, the CLI crashes immediately.',
|
||||
),
|
||||
],
|
||||
params: { settings: TRIAGE_SETTINGS },
|
||||
assert: assertHasLabel('area/non-interactive'),
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should identify area/agent for mixed feedback and tool bugs',
|
||||
prompt: [
|
||||
'--output-format',
|
||||
'json',
|
||||
'--prompt',
|
||||
createPrompt(
|
||||
'Great tool but web search fails',
|
||||
'I love using Gemini CLI, it is amazing! However, the @web tool gives me an error every time I search for "react".',
|
||||
),
|
||||
],
|
||||
params: { settings: TRIAGE_SETTINGS },
|
||||
assert: assertHasLabel('area/agent'),
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should identify area/core for UI performance issues',
|
||||
prompt: [
|
||||
'--output-format',
|
||||
'json',
|
||||
'--prompt',
|
||||
createPrompt(
|
||||
'UI is very slow',
|
||||
'The new interface is lagging and unresponsive when I scroll.',
|
||||
),
|
||||
],
|
||||
params: { settings: TRIAGE_SETTINGS },
|
||||
assert: assertHasLabel('area/core'),
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should identify area/security for accidental secret leakage',
|
||||
prompt: [
|
||||
'--output-format',
|
||||
'json',
|
||||
'--prompt',
|
||||
createPrompt(
|
||||
'Leaked API key in logs',
|
||||
'I accidentally posted my API key in a previous issue comment. Can you delete it?',
|
||||
),
|
||||
],
|
||||
params: { settings: TRIAGE_SETTINGS },
|
||||
assert: assertHasLabel('area/security'),
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should identify area/unknown for nonsensical input',
|
||||
prompt: [
|
||||
'--output-format',
|
||||
'json',
|
||||
'--prompt',
|
||||
createPrompt('asdfasdf', 'qwerqwer zxcvbnm'),
|
||||
],
|
||||
params: { settings: TRIAGE_SETTINGS },
|
||||
assert: assertHasLabel('area/unknown'),
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user