feat(evals): add comprehensive workflow evaluations and tune prompts (Issue #219)

- Established evals for all agent workflows (triage, dedup, refresh). - Refactored all evals to use modern --output-format=json flag for robust validation. - Tuned prompts for strict JSON compliance and corrected spam handling in scheduled triage. - Expanded edge case coverage for false positives, security leaks, and mixed batches.
2026-06-12 12:26:57 -07:00 · 2026-02-03 19:17:38 -05:00
parent 2daee0d066
commit 5c2f477adf
8 changed files with 1128 additions and 71 deletions
@@ -257,6 +257,13 @@ jobs:
            area/unknown
            - Description: Issues that do not clearly fit into any other defined area/ category, or where information is too limited to make a determination. Use this when no other area is appropriate.

+            ## Final Instructions
+
+            - Output ONLY valid JSON format.
+            - Do NOT include any introductory or concluding remarks, explanations, or additional text.
+            - Do NOT include any thoughts or reasoning outside the JSON block.
+            - Ensure the output is a single JSON object with a "labels_to_set" array.
+
      - name: 'Apply Labels to Issue'
        if: |-
          ${{ steps.gemini_issue_analysis.outputs.summary != '' }}
@@ -159,7 +159,7 @@ jobs:
                 }
               ]
               ```
-              If an issue cannot be classified, do not include it in the output array.
+              If an issue cannot be classified (e.g. spam), classify it as area/unknown.
            9. For each issue please check if CLI version is present, this is usually in the output of the /about command and will look like 0.1.5
              - Anything more than 6 versions older than the most recent should add the status/need-retesting label
            10. If you see that the issue doesn't look like it has sufficient information recommend the status/need-information label and leave a comment politely requesting the relevant information, eg.. if repro steps are missing request for repro steps. if version information is missing request for version information into the explanation section below.
@@ -207,7 +207,7 @@ jobs:
            area/enterprise: Telemetry, Policy, Quota / Licensing
            area/extensions: Gemini CLI extensions capability
            area/non-interactive: GitHub Actions, SDK, 3P Integrations, Shell Scripting, Command line automation
-            area/platform: Build infra, Release mgmt, Testing, Eval infra, Capacity, Quota mgmt
+            area/platform: Build infra, Release mgmt, Automated testing infrastructure (evals), Capacity, Quota mgmt. NOT for local test failures.
            area/security: security related issues

            Additional Context:
@@ -215,6 +215,13 @@ jobs:
            - This product is designed to use different models eg.. using pro, downgrading to flash etc.
            - When users report that they dont expect the model to change those would be categorized as feature requests.

+            ## Final Instructions
+
+            - Output ONLY valid JSON format.
+            - Do NOT include any introductory or concluding remarks, explanations, or additional text.
+            - Do NOT include any thoughts or reasoning outside the JSON block.
+            - Ensure the output is a single JSON array of objects.
+
      - name: 'Apply Labels to Issues'
        if: |-
          ${{ steps.gemini_issue_analysis.outcome == 'success' &&
@@ -0,0 +1,254 @@
+/**
+ * @license
+ * Copyright 2025 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, expect } from 'vitest';
+import { evalTest } from './test-helper.js';
+import fs from 'node:fs/promises';
+import path from 'node:path';
+import yaml from 'js-yaml';
+
+// Read the workflow file to extract the prompt
+const workflowPath = path.join(
+  process.cwd(),
+  '.github/workflows/gemini-scheduled-issue-triage.yml',
+);
+const workflowContent = await fs.readFile(workflowPath, 'utf8');
+
+// Use a YAML parser for robustness
+const workflowData = yaml.load(workflowContent) as {
+  jobs?: {
+    'triage-issues'?: {
+      steps?: {
+        id?: string;
+        with?: { prompt?: string; script?: string };
+        env?: { AVAILABLE_LABELS?: string };
+      }[];
+    };
+  };
+};
+
+const geminiStep = workflowData.jobs?.['triage-issues']?.steps?.find(
+  (step) => step.id === 'gemini_issue_analysis',
+);
+
+const labelsStep = workflowData.jobs?.['triage-issues']?.steps?.find(
+  (step) => step.id === 'get_labels',
+);
+
+const BATCH_TRIAGE_PROMPT_TEMPLATE = geminiStep?.with?.prompt;
+const ORIGINAL_SETTINGS = JSON.parse(geminiStep?.with?.settings || '{}');
+const LABELS_SCRIPT = labelsStep?.with?.script;
+
+if (!BATCH_TRIAGE_PROMPT_TEMPLATE) {
+  throw new Error(
+    'Could not extract prompt from workflow file. Check for `jobs.triage-issues.steps[id=gemini_issue_analysis].with.prompt` in the YAML file.',
+  );
+}
+
+// Extract available labels from the script
+let availableLabels = '';
+if (LABELS_SCRIPT) {
+  const match = LABELS_SCRIPT.match(
+    /const labelNames = labels.map\(label => label.name\);/,
+  );
+  // Wait, the script in scheduled triage is different!
+  // const labelNames = labels.map(label => label.name);
+  // It gets ALL labels.
+  // But the prompt expects "${AVAILABLE_LABELS}".
+  // In the test, we can just mock a reasonable set of labels.
+  availableLabels =
+    'area/agent, area/core, area/enterprise, area/extensions, area/non-interactive, area/platform, area/security, area/unknown, kind/bug, kind/feature, kind/question, priority/p0, priority/p1, priority/p2, priority/p3';
+}
+
+const createPrompt = () => {
+  return BATCH_TRIAGE_PROMPT_TEMPLATE.replace(
+    '${AVAILABLE_LABELS}',
+    availableLabels,
+  );
+};
+
+const BATCH_TRIAGE_SETTINGS = {
+  ...ORIGINAL_SETTINGS,
+};
+if (BATCH_TRIAGE_SETTINGS.telemetry) {
+  delete BATCH_TRIAGE_SETTINGS.telemetry;
+}
+
+const escapeHtml = (str: string) => {
+  return str.replace(/[<>&'"]/g, (c) => {
+    switch (c) {
+      case '<':
+        return '&lt;';
+      case '>':
+        return '&gt;';
+      case '&':
+        return '&amp;';
+      case "'":
+        return '&apos;';
+      case '"':
+        return '&quot;';
+    }
+    return '';
+  });
+};
+
+const assertHasIssueLabel = (issueNumber: number, expectedLabel: string) => {
+  return async (rig: any, result: string) => {
+    // Verify JSON output stats
+    const output = JSON.parse(result);
+    expect(output.stats).toBeDefined();
+
+    // The model response JSON is in the 'response' field
+    const responseText = output.response;
+    const firstBrace = responseText.indexOf('[');
+    const lastBrace = responseText.lastIndexOf(']');
+    if (firstBrace === -1 || lastBrace === -1 || lastBrace < firstBrace) {
+      throw new Error(
+        `Could not find a JSON array in the response: "${escapeHtml(responseText)}"`,
+      );
+    }
+    const jsonString = responseText.substring(firstBrace, lastBrace + 1);
+
+    let data: { issue_number: number; labels_to_add: string[] }[];
+    try {
+      data = JSON.parse(jsonString);
+    } catch (e) {
+      const err = e as Error;
+      throw new Error(
+        `Failed to parse JSON. Error: ${err.message}. Response: "${escapeHtml(responseText)}"`,
+      );
+    }
+
+    const issue = data.find((i) => i.issue_number === issueNumber);
+    if (!issue) {
+      throw new Error(
+        `Issue #${issueNumber} not found in output: ${JSON.stringify(data)}`,
+      );
+    }
+
+    expect(issue.labels_to_add).toContain(expectedLabel);
+  };
+};
+
+describe('batch_triage_agent', () => {
+  evalTest('USUALLY_PASSES', {
+    name: 'should identify area/core for local test failures in batch',
+    prompt: ['--output-format', 'json', '--prompt', createPrompt()],
+    env: {
+      AVAILABLE_LABELS: availableLabels,
+      ISSUES_TO_TRIAGE: JSON.stringify([
+        {
+          number: 101,
+          title: 'Local tests failing',
+          body: 'I am running npm test locally and it fails with an error.',
+        },
+      ]),
+    },
+    params: { settings: BATCH_TRIAGE_SETTINGS },
+    assert: assertHasIssueLabel(101, 'area/core'),
+  });
+
+  evalTest('USUALLY_PASSES', {
+    name: 'should identify area/platform for CI failures in batch',
+    prompt: ['--output-format', 'json', '--prompt', createPrompt()],
+    env: {
+      AVAILABLE_LABELS: availableLabels,
+      ISSUES_TO_TRIAGE: JSON.stringify([
+        {
+          number: 102,
+          title: 'CI pipeline failed',
+          body: 'The GitHub Action for tests failed on the main branch.',
+        },
+      ]),
+    },
+    params: { settings: BATCH_TRIAGE_SETTINGS },
+    assert: assertHasIssueLabel(102, 'area/platform'),
+  });
+
+  evalTest('USUALLY_PASSES', {
+    name: 'should handle mixed batch correctly',
+    prompt: ['--output-format', 'json', '--prompt', createPrompt()],
+    env: {
+      AVAILABLE_LABELS: availableLabels,
+      ISSUES_TO_TRIAGE: JSON.stringify([
+        {
+          number: 103,
+          title: 'Cannot install on MacOS',
+          body: 'Install fails with permission error.',
+        },
+        {
+          number: 104,
+          title: 'Click to win',
+          body: 'Spam body',
+        },
+      ]),
+    },
+    params: { settings: BATCH_TRIAGE_SETTINGS },
+    assert: async (rig: any, result) => {
+      // Assert issue 103 has area/core
+      await assertHasIssueLabel(103, 'area/core')(rig, result);
+      // Assert issue 104 has area/unknown
+      await assertHasIssueLabel(104, 'area/unknown')(rig, result);
+    },
+  });
+
+  evalTest('USUALLY_PASSES', {
+    name: 'should handle issues needing retesting (old version)',
+    prompt: ['--output-format', 'json', '--prompt', createPrompt()],
+    env: {
+      AVAILABLE_LABELS: availableLabels,
+      ISSUES_TO_TRIAGE: JSON.stringify([
+        {
+          number: 105,
+          title: 'Crash on version 0.1.0',
+          body: 'I am using /about and it says 0.1.0. The app crashes when I run it.',
+        },
+      ]),
+    },
+    params: { settings: BATCH_TRIAGE_SETTINGS },
+    assert: assertHasIssueLabel(105, 'status/need-retesting'),
+  });
+
+  evalTest('USUALLY_PASSES', {
+    name: 'should handle issues needing more information',
+    prompt: ['--output-format', 'json', '--prompt', createPrompt()],
+    env: {
+      AVAILABLE_LABELS: availableLabels,
+      ISSUES_TO_TRIAGE: JSON.stringify([
+        {
+          number: 106,
+          title: 'It does not work',
+          body: 'Something is broken.',
+        },
+      ]),
+    },
+    params: { settings: BATCH_TRIAGE_SETTINGS },
+    assert: assertHasIssueLabel(106, 'status/need-information'),
+  });
+
+  evalTest('USUALLY_PASSES', {
+    name: 'should handle large batch of diverse issues',
+    prompt: ['--output-format', 'json', '--prompt', createPrompt()],
+    env: {
+      AVAILABLE_LABELS: availableLabels,
+      ISSUES_TO_TRIAGE: JSON.stringify([
+        { number: 107, title: 'Bug A', body: 'Local test failure' },
+        { number: 108, title: 'Bug B', body: 'CI failure' },
+        { number: 109, title: 'Bug C', body: 'Security leak' },
+        { number: 110, title: 'Bug D', body: 'Spam' },
+        { number: 111, title: 'Bug E', body: 'Old version 0.0.1' },
+      ]),
+    },
+    params: { settings: BATCH_TRIAGE_SETTINGS },
+    assert: async (rig: any, result) => {
+      await assertHasIssueLabel(107, 'area/core')(rig, result);
+      await assertHasIssueLabel(108, 'area/platform')(rig, result);
+      await assertHasIssueLabel(109, 'area/security')(rig, result);
+      await assertHasIssueLabel(110, 'area/unknown')(rig, result);
+      await assertHasIssueLabel(111, 'status/need-retesting')(rig, result);
+    },
+  });
+});
@@ -0,0 +1,422 @@
+/**
+ * @license
+ * Copyright 2025 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, expect } from 'vitest';
+import { evalTest } from './test-helper.js';
+import fs from 'node:fs/promises';
+import path from 'node:path';
+import yaml from 'js-yaml';
+
+// Read the workflow file to extract the prompt and settings
+const workflowPath = path.join(
+  process.cwd(),
+  '.github/workflows/gemini-automated-issue-dedup.yml',
+);
+const workflowContent = await fs.readFile(workflowPath, 'utf8');
+
+const workflowData = yaml.load(workflowContent) as any;
+const geminiStep = workflowData.jobs?.['find-duplicates']?.steps?.find(
+  (step: any) => step.id === 'gemini_issue_deduplication',
+);
+
+const DEDUP_PROMPT_TEMPLATE = geminiStep?.with?.prompt;
+const ORIGINAL_SETTINGS = JSON.parse(geminiStep?.with?.settings || '{}');
+
+if (!DEDUP_PROMPT_TEMPLATE) {
+  throw new Error('Could not extract prompt from de-duplication workflow.');
+}
+
+const mockMcpPath = path.join(process.cwd(), 'evals/mocks/dedup_mcp.ts');
+
+const createPrompt = (issueNumber: number) => {
+  // The prompt uses ${{ github.event.issue.number }} but also references ${ISSUE_NUMBER} (env)
+  return DEDUP_PROMPT_TEMPLATE.replace(
+    /\${{ github\.repository }}/g,
+    'google-gemini/gemini-cli',
+  ).replace(/\${{ github\.event\.issue\.number }}/g, issueNumber.toString());
+};
+
+const DEDUP_SETTINGS = {
+  ...ORIGINAL_SETTINGS,
+  mcpServers: {
+    issue_deduplication: {
+      command: 'npx',
+      args: ['tsx', mockMcpPath],
+    },
+  },
+};
+if (DEDUP_SETTINGS.telemetry) {
+  delete DEDUP_SETTINGS.telemetry;
+}
+
+describe('dedup_agent', () => {
+  evalTest('USUALLY_PASSES', {
+    name: 'should identify duplicate issues',
+    prompt: ['--output-format', 'json', '--prompt', createPrompt(101)],
+    env: {
+      ISSUE_NUMBER: '101',
+      GITHUB_ENV: 'github_env',
+    },
+    params: {
+      settings: DEDUP_SETTINGS,
+    },
+    files: {
+      github_env: '',
+      // Mock gh binary
+      'bin/gh': `#!/usr/bin/env node
+const args = process.argv.slice(2).join(' ');
+if (args.includes('issue view')) {
+    const issueNum = args.match(/view (\\d+)/)?.[1];
+    if (issueNum === '101') {
+        console.log(JSON.stringify({
+            number: 101,
+            title: 'CLI crashes on start',
+            body: 'It segfaults immediately.',
+            comments: []
+        }));
+    } else if (issueNum === '201') {
+        console.log(JSON.stringify({
+            number: 201,
+            title: 'Segfault on launch',
+            body: 'The app crashes right away.',
+            comments: []
+        }));
+    } else if (issueNum === '202') {
+        console.log(JSON.stringify({
+            number: 202,
+            title: 'Unrelated bug',
+            body: 'Themes are not working.',
+            comments: []
+        }));
+    }
+}
+`,
+    },
+    assert: async (rig: any, result) => {
+      // Verify JSON output stats
+      const output = JSON.parse(result);
+      expect(output.stats).toBeDefined();
+      expect(output.stats.tools.byName['duplicates']).toBeDefined();
+      expect(output.stats.tools.byName['run_shell_command']).toBeDefined();
+
+      // Verify detailed tool usage via telemetry
+      const toolLogs = rig.readToolLogs();
+      const duplicatesCall = toolLogs.find(
+        (l: any) => l.toolRequest.name === 'duplicates',
+      );
+      expect(duplicatesCall).toBeDefined();
+
+      // The current prompt uses echo to set GITHUB_ENV
+      // We check the tool call for the echo command
+      const shellCalls = toolLogs.filter(
+        (l: any) => l.toolRequest.name === 'run_shell_command',
+      );
+      const envCall = shellCalls.find((call: any) =>
+        call.toolRequest.args.includes('DUPLICATE_ISSUES_CSV'),
+      );
+
+      expect(envCall).toBeDefined();
+      // Check the command content
+      const match = envCall.toolRequest.args.match(
+        /DUPLICATE_ISSUES_CSV=\[?([\d, ]*)\]?/,
+      );
+      expect(match).not.toBeNull();
+      const issues = match![1]
+        .split(',')
+        .map((s: string) => s.trim())
+        .filter((s: string) => s);
+      expect(issues).toContain('201');
+      expect(issues).not.toContain('202');
+    },
+  });
+
+  evalTest('USUALLY_PASSES', {
+    name: 'should respect "not a duplicate" comments',
+    prompt: ['--output-format', 'json', '--prompt', createPrompt(101)],
+    env: {
+      ISSUE_NUMBER: '101',
+      GITHUB_ENV: 'github_env',
+    },
+    params: {
+      settings: DEDUP_SETTINGS,
+    },
+    files: {
+      github_env: '',
+      'bin/gh': `#!/usr/bin/env node
+const args = process.argv.slice(2).join(' ');
+if (args.includes('issue view')) {
+    const issueNum = args.match(/view (\\d+)/)?.[1];
+    if (issueNum === '101') {
+        console.log(JSON.stringify({
+            number: 101,
+            title: 'CLI crashes on start',
+            body: 'It segfaults immediately.',
+            comments: [{ body: 'Note: This is NOT a duplicate of #201, different root cause.' }]
+        }));
+    } else if (issueNum === '201') {
+        console.log(JSON.stringify({
+            number: 201,
+            title: 'Segfault on launch',
+            body: 'The app crashes right away.',
+            comments: []
+        }));
+    } else {
+        console.log(JSON.stringify({ number: parseInt(issueNum), title: '', body: '', comments: [] }));
+    }
+}
+`,
+    },
+    assert: async (rig: any, result) => {
+      // Verify JSON output stats
+      const output = JSON.parse(result);
+      expect(output.stats).toBeDefined();
+
+      const toolLogs = rig.readToolLogs();
+      const duplicatesCall = toolLogs.find(
+        (l: any) => l.toolRequest.name === 'duplicates',
+      );
+      expect(duplicatesCall).toBeDefined();
+
+      const shellCalls = toolLogs.filter(
+        (l: any) => l.toolRequest.name === 'run_shell_command',
+      );
+      // It might not call echo if no duplicates are found, or it might echo an empty list.
+      // We'll check if it does call echo, that 201 is NOT in it.
+      const envCall = shellCalls.find((call: any) =>
+        call.toolRequest.args.includes('DUPLICATE_ISSUES_CSV'),
+      );
+
+      if (envCall) {
+        const match = envCall.toolRequest.args.match(
+          /DUPLICATE_ISSUES_CSV=\[?([\d, ]*)\]?/,
+        );
+        const issues = match
+          ? match[1]
+              .split(',')
+              .map((s: string) => s.trim())
+              .filter((s: string) => s)
+          : [];
+        expect(issues).not.toContain('201');
+      }
+    },
+  });
+
+  evalTest('USUALLY_PASSES', {
+    name: 'should differentiate false positives with high similarity',
+    prompt: ['--output-format', 'json', '--prompt', createPrompt(301)],
+    env: {
+      ISSUE_NUMBER: '301',
+      GITHUB_ENV: 'github_env',
+    },
+    params: {
+      settings: DEDUP_SETTINGS,
+    },
+    files: {
+      github_env: '',
+      'bin/gh': `#!/usr/bin/env node
+const args = process.argv.slice(2).join(' ');
+if (args.includes('issue view')) {
+    const issueNum = args.match(/view (\\d+)/)?.[1];
+    if (issueNum === '301') {
+        console.log(JSON.stringify({
+            number: 301,
+            title: 'App crashes when I click Save',
+            body: 'I click the save button and it crashes.',
+            comments: []
+        }));
+    } else if (issueNum === '302') {
+        console.log(JSON.stringify({
+            number: 302,
+            title: 'App crashes when I click Load',
+            body: 'I click the load button and it crashes. This seems related to the loader component.',
+            comments: []
+        }));
+    } else {
+        console.log(JSON.stringify({ number: parseInt(issueNum), title: '', body: '', comments: [] }));
+    }
+}
+`,
+    },
+    assert: async (rig: any, result) => {
+      // Verify JSON output stats
+      const output = JSON.parse(result);
+      expect(output.stats).toBeDefined();
+
+      const toolLogs = rig.readToolLogs();
+      const duplicatesCall = toolLogs.find(
+        (l: any) => l.toolRequest.name === 'duplicates',
+      );
+      expect(duplicatesCall).toBeDefined();
+
+      const shellCalls = toolLogs.filter(
+        (l: any) => l.toolRequest.name === 'run_shell_command',
+      );
+      const envCall = shellCalls.find((call: any) =>
+        call.toolRequest.args.includes('DUPLICATE_ISSUES_CSV'),
+      );
+
+      if (envCall) {
+        const match = envCall.toolRequest.args.match(
+          /DUPLICATE_ISSUES_CSV=\[?([\d, ]*)\]?/,
+        );
+        const issues = match
+          ? match[1]
+              .split(',')
+              .map((s: string) => s.trim())
+              .filter((s: string) => s)
+          : [];
+        // Should NOT contain 302 because it's a different feature (Save vs Load) despite crash
+        expect(issues).not.toContain('302');
+      }
+    },
+  });
+
+  evalTest('USUALLY_PASSES', {
+    name: 'should reject matches with low similarity',
+    prompt: ['--output-format', 'json', '--prompt', createPrompt(401)],
+    env: {
+      ISSUE_NUMBER: '401',
+      GITHUB_ENV: 'github_env',
+    },
+    params: {
+      settings: DEDUP_SETTINGS,
+    },
+    files: {
+      github_env: '',
+      'bin/gh': `#!/usr/bin/env node
+const args = process.argv.slice(2).join(' ');
+if (args.includes('issue view')) {
+    const issueNum = args.match(/view (\\d+)/)?.[1];
+    if (issueNum === '401') {
+        console.log(JSON.stringify({
+            number: 401,
+            title: 'Feature request: Dark mode',
+            body: 'Please add dark mode.',
+            comments: []
+        }));
+    } else if (issueNum === '402') {
+        console.log(JSON.stringify({
+            number: 402,
+            title: 'Feature request: Light mode',
+            body: 'Please add light mode.',
+            comments: []
+        }));
+    } else {
+        console.log(JSON.stringify({ number: parseInt(issueNum), title: '', body: '', comments: [] }));
+    }
+}
+`,
+    },
+    assert: async (rig: any, result) => {
+      // Verify JSON output stats
+      const output = JSON.parse(result);
+      expect(output.stats).toBeDefined();
+
+      const toolLogs = rig.readToolLogs();
+      const duplicatesCall = toolLogs.find(
+        (l: any) => l.toolRequest.name === 'duplicates',
+      );
+      expect(duplicatesCall).toBeDefined();
+
+      const shellCalls = toolLogs.filter(
+        (l: any) => l.toolRequest.name === 'run_shell_command',
+      );
+      const envCall = shellCalls.find((call: any) =>
+        call.toolRequest.args.includes('DUPLICATE_ISSUES_CSV'),
+      );
+
+      if (envCall) {
+        const match = envCall.toolRequest.args.match(
+          /DUPLICATE_ISSUES_CSV=\[?([\d, ]*)\]?/,
+        );
+        const issues = match
+          ? match[1]
+              .split(',')
+              .map((s: string) => s.trim())
+              .filter((s: string) => s)
+          : [];
+        expect(issues).not.toContain('402');
+        expect(issues.length).toBe(0);
+      }
+    },
+  });
+
+  evalTest('USUALLY_PASSES', {
+    name: 'should identify multiple duplicates',
+    prompt: ['--output-format', 'json', '--prompt', createPrompt(501)],
+    env: {
+      ISSUE_NUMBER: '501',
+      GITHUB_ENV: 'github_env',
+    },
+    params: {
+      settings: DEDUP_SETTINGS,
+    },
+    files: {
+      github_env: '',
+      'bin/gh': `#!/usr/bin/env node
+const args = process.argv.slice(2).join(' ');
+if (args.includes('issue view')) {
+    const issueNum = args.match(/view (\\d+)/)?.[1];
+    if (issueNum === '501') {
+        console.log(JSON.stringify({
+            number: 501,
+            title: 'Crash on login',
+            body: 'The app crashes when I try to log in.',
+            comments: []
+        }));
+    } else if (issueNum === '502') {
+        console.log(JSON.stringify({
+            number: 502,
+            title: 'Crash on sign in',
+            body: 'Crashes during sign in process.',
+            comments: []
+        }));
+    } else if (issueNum === '503') {
+        console.log(JSON.stringify({
+            number: 503,
+            title: 'Crashes on login page',
+            body: 'I get a crash immediately on the login page.',
+            comments: []
+        }));
+    } else {
+        console.log(JSON.stringify({ number: parseInt(issueNum), title: '', body: '', comments: [] }));
+    }
+}
+`,
+    },
+    assert: async (rig: any, result) => {
+      // Verify JSON output stats
+      const output = JSON.parse(result);
+      expect(output.stats).toBeDefined();
+
+      const toolLogs = rig.readToolLogs();
+      const duplicatesCall = toolLogs.find(
+        (l: any) => l.toolRequest.name === 'duplicates',
+      );
+      expect(duplicatesCall).toBeDefined();
+
+      const shellCalls = toolLogs.filter(
+        (l: any) => l.toolRequest.name === 'run_shell_command',
+      );
+      const envCall = shellCalls.find((call: any) =>
+        call.toolRequest.args.includes('DUPLICATE_ISSUES_CSV'),
+      );
+
+      expect(envCall).toBeDefined();
+      const match = envCall.toolRequest.args.match(
+        /DUPLICATE_ISSUES_CSV=\[?([\d, ]*)\]?/,
+      );
+      const issues = match
+        ? match[1]
+            .split(',')
+            .map((s: string) => s.trim())
+            .filter((s: string) => s)
+        : [];
+      expect(issues).toContain('502');
+      expect(issues).toContain('503');
+    },
+  });
+});
@@ -0,0 +1,81 @@
+/**
+ * @license
+ * Copyright 2025 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, expect } from 'vitest';
+import { evalTest } from './test-helper.js';
+import fs from 'node:fs/promises';
+import path from 'node:path';
+import yaml from 'js-yaml';
+
+// Read the workflow file to extract the prompt and settings
+const workflowPath = path.join(
+  process.cwd(),
+  '.github/workflows/gemini-scheduled-issue-dedup.yml',
+);
+const workflowContent = await fs.readFile(workflowPath, 'utf8');
+
+const workflowData = yaml.load(workflowContent) as any;
+const geminiStep = workflowData.jobs?.['refresh-embeddings']?.steps?.find(
+  (step: any) => step.id === 'gemini_refresh_embeddings',
+);
+
+const REFRESH_PROMPT_TEMPLATE = geminiStep?.with?.prompt;
+const ORIGINAL_SETTINGS = JSON.parse(geminiStep?.with?.settings || '{}');
+
+if (!REFRESH_PROMPT_TEMPLATE) {
+  throw new Error('Could not extract prompt from dedup refresh workflow.');
+}
+
+const mockMcpPath = path.join(process.cwd(), 'evals/mocks/dedup_mcp.ts');
+
+const createPrompt = () => {
+  return REFRESH_PROMPT_TEMPLATE.replace(
+    /\${{ github\.repository }}/g,
+    'google-gemini/gemini-cli',
+  );
+};
+
+const REFRESH_SETTINGS = {
+  ...ORIGINAL_SETTINGS,
+  mcpServers: {
+    issue_deduplication: {
+      command: 'npx',
+      args: ['tsx', mockMcpPath],
+    },
+  },
+};
+if (REFRESH_SETTINGS.telemetry) {
+  delete REFRESH_SETTINGS.telemetry;
+}
+
+describe('dedup_refresh_agent', () => {
+  evalTest('USUALLY_PASSES', {
+    name: 'should call refresh tool',
+    prompt: ['--output-format', 'json', '--prompt', createPrompt()],
+    approvalMode: 'yolo',
+    params: {
+      settings: REFRESH_SETTINGS,
+    },
+    assert: async (rig: any, result) => {
+      // result is the JSON output
+      const output = JSON.parse(result);
+      expect(output.stats).toBeDefined();
+
+      const toolStats = output.stats.tools.byName;
+      expect(toolStats.refresh).toBeDefined();
+      expect(toolStats.refresh.count).toBe(1);
+      expect(toolStats.refresh.success).toBe(1);
+
+      // We still check telemetry for deep arg inspection if needed,
+      // but stats verify the high-level goal.
+      const toolLogs = rig.readToolLogs();
+      const refreshCall = toolLogs.find(
+        (l: any) => l.toolRequest.name === 'refresh',
+      );
+      expect(refreshCall).toBeDefined();
+    },
+  });
+});
@@ -0,0 +1,136 @@
+/**
+ * @license
+ * Copyright 2025 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { Server } from '@modelcontextprotocol/sdk/server/index.js';
+import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
+import {
+  CallToolRequestSchema,
+  ListToolsRequestSchema,
+} from '@modelcontextprotocol/sdk/types.js';
+
+const server = new Server(
+  {
+    name: 'mock-dedup-server',
+    version: '1.0.0',
+  },
+  {
+    capabilities: {
+      tools: {},
+    },
+  },
+);
+
+server.setRequestHandler(ListToolsRequestSchema, async () => ({
+  tools: [
+    {
+      name: 'duplicates',
+      description: 'Find duplicate issues',
+      inputSchema: {
+        type: 'object',
+        properties: {
+          repo: { type: 'string' },
+          issue_number: { type: 'number' },
+        },
+        required: ['repo', 'issue_number'],
+      },
+    },
+    {
+      name: 'refresh',
+      description: 'Refresh embeddings for all issues in a repository',
+      inputSchema: {
+        type: 'object',
+        properties: {
+          repo: { type: 'string' },
+          force: { type: 'boolean' },
+        },
+        required: ['repo'],
+      },
+    },
+  ],
+}));
+
+server.setRequestHandler(CallToolRequestSchema, async (request) => {
+  if (request.params.name === 'duplicates') {
+    const issueNumber = request.params.arguments?.issue_number;
+
+    // Logic to return different results based on issue number
+    if (issueNumber === 101) {
+      return {
+        content: [
+          {
+            type: 'text',
+            text: JSON.stringify([
+              { number: 201, similarity: 0.95 },
+              { number: 202, similarity: 0.85 },
+            ]),
+          },
+        ],
+      };
+    }
+
+    // Edge Case: False Positive / Ambiguous
+    if (issueNumber === 301) {
+      return {
+        content: [
+          {
+            type: 'text',
+            text: JSON.stringify([
+              { number: 302, similarity: 0.88 }, // High similarity but different root cause
+            ]),
+          },
+        ],
+      };
+    }
+
+    // Edge Case: Low similarity (should reject)
+    if (issueNumber === 401) {
+      return {
+        content: [
+          {
+            type: 'text',
+            text: JSON.stringify([{ number: 402, similarity: 0.75 }]),
+          },
+        ],
+      };
+    }
+
+    // Edge Case: Multiple duplicates
+    if (issueNumber === 501) {
+      return {
+        content: [
+          {
+            type: 'text',
+            text: JSON.stringify([
+              { number: 502, similarity: 0.92 },
+              { number: 503, similarity: 0.91 },
+            ]),
+          },
+        ],
+      };
+    }
+
+    return {
+      content: [{ type: 'text', text: '[]' }],
+    };
+  }
+
+  if (request.params.name === 'refresh') {
+    return {
+      content: [
+        {
+          type: 'text',
+          text: JSON.stringify({ status: 'success', refreshed_count: 10 }),
+        },
+      ],
+    };
+  }
+
+  throw new Error('Tool not found');
+});
+
+const transport = new StdioServerTransport();
+await server.connect(transport);
+console.error('Mock Dedup MCP server running');
@@ -37,9 +37,10 @@ export type EvalPolicy = 'ALWAYS_PASSES' | 'USUALLY_PASSES';

 export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
  const fn = async () => {
-    const rig = new TestRig();
+    const rig = new TestRig() as any;
    const { logDir, sanitizedName } = await prepareLogDir(evalCase.name);
    const activityLogFile = path.join(logDir, `${sanitizedName}.jsonl`);
+    rig._activityLogFile = activityLogFile;
    const logFile = path.join(logDir, `${sanitizedName}.log`);
    let isSuccess = false;
    try {
@@ -53,6 +54,9 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
          const fullPath = path.join(rig.testDir!, filePath);
          fs.mkdirSync(path.dirname(fullPath), { recursive: true });
          fs.writeFileSync(fullPath, content);
+          if (filePath.startsWith('bin/')) {
+            fs.chmodSync(fullPath, 0o755);
+          }

          // If it's an agent file, calculate hash for acknowledgement
          if (
@@ -118,6 +122,8 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
        timeout: evalCase.timeout,
        env: {
          GEMINI_CLI_ACTIVITY_LOG_FILE: activityLogFile,
+          PATH: `${path.join(rig.testDir!, 'bin')}${path.delimiter}${process.env.PATH}`,
+          ...evalCase.env,
        },
      });

@@ -168,8 +174,9 @@ async function prepareLogDir(name: string) {
 export interface EvalCase {
  name: string;
  params?: Record<string, any>;
-  prompt: string;
+  prompt: string | string[];
  timeout?: number;
+  env?: Record<string, string>;
  files?: Record<string, string>;
  approvalMode?: 'default' | 'auto_edit' | 'yolo' | 'plan';
  assert: (rig: TestRig, result: string) => Promise<void>;
@@ -75,6 +75,8 @@ const createPrompt = (title: string, body: string) => {
    .replace('${{ env.AVAILABLE_LABELS }}', availableLabels);
 };

+const TRIAGE_SETTINGS = {};
+
 const escapeHtml = (str: string) => {
  return str.replace(/[<>&'"]/g, (c) => {
    switch (c) {
@@ -94,15 +96,21 @@ const escapeHtml = (str: string) => {
 };

 const assertHasLabel = (expectedLabel: string) => {
-  return async (_rig: unknown, result: string) => {
-    const firstBrace = result.indexOf('{');
-    const lastBrace = result.lastIndexOf('}');
+  return async (rig: any, result: string) => {
+    // Verify JSON output stats
+    const output = JSON.parse(result);
+    expect(output.stats).toBeDefined();
+
+    // The model response JSON is in the 'response' field
+    const responseText = output.response;
+    const firstBrace = responseText.indexOf('{');
+    const lastBrace = responseText.lastIndexOf('}');
    if (firstBrace === -1 || lastBrace === -1 || lastBrace < firstBrace) {
      throw new Error(
-        `Could not find a JSON object in the result: "${escapeHtml(result)}"`,
+        `Could not find a JSON object in the response: "${escapeHtml(responseText)}"`,
      );
    }
-    const jsonString = result.substring(firstBrace, lastBrace + 1);
+    const jsonString = responseText.substring(firstBrace, lastBrace + 1);

    let data: { labels_to_set?: string[] };
    try {
@@ -110,7 +118,7 @@ const assertHasLabel = (expectedLabel: string) => {
    } catch (e) {
      const err = e as Error;
      throw new Error(
-        `Failed to parse JSON. Error: ${err.message}. Result: "${escapeHtml(result)}"`,
+        `Failed to parse JSON. Error: ${err.message}. Response: "${escapeHtml(responseText)}"`,
      );
    }

@@ -123,142 +131,277 @@ const assertHasLabel = (expectedLabel: string) => {
 describe('triage_agent', () => {
  evalTest('USUALLY_PASSES', {
    name: 'should identify area/core for windows installation issues',
-    prompt: createPrompt(
-      'CLI failed to install on Windows',
-      'I tried running npm install but it failed with an error on Windows 11.',
-    ),
+    prompt: [
+      '--output-format',
+      'json',
+      '--prompt',
+      createPrompt(
+        'CLI failed to install on Windows',
+        'I tried running npm install but it failed with an error on Windows 11.',
+      ),
+    ],
+    params: { settings: TRIAGE_SETTINGS },
    assert: assertHasLabel('area/core'),
  });

  evalTest('USUALLY_PASSES', {
    name: 'should identify area/platform for CI/CD failures',
-    prompt: createPrompt(
-      'Tests are failing in the CI/CD pipeline',
-      'The github action is failing with a 500 error.',
-    ),
+    prompt: [
+      '--output-format',
+      'json',
+      '--prompt',
+      createPrompt(
+        'Tests are failing in the CI/CD pipeline',
+        'The github action is failing with a 500 error.',
+      ),
+    ],
+    params: { settings: TRIAGE_SETTINGS },
    assert: assertHasLabel('area/platform'),
  });

  evalTest('USUALLY_PASSES', {
    name: 'should identify area/platform for quota issues',
-    prompt: createPrompt(
-      'Resource Exhausted 429',
-      'I am getting a 429 error when running the CLI.',
-    ),
+    prompt: [
+      '--output-format',
+      'json',
+      '--prompt',
+      createPrompt(
+        'Resource Exhausted 429',
+        'I am getting a 429 error when running the CLI.',
+      ),
+    ],
+    params: { settings: TRIAGE_SETTINGS },
    assert: assertHasLabel('area/platform'),
  });

  evalTest('USUALLY_PASSES', {
    name: 'should identify area/core for local build failures',
-    prompt: createPrompt(
-      'Local build failing',
-      'I cannot build the project locally. npm run build fails.',
-    ),
+    prompt: [
+      '--output-format',
+      'json',
+      '--prompt',
+      createPrompt(
+        'Local build failing',
+        'I cannot build the project locally. npm run build fails.',
+      ),
+    ],
+    params: { settings: TRIAGE_SETTINGS },
    assert: assertHasLabel('area/core'),
  });

  evalTest('USUALLY_PASSES', {
    name: 'should identify area/platform for sandbox issues',
-    prompt: createPrompt(
-      'Sandbox connection failed',
-      'I cannot connect to the docker sandbox environment.',
-    ),
+    prompt: [
+      '--output-format',
+      'json',
+      '--prompt',
+      createPrompt(
+        'Sandbox connection failed',
+        'I cannot connect to the docker sandbox environment.',
+      ),
+    ],
+    params: { settings: TRIAGE_SETTINGS },
    assert: assertHasLabel('area/platform'),
  });

  evalTest('USUALLY_PASSES', {
    name: 'should identify area/core for local test failures',
-    prompt: createPrompt(
-      'Local tests failing',
-      'I am running npm test locally and it fails.',
-    ),
+    prompt: [
+      '--output-format',
+      'json',
+      '--prompt',
+      createPrompt(
+        'Local tests failing',
+        'I am running npm test locally and it fails.',
+      ),
+    ],
+    params: { settings: TRIAGE_SETTINGS },
    assert: assertHasLabel('area/core'),
  });

  evalTest('USUALLY_PASSES', {
    name: 'should identify area/agent for questions about tools',
-    prompt: createPrompt(
-      'Bug with web search?',
-      'I am trying to use web search but I do not know the syntax. Is it @web or /web?',
-    ),
+    prompt: [
+      '--output-format',
+      'json',
+      '--prompt',
+      createPrompt(
+        'Bug with web search?',
+        'I am trying to use web search but I do not know the syntax. Is it @web or /web?',
+      ),
+    ],
+    params: { settings: TRIAGE_SETTINGS },
    assert: assertHasLabel('area/agent'),
  });

  evalTest('USUALLY_PASSES', {
    name: 'should identify area/extensions for feature requests',
-    prompt: createPrompt(
-      'Please add a python extension',
-      'I want to write python scripts as an extension.',
-    ),
+    prompt: [
+      '--output-format',
+      'json',
+      '--prompt',
+      createPrompt(
+        'Please add a python extension',
+        'I want to write python scripts as an extension.',
+      ),
+    ],
+    params: { settings: TRIAGE_SETTINGS },
    assert: assertHasLabel('area/extensions'),
  });

  evalTest('USUALLY_PASSES', {
    name: 'should identify area/unknown for off-topic spam',
-    prompt: createPrompt('Buy cheap rolex', 'Click here for discount.'),
+    prompt: [
+      '--output-format',
+      'json',
+      '--prompt',
+      createPrompt('Buy cheap rolex', 'Click here for discount.'),
+    ],
+    params: { settings: TRIAGE_SETTINGS },
    assert: assertHasLabel('area/unknown'),
  });

  evalTest('USUALLY_PASSES', {
    name: 'should identify area/core for crash reports phrased as questions',
-    prompt: createPrompt(
-      'Why does it segfault?',
-      'Why does the CLI segfault immediately when I run it on Ubuntu?',
-    ),
+    prompt: [
+      '--output-format',
+      'json',
+      '--prompt',
+      createPrompt(
+        'Why does it segfault?',
+        'Why does the CLI segfault immediately when I run it on Ubuntu?',
+      ),
+    ],
+    params: { settings: TRIAGE_SETTINGS },
    assert: assertHasLabel('area/core'),
  });

  evalTest('USUALLY_PASSES', {
    name: 'should identify area/agent for feature requests for built-in tools',
-    prompt: createPrompt(
-      'Can we have a diff tool?',
-      'Is it possible to add a built-in tool to show diffs before editing?',
-    ),
+    prompt: [
+      '--output-format',
+      'json',
+      '--prompt',
+      createPrompt(
+        'Can we have a diff tool?',
+        'Is it possible to add a built-in tool to show diffs before editing?',
+      ),
+    ],
+    params: { settings: TRIAGE_SETTINGS },
    assert: assertHasLabel('area/agent'),
  });

  evalTest('USUALLY_PASSES', {
    name: 'should identify area/enterprise for license questions',
-    prompt: createPrompt(
-      'License key issue',
-      'Where do I enter my enterprise license key? I cannot find the setting.',
-    ),
+    prompt: [
+      '--output-format',
+      'json',
+      '--prompt',
+      createPrompt(
+        'License key issue',
+        'Where do I enter my enterprise license key? I cannot find the setting.',
+      ),
+    ],
+    params: { settings: TRIAGE_SETTINGS },
    assert: assertHasLabel('area/enterprise'),
  });

  evalTest('USUALLY_PASSES', {
    name: 'should identify area/unknown for extremely vague reports',
-    prompt: createPrompt(
-      'It does not work',
-      'I tried to use it and it failed.',
-    ),
+    prompt: [
+      '--output-format',
+      'json',
+      '--prompt',
+      createPrompt('It does not work', 'I tried to use it and it failed.'),
+    ],
+    params: { settings: TRIAGE_SETTINGS },
    assert: assertHasLabel('area/unknown'),
  });

  evalTest('USUALLY_PASSES', {
    name: 'should identify area/security for prompt injection reports',
-    prompt: createPrompt(
-      'Prompt injection vulnerability',
-      'I found a way to make the agent ignore instructions by saying "Ignore all previous instructions".',
-    ),
+    prompt: [
+      '--output-format',
+      'json',
+      '--prompt',
+      createPrompt(
+        'Prompt injection vulnerability',
+        'I found a way to make the agent ignore instructions by saying "Ignore all previous instructions".',
+      ),
+    ],
+    params: { settings: TRIAGE_SETTINGS },
    assert: assertHasLabel('area/security'),
  });

  evalTest('USUALLY_PASSES', {
    name: 'should identify area/non-interactive for headless crashes',
-    prompt: createPrompt(
-      'Headless mode segfault',
-      'When I run with --headless, the CLI crashes immediately.',
-    ),
+    prompt: [
+      '--output-format',
+      'json',
+      '--prompt',
+      createPrompt(
+        'Headless mode segfault',
+        'When I run with --headless, the CLI crashes immediately.',
+      ),
+    ],
+    params: { settings: TRIAGE_SETTINGS },
    assert: assertHasLabel('area/non-interactive'),
  });

  evalTest('USUALLY_PASSES', {
    name: 'should identify area/agent for mixed feedback and tool bugs',
-    prompt: createPrompt(
-      'Great tool but web search fails',
-      'I love using Gemini CLI, it is amazing! However, the @web tool gives me an error every time I search for "react".',
-    ),
+    prompt: [
+      '--output-format',
+      'json',
+      '--prompt',
+      createPrompt(
+        'Great tool but web search fails',
+        'I love using Gemini CLI, it is amazing! However, the @web tool gives me an error every time I search for "react".',
+      ),
+    ],
+    params: { settings: TRIAGE_SETTINGS },
    assert: assertHasLabel('area/agent'),
  });
+
+  evalTest('USUALLY_PASSES', {
+    name: 'should identify area/core for UI performance issues',
+    prompt: [
+      '--output-format',
+      'json',
+      '--prompt',
+      createPrompt(
+        'UI is very slow',
+        'The new interface is lagging and unresponsive when I scroll.',
+      ),
+    ],
+    params: { settings: TRIAGE_SETTINGS },
+    assert: assertHasLabel('area/core'),
+  });
+
+  evalTest('USUALLY_PASSES', {
+    name: 'should identify area/security for accidental secret leakage',
+    prompt: [
+      '--output-format',
+      'json',
+      '--prompt',
+      createPrompt(
+        'Leaked API key in logs',
+        'I accidentally posted my API key in a previous issue comment. Can you delete it?',
+      ),
+    ],
+    params: { settings: TRIAGE_SETTINGS },
+    assert: assertHasLabel('area/security'),
+  });
+
+  evalTest('USUALLY_PASSES', {
+    name: 'should identify area/unknown for nonsensical input',
+    prompt: [
+      '--output-format',
+      'json',
+      '--prompt',
+      createPrompt('asdfasdf', 'qwerqwer zxcvbnm'),
+    ],
+    params: { settings: TRIAGE_SETTINGS },
+    assert: assertHasLabel('area/unknown'),
+  });
 });