diff --git a/.github/workflows/gemini-automated-issue-triage.yml b/.github/workflows/gemini-automated-issue-triage.yml index 08b97db0a2..3b6a46f7a2 100644 --- a/.github/workflows/gemini-automated-issue-triage.yml +++ b/.github/workflows/gemini-automated-issue-triage.yml @@ -209,7 +209,7 @@ jobs: "I am having trouble running the CLI in headless mode" area/core - - Description: Issues with the fundamental CLI app itself. This includes the user interface (UI/UX), installation, OS compatibility, and performance. + - Description: Issues with the fundamental CLI app itself. This includes the user interface (UI/UX), installation, OS compatibility, performance, and local development issues (e.g., local build or test failures). - Example Issues: "I am seeing my screen flicker when using the CLI." "The output in my terminal is malformed or unreadable." @@ -221,6 +221,8 @@ jobs: "High CPU or memory usage by the CLI process." "Issues related to multi-modality (e.g., handling image inputs)." "Problems with the IDE integration connection or installation" + "I cannot build the project locally." + "My local tests are failing." area/security - Description: Issues related to user authentication, authorization, data security, and privacy. @@ -234,7 +236,7 @@ jobs: "Preventing unauthorized data access." area/platform - - Description: Issues related to CI/CD, release management, testing, eval infrastructure, capacity, quota management, and sandbox environments. + - Description: Issues related to CI/CD pipelines, release management, automated testing infrastructure (evals), capacity, quota management, and sandbox environments. NOT for local test failures. - Example Issues: "I am getting a 429 'Resource Exhausted' or 500-level server error." "General slowness or high latency from the service." diff --git a/evals/triage.eval.ts b/evals/triage.eval.ts new file mode 100644 index 0000000000..f1a966fef8 --- /dev/null +++ b/evals/triage.eval.ts @@ -0,0 +1,204 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, expect } from 'vitest'; +import { evalTest } from './test-helper.js'; +import fs from 'node:fs'; +import path from 'node:path'; + +// Read the workflow file to extract the prompt +const workflowPath = path.join( + process.cwd(), + '.github/workflows/gemini-automated-issue-triage.yml', +); +const workflowContent = fs.readFileSync(workflowPath, 'utf8'); + +// Extract the prompt block +// Looking for "prompt: |-" followed by the content, until the next step definition +const promptMatch = workflowContent.match( + /prompt: \|-\n([\s\S]+?)(?=\n\s+-\s+name:)/, +); + +if (!promptMatch) { + throw new Error( + 'Could not extract prompt from workflow file. Check regex or file content.', + ); +} + +const rawPrompt = promptMatch[1]; +// Remove the YAML indentation (12 spaces based on the file structure) +// We detect the indentation from the first line +const lines = rawPrompt.split('\n'); +const firstLineIndent = lines[0].match(/^\s*/)?.[0].length || 0; +const TRIAGE_PROMPT_TEMPLATE = lines + .map((line) => line.slice(firstLineIndent)) + .join('\n'); + +const createPrompt = (title: string, body: string) => { + // The placeholders in the YAML are ${{ env.ISSUE_TITLE }} etc. + // We need to replace them with the actual values for the test. + return TRIAGE_PROMPT_TEMPLATE.replace('${{ env.ISSUE_TITLE }}', title) + .replace('${{ env.ISSUE_BODY }}', body) + .replace( + '${{ env.AVAILABLE_LABELS }}', + 'area/agent, area/enterprise, area/non-interactive, area/core, area/security, area/platform, area/extensions, area/unknown', + ); +}; + +describe('triage_agent', () => { + evalTest('USUALLY_PASSES', { + name: 'should identify area/core for windows installation issues', + prompt: createPrompt( + 'CLI failed to install on Windows', + 'I tried running npm install but it failed with an error on Windows 11.', + ), + assert: async (rig, result) => { + const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}'); + expect(json.labels_to_set).toContain('area/core'); + }, + }); + + evalTest('USUALLY_PASSES', { + name: 'should identify area/platform for CI/CD failures', + prompt: createPrompt( + 'Tests are failing in the CI/CD pipeline', + 'The github action is failing with a 500 error.', + ), + assert: async (rig, result) => { + const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}'); + expect(json.labels_to_set).toContain('area/platform'); + }, + }); + + evalTest('USUALLY_PASSES', { + name: 'should identify area/platform for quota issues', + prompt: createPrompt( + 'Resource Exhausted 429', + 'I am getting a 429 error when running the CLI.', + ), + assert: async (rig, result) => { + const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}'); + expect(json.labels_to_set).toContain('area/platform'); + }, + }); + + evalTest('USUALLY_PASSES', { + name: 'should identify area/core for local build failures', + prompt: createPrompt( + 'Local build failing', + 'I cannot build the project locally. npm run build fails.', + ), + assert: async (rig, result) => { + const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}'); + expect(json.labels_to_set).toContain('area/core'); + }, + }); + + evalTest('USUALLY_PASSES', { + name: 'should identify area/platform for sandbox issues', + prompt: createPrompt( + 'Sandbox connection failed', + 'I cannot connect to the docker sandbox environment.', + ), + assert: async (rig, result) => { + const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}'); + expect(json.labels_to_set).toContain('area/platform'); + }, + }); + + evalTest('USUALLY_PASSES', { + name: 'should identify area/core for local test failures', + prompt: createPrompt( + 'Local tests failing', + 'I am running npm test locally and it fails.', + ), + assert: async (rig, result) => { + const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}'); + expect(json.labels_to_set).toContain('area/core'); + }, + }); + + evalTest('USUALLY_PASSES', { + name: 'should identify area/agent for questions about tools', + prompt: createPrompt( + 'Bug with web search?', + 'I am trying to use web search but I do not know the syntax. Is it @web or /web?', + ), + assert: async (rig, result) => { + const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}'); + expect(json.labels_to_set).toContain('area/agent'); + }, + }); + + evalTest('USUALLY_PASSES', { + name: 'should identify area/extensions for feature requests', + prompt: createPrompt( + 'Please add a python extension', + 'I want to write python scripts as an extension.', + ), + assert: async (rig, result) => { + const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}'); + expect(json.labels_to_set).toContain('area/extensions'); + }, + }); + + evalTest('USUALLY_PASSES', { + name: 'should identify area/unknown for off-topic spam', + prompt: createPrompt('Buy cheap rolex', 'Click here for discount.'), + assert: async (rig, result) => { + const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}'); + expect(json.labels_to_set).toContain('area/unknown'); + }, + }); + + evalTest('USUALLY_PASSES', { + name: 'should identify area/core for crash reports phrased as questions', + prompt: createPrompt( + 'Why does it segfault?', + 'Why does the CLI segfault immediately when I run it on Ubuntu?', + ), + assert: async (rig, result) => { + const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}'); + expect(json.labels_to_set).toContain('area/core'); + }, + }); + + evalTest('USUALLY_PASSES', { + name: 'should identify area/agent for feature requests for built-in tools', + prompt: createPrompt( + 'Can we have a diff tool?', + 'Is it possible to add a built-in tool to show diffs before editing?', + ), + assert: async (rig, result) => { + const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}'); + expect(json.labels_to_set).toContain('area/agent'); + }, + }); + + evalTest('USUALLY_PASSES', { + name: 'should identify area/enterprise for license questions', + prompt: createPrompt( + 'License key issue', + 'Where do I enter my enterprise license key? I cannot find the setting.', + ), + assert: async (rig, result) => { + const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}'); + expect(json.labels_to_set).toContain('area/enterprise'); + }, + }); + + evalTest('USUALLY_PASSES', { + name: 'should identify area/unknown for extremely vague reports', + prompt: createPrompt( + 'It does not work', + 'I tried to use it and it failed.', + ), + assert: async (rig, result) => { + const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}'); + expect(json.labels_to_set).toContain('area/unknown'); + }, + }); +});