mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-03-12 23:21:27 -07:00
fix(workflows): tune triage prompt and add robustness evals
This commit is contained in:
@@ -209,7 +209,7 @@ jobs:
|
||||
"I am having trouble running the CLI in headless mode"
|
||||
|
||||
area/core
|
||||
- Description: Issues with the fundamental CLI app itself. This includes the user interface (UI/UX), installation, OS compatibility, and performance.
|
||||
- Description: Issues with the fundamental CLI app itself. This includes the user interface (UI/UX), installation, OS compatibility, performance, and local development issues (e.g., local build or test failures).
|
||||
- Example Issues:
|
||||
"I am seeing my screen flicker when using the CLI."
|
||||
"The output in my terminal is malformed or unreadable."
|
||||
@@ -221,6 +221,8 @@ jobs:
|
||||
"High CPU or memory usage by the CLI process."
|
||||
"Issues related to multi-modality (e.g., handling image inputs)."
|
||||
"Problems with the IDE integration connection or installation"
|
||||
"I cannot build the project locally."
|
||||
"My local tests are failing."
|
||||
|
||||
area/security
|
||||
- Description: Issues related to user authentication, authorization, data security, and privacy.
|
||||
@@ -234,7 +236,7 @@ jobs:
|
||||
"Preventing unauthorized data access."
|
||||
|
||||
area/platform
|
||||
- Description: Issues related to CI/CD, release management, testing, eval infrastructure, capacity, quota management, and sandbox environments.
|
||||
- Description: Issues related to CI/CD pipelines, release management, automated testing infrastructure (evals), capacity, quota management, and sandbox environments. NOT for local test failures.
|
||||
- Example Issues:
|
||||
"I am getting a 429 'Resource Exhausted' or 500-level server error."
|
||||
"General slowness or high latency from the service."
|
||||
|
||||
204
evals/triage.eval.ts
Normal file
204
evals/triage.eval.ts
Normal file
@@ -0,0 +1,204 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2025 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { describe, expect } from 'vitest';
|
||||
import { evalTest } from './test-helper.js';
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
|
||||
// Read the workflow file to extract the prompt
|
||||
const workflowPath = path.join(
|
||||
process.cwd(),
|
||||
'.github/workflows/gemini-automated-issue-triage.yml',
|
||||
);
|
||||
const workflowContent = fs.readFileSync(workflowPath, 'utf8');
|
||||
|
||||
// Extract the prompt block
|
||||
// Looking for "prompt: |-" followed by the content, until the next step definition
|
||||
const promptMatch = workflowContent.match(
|
||||
/prompt: \|-\n([\s\S]+?)(?=\n\s+-\s+name:)/,
|
||||
);
|
||||
|
||||
if (!promptMatch) {
|
||||
throw new Error(
|
||||
'Could not extract prompt from workflow file. Check regex or file content.',
|
||||
);
|
||||
}
|
||||
|
||||
const rawPrompt = promptMatch[1];
|
||||
// Remove the YAML indentation (12 spaces based on the file structure)
|
||||
// We detect the indentation from the first line
|
||||
const lines = rawPrompt.split('\n');
|
||||
const firstLineIndent = lines[0].match(/^\s*/)?.[0].length || 0;
|
||||
const TRIAGE_PROMPT_TEMPLATE = lines
|
||||
.map((line) => line.slice(firstLineIndent))
|
||||
.join('\n');
|
||||
|
||||
const createPrompt = (title: string, body: string) => {
|
||||
// The placeholders in the YAML are ${{ env.ISSUE_TITLE }} etc.
|
||||
// We need to replace them with the actual values for the test.
|
||||
return TRIAGE_PROMPT_TEMPLATE.replace('${{ env.ISSUE_TITLE }}', title)
|
||||
.replace('${{ env.ISSUE_BODY }}', body)
|
||||
.replace(
|
||||
'${{ env.AVAILABLE_LABELS }}',
|
||||
'area/agent, area/enterprise, area/non-interactive, area/core, area/security, area/platform, area/extensions, area/unknown',
|
||||
);
|
||||
};
|
||||
|
||||
describe('triage_agent', () => {
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should identify area/core for windows installation issues',
|
||||
prompt: createPrompt(
|
||||
'CLI failed to install on Windows',
|
||||
'I tried running npm install but it failed with an error on Windows 11.',
|
||||
),
|
||||
assert: async (rig, result) => {
|
||||
const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}');
|
||||
expect(json.labels_to_set).toContain('area/core');
|
||||
},
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should identify area/platform for CI/CD failures',
|
||||
prompt: createPrompt(
|
||||
'Tests are failing in the CI/CD pipeline',
|
||||
'The github action is failing with a 500 error.',
|
||||
),
|
||||
assert: async (rig, result) => {
|
||||
const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}');
|
||||
expect(json.labels_to_set).toContain('area/platform');
|
||||
},
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should identify area/platform for quota issues',
|
||||
prompt: createPrompt(
|
||||
'Resource Exhausted 429',
|
||||
'I am getting a 429 error when running the CLI.',
|
||||
),
|
||||
assert: async (rig, result) => {
|
||||
const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}');
|
||||
expect(json.labels_to_set).toContain('area/platform');
|
||||
},
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should identify area/core for local build failures',
|
||||
prompt: createPrompt(
|
||||
'Local build failing',
|
||||
'I cannot build the project locally. npm run build fails.',
|
||||
),
|
||||
assert: async (rig, result) => {
|
||||
const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}');
|
||||
expect(json.labels_to_set).toContain('area/core');
|
||||
},
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should identify area/platform for sandbox issues',
|
||||
prompt: createPrompt(
|
||||
'Sandbox connection failed',
|
||||
'I cannot connect to the docker sandbox environment.',
|
||||
),
|
||||
assert: async (rig, result) => {
|
||||
const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}');
|
||||
expect(json.labels_to_set).toContain('area/platform');
|
||||
},
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should identify area/core for local test failures',
|
||||
prompt: createPrompt(
|
||||
'Local tests failing',
|
||||
'I am running npm test locally and it fails.',
|
||||
),
|
||||
assert: async (rig, result) => {
|
||||
const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}');
|
||||
expect(json.labels_to_set).toContain('area/core');
|
||||
},
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should identify area/agent for questions about tools',
|
||||
prompt: createPrompt(
|
||||
'Bug with web search?',
|
||||
'I am trying to use web search but I do not know the syntax. Is it @web or /web?',
|
||||
),
|
||||
assert: async (rig, result) => {
|
||||
const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}');
|
||||
expect(json.labels_to_set).toContain('area/agent');
|
||||
},
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should identify area/extensions for feature requests',
|
||||
prompt: createPrompt(
|
||||
'Please add a python extension',
|
||||
'I want to write python scripts as an extension.',
|
||||
),
|
||||
assert: async (rig, result) => {
|
||||
const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}');
|
||||
expect(json.labels_to_set).toContain('area/extensions');
|
||||
},
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should identify area/unknown for off-topic spam',
|
||||
prompt: createPrompt('Buy cheap rolex', 'Click here for discount.'),
|
||||
assert: async (rig, result) => {
|
||||
const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}');
|
||||
expect(json.labels_to_set).toContain('area/unknown');
|
||||
},
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should identify area/core for crash reports phrased as questions',
|
||||
prompt: createPrompt(
|
||||
'Why does it segfault?',
|
||||
'Why does the CLI segfault immediately when I run it on Ubuntu?',
|
||||
),
|
||||
assert: async (rig, result) => {
|
||||
const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}');
|
||||
expect(json.labels_to_set).toContain('area/core');
|
||||
},
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should identify area/agent for feature requests for built-in tools',
|
||||
prompt: createPrompt(
|
||||
'Can we have a diff tool?',
|
||||
'Is it possible to add a built-in tool to show diffs before editing?',
|
||||
),
|
||||
assert: async (rig, result) => {
|
||||
const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}');
|
||||
expect(json.labels_to_set).toContain('area/agent');
|
||||
},
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should identify area/enterprise for license questions',
|
||||
prompt: createPrompt(
|
||||
'License key issue',
|
||||
'Where do I enter my enterprise license key? I cannot find the setting.',
|
||||
),
|
||||
assert: async (rig, result) => {
|
||||
const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}');
|
||||
expect(json.labels_to_set).toContain('area/enterprise');
|
||||
},
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should identify area/unknown for extremely vague reports',
|
||||
prompt: createPrompt(
|
||||
'It does not work',
|
||||
'I tried to use it and it failed.',
|
||||
),
|
||||
assert: async (rig, result) => {
|
||||
const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}');
|
||||
expect(json.labels_to_set).toContain('area/unknown');
|
||||
},
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user