From 9f8f31cce9ef10786a651796ab4f0e53ccf4f7f1 Mon Sep 17 00:00:00 2001 From: cocosheng-g Date: Tue, 3 Feb 2026 15:54:33 -0500 Subject: [PATCH] fix(evals): address review feedback on triage tests --- evals/triage.eval.ts | 143 ++++++++++++++++++++++--------------------- package.json | 2 + 2 files changed, 76 insertions(+), 69 deletions(-) diff --git a/evals/triage.eval.ts b/evals/triage.eval.ts index f1a966fef8..bc4bce4512 100644 --- a/evals/triage.eval.ts +++ b/evals/triage.eval.ts @@ -6,37 +6,38 @@ import { describe, expect } from 'vitest'; import { evalTest } from './test-helper.js'; -import fs from 'node:fs'; +import fs from 'node:fs/promises'; import path from 'node:path'; +import yaml from 'js-yaml'; // Read the workflow file to extract the prompt const workflowPath = path.join( process.cwd(), '.github/workflows/gemini-automated-issue-triage.yml', ); -const workflowContent = fs.readFileSync(workflowPath, 'utf8'); +const workflowContent = await fs.readFile(workflowPath, 'utf8'); -// Extract the prompt block -// Looking for "prompt: |-" followed by the content, until the next step definition -const promptMatch = workflowContent.match( - /prompt: \|-\n([\s\S]+?)(?=\n\s+-\s+name:)/, +// Use a YAML parser for robustness +const workflowData = yaml.load(workflowContent) as { + jobs?: { + 'triage-issue'?: { + steps?: { id?: string; with?: { prompt?: string } }[]; + }; + }; +}; + +const triageStep = workflowData.jobs?.['triage-issue']?.steps?.find( + (step) => step.id === 'gemini_issue_analysis', ); -if (!promptMatch) { +const TRIAGE_PROMPT_TEMPLATE = triageStep?.with?.prompt; + +if (!TRIAGE_PROMPT_TEMPLATE) { throw new Error( - 'Could not extract prompt from workflow file. Check regex or file content.', + 'Could not extract prompt from workflow file. Check for `jobs.triage-issue.steps[id=gemini_issue_analysis].with.prompt` in the YAML file.', ); } -const rawPrompt = promptMatch[1]; -// Remove the YAML indentation (12 spaces based on the file structure) -// We detect the indentation from the first line -const lines = rawPrompt.split('\n'); -const firstLineIndent = lines[0].match(/^\s*/)?.[0].length || 0; -const TRIAGE_PROMPT_TEMPLATE = lines - .map((line) => line.slice(firstLineIndent)) - .join('\n'); - const createPrompt = (title: string, body: string) => { // The placeholders in the YAML are ${{ env.ISSUE_TITLE }} etc. // We need to replace them with the actual values for the test. @@ -48,6 +49,49 @@ const createPrompt = (title: string, body: string) => { ); }; +const escapeHtml = (str: string) => { + return str.replace(/[<>&'"]/g, (c) => { + switch (c) { + case '<': + return '<'; + case '>': + return '>'; + case '&': + return '&'; + case "'": + return '''; + case '"': + return '"'; + } + return ''; // Should not happen + }); +}; + +const assertHasLabel = (expectedLabel: string) => { + return async (_rig: unknown, result: string) => { + const jsonMatch = result.match(/{[\s\S]*}/); + if (!jsonMatch || !jsonMatch[0]) { + throw new Error( + `Could not find a JSON object in the result: "${escapeHtml(result)}"`, + ); + } + + let data: { labels_to_set?: string[] }; + try { + data = JSON.parse(jsonMatch[0]); + } catch (e) { + const err = e as Error; + throw new Error( + `Failed to parse JSON. Error: ${err.message}. Result: "${escapeHtml(result)}"`, + ); + } + + expect(data).toHaveProperty('labels_to_set'); + expect(Array.isArray(data.labels_to_set)).toBe(true); + expect(data.labels_to_set).toContain(expectedLabel); + }; +}; + describe('triage_agent', () => { evalTest('USUALLY_PASSES', { name: 'should identify area/core for windows installation issues', @@ -55,10 +99,7 @@ describe('triage_agent', () => { 'CLI failed to install on Windows', 'I tried running npm install but it failed with an error on Windows 11.', ), - assert: async (rig, result) => { - const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}'); - expect(json.labels_to_set).toContain('area/core'); - }, + assert: assertHasLabel('area/core'), }); evalTest('USUALLY_PASSES', { @@ -67,10 +108,7 @@ describe('triage_agent', () => { 'Tests are failing in the CI/CD pipeline', 'The github action is failing with a 500 error.', ), - assert: async (rig, result) => { - const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}'); - expect(json.labels_to_set).toContain('area/platform'); - }, + assert: assertHasLabel('area/platform'), }); evalTest('USUALLY_PASSES', { @@ -79,10 +117,7 @@ describe('triage_agent', () => { 'Resource Exhausted 429', 'I am getting a 429 error when running the CLI.', ), - assert: async (rig, result) => { - const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}'); - expect(json.labels_to_set).toContain('area/platform'); - }, + assert: assertHasLabel('area/platform'), }); evalTest('USUALLY_PASSES', { @@ -91,10 +126,7 @@ describe('triage_agent', () => { 'Local build failing', 'I cannot build the project locally. npm run build fails.', ), - assert: async (rig, result) => { - const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}'); - expect(json.labels_to_set).toContain('area/core'); - }, + assert: assertHasLabel('area/core'), }); evalTest('USUALLY_PASSES', { @@ -103,10 +135,7 @@ describe('triage_agent', () => { 'Sandbox connection failed', 'I cannot connect to the docker sandbox environment.', ), - assert: async (rig, result) => { - const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}'); - expect(json.labels_to_set).toContain('area/platform'); - }, + assert: assertHasLabel('area/platform'), }); evalTest('USUALLY_PASSES', { @@ -115,10 +144,7 @@ describe('triage_agent', () => { 'Local tests failing', 'I am running npm test locally and it fails.', ), - assert: async (rig, result) => { - const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}'); - expect(json.labels_to_set).toContain('area/core'); - }, + assert: assertHasLabel('area/core'), }); evalTest('USUALLY_PASSES', { @@ -127,10 +153,7 @@ describe('triage_agent', () => { 'Bug with web search?', 'I am trying to use web search but I do not know the syntax. Is it @web or /web?', ), - assert: async (rig, result) => { - const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}'); - expect(json.labels_to_set).toContain('area/agent'); - }, + assert: assertHasLabel('area/agent'), }); evalTest('USUALLY_PASSES', { @@ -139,19 +162,13 @@ describe('triage_agent', () => { 'Please add a python extension', 'I want to write python scripts as an extension.', ), - assert: async (rig, result) => { - const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}'); - expect(json.labels_to_set).toContain('area/extensions'); - }, + assert: assertHasLabel('area/extensions'), }); evalTest('USUALLY_PASSES', { name: 'should identify area/unknown for off-topic spam', prompt: createPrompt('Buy cheap rolex', 'Click here for discount.'), - assert: async (rig, result) => { - const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}'); - expect(json.labels_to_set).toContain('area/unknown'); - }, + assert: assertHasLabel('area/unknown'), }); evalTest('USUALLY_PASSES', { @@ -160,10 +177,7 @@ describe('triage_agent', () => { 'Why does it segfault?', 'Why does the CLI segfault immediately when I run it on Ubuntu?', ), - assert: async (rig, result) => { - const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}'); - expect(json.labels_to_set).toContain('area/core'); - }, + assert: assertHasLabel('area/core'), }); evalTest('USUALLY_PASSES', { @@ -172,10 +186,7 @@ describe('triage_agent', () => { 'Can we have a diff tool?', 'Is it possible to add a built-in tool to show diffs before editing?', ), - assert: async (rig, result) => { - const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}'); - expect(json.labels_to_set).toContain('area/agent'); - }, + assert: assertHasLabel('area/agent'), }); evalTest('USUALLY_PASSES', { @@ -184,10 +195,7 @@ describe('triage_agent', () => { 'License key issue', 'Where do I enter my enterprise license key? I cannot find the setting.', ), - assert: async (rig, result) => { - const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}'); - expect(json.labels_to_set).toContain('area/enterprise'); - }, + assert: assertHasLabel('area/enterprise'), }); evalTest('USUALLY_PASSES', { @@ -196,9 +204,6 @@ describe('triage_agent', () => { 'It does not work', 'I tried to use it and it failed.', ), - assert: async (rig, result) => { - const json = JSON.parse(result.match(/{[\s\S]*}/)?.[0] || '{}'); - expect(json.labels_to_set).toContain('area/unknown'); - }, + assert: assertHasLabel('area/unknown'), }); }); diff --git a/package.json b/package.json index e64d547254..6f3f96632b 100644 --- a/package.json +++ b/package.json @@ -85,6 +85,7 @@ "@types/mime-types": "^3.0.1", "@types/minimatch": "^5.1.2", "@types/mock-fs": "^4.13.4", + "@types/js-yaml": "^4.0.9", "@types/prompts": "^2.4.9", "@types/react": "^19.2.0", "@types/react-dom": "^19.2.0", @@ -103,6 +104,7 @@ "eslint-plugin-react-hooks": "^5.2.0", "glob": "^12.0.0", "globals": "^16.0.0", + "js-yaml": "^4.1.0", "google-artifactregistry-auth": "^3.4.0", "husky": "^9.1.7", "ink-testing-library": "^4.0.0",