diff --git a/.github/workflows/evals-nightly.yml b/.github/workflows/evals-nightly.yml index b7a375d836..b2f23fcc55 100644 --- a/.github/workflows/evals-nightly.yml +++ b/.github/workflows/evals-nightly.yml @@ -70,7 +70,8 @@ jobs: $CMD -- -t "$PATTERN" fi else - $CMD + # Exclude workflow evals from the general matrix run + $CMD -- --exclude evals/workflows fi - name: 'Upload Logs' @@ -81,9 +82,48 @@ jobs: path: 'evals/logs' retention-days: 7 + workflow-evals: + name: 'Evals (Workflow) nightly run' + runs-on: 'gemini-cli-ubuntu-16-core' + steps: + - name: 'Checkout' + uses: 'actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8' # ratchet:actions/checkout@v5 + + - name: 'Set up Node.js' + uses: 'actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020' # ratchet:actions/setup-node@v4 + with: + node-version-file: '.nvmrc' + cache: 'npm' + + - name: 'Install dependencies' + run: 'npm ci' + + - name: 'Build project' + run: 'npm run build' + + - name: 'Create logs directory' + run: 'mkdir -p evals/logs' + + - name: 'Run Workflow Evals' + continue-on-error: true + env: + GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}' + GEMINI_MODEL: 'gemini-2.5-pro' + RUN_EVALS: 'true' + # Explicitly target the workflow directory + run: 'npm run test:all_evals -- evals/workflows' + + - name: 'Upload Logs' + if: 'always()' + uses: 'actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02' # ratchet:actions/upload-artifact@v4 + with: + name: 'eval-logs-workflows-gemini-2.5-pro' + path: 'evals/logs' + retention-days: 7 + aggregate-results: name: 'Aggregate Results' - needs: ['evals'] + needs: ['evals', 'workflow-evals'] if: 'always()' runs-on: 'gemini-cli-ubuntu-16-core' steps: diff --git a/evals/workflows/batch_triage.eval.ts b/evals/workflows/batch_triage.eval.ts index 14e38c4e41..b5143117b1 100644 --- a/evals/workflows/batch_triage.eval.ts +++ b/evals/workflows/batch_triage.eval.ts @@ -9,7 +9,6 @@ import { evalTest } from '../test-helper.js'; import fs from 'node:fs/promises'; import path from 'node:path'; import yaml from 'js-yaml'; -import { WORKFLOW_TARGET_MODELS } from './constants.js'; // Read the workflow file to extract the prompt const workflowPath = path.join( @@ -159,7 +158,6 @@ describe('batch_triage_agent', () => { ]), }, params: { settings: BATCH_TRIAGE_SETTINGS }, - targetModels: WORKFLOW_TARGET_MODELS, assert: assertHasIssueLabel(101, 'area/core'), }); @@ -177,7 +175,6 @@ describe('batch_triage_agent', () => { ]), }, params: { settings: BATCH_TRIAGE_SETTINGS }, - targetModels: WORKFLOW_TARGET_MODELS, assert: assertHasIssueLabel(102, 'area/platform'), }); @@ -200,7 +197,6 @@ describe('batch_triage_agent', () => { ]), }, params: { settings: BATCH_TRIAGE_SETTINGS }, - targetModels: WORKFLOW_TARGET_MODELS, assert: async (rig: any, result) => { // Assert issue 103 has area/core await assertHasIssueLabel(103, 'area/core')(rig, result); @@ -223,7 +219,6 @@ describe('batch_triage_agent', () => { ]), }, params: { settings: BATCH_TRIAGE_SETTINGS }, - targetModels: WORKFLOW_TARGET_MODELS, assert: assertHasIssueLabel(105, 'status/need-retesting'), }); @@ -241,7 +236,6 @@ describe('batch_triage_agent', () => { ]), }, params: { settings: BATCH_TRIAGE_SETTINGS }, - targetModels: WORKFLOW_TARGET_MODELS, assert: assertHasIssueLabel(106, 'status/need-information'), }); @@ -259,7 +253,6 @@ describe('batch_triage_agent', () => { ]), }, params: { settings: BATCH_TRIAGE_SETTINGS }, - targetModels: WORKFLOW_TARGET_MODELS, assert: async (rig: any, result) => { await assertHasIssueLabel(107, 'area/core')(rig, result); await assertHasIssueLabel(108, 'area/platform')(rig, result); diff --git a/evals/workflows/constants.ts b/evals/workflows/constants.ts deleted file mode 100644 index 23af437090..0000000000 --- a/evals/workflows/constants.ts +++ /dev/null @@ -1,10 +0,0 @@ -/** - * @license - * Copyright 2025 Google LLC - * SPDX-License-Identifier: Apache-2.0 - */ - -// The models that workflow evals should target. -// These workflows (triage, dedup) run in GitHub Actions using the default CLI model. -// We restrict evals to this model to avoid noise from other models in the nightly matrix. -export const WORKFLOW_TARGET_MODELS = ['gemini-2.5-pro']; diff --git a/evals/workflows/dedup.eval.ts b/evals/workflows/dedup.eval.ts index e4e833ed29..8dd5057180 100644 --- a/evals/workflows/dedup.eval.ts +++ b/evals/workflows/dedup.eval.ts @@ -9,7 +9,6 @@ import { evalTest } from '../test-helper.js'; import fs from 'node:fs/promises'; import path from 'node:path'; import yaml from 'js-yaml'; -import { WORKFLOW_TARGET_MODELS } from './constants.js'; // Read the workflow file to extract the prompt and settings const workflowPath = path.join( @@ -66,7 +65,6 @@ describe('dedup_agent', () => { params: { settings: DEDUP_SETTINGS, }, - targetModels: WORKFLOW_TARGET_MODELS, files: { github_env: '', // Mock gh binary @@ -147,7 +145,6 @@ if (args.includes('issue view')) { params: { settings: DEDUP_SETTINGS, }, - targetModels: WORKFLOW_TARGET_MODELS, files: { github_env: '', 'bin/gh': `#!/usr/bin/env node @@ -219,7 +216,6 @@ if (args.includes('issue view')) { params: { settings: DEDUP_SETTINGS, }, - targetModels: WORKFLOW_TARGET_MODELS, files: { github_env: '', 'bin/gh': `#!/usr/bin/env node @@ -290,7 +286,6 @@ if (args.includes('issue view')) { params: { settings: DEDUP_SETTINGS, }, - targetModels: WORKFLOW_TARGET_MODELS, files: { github_env: '', 'bin/gh': `#!/usr/bin/env node @@ -361,7 +356,6 @@ if (args.includes('issue view')) { params: { settings: DEDUP_SETTINGS, }, - targetModels: WORKFLOW_TARGET_MODELS, files: { github_env: '', 'bin/gh': `#!/usr/bin/env node diff --git a/evals/workflows/dedup_refresh.eval.ts b/evals/workflows/dedup_refresh.eval.ts index ef0ef352ca..1331414343 100644 --- a/evals/workflows/dedup_refresh.eval.ts +++ b/evals/workflows/dedup_refresh.eval.ts @@ -9,7 +9,6 @@ import { evalTest } from '../test-helper.js'; import fs from 'node:fs/promises'; import path from 'node:path'; import yaml from 'js-yaml'; -import { WORKFLOW_TARGET_MODELS } from './constants.js'; // Read the workflow file to extract the prompt and settings const workflowPath = path.join( @@ -62,7 +61,6 @@ describe('dedup_refresh_agent', () => { params: { settings: REFRESH_SETTINGS, }, - targetModels: WORKFLOW_TARGET_MODELS, assert: async (rig: any, result) => { // result is the JSON output const output = JSON.parse(result); diff --git a/evals/workflows/triage.eval.ts b/evals/workflows/triage.eval.ts index 4a363b78a2..4d177159c7 100644 --- a/evals/workflows/triage.eval.ts +++ b/evals/workflows/triage.eval.ts @@ -9,7 +9,6 @@ import { evalTest } from '../test-helper.js'; import fs from 'node:fs/promises'; import path from 'node:path'; import yaml from 'js-yaml'; -import { WORKFLOW_TARGET_MODELS } from './constants.js'; // Read the workflow file to extract the prompt const workflowPath = path.join( @@ -148,7 +147,6 @@ describe('triage_agent', () => { ), ], params: { settings: TRIAGE_SETTINGS }, - targetModels: WORKFLOW_TARGET_MODELS, assert: assertHasLabel('area/core'), }); @@ -164,7 +162,6 @@ describe('triage_agent', () => { ), ], params: { settings: TRIAGE_SETTINGS }, - targetModels: WORKFLOW_TARGET_MODELS, assert: assertHasLabel('area/platform'), }); @@ -180,7 +177,6 @@ describe('triage_agent', () => { ), ], params: { settings: TRIAGE_SETTINGS }, - targetModels: WORKFLOW_TARGET_MODELS, assert: assertHasLabel('area/platform'), }); @@ -196,7 +192,6 @@ describe('triage_agent', () => { ), ], params: { settings: TRIAGE_SETTINGS }, - targetModels: WORKFLOW_TARGET_MODELS, assert: assertHasLabel('area/core'), }); @@ -212,7 +207,6 @@ describe('triage_agent', () => { ), ], params: { settings: TRIAGE_SETTINGS }, - targetModels: WORKFLOW_TARGET_MODELS, assert: assertHasLabel('area/platform'), }); @@ -228,7 +222,6 @@ describe('triage_agent', () => { ), ], params: { settings: TRIAGE_SETTINGS }, - targetModels: WORKFLOW_TARGET_MODELS, assert: assertHasLabel('area/core'), }); @@ -244,7 +237,6 @@ describe('triage_agent', () => { ), ], params: { settings: TRIAGE_SETTINGS }, - targetModels: WORKFLOW_TARGET_MODELS, assert: assertHasLabel('area/agent'), }); @@ -260,7 +252,6 @@ describe('triage_agent', () => { ), ], params: { settings: TRIAGE_SETTINGS }, - targetModels: WORKFLOW_TARGET_MODELS, assert: assertHasLabel('area/extensions'), }); @@ -273,7 +264,6 @@ describe('triage_agent', () => { createPrompt('Buy cheap rolex', 'Click here for discount.'), ], params: { settings: TRIAGE_SETTINGS }, - targetModels: WORKFLOW_TARGET_MODELS, assert: assertHasLabel('area/unknown'), }); @@ -289,7 +279,6 @@ describe('triage_agent', () => { ), ], params: { settings: TRIAGE_SETTINGS }, - targetModels: WORKFLOW_TARGET_MODELS, assert: assertHasLabel('area/core'), }); @@ -305,7 +294,6 @@ describe('triage_agent', () => { ), ], params: { settings: TRIAGE_SETTINGS }, - targetModels: WORKFLOW_TARGET_MODELS, assert: assertHasLabel('area/agent'), }); @@ -321,7 +309,6 @@ describe('triage_agent', () => { ), ], params: { settings: TRIAGE_SETTINGS }, - targetModels: WORKFLOW_TARGET_MODELS, assert: assertHasLabel('area/enterprise'), }); @@ -334,7 +321,6 @@ describe('triage_agent', () => { createPrompt('It does not work', 'I tried to use it and it failed.'), ], params: { settings: TRIAGE_SETTINGS }, - targetModels: WORKFLOW_TARGET_MODELS, assert: assertHasLabel('area/unknown'), }); @@ -350,7 +336,6 @@ describe('triage_agent', () => { ), ], params: { settings: TRIAGE_SETTINGS }, - targetModels: WORKFLOW_TARGET_MODELS, assert: assertHasLabel('area/security'), }); @@ -366,7 +351,6 @@ describe('triage_agent', () => { ), ], params: { settings: TRIAGE_SETTINGS }, - targetModels: WORKFLOW_TARGET_MODELS, assert: assertHasLabel('area/non-interactive'), }); @@ -382,7 +366,6 @@ describe('triage_agent', () => { ), ], params: { settings: TRIAGE_SETTINGS }, - targetModels: WORKFLOW_TARGET_MODELS, assert: assertHasLabel('area/agent'), }); @@ -398,7 +381,6 @@ describe('triage_agent', () => { ), ], params: { settings: TRIAGE_SETTINGS }, - targetModels: WORKFLOW_TARGET_MODELS, assert: assertHasLabel('area/core'), }); @@ -414,7 +396,6 @@ describe('triage_agent', () => { ), ], params: { settings: TRIAGE_SETTINGS }, - targetModels: WORKFLOW_TARGET_MODELS, assert: assertHasLabel('area/security'), }); @@ -427,7 +408,6 @@ describe('triage_agent', () => { createPrompt('asdfasdf', 'qwerqwer zxcvbnm'), ], params: { settings: TRIAGE_SETTINGS }, - targetModels: WORKFLOW_TARGET_MODELS, assert: assertHasLabel('area/unknown'), }); });