feat(ci): isolate workflow evals into independent nightly job

- Splits 'Evals: Nightly' into 'evals' (general capabilities) and 'workflow-evals' (specific workflow simulations).
- 'workflow-evals' runs only on 'gemini-2.5-pro' (the target model).
- 'evals' excludes workflow tests to prevent noise/skewed metrics on other models.
- Removes code-level 'targetModels' restrictions in favor of CI configuration.
- Updates aggregation script to handle skipped tests correctly (though exclusion avoids them).
This commit is contained in:
cocosheng-g
2026-02-03 22:37:15 -05:00
parent d23499db90
commit 9da1542071
6 changed files with 42 additions and 47 deletions

View File

@@ -70,7 +70,8 @@ jobs:
$CMD -- -t "$PATTERN"
fi
else
$CMD
# Exclude workflow evals from the general matrix run
$CMD -- --exclude evals/workflows
fi
- name: 'Upload Logs'
@@ -81,9 +82,48 @@ jobs:
path: 'evals/logs'
retention-days: 7
workflow-evals:
name: 'Evals (Workflow) nightly run'
runs-on: 'gemini-cli-ubuntu-16-core'
steps:
- name: 'Checkout'
uses: 'actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8' # ratchet:actions/checkout@v5
- name: 'Set up Node.js'
uses: 'actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020' # ratchet:actions/setup-node@v4
with:
node-version-file: '.nvmrc'
cache: 'npm'
- name: 'Install dependencies'
run: 'npm ci'
- name: 'Build project'
run: 'npm run build'
- name: 'Create logs directory'
run: 'mkdir -p evals/logs'
- name: 'Run Workflow Evals'
continue-on-error: true
env:
GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}'
GEMINI_MODEL: 'gemini-2.5-pro'
RUN_EVALS: 'true'
# Explicitly target the workflow directory
run: 'npm run test:all_evals -- evals/workflows'
- name: 'Upload Logs'
if: 'always()'
uses: 'actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02' # ratchet:actions/upload-artifact@v4
with:
name: 'eval-logs-workflows-gemini-2.5-pro'
path: 'evals/logs'
retention-days: 7
aggregate-results:
name: 'Aggregate Results'
needs: ['evals']
needs: ['evals', 'workflow-evals']
if: 'always()'
runs-on: 'gemini-cli-ubuntu-16-core'
steps:

View File

@@ -9,7 +9,6 @@ import { evalTest } from '../test-helper.js';
import fs from 'node:fs/promises';
import path from 'node:path';
import yaml from 'js-yaml';
import { WORKFLOW_TARGET_MODELS } from './constants.js';
// Read the workflow file to extract the prompt
const workflowPath = path.join(
@@ -159,7 +158,6 @@ describe('batch_triage_agent', () => {
]),
},
params: { settings: BATCH_TRIAGE_SETTINGS },
targetModels: WORKFLOW_TARGET_MODELS,
assert: assertHasIssueLabel(101, 'area/core'),
});
@@ -177,7 +175,6 @@ describe('batch_triage_agent', () => {
]),
},
params: { settings: BATCH_TRIAGE_SETTINGS },
targetModels: WORKFLOW_TARGET_MODELS,
assert: assertHasIssueLabel(102, 'area/platform'),
});
@@ -200,7 +197,6 @@ describe('batch_triage_agent', () => {
]),
},
params: { settings: BATCH_TRIAGE_SETTINGS },
targetModels: WORKFLOW_TARGET_MODELS,
assert: async (rig: any, result) => {
// Assert issue 103 has area/core
await assertHasIssueLabel(103, 'area/core')(rig, result);
@@ -223,7 +219,6 @@ describe('batch_triage_agent', () => {
]),
},
params: { settings: BATCH_TRIAGE_SETTINGS },
targetModels: WORKFLOW_TARGET_MODELS,
assert: assertHasIssueLabel(105, 'status/need-retesting'),
});
@@ -241,7 +236,6 @@ describe('batch_triage_agent', () => {
]),
},
params: { settings: BATCH_TRIAGE_SETTINGS },
targetModels: WORKFLOW_TARGET_MODELS,
assert: assertHasIssueLabel(106, 'status/need-information'),
});
@@ -259,7 +253,6 @@ describe('batch_triage_agent', () => {
]),
},
params: { settings: BATCH_TRIAGE_SETTINGS },
targetModels: WORKFLOW_TARGET_MODELS,
assert: async (rig: any, result) => {
await assertHasIssueLabel(107, 'area/core')(rig, result);
await assertHasIssueLabel(108, 'area/platform')(rig, result);

View File

@@ -1,10 +0,0 @@
/**
* @license
* Copyright 2025 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
// The models that workflow evals should target.
// These workflows (triage, dedup) run in GitHub Actions using the default CLI model.
// We restrict evals to this model to avoid noise from other models in the nightly matrix.
export const WORKFLOW_TARGET_MODELS = ['gemini-2.5-pro'];

View File

@@ -9,7 +9,6 @@ import { evalTest } from '../test-helper.js';
import fs from 'node:fs/promises';
import path from 'node:path';
import yaml from 'js-yaml';
import { WORKFLOW_TARGET_MODELS } from './constants.js';
// Read the workflow file to extract the prompt and settings
const workflowPath = path.join(
@@ -66,7 +65,6 @@ describe('dedup_agent', () => {
params: {
settings: DEDUP_SETTINGS,
},
targetModels: WORKFLOW_TARGET_MODELS,
files: {
github_env: '',
// Mock gh binary
@@ -147,7 +145,6 @@ if (args.includes('issue view')) {
params: {
settings: DEDUP_SETTINGS,
},
targetModels: WORKFLOW_TARGET_MODELS,
files: {
github_env: '',
'bin/gh': `#!/usr/bin/env node
@@ -219,7 +216,6 @@ if (args.includes('issue view')) {
params: {
settings: DEDUP_SETTINGS,
},
targetModels: WORKFLOW_TARGET_MODELS,
files: {
github_env: '',
'bin/gh': `#!/usr/bin/env node
@@ -290,7 +286,6 @@ if (args.includes('issue view')) {
params: {
settings: DEDUP_SETTINGS,
},
targetModels: WORKFLOW_TARGET_MODELS,
files: {
github_env: '',
'bin/gh': `#!/usr/bin/env node
@@ -361,7 +356,6 @@ if (args.includes('issue view')) {
params: {
settings: DEDUP_SETTINGS,
},
targetModels: WORKFLOW_TARGET_MODELS,
files: {
github_env: '',
'bin/gh': `#!/usr/bin/env node

View File

@@ -9,7 +9,6 @@ import { evalTest } from '../test-helper.js';
import fs from 'node:fs/promises';
import path from 'node:path';
import yaml from 'js-yaml';
import { WORKFLOW_TARGET_MODELS } from './constants.js';
// Read the workflow file to extract the prompt and settings
const workflowPath = path.join(
@@ -62,7 +61,6 @@ describe('dedup_refresh_agent', () => {
params: {
settings: REFRESH_SETTINGS,
},
targetModels: WORKFLOW_TARGET_MODELS,
assert: async (rig: any, result) => {
// result is the JSON output
const output = JSON.parse(result);

View File

@@ -9,7 +9,6 @@ import { evalTest } from '../test-helper.js';
import fs from 'node:fs/promises';
import path from 'node:path';
import yaml from 'js-yaml';
import { WORKFLOW_TARGET_MODELS } from './constants.js';
// Read the workflow file to extract the prompt
const workflowPath = path.join(
@@ -148,7 +147,6 @@ describe('triage_agent', () => {
),
],
params: { settings: TRIAGE_SETTINGS },
targetModels: WORKFLOW_TARGET_MODELS,
assert: assertHasLabel('area/core'),
});
@@ -164,7 +162,6 @@ describe('triage_agent', () => {
),
],
params: { settings: TRIAGE_SETTINGS },
targetModels: WORKFLOW_TARGET_MODELS,
assert: assertHasLabel('area/platform'),
});
@@ -180,7 +177,6 @@ describe('triage_agent', () => {
),
],
params: { settings: TRIAGE_SETTINGS },
targetModels: WORKFLOW_TARGET_MODELS,
assert: assertHasLabel('area/platform'),
});
@@ -196,7 +192,6 @@ describe('triage_agent', () => {
),
],
params: { settings: TRIAGE_SETTINGS },
targetModels: WORKFLOW_TARGET_MODELS,
assert: assertHasLabel('area/core'),
});
@@ -212,7 +207,6 @@ describe('triage_agent', () => {
),
],
params: { settings: TRIAGE_SETTINGS },
targetModels: WORKFLOW_TARGET_MODELS,
assert: assertHasLabel('area/platform'),
});
@@ -228,7 +222,6 @@ describe('triage_agent', () => {
),
],
params: { settings: TRIAGE_SETTINGS },
targetModels: WORKFLOW_TARGET_MODELS,
assert: assertHasLabel('area/core'),
});
@@ -244,7 +237,6 @@ describe('triage_agent', () => {
),
],
params: { settings: TRIAGE_SETTINGS },
targetModels: WORKFLOW_TARGET_MODELS,
assert: assertHasLabel('area/agent'),
});
@@ -260,7 +252,6 @@ describe('triage_agent', () => {
),
],
params: { settings: TRIAGE_SETTINGS },
targetModels: WORKFLOW_TARGET_MODELS,
assert: assertHasLabel('area/extensions'),
});
@@ -273,7 +264,6 @@ describe('triage_agent', () => {
createPrompt('Buy cheap rolex', 'Click here for discount.'),
],
params: { settings: TRIAGE_SETTINGS },
targetModels: WORKFLOW_TARGET_MODELS,
assert: assertHasLabel('area/unknown'),
});
@@ -289,7 +279,6 @@ describe('triage_agent', () => {
),
],
params: { settings: TRIAGE_SETTINGS },
targetModels: WORKFLOW_TARGET_MODELS,
assert: assertHasLabel('area/core'),
});
@@ -305,7 +294,6 @@ describe('triage_agent', () => {
),
],
params: { settings: TRIAGE_SETTINGS },
targetModels: WORKFLOW_TARGET_MODELS,
assert: assertHasLabel('area/agent'),
});
@@ -321,7 +309,6 @@ describe('triage_agent', () => {
),
],
params: { settings: TRIAGE_SETTINGS },
targetModels: WORKFLOW_TARGET_MODELS,
assert: assertHasLabel('area/enterprise'),
});
@@ -334,7 +321,6 @@ describe('triage_agent', () => {
createPrompt('It does not work', 'I tried to use it and it failed.'),
],
params: { settings: TRIAGE_SETTINGS },
targetModels: WORKFLOW_TARGET_MODELS,
assert: assertHasLabel('area/unknown'),
});
@@ -350,7 +336,6 @@ describe('triage_agent', () => {
),
],
params: { settings: TRIAGE_SETTINGS },
targetModels: WORKFLOW_TARGET_MODELS,
assert: assertHasLabel('area/security'),
});
@@ -366,7 +351,6 @@ describe('triage_agent', () => {
),
],
params: { settings: TRIAGE_SETTINGS },
targetModels: WORKFLOW_TARGET_MODELS,
assert: assertHasLabel('area/non-interactive'),
});
@@ -382,7 +366,6 @@ describe('triage_agent', () => {
),
],
params: { settings: TRIAGE_SETTINGS },
targetModels: WORKFLOW_TARGET_MODELS,
assert: assertHasLabel('area/agent'),
});
@@ -398,7 +381,6 @@ describe('triage_agent', () => {
),
],
params: { settings: TRIAGE_SETTINGS },
targetModels: WORKFLOW_TARGET_MODELS,
assert: assertHasLabel('area/core'),
});
@@ -414,7 +396,6 @@ describe('triage_agent', () => {
),
],
params: { settings: TRIAGE_SETTINGS },
targetModels: WORKFLOW_TARGET_MODELS,
assert: assertHasLabel('area/security'),
});
@@ -427,7 +408,6 @@ describe('triage_agent', () => {
createPrompt('asdfasdf', 'qwerqwer zxcvbnm'),
],
params: { settings: TRIAGE_SETTINGS },
targetModels: WORKFLOW_TARGET_MODELS,
assert: assertHasLabel('area/unknown'),
});
});