feat(ci): isolate workflow evals into independent nightly job

- Splits 'Evals: Nightly' into 'evals' (general capabilities) and 'workflow-evals' (specific workflow simulations). - 'workflow-evals' runs only on 'gemini-2.5-pro' (the target model). - 'evals' excludes workflow tests to prevent noise/skewed metrics on other models. - Removes code-level 'targetModels' restrictions in favor of CI configuration. - Updates aggregation script to handle skipped tests correctly (though exclusion avoids them).
2026-06-12 12:26:57 -07:00 · 2026-02-03 22:37:15 -05:00
parent d23499db90
commit 9da1542071
6 changed files with 42 additions and 47 deletions
@@ -70,7 +70,8 @@ jobs:
              $CMD -- -t "$PATTERN"
            fi
          else
-            $CMD
+            # Exclude workflow evals from the general matrix run
+            $CMD -- --exclude evals/workflows
          fi

      - name: 'Upload Logs'
@@ -81,9 +82,48 @@ jobs:
          path: 'evals/logs'
          retention-days: 7

+  workflow-evals:
+    name: 'Evals (Workflow) nightly run'
+    runs-on: 'gemini-cli-ubuntu-16-core'
+    steps:
+      - name: 'Checkout'
+        uses: 'actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8' # ratchet:actions/checkout@v5
+
+      - name: 'Set up Node.js'
+        uses: 'actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020' # ratchet:actions/setup-node@v4
+        with:
+          node-version-file: '.nvmrc'
+          cache: 'npm'
+
+      - name: 'Install dependencies'
+        run: 'npm ci'
+
+      - name: 'Build project'
+        run: 'npm run build'
+
+      - name: 'Create logs directory'
+        run: 'mkdir -p evals/logs'
+
+      - name: 'Run Workflow Evals'
+        continue-on-error: true
+        env:
+          GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}'
+          GEMINI_MODEL: 'gemini-2.5-pro'
+          RUN_EVALS: 'true'
+        # Explicitly target the workflow directory
+        run: 'npm run test:all_evals -- evals/workflows'
+
+      - name: 'Upload Logs'
+        if: 'always()'
+        uses: 'actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02' # ratchet:actions/upload-artifact@v4
+        with:
+          name: 'eval-logs-workflows-gemini-2.5-pro'
+          path: 'evals/logs'
+          retention-days: 7
+
  aggregate-results:
    name: 'Aggregate Results'
-    needs: ['evals']
+    needs: ['evals', 'workflow-evals']
    if: 'always()'
    runs-on: 'gemini-cli-ubuntu-16-core'
    steps:
@@ -9,7 +9,6 @@ import { evalTest } from '../test-helper.js';
 import fs from 'node:fs/promises';
 import path from 'node:path';
 import yaml from 'js-yaml';
-import { WORKFLOW_TARGET_MODELS } from './constants.js';

 // Read the workflow file to extract the prompt
 const workflowPath = path.join(
@@ -159,7 +158,6 @@ describe('batch_triage_agent', () => {
      ]),
    },
    params: { settings: BATCH_TRIAGE_SETTINGS },
-    targetModels: WORKFLOW_TARGET_MODELS,
    assert: assertHasIssueLabel(101, 'area/core'),
  });

@@ -177,7 +175,6 @@ describe('batch_triage_agent', () => {
      ]),
    },
    params: { settings: BATCH_TRIAGE_SETTINGS },
-    targetModels: WORKFLOW_TARGET_MODELS,
    assert: assertHasIssueLabel(102, 'area/platform'),
  });

@@ -200,7 +197,6 @@ describe('batch_triage_agent', () => {
      ]),
    },
    params: { settings: BATCH_TRIAGE_SETTINGS },
-    targetModels: WORKFLOW_TARGET_MODELS,
    assert: async (rig: any, result) => {
      // Assert issue 103 has area/core
      await assertHasIssueLabel(103, 'area/core')(rig, result);
@@ -223,7 +219,6 @@ describe('batch_triage_agent', () => {
      ]),
    },
    params: { settings: BATCH_TRIAGE_SETTINGS },
-    targetModels: WORKFLOW_TARGET_MODELS,
    assert: assertHasIssueLabel(105, 'status/need-retesting'),
  });

@@ -241,7 +236,6 @@ describe('batch_triage_agent', () => {
      ]),
    },
    params: { settings: BATCH_TRIAGE_SETTINGS },
-    targetModels: WORKFLOW_TARGET_MODELS,
    assert: assertHasIssueLabel(106, 'status/need-information'),
  });

@@ -259,7 +253,6 @@ describe('batch_triage_agent', () => {
      ]),
    },
    params: { settings: BATCH_TRIAGE_SETTINGS },
-    targetModels: WORKFLOW_TARGET_MODELS,
    assert: async (rig: any, result) => {
      await assertHasIssueLabel(107, 'area/core')(rig, result);
      await assertHasIssueLabel(108, 'area/platform')(rig, result);
@@ -1,10 +0,0 @@
-/**
- * @license
- * Copyright 2025 Google LLC
- * SPDX-License-Identifier: Apache-2.0
- */
-
-// The models that workflow evals should target.
-// These workflows (triage, dedup) run in GitHub Actions using the default CLI model.
-// We restrict evals to this model to avoid noise from other models in the nightly matrix.
-export const WORKFLOW_TARGET_MODELS = ['gemini-2.5-pro'];
@@ -9,7 +9,6 @@ import { evalTest } from '../test-helper.js';
 import fs from 'node:fs/promises';
 import path from 'node:path';
 import yaml from 'js-yaml';
-import { WORKFLOW_TARGET_MODELS } from './constants.js';

 // Read the workflow file to extract the prompt and settings
 const workflowPath = path.join(
@@ -66,7 +65,6 @@ describe('dedup_agent', () => {
    params: {
      settings: DEDUP_SETTINGS,
    },
-    targetModels: WORKFLOW_TARGET_MODELS,
    files: {
      github_env: '',
      // Mock gh binary
@@ -147,7 +145,6 @@ if (args.includes('issue view')) {
    params: {
      settings: DEDUP_SETTINGS,
    },
-    targetModels: WORKFLOW_TARGET_MODELS,
    files: {
      github_env: '',
      'bin/gh': `#!/usr/bin/env node
@@ -219,7 +216,6 @@ if (args.includes('issue view')) {
    params: {
      settings: DEDUP_SETTINGS,
    },
-    targetModels: WORKFLOW_TARGET_MODELS,
    files: {
      github_env: '',
      'bin/gh': `#!/usr/bin/env node
@@ -290,7 +286,6 @@ if (args.includes('issue view')) {
    params: {
      settings: DEDUP_SETTINGS,
    },
-    targetModels: WORKFLOW_TARGET_MODELS,
    files: {
      github_env: '',
      'bin/gh': `#!/usr/bin/env node
@@ -361,7 +356,6 @@ if (args.includes('issue view')) {
    params: {
      settings: DEDUP_SETTINGS,
    },
-    targetModels: WORKFLOW_TARGET_MODELS,
    files: {
      github_env: '',
      'bin/gh': `#!/usr/bin/env node
@@ -9,7 +9,6 @@ import { evalTest } from '../test-helper.js';
 import fs from 'node:fs/promises';
 import path from 'node:path';
 import yaml from 'js-yaml';
-import { WORKFLOW_TARGET_MODELS } from './constants.js';

 // Read the workflow file to extract the prompt and settings
 const workflowPath = path.join(
@@ -62,7 +61,6 @@ describe('dedup_refresh_agent', () => {
    params: {
      settings: REFRESH_SETTINGS,
    },
-    targetModels: WORKFLOW_TARGET_MODELS,
    assert: async (rig: any, result) => {
      // result is the JSON output
      const output = JSON.parse(result);
@@ -9,7 +9,6 @@ import { evalTest } from '../test-helper.js';
 import fs from 'node:fs/promises';
 import path from 'node:path';
 import yaml from 'js-yaml';
-import { WORKFLOW_TARGET_MODELS } from './constants.js';

 // Read the workflow file to extract the prompt
 const workflowPath = path.join(
@@ -148,7 +147,6 @@ describe('triage_agent', () => {
      ),
    ],
    params: { settings: TRIAGE_SETTINGS },
-    targetModels: WORKFLOW_TARGET_MODELS,
    assert: assertHasLabel('area/core'),
  });

@@ -164,7 +162,6 @@ describe('triage_agent', () => {
      ),
    ],
    params: { settings: TRIAGE_SETTINGS },
-    targetModels: WORKFLOW_TARGET_MODELS,
    assert: assertHasLabel('area/platform'),
  });

@@ -180,7 +177,6 @@ describe('triage_agent', () => {
      ),
    ],
    params: { settings: TRIAGE_SETTINGS },
-    targetModels: WORKFLOW_TARGET_MODELS,
    assert: assertHasLabel('area/platform'),
  });

@@ -196,7 +192,6 @@ describe('triage_agent', () => {
      ),
    ],
    params: { settings: TRIAGE_SETTINGS },
-    targetModels: WORKFLOW_TARGET_MODELS,
    assert: assertHasLabel('area/core'),
  });

@@ -212,7 +207,6 @@ describe('triage_agent', () => {
      ),
    ],
    params: { settings: TRIAGE_SETTINGS },
-    targetModels: WORKFLOW_TARGET_MODELS,
    assert: assertHasLabel('area/platform'),
  });

@@ -228,7 +222,6 @@ describe('triage_agent', () => {
      ),
    ],
    params: { settings: TRIAGE_SETTINGS },
-    targetModels: WORKFLOW_TARGET_MODELS,
    assert: assertHasLabel('area/core'),
  });

@@ -244,7 +237,6 @@ describe('triage_agent', () => {
      ),
    ],
    params: { settings: TRIAGE_SETTINGS },
-    targetModels: WORKFLOW_TARGET_MODELS,
    assert: assertHasLabel('area/agent'),
  });

@@ -260,7 +252,6 @@ describe('triage_agent', () => {
      ),
    ],
    params: { settings: TRIAGE_SETTINGS },
-    targetModels: WORKFLOW_TARGET_MODELS,
    assert: assertHasLabel('area/extensions'),
  });

@@ -273,7 +264,6 @@ describe('triage_agent', () => {
      createPrompt('Buy cheap rolex', 'Click here for discount.'),
    ],
    params: { settings: TRIAGE_SETTINGS },
-    targetModels: WORKFLOW_TARGET_MODELS,
    assert: assertHasLabel('area/unknown'),
  });

@@ -289,7 +279,6 @@ describe('triage_agent', () => {
      ),
    ],
    params: { settings: TRIAGE_SETTINGS },
-    targetModels: WORKFLOW_TARGET_MODELS,
    assert: assertHasLabel('area/core'),
  });

@@ -305,7 +294,6 @@ describe('triage_agent', () => {
      ),
    ],
    params: { settings: TRIAGE_SETTINGS },
-    targetModels: WORKFLOW_TARGET_MODELS,
    assert: assertHasLabel('area/agent'),
  });

@@ -321,7 +309,6 @@ describe('triage_agent', () => {
      ),
    ],
    params: { settings: TRIAGE_SETTINGS },
-    targetModels: WORKFLOW_TARGET_MODELS,
    assert: assertHasLabel('area/enterprise'),
  });

@@ -334,7 +321,6 @@ describe('triage_agent', () => {
      createPrompt('It does not work', 'I tried to use it and it failed.'),
    ],
    params: { settings: TRIAGE_SETTINGS },
-    targetModels: WORKFLOW_TARGET_MODELS,
    assert: assertHasLabel('area/unknown'),
  });

@@ -350,7 +336,6 @@ describe('triage_agent', () => {
      ),
    ],
    params: { settings: TRIAGE_SETTINGS },
-    targetModels: WORKFLOW_TARGET_MODELS,
    assert: assertHasLabel('area/security'),
  });

@@ -366,7 +351,6 @@ describe('triage_agent', () => {
      ),
    ],
    params: { settings: TRIAGE_SETTINGS },
-    targetModels: WORKFLOW_TARGET_MODELS,
    assert: assertHasLabel('area/non-interactive'),
  });

@@ -382,7 +366,6 @@ describe('triage_agent', () => {
      ),
    ],
    params: { settings: TRIAGE_SETTINGS },
-    targetModels: WORKFLOW_TARGET_MODELS,
    assert: assertHasLabel('area/agent'),
  });

@@ -398,7 +381,6 @@ describe('triage_agent', () => {
      ),
    ],
    params: { settings: TRIAGE_SETTINGS },
-    targetModels: WORKFLOW_TARGET_MODELS,
    assert: assertHasLabel('area/core'),
  });

@@ -414,7 +396,6 @@ describe('triage_agent', () => {
      ),
    ],
    params: { settings: TRIAGE_SETTINGS },
-    targetModels: WORKFLOW_TARGET_MODELS,
    assert: assertHasLabel('area/security'),
  });

@@ -427,7 +408,6 @@ describe('triage_agent', () => {
      createPrompt('asdfasdf', 'qwerqwer zxcvbnm'),
    ],
    params: { settings: TRIAGE_SETTINGS },
-    targetModels: WORKFLOW_TARGET_MODELS,
    assert: assertHasLabel('area/unknown'),
  });
 });