fix(ci): isolate workflow evals, revert unrelated changes, and fix aggregation

2026-07-22 07:41:23 -07:00 · 2026-02-03 23:04:50 -05:00
parent 9da1542071
commit fe50e580fa
2 changed files with 14 additions and 22 deletions
@@ -35,18 +35,6 @@ export * from '@google/gemini-cli-test-utils';
 //   This may take a really long time and is not recommended.
 export type EvalPolicy = 'ALWAYS_PASSES' | 'USUALLY_PASSES';
 export interface EvalCase {
  name: string;
  params?: Record<string, any>;
  prompt: string | string[];
  timeout?: number;
  env?: Record<string, string>;
  files?: Record<string, string>;
  approvalMode?: 'default' | 'auto_edit' | 'yolo' | 'plan';
  targetModels?: string[];
  assert: (rig: TestRig, result: string) => Promise<void>;
 }
 export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
  const fn = async () => {
    const rig = new TestRig() as any;
@@ -169,16 +157,6 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
    }
  };
  const currentModel = process.env.GEMINI_MODEL;
  if (
    evalCase.targetModels &&
    currentModel &&
    !evalCase.targetModels.includes(currentModel)
  ) {
    it.skip(`${evalCase.name} (skipped for model ${currentModel})`, fn);
    return;
  }
  if (policy === 'USUALLY_PASSES' && !process.env['RUN_EVALS']) {
    it.skip(evalCase.name, fn);
  } else {
@@ -192,3 +170,14 @@ async function prepareLogDir(name: string) {
  const sanitizedName = name.replace(/[^a-z0-9]/gi, '_').toLowerCase();
  return { logDir, sanitizedName };
 }
 export interface EvalCase {
  name: string;
  params?: Record<string, any>;
  prompt: string | string[];
  timeout?: number;
  env?: Record<string, string>;
  files?: Record<string, string>;
  approvalMode?: 'default' | 'auto_edit' | 'yolo' | 'plan';
  assert: (rig: TestRig, result: string) => Promise<void>;
 }
@@ -38,6 +38,9 @@ function getModelFromPath(reportPath) {
  const artifactDir = parts.find((p) => p.startsWith('eval-logs-'));
  if (!artifactDir) return 'unknown';
  const matchWorkflow = artifactDir.match(/^eval-logs-workflows-(.+)$/);
  if (matchWorkflow) return `${matchWorkflow[1]} (Workflow)`;
  const matchNew = artifactDir.match(/^eval-logs-(.+)-(\d+)$/);
  if (matchNew) return matchNew[1];