diff --git a/evals/test-helper.ts b/evals/test-helper.ts index bee3fafa0d..1167be53eb 100644 --- a/evals/test-helper.ts +++ b/evals/test-helper.ts @@ -35,18 +35,6 @@ export * from '@google/gemini-cli-test-utils'; // This may take a really long time and is not recommended. export type EvalPolicy = 'ALWAYS_PASSES' | 'USUALLY_PASSES'; -export interface EvalCase { - name: string; - params?: Record; - prompt: string | string[]; - timeout?: number; - env?: Record; - files?: Record; - approvalMode?: 'default' | 'auto_edit' | 'yolo' | 'plan'; - targetModels?: string[]; - assert: (rig: TestRig, result: string) => Promise; -} - export function evalTest(policy: EvalPolicy, evalCase: EvalCase) { const fn = async () => { const rig = new TestRig() as any; @@ -169,16 +157,6 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) { } }; - const currentModel = process.env.GEMINI_MODEL; - if ( - evalCase.targetModels && - currentModel && - !evalCase.targetModels.includes(currentModel) - ) { - it.skip(`${evalCase.name} (skipped for model ${currentModel})`, fn); - return; - } - if (policy === 'USUALLY_PASSES' && !process.env['RUN_EVALS']) { it.skip(evalCase.name, fn); } else { @@ -192,3 +170,14 @@ async function prepareLogDir(name: string) { const sanitizedName = name.replace(/[^a-z0-9]/gi, '_').toLowerCase(); return { logDir, sanitizedName }; } + +export interface EvalCase { + name: string; + params?: Record; + prompt: string | string[]; + timeout?: number; + env?: Record; + files?: Record; + approvalMode?: 'default' | 'auto_edit' | 'yolo' | 'plan'; + assert: (rig: TestRig, result: string) => Promise; +} diff --git a/scripts/aggregate_evals.js b/scripts/aggregate_evals.js index 6f5e440bd2..ce913ca4f7 100644 --- a/scripts/aggregate_evals.js +++ b/scripts/aggregate_evals.js @@ -38,6 +38,9 @@ function getModelFromPath(reportPath) { const artifactDir = parts.find((p) => p.startsWith('eval-logs-')); if (!artifactDir) return 'unknown'; + const matchWorkflow = artifactDir.match(/^eval-logs-workflows-(.+)$/); + if (matchWorkflow) return `${matchWorkflow[1]} (Workflow)`; + const matchNew = artifactDir.match(/^eval-logs-(.+)-(\d+)$/); if (matchNew) return matchNew[1];