mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-04-24 03:54:43 -07:00
fix(ci): isolate workflow evals, revert unrelated changes, and fix aggregation
This commit is contained in:
+11
-22
@@ -35,18 +35,6 @@ export * from '@google/gemini-cli-test-utils';
|
|||||||
// This may take a really long time and is not recommended.
|
// This may take a really long time and is not recommended.
|
||||||
export type EvalPolicy = 'ALWAYS_PASSES' | 'USUALLY_PASSES';
|
export type EvalPolicy = 'ALWAYS_PASSES' | 'USUALLY_PASSES';
|
||||||
|
|
||||||
export interface EvalCase {
|
|
||||||
name: string;
|
|
||||||
params?: Record<string, any>;
|
|
||||||
prompt: string | string[];
|
|
||||||
timeout?: number;
|
|
||||||
env?: Record<string, string>;
|
|
||||||
files?: Record<string, string>;
|
|
||||||
approvalMode?: 'default' | 'auto_edit' | 'yolo' | 'plan';
|
|
||||||
targetModels?: string[];
|
|
||||||
assert: (rig: TestRig, result: string) => Promise<void>;
|
|
||||||
}
|
|
||||||
|
|
||||||
export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
|
export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
|
||||||
const fn = async () => {
|
const fn = async () => {
|
||||||
const rig = new TestRig() as any;
|
const rig = new TestRig() as any;
|
||||||
@@ -169,16 +157,6 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
const currentModel = process.env.GEMINI_MODEL;
|
|
||||||
if (
|
|
||||||
evalCase.targetModels &&
|
|
||||||
currentModel &&
|
|
||||||
!evalCase.targetModels.includes(currentModel)
|
|
||||||
) {
|
|
||||||
it.skip(`${evalCase.name} (skipped for model ${currentModel})`, fn);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (policy === 'USUALLY_PASSES' && !process.env['RUN_EVALS']) {
|
if (policy === 'USUALLY_PASSES' && !process.env['RUN_EVALS']) {
|
||||||
it.skip(evalCase.name, fn);
|
it.skip(evalCase.name, fn);
|
||||||
} else {
|
} else {
|
||||||
@@ -192,3 +170,14 @@ async function prepareLogDir(name: string) {
|
|||||||
const sanitizedName = name.replace(/[^a-z0-9]/gi, '_').toLowerCase();
|
const sanitizedName = name.replace(/[^a-z0-9]/gi, '_').toLowerCase();
|
||||||
return { logDir, sanitizedName };
|
return { logDir, sanitizedName };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export interface EvalCase {
|
||||||
|
name: string;
|
||||||
|
params?: Record<string, any>;
|
||||||
|
prompt: string | string[];
|
||||||
|
timeout?: number;
|
||||||
|
env?: Record<string, string>;
|
||||||
|
files?: Record<string, string>;
|
||||||
|
approvalMode?: 'default' | 'auto_edit' | 'yolo' | 'plan';
|
||||||
|
assert: (rig: TestRig, result: string) => Promise<void>;
|
||||||
|
}
|
||||||
|
|||||||
@@ -38,6 +38,9 @@ function getModelFromPath(reportPath) {
|
|||||||
const artifactDir = parts.find((p) => p.startsWith('eval-logs-'));
|
const artifactDir = parts.find((p) => p.startsWith('eval-logs-'));
|
||||||
if (!artifactDir) return 'unknown';
|
if (!artifactDir) return 'unknown';
|
||||||
|
|
||||||
|
const matchWorkflow = artifactDir.match(/^eval-logs-workflows-(.+)$/);
|
||||||
|
if (matchWorkflow) return `${matchWorkflow[1]} (Workflow)`;
|
||||||
|
|
||||||
const matchNew = artifactDir.match(/^eval-logs-(.+)-(\d+)$/);
|
const matchNew = artifactDir.match(/^eval-logs-(.+)-(\d+)$/);
|
||||||
if (matchNew) return matchNew[1];
|
if (matchNew) return matchNew[1];
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user