Generalize evals infra to support more types of evals, organization and queuing of named suites (#24941)

This commit is contained in:
Christian Gunderman
2026-04-08 23:57:26 +00:00
committed by GitHub
parent bc3ed61adb
commit f1bb2af6de
32 changed files with 475 additions and 133 deletions

View File

@@ -335,6 +335,8 @@ jobs:
env:
GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}'
GEMINI_MODEL: 'gemini-3-pro-preview'
# Only run always passes behavioral tests.
EVAL_SUITE_TYPE: 'behavioral'
# Disable Vitest internal retries to avoid double-retrying;
# custom retry logic is handled in evals/test-helper.ts
VITEST_RETRY: 0

View File

@@ -5,10 +5,18 @@ on:
- cron: '0 1 * * *' # Runs at 1 AM every day
workflow_dispatch:
inputs:
run_all:
description: 'Run all evaluations (including usually passing)'
type: 'boolean'
default: true
suite_type:
description: 'Suite type to run'
type: 'choice'
options:
- 'behavioral'
- 'component-level'
- 'hero-scenario'
default: 'behavioral'
suite_name:
description: 'Specific suite name to run'
required: false
type: 'string'
test_name_pattern:
description: 'Test name pattern or file name'
required: false
@@ -59,7 +67,9 @@ jobs:
env:
GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}'
GEMINI_MODEL: '${{ matrix.model }}'
RUN_EVALS: "${{ github.event.inputs.run_all != 'false' }}"
RUN_EVALS: 'true'
EVAL_SUITE_TYPE: "${{ github.event.inputs.suite_type || 'behavioral' }}"
EVAL_SUITE_NAME: '${{ github.event.inputs.suite_name }}'
TEST_NAME_PATTERN: '${{ github.event.inputs.test_name_pattern }}'
# Disable Vitest internal retries to avoid double-retrying;
# custom retry logic is handled in evals/test-helper.ts