mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-03-13 07:30:52 -07:00
Behavioral evals framework. (#16047)
This commit is contained in:
committed by
GitHub
parent
933bc5774f
commit
8030404b08
35
.github/workflows/chained_e2e.yml
vendored
35
.github/workflows/chained_e2e.yml
vendored
@@ -277,6 +277,37 @@ jobs:
|
||||
shell: 'pwsh'
|
||||
run: 'npm run test:integration:sandbox:none'
|
||||
|
||||
evals:
|
||||
name: 'Evals (ALWAYS_PASSING)'
|
||||
needs:
|
||||
- 'merge_queue_skipper'
|
||||
- 'parse_run_context'
|
||||
runs-on: 'gemini-cli-ubuntu-16-core'
|
||||
if: |
|
||||
always() && (needs.merge_queue_skipper.result !='success' || needs.merge_queue_skipper.outputs.skip != 'true')
|
||||
steps:
|
||||
- name: 'Checkout'
|
||||
uses: 'actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955' # ratchet:actions/checkout@v5
|
||||
with:
|
||||
ref: '${{ needs.parse_run_context.outputs.sha }}'
|
||||
repository: '${{ needs.parse_run_context.outputs.repository }}'
|
||||
|
||||
- name: 'Set up Node.js 20.x'
|
||||
uses: 'actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020' # ratchet:actions-node@v4
|
||||
with:
|
||||
node-version: '20.x'
|
||||
|
||||
- name: 'Install dependencies'
|
||||
run: 'npm ci'
|
||||
|
||||
- name: 'Build project'
|
||||
run: 'npm run build'
|
||||
|
||||
- name: 'Run Evals (Required to pass)'
|
||||
env:
|
||||
GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}'
|
||||
run: 'npm run test:always_passing_evals'
|
||||
|
||||
e2e:
|
||||
name: 'E2E'
|
||||
if: |
|
||||
@@ -284,13 +315,15 @@ jobs:
|
||||
needs:
|
||||
- 'e2e_linux'
|
||||
- 'e2e_mac'
|
||||
- 'evals'
|
||||
- 'merge_queue_skipper'
|
||||
runs-on: 'gemini-cli-ubuntu-16-core'
|
||||
steps:
|
||||
- name: 'Check E2E test results'
|
||||
run: |
|
||||
if [[ ${{ needs.e2e_linux.result }} != 'success' || \
|
||||
${{ needs.e2e_mac.result }} != 'success' ]]; then
|
||||
${{ needs.e2e_mac.result }} != 'success' || \
|
||||
${{ needs.evals.result }} != 'success' ]]; then
|
||||
echo "One or more E2E jobs failed."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
41
.github/workflows/evals-nightly.yml
vendored
Normal file
41
.github/workflows/evals-nightly.yml
vendored
Normal file
@@ -0,0 +1,41 @@
|
||||
name: 'Evals: Nightly'
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 1 * * *' # Runs at 1 AM every day
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
run_all:
|
||||
description: 'Run all evaluations (including usually passing)'
|
||||
type: 'boolean'
|
||||
default: true
|
||||
|
||||
permissions:
|
||||
contents: 'read'
|
||||
checks: 'write'
|
||||
|
||||
jobs:
|
||||
evals:
|
||||
name: 'Evals (USUALLY_PASSING) nightly run'
|
||||
runs-on: 'gemini-cli-ubuntu-16-core'
|
||||
steps:
|
||||
- name: 'Checkout'
|
||||
uses: 'actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8' # ratchet:actions/checkout@v5
|
||||
|
||||
- name: 'Set up Node.js'
|
||||
uses: 'actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020' # ratchet:actions/setup-node@v4
|
||||
with:
|
||||
node-version-file: '.nvmrc'
|
||||
cache: 'npm'
|
||||
|
||||
- name: 'Install dependencies'
|
||||
run: 'npm ci'
|
||||
|
||||
- name: 'Build project'
|
||||
run: 'npm run build'
|
||||
|
||||
- name: 'Run Evals'
|
||||
env:
|
||||
GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}'
|
||||
RUN_EVALS: "${{ github.event.inputs.run_all != 'false' }}"
|
||||
run: 'npm run test:all_evals'
|
||||
Reference in New Issue
Block a user