From 6b8b61290b6bf8ff8e76a8c18e669eefde526865 Mon Sep 17 00:00:00 2001 From: Alisa Novikova <62909685+alisa-alisa@users.noreply.github.com> Date: Wed, 18 Mar 2026 16:30:48 -0700 Subject: [PATCH] feat(evals): consolidate PR impact check into a single job --- .github/workflows/eval-pr.yml | 56 +++++++++++++---------------------- 1 file changed, 20 insertions(+), 36 deletions(-) diff --git a/.github/workflows/eval-pr.yml b/.github/workflows/eval-pr.yml index adb7099910..2fe135bbdf 100644 --- a/.github/workflows/eval-pr.yml +++ b/.github/workflows/eval-pr.yml @@ -14,15 +14,10 @@ permissions: pull-requests: 'write' jobs: - eval-run: - name: 'Eval Run (${{ matrix.model }}, attempt ${{ matrix.run_attempt }})' + eval-impact: + name: 'Eval Impact Analysis' runs-on: 'gemini-cli-ubuntu-16-core' if: "github.repository == 'google-gemini/gemini-cli'" - strategy: - fail-fast: false - matrix: - run_attempt: [1, 2, 3] - model: ["gemini-3.1-pro-preview-customtools", "gemini-3-flash-preview"] steps: - name: 'Checkout' uses: 'actions/checkout@v4' @@ -39,52 +34,41 @@ jobs: - name: 'Build project' run: 'npm run build' - - name: 'Run Evals' + - name: 'Run Evals (3 Attempts)' env: GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}' RUN_EVALS: 'true' - GEMINI_MODEL: '${{ matrix.model }}' run: | - # Use a unique directory name for this matrix leg - DIR_NAME="eval-logs-${{ matrix.model }}-${{ matrix.run_attempt }}" - mkdir -p "evals/logs/$DIR_NAME" - npm run test:all_evals -- --outputFile.json="evals/logs/$DIR_NAME/report.json" || true - - - name: 'Upload Logs' - uses: 'actions/upload-artifact@v4' - with: - name: 'eval-logs-${{ matrix.model }}-${{ matrix.run_attempt }}' - path: 'evals/logs' - retention-days: 1 - - aggregate-impact: - name: 'Aggregate Impact' - needs: ['eval-run'] - if: 'always()' - runs-on: 'gemini-cli-ubuntu-16-core' - steps: - - name: 'Checkout' - uses: 'actions/checkout@v4' - - - name: 'Download Logs' - uses: 'actions/download-artifact@v4' - with: - path: 'artifacts' + MODELS=("gemini-3.1-pro-preview-customtools" "gemini-3-flash-preview") + mkdir -p evals/logs + + for model in "${MODELS[@]}"; do + for attempt in {1..3}; do + echo "::group::Running $model (Attempt $attempt)" + DIR_NAME="eval-logs-$model-$attempt" + mkdir -p "evals/logs/$DIR_NAME" + # Run sequentially to keep one clean job in the UI + GEMINI_MODEL=$model npm run test:all_evals -- --outputFile.json="evals/logs/$DIR_NAME/report.json" || true + echo "::endgroup::" + done + done - name: 'Generate Impact Report' id: 'generate-report' + if: 'always()' env: GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}' run: | - if [ ! -d "artifacts" ]; then + if [ ! -d "evals/logs" ]; then echo "No logs found, skipping report generation." exit 0 fi echo "" > report.md - node scripts/aggregate_evals.js artifacts --compare-main --pr-comment >> report.md + node scripts/aggregate_evals.js evals/logs --compare-main --pr-comment >> report.md cat report.md >> "$GITHUB_STEP_SUMMARY" - name: 'Comment on PR' + if: 'always()' env: GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}' run: |