diff --git a/.github/workflows/memory-nightly.yml b/.github/workflows/memory-nightly.yml index ee4e5e589c..6b0fb519c5 100644 --- a/.github/workflows/memory-nightly.yml +++ b/.github/workflows/memory-nightly.yml @@ -10,9 +10,21 @@ permissions: jobs: memory-test: - name: 'Run Memory Usage Tests' - runs-on: 'gemini-cli-ubuntu-16-core' + name: 'Run Memory Usage Tests (${{ matrix.machine_family }})' if: "github.repository == 'google-gemini/gemini-cli'" + strategy: + fail-fast: false + matrix: + include: + - runs_on: 'gemini-cli-ubuntu-16-core' + machine_family: 'gemini-cli-ubuntu-16-core' + - runs_on: 'macos-latest' + machine_family: 'macos-latest' + - runs_on: 'gemini-cli-windows-16-core' + machine_family: 'gemini-cli-windows-16-core' + runs-on: '${{ matrix.runs_on }}' + env: + MEMORY_MACHINE_FAMILY: '${{ matrix.machine_family }}' steps: - name: 'Checkout' uses: 'actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8' # ratchet:actions/checkout@v5 diff --git a/.github/workflows/perf-nightly.yml b/.github/workflows/perf-nightly.yml index 3749df231a..cbca80bacc 100644 --- a/.github/workflows/perf-nightly.yml +++ b/.github/workflows/perf-nightly.yml @@ -10,9 +10,21 @@ permissions: jobs: perf-test: - name: 'Run Performance Usage Tests' - runs-on: 'gemini-cli-ubuntu-16-core' + name: 'Run Performance Tests (${{ matrix.machine_family }})' if: "github.repository == 'google-gemini/gemini-cli'" + strategy: + fail-fast: false + matrix: + include: + - runs_on: 'gemini-cli-ubuntu-16-core' + machine_family: 'gemini-cli-ubuntu-16-core' + - runs_on: 'macos-latest' + machine_family: 'macos-latest' + - runs_on: 'gemini-cli-windows-16-core' + machine_family: 'gemini-cli-windows-16-core' + runs-on: '${{ matrix.runs_on }}' + env: + PERF_MACHINE_FAMILY: '${{ matrix.machine_family }}' steps: - name: 'Checkout' uses: 'actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8' # ratchet:actions/checkout@v5 diff --git a/.github/workflows/update-baselines.yml b/.github/workflows/update-baselines.yml new file mode 100644 index 0000000000..e49a80af1d --- /dev/null +++ b/.github/workflows/update-baselines.yml @@ -0,0 +1,243 @@ +# Copyright 2026 Google LLC +# SPDX-License-Identifier: Apache-2.0 +# +# Update Perf/Memory Baselines +# +# Triggered by: +# 1. A PR comment starting with one of: +# /run perf — updates only perf baselines +# /run mem — updates only memory baselines +# /run perf+mem — updates both (default) +# 2. Manual workflow_dispatch from the Actions tab. +# +# Both paths are gated behind the 'perf-approvers' GitHub environment, +# which requires approval from the designated approvers group before the +# matrix runners are provisioned. +# +# After all per-platform runs complete, the updated baseline JSON files +# are committed back to the triggering branch automatically. + +name: 'Update Perf/Memory Baselines' + +on: + issue_comment: + types: ['created'] + workflow_dispatch: + inputs: + test_type: + description: 'Which baselines to update' + required: true + default: 'perf+mem' + type: 'choice' + options: + - 'perf' + - 'mem' + - 'perf+mem' + ref: + description: 'Branch/SHA to checkout and update baselines on (default: main)' + required: false + default: 'main' + +permissions: + contents: 'write' # push the updated baseline commit + pull-requests: 'write' # post the result comment + issues: 'read' + +jobs: + # ── 1. Parse slash command / workflow_dispatch ────────────────────────── + parse-command: + name: 'Parse Command' + runs-on: 'gemini-cli-ubuntu-16-core' + if: | + github.repository == 'google-gemini/gemini-cli' && ( + github.event_name == 'workflow_dispatch' || ( + github.event_name == 'issue_comment' && + github.event.issue.pull_request != null && + ( + startsWith(github.event.comment.body, '/run perf+mem') || + startsWith(github.event.comment.body, '/run perf') || + startsWith(github.event.comment.body, '/run mem') + ) + ) + ) + outputs: + test_type: '${{ steps.parse.outputs.test_type }}' + ref: '${{ steps.parse.outputs.ref }}' + pr_number: '${{ steps.parse.outputs.pr_number }}' + steps: + - name: 'Parse inputs' + id: 'parse' + env: + GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}' + COMMENT_BODY: '${{ github.event.comment.body }}' + DISPATCH_TEST_TYPE: '${{ inputs.test_type }}' + DISPATCH_REF: '${{ inputs.ref }}' + PR_NUMBER: '${{ github.event.issue.number }}' + EVENT_NAME: '${{ github.event_name }}' + run: | + if [[ "$EVENT_NAME" == "workflow_dispatch" ]]; then + TEST_TYPE="${DISPATCH_TEST_TYPE:-perf+mem}" + REF="${DISPATCH_REF:-main}" + echo "pr_number=" >> "$GITHUB_OUTPUT" + else + # Slash command: determine test_type from comment + if echo "$COMMENT_BODY" | grep -q "^/run perf+mem"; then + TEST_TYPE="perf+mem" + elif echo "$COMMENT_BODY" | grep -q "^/run perf"; then + TEST_TYPE="perf" + else + TEST_TYPE="mem" + fi + # Get the HEAD sha of the PR + REF=$(gh pr view "$PR_NUMBER" --json headRefName --jq '.headRefName') + echo "pr_number=${PR_NUMBER}" >> "$GITHUB_OUTPUT" + fi + echo "test_type=${TEST_TYPE}" >> "$GITHUB_OUTPUT" + echo "ref=${REF}" >> "$GITHUB_OUTPUT" + + - name: 'Post acknowledgement comment on PR' + if: "steps.parse.outputs.pr_number != ''" + env: + GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}' + run: | + gh pr comment "${{ steps.parse.outputs.pr_number }}" --body \ + "⏳ **Baseline update requested** (\`${{ steps.parse.outputs.test_type }}\`). + A member of the \`perf-approvers\` group must approve this workflow before the runners start. + " + + # ── 2. Approval gate (perf-approvers environment) ────────────────────── + await-approval: + name: 'Await perf-approvers Approval' + needs: 'parse-command' + # This environment requires manual approval from the perf-approvers group + # before GitHub provisions any of the downstream runners. + environment: 'perf-approvers' + runs-on: 'gemini-cli-ubuntu-16-core' + steps: + - name: 'Approved' + run: 'echo "Approved by perf-approvers — launching baseline update matrix."' + + # ── 3. Run tests to capture fresh baselines on each platform ─────────── + update-baselines: + name: 'Update Baselines (${{ matrix.machine_family }})' + needs: 'await-approval' + strategy: + fail-fast: false + matrix: + include: + - runs_on: 'gemini-cli-ubuntu-16-core' + machine_family: 'gemini-cli-ubuntu-16-core' + - runs_on: 'macos-latest' + machine_family: 'macos-latest' + - runs_on: 'gemini-cli-windows-16-core' + machine_family: 'gemini-cli-windows-16-core' + runs-on: '${{ matrix.runs_on }}' + steps: + - name: 'Checkout' + uses: 'actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8' # ratchet:actions/checkout@v5 + with: + ref: '${{ needs.parse-command.outputs.ref }}' + # Need full history so we can push back + fetch-depth: 0 + + - name: 'Set up Node.js' + uses: 'actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020' # ratchet:actions/setup-node@v4 + with: + node-version-file: '.nvmrc' + cache: 'npm' + + - name: 'Install dependencies' + run: 'npm ci' + + - name: 'Build project' + run: 'npm run build' + + - name: 'Update Perf Baselines' + if: "contains(needs.parse-command.outputs.test_type, 'perf')" + run: 'npm run test:perf:update-baselines' + env: + PERF_MACHINE_FAMILY: '${{ matrix.machine_family }}' + + - name: 'Update Memory Baselines' + if: "contains(needs.parse-command.outputs.test_type, 'mem')" + run: 'npm run test:memory:update-baselines' + env: + MEMORY_MACHINE_FAMILY: '${{ matrix.machine_family }}' + + - name: 'Upload updated baseline files' + uses: 'actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02' # ratchet:actions/upload-artifact@v4 + with: + name: 'baselines-${{ matrix.machine_family }}' + # Upload the entire baselines/ subdirectories from both test roots + path: | + perf-tests/baselines/ + memory-tests/baselines/ + if-no-files-found: 'warn' + + # ── 4. Gather artifacts and commit everything back to the branch ──────── + commit-baselines: + name: 'Commit Updated Baselines' + needs: + - 'parse-command' + - 'update-baselines' + runs-on: 'gemini-cli-ubuntu-16-core' + steps: + - name: 'Checkout' + uses: 'actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8' # ratchet:actions/checkout@v5 + with: + ref: '${{ needs.parse-command.outputs.ref }}' + fetch-depth: 0 + + - name: 'Download all baseline artifacts' + uses: 'actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093' # ratchet:actions/download-artifact@v4 + with: + # Download each per-platform artifact into its own subdirectory so + # the paths mirror the test directory layout. + pattern: 'baselines-*' + merge-multiple: true + path: '.' + + - name: 'Commit and push' + env: + GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}' + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + + # Stage only the per-platform baseline files (not the generic ones) + git add perf-tests/baselines/ memory-tests/baselines/ || true + + if git diff --cached --quiet; then + echo "No baseline files changed — nothing to commit." + else + git commit -m "chore: update ${{ needs.parse-command.outputs.test_type }} baselines [skip ci] + + Updated by 'Update Perf/Memory Baselines' workflow run #${{ github.run_id }}. + Platforms: gemini-cli-ubuntu-16-core, macos-latest, gemini-cli-windows-16-core" + git push + fi + + - name: 'Post result comment on PR' + if: "needs.parse-command.outputs.pr_number != ''" + env: + GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}' + run: | + # Remove the acknowledgement comment before posting the result + COMMENT_ID=$(gh pr view "${{ needs.parse-command.outputs.pr_number }}" \ + --json comments \ + --jq '.comments[] | select(.body | contains("")) | .url' \ + | grep -oE '[0-9]+$' | head -n 1) + if [ -n "$COMMENT_ID" ]; then + gh api -X DELETE "repos/${{ github.repository }}/issues/comments/${COMMENT_ID}" + fi + + gh pr comment "${{ needs.parse-command.outputs.pr_number }}" --body \ + "✅ **Baselines updated** (\`${{ needs.parse-command.outputs.test_type }}\`). + + Fresh per-platform baseline files have been committed to this branch for: + - \`gemini-cli-ubuntu-16-core\` + - \`macos-latest\` + - \`gemini-cli-windows-16-core\` + + The nightly tests will now compare against these values. + " diff --git a/.gitignore b/.gitignore index 85902b4a7c..a076f9b813 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,8 @@ .env~ # gemini-cli settings +.tmp-perf-baselines.json +.tmp-memory-baselines.json # We want to keep the .gemini in the root of the repo and ignore any .gemini # in subdirectories. In our root .gemini we want to allow for version control # for subcommands. diff --git a/docs/performance-and-memory-testing.md b/docs/performance-and-memory-testing.md new file mode 100644 index 0000000000..924f0069cc --- /dev/null +++ b/docs/performance-and-memory-testing.md @@ -0,0 +1,110 @@ +# Performance & Memory Testing Infrastructure + +## Overview + +Gemini CLI features a highly reliable performance and memory regression testing +pipeline. To curb anomalies and yields accurate results, the harness applies: + +- **IQR Outlier Filtering**: Discards anomalous metrics from evaluation safely. +- **Median Sampling**: Takes `N` runs, evaluating strictly median averages + effortlessly. +- **Warmup Runs**: Discards first samples smoothly preventing JIT artifacts. +- **Tolerance Boundary**: Default restrictions at 15% tolerance prevent + unwarranted panics effortlessly. + +--- + +## Baseline Management + +There are two core strategies for calibrating tolerances on performance +benchmarks: + +- **Approach A: Normalize for Testing Servers**: Tests run directly on the + automated cloud servers, and those scores are recorded as official, static + baselines. +- **Approach B: Machine-Agnostic Daily Comparisons**: Static baseline files are + ignored. Every night, the test is run against today's and yesterday's code on + the exact same server. + +### Recommended Strategy: GitHub Action + Approach A + +#### Local Development & PR Checks + +- **Local Testing**: If you are a developer trying to quickly test your code + changes against performance or memory impacts, simply run the standard local + perf or memory tests directly without arguments. The harness stashes dirty + alterations automatically, refreshes baseline settings against the most + up-to-date `main` branch dynamically using non-tracked ephemeral files, and + yields immediate comparison feedback. +- **PR Merges**: Please note that if your alterations intentionally necessitate + adjustments across baseline metrics, you should trigger the GitHub Action to + recalibrate baselines in tandem with merging your PR. This is so that + subsequent nightly audits appropriately do their evaluation comparisons + against the new tolerances successfully! + +#### Nightly Build Health Audits + +- Strict Approach A procedures apply daily across platforms on dedicated + environments, avoiding the "boiling frog" issue where micro-regressions + quietly slip past over periods of duration. + +--- + +## Running Tests + +### Performance CPU Tests + +```bash +# Run tests (compare against committed baselines) +npm run test:perf + +# Verbose output +VERBOSE=true npm run test:perf + +# Keep test artifacts for debugging +KEEP_OUTPUT=true npm run test:perf +``` + +### Memory Tests + +```bash +# Run memory tests (compare against local main baselines) +npm run test:memory +``` + +--- + +## Architecture & Configuration + +### Performance Tests Directory Tree + +- `perf-tests/baselines.json`: Committed baseline values +- `perf-tests/globalSetup.ts`: Test environment setup +- `perf-tests/perf-usage.test.ts`: Test scenarios +- `perf-tests/perf.*.responses`: Fake API responses per scenario + +### Memory Tests Directory Tree + +- `memory-tests/baselines.json`: Committed memory values +- `memory-tests/memory-usage.test.ts`: Memory test scenarios + +--- + +## CI Integration + +These tests are strictly excluded from `preflight` constraints and remain +designed strictly for nightly daily audits accurately: + +```yaml +- name: Performance regression tests + run: npm run test:perf +``` + +--- + +## Adding New Scenarios + +1. Add a fake response file: `perf..responses` or + `memory..responses`. +2. Add a test case in `perf-usage.test.ts` or `memory-usage.test.ts` applying + `harness.runScenario()`. diff --git a/memory-tests/baselines/.gitkeep b/memory-tests/baselines/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/memory-tests/memory-usage.test.ts b/memory-tests/memory-usage.test.ts index eb363a0135..41a594b646 100644 --- a/memory-tests/memory-usage.test.ts +++ b/memory-tests/memory-usage.test.ts @@ -5,7 +5,11 @@ */ import { describe, it, beforeAll, afterAll, afterEach } from 'vitest'; -import { TestRig, MemoryTestHarness } from '@google/gemini-cli-test-utils'; +import { + TestRig, + MemoryTestHarness, + resolveMemoryBaselinesPath, +} from '@google/gemini-cli-test-utils'; import { join, dirname } from 'node:path'; import { fileURLToPath } from 'node:url'; import { @@ -19,7 +23,8 @@ import { import { randomUUID } from 'node:crypto'; const __dirname = dirname(fileURLToPath(import.meta.url)); -const BASELINES_PATH = join(__dirname, 'baselines.json'); +const MACHINE_FAMILY = process.env['MEMORY_MACHINE_FAMILY']; +const BASELINES_PATH = resolveMemoryBaselinesPath(__dirname, MACHINE_FAMILY); const UPDATE_BASELINES = process.env['UPDATE_MEMORY_BASELINES'] === 'true'; const TOLERANCE_PERCENT = 10; @@ -37,6 +42,7 @@ describe('Memory Usage Tests', () => { gcCycles: 3, gcDelayMs: 100, sampleCount: 3, + machineFamily: MACHINE_FAMILY, }); }); diff --git a/package.json b/package.json index 150abcf3c3..6f48126f19 100644 --- a/package.json +++ b/package.json @@ -51,9 +51,9 @@ "test:integration:all": "npm run test:integration:sandbox:none && npm run test:integration:sandbox:docker && npm run test:integration:sandbox:podman", "test:integration:flaky": "cross-env RUN_FLAKY_INTEGRATION=1 npm run test:integration:sandbox:none", "test:integration:sandbox:none": "cross-env GEMINI_SANDBOX=false vitest run --root ./integration-tests", - "test:memory": "vitest run --root ./memory-tests", + "test:memory": "node scripts/run-perf-tests.js memory", "test:memory:update-baselines": "cross-env UPDATE_MEMORY_BASELINES=true vitest run --root ./memory-tests", - "test:perf": "vitest run --root ./perf-tests", + "test:perf": "node scripts/run-perf-tests.js perf", "test:perf:update-baselines": "cross-env UPDATE_PERF_BASELINES=true vitest run --root ./perf-tests", "test:integration:sandbox:docker": "cross-env GEMINI_SANDBOX=docker npm run build:sandbox && cross-env GEMINI_SANDBOX=docker vitest run --root ./integration-tests", "test:integration:sandbox:podman": "cross-env GEMINI_SANDBOX=podman vitest run --root ./integration-tests", diff --git a/packages/test-utils/src/index.ts b/packages/test-utils/src/index.ts index e851e7ab8d..4665993df6 100644 --- a/packages/test-utils/src/index.ts +++ b/packages/test-utils/src/index.ts @@ -1,6 +1,6 @@ /** * @license - * Copyright 2025 Google LLC + * Copyright 2026 Google LLC * SPDX-License-Identifier: Apache-2.0 */ diff --git a/packages/test-utils/src/memory-baselines.ts b/packages/test-utils/src/memory-baselines.ts index 3a4578cc50..bcefe7ba69 100644 --- a/packages/test-utils/src/memory-baselines.ts +++ b/packages/test-utils/src/memory-baselines.ts @@ -5,6 +5,7 @@ */ import { readFileSync, writeFileSync, existsSync } from 'node:fs'; +import { join } from 'node:path'; /** * Baseline entry for a single memory test scenario. @@ -77,3 +78,25 @@ export function updateBaseline( }; saveBaselines(path, baselines); } + +/** + * Resolve the path to the correct memory baselines JSON file. + * + * - If `machineFamily` is provided → returns `/baselines/.json`. + * This file may not exist yet; the harness will hard-fail at assertion time if it doesn't. + * - If `machineFamily` is absent → returns `/baselines.json` + * (the legacy generic file used for local development). + * + * @param testRootDir - Absolute path to the directory containing the test root + * (e.g. `__dirname` inside `memory-tests/`). + * @param machineFamily - Optional CI runner label (e.g. `'gemini-cli-ubuntu-16-core'`). + */ +export function resolveMemoryBaselinesPath( + testRootDir: string, + machineFamily?: string, +): string { + if (machineFamily) { + return join(testRootDir, 'baselines', `${machineFamily}.json`); + } + return join(testRootDir, 'baselines.json'); +} diff --git a/packages/test-utils/src/memory-test-harness.ts b/packages/test-utils/src/memory-test-harness.ts index c12c220458..2141d66947 100644 --- a/packages/test-utils/src/memory-test-harness.ts +++ b/packages/test-utils/src/memory-test-harness.ts @@ -6,6 +6,8 @@ import v8 from 'node:v8'; import { setTimeout as sleep } from 'node:timers/promises'; +import { mkdirSync } from 'node:fs'; +import { join, dirname } from 'node:path'; import { loadBaselines, updateBaseline } from './memory-baselines.js'; import type { MemoryBaseline, MemoryBaselineFile } from './memory-baselines.js'; @@ -66,6 +68,14 @@ export interface MemoryTestHarnessOptions { sampleCount?: number; /** Pause in ms between samples. Default: 50 */ samplePauseMs?: number; + /** + * The CI machine family (e.g. 'gemini-cli-ubuntu-16-core'). + * When set, baselines are loaded from and saved to + * `/baselines/.json`. If the file does not exist and + * UPDATE_MEMORY_BASELINES is not set, tests hard-fail with an actionable + * message instead of silently falling back. + */ + machineFamily?: string; } /** @@ -85,6 +95,7 @@ export class MemoryTestHarness { private readonly gcDelayMs: number; private readonly sampleCount: number; private readonly samplePauseMs: number; + private readonly machineFamily?: string; private allResults: MemoryTestResult[] = []; constructor(options: MemoryTestHarnessOptions) { @@ -94,6 +105,7 @@ export class MemoryTestHarness { this.gcDelayMs = options.gcDelayMs ?? 100; this.sampleCount = options.sampleCount ?? 3; this.samplePauseMs = options.samplePauseMs ?? 50; + this.machineFamily = options.machineFamily; this.baselines = loadBaselines(this.baselinesPath); } @@ -240,6 +252,16 @@ export class MemoryTestHarness { const tolerance = tolerancePercent ?? this.defaultTolerancePercent; if (!result.baseline) { + if (this.machineFamily) { + throw new Error( + `No baseline found for scenario "${result.scenarioName}" on machine family "${this.machineFamily}".\n` + + ` Expected file: ${this.baselinesPath}\n` + + ` To create it, trigger the 'Update Baselines' workflow:\n` + + ` .github/workflows/update-baselines.yml\n` + + ` Or locally:\n` + + ` UPDATE_MEMORY_BASELINES=true MEMORY_MACHINE_FAMILY=${this.machineFamily} npm run test:memory`, + ); + } console.warn( `⚠ No baseline found for "${result.scenarioName}". ` + `Run with UPDATE_MEMORY_BASELINES=true to create one. ` + @@ -268,9 +290,21 @@ export class MemoryTestHarness { /** * Update the baseline for a scenario with the current measured values. + * When `machineFamily` is set, writes to `baselines/.json` + * (creating the directory if needed). Otherwise writes to `baselinesPath`. */ updateScenarioBaseline(result: MemoryTestResult): void { - updateBaseline(this.baselinesPath, result.scenarioName, { + const targetPath = this.machineFamily + ? join( + dirname(this.baselinesPath), + 'baselines', + `${this.machineFamily}.json`, + ) + : this.baselinesPath; + if (this.machineFamily) { + mkdirSync(dirname(targetPath), { recursive: true }); + } + updateBaseline(targetPath, result.scenarioName, { heapUsedBytes: result.finalHeapUsed, heapTotalBytes: result.snapshots[result.snapshots.length - 1]?.heapTotal ?? 0, @@ -391,6 +425,9 @@ export class MemoryTestHarness { lines.push(''); lines.push('═══════════════════════════════════════════════════'); lines.push(' MEMORY USAGE TEST REPORT'); + if (this.machineFamily) { + lines.push(` Machine family: ${this.machineFamily}`); + } lines.push('═══════════════════════════════════════════════════'); lines.push(''); diff --git a/packages/test-utils/src/perf-test-harness.ts b/packages/test-utils/src/perf-test-harness.ts index 2f376f58b6..0a567948d7 100644 --- a/packages/test-utils/src/perf-test-harness.ts +++ b/packages/test-utils/src/perf-test-harness.ts @@ -6,7 +6,8 @@ import { performance } from 'node:perf_hooks'; import { setTimeout as sleep } from 'node:timers/promises'; -import { readFileSync, writeFileSync, existsSync } from 'node:fs'; +import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'node:fs'; +import { join, dirname } from 'node:path'; /** Configuration for asciichart plot function. */ interface PlotConfig { @@ -83,6 +84,14 @@ export interface PerfTestHarnessOptions { warmupCount?: number; /** Pause in ms between samples. Default: 100 */ samplePauseMs?: number; + /** + * The CI machine family (e.g. 'gemini-cli-ubuntu-16-core'). + * When set, baselines are loaded from and saved to + * `/baselines/.json`. If the file does not exist and + * UPDATE_PERF_BASELINES is not set, tests hard-fail with an actionable + * message instead of silently falling back. + */ + machineFamily?: string; } /** @@ -114,6 +123,7 @@ export class PerfTestHarness { private readonly sampleCount: number; private readonly warmupCount: number; private readonly samplePauseMs: number; + private readonly machineFamily?: string; private allResults: PerfTestResult[] = []; private activeTimers: Map = new Map(); @@ -124,6 +134,7 @@ export class PerfTestHarness { this.sampleCount = options.sampleCount ?? 5; this.warmupCount = options.warmupCount ?? 1; this.samplePauseMs = options.samplePauseMs ?? 100; + this.machineFamily = options.machineFamily; this.baselines = loadPerfBaselines(this.baselinesPath); } @@ -284,6 +295,18 @@ export class PerfTestHarness { const cpuTolerance = cpuTolerancePercent ?? this.defaultCpuTolerancePercent; if (!result.baseline) { + if (this.machineFamily) { + // In CI with a declared machine family: hard-fail so the problem is + // immediately visible, rather than silently skipping the assertion. + throw new Error( + `No baseline found for scenario "${result.scenarioName}" on machine family "${this.machineFamily}".\n` + + ` Expected file: ${this.baselinesPath}\n` + + ` To create it, trigger the 'Update Baselines' workflow:\n` + + ` .github/workflows/update-baselines.yml\n` + + ` Or locally:\n` + + ` UPDATE_PERF_BASELINES=true PERF_MACHINE_FAMILY=${this.machineFamily} npm run test:perf`, + ); + } console.warn( `⚠ No baseline found for "${result.scenarioName}". ` + `Run with UPDATE_PERF_BASELINES=true to create one. ` + @@ -321,16 +344,30 @@ export class PerfTestHarness { /** * Update the baseline for a scenario with the current measured values. + * When `machineFamily` is set, writes to `baselines/.json` + * (creating the directory if needed). Otherwise writes to `baselinesPath`. */ updateScenarioBaseline(result: PerfTestResult): void { - updatePerfBaseline(this.baselinesPath, result.scenarioName, { + const targetPath = this.machineFamily + ? join( + dirname(this.baselinesPath), + 'baselines', + `${this.machineFamily}.json`, + ) + : this.baselinesPath; + // Ensure the baselines/ subdirectory exists + if (this.machineFamily) { + mkdirSync(dirname(targetPath), { recursive: true }); + } + updatePerfBaseline(targetPath, result.scenarioName, { wallClockMs: result.median.wallClockMs, cpuTotalUs: result.median.cpuTotalUs, }); // Reload baselines after update this.baselines = loadPerfBaselines(this.baselinesPath); console.log( - `Updated baseline for ${result.scenarioName}: ${result.median.wallClockMs.toFixed(1)} ms`, + `Updated baseline for ${result.scenarioName}: ${result.median.wallClockMs.toFixed(1)} ms` + + (this.machineFamily ? ` [${this.machineFamily}]` : ''), ); } @@ -344,6 +381,9 @@ export class PerfTestHarness { lines.push(''); lines.push('═══════════════════════════════════════════════════'); lines.push(' PERFORMANCE TEST REPORT'); + if (this.machineFamily) { + lines.push(` Machine family: ${this.machineFamily}`); + } lines.push('═══════════════════════════════════════════════════'); lines.push(''); @@ -484,6 +524,30 @@ export class PerfTestHarness { } } +// ─── Baseline path resolution ──────────────────────────────────────── + +/** + * Resolve the path to the correct perf baselines JSON file. + * + * - If `machineFamily` is provided → returns `/baselines/.json`. + * This file may not exist yet; the harness will hard-fail at assertion time if it doesn't. + * - If `machineFamily` is absent → returns `/baselines.json` + * (the legacy generic file used for local development). + * + * @param testRootDir - Absolute path to the directory containing the test root + * (e.g. `__dirname` inside `perf-tests/`). + * @param machineFamily - Optional CI runner label (e.g. `'gemini-cli-ubuntu-16-core'`). + */ +export function resolvePerfBaselinesPath( + testRootDir: string, + machineFamily?: string, +): string { + if (machineFamily) { + return join(testRootDir, 'baselines', `${machineFamily}.json`); + } + return join(testRootDir, 'baselines.json'); +} + // ─── Baseline management ───────────────────────────────────────────── /** diff --git a/perf-tests/README.md b/perf-tests/README.md deleted file mode 100644 index c8e9e448c1..0000000000 --- a/perf-tests/README.md +++ /dev/null @@ -1,121 +0,0 @@ -# CPU Performance Integration Test Harness - -## Overview - -This directory contains performance/CPU integration tests for the Gemini CLI. -These tests measure wall-clock time, CPU usage, and event loop responsiveness to -detect regressions across key scenarios. - -CPU performance is inherently noisy, especially in CI. The harness addresses -this with: - -- **IQR outlier filtering** — discards anomalous samples -- **Median sampling** — takes N runs, reports the median after filtering -- **Warmup runs** — discards the first run to mitigate JIT compilation noise -- **15% default tolerance** — won't panic at slight regressions - -## Running - -```bash -# Run tests (compare against committed baselines) -npm run test:perf - -# Update baselines (after intentional changes) -npm run test:perf:update-baselines - -# Verbose output -VERBOSE=true npm run test:perf - -# Keep test artifacts for debugging -KEEP_OUTPUT=true npm run test:perf -``` - -## How It Works - -### Measurement Primitives - -The `PerfTestHarness` class (in `packages/test-utils`) provides: - -- **`performance.now()`** — high-resolution wall-clock timing -- **`process.cpuUsage()`** — user + system CPU microseconds (delta between - start/stop) -- **`perf_hooks.monitorEventLoopDelay()`** — event loop delay histogram - (p50/p95/p99/max) - -### Noise Reduction - -1. **Warmup**: First run is discarded to mitigate JIT compilation artifacts -2. **Multiple samples**: Each scenario runs N times (default 5) -3. **IQR filtering**: Samples outside Q1−1.5×IQR and Q3+1.5×IQR are discarded -4. **Median**: The median of remaining samples is used for comparison - -### Baseline Management - -Baselines are stored in `baselines.json` in this directory. Each scenario has: - -```json -{ - "cold-startup-time": { - "wallClockMs": 1234.5, - "cpuTotalUs": 567890, - "eventLoopDelayP99Ms": 12.3, - "timestamp": "2026-04-08T..." - } -} -``` - -Tests fail if the measured value exceeds `baseline × 1.15` (15% tolerance). - -To recalibrate after intentional changes: - -```bash -npm run test:perf:update-baselines -# then commit baselines.json -``` - -### Report Output - -After all tests, the harness prints an ASCII summary: - -``` -═══════════════════════════════════════════════════ - PERFORMANCE TEST REPORT -═══════════════════════════════════════════════════ - -cold-startup-time: 1234.5 ms (Baseline: 1200.0 ms, Delta: +2.9%) ✅ -idle-cpu-usage: 2.1 % (Baseline: 2.0 %, Delta: +5.0%) ✅ -skill-loading-time: 1567.8 ms (Baseline: 1500.0 ms, Delta: +4.5%) ✅ -``` - -## Architecture - -``` -perf-tests/ -├── README.md ← you are here -├── baselines.json ← committed baseline values -├── globalSetup.ts ← test environment setup -├── perf-usage.test.ts ← test scenarios -├── perf.*.responses ← fake API responses per scenario -├── tsconfig.json ← TypeScript config -└── vitest.config.ts ← vitest config (serial, isolated) - -packages/test-utils/src/ -├── perf-test-harness.ts ← PerfTestHarness class -└── index.ts ← re-exports -``` - -## CI Integration - -These tests are **excluded from `preflight`** and designed for nightly CI: - -```yaml -- name: Performance regression tests - run: npm run test:perf -``` - -## Adding a New Scenario - -1. Add a fake response file: `perf..responses` -2. Add a test case in `perf-usage.test.ts` using `harness.runScenario()` -3. Run `npm run test:perf:update-baselines` to establish initial baseline -4. Commit the updated `baselines.json` diff --git a/perf-tests/baselines/.gitkeep b/perf-tests/baselines/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/perf-tests/perf-usage.test.ts b/perf-tests/perf-usage.test.ts index 1a361eda5d..a85eb0b9ca 100644 --- a/perf-tests/perf-usage.test.ts +++ b/perf-tests/perf-usage.test.ts @@ -5,13 +5,18 @@ */ import { describe, it, beforeAll, afterAll } from 'vitest'; -import { TestRig, PerfTestHarness } from '@google/gemini-cli-test-utils'; +import { + TestRig, + PerfTestHarness, + resolvePerfBaselinesPath, +} from '@google/gemini-cli-test-utils'; import { join, dirname } from 'node:path'; import { fileURLToPath } from 'node:url'; import { existsSync, readFileSync } from 'node:fs'; const __dirname = dirname(fileURLToPath(import.meta.url)); -const BASELINES_PATH = join(__dirname, 'baselines.json'); +const MACHINE_FAMILY = process.env['PERF_MACHINE_FAMILY']; +const BASELINES_PATH = resolvePerfBaselinesPath(__dirname, MACHINE_FAMILY); const UPDATE_BASELINES = process.env['UPDATE_PERF_BASELINES'] === 'true'; const TOLERANCE_PERCENT = 15; @@ -28,6 +33,7 @@ describe('CPU Performance Tests', () => { defaultTolerancePercent: TOLERANCE_PERCENT, sampleCount: SAMPLE_COUNT, warmupCount: WARMUP_COUNT, + machineFamily: MACHINE_FAMILY, }); }); diff --git a/scripts/clean.js b/scripts/clean.js index dbb3849b15..ef591a4697 100644 --- a/scripts/clean.js +++ b/scripts/clean.js @@ -27,6 +27,8 @@ const root = join(__dirname, '..'); // remove npm install/build artifacts rmSync(join(root, 'node_modules'), { recursive: true, force: true }); rmSync(join(root, 'bundle'), { recursive: true, force: true }); +rmSync(join(root, '.tmp-perf-baselines.json'), { force: true }); +rmSync(join(root, '.tmp-memory-baselines.json'), { force: true }); rmSync(join(root, 'packages/cli/src/generated/'), { recursive: true, force: true, diff --git a/scripts/run-perf-tests.js b/scripts/run-perf-tests.js new file mode 100644 index 0000000000..7e2b38bb85 --- /dev/null +++ b/scripts/run-perf-tests.js @@ -0,0 +1,118 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { execSync } from 'node:child_process'; +import path from 'node:path'; + +const type = process.argv[2]; // 'perf' or 'memory' +const args = process.argv.slice(3); + +if (type !== 'perf' && type !== 'memory') { + console.error('Invalid test type. Must be "perf" or "memory".'); + process.exit(1); +} + +const isLocal = !process.env.CI && !process.env.GITHUB_ACTIONS; +const noOptions = args.length === 0; +const testDir = type === 'perf' ? './perf-tests' : './memory-tests'; +const updateEnv = + type === 'perf' + ? 'UPDATE_PERF_BASELINES=true' + : 'UPDATE_MEMORY_BASELINES=true'; +const tempBaselinesPath = path.resolve( + process.cwd(), + `.tmp-${type}-baselines.json`, +); + +if (isLocal && noOptions) { + console.log( + `[Auto-Baseline] Detected local run without options for ${type} tests.`, + ); + console.log('[Auto-Baseline] Updating baselines from main branch first...'); + + let originalBranch = ''; + let isDirty = false; + + try { + originalBranch = execSync('git rev-parse --abbrev-ref HEAD', { + encoding: 'utf-8', + }).trim(); + const status = execSync('git status --porcelain', { + encoding: 'utf-8', + }).trim(); + isDirty = status !== ''; + + if (isDirty) { + console.log('[Auto-Baseline] Stashing current changes...'); + execSync('git stash push --include-untracked -m "temp-perf-test-run"'); + } + + console.log('[Auto-Baseline] Switching to main branch...'); + execSync('git checkout main', { stdio: 'inherit' }); + + try { + console.log( + '[Auto-Baseline] Pulling latest changes for main from origin...', + ); + execSync('git pull origin main', { stdio: 'inherit' }); + } catch { + console.warn( + '[Auto-Baseline] Warning: git pull failed. Proceeding with local main branch.', + ); + } + + console.log( + `[Auto-Baseline] Running update baselines for ${type} tests on main...`, + ); + execSync( + `npx cross-env ${updateEnv} TEMP_BASELINES_PATH=${tempBaselinesPath} npx vitest run --root ${testDir}`, + { stdio: 'inherit' }, + ); + } catch (err) { + console.error( + '[Auto-Baseline] Error during main-branch baseline update:', + err, + ); + } finally { + if (originalBranch) { + console.log( + `[Auto-Baseline] Returning to original branch: ${originalBranch}...`, + ); + try { + execSync(`git checkout ${originalBranch}`, { stdio: 'inherit' }); + if (isDirty) { + console.log('[Auto-Baseline] Restoring stashed changes...'); + execSync('git stash pop', { stdio: 'inherit' }); + } + } catch { + console.error( + '[Auto-Baseline] Critical error while trying to restore original branch state.', + ); + } + } + } + + console.log( + `[Auto-Baseline] Running tests on branch ${originalBranch} against updated baselines...`, + ); + try { + execSync( + `npx cross-env TEMP_BASELINES_PATH=${tempBaselinesPath} npx vitest run --root ${testDir}`, + { stdio: 'inherit' }, + ); + } catch { + process.exit(1); + } +} else { + // Just run standard tests directly + const command = `npx vitest run --root ${testDir} ${args.join(' ')}`; + console.log(`[Standard] Running tests: ${command}`); + try { + execSync(command, { stdio: 'inherit' }); + } catch { + process.exit(1); + } +}