test(perf): overhaul performance and memory baseline management

Comprehensive automation upgrades for performance and memory baselines. Includes GitHub Actions workflows for remote updates, automatic local comparisons against main, and git-ignored temporary baselines. - Added update-baselines.yml GitHub Action to automate remote baseline upgrades efficiently in CI. - Created scripts/run-perf-tests.js to wrap performance executions, safely stashing dirty alterations and gathering main-branch baselines locally when run without arguments. - Enhanced PerfTestHarness and MemoryTestHarness to accommodate tolerance limits assertions safely. - Updated test files to process TEMP_BASELINES_PATH environment variables, protecting tracked files clean during local evaluations. - Formed docs/performance-and-memory-testing.md safely centrally detailing general strategies. - Obsoleted folder files perf-tests/README.md, and memory-tests/README.md deleted altogether. - Registered temporary baseline outputs inside .gitignore and updated scripts/clean.js safely for fast removals on npm run clean.
2026-05-14 22:02:59 -07:00 · 2026-04-16 16:10:31 -07:00
parent daf5006237
commit 6355e2d8a1
17 changed files with 650 additions and 136 deletions
@@ -10,9 +10,21 @@ permissions:
 jobs:
  memory-test:
-    name: 'Run Memory Usage Tests'
+    name: 'Run Memory Usage Tests (${{ matrix.machine_family }})'
    runs-on: 'gemini-cli-ubuntu-16-core'
    if: "github.repository == 'google-gemini/gemini-cli'"
    strategy:
      fail-fast: false
      matrix:
        include:
          - runs_on: 'gemini-cli-ubuntu-16-core'
            machine_family: 'gemini-cli-ubuntu-16-core'
          - runs_on: 'macos-latest'
            machine_family: 'macos-latest'
          - runs_on: 'gemini-cli-windows-16-core'
            machine_family: 'gemini-cli-windows-16-core'
    runs-on: '${{ matrix.runs_on }}'
    env:
      MEMORY_MACHINE_FAMILY: '${{ matrix.machine_family }}'
    steps:
      - name: 'Checkout'
        uses: 'actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8' # ratchet:actions/checkout@v5
@@ -10,9 +10,21 @@ permissions:
 jobs:
  perf-test:
-    name: 'Run Performance Usage Tests'
+    name: 'Run Performance Tests (${{ matrix.machine_family }})'
    runs-on: 'gemini-cli-ubuntu-16-core'
    if: "github.repository == 'google-gemini/gemini-cli'"
    strategy:
      fail-fast: false
      matrix:
        include:
          - runs_on: 'gemini-cli-ubuntu-16-core'
            machine_family: 'gemini-cli-ubuntu-16-core'
          - runs_on: 'macos-latest'
            machine_family: 'macos-latest'
          - runs_on: 'gemini-cli-windows-16-core'
            machine_family: 'gemini-cli-windows-16-core'
    runs-on: '${{ matrix.runs_on }}'
    env:
      PERF_MACHINE_FAMILY: '${{ matrix.machine_family }}'
    steps:
      - name: 'Checkout'
        uses: 'actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8' # ratchet:actions/checkout@v5
@@ -0,0 +1,243 @@
 # Copyright 2026 Google LLC
 # SPDX-License-Identifier: Apache-2.0
 #
 # Update Perf/Memory Baselines
 #
 # Triggered by:
 #   1. A PR comment starting with one of:
 #        /run perf         — updates only perf baselines
 #        /run mem          — updates only memory baselines
 #        /run perf+mem     — updates both (default)
 #   2. Manual workflow_dispatch from the Actions tab.
 #
 # Both paths are gated behind the 'perf-approvers' GitHub environment,
 # which requires approval from the designated approvers group before the
 # matrix runners are provisioned.
 #
 # After all per-platform runs complete, the updated baseline JSON files
 # are committed back to the triggering branch automatically.
 name: 'Update Perf/Memory Baselines'
 on:
  issue_comment:
    types: ['created']
  workflow_dispatch:
    inputs:
      test_type:
        description: 'Which baselines to update'
        required: true
        default: 'perf+mem'
        type: 'choice'
        options:
          - 'perf'
          - 'mem'
          - 'perf+mem'
      ref:
        description: 'Branch/SHA to checkout and update baselines on (default: main)'
        required: false
        default: 'main'
 permissions:
  contents: 'write' # push the updated baseline commit
  pull-requests: 'write' # post the result comment
  issues: 'read'
 jobs:
  # ── 1. Parse slash command / workflow_dispatch ──────────────────────────
  parse-command:
    name: 'Parse Command'
    runs-on: 'gemini-cli-ubuntu-16-core'
    if: |
      github.repository == 'google-gemini/gemini-cli' && (
        github.event_name == 'workflow_dispatch' || (
          github.event_name == 'issue_comment' &&
          github.event.issue.pull_request != null &&
          (
            startsWith(github.event.comment.body, '/run perf+mem') ||
            startsWith(github.event.comment.body, '/run perf') ||
            startsWith(github.event.comment.body, '/run mem')
          )
        )
      )
    outputs:
      test_type: '${{ steps.parse.outputs.test_type }}'
      ref: '${{ steps.parse.outputs.ref }}'
      pr_number: '${{ steps.parse.outputs.pr_number }}'
    steps:
      - name: 'Parse inputs'
        id: 'parse'
        env:
          GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}'
          COMMENT_BODY: '${{ github.event.comment.body }}'
          DISPATCH_TEST_TYPE: '${{ inputs.test_type }}'
          DISPATCH_REF: '${{ inputs.ref }}'
          PR_NUMBER: '${{ github.event.issue.number }}'
          EVENT_NAME: '${{ github.event_name }}'
        run: |
          if [[ "$EVENT_NAME" == "workflow_dispatch" ]]; then
            TEST_TYPE="${DISPATCH_TEST_TYPE:-perf+mem}"
            REF="${DISPATCH_REF:-main}"
            echo "pr_number=" >> "$GITHUB_OUTPUT"
          else
            # Slash command: determine test_type from comment
            if echo "$COMMENT_BODY" | grep -q "^/run perf+mem"; then
              TEST_TYPE="perf+mem"
            elif echo "$COMMENT_BODY" | grep -q "^/run perf"; then
              TEST_TYPE="perf"
            else
              TEST_TYPE="mem"
            fi
            # Get the HEAD sha of the PR
            REF=$(gh pr view "$PR_NUMBER" --json headRefName --jq '.headRefName')
            echo "pr_number=${PR_NUMBER}" >> "$GITHUB_OUTPUT"
          fi
          echo "test_type=${TEST_TYPE}" >> "$GITHUB_OUTPUT"
          echo "ref=${REF}" >> "$GITHUB_OUTPUT"
      - name: 'Post acknowledgement comment on PR'
        if: "steps.parse.outputs.pr_number != ''"
        env:
          GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}'
        run: |
          gh pr comment "${{ steps.parse.outputs.pr_number }}" --body \
            "⏳ **Baseline update requested** (\`${{ steps.parse.outputs.test_type }}\`).
            A member of the \`perf-approvers\` group must approve this workflow before the runners start.
            <!-- baseline-update-ack -->"
  # ── 2. Approval gate (perf-approvers environment) ──────────────────────
  await-approval:
    name: 'Await perf-approvers Approval'
    needs: 'parse-command'
    # This environment requires manual approval from the perf-approvers group
    # before GitHub provisions any of the downstream runners.
    environment: 'perf-approvers'
    runs-on: 'gemini-cli-ubuntu-16-core'
    steps:
      - name: 'Approved'
        run: 'echo "Approved by perf-approvers — launching baseline update matrix."'
  # ── 3. Run tests to capture fresh baselines on each platform ───────────
  update-baselines:
    name: 'Update Baselines (${{ matrix.machine_family }})'
    needs: 'await-approval'
    strategy:
      fail-fast: false
      matrix:
        include:
          - runs_on: 'gemini-cli-ubuntu-16-core'
            machine_family: 'gemini-cli-ubuntu-16-core'
          - runs_on: 'macos-latest'
            machine_family: 'macos-latest'
          - runs_on: 'gemini-cli-windows-16-core'
            machine_family: 'gemini-cli-windows-16-core'
    runs-on: '${{ matrix.runs_on }}'
    steps:
      - name: 'Checkout'
        uses: 'actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8' # ratchet:actions/checkout@v5
        with:
          ref: '${{ needs.parse-command.outputs.ref }}'
          # Need full history so we can push back
          fetch-depth: 0
      - name: 'Set up Node.js'
        uses: 'actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020' # ratchet:actions/setup-node@v4
        with:
          node-version-file: '.nvmrc'
          cache: 'npm'
      - name: 'Install dependencies'
        run: 'npm ci'
      - name: 'Build project'
        run: 'npm run build'
      - name: 'Update Perf Baselines'
        if: "contains(needs.parse-command.outputs.test_type, 'perf')"
        run: 'npm run test:perf:update-baselines'
        env:
          PERF_MACHINE_FAMILY: '${{ matrix.machine_family }}'
      - name: 'Update Memory Baselines'
        if: "contains(needs.parse-command.outputs.test_type, 'mem')"
        run: 'npm run test:memory:update-baselines'
        env:
          MEMORY_MACHINE_FAMILY: '${{ matrix.machine_family }}'
      - name: 'Upload updated baseline files'
        uses: 'actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02' # ratchet:actions/upload-artifact@v4
        with:
          name: 'baselines-${{ matrix.machine_family }}'
          # Upload the entire baselines/ subdirectories from both test roots
          path: |
            perf-tests/baselines/
            memory-tests/baselines/
          if-no-files-found: 'warn'
  # ── 4. Gather artifacts and commit everything back to the branch ────────
  commit-baselines:
    name: 'Commit Updated Baselines'
    needs:
      - 'parse-command'
      - 'update-baselines'
    runs-on: 'gemini-cli-ubuntu-16-core'
    steps:
      - name: 'Checkout'
        uses: 'actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8' # ratchet:actions/checkout@v5
        with:
          ref: '${{ needs.parse-command.outputs.ref }}'
          fetch-depth: 0
      - name: 'Download all baseline artifacts'
        uses: 'actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093' # ratchet:actions/download-artifact@v4
        with:
          # Download each per-platform artifact into its own subdirectory so
          # the paths mirror the test directory layout.
          pattern: 'baselines-*'
          merge-multiple: true
          path: '.'
      - name: 'Commit and push'
        env:
          GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}'
        run: |
          git config user.name "github-actions[bot]"
          git config user.email "github-actions[bot]@users.noreply.github.com"
          # Stage only the per-platform baseline files (not the generic ones)
          git add perf-tests/baselines/ memory-tests/baselines/ || true
          if git diff --cached --quiet; then
            echo "No baseline files changed — nothing to commit."
          else
            git commit -m "chore: update ${{ needs.parse-command.outputs.test_type }} baselines [skip ci]
            Updated by 'Update Perf/Memory Baselines' workflow run #${{ github.run_id }}.
            Platforms: gemini-cli-ubuntu-16-core, macos-latest, gemini-cli-windows-16-core"
            git push
          fi
      - name: 'Post result comment on PR'
        if: "needs.parse-command.outputs.pr_number != ''"
        env:
          GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}'
        run: |
          # Remove the acknowledgement comment before posting the result
          COMMENT_ID=$(gh pr view "${{ needs.parse-command.outputs.pr_number }}" \
            --json comments \
            --jq '.comments[] | select(.body | contains("<!-- baseline-update-ack -->")) | .url' \
            | grep -oE '[0-9]+$' | head -n 1)
          if [ -n "$COMMENT_ID" ]; then
            gh api -X DELETE "repos/${{ github.repository }}/issues/comments/${COMMENT_ID}"
          fi
          gh pr comment "${{ needs.parse-command.outputs.pr_number }}" --body \
            "✅ **Baselines updated** (\`${{ needs.parse-command.outputs.test_type }}\`).
            Fresh per-platform baseline files have been committed to this branch for:
            - \`gemini-cli-ubuntu-16-core\`
            - \`macos-latest\`
            - \`gemini-cli-windows-16-core\`
            The nightly tests will now compare against these values.
            <!-- baseline-update-result -->"
@@ -3,6 +3,8 @@
 .env~
 # gemini-cli settings
 .tmp-perf-baselines.json
 .tmp-memory-baselines.json
 # We want to keep the .gemini in the root of the repo and ignore any .gemini
 # in subdirectories. In our root .gemini we want to allow for version control
 # for subcommands.
@@ -0,0 +1,110 @@
 # Performance & Memory Testing Infrastructure
 ## Overview
 Gemini CLI features a highly reliable performance and memory regression testing
 pipeline. To curb anomalies and yields accurate results, the harness applies:
 - **IQR Outlier Filtering**: Discards anomalous metrics from evaluation safely.
 - **Median Sampling**: Takes `N` runs, evaluating strictly median averages
  effortlessly.
 - **Warmup Runs**: Discards first samples smoothly preventing JIT artifacts.
 - **Tolerance Boundary**: Default restrictions at 15% tolerance prevent
  unwarranted panics effortlessly.
 ---
 ## Baseline Management
 There are two core strategies for calibrating tolerances on performance
 benchmarks:
 - **Approach A: Normalize for Testing Servers**: Tests run directly on the
  automated cloud servers, and those scores are recorded as official, static
  baselines.
 - **Approach B: Machine-Agnostic Daily Comparisons**: Static baseline files are
  ignored. Every night, the test is run against today's and yesterday's code on
  the exact same server.
 ### Recommended Strategy: GitHub Action + Approach A
 #### Local Development & PR Checks
 - **Local Testing**: If you are a developer trying to quickly test your code
  changes against performance or memory impacts, simply run the standard local
  perf or memory tests directly without arguments. The harness stashes dirty
  alterations automatically, refreshes baseline settings against the most
  up-to-date `main` branch dynamically using non-tracked ephemeral files, and
  yields immediate comparison feedback.
 - **PR Merges**: Please note that if your alterations intentionally necessitate
  adjustments across baseline metrics, you should trigger the GitHub Action to
  recalibrate baselines in tandem with merging your PR. This is so that
  subsequent nightly audits appropriately do their evaluation comparisons
  against the new tolerances successfully!
 #### Nightly Build Health Audits
 - Strict Approach A procedures apply daily across platforms on dedicated
  environments, avoiding the "boiling frog" issue where micro-regressions
  quietly slip past over periods of duration.
 ---
 ## Running Tests
 ### Performance CPU Tests
 ```bash
 # Run tests (compare against committed baselines)
 npm run test:perf
 # Verbose output
 VERBOSE=true npm run test:perf
 # Keep test artifacts for debugging
 KEEP_OUTPUT=true npm run test:perf
 ```
 ### Memory Tests
 ```bash
 # Run memory tests (compare against local main baselines)
 npm run test:memory
 ```
 ---
 ## Architecture & Configuration
 ### Performance Tests Directory Tree
 - `perf-tests/baselines.json`: Committed baseline values
 - `perf-tests/globalSetup.ts`: Test environment setup
 - `perf-tests/perf-usage.test.ts`: Test scenarios
 - `perf-tests/perf.*.responses`: Fake API responses per scenario
 ### Memory Tests Directory Tree
 - `memory-tests/baselines.json`: Committed memory values
 - `memory-tests/memory-usage.test.ts`: Memory test scenarios
 ---
 ## CI Integration
 These tests are strictly excluded from `preflight` constraints and remain
 designed strictly for nightly daily audits accurately:
 ```yaml
 - name: Performance regression tests
  run: npm run test:perf
 ```
 ---
 ## Adding New Scenarios
 1. Add a fake response file: `perf.<scenario-name>.responses` or
   `memory.<scenario-name>.responses`.
 2. Add a test case in `perf-usage.test.ts` or `memory-usage.test.ts` applying
   `harness.runScenario()`.
@@ -5,7 +5,11 @@
 */
 import { describe, it, beforeAll, afterAll, afterEach } from 'vitest';
-import { TestRig, MemoryTestHarness } from '@google/gemini-cli-test-utils';
+import {
  TestRig,
  MemoryTestHarness,
  resolveMemoryBaselinesPath,
 } from '@google/gemini-cli-test-utils';
 import { join, dirname } from 'node:path';
 import { fileURLToPath } from 'node:url';
 import {
@@ -19,7 +23,8 @@ import {
 import { randomUUID } from 'node:crypto';
 const __dirname = dirname(fileURLToPath(import.meta.url));
-const BASELINES_PATH = join(__dirname, 'baselines.json');
+const MACHINE_FAMILY = process.env['MEMORY_MACHINE_FAMILY'];
 const BASELINES_PATH = resolveMemoryBaselinesPath(__dirname, MACHINE_FAMILY);
 const UPDATE_BASELINES = process.env['UPDATE_MEMORY_BASELINES'] === 'true';
 const TOLERANCE_PERCENT = 10;
@@ -37,6 +42,7 @@ describe('Memory Usage Tests', () => {
      gcCycles: 3,
      gcDelayMs: 100,
      sampleCount: 3,
      machineFamily: MACHINE_FAMILY,
    });
  });
@@ -51,9 +51,9 @@
    "test:integration:all": "npm run test:integration:sandbox:none && npm run test:integration:sandbox:docker && npm run test:integration:sandbox:podman",
    "test:integration:flaky": "cross-env RUN_FLAKY_INTEGRATION=1 npm run test:integration:sandbox:none",
    "test:integration:sandbox:none": "cross-env GEMINI_SANDBOX=false vitest run --root ./integration-tests",
-    "test:memory": "vitest run --root ./memory-tests",
+    "test:memory": "node scripts/run-perf-tests.js memory",
    "test:memory:update-baselines": "cross-env UPDATE_MEMORY_BASELINES=true vitest run --root ./memory-tests",
-    "test:perf": "vitest run --root ./perf-tests",
+    "test:perf": "node scripts/run-perf-tests.js perf",
    "test:perf:update-baselines": "cross-env UPDATE_PERF_BASELINES=true vitest run --root ./perf-tests",
    "test:integration:sandbox:docker": "cross-env GEMINI_SANDBOX=docker npm run build:sandbox && cross-env GEMINI_SANDBOX=docker vitest run --root ./integration-tests",
    "test:integration:sandbox:podman": "cross-env GEMINI_SANDBOX=podman vitest run --root ./integration-tests",
@@ -1,6 +1,6 @@
 /**
 * @license
- * Copyright 2025 Google LLC
+ * Copyright 2026 Google LLC
 * SPDX-License-Identifier: Apache-2.0
 */
@@ -5,6 +5,7 @@
 */
 import { readFileSync, writeFileSync, existsSync } from 'node:fs';
 import { join } from 'node:path';
 /**
 * Baseline entry for a single memory test scenario.
@@ -77,3 +78,25 @@ export function updateBaseline(
  };
  saveBaselines(path, baselines);
 }
 /**
 * Resolve the path to the correct memory baselines JSON file.
 *
 * - If `machineFamily` is provided → returns `<testRootDir>/baselines/<machineFamily>.json`.
 *   This file may not exist yet; the harness will hard-fail at assertion time if it doesn't.
 * - If `machineFamily` is absent → returns `<testRootDir>/baselines.json`
 *   (the legacy generic file used for local development).
 *
 * @param testRootDir - Absolute path to the directory containing the test root
 *   (e.g. `__dirname` inside `memory-tests/`).
 * @param machineFamily - Optional CI runner label (e.g. `'gemini-cli-ubuntu-16-core'`).
 */
 export function resolveMemoryBaselinesPath(
  testRootDir: string,
  machineFamily?: string,
 ): string {
  if (machineFamily) {
    return join(testRootDir, 'baselines', `${machineFamily}.json`);
  }
  return join(testRootDir, 'baselines.json');
 }
@@ -6,6 +6,8 @@
 import v8 from 'node:v8';
 import { setTimeout as sleep } from 'node:timers/promises';
 import { mkdirSync } from 'node:fs';
 import { join, dirname } from 'node:path';
 import { loadBaselines, updateBaseline } from './memory-baselines.js';
 import type { MemoryBaseline, MemoryBaselineFile } from './memory-baselines.js';
@@ -66,6 +68,14 @@ export interface MemoryTestHarnessOptions {
  sampleCount?: number;
  /** Pause in ms between samples. Default: 50 */
  samplePauseMs?: number;
  /**
   * The CI machine family (e.g. 'gemini-cli-ubuntu-16-core').
   * When set, baselines are loaded from and saved to
   * `<dir>/baselines/<machineFamily>.json`. If the file does not exist and
   * UPDATE_MEMORY_BASELINES is not set, tests hard-fail with an actionable
   * message instead of silently falling back.
   */
  machineFamily?: string;
 }
 /**
@@ -85,6 +95,7 @@ export class MemoryTestHarness {
  private readonly gcDelayMs: number;
  private readonly sampleCount: number;
  private readonly samplePauseMs: number;
  private readonly machineFamily?: string;
  private allResults: MemoryTestResult[] = [];
  constructor(options: MemoryTestHarnessOptions) {
@@ -94,6 +105,7 @@ export class MemoryTestHarness {
    this.gcDelayMs = options.gcDelayMs ?? 100;
    this.sampleCount = options.sampleCount ?? 3;
    this.samplePauseMs = options.samplePauseMs ?? 50;
    this.machineFamily = options.machineFamily;
    this.baselines = loadBaselines(this.baselinesPath);
  }
@@ -240,6 +252,16 @@ export class MemoryTestHarness {
    const tolerance = tolerancePercent ?? this.defaultTolerancePercent;
    if (!result.baseline) {
      if (this.machineFamily) {
        throw new Error(
          `No baseline found for scenario "${result.scenarioName}" on machine family "${this.machineFamily}".\n` +
            `  Expected file: ${this.baselinesPath}\n` +
            `  To create it, trigger the 'Update Baselines' workflow:\n` +
            `    .github/workflows/update-baselines.yml\n` +
            `  Or locally:\n` +
            `    UPDATE_MEMORY_BASELINES=true MEMORY_MACHINE_FAMILY=${this.machineFamily} npm run test:memory`,
        );
      }
      console.warn(
        `⚠ No baseline found for "${result.scenarioName}". ` +
          `Run with UPDATE_MEMORY_BASELINES=true to create one. ` +
@@ -268,9 +290,21 @@ export class MemoryTestHarness {
  /**
   * Update the baseline for a scenario with the current measured values.
   * When `machineFamily` is set, writes to `baselines/<machineFamily>.json`
   * (creating the directory if needed). Otherwise writes to `baselinesPath`.
   */
  updateScenarioBaseline(result: MemoryTestResult): void {
-    updateBaseline(this.baselinesPath, result.scenarioName, {
+    const targetPath = this.machineFamily
      ? join(
          dirname(this.baselinesPath),
          'baselines',
          `${this.machineFamily}.json`,
        )
      : this.baselinesPath;
    if (this.machineFamily) {
      mkdirSync(dirname(targetPath), { recursive: true });
    }
    updateBaseline(targetPath, result.scenarioName, {
      heapUsedBytes: result.finalHeapUsed,
      heapTotalBytes:
        result.snapshots[result.snapshots.length - 1]?.heapTotal ?? 0,
@@ -391,6 +425,9 @@ export class MemoryTestHarness {
    lines.push('');
    lines.push('═══════════════════════════════════════════════════');
    lines.push('         MEMORY USAGE TEST REPORT');
    if (this.machineFamily) {
      lines.push(`         Machine family: ${this.machineFamily}`);
    }
    lines.push('═══════════════════════════════════════════════════');
    lines.push('');
@@ -6,7 +6,8 @@
 import { performance } from 'node:perf_hooks';
 import { setTimeout as sleep } from 'node:timers/promises';
-import { readFileSync, writeFileSync, existsSync } from 'node:fs';
+import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'node:fs';
 import { join, dirname } from 'node:path';
 /** Configuration for asciichart plot function. */
 interface PlotConfig {
@@ -83,6 +84,14 @@ export interface PerfTestHarnessOptions {
  warmupCount?: number;
  /** Pause in ms between samples. Default: 100 */
  samplePauseMs?: number;
  /**
   * The CI machine family (e.g. 'gemini-cli-ubuntu-16-core').
   * When set, baselines are loaded from and saved to
   * `<dir>/baselines/<machineFamily>.json`. If the file does not exist and
   * UPDATE_PERF_BASELINES is not set, tests hard-fail with an actionable
   * message instead of silently falling back.
   */
  machineFamily?: string;
 }
 /**
@@ -114,6 +123,7 @@ export class PerfTestHarness {
  private readonly sampleCount: number;
  private readonly warmupCount: number;
  private readonly samplePauseMs: number;
  private readonly machineFamily?: string;
  private allResults: PerfTestResult[] = [];
  private activeTimers: Map<string, ActiveTimer> = new Map();
@@ -124,6 +134,7 @@ export class PerfTestHarness {
    this.sampleCount = options.sampleCount ?? 5;
    this.warmupCount = options.warmupCount ?? 1;
    this.samplePauseMs = options.samplePauseMs ?? 100;
    this.machineFamily = options.machineFamily;
    this.baselines = loadPerfBaselines(this.baselinesPath);
  }
@@ -284,6 +295,18 @@ export class PerfTestHarness {
    const cpuTolerance = cpuTolerancePercent ?? this.defaultCpuTolerancePercent;
    if (!result.baseline) {
      if (this.machineFamily) {
        // In CI with a declared machine family: hard-fail so the problem is
        // immediately visible, rather than silently skipping the assertion.
        throw new Error(
          `No baseline found for scenario "${result.scenarioName}" on machine family "${this.machineFamily}".\n` +
            `  Expected file: ${this.baselinesPath}\n` +
            `  To create it, trigger the 'Update Baselines' workflow:\n` +
            `    .github/workflows/update-baselines.yml\n` +
            `  Or locally:\n` +
            `    UPDATE_PERF_BASELINES=true PERF_MACHINE_FAMILY=${this.machineFamily} npm run test:perf`,
        );
      }
      console.warn(
        `⚠ No baseline found for "${result.scenarioName}". ` +
          `Run with UPDATE_PERF_BASELINES=true to create one. ` +
@@ -321,16 +344,30 @@ export class PerfTestHarness {
  /**
   * Update the baseline for a scenario with the current measured values.
   * When `machineFamily` is set, writes to `baselines/<machineFamily>.json`
   * (creating the directory if needed). Otherwise writes to `baselinesPath`.
   */
  updateScenarioBaseline(result: PerfTestResult): void {
-    updatePerfBaseline(this.baselinesPath, result.scenarioName, {
+    const targetPath = this.machineFamily
      ? join(
          dirname(this.baselinesPath),
          'baselines',
          `${this.machineFamily}.json`,
        )
      : this.baselinesPath;
    // Ensure the baselines/ subdirectory exists
    if (this.machineFamily) {
      mkdirSync(dirname(targetPath), { recursive: true });
    }
    updatePerfBaseline(targetPath, result.scenarioName, {
      wallClockMs: result.median.wallClockMs,
      cpuTotalUs: result.median.cpuTotalUs,
    });
    // Reload baselines after update
    this.baselines = loadPerfBaselines(this.baselinesPath);
    console.log(
-      `Updated baseline for ${result.scenarioName}: ${result.median.wallClockMs.toFixed(1)} ms`,
+      `Updated baseline for ${result.scenarioName}: ${result.median.wallClockMs.toFixed(1)} ms` +
        (this.machineFamily ? ` [${this.machineFamily}]` : ''),
    );
  }
@@ -344,6 +381,9 @@ export class PerfTestHarness {
    lines.push('');
    lines.push('═══════════════════════════════════════════════════');
    lines.push('         PERFORMANCE TEST REPORT');
    if (this.machineFamily) {
      lines.push(`         Machine family: ${this.machineFamily}`);
    }
    lines.push('═══════════════════════════════════════════════════');
    lines.push('');
@@ -484,6 +524,30 @@ export class PerfTestHarness {
  }
 }
 // ─── Baseline path resolution ────────────────────────────────────────
 /**
 * Resolve the path to the correct perf baselines JSON file.
 *
 * - If `machineFamily` is provided → returns `<testRootDir>/baselines/<machineFamily>.json`.
 *   This file may not exist yet; the harness will hard-fail at assertion time if it doesn't.
 * - If `machineFamily` is absent → returns `<testRootDir>/baselines.json`
 *   (the legacy generic file used for local development).
 *
 * @param testRootDir - Absolute path to the directory containing the test root
 *   (e.g. `__dirname` inside `perf-tests/`).
 * @param machineFamily - Optional CI runner label (e.g. `'gemini-cli-ubuntu-16-core'`).
 */
 export function resolvePerfBaselinesPath(
  testRootDir: string,
  machineFamily?: string,
 ): string {
  if (machineFamily) {
    return join(testRootDir, 'baselines', `${machineFamily}.json`);
  }
  return join(testRootDir, 'baselines.json');
 }
 // ─── Baseline management ─────────────────────────────────────────────
 /**
@@ -1,121 +0,0 @@
 # CPU Performance Integration Test Harness
 ## Overview
 This directory contains performance/CPU integration tests for the Gemini CLI.
 These tests measure wall-clock time, CPU usage, and event loop responsiveness to
 detect regressions across key scenarios.
 CPU performance is inherently noisy, especially in CI. The harness addresses
 this with:
 - **IQR outlier filtering** — discards anomalous samples
 - **Median sampling** — takes N runs, reports the median after filtering
 - **Warmup runs** — discards the first run to mitigate JIT compilation noise
 - **15% default tolerance** — won't panic at slight regressions
 ## Running
 ```bash
 # Run tests (compare against committed baselines)
 npm run test:perf
 # Update baselines (after intentional changes)
 npm run test:perf:update-baselines
 # Verbose output
 VERBOSE=true npm run test:perf
 # Keep test artifacts for debugging
 KEEP_OUTPUT=true npm run test:perf
 ```
 ## How It Works
 ### Measurement Primitives
 The `PerfTestHarness` class (in `packages/test-utils`) provides:
 - **`performance.now()`** — high-resolution wall-clock timing
 - **`process.cpuUsage()`** — user + system CPU microseconds (delta between
  start/stop)
 - **`perf_hooks.monitorEventLoopDelay()`** — event loop delay histogram
  (p50/p95/p99/max)
 ### Noise Reduction
 1. **Warmup**: First run is discarded to mitigate JIT compilation artifacts
 2. **Multiple samples**: Each scenario runs N times (default 5)
 3. **IQR filtering**: Samples outside Q1−1.5×IQR and Q3+1.5×IQR are discarded
 4. **Median**: The median of remaining samples is used for comparison
 ### Baseline Management
 Baselines are stored in `baselines.json` in this directory. Each scenario has:
 ```json
 {
  "cold-startup-time": {
    "wallClockMs": 1234.5,
    "cpuTotalUs": 567890,
    "eventLoopDelayP99Ms": 12.3,
    "timestamp": "2026-04-08T..."
  }
 }
 ```
 Tests fail if the measured value exceeds `baseline × 1.15` (15% tolerance).
 To recalibrate after intentional changes:
 ```bash
 npm run test:perf:update-baselines
 # then commit baselines.json
 ```
 ### Report Output
 After all tests, the harness prints an ASCII summary:
 ```
 ═══════════════════════════════════════════════════
         PERFORMANCE TEST REPORT
 ═══════════════════════════════════════════════════
 cold-startup-time:   1234.5 ms (Baseline: 1200.0 ms, Delta: +2.9%) ✅
 idle-cpu-usage:         2.1 %  (Baseline: 2.0 %, Delta: +5.0%)     ✅
 skill-loading-time:  1567.8 ms (Baseline: 1500.0 ms, Delta: +4.5%) ✅
 ```
 ## Architecture
 ```
 perf-tests/
 ├── README.md              ← you are here
 ├── baselines.json         ← committed baseline values
 ├── globalSetup.ts         ← test environment setup
 ├── perf-usage.test.ts     ← test scenarios
 ├── perf.*.responses       ← fake API responses per scenario
 ├── tsconfig.json          ← TypeScript config
 └── vitest.config.ts       ← vitest config (serial, isolated)
 packages/test-utils/src/
 ├── perf-test-harness.ts   ← PerfTestHarness class
 └── index.ts               ← re-exports
 ```
 ## CI Integration
 These tests are **excluded from `preflight`** and designed for nightly CI:
 ```yaml
 - name: Performance regression tests
  run: npm run test:perf
 ```
 ## Adding a New Scenario
 1. Add a fake response file: `perf.<scenario-name>.responses`
 2. Add a test case in `perf-usage.test.ts` using `harness.runScenario()`
 3. Run `npm run test:perf:update-baselines` to establish initial baseline
 4. Commit the updated `baselines.json`
@@ -5,13 +5,18 @@
 */
 import { describe, it, beforeAll, afterAll } from 'vitest';
-import { TestRig, PerfTestHarness } from '@google/gemini-cli-test-utils';
+import {
  TestRig,
  PerfTestHarness,
  resolvePerfBaselinesPath,
 } from '@google/gemini-cli-test-utils';
 import { join, dirname } from 'node:path';
 import { fileURLToPath } from 'node:url';
 import { existsSync, readFileSync } from 'node:fs';
 const __dirname = dirname(fileURLToPath(import.meta.url));
-const BASELINES_PATH = join(__dirname, 'baselines.json');
+const MACHINE_FAMILY = process.env['PERF_MACHINE_FAMILY'];
 const BASELINES_PATH = resolvePerfBaselinesPath(__dirname, MACHINE_FAMILY);
 const UPDATE_BASELINES = process.env['UPDATE_PERF_BASELINES'] === 'true';
 const TOLERANCE_PERCENT = 15;
@@ -28,6 +33,7 @@ describe('CPU Performance Tests', () => {
      defaultTolerancePercent: TOLERANCE_PERCENT,
      sampleCount: SAMPLE_COUNT,
      warmupCount: WARMUP_COUNT,
      machineFamily: MACHINE_FAMILY,
    });
  });
@@ -27,6 +27,8 @@ const root = join(__dirname, '..');
 // remove npm install/build artifacts
 rmSync(join(root, 'node_modules'), { recursive: true, force: true });
 rmSync(join(root, 'bundle'), { recursive: true, force: true });
 rmSync(join(root, '.tmp-perf-baselines.json'), { force: true });
 rmSync(join(root, '.tmp-memory-baselines.json'), { force: true });
 rmSync(join(root, 'packages/cli/src/generated/'), {
  recursive: true,
  force: true,
@@ -0,0 +1,118 @@
 /**
 * @license
 * Copyright 2026 Google LLC
 * SPDX-License-Identifier: Apache-2.0
 */
 import { execSync } from 'node:child_process';
 import path from 'node:path';
 const type = process.argv[2]; // 'perf' or 'memory'
 const args = process.argv.slice(3);
 if (type !== 'perf' && type !== 'memory') {
  console.error('Invalid test type. Must be "perf" or "memory".');
  process.exit(1);
 }
 const isLocal = !process.env.CI && !process.env.GITHUB_ACTIONS;
 const noOptions = args.length === 0;
 const testDir = type === 'perf' ? './perf-tests' : './memory-tests';
 const updateEnv =
  type === 'perf'
    ? 'UPDATE_PERF_BASELINES=true'
    : 'UPDATE_MEMORY_BASELINES=true';
 const tempBaselinesPath = path.resolve(
  process.cwd(),
  `.tmp-${type}-baselines.json`,
 );
 if (isLocal && noOptions) {
  console.log(
    `[Auto-Baseline] Detected local run without options for ${type} tests.`,
  );
  console.log('[Auto-Baseline] Updating baselines from main branch first...');
  let originalBranch = '';
  let isDirty = false;
  try {
    originalBranch = execSync('git rev-parse --abbrev-ref HEAD', {
      encoding: 'utf-8',
    }).trim();
    const status = execSync('git status --porcelain', {
      encoding: 'utf-8',
    }).trim();
    isDirty = status !== '';
    if (isDirty) {
      console.log('[Auto-Baseline] Stashing current changes...');
      execSync('git stash push --include-untracked -m "temp-perf-test-run"');
    }
    console.log('[Auto-Baseline] Switching to main branch...');
    execSync('git checkout main', { stdio: 'inherit' });
    try {
      console.log(
        '[Auto-Baseline] Pulling latest changes for main from origin...',
      );
      execSync('git pull origin main', { stdio: 'inherit' });
    } catch {
      console.warn(
        '[Auto-Baseline] Warning: git pull failed. Proceeding with local main branch.',
      );
    }
    console.log(
      `[Auto-Baseline] Running update baselines for ${type} tests on main...`,
    );
    execSync(
      `npx cross-env ${updateEnv} TEMP_BASELINES_PATH=${tempBaselinesPath} npx vitest run --root ${testDir}`,
      { stdio: 'inherit' },
    );
  } catch (err) {
    console.error(
      '[Auto-Baseline] Error during main-branch baseline update:',
      err,
    );
  } finally {
    if (originalBranch) {
      console.log(
        `[Auto-Baseline] Returning to original branch: ${originalBranch}...`,
      );
      try {
        execSync(`git checkout ${originalBranch}`, { stdio: 'inherit' });
        if (isDirty) {
          console.log('[Auto-Baseline] Restoring stashed changes...');
          execSync('git stash pop', { stdio: 'inherit' });
        }
      } catch {
        console.error(
          '[Auto-Baseline] Critical error while trying to restore original branch state.',
        );
      }
    }
  }
  console.log(
    `[Auto-Baseline] Running tests on branch ${originalBranch} against updated baselines...`,
  );
  try {
    execSync(
      `npx cross-env TEMP_BASELINES_PATH=${tempBaselinesPath} npx vitest run --root ${testDir}`,
      { stdio: 'inherit' },
    );
  } catch {
    process.exit(1);
  }
 } else {
  // Just run standard tests directly
  const command = `npx vitest run --root ${testDir} ${args.join(' ')}`;
  console.log(`[Standard] Running tests: ${command}`);
  try {
    execSync(command, { stdio: 'inherit' });
  } catch {
    process.exit(1);
  }
 }