test(perf): overhaul performance and memory baseline management

Comprehensive automation upgrades for performance and memory baselines. Includes GitHub Actions workflows for remote updates, automatic local comparisons against main, and git-ignored temporary baselines.

- Added update-baselines.yml GitHub Action to automate remote baseline upgrades efficiently in CI.
- Created scripts/run-perf-tests.js to wrap performance executions, safely stashing dirty alterations and gathering main-branch baselines locally when run without arguments.
- Enhanced PerfTestHarness and MemoryTestHarness to accommodate tolerance limits assertions safely.
- Updated test files to process TEMP_BASELINES_PATH environment variables, protecting tracked files clean during local evaluations.
- Formed docs/performance-and-memory-testing.md safely centrally detailing general strategies.
- Obsoleted folder files perf-tests/README.md, and memory-tests/README.md deleted altogether.
- Registered temporary baseline outputs inside .gitignore and updated scripts/clean.js safely for fast removals on npm run clean.
This commit is contained in:
Sri Pasumarthi
2026-04-16 16:10:31 -07:00
parent daf5006237
commit 6355e2d8a1
17 changed files with 650 additions and 136 deletions
+14 -2
View File
@@ -10,9 +10,21 @@ permissions:
jobs: jobs:
memory-test: memory-test:
name: 'Run Memory Usage Tests' name: 'Run Memory Usage Tests (${{ matrix.machine_family }})'
runs-on: 'gemini-cli-ubuntu-16-core'
if: "github.repository == 'google-gemini/gemini-cli'" if: "github.repository == 'google-gemini/gemini-cli'"
strategy:
fail-fast: false
matrix:
include:
- runs_on: 'gemini-cli-ubuntu-16-core'
machine_family: 'gemini-cli-ubuntu-16-core'
- runs_on: 'macos-latest'
machine_family: 'macos-latest'
- runs_on: 'gemini-cli-windows-16-core'
machine_family: 'gemini-cli-windows-16-core'
runs-on: '${{ matrix.runs_on }}'
env:
MEMORY_MACHINE_FAMILY: '${{ matrix.machine_family }}'
steps: steps:
- name: 'Checkout' - name: 'Checkout'
uses: 'actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8' # ratchet:actions/checkout@v5 uses: 'actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8' # ratchet:actions/checkout@v5
+14 -2
View File
@@ -10,9 +10,21 @@ permissions:
jobs: jobs:
perf-test: perf-test:
name: 'Run Performance Usage Tests' name: 'Run Performance Tests (${{ matrix.machine_family }})'
runs-on: 'gemini-cli-ubuntu-16-core'
if: "github.repository == 'google-gemini/gemini-cli'" if: "github.repository == 'google-gemini/gemini-cli'"
strategy:
fail-fast: false
matrix:
include:
- runs_on: 'gemini-cli-ubuntu-16-core'
machine_family: 'gemini-cli-ubuntu-16-core'
- runs_on: 'macos-latest'
machine_family: 'macos-latest'
- runs_on: 'gemini-cli-windows-16-core'
machine_family: 'gemini-cli-windows-16-core'
runs-on: '${{ matrix.runs_on }}'
env:
PERF_MACHINE_FAMILY: '${{ matrix.machine_family }}'
steps: steps:
- name: 'Checkout' - name: 'Checkout'
uses: 'actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8' # ratchet:actions/checkout@v5 uses: 'actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8' # ratchet:actions/checkout@v5
+243
View File
@@ -0,0 +1,243 @@
# Copyright 2026 Google LLC
# SPDX-License-Identifier: Apache-2.0
#
# Update Perf/Memory Baselines
#
# Triggered by:
# 1. A PR comment starting with one of:
# /run perf — updates only perf baselines
# /run mem — updates only memory baselines
# /run perf+mem — updates both (default)
# 2. Manual workflow_dispatch from the Actions tab.
#
# Both paths are gated behind the 'perf-approvers' GitHub environment,
# which requires approval from the designated approvers group before the
# matrix runners are provisioned.
#
# After all per-platform runs complete, the updated baseline JSON files
# are committed back to the triggering branch automatically.
name: 'Update Perf/Memory Baselines'
on:
issue_comment:
types: ['created']
workflow_dispatch:
inputs:
test_type:
description: 'Which baselines to update'
required: true
default: 'perf+mem'
type: 'choice'
options:
- 'perf'
- 'mem'
- 'perf+mem'
ref:
description: 'Branch/SHA to checkout and update baselines on (default: main)'
required: false
default: 'main'
permissions:
contents: 'write' # push the updated baseline commit
pull-requests: 'write' # post the result comment
issues: 'read'
jobs:
# ── 1. Parse slash command / workflow_dispatch ──────────────────────────
parse-command:
name: 'Parse Command'
runs-on: 'gemini-cli-ubuntu-16-core'
if: |
github.repository == 'google-gemini/gemini-cli' && (
github.event_name == 'workflow_dispatch' || (
github.event_name == 'issue_comment' &&
github.event.issue.pull_request != null &&
(
startsWith(github.event.comment.body, '/run perf+mem') ||
startsWith(github.event.comment.body, '/run perf') ||
startsWith(github.event.comment.body, '/run mem')
)
)
)
outputs:
test_type: '${{ steps.parse.outputs.test_type }}'
ref: '${{ steps.parse.outputs.ref }}'
pr_number: '${{ steps.parse.outputs.pr_number }}'
steps:
- name: 'Parse inputs'
id: 'parse'
env:
GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}'
COMMENT_BODY: '${{ github.event.comment.body }}'
DISPATCH_TEST_TYPE: '${{ inputs.test_type }}'
DISPATCH_REF: '${{ inputs.ref }}'
PR_NUMBER: '${{ github.event.issue.number }}'
EVENT_NAME: '${{ github.event_name }}'
run: |
if [[ "$EVENT_NAME" == "workflow_dispatch" ]]; then
TEST_TYPE="${DISPATCH_TEST_TYPE:-perf+mem}"
REF="${DISPATCH_REF:-main}"
echo "pr_number=" >> "$GITHUB_OUTPUT"
else
# Slash command: determine test_type from comment
if echo "$COMMENT_BODY" | grep -q "^/run perf+mem"; then
TEST_TYPE="perf+mem"
elif echo "$COMMENT_BODY" | grep -q "^/run perf"; then
TEST_TYPE="perf"
else
TEST_TYPE="mem"
fi
# Get the HEAD sha of the PR
REF=$(gh pr view "$PR_NUMBER" --json headRefName --jq '.headRefName')
echo "pr_number=${PR_NUMBER}" >> "$GITHUB_OUTPUT"
fi
echo "test_type=${TEST_TYPE}" >> "$GITHUB_OUTPUT"
echo "ref=${REF}" >> "$GITHUB_OUTPUT"
- name: 'Post acknowledgement comment on PR'
if: "steps.parse.outputs.pr_number != ''"
env:
GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}'
run: |
gh pr comment "${{ steps.parse.outputs.pr_number }}" --body \
"⏳ **Baseline update requested** (\`${{ steps.parse.outputs.test_type }}\`).
A member of the \`perf-approvers\` group must approve this workflow before the runners start.
<!-- baseline-update-ack -->"
# ── 2. Approval gate (perf-approvers environment) ──────────────────────
await-approval:
name: 'Await perf-approvers Approval'
needs: 'parse-command'
# This environment requires manual approval from the perf-approvers group
# before GitHub provisions any of the downstream runners.
environment: 'perf-approvers'
runs-on: 'gemini-cli-ubuntu-16-core'
steps:
- name: 'Approved'
run: 'echo "Approved by perf-approvers — launching baseline update matrix."'
# ── 3. Run tests to capture fresh baselines on each platform ───────────
update-baselines:
name: 'Update Baselines (${{ matrix.machine_family }})'
needs: 'await-approval'
strategy:
fail-fast: false
matrix:
include:
- runs_on: 'gemini-cli-ubuntu-16-core'
machine_family: 'gemini-cli-ubuntu-16-core'
- runs_on: 'macos-latest'
machine_family: 'macos-latest'
- runs_on: 'gemini-cli-windows-16-core'
machine_family: 'gemini-cli-windows-16-core'
runs-on: '${{ matrix.runs_on }}'
steps:
- name: 'Checkout'
uses: 'actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8' # ratchet:actions/checkout@v5
with:
ref: '${{ needs.parse-command.outputs.ref }}'
# Need full history so we can push back
fetch-depth: 0
- name: 'Set up Node.js'
uses: 'actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020' # ratchet:actions/setup-node@v4
with:
node-version-file: '.nvmrc'
cache: 'npm'
- name: 'Install dependencies'
run: 'npm ci'
- name: 'Build project'
run: 'npm run build'
- name: 'Update Perf Baselines'
if: "contains(needs.parse-command.outputs.test_type, 'perf')"
run: 'npm run test:perf:update-baselines'
env:
PERF_MACHINE_FAMILY: '${{ matrix.machine_family }}'
- name: 'Update Memory Baselines'
if: "contains(needs.parse-command.outputs.test_type, 'mem')"
run: 'npm run test:memory:update-baselines'
env:
MEMORY_MACHINE_FAMILY: '${{ matrix.machine_family }}'
- name: 'Upload updated baseline files'
uses: 'actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02' # ratchet:actions/upload-artifact@v4
with:
name: 'baselines-${{ matrix.machine_family }}'
# Upload the entire baselines/ subdirectories from both test roots
path: |
perf-tests/baselines/
memory-tests/baselines/
if-no-files-found: 'warn'
# ── 4. Gather artifacts and commit everything back to the branch ────────
commit-baselines:
name: 'Commit Updated Baselines'
needs:
- 'parse-command'
- 'update-baselines'
runs-on: 'gemini-cli-ubuntu-16-core'
steps:
- name: 'Checkout'
uses: 'actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8' # ratchet:actions/checkout@v5
with:
ref: '${{ needs.parse-command.outputs.ref }}'
fetch-depth: 0
- name: 'Download all baseline artifacts'
uses: 'actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093' # ratchet:actions/download-artifact@v4
with:
# Download each per-platform artifact into its own subdirectory so
# the paths mirror the test directory layout.
pattern: 'baselines-*'
merge-multiple: true
path: '.'
- name: 'Commit and push'
env:
GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}'
run: |
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"
# Stage only the per-platform baseline files (not the generic ones)
git add perf-tests/baselines/ memory-tests/baselines/ || true
if git diff --cached --quiet; then
echo "No baseline files changed — nothing to commit."
else
git commit -m "chore: update ${{ needs.parse-command.outputs.test_type }} baselines [skip ci]
Updated by 'Update Perf/Memory Baselines' workflow run #${{ github.run_id }}.
Platforms: gemini-cli-ubuntu-16-core, macos-latest, gemini-cli-windows-16-core"
git push
fi
- name: 'Post result comment on PR'
if: "needs.parse-command.outputs.pr_number != ''"
env:
GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}'
run: |
# Remove the acknowledgement comment before posting the result
COMMENT_ID=$(gh pr view "${{ needs.parse-command.outputs.pr_number }}" \
--json comments \
--jq '.comments[] | select(.body | contains("<!-- baseline-update-ack -->")) | .url' \
| grep -oE '[0-9]+$' | head -n 1)
if [ -n "$COMMENT_ID" ]; then
gh api -X DELETE "repos/${{ github.repository }}/issues/comments/${COMMENT_ID}"
fi
gh pr comment "${{ needs.parse-command.outputs.pr_number }}" --body \
"✅ **Baselines updated** (\`${{ needs.parse-command.outputs.test_type }}\`).
Fresh per-platform baseline files have been committed to this branch for:
- \`gemini-cli-ubuntu-16-core\`
- \`macos-latest\`
- \`gemini-cli-windows-16-core\`
The nightly tests will now compare against these values.
<!-- baseline-update-result -->"
+2
View File
@@ -3,6 +3,8 @@
.env~ .env~
# gemini-cli settings # gemini-cli settings
.tmp-perf-baselines.json
.tmp-memory-baselines.json
# We want to keep the .gemini in the root of the repo and ignore any .gemini # We want to keep the .gemini in the root of the repo and ignore any .gemini
# in subdirectories. In our root .gemini we want to allow for version control # in subdirectories. In our root .gemini we want to allow for version control
# for subcommands. # for subcommands.
+110
View File
@@ -0,0 +1,110 @@
# Performance & Memory Testing Infrastructure
## Overview
Gemini CLI features a highly reliable performance and memory regression testing
pipeline. To curb anomalies and yields accurate results, the harness applies:
- **IQR Outlier Filtering**: Discards anomalous metrics from evaluation safely.
- **Median Sampling**: Takes `N` runs, evaluating strictly median averages
effortlessly.
- **Warmup Runs**: Discards first samples smoothly preventing JIT artifacts.
- **Tolerance Boundary**: Default restrictions at 15% tolerance prevent
unwarranted panics effortlessly.
---
## Baseline Management
There are two core strategies for calibrating tolerances on performance
benchmarks:
- **Approach A: Normalize for Testing Servers**: Tests run directly on the
automated cloud servers, and those scores are recorded as official, static
baselines.
- **Approach B: Machine-Agnostic Daily Comparisons**: Static baseline files are
ignored. Every night, the test is run against today's and yesterday's code on
the exact same server.
### Recommended Strategy: GitHub Action + Approach A
#### Local Development & PR Checks
- **Local Testing**: If you are a developer trying to quickly test your code
changes against performance or memory impacts, simply run the standard local
perf or memory tests directly without arguments. The harness stashes dirty
alterations automatically, refreshes baseline settings against the most
up-to-date `main` branch dynamically using non-tracked ephemeral files, and
yields immediate comparison feedback.
- **PR Merges**: Please note that if your alterations intentionally necessitate
adjustments across baseline metrics, you should trigger the GitHub Action to
recalibrate baselines in tandem with merging your PR. This is so that
subsequent nightly audits appropriately do their evaluation comparisons
against the new tolerances successfully!
#### Nightly Build Health Audits
- Strict Approach A procedures apply daily across platforms on dedicated
environments, avoiding the "boiling frog" issue where micro-regressions
quietly slip past over periods of duration.
---
## Running Tests
### Performance CPU Tests
```bash
# Run tests (compare against committed baselines)
npm run test:perf
# Verbose output
VERBOSE=true npm run test:perf
# Keep test artifacts for debugging
KEEP_OUTPUT=true npm run test:perf
```
### Memory Tests
```bash
# Run memory tests (compare against local main baselines)
npm run test:memory
```
---
## Architecture & Configuration
### Performance Tests Directory Tree
- `perf-tests/baselines.json`: Committed baseline values
- `perf-tests/globalSetup.ts`: Test environment setup
- `perf-tests/perf-usage.test.ts`: Test scenarios
- `perf-tests/perf.*.responses`: Fake API responses per scenario
### Memory Tests Directory Tree
- `memory-tests/baselines.json`: Committed memory values
- `memory-tests/memory-usage.test.ts`: Memory test scenarios
---
## CI Integration
These tests are strictly excluded from `preflight` constraints and remain
designed strictly for nightly daily audits accurately:
```yaml
- name: Performance regression tests
run: npm run test:perf
```
---
## Adding New Scenarios
1. Add a fake response file: `perf.<scenario-name>.responses` or
`memory.<scenario-name>.responses`.
2. Add a test case in `perf-usage.test.ts` or `memory-usage.test.ts` applying
`harness.runScenario()`.
View File
+8 -2
View File
@@ -5,7 +5,11 @@
*/ */
import { describe, it, beforeAll, afterAll, afterEach } from 'vitest'; import { describe, it, beforeAll, afterAll, afterEach } from 'vitest';
import { TestRig, MemoryTestHarness } from '@google/gemini-cli-test-utils'; import {
TestRig,
MemoryTestHarness,
resolveMemoryBaselinesPath,
} from '@google/gemini-cli-test-utils';
import { join, dirname } from 'node:path'; import { join, dirname } from 'node:path';
import { fileURLToPath } from 'node:url'; import { fileURLToPath } from 'node:url';
import { import {
@@ -19,7 +23,8 @@ import {
import { randomUUID } from 'node:crypto'; import { randomUUID } from 'node:crypto';
const __dirname = dirname(fileURLToPath(import.meta.url)); const __dirname = dirname(fileURLToPath(import.meta.url));
const BASELINES_PATH = join(__dirname, 'baselines.json'); const MACHINE_FAMILY = process.env['MEMORY_MACHINE_FAMILY'];
const BASELINES_PATH = resolveMemoryBaselinesPath(__dirname, MACHINE_FAMILY);
const UPDATE_BASELINES = process.env['UPDATE_MEMORY_BASELINES'] === 'true'; const UPDATE_BASELINES = process.env['UPDATE_MEMORY_BASELINES'] === 'true';
const TOLERANCE_PERCENT = 10; const TOLERANCE_PERCENT = 10;
@@ -37,6 +42,7 @@ describe('Memory Usage Tests', () => {
gcCycles: 3, gcCycles: 3,
gcDelayMs: 100, gcDelayMs: 100,
sampleCount: 3, sampleCount: 3,
machineFamily: MACHINE_FAMILY,
}); });
}); });
+2 -2
View File
@@ -51,9 +51,9 @@
"test:integration:all": "npm run test:integration:sandbox:none && npm run test:integration:sandbox:docker && npm run test:integration:sandbox:podman", "test:integration:all": "npm run test:integration:sandbox:none && npm run test:integration:sandbox:docker && npm run test:integration:sandbox:podman",
"test:integration:flaky": "cross-env RUN_FLAKY_INTEGRATION=1 npm run test:integration:sandbox:none", "test:integration:flaky": "cross-env RUN_FLAKY_INTEGRATION=1 npm run test:integration:sandbox:none",
"test:integration:sandbox:none": "cross-env GEMINI_SANDBOX=false vitest run --root ./integration-tests", "test:integration:sandbox:none": "cross-env GEMINI_SANDBOX=false vitest run --root ./integration-tests",
"test:memory": "vitest run --root ./memory-tests", "test:memory": "node scripts/run-perf-tests.js memory",
"test:memory:update-baselines": "cross-env UPDATE_MEMORY_BASELINES=true vitest run --root ./memory-tests", "test:memory:update-baselines": "cross-env UPDATE_MEMORY_BASELINES=true vitest run --root ./memory-tests",
"test:perf": "vitest run --root ./perf-tests", "test:perf": "node scripts/run-perf-tests.js perf",
"test:perf:update-baselines": "cross-env UPDATE_PERF_BASELINES=true vitest run --root ./perf-tests", "test:perf:update-baselines": "cross-env UPDATE_PERF_BASELINES=true vitest run --root ./perf-tests",
"test:integration:sandbox:docker": "cross-env GEMINI_SANDBOX=docker npm run build:sandbox && cross-env GEMINI_SANDBOX=docker vitest run --root ./integration-tests", "test:integration:sandbox:docker": "cross-env GEMINI_SANDBOX=docker npm run build:sandbox && cross-env GEMINI_SANDBOX=docker vitest run --root ./integration-tests",
"test:integration:sandbox:podman": "cross-env GEMINI_SANDBOX=podman vitest run --root ./integration-tests", "test:integration:sandbox:podman": "cross-env GEMINI_SANDBOX=podman vitest run --root ./integration-tests",
+1 -1
View File
@@ -1,6 +1,6 @@
/** /**
* @license * @license
* Copyright 2025 Google LLC * Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0 * SPDX-License-Identifier: Apache-2.0
*/ */
@@ -5,6 +5,7 @@
*/ */
import { readFileSync, writeFileSync, existsSync } from 'node:fs'; import { readFileSync, writeFileSync, existsSync } from 'node:fs';
import { join } from 'node:path';
/** /**
* Baseline entry for a single memory test scenario. * Baseline entry for a single memory test scenario.
@@ -77,3 +78,25 @@ export function updateBaseline(
}; };
saveBaselines(path, baselines); saveBaselines(path, baselines);
} }
/**
* Resolve the path to the correct memory baselines JSON file.
*
* - If `machineFamily` is provided returns `<testRootDir>/baselines/<machineFamily>.json`.
* This file may not exist yet; the harness will hard-fail at assertion time if it doesn't.
* - If `machineFamily` is absent returns `<testRootDir>/baselines.json`
* (the legacy generic file used for local development).
*
* @param testRootDir - Absolute path to the directory containing the test root
* (e.g. `__dirname` inside `memory-tests/`).
* @param machineFamily - Optional CI runner label (e.g. `'gemini-cli-ubuntu-16-core'`).
*/
export function resolveMemoryBaselinesPath(
testRootDir: string,
machineFamily?: string,
): string {
if (machineFamily) {
return join(testRootDir, 'baselines', `${machineFamily}.json`);
}
return join(testRootDir, 'baselines.json');
}
+38 -1
View File
@@ -6,6 +6,8 @@
import v8 from 'node:v8'; import v8 from 'node:v8';
import { setTimeout as sleep } from 'node:timers/promises'; import { setTimeout as sleep } from 'node:timers/promises';
import { mkdirSync } from 'node:fs';
import { join, dirname } from 'node:path';
import { loadBaselines, updateBaseline } from './memory-baselines.js'; import { loadBaselines, updateBaseline } from './memory-baselines.js';
import type { MemoryBaseline, MemoryBaselineFile } from './memory-baselines.js'; import type { MemoryBaseline, MemoryBaselineFile } from './memory-baselines.js';
@@ -66,6 +68,14 @@ export interface MemoryTestHarnessOptions {
sampleCount?: number; sampleCount?: number;
/** Pause in ms between samples. Default: 50 */ /** Pause in ms between samples. Default: 50 */
samplePauseMs?: number; samplePauseMs?: number;
/**
* The CI machine family (e.g. 'gemini-cli-ubuntu-16-core').
* When set, baselines are loaded from and saved to
* `<dir>/baselines/<machineFamily>.json`. If the file does not exist and
* UPDATE_MEMORY_BASELINES is not set, tests hard-fail with an actionable
* message instead of silently falling back.
*/
machineFamily?: string;
} }
/** /**
@@ -85,6 +95,7 @@ export class MemoryTestHarness {
private readonly gcDelayMs: number; private readonly gcDelayMs: number;
private readonly sampleCount: number; private readonly sampleCount: number;
private readonly samplePauseMs: number; private readonly samplePauseMs: number;
private readonly machineFamily?: string;
private allResults: MemoryTestResult[] = []; private allResults: MemoryTestResult[] = [];
constructor(options: MemoryTestHarnessOptions) { constructor(options: MemoryTestHarnessOptions) {
@@ -94,6 +105,7 @@ export class MemoryTestHarness {
this.gcDelayMs = options.gcDelayMs ?? 100; this.gcDelayMs = options.gcDelayMs ?? 100;
this.sampleCount = options.sampleCount ?? 3; this.sampleCount = options.sampleCount ?? 3;
this.samplePauseMs = options.samplePauseMs ?? 50; this.samplePauseMs = options.samplePauseMs ?? 50;
this.machineFamily = options.machineFamily;
this.baselines = loadBaselines(this.baselinesPath); this.baselines = loadBaselines(this.baselinesPath);
} }
@@ -240,6 +252,16 @@ export class MemoryTestHarness {
const tolerance = tolerancePercent ?? this.defaultTolerancePercent; const tolerance = tolerancePercent ?? this.defaultTolerancePercent;
if (!result.baseline) { if (!result.baseline) {
if (this.machineFamily) {
throw new Error(
`No baseline found for scenario "${result.scenarioName}" on machine family "${this.machineFamily}".\n` +
` Expected file: ${this.baselinesPath}\n` +
` To create it, trigger the 'Update Baselines' workflow:\n` +
` .github/workflows/update-baselines.yml\n` +
` Or locally:\n` +
` UPDATE_MEMORY_BASELINES=true MEMORY_MACHINE_FAMILY=${this.machineFamily} npm run test:memory`,
);
}
console.warn( console.warn(
`⚠ No baseline found for "${result.scenarioName}". ` + `⚠ No baseline found for "${result.scenarioName}". ` +
`Run with UPDATE_MEMORY_BASELINES=true to create one. ` + `Run with UPDATE_MEMORY_BASELINES=true to create one. ` +
@@ -268,9 +290,21 @@ export class MemoryTestHarness {
/** /**
* Update the baseline for a scenario with the current measured values. * Update the baseline for a scenario with the current measured values.
* When `machineFamily` is set, writes to `baselines/<machineFamily>.json`
* (creating the directory if needed). Otherwise writes to `baselinesPath`.
*/ */
updateScenarioBaseline(result: MemoryTestResult): void { updateScenarioBaseline(result: MemoryTestResult): void {
updateBaseline(this.baselinesPath, result.scenarioName, { const targetPath = this.machineFamily
? join(
dirname(this.baselinesPath),
'baselines',
`${this.machineFamily}.json`,
)
: this.baselinesPath;
if (this.machineFamily) {
mkdirSync(dirname(targetPath), { recursive: true });
}
updateBaseline(targetPath, result.scenarioName, {
heapUsedBytes: result.finalHeapUsed, heapUsedBytes: result.finalHeapUsed,
heapTotalBytes: heapTotalBytes:
result.snapshots[result.snapshots.length - 1]?.heapTotal ?? 0, result.snapshots[result.snapshots.length - 1]?.heapTotal ?? 0,
@@ -391,6 +425,9 @@ export class MemoryTestHarness {
lines.push(''); lines.push('');
lines.push('═══════════════════════════════════════════════════'); lines.push('═══════════════════════════════════════════════════');
lines.push(' MEMORY USAGE TEST REPORT'); lines.push(' MEMORY USAGE TEST REPORT');
if (this.machineFamily) {
lines.push(` Machine family: ${this.machineFamily}`);
}
lines.push('═══════════════════════════════════════════════════'); lines.push('═══════════════════════════════════════════════════');
lines.push(''); lines.push('');
+67 -3
View File
@@ -6,7 +6,8 @@
import { performance } from 'node:perf_hooks'; import { performance } from 'node:perf_hooks';
import { setTimeout as sleep } from 'node:timers/promises'; import { setTimeout as sleep } from 'node:timers/promises';
import { readFileSync, writeFileSync, existsSync } from 'node:fs'; import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'node:fs';
import { join, dirname } from 'node:path';
/** Configuration for asciichart plot function. */ /** Configuration for asciichart plot function. */
interface PlotConfig { interface PlotConfig {
@@ -83,6 +84,14 @@ export interface PerfTestHarnessOptions {
warmupCount?: number; warmupCount?: number;
/** Pause in ms between samples. Default: 100 */ /** Pause in ms between samples. Default: 100 */
samplePauseMs?: number; samplePauseMs?: number;
/**
* The CI machine family (e.g. 'gemini-cli-ubuntu-16-core').
* When set, baselines are loaded from and saved to
* `<dir>/baselines/<machineFamily>.json`. If the file does not exist and
* UPDATE_PERF_BASELINES is not set, tests hard-fail with an actionable
* message instead of silently falling back.
*/
machineFamily?: string;
} }
/** /**
@@ -114,6 +123,7 @@ export class PerfTestHarness {
private readonly sampleCount: number; private readonly sampleCount: number;
private readonly warmupCount: number; private readonly warmupCount: number;
private readonly samplePauseMs: number; private readonly samplePauseMs: number;
private readonly machineFamily?: string;
private allResults: PerfTestResult[] = []; private allResults: PerfTestResult[] = [];
private activeTimers: Map<string, ActiveTimer> = new Map(); private activeTimers: Map<string, ActiveTimer> = new Map();
@@ -124,6 +134,7 @@ export class PerfTestHarness {
this.sampleCount = options.sampleCount ?? 5; this.sampleCount = options.sampleCount ?? 5;
this.warmupCount = options.warmupCount ?? 1; this.warmupCount = options.warmupCount ?? 1;
this.samplePauseMs = options.samplePauseMs ?? 100; this.samplePauseMs = options.samplePauseMs ?? 100;
this.machineFamily = options.machineFamily;
this.baselines = loadPerfBaselines(this.baselinesPath); this.baselines = loadPerfBaselines(this.baselinesPath);
} }
@@ -284,6 +295,18 @@ export class PerfTestHarness {
const cpuTolerance = cpuTolerancePercent ?? this.defaultCpuTolerancePercent; const cpuTolerance = cpuTolerancePercent ?? this.defaultCpuTolerancePercent;
if (!result.baseline) { if (!result.baseline) {
if (this.machineFamily) {
// In CI with a declared machine family: hard-fail so the problem is
// immediately visible, rather than silently skipping the assertion.
throw new Error(
`No baseline found for scenario "${result.scenarioName}" on machine family "${this.machineFamily}".\n` +
` Expected file: ${this.baselinesPath}\n` +
` To create it, trigger the 'Update Baselines' workflow:\n` +
` .github/workflows/update-baselines.yml\n` +
` Or locally:\n` +
` UPDATE_PERF_BASELINES=true PERF_MACHINE_FAMILY=${this.machineFamily} npm run test:perf`,
);
}
console.warn( console.warn(
`⚠ No baseline found for "${result.scenarioName}". ` + `⚠ No baseline found for "${result.scenarioName}". ` +
`Run with UPDATE_PERF_BASELINES=true to create one. ` + `Run with UPDATE_PERF_BASELINES=true to create one. ` +
@@ -321,16 +344,30 @@ export class PerfTestHarness {
/** /**
* Update the baseline for a scenario with the current measured values. * Update the baseline for a scenario with the current measured values.
* When `machineFamily` is set, writes to `baselines/<machineFamily>.json`
* (creating the directory if needed). Otherwise writes to `baselinesPath`.
*/ */
updateScenarioBaseline(result: PerfTestResult): void { updateScenarioBaseline(result: PerfTestResult): void {
updatePerfBaseline(this.baselinesPath, result.scenarioName, { const targetPath = this.machineFamily
? join(
dirname(this.baselinesPath),
'baselines',
`${this.machineFamily}.json`,
)
: this.baselinesPath;
// Ensure the baselines/ subdirectory exists
if (this.machineFamily) {
mkdirSync(dirname(targetPath), { recursive: true });
}
updatePerfBaseline(targetPath, result.scenarioName, {
wallClockMs: result.median.wallClockMs, wallClockMs: result.median.wallClockMs,
cpuTotalUs: result.median.cpuTotalUs, cpuTotalUs: result.median.cpuTotalUs,
}); });
// Reload baselines after update // Reload baselines after update
this.baselines = loadPerfBaselines(this.baselinesPath); this.baselines = loadPerfBaselines(this.baselinesPath);
console.log( console.log(
`Updated baseline for ${result.scenarioName}: ${result.median.wallClockMs.toFixed(1)} ms`, `Updated baseline for ${result.scenarioName}: ${result.median.wallClockMs.toFixed(1)} ms` +
(this.machineFamily ? ` [${this.machineFamily}]` : ''),
); );
} }
@@ -344,6 +381,9 @@ export class PerfTestHarness {
lines.push(''); lines.push('');
lines.push('═══════════════════════════════════════════════════'); lines.push('═══════════════════════════════════════════════════');
lines.push(' PERFORMANCE TEST REPORT'); lines.push(' PERFORMANCE TEST REPORT');
if (this.machineFamily) {
lines.push(` Machine family: ${this.machineFamily}`);
}
lines.push('═══════════════════════════════════════════════════'); lines.push('═══════════════════════════════════════════════════');
lines.push(''); lines.push('');
@@ -484,6 +524,30 @@ export class PerfTestHarness {
} }
} }
// ─── Baseline path resolution ────────────────────────────────────────
/**
* Resolve the path to the correct perf baselines JSON file.
*
* - If `machineFamily` is provided returns `<testRootDir>/baselines/<machineFamily>.json`.
* This file may not exist yet; the harness will hard-fail at assertion time if it doesn't.
* - If `machineFamily` is absent returns `<testRootDir>/baselines.json`
* (the legacy generic file used for local development).
*
* @param testRootDir - Absolute path to the directory containing the test root
* (e.g. `__dirname` inside `perf-tests/`).
* @param machineFamily - Optional CI runner label (e.g. `'gemini-cli-ubuntu-16-core'`).
*/
export function resolvePerfBaselinesPath(
testRootDir: string,
machineFamily?: string,
): string {
if (machineFamily) {
return join(testRootDir, 'baselines', `${machineFamily}.json`);
}
return join(testRootDir, 'baselines.json');
}
// ─── Baseline management ───────────────────────────────────────────── // ─── Baseline management ─────────────────────────────────────────────
/** /**
-121
View File
@@ -1,121 +0,0 @@
# CPU Performance Integration Test Harness
## Overview
This directory contains performance/CPU integration tests for the Gemini CLI.
These tests measure wall-clock time, CPU usage, and event loop responsiveness to
detect regressions across key scenarios.
CPU performance is inherently noisy, especially in CI. The harness addresses
this with:
- **IQR outlier filtering** — discards anomalous samples
- **Median sampling** — takes N runs, reports the median after filtering
- **Warmup runs** — discards the first run to mitigate JIT compilation noise
- **15% default tolerance** — won't panic at slight regressions
## Running
```bash
# Run tests (compare against committed baselines)
npm run test:perf
# Update baselines (after intentional changes)
npm run test:perf:update-baselines
# Verbose output
VERBOSE=true npm run test:perf
# Keep test artifacts for debugging
KEEP_OUTPUT=true npm run test:perf
```
## How It Works
### Measurement Primitives
The `PerfTestHarness` class (in `packages/test-utils`) provides:
- **`performance.now()`** — high-resolution wall-clock timing
- **`process.cpuUsage()`** — user + system CPU microseconds (delta between
start/stop)
- **`perf_hooks.monitorEventLoopDelay()`** — event loop delay histogram
(p50/p95/p99/max)
### Noise Reduction
1. **Warmup**: First run is discarded to mitigate JIT compilation artifacts
2. **Multiple samples**: Each scenario runs N times (default 5)
3. **IQR filtering**: Samples outside Q11.5×IQR and Q3+1.5×IQR are discarded
4. **Median**: The median of remaining samples is used for comparison
### Baseline Management
Baselines are stored in `baselines.json` in this directory. Each scenario has:
```json
{
"cold-startup-time": {
"wallClockMs": 1234.5,
"cpuTotalUs": 567890,
"eventLoopDelayP99Ms": 12.3,
"timestamp": "2026-04-08T..."
}
}
```
Tests fail if the measured value exceeds `baseline × 1.15` (15% tolerance).
To recalibrate after intentional changes:
```bash
npm run test:perf:update-baselines
# then commit baselines.json
```
### Report Output
After all tests, the harness prints an ASCII summary:
```
═══════════════════════════════════════════════════
PERFORMANCE TEST REPORT
═══════════════════════════════════════════════════
cold-startup-time: 1234.5 ms (Baseline: 1200.0 ms, Delta: +2.9%) ✅
idle-cpu-usage: 2.1 % (Baseline: 2.0 %, Delta: +5.0%) ✅
skill-loading-time: 1567.8 ms (Baseline: 1500.0 ms, Delta: +4.5%) ✅
```
## Architecture
```
perf-tests/
├── README.md ← you are here
├── baselines.json ← committed baseline values
├── globalSetup.ts ← test environment setup
├── perf-usage.test.ts ← test scenarios
├── perf.*.responses ← fake API responses per scenario
├── tsconfig.json ← TypeScript config
└── vitest.config.ts ← vitest config (serial, isolated)
packages/test-utils/src/
├── perf-test-harness.ts ← PerfTestHarness class
└── index.ts ← re-exports
```
## CI Integration
These tests are **excluded from `preflight`** and designed for nightly CI:
```yaml
- name: Performance regression tests
run: npm run test:perf
```
## Adding a New Scenario
1. Add a fake response file: `perf.<scenario-name>.responses`
2. Add a test case in `perf-usage.test.ts` using `harness.runScenario()`
3. Run `npm run test:perf:update-baselines` to establish initial baseline
4. Commit the updated `baselines.json`
View File
+8 -2
View File
@@ -5,13 +5,18 @@
*/ */
import { describe, it, beforeAll, afterAll } from 'vitest'; import { describe, it, beforeAll, afterAll } from 'vitest';
import { TestRig, PerfTestHarness } from '@google/gemini-cli-test-utils'; import {
TestRig,
PerfTestHarness,
resolvePerfBaselinesPath,
} from '@google/gemini-cli-test-utils';
import { join, dirname } from 'node:path'; import { join, dirname } from 'node:path';
import { fileURLToPath } from 'node:url'; import { fileURLToPath } from 'node:url';
import { existsSync, readFileSync } from 'node:fs'; import { existsSync, readFileSync } from 'node:fs';
const __dirname = dirname(fileURLToPath(import.meta.url)); const __dirname = dirname(fileURLToPath(import.meta.url));
const BASELINES_PATH = join(__dirname, 'baselines.json'); const MACHINE_FAMILY = process.env['PERF_MACHINE_FAMILY'];
const BASELINES_PATH = resolvePerfBaselinesPath(__dirname, MACHINE_FAMILY);
const UPDATE_BASELINES = process.env['UPDATE_PERF_BASELINES'] === 'true'; const UPDATE_BASELINES = process.env['UPDATE_PERF_BASELINES'] === 'true';
const TOLERANCE_PERCENT = 15; const TOLERANCE_PERCENT = 15;
@@ -28,6 +33,7 @@ describe('CPU Performance Tests', () => {
defaultTolerancePercent: TOLERANCE_PERCENT, defaultTolerancePercent: TOLERANCE_PERCENT,
sampleCount: SAMPLE_COUNT, sampleCount: SAMPLE_COUNT,
warmupCount: WARMUP_COUNT, warmupCount: WARMUP_COUNT,
machineFamily: MACHINE_FAMILY,
}); });
}); });
+2
View File
@@ -27,6 +27,8 @@ const root = join(__dirname, '..');
// remove npm install/build artifacts // remove npm install/build artifacts
rmSync(join(root, 'node_modules'), { recursive: true, force: true }); rmSync(join(root, 'node_modules'), { recursive: true, force: true });
rmSync(join(root, 'bundle'), { recursive: true, force: true }); rmSync(join(root, 'bundle'), { recursive: true, force: true });
rmSync(join(root, '.tmp-perf-baselines.json'), { force: true });
rmSync(join(root, '.tmp-memory-baselines.json'), { force: true });
rmSync(join(root, 'packages/cli/src/generated/'), { rmSync(join(root, 'packages/cli/src/generated/'), {
recursive: true, recursive: true,
force: true, force: true,
+118
View File
@@ -0,0 +1,118 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { execSync } from 'node:child_process';
import path from 'node:path';
const type = process.argv[2]; // 'perf' or 'memory'
const args = process.argv.slice(3);
if (type !== 'perf' && type !== 'memory') {
console.error('Invalid test type. Must be "perf" or "memory".');
process.exit(1);
}
const isLocal = !process.env.CI && !process.env.GITHUB_ACTIONS;
const noOptions = args.length === 0;
const testDir = type === 'perf' ? './perf-tests' : './memory-tests';
const updateEnv =
type === 'perf'
? 'UPDATE_PERF_BASELINES=true'
: 'UPDATE_MEMORY_BASELINES=true';
const tempBaselinesPath = path.resolve(
process.cwd(),
`.tmp-${type}-baselines.json`,
);
if (isLocal && noOptions) {
console.log(
`[Auto-Baseline] Detected local run without options for ${type} tests.`,
);
console.log('[Auto-Baseline] Updating baselines from main branch first...');
let originalBranch = '';
let isDirty = false;
try {
originalBranch = execSync('git rev-parse --abbrev-ref HEAD', {
encoding: 'utf-8',
}).trim();
const status = execSync('git status --porcelain', {
encoding: 'utf-8',
}).trim();
isDirty = status !== '';
if (isDirty) {
console.log('[Auto-Baseline] Stashing current changes...');
execSync('git stash push --include-untracked -m "temp-perf-test-run"');
}
console.log('[Auto-Baseline] Switching to main branch...');
execSync('git checkout main', { stdio: 'inherit' });
try {
console.log(
'[Auto-Baseline] Pulling latest changes for main from origin...',
);
execSync('git pull origin main', { stdio: 'inherit' });
} catch {
console.warn(
'[Auto-Baseline] Warning: git pull failed. Proceeding with local main branch.',
);
}
console.log(
`[Auto-Baseline] Running update baselines for ${type} tests on main...`,
);
execSync(
`npx cross-env ${updateEnv} TEMP_BASELINES_PATH=${tempBaselinesPath} npx vitest run --root ${testDir}`,
{ stdio: 'inherit' },
);
} catch (err) {
console.error(
'[Auto-Baseline] Error during main-branch baseline update:',
err,
);
} finally {
if (originalBranch) {
console.log(
`[Auto-Baseline] Returning to original branch: ${originalBranch}...`,
);
try {
execSync(`git checkout ${originalBranch}`, { stdio: 'inherit' });
if (isDirty) {
console.log('[Auto-Baseline] Restoring stashed changes...');
execSync('git stash pop', { stdio: 'inherit' });
}
} catch {
console.error(
'[Auto-Baseline] Critical error while trying to restore original branch state.',
);
}
}
}
console.log(
`[Auto-Baseline] Running tests on branch ${originalBranch} against updated baselines...`,
);
try {
execSync(
`npx cross-env TEMP_BASELINES_PATH=${tempBaselinesPath} npx vitest run --root ${testDir}`,
{ stdio: 'inherit' },
);
} catch {
process.exit(1);
}
} else {
// Just run standard tests directly
const command = `npx vitest run --root ${testDir} ${args.join(' ')}`;
console.log(`[Standard] Running tests: ${command}`);
try {
execSync(command, { stdio: 'inherit' });
} catch {
process.exit(1);
}
}