diff --git a/.github/workflows/perf-nightly.yml b/.github/workflows/perf-nightly.yml new file mode 100644 index 0000000000..3749df231a --- /dev/null +++ b/.github/workflows/perf-nightly.yml @@ -0,0 +1,33 @@ +name: 'Performance Tests: Nightly' + +on: + schedule: + - cron: '0 3 * * *' # Runs at 3 AM every day + workflow_dispatch: # Allow manual trigger + +permissions: + contents: 'read' + +jobs: + perf-test: + name: 'Run Performance Usage Tests' + runs-on: 'gemini-cli-ubuntu-16-core' + if: "github.repository == 'google-gemini/gemini-cli'" + steps: + - name: 'Checkout' + uses: 'actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8' # ratchet:actions/checkout@v5 + + - name: 'Set up Node.js' + uses: 'actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020' # ratchet:actions/setup-node@v4 + with: + node-version-file: '.nvmrc' + cache: 'npm' + + - name: 'Install dependencies' + run: 'npm ci' + + - name: 'Build project' + run: 'npm run build' + + - name: 'Run Performance Tests' + run: 'npm run test:perf' diff --git a/.gitignore b/.gitignore index b6e3804ab5..85902b4a7c 100644 --- a/.gitignore +++ b/.gitignore @@ -48,6 +48,7 @@ packages/cli/src/generated/ packages/core/src/generated/ packages/devtools/src/_client-assets.ts .integration-tests/ +.perf-tests/ packages/vscode-ide-companion/*.vsix packages/cli/download-ripgrep*/ diff --git a/GEMINI.md b/GEMINI.md index 60824972d3..4acdfc08be 100644 --- a/GEMINI.md +++ b/GEMINI.md @@ -44,8 +44,13 @@ powerful tool for developers. - **Test Commands:** - **Unit (All):** `npm run test` - **Integration (E2E):** `npm run test:e2e` + - > **NOTE**: Please run the memory and perf tests locally **only if** you are + > implementing changes related to those test areas. Otherwise skip these + > tests locally and rely on CI to run them on nightly builds. - **Memory (Nightly):** `npm run test:memory` (Runs memory regression tests against baselines. Excluded from `preflight`, run nightly.) + - **Performance (Nightly):** `npm run test:perf` (Runs CPU performance + regression tests against baselines. Excluded from `preflight`, run nightly.) - **Workspace-Specific:** `npm test -w -- ` (Note: `` must be relative to the workspace root, e.g., `-w @google/gemini-cli-core -- src/routing/modelRouterService.test.ts`) diff --git a/docs/integration-tests.md b/docs/integration-tests.md index bfed813ebc..ddd4eb9c73 100644 --- a/docs/integration-tests.md +++ b/docs/integration-tests.md @@ -157,6 +157,48 @@ The harness (`MemoryTestHarness` in `packages/test-utils`): - Compares against baselines with a 10% tolerance. - Can analyze sustained leaks across 3 snapshots using `analyzeSnapshots()`. +## Performance regression tests + +Performance regression tests are designed to detect wall-clock time, CPU usage, +and event loop delay regressions across key CLI scenarios. They are located in +the `perf-tests` directory. + +These tests are distinct from standard integration tests because they measure +performance metrics and compare it against committed baselines. + +### Running performance tests + +Performance tests are not run as part of the default `npm run test` or +`npm run test:e2e` commands. They are run nightly in CI but can be run manually: + +```bash +npm run test:perf +``` + +### Updating baselines + +If you intentionally change behavior that affects performance, you may need to +update the baselines. Set the `UPDATE_PERF_BASELINES` environment variable to +`true`: + +```bash +UPDATE_PERF_BASELINES=true npm run test:perf +``` + +This will run the tests multiple times (with warmup), apply IQR outlier +filtering, and overwrite `perf-tests/baselines.json`. You should review the +changes and commit the updated baseline file. + +### How it works + +The harness (`PerfTestHarness` in `packages/test-utils`): + +- Measures wall-clock time using `performance.now()`. +- Measures CPU usage using `process.cpuUsage()`. +- Monitors event loop delay using `perf_hooks.monitorEventLoopDelay()`. +- Applies IQR (Interquartile Range) filtering to remove outlier samples. +- Compares against baselines with a 15% tolerance. + ## Diagnostics The integration test runner provides several options for diagnostics to help diff --git a/integration-tests/globalSetup.ts b/integration-tests/globalSetup.ts index 9dad51f9b3..4a15d03255 100644 --- a/integration-tests/globalSetup.ts +++ b/integration-tests/globalSetup.ts @@ -14,6 +14,7 @@ import { join, dirname, extname } from 'node:path'; import { fileURLToPath } from 'node:url'; import { canUseRipgrep } from '../packages/core/src/tools/ripGrep.js'; import { disableMouseTracking } from '@google/gemini-cli-core'; +import { isolateTestEnv } from '../packages/test-utils/src/env-setup.js'; import { createServer, type Server } from 'node:http'; const __dirname = dirname(fileURLToPath(import.meta.url)); @@ -88,15 +89,8 @@ export async function setup() { runDir = join(integrationTestsDir, `${Date.now()}`); await mkdir(runDir, { recursive: true }); - // Set the home directory to the test run directory to avoid conflicts - // with the user's local config. - process.env['HOME'] = runDir; - if (process.platform === 'win32') { - process.env['USERPROFILE'] = runDir; - } - // We also need to set the config dir explicitly, since the code might - // construct the path before the HOME env var is set. - process.env['GEMINI_CONFIG_DIR'] = join(runDir, '.gemini'); + // Isolate environment variables + isolateTestEnv(runDir); // Download ripgrep to avoid race conditions in parallel tests const available = await canUseRipgrep(); @@ -127,10 +121,6 @@ export async function setup() { } process.env['INTEGRATION_TEST_FILE_DIR'] = runDir; - process.env['GEMINI_CLI_INTEGRATION_TEST'] = 'true'; - // Force file storage to avoid keychain prompts/hangs in CI, especially on macOS - process.env['GEMINI_FORCE_FILE_STORAGE'] = 'true'; - process.env['TELEMETRY_LOG_FILE'] = join(runDir, 'telemetry.log'); if (process.env['KEEP_OUTPUT']) { console.log(`Keeping output for test run in: ${runDir}`); diff --git a/package-lock.json b/package-lock.json index 2d3e670b74..3a22da1337 100644 --- a/package-lock.json +++ b/package-lock.json @@ -36,6 +36,7 @@ "@types/ws": "^8.18.1", "@vitest/coverage-v8": "^3.1.1", "@vitest/eslint-plugin": "^1.3.4", + "asciichart": "^1.5.25", "cross-env": "^7.0.3", "depcheck": "^1.4.7", "domexception": "^4.0.0", diff --git a/package.json b/package.json index f531b41dbc..77801eaa7b 100644 --- a/package.json +++ b/package.json @@ -53,6 +53,8 @@ "test:integration:sandbox:none": "cross-env GEMINI_SANDBOX=false vitest run --root ./integration-tests", "test:memory": "vitest run --root ./memory-tests", "test:memory:update-baselines": "cross-env UPDATE_MEMORY_BASELINES=true vitest run --root ./memory-tests", + "test:perf": "vitest run --root ./perf-tests", + "test:perf:update-baselines": "cross-env UPDATE_PERF_BASELINES=true vitest run --root ./perf-tests", "test:integration:sandbox:docker": "cross-env GEMINI_SANDBOX=docker npm run build:sandbox && cross-env GEMINI_SANDBOX=docker vitest run --root ./integration-tests", "test:integration:sandbox:podman": "cross-env GEMINI_SANDBOX=podman vitest run --root ./integration-tests", "lint": "cross-env NODE_OPTIONS=\"--max-old-space-size=8192\" eslint . --cache --max-warnings 0", @@ -105,6 +107,7 @@ "@types/ws": "^8.18.1", "@vitest/coverage-v8": "^3.1.1", "@vitest/eslint-plugin": "^1.3.4", + "asciichart": "^1.5.25", "cross-env": "^7.0.3", "depcheck": "^1.4.7", "domexception": "^4.0.0", diff --git a/packages/test-utils/src/env-setup.ts b/packages/test-utils/src/env-setup.ts new file mode 100644 index 0000000000..1c5ffd0d21 --- /dev/null +++ b/packages/test-utils/src/env-setup.ts @@ -0,0 +1,35 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { join } from 'node:path'; + +/** + * Isolate the test environment by setting environment variables + * to point to a temporary run directory. + * + * @param runDir - The temporary directory for this test run. + */ +export function isolateTestEnv(runDir: string): void { + // Set the home directory to the test run directory to avoid conflicts + // with the user's local config. + process.env['HOME'] = runDir; + if (process.platform === 'win32') { + process.env['USERPROFILE'] = runDir; + } + + // We also need to set the config dir explicitly, since the code might + // construct the path before the HOME env var is set. + process.env['GEMINI_CONFIG_DIR'] = join(runDir, '.gemini'); + + // Force file storage to avoid keychain prompts/hangs in CI, especially on macOS + process.env['GEMINI_FORCE_FILE_STORAGE'] = 'true'; + + // Mark as integration test + process.env['GEMINI_CLI_INTEGRATION_TEST'] = 'true'; + + // Isolate telemetry log + process.env['TELEMETRY_LOG_FILE'] = join(runDir, 'telemetry.log'); +} diff --git a/packages/test-utils/src/index.ts b/packages/test-utils/src/index.ts index 49eaec66d3..e851e7ab8d 100644 --- a/packages/test-utils/src/index.ts +++ b/packages/test-utils/src/index.ts @@ -8,6 +8,8 @@ export * from './file-system-test-helpers.js'; export * from './fixtures/agents.js'; export * from './memory-baselines.js'; export * from './memory-test-harness.js'; +export * from './perf-test-harness.js'; export * from './mock-utils.js'; export * from './test-mcp-server.js'; export * from './test-rig.js'; +export * from './env-setup.js'; diff --git a/packages/test-utils/src/perf-test-harness.ts b/packages/test-utils/src/perf-test-harness.ts new file mode 100644 index 0000000000..c4625077be --- /dev/null +++ b/packages/test-utils/src/perf-test-harness.ts @@ -0,0 +1,546 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { performance } from 'node:perf_hooks'; +import { setTimeout as sleep } from 'node:timers/promises'; +import { readFileSync, writeFileSync, existsSync } from 'node:fs'; + +/** Configuration for asciichart plot function. */ +interface PlotConfig { + height?: number; + format?: (x: number) => string; +} + +/** Type for the asciichart plot function. */ +type PlotFn = (series: number[], config?: PlotConfig) => string; + +/** + * Baseline entry for a single performance test scenario. + */ +export interface PerfBaseline { + wallClockMs: number; + cpuTotalUs: number; + eventLoopDelayP99Ms: number; + timestamp: string; +} + +/** + * Top-level structure of the perf baselines JSON file. + */ +export interface PerfBaselineFile { + version: number; + updatedAt: string; + scenarios: Record; +} + +/** + * A single performance snapshot at a point in time. + */ +export interface PerfSnapshot { + timestamp: number; + label: string; + wallClockMs: number; + cpuUserUs: number; + cpuSystemUs: number; + cpuTotalUs: number; + eventLoopDelayP50Ms: number; + eventLoopDelayP95Ms: number; + eventLoopDelayP99Ms: number; + eventLoopDelayMaxMs: number; +} + +/** + * Result from running a performance test scenario. + */ +export interface PerfTestResult { + scenarioName: string; + samples: PerfSnapshot[]; + filteredSamples: PerfSnapshot[]; + median: PerfSnapshot; + baseline: PerfBaseline | undefined; + withinTolerance: boolean; + deltaPercent: number; + cpuDeltaPercent: number; +} + +/** + * Options for the PerfTestHarness. + */ +export interface PerfTestHarnessOptions { + /** Path to the baselines JSON file */ + baselinesPath: string; + /** Default tolerance percentage (0-100). Default: 15 */ + defaultTolerancePercent?: number; + /** Default CPU tolerance percentage (0-100). Optional */ + defaultCpuTolerancePercent?: number; + /** Number of samples per scenario. Default: 5 */ + sampleCount?: number; + /** Number of warmup runs to discard. Default: 1 */ + warmupCount?: number; + /** Pause in ms between samples. Default: 100 */ + samplePauseMs?: number; +} + +/** + * Active timer state tracked internally. + */ +interface ActiveTimer { + label: string; + startTime: number; + startCpuUsage: NodeJS.CpuUsage; +} + +/** + * PerfTestHarness provides infrastructure for running CPU performance tests. + * + * It handles: + * - High-resolution wall-clock timing via performance.now() + * - CPU usage measurement via process.cpuUsage() + * - Event loop delay monitoring via perf_hooks.monitorEventLoopDelay() + * - IQR outlier filtering for noise reduction + * - Warmup runs to avoid JIT compilation noise + * - Comparing against baselines with configurable tolerance + * - Generating ASCII chart reports + */ +export class PerfTestHarness { + private baselines: PerfBaselineFile; + private readonly baselinesPath: string; + private readonly defaultTolerancePercent: number; + private readonly defaultCpuTolerancePercent?: number; + private readonly sampleCount: number; + private readonly warmupCount: number; + private readonly samplePauseMs: number; + private allResults: PerfTestResult[] = []; + private activeTimers: Map = new Map(); + + constructor(options: PerfTestHarnessOptions) { + this.baselinesPath = options.baselinesPath; + this.defaultTolerancePercent = options.defaultTolerancePercent ?? 15; + this.defaultCpuTolerancePercent = options.defaultCpuTolerancePercent; + this.sampleCount = options.sampleCount ?? 5; + this.warmupCount = options.warmupCount ?? 1; + this.samplePauseMs = options.samplePauseMs ?? 100; + this.baselines = loadPerfBaselines(this.baselinesPath); + } + + /** + * Start a high-resolution timer with CPU tracking. + */ + startTimer(label: string): void { + this.activeTimers.set(label, { + label, + startTime: performance.now(), + startCpuUsage: process.cpuUsage(), + }); + } + + /** + * Stop a timer and return the snapshot. + */ + stopTimer(label: string): PerfSnapshot { + const timer = this.activeTimers.get(label); + if (!timer) { + throw new Error(`No active timer found for label "${label}"`); + } + + const wallClockMs = performance.now() - timer.startTime; + const cpuDelta = process.cpuUsage(timer.startCpuUsage); + this.activeTimers.delete(label); + + return { + timestamp: Date.now(), + label, + wallClockMs, + cpuUserUs: cpuDelta.user, + cpuSystemUs: cpuDelta.system, + cpuTotalUs: cpuDelta.user + cpuDelta.system, + eventLoopDelayP50Ms: 0, + eventLoopDelayP95Ms: 0, + eventLoopDelayP99Ms: 0, + eventLoopDelayMaxMs: 0, + }; + } + + /** + * Measure a function's wall-clock time and CPU usage. + * Returns the snapshot with timing data. + */ + async measure(label: string, fn: () => Promise): Promise { + this.startTimer(label); + await fn(); + return this.stopTimer(label); + } + + /** + * Measure a function with event loop delay monitoring. + * Uses perf_hooks.monitorEventLoopDelay() for histogram data. + */ + async measureWithEventLoop( + label: string, + fn: () => Promise, + ): Promise { + // monitorEventLoopDelay is available in Node.js 12+ + const { monitorEventLoopDelay } = await import('node:perf_hooks'); + const histogram = monitorEventLoopDelay({ resolution: 10 }); + histogram.enable(); + + this.startTimer(label); + await fn(); + const snapshot = this.stopTimer(label); + + histogram.disable(); + + // Convert from nanoseconds to milliseconds + snapshot.eventLoopDelayP50Ms = histogram.percentile(50) / 1e6; + snapshot.eventLoopDelayP95Ms = histogram.percentile(95) / 1e6; + snapshot.eventLoopDelayP99Ms = histogram.percentile(99) / 1e6; + snapshot.eventLoopDelayMaxMs = histogram.max / 1e6; + + return snapshot; + } + + /** + * Run a scenario multiple times with warmup, outlier filtering, and baseline comparison. + * + * @param name - Scenario name (must match baseline key) + * @param fn - Async function that executes one sample of the scenario. + * Must return a PerfSnapshot with measured values. + * @param tolerancePercent - Override default tolerance for this scenario + */ + async runScenario( + name: string, + fn: () => Promise, + tolerancePercent?: number, + ): Promise { + const tolerance = tolerancePercent ?? this.defaultTolerancePercent; + const totalRuns = this.warmupCount + this.sampleCount; + const allSnapshots: PerfSnapshot[] = []; + + for (let i = 0; i < totalRuns; i++) { + const isWarmup = i < this.warmupCount; + const snapshot = await fn(); + snapshot.label = isWarmup + ? `warmup-${i}` + : `sample-${i - this.warmupCount}`; + + if (!isWarmup) { + allSnapshots.push(snapshot); + } + + // Brief pause between samples + await sleep(this.samplePauseMs); + } + + // Apply IQR outlier filtering on wall-clock time + const filteredSnapshots = this.filterOutliers(allSnapshots, 'wallClockMs'); + + // Get median of filtered samples + const median = this.getMedianSnapshot(filteredSnapshots); + median.label = 'median'; + + // Get baseline + const baseline = this.baselines.scenarios[name]; + + // Determine if within tolerance + let deltaPercent = 0; + let cpuDeltaPercent = 0; + let withinTolerance = true; + + if (baseline) { + deltaPercent = + ((median.wallClockMs - baseline.wallClockMs) / baseline.wallClockMs) * + 100; + cpuDeltaPercent = + ((median.cpuTotalUs - baseline.cpuTotalUs) / baseline.cpuTotalUs) * 100; + withinTolerance = deltaPercent <= tolerance; + } + + const result: PerfTestResult = { + scenarioName: name, + samples: allSnapshots, + filteredSamples: filteredSnapshots, + median, + baseline, + withinTolerance, + deltaPercent, + cpuDeltaPercent, + }; + + this.allResults.push(result); + return result; + } + + /** + * Assert that a scenario result is within the baseline tolerance. + */ + assertWithinBaseline( + result: PerfTestResult, + tolerancePercent?: number, + cpuTolerancePercent?: number, + ): void { + const tolerance = tolerancePercent ?? this.defaultTolerancePercent; + const cpuTolerance = cpuTolerancePercent ?? this.defaultCpuTolerancePercent; + + if (!result.baseline) { + console.warn( + `⚠ No baseline found for "${result.scenarioName}". ` + + `Run with UPDATE_PERF_BASELINES=true to create one. ` + + `Measured: ${result.median.wallClockMs.toFixed(1)} ms wall-clock.`, + ); + return; + } + + const deltaPercent = + ((result.median.wallClockMs - result.baseline.wallClockMs) / + result.baseline.wallClockMs) * + 100; + + if (deltaPercent > tolerance) { + throw new Error( + `Performance regression detected for "${result.scenarioName}"!\n` + + ` Measured: ${result.median.wallClockMs.toFixed(1)} ms wall-clock\n` + + ` Baseline: ${result.baseline.wallClockMs.toFixed(1)} ms wall-clock\n` + + ` Delta: ${deltaPercent.toFixed(1)}% (tolerance: ${tolerance}%)\n` + + ` CPU total: ${formatUs(result.median.cpuTotalUs)}\n` + + ` EL p99: ${result.median.eventLoopDelayP99Ms.toFixed(1)} ms\n` + + ` Samples: ${result.samples.length} (${result.filteredSamples.length} after IQR filter)`, + ); + } + + if (cpuTolerance !== undefined && result.cpuDeltaPercent > cpuTolerance) { + throw new Error( + `CPU usage regression detected for "${result.scenarioName}"!\n` + + ` Measured: ${formatUs(result.median.cpuTotalUs)}\n` + + ` Baseline: ${formatUs(result.baseline.cpuTotalUs)}\n` + + ` Delta: ${result.cpuDeltaPercent.toFixed(1)}% (tolerance: ${cpuTolerance}%)\n` + + ` Wall-clock: ${result.median.wallClockMs.toFixed(1)} ms\n` + + ` EL p99: ${result.median.eventLoopDelayP99Ms.toFixed(1)} ms`, + ); + } + } + + /** + * Update the baseline for a scenario with the current measured values. + */ + updateScenarioBaseline(result: PerfTestResult): void { + updatePerfBaseline(this.baselinesPath, result.scenarioName, { + wallClockMs: result.median.wallClockMs, + cpuTotalUs: result.median.cpuTotalUs, + eventLoopDelayP99Ms: result.median.eventLoopDelayP99Ms, + }); + // Reload baselines after update + this.baselines = loadPerfBaselines(this.baselinesPath); + console.log( + `Updated baseline for ${result.scenarioName}: ${result.median.wallClockMs.toFixed(1)} ms`, + ); + } + + /** + * Generate an ASCII report with summary table and charts. + */ + async generateReport(results?: PerfTestResult[]): Promise { + const resultsToReport = results ?? this.allResults; + const lines: string[] = []; + + lines.push(''); + lines.push('═══════════════════════════════════════════════════'); + lines.push(' PERFORMANCE TEST REPORT'); + lines.push('═══════════════════════════════════════════════════'); + lines.push(''); + + for (const result of resultsToReport) { + const measured = `${result.median.wallClockMs.toFixed(1)} ms`; + const baseline = result.baseline + ? `${result.baseline.wallClockMs.toFixed(1)} ms` + : 'N/A'; + const delta = result.baseline + ? `${result.deltaPercent >= 0 ? '+' : ''}${result.deltaPercent.toFixed(1)}%` + : 'N/A'; + const status = !result.baseline + ? 'NEW' + : result.withinTolerance + ? '✅' + : '❌'; + + lines.push( + `${result.scenarioName}: ${measured} (Baseline: ${baseline}, Delta: ${delta}) ${status}`, + ); + + // Show CPU breakdown + const cpuMs = `${(result.median.cpuTotalUs / 1000).toFixed(1)} ms`; + lines.push( + ` CPU: ${cpuMs} (user: ${formatUs(result.median.cpuUserUs)}, system: ${formatUs(result.median.cpuSystemUs)})`, + ); + + if (result.median.eventLoopDelayP99Ms > 0) { + lines.push( + ` Event loop: p50=${result.median.eventLoopDelayP50Ms.toFixed(1)}ms p95=${result.median.eventLoopDelayP95Ms.toFixed(1)}ms p99=${result.median.eventLoopDelayP99Ms.toFixed(1)}ms max=${result.median.eventLoopDelayMaxMs.toFixed(1)}ms`, + ); + } + + lines.push( + ` Samples: ${result.samples.length} → ${result.filteredSamples.length} after IQR filter`, + ); + } + lines.push(''); + + // Generate ASCII chart for wall-clock per scenario + try { + // @ts-expect-error - asciichart may not have types + const asciichart = (await import('asciichart')) as { + default?: { plot?: PlotFn }; + plot?: PlotFn; + }; + const plot: PlotFn | undefined = + asciichart.default?.plot ?? asciichart.plot; + + for (const result of resultsToReport) { + if (result.filteredSamples.length > 2) { + lines.push(`📈 Wall-clock trend: ${result.scenarioName}`); + lines.push('─'.repeat(60)); + + const wallClockData = result.filteredSamples.map( + (s) => s.wallClockMs, + ); + + if (plot) { + const chart = plot(wallClockData, { + height: 8, + format: (x: number) => `${x.toFixed(0)} ms`.padStart(10), + }); + lines.push(chart); + } + + const labels = result.filteredSamples.map((s) => s.label); + lines.push(' ' + labels.join(' → ')); + lines.push(''); + } + } + } catch { + lines.push( + '(asciichart not available — install with: npm install --save-dev asciichart)', + ); + lines.push(''); + } + + lines.push('═══════════════════════════════════════════════════'); + lines.push(''); + + const report = lines.join('\n'); + console.log(report); + return report; + } + + /** + * Filter outliers using the Interquartile Range (IQR) method. + * Removes samples where the given metric falls outside Q1 - 1.5*IQR or Q3 + 1.5*IQR. + */ + private filterOutliers( + snapshots: PerfSnapshot[], + metric: keyof PerfSnapshot, + ): PerfSnapshot[] { + if (snapshots.length < 4) { + // Not enough data for meaningful IQR filtering + return [...snapshots]; + } + + const sorted = [...snapshots].sort( + (a, b) => (a[metric] as number) - (b[metric] as number), + ); + const q1Idx = Math.floor(sorted.length * 0.25); + const q3Idx = Math.floor(sorted.length * 0.75); + + const q1 = sorted[q1Idx]![metric] as number; + const q3 = sorted[q3Idx]![metric] as number; + const iqr = q3 - q1; + const lowerBound = q1 - 1.5 * iqr; + const upperBound = q3 + 1.5 * iqr; + + return snapshots.filter((s) => { + const val = s[metric] as number; + return val >= lowerBound && val <= upperBound; + }); + } + + /** + * Get the median snapshot by wall-clock time from a sorted list. + */ + private getMedianSnapshot(snapshots: PerfSnapshot[]): PerfSnapshot { + if (snapshots.length === 0) { + throw new Error('Cannot compute median of empty snapshot list'); + } + + const sorted = [...snapshots].sort((a, b) => a.wallClockMs - b.wallClockMs); + const medianIdx = Math.floor(sorted.length / 2); + return { ...sorted[medianIdx]! }; + } +} + +// ─── Baseline management ───────────────────────────────────────────── + +/** + * Load perf baselines from a JSON file. + */ +export function loadPerfBaselines(path: string): PerfBaselineFile { + if (!existsSync(path)) { + return { + version: 1, + updatedAt: new Date().toISOString(), + scenarios: {}, + }; + } + + const content = readFileSync(path, 'utf-8'); + return JSON.parse(content) as PerfBaselineFile; +} + +/** + * Save perf baselines to a JSON file. + */ +export function savePerfBaselines( + path: string, + baselines: PerfBaselineFile, +): void { + baselines.updatedAt = new Date().toISOString(); + writeFileSync(path, JSON.stringify(baselines, null, 2) + '\n'); +} + +/** + * Update (or create) a single scenario baseline in the file. + */ +export function updatePerfBaseline( + path: string, + scenarioName: string, + measured: { + wallClockMs: number; + cpuTotalUs: number; + eventLoopDelayP99Ms: number; + }, +): void { + const baselines = loadPerfBaselines(path); + baselines.scenarios[scenarioName] = { + wallClockMs: measured.wallClockMs, + cpuTotalUs: measured.cpuTotalUs, + eventLoopDelayP99Ms: measured.eventLoopDelayP99Ms, + timestamp: new Date().toISOString(), + }; + savePerfBaselines(path, baselines); +} + +// ─── Helpers ───────────────────────────────────────────────────────── + +/** + * Format microseconds as a human-readable string. + */ +function formatUs(us: number): string { + if (us > 1_000_000) { + return `${(us / 1_000_000).toFixed(2)} s`; + } + if (us > 1_000) { + return `${(us / 1_000).toFixed(1)} ms`; + } + return `${us} μs`; +} diff --git a/perf-tests/README.md b/perf-tests/README.md new file mode 100644 index 0000000000..c8e9e448c1 --- /dev/null +++ b/perf-tests/README.md @@ -0,0 +1,121 @@ +# CPU Performance Integration Test Harness + +## Overview + +This directory contains performance/CPU integration tests for the Gemini CLI. +These tests measure wall-clock time, CPU usage, and event loop responsiveness to +detect regressions across key scenarios. + +CPU performance is inherently noisy, especially in CI. The harness addresses +this with: + +- **IQR outlier filtering** — discards anomalous samples +- **Median sampling** — takes N runs, reports the median after filtering +- **Warmup runs** — discards the first run to mitigate JIT compilation noise +- **15% default tolerance** — won't panic at slight regressions + +## Running + +```bash +# Run tests (compare against committed baselines) +npm run test:perf + +# Update baselines (after intentional changes) +npm run test:perf:update-baselines + +# Verbose output +VERBOSE=true npm run test:perf + +# Keep test artifacts for debugging +KEEP_OUTPUT=true npm run test:perf +``` + +## How It Works + +### Measurement Primitives + +The `PerfTestHarness` class (in `packages/test-utils`) provides: + +- **`performance.now()`** — high-resolution wall-clock timing +- **`process.cpuUsage()`** — user + system CPU microseconds (delta between + start/stop) +- **`perf_hooks.monitorEventLoopDelay()`** — event loop delay histogram + (p50/p95/p99/max) + +### Noise Reduction + +1. **Warmup**: First run is discarded to mitigate JIT compilation artifacts +2. **Multiple samples**: Each scenario runs N times (default 5) +3. **IQR filtering**: Samples outside Q1−1.5×IQR and Q3+1.5×IQR are discarded +4. **Median**: The median of remaining samples is used for comparison + +### Baseline Management + +Baselines are stored in `baselines.json` in this directory. Each scenario has: + +```json +{ + "cold-startup-time": { + "wallClockMs": 1234.5, + "cpuTotalUs": 567890, + "eventLoopDelayP99Ms": 12.3, + "timestamp": "2026-04-08T..." + } +} +``` + +Tests fail if the measured value exceeds `baseline × 1.15` (15% tolerance). + +To recalibrate after intentional changes: + +```bash +npm run test:perf:update-baselines +# then commit baselines.json +``` + +### Report Output + +After all tests, the harness prints an ASCII summary: + +``` +═══════════════════════════════════════════════════ + PERFORMANCE TEST REPORT +═══════════════════════════════════════════════════ + +cold-startup-time: 1234.5 ms (Baseline: 1200.0 ms, Delta: +2.9%) ✅ +idle-cpu-usage: 2.1 % (Baseline: 2.0 %, Delta: +5.0%) ✅ +skill-loading-time: 1567.8 ms (Baseline: 1500.0 ms, Delta: +4.5%) ✅ +``` + +## Architecture + +``` +perf-tests/ +├── README.md ← you are here +├── baselines.json ← committed baseline values +├── globalSetup.ts ← test environment setup +├── perf-usage.test.ts ← test scenarios +├── perf.*.responses ← fake API responses per scenario +├── tsconfig.json ← TypeScript config +└── vitest.config.ts ← vitest config (serial, isolated) + +packages/test-utils/src/ +├── perf-test-harness.ts ← PerfTestHarness class +└── index.ts ← re-exports +``` + +## CI Integration + +These tests are **excluded from `preflight`** and designed for nightly CI: + +```yaml +- name: Performance regression tests + run: npm run test:perf +``` + +## Adding a New Scenario + +1. Add a fake response file: `perf..responses` +2. Add a test case in `perf-usage.test.ts` using `harness.runScenario()` +3. Run `npm run test:perf:update-baselines` to establish initial baseline +4. Commit the updated `baselines.json` diff --git a/perf-tests/baselines.json b/perf-tests/baselines.json new file mode 100644 index 0000000000..a6bad73574 --- /dev/null +++ b/perf-tests/baselines.json @@ -0,0 +1,24 @@ +{ + "version": 1, + "updatedAt": "2026-04-08T18:51:29.839Z", + "scenarios": { + "cold-startup-time": { + "wallClockMs": 1333.4230420000004, + "cpuTotalUs": 1711, + "eventLoopDelayP99Ms": 0, + "timestamp": "2026-04-08T18:50:58.124Z" + }, + "idle-cpu-usage": { + "wallClockMs": 5001.926125, + "cpuTotalUs": 128518, + "eventLoopDelayP99Ms": 12.705791, + "timestamp": "2026-04-08T18:51:23.938Z" + }, + "skill-loading-time": { + "wallClockMs": 1372.4463749999995, + "cpuTotalUs": 1550, + "eventLoopDelayP99Ms": 0, + "timestamp": "2026-04-08T18:51:29.839Z" + } + } +} diff --git a/perf-tests/globalSetup.ts b/perf-tests/globalSetup.ts new file mode 100644 index 0000000000..77447bd2ba --- /dev/null +++ b/perf-tests/globalSetup.ts @@ -0,0 +1,67 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { mkdir, readdir, rm } from 'node:fs/promises'; +import { join, dirname } from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { canUseRipgrep } from '../packages/core/src/tools/ripGrep.js'; +import { isolateTestEnv } from '../packages/test-utils/src/env-setup.js'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const rootDir = join(__dirname, '..'); +const perfTestsDir = join(rootDir, '.perf-tests'); +const KEEP_RUNS_COUNT = 5; +let runDir = ''; + +export async function setup() { + runDir = join(perfTestsDir, `${Date.now()}`); + await mkdir(runDir, { recursive: true }); + + // Isolate environment variables + isolateTestEnv(runDir); + + // Download ripgrep to avoid race conditions + const available = await canUseRipgrep(); + if (!available) { + throw new Error('Failed to download ripgrep binary'); + } + + // Clean up old test runs, keeping the latest few for debugging + try { + const testRuns = await readdir(perfTestsDir); + if (testRuns.length > KEEP_RUNS_COUNT) { + const oldRuns = testRuns + .sort() + .slice(0, testRuns.length - KEEP_RUNS_COUNT); + await Promise.all( + oldRuns.map((oldRun) => + rm(join(perfTestsDir, oldRun), { + recursive: true, + force: true, + }), + ), + ); + } + } catch (e) { + console.error('Error cleaning up old perf test runs:', e); + } + + process.env['INTEGRATION_TEST_FILE_DIR'] = runDir; + process.env['VERBOSE'] = process.env['VERBOSE'] ?? 'false'; + + console.log(`\nPerf test output directory: ${runDir}`); +} + +export async function teardown() { + // Cleanup unless KEEP_OUTPUT is set + if (process.env['KEEP_OUTPUT'] !== 'true' && runDir) { + try { + await rm(runDir, { recursive: true, force: true }); + } catch (e) { + console.warn('Failed to clean up perf test directory:', e); + } + } +} diff --git a/perf-tests/perf-usage.test.ts b/perf-tests/perf-usage.test.ts new file mode 100644 index 0000000000..3f92cd9f91 --- /dev/null +++ b/perf-tests/perf-usage.test.ts @@ -0,0 +1,153 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, it, beforeAll, afterAll } from 'vitest'; +import { TestRig, PerfTestHarness } from '@google/gemini-cli-test-utils'; +import { join, dirname } from 'node:path'; +import { fileURLToPath } from 'node:url'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const BASELINES_PATH = join(__dirname, 'baselines.json'); +const UPDATE_BASELINES = process.env['UPDATE_PERF_BASELINES'] === 'true'; +const TOLERANCE_PERCENT = 15; + +// Use fewer samples locally for faster iteration, more in CI +const SAMPLE_COUNT = process.env['CI'] ? 5 : 3; +const WARMUP_COUNT = 1; + +describe('CPU Performance Tests', () => { + let harness: PerfTestHarness; + + beforeAll(() => { + harness = new PerfTestHarness({ + baselinesPath: BASELINES_PATH, + defaultTolerancePercent: TOLERANCE_PERCENT, + sampleCount: SAMPLE_COUNT, + warmupCount: WARMUP_COUNT, + }); + }); + + afterAll(async () => { + // Generate the summary report after all tests + await harness.generateReport(); + }); + + it('cold-startup-time: startup completes within baseline', async () => { + const result = await harness.runScenario('cold-startup-time', async () => { + const rig = new TestRig(); + try { + rig.setup('perf-cold-startup', { + fakeResponsesPath: join(__dirname, 'perf.cold-startup.responses'), + }); + + return await harness.measure('cold-startup', async () => { + await rig.run({ + args: ['hello'], + timeout: 120000, + env: { GEMINI_API_KEY: 'fake-perf-test-key' }, + }); + }); + } finally { + await rig.cleanup(); + } + }); + + if (UPDATE_BASELINES) { + harness.updateScenarioBaseline(result); + } else { + harness.assertWithinBaseline(result); + } + }); + + it('idle-cpu-usage: CPU stays low when idle', async () => { + const IDLE_OBSERVATION_MS = 5000; + + const result = await harness.runScenario('idle-cpu-usage', async () => { + const rig = new TestRig(); + try { + rig.setup('perf-idle-cpu', { + fakeResponsesPath: join(__dirname, 'perf.idle-cpu.responses'), + }); + + // First, run a prompt to get the CLI into idle state + await rig.run({ + args: ['hello'], + timeout: 120000, + env: { GEMINI_API_KEY: 'fake-perf-test-key' }, + }); + + // Now measure CPU during idle period in the test process + return await harness.measureWithEventLoop('idle-cpu', async () => { + // Simulate idle period — just wait + const { setTimeout: sleep } = await import('node:timers/promises'); + await sleep(IDLE_OBSERVATION_MS); + }); + } finally { + await rig.cleanup(); + } + }); + + if (UPDATE_BASELINES) { + harness.updateScenarioBaseline(result); + } else { + harness.assertWithinBaseline(result); + } + }); + + it('skill-loading-time: startup with many skills within baseline', async () => { + const SKILL_COUNT = 20; + + const result = await harness.runScenario('skill-loading-time', async () => { + const rig = new TestRig(); + try { + rig.setup('perf-skill-loading', { + fakeResponsesPath: join(__dirname, 'perf.skill-loading.responses'), + }); + + // Create many skill directories with SKILL.md files + for (let i = 0; i < SKILL_COUNT; i++) { + const skillDir = `.gemini/skills/perf-skill-${i}`; + rig.mkdir(skillDir); + rig.createFile( + `${skillDir}/SKILL.md`, + [ + '---', + `name: perf-skill-${i}`, + `description: Performance test skill number ${i}`, + `activation: manual`, + '---', + '', + `# Performance Test Skill ${i}`, + '', + `This is a test skill for measuring skill loading performance.`, + `It contains some content to simulate real-world skill files.`, + '', + `## Usage`, + '', + `Use this skill by activating it with @perf-skill-${i}.`, + ].join('\n'), + ); + } + + return await harness.measure('skill-loading', async () => { + await rig.run({ + args: ['hello'], + timeout: 120000, + env: { GEMINI_API_KEY: 'fake-perf-test-key' }, + }); + }); + } finally { + await rig.cleanup(); + } + }); + + if (UPDATE_BASELINES) { + harness.updateScenarioBaseline(result); + } else { + harness.assertWithinBaseline(result); + } + }); +}); diff --git a/perf-tests/perf.cold-startup.responses b/perf-tests/perf.cold-startup.responses new file mode 100644 index 0000000000..7a5703e3d2 --- /dev/null +++ b/perf-tests/perf.cold-startup.responses @@ -0,0 +1,2 @@ +{"method":"generateContent","response":{"candidates":[{"content":{"parts":[{"text":"0"}],"role":"model"},"finishReason":"STOP","index":0}]}} +{"method":"generateContentStream","response":[{"candidates":[{"content":{"parts":[{"text":"Hello! I'm ready to help. What would you like to work on?"}],"role":"model"},"finishReason":"STOP","index":0}],"usageMetadata":{"promptTokenCount":5,"candidatesTokenCount":12,"totalTokenCount":17,"promptTokensDetails":[{"modality":"TEXT","tokenCount":5}]}}]} diff --git a/perf-tests/perf.idle-cpu.responses b/perf-tests/perf.idle-cpu.responses new file mode 100644 index 0000000000..a0d05086d2 --- /dev/null +++ b/perf-tests/perf.idle-cpu.responses @@ -0,0 +1,2 @@ +{"method":"generateContent","response":{"candidates":[{"content":{"parts":[{"text":"0"}],"role":"model"},"finishReason":"STOP","index":0}]}} +{"method":"generateContentStream","response":[{"candidates":[{"content":{"parts":[{"text":"Hello! I'm ready to help."}],"role":"model"},"finishReason":"STOP","index":0}],"usageMetadata":{"promptTokenCount":5,"candidatesTokenCount":8,"totalTokenCount":13,"promptTokensDetails":[{"modality":"TEXT","tokenCount":5}]}}]} diff --git a/perf-tests/perf.skill-loading.responses b/perf-tests/perf.skill-loading.responses new file mode 100644 index 0000000000..eb6c96fe9c --- /dev/null +++ b/perf-tests/perf.skill-loading.responses @@ -0,0 +1,2 @@ +{"method":"generateContent","response":{"candidates":[{"content":{"parts":[{"text":"0"}],"role":"model"},"finishReason":"STOP","index":0}]}} +{"method":"generateContentStream","response":[{"candidates":[{"content":{"parts":[{"text":"Hello! I'm ready to assist you with your project."}],"role":"model"},"finishReason":"STOP","index":0}],"usageMetadata":{"promptTokenCount":5,"candidatesTokenCount":10,"totalTokenCount":15,"promptTokensDetails":[{"modality":"TEXT","tokenCount":5}]}}]} diff --git a/perf-tests/tsconfig.json b/perf-tests/tsconfig.json new file mode 100644 index 0000000000..7f2c199703 --- /dev/null +++ b/perf-tests/tsconfig.json @@ -0,0 +1,12 @@ +{ + "extends": "../tsconfig.json", + "compilerOptions": { + "noEmit": true, + "allowJs": true + }, + "include": ["**/*.ts"], + "references": [ + { "path": "../packages/core" }, + { "path": "../packages/test-utils" } + ] +} diff --git a/perf-tests/vitest.config.ts b/perf-tests/vitest.config.ts new file mode 100644 index 0000000000..e9baeec0bf --- /dev/null +++ b/perf-tests/vitest.config.ts @@ -0,0 +1,27 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { defineConfig } from 'vitest/config'; + +export default defineConfig({ + test: { + testTimeout: 600000, // 10 minutes — performance profiling needs time for multiple samples + globalSetup: './globalSetup.ts', + reporters: ['default'], + include: ['**/*.test.ts'], + retry: 0, // No retries — noise is handled by IQR filtering and tolerance + fileParallelism: false, // Must run serially to avoid CPU contention + pool: 'forks', + poolOptions: { + forks: { + singleFork: true, // Single process for accurate per-test CPU readings + }, + }, + env: { + GEMINI_TEST_TYPE: 'perf', + }, + }, +});