feat(test-utils): add CPU performance integration test harness (#24951)

2026-05-15 06:12:50 -07:00 · 2026-04-08 14:50:29 -07:00
parent 5dc9a7a0b7
commit b46eedda81
19 changed files with 1081 additions and 13 deletions
@@ -0,0 +1,121 @@
+# CPU Performance Integration Test Harness
+
+## Overview
+
+This directory contains performance/CPU integration tests for the Gemini CLI.
+These tests measure wall-clock time, CPU usage, and event loop responsiveness to
+detect regressions across key scenarios.
+
+CPU performance is inherently noisy, especially in CI. The harness addresses
+this with:
+
+- **IQR outlier filtering** — discards anomalous samples
+- **Median sampling** — takes N runs, reports the median after filtering
+- **Warmup runs** — discards the first run to mitigate JIT compilation noise
+- **15% default tolerance** — won't panic at slight regressions
+
+## Running
+
+```bash
+# Run tests (compare against committed baselines)
+npm run test:perf
+
+# Update baselines (after intentional changes)
+npm run test:perf:update-baselines
+
+# Verbose output
+VERBOSE=true npm run test:perf
+
+# Keep test artifacts for debugging
+KEEP_OUTPUT=true npm run test:perf
+```
+
+## How It Works
+
+### Measurement Primitives
+
+The `PerfTestHarness` class (in `packages/test-utils`) provides:
+
+- **`performance.now()`** — high-resolution wall-clock timing
+- **`process.cpuUsage()`** — user + system CPU microseconds (delta between
+  start/stop)
+- **`perf_hooks.monitorEventLoopDelay()`** — event loop delay histogram
+  (p50/p95/p99/max)
+
+### Noise Reduction
+
+1. **Warmup**: First run is discarded to mitigate JIT compilation artifacts
+2. **Multiple samples**: Each scenario runs N times (default 5)
+3. **IQR filtering**: Samples outside Q1−1.5×IQR and Q3+1.5×IQR are discarded
+4. **Median**: The median of remaining samples is used for comparison
+
+### Baseline Management
+
+Baselines are stored in `baselines.json` in this directory. Each scenario has:
+
+```json
+{
+  "cold-startup-time": {
+    "wallClockMs": 1234.5,
+    "cpuTotalUs": 567890,
+    "eventLoopDelayP99Ms": 12.3,
+    "timestamp": "2026-04-08T..."
+  }
+}
+```
+
+Tests fail if the measured value exceeds `baseline × 1.15` (15% tolerance).
+
+To recalibrate after intentional changes:
+
+```bash
+npm run test:perf:update-baselines
+# then commit baselines.json
+```
+
+### Report Output
+
+After all tests, the harness prints an ASCII summary:
+
+```
+═══════════════════════════════════════════════════
+         PERFORMANCE TEST REPORT
+═══════════════════════════════════════════════════
+
+cold-startup-time:   1234.5 ms (Baseline: 1200.0 ms, Delta: +2.9%) ✅
+idle-cpu-usage:         2.1 %  (Baseline: 2.0 %, Delta: +5.0%)     ✅
+skill-loading-time:  1567.8 ms (Baseline: 1500.0 ms, Delta: +4.5%) ✅
+```
+
+## Architecture
+
+```
+perf-tests/
+├── README.md              ← you are here
+├── baselines.json         ← committed baseline values
+├── globalSetup.ts         ← test environment setup
+├── perf-usage.test.ts     ← test scenarios
+├── perf.*.responses       ← fake API responses per scenario
+├── tsconfig.json          ← TypeScript config
+└── vitest.config.ts       ← vitest config (serial, isolated)
+
+packages/test-utils/src/
+├── perf-test-harness.ts   ← PerfTestHarness class
+└── index.ts               ← re-exports
+```
+
+## CI Integration
+
+These tests are **excluded from `preflight`** and designed for nightly CI:
+
+```yaml
+- name: Performance regression tests
+  run: npm run test:perf
+```
+
+## Adding a New Scenario
+
+1. Add a fake response file: `perf.<scenario-name>.responses`
+2. Add a test case in `perf-usage.test.ts` using `harness.runScenario()`
+3. Run `npm run test:perf:update-baselines` to establish initial baseline
+4. Commit the updated `baselines.json`
@@ -0,0 +1,24 @@
+{
+  "version": 1,
+  "updatedAt": "2026-04-08T18:51:29.839Z",
+  "scenarios": {
+    "cold-startup-time": {
+      "wallClockMs": 1333.4230420000004,
+      "cpuTotalUs": 1711,
+      "eventLoopDelayP99Ms": 0,
+      "timestamp": "2026-04-08T18:50:58.124Z"
+    },
+    "idle-cpu-usage": {
+      "wallClockMs": 5001.926125,
+      "cpuTotalUs": 128518,
+      "eventLoopDelayP99Ms": 12.705791,
+      "timestamp": "2026-04-08T18:51:23.938Z"
+    },
+    "skill-loading-time": {
+      "wallClockMs": 1372.4463749999995,
+      "cpuTotalUs": 1550,
+      "eventLoopDelayP99Ms": 0,
+      "timestamp": "2026-04-08T18:51:29.839Z"
+    }
+  }
+}
@@ -0,0 +1,67 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { mkdir, readdir, rm } from 'node:fs/promises';
+import { join, dirname } from 'node:path';
+import { fileURLToPath } from 'node:url';
+import { canUseRipgrep } from '../packages/core/src/tools/ripGrep.js';
+import { isolateTestEnv } from '../packages/test-utils/src/env-setup.js';
+
+const __dirname = dirname(fileURLToPath(import.meta.url));
+const rootDir = join(__dirname, '..');
+const perfTestsDir = join(rootDir, '.perf-tests');
+const KEEP_RUNS_COUNT = 5;
+let runDir = '';
+
+export async function setup() {
+  runDir = join(perfTestsDir, `${Date.now()}`);
+  await mkdir(runDir, { recursive: true });
+
+  // Isolate environment variables
+  isolateTestEnv(runDir);
+
+  // Download ripgrep to avoid race conditions
+  const available = await canUseRipgrep();
+  if (!available) {
+    throw new Error('Failed to download ripgrep binary');
+  }
+
+  // Clean up old test runs, keeping the latest few for debugging
+  try {
+    const testRuns = await readdir(perfTestsDir);
+    if (testRuns.length > KEEP_RUNS_COUNT) {
+      const oldRuns = testRuns
+        .sort()
+        .slice(0, testRuns.length - KEEP_RUNS_COUNT);
+      await Promise.all(
+        oldRuns.map((oldRun) =>
+          rm(join(perfTestsDir, oldRun), {
+            recursive: true,
+            force: true,
+          }),
+        ),
+      );
+    }
+  } catch (e) {
+    console.error('Error cleaning up old perf test runs:', e);
+  }
+
+  process.env['INTEGRATION_TEST_FILE_DIR'] = runDir;
+  process.env['VERBOSE'] = process.env['VERBOSE'] ?? 'false';
+
+  console.log(`\nPerf test output directory: ${runDir}`);
+}
+
+export async function teardown() {
+  // Cleanup unless KEEP_OUTPUT is set
+  if (process.env['KEEP_OUTPUT'] !== 'true' && runDir) {
+    try {
+      await rm(runDir, { recursive: true, force: true });
+    } catch (e) {
+      console.warn('Failed to clean up perf test directory:', e);
+    }
+  }
+}
@@ -0,0 +1,153 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, it, beforeAll, afterAll } from 'vitest';
+import { TestRig, PerfTestHarness } from '@google/gemini-cli-test-utils';
+import { join, dirname } from 'node:path';
+import { fileURLToPath } from 'node:url';
+
+const __dirname = dirname(fileURLToPath(import.meta.url));
+const BASELINES_PATH = join(__dirname, 'baselines.json');
+const UPDATE_BASELINES = process.env['UPDATE_PERF_BASELINES'] === 'true';
+const TOLERANCE_PERCENT = 15;
+
+// Use fewer samples locally for faster iteration, more in CI
+const SAMPLE_COUNT = process.env['CI'] ? 5 : 3;
+const WARMUP_COUNT = 1;
+
+describe('CPU Performance Tests', () => {
+  let harness: PerfTestHarness;
+
+  beforeAll(() => {
+    harness = new PerfTestHarness({
+      baselinesPath: BASELINES_PATH,
+      defaultTolerancePercent: TOLERANCE_PERCENT,
+      sampleCount: SAMPLE_COUNT,
+      warmupCount: WARMUP_COUNT,
+    });
+  });
+
+  afterAll(async () => {
+    // Generate the summary report after all tests
+    await harness.generateReport();
+  });
+
+  it('cold-startup-time: startup completes within baseline', async () => {
+    const result = await harness.runScenario('cold-startup-time', async () => {
+      const rig = new TestRig();
+      try {
+        rig.setup('perf-cold-startup', {
+          fakeResponsesPath: join(__dirname, 'perf.cold-startup.responses'),
+        });
+
+        return await harness.measure('cold-startup', async () => {
+          await rig.run({
+            args: ['hello'],
+            timeout: 120000,
+            env: { GEMINI_API_KEY: 'fake-perf-test-key' },
+          });
+        });
+      } finally {
+        await rig.cleanup();
+      }
+    });
+
+    if (UPDATE_BASELINES) {
+      harness.updateScenarioBaseline(result);
+    } else {
+      harness.assertWithinBaseline(result);
+    }
+  });
+
+  it('idle-cpu-usage: CPU stays low when idle', async () => {
+    const IDLE_OBSERVATION_MS = 5000;
+
+    const result = await harness.runScenario('idle-cpu-usage', async () => {
+      const rig = new TestRig();
+      try {
+        rig.setup('perf-idle-cpu', {
+          fakeResponsesPath: join(__dirname, 'perf.idle-cpu.responses'),
+        });
+
+        // First, run a prompt to get the CLI into idle state
+        await rig.run({
+          args: ['hello'],
+          timeout: 120000,
+          env: { GEMINI_API_KEY: 'fake-perf-test-key' },
+        });
+
+        // Now measure CPU during idle period in the test process
+        return await harness.measureWithEventLoop('idle-cpu', async () => {
+          // Simulate idle period — just wait
+          const { setTimeout: sleep } = await import('node:timers/promises');
+          await sleep(IDLE_OBSERVATION_MS);
+        });
+      } finally {
+        await rig.cleanup();
+      }
+    });
+
+    if (UPDATE_BASELINES) {
+      harness.updateScenarioBaseline(result);
+    } else {
+      harness.assertWithinBaseline(result);
+    }
+  });
+
+  it('skill-loading-time: startup with many skills within baseline', async () => {
+    const SKILL_COUNT = 20;
+
+    const result = await harness.runScenario('skill-loading-time', async () => {
+      const rig = new TestRig();
+      try {
+        rig.setup('perf-skill-loading', {
+          fakeResponsesPath: join(__dirname, 'perf.skill-loading.responses'),
+        });
+
+        // Create many skill directories with SKILL.md files
+        for (let i = 0; i < SKILL_COUNT; i++) {
+          const skillDir = `.gemini/skills/perf-skill-${i}`;
+          rig.mkdir(skillDir);
+          rig.createFile(
+            `${skillDir}/SKILL.md`,
+            [
+              '---',
+              `name: perf-skill-${i}`,
+              `description: Performance test skill number ${i}`,
+              `activation: manual`,
+              '---',
+              '',
+              `# Performance Test Skill ${i}`,
+              '',
+              `This is a test skill for measuring skill loading performance.`,
+              `It contains some content to simulate real-world skill files.`,
+              '',
+              `## Usage`,
+              '',
+              `Use this skill by activating it with @perf-skill-${i}.`,
+            ].join('\n'),
+          );
+        }
+
+        return await harness.measure('skill-loading', async () => {
+          await rig.run({
+            args: ['hello'],
+            timeout: 120000,
+            env: { GEMINI_API_KEY: 'fake-perf-test-key' },
+          });
+        });
+      } finally {
+        await rig.cleanup();
+      }
+    });
+
+    if (UPDATE_BASELINES) {
+      harness.updateScenarioBaseline(result);
+    } else {
+      harness.assertWithinBaseline(result);
+    }
+  });
+});
@@ -0,0 +1,2 @@
+{"method":"generateContent","response":{"candidates":[{"content":{"parts":[{"text":"0"}],"role":"model"},"finishReason":"STOP","index":0}]}}
+{"method":"generateContentStream","response":[{"candidates":[{"content":{"parts":[{"text":"Hello! I'm ready to help. What would you like to work on?"}],"role":"model"},"finishReason":"STOP","index":0}],"usageMetadata":{"promptTokenCount":5,"candidatesTokenCount":12,"totalTokenCount":17,"promptTokensDetails":[{"modality":"TEXT","tokenCount":5}]}}]}
@@ -0,0 +1,2 @@
+{"method":"generateContent","response":{"candidates":[{"content":{"parts":[{"text":"0"}],"role":"model"},"finishReason":"STOP","index":0}]}}
+{"method":"generateContentStream","response":[{"candidates":[{"content":{"parts":[{"text":"Hello! I'm ready to help."}],"role":"model"},"finishReason":"STOP","index":0}],"usageMetadata":{"promptTokenCount":5,"candidatesTokenCount":8,"totalTokenCount":13,"promptTokensDetails":[{"modality":"TEXT","tokenCount":5}]}}]}
@@ -0,0 +1,2 @@
+{"method":"generateContent","response":{"candidates":[{"content":{"parts":[{"text":"0"}],"role":"model"},"finishReason":"STOP","index":0}]}}
+{"method":"generateContentStream","response":[{"candidates":[{"content":{"parts":[{"text":"Hello! I'm ready to assist you with your project."}],"role":"model"},"finishReason":"STOP","index":0}],"usageMetadata":{"promptTokenCount":5,"candidatesTokenCount":10,"totalTokenCount":15,"promptTokensDetails":[{"modality":"TEXT","tokenCount":5}]}}]}
@@ -0,0 +1,12 @@
+{
+  "extends": "../tsconfig.json",
+  "compilerOptions": {
+    "noEmit": true,
+    "allowJs": true
+  },
+  "include": ["**/*.ts"],
+  "references": [
+    { "path": "../packages/core" },
+    { "path": "../packages/test-utils" }
+  ]
+}
@@ -0,0 +1,27 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { defineConfig } from 'vitest/config';
+
+export default defineConfig({
+  test: {
+    testTimeout: 600000, // 10 minutes — performance profiling needs time for multiple samples
+    globalSetup: './globalSetup.ts',
+    reporters: ['default'],
+    include: ['**/*.test.ts'],
+    retry: 0, // No retries — noise is handled by IQR filtering and tolerance
+    fileParallelism: false, // Must run serially to avoid CPU contention
+    pool: 'forks',
+    poolOptions: {
+      forks: {
+        singleFork: true, // Single process for accurate per-test CPU readings
+      },
+    },
+    env: {
+      GEMINI_TEST_TYPE: 'perf',
+    },
+  },
+});