feat(test-utils): add CPU performance integration test harness (#24951)

2026-04-21 10:34:35 -07:00 · 2026-04-08 14:50:29 -07:00
parent 15f7b24312
commit c7b920717f
19 changed files with 1081 additions and 13 deletions
@@ -0,0 +1,35 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { join } from 'node:path';
+
+/**
+ * Isolate the test environment by setting environment variables
+ * to point to a temporary run directory.
+ *
+ * @param runDir - The temporary directory for this test run.
+ */
+export function isolateTestEnv(runDir: string): void {
+  // Set the home directory to the test run directory to avoid conflicts
+  // with the user's local config.
+  process.env['HOME'] = runDir;
+  if (process.platform === 'win32') {
+    process.env['USERPROFILE'] = runDir;
+  }
+
+  // We also need to set the config dir explicitly, since the code might
+  // construct the path before the HOME env var is set.
+  process.env['GEMINI_CONFIG_DIR'] = join(runDir, '.gemini');
+
+  // Force file storage to avoid keychain prompts/hangs in CI, especially on macOS
+  process.env['GEMINI_FORCE_FILE_STORAGE'] = 'true';
+
+  // Mark as integration test
+  process.env['GEMINI_CLI_INTEGRATION_TEST'] = 'true';
+
+  // Isolate telemetry log
+  process.env['TELEMETRY_LOG_FILE'] = join(runDir, 'telemetry.log');
+}
@@ -8,6 +8,8 @@ export * from './file-system-test-helpers.js';
 export * from './fixtures/agents.js';
 export * from './memory-baselines.js';
 export * from './memory-test-harness.js';
+export * from './perf-test-harness.js';
 export * from './mock-utils.js';
 export * from './test-mcp-server.js';
 export * from './test-rig.js';
+export * from './env-setup.js';
@@ -0,0 +1,546 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { performance } from 'node:perf_hooks';
+import { setTimeout as sleep } from 'node:timers/promises';
+import { readFileSync, writeFileSync, existsSync } from 'node:fs';
+
+/** Configuration for asciichart plot function. */
+interface PlotConfig {
+  height?: number;
+  format?: (x: number) => string;
+}
+
+/** Type for the asciichart plot function. */
+type PlotFn = (series: number[], config?: PlotConfig) => string;
+
+/**
+ * Baseline entry for a single performance test scenario.
+ */
+export interface PerfBaseline {
+  wallClockMs: number;
+  cpuTotalUs: number;
+  eventLoopDelayP99Ms: number;
+  timestamp: string;
+}
+
+/**
+ * Top-level structure of the perf baselines JSON file.
+ */
+export interface PerfBaselineFile {
+  version: number;
+  updatedAt: string;
+  scenarios: Record<string, PerfBaseline>;
+}
+
+/**
+ * A single performance snapshot at a point in time.
+ */
+export interface PerfSnapshot {
+  timestamp: number;
+  label: string;
+  wallClockMs: number;
+  cpuUserUs: number;
+  cpuSystemUs: number;
+  cpuTotalUs: number;
+  eventLoopDelayP50Ms: number;
+  eventLoopDelayP95Ms: number;
+  eventLoopDelayP99Ms: number;
+  eventLoopDelayMaxMs: number;
+}
+
+/**
+ * Result from running a performance test scenario.
+ */
+export interface PerfTestResult {
+  scenarioName: string;
+  samples: PerfSnapshot[];
+  filteredSamples: PerfSnapshot[];
+  median: PerfSnapshot;
+  baseline: PerfBaseline | undefined;
+  withinTolerance: boolean;
+  deltaPercent: number;
+  cpuDeltaPercent: number;
+}
+
+/**
+ * Options for the PerfTestHarness.
+ */
+export interface PerfTestHarnessOptions {
+  /** Path to the baselines JSON file */
+  baselinesPath: string;
+  /** Default tolerance percentage (0-100). Default: 15 */
+  defaultTolerancePercent?: number;
+  /** Default CPU tolerance percentage (0-100). Optional */
+  defaultCpuTolerancePercent?: number;
+  /** Number of samples per scenario. Default: 5 */
+  sampleCount?: number;
+  /** Number of warmup runs to discard. Default: 1 */
+  warmupCount?: number;
+  /** Pause in ms between samples. Default: 100 */
+  samplePauseMs?: number;
+}
+
+/**
+ * Active timer state tracked internally.
+ */
+interface ActiveTimer {
+  label: string;
+  startTime: number;
+  startCpuUsage: NodeJS.CpuUsage;
+}
+
+/**
+ * PerfTestHarness provides infrastructure for running CPU performance tests.
+ *
+ * It handles:
+ * - High-resolution wall-clock timing via performance.now()
+ * - CPU usage measurement via process.cpuUsage()
+ * - Event loop delay monitoring via perf_hooks.monitorEventLoopDelay()
+ * - IQR outlier filtering for noise reduction
+ * - Warmup runs to avoid JIT compilation noise
+ * - Comparing against baselines with configurable tolerance
+ * - Generating ASCII chart reports
+ */
+export class PerfTestHarness {
+  private baselines: PerfBaselineFile;
+  private readonly baselinesPath: string;
+  private readonly defaultTolerancePercent: number;
+  private readonly defaultCpuTolerancePercent?: number;
+  private readonly sampleCount: number;
+  private readonly warmupCount: number;
+  private readonly samplePauseMs: number;
+  private allResults: PerfTestResult[] = [];
+  private activeTimers: Map<string, ActiveTimer> = new Map();
+
+  constructor(options: PerfTestHarnessOptions) {
+    this.baselinesPath = options.baselinesPath;
+    this.defaultTolerancePercent = options.defaultTolerancePercent ?? 15;
+    this.defaultCpuTolerancePercent = options.defaultCpuTolerancePercent;
+    this.sampleCount = options.sampleCount ?? 5;
+    this.warmupCount = options.warmupCount ?? 1;
+    this.samplePauseMs = options.samplePauseMs ?? 100;
+    this.baselines = loadPerfBaselines(this.baselinesPath);
+  }
+
+  /**
+   * Start a high-resolution timer with CPU tracking.
+   */
+  startTimer(label: string): void {
+    this.activeTimers.set(label, {
+      label,
+      startTime: performance.now(),
+      startCpuUsage: process.cpuUsage(),
+    });
+  }
+
+  /**
+   * Stop a timer and return the snapshot.
+   */
+  stopTimer(label: string): PerfSnapshot {
+    const timer = this.activeTimers.get(label);
+    if (!timer) {
+      throw new Error(`No active timer found for label "${label}"`);
+    }
+
+    const wallClockMs = performance.now() - timer.startTime;
+    const cpuDelta = process.cpuUsage(timer.startCpuUsage);
+    this.activeTimers.delete(label);
+
+    return {
+      timestamp: Date.now(),
+      label,
+      wallClockMs,
+      cpuUserUs: cpuDelta.user,
+      cpuSystemUs: cpuDelta.system,
+      cpuTotalUs: cpuDelta.user + cpuDelta.system,
+      eventLoopDelayP50Ms: 0,
+      eventLoopDelayP95Ms: 0,
+      eventLoopDelayP99Ms: 0,
+      eventLoopDelayMaxMs: 0,
+    };
+  }
+
+  /**
+   * Measure a function's wall-clock time and CPU usage.
+   * Returns the snapshot with timing data.
+   */
+  async measure(label: string, fn: () => Promise<void>): Promise<PerfSnapshot> {
+    this.startTimer(label);
+    await fn();
+    return this.stopTimer(label);
+  }
+
+  /**
+   * Measure a function with event loop delay monitoring.
+   * Uses perf_hooks.monitorEventLoopDelay() for histogram data.
+   */
+  async measureWithEventLoop(
+    label: string,
+    fn: () => Promise<void>,
+  ): Promise<PerfSnapshot> {
+    // monitorEventLoopDelay is available in Node.js 12+
+    const { monitorEventLoopDelay } = await import('node:perf_hooks');
+    const histogram = monitorEventLoopDelay({ resolution: 10 });
+    histogram.enable();
+
+    this.startTimer(label);
+    await fn();
+    const snapshot = this.stopTimer(label);
+
+    histogram.disable();
+
+    // Convert from nanoseconds to milliseconds
+    snapshot.eventLoopDelayP50Ms = histogram.percentile(50) / 1e6;
+    snapshot.eventLoopDelayP95Ms = histogram.percentile(95) / 1e6;
+    snapshot.eventLoopDelayP99Ms = histogram.percentile(99) / 1e6;
+    snapshot.eventLoopDelayMaxMs = histogram.max / 1e6;
+
+    return snapshot;
+  }
+
+  /**
+   * Run a scenario multiple times with warmup, outlier filtering, and baseline comparison.
+   *
+   * @param name - Scenario name (must match baseline key)
+   * @param fn - Async function that executes one sample of the scenario.
+   *             Must return a PerfSnapshot with measured values.
+   * @param tolerancePercent - Override default tolerance for this scenario
+   */
+  async runScenario(
+    name: string,
+    fn: () => Promise<PerfSnapshot>,
+    tolerancePercent?: number,
+  ): Promise<PerfTestResult> {
+    const tolerance = tolerancePercent ?? this.defaultTolerancePercent;
+    const totalRuns = this.warmupCount + this.sampleCount;
+    const allSnapshots: PerfSnapshot[] = [];
+
+    for (let i = 0; i < totalRuns; i++) {
+      const isWarmup = i < this.warmupCount;
+      const snapshot = await fn();
+      snapshot.label = isWarmup
+        ? `warmup-${i}`
+        : `sample-${i - this.warmupCount}`;
+
+      if (!isWarmup) {
+        allSnapshots.push(snapshot);
+      }
+
+      // Brief pause between samples
+      await sleep(this.samplePauseMs);
+    }
+
+    // Apply IQR outlier filtering on wall-clock time
+    const filteredSnapshots = this.filterOutliers(allSnapshots, 'wallClockMs');
+
+    // Get median of filtered samples
+    const median = this.getMedianSnapshot(filteredSnapshots);
+    median.label = 'median';
+
+    // Get baseline
+    const baseline = this.baselines.scenarios[name];
+
+    // Determine if within tolerance
+    let deltaPercent = 0;
+    let cpuDeltaPercent = 0;
+    let withinTolerance = true;
+
+    if (baseline) {
+      deltaPercent =
+        ((median.wallClockMs - baseline.wallClockMs) / baseline.wallClockMs) *
+        100;
+      cpuDeltaPercent =
+        ((median.cpuTotalUs - baseline.cpuTotalUs) / baseline.cpuTotalUs) * 100;
+      withinTolerance = deltaPercent <= tolerance;
+    }
+
+    const result: PerfTestResult = {
+      scenarioName: name,
+      samples: allSnapshots,
+      filteredSamples: filteredSnapshots,
+      median,
+      baseline,
+      withinTolerance,
+      deltaPercent,
+      cpuDeltaPercent,
+    };
+
+    this.allResults.push(result);
+    return result;
+  }
+
+  /**
+   * Assert that a scenario result is within the baseline tolerance.
+   */
+  assertWithinBaseline(
+    result: PerfTestResult,
+    tolerancePercent?: number,
+    cpuTolerancePercent?: number,
+  ): void {
+    const tolerance = tolerancePercent ?? this.defaultTolerancePercent;
+    const cpuTolerance = cpuTolerancePercent ?? this.defaultCpuTolerancePercent;
+
+    if (!result.baseline) {
+      console.warn(
+        `⚠ No baseline found for "${result.scenarioName}". ` +
+          `Run with UPDATE_PERF_BASELINES=true to create one. ` +
+          `Measured: ${result.median.wallClockMs.toFixed(1)} ms wall-clock.`,
+      );
+      return;
+    }
+
+    const deltaPercent =
+      ((result.median.wallClockMs - result.baseline.wallClockMs) /
+        result.baseline.wallClockMs) *
+      100;
+
+    if (deltaPercent > tolerance) {
+      throw new Error(
+        `Performance regression detected for "${result.scenarioName}"!\n` +
+          `  Measured:    ${result.median.wallClockMs.toFixed(1)} ms wall-clock\n` +
+          `  Baseline:    ${result.baseline.wallClockMs.toFixed(1)} ms wall-clock\n` +
+          `  Delta:       ${deltaPercent.toFixed(1)}% (tolerance: ${tolerance}%)\n` +
+          `  CPU total:   ${formatUs(result.median.cpuTotalUs)}\n` +
+          `  EL p99:      ${result.median.eventLoopDelayP99Ms.toFixed(1)} ms\n` +
+          `  Samples:     ${result.samples.length} (${result.filteredSamples.length} after IQR filter)`,
+      );
+    }
+
+    if (cpuTolerance !== undefined && result.cpuDeltaPercent > cpuTolerance) {
+      throw new Error(
+        `CPU usage regression detected for "${result.scenarioName}"!\n` +
+          `  Measured:    ${formatUs(result.median.cpuTotalUs)}\n` +
+          `  Baseline:    ${formatUs(result.baseline.cpuTotalUs)}\n` +
+          `  Delta:       ${result.cpuDeltaPercent.toFixed(1)}% (tolerance: ${cpuTolerance}%)\n` +
+          `  Wall-clock:  ${result.median.wallClockMs.toFixed(1)} ms\n` +
+          `  EL p99:      ${result.median.eventLoopDelayP99Ms.toFixed(1)} ms`,
+      );
+    }
+  }
+
+  /**
+   * Update the baseline for a scenario with the current measured values.
+   */
+  updateScenarioBaseline(result: PerfTestResult): void {
+    updatePerfBaseline(this.baselinesPath, result.scenarioName, {
+      wallClockMs: result.median.wallClockMs,
+      cpuTotalUs: result.median.cpuTotalUs,
+      eventLoopDelayP99Ms: result.median.eventLoopDelayP99Ms,
+    });
+    // Reload baselines after update
+    this.baselines = loadPerfBaselines(this.baselinesPath);
+    console.log(
+      `Updated baseline for ${result.scenarioName}: ${result.median.wallClockMs.toFixed(1)} ms`,
+    );
+  }
+
+  /**
+   * Generate an ASCII report with summary table and charts.
+   */
+  async generateReport(results?: PerfTestResult[]): Promise<string> {
+    const resultsToReport = results ?? this.allResults;
+    const lines: string[] = [];
+
+    lines.push('');
+    lines.push('═══════════════════════════════════════════════════');
+    lines.push('         PERFORMANCE TEST REPORT');
+    lines.push('═══════════════════════════════════════════════════');
+    lines.push('');
+
+    for (const result of resultsToReport) {
+      const measured = `${result.median.wallClockMs.toFixed(1)} ms`;
+      const baseline = result.baseline
+        ? `${result.baseline.wallClockMs.toFixed(1)} ms`
+        : 'N/A';
+      const delta = result.baseline
+        ? `${result.deltaPercent >= 0 ? '+' : ''}${result.deltaPercent.toFixed(1)}%`
+        : 'N/A';
+      const status = !result.baseline
+        ? 'NEW'
+        : result.withinTolerance
+          ? '✅'
+          : '❌';
+
+      lines.push(
+        `${result.scenarioName}: ${measured} (Baseline: ${baseline}, Delta: ${delta}) ${status}`,
+      );
+
+      // Show CPU breakdown
+      const cpuMs = `${(result.median.cpuTotalUs / 1000).toFixed(1)} ms`;
+      lines.push(
+        `  CPU: ${cpuMs} (user: ${formatUs(result.median.cpuUserUs)}, system: ${formatUs(result.median.cpuSystemUs)})`,
+      );
+
+      if (result.median.eventLoopDelayP99Ms > 0) {
+        lines.push(
+          `  Event loop: p50=${result.median.eventLoopDelayP50Ms.toFixed(1)}ms p95=${result.median.eventLoopDelayP95Ms.toFixed(1)}ms p99=${result.median.eventLoopDelayP99Ms.toFixed(1)}ms max=${result.median.eventLoopDelayMaxMs.toFixed(1)}ms`,
+        );
+      }
+
+      lines.push(
+        `  Samples: ${result.samples.length} → ${result.filteredSamples.length} after IQR filter`,
+      );
+    }
+    lines.push('');
+
+    // Generate ASCII chart for wall-clock per scenario
+    try {
+      // @ts-expect-error - asciichart may not have types
+      const asciichart = (await import('asciichart')) as {
+        default?: { plot?: PlotFn };
+        plot?: PlotFn;
+      };
+      const plot: PlotFn | undefined =
+        asciichart.default?.plot ?? asciichart.plot;
+
+      for (const result of resultsToReport) {
+        if (result.filteredSamples.length > 2) {
+          lines.push(`📈 Wall-clock trend: ${result.scenarioName}`);
+          lines.push('─'.repeat(60));
+
+          const wallClockData = result.filteredSamples.map(
+            (s) => s.wallClockMs,
+          );
+
+          if (plot) {
+            const chart = plot(wallClockData, {
+              height: 8,
+              format: (x: number) => `${x.toFixed(0)} ms`.padStart(10),
+            });
+            lines.push(chart);
+          }
+
+          const labels = result.filteredSamples.map((s) => s.label);
+          lines.push('  ' + labels.join(' → '));
+          lines.push('');
+        }
+      }
+    } catch {
+      lines.push(
+        '(asciichart not available — install with: npm install --save-dev asciichart)',
+      );
+      lines.push('');
+    }
+
+    lines.push('═══════════════════════════════════════════════════');
+    lines.push('');
+
+    const report = lines.join('\n');
+    console.log(report);
+    return report;
+  }
+
+  /**
+   * Filter outliers using the Interquartile Range (IQR) method.
+   * Removes samples where the given metric falls outside Q1 - 1.5*IQR or Q3 + 1.5*IQR.
+   */
+  private filterOutliers(
+    snapshots: PerfSnapshot[],
+    metric: keyof PerfSnapshot,
+  ): PerfSnapshot[] {
+    if (snapshots.length < 4) {
+      // Not enough data for meaningful IQR filtering
+      return [...snapshots];
+    }
+
+    const sorted = [...snapshots].sort(
+      (a, b) => (a[metric] as number) - (b[metric] as number),
+    );
+    const q1Idx = Math.floor(sorted.length * 0.25);
+    const q3Idx = Math.floor(sorted.length * 0.75);
+
+    const q1 = sorted[q1Idx]![metric] as number;
+    const q3 = sorted[q3Idx]![metric] as number;
+    const iqr = q3 - q1;
+    const lowerBound = q1 - 1.5 * iqr;
+    const upperBound = q3 + 1.5 * iqr;
+
+    return snapshots.filter((s) => {
+      const val = s[metric] as number;
+      return val >= lowerBound && val <= upperBound;
+    });
+  }
+
+  /**
+   * Get the median snapshot by wall-clock time from a sorted list.
+   */
+  private getMedianSnapshot(snapshots: PerfSnapshot[]): PerfSnapshot {
+    if (snapshots.length === 0) {
+      throw new Error('Cannot compute median of empty snapshot list');
+    }
+
+    const sorted = [...snapshots].sort((a, b) => a.wallClockMs - b.wallClockMs);
+    const medianIdx = Math.floor(sorted.length / 2);
+    return { ...sorted[medianIdx]! };
+  }
+}
+
+// ─── Baseline management ─────────────────────────────────────────────
+
+/**
+ * Load perf baselines from a JSON file.
+ */
+export function loadPerfBaselines(path: string): PerfBaselineFile {
+  if (!existsSync(path)) {
+    return {
+      version: 1,
+      updatedAt: new Date().toISOString(),
+      scenarios: {},
+    };
+  }
+
+  const content = readFileSync(path, 'utf-8');
+  return JSON.parse(content) as PerfBaselineFile;
+}
+
+/**
+ * Save perf baselines to a JSON file.
+ */
+export function savePerfBaselines(
+  path: string,
+  baselines: PerfBaselineFile,
+): void {
+  baselines.updatedAt = new Date().toISOString();
+  writeFileSync(path, JSON.stringify(baselines, null, 2) + '\n');
+}
+
+/**
+ * Update (or create) a single scenario baseline in the file.
+ */
+export function updatePerfBaseline(
+  path: string,
+  scenarioName: string,
+  measured: {
+    wallClockMs: number;
+    cpuTotalUs: number;
+    eventLoopDelayP99Ms: number;
+  },
+): void {
+  const baselines = loadPerfBaselines(path);
+  baselines.scenarios[scenarioName] = {
+    wallClockMs: measured.wallClockMs,
+    cpuTotalUs: measured.cpuTotalUs,
+    eventLoopDelayP99Ms: measured.eventLoopDelayP99Ms,
+    timestamp: new Date().toISOString(),
+  };
+  savePerfBaselines(path, baselines);
+}
+
+// ─── Helpers ─────────────────────────────────────────────────────────
+
+/**
+ * Format microseconds as a human-readable string.
+ */
+function formatUs(us: number): string {
+  if (us > 1_000_000) {
+    return `${(us / 1_000_000).toFixed(2)} s`;
+  }
+  if (us > 1_000) {
+    return `${(us / 1_000).toFixed(1)} ms`;
+  }
+  return `${us} μs`;
+}