feat(test-utils): add CPU performance integration test harness (#24951)

This commit is contained in:
Sri Pasumarthi
2026-04-08 14:50:29 -07:00
committed by GitHub
parent 5dc9a7a0b7
commit b46eedda81
19 changed files with 1081 additions and 13 deletions
+121
View File
@@ -0,0 +1,121 @@
# CPU Performance Integration Test Harness
## Overview
This directory contains performance/CPU integration tests for the Gemini CLI.
These tests measure wall-clock time, CPU usage, and event loop responsiveness to
detect regressions across key scenarios.
CPU performance is inherently noisy, especially in CI. The harness addresses
this with:
- **IQR outlier filtering** — discards anomalous samples
- **Median sampling** — takes N runs, reports the median after filtering
- **Warmup runs** — discards the first run to mitigate JIT compilation noise
- **15% default tolerance** — won't panic at slight regressions
## Running
```bash
# Run tests (compare against committed baselines)
npm run test:perf
# Update baselines (after intentional changes)
npm run test:perf:update-baselines
# Verbose output
VERBOSE=true npm run test:perf
# Keep test artifacts for debugging
KEEP_OUTPUT=true npm run test:perf
```
## How It Works
### Measurement Primitives
The `PerfTestHarness` class (in `packages/test-utils`) provides:
- **`performance.now()`** — high-resolution wall-clock timing
- **`process.cpuUsage()`** — user + system CPU microseconds (delta between
start/stop)
- **`perf_hooks.monitorEventLoopDelay()`** — event loop delay histogram
(p50/p95/p99/max)
### Noise Reduction
1. **Warmup**: First run is discarded to mitigate JIT compilation artifacts
2. **Multiple samples**: Each scenario runs N times (default 5)
3. **IQR filtering**: Samples outside Q11.5×IQR and Q3+1.5×IQR are discarded
4. **Median**: The median of remaining samples is used for comparison
### Baseline Management
Baselines are stored in `baselines.json` in this directory. Each scenario has:
```json
{
"cold-startup-time": {
"wallClockMs": 1234.5,
"cpuTotalUs": 567890,
"eventLoopDelayP99Ms": 12.3,
"timestamp": "2026-04-08T..."
}
}
```
Tests fail if the measured value exceeds `baseline × 1.15` (15% tolerance).
To recalibrate after intentional changes:
```bash
npm run test:perf:update-baselines
# then commit baselines.json
```
### Report Output
After all tests, the harness prints an ASCII summary:
```
═══════════════════════════════════════════════════
PERFORMANCE TEST REPORT
═══════════════════════════════════════════════════
cold-startup-time: 1234.5 ms (Baseline: 1200.0 ms, Delta: +2.9%) ✅
idle-cpu-usage: 2.1 % (Baseline: 2.0 %, Delta: +5.0%) ✅
skill-loading-time: 1567.8 ms (Baseline: 1500.0 ms, Delta: +4.5%) ✅
```
## Architecture
```
perf-tests/
├── README.md ← you are here
├── baselines.json ← committed baseline values
├── globalSetup.ts ← test environment setup
├── perf-usage.test.ts ← test scenarios
├── perf.*.responses ← fake API responses per scenario
├── tsconfig.json ← TypeScript config
└── vitest.config.ts ← vitest config (serial, isolated)
packages/test-utils/src/
├── perf-test-harness.ts ← PerfTestHarness class
└── index.ts ← re-exports
```
## CI Integration
These tests are **excluded from `preflight`** and designed for nightly CI:
```yaml
- name: Performance regression tests
run: npm run test:perf
```
## Adding a New Scenario
1. Add a fake response file: `perf.<scenario-name>.responses`
2. Add a test case in `perf-usage.test.ts` using `harness.runScenario()`
3. Run `npm run test:perf:update-baselines` to establish initial baseline
4. Commit the updated `baselines.json`
+24
View File
@@ -0,0 +1,24 @@
{
"version": 1,
"updatedAt": "2026-04-08T18:51:29.839Z",
"scenarios": {
"cold-startup-time": {
"wallClockMs": 1333.4230420000004,
"cpuTotalUs": 1711,
"eventLoopDelayP99Ms": 0,
"timestamp": "2026-04-08T18:50:58.124Z"
},
"idle-cpu-usage": {
"wallClockMs": 5001.926125,
"cpuTotalUs": 128518,
"eventLoopDelayP99Ms": 12.705791,
"timestamp": "2026-04-08T18:51:23.938Z"
},
"skill-loading-time": {
"wallClockMs": 1372.4463749999995,
"cpuTotalUs": 1550,
"eventLoopDelayP99Ms": 0,
"timestamp": "2026-04-08T18:51:29.839Z"
}
}
}
+67
View File
@@ -0,0 +1,67 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { mkdir, readdir, rm } from 'node:fs/promises';
import { join, dirname } from 'node:path';
import { fileURLToPath } from 'node:url';
import { canUseRipgrep } from '../packages/core/src/tools/ripGrep.js';
import { isolateTestEnv } from '../packages/test-utils/src/env-setup.js';
const __dirname = dirname(fileURLToPath(import.meta.url));
const rootDir = join(__dirname, '..');
const perfTestsDir = join(rootDir, '.perf-tests');
const KEEP_RUNS_COUNT = 5;
let runDir = '';
export async function setup() {
runDir = join(perfTestsDir, `${Date.now()}`);
await mkdir(runDir, { recursive: true });
// Isolate environment variables
isolateTestEnv(runDir);
// Download ripgrep to avoid race conditions
const available = await canUseRipgrep();
if (!available) {
throw new Error('Failed to download ripgrep binary');
}
// Clean up old test runs, keeping the latest few for debugging
try {
const testRuns = await readdir(perfTestsDir);
if (testRuns.length > KEEP_RUNS_COUNT) {
const oldRuns = testRuns
.sort()
.slice(0, testRuns.length - KEEP_RUNS_COUNT);
await Promise.all(
oldRuns.map((oldRun) =>
rm(join(perfTestsDir, oldRun), {
recursive: true,
force: true,
}),
),
);
}
} catch (e) {
console.error('Error cleaning up old perf test runs:', e);
}
process.env['INTEGRATION_TEST_FILE_DIR'] = runDir;
process.env['VERBOSE'] = process.env['VERBOSE'] ?? 'false';
console.log(`\nPerf test output directory: ${runDir}`);
}
export async function teardown() {
// Cleanup unless KEEP_OUTPUT is set
if (process.env['KEEP_OUTPUT'] !== 'true' && runDir) {
try {
await rm(runDir, { recursive: true, force: true });
} catch (e) {
console.warn('Failed to clean up perf test directory:', e);
}
}
}
+153
View File
@@ -0,0 +1,153 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, it, beforeAll, afterAll } from 'vitest';
import { TestRig, PerfTestHarness } from '@google/gemini-cli-test-utils';
import { join, dirname } from 'node:path';
import { fileURLToPath } from 'node:url';
const __dirname = dirname(fileURLToPath(import.meta.url));
const BASELINES_PATH = join(__dirname, 'baselines.json');
const UPDATE_BASELINES = process.env['UPDATE_PERF_BASELINES'] === 'true';
const TOLERANCE_PERCENT = 15;
// Use fewer samples locally for faster iteration, more in CI
const SAMPLE_COUNT = process.env['CI'] ? 5 : 3;
const WARMUP_COUNT = 1;
describe('CPU Performance Tests', () => {
let harness: PerfTestHarness;
beforeAll(() => {
harness = new PerfTestHarness({
baselinesPath: BASELINES_PATH,
defaultTolerancePercent: TOLERANCE_PERCENT,
sampleCount: SAMPLE_COUNT,
warmupCount: WARMUP_COUNT,
});
});
afterAll(async () => {
// Generate the summary report after all tests
await harness.generateReport();
});
it('cold-startup-time: startup completes within baseline', async () => {
const result = await harness.runScenario('cold-startup-time', async () => {
const rig = new TestRig();
try {
rig.setup('perf-cold-startup', {
fakeResponsesPath: join(__dirname, 'perf.cold-startup.responses'),
});
return await harness.measure('cold-startup', async () => {
await rig.run({
args: ['hello'],
timeout: 120000,
env: { GEMINI_API_KEY: 'fake-perf-test-key' },
});
});
} finally {
await rig.cleanup();
}
});
if (UPDATE_BASELINES) {
harness.updateScenarioBaseline(result);
} else {
harness.assertWithinBaseline(result);
}
});
it('idle-cpu-usage: CPU stays low when idle', async () => {
const IDLE_OBSERVATION_MS = 5000;
const result = await harness.runScenario('idle-cpu-usage', async () => {
const rig = new TestRig();
try {
rig.setup('perf-idle-cpu', {
fakeResponsesPath: join(__dirname, 'perf.idle-cpu.responses'),
});
// First, run a prompt to get the CLI into idle state
await rig.run({
args: ['hello'],
timeout: 120000,
env: { GEMINI_API_KEY: 'fake-perf-test-key' },
});
// Now measure CPU during idle period in the test process
return await harness.measureWithEventLoop('idle-cpu', async () => {
// Simulate idle period — just wait
const { setTimeout: sleep } = await import('node:timers/promises');
await sleep(IDLE_OBSERVATION_MS);
});
} finally {
await rig.cleanup();
}
});
if (UPDATE_BASELINES) {
harness.updateScenarioBaseline(result);
} else {
harness.assertWithinBaseline(result);
}
});
it('skill-loading-time: startup with many skills within baseline', async () => {
const SKILL_COUNT = 20;
const result = await harness.runScenario('skill-loading-time', async () => {
const rig = new TestRig();
try {
rig.setup('perf-skill-loading', {
fakeResponsesPath: join(__dirname, 'perf.skill-loading.responses'),
});
// Create many skill directories with SKILL.md files
for (let i = 0; i < SKILL_COUNT; i++) {
const skillDir = `.gemini/skills/perf-skill-${i}`;
rig.mkdir(skillDir);
rig.createFile(
`${skillDir}/SKILL.md`,
[
'---',
`name: perf-skill-${i}`,
`description: Performance test skill number ${i}`,
`activation: manual`,
'---',
'',
`# Performance Test Skill ${i}`,
'',
`This is a test skill for measuring skill loading performance.`,
`It contains some content to simulate real-world skill files.`,
'',
`## Usage`,
'',
`Use this skill by activating it with @perf-skill-${i}.`,
].join('\n'),
);
}
return await harness.measure('skill-loading', async () => {
await rig.run({
args: ['hello'],
timeout: 120000,
env: { GEMINI_API_KEY: 'fake-perf-test-key' },
});
});
} finally {
await rig.cleanup();
}
});
if (UPDATE_BASELINES) {
harness.updateScenarioBaseline(result);
} else {
harness.assertWithinBaseline(result);
}
});
});
+2
View File
@@ -0,0 +1,2 @@
{"method":"generateContent","response":{"candidates":[{"content":{"parts":[{"text":"0"}],"role":"model"},"finishReason":"STOP","index":0}]}}
{"method":"generateContentStream","response":[{"candidates":[{"content":{"parts":[{"text":"Hello! I'm ready to help. What would you like to work on?"}],"role":"model"},"finishReason":"STOP","index":0}],"usageMetadata":{"promptTokenCount":5,"candidatesTokenCount":12,"totalTokenCount":17,"promptTokensDetails":[{"modality":"TEXT","tokenCount":5}]}}]}
+2
View File
@@ -0,0 +1,2 @@
{"method":"generateContent","response":{"candidates":[{"content":{"parts":[{"text":"0"}],"role":"model"},"finishReason":"STOP","index":0}]}}
{"method":"generateContentStream","response":[{"candidates":[{"content":{"parts":[{"text":"Hello! I'm ready to help."}],"role":"model"},"finishReason":"STOP","index":0}],"usageMetadata":{"promptTokenCount":5,"candidatesTokenCount":8,"totalTokenCount":13,"promptTokensDetails":[{"modality":"TEXT","tokenCount":5}]}}]}
+2
View File
@@ -0,0 +1,2 @@
{"method":"generateContent","response":{"candidates":[{"content":{"parts":[{"text":"0"}],"role":"model"},"finishReason":"STOP","index":0}]}}
{"method":"generateContentStream","response":[{"candidates":[{"content":{"parts":[{"text":"Hello! I'm ready to assist you with your project."}],"role":"model"},"finishReason":"STOP","index":0}],"usageMetadata":{"promptTokenCount":5,"candidatesTokenCount":10,"totalTokenCount":15,"promptTokensDetails":[{"modality":"TEXT","tokenCount":5}]}}]}
+12
View File
@@ -0,0 +1,12 @@
{
"extends": "../tsconfig.json",
"compilerOptions": {
"noEmit": true,
"allowJs": true
},
"include": ["**/*.ts"],
"references": [
{ "path": "../packages/core" },
{ "path": "../packages/test-utils" }
]
}
+27
View File
@@ -0,0 +1,27 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { defineConfig } from 'vitest/config';
export default defineConfig({
test: {
testTimeout: 600000, // 10 minutes — performance profiling needs time for multiple samples
globalSetup: './globalSetup.ts',
reporters: ['default'],
include: ['**/*.test.ts'],
retry: 0, // No retries — noise is handled by IQR filtering and tolerance
fileParallelism: false, // Must run serially to avoid CPU contention
pool: 'forks',
poolOptions: {
forks: {
singleFork: true, // Single process for accurate per-test CPU readings
},
},
env: {
GEMINI_TEST_TYPE: 'perf',
},
},
});