mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-05-15 06:12:50 -07:00
feat(test-utils): add CPU performance integration test harness (#24951)
This commit is contained in:
@@ -0,0 +1,121 @@
|
||||
# CPU Performance Integration Test Harness
|
||||
|
||||
## Overview
|
||||
|
||||
This directory contains performance/CPU integration tests for the Gemini CLI.
|
||||
These tests measure wall-clock time, CPU usage, and event loop responsiveness to
|
||||
detect regressions across key scenarios.
|
||||
|
||||
CPU performance is inherently noisy, especially in CI. The harness addresses
|
||||
this with:
|
||||
|
||||
- **IQR outlier filtering** — discards anomalous samples
|
||||
- **Median sampling** — takes N runs, reports the median after filtering
|
||||
- **Warmup runs** — discards the first run to mitigate JIT compilation noise
|
||||
- **15% default tolerance** — won't panic at slight regressions
|
||||
|
||||
## Running
|
||||
|
||||
```bash
|
||||
# Run tests (compare against committed baselines)
|
||||
npm run test:perf
|
||||
|
||||
# Update baselines (after intentional changes)
|
||||
npm run test:perf:update-baselines
|
||||
|
||||
# Verbose output
|
||||
VERBOSE=true npm run test:perf
|
||||
|
||||
# Keep test artifacts for debugging
|
||||
KEEP_OUTPUT=true npm run test:perf
|
||||
```
|
||||
|
||||
## How It Works
|
||||
|
||||
### Measurement Primitives
|
||||
|
||||
The `PerfTestHarness` class (in `packages/test-utils`) provides:
|
||||
|
||||
- **`performance.now()`** — high-resolution wall-clock timing
|
||||
- **`process.cpuUsage()`** — user + system CPU microseconds (delta between
|
||||
start/stop)
|
||||
- **`perf_hooks.monitorEventLoopDelay()`** — event loop delay histogram
|
||||
(p50/p95/p99/max)
|
||||
|
||||
### Noise Reduction
|
||||
|
||||
1. **Warmup**: First run is discarded to mitigate JIT compilation artifacts
|
||||
2. **Multiple samples**: Each scenario runs N times (default 5)
|
||||
3. **IQR filtering**: Samples outside Q1−1.5×IQR and Q3+1.5×IQR are discarded
|
||||
4. **Median**: The median of remaining samples is used for comparison
|
||||
|
||||
### Baseline Management
|
||||
|
||||
Baselines are stored in `baselines.json` in this directory. Each scenario has:
|
||||
|
||||
```json
|
||||
{
|
||||
"cold-startup-time": {
|
||||
"wallClockMs": 1234.5,
|
||||
"cpuTotalUs": 567890,
|
||||
"eventLoopDelayP99Ms": 12.3,
|
||||
"timestamp": "2026-04-08T..."
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Tests fail if the measured value exceeds `baseline × 1.15` (15% tolerance).
|
||||
|
||||
To recalibrate after intentional changes:
|
||||
|
||||
```bash
|
||||
npm run test:perf:update-baselines
|
||||
# then commit baselines.json
|
||||
```
|
||||
|
||||
### Report Output
|
||||
|
||||
After all tests, the harness prints an ASCII summary:
|
||||
|
||||
```
|
||||
═══════════════════════════════════════════════════
|
||||
PERFORMANCE TEST REPORT
|
||||
═══════════════════════════════════════════════════
|
||||
|
||||
cold-startup-time: 1234.5 ms (Baseline: 1200.0 ms, Delta: +2.9%) ✅
|
||||
idle-cpu-usage: 2.1 % (Baseline: 2.0 %, Delta: +5.0%) ✅
|
||||
skill-loading-time: 1567.8 ms (Baseline: 1500.0 ms, Delta: +4.5%) ✅
|
||||
```
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
perf-tests/
|
||||
├── README.md ← you are here
|
||||
├── baselines.json ← committed baseline values
|
||||
├── globalSetup.ts ← test environment setup
|
||||
├── perf-usage.test.ts ← test scenarios
|
||||
├── perf.*.responses ← fake API responses per scenario
|
||||
├── tsconfig.json ← TypeScript config
|
||||
└── vitest.config.ts ← vitest config (serial, isolated)
|
||||
|
||||
packages/test-utils/src/
|
||||
├── perf-test-harness.ts ← PerfTestHarness class
|
||||
└── index.ts ← re-exports
|
||||
```
|
||||
|
||||
## CI Integration
|
||||
|
||||
These tests are **excluded from `preflight`** and designed for nightly CI:
|
||||
|
||||
```yaml
|
||||
- name: Performance regression tests
|
||||
run: npm run test:perf
|
||||
```
|
||||
|
||||
## Adding a New Scenario
|
||||
|
||||
1. Add a fake response file: `perf.<scenario-name>.responses`
|
||||
2. Add a test case in `perf-usage.test.ts` using `harness.runScenario()`
|
||||
3. Run `npm run test:perf:update-baselines` to establish initial baseline
|
||||
4. Commit the updated `baselines.json`
|
||||
@@ -0,0 +1,24 @@
|
||||
{
|
||||
"version": 1,
|
||||
"updatedAt": "2026-04-08T18:51:29.839Z",
|
||||
"scenarios": {
|
||||
"cold-startup-time": {
|
||||
"wallClockMs": 1333.4230420000004,
|
||||
"cpuTotalUs": 1711,
|
||||
"eventLoopDelayP99Ms": 0,
|
||||
"timestamp": "2026-04-08T18:50:58.124Z"
|
||||
},
|
||||
"idle-cpu-usage": {
|
||||
"wallClockMs": 5001.926125,
|
||||
"cpuTotalUs": 128518,
|
||||
"eventLoopDelayP99Ms": 12.705791,
|
||||
"timestamp": "2026-04-08T18:51:23.938Z"
|
||||
},
|
||||
"skill-loading-time": {
|
||||
"wallClockMs": 1372.4463749999995,
|
||||
"cpuTotalUs": 1550,
|
||||
"eventLoopDelayP99Ms": 0,
|
||||
"timestamp": "2026-04-08T18:51:29.839Z"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,67 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { mkdir, readdir, rm } from 'node:fs/promises';
|
||||
import { join, dirname } from 'node:path';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
import { canUseRipgrep } from '../packages/core/src/tools/ripGrep.js';
|
||||
import { isolateTestEnv } from '../packages/test-utils/src/env-setup.js';
|
||||
|
||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||
const rootDir = join(__dirname, '..');
|
||||
const perfTestsDir = join(rootDir, '.perf-tests');
|
||||
const KEEP_RUNS_COUNT = 5;
|
||||
let runDir = '';
|
||||
|
||||
export async function setup() {
|
||||
runDir = join(perfTestsDir, `${Date.now()}`);
|
||||
await mkdir(runDir, { recursive: true });
|
||||
|
||||
// Isolate environment variables
|
||||
isolateTestEnv(runDir);
|
||||
|
||||
// Download ripgrep to avoid race conditions
|
||||
const available = await canUseRipgrep();
|
||||
if (!available) {
|
||||
throw new Error('Failed to download ripgrep binary');
|
||||
}
|
||||
|
||||
// Clean up old test runs, keeping the latest few for debugging
|
||||
try {
|
||||
const testRuns = await readdir(perfTestsDir);
|
||||
if (testRuns.length > KEEP_RUNS_COUNT) {
|
||||
const oldRuns = testRuns
|
||||
.sort()
|
||||
.slice(0, testRuns.length - KEEP_RUNS_COUNT);
|
||||
await Promise.all(
|
||||
oldRuns.map((oldRun) =>
|
||||
rm(join(perfTestsDir, oldRun), {
|
||||
recursive: true,
|
||||
force: true,
|
||||
}),
|
||||
),
|
||||
);
|
||||
}
|
||||
} catch (e) {
|
||||
console.error('Error cleaning up old perf test runs:', e);
|
||||
}
|
||||
|
||||
process.env['INTEGRATION_TEST_FILE_DIR'] = runDir;
|
||||
process.env['VERBOSE'] = process.env['VERBOSE'] ?? 'false';
|
||||
|
||||
console.log(`\nPerf test output directory: ${runDir}`);
|
||||
}
|
||||
|
||||
export async function teardown() {
|
||||
// Cleanup unless KEEP_OUTPUT is set
|
||||
if (process.env['KEEP_OUTPUT'] !== 'true' && runDir) {
|
||||
try {
|
||||
await rm(runDir, { recursive: true, force: true });
|
||||
} catch (e) {
|
||||
console.warn('Failed to clean up perf test directory:', e);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,153 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { describe, it, beforeAll, afterAll } from 'vitest';
|
||||
import { TestRig, PerfTestHarness } from '@google/gemini-cli-test-utils';
|
||||
import { join, dirname } from 'node:path';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
|
||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||
const BASELINES_PATH = join(__dirname, 'baselines.json');
|
||||
const UPDATE_BASELINES = process.env['UPDATE_PERF_BASELINES'] === 'true';
|
||||
const TOLERANCE_PERCENT = 15;
|
||||
|
||||
// Use fewer samples locally for faster iteration, more in CI
|
||||
const SAMPLE_COUNT = process.env['CI'] ? 5 : 3;
|
||||
const WARMUP_COUNT = 1;
|
||||
|
||||
describe('CPU Performance Tests', () => {
|
||||
let harness: PerfTestHarness;
|
||||
|
||||
beforeAll(() => {
|
||||
harness = new PerfTestHarness({
|
||||
baselinesPath: BASELINES_PATH,
|
||||
defaultTolerancePercent: TOLERANCE_PERCENT,
|
||||
sampleCount: SAMPLE_COUNT,
|
||||
warmupCount: WARMUP_COUNT,
|
||||
});
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
// Generate the summary report after all tests
|
||||
await harness.generateReport();
|
||||
});
|
||||
|
||||
it('cold-startup-time: startup completes within baseline', async () => {
|
||||
const result = await harness.runScenario('cold-startup-time', async () => {
|
||||
const rig = new TestRig();
|
||||
try {
|
||||
rig.setup('perf-cold-startup', {
|
||||
fakeResponsesPath: join(__dirname, 'perf.cold-startup.responses'),
|
||||
});
|
||||
|
||||
return await harness.measure('cold-startup', async () => {
|
||||
await rig.run({
|
||||
args: ['hello'],
|
||||
timeout: 120000,
|
||||
env: { GEMINI_API_KEY: 'fake-perf-test-key' },
|
||||
});
|
||||
});
|
||||
} finally {
|
||||
await rig.cleanup();
|
||||
}
|
||||
});
|
||||
|
||||
if (UPDATE_BASELINES) {
|
||||
harness.updateScenarioBaseline(result);
|
||||
} else {
|
||||
harness.assertWithinBaseline(result);
|
||||
}
|
||||
});
|
||||
|
||||
it('idle-cpu-usage: CPU stays low when idle', async () => {
|
||||
const IDLE_OBSERVATION_MS = 5000;
|
||||
|
||||
const result = await harness.runScenario('idle-cpu-usage', async () => {
|
||||
const rig = new TestRig();
|
||||
try {
|
||||
rig.setup('perf-idle-cpu', {
|
||||
fakeResponsesPath: join(__dirname, 'perf.idle-cpu.responses'),
|
||||
});
|
||||
|
||||
// First, run a prompt to get the CLI into idle state
|
||||
await rig.run({
|
||||
args: ['hello'],
|
||||
timeout: 120000,
|
||||
env: { GEMINI_API_KEY: 'fake-perf-test-key' },
|
||||
});
|
||||
|
||||
// Now measure CPU during idle period in the test process
|
||||
return await harness.measureWithEventLoop('idle-cpu', async () => {
|
||||
// Simulate idle period — just wait
|
||||
const { setTimeout: sleep } = await import('node:timers/promises');
|
||||
await sleep(IDLE_OBSERVATION_MS);
|
||||
});
|
||||
} finally {
|
||||
await rig.cleanup();
|
||||
}
|
||||
});
|
||||
|
||||
if (UPDATE_BASELINES) {
|
||||
harness.updateScenarioBaseline(result);
|
||||
} else {
|
||||
harness.assertWithinBaseline(result);
|
||||
}
|
||||
});
|
||||
|
||||
it('skill-loading-time: startup with many skills within baseline', async () => {
|
||||
const SKILL_COUNT = 20;
|
||||
|
||||
const result = await harness.runScenario('skill-loading-time', async () => {
|
||||
const rig = new TestRig();
|
||||
try {
|
||||
rig.setup('perf-skill-loading', {
|
||||
fakeResponsesPath: join(__dirname, 'perf.skill-loading.responses'),
|
||||
});
|
||||
|
||||
// Create many skill directories with SKILL.md files
|
||||
for (let i = 0; i < SKILL_COUNT; i++) {
|
||||
const skillDir = `.gemini/skills/perf-skill-${i}`;
|
||||
rig.mkdir(skillDir);
|
||||
rig.createFile(
|
||||
`${skillDir}/SKILL.md`,
|
||||
[
|
||||
'---',
|
||||
`name: perf-skill-${i}`,
|
||||
`description: Performance test skill number ${i}`,
|
||||
`activation: manual`,
|
||||
'---',
|
||||
'',
|
||||
`# Performance Test Skill ${i}`,
|
||||
'',
|
||||
`This is a test skill for measuring skill loading performance.`,
|
||||
`It contains some content to simulate real-world skill files.`,
|
||||
'',
|
||||
`## Usage`,
|
||||
'',
|
||||
`Use this skill by activating it with @perf-skill-${i}.`,
|
||||
].join('\n'),
|
||||
);
|
||||
}
|
||||
|
||||
return await harness.measure('skill-loading', async () => {
|
||||
await rig.run({
|
||||
args: ['hello'],
|
||||
timeout: 120000,
|
||||
env: { GEMINI_API_KEY: 'fake-perf-test-key' },
|
||||
});
|
||||
});
|
||||
} finally {
|
||||
await rig.cleanup();
|
||||
}
|
||||
});
|
||||
|
||||
if (UPDATE_BASELINES) {
|
||||
harness.updateScenarioBaseline(result);
|
||||
} else {
|
||||
harness.assertWithinBaseline(result);
|
||||
}
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,2 @@
|
||||
{"method":"generateContent","response":{"candidates":[{"content":{"parts":[{"text":"0"}],"role":"model"},"finishReason":"STOP","index":0}]}}
|
||||
{"method":"generateContentStream","response":[{"candidates":[{"content":{"parts":[{"text":"Hello! I'm ready to help. What would you like to work on?"}],"role":"model"},"finishReason":"STOP","index":0}],"usageMetadata":{"promptTokenCount":5,"candidatesTokenCount":12,"totalTokenCount":17,"promptTokensDetails":[{"modality":"TEXT","tokenCount":5}]}}]}
|
||||
@@ -0,0 +1,2 @@
|
||||
{"method":"generateContent","response":{"candidates":[{"content":{"parts":[{"text":"0"}],"role":"model"},"finishReason":"STOP","index":0}]}}
|
||||
{"method":"generateContentStream","response":[{"candidates":[{"content":{"parts":[{"text":"Hello! I'm ready to help."}],"role":"model"},"finishReason":"STOP","index":0}],"usageMetadata":{"promptTokenCount":5,"candidatesTokenCount":8,"totalTokenCount":13,"promptTokensDetails":[{"modality":"TEXT","tokenCount":5}]}}]}
|
||||
@@ -0,0 +1,2 @@
|
||||
{"method":"generateContent","response":{"candidates":[{"content":{"parts":[{"text":"0"}],"role":"model"},"finishReason":"STOP","index":0}]}}
|
||||
{"method":"generateContentStream","response":[{"candidates":[{"content":{"parts":[{"text":"Hello! I'm ready to assist you with your project."}],"role":"model"},"finishReason":"STOP","index":0}],"usageMetadata":{"promptTokenCount":5,"candidatesTokenCount":10,"totalTokenCount":15,"promptTokensDetails":[{"modality":"TEXT","tokenCount":5}]}}]}
|
||||
@@ -0,0 +1,12 @@
|
||||
{
|
||||
"extends": "../tsconfig.json",
|
||||
"compilerOptions": {
|
||||
"noEmit": true,
|
||||
"allowJs": true
|
||||
},
|
||||
"include": ["**/*.ts"],
|
||||
"references": [
|
||||
{ "path": "../packages/core" },
|
||||
{ "path": "../packages/test-utils" }
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,27 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { defineConfig } from 'vitest/config';
|
||||
|
||||
export default defineConfig({
|
||||
test: {
|
||||
testTimeout: 600000, // 10 minutes — performance profiling needs time for multiple samples
|
||||
globalSetup: './globalSetup.ts',
|
||||
reporters: ['default'],
|
||||
include: ['**/*.test.ts'],
|
||||
retry: 0, // No retries — noise is handled by IQR filtering and tolerance
|
||||
fileParallelism: false, // Must run serially to avoid CPU contention
|
||||
pool: 'forks',
|
||||
poolOptions: {
|
||||
forks: {
|
||||
singleFork: true, // Single process for accurate per-test CPU readings
|
||||
},
|
||||
},
|
||||
env: {
|
||||
GEMINI_TEST_TYPE: 'perf',
|
||||
},
|
||||
},
|
||||
});
|
||||
Reference in New Issue
Block a user