feat(test): refactor the memory usage test to use metrics from CLI process instead of test runner (#25708)

This commit is contained in:
cynthialong0-0
2026-04-21 10:06:22 -07:00
committed by GitHub
parent 2c14954010
commit aee2cde1a3
5 changed files with 284 additions and 223 deletions
+12 -12
View File
@@ -10,10 +10,10 @@ import { readFileSync, writeFileSync, existsSync } from 'node:fs';
* Baseline entry for a single memory test scenario.
*/
export interface MemoryBaseline {
heapUsedBytes: number;
heapTotalBytes: number;
rssBytes: number;
externalBytes: number;
heapUsedMB: number;
heapTotalMB: number;
rssMB: number;
externalMB: number;
timestamp: string;
}
@@ -61,18 +61,18 @@ export function updateBaseline(
path: string,
scenarioName: string,
measured: {
heapUsedBytes: number;
heapTotalBytes: number;
rssBytes: number;
externalBytes: number;
heapUsedMB: number;
heapTotalMB: number;
rssMB: number;
externalMB: number;
},
): void {
const baselines = loadBaselines(path);
baselines.scenarios[scenarioName] = {
heapUsedBytes: measured.heapUsedBytes,
heapTotalBytes: measured.heapTotalBytes,
rssBytes: measured.rssBytes,
externalBytes: measured.externalBytes,
heapUsedMB: measured.heapUsedMB,
heapTotalMB: measured.heapTotalMB,
rssMB: measured.rssMB,
externalMB: measured.externalMB,
timestamp: new Date().toISOString(),
};
saveBaselines(path, baselines);
+78 -166
View File
@@ -4,10 +4,9 @@
* SPDX-License-Identifier: Apache-2.0
*/
import v8 from 'node:v8';
import { setTimeout as sleep } from 'node:timers/promises';
import { loadBaselines, updateBaseline } from './memory-baselines.js';
import type { MemoryBaseline, MemoryBaselineFile } from './memory-baselines.js';
import type { TestRig } from './test-rig.js';
/** Configuration for asciichart plot function. */
interface PlotConfig {
@@ -28,9 +27,6 @@ export interface MemorySnapshot {
heapTotal: number;
rss: number;
external: number;
arrayBuffers: number;
heapSizeLimit: number;
heapSpaces: any[];
}
/**
@@ -64,16 +60,13 @@ export interface MemoryTestHarnessOptions {
gcDelayMs?: number;
/** Number of samples to take for median calculation. Default: 3 */
sampleCount?: number;
/** Pause in ms between samples. Default: 50 */
samplePauseMs?: number;
}
/**
* MemoryTestHarness provides infrastructure for running memory usage tests.
*
* It handles:
* - Forcing V8 garbage collection to reduce noise
* - Taking V8 heap snapshots for accurate memory measurement
* - Extracting memory metrics from CLI process telemetry
* - Comparing against baselines with configurable tolerance
* - Generating ASCII chart reports of memory trends
*/
@@ -81,88 +74,45 @@ export class MemoryTestHarness {
private baselines: MemoryBaselineFile;
private readonly baselinesPath: string;
private readonly defaultTolerancePercent: number;
private readonly gcCycles: number;
private readonly gcDelayMs: number;
private readonly sampleCount: number;
private readonly samplePauseMs: number;
private allResults: MemoryTestResult[] = [];
constructor(options: MemoryTestHarnessOptions) {
this.baselinesPath = options.baselinesPath;
this.defaultTolerancePercent = options.defaultTolerancePercent ?? 10;
this.gcCycles = options.gcCycles ?? 3;
this.gcDelayMs = options.gcDelayMs ?? 100;
this.sampleCount = options.sampleCount ?? 3;
this.samplePauseMs = options.samplePauseMs ?? 50;
this.baselines = loadBaselines(this.baselinesPath);
}
/**
* Force garbage collection multiple times and take a V8 heap snapshot.
* Forces GC multiple times with delays to allow weak references and
* FinalizationRegistry callbacks to run, reducing measurement noise.
* Extract memory snapshot from TestRig telemetry.
*/
async takeSnapshot(label: string = 'snapshot'): Promise<MemorySnapshot> {
await this.forceGC();
const memUsage = process.memoryUsage();
const heapStats = v8.getHeapStatistics();
return {
timestamp: Date.now(),
label,
heapUsed: memUsage.heapUsed,
heapTotal: memUsage.heapTotal,
rss: memUsage.rss,
external: memUsage.external,
arrayBuffers: memUsage.arrayBuffers,
heapSizeLimit: heapStats.heap_size_limit,
heapSpaces: v8.getHeapSpaceStatistics(),
};
}
/**
* Take multiple snapshot samples and return the median to reduce noise.
*/
async takeMedianSnapshot(
label: string = 'median',
count?: number,
async takeSnapshot(
rig: TestRig,
label: string = 'snapshot',
strategy: 'peak' | 'last' = 'last',
): Promise<MemorySnapshot> {
const samples: MemorySnapshot[] = [];
const numSamples = count ?? this.sampleCount;
for (let i = 0; i < numSamples; i++) {
samples.push(await this.takeSnapshot(`${label}_sample_${i}`));
if (i < numSamples - 1) {
await sleep(this.samplePauseMs);
}
}
// Sort by heapUsed and take the median
samples.sort((a, b) => a.heapUsed - b.heapUsed);
const medianIdx = Math.floor(samples.length / 2);
const median = samples[medianIdx]!;
const metrics = rig.readMemoryMetrics(strategy);
return {
...median,
timestamp: metrics.timestamp,
label,
timestamp: Date.now(),
heapUsed: metrics.heapUsed,
heapTotal: metrics.heapTotal,
rss: metrics.rss,
external: metrics.external,
};
}
/**
* Run a memory test scenario.
*
* Takes before/after snapshots around the scenario function, collects
* intermediate snapshots if the scenario provides them, and compares
* the result against the stored baseline.
*
* @param rig - The TestRig instance running the CLI
* @param name - Scenario name (must match baseline key)
* @param fn - Async function that executes the scenario. Receives a
* `recordSnapshot` callback for recording intermediate snapshots.
* @param tolerancePercent - Override default tolerance for this scenario
*/
async runScenario(
rig: TestRig,
name: string,
fn: (
recordSnapshot: (label: string) => Promise<MemorySnapshot>,
@@ -172,27 +122,49 @@ export class MemoryTestHarness {
const tolerance = tolerancePercent ?? this.defaultTolerancePercent;
const snapshots: MemorySnapshot[] = [];
// Record initial snapshot
const beforeSnap = await this.takeSnapshot(rig, 'before');
snapshots.push(beforeSnap);
// Record a callback for intermediate snapshots
const recordSnapshot = async (label: string): Promise<MemorySnapshot> => {
const snap = await this.takeMedianSnapshot(label);
// Small delay to allow telemetry to flush if needed
await rig.waitForTelemetryReady();
const snap = await this.takeSnapshot(rig, label);
snapshots.push(snap);
return snap;
};
// Before snapshot
const beforeSnap = await this.takeMedianSnapshot('before');
snapshots.push(beforeSnap);
// Run the scenario
await fn(recordSnapshot);
// After snapshot (median of multiple samples)
const afterSnap = await this.takeMedianSnapshot('after');
// Final wait for telemetry to ensure everything is flushed
await rig.waitForTelemetryReady();
// After snapshot
const afterSnap = await this.takeSnapshot(rig, 'after');
snapshots.push(afterSnap);
// Calculate peak values
const peakHeapUsed = Math.max(...snapshots.map((s) => s.heapUsed));
const peakRss = Math.max(...snapshots.map((s) => s.rss));
// Calculate peak values from ALL snapshots seen during the scenario
const allSnapshots = rig.readAllMemorySnapshots();
const scenarioSnapshots = allSnapshots.filter(
(s) =>
s.timestamp >= beforeSnap.timestamp &&
s.timestamp <= afterSnap.timestamp,
);
const peakHeapUsed = Math.max(
...scenarioSnapshots.map((s) => s.heapUsed),
...snapshots.map((s) => s.heapUsed),
);
const peakRss = Math.max(
...scenarioSnapshots.map((s) => s.rss),
...snapshots.map((s) => s.rss),
);
const peakExternal = Math.max(
...scenarioSnapshots.map((s) => s.external),
...snapshots.map((s) => s.external),
);
// Get baseline
const baseline = this.baselines.scenarios[name];
@@ -202,15 +174,12 @@ export class MemoryTestHarness {
let withinTolerance = true;
if (baseline) {
const measuredMB = afterSnap.heapUsed / (1024 * 1024);
deltaPercent =
((afterSnap.heapUsed - baseline.heapUsedBytes) /
baseline.heapUsedBytes) *
100;
((measuredMB - baseline.heapUsedMB) / baseline.heapUsedMB) * 100;
withinTolerance = deltaPercent <= tolerance;
}
const peakExternal = Math.max(...snapshots.map((s) => s.external));
const result: MemoryTestResult = {
scenarioName: name,
snapshots,
@@ -248,16 +217,16 @@ export class MemoryTestHarness {
return; // Don't fail if no baseline exists yet
}
const measuredMB = result.finalHeapUsed / (1024 * 1024);
const deltaPercent =
((result.finalHeapUsed - result.baseline.heapUsedBytes) /
result.baseline.heapUsedBytes) *
((measuredMB - result.baseline.heapUsedMB) / result.baseline.heapUsedMB) *
100;
if (deltaPercent > tolerance) {
throw new Error(
`Memory regression detected for "${result.scenarioName}"!\n` +
` Measured: ${formatMB(result.finalHeapUsed)} heap used\n` +
` Baseline: ${formatMB(result.baseline.heapUsedBytes)} heap used\n` +
` Baseline: ${result.baseline.heapUsedMB.toFixed(1)} MB heap used\n` +
` Delta: ${deltaPercent.toFixed(1)}% (tolerance: ${tolerance}%)\n` +
` Peak heap: ${formatMB(result.peakHeapUsed)}\n` +
` Peak RSS: ${formatMB(result.peakRss)}\n` +
@@ -270,20 +239,22 @@ export class MemoryTestHarness {
* Update the baseline for a scenario with the current measured values.
*/
updateScenarioBaseline(result: MemoryTestResult): void {
const lastSnapshot = result.snapshots[result.snapshots.length - 1];
updateBaseline(this.baselinesPath, result.scenarioName, {
heapUsedBytes: result.finalHeapUsed,
heapTotalBytes:
result.snapshots[result.snapshots.length - 1]?.heapTotal ?? 0,
rssBytes: result.finalRss,
externalBytes: result.finalExternal,
heapUsedMB: Number((result.finalHeapUsed / (1024 * 1024)).toFixed(1)),
heapTotalMB: Number(
((lastSnapshot?.heapTotal ?? 0) / (1024 * 1024)).toFixed(1),
),
rssMB: Number((result.finalRss / (1024 * 1024)).toFixed(1)),
externalMB: Number((result.finalExternal / (1024 * 1024)).toFixed(1)),
});
// Reload baselines after update
this.baselines = loadBaselines(this.baselinesPath);
}
/**
* Analyze snapshots to detect sustained leaks across 3 snapshots.
* A leak is flagged if growth is observed in both phases for any heap space.
* Analyze snapshots to detect sustained leaks.
* A leak is flagged if growth is observed in both phases.
*/
analyzeSnapshots(
snapshots: MemorySnapshot[],
@@ -297,55 +268,20 @@ export class MemoryTestHarness {
const snap2 = snapshots[snapshots.length - 2];
const snap3 = snapshots[snapshots.length - 1];
if (!snap1 || !snap2 || !snap3) {
return { leaked: false, message: 'Missing snapshots' };
}
const growth1 = snap2.heapUsed - snap1.heapUsed;
const growth2 = snap3.heapUsed - snap2.heapUsed;
const spaceNames = new Set<string>();
snap1.heapSpaces.forEach((s: any) => spaceNames.add(s.space_name));
snap2.heapSpaces.forEach((s: any) => spaceNames.add(s.space_name));
snap3.heapSpaces.forEach((s: any) => spaceNames.add(s.space_name));
const leaked = growth1 > thresholdBytes && growth2 > thresholdBytes;
let message = leaked
? `Memory bloat detected: sustained growth (${formatMB(growth1)} -> ${formatMB(growth2)})`
: `No sustained growth detected above threshold.`;
let hasSustainedGrowth = false;
const growthDetails: string[] = [];
for (const name of spaceNames) {
const size1 =
snap1.heapSpaces.find((s: any) => s.space_name === name)
?.space_used_size ?? 0;
const size2 =
snap2.heapSpaces.find((s: any) => s.space_name === name)
?.space_used_size ?? 0;
const size3 =
snap3.heapSpaces.find((s: any) => s.space_name === name)
?.space_used_size ?? 0;
const growth1 = size2 - size1;
const growth2 = size3 - size2;
if (growth1 > thresholdBytes && growth2 > thresholdBytes) {
hasSustainedGrowth = true;
growthDetails.push(
`${name}: sustained growth (${formatMB(growth1)} -> ${formatMB(growth2)})`,
);
}
}
let message = '';
if (hasSustainedGrowth) {
message =
`Memory bloat detected in heap spaces:\n ` +
growthDetails.join('\n ');
} else {
message = `No sustained growth detected in any heap space above threshold.`;
}
return { leaked: hasSustainedGrowth, message };
return { leaked, message };
}
/**
* Assert that memory returns to a baseline level after a peak.
* Useful for verifying that large tool outputs are not retained.
* Useful for verifying that large tool outputs or history are not retained.
*/
assertMemoryReturnsToBaseline(
snapshots: MemorySnapshot[],
@@ -355,26 +291,22 @@ export class MemoryTestHarness {
throw new Error('Need at least 3 snapshots to check return to baseline');
}
const baseline = snapshots[0]; // Assume first is baseline
const peak = snapshots.reduce(
(max, s) => (s.heapUsed > max.heapUsed ? s : max),
snapshots[0],
);
const final = snapshots[snapshots.length - 1];
if (!baseline || !peak || !final) {
throw new Error('Missing snapshots for return to baseline check');
// Find the first non-zero snapshot as baseline
const baseline = snapshots.find((s) => s.heapUsed > 0);
if (!baseline) {
return; // No memory reported yet
}
const final = snapshots[snapshots.length - 1]!;
const tolerance = baseline.heapUsed * (tolerancePercent / 100);
const delta = final.heapUsed - baseline.heapUsed;
if (delta > tolerance) {
throw new Error(
`Memory did not return to baseline!\n` +
` Baseline: ${formatMB(baseline.heapUsed)}\n` +
` Peak: ${formatMB(peak.heapUsed)}\n` +
` Final: ${formatMB(final.heapUsed)}\n` +
` Baseline: ${formatMB(baseline.heapUsed)} (${baseline.label})\n` +
` Final: ${formatMB(final.heapUsed)} (${final.label})\n` +
` Delta: ${formatMB(delta)} (tolerance: ${formatMB(tolerance)})`,
);
}
@@ -397,7 +329,7 @@ export class MemoryTestHarness {
for (const result of resultsToReport) {
const measured = formatMB(result.finalHeapUsed);
const baseline = result.baseline
? formatMB(result.baseline.heapUsedBytes)
? `${result.baseline.heapUsedMB.toFixed(1)} MB`
: 'N/A';
const delta = result.baseline
? `${result.deltaPercent >= 0 ? '+' : ''}${result.deltaPercent.toFixed(1)}%`
@@ -461,26 +393,6 @@ export class MemoryTestHarness {
console.log(report);
return report;
}
/**
* Force V8 garbage collection.
* Runs multiple GC cycles with delays to allow weak references
* and FinalizationRegistry callbacks to run.
*/
private async forceGC(): Promise<void> {
if (typeof globalThis.gc !== 'function') {
throw new Error(
'global.gc() not available. Run with --expose-gc for accurate measurements.',
);
}
for (let i = 0; i < this.gcCycles; i++) {
globalThis.gc();
if (i < this.gcCycles - 1) {
await sleep(this.gcDelayMs);
}
}
}
}
/**
+128 -1
View File
@@ -1475,7 +1475,7 @@ export class TestRig {
readMetric(metricName: string): TelemetryMetric | null {
const logs = this._readAndParseTelemetryLog();
for (const logData of logs) {
if (logData.scopeMetrics) {
if (logData && logData.scopeMetrics) {
for (const scopeMetric of logData.scopeMetrics) {
for (const metric of scopeMetric.metrics) {
if (metric.descriptor.name === `gemini_cli.${metricName}`) {
@@ -1488,6 +1488,133 @@ export class TestRig {
return null;
}
readMemoryMetrics(strategy: 'peak' | 'last' = 'peak'): {
timestamp: number;
heapUsed: number;
heapTotal: number;
rss: number;
external: number;
} {
const snapshots = this._getMemorySnapshots();
if (snapshots.length === 0) {
return {
timestamp: Date.now(),
heapUsed: 0,
heapTotal: 0,
rss: 0,
external: 0,
};
}
if (strategy === 'last') {
const last = snapshots[snapshots.length - 1];
return {
timestamp: last.timestamp,
heapUsed: last.heapUsed,
heapTotal: last.heapTotal,
rss: last.rss,
external: last.external,
};
}
// Find the snapshot with the highest RSS
let peak = snapshots[0];
for (const snapshot of snapshots) {
if (snapshot.rss > peak.rss) {
peak = snapshot;
}
}
// Fallback: if we didn't find any RSS but found heap, use the max heap
if (peak.rss === 0) {
for (const snapshot of snapshots) {
if (snapshot.heapUsed > peak.heapUsed) {
peak = snapshot;
}
}
}
return {
timestamp: peak.timestamp,
heapUsed: peak.heapUsed,
heapTotal: peak.heapTotal,
rss: peak.rss,
external: peak.external,
};
}
readAllMemorySnapshots(): {
timestamp: number;
heapUsed: number;
heapTotal: number;
rss: number;
external: number;
}[] {
return this._getMemorySnapshots();
}
private _getMemorySnapshots(): {
timestamp: number;
heapUsed: number;
heapTotal: number;
rss: number;
external: number;
}[] {
const snapshots: Record<
string,
{
timestamp: number;
heapUsed: number;
heapTotal: number;
rss: number;
external: number;
}
> = {};
const logs = this._readAndParseTelemetryLog();
for (const logData of logs) {
if (logData && logData.scopeMetrics) {
for (const scopeMetric of logData.scopeMetrics) {
for (const metric of scopeMetric.metrics) {
if (metric.descriptor.name === 'gemini_cli.memory.usage') {
for (const dp of metric.dataPoints) {
const sessionId =
(dp.attributes?.['session.id'] as string) || 'unknown';
const component =
(dp.attributes?.['component'] as string) || 'unknown';
const seconds = dp.startTime?.[0] || 0;
const nanos = dp.startTime?.[1] || 0;
const timeKey = `${sessionId}-${component}-${seconds}-${nanos}`;
if (!snapshots[timeKey]) {
snapshots[timeKey] = {
timestamp: seconds * 1000 + Math.floor(nanos / 1000000),
rss: 0,
heapUsed: 0,
heapTotal: 0,
external: 0,
};
}
const type = dp.attributes?.['memory_type'];
const value = dp.value?.max ?? dp.value?.sum ?? 0;
if (type === 'heap_used') snapshots[timeKey].heapUsed = value;
else if (type === 'heap_total')
snapshots[timeKey].heapTotal = value;
else if (type === 'rss') snapshots[timeKey].rss = value;
else if (type === 'external')
snapshots[timeKey].external = value;
}
}
}
}
}
}
return Object.values(snapshots).sort((a, b) => a.timestamp - b.timestamp);
}
async runInteractive(options?: {
args?: string | string[];
approvalMode?: 'default' | 'auto_edit' | 'yolo' | 'plan';