feat(test): refactor the memory usage test to use metrics from CLI process instead of test runner (#25708)

This commit is contained in:
cynthialong0-0
2026-04-21 10:06:22 -07:00
committed by GitHub
parent 2c14954010
commit aee2cde1a3
5 changed files with 284 additions and 223 deletions
+78 -166
View File
@@ -4,10 +4,9 @@
* SPDX-License-Identifier: Apache-2.0
*/
import v8 from 'node:v8';
import { setTimeout as sleep } from 'node:timers/promises';
import { loadBaselines, updateBaseline } from './memory-baselines.js';
import type { MemoryBaseline, MemoryBaselineFile } from './memory-baselines.js';
import type { TestRig } from './test-rig.js';
/** Configuration for asciichart plot function. */
interface PlotConfig {
@@ -28,9 +27,6 @@ export interface MemorySnapshot {
heapTotal: number;
rss: number;
external: number;
arrayBuffers: number;
heapSizeLimit: number;
heapSpaces: any[];
}
/**
@@ -64,16 +60,13 @@ export interface MemoryTestHarnessOptions {
gcDelayMs?: number;
/** Number of samples to take for median calculation. Default: 3 */
sampleCount?: number;
/** Pause in ms between samples. Default: 50 */
samplePauseMs?: number;
}
/**
* MemoryTestHarness provides infrastructure for running memory usage tests.
*
* It handles:
* - Forcing V8 garbage collection to reduce noise
* - Taking V8 heap snapshots for accurate memory measurement
* - Extracting memory metrics from CLI process telemetry
* - Comparing against baselines with configurable tolerance
* - Generating ASCII chart reports of memory trends
*/
@@ -81,88 +74,45 @@ export class MemoryTestHarness {
private baselines: MemoryBaselineFile;
private readonly baselinesPath: string;
private readonly defaultTolerancePercent: number;
private readonly gcCycles: number;
private readonly gcDelayMs: number;
private readonly sampleCount: number;
private readonly samplePauseMs: number;
private allResults: MemoryTestResult[] = [];
constructor(options: MemoryTestHarnessOptions) {
this.baselinesPath = options.baselinesPath;
this.defaultTolerancePercent = options.defaultTolerancePercent ?? 10;
this.gcCycles = options.gcCycles ?? 3;
this.gcDelayMs = options.gcDelayMs ?? 100;
this.sampleCount = options.sampleCount ?? 3;
this.samplePauseMs = options.samplePauseMs ?? 50;
this.baselines = loadBaselines(this.baselinesPath);
}
/**
* Force garbage collection multiple times and take a V8 heap snapshot.
* Forces GC multiple times with delays to allow weak references and
* FinalizationRegistry callbacks to run, reducing measurement noise.
* Extract memory snapshot from TestRig telemetry.
*/
async takeSnapshot(label: string = 'snapshot'): Promise<MemorySnapshot> {
await this.forceGC();
const memUsage = process.memoryUsage();
const heapStats = v8.getHeapStatistics();
return {
timestamp: Date.now(),
label,
heapUsed: memUsage.heapUsed,
heapTotal: memUsage.heapTotal,
rss: memUsage.rss,
external: memUsage.external,
arrayBuffers: memUsage.arrayBuffers,
heapSizeLimit: heapStats.heap_size_limit,
heapSpaces: v8.getHeapSpaceStatistics(),
};
}
/**
* Take multiple snapshot samples and return the median to reduce noise.
*/
async takeMedianSnapshot(
label: string = 'median',
count?: number,
async takeSnapshot(
rig: TestRig,
label: string = 'snapshot',
strategy: 'peak' | 'last' = 'last',
): Promise<MemorySnapshot> {
const samples: MemorySnapshot[] = [];
const numSamples = count ?? this.sampleCount;
for (let i = 0; i < numSamples; i++) {
samples.push(await this.takeSnapshot(`${label}_sample_${i}`));
if (i < numSamples - 1) {
await sleep(this.samplePauseMs);
}
}
// Sort by heapUsed and take the median
samples.sort((a, b) => a.heapUsed - b.heapUsed);
const medianIdx = Math.floor(samples.length / 2);
const median = samples[medianIdx]!;
const metrics = rig.readMemoryMetrics(strategy);
return {
...median,
timestamp: metrics.timestamp,
label,
timestamp: Date.now(),
heapUsed: metrics.heapUsed,
heapTotal: metrics.heapTotal,
rss: metrics.rss,
external: metrics.external,
};
}
/**
* Run a memory test scenario.
*
* Takes before/after snapshots around the scenario function, collects
* intermediate snapshots if the scenario provides them, and compares
* the result against the stored baseline.
*
* @param rig - The TestRig instance running the CLI
* @param name - Scenario name (must match baseline key)
* @param fn - Async function that executes the scenario. Receives a
* `recordSnapshot` callback for recording intermediate snapshots.
* @param tolerancePercent - Override default tolerance for this scenario
*/
async runScenario(
rig: TestRig,
name: string,
fn: (
recordSnapshot: (label: string) => Promise<MemorySnapshot>,
@@ -172,27 +122,49 @@ export class MemoryTestHarness {
const tolerance = tolerancePercent ?? this.defaultTolerancePercent;
const snapshots: MemorySnapshot[] = [];
// Record initial snapshot
const beforeSnap = await this.takeSnapshot(rig, 'before');
snapshots.push(beforeSnap);
// Record a callback for intermediate snapshots
const recordSnapshot = async (label: string): Promise<MemorySnapshot> => {
const snap = await this.takeMedianSnapshot(label);
// Small delay to allow telemetry to flush if needed
await rig.waitForTelemetryReady();
const snap = await this.takeSnapshot(rig, label);
snapshots.push(snap);
return snap;
};
// Before snapshot
const beforeSnap = await this.takeMedianSnapshot('before');
snapshots.push(beforeSnap);
// Run the scenario
await fn(recordSnapshot);
// After snapshot (median of multiple samples)
const afterSnap = await this.takeMedianSnapshot('after');
// Final wait for telemetry to ensure everything is flushed
await rig.waitForTelemetryReady();
// After snapshot
const afterSnap = await this.takeSnapshot(rig, 'after');
snapshots.push(afterSnap);
// Calculate peak values
const peakHeapUsed = Math.max(...snapshots.map((s) => s.heapUsed));
const peakRss = Math.max(...snapshots.map((s) => s.rss));
// Calculate peak values from ALL snapshots seen during the scenario
const allSnapshots = rig.readAllMemorySnapshots();
const scenarioSnapshots = allSnapshots.filter(
(s) =>
s.timestamp >= beforeSnap.timestamp &&
s.timestamp <= afterSnap.timestamp,
);
const peakHeapUsed = Math.max(
...scenarioSnapshots.map((s) => s.heapUsed),
...snapshots.map((s) => s.heapUsed),
);
const peakRss = Math.max(
...scenarioSnapshots.map((s) => s.rss),
...snapshots.map((s) => s.rss),
);
const peakExternal = Math.max(
...scenarioSnapshots.map((s) => s.external),
...snapshots.map((s) => s.external),
);
// Get baseline
const baseline = this.baselines.scenarios[name];
@@ -202,15 +174,12 @@ export class MemoryTestHarness {
let withinTolerance = true;
if (baseline) {
const measuredMB = afterSnap.heapUsed / (1024 * 1024);
deltaPercent =
((afterSnap.heapUsed - baseline.heapUsedBytes) /
baseline.heapUsedBytes) *
100;
((measuredMB - baseline.heapUsedMB) / baseline.heapUsedMB) * 100;
withinTolerance = deltaPercent <= tolerance;
}
const peakExternal = Math.max(...snapshots.map((s) => s.external));
const result: MemoryTestResult = {
scenarioName: name,
snapshots,
@@ -248,16 +217,16 @@ export class MemoryTestHarness {
return; // Don't fail if no baseline exists yet
}
const measuredMB = result.finalHeapUsed / (1024 * 1024);
const deltaPercent =
((result.finalHeapUsed - result.baseline.heapUsedBytes) /
result.baseline.heapUsedBytes) *
((measuredMB - result.baseline.heapUsedMB) / result.baseline.heapUsedMB) *
100;
if (deltaPercent > tolerance) {
throw new Error(
`Memory regression detected for "${result.scenarioName}"!\n` +
` Measured: ${formatMB(result.finalHeapUsed)} heap used\n` +
` Baseline: ${formatMB(result.baseline.heapUsedBytes)} heap used\n` +
` Baseline: ${result.baseline.heapUsedMB.toFixed(1)} MB heap used\n` +
` Delta: ${deltaPercent.toFixed(1)}% (tolerance: ${tolerance}%)\n` +
` Peak heap: ${formatMB(result.peakHeapUsed)}\n` +
` Peak RSS: ${formatMB(result.peakRss)}\n` +
@@ -270,20 +239,22 @@ export class MemoryTestHarness {
* Update the baseline for a scenario with the current measured values.
*/
updateScenarioBaseline(result: MemoryTestResult): void {
const lastSnapshot = result.snapshots[result.snapshots.length - 1];
updateBaseline(this.baselinesPath, result.scenarioName, {
heapUsedBytes: result.finalHeapUsed,
heapTotalBytes:
result.snapshots[result.snapshots.length - 1]?.heapTotal ?? 0,
rssBytes: result.finalRss,
externalBytes: result.finalExternal,
heapUsedMB: Number((result.finalHeapUsed / (1024 * 1024)).toFixed(1)),
heapTotalMB: Number(
((lastSnapshot?.heapTotal ?? 0) / (1024 * 1024)).toFixed(1),
),
rssMB: Number((result.finalRss / (1024 * 1024)).toFixed(1)),
externalMB: Number((result.finalExternal / (1024 * 1024)).toFixed(1)),
});
// Reload baselines after update
this.baselines = loadBaselines(this.baselinesPath);
}
/**
* Analyze snapshots to detect sustained leaks across 3 snapshots.
* A leak is flagged if growth is observed in both phases for any heap space.
* Analyze snapshots to detect sustained leaks.
* A leak is flagged if growth is observed in both phases.
*/
analyzeSnapshots(
snapshots: MemorySnapshot[],
@@ -297,55 +268,20 @@ export class MemoryTestHarness {
const snap2 = snapshots[snapshots.length - 2];
const snap3 = snapshots[snapshots.length - 1];
if (!snap1 || !snap2 || !snap3) {
return { leaked: false, message: 'Missing snapshots' };
}
const growth1 = snap2.heapUsed - snap1.heapUsed;
const growth2 = snap3.heapUsed - snap2.heapUsed;
const spaceNames = new Set<string>();
snap1.heapSpaces.forEach((s: any) => spaceNames.add(s.space_name));
snap2.heapSpaces.forEach((s: any) => spaceNames.add(s.space_name));
snap3.heapSpaces.forEach((s: any) => spaceNames.add(s.space_name));
const leaked = growth1 > thresholdBytes && growth2 > thresholdBytes;
let message = leaked
? `Memory bloat detected: sustained growth (${formatMB(growth1)} -> ${formatMB(growth2)})`
: `No sustained growth detected above threshold.`;
let hasSustainedGrowth = false;
const growthDetails: string[] = [];
for (const name of spaceNames) {
const size1 =
snap1.heapSpaces.find((s: any) => s.space_name === name)
?.space_used_size ?? 0;
const size2 =
snap2.heapSpaces.find((s: any) => s.space_name === name)
?.space_used_size ?? 0;
const size3 =
snap3.heapSpaces.find((s: any) => s.space_name === name)
?.space_used_size ?? 0;
const growth1 = size2 - size1;
const growth2 = size3 - size2;
if (growth1 > thresholdBytes && growth2 > thresholdBytes) {
hasSustainedGrowth = true;
growthDetails.push(
`${name}: sustained growth (${formatMB(growth1)} -> ${formatMB(growth2)})`,
);
}
}
let message = '';
if (hasSustainedGrowth) {
message =
`Memory bloat detected in heap spaces:\n ` +
growthDetails.join('\n ');
} else {
message = `No sustained growth detected in any heap space above threshold.`;
}
return { leaked: hasSustainedGrowth, message };
return { leaked, message };
}
/**
* Assert that memory returns to a baseline level after a peak.
* Useful for verifying that large tool outputs are not retained.
* Useful for verifying that large tool outputs or history are not retained.
*/
assertMemoryReturnsToBaseline(
snapshots: MemorySnapshot[],
@@ -355,26 +291,22 @@ export class MemoryTestHarness {
throw new Error('Need at least 3 snapshots to check return to baseline');
}
const baseline = snapshots[0]; // Assume first is baseline
const peak = snapshots.reduce(
(max, s) => (s.heapUsed > max.heapUsed ? s : max),
snapshots[0],
);
const final = snapshots[snapshots.length - 1];
if (!baseline || !peak || !final) {
throw new Error('Missing snapshots for return to baseline check');
// Find the first non-zero snapshot as baseline
const baseline = snapshots.find((s) => s.heapUsed > 0);
if (!baseline) {
return; // No memory reported yet
}
const final = snapshots[snapshots.length - 1]!;
const tolerance = baseline.heapUsed * (tolerancePercent / 100);
const delta = final.heapUsed - baseline.heapUsed;
if (delta > tolerance) {
throw new Error(
`Memory did not return to baseline!\n` +
` Baseline: ${formatMB(baseline.heapUsed)}\n` +
` Peak: ${formatMB(peak.heapUsed)}\n` +
` Final: ${formatMB(final.heapUsed)}\n` +
` Baseline: ${formatMB(baseline.heapUsed)} (${baseline.label})\n` +
` Final: ${formatMB(final.heapUsed)} (${final.label})\n` +
` Delta: ${formatMB(delta)} (tolerance: ${formatMB(tolerance)})`,
);
}
@@ -397,7 +329,7 @@ export class MemoryTestHarness {
for (const result of resultsToReport) {
const measured = formatMB(result.finalHeapUsed);
const baseline = result.baseline
? formatMB(result.baseline.heapUsedBytes)
? `${result.baseline.heapUsedMB.toFixed(1)} MB`
: 'N/A';
const delta = result.baseline
? `${result.deltaPercent >= 0 ? '+' : ''}${result.deltaPercent.toFixed(1)}%`
@@ -461,26 +393,6 @@ export class MemoryTestHarness {
console.log(report);
return report;
}
/**
* Force V8 garbage collection.
* Runs multiple GC cycles with delays to allow weak references
* and FinalizationRegistry callbacks to run.
*/
private async forceGC(): Promise<void> {
if (typeof globalThis.gc !== 'function') {
throw new Error(
'global.gc() not available. Run with --expose-gc for accurate measurements.',
);
}
for (let i = 0; i < this.gcCycles; i++) {
globalThis.gc();
if (i < this.gcCycles - 1) {
await sleep(this.gcDelayMs);
}
}
}
}
/**