mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-05-14 22:02:59 -07:00
test(perf): overhaul performance and memory baseline management
Comprehensive automation upgrades for performance and memory baselines. Includes GitHub Actions workflows for remote updates, automatic local comparisons against main, and git-ignored temporary baselines. - Added update-baselines.yml GitHub Action to automate remote baseline upgrades efficiently in CI. - Created scripts/run-perf-tests.js to wrap performance executions, safely stashing dirty alterations and gathering main-branch baselines locally when run without arguments. - Enhanced PerfTestHarness and MemoryTestHarness to accommodate tolerance limits assertions safely. - Updated test files to process TEMP_BASELINES_PATH environment variables, protecting tracked files clean during local evaluations. - Formed docs/performance-and-memory-testing.md safely centrally detailing general strategies. - Obsoleted folder files perf-tests/README.md, and memory-tests/README.md deleted altogether. - Registered temporary baseline outputs inside .gitignore and updated scripts/clean.js safely for fast removals on npm run clean.
This commit is contained in:
@@ -10,9 +10,21 @@ permissions:
|
|||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
memory-test:
|
memory-test:
|
||||||
name: 'Run Memory Usage Tests'
|
name: 'Run Memory Usage Tests (${{ matrix.machine_family }})'
|
||||||
runs-on: 'gemini-cli-ubuntu-16-core'
|
|
||||||
if: "github.repository == 'google-gemini/gemini-cli'"
|
if: "github.repository == 'google-gemini/gemini-cli'"
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
include:
|
||||||
|
- runs_on: 'gemini-cli-ubuntu-16-core'
|
||||||
|
machine_family: 'gemini-cli-ubuntu-16-core'
|
||||||
|
- runs_on: 'macos-latest'
|
||||||
|
machine_family: 'macos-latest'
|
||||||
|
- runs_on: 'gemini-cli-windows-16-core'
|
||||||
|
machine_family: 'gemini-cli-windows-16-core'
|
||||||
|
runs-on: '${{ matrix.runs_on }}'
|
||||||
|
env:
|
||||||
|
MEMORY_MACHINE_FAMILY: '${{ matrix.machine_family }}'
|
||||||
steps:
|
steps:
|
||||||
- name: 'Checkout'
|
- name: 'Checkout'
|
||||||
uses: 'actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8' # ratchet:actions/checkout@v5
|
uses: 'actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8' # ratchet:actions/checkout@v5
|
||||||
|
|||||||
@@ -10,9 +10,21 @@ permissions:
|
|||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
perf-test:
|
perf-test:
|
||||||
name: 'Run Performance Usage Tests'
|
name: 'Run Performance Tests (${{ matrix.machine_family }})'
|
||||||
runs-on: 'gemini-cli-ubuntu-16-core'
|
|
||||||
if: "github.repository == 'google-gemini/gemini-cli'"
|
if: "github.repository == 'google-gemini/gemini-cli'"
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
include:
|
||||||
|
- runs_on: 'gemini-cli-ubuntu-16-core'
|
||||||
|
machine_family: 'gemini-cli-ubuntu-16-core'
|
||||||
|
- runs_on: 'macos-latest'
|
||||||
|
machine_family: 'macos-latest'
|
||||||
|
- runs_on: 'gemini-cli-windows-16-core'
|
||||||
|
machine_family: 'gemini-cli-windows-16-core'
|
||||||
|
runs-on: '${{ matrix.runs_on }}'
|
||||||
|
env:
|
||||||
|
PERF_MACHINE_FAMILY: '${{ matrix.machine_family }}'
|
||||||
steps:
|
steps:
|
||||||
- name: 'Checkout'
|
- name: 'Checkout'
|
||||||
uses: 'actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8' # ratchet:actions/checkout@v5
|
uses: 'actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8' # ratchet:actions/checkout@v5
|
||||||
|
|||||||
@@ -0,0 +1,243 @@
|
|||||||
|
# Copyright 2026 Google LLC
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
#
|
||||||
|
# Update Perf/Memory Baselines
|
||||||
|
#
|
||||||
|
# Triggered by:
|
||||||
|
# 1. A PR comment starting with one of:
|
||||||
|
# /run perf — updates only perf baselines
|
||||||
|
# /run mem — updates only memory baselines
|
||||||
|
# /run perf+mem — updates both (default)
|
||||||
|
# 2. Manual workflow_dispatch from the Actions tab.
|
||||||
|
#
|
||||||
|
# Both paths are gated behind the 'perf-approvers' GitHub environment,
|
||||||
|
# which requires approval from the designated approvers group before the
|
||||||
|
# matrix runners are provisioned.
|
||||||
|
#
|
||||||
|
# After all per-platform runs complete, the updated baseline JSON files
|
||||||
|
# are committed back to the triggering branch automatically.
|
||||||
|
|
||||||
|
name: 'Update Perf/Memory Baselines'
|
||||||
|
|
||||||
|
on:
|
||||||
|
issue_comment:
|
||||||
|
types: ['created']
|
||||||
|
workflow_dispatch:
|
||||||
|
inputs:
|
||||||
|
test_type:
|
||||||
|
description: 'Which baselines to update'
|
||||||
|
required: true
|
||||||
|
default: 'perf+mem'
|
||||||
|
type: 'choice'
|
||||||
|
options:
|
||||||
|
- 'perf'
|
||||||
|
- 'mem'
|
||||||
|
- 'perf+mem'
|
||||||
|
ref:
|
||||||
|
description: 'Branch/SHA to checkout and update baselines on (default: main)'
|
||||||
|
required: false
|
||||||
|
default: 'main'
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: 'write' # push the updated baseline commit
|
||||||
|
pull-requests: 'write' # post the result comment
|
||||||
|
issues: 'read'
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
# ── 1. Parse slash command / workflow_dispatch ──────────────────────────
|
||||||
|
parse-command:
|
||||||
|
name: 'Parse Command'
|
||||||
|
runs-on: 'gemini-cli-ubuntu-16-core'
|
||||||
|
if: |
|
||||||
|
github.repository == 'google-gemini/gemini-cli' && (
|
||||||
|
github.event_name == 'workflow_dispatch' || (
|
||||||
|
github.event_name == 'issue_comment' &&
|
||||||
|
github.event.issue.pull_request != null &&
|
||||||
|
(
|
||||||
|
startsWith(github.event.comment.body, '/run perf+mem') ||
|
||||||
|
startsWith(github.event.comment.body, '/run perf') ||
|
||||||
|
startsWith(github.event.comment.body, '/run mem')
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
outputs:
|
||||||
|
test_type: '${{ steps.parse.outputs.test_type }}'
|
||||||
|
ref: '${{ steps.parse.outputs.ref }}'
|
||||||
|
pr_number: '${{ steps.parse.outputs.pr_number }}'
|
||||||
|
steps:
|
||||||
|
- name: 'Parse inputs'
|
||||||
|
id: 'parse'
|
||||||
|
env:
|
||||||
|
GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}'
|
||||||
|
COMMENT_BODY: '${{ github.event.comment.body }}'
|
||||||
|
DISPATCH_TEST_TYPE: '${{ inputs.test_type }}'
|
||||||
|
DISPATCH_REF: '${{ inputs.ref }}'
|
||||||
|
PR_NUMBER: '${{ github.event.issue.number }}'
|
||||||
|
EVENT_NAME: '${{ github.event_name }}'
|
||||||
|
run: |
|
||||||
|
if [[ "$EVENT_NAME" == "workflow_dispatch" ]]; then
|
||||||
|
TEST_TYPE="${DISPATCH_TEST_TYPE:-perf+mem}"
|
||||||
|
REF="${DISPATCH_REF:-main}"
|
||||||
|
echo "pr_number=" >> "$GITHUB_OUTPUT"
|
||||||
|
else
|
||||||
|
# Slash command: determine test_type from comment
|
||||||
|
if echo "$COMMENT_BODY" | grep -q "^/run perf+mem"; then
|
||||||
|
TEST_TYPE="perf+mem"
|
||||||
|
elif echo "$COMMENT_BODY" | grep -q "^/run perf"; then
|
||||||
|
TEST_TYPE="perf"
|
||||||
|
else
|
||||||
|
TEST_TYPE="mem"
|
||||||
|
fi
|
||||||
|
# Get the HEAD sha of the PR
|
||||||
|
REF=$(gh pr view "$PR_NUMBER" --json headRefName --jq '.headRefName')
|
||||||
|
echo "pr_number=${PR_NUMBER}" >> "$GITHUB_OUTPUT"
|
||||||
|
fi
|
||||||
|
echo "test_type=${TEST_TYPE}" >> "$GITHUB_OUTPUT"
|
||||||
|
echo "ref=${REF}" >> "$GITHUB_OUTPUT"
|
||||||
|
|
||||||
|
- name: 'Post acknowledgement comment on PR'
|
||||||
|
if: "steps.parse.outputs.pr_number != ''"
|
||||||
|
env:
|
||||||
|
GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}'
|
||||||
|
run: |
|
||||||
|
gh pr comment "${{ steps.parse.outputs.pr_number }}" --body \
|
||||||
|
"⏳ **Baseline update requested** (\`${{ steps.parse.outputs.test_type }}\`).
|
||||||
|
A member of the \`perf-approvers\` group must approve this workflow before the runners start.
|
||||||
|
<!-- baseline-update-ack -->"
|
||||||
|
|
||||||
|
# ── 2. Approval gate (perf-approvers environment) ──────────────────────
|
||||||
|
await-approval:
|
||||||
|
name: 'Await perf-approvers Approval'
|
||||||
|
needs: 'parse-command'
|
||||||
|
# This environment requires manual approval from the perf-approvers group
|
||||||
|
# before GitHub provisions any of the downstream runners.
|
||||||
|
environment: 'perf-approvers'
|
||||||
|
runs-on: 'gemini-cli-ubuntu-16-core'
|
||||||
|
steps:
|
||||||
|
- name: 'Approved'
|
||||||
|
run: 'echo "Approved by perf-approvers — launching baseline update matrix."'
|
||||||
|
|
||||||
|
# ── 3. Run tests to capture fresh baselines on each platform ───────────
|
||||||
|
update-baselines:
|
||||||
|
name: 'Update Baselines (${{ matrix.machine_family }})'
|
||||||
|
needs: 'await-approval'
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
include:
|
||||||
|
- runs_on: 'gemini-cli-ubuntu-16-core'
|
||||||
|
machine_family: 'gemini-cli-ubuntu-16-core'
|
||||||
|
- runs_on: 'macos-latest'
|
||||||
|
machine_family: 'macos-latest'
|
||||||
|
- runs_on: 'gemini-cli-windows-16-core'
|
||||||
|
machine_family: 'gemini-cli-windows-16-core'
|
||||||
|
runs-on: '${{ matrix.runs_on }}'
|
||||||
|
steps:
|
||||||
|
- name: 'Checkout'
|
||||||
|
uses: 'actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8' # ratchet:actions/checkout@v5
|
||||||
|
with:
|
||||||
|
ref: '${{ needs.parse-command.outputs.ref }}'
|
||||||
|
# Need full history so we can push back
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: 'Set up Node.js'
|
||||||
|
uses: 'actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020' # ratchet:actions/setup-node@v4
|
||||||
|
with:
|
||||||
|
node-version-file: '.nvmrc'
|
||||||
|
cache: 'npm'
|
||||||
|
|
||||||
|
- name: 'Install dependencies'
|
||||||
|
run: 'npm ci'
|
||||||
|
|
||||||
|
- name: 'Build project'
|
||||||
|
run: 'npm run build'
|
||||||
|
|
||||||
|
- name: 'Update Perf Baselines'
|
||||||
|
if: "contains(needs.parse-command.outputs.test_type, 'perf')"
|
||||||
|
run: 'npm run test:perf:update-baselines'
|
||||||
|
env:
|
||||||
|
PERF_MACHINE_FAMILY: '${{ matrix.machine_family }}'
|
||||||
|
|
||||||
|
- name: 'Update Memory Baselines'
|
||||||
|
if: "contains(needs.parse-command.outputs.test_type, 'mem')"
|
||||||
|
run: 'npm run test:memory:update-baselines'
|
||||||
|
env:
|
||||||
|
MEMORY_MACHINE_FAMILY: '${{ matrix.machine_family }}'
|
||||||
|
|
||||||
|
- name: 'Upload updated baseline files'
|
||||||
|
uses: 'actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02' # ratchet:actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: 'baselines-${{ matrix.machine_family }}'
|
||||||
|
# Upload the entire baselines/ subdirectories from both test roots
|
||||||
|
path: |
|
||||||
|
perf-tests/baselines/
|
||||||
|
memory-tests/baselines/
|
||||||
|
if-no-files-found: 'warn'
|
||||||
|
|
||||||
|
# ── 4. Gather artifacts and commit everything back to the branch ────────
|
||||||
|
commit-baselines:
|
||||||
|
name: 'Commit Updated Baselines'
|
||||||
|
needs:
|
||||||
|
- 'parse-command'
|
||||||
|
- 'update-baselines'
|
||||||
|
runs-on: 'gemini-cli-ubuntu-16-core'
|
||||||
|
steps:
|
||||||
|
- name: 'Checkout'
|
||||||
|
uses: 'actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8' # ratchet:actions/checkout@v5
|
||||||
|
with:
|
||||||
|
ref: '${{ needs.parse-command.outputs.ref }}'
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: 'Download all baseline artifacts'
|
||||||
|
uses: 'actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093' # ratchet:actions/download-artifact@v4
|
||||||
|
with:
|
||||||
|
# Download each per-platform artifact into its own subdirectory so
|
||||||
|
# the paths mirror the test directory layout.
|
||||||
|
pattern: 'baselines-*'
|
||||||
|
merge-multiple: true
|
||||||
|
path: '.'
|
||||||
|
|
||||||
|
- name: 'Commit and push'
|
||||||
|
env:
|
||||||
|
GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}'
|
||||||
|
run: |
|
||||||
|
git config user.name "github-actions[bot]"
|
||||||
|
git config user.email "github-actions[bot]@users.noreply.github.com"
|
||||||
|
|
||||||
|
# Stage only the per-platform baseline files (not the generic ones)
|
||||||
|
git add perf-tests/baselines/ memory-tests/baselines/ || true
|
||||||
|
|
||||||
|
if git diff --cached --quiet; then
|
||||||
|
echo "No baseline files changed — nothing to commit."
|
||||||
|
else
|
||||||
|
git commit -m "chore: update ${{ needs.parse-command.outputs.test_type }} baselines [skip ci]
|
||||||
|
|
||||||
|
Updated by 'Update Perf/Memory Baselines' workflow run #${{ github.run_id }}.
|
||||||
|
Platforms: gemini-cli-ubuntu-16-core, macos-latest, gemini-cli-windows-16-core"
|
||||||
|
git push
|
||||||
|
fi
|
||||||
|
|
||||||
|
- name: 'Post result comment on PR'
|
||||||
|
if: "needs.parse-command.outputs.pr_number != ''"
|
||||||
|
env:
|
||||||
|
GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}'
|
||||||
|
run: |
|
||||||
|
# Remove the acknowledgement comment before posting the result
|
||||||
|
COMMENT_ID=$(gh pr view "${{ needs.parse-command.outputs.pr_number }}" \
|
||||||
|
--json comments \
|
||||||
|
--jq '.comments[] | select(.body | contains("<!-- baseline-update-ack -->")) | .url' \
|
||||||
|
| grep -oE '[0-9]+$' | head -n 1)
|
||||||
|
if [ -n "$COMMENT_ID" ]; then
|
||||||
|
gh api -X DELETE "repos/${{ github.repository }}/issues/comments/${COMMENT_ID}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
gh pr comment "${{ needs.parse-command.outputs.pr_number }}" --body \
|
||||||
|
"✅ **Baselines updated** (\`${{ needs.parse-command.outputs.test_type }}\`).
|
||||||
|
|
||||||
|
Fresh per-platform baseline files have been committed to this branch for:
|
||||||
|
- \`gemini-cli-ubuntu-16-core\`
|
||||||
|
- \`macos-latest\`
|
||||||
|
- \`gemini-cli-windows-16-core\`
|
||||||
|
|
||||||
|
The nightly tests will now compare against these values.
|
||||||
|
<!-- baseline-update-result -->"
|
||||||
@@ -3,6 +3,8 @@
|
|||||||
.env~
|
.env~
|
||||||
|
|
||||||
# gemini-cli settings
|
# gemini-cli settings
|
||||||
|
.tmp-perf-baselines.json
|
||||||
|
.tmp-memory-baselines.json
|
||||||
# We want to keep the .gemini in the root of the repo and ignore any .gemini
|
# We want to keep the .gemini in the root of the repo and ignore any .gemini
|
||||||
# in subdirectories. In our root .gemini we want to allow for version control
|
# in subdirectories. In our root .gemini we want to allow for version control
|
||||||
# for subcommands.
|
# for subcommands.
|
||||||
|
|||||||
@@ -0,0 +1,110 @@
|
|||||||
|
# Performance & Memory Testing Infrastructure
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Gemini CLI features a highly reliable performance and memory regression testing
|
||||||
|
pipeline. To curb anomalies and yields accurate results, the harness applies:
|
||||||
|
|
||||||
|
- **IQR Outlier Filtering**: Discards anomalous metrics from evaluation safely.
|
||||||
|
- **Median Sampling**: Takes `N` runs, evaluating strictly median averages
|
||||||
|
effortlessly.
|
||||||
|
- **Warmup Runs**: Discards first samples smoothly preventing JIT artifacts.
|
||||||
|
- **Tolerance Boundary**: Default restrictions at 15% tolerance prevent
|
||||||
|
unwarranted panics effortlessly.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Baseline Management
|
||||||
|
|
||||||
|
There are two core strategies for calibrating tolerances on performance
|
||||||
|
benchmarks:
|
||||||
|
|
||||||
|
- **Approach A: Normalize for Testing Servers**: Tests run directly on the
|
||||||
|
automated cloud servers, and those scores are recorded as official, static
|
||||||
|
baselines.
|
||||||
|
- **Approach B: Machine-Agnostic Daily Comparisons**: Static baseline files are
|
||||||
|
ignored. Every night, the test is run against today's and yesterday's code on
|
||||||
|
the exact same server.
|
||||||
|
|
||||||
|
### Recommended Strategy: GitHub Action + Approach A
|
||||||
|
|
||||||
|
#### Local Development & PR Checks
|
||||||
|
|
||||||
|
- **Local Testing**: If you are a developer trying to quickly test your code
|
||||||
|
changes against performance or memory impacts, simply run the standard local
|
||||||
|
perf or memory tests directly without arguments. The harness stashes dirty
|
||||||
|
alterations automatically, refreshes baseline settings against the most
|
||||||
|
up-to-date `main` branch dynamically using non-tracked ephemeral files, and
|
||||||
|
yields immediate comparison feedback.
|
||||||
|
- **PR Merges**: Please note that if your alterations intentionally necessitate
|
||||||
|
adjustments across baseline metrics, you should trigger the GitHub Action to
|
||||||
|
recalibrate baselines in tandem with merging your PR. This is so that
|
||||||
|
subsequent nightly audits appropriately do their evaluation comparisons
|
||||||
|
against the new tolerances successfully!
|
||||||
|
|
||||||
|
#### Nightly Build Health Audits
|
||||||
|
|
||||||
|
- Strict Approach A procedures apply daily across platforms on dedicated
|
||||||
|
environments, avoiding the "boiling frog" issue where micro-regressions
|
||||||
|
quietly slip past over periods of duration.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Running Tests
|
||||||
|
|
||||||
|
### Performance CPU Tests
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Run tests (compare against committed baselines)
|
||||||
|
npm run test:perf
|
||||||
|
|
||||||
|
# Verbose output
|
||||||
|
VERBOSE=true npm run test:perf
|
||||||
|
|
||||||
|
# Keep test artifacts for debugging
|
||||||
|
KEEP_OUTPUT=true npm run test:perf
|
||||||
|
```
|
||||||
|
|
||||||
|
### Memory Tests
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Run memory tests (compare against local main baselines)
|
||||||
|
npm run test:memory
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Architecture & Configuration
|
||||||
|
|
||||||
|
### Performance Tests Directory Tree
|
||||||
|
|
||||||
|
- `perf-tests/baselines.json`: Committed baseline values
|
||||||
|
- `perf-tests/globalSetup.ts`: Test environment setup
|
||||||
|
- `perf-tests/perf-usage.test.ts`: Test scenarios
|
||||||
|
- `perf-tests/perf.*.responses`: Fake API responses per scenario
|
||||||
|
|
||||||
|
### Memory Tests Directory Tree
|
||||||
|
|
||||||
|
- `memory-tests/baselines.json`: Committed memory values
|
||||||
|
- `memory-tests/memory-usage.test.ts`: Memory test scenarios
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## CI Integration
|
||||||
|
|
||||||
|
These tests are strictly excluded from `preflight` constraints and remain
|
||||||
|
designed strictly for nightly daily audits accurately:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
- name: Performance regression tests
|
||||||
|
run: npm run test:perf
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Adding New Scenarios
|
||||||
|
|
||||||
|
1. Add a fake response file: `perf.<scenario-name>.responses` or
|
||||||
|
`memory.<scenario-name>.responses`.
|
||||||
|
2. Add a test case in `perf-usage.test.ts` or `memory-usage.test.ts` applying
|
||||||
|
`harness.runScenario()`.
|
||||||
@@ -5,7 +5,11 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
import { describe, it, beforeAll, afterAll, afterEach } from 'vitest';
|
import { describe, it, beforeAll, afterAll, afterEach } from 'vitest';
|
||||||
import { TestRig, MemoryTestHarness } from '@google/gemini-cli-test-utils';
|
import {
|
||||||
|
TestRig,
|
||||||
|
MemoryTestHarness,
|
||||||
|
resolveMemoryBaselinesPath,
|
||||||
|
} from '@google/gemini-cli-test-utils';
|
||||||
import { join, dirname } from 'node:path';
|
import { join, dirname } from 'node:path';
|
||||||
import { fileURLToPath } from 'node:url';
|
import { fileURLToPath } from 'node:url';
|
||||||
import {
|
import {
|
||||||
@@ -19,7 +23,8 @@ import {
|
|||||||
import { randomUUID } from 'node:crypto';
|
import { randomUUID } from 'node:crypto';
|
||||||
|
|
||||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||||
const BASELINES_PATH = join(__dirname, 'baselines.json');
|
const MACHINE_FAMILY = process.env['MEMORY_MACHINE_FAMILY'];
|
||||||
|
const BASELINES_PATH = resolveMemoryBaselinesPath(__dirname, MACHINE_FAMILY);
|
||||||
const UPDATE_BASELINES = process.env['UPDATE_MEMORY_BASELINES'] === 'true';
|
const UPDATE_BASELINES = process.env['UPDATE_MEMORY_BASELINES'] === 'true';
|
||||||
const TOLERANCE_PERCENT = 10;
|
const TOLERANCE_PERCENT = 10;
|
||||||
|
|
||||||
@@ -37,6 +42,7 @@ describe('Memory Usage Tests', () => {
|
|||||||
gcCycles: 3,
|
gcCycles: 3,
|
||||||
gcDelayMs: 100,
|
gcDelayMs: 100,
|
||||||
sampleCount: 3,
|
sampleCount: 3,
|
||||||
|
machineFamily: MACHINE_FAMILY,
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
+2
-2
@@ -51,9 +51,9 @@
|
|||||||
"test:integration:all": "npm run test:integration:sandbox:none && npm run test:integration:sandbox:docker && npm run test:integration:sandbox:podman",
|
"test:integration:all": "npm run test:integration:sandbox:none && npm run test:integration:sandbox:docker && npm run test:integration:sandbox:podman",
|
||||||
"test:integration:flaky": "cross-env RUN_FLAKY_INTEGRATION=1 npm run test:integration:sandbox:none",
|
"test:integration:flaky": "cross-env RUN_FLAKY_INTEGRATION=1 npm run test:integration:sandbox:none",
|
||||||
"test:integration:sandbox:none": "cross-env GEMINI_SANDBOX=false vitest run --root ./integration-tests",
|
"test:integration:sandbox:none": "cross-env GEMINI_SANDBOX=false vitest run --root ./integration-tests",
|
||||||
"test:memory": "vitest run --root ./memory-tests",
|
"test:memory": "node scripts/run-perf-tests.js memory",
|
||||||
"test:memory:update-baselines": "cross-env UPDATE_MEMORY_BASELINES=true vitest run --root ./memory-tests",
|
"test:memory:update-baselines": "cross-env UPDATE_MEMORY_BASELINES=true vitest run --root ./memory-tests",
|
||||||
"test:perf": "vitest run --root ./perf-tests",
|
"test:perf": "node scripts/run-perf-tests.js perf",
|
||||||
"test:perf:update-baselines": "cross-env UPDATE_PERF_BASELINES=true vitest run --root ./perf-tests",
|
"test:perf:update-baselines": "cross-env UPDATE_PERF_BASELINES=true vitest run --root ./perf-tests",
|
||||||
"test:integration:sandbox:docker": "cross-env GEMINI_SANDBOX=docker npm run build:sandbox && cross-env GEMINI_SANDBOX=docker vitest run --root ./integration-tests",
|
"test:integration:sandbox:docker": "cross-env GEMINI_SANDBOX=docker npm run build:sandbox && cross-env GEMINI_SANDBOX=docker vitest run --root ./integration-tests",
|
||||||
"test:integration:sandbox:podman": "cross-env GEMINI_SANDBOX=podman vitest run --root ./integration-tests",
|
"test:integration:sandbox:podman": "cross-env GEMINI_SANDBOX=podman vitest run --root ./integration-tests",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
/**
|
/**
|
||||||
* @license
|
* @license
|
||||||
* Copyright 2025 Google LLC
|
* Copyright 2026 Google LLC
|
||||||
* SPDX-License-Identifier: Apache-2.0
|
* SPDX-License-Identifier: Apache-2.0
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|||||||
@@ -5,6 +5,7 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
import { readFileSync, writeFileSync, existsSync } from 'node:fs';
|
import { readFileSync, writeFileSync, existsSync } from 'node:fs';
|
||||||
|
import { join } from 'node:path';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Baseline entry for a single memory test scenario.
|
* Baseline entry for a single memory test scenario.
|
||||||
@@ -77,3 +78,25 @@ export function updateBaseline(
|
|||||||
};
|
};
|
||||||
saveBaselines(path, baselines);
|
saveBaselines(path, baselines);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resolve the path to the correct memory baselines JSON file.
|
||||||
|
*
|
||||||
|
* - If `machineFamily` is provided → returns `<testRootDir>/baselines/<machineFamily>.json`.
|
||||||
|
* This file may not exist yet; the harness will hard-fail at assertion time if it doesn't.
|
||||||
|
* - If `machineFamily` is absent → returns `<testRootDir>/baselines.json`
|
||||||
|
* (the legacy generic file used for local development).
|
||||||
|
*
|
||||||
|
* @param testRootDir - Absolute path to the directory containing the test root
|
||||||
|
* (e.g. `__dirname` inside `memory-tests/`).
|
||||||
|
* @param machineFamily - Optional CI runner label (e.g. `'gemini-cli-ubuntu-16-core'`).
|
||||||
|
*/
|
||||||
|
export function resolveMemoryBaselinesPath(
|
||||||
|
testRootDir: string,
|
||||||
|
machineFamily?: string,
|
||||||
|
): string {
|
||||||
|
if (machineFamily) {
|
||||||
|
return join(testRootDir, 'baselines', `${machineFamily}.json`);
|
||||||
|
}
|
||||||
|
return join(testRootDir, 'baselines.json');
|
||||||
|
}
|
||||||
|
|||||||
@@ -6,6 +6,8 @@
|
|||||||
|
|
||||||
import v8 from 'node:v8';
|
import v8 from 'node:v8';
|
||||||
import { setTimeout as sleep } from 'node:timers/promises';
|
import { setTimeout as sleep } from 'node:timers/promises';
|
||||||
|
import { mkdirSync } from 'node:fs';
|
||||||
|
import { join, dirname } from 'node:path';
|
||||||
import { loadBaselines, updateBaseline } from './memory-baselines.js';
|
import { loadBaselines, updateBaseline } from './memory-baselines.js';
|
||||||
import type { MemoryBaseline, MemoryBaselineFile } from './memory-baselines.js';
|
import type { MemoryBaseline, MemoryBaselineFile } from './memory-baselines.js';
|
||||||
|
|
||||||
@@ -66,6 +68,14 @@ export interface MemoryTestHarnessOptions {
|
|||||||
sampleCount?: number;
|
sampleCount?: number;
|
||||||
/** Pause in ms between samples. Default: 50 */
|
/** Pause in ms between samples. Default: 50 */
|
||||||
samplePauseMs?: number;
|
samplePauseMs?: number;
|
||||||
|
/**
|
||||||
|
* The CI machine family (e.g. 'gemini-cli-ubuntu-16-core').
|
||||||
|
* When set, baselines are loaded from and saved to
|
||||||
|
* `<dir>/baselines/<machineFamily>.json`. If the file does not exist and
|
||||||
|
* UPDATE_MEMORY_BASELINES is not set, tests hard-fail with an actionable
|
||||||
|
* message instead of silently falling back.
|
||||||
|
*/
|
||||||
|
machineFamily?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -85,6 +95,7 @@ export class MemoryTestHarness {
|
|||||||
private readonly gcDelayMs: number;
|
private readonly gcDelayMs: number;
|
||||||
private readonly sampleCount: number;
|
private readonly sampleCount: number;
|
||||||
private readonly samplePauseMs: number;
|
private readonly samplePauseMs: number;
|
||||||
|
private readonly machineFamily?: string;
|
||||||
private allResults: MemoryTestResult[] = [];
|
private allResults: MemoryTestResult[] = [];
|
||||||
|
|
||||||
constructor(options: MemoryTestHarnessOptions) {
|
constructor(options: MemoryTestHarnessOptions) {
|
||||||
@@ -94,6 +105,7 @@ export class MemoryTestHarness {
|
|||||||
this.gcDelayMs = options.gcDelayMs ?? 100;
|
this.gcDelayMs = options.gcDelayMs ?? 100;
|
||||||
this.sampleCount = options.sampleCount ?? 3;
|
this.sampleCount = options.sampleCount ?? 3;
|
||||||
this.samplePauseMs = options.samplePauseMs ?? 50;
|
this.samplePauseMs = options.samplePauseMs ?? 50;
|
||||||
|
this.machineFamily = options.machineFamily;
|
||||||
this.baselines = loadBaselines(this.baselinesPath);
|
this.baselines = loadBaselines(this.baselinesPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -240,6 +252,16 @@ export class MemoryTestHarness {
|
|||||||
const tolerance = tolerancePercent ?? this.defaultTolerancePercent;
|
const tolerance = tolerancePercent ?? this.defaultTolerancePercent;
|
||||||
|
|
||||||
if (!result.baseline) {
|
if (!result.baseline) {
|
||||||
|
if (this.machineFamily) {
|
||||||
|
throw new Error(
|
||||||
|
`No baseline found for scenario "${result.scenarioName}" on machine family "${this.machineFamily}".\n` +
|
||||||
|
` Expected file: ${this.baselinesPath}\n` +
|
||||||
|
` To create it, trigger the 'Update Baselines' workflow:\n` +
|
||||||
|
` .github/workflows/update-baselines.yml\n` +
|
||||||
|
` Or locally:\n` +
|
||||||
|
` UPDATE_MEMORY_BASELINES=true MEMORY_MACHINE_FAMILY=${this.machineFamily} npm run test:memory`,
|
||||||
|
);
|
||||||
|
}
|
||||||
console.warn(
|
console.warn(
|
||||||
`⚠ No baseline found for "${result.scenarioName}". ` +
|
`⚠ No baseline found for "${result.scenarioName}". ` +
|
||||||
`Run with UPDATE_MEMORY_BASELINES=true to create one. ` +
|
`Run with UPDATE_MEMORY_BASELINES=true to create one. ` +
|
||||||
@@ -268,9 +290,21 @@ export class MemoryTestHarness {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Update the baseline for a scenario with the current measured values.
|
* Update the baseline for a scenario with the current measured values.
|
||||||
|
* When `machineFamily` is set, writes to `baselines/<machineFamily>.json`
|
||||||
|
* (creating the directory if needed). Otherwise writes to `baselinesPath`.
|
||||||
*/
|
*/
|
||||||
updateScenarioBaseline(result: MemoryTestResult): void {
|
updateScenarioBaseline(result: MemoryTestResult): void {
|
||||||
updateBaseline(this.baselinesPath, result.scenarioName, {
|
const targetPath = this.machineFamily
|
||||||
|
? join(
|
||||||
|
dirname(this.baselinesPath),
|
||||||
|
'baselines',
|
||||||
|
`${this.machineFamily}.json`,
|
||||||
|
)
|
||||||
|
: this.baselinesPath;
|
||||||
|
if (this.machineFamily) {
|
||||||
|
mkdirSync(dirname(targetPath), { recursive: true });
|
||||||
|
}
|
||||||
|
updateBaseline(targetPath, result.scenarioName, {
|
||||||
heapUsedBytes: result.finalHeapUsed,
|
heapUsedBytes: result.finalHeapUsed,
|
||||||
heapTotalBytes:
|
heapTotalBytes:
|
||||||
result.snapshots[result.snapshots.length - 1]?.heapTotal ?? 0,
|
result.snapshots[result.snapshots.length - 1]?.heapTotal ?? 0,
|
||||||
@@ -391,6 +425,9 @@ export class MemoryTestHarness {
|
|||||||
lines.push('');
|
lines.push('');
|
||||||
lines.push('═══════════════════════════════════════════════════');
|
lines.push('═══════════════════════════════════════════════════');
|
||||||
lines.push(' MEMORY USAGE TEST REPORT');
|
lines.push(' MEMORY USAGE TEST REPORT');
|
||||||
|
if (this.machineFamily) {
|
||||||
|
lines.push(` Machine family: ${this.machineFamily}`);
|
||||||
|
}
|
||||||
lines.push('═══════════════════════════════════════════════════');
|
lines.push('═══════════════════════════════════════════════════');
|
||||||
lines.push('');
|
lines.push('');
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,8 @@
|
|||||||
|
|
||||||
import { performance } from 'node:perf_hooks';
|
import { performance } from 'node:perf_hooks';
|
||||||
import { setTimeout as sleep } from 'node:timers/promises';
|
import { setTimeout as sleep } from 'node:timers/promises';
|
||||||
import { readFileSync, writeFileSync, existsSync } from 'node:fs';
|
import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'node:fs';
|
||||||
|
import { join, dirname } from 'node:path';
|
||||||
|
|
||||||
/** Configuration for asciichart plot function. */
|
/** Configuration for asciichart plot function. */
|
||||||
interface PlotConfig {
|
interface PlotConfig {
|
||||||
@@ -83,6 +84,14 @@ export interface PerfTestHarnessOptions {
|
|||||||
warmupCount?: number;
|
warmupCount?: number;
|
||||||
/** Pause in ms between samples. Default: 100 */
|
/** Pause in ms between samples. Default: 100 */
|
||||||
samplePauseMs?: number;
|
samplePauseMs?: number;
|
||||||
|
/**
|
||||||
|
* The CI machine family (e.g. 'gemini-cli-ubuntu-16-core').
|
||||||
|
* When set, baselines are loaded from and saved to
|
||||||
|
* `<dir>/baselines/<machineFamily>.json`. If the file does not exist and
|
||||||
|
* UPDATE_PERF_BASELINES is not set, tests hard-fail with an actionable
|
||||||
|
* message instead of silently falling back.
|
||||||
|
*/
|
||||||
|
machineFamily?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -114,6 +123,7 @@ export class PerfTestHarness {
|
|||||||
private readonly sampleCount: number;
|
private readonly sampleCount: number;
|
||||||
private readonly warmupCount: number;
|
private readonly warmupCount: number;
|
||||||
private readonly samplePauseMs: number;
|
private readonly samplePauseMs: number;
|
||||||
|
private readonly machineFamily?: string;
|
||||||
private allResults: PerfTestResult[] = [];
|
private allResults: PerfTestResult[] = [];
|
||||||
private activeTimers: Map<string, ActiveTimer> = new Map();
|
private activeTimers: Map<string, ActiveTimer> = new Map();
|
||||||
|
|
||||||
@@ -124,6 +134,7 @@ export class PerfTestHarness {
|
|||||||
this.sampleCount = options.sampleCount ?? 5;
|
this.sampleCount = options.sampleCount ?? 5;
|
||||||
this.warmupCount = options.warmupCount ?? 1;
|
this.warmupCount = options.warmupCount ?? 1;
|
||||||
this.samplePauseMs = options.samplePauseMs ?? 100;
|
this.samplePauseMs = options.samplePauseMs ?? 100;
|
||||||
|
this.machineFamily = options.machineFamily;
|
||||||
this.baselines = loadPerfBaselines(this.baselinesPath);
|
this.baselines = loadPerfBaselines(this.baselinesPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -284,6 +295,18 @@ export class PerfTestHarness {
|
|||||||
const cpuTolerance = cpuTolerancePercent ?? this.defaultCpuTolerancePercent;
|
const cpuTolerance = cpuTolerancePercent ?? this.defaultCpuTolerancePercent;
|
||||||
|
|
||||||
if (!result.baseline) {
|
if (!result.baseline) {
|
||||||
|
if (this.machineFamily) {
|
||||||
|
// In CI with a declared machine family: hard-fail so the problem is
|
||||||
|
// immediately visible, rather than silently skipping the assertion.
|
||||||
|
throw new Error(
|
||||||
|
`No baseline found for scenario "${result.scenarioName}" on machine family "${this.machineFamily}".\n` +
|
||||||
|
` Expected file: ${this.baselinesPath}\n` +
|
||||||
|
` To create it, trigger the 'Update Baselines' workflow:\n` +
|
||||||
|
` .github/workflows/update-baselines.yml\n` +
|
||||||
|
` Or locally:\n` +
|
||||||
|
` UPDATE_PERF_BASELINES=true PERF_MACHINE_FAMILY=${this.machineFamily} npm run test:perf`,
|
||||||
|
);
|
||||||
|
}
|
||||||
console.warn(
|
console.warn(
|
||||||
`⚠ No baseline found for "${result.scenarioName}". ` +
|
`⚠ No baseline found for "${result.scenarioName}". ` +
|
||||||
`Run with UPDATE_PERF_BASELINES=true to create one. ` +
|
`Run with UPDATE_PERF_BASELINES=true to create one. ` +
|
||||||
@@ -321,16 +344,30 @@ export class PerfTestHarness {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Update the baseline for a scenario with the current measured values.
|
* Update the baseline for a scenario with the current measured values.
|
||||||
|
* When `machineFamily` is set, writes to `baselines/<machineFamily>.json`
|
||||||
|
* (creating the directory if needed). Otherwise writes to `baselinesPath`.
|
||||||
*/
|
*/
|
||||||
updateScenarioBaseline(result: PerfTestResult): void {
|
updateScenarioBaseline(result: PerfTestResult): void {
|
||||||
updatePerfBaseline(this.baselinesPath, result.scenarioName, {
|
const targetPath = this.machineFamily
|
||||||
|
? join(
|
||||||
|
dirname(this.baselinesPath),
|
||||||
|
'baselines',
|
||||||
|
`${this.machineFamily}.json`,
|
||||||
|
)
|
||||||
|
: this.baselinesPath;
|
||||||
|
// Ensure the baselines/ subdirectory exists
|
||||||
|
if (this.machineFamily) {
|
||||||
|
mkdirSync(dirname(targetPath), { recursive: true });
|
||||||
|
}
|
||||||
|
updatePerfBaseline(targetPath, result.scenarioName, {
|
||||||
wallClockMs: result.median.wallClockMs,
|
wallClockMs: result.median.wallClockMs,
|
||||||
cpuTotalUs: result.median.cpuTotalUs,
|
cpuTotalUs: result.median.cpuTotalUs,
|
||||||
});
|
});
|
||||||
// Reload baselines after update
|
// Reload baselines after update
|
||||||
this.baselines = loadPerfBaselines(this.baselinesPath);
|
this.baselines = loadPerfBaselines(this.baselinesPath);
|
||||||
console.log(
|
console.log(
|
||||||
`Updated baseline for ${result.scenarioName}: ${result.median.wallClockMs.toFixed(1)} ms`,
|
`Updated baseline for ${result.scenarioName}: ${result.median.wallClockMs.toFixed(1)} ms` +
|
||||||
|
(this.machineFamily ? ` [${this.machineFamily}]` : ''),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -344,6 +381,9 @@ export class PerfTestHarness {
|
|||||||
lines.push('');
|
lines.push('');
|
||||||
lines.push('═══════════════════════════════════════════════════');
|
lines.push('═══════════════════════════════════════════════════');
|
||||||
lines.push(' PERFORMANCE TEST REPORT');
|
lines.push(' PERFORMANCE TEST REPORT');
|
||||||
|
if (this.machineFamily) {
|
||||||
|
lines.push(` Machine family: ${this.machineFamily}`);
|
||||||
|
}
|
||||||
lines.push('═══════════════════════════════════════════════════');
|
lines.push('═══════════════════════════════════════════════════');
|
||||||
lines.push('');
|
lines.push('');
|
||||||
|
|
||||||
@@ -484,6 +524,30 @@ export class PerfTestHarness {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ─── Baseline path resolution ────────────────────────────────────────
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resolve the path to the correct perf baselines JSON file.
|
||||||
|
*
|
||||||
|
* - If `machineFamily` is provided → returns `<testRootDir>/baselines/<machineFamily>.json`.
|
||||||
|
* This file may not exist yet; the harness will hard-fail at assertion time if it doesn't.
|
||||||
|
* - If `machineFamily` is absent → returns `<testRootDir>/baselines.json`
|
||||||
|
* (the legacy generic file used for local development).
|
||||||
|
*
|
||||||
|
* @param testRootDir - Absolute path to the directory containing the test root
|
||||||
|
* (e.g. `__dirname` inside `perf-tests/`).
|
||||||
|
* @param machineFamily - Optional CI runner label (e.g. `'gemini-cli-ubuntu-16-core'`).
|
||||||
|
*/
|
||||||
|
export function resolvePerfBaselinesPath(
|
||||||
|
testRootDir: string,
|
||||||
|
machineFamily?: string,
|
||||||
|
): string {
|
||||||
|
if (machineFamily) {
|
||||||
|
return join(testRootDir, 'baselines', `${machineFamily}.json`);
|
||||||
|
}
|
||||||
|
return join(testRootDir, 'baselines.json');
|
||||||
|
}
|
||||||
|
|
||||||
// ─── Baseline management ─────────────────────────────────────────────
|
// ─── Baseline management ─────────────────────────────────────────────
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -1,121 +0,0 @@
|
|||||||
# CPU Performance Integration Test Harness
|
|
||||||
|
|
||||||
## Overview
|
|
||||||
|
|
||||||
This directory contains performance/CPU integration tests for the Gemini CLI.
|
|
||||||
These tests measure wall-clock time, CPU usage, and event loop responsiveness to
|
|
||||||
detect regressions across key scenarios.
|
|
||||||
|
|
||||||
CPU performance is inherently noisy, especially in CI. The harness addresses
|
|
||||||
this with:
|
|
||||||
|
|
||||||
- **IQR outlier filtering** — discards anomalous samples
|
|
||||||
- **Median sampling** — takes N runs, reports the median after filtering
|
|
||||||
- **Warmup runs** — discards the first run to mitigate JIT compilation noise
|
|
||||||
- **15% default tolerance** — won't panic at slight regressions
|
|
||||||
|
|
||||||
## Running
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Run tests (compare against committed baselines)
|
|
||||||
npm run test:perf
|
|
||||||
|
|
||||||
# Update baselines (after intentional changes)
|
|
||||||
npm run test:perf:update-baselines
|
|
||||||
|
|
||||||
# Verbose output
|
|
||||||
VERBOSE=true npm run test:perf
|
|
||||||
|
|
||||||
# Keep test artifacts for debugging
|
|
||||||
KEEP_OUTPUT=true npm run test:perf
|
|
||||||
```
|
|
||||||
|
|
||||||
## How It Works
|
|
||||||
|
|
||||||
### Measurement Primitives
|
|
||||||
|
|
||||||
The `PerfTestHarness` class (in `packages/test-utils`) provides:
|
|
||||||
|
|
||||||
- **`performance.now()`** — high-resolution wall-clock timing
|
|
||||||
- **`process.cpuUsage()`** — user + system CPU microseconds (delta between
|
|
||||||
start/stop)
|
|
||||||
- **`perf_hooks.monitorEventLoopDelay()`** — event loop delay histogram
|
|
||||||
(p50/p95/p99/max)
|
|
||||||
|
|
||||||
### Noise Reduction
|
|
||||||
|
|
||||||
1. **Warmup**: First run is discarded to mitigate JIT compilation artifacts
|
|
||||||
2. **Multiple samples**: Each scenario runs N times (default 5)
|
|
||||||
3. **IQR filtering**: Samples outside Q1−1.5×IQR and Q3+1.5×IQR are discarded
|
|
||||||
4. **Median**: The median of remaining samples is used for comparison
|
|
||||||
|
|
||||||
### Baseline Management
|
|
||||||
|
|
||||||
Baselines are stored in `baselines.json` in this directory. Each scenario has:
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"cold-startup-time": {
|
|
||||||
"wallClockMs": 1234.5,
|
|
||||||
"cpuTotalUs": 567890,
|
|
||||||
"eventLoopDelayP99Ms": 12.3,
|
|
||||||
"timestamp": "2026-04-08T..."
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
Tests fail if the measured value exceeds `baseline × 1.15` (15% tolerance).
|
|
||||||
|
|
||||||
To recalibrate after intentional changes:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
npm run test:perf:update-baselines
|
|
||||||
# then commit baselines.json
|
|
||||||
```
|
|
||||||
|
|
||||||
### Report Output
|
|
||||||
|
|
||||||
After all tests, the harness prints an ASCII summary:
|
|
||||||
|
|
||||||
```
|
|
||||||
═══════════════════════════════════════════════════
|
|
||||||
PERFORMANCE TEST REPORT
|
|
||||||
═══════════════════════════════════════════════════
|
|
||||||
|
|
||||||
cold-startup-time: 1234.5 ms (Baseline: 1200.0 ms, Delta: +2.9%) ✅
|
|
||||||
idle-cpu-usage: 2.1 % (Baseline: 2.0 %, Delta: +5.0%) ✅
|
|
||||||
skill-loading-time: 1567.8 ms (Baseline: 1500.0 ms, Delta: +4.5%) ✅
|
|
||||||
```
|
|
||||||
|
|
||||||
## Architecture
|
|
||||||
|
|
||||||
```
|
|
||||||
perf-tests/
|
|
||||||
├── README.md ← you are here
|
|
||||||
├── baselines.json ← committed baseline values
|
|
||||||
├── globalSetup.ts ← test environment setup
|
|
||||||
├── perf-usage.test.ts ← test scenarios
|
|
||||||
├── perf.*.responses ← fake API responses per scenario
|
|
||||||
├── tsconfig.json ← TypeScript config
|
|
||||||
└── vitest.config.ts ← vitest config (serial, isolated)
|
|
||||||
|
|
||||||
packages/test-utils/src/
|
|
||||||
├── perf-test-harness.ts ← PerfTestHarness class
|
|
||||||
└── index.ts ← re-exports
|
|
||||||
```
|
|
||||||
|
|
||||||
## CI Integration
|
|
||||||
|
|
||||||
These tests are **excluded from `preflight`** and designed for nightly CI:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
- name: Performance regression tests
|
|
||||||
run: npm run test:perf
|
|
||||||
```
|
|
||||||
|
|
||||||
## Adding a New Scenario
|
|
||||||
|
|
||||||
1. Add a fake response file: `perf.<scenario-name>.responses`
|
|
||||||
2. Add a test case in `perf-usage.test.ts` using `harness.runScenario()`
|
|
||||||
3. Run `npm run test:perf:update-baselines` to establish initial baseline
|
|
||||||
4. Commit the updated `baselines.json`
|
|
||||||
@@ -5,13 +5,18 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
import { describe, it, beforeAll, afterAll } from 'vitest';
|
import { describe, it, beforeAll, afterAll } from 'vitest';
|
||||||
import { TestRig, PerfTestHarness } from '@google/gemini-cli-test-utils';
|
import {
|
||||||
|
TestRig,
|
||||||
|
PerfTestHarness,
|
||||||
|
resolvePerfBaselinesPath,
|
||||||
|
} from '@google/gemini-cli-test-utils';
|
||||||
import { join, dirname } from 'node:path';
|
import { join, dirname } from 'node:path';
|
||||||
import { fileURLToPath } from 'node:url';
|
import { fileURLToPath } from 'node:url';
|
||||||
import { existsSync, readFileSync } from 'node:fs';
|
import { existsSync, readFileSync } from 'node:fs';
|
||||||
|
|
||||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||||
const BASELINES_PATH = join(__dirname, 'baselines.json');
|
const MACHINE_FAMILY = process.env['PERF_MACHINE_FAMILY'];
|
||||||
|
const BASELINES_PATH = resolvePerfBaselinesPath(__dirname, MACHINE_FAMILY);
|
||||||
const UPDATE_BASELINES = process.env['UPDATE_PERF_BASELINES'] === 'true';
|
const UPDATE_BASELINES = process.env['UPDATE_PERF_BASELINES'] === 'true';
|
||||||
const TOLERANCE_PERCENT = 15;
|
const TOLERANCE_PERCENT = 15;
|
||||||
|
|
||||||
@@ -28,6 +33,7 @@ describe('CPU Performance Tests', () => {
|
|||||||
defaultTolerancePercent: TOLERANCE_PERCENT,
|
defaultTolerancePercent: TOLERANCE_PERCENT,
|
||||||
sampleCount: SAMPLE_COUNT,
|
sampleCount: SAMPLE_COUNT,
|
||||||
warmupCount: WARMUP_COUNT,
|
warmupCount: WARMUP_COUNT,
|
||||||
|
machineFamily: MACHINE_FAMILY,
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@@ -27,6 +27,8 @@ const root = join(__dirname, '..');
|
|||||||
// remove npm install/build artifacts
|
// remove npm install/build artifacts
|
||||||
rmSync(join(root, 'node_modules'), { recursive: true, force: true });
|
rmSync(join(root, 'node_modules'), { recursive: true, force: true });
|
||||||
rmSync(join(root, 'bundle'), { recursive: true, force: true });
|
rmSync(join(root, 'bundle'), { recursive: true, force: true });
|
||||||
|
rmSync(join(root, '.tmp-perf-baselines.json'), { force: true });
|
||||||
|
rmSync(join(root, '.tmp-memory-baselines.json'), { force: true });
|
||||||
rmSync(join(root, 'packages/cli/src/generated/'), {
|
rmSync(join(root, 'packages/cli/src/generated/'), {
|
||||||
recursive: true,
|
recursive: true,
|
||||||
force: true,
|
force: true,
|
||||||
|
|||||||
@@ -0,0 +1,118 @@
|
|||||||
|
/**
|
||||||
|
* @license
|
||||||
|
* Copyright 2026 Google LLC
|
||||||
|
* SPDX-License-Identifier: Apache-2.0
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { execSync } from 'node:child_process';
|
||||||
|
import path from 'node:path';
|
||||||
|
|
||||||
|
const type = process.argv[2]; // 'perf' or 'memory'
|
||||||
|
const args = process.argv.slice(3);
|
||||||
|
|
||||||
|
if (type !== 'perf' && type !== 'memory') {
|
||||||
|
console.error('Invalid test type. Must be "perf" or "memory".');
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
const isLocal = !process.env.CI && !process.env.GITHUB_ACTIONS;
|
||||||
|
const noOptions = args.length === 0;
|
||||||
|
const testDir = type === 'perf' ? './perf-tests' : './memory-tests';
|
||||||
|
const updateEnv =
|
||||||
|
type === 'perf'
|
||||||
|
? 'UPDATE_PERF_BASELINES=true'
|
||||||
|
: 'UPDATE_MEMORY_BASELINES=true';
|
||||||
|
const tempBaselinesPath = path.resolve(
|
||||||
|
process.cwd(),
|
||||||
|
`.tmp-${type}-baselines.json`,
|
||||||
|
);
|
||||||
|
|
||||||
|
if (isLocal && noOptions) {
|
||||||
|
console.log(
|
||||||
|
`[Auto-Baseline] Detected local run without options for ${type} tests.`,
|
||||||
|
);
|
||||||
|
console.log('[Auto-Baseline] Updating baselines from main branch first...');
|
||||||
|
|
||||||
|
let originalBranch = '';
|
||||||
|
let isDirty = false;
|
||||||
|
|
||||||
|
try {
|
||||||
|
originalBranch = execSync('git rev-parse --abbrev-ref HEAD', {
|
||||||
|
encoding: 'utf-8',
|
||||||
|
}).trim();
|
||||||
|
const status = execSync('git status --porcelain', {
|
||||||
|
encoding: 'utf-8',
|
||||||
|
}).trim();
|
||||||
|
isDirty = status !== '';
|
||||||
|
|
||||||
|
if (isDirty) {
|
||||||
|
console.log('[Auto-Baseline] Stashing current changes...');
|
||||||
|
execSync('git stash push --include-untracked -m "temp-perf-test-run"');
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('[Auto-Baseline] Switching to main branch...');
|
||||||
|
execSync('git checkout main', { stdio: 'inherit' });
|
||||||
|
|
||||||
|
try {
|
||||||
|
console.log(
|
||||||
|
'[Auto-Baseline] Pulling latest changes for main from origin...',
|
||||||
|
);
|
||||||
|
execSync('git pull origin main', { stdio: 'inherit' });
|
||||||
|
} catch {
|
||||||
|
console.warn(
|
||||||
|
'[Auto-Baseline] Warning: git pull failed. Proceeding with local main branch.',
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(
|
||||||
|
`[Auto-Baseline] Running update baselines for ${type} tests on main...`,
|
||||||
|
);
|
||||||
|
execSync(
|
||||||
|
`npx cross-env ${updateEnv} TEMP_BASELINES_PATH=${tempBaselinesPath} npx vitest run --root ${testDir}`,
|
||||||
|
{ stdio: 'inherit' },
|
||||||
|
);
|
||||||
|
} catch (err) {
|
||||||
|
console.error(
|
||||||
|
'[Auto-Baseline] Error during main-branch baseline update:',
|
||||||
|
err,
|
||||||
|
);
|
||||||
|
} finally {
|
||||||
|
if (originalBranch) {
|
||||||
|
console.log(
|
||||||
|
`[Auto-Baseline] Returning to original branch: ${originalBranch}...`,
|
||||||
|
);
|
||||||
|
try {
|
||||||
|
execSync(`git checkout ${originalBranch}`, { stdio: 'inherit' });
|
||||||
|
if (isDirty) {
|
||||||
|
console.log('[Auto-Baseline] Restoring stashed changes...');
|
||||||
|
execSync('git stash pop', { stdio: 'inherit' });
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
console.error(
|
||||||
|
'[Auto-Baseline] Critical error while trying to restore original branch state.',
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(
|
||||||
|
`[Auto-Baseline] Running tests on branch ${originalBranch} against updated baselines...`,
|
||||||
|
);
|
||||||
|
try {
|
||||||
|
execSync(
|
||||||
|
`npx cross-env TEMP_BASELINES_PATH=${tempBaselinesPath} npx vitest run --root ${testDir}`,
|
||||||
|
{ stdio: 'inherit' },
|
||||||
|
);
|
||||||
|
} catch {
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Just run standard tests directly
|
||||||
|
const command = `npx vitest run --root ${testDir} ${args.join(' ')}`;
|
||||||
|
console.log(`[Standard] Running tests: ${command}`);
|
||||||
|
try {
|
||||||
|
execSync(command, { stdio: 'inherit' });
|
||||||
|
} catch {
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user