mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-06-10 11:12:35 -07:00
feat(optimization): implement manifest-driven extraction pipeline
- Implement `extract.ts` with robust character-aware parsing for snippets and tools. - Consolidate research dependencies by moving `@ax-llm/ax` to root `optionalDependencies`. - Relocate evaluation logic from `packages/core` to `scripts/optimization/lib/evals` to keep the production core lean. - Add `optimization_targets` to `data/manifest.json` as the single source of truth for the pipeline. - Implement comprehensive unit tests for extraction and variable masking with 100% pass rate. - Update global config and linting rules to support the new optimization infrastructure.
This commit is contained in:
@@ -62,3 +62,4 @@ gemini-debug.log
|
||||
.gemini-clipboard/
|
||||
.eslintcache
|
||||
evals/logs/
|
||||
data/optimization/
|
||||
|
||||
@@ -66,6 +66,14 @@
|
||||
},
|
||||
"file_descriptions": {
|
||||
"data/tool_alignment.jsonl": "Ensures the model selects the correct built-in tool over generic shell commands and optimizes for brevity."
|
||||
},
|
||||
"optimization_targets": {
|
||||
"snippets": [
|
||||
"renderCoreMandates",
|
||||
"renderPrimaryWorkflows",
|
||||
"renderOperationalGuidelines",
|
||||
"renderGitRepo"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
+5
-1
@@ -68,7 +68,11 @@ export default tseslint.config(
|
||||
},
|
||||
{
|
||||
// Rules for packages/*/src (TS/TSX)
|
||||
files: ['packages/*/src/**/*.{ts,tsx}', 'data/**/*.ts'],
|
||||
files: [
|
||||
'packages/*/src/**/*.{ts,tsx}',
|
||||
'data/**/*.ts',
|
||||
'scripts/optimization/**/*.ts',
|
||||
],
|
||||
plugins: {
|
||||
import: importPlugin,
|
||||
},
|
||||
|
||||
Generated
+48
-1
@@ -73,6 +73,7 @@
|
||||
"node": ">=20.0.0"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@ax-llm/ax": "^19.0.11",
|
||||
"@lydell/node-pty": "1.1.0",
|
||||
"@lydell/node-pty-darwin-arm64": "1.1.0",
|
||||
"@lydell/node-pty-darwin-x64": "1.1.0",
|
||||
@@ -179,6 +180,21 @@
|
||||
"node": ">=6.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@ax-llm/ax": {
|
||||
"version": "19.0.11",
|
||||
"resolved": "https://registry.npmjs.org/@ax-llm/ax/-/ax-19.0.11.tgz",
|
||||
"integrity": "sha512-U3ZYzBrmMDTDst32jxgH873gC4c75aYjzdCZgwQWy+CwSDL2SskwQX2kZAWGDmmSzs8BxskleoASzQUXuqRLfQ==",
|
||||
"hasInstallScript": true,
|
||||
"license": "Apache-2.0",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"@opentelemetry/api": "^1.9.0",
|
||||
"dayjs": "^1.11.13"
|
||||
},
|
||||
"bin": {
|
||||
"ax": "cli/index.mjs"
|
||||
}
|
||||
},
|
||||
"node_modules/@azu/format-text": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/@azu/format-text/-/format-text-1.0.2.tgz",
|
||||
@@ -2292,6 +2308,7 @@
|
||||
"integrity": "sha512-t54CUOsFMappY1Jbzb7fetWeO0n6K0k/4+/ZpkS+3Joz8I4VcvY9OiEBFRYISqaI2fq5sCiPtAjRDOzVYG8m+Q==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@octokit/auth-token": "^6.0.0",
|
||||
"@octokit/graphql": "^9.0.2",
|
||||
@@ -2472,6 +2489,7 @@
|
||||
"resolved": "https://registry.npmjs.org/@opentelemetry/api/-/api-1.9.0.tgz",
|
||||
"integrity": "sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg==",
|
||||
"license": "Apache-2.0",
|
||||
"peer": true,
|
||||
"engines": {
|
||||
"node": ">=8.0.0"
|
||||
}
|
||||
@@ -2521,6 +2539,7 @@
|
||||
"resolved": "https://registry.npmjs.org/@opentelemetry/core/-/core-2.5.0.tgz",
|
||||
"integrity": "sha512-ka4H8OM6+DlUhSAZpONu0cPBtPPTQKxbxVzC4CzVx5+K4JnroJVBtDzLAMx4/3CDTJXRvVFhpFjtl4SaiTNoyQ==",
|
||||
"license": "Apache-2.0",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@opentelemetry/semantic-conventions": "^1.29.0"
|
||||
},
|
||||
@@ -2895,6 +2914,7 @@
|
||||
"resolved": "https://registry.npmjs.org/@opentelemetry/resources/-/resources-2.5.0.tgz",
|
||||
"integrity": "sha512-F8W52ApePshpoSrfsSk1H2yJn9aKjCrbpQF1M9Qii0GHzbfVeFUB+rc3X4aggyZD8x9Gu3Slua+s6krmq6Dt8g==",
|
||||
"license": "Apache-2.0",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@opentelemetry/core": "2.5.0",
|
||||
"@opentelemetry/semantic-conventions": "^1.29.0"
|
||||
@@ -2928,6 +2948,7 @@
|
||||
"resolved": "https://registry.npmjs.org/@opentelemetry/sdk-metrics/-/sdk-metrics-2.5.0.tgz",
|
||||
"integrity": "sha512-BeJLtU+f5Gf905cJX9vXFQorAr6TAfK3SPvTFqP+scfIpDQEJfRaGJWta7sJgP+m4dNtBf9y3yvBKVAZZtJQVA==",
|
||||
"license": "Apache-2.0",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@opentelemetry/core": "2.5.0",
|
||||
"@opentelemetry/resources": "2.5.0"
|
||||
@@ -2982,6 +3003,7 @@
|
||||
"resolved": "https://registry.npmjs.org/@opentelemetry/sdk-trace-base/-/sdk-trace-base-2.5.0.tgz",
|
||||
"integrity": "sha512-VzRf8LzotASEyNDUxTdaJ9IRJ1/h692WyArDBInf5puLCjxbICD6XkHgpuudis56EndyS7LYFmtTMny6UABNdQ==",
|
||||
"license": "Apache-2.0",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@opentelemetry/core": "2.5.0",
|
||||
"@opentelemetry/resources": "2.5.0",
|
||||
@@ -4178,6 +4200,7 @@
|
||||
"integrity": "sha512-6mDvHUFSjyT2B2yeNx2nUgMxh9LtOWvkhIU3uePn2I2oyNymUAX1NIsdgviM4CH+JSrp2D2hsMvJOkxY+0wNRA==",
|
||||
"devOptional": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"csstype": "^3.0.2"
|
||||
}
|
||||
@@ -4451,6 +4474,7 @@
|
||||
"integrity": "sha512-klQbnPAAiGYFyI02+znpBRLyjL4/BrBd0nyWkdC0s/6xFLkXYQ8OoRrSkqacS1ddVxf/LDyODIKbQ5TgKAf/Fg==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@typescript-eslint/scope-manager": "8.56.1",
|
||||
"@typescript-eslint/types": "8.56.1",
|
||||
@@ -5298,6 +5322,7 @@
|
||||
"resolved": "https://registry.npmjs.org/acorn/-/acorn-8.15.0.tgz",
|
||||
"integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==",
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"bin": {
|
||||
"acorn": "bin/acorn"
|
||||
},
|
||||
@@ -6879,6 +6904,13 @@
|
||||
"url": "https://github.com/sponsors/ljharb"
|
||||
}
|
||||
},
|
||||
"node_modules/dayjs": {
|
||||
"version": "1.11.19",
|
||||
"resolved": "https://registry.npmjs.org/dayjs/-/dayjs-1.11.19.tgz",
|
||||
"integrity": "sha512-t5EcLVS6QPBNqM2z8fakk/NKel+Xzshgt8FFKAn+qwlD1pzZWxh0nVCrvFK7ZDb6XucZeF9z8C7CBWTRIVApAw==",
|
||||
"license": "MIT",
|
||||
"optional": true
|
||||
},
|
||||
"node_modules/debug": {
|
||||
"version": "4.4.3",
|
||||
"resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz",
|
||||
@@ -7901,6 +7933,7 @@
|
||||
"integrity": "sha512-VmQ+sifHUbI/IcSopBCF/HO3YiHQx/AVd3UVyYL6weuwW+HvON9VYn5l6Zl1WZzPWXPNZrSQpxwkkZ/VuvJZzg==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@eslint-community/eslint-utils": "^4.8.0",
|
||||
"@eslint-community/regexpp": "^4.12.1",
|
||||
@@ -8533,6 +8566,7 @@
|
||||
"resolved": "https://registry.npmjs.org/express/-/express-5.2.1.tgz",
|
||||
"integrity": "sha512-hIS4idWWai69NezIdRt2xFVofaF4j+6INOpJlVOLDO8zXGpUVEVzIYk12UUi2JzjEzWL3IOAxcTubgz9Po0yXw==",
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"accepts": "^2.0.0",
|
||||
"body-parser": "^2.2.1",
|
||||
@@ -9847,6 +9881,7 @@
|
||||
"resolved": "https://registry.npmjs.org/hono/-/hono-4.12.2.tgz",
|
||||
"integrity": "sha512-gJnaDHXKDayjt8ue0n8Gs0A007yKXj4Xzb8+cNjZeYsSzzwKc0Lr+OZgYwVfB0pHfUs17EPoLvrOsEaJ9mj+Tg==",
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"engines": {
|
||||
"node": ">=16.9.0"
|
||||
}
|
||||
@@ -10126,6 +10161,7 @@
|
||||
"resolved": "https://registry.npmjs.org/@jrichman/ink/-/ink-6.4.11.tgz",
|
||||
"integrity": "sha512-93LQlzT7vvZ1XJcmOMwN4s+6W334QegendeHOMnEJBlhnpIzr8bws6/aOEHG8ZCuVD/vNeeea5m1msHIdAY6ig==",
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@alcalzone/ansi-tokenize": "^0.2.1",
|
||||
"ansi-escapes": "^7.0.0",
|
||||
@@ -13808,6 +13844,7 @@
|
||||
"resolved": "https://registry.npmjs.org/react/-/react-19.2.4.tgz",
|
||||
"integrity": "sha512-9nfp2hYpCwOjAN+8TZFGhtWEwgvWHXqESH8qT89AT/lWklpLON22Lc8pEtnpsZz7VmawabSU0gCjnj8aC0euHQ==",
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"engines": {
|
||||
"node": ">=0.10.0"
|
||||
}
|
||||
@@ -13818,6 +13855,7 @@
|
||||
"integrity": "sha512-ePrwPfxAnB+7hgnEr8vpKxL9cmnp7F322t8oqcPshbIQQhDKgFDW4tjhF2wjVbdXF9O/nyuy3sQWd9JGpiLPvA==",
|
||||
"devOptional": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"shell-quote": "^1.6.1",
|
||||
"ws": "^7"
|
||||
@@ -15906,6 +15944,7 @@
|
||||
"resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.3.tgz",
|
||||
"integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==",
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
},
|
||||
@@ -16129,7 +16168,8 @@
|
||||
"resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz",
|
||||
"integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==",
|
||||
"dev": true,
|
||||
"license": "0BSD"
|
||||
"license": "0BSD",
|
||||
"peer": true
|
||||
},
|
||||
"node_modules/tsx": {
|
||||
"version": "4.20.3",
|
||||
@@ -16137,6 +16177,7 @@
|
||||
"integrity": "sha512-qjbnuR9Tr+FJOMBqJCW5ehvIo/buZq7vH7qD7JziU98h6l3qGy0a/yPFjwO+y0/T7GFpNgNAvEcPPVfyT8rrPQ==",
|
||||
"devOptional": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"esbuild": "~0.25.0",
|
||||
"get-tsconfig": "^4.7.5"
|
||||
@@ -16296,6 +16337,7 @@
|
||||
"integrity": "sha512-p1diW6TqL9L07nNxvRMM7hMMw4c5XOo/1ibL4aAIGmSAt9slTE1Xgw5KWuof2uTOvCg9BY7ZRi+GaF+7sfgPeQ==",
|
||||
"devOptional": true,
|
||||
"license": "Apache-2.0",
|
||||
"peer": true,
|
||||
"bin": {
|
||||
"tsc": "bin/tsc",
|
||||
"tsserver": "bin/tsserver"
|
||||
@@ -16519,6 +16561,7 @@
|
||||
"resolved": "https://registry.npmjs.org/vite/-/vite-7.2.2.tgz",
|
||||
"integrity": "sha512-BxAKBWmIbrDgrokdGZH1IgkIk/5mMHDreLDmCJ0qpyJaAteP8NvMhkwr/ZCQNqNH97bw/dANTE9PDzqwJghfMQ==",
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"esbuild": "^0.25.0",
|
||||
"fdir": "^6.5.0",
|
||||
@@ -16632,6 +16675,7 @@
|
||||
"resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.3.tgz",
|
||||
"integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==",
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
},
|
||||
@@ -16644,6 +16688,7 @@
|
||||
"resolved": "https://registry.npmjs.org/vitest/-/vitest-3.2.4.tgz",
|
||||
"integrity": "sha512-LUCP5ev3GURDysTWiP47wRRUpLKMOfPh+yKTx3kVIEiu5KOMeqzpnYNsKyOoVrULivR8tLcks4+lga33Whn90A==",
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@types/chai": "^5.2.2",
|
||||
"@vitest/expect": "3.2.4",
|
||||
@@ -17288,6 +17333,7 @@
|
||||
"resolved": "https://registry.npmjs.org/zod/-/zod-3.25.76.tgz",
|
||||
"integrity": "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ==",
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/colinhacks"
|
||||
}
|
||||
@@ -17687,6 +17733,7 @@
|
||||
"resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.3.tgz",
|
||||
"integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==",
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
},
|
||||
|
||||
+5
-2
@@ -62,8 +62,10 @@
|
||||
"release:version": "node scripts/version.js",
|
||||
"telemetry": "node scripts/telemetry.js",
|
||||
"data:validate": "tsx scripts/validate-data.ts",
|
||||
"data:format": "prettier --write 'data/**/*.ts' 'data/*.json' 'scripts/validate-data.ts' 'packages/core/src/evals/**/*.ts'",
|
||||
"data:lint": "eslint 'data/**/*.ts' 'scripts/validate-data.ts' 'packages/core/src/evals/**/*.ts'",
|
||||
"data:format": "prettier --write 'data/*.json' 'scripts/validate-data.ts' 'scripts/optimization/**/*.ts'",
|
||||
"data:lint": "eslint 'scripts/validate-data.ts' 'scripts/optimization/**/*.ts'",
|
||||
"optimize": "tsx scripts/optimization/run.ts",
|
||||
"optimize:extract": "tsx scripts/optimization/extract.ts",
|
||||
"check:lockfile": "node scripts/check-lockfile.js",
|
||||
"clean": "node scripts/clean.js",
|
||||
"pre-commit": "node scripts/pre-commit.js"
|
||||
@@ -145,6 +147,7 @@
|
||||
"simple-git": "^3.28.0"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@ax-llm/ax": "^19.0.11",
|
||||
"@lydell/node-pty": "1.1.0",
|
||||
"@lydell/node-pty-darwin-arm64": "1.1.0",
|
||||
"@lydell/node-pty-darwin-x64": "1.1.0",
|
||||
|
||||
@@ -1,41 +0,0 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import { evaluateTokenFrugality } from './tokenFrugality.js';
|
||||
import { MetricObjective, OptimizationDirection } from '../types.js';
|
||||
|
||||
describe('evaluateTokenFrugality', () => {
|
||||
it('should return the raw character count as the score', () => {
|
||||
const prediction = { output_text: 'Hello' };
|
||||
const result = evaluateTokenFrugality(prediction);
|
||||
expect(result.score).toBe(5);
|
||||
expect(result.objective).toBe(MetricObjective.FRUGALITY);
|
||||
expect(result.direction).toBe(OptimizationDirection.MINIMIZE);
|
||||
expect(result.reason).toContain('contains 5 characters');
|
||||
});
|
||||
|
||||
it('should flag if response is succinct (under threshold)', () => {
|
||||
const prediction = { output_text: 'Short' };
|
||||
const result = evaluateTokenFrugality(prediction);
|
||||
expect(result.metadata?.['isOverThreshold']).toBe(false);
|
||||
expect(result.reason).toContain('Succinct response');
|
||||
});
|
||||
|
||||
it('should flag if response exceeds chatter threshold', () => {
|
||||
const prediction = { output_text: 'a'.repeat(50) };
|
||||
const result = evaluateTokenFrugality(prediction);
|
||||
expect(result.metadata?.['isOverThreshold']).toBe(true);
|
||||
expect(result.reason).toContain('Exceeds threshold');
|
||||
});
|
||||
|
||||
it('should handle missing output text as 0 chars', () => {
|
||||
const prediction = {};
|
||||
const result = evaluateTokenFrugality(prediction);
|
||||
expect(result.score).toBe(0);
|
||||
expect(result.reason).toContain('contains 0 characters');
|
||||
});
|
||||
});
|
||||
@@ -1,49 +0,0 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { debugLogger } from '../../utils/debugLogger.js';
|
||||
import { DEFAULT_EVAL_CONFIG } from '../config.js';
|
||||
import { MetricObjective, OptimizationDirection } from '../types.js';
|
||||
import type { MetricResult } from '../types.js';
|
||||
|
||||
/**
|
||||
* Evaluates the frugality of a model's response by measuring total character count.
|
||||
* Focuses on reducing conversational noise ("chatter").
|
||||
*/
|
||||
export function evaluateTokenFrugality(
|
||||
prediction: { output_text?: string },
|
||||
config = DEFAULT_EVAL_CONFIG.objectives.frugality,
|
||||
): MetricResult {
|
||||
const chatter = prediction.output_text ?? '';
|
||||
const chatterLength = chatter.length;
|
||||
|
||||
debugLogger.debug(
|
||||
`[Eval:Frugality] Measuring output text length: ${chatterLength} chars.`,
|
||||
);
|
||||
|
||||
// In Genetic-Pareto, the raw score (length) is the value to be MINIMIZED.
|
||||
// We provide the raw count as the score, and the direction tells the optimizer how to handle it.
|
||||
|
||||
let reason = `Response contains ${chatterLength} characters of non-tool text.`;
|
||||
|
||||
if (chatterLength > config.chattyThresholdChars) {
|
||||
reason += ` (Exceeds threshold of ${config.chattyThresholdChars})`;
|
||||
} else {
|
||||
reason += ' (Succinct response)';
|
||||
}
|
||||
|
||||
return {
|
||||
score: chatterLength,
|
||||
objective: MetricObjective.FRUGALITY,
|
||||
direction: OptimizationDirection.MINIMIZE,
|
||||
reason,
|
||||
metadata: {
|
||||
charCount: chatterLength,
|
||||
threshold: config.chattyThresholdChars,
|
||||
isOverThreshold: chatterLength > config.chattyThresholdChars,
|
||||
},
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,99 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { describe, it, expect, vi, beforeEach } from 'vitest';
|
||||
import * as fs from 'node:fs';
|
||||
import { runExtraction } from './extract.js';
|
||||
|
||||
vi.mock('node:fs');
|
||||
|
||||
describe('extraction script', () => {
|
||||
const mockManifest = {
|
||||
data_inventory: {
|
||||
optimization_targets: {
|
||||
snippets: ['renderCoreMandates'],
|
||||
},
|
||||
tools: {
|
||||
read_file: {},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
beforeEach(() => {
|
||||
vi.clearAllMocks();
|
||||
vi.mocked(fs.existsSync).mockReturnValue(true);
|
||||
vi.mocked(fs.readFileSync).mockImplementation((path) => {
|
||||
if (typeof path !== 'string') return '';
|
||||
if (path.includes('manifest.json')) return JSON.stringify(mockManifest);
|
||||
|
||||
// Mock snippets.ts
|
||||
if (path.includes('snippets.ts')) {
|
||||
return `
|
||||
export function renderCoreMandates(options: any): string {
|
||||
const foo = "Ignore me";
|
||||
return \`# Core Mandate Instruction \${USER_VAR}\`.trim();
|
||||
}
|
||||
`;
|
||||
}
|
||||
|
||||
// Mock gemini-3.ts
|
||||
if (path.includes('gemini-3.ts')) {
|
||||
return `
|
||||
read_file: {
|
||||
description: 'Read file description.',
|
||||
},
|
||||
`;
|
||||
}
|
||||
|
||||
// Mock dynamic helpers
|
||||
if (path.includes('dynamic-declaration-helpers.ts')) {
|
||||
return `
|
||||
return \`This tool executes a given shell command as \\\`bash -c <command>\\\`. \${backgroundInstructions}\`;
|
||||
name: EXIT_PLAN_MODE_TOOL_NAME,
|
||||
description: 'Exit Plan Mode.',
|
||||
name: ACTIVATE_SKILL_TOOL_NAME,
|
||||
description: \`Activate skill.\`,
|
||||
`;
|
||||
}
|
||||
return '';
|
||||
});
|
||||
});
|
||||
|
||||
it('should extract snippets correctly (Step 1)', async () => {
|
||||
const targets = await runExtraction();
|
||||
const snippet = targets.find((t) => t.id === 'snippets:renderCoreMandates');
|
||||
expect(snippet).toBeDefined();
|
||||
expect(snippet?.originalText).toBe(
|
||||
'# Core Mandate Instruction ${USER_VAR}',
|
||||
);
|
||||
expect(snippet?.maskedText).toContain('[[GCLI_VAR_0]]');
|
||||
});
|
||||
|
||||
it('should extract tools correctly (Step 2)', async () => {
|
||||
const targets = await runExtraction();
|
||||
const tool = targets.find((t) => t.id === 'gemini3:read_file:description');
|
||||
expect(tool).toBeDefined();
|
||||
expect(tool?.originalText).toBe('Read file description.');
|
||||
});
|
||||
|
||||
it('should extract dynamic helpers correctly (Step 3)', async () => {
|
||||
const targets = await runExtraction();
|
||||
const shell = targets.find((t) => t.id === 'shell:darwin:description');
|
||||
expect(shell).toBeDefined();
|
||||
expect(shell?.maskedText).toContain('[[GCLI_VAR_0]]');
|
||||
|
||||
const exitPlan = targets.find((t) => t.id === 'exit_plan_mode:description');
|
||||
expect(exitPlan?.originalText).toBe('Exit Plan Mode.');
|
||||
});
|
||||
|
||||
it('should write targets.json to the correct directory', async () => {
|
||||
await runExtraction();
|
||||
expect(fs.writeFileSync).toHaveBeenCalledWith(
|
||||
expect.stringContaining('targets.json'),
|
||||
expect.any(String),
|
||||
);
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,175 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import * as fs from 'node:fs';
|
||||
import * as path from 'node:path';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
import { maskVariables } from './lib/masking.js';
|
||||
|
||||
export interface OptimizationTarget {
|
||||
id: string;
|
||||
sourceFile: string;
|
||||
originalText: string;
|
||||
maskedText: string;
|
||||
maskMap: Record<string, string>;
|
||||
}
|
||||
|
||||
/**
|
||||
* Robustly finds a block bounded by { } using character scanning.
|
||||
*/
|
||||
function findBlockBounds(
|
||||
content: string,
|
||||
startIdx: number,
|
||||
): { start: number; end: number } | null {
|
||||
const blockStart = content.indexOf('{', startIdx);
|
||||
if (blockStart === -1) return null;
|
||||
|
||||
let braceCount = 0;
|
||||
for (let i = blockStart; i < content.length; i++) {
|
||||
if (content[i] === '{') braceCount++;
|
||||
if (content[i] === '}') braceCount--;
|
||||
if (braceCount === 0) {
|
||||
return { start: blockStart, end: i };
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Main extraction function.
|
||||
*/
|
||||
export async function runExtraction() {
|
||||
const manifest = JSON.parse(fs.readFileSync('data/manifest.json', 'utf8'));
|
||||
const targets: OptimizationTarget[] = [];
|
||||
|
||||
// 1. Snippets
|
||||
const snippetNames =
|
||||
manifest.data_inventory?.optimization_targets?.snippets || [];
|
||||
const snippetsPath = 'packages/core/src/prompts/snippets.ts';
|
||||
if (fs.existsSync(snippetsPath)) {
|
||||
const content = fs.readFileSync(snippetsPath, 'utf8');
|
||||
for (const name of snippetNames) {
|
||||
const startIdx = content.indexOf(`export function ${name}`);
|
||||
if (startIdx === -1) continue;
|
||||
|
||||
const bounds = findBlockBounds(content, startIdx);
|
||||
if (!bounds) continue;
|
||||
|
||||
const body = content.substring(bounds.start, bounds.end + 1);
|
||||
// Capture the LAST template literal
|
||||
const tickMatches = [...body.matchAll(/`((?:[^`\\]|\\.)*)`/g)];
|
||||
if (tickMatches.length > 0) {
|
||||
const text = tickMatches[tickMatches.length - 1][1].trim();
|
||||
const { maskedText, maskMap } = maskVariables(text);
|
||||
targets.push({
|
||||
id: `snippets:${name}`,
|
||||
sourceFile: snippetsPath,
|
||||
originalText: text,
|
||||
maskedText,
|
||||
maskMap,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Tools
|
||||
const toolNames = Object.keys(manifest.data_inventory?.tools || {});
|
||||
const gemini3Path =
|
||||
'packages/core/src/tools/definitions/model-family-sets/gemini-3.ts';
|
||||
if (fs.existsSync(gemini3Path)) {
|
||||
const content = fs.readFileSync(gemini3Path, 'utf8');
|
||||
for (const name of toolNames) {
|
||||
// Find tool key (2-space indent)
|
||||
const toolRegex = new RegExp(`^\\s{2}${name}:\\s*\\{`, 'm');
|
||||
const match = toolRegex.exec(content);
|
||||
if (!match) continue;
|
||||
|
||||
const bounds = findBlockBounds(content, match.index);
|
||||
if (!bounds) continue;
|
||||
|
||||
const toolBlock = content.substring(match.index, bounds.end + 1);
|
||||
const descRegex =
|
||||
/description:\s*(?:`((?:[^`\\]|\\.)*)`|'([^']*)'|"([^"]*)")/g;
|
||||
const descMatch = descRegex.exec(toolBlock);
|
||||
|
||||
if (descMatch) {
|
||||
const text = (descMatch[1] || descMatch[2] || descMatch[3]).trim();
|
||||
const { maskedText, maskMap } = maskVariables(text);
|
||||
targets.push({
|
||||
id: `gemini3:${name}:description`,
|
||||
sourceFile: gemini3Path,
|
||||
originalText: text,
|
||||
maskedText,
|
||||
maskMap,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Dynamic Helpers
|
||||
const helpersPath =
|
||||
'packages/core/src/tools/definitions/dynamic-declaration-helpers.ts';
|
||||
if (fs.existsSync(helpersPath)) {
|
||||
const content = fs.readFileSync(helpersPath, 'utf8');
|
||||
const specs = [
|
||||
{
|
||||
id: 'shell:darwin:description',
|
||||
regex:
|
||||
/return `This tool executes a given shell command as \\`bash -c <command>\\`. ([\s\S]*?)`;/,
|
||||
},
|
||||
{
|
||||
id: 'shell:win32:description',
|
||||
regex:
|
||||
/return `This tool executes a given shell command as \\`powershell\.exe -NoProfile -Command <command>\\`. ([\s\S]*?)`;/,
|
||||
},
|
||||
{
|
||||
id: 'exit_plan_mode:description',
|
||||
regex:
|
||||
/name: EXIT_PLAN_MODE_TOOL_NAME,[\s\S]*?description:\s*'([^']*)',/,
|
||||
},
|
||||
{
|
||||
id: 'activate_skill:description',
|
||||
regex:
|
||||
/name: ACTIVATE_SKILL_TOOL_NAME,[\s\S]*?description:\s*`((?:[^`\\]|\\.)*)`,/,
|
||||
},
|
||||
];
|
||||
for (const s of specs) {
|
||||
const m = s.regex.exec(content);
|
||||
if (m && m[1]) {
|
||||
const text = m[1].trim();
|
||||
const { maskedText, maskMap } = maskVariables(text);
|
||||
targets.push({
|
||||
id: s.id,
|
||||
sourceFile: helpersPath,
|
||||
originalText: text,
|
||||
maskedText,
|
||||
maskMap,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const outputDir = 'data/optimization';
|
||||
if (!fs.existsSync(outputDir)) fs.mkdirSync(outputDir, { recursive: true });
|
||||
|
||||
fs.writeFileSync(
|
||||
path.join(outputDir, 'targets.json'),
|
||||
JSON.stringify(targets, null, 2),
|
||||
);
|
||||
return targets;
|
||||
}
|
||||
|
||||
// CLI Entrypoint
|
||||
const isMain =
|
||||
process.argv[1] &&
|
||||
fileURLToPath(import.meta.url) === fs.realpathSync(process.argv[1]);
|
||||
if (isMain) {
|
||||
runExtraction()
|
||||
// eslint-disable-next-line no-console
|
||||
.then((t) => console.log(`✅ Extracted ${t.length} targets.`))
|
||||
// eslint-disable-next-line no-console
|
||||
.catch(console.error);
|
||||
}
|
||||
@@ -4,17 +4,10 @@
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { OptimizationDirection } from './types.js';
|
||||
|
||||
/**
|
||||
* Configuration for the Tool Alignment objective (The Accuracy Dimension).
|
||||
*/
|
||||
export interface AlignmentConfig {
|
||||
/**
|
||||
* Whether to increase or decrease the alignment score.
|
||||
*/
|
||||
direction: OptimizationDirection.MAXIMIZE;
|
||||
|
||||
/**
|
||||
* The relative importance of accuracy vs other objectives in the Pareto frontier.
|
||||
*/
|
||||
@@ -42,28 +35,37 @@ export interface AlignmentConfig {
|
||||
}
|
||||
|
||||
/**
|
||||
* Configuration for the Token Frugality objective (The Density Dimension).
|
||||
* Configuration for the Brevity objective (The Density Dimension).
|
||||
* Uses a word-count step-function to provide high-contrast signal for GEPA.
|
||||
*/
|
||||
export interface FrugalityConfig {
|
||||
/**
|
||||
* Whether to increase or decrease the token count.
|
||||
*/
|
||||
direction: OptimizationDirection.MINIMIZE;
|
||||
|
||||
export interface BrevityConfig {
|
||||
/**
|
||||
* Importance of brevity relative to accuracy.
|
||||
*/
|
||||
weight: number;
|
||||
|
||||
/**
|
||||
* The 'conversational budget' - max chars of non-tool text allowed before penalty.
|
||||
* TIER 1: Response is perfectly succinct (e.g., <= 10 words).
|
||||
*/
|
||||
chattyThresholdChars: number;
|
||||
succinctThresholdWords: number;
|
||||
succinctScore: number; // 1.0
|
||||
|
||||
/**
|
||||
* Amount subtracted from the functional score if the model is too verbose.
|
||||
* TIER 2: Response is acceptable but slightly verbose (e.g., <= 25 words).
|
||||
*/
|
||||
chattyPenalty: number;
|
||||
acceptableThresholdWords: number;
|
||||
acceptableScore: number; // 0.7
|
||||
|
||||
/**
|
||||
* TIER 3: Response is verbose (e.g., <= 50 words).
|
||||
*/
|
||||
verboseThresholdWords: number;
|
||||
verboseScore: number; // 0.4
|
||||
|
||||
/**
|
||||
* TIER 4: Response is very heavy (e.g., > 50 words).
|
||||
*/
|
||||
heavyScore: number; // 0.1
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -72,29 +74,33 @@ export interface FrugalityConfig {
|
||||
export interface EvalConfig {
|
||||
objectives: {
|
||||
alignment: AlignmentConfig;
|
||||
frugality: FrugalityConfig;
|
||||
brevity: BrevityConfig;
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Default weights and thresholds for the Genetic-Pareto (GEPA) engine.
|
||||
* These constants drive the 'Selection Pressure' that evolves the prompt.
|
||||
* GEPA always MAXIMIZES, so higher scores represent better performance.
|
||||
*/
|
||||
export const DEFAULT_EVAL_CONFIG: EvalConfig = {
|
||||
objectives: {
|
||||
alignment: {
|
||||
direction: OptimizationDirection.MAXIMIZE,
|
||||
weight: 1.0, // PRIMARY: Accuracy cannot be sacrificed.
|
||||
hardFailureScore: 0.0,
|
||||
invalidResponseScore: 0.1,
|
||||
toolNameMatchOnlyScore: 0.4,
|
||||
functionalSuccessScore: 1.0,
|
||||
},
|
||||
frugality: {
|
||||
direction: OptimizationDirection.MINIMIZE,
|
||||
brevity: {
|
||||
weight: 0.6, // SECONDARY: Reward brevity once accuracy is high.
|
||||
chattyThresholdChars: 30, // Budget for 'I have updated the file' etc.
|
||||
chattyPenalty: 0.2, // Penalty creates a 'Reward Gap' for concise models.
|
||||
succinctThresholdWords: 10,
|
||||
succinctScore: 1.0,
|
||||
acceptableThresholdWords: 25,
|
||||
acceptableScore: 0.7,
|
||||
verboseThresholdWords: 50,
|
||||
verboseScore: 0.4,
|
||||
heavyScore: 0.1, // Never hard-zero brevity to allow gradient improvement.
|
||||
},
|
||||
},
|
||||
};
|
||||
@@ -0,0 +1,54 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import { evaluateBrevity } from './brevityMetric.js';
|
||||
|
||||
describe('evaluateBrevity 4-tier step-function', () => {
|
||||
it('should return 1.0 for a succinct response (<= 10 words)', () => {
|
||||
const prediction = { output_text: 'I have updated the file for you now.' }; // 8 words
|
||||
const result = evaluateBrevity(prediction);
|
||||
expect(result.score).toBe(1.0);
|
||||
expect(result.metadata?.tier).toBe('succinct');
|
||||
});
|
||||
|
||||
it('should return 0.7 for an acceptable response (11-25 words)', () => {
|
||||
const text =
|
||||
'I have successfully updated the file. Everything looks good to proceed with the next step.';
|
||||
// 16 words
|
||||
const prediction = { output_text: text };
|
||||
const result = evaluateBrevity(prediction);
|
||||
expect(result.score).toBe(0.7);
|
||||
expect(result.metadata?.tier).toBe('acceptable');
|
||||
});
|
||||
|
||||
it('should return 0.4 for a verbose response (26-50 words)', () => {
|
||||
const text =
|
||||
'Certainly! I would be more than happy to assist you with that request. I am now proceeding to surgically update the file using the replace tool to ensure accuracy.';
|
||||
// 29 words
|
||||
const prediction = { output_text: text };
|
||||
const result = evaluateBrevity(prediction);
|
||||
expect(result.score).toBe(0.4);
|
||||
expect(result.metadata?.tier).toBe('verbose');
|
||||
});
|
||||
|
||||
it('should return 0.1 for a heavy response (> 50 words)', () => {
|
||||
const text =
|
||||
'Certainly! I would be more than happy to assist you with that request. I am now proceeding to surgically update the file using the replace tool to ensure accuracy. I will then verify the changes and let you know when I am finished with the task so we can move to the next stage of implementation.';
|
||||
// 53 words
|
||||
const prediction = { output_text: text };
|
||||
const result = evaluateBrevity(prediction);
|
||||
expect(result.score).toBe(0.1);
|
||||
expect(result.metadata?.tier).toBe('heavy');
|
||||
});
|
||||
|
||||
it('should handle missing output text as succinct (0 words)', () => {
|
||||
const prediction = {};
|
||||
const result = evaluateBrevity(prediction);
|
||||
expect(result.score).toBe(1.0);
|
||||
expect(result.metadata?.tier).toBe('succinct');
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,62 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { debugLogger } from '../../../../../packages/core/src/utils/debugLogger.js';
|
||||
import { DEFAULT_EVAL_CONFIG } from '../config.js';
|
||||
import { MetricObjective } from '../types.js';
|
||||
import type { MetricResult } from '../types.js';
|
||||
|
||||
/**
|
||||
* Evaluates the brevity of a model's response using a tiered 4-step word-count function.
|
||||
* Focuses on rewarding succinctness and providing a non-zero gradient for verbose models.
|
||||
*/
|
||||
export function evaluateBrevity(
|
||||
prediction: { output_text?: string },
|
||||
config = DEFAULT_EVAL_CONFIG.objectives.brevity,
|
||||
): MetricResult {
|
||||
const chatter = (prediction.output_text ?? '').trim();
|
||||
|
||||
// Simple word count: split by whitespace and filter out empty strings
|
||||
const wordCount = chatter === '' ? 0 : chatter.split(/\s+/).length;
|
||||
|
||||
debugLogger.debug(
|
||||
`[Eval:Brevity] Measuring output text word count: ${wordCount} words.`,
|
||||
);
|
||||
|
||||
let score: number;
|
||||
let reason: string;
|
||||
|
||||
if (wordCount <= config.succinctThresholdWords) {
|
||||
score = config.succinctScore;
|
||||
reason = `Succinct: Response is within ${config.succinctThresholdWords} words.`;
|
||||
} else if (wordCount <= config.acceptableThresholdWords) {
|
||||
score = config.acceptableScore;
|
||||
reason = `Acceptable: Response is slightly verbose (${wordCount} words), exceeding ${config.succinctThresholdWords} words.`;
|
||||
} else if (wordCount <= config.verboseThresholdWords) {
|
||||
score = config.verboseScore;
|
||||
reason = `Verbose: Response contains ${wordCount} words, exceeding acceptable limit of ${config.acceptableThresholdWords} words.`;
|
||||
} else {
|
||||
score = config.heavyScore;
|
||||
reason = `Heavy: Response is excessively verbose (${wordCount} words).`;
|
||||
}
|
||||
|
||||
return {
|
||||
score,
|
||||
objective: MetricObjective.BREVITY,
|
||||
reason,
|
||||
metadata: {
|
||||
wordCount,
|
||||
tier:
|
||||
score === 1.0
|
||||
? 'succinct'
|
||||
: score === 0.7
|
||||
? 'acceptable'
|
||||
: score === 0.4
|
||||
? 'verbose'
|
||||
: 'heavy',
|
||||
},
|
||||
};
|
||||
}
|
||||
+1
-2
@@ -6,7 +6,7 @@
|
||||
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import { evaluateToolAlignment } from './toolAlignment.js';
|
||||
import { MetricObjective, OptimizationDirection } from '../types.js';
|
||||
import { MetricObjective } from '../types.js';
|
||||
import type { Scenario } from '../schema.js';
|
||||
|
||||
describe('evaluateToolAlignment', () => {
|
||||
@@ -36,7 +36,6 @@ describe('evaluateToolAlignment', () => {
|
||||
const result = evaluateToolAlignment(prediction, mockScenario);
|
||||
expect(result.score).toBe(1.0);
|
||||
expect(result.objective).toBe(MetricObjective.ALIGNMENT);
|
||||
expect(result.direction).toBe(OptimizationDirection.MAXIMIZE);
|
||||
expect(result.reason).toContain('Functional Success');
|
||||
});
|
||||
|
||||
+2
-7
@@ -4,10 +4,10 @@
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { debugLogger } from '../../utils/debugLogger.js';
|
||||
import { debugLogger } from '../../../../../packages/core/src/utils/debugLogger.js';
|
||||
import type { Scenario, ToolCall } from '../schema.js';
|
||||
import { DEFAULT_EVAL_CONFIG } from '../config.js';
|
||||
import { MetricObjective, OptimizationDirection } from '../types.js';
|
||||
import { MetricObjective } from '../types.js';
|
||||
import type { MetricResult } from '../types.js';
|
||||
|
||||
/**
|
||||
@@ -41,7 +41,6 @@ export function evaluateToolAlignment(
|
||||
return {
|
||||
score: config.hardFailureScore,
|
||||
objective: MetricObjective.ALIGNMENT,
|
||||
direction: OptimizationDirection.MAXIMIZE,
|
||||
reason: `Hard Failure: ${negative.reason}`,
|
||||
metadata: {
|
||||
matchedNegativeReason: negative.reason,
|
||||
@@ -59,7 +58,6 @@ export function evaluateToolAlignment(
|
||||
return {
|
||||
score: config.invalidResponseScore,
|
||||
objective: MetricObjective.ALIGNMENT,
|
||||
direction: OptimizationDirection.MAXIMIZE,
|
||||
reason: 'Model failed to produce any tool calls.',
|
||||
};
|
||||
}
|
||||
@@ -79,7 +77,6 @@ export function evaluateToolAlignment(
|
||||
return {
|
||||
score: config.invalidResponseScore,
|
||||
objective: MetricObjective.ALIGNMENT,
|
||||
direction: OptimizationDirection.MAXIMIZE,
|
||||
reason: 'Model selected the wrong tool(s).',
|
||||
};
|
||||
}
|
||||
@@ -100,7 +97,6 @@ export function evaluateToolAlignment(
|
||||
return {
|
||||
score: config.toolNameMatchOnlyScore,
|
||||
objective: MetricObjective.ALIGNMENT,
|
||||
direction: OptimizationDirection.MAXIMIZE,
|
||||
reason: 'Correct tool selected, but arguments are incorrect or missing.',
|
||||
};
|
||||
}
|
||||
@@ -112,7 +108,6 @@ export function evaluateToolAlignment(
|
||||
return {
|
||||
score: config.functionalSuccessScore,
|
||||
objective: MetricObjective.ALIGNMENT,
|
||||
direction: OptimizationDirection.MAXIMIZE,
|
||||
reason:
|
||||
'Functional Success: Tool and arguments align perfectly with golden scenario.',
|
||||
};
|
||||
@@ -4,20 +4,12 @@
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
/**
|
||||
* Defines whether an objective should be increased or decreased during optimization.
|
||||
*/
|
||||
export enum OptimizationDirection {
|
||||
MINIMIZE = 'minimize',
|
||||
MAXIMIZE = 'maximize',
|
||||
}
|
||||
|
||||
/**
|
||||
* The specific dimensions being measured by the evaluation pipeline.
|
||||
*/
|
||||
export enum MetricObjective {
|
||||
ALIGNMENT = 'alignment',
|
||||
FRUGALITY = 'frugality',
|
||||
BREVITY = 'brevity',
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -27,6 +19,7 @@ export enum MetricObjective {
|
||||
export interface MetricResult {
|
||||
/**
|
||||
* The numeric score calculated by the metric.
|
||||
* All metrics must provide a value where HIGHER is BETTER.
|
||||
*/
|
||||
score: number;
|
||||
|
||||
@@ -35,11 +28,6 @@ export interface MetricResult {
|
||||
*/
|
||||
objective: MetricObjective;
|
||||
|
||||
/**
|
||||
* Whether the goal is to increase or decrease this specific score.
|
||||
*/
|
||||
direction: OptimizationDirection;
|
||||
|
||||
/**
|
||||
* A human-readable (and optimizer-reflective) reason for the score.
|
||||
*/
|
||||
@@ -0,0 +1,41 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import { maskVariables, unmaskVariables } from './masking.js';
|
||||
|
||||
describe('optimization masking utility', () => {
|
||||
it('should mask unique template variables with indexed tokens', () => {
|
||||
const input = 'Use ${TOOL_A} to read ${FILE_PATH}. ${TOOL_A} is efficient.';
|
||||
const { maskedText, maskMap } = maskVariables(input);
|
||||
|
||||
expect(maskedText).toContain('[[GCLI_VAR_0]]');
|
||||
expect(maskedText).toContain('[[GCLI_VAR_1]]');
|
||||
// Ensure all occurrences of the same variable are replaced with the same token
|
||||
const toolAToken = Object.keys(maskMap).find(
|
||||
(key) => maskMap[key] === '${TOOL_A}',
|
||||
)!;
|
||||
const count = maskedText.split(toolAToken).length - 1;
|
||||
expect(count).toBe(2);
|
||||
expect(maskedText).not.toContain('${TOOL_A}');
|
||||
});
|
||||
|
||||
it('should perfectly restore original text during unmasking', () => {
|
||||
const original = 'Update ${OLD_STR} with ${NEW_STR} in ${FILE_PATH}.';
|
||||
const { maskedText, maskMap } = maskVariables(original);
|
||||
const restored = unmaskVariables(maskedText, maskMap);
|
||||
|
||||
expect(restored).toBe(original);
|
||||
});
|
||||
|
||||
it('should handle text with no variables', () => {
|
||||
const input = 'Static text with no placeholders.';
|
||||
const { maskedText, maskMap } = maskVariables(input);
|
||||
|
||||
expect(maskedText).toBe(input);
|
||||
expect(Object.keys(maskMap).length).toBe(0);
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,61 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
/**
|
||||
* Utility to protect TypeScript template variables from being "optimized" by the LLM.
|
||||
* Replaces ${VAR} with unique stable tokens and allows for perfect restoration.
|
||||
*/
|
||||
|
||||
export interface MaskResult {
|
||||
maskedText: string;
|
||||
maskMap: Record<string, string>;
|
||||
}
|
||||
|
||||
const MASK_PREFIX = '[[GCLI_VAR_';
|
||||
const MASK_SUFFIX = ']]';
|
||||
|
||||
/**
|
||||
* Replaces all instances of ${VARIABLE_NAME} with indexed tokens.
|
||||
* Supports both SCREAMING_SNAKE_CASE and camelCase variables.
|
||||
*/
|
||||
export function maskVariables(text: string): MaskResult {
|
||||
const maskMap: Record<string, string> = {};
|
||||
// Refined regex to capture any variable pattern like ${variableName} or ${VARIABLE_NAME}
|
||||
const variableRegex = /\${[a-zA-Z0-9_.]+}/g;
|
||||
let index = 0;
|
||||
let maskedText = text;
|
||||
|
||||
// Find all unique variables
|
||||
const uniqueVars = Array.from(new Set(text.match(variableRegex) || []));
|
||||
|
||||
uniqueVars.forEach((v) => {
|
||||
const token = `${MASK_PREFIX}${index}${MASK_SUFFIX}`;
|
||||
maskMap[token] = v;
|
||||
// Use a global regex for the specific variable to replace all occurrences
|
||||
maskedText = maskedText.split(v).join(token);
|
||||
index++;
|
||||
});
|
||||
|
||||
return { maskedText, maskMap };
|
||||
}
|
||||
|
||||
/**
|
||||
* Restores original ${VARIABLE_NAME} patterns using the provided mask map.
|
||||
*/
|
||||
export function unmaskVariables(
|
||||
text: string,
|
||||
maskMap: Record<string, string>,
|
||||
): string {
|
||||
let unmaskedText = text;
|
||||
// Sort tokens by length descending to prevent partial replacement (e.g. VAR_10 before VAR_1)
|
||||
const sortedTokens = Object.keys(maskMap).sort((a, b) => b.length - a.length);
|
||||
|
||||
sortedTokens.forEach((token) => {
|
||||
const originalVar = maskMap[token];
|
||||
unmaskedText = unmaskedText.split(token).join(originalVar);
|
||||
});
|
||||
return unmaskedText;
|
||||
}
|
||||
@@ -10,7 +10,10 @@ export default defineConfig({
|
||||
test: {
|
||||
globals: true,
|
||||
environment: 'node',
|
||||
include: ['scripts/tests/**/*.test.{js,ts}'],
|
||||
include: [
|
||||
'scripts/tests/**/*.test.{js,ts}',
|
||||
'scripts/optimization/**/*.test.ts',
|
||||
],
|
||||
setupFiles: ['scripts/tests/test-setup.ts'],
|
||||
coverage: {
|
||||
provider: 'v8',
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
import * as fs from 'node:fs';
|
||||
import * as path from 'node:path';
|
||||
import type { Scenario } from '../packages/core/src/evals/schema.ts';
|
||||
import type { Scenario } from './optimization/lib/evals/schema.ts';
|
||||
|
||||
const MANIFEST_FILE = 'data/manifest.json';
|
||||
const DEFAULT_DATA_DIR = 'data';
|
||||
|
||||
Reference in New Issue
Block a user