diff --git a/.gitignore b/.gitignore index a2a6553cd3..bd2305f845 100644 --- a/.gitignore +++ b/.gitignore @@ -62,3 +62,4 @@ gemini-debug.log .gemini-clipboard/ .eslintcache evals/logs/ +data/optimization/ diff --git a/data/manifest.json b/data/manifest.json index c8bd9d558c..65f51f488a 100644 --- a/data/manifest.json +++ b/data/manifest.json @@ -66,6 +66,14 @@ }, "file_descriptions": { "data/tool_alignment.jsonl": "Ensures the model selects the correct built-in tool over generic shell commands and optimizes for brevity." + }, + "optimization_targets": { + "snippets": [ + "renderCoreMandates", + "renderPrimaryWorkflows", + "renderOperationalGuidelines", + "renderGitRepo" + ] } } } diff --git a/eslint.config.js b/eslint.config.js index d7c9fa2487..2a66d96235 100644 --- a/eslint.config.js +++ b/eslint.config.js @@ -68,7 +68,11 @@ export default tseslint.config( }, { // Rules for packages/*/src (TS/TSX) - files: ['packages/*/src/**/*.{ts,tsx}', 'data/**/*.ts'], + files: [ + 'packages/*/src/**/*.{ts,tsx}', + 'data/**/*.ts', + 'scripts/optimization/**/*.ts', + ], plugins: { import: importPlugin, }, diff --git a/package-lock.json b/package-lock.json index 85448711c7..5e2c73b579 100644 --- a/package-lock.json +++ b/package-lock.json @@ -73,6 +73,7 @@ "node": ">=20.0.0" }, "optionalDependencies": { + "@ax-llm/ax": "^19.0.11", "@lydell/node-pty": "1.1.0", "@lydell/node-pty-darwin-arm64": "1.1.0", "@lydell/node-pty-darwin-x64": "1.1.0", @@ -179,6 +180,21 @@ "node": ">=6.0.0" } }, + "node_modules/@ax-llm/ax": { + "version": "19.0.11", + "resolved": "https://registry.npmjs.org/@ax-llm/ax/-/ax-19.0.11.tgz", + "integrity": "sha512-U3ZYzBrmMDTDst32jxgH873gC4c75aYjzdCZgwQWy+CwSDL2SskwQX2kZAWGDmmSzs8BxskleoASzQUXuqRLfQ==", + "hasInstallScript": true, + "license": "Apache-2.0", + "optional": true, + "dependencies": { + "@opentelemetry/api": "^1.9.0", + "dayjs": "^1.11.13" + }, + "bin": { + "ax": "cli/index.mjs" + } + }, "node_modules/@azu/format-text": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/@azu/format-text/-/format-text-1.0.2.tgz", @@ -2292,6 +2308,7 @@ "integrity": "sha512-t54CUOsFMappY1Jbzb7fetWeO0n6K0k/4+/ZpkS+3Joz8I4VcvY9OiEBFRYISqaI2fq5sCiPtAjRDOzVYG8m+Q==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "@octokit/auth-token": "^6.0.0", "@octokit/graphql": "^9.0.2", @@ -2472,6 +2489,7 @@ "resolved": "https://registry.npmjs.org/@opentelemetry/api/-/api-1.9.0.tgz", "integrity": "sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg==", "license": "Apache-2.0", + "peer": true, "engines": { "node": ">=8.0.0" } @@ -2521,6 +2539,7 @@ "resolved": "https://registry.npmjs.org/@opentelemetry/core/-/core-2.5.0.tgz", "integrity": "sha512-ka4H8OM6+DlUhSAZpONu0cPBtPPTQKxbxVzC4CzVx5+K4JnroJVBtDzLAMx4/3CDTJXRvVFhpFjtl4SaiTNoyQ==", "license": "Apache-2.0", + "peer": true, "dependencies": { "@opentelemetry/semantic-conventions": "^1.29.0" }, @@ -2895,6 +2914,7 @@ "resolved": "https://registry.npmjs.org/@opentelemetry/resources/-/resources-2.5.0.tgz", "integrity": "sha512-F8W52ApePshpoSrfsSk1H2yJn9aKjCrbpQF1M9Qii0GHzbfVeFUB+rc3X4aggyZD8x9Gu3Slua+s6krmq6Dt8g==", "license": "Apache-2.0", + "peer": true, "dependencies": { "@opentelemetry/core": "2.5.0", "@opentelemetry/semantic-conventions": "^1.29.0" @@ -2928,6 +2948,7 @@ "resolved": "https://registry.npmjs.org/@opentelemetry/sdk-metrics/-/sdk-metrics-2.5.0.tgz", "integrity": "sha512-BeJLtU+f5Gf905cJX9vXFQorAr6TAfK3SPvTFqP+scfIpDQEJfRaGJWta7sJgP+m4dNtBf9y3yvBKVAZZtJQVA==", "license": "Apache-2.0", + "peer": true, "dependencies": { "@opentelemetry/core": "2.5.0", "@opentelemetry/resources": "2.5.0" @@ -2982,6 +3003,7 @@ "resolved": "https://registry.npmjs.org/@opentelemetry/sdk-trace-base/-/sdk-trace-base-2.5.0.tgz", "integrity": "sha512-VzRf8LzotASEyNDUxTdaJ9IRJ1/h692WyArDBInf5puLCjxbICD6XkHgpuudis56EndyS7LYFmtTMny6UABNdQ==", "license": "Apache-2.0", + "peer": true, "dependencies": { "@opentelemetry/core": "2.5.0", "@opentelemetry/resources": "2.5.0", @@ -4178,6 +4200,7 @@ "integrity": "sha512-6mDvHUFSjyT2B2yeNx2nUgMxh9LtOWvkhIU3uePn2I2oyNymUAX1NIsdgviM4CH+JSrp2D2hsMvJOkxY+0wNRA==", "devOptional": true, "license": "MIT", + "peer": true, "dependencies": { "csstype": "^3.0.2" } @@ -4451,6 +4474,7 @@ "integrity": "sha512-klQbnPAAiGYFyI02+znpBRLyjL4/BrBd0nyWkdC0s/6xFLkXYQ8OoRrSkqacS1ddVxf/LDyODIKbQ5TgKAf/Fg==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "@typescript-eslint/scope-manager": "8.56.1", "@typescript-eslint/types": "8.56.1", @@ -5298,6 +5322,7 @@ "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.15.0.tgz", "integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==", "license": "MIT", + "peer": true, "bin": { "acorn": "bin/acorn" }, @@ -6879,6 +6904,13 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/dayjs": { + "version": "1.11.19", + "resolved": "https://registry.npmjs.org/dayjs/-/dayjs-1.11.19.tgz", + "integrity": "sha512-t5EcLVS6QPBNqM2z8fakk/NKel+Xzshgt8FFKAn+qwlD1pzZWxh0nVCrvFK7ZDb6XucZeF9z8C7CBWTRIVApAw==", + "license": "MIT", + "optional": true + }, "node_modules/debug": { "version": "4.4.3", "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz", @@ -7901,6 +7933,7 @@ "integrity": "sha512-VmQ+sifHUbI/IcSopBCF/HO3YiHQx/AVd3UVyYL6weuwW+HvON9VYn5l6Zl1WZzPWXPNZrSQpxwkkZ/VuvJZzg==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "@eslint-community/eslint-utils": "^4.8.0", "@eslint-community/regexpp": "^4.12.1", @@ -8533,6 +8566,7 @@ "resolved": "https://registry.npmjs.org/express/-/express-5.2.1.tgz", "integrity": "sha512-hIS4idWWai69NezIdRt2xFVofaF4j+6INOpJlVOLDO8zXGpUVEVzIYk12UUi2JzjEzWL3IOAxcTubgz9Po0yXw==", "license": "MIT", + "peer": true, "dependencies": { "accepts": "^2.0.0", "body-parser": "^2.2.1", @@ -9847,6 +9881,7 @@ "resolved": "https://registry.npmjs.org/hono/-/hono-4.12.2.tgz", "integrity": "sha512-gJnaDHXKDayjt8ue0n8Gs0A007yKXj4Xzb8+cNjZeYsSzzwKc0Lr+OZgYwVfB0pHfUs17EPoLvrOsEaJ9mj+Tg==", "license": "MIT", + "peer": true, "engines": { "node": ">=16.9.0" } @@ -10126,6 +10161,7 @@ "resolved": "https://registry.npmjs.org/@jrichman/ink/-/ink-6.4.11.tgz", "integrity": "sha512-93LQlzT7vvZ1XJcmOMwN4s+6W334QegendeHOMnEJBlhnpIzr8bws6/aOEHG8ZCuVD/vNeeea5m1msHIdAY6ig==", "license": "MIT", + "peer": true, "dependencies": { "@alcalzone/ansi-tokenize": "^0.2.1", "ansi-escapes": "^7.0.0", @@ -13808,6 +13844,7 @@ "resolved": "https://registry.npmjs.org/react/-/react-19.2.4.tgz", "integrity": "sha512-9nfp2hYpCwOjAN+8TZFGhtWEwgvWHXqESH8qT89AT/lWklpLON22Lc8pEtnpsZz7VmawabSU0gCjnj8aC0euHQ==", "license": "MIT", + "peer": true, "engines": { "node": ">=0.10.0" } @@ -13818,6 +13855,7 @@ "integrity": "sha512-ePrwPfxAnB+7hgnEr8vpKxL9cmnp7F322t8oqcPshbIQQhDKgFDW4tjhF2wjVbdXF9O/nyuy3sQWd9JGpiLPvA==", "devOptional": true, "license": "MIT", + "peer": true, "dependencies": { "shell-quote": "^1.6.1", "ws": "^7" @@ -15906,6 +15944,7 @@ "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.3.tgz", "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", "license": "MIT", + "peer": true, "engines": { "node": ">=12" }, @@ -16129,7 +16168,8 @@ "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz", "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==", "dev": true, - "license": "0BSD" + "license": "0BSD", + "peer": true }, "node_modules/tsx": { "version": "4.20.3", @@ -16137,6 +16177,7 @@ "integrity": "sha512-qjbnuR9Tr+FJOMBqJCW5ehvIo/buZq7vH7qD7JziU98h6l3qGy0a/yPFjwO+y0/T7GFpNgNAvEcPPVfyT8rrPQ==", "devOptional": true, "license": "MIT", + "peer": true, "dependencies": { "esbuild": "~0.25.0", "get-tsconfig": "^4.7.5" @@ -16296,6 +16337,7 @@ "integrity": "sha512-p1diW6TqL9L07nNxvRMM7hMMw4c5XOo/1ibL4aAIGmSAt9slTE1Xgw5KWuof2uTOvCg9BY7ZRi+GaF+7sfgPeQ==", "devOptional": true, "license": "Apache-2.0", + "peer": true, "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" @@ -16519,6 +16561,7 @@ "resolved": "https://registry.npmjs.org/vite/-/vite-7.2.2.tgz", "integrity": "sha512-BxAKBWmIbrDgrokdGZH1IgkIk/5mMHDreLDmCJ0qpyJaAteP8NvMhkwr/ZCQNqNH97bw/dANTE9PDzqwJghfMQ==", "license": "MIT", + "peer": true, "dependencies": { "esbuild": "^0.25.0", "fdir": "^6.5.0", @@ -16632,6 +16675,7 @@ "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.3.tgz", "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", "license": "MIT", + "peer": true, "engines": { "node": ">=12" }, @@ -16644,6 +16688,7 @@ "resolved": "https://registry.npmjs.org/vitest/-/vitest-3.2.4.tgz", "integrity": "sha512-LUCP5ev3GURDysTWiP47wRRUpLKMOfPh+yKTx3kVIEiu5KOMeqzpnYNsKyOoVrULivR8tLcks4+lga33Whn90A==", "license": "MIT", + "peer": true, "dependencies": { "@types/chai": "^5.2.2", "@vitest/expect": "3.2.4", @@ -17288,6 +17333,7 @@ "resolved": "https://registry.npmjs.org/zod/-/zod-3.25.76.tgz", "integrity": "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ==", "license": "MIT", + "peer": true, "funding": { "url": "https://github.com/sponsors/colinhacks" } @@ -17687,6 +17733,7 @@ "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.3.tgz", "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", "license": "MIT", + "peer": true, "engines": { "node": ">=12" }, diff --git a/package.json b/package.json index d2e0900914..eef127a756 100644 --- a/package.json +++ b/package.json @@ -62,8 +62,10 @@ "release:version": "node scripts/version.js", "telemetry": "node scripts/telemetry.js", "data:validate": "tsx scripts/validate-data.ts", - "data:format": "prettier --write 'data/**/*.ts' 'data/*.json' 'scripts/validate-data.ts' 'packages/core/src/evals/**/*.ts'", - "data:lint": "eslint 'data/**/*.ts' 'scripts/validate-data.ts' 'packages/core/src/evals/**/*.ts'", + "data:format": "prettier --write 'data/*.json' 'scripts/validate-data.ts' 'scripts/optimization/**/*.ts'", + "data:lint": "eslint 'scripts/validate-data.ts' 'scripts/optimization/**/*.ts'", + "optimize": "tsx scripts/optimization/run.ts", + "optimize:extract": "tsx scripts/optimization/extract.ts", "check:lockfile": "node scripts/check-lockfile.js", "clean": "node scripts/clean.js", "pre-commit": "node scripts/pre-commit.js" @@ -145,6 +147,7 @@ "simple-git": "^3.28.0" }, "optionalDependencies": { + "@ax-llm/ax": "^19.0.11", "@lydell/node-pty": "1.1.0", "@lydell/node-pty-darwin-arm64": "1.1.0", "@lydell/node-pty-darwin-x64": "1.1.0", diff --git a/packages/core/src/evals/metrics/tokenFrugality.test.ts b/packages/core/src/evals/metrics/tokenFrugality.test.ts deleted file mode 100644 index 599c23fdd4..0000000000 --- a/packages/core/src/evals/metrics/tokenFrugality.test.ts +++ /dev/null @@ -1,41 +0,0 @@ -/** - * @license - * Copyright 2026 Google LLC - * SPDX-License-Identifier: Apache-2.0 - */ - -import { describe, it, expect } from 'vitest'; -import { evaluateTokenFrugality } from './tokenFrugality.js'; -import { MetricObjective, OptimizationDirection } from '../types.js'; - -describe('evaluateTokenFrugality', () => { - it('should return the raw character count as the score', () => { - const prediction = { output_text: 'Hello' }; - const result = evaluateTokenFrugality(prediction); - expect(result.score).toBe(5); - expect(result.objective).toBe(MetricObjective.FRUGALITY); - expect(result.direction).toBe(OptimizationDirection.MINIMIZE); - expect(result.reason).toContain('contains 5 characters'); - }); - - it('should flag if response is succinct (under threshold)', () => { - const prediction = { output_text: 'Short' }; - const result = evaluateTokenFrugality(prediction); - expect(result.metadata?.['isOverThreshold']).toBe(false); - expect(result.reason).toContain('Succinct response'); - }); - - it('should flag if response exceeds chatter threshold', () => { - const prediction = { output_text: 'a'.repeat(50) }; - const result = evaluateTokenFrugality(prediction); - expect(result.metadata?.['isOverThreshold']).toBe(true); - expect(result.reason).toContain('Exceeds threshold'); - }); - - it('should handle missing output text as 0 chars', () => { - const prediction = {}; - const result = evaluateTokenFrugality(prediction); - expect(result.score).toBe(0); - expect(result.reason).toContain('contains 0 characters'); - }); -}); diff --git a/packages/core/src/evals/metrics/tokenFrugality.ts b/packages/core/src/evals/metrics/tokenFrugality.ts deleted file mode 100644 index 683f972f57..0000000000 --- a/packages/core/src/evals/metrics/tokenFrugality.ts +++ /dev/null @@ -1,49 +0,0 @@ -/** - * @license - * Copyright 2026 Google LLC - * SPDX-License-Identifier: Apache-2.0 - */ - -import { debugLogger } from '../../utils/debugLogger.js'; -import { DEFAULT_EVAL_CONFIG } from '../config.js'; -import { MetricObjective, OptimizationDirection } from '../types.js'; -import type { MetricResult } from '../types.js'; - -/** - * Evaluates the frugality of a model's response by measuring total character count. - * Focuses on reducing conversational noise ("chatter"). - */ -export function evaluateTokenFrugality( - prediction: { output_text?: string }, - config = DEFAULT_EVAL_CONFIG.objectives.frugality, -): MetricResult { - const chatter = prediction.output_text ?? ''; - const chatterLength = chatter.length; - - debugLogger.debug( - `[Eval:Frugality] Measuring output text length: ${chatterLength} chars.`, - ); - - // In Genetic-Pareto, the raw score (length) is the value to be MINIMIZED. - // We provide the raw count as the score, and the direction tells the optimizer how to handle it. - - let reason = `Response contains ${chatterLength} characters of non-tool text.`; - - if (chatterLength > config.chattyThresholdChars) { - reason += ` (Exceeds threshold of ${config.chattyThresholdChars})`; - } else { - reason += ' (Succinct response)'; - } - - return { - score: chatterLength, - objective: MetricObjective.FRUGALITY, - direction: OptimizationDirection.MINIMIZE, - reason, - metadata: { - charCount: chatterLength, - threshold: config.chattyThresholdChars, - isOverThreshold: chatterLength > config.chattyThresholdChars, - }, - }; -} diff --git a/scripts/optimization/extract.test.ts b/scripts/optimization/extract.test.ts new file mode 100644 index 0000000000..1223b19e4f --- /dev/null +++ b/scripts/optimization/extract.test.ts @@ -0,0 +1,99 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, it, expect, vi, beforeEach } from 'vitest'; +import * as fs from 'node:fs'; +import { runExtraction } from './extract.js'; + +vi.mock('node:fs'); + +describe('extraction script', () => { + const mockManifest = { + data_inventory: { + optimization_targets: { + snippets: ['renderCoreMandates'], + }, + tools: { + read_file: {}, + }, + }, + }; + + beforeEach(() => { + vi.clearAllMocks(); + vi.mocked(fs.existsSync).mockReturnValue(true); + vi.mocked(fs.readFileSync).mockImplementation((path) => { + if (typeof path !== 'string') return ''; + if (path.includes('manifest.json')) return JSON.stringify(mockManifest); + + // Mock snippets.ts + if (path.includes('snippets.ts')) { + return ` + export function renderCoreMandates(options: any): string { + const foo = "Ignore me"; + return \`# Core Mandate Instruction \${USER_VAR}\`.trim(); + } + `; + } + + // Mock gemini-3.ts + if (path.includes('gemini-3.ts')) { + return ` + read_file: { + description: 'Read file description.', + }, +`; + } + + // Mock dynamic helpers + if (path.includes('dynamic-declaration-helpers.ts')) { + return ` + return \`This tool executes a given shell command as \\\`bash -c \\\`. \${backgroundInstructions}\`; + name: EXIT_PLAN_MODE_TOOL_NAME, + description: 'Exit Plan Mode.', + name: ACTIVATE_SKILL_TOOL_NAME, + description: \`Activate skill.\`, + `; + } + return ''; + }); + }); + + it('should extract snippets correctly (Step 1)', async () => { + const targets = await runExtraction(); + const snippet = targets.find((t) => t.id === 'snippets:renderCoreMandates'); + expect(snippet).toBeDefined(); + expect(snippet?.originalText).toBe( + '# Core Mandate Instruction ${USER_VAR}', + ); + expect(snippet?.maskedText).toContain('[[GCLI_VAR_0]]'); + }); + + it('should extract tools correctly (Step 2)', async () => { + const targets = await runExtraction(); + const tool = targets.find((t) => t.id === 'gemini3:read_file:description'); + expect(tool).toBeDefined(); + expect(tool?.originalText).toBe('Read file description.'); + }); + + it('should extract dynamic helpers correctly (Step 3)', async () => { + const targets = await runExtraction(); + const shell = targets.find((t) => t.id === 'shell:darwin:description'); + expect(shell).toBeDefined(); + expect(shell?.maskedText).toContain('[[GCLI_VAR_0]]'); + + const exitPlan = targets.find((t) => t.id === 'exit_plan_mode:description'); + expect(exitPlan?.originalText).toBe('Exit Plan Mode.'); + }); + + it('should write targets.json to the correct directory', async () => { + await runExtraction(); + expect(fs.writeFileSync).toHaveBeenCalledWith( + expect.stringContaining('targets.json'), + expect.any(String), + ); + }); +}); diff --git a/scripts/optimization/extract.ts b/scripts/optimization/extract.ts new file mode 100644 index 0000000000..c28f44226f --- /dev/null +++ b/scripts/optimization/extract.ts @@ -0,0 +1,175 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import * as fs from 'node:fs'; +import * as path from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { maskVariables } from './lib/masking.js'; + +export interface OptimizationTarget { + id: string; + sourceFile: string; + originalText: string; + maskedText: string; + maskMap: Record; +} + +/** + * Robustly finds a block bounded by { } using character scanning. + */ +function findBlockBounds( + content: string, + startIdx: number, +): { start: number; end: number } | null { + const blockStart = content.indexOf('{', startIdx); + if (blockStart === -1) return null; + + let braceCount = 0; + for (let i = blockStart; i < content.length; i++) { + if (content[i] === '{') braceCount++; + if (content[i] === '}') braceCount--; + if (braceCount === 0) { + return { start: blockStart, end: i }; + } + } + return null; +} + +/** + * Main extraction function. + */ +export async function runExtraction() { + const manifest = JSON.parse(fs.readFileSync('data/manifest.json', 'utf8')); + const targets: OptimizationTarget[] = []; + + // 1. Snippets + const snippetNames = + manifest.data_inventory?.optimization_targets?.snippets || []; + const snippetsPath = 'packages/core/src/prompts/snippets.ts'; + if (fs.existsSync(snippetsPath)) { + const content = fs.readFileSync(snippetsPath, 'utf8'); + for (const name of snippetNames) { + const startIdx = content.indexOf(`export function ${name}`); + if (startIdx === -1) continue; + + const bounds = findBlockBounds(content, startIdx); + if (!bounds) continue; + + const body = content.substring(bounds.start, bounds.end + 1); + // Capture the LAST template literal + const tickMatches = [...body.matchAll(/`((?:[^`\\]|\\.)*)`/g)]; + if (tickMatches.length > 0) { + const text = tickMatches[tickMatches.length - 1][1].trim(); + const { maskedText, maskMap } = maskVariables(text); + targets.push({ + id: `snippets:${name}`, + sourceFile: snippetsPath, + originalText: text, + maskedText, + maskMap, + }); + } + } + } + + // 2. Tools + const toolNames = Object.keys(manifest.data_inventory?.tools || {}); + const gemini3Path = + 'packages/core/src/tools/definitions/model-family-sets/gemini-3.ts'; + if (fs.existsSync(gemini3Path)) { + const content = fs.readFileSync(gemini3Path, 'utf8'); + for (const name of toolNames) { + // Find tool key (2-space indent) + const toolRegex = new RegExp(`^\\s{2}${name}:\\s*\\{`, 'm'); + const match = toolRegex.exec(content); + if (!match) continue; + + const bounds = findBlockBounds(content, match.index); + if (!bounds) continue; + + const toolBlock = content.substring(match.index, bounds.end + 1); + const descRegex = + /description:\s*(?:`((?:[^`\\]|\\.)*)`|'([^']*)'|"([^"]*)")/g; + const descMatch = descRegex.exec(toolBlock); + + if (descMatch) { + const text = (descMatch[1] || descMatch[2] || descMatch[3]).trim(); + const { maskedText, maskMap } = maskVariables(text); + targets.push({ + id: `gemini3:${name}:description`, + sourceFile: gemini3Path, + originalText: text, + maskedText, + maskMap, + }); + } + } + } + + // 3. Dynamic Helpers + const helpersPath = + 'packages/core/src/tools/definitions/dynamic-declaration-helpers.ts'; + if (fs.existsSync(helpersPath)) { + const content = fs.readFileSync(helpersPath, 'utf8'); + const specs = [ + { + id: 'shell:darwin:description', + regex: + /return `This tool executes a given shell command as \\`bash -c \\`. ([\s\S]*?)`;/, + }, + { + id: 'shell:win32:description', + regex: + /return `This tool executes a given shell command as \\`powershell\.exe -NoProfile -Command \\`. ([\s\S]*?)`;/, + }, + { + id: 'exit_plan_mode:description', + regex: + /name: EXIT_PLAN_MODE_TOOL_NAME,[\s\S]*?description:\s*'([^']*)',/, + }, + { + id: 'activate_skill:description', + regex: + /name: ACTIVATE_SKILL_TOOL_NAME,[\s\S]*?description:\s*`((?:[^`\\]|\\.)*)`,/, + }, + ]; + for (const s of specs) { + const m = s.regex.exec(content); + if (m && m[1]) { + const text = m[1].trim(); + const { maskedText, maskMap } = maskVariables(text); + targets.push({ + id: s.id, + sourceFile: helpersPath, + originalText: text, + maskedText, + maskMap, + }); + } + } + } + + const outputDir = 'data/optimization'; + if (!fs.existsSync(outputDir)) fs.mkdirSync(outputDir, { recursive: true }); + + fs.writeFileSync( + path.join(outputDir, 'targets.json'), + JSON.stringify(targets, null, 2), + ); + return targets; +} + +// CLI Entrypoint +const isMain = + process.argv[1] && + fileURLToPath(import.meta.url) === fs.realpathSync(process.argv[1]); +if (isMain) { + runExtraction() + // eslint-disable-next-line no-console + .then((t) => console.log(`✅ Extracted ${t.length} targets.`)) + // eslint-disable-next-line no-console + .catch(console.error); +} diff --git a/packages/core/src/evals/config.ts b/scripts/optimization/lib/evals/config.ts similarity index 61% rename from packages/core/src/evals/config.ts rename to scripts/optimization/lib/evals/config.ts index c45d2089e1..dad9fc0390 100644 --- a/packages/core/src/evals/config.ts +++ b/scripts/optimization/lib/evals/config.ts @@ -4,17 +4,10 @@ * SPDX-License-Identifier: Apache-2.0 */ -import { OptimizationDirection } from './types.js'; - /** * Configuration for the Tool Alignment objective (The Accuracy Dimension). */ export interface AlignmentConfig { - /** - * Whether to increase or decrease the alignment score. - */ - direction: OptimizationDirection.MAXIMIZE; - /** * The relative importance of accuracy vs other objectives in the Pareto frontier. */ @@ -42,28 +35,37 @@ export interface AlignmentConfig { } /** - * Configuration for the Token Frugality objective (The Density Dimension). + * Configuration for the Brevity objective (The Density Dimension). + * Uses a word-count step-function to provide high-contrast signal for GEPA. */ -export interface FrugalityConfig { - /** - * Whether to increase or decrease the token count. - */ - direction: OptimizationDirection.MINIMIZE; - +export interface BrevityConfig { /** * Importance of brevity relative to accuracy. */ weight: number; /** - * The 'conversational budget' - max chars of non-tool text allowed before penalty. + * TIER 1: Response is perfectly succinct (e.g., <= 10 words). */ - chattyThresholdChars: number; + succinctThresholdWords: number; + succinctScore: number; // 1.0 /** - * Amount subtracted from the functional score if the model is too verbose. + * TIER 2: Response is acceptable but slightly verbose (e.g., <= 25 words). */ - chattyPenalty: number; + acceptableThresholdWords: number; + acceptableScore: number; // 0.7 + + /** + * TIER 3: Response is verbose (e.g., <= 50 words). + */ + verboseThresholdWords: number; + verboseScore: number; // 0.4 + + /** + * TIER 4: Response is very heavy (e.g., > 50 words). + */ + heavyScore: number; // 0.1 } /** @@ -72,29 +74,33 @@ export interface FrugalityConfig { export interface EvalConfig { objectives: { alignment: AlignmentConfig; - frugality: FrugalityConfig; + brevity: BrevityConfig; }; } /** * Default weights and thresholds for the Genetic-Pareto (GEPA) engine. * These constants drive the 'Selection Pressure' that evolves the prompt. + * GEPA always MAXIMIZES, so higher scores represent better performance. */ export const DEFAULT_EVAL_CONFIG: EvalConfig = { objectives: { alignment: { - direction: OptimizationDirection.MAXIMIZE, weight: 1.0, // PRIMARY: Accuracy cannot be sacrificed. hardFailureScore: 0.0, invalidResponseScore: 0.1, toolNameMatchOnlyScore: 0.4, functionalSuccessScore: 1.0, }, - frugality: { - direction: OptimizationDirection.MINIMIZE, + brevity: { weight: 0.6, // SECONDARY: Reward brevity once accuracy is high. - chattyThresholdChars: 30, // Budget for 'I have updated the file' etc. - chattyPenalty: 0.2, // Penalty creates a 'Reward Gap' for concise models. + succinctThresholdWords: 10, + succinctScore: 1.0, + acceptableThresholdWords: 25, + acceptableScore: 0.7, + verboseThresholdWords: 50, + verboseScore: 0.4, + heavyScore: 0.1, // Never hard-zero brevity to allow gradient improvement. }, }, }; diff --git a/scripts/optimization/lib/evals/metrics/brevityMetric.test.ts b/scripts/optimization/lib/evals/metrics/brevityMetric.test.ts new file mode 100644 index 0000000000..350f2f0221 --- /dev/null +++ b/scripts/optimization/lib/evals/metrics/brevityMetric.test.ts @@ -0,0 +1,54 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, it, expect } from 'vitest'; +import { evaluateBrevity } from './brevityMetric.js'; + +describe('evaluateBrevity 4-tier step-function', () => { + it('should return 1.0 for a succinct response (<= 10 words)', () => { + const prediction = { output_text: 'I have updated the file for you now.' }; // 8 words + const result = evaluateBrevity(prediction); + expect(result.score).toBe(1.0); + expect(result.metadata?.tier).toBe('succinct'); + }); + + it('should return 0.7 for an acceptable response (11-25 words)', () => { + const text = + 'I have successfully updated the file. Everything looks good to proceed with the next step.'; + // 16 words + const prediction = { output_text: text }; + const result = evaluateBrevity(prediction); + expect(result.score).toBe(0.7); + expect(result.metadata?.tier).toBe('acceptable'); + }); + + it('should return 0.4 for a verbose response (26-50 words)', () => { + const text = + 'Certainly! I would be more than happy to assist you with that request. I am now proceeding to surgically update the file using the replace tool to ensure accuracy.'; + // 29 words + const prediction = { output_text: text }; + const result = evaluateBrevity(prediction); + expect(result.score).toBe(0.4); + expect(result.metadata?.tier).toBe('verbose'); + }); + + it('should return 0.1 for a heavy response (> 50 words)', () => { + const text = + 'Certainly! I would be more than happy to assist you with that request. I am now proceeding to surgically update the file using the replace tool to ensure accuracy. I will then verify the changes and let you know when I am finished with the task so we can move to the next stage of implementation.'; + // 53 words + const prediction = { output_text: text }; + const result = evaluateBrevity(prediction); + expect(result.score).toBe(0.1); + expect(result.metadata?.tier).toBe('heavy'); + }); + + it('should handle missing output text as succinct (0 words)', () => { + const prediction = {}; + const result = evaluateBrevity(prediction); + expect(result.score).toBe(1.0); + expect(result.metadata?.tier).toBe('succinct'); + }); +}); diff --git a/scripts/optimization/lib/evals/metrics/brevityMetric.ts b/scripts/optimization/lib/evals/metrics/brevityMetric.ts new file mode 100644 index 0000000000..2ccd128275 --- /dev/null +++ b/scripts/optimization/lib/evals/metrics/brevityMetric.ts @@ -0,0 +1,62 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { debugLogger } from '../../../../../packages/core/src/utils/debugLogger.js'; +import { DEFAULT_EVAL_CONFIG } from '../config.js'; +import { MetricObjective } from '../types.js'; +import type { MetricResult } from '../types.js'; + +/** + * Evaluates the brevity of a model's response using a tiered 4-step word-count function. + * Focuses on rewarding succinctness and providing a non-zero gradient for verbose models. + */ +export function evaluateBrevity( + prediction: { output_text?: string }, + config = DEFAULT_EVAL_CONFIG.objectives.brevity, +): MetricResult { + const chatter = (prediction.output_text ?? '').trim(); + + // Simple word count: split by whitespace and filter out empty strings + const wordCount = chatter === '' ? 0 : chatter.split(/\s+/).length; + + debugLogger.debug( + `[Eval:Brevity] Measuring output text word count: ${wordCount} words.`, + ); + + let score: number; + let reason: string; + + if (wordCount <= config.succinctThresholdWords) { + score = config.succinctScore; + reason = `Succinct: Response is within ${config.succinctThresholdWords} words.`; + } else if (wordCount <= config.acceptableThresholdWords) { + score = config.acceptableScore; + reason = `Acceptable: Response is slightly verbose (${wordCount} words), exceeding ${config.succinctThresholdWords} words.`; + } else if (wordCount <= config.verboseThresholdWords) { + score = config.verboseScore; + reason = `Verbose: Response contains ${wordCount} words, exceeding acceptable limit of ${config.acceptableThresholdWords} words.`; + } else { + score = config.heavyScore; + reason = `Heavy: Response is excessively verbose (${wordCount} words).`; + } + + return { + score, + objective: MetricObjective.BREVITY, + reason, + metadata: { + wordCount, + tier: + score === 1.0 + ? 'succinct' + : score === 0.7 + ? 'acceptable' + : score === 0.4 + ? 'verbose' + : 'heavy', + }, + }; +} diff --git a/packages/core/src/evals/metrics/toolAlignment.test.ts b/scripts/optimization/lib/evals/metrics/toolAlignment.test.ts similarity index 95% rename from packages/core/src/evals/metrics/toolAlignment.test.ts rename to scripts/optimization/lib/evals/metrics/toolAlignment.test.ts index 9f624e8dc0..b026d97fe7 100644 --- a/packages/core/src/evals/metrics/toolAlignment.test.ts +++ b/scripts/optimization/lib/evals/metrics/toolAlignment.test.ts @@ -6,7 +6,7 @@ import { describe, it, expect } from 'vitest'; import { evaluateToolAlignment } from './toolAlignment.js'; -import { MetricObjective, OptimizationDirection } from '../types.js'; +import { MetricObjective } from '../types.js'; import type { Scenario } from '../schema.js'; describe('evaluateToolAlignment', () => { @@ -36,7 +36,6 @@ describe('evaluateToolAlignment', () => { const result = evaluateToolAlignment(prediction, mockScenario); expect(result.score).toBe(1.0); expect(result.objective).toBe(MetricObjective.ALIGNMENT); - expect(result.direction).toBe(OptimizationDirection.MAXIMIZE); expect(result.reason).toContain('Functional Success'); }); diff --git a/packages/core/src/evals/metrics/toolAlignment.ts b/scripts/optimization/lib/evals/metrics/toolAlignment.ts similarity index 90% rename from packages/core/src/evals/metrics/toolAlignment.ts rename to scripts/optimization/lib/evals/metrics/toolAlignment.ts index d48f4dcce9..332ff77a59 100644 --- a/packages/core/src/evals/metrics/toolAlignment.ts +++ b/scripts/optimization/lib/evals/metrics/toolAlignment.ts @@ -4,10 +4,10 @@ * SPDX-License-Identifier: Apache-2.0 */ -import { debugLogger } from '../../utils/debugLogger.js'; +import { debugLogger } from '../../../../../packages/core/src/utils/debugLogger.js'; import type { Scenario, ToolCall } from '../schema.js'; import { DEFAULT_EVAL_CONFIG } from '../config.js'; -import { MetricObjective, OptimizationDirection } from '../types.js'; +import { MetricObjective } from '../types.js'; import type { MetricResult } from '../types.js'; /** @@ -41,7 +41,6 @@ export function evaluateToolAlignment( return { score: config.hardFailureScore, objective: MetricObjective.ALIGNMENT, - direction: OptimizationDirection.MAXIMIZE, reason: `Hard Failure: ${negative.reason}`, metadata: { matchedNegativeReason: negative.reason, @@ -59,7 +58,6 @@ export function evaluateToolAlignment( return { score: config.invalidResponseScore, objective: MetricObjective.ALIGNMENT, - direction: OptimizationDirection.MAXIMIZE, reason: 'Model failed to produce any tool calls.', }; } @@ -79,7 +77,6 @@ export function evaluateToolAlignment( return { score: config.invalidResponseScore, objective: MetricObjective.ALIGNMENT, - direction: OptimizationDirection.MAXIMIZE, reason: 'Model selected the wrong tool(s).', }; } @@ -100,7 +97,6 @@ export function evaluateToolAlignment( return { score: config.toolNameMatchOnlyScore, objective: MetricObjective.ALIGNMENT, - direction: OptimizationDirection.MAXIMIZE, reason: 'Correct tool selected, but arguments are incorrect or missing.', }; } @@ -112,7 +108,6 @@ export function evaluateToolAlignment( return { score: config.functionalSuccessScore, objective: MetricObjective.ALIGNMENT, - direction: OptimizationDirection.MAXIMIZE, reason: 'Functional Success: Tool and arguments align perfectly with golden scenario.', }; diff --git a/packages/core/src/evals/schema.ts b/scripts/optimization/lib/evals/schema.ts similarity index 100% rename from packages/core/src/evals/schema.ts rename to scripts/optimization/lib/evals/schema.ts diff --git a/packages/core/src/evals/types.ts b/scripts/optimization/lib/evals/types.ts similarity index 70% rename from packages/core/src/evals/types.ts rename to scripts/optimization/lib/evals/types.ts index f918588a71..a2267e9d66 100644 --- a/packages/core/src/evals/types.ts +++ b/scripts/optimization/lib/evals/types.ts @@ -4,20 +4,12 @@ * SPDX-License-Identifier: Apache-2.0 */ -/** - * Defines whether an objective should be increased or decreased during optimization. - */ -export enum OptimizationDirection { - MINIMIZE = 'minimize', - MAXIMIZE = 'maximize', -} - /** * The specific dimensions being measured by the evaluation pipeline. */ export enum MetricObjective { ALIGNMENT = 'alignment', - FRUGALITY = 'frugality', + BREVITY = 'brevity', } /** @@ -27,6 +19,7 @@ export enum MetricObjective { export interface MetricResult { /** * The numeric score calculated by the metric. + * All metrics must provide a value where HIGHER is BETTER. */ score: number; @@ -35,11 +28,6 @@ export interface MetricResult { */ objective: MetricObjective; - /** - * Whether the goal is to increase or decrease this specific score. - */ - direction: OptimizationDirection; - /** * A human-readable (and optimizer-reflective) reason for the score. */ diff --git a/scripts/optimization/lib/masking.test.ts b/scripts/optimization/lib/masking.test.ts new file mode 100644 index 0000000000..1bedb294ee --- /dev/null +++ b/scripts/optimization/lib/masking.test.ts @@ -0,0 +1,41 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, it, expect } from 'vitest'; +import { maskVariables, unmaskVariables } from './masking.js'; + +describe('optimization masking utility', () => { + it('should mask unique template variables with indexed tokens', () => { + const input = 'Use ${TOOL_A} to read ${FILE_PATH}. ${TOOL_A} is efficient.'; + const { maskedText, maskMap } = maskVariables(input); + + expect(maskedText).toContain('[[GCLI_VAR_0]]'); + expect(maskedText).toContain('[[GCLI_VAR_1]]'); + // Ensure all occurrences of the same variable are replaced with the same token + const toolAToken = Object.keys(maskMap).find( + (key) => maskMap[key] === '${TOOL_A}', + )!; + const count = maskedText.split(toolAToken).length - 1; + expect(count).toBe(2); + expect(maskedText).not.toContain('${TOOL_A}'); + }); + + it('should perfectly restore original text during unmasking', () => { + const original = 'Update ${OLD_STR} with ${NEW_STR} in ${FILE_PATH}.'; + const { maskedText, maskMap } = maskVariables(original); + const restored = unmaskVariables(maskedText, maskMap); + + expect(restored).toBe(original); + }); + + it('should handle text with no variables', () => { + const input = 'Static text with no placeholders.'; + const { maskedText, maskMap } = maskVariables(input); + + expect(maskedText).toBe(input); + expect(Object.keys(maskMap).length).toBe(0); + }); +}); diff --git a/scripts/optimization/lib/masking.ts b/scripts/optimization/lib/masking.ts new file mode 100644 index 0000000000..ada6982da7 --- /dev/null +++ b/scripts/optimization/lib/masking.ts @@ -0,0 +1,61 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * Utility to protect TypeScript template variables from being "optimized" by the LLM. + * Replaces ${VAR} with unique stable tokens and allows for perfect restoration. + */ + +export interface MaskResult { + maskedText: string; + maskMap: Record; +} + +const MASK_PREFIX = '[[GCLI_VAR_'; +const MASK_SUFFIX = ']]'; + +/** + * Replaces all instances of ${VARIABLE_NAME} with indexed tokens. + * Supports both SCREAMING_SNAKE_CASE and camelCase variables. + */ +export function maskVariables(text: string): MaskResult { + const maskMap: Record = {}; + // Refined regex to capture any variable pattern like ${variableName} or ${VARIABLE_NAME} + const variableRegex = /\${[a-zA-Z0-9_.]+}/g; + let index = 0; + let maskedText = text; + + // Find all unique variables + const uniqueVars = Array.from(new Set(text.match(variableRegex) || [])); + + uniqueVars.forEach((v) => { + const token = `${MASK_PREFIX}${index}${MASK_SUFFIX}`; + maskMap[token] = v; + // Use a global regex for the specific variable to replace all occurrences + maskedText = maskedText.split(v).join(token); + index++; + }); + + return { maskedText, maskMap }; +} + +/** + * Restores original ${VARIABLE_NAME} patterns using the provided mask map. + */ +export function unmaskVariables( + text: string, + maskMap: Record, +): string { + let unmaskedText = text; + // Sort tokens by length descending to prevent partial replacement (e.g. VAR_10 before VAR_1) + const sortedTokens = Object.keys(maskMap).sort((a, b) => b.length - a.length); + + sortedTokens.forEach((token) => { + const originalVar = maskMap[token]; + unmaskedText = unmaskedText.split(token).join(originalVar); + }); + return unmaskedText; +} diff --git a/scripts/tests/vitest.config.ts b/scripts/tests/vitest.config.ts index 9eb42595cf..23b562173e 100644 --- a/scripts/tests/vitest.config.ts +++ b/scripts/tests/vitest.config.ts @@ -10,7 +10,10 @@ export default defineConfig({ test: { globals: true, environment: 'node', - include: ['scripts/tests/**/*.test.{js,ts}'], + include: [ + 'scripts/tests/**/*.test.{js,ts}', + 'scripts/optimization/**/*.test.ts', + ], setupFiles: ['scripts/tests/test-setup.ts'], coverage: { provider: 'v8', diff --git a/scripts/validate-data.ts b/scripts/validate-data.ts index 295179eb74..adde0fddf9 100644 --- a/scripts/validate-data.ts +++ b/scripts/validate-data.ts @@ -6,7 +6,7 @@ import * as fs from 'node:fs'; import * as path from 'node:path'; -import type { Scenario } from '../packages/core/src/evals/schema.ts'; +import type { Scenario } from './optimization/lib/evals/schema.ts'; const MANIFEST_FILE = 'data/manifest.json'; const DEFAULT_DATA_DIR = 'data';