feat(optimization): implement manifest-driven extraction pipeline

- Implement `extract.ts` with robust character-aware parsing for snippets and tools. - Consolidate research dependencies by moving `@ax-llm/ax` to root `optionalDependencies`. - Relocate evaluation logic from `packages/core` to `scripts/optimization/lib/evals` to keep the production core lean. - Add `optimization_targets` to `data/manifest.json` as the single source of truth for the pipeline. - Implement comprehensive unit tests for extraction and variable masking with 100% pass rate. - Update global config and linting rules to support the new optimization infrastructure.
2026-06-08 10:02:59 -07:00 · 2026-03-04 14:25:17 -08:00
parent 6c94c4d9ca
commit 59d377e5e0
20 changed files with 599 additions and 143 deletions
@@ -0,0 +1,99 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, it, expect, vi, beforeEach } from 'vitest';
+import * as fs from 'node:fs';
+import { runExtraction } from './extract.js';
+
+vi.mock('node:fs');
+
+describe('extraction script', () => {
+  const mockManifest = {
+    data_inventory: {
+      optimization_targets: {
+        snippets: ['renderCoreMandates'],
+      },
+      tools: {
+        read_file: {},
+      },
+    },
+  };
+
+  beforeEach(() => {
+    vi.clearAllMocks();
+    vi.mocked(fs.existsSync).mockReturnValue(true);
+    vi.mocked(fs.readFileSync).mockImplementation((path) => {
+      if (typeof path !== 'string') return '';
+      if (path.includes('manifest.json')) return JSON.stringify(mockManifest);
+
+      // Mock snippets.ts
+      if (path.includes('snippets.ts')) {
+        return `
+          export function renderCoreMandates(options: any): string {
+            const foo = "Ignore me";
+            return \`# Core Mandate Instruction \${USER_VAR}\`.trim();
+          }
+        `;
+      }
+
+      // Mock gemini-3.ts
+      if (path.includes('gemini-3.ts')) {
+        return `
+  read_file: {
+    description: 'Read file description.',
+  },
+`;
+      }
+
+      // Mock dynamic helpers
+      if (path.includes('dynamic-declaration-helpers.ts')) {
+        return `
+          return \`This tool executes a given shell command as \\\`bash -c <command>\\\`. \${backgroundInstructions}\`;
+          name: EXIT_PLAN_MODE_TOOL_NAME,
+          description: 'Exit Plan Mode.',
+          name: ACTIVATE_SKILL_TOOL_NAME,
+          description: \`Activate skill.\`,
+        `;
+      }
+      return '';
+    });
+  });
+
+  it('should extract snippets correctly (Step 1)', async () => {
+    const targets = await runExtraction();
+    const snippet = targets.find((t) => t.id === 'snippets:renderCoreMandates');
+    expect(snippet).toBeDefined();
+    expect(snippet?.originalText).toBe(
+      '# Core Mandate Instruction ${USER_VAR}',
+    );
+    expect(snippet?.maskedText).toContain('[[GCLI_VAR_0]]');
+  });
+
+  it('should extract tools correctly (Step 2)', async () => {
+    const targets = await runExtraction();
+    const tool = targets.find((t) => t.id === 'gemini3:read_file:description');
+    expect(tool).toBeDefined();
+    expect(tool?.originalText).toBe('Read file description.');
+  });
+
+  it('should extract dynamic helpers correctly (Step 3)', async () => {
+    const targets = await runExtraction();
+    const shell = targets.find((t) => t.id === 'shell:darwin:description');
+    expect(shell).toBeDefined();
+    expect(shell?.maskedText).toContain('[[GCLI_VAR_0]]');
+
+    const exitPlan = targets.find((t) => t.id === 'exit_plan_mode:description');
+    expect(exitPlan?.originalText).toBe('Exit Plan Mode.');
+  });
+
+  it('should write targets.json to the correct directory', async () => {
+    await runExtraction();
+    expect(fs.writeFileSync).toHaveBeenCalledWith(
+      expect.stringContaining('targets.json'),
+      expect.any(String),
+    );
+  });
+});
@@ -0,0 +1,175 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import * as fs from 'node:fs';
+import * as path from 'node:path';
+import { fileURLToPath } from 'node:url';
+import { maskVariables } from './lib/masking.js';
+
+export interface OptimizationTarget {
+  id: string;
+  sourceFile: string;
+  originalText: string;
+  maskedText: string;
+  maskMap: Record<string, string>;
+}
+
+/**
+ * Robustly finds a block bounded by { } using character scanning.
+ */
+function findBlockBounds(
+  content: string,
+  startIdx: number,
+): { start: number; end: number } | null {
+  const blockStart = content.indexOf('{', startIdx);
+  if (blockStart === -1) return null;
+
+  let braceCount = 0;
+  for (let i = blockStart; i < content.length; i++) {
+    if (content[i] === '{') braceCount++;
+    if (content[i] === '}') braceCount--;
+    if (braceCount === 0) {
+      return { start: blockStart, end: i };
+    }
+  }
+  return null;
+}
+
+/**
+ * Main extraction function.
+ */
+export async function runExtraction() {
+  const manifest = JSON.parse(fs.readFileSync('data/manifest.json', 'utf8'));
+  const targets: OptimizationTarget[] = [];
+
+  // 1. Snippets
+  const snippetNames =
+    manifest.data_inventory?.optimization_targets?.snippets || [];
+  const snippetsPath = 'packages/core/src/prompts/snippets.ts';
+  if (fs.existsSync(snippetsPath)) {
+    const content = fs.readFileSync(snippetsPath, 'utf8');
+    for (const name of snippetNames) {
+      const startIdx = content.indexOf(`export function ${name}`);
+      if (startIdx === -1) continue;
+
+      const bounds = findBlockBounds(content, startIdx);
+      if (!bounds) continue;
+
+      const body = content.substring(bounds.start, bounds.end + 1);
+      // Capture the LAST template literal
+      const tickMatches = [...body.matchAll(/`((?:[^`\\]|\\.)*)`/g)];
+      if (tickMatches.length > 0) {
+        const text = tickMatches[tickMatches.length - 1][1].trim();
+        const { maskedText, maskMap } = maskVariables(text);
+        targets.push({
+          id: `snippets:${name}`,
+          sourceFile: snippetsPath,
+          originalText: text,
+          maskedText,
+          maskMap,
+        });
+      }
+    }
+  }
+
+  // 2. Tools
+  const toolNames = Object.keys(manifest.data_inventory?.tools || {});
+  const gemini3Path =
+    'packages/core/src/tools/definitions/model-family-sets/gemini-3.ts';
+  if (fs.existsSync(gemini3Path)) {
+    const content = fs.readFileSync(gemini3Path, 'utf8');
+    for (const name of toolNames) {
+      // Find tool key (2-space indent)
+      const toolRegex = new RegExp(`^\\s{2}${name}:\\s*\\{`, 'm');
+      const match = toolRegex.exec(content);
+      if (!match) continue;
+
+      const bounds = findBlockBounds(content, match.index);
+      if (!bounds) continue;
+
+      const toolBlock = content.substring(match.index, bounds.end + 1);
+      const descRegex =
+        /description:\s*(?:`((?:[^`\\]|\\.)*)`|'([^']*)'|"([^"]*)")/g;
+      const descMatch = descRegex.exec(toolBlock);
+
+      if (descMatch) {
+        const text = (descMatch[1] || descMatch[2] || descMatch[3]).trim();
+        const { maskedText, maskMap } = maskVariables(text);
+        targets.push({
+          id: `gemini3:${name}:description`,
+          sourceFile: gemini3Path,
+          originalText: text,
+          maskedText,
+          maskMap,
+        });
+      }
+    }
+  }
+
+  // 3. Dynamic Helpers
+  const helpersPath =
+    'packages/core/src/tools/definitions/dynamic-declaration-helpers.ts';
+  if (fs.existsSync(helpersPath)) {
+    const content = fs.readFileSync(helpersPath, 'utf8');
+    const specs = [
+      {
+        id: 'shell:darwin:description',
+        regex:
+          /return `This tool executes a given shell command as \\`bash -c <command>\\`. ([\s\S]*?)`;/,
+      },
+      {
+        id: 'shell:win32:description',
+        regex:
+          /return `This tool executes a given shell command as \\`powershell\.exe -NoProfile -Command <command>\\`. ([\s\S]*?)`;/,
+      },
+      {
+        id: 'exit_plan_mode:description',
+        regex:
+          /name: EXIT_PLAN_MODE_TOOL_NAME,[\s\S]*?description:\s*'([^']*)',/,
+      },
+      {
+        id: 'activate_skill:description',
+        regex:
+          /name: ACTIVATE_SKILL_TOOL_NAME,[\s\S]*?description:\s*`((?:[^`\\]|\\.)*)`,/,
+      },
+    ];
+    for (const s of specs) {
+      const m = s.regex.exec(content);
+      if (m && m[1]) {
+        const text = m[1].trim();
+        const { maskedText, maskMap } = maskVariables(text);
+        targets.push({
+          id: s.id,
+          sourceFile: helpersPath,
+          originalText: text,
+          maskedText,
+          maskMap,
+        });
+      }
+    }
+  }
+
+  const outputDir = 'data/optimization';
+  if (!fs.existsSync(outputDir)) fs.mkdirSync(outputDir, { recursive: true });
+
+  fs.writeFileSync(
+    path.join(outputDir, 'targets.json'),
+    JSON.stringify(targets, null, 2),
+  );
+  return targets;
+}
+
+// CLI Entrypoint
+const isMain =
+  process.argv[1] &&
+  fileURLToPath(import.meta.url) === fs.realpathSync(process.argv[1]);
+if (isMain) {
+  runExtraction()
+    // eslint-disable-next-line no-console
+    .then((t) => console.log(`✅ Extracted ${t.length} targets.`))
+    // eslint-disable-next-line no-console
+    .catch(console.error);
+}
@@ -0,0 +1,106 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/**
+ * Configuration for the Tool Alignment objective (The Accuracy Dimension).
+ */
+export interface AlignmentConfig {
+  /**
+   * The relative importance of accuracy vs other objectives in the Pareto frontier.
+   */
+  weight: number;
+
+  /**
+   * Strongest negative signal (0.0): used when model falls into a known shell trap.
+   */
+  hardFailureScore: number;
+
+  /**
+   * Neutral negative signal (0.1): used when model fails to produce a valid tool call.
+   */
+  invalidResponseScore: number;
+
+  /**
+   * Partial positive signal (0.4): model chose the right tool but hallucinated arguments.
+   */
+  toolNameMatchOnlyScore: number;
+
+  /**
+   * Maximum positive signal (1.0): model matched the golden signature perfectly.
+   */
+  functionalSuccessScore: number;
+}
+
+/**
+ * Configuration for the Brevity objective (The Density Dimension).
+ * Uses a word-count step-function to provide high-contrast signal for GEPA.
+ */
+export interface BrevityConfig {
+  /**
+   * Importance of brevity relative to accuracy.
+   */
+  weight: number;
+
+  /**
+   * TIER 1: Response is perfectly succinct (e.g., <= 10 words).
+   */
+  succinctThresholdWords: number;
+  succinctScore: number; // 1.0
+
+  /**
+   * TIER 2: Response is acceptable but slightly verbose (e.g., <= 25 words).
+   */
+  acceptableThresholdWords: number;
+  acceptableScore: number; // 0.7
+
+  /**
+   * TIER 3: Response is verbose (e.g., <= 50 words).
+   */
+  verboseThresholdWords: number;
+  verboseScore: number; // 0.4
+
+  /**
+   * TIER 4: Response is very heavy (e.g., > 50 words).
+   */
+  heavyScore: number; // 0.1
+}
+
+/**
+ * Global evaluation configuration for multi-objective optimization.
+ */
+export interface EvalConfig {
+  objectives: {
+    alignment: AlignmentConfig;
+    brevity: BrevityConfig;
+  };
+}
+
+/**
+ * Default weights and thresholds for the Genetic-Pareto (GEPA) engine.
+ * These constants drive the 'Selection Pressure' that evolves the prompt.
+ * GEPA always MAXIMIZES, so higher scores represent better performance.
+ */
+export const DEFAULT_EVAL_CONFIG: EvalConfig = {
+  objectives: {
+    alignment: {
+      weight: 1.0, // PRIMARY: Accuracy cannot be sacrificed.
+      hardFailureScore: 0.0,
+      invalidResponseScore: 0.1,
+      toolNameMatchOnlyScore: 0.4,
+      functionalSuccessScore: 1.0,
+    },
+    brevity: {
+      weight: 0.6, // SECONDARY: Reward brevity once accuracy is high.
+      succinctThresholdWords: 10,
+      succinctScore: 1.0,
+      acceptableThresholdWords: 25,
+      acceptableScore: 0.7,
+      verboseThresholdWords: 50,
+      verboseScore: 0.4,
+      heavyScore: 0.1, // Never hard-zero brevity to allow gradient improvement.
+    },
+  },
+};
@@ -0,0 +1,54 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, it, expect } from 'vitest';
+import { evaluateBrevity } from './brevityMetric.js';
+
+describe('evaluateBrevity 4-tier step-function', () => {
+  it('should return 1.0 for a succinct response (<= 10 words)', () => {
+    const prediction = { output_text: 'I have updated the file for you now.' }; // 8 words
+    const result = evaluateBrevity(prediction);
+    expect(result.score).toBe(1.0);
+    expect(result.metadata?.tier).toBe('succinct');
+  });
+
+  it('should return 0.7 for an acceptable response (11-25 words)', () => {
+    const text =
+      'I have successfully updated the file. Everything looks good to proceed with the next step.';
+    // 16 words
+    const prediction = { output_text: text };
+    const result = evaluateBrevity(prediction);
+    expect(result.score).toBe(0.7);
+    expect(result.metadata?.tier).toBe('acceptable');
+  });
+
+  it('should return 0.4 for a verbose response (26-50 words)', () => {
+    const text =
+      'Certainly! I would be more than happy to assist you with that request. I am now proceeding to surgically update the file using the replace tool to ensure accuracy.';
+    // 29 words
+    const prediction = { output_text: text };
+    const result = evaluateBrevity(prediction);
+    expect(result.score).toBe(0.4);
+    expect(result.metadata?.tier).toBe('verbose');
+  });
+
+  it('should return 0.1 for a heavy response (> 50 words)', () => {
+    const text =
+      'Certainly! I would be more than happy to assist you with that request. I am now proceeding to surgically update the file using the replace tool to ensure accuracy. I will then verify the changes and let you know when I am finished with the task so we can move to the next stage of implementation.';
+    // 53 words
+    const prediction = { output_text: text };
+    const result = evaluateBrevity(prediction);
+    expect(result.score).toBe(0.1);
+    expect(result.metadata?.tier).toBe('heavy');
+  });
+
+  it('should handle missing output text as succinct (0 words)', () => {
+    const prediction = {};
+    const result = evaluateBrevity(prediction);
+    expect(result.score).toBe(1.0);
+    expect(result.metadata?.tier).toBe('succinct');
+  });
+});
@@ -0,0 +1,62 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { debugLogger } from '../../../../../packages/core/src/utils/debugLogger.js';
+import { DEFAULT_EVAL_CONFIG } from '../config.js';
+import { MetricObjective } from '../types.js';
+import type { MetricResult } from '../types.js';
+
+/**
+ * Evaluates the brevity of a model's response using a tiered 4-step word-count function.
+ * Focuses on rewarding succinctness and providing a non-zero gradient for verbose models.
+ */
+export function evaluateBrevity(
+  prediction: { output_text?: string },
+  config = DEFAULT_EVAL_CONFIG.objectives.brevity,
+): MetricResult {
+  const chatter = (prediction.output_text ?? '').trim();
+
+  // Simple word count: split by whitespace and filter out empty strings
+  const wordCount = chatter === '' ? 0 : chatter.split(/\s+/).length;
+
+  debugLogger.debug(
+    `[Eval:Brevity] Measuring output text word count: ${wordCount} words.`,
+  );
+
+  let score: number;
+  let reason: string;
+
+  if (wordCount <= config.succinctThresholdWords) {
+    score = config.succinctScore;
+    reason = `Succinct: Response is within ${config.succinctThresholdWords} words.`;
+  } else if (wordCount <= config.acceptableThresholdWords) {
+    score = config.acceptableScore;
+    reason = `Acceptable: Response is slightly verbose (${wordCount} words), exceeding ${config.succinctThresholdWords} words.`;
+  } else if (wordCount <= config.verboseThresholdWords) {
+    score = config.verboseScore;
+    reason = `Verbose: Response contains ${wordCount} words, exceeding acceptable limit of ${config.acceptableThresholdWords} words.`;
+  } else {
+    score = config.heavyScore;
+    reason = `Heavy: Response is excessively verbose (${wordCount} words).`;
+  }
+
+  return {
+    score,
+    objective: MetricObjective.BREVITY,
+    reason,
+    metadata: {
+      wordCount,
+      tier:
+        score === 1.0
+          ? 'succinct'
+          : score === 0.7
+            ? 'acceptable'
+            : score === 0.4
+              ? 'verbose'
+              : 'heavy',
+    },
+  };
+}
@@ -0,0 +1,83 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, it, expect } from 'vitest';
+import { evaluateToolAlignment } from './toolAlignment.js';
+import { MetricObjective } from '../types.js';
+import type { Scenario } from '../schema.js';
+
+describe('evaluateToolAlignment', () => {
+  const mockScenario: Scenario = {
+    id: 'test-scenario',
+    metadata: { tags: ['test'], created_at: '2026-03-02' },
+    input: { user_query: 'test query' },
+    expected: {
+      tool_calls: [{ name: 'read_file', arguments: { file_path: 'test.ts' } }],
+      rationale: 'Testing alignment',
+    },
+    negatives: [
+      {
+        tool_calls: [
+          { name: 'run_shell_command', arguments: { command: 'cat test.ts' } },
+        ],
+        reason: 'Avoid shell',
+        severity: 'high',
+      },
+    ],
+  };
+
+  it('should return 1.0 for a perfect match', () => {
+    const prediction = {
+      tool_calls: [{ name: 'read_file', arguments: { file_path: 'test.ts' } }],
+    };
+    const result = evaluateToolAlignment(prediction, mockScenario);
+    expect(result.score).toBe(1.0);
+    expect(result.objective).toBe(MetricObjective.ALIGNMENT);
+    expect(result.reason).toContain('Functional Success');
+  });
+
+  it('should return 0.0 for a hard failure (negative match)', () => {
+    const prediction = {
+      tool_calls: [
+        { name: 'run_shell_command', arguments: { command: 'cat test.ts' } },
+      ],
+    };
+    const result = evaluateToolAlignment(prediction, mockScenario);
+    expect(result.score).toBe(0.0);
+    expect(result.reason).toContain('Hard Failure');
+    expect(result.metadata?.['matchedNegativeReason']).toBe('Avoid shell');
+  });
+
+  it('should return 0.1 for an incorrect tool selection', () => {
+    const prediction = {
+      tool_calls: [
+        {
+          name: 'write_file',
+          arguments: { file_path: 'test.ts', content: 'test' },
+        },
+      ],
+    };
+    const result = evaluateToolAlignment(prediction, mockScenario);
+    expect(result.score).toBe(0.1);
+    expect(result.reason).toContain('wrong tool');
+  });
+
+  it('should return 0.4 for correct tool but wrong arguments', () => {
+    const prediction = {
+      tool_calls: [{ name: 'read_file', arguments: { file_path: 'wrong.ts' } }],
+    };
+    const result = evaluateToolAlignment(prediction, mockScenario);
+    expect(result.score).toBe(0.4);
+    expect(result.reason).toContain('arguments are incorrect');
+  });
+
+  it('should return 0.1 for an empty tool call list', () => {
+    const prediction = { tool_calls: [] };
+    const result = evaluateToolAlignment(prediction, mockScenario);
+    expect(result.score).toBe(0.1);
+    expect(result.reason).toContain('failed to produce any tool calls');
+  });
+});
@@ -0,0 +1,124 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { debugLogger } from '../../../../../packages/core/src/utils/debugLogger.js';
+import type { Scenario, ToolCall } from '../schema.js';
+import { DEFAULT_EVAL_CONFIG } from '../config.js';
+import { MetricObjective } from '../types.js';
+import type { MetricResult } from '../types.js';
+
+/**
+ * Evaluates the alignment of a model's predicted tool calls against a golden scenario.
+ * Focuses on accuracy and shell avoidance.
+ */
+export function evaluateToolAlignment(
+  prediction: { tool_calls: ToolCall[] },
+  example: Scenario,
+  config = DEFAULT_EVAL_CONFIG.objectives.alignment,
+): MetricResult {
+  const { tool_calls: predictedCalls } = prediction;
+  const { expected, negatives, id: scenarioId } = example;
+
+  debugLogger.debug(`[Eval:${scenarioId}] Evaluating tool alignment...`);
+
+  // 1. Check for Hard Failures (Explicit Negatives)
+  for (const negative of negatives) {
+    const isNegativeMatch = negative.tool_calls.every((negCall: ToolCall) =>
+      predictedCalls.some(
+        (predCall: ToolCall) =>
+          predCall.name === negCall.name &&
+          areArgsMatching(negCall.arguments, predCall.arguments),
+      ),
+    );
+
+    if (isNegativeMatch && negative.tool_calls.length > 0) {
+      debugLogger.debug(
+        `[Eval:${scenarioId}] Hard Failure: Matched negative pattern.`,
+      );
+      return {
+        score: config.hardFailureScore,
+        objective: MetricObjective.ALIGNMENT,
+        reason: `Hard Failure: ${negative.reason}`,
+        metadata: {
+          matchedNegativeReason: negative.reason,
+          severity: negative.severity,
+        },
+      };
+    }
+  }
+
+  // 2. Structural Check
+  if (predictedCalls.length === 0) {
+    debugLogger.debug(
+      `[Eval:${scenarioId}] Invalid Response: No tool calls found.`,
+    );
+    return {
+      score: config.invalidResponseScore,
+      objective: MetricObjective.ALIGNMENT,
+      reason: 'Model failed to produce any tool calls.',
+    };
+  }
+
+  // 3. Functional Alignment Check
+  const expectedCalls = expected.tool_calls;
+
+  // Check if all expected tool names are present
+  const namesMatch = expectedCalls.every((exp: ToolCall) =>
+    predictedCalls.some((pred: ToolCall) => pred.name === exp.name),
+  );
+
+  if (!namesMatch) {
+    debugLogger.debug(
+      `[Eval:${scenarioId}] Failure: Incorrect tool selection.`,
+    );
+    return {
+      score: config.invalidResponseScore,
+      objective: MetricObjective.ALIGNMENT,
+      reason: 'Model selected the wrong tool(s).',
+    };
+  }
+
+  // Check for Argument Precision
+  const argsMatch = expectedCalls.every((exp: ToolCall) =>
+    predictedCalls.some(
+      (pred: ToolCall) =>
+        pred.name === exp.name &&
+        areArgsMatching(exp.arguments, pred.arguments),
+    ),
+  );
+
+  if (!argsMatch) {
+    debugLogger.debug(
+      `[Eval:${scenarioId}] Partial Success: Right tool, wrong arguments.`,
+    );
+    return {
+      score: config.toolNameMatchOnlyScore,
+      objective: MetricObjective.ALIGNMENT,
+      reason: 'Correct tool selected, but arguments are incorrect or missing.',
+    };
+  }
+
+  // 4. Perfect Success
+  debugLogger.debug(
+    `[Eval:${scenarioId}] Perfect Functional Alignment achieved.`,
+  );
+  return {
+    score: config.functionalSuccessScore,
+    objective: MetricObjective.ALIGNMENT,
+    reason:
+      'Functional Success: Tool and arguments align perfectly with golden scenario.',
+  };
+}
+
+/**
+ * Deep equality check for tool arguments.
+ */
+function areArgsMatching(
+  expected: Record<string, unknown>,
+  predicted: Record<string, unknown>,
+): boolean {
+  return JSON.stringify(expected) === JSON.stringify(predicted);
+}
@@ -0,0 +1,49 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/**
+ * The core data interface for the Tool Alignment Dataset.
+ * Designed to be extensible for custom error reports and metrics.
+ */
+
+export interface ToolCall {
+  name: string;
+  arguments: Record<string, unknown>;
+}
+
+export interface NegativeExample {
+  id?: string;
+  tool_calls: ToolCall[];
+  output_text?: string; // For "too chatty" or "hallucination" failures
+  reason: string; // e.g., "Defaulted to shell 'cat'", "Included conversational filler"
+  severity: 'low' | 'medium' | 'high'; // Helps the optimizer prioritize fixes
+}
+
+export interface Scenario {
+  id: string; // Unique identifier (e.g., 'read_file-01')
+  metadata: {
+    tags: string[]; // e.g., ['tool-alignment', 'shell-avoidance']
+    created_at: string;
+    platform?: 'darwin' | 'linux' | 'win32'; // To handle platform-specific shell variations
+    model_info?: {
+      // Placeholder for future tracking
+      name?: string;
+      version?: string;
+    };
+  };
+  input: {
+    user_query: string;
+    context?: {
+      current_file?: string;
+      directory_structure?: string[];
+    };
+  };
+  expected: {
+    tool_calls: ToolCall[];
+    rationale: string; // Why this is the 'Golden' choice
+  };
+  negatives: NegativeExample[]; // Array of multiple failure modes
+}
@@ -0,0 +1,40 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/**
+ * The specific dimensions being measured by the evaluation pipeline.
+ */
+export enum MetricObjective {
+  ALIGNMENT = 'alignment',
+  BREVITY = 'brevity',
+}
+
+/**
+ * Standardized result for any metric calculation.
+ * Designed for consumption by the Genetic-Pareto (GEPA) multi-objective function.
+ */
+export interface MetricResult {
+  /**
+   * The numeric score calculated by the metric.
+   * All metrics must provide a value where HIGHER is BETTER.
+   */
+  score: number;
+
+  /**
+   * The specific objective this result corresponds to.
+   */
+  objective: MetricObjective;
+
+  /**
+   * A human-readable (and optimizer-reflective) reason for the score.
+   */
+  reason: string;
+
+  /**
+   * Additional data points (e.g., char counts, matched negative IDs).
+   */
+  metadata?: Record<string, unknown>;
+}
@@ -0,0 +1,41 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, it, expect } from 'vitest';
+import { maskVariables, unmaskVariables } from './masking.js';
+
+describe('optimization masking utility', () => {
+  it('should mask unique template variables with indexed tokens', () => {
+    const input = 'Use ${TOOL_A} to read ${FILE_PATH}. ${TOOL_A} is efficient.';
+    const { maskedText, maskMap } = maskVariables(input);
+
+    expect(maskedText).toContain('[[GCLI_VAR_0]]');
+    expect(maskedText).toContain('[[GCLI_VAR_1]]');
+    // Ensure all occurrences of the same variable are replaced with the same token
+    const toolAToken = Object.keys(maskMap).find(
+      (key) => maskMap[key] === '${TOOL_A}',
+    )!;
+    const count = maskedText.split(toolAToken).length - 1;
+    expect(count).toBe(2);
+    expect(maskedText).not.toContain('${TOOL_A}');
+  });
+
+  it('should perfectly restore original text during unmasking', () => {
+    const original = 'Update ${OLD_STR} with ${NEW_STR} in ${FILE_PATH}.';
+    const { maskedText, maskMap } = maskVariables(original);
+    const restored = unmaskVariables(maskedText, maskMap);
+
+    expect(restored).toBe(original);
+  });
+
+  it('should handle text with no variables', () => {
+    const input = 'Static text with no placeholders.';
+    const { maskedText, maskMap } = maskVariables(input);
+
+    expect(maskedText).toBe(input);
+    expect(Object.keys(maskMap).length).toBe(0);
+  });
+});
@@ -0,0 +1,61 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/**
+ * Utility to protect TypeScript template variables from being "optimized" by the LLM.
+ * Replaces ${VAR} with unique stable tokens and allows for perfect restoration.
+ */
+
+export interface MaskResult {
+  maskedText: string;
+  maskMap: Record<string, string>;
+}
+
+const MASK_PREFIX = '[[GCLI_VAR_';
+const MASK_SUFFIX = ']]';
+
+/**
+ * Replaces all instances of ${VARIABLE_NAME} with indexed tokens.
+ * Supports both SCREAMING_SNAKE_CASE and camelCase variables.
+ */
+export function maskVariables(text: string): MaskResult {
+  const maskMap: Record<string, string> = {};
+  // Refined regex to capture any variable pattern like ${variableName} or ${VARIABLE_NAME}
+  const variableRegex = /\${[a-zA-Z0-9_.]+}/g;
+  let index = 0;
+  let maskedText = text;
+
+  // Find all unique variables
+  const uniqueVars = Array.from(new Set(text.match(variableRegex) || []));
+
+  uniqueVars.forEach((v) => {
+    const token = `${MASK_PREFIX}${index}${MASK_SUFFIX}`;
+    maskMap[token] = v;
+    // Use a global regex for the specific variable to replace all occurrences
+    maskedText = maskedText.split(v).join(token);
+    index++;
+  });
+
+  return { maskedText, maskMap };
+}
+
+/**
+ * Restores original ${VARIABLE_NAME} patterns using the provided mask map.
+ */
+export function unmaskVariables(
+  text: string,
+  maskMap: Record<string, string>,
+): string {
+  let unmaskedText = text;
+  // Sort tokens by length descending to prevent partial replacement (e.g. VAR_10 before VAR_1)
+  const sortedTokens = Object.keys(maskMap).sort((a, b) => b.length - a.length);
+
+  sortedTokens.forEach((token) => {
+    const originalVar = maskMap[token];
+    unmaskedText = unmaskedText.split(token).join(originalVar);
+  });
+  return unmaskedText;
+}