feat(optimization): consolidate extraction pipeline and metrics

- Flatten directory structure by moving masking and evals to scripts root. - Merge evaluation metrics into scripts/optimization/evals. - Restore and verify extraction tests for the new structure.
2026-06-15 13:57:45 -07:00 · 2026-03-04 15:14:45 -08:00
parent 59d377e5e0
commit 9bdf5d5995
10 changed files with 1 additions and 1 deletions
@@ -0,0 +1,106 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/**
+ * Configuration for the Tool Alignment objective (The Accuracy Dimension).
+ */
+export interface AlignmentConfig {
+  /**
+   * The relative importance of accuracy vs other objectives in the Pareto frontier.
+   */
+  weight: number;
+
+  /**
+   * Strongest negative signal (0.0): used when model falls into a known shell trap.
+   */
+  hardFailureScore: number;
+
+  /**
+   * Neutral negative signal (0.1): used when model fails to produce a valid tool call.
+   */
+  invalidResponseScore: number;
+
+  /**
+   * Partial positive signal (0.4): model chose the right tool but hallucinated arguments.
+   */
+  toolNameMatchOnlyScore: number;
+
+  /**
+   * Maximum positive signal (1.0): model matched the golden signature perfectly.
+   */
+  functionalSuccessScore: number;
+}
+
+/**
+ * Configuration for the Brevity objective (The Density Dimension).
+ * Uses a word-count step-function to provide high-contrast signal for GEPA.
+ */
+export interface BrevityConfig {
+  /**
+   * Importance of brevity relative to accuracy.
+   */
+  weight: number;
+
+  /**
+   * TIER 1: Response is perfectly succinct (e.g., <= 10 words).
+   */
+  succinctThresholdWords: number;
+  succinctScore: number; // 1.0
+
+  /**
+   * TIER 2: Response is acceptable but slightly verbose (e.g., <= 25 words).
+   */
+  acceptableThresholdWords: number;
+  acceptableScore: number; // 0.7
+
+  /**
+   * TIER 3: Response is verbose (e.g., <= 50 words).
+   */
+  verboseThresholdWords: number;
+  verboseScore: number; // 0.4
+
+  /**
+   * TIER 4: Response is very heavy (e.g., > 50 words).
+   */
+  heavyScore: number; // 0.1
+}
+
+/**
+ * Global evaluation configuration for multi-objective optimization.
+ */
+export interface EvalConfig {
+  objectives: {
+    alignment: AlignmentConfig;
+    brevity: BrevityConfig;
+  };
+}
+
+/**
+ * Default weights and thresholds for the Genetic-Pareto (GEPA) engine.
+ * These constants drive the 'Selection Pressure' that evolves the prompt.
+ * GEPA always MAXIMIZES, so higher scores represent better performance.
+ */
+export const DEFAULT_EVAL_CONFIG: EvalConfig = {
+  objectives: {
+    alignment: {
+      weight: 1.0, // PRIMARY: Accuracy cannot be sacrificed.
+      hardFailureScore: 0.0,
+      invalidResponseScore: 0.1,
+      toolNameMatchOnlyScore: 0.4,
+      functionalSuccessScore: 1.0,
+    },
+    brevity: {
+      weight: 0.6, // SECONDARY: Reward brevity once accuracy is high.
+      succinctThresholdWords: 10,
+      succinctScore: 1.0,
+      acceptableThresholdWords: 25,
+      acceptableScore: 0.7,
+      verboseThresholdWords: 50,
+      verboseScore: 0.4,
+      heavyScore: 0.1, // Never hard-zero brevity to allow gradient improvement.
+    },
+  },
+};
@@ -0,0 +1,54 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, it, expect } from 'vitest';
+import { evaluateBrevity } from './brevityMetric.js';
+
+describe('evaluateBrevity 4-tier step-function', () => {
+  it('should return 1.0 for a succinct response (<= 10 words)', () => {
+    const prediction = { output_text: 'I have updated the file for you now.' }; // 8 words
+    const result = evaluateBrevity(prediction);
+    expect(result.score).toBe(1.0);
+    expect(result.metadata?.tier).toBe('succinct');
+  });
+
+  it('should return 0.7 for an acceptable response (11-25 words)', () => {
+    const text =
+      'I have successfully updated the file. Everything looks good to proceed with the next step.';
+    // 16 words
+    const prediction = { output_text: text };
+    const result = evaluateBrevity(prediction);
+    expect(result.score).toBe(0.7);
+    expect(result.metadata?.tier).toBe('acceptable');
+  });
+
+  it('should return 0.4 for a verbose response (26-50 words)', () => {
+    const text =
+      'Certainly! I would be more than happy to assist you with that request. I am now proceeding to surgically update the file using the replace tool to ensure accuracy.';
+    // 29 words
+    const prediction = { output_text: text };
+    const result = evaluateBrevity(prediction);
+    expect(result.score).toBe(0.4);
+    expect(result.metadata?.tier).toBe('verbose');
+  });
+
+  it('should return 0.1 for a heavy response (> 50 words)', () => {
+    const text =
+      'Certainly! I would be more than happy to assist you with that request. I am now proceeding to surgically update the file using the replace tool to ensure accuracy. I will then verify the changes and let you know when I am finished with the task so we can move to the next stage of implementation.';
+    // 53 words
+    const prediction = { output_text: text };
+    const result = evaluateBrevity(prediction);
+    expect(result.score).toBe(0.1);
+    expect(result.metadata?.tier).toBe('heavy');
+  });
+
+  it('should handle missing output text as succinct (0 words)', () => {
+    const prediction = {};
+    const result = evaluateBrevity(prediction);
+    expect(result.score).toBe(1.0);
+    expect(result.metadata?.tier).toBe('succinct');
+  });
+});
@@ -0,0 +1,62 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { debugLogger } from '../../../../../packages/core/src/utils/debugLogger.js';
+import { DEFAULT_EVAL_CONFIG } from '../config.js';
+import { MetricObjective } from '../types.js';
+import type { MetricResult } from '../types.js';
+
+/**
+ * Evaluates the brevity of a model's response using a tiered 4-step word-count function.
+ * Focuses on rewarding succinctness and providing a non-zero gradient for verbose models.
+ */
+export function evaluateBrevity(
+  prediction: { output_text?: string },
+  config = DEFAULT_EVAL_CONFIG.objectives.brevity,
+): MetricResult {
+  const chatter = (prediction.output_text ?? '').trim();
+
+  // Simple word count: split by whitespace and filter out empty strings
+  const wordCount = chatter === '' ? 0 : chatter.split(/\s+/).length;
+
+  debugLogger.debug(
+    `[Eval:Brevity] Measuring output text word count: ${wordCount} words.`,
+  );
+
+  let score: number;
+  let reason: string;
+
+  if (wordCount <= config.succinctThresholdWords) {
+    score = config.succinctScore;
+    reason = `Succinct: Response is within ${config.succinctThresholdWords} words.`;
+  } else if (wordCount <= config.acceptableThresholdWords) {
+    score = config.acceptableScore;
+    reason = `Acceptable: Response is slightly verbose (${wordCount} words), exceeding ${config.succinctThresholdWords} words.`;
+  } else if (wordCount <= config.verboseThresholdWords) {
+    score = config.verboseScore;
+    reason = `Verbose: Response contains ${wordCount} words, exceeding acceptable limit of ${config.acceptableThresholdWords} words.`;
+  } else {
+    score = config.heavyScore;
+    reason = `Heavy: Response is excessively verbose (${wordCount} words).`;
+  }
+
+  return {
+    score,
+    objective: MetricObjective.BREVITY,
+    reason,
+    metadata: {
+      wordCount,
+      tier:
+        score === 1.0
+          ? 'succinct'
+          : score === 0.7
+            ? 'acceptable'
+            : score === 0.4
+              ? 'verbose'
+              : 'heavy',
+    },
+  };
+}
@@ -0,0 +1,83 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, it, expect } from 'vitest';
+import { evaluateToolAlignment } from './toolAlignment.js';
+import { MetricObjective } from '../types.js';
+import type { Scenario } from '../schema.js';
+
+describe('evaluateToolAlignment', () => {
+  const mockScenario: Scenario = {
+    id: 'test-scenario',
+    metadata: { tags: ['test'], created_at: '2026-03-02' },
+    input: { user_query: 'test query' },
+    expected: {
+      tool_calls: [{ name: 'read_file', arguments: { file_path: 'test.ts' } }],
+      rationale: 'Testing alignment',
+    },
+    negatives: [
+      {
+        tool_calls: [
+          { name: 'run_shell_command', arguments: { command: 'cat test.ts' } },
+        ],
+        reason: 'Avoid shell',
+        severity: 'high',
+      },
+    ],
+  };
+
+  it('should return 1.0 for a perfect match', () => {
+    const prediction = {
+      tool_calls: [{ name: 'read_file', arguments: { file_path: 'test.ts' } }],
+    };
+    const result = evaluateToolAlignment(prediction, mockScenario);
+    expect(result.score).toBe(1.0);
+    expect(result.objective).toBe(MetricObjective.ALIGNMENT);
+    expect(result.reason).toContain('Functional Success');
+  });
+
+  it('should return 0.0 for a hard failure (negative match)', () => {
+    const prediction = {
+      tool_calls: [
+        { name: 'run_shell_command', arguments: { command: 'cat test.ts' } },
+      ],
+    };
+    const result = evaluateToolAlignment(prediction, mockScenario);
+    expect(result.score).toBe(0.0);
+    expect(result.reason).toContain('Hard Failure');
+    expect(result.metadata?.['matchedNegativeReason']).toBe('Avoid shell');
+  });
+
+  it('should return 0.1 for an incorrect tool selection', () => {
+    const prediction = {
+      tool_calls: [
+        {
+          name: 'write_file',
+          arguments: { file_path: 'test.ts', content: 'test' },
+        },
+      ],
+    };
+    const result = evaluateToolAlignment(prediction, mockScenario);
+    expect(result.score).toBe(0.1);
+    expect(result.reason).toContain('wrong tool');
+  });
+
+  it('should return 0.4 for correct tool but wrong arguments', () => {
+    const prediction = {
+      tool_calls: [{ name: 'read_file', arguments: { file_path: 'wrong.ts' } }],
+    };
+    const result = evaluateToolAlignment(prediction, mockScenario);
+    expect(result.score).toBe(0.4);
+    expect(result.reason).toContain('arguments are incorrect');
+  });
+
+  it('should return 0.1 for an empty tool call list', () => {
+    const prediction = { tool_calls: [] };
+    const result = evaluateToolAlignment(prediction, mockScenario);
+    expect(result.score).toBe(0.1);
+    expect(result.reason).toContain('failed to produce any tool calls');
+  });
+});
@@ -0,0 +1,124 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { debugLogger } from '../../../../../packages/core/src/utils/debugLogger.js';
+import type { Scenario, ToolCall } from '../schema.js';
+import { DEFAULT_EVAL_CONFIG } from '../config.js';
+import { MetricObjective } from '../types.js';
+import type { MetricResult } from '../types.js';
+
+/**
+ * Evaluates the alignment of a model's predicted tool calls against a golden scenario.
+ * Focuses on accuracy and shell avoidance.
+ */
+export function evaluateToolAlignment(
+  prediction: { tool_calls: ToolCall[] },
+  example: Scenario,
+  config = DEFAULT_EVAL_CONFIG.objectives.alignment,
+): MetricResult {
+  const { tool_calls: predictedCalls } = prediction;
+  const { expected, negatives, id: scenarioId } = example;
+
+  debugLogger.debug(`[Eval:${scenarioId}] Evaluating tool alignment...`);
+
+  // 1. Check for Hard Failures (Explicit Negatives)
+  for (const negative of negatives) {
+    const isNegativeMatch = negative.tool_calls.every((negCall: ToolCall) =>
+      predictedCalls.some(
+        (predCall: ToolCall) =>
+          predCall.name === negCall.name &&
+          areArgsMatching(negCall.arguments, predCall.arguments),
+      ),
+    );
+
+    if (isNegativeMatch && negative.tool_calls.length > 0) {
+      debugLogger.debug(
+        `[Eval:${scenarioId}] Hard Failure: Matched negative pattern.`,
+      );
+      return {
+        score: config.hardFailureScore,
+        objective: MetricObjective.ALIGNMENT,
+        reason: `Hard Failure: ${negative.reason}`,
+        metadata: {
+          matchedNegativeReason: negative.reason,
+          severity: negative.severity,
+        },
+      };
+    }
+  }
+
+  // 2. Structural Check
+  if (predictedCalls.length === 0) {
+    debugLogger.debug(
+      `[Eval:${scenarioId}] Invalid Response: No tool calls found.`,
+    );
+    return {
+      score: config.invalidResponseScore,
+      objective: MetricObjective.ALIGNMENT,
+      reason: 'Model failed to produce any tool calls.',
+    };
+  }
+
+  // 3. Functional Alignment Check
+  const expectedCalls = expected.tool_calls;
+
+  // Check if all expected tool names are present
+  const namesMatch = expectedCalls.every((exp: ToolCall) =>
+    predictedCalls.some((pred: ToolCall) => pred.name === exp.name),
+  );
+
+  if (!namesMatch) {
+    debugLogger.debug(
+      `[Eval:${scenarioId}] Failure: Incorrect tool selection.`,
+    );
+    return {
+      score: config.invalidResponseScore,
+      objective: MetricObjective.ALIGNMENT,
+      reason: 'Model selected the wrong tool(s).',
+    };
+  }
+
+  // Check for Argument Precision
+  const argsMatch = expectedCalls.every((exp: ToolCall) =>
+    predictedCalls.some(
+      (pred: ToolCall) =>
+        pred.name === exp.name &&
+        areArgsMatching(exp.arguments, pred.arguments),
+    ),
+  );
+
+  if (!argsMatch) {
+    debugLogger.debug(
+      `[Eval:${scenarioId}] Partial Success: Right tool, wrong arguments.`,
+    );
+    return {
+      score: config.toolNameMatchOnlyScore,
+      objective: MetricObjective.ALIGNMENT,
+      reason: 'Correct tool selected, but arguments are incorrect or missing.',
+    };
+  }
+
+  // 4. Perfect Success
+  debugLogger.debug(
+    `[Eval:${scenarioId}] Perfect Functional Alignment achieved.`,
+  );
+  return {
+    score: config.functionalSuccessScore,
+    objective: MetricObjective.ALIGNMENT,
+    reason:
+      'Functional Success: Tool and arguments align perfectly with golden scenario.',
+  };
+}
+
+/**
+ * Deep equality check for tool arguments.
+ */
+function areArgsMatching(
+  expected: Record<string, unknown>,
+  predicted: Record<string, unknown>,
+): boolean {
+  return JSON.stringify(expected) === JSON.stringify(predicted);
+}
@@ -0,0 +1,49 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/**
+ * The core data interface for the Tool Alignment Dataset.
+ * Designed to be extensible for custom error reports and metrics.
+ */
+
+export interface ToolCall {
+  name: string;
+  arguments: Record<string, unknown>;
+}
+
+export interface NegativeExample {
+  id?: string;
+  tool_calls: ToolCall[];
+  output_text?: string; // For "too chatty" or "hallucination" failures
+  reason: string; // e.g., "Defaulted to shell 'cat'", "Included conversational filler"
+  severity: 'low' | 'medium' | 'high'; // Helps the optimizer prioritize fixes
+}
+
+export interface Scenario {
+  id: string; // Unique identifier (e.g., 'read_file-01')
+  metadata: {
+    tags: string[]; // e.g., ['tool-alignment', 'shell-avoidance']
+    created_at: string;
+    platform?: 'darwin' | 'linux' | 'win32'; // To handle platform-specific shell variations
+    model_info?: {
+      // Placeholder for future tracking
+      name?: string;
+      version?: string;
+    };
+  };
+  input: {
+    user_query: string;
+    context?: {
+      current_file?: string;
+      directory_structure?: string[];
+    };
+  };
+  expected: {
+    tool_calls: ToolCall[];
+    rationale: string; // Why this is the 'Golden' choice
+  };
+  negatives: NegativeExample[]; // Array of multiple failure modes
+}
@@ -0,0 +1,40 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/**
+ * The specific dimensions being measured by the evaluation pipeline.
+ */
+export enum MetricObjective {
+  ALIGNMENT = 'alignment',
+  BREVITY = 'brevity',
+}
+
+/**
+ * Standardized result for any metric calculation.
+ * Designed for consumption by the Genetic-Pareto (GEPA) multi-objective function.
+ */
+export interface MetricResult {
+  /**
+   * The numeric score calculated by the metric.
+   * All metrics must provide a value where HIGHER is BETTER.
+   */
+  score: number;
+
+  /**
+   * The specific objective this result corresponds to.
+   */
+  objective: MetricObjective;
+
+  /**
+   * A human-readable (and optimizer-reflective) reason for the score.
+   */
+  reason: string;
+
+  /**
+   * Additional data points (e.g., char counts, matched negative IDs).
+   */
+  metadata?: Record<string, unknown>;
+}