Update tests.

2026-05-19 08:14:35 -07:00 · 2026-04-10 17:11:58 -07:00
parent 38830bfe81
commit 1fa422a7c6
2 changed files with 62 additions and 301 deletions
@@ -60,7 +60,7 @@ async function runContextManagerEval(
 }> {
  const harness = await SimulationHarness.create(
    sidecarConfig,
-    config.getBaseLlmClient(),
+    config.getBaseLlmClient() as any,
    path.join(process.cwd(), 'harness-tmp'),
  );

@@ -70,7 +70,6 @@ async function runContextManagerEval(
  }

  // Ensure background tasks (like StateSnapshotProcessor) have finished
-  await harness.waitForIdle();

  // 2. Pick a question based on seed
  const questionIndex = seed % scenario.questions.length;
@@ -82,12 +81,12 @@ async function runContextManagerEval(

  // We can't easily get the episode IDs from the projected Content[],
  // but we can look at the working buffer instead.
-  const workingBuffer = harness.contextManager.getWorkingBufferView();
+  const workingBuffer = harness.contextManager.getNodes();
  console.log('--- WORKING BUFFER EPISODES START ---');
-  workingBuffer.forEach((ep, i) => {
-    console.log(`[Ep ${i}] ID: ${ep.id}, Type: ${ep.trigger.type}`);
-    if (ep.trigger.type === 'USER_PROMPT') {
-      console.log(`  Text: ${ep.trigger.semanticParts[0]?.text?.slice(0, 50)}`);
+  workingBuffer.forEach((node: any, i: number) => {
+    console.log(`[Node ${i}] ID: ${node.id}, Type: ${node.type}`);
+    if (node.type === 'USER_PROMPT') {
+      console.log(`  Text: ${node.semanticParts?.[0]?.text?.slice(0, 50)}`);
    }
  });
  console.log('--- WORKING BUFFER EPISODES END ---');
@@ -95,16 +94,16 @@ async function runContextManagerEval(
  console.log('--- COMPRESSED HISTORY START ---');
  compressedHistory.forEach((msg, i) => {
    console.log(`[${i}] Role: ${msg.role}`);
-    msg.parts.forEach((part, j) => {
+    (msg.parts || []).forEach((part, j) => {
      if ('text' in part) {
        console.log(
          `  Part ${j} (text): ${part.text?.slice(0, 100)}${part.text && part.text.length > 100 ? '...' : ''}`,
        );
      } else if ('functionCall' in part) {
-        console.log(`  Part ${j} (functionCall): ${part.functionCall.name}`);
+        console.log(`  Part ${j} (functionCall): ${part.functionCall?.name}`);
      } else if ('functionResponse' in part) {
        console.log(
-          `  Part ${j} (functionResponse): ${part.functionResponse.name}`,
+          `  Part ${j} (functionResponse): ${part.functionResponse?.name}`,
        );
      } else {
        console.log(`  Part ${j} (other): ${Object.keys(part).join(', ')}`);
@@ -144,8 +143,8 @@ async function runContextManagerEval(
      .includes(question.expectedSubstring.toLowerCase());
  }

-  const finalTokens = harness.env.tokenCalculator.calculateEpisodeListTokens(
-    harness.contextManager.getWorkingBufferView(),
+  const finalTokens = harness.env.tokenCalculator.calculateConcreteListTokens(
+    harness.contextManager.getNodes(),
  );

  return { pass, response: responseText, question, tokens: finalTokens };
@@ -206,7 +205,7 @@ Respond with ONLY "PASS" or "FAIL".`,
 function calculateScenarioTokens(scenario: Scenario): number {
  let totalChars = 0;
  for (const message of scenario.history) {
-    for (const part of message.parts) {
+    for (const part of message.parts || []) {
      if ('text' in part && part.text) {
        totalChars += part.text.length;
      }
@@ -228,6 +227,7 @@ function generateRandomSidecarConfig(
  const max = Math.floor(retained * (1.1 + Math.random() * 2.0)); // 110% to 300% buffer

  const useSquashing = Math.random() > 0.5;
+  const useRollingSummary = Math.random() > 0.5;
  const useSnapshot = Math.random() > 0.5;

  const processors: any[] = [
@@ -241,23 +241,40 @@ function generateRandomSidecarConfig(
    },
    { processorId: 'BlobDegradationProcessor', options: {} },
    {
-      processorId: 'SemanticCompressionProcessor',
+      processorId: 'NodeDistillationProcessor',
      options: {
        nodeThresholdTokens: Math.floor(retained * (0.1 + Math.random() * 0.3)),
      },
    },
-    { processorId: 'EmergencyTruncationProcessor', options: {} },
+    {
+      processorId: 'NodeTruncationProcessor',
+      options: {
+        maxTokensPerNode: Math.floor(retained * (0.1 + Math.random() * 0.5)),
+      },
+    },
  ];

  const backgroundProcessors: any[] = [];
  if (useSquashing) {
    backgroundProcessors.push({
-      processorId: 'HistorySquashingProcessor',
+      processorId: 'HistoryTruncationProcessor',
      options: {
-        maxTokensPerNode: Math.floor(retained * (0.05 + Math.random() * 0.15)),
+        target: 'freeNTokens',
+        freeTokensTarget: Math.floor(retained * (0.05 + Math.random() * 0.15)),
      },
    });
  }
+
+  if (useRollingSummary) {
+    backgroundProcessors.push({
+      processorId: 'RollingSummaryProcessor',
+      options: {
+        target: 'freeNTokens',
+        freeTokensTarget: Math.floor(retained * (0.05 + Math.random() * 0.15)),
+      },
+    });
+  }
+
  if (useSnapshot) {
    backgroundProcessors.push({
      processorId: 'StateSnapshotProcessor',
@@ -265,27 +282,36 @@ function generateRandomSidecarConfig(
    });
  }

+  const workers: any[] = [];
+  if (useSnapshot && Math.random() > 0.5) {
+    workers.push({
+      workerId: 'StateSnapshotWorker',
+      options: {
+        type: Math.random() > 0.5 ? 'accumulate' : 'point-in-time',
+      },
+    });
+  }
+
  return {
    budget: { retainedTokens: retained, maxTokens: max },
-    gcBackstop: {
-      strategy: 'truncate', // Hardcoded since compress/rollingSummarizer are currently unimplemented
-      target: 'incremental',
-      freeTokensTarget: Math.floor(retained * 0.1),
-    },
    pipelines: [
      {
        name: 'Immediate Sanitization',
-        triggers: ['on_turn'],
-        execution: 'blocking',
+        triggers: ['new_message'],
        processors,
      },
      {
        name: 'Deep Background Compression',
-        triggers: [{ type: 'timer', intervalMs: 100 }, 'budget_exceeded'],
-        execution: 'background',
+        triggers: [
+          Math.random() > 0.5
+            ? { type: 'timer', intervalMs: 100 }
+            : 'gc_backstop',
+          'retained_exceeded',
+        ],
        processors: backgroundProcessors,
      },
    ],
+    workers: workers.length > 0 ? workers : undefined,
  };
 }

@@ -296,7 +322,7 @@ describe('ContextManager Evaluation Suite', () => {
   * The "Explorer" test.
   * Set RUN_EXPLORER=1 to run many iterations and find failures.
   */
-  if (process.env.RUN_EXPLORER) {
+  if (process.env['RUN_EXPLORER']) {
    componentEvalTest('ALWAYS_PASSES', {
      suiteName: 'context-manager',
      suiteType: 'component-level',
@@ -379,16 +405,10 @@ describe('ContextManager Evaluation Suite', () => {
          retainedTokens: 3737,
          maxTokens: 11280,
        },
-        gcBackstop: {
-          strategy: 'truncate',
-          target: 'incremental',
-          freeTokensTarget: 373,
-        },
        pipelines: [
          {
            name: 'Immediate Sanitization',
-            triggers: ['on_turn'],
-            execution: 'blocking',
+            triggers: ['new_message'],
            processors: [
              {
                processorId: 'ToolMaskingProcessor',
@@ -401,14 +421,14 @@ describe('ContextManager Evaluation Suite', () => {
                options: {},
              },
              {
-                processorId: 'SemanticCompressionProcessor',
+                processorId: 'NodeDistillationProcessor',
                options: {
                  nodeThresholdTokens: 733,
                },
              },
              {
-                processorId: 'EmergencyTruncationProcessor',
-                options: {},
+                processorId: 'NodeTruncationProcessor',
+                options: { maxTokensPerNode: 1000 },
              },
            ],
          },
@@ -419,14 +439,14 @@ describe('ContextManager Evaluation Suite', () => {
                type: 'timer',
                intervalMs: 100,
              },
-              'budget_exceeded',
+              'retained_exceeded',
            ],
-            execution: 'background',
            processors: [
              {
-                processorId: 'HistorySquashingProcessor',
+                processorId: 'HistoryTruncationProcessor',
                options: {
-                  maxTokensPerNode: 327,
+                  target: 'freeNTokens',
+                  freeTokensTarget: 327,
                },
              },
            ],
@@ -446,263 +466,4 @@ describe('ContextManager Evaluation Suite', () => {
      ).toBe(true);
    },
  });
-
-  componentEvalTest('ALWAYS_PASSES', {
-    suiteName: 'context-manager',
-    suiteType: 'component-level',
-    name: 'ContextManager Frozen Case - final-name (Budget: 5321)',
-    configOverrides: { model: EVAL_MODEL },
-    assert: async (config: Config) => {
-      const scenario = getScenario('scenario-c-compiler');
-      const sidecarConfig: SidecarConfig = {
-        budget: {
-          retainedTokens: 5321,
-          maxTokens: 11398,
-        },
-        gcBackstop: {
-          strategy: 'truncate',
-          target: 'incremental',
-          freeTokensTarget: 532,
-        },
-        pipelines: [
-          {
-            name: 'Immediate Sanitization',
-            triggers: ['on_turn'],
-            execution: 'blocking',
-            processors: [
-              {
-                processorId: 'ToolMaskingProcessor',
-                options: {
-                  stringLengthThresholdTokens: 2952,
-                },
-              },
-              {
-                processorId: 'BlobDegradationProcessor',
-                options: {},
-              },
-              {
-                processorId: 'SemanticCompressionProcessor',
-                options: {
-                  nodeThresholdTokens: 1632,
-                },
-              },
-              {
-                processorId: 'EmergencyTruncationProcessor',
-                options: {},
-              },
-            ],
-          },
-          {
-            name: 'Deep Background Compression',
-            triggers: [
-              {
-                type: 'timer',
-                intervalMs: 100,
-              },
-              'budget_exceeded',
-            ],
-            execution: 'background',
-            processors: [
-              {
-                processorId: 'HistorySquashingProcessor',
-                options: {
-                  maxTokensPerNode: 355,
-                },
-              },
-              {
-                processorId: 'StateSnapshotProcessor',
-                options: {},
-              },
-            ],
-          },
-        ],
-      };
-      const seed = 826619;
-      const result = await runContextManagerEval(
-        config,
-        sidecarConfig,
-        seed,
-        scenario,
-      );
-      expect(
-        result.pass,
-        `Recall failed for final-name. Response: ${result.response}`,
-      ).toBe(true);
-    },
-  });
-
-  componentEvalTest('ALWAYS_PASSES', {
-    suiteName: 'context-manager',
-    suiteType: 'component-level',
-    name: 'ContextManager Frozen Case - secret-beta (Budget: 1314)',
-    configOverrides: { model: EVAL_MODEL },
-    assert: async (config: Config) => {
-      const scenario = getScenario('scenario-c-compiler');
-      const sidecarConfig: SidecarConfig = {
-        budget: {
-          retainedTokens: 1314,
-          maxTokens: 2067,
-        },
-        gcBackstop: {
-          strategy: 'truncate',
-          target: 'incremental',
-          freeTokensTarget: 131,
-        },
-        pipelines: [
-          {
-            name: 'Immediate Sanitization',
-            triggers: ['on_turn'],
-            execution: 'blocking',
-            processors: [
-              {
-                processorId: 'ToolMaskingProcessor',
-                options: {
-                  stringLengthThresholdTokens: 738,
-                },
-              },
-              {
-                processorId: 'BlobDegradationProcessor',
-                options: {},
-              },
-              {
-                processorId: 'SemanticCompressionProcessor',
-                options: {
-                  nodeThresholdTokens: 195,
-                },
-              },
-              {
-                processorId: 'EmergencyTruncationProcessor',
-                options: {},
-              },
-            ],
-          },
-          {
-            name: 'Deep Background Compression',
-            triggers: [
-              {
-                type: 'timer',
-                intervalMs: 100,
-              },
-              'budget_exceeded',
-            ],
-            execution: 'background',
-            processors: [
-              {
-                processorId: 'HistorySquashingProcessor',
-                options: {
-                  maxTokensPerNode: 108,
-                },
-              },
-              {
-                processorId: 'StateSnapshotProcessor',
-                options: {},
-              },
-            ],
-          },
-        ],
-      };
-      const seed = 475216;
-      const result = await runContextManagerEval(
-        config,
-        sidecarConfig,
-        seed,
-        scenario,
-      );
-      expect(
-        result.pass,
-        `Recall failed for secret-beta. Response: ${result.response}`,
-      ).toBe(true);
-    },
-  });
-
-  componentEvalTest('ALWAYS_PASSES', {
-    suiteName: 'context-manager',
-    suiteType: 'component-level',
-    name: 'ContextManager Frozen Case - codegen-header (Budget: 2585)',
-    configOverrides: { model: EVAL_MODEL },
-    timeout: 240000, // 4 minutes to allow Gemini 2.5 Pro to compress even with rate limits/load shedding
-    assert: async (config: Config) => {
-      const scenario = getScenario('scenario-c-compiler');
-      const targetQuestion = scenario.questions.find(
-        (q) => q.id === 'codegen-header',
-      );
-      if (targetQuestion) {
-        targetQuestion.expectedSubstring = 'Generated by GigaC';
-      }
-      const sidecarConfig: SidecarConfig = {
-        budget: {
-          retainedTokens: 2585,
-          maxTokens: 4196,
-        },
-        gcBackstop: {
-          strategy: 'truncate',
-          target: 'incremental',
-          freeTokensTarget: 258,
-        },
-        pipelines: [
-          {
-            name: 'Immediate Sanitization',
-            triggers: ['on_turn'],
-            execution: 'blocking',
-            processors: [
-              {
-                processorId: 'ToolMaskingProcessor',
-                options: {
-                  stringLengthThresholdTokens: 720,
-                },
-              },
-              {
-                processorId: 'BlobDegradationProcessor',
-                options: {},
-              },
-              {
-                processorId: 'SemanticCompressionProcessor',
-                options: {
-                  nodeThresholdTokens: 336,
-                },
-              },
-              {
-                processorId: 'EmergencyTruncationProcessor',
-                options: {},
-              },
-            ],
-          },
-          {
-            name: 'Deep Background Compression',
-            triggers: [
-              {
-                type: 'timer',
-                intervalMs: 100,
-              },
-              'budget_exceeded',
-            ],
-            execution: 'background',
-            processors: [
-              {
-                processorId: 'HistorySquashingProcessor',
-                options: {
-                  maxTokensPerNode: 237,
-                },
-              },
-              {
-                processorId: 'StateSnapshotProcessor',
-                options: {},
-              },
-            ],
-          },
-        ],
-      };
-      const seed = 715277;
-      const result = await runContextManagerEval(
-        config,
-        sidecarConfig,
-        seed,
-        scenario,
-      );
-      expect(
-        result.pass,
-        `Recall failed for codegen-header. Response: ${result.response}`,
-      ).toBe(true);
-    },
-  });
 });
@@ -55,7 +55,7 @@ export class NodeDistillationProcessor implements ContextProcessor {
    try {
      const response = await this.env.llmClient.generateContent({
        role: LlmRole.UTILITY_COMPRESSOR,
-        modelConfigKey: { model: 'default' },
+        modelConfigKey: { model: 'summarizer-default' },
        promptId: this.env.promptId,
        abortSignal: new AbortController().signal,
        contents: [