gemini-cli/evals/auto_distillation.eval.ts

/**
 * @license
 * Copyright 2026 Google LLC
 * SPDX-License-Identifier: Apache-2.0
 */

import { describe, expect } from 'vitest';
import path from 'node:path';
import fs from 'node:fs';
import { appEvalTest } from './app-test-helper.js';
import {
  userText,
  mockGenerateContentStreamText,
} from '@google/gemini-cli-core';

describe('Auto-Distillation Behavioral Evals', () => {
  appEvalTest('USUALLY_PASSES', {
    name: 'Agent successfully navigates truncated output using the structural map to extract a secret',
    timeout: 180000,
    configOverrides: { autoDistillation: true },
    setup: async (rig) => {
      const testDir = rig.getTestDir();

      const mockData: any = {
        system_info: {
          version: '1.0.0',
          uptime: 999999,
          environment: 'production',
        },
        active_sessions: [],
        quarantined_payloads: [
          { id: 'Subject-01', status: 'cleared' },
          {
            id: 'Subject-89',
            secret_token: 'the_cake_is_a_lie',
            status: 'held_for_review',
          },
          { id: 'Subject-99', status: 'cleared' },
        ],
        archived_metrics: [],
      };

      for (let i = 0; i < 300; i++) {
        mockData.active_sessions.push({
          session_id: `sess_${i.toString().padStart(4, '0')}`,
          duration_ms: Math.floor(Math.random() * 10000),
          bytes_transferred: Math.floor(Math.random() * 50000),
        });
      }

      for (let i = 0; i < 2000; i++) {
        mockData.archived_metrics.push({
          timestamp: Date.now() - i * 1000,
          cpu_load: parseFloat(Math.random().toFixed(4)),
          mem_usage: parseFloat(Math.random().toFixed(4)),
        });
      }

      const massiveString = JSON.stringify(mockData, null, 2);

      fs.writeFileSync(
        path.join(testDir, 'server_state_dump.json'),
        massiveString,
      );
    },
    script: [
      userText(
        'We have a critical error in production. Are you ready to help?',
      ),
      mockGenerateContentStreamText(
        'I am ready. Please provide the details of the error.',
      ),
    ],
    prompt: `My application crashed with: "FATAL: Subject-89 held for review in quarantine". \n\nPlease run \`cat server_state_dump.json\` to investigate. The file is massive, so your tool output will be automatically truncated and you will receive a structural map instead. Use that structural map to determine the right command to extract the \`secret_token\` for Subject-89. Please state the exact secret token when you find it.`,
    assert: async (rig) => {
      await rig.drainBreakpointsUntilIdle(undefined, 60000);

      const finalOutput = rig.getStaticOutput();

      // Ensure the agent correctly extracted the secret token after navigating the distilled output
      expect(finalOutput).toContain('the_cake_is_a_lie');
    },
  });
});