gemini-cli/evals/workflows/dedup_refresh.eval.ts

/**
 * @license
 * Copyright 2025 Google LLC
 * SPDX-License-Identifier: Apache-2.0
 */

import { describe, expect } from 'vitest';
import { evalTest } from '../test-helper.js';
import fs from 'node:fs/promises';
import path from 'node:path';
import yaml from 'js-yaml';
import { WORKFLOW_TARGET_MODELS } from './constants.js';

// Read the workflow file to extract the prompt and settings
const workflowPath = path.join(
  process.cwd(),
  '.github/workflows/gemini-scheduled-issue-dedup.yml',
);
const workflowContent = await fs.readFile(workflowPath, 'utf8');

const workflowData = yaml.load(workflowContent) as any;
const geminiStep = workflowData.jobs?.['refresh-embeddings']?.steps?.find(
  (step: any) => step.id === 'gemini_refresh_embeddings',
);

const REFRESH_PROMPT_TEMPLATE = geminiStep?.with?.prompt;
const ORIGINAL_SETTINGS = JSON.parse(geminiStep?.with?.settings || '{}');

if (!REFRESH_PROMPT_TEMPLATE) {
  throw new Error('Could not extract prompt from dedup refresh workflow.');
}

const mockMcpPath = path.join(process.cwd(), 'evals/mocks/dedup_mcp.ts');

const createPrompt = () => {
  return REFRESH_PROMPT_TEMPLATE.replace(
    /\${{ github\.repository }}/g,
    'google-gemini/gemini-cli',
  );
};

const REFRESH_SETTINGS = {
  ...ORIGINAL_SETTINGS,
  mcpServers: {
    issue_deduplication: {
      command: 'npx',
      args: ['tsx', mockMcpPath],
    },
  },
};
if (REFRESH_SETTINGS.telemetry) {
  delete REFRESH_SETTINGS.telemetry;
}

describe('dedup_refresh_agent', () => {
  evalTest('USUALLY_PASSES', {
    name: 'should call refresh tool',
    prompt: ['--output-format', 'json', '--prompt', createPrompt()],
    approvalMode: 'yolo',
    params: {
      settings: REFRESH_SETTINGS,
    },
    targetModels: WORKFLOW_TARGET_MODELS,
    assert: async (rig: any, result) => {
      // result is the JSON output
      const output = JSON.parse(result);
      expect(output.stats).toBeDefined();

      const toolStats = output.stats.tools.byName;
      expect(toolStats.refresh).toBeDefined();
      expect(toolStats.refresh.count).toBe(1);
      expect(toolStats.refresh.success).toBe(1);

      // We still check telemetry for deep arg inspection if needed,
      // but stats verify the high-level goal.
      const toolLogs = rig.readToolLogs();
      const refreshCall = toolLogs.find(
        (l: any) => l.toolRequest.name === 'refresh',
      );
      expect(refreshCall).toBeDefined();
    },
  });
});