Files
gemini-cli/evals/workflows/dedup_refresh.eval.ts

84 lines
2.4 KiB
TypeScript

/**
* @license
* Copyright 2025 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, expect } from 'vitest';
import { evalTest } from '../test-helper.js';
import fs from 'node:fs/promises';
import path from 'node:path';
import yaml from 'js-yaml';
import { WORKFLOW_TARGET_MODELS } from './constants.js';
// Read the workflow file to extract the prompt and settings
const workflowPath = path.join(
process.cwd(),
'.github/workflows/gemini-scheduled-issue-dedup.yml',
);
const workflowContent = await fs.readFile(workflowPath, 'utf8');
const workflowData = yaml.load(workflowContent) as any;
const geminiStep = workflowData.jobs?.['refresh-embeddings']?.steps?.find(
(step: any) => step.id === 'gemini_refresh_embeddings',
);
const REFRESH_PROMPT_TEMPLATE = geminiStep?.with?.prompt;
const ORIGINAL_SETTINGS = JSON.parse(geminiStep?.with?.settings || '{}');
if (!REFRESH_PROMPT_TEMPLATE) {
throw new Error('Could not extract prompt from dedup refresh workflow.');
}
const mockMcpPath = path.join(process.cwd(), 'evals/mocks/dedup_mcp.ts');
const createPrompt = () => {
return REFRESH_PROMPT_TEMPLATE.replace(
/\${{ github\.repository }}/g,
'google-gemini/gemini-cli',
);
};
const REFRESH_SETTINGS = {
...ORIGINAL_SETTINGS,
mcpServers: {
issue_deduplication: {
command: 'npx',
args: ['tsx', mockMcpPath],
},
},
};
if (REFRESH_SETTINGS.telemetry) {
delete REFRESH_SETTINGS.telemetry;
}
describe('dedup_refresh_agent', () => {
evalTest('USUALLY_PASSES', {
name: 'should call refresh tool',
prompt: ['--output-format', 'json', '--prompt', createPrompt()],
approvalMode: 'yolo',
params: {
settings: REFRESH_SETTINGS,
},
targetModels: WORKFLOW_TARGET_MODELS,
assert: async (rig: any, result) => {
// result is the JSON output
const output = JSON.parse(result);
expect(output.stats).toBeDefined();
const toolStats = output.stats.tools.byName;
expect(toolStats.refresh).toBeDefined();
expect(toolStats.refresh.count).toBe(1);
expect(toolStats.refresh.success).toBe(1);
// We still check telemetry for deep arg inspection if needed,
// but stats verify the high-level goal.
const toolLogs = rig.readToolLogs();
const refreshCall = toolLogs.find(
(l: any) => l.toolRequest.name === 'refresh',
);
expect(refreshCall).toBeDefined();
},
});
});