refactor(evals): isolate workflow evals and target specific models

2026-04-26 04:54:25 -07:00 · 2026-02-03 21:02:55 -05:00
parent a3b7f5d5e1
commit ff4e816a70
6 changed files with 1247 additions and 11 deletions
@@ -0,0 +1,83 @@
+/**
+ * @license
+ * Copyright 2025 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, expect } from 'vitest';
+import { evalTest } from '../test-helper.js';
+import fs from 'node:fs/promises';
+import path from 'node:path';
+import yaml from 'js-yaml';
+import { WORKFLOW_TARGET_MODELS } from './constants.js';
+
+// Read the workflow file to extract the prompt and settings
+const workflowPath = path.join(
+  process.cwd(),
+  '.github/workflows/gemini-scheduled-issue-dedup.yml',
+);
+const workflowContent = await fs.readFile(workflowPath, 'utf8');
+
+const workflowData = yaml.load(workflowContent) as any;
+const geminiStep = workflowData.jobs?.['refresh-embeddings']?.steps?.find(
+  (step: any) => step.id === 'gemini_refresh_embeddings',
+);
+
+const REFRESH_PROMPT_TEMPLATE = geminiStep?.with?.prompt;
+const ORIGINAL_SETTINGS = JSON.parse(geminiStep?.with?.settings || '{}');
+
+if (!REFRESH_PROMPT_TEMPLATE) {
+  throw new Error('Could not extract prompt from dedup refresh workflow.');
+}
+
+const mockMcpPath = path.join(process.cwd(), 'evals/mocks/dedup_mcp.ts');
+
+const createPrompt = () => {
+  return REFRESH_PROMPT_TEMPLATE.replace(
+    /\${{ github\.repository }}/g,
+    'google-gemini/gemini-cli',
+  );
+};
+
+const REFRESH_SETTINGS = {
+  ...ORIGINAL_SETTINGS,
+  mcpServers: {
+    issue_deduplication: {
+      command: 'npx',
+      args: ['tsx', mockMcpPath],
+    },
+  },
+};
+if (REFRESH_SETTINGS.telemetry) {
+  delete REFRESH_SETTINGS.telemetry;
+}
+
+describe('dedup_refresh_agent', () => {
+  evalTest('USUALLY_PASSES', {
+    name: 'should call refresh tool',
+    prompt: ['--output-format', 'json', '--prompt', createPrompt()],
+    approvalMode: 'yolo',
+    params: {
+      settings: REFRESH_SETTINGS,
+    },
+    targetModels: WORKFLOW_TARGET_MODELS,
+    assert: async (rig: any, result) => {
+      // result is the JSON output
+      const output = JSON.parse(result);
+      expect(output.stats).toBeDefined();
+
+      const toolStats = output.stats.tools.byName;
+      expect(toolStats.refresh).toBeDefined();
+      expect(toolStats.refresh.count).toBe(1);
+      expect(toolStats.refresh.success).toBe(1);
+
+      // We still check telemetry for deep arg inspection if needed,
+      // but stats verify the high-level goal.
+      const toolLogs = rig.readToolLogs();
+      const refreshCall = toolLogs.find(
+        (l: any) => l.toolRequest.name === 'refresh',
+      );
+      expect(refreshCall).toBeDefined();
+    },
+  });
+});