improve(core): require recurrence evidence before extracting skills (#25147)

2026-07-22 15:51:18 -07:00 · 2026-04-15 11:45:31 -07:00
parent 5333e5ab20
commit 485f3d92d8
4 changed files with 502 additions and 22 deletions
@@ -16,6 +16,7 @@ import fs from 'node:fs';
 import path from 'node:path';
 import os from 'node:os';
 import { randomUUID } from 'node:crypto';
+import { vi } from 'vitest';
 import {
  Config,
  type ConfigParameters,
@@ -52,6 +53,7 @@ export interface ComponentEvalCase extends BaseEvalCase {
 export class ComponentRig {
  public config: Config | undefined;
  public testDir: string;
+  public homeDir: string;
  public sessionId: string;

  constructor(
@@ -61,6 +63,9 @@ export class ComponentRig {
    this.testDir = fs.mkdtempSync(
      path.join(os.tmpdir(), `gemini-component-rig-${uniqueId.slice(0, 8)}-`),
    );
+    this.homeDir = fs.mkdtempSync(
+      path.join(os.tmpdir(), `gemini-component-home-${uniqueId.slice(0, 8)}-`),
+    );
    this.sessionId = `test-session-${uniqueId}`;
  }

@@ -89,12 +94,23 @@ export class ComponentRig {
    this.config = makeFakeConfig(configParams);
    await this.config.initialize();

-    // Refresh auth using USE_GEMINI to initialize the real BaseLlmClient
+    // Refresh auth using USE_GEMINI to initialize the real BaseLlmClient.
+    // This must happen BEFORE stubbing GEMINI_CLI_HOME because OAuth credential
+    // lookup resolves through homedir() → GEMINI_CLI_HOME.
    await this.config.refreshAuth(AuthType.USE_GEMINI);
+
+    // Isolate storage paths (session files, skills, extraction state) by
+    // pointing GEMINI_CLI_HOME at a per-test temp directory.  Storage resolves
+    // global paths through `homedir()` which reads this env var.  This is set
+    // after auth so credential lookup uses the real home directory.
+    vi.stubEnv('GEMINI_CLI_HOME', this.homeDir);
  }

  async cleanup() {
+    await this.config?.dispose();
+    vi.unstubAllEnvs();
    fs.rmSync(this.testDir, { recursive: true, force: true });
+    fs.rmSync(this.homeDir, { recursive: true, force: true });
  }
 }

@@ -0,0 +1,341 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import fsp from 'node:fs/promises';
+import path from 'node:path';
+import { describe, expect } from 'vitest';
+import {
+  type Config,
+  ApprovalMode,
+  SESSION_FILE_PREFIX,
+  getProjectHash,
+  startMemoryService,
+} from '@google/gemini-cli-core';
+import { componentEvalTest } from './component-test-helper.js';
+
+interface SeedSession {
+  sessionId: string;
+  summary: string;
+  userTurns: string[];
+  timestampOffsetMinutes: number;
+}
+
+interface MessageRecord {
+  id: string;
+  timestamp: string;
+  type: string;
+  content: Array<{ text: string }>;
+}
+
+const WORKSPACE_FILES = {
+  'package.json': JSON.stringify(
+    {
+      name: 'skill-extraction-eval',
+      private: true,
+      scripts: {
+        build: 'echo build',
+        lint: 'echo lint',
+        test: 'echo test',
+      },
+    },
+    null,
+    2,
+  ),
+  'README.md': `# Skill Extraction Eval
+
+This workspace exists to exercise background skill extraction from prior chats.
+`,
+};
+
+function buildMessages(userTurns: string[]): MessageRecord[] {
+  const baseTime = new Date(Date.now() - 6 * 60 * 60 * 1000).toISOString();
+  return userTurns.flatMap((text, index) => [
+    {
+      id: `u${index + 1}`,
+      timestamp: baseTime,
+      type: 'user',
+      content: [{ text }],
+    },
+    {
+      id: `a${index + 1}`,
+      timestamp: baseTime,
+      type: 'gemini',
+      content: [{ text: `Acknowledged: ${index + 1}` }],
+    },
+  ]);
+}
+
+async function seedSessions(
+  config: Config,
+  sessions: SeedSession[],
+): Promise<void> {
+  const chatsDir = path.join(config.storage.getProjectTempDir(), 'chats');
+  await fsp.mkdir(chatsDir, { recursive: true });
+
+  const projectRoot = config.storage.getProjectRoot();
+
+  for (const session of sessions) {
+    const timestamp = new Date(
+      Date.now() - session.timestampOffsetMinutes * 60 * 1000,
+    )
+      .toISOString()
+      .slice(0, 16)
+      .replace(/:/g, '-');
+    const filename = `${SESSION_FILE_PREFIX}${timestamp}-${session.sessionId.slice(0, 8)}.json`;
+    const conversation = {
+      sessionId: session.sessionId,
+      projectHash: getProjectHash(projectRoot),
+      summary: session.summary,
+      startTime: new Date(Date.now() - 7 * 60 * 60 * 1000).toISOString(),
+      lastUpdated: new Date(Date.now() - 4 * 60 * 60 * 1000).toISOString(),
+      messages: buildMessages(session.userTurns),
+    };
+
+    await fsp.writeFile(
+      path.join(chatsDir, filename),
+      JSON.stringify(conversation, null, 2),
+    );
+  }
+}
+
+async function runExtractionAndReadState(config: Config): Promise<{
+  state: { runs: Array<{ sessionIds: string[]; skillsCreated: string[] }> };
+  skillsDir: string;
+}> {
+  await startMemoryService(config);
+
+  const memoryDir = config.storage.getProjectMemoryTempDir();
+  const skillsDir = config.storage.getProjectSkillsMemoryDir();
+  const statePath = path.join(memoryDir, '.extraction-state.json');
+
+  const raw = await fsp.readFile(statePath, 'utf-8');
+  const state = JSON.parse(raw) as {
+    runs?: Array<{ sessionIds?: string[]; skillsCreated?: string[] }>;
+  };
+  if (!Array.isArray(state.runs) || state.runs.length === 0) {
+    throw new Error('Skill extraction finished without writing any run state');
+  }
+
+  return {
+    state: {
+      runs: state.runs.map((run) => ({
+        sessionIds: Array.isArray(run.sessionIds) ? run.sessionIds : [],
+        skillsCreated: Array.isArray(run.skillsCreated)
+          ? run.skillsCreated
+          : [],
+      })),
+    },
+    skillsDir,
+  };
+}
+
+async function readSkillBodies(skillsDir: string): Promise<string[]> {
+  try {
+    const entries = await fsp.readdir(skillsDir, { withFileTypes: true });
+    const skillDirs = entries.filter((entry) => entry.isDirectory());
+    const bodies = await Promise.all(
+      skillDirs.map((entry) =>
+        fsp.readFile(path.join(skillsDir, entry.name, 'SKILL.md'), 'utf-8'),
+      ),
+    );
+    return bodies;
+  } catch {
+    return [];
+  }
+}
+
+/**
+ * Shared configOverrides for all skill extraction component evals.
+ * - experimentalMemoryManager: enables the memory extraction pipeline.
+ * - approvalMode: YOLO auto-approves tool calls (write_file, read_file) so the
+ *   background agent can execute without interactive confirmation.
+ */
+const EXTRACTION_CONFIG_OVERRIDES = {
+  experimentalMemoryManager: true,
+  approvalMode: ApprovalMode.YOLO,
+};
+
+describe('Skill Extraction', () => {
+  componentEvalTest('USUALLY_PASSES', {
+    suiteName: 'skill-extraction',
+    suiteType: 'component-level',
+    name: 'ignores one-off incidents even when session summaries look similar',
+    files: WORKSPACE_FILES,
+    timeout: 180000,
+    configOverrides: EXTRACTION_CONFIG_OVERRIDES,
+    setup: async (config) => {
+      await seedSessions(config, [
+        {
+          sessionId: 'incident-login-redirect',
+          summary: 'Debug login redirect loop in staging',
+          timestampOffsetMinutes: 420,
+          userTurns: [
+            'We only need a one-off fix for incident INC-4412 on branch hotfix/login-loop.',
+            'The exact failing string is ERR_REDIRECT_4412 and this workaround is incident-specific.',
+            'Patch packages/auth/src/redirect.ts just for this branch and do not generalize it.',
+            'The thing that worked was deleting the stale staging cookie before retrying.',
+            'This is not a normal workflow and should not become a reusable instruction.',
+            'It only reproduced against the 2026-04-08 staging rollout.',
+            'After the cookie clear, the branch-specific redirect logic passed.',
+            'Do not turn this incident writeup into a standing process.',
+            'Yes, the hotfix worked for this exact redirect-loop incident.',
+            'Close out INC-4412 once the staging login succeeds again.',
+          ],
+        },
+        {
+          sessionId: 'incident-login-timeout',
+          summary: 'Debug login callback timeout in staging',
+          timestampOffsetMinutes: 360,
+          userTurns: [
+            'This is another one-off staging incident, this time TICKET-991 for callback timeout.',
+            'The exact failing string is ERR_CALLBACK_TIMEOUT_991 and it is unrelated to the redirect loop.',
+            'The temporary fix was rotating the staging secret and deleting a bad feature-flag row.',
+            'Do not write a generic login-debugging playbook from this.',
+            'This only applied to the callback timeout during the April rollout.',
+            'The successful fix was specific to the stale secret in staging.',
+            'It does not define a durable repo workflow for future tasks.',
+            'After rotating the secret, the callback timeout stopped reproducing.',
+            'Treat this as incident response only, not a reusable skill.',
+            'Once staging passed again, we closed TICKET-991.',
+          ],
+        },
+      ]);
+    },
+    assert: async (config) => {
+      const { state, skillsDir } = await runExtractionAndReadState(config);
+      const skillBodies = await readSkillBodies(skillsDir);
+
+      expect(state.runs).toHaveLength(1);
+      expect(state.runs[0].sessionIds).toHaveLength(2);
+      expect(state.runs[0].skillsCreated).toEqual([]);
+      expect(skillBodies).toEqual([]);
+    },
+  });
+
+  componentEvalTest('USUALLY_PASSES', {
+    suiteName: 'skill-extraction',
+    suiteType: 'component-level',
+    name: 'extracts a repeated project-specific workflow into a skill',
+    files: WORKSPACE_FILES,
+    timeout: 180000,
+    configOverrides: EXTRACTION_CONFIG_OVERRIDES,
+    setup: async (config) => {
+      await seedSessions(config, [
+        {
+          sessionId: 'settings-docs-regen-1',
+          summary: 'Update settings docs after adding a config option',
+          timestampOffsetMinutes: 420,
+          userTurns: [
+            'When we add a new config option, we have to regenerate the settings docs in a specific order.',
+            'The sequence that worked was npm run predocs:settings, npm run schema:settings, then npm run docs:settings.',
+            'Do not hand-edit generated settings docs.',
+            'If predocs is skipped, the generated schema docs miss the new defaults.',
+            'Update the source first, then run that generation sequence.',
+            'After regenerating, verify the schema output and docs changed together.',
+            'We used this same sequence the last time we touched settings docs.',
+            'That ordered workflow passed and produced the expected generated files.',
+            'Please keep the exact command order because reversing it breaks the output.',
+            'Yes, the generated settings docs were correct after those three commands.',
+          ],
+        },
+        {
+          sessionId: 'settings-docs-regen-2',
+          summary: 'Regenerate settings schema docs for another new setting',
+          timestampOffsetMinutes: 360,
+          userTurns: [
+            'We are touching another setting, so follow the same settings-doc regeneration workflow again.',
+            'Run npm run predocs:settings before npm run schema:settings and npm run docs:settings.',
+            'The project keeps generated settings docs in sync through those commands, not manual edits.',
+            'Skipping predocs caused stale defaults in the generated output before.',
+            'Change the source, then execute the same three commands in order.',
+            'Verify both the schema artifact and docs update together after regeneration.',
+            'This is the recurring workflow we use whenever a setting changes.',
+            'The exact order worked again on this second settings update.',
+            'Please preserve that ordering constraint for future settings changes.',
+            'Confirmed: the settings docs regenerated correctly with the same command sequence.',
+          ],
+        },
+      ]);
+    },
+    assert: async (config) => {
+      const { state, skillsDir } = await runExtractionAndReadState(config);
+      const skillBodies = await readSkillBodies(skillsDir);
+      const combinedSkills = skillBodies.join('\n\n');
+
+      expect(state.runs).toHaveLength(1);
+      expect(state.runs[0].sessionIds).toHaveLength(2);
+      expect(state.runs[0].skillsCreated.length).toBeGreaterThanOrEqual(1);
+      expect(skillBodies.length).toBeGreaterThanOrEqual(1);
+      expect(combinedSkills).toContain('npm run predocs:settings');
+      expect(combinedSkills).toContain('npm run schema:settings');
+      expect(combinedSkills).toContain('npm run docs:settings');
+      expect(combinedSkills).toMatch(/When to Use/i);
+      expect(combinedSkills).toMatch(/Verification/i);
+    },
+  });
+
+  componentEvalTest('USUALLY_PASSES', {
+    suiteName: 'skill-extraction',
+    suiteType: 'component-level',
+    name: 'extracts a repeated multi-step migration workflow with ordering constraints',
+    files: WORKSPACE_FILES,
+    timeout: 180000,
+    configOverrides: EXTRACTION_CONFIG_OVERRIDES,
+    setup: async (config) => {
+      await seedSessions(config, [
+        {
+          sessionId: 'db-migration-v12',
+          summary: 'Run database migration for v12 schema update',
+          timestampOffsetMinutes: 420,
+          userTurns: [
+            'Every time we change the database schema we follow a specific migration workflow.',
+            'First run npm run db:check to verify no pending migrations conflict.',
+            'Then run npm run db:migrate to apply the new migration files.',
+            'After migration, always run npm run db:validate to confirm schema integrity.',
+            'If db:validate fails, immediately run npm run db:rollback before anything else.',
+            'Never skip db:check — last time we did, two migrations collided and corrupted the index.',
+            'The ordering is critical: check, migrate, validate. Reversing migrate and validate caused silent data loss before.',
+            'This v12 migration passed after following that exact sequence.',
+            'We use this same three-step workflow every time the schema changes.',
+            'Confirmed: db:check, db:migrate, db:validate completed successfully for v12.',
+          ],
+        },
+        {
+          sessionId: 'db-migration-v13',
+          summary: 'Run database migration for v13 schema update',
+          timestampOffsetMinutes: 360,
+          userTurns: [
+            'New schema change for v13, following the same database migration workflow as before.',
+            'Start with npm run db:check to ensure no conflicting pending migrations.',
+            'Then npm run db:migrate to apply the v13 migration files.',
+            'Then npm run db:validate to confirm the schema is consistent.',
+            'If validation fails, run npm run db:rollback immediately — do not attempt manual fixes.',
+            'We learned the hard way that skipping db:check causes index corruption.',
+            'The check-migrate-validate order is mandatory for every schema change.',
+            'This is the same recurring workflow we used for v12 and earlier migrations.',
+            'The v13 migration passed with the same three-step sequence.',
+            'Confirmed: the standard db migration workflow succeeded again for v13.',
+          ],
+        },
+      ]);
+    },
+    assert: async (config) => {
+      const { state, skillsDir } = await runExtractionAndReadState(config);
+      const skillBodies = await readSkillBodies(skillsDir);
+      const combinedSkills = skillBodies.join('\n\n');
+
+      expect(state.runs).toHaveLength(1);
+      expect(state.runs[0].sessionIds).toHaveLength(2);
+      expect(state.runs[0].skillsCreated.length).toBeGreaterThanOrEqual(1);
+      expect(skillBodies.length).toBeGreaterThanOrEqual(1);
+      expect(combinedSkills).toContain('npm run db:check');
+      expect(combinedSkills).toContain('npm run db:migrate');
+      expect(combinedSkills).toContain('npm run db:validate');
+      expect(combinedSkills).toMatch(/rollback/i);
+      expect(combinedSkills).toMatch(/When to Use/i);
+    },
+  });
+});