mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-04-16 16:21:27 -07:00
improve(core): require recurrence evidence before extracting skills (#25147)
This commit is contained in:
@@ -16,6 +16,7 @@ import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import os from 'node:os';
|
||||
import { randomUUID } from 'node:crypto';
|
||||
import { vi } from 'vitest';
|
||||
import {
|
||||
Config,
|
||||
type ConfigParameters,
|
||||
@@ -52,6 +53,7 @@ export interface ComponentEvalCase extends BaseEvalCase {
|
||||
export class ComponentRig {
|
||||
public config: Config | undefined;
|
||||
public testDir: string;
|
||||
public homeDir: string;
|
||||
public sessionId: string;
|
||||
|
||||
constructor(
|
||||
@@ -61,6 +63,9 @@ export class ComponentRig {
|
||||
this.testDir = fs.mkdtempSync(
|
||||
path.join(os.tmpdir(), `gemini-component-rig-${uniqueId.slice(0, 8)}-`),
|
||||
);
|
||||
this.homeDir = fs.mkdtempSync(
|
||||
path.join(os.tmpdir(), `gemini-component-home-${uniqueId.slice(0, 8)}-`),
|
||||
);
|
||||
this.sessionId = `test-session-${uniqueId}`;
|
||||
}
|
||||
|
||||
@@ -89,12 +94,23 @@ export class ComponentRig {
|
||||
this.config = makeFakeConfig(configParams);
|
||||
await this.config.initialize();
|
||||
|
||||
// Refresh auth using USE_GEMINI to initialize the real BaseLlmClient
|
||||
// Refresh auth using USE_GEMINI to initialize the real BaseLlmClient.
|
||||
// This must happen BEFORE stubbing GEMINI_CLI_HOME because OAuth credential
|
||||
// lookup resolves through homedir() → GEMINI_CLI_HOME.
|
||||
await this.config.refreshAuth(AuthType.USE_GEMINI);
|
||||
|
||||
// Isolate storage paths (session files, skills, extraction state) by
|
||||
// pointing GEMINI_CLI_HOME at a per-test temp directory. Storage resolves
|
||||
// global paths through `homedir()` which reads this env var. This is set
|
||||
// after auth so credential lookup uses the real home directory.
|
||||
vi.stubEnv('GEMINI_CLI_HOME', this.homeDir);
|
||||
}
|
||||
|
||||
async cleanup() {
|
||||
await this.config?.dispose();
|
||||
vi.unstubAllEnvs();
|
||||
fs.rmSync(this.testDir, { recursive: true, force: true });
|
||||
fs.rmSync(this.homeDir, { recursive: true, force: true });
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
341
evals/skill_extraction.eval.ts
Normal file
341
evals/skill_extraction.eval.ts
Normal file
@@ -0,0 +1,341 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import fsp from 'node:fs/promises';
|
||||
import path from 'node:path';
|
||||
import { describe, expect } from 'vitest';
|
||||
import {
|
||||
type Config,
|
||||
ApprovalMode,
|
||||
SESSION_FILE_PREFIX,
|
||||
getProjectHash,
|
||||
startMemoryService,
|
||||
} from '@google/gemini-cli-core';
|
||||
import { componentEvalTest } from './component-test-helper.js';
|
||||
|
||||
interface SeedSession {
|
||||
sessionId: string;
|
||||
summary: string;
|
||||
userTurns: string[];
|
||||
timestampOffsetMinutes: number;
|
||||
}
|
||||
|
||||
interface MessageRecord {
|
||||
id: string;
|
||||
timestamp: string;
|
||||
type: string;
|
||||
content: Array<{ text: string }>;
|
||||
}
|
||||
|
||||
const WORKSPACE_FILES = {
|
||||
'package.json': JSON.stringify(
|
||||
{
|
||||
name: 'skill-extraction-eval',
|
||||
private: true,
|
||||
scripts: {
|
||||
build: 'echo build',
|
||||
lint: 'echo lint',
|
||||
test: 'echo test',
|
||||
},
|
||||
},
|
||||
null,
|
||||
2,
|
||||
),
|
||||
'README.md': `# Skill Extraction Eval
|
||||
|
||||
This workspace exists to exercise background skill extraction from prior chats.
|
||||
`,
|
||||
};
|
||||
|
||||
function buildMessages(userTurns: string[]): MessageRecord[] {
|
||||
const baseTime = new Date(Date.now() - 6 * 60 * 60 * 1000).toISOString();
|
||||
return userTurns.flatMap((text, index) => [
|
||||
{
|
||||
id: `u${index + 1}`,
|
||||
timestamp: baseTime,
|
||||
type: 'user',
|
||||
content: [{ text }],
|
||||
},
|
||||
{
|
||||
id: `a${index + 1}`,
|
||||
timestamp: baseTime,
|
||||
type: 'gemini',
|
||||
content: [{ text: `Acknowledged: ${index + 1}` }],
|
||||
},
|
||||
]);
|
||||
}
|
||||
|
||||
async function seedSessions(
|
||||
config: Config,
|
||||
sessions: SeedSession[],
|
||||
): Promise<void> {
|
||||
const chatsDir = path.join(config.storage.getProjectTempDir(), 'chats');
|
||||
await fsp.mkdir(chatsDir, { recursive: true });
|
||||
|
||||
const projectRoot = config.storage.getProjectRoot();
|
||||
|
||||
for (const session of sessions) {
|
||||
const timestamp = new Date(
|
||||
Date.now() - session.timestampOffsetMinutes * 60 * 1000,
|
||||
)
|
||||
.toISOString()
|
||||
.slice(0, 16)
|
||||
.replace(/:/g, '-');
|
||||
const filename = `${SESSION_FILE_PREFIX}${timestamp}-${session.sessionId.slice(0, 8)}.json`;
|
||||
const conversation = {
|
||||
sessionId: session.sessionId,
|
||||
projectHash: getProjectHash(projectRoot),
|
||||
summary: session.summary,
|
||||
startTime: new Date(Date.now() - 7 * 60 * 60 * 1000).toISOString(),
|
||||
lastUpdated: new Date(Date.now() - 4 * 60 * 60 * 1000).toISOString(),
|
||||
messages: buildMessages(session.userTurns),
|
||||
};
|
||||
|
||||
await fsp.writeFile(
|
||||
path.join(chatsDir, filename),
|
||||
JSON.stringify(conversation, null, 2),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
async function runExtractionAndReadState(config: Config): Promise<{
|
||||
state: { runs: Array<{ sessionIds: string[]; skillsCreated: string[] }> };
|
||||
skillsDir: string;
|
||||
}> {
|
||||
await startMemoryService(config);
|
||||
|
||||
const memoryDir = config.storage.getProjectMemoryTempDir();
|
||||
const skillsDir = config.storage.getProjectSkillsMemoryDir();
|
||||
const statePath = path.join(memoryDir, '.extraction-state.json');
|
||||
|
||||
const raw = await fsp.readFile(statePath, 'utf-8');
|
||||
const state = JSON.parse(raw) as {
|
||||
runs?: Array<{ sessionIds?: string[]; skillsCreated?: string[] }>;
|
||||
};
|
||||
if (!Array.isArray(state.runs) || state.runs.length === 0) {
|
||||
throw new Error('Skill extraction finished without writing any run state');
|
||||
}
|
||||
|
||||
return {
|
||||
state: {
|
||||
runs: state.runs.map((run) => ({
|
||||
sessionIds: Array.isArray(run.sessionIds) ? run.sessionIds : [],
|
||||
skillsCreated: Array.isArray(run.skillsCreated)
|
||||
? run.skillsCreated
|
||||
: [],
|
||||
})),
|
||||
},
|
||||
skillsDir,
|
||||
};
|
||||
}
|
||||
|
||||
async function readSkillBodies(skillsDir: string): Promise<string[]> {
|
||||
try {
|
||||
const entries = await fsp.readdir(skillsDir, { withFileTypes: true });
|
||||
const skillDirs = entries.filter((entry) => entry.isDirectory());
|
||||
const bodies = await Promise.all(
|
||||
skillDirs.map((entry) =>
|
||||
fsp.readFile(path.join(skillsDir, entry.name, 'SKILL.md'), 'utf-8'),
|
||||
),
|
||||
);
|
||||
return bodies;
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Shared configOverrides for all skill extraction component evals.
|
||||
* - experimentalMemoryManager: enables the memory extraction pipeline.
|
||||
* - approvalMode: YOLO auto-approves tool calls (write_file, read_file) so the
|
||||
* background agent can execute without interactive confirmation.
|
||||
*/
|
||||
const EXTRACTION_CONFIG_OVERRIDES = {
|
||||
experimentalMemoryManager: true,
|
||||
approvalMode: ApprovalMode.YOLO,
|
||||
};
|
||||
|
||||
describe('Skill Extraction', () => {
|
||||
componentEvalTest('USUALLY_PASSES', {
|
||||
suiteName: 'skill-extraction',
|
||||
suiteType: 'component-level',
|
||||
name: 'ignores one-off incidents even when session summaries look similar',
|
||||
files: WORKSPACE_FILES,
|
||||
timeout: 180000,
|
||||
configOverrides: EXTRACTION_CONFIG_OVERRIDES,
|
||||
setup: async (config) => {
|
||||
await seedSessions(config, [
|
||||
{
|
||||
sessionId: 'incident-login-redirect',
|
||||
summary: 'Debug login redirect loop in staging',
|
||||
timestampOffsetMinutes: 420,
|
||||
userTurns: [
|
||||
'We only need a one-off fix for incident INC-4412 on branch hotfix/login-loop.',
|
||||
'The exact failing string is ERR_REDIRECT_4412 and this workaround is incident-specific.',
|
||||
'Patch packages/auth/src/redirect.ts just for this branch and do not generalize it.',
|
||||
'The thing that worked was deleting the stale staging cookie before retrying.',
|
||||
'This is not a normal workflow and should not become a reusable instruction.',
|
||||
'It only reproduced against the 2026-04-08 staging rollout.',
|
||||
'After the cookie clear, the branch-specific redirect logic passed.',
|
||||
'Do not turn this incident writeup into a standing process.',
|
||||
'Yes, the hotfix worked for this exact redirect-loop incident.',
|
||||
'Close out INC-4412 once the staging login succeeds again.',
|
||||
],
|
||||
},
|
||||
{
|
||||
sessionId: 'incident-login-timeout',
|
||||
summary: 'Debug login callback timeout in staging',
|
||||
timestampOffsetMinutes: 360,
|
||||
userTurns: [
|
||||
'This is another one-off staging incident, this time TICKET-991 for callback timeout.',
|
||||
'The exact failing string is ERR_CALLBACK_TIMEOUT_991 and it is unrelated to the redirect loop.',
|
||||
'The temporary fix was rotating the staging secret and deleting a bad feature-flag row.',
|
||||
'Do not write a generic login-debugging playbook from this.',
|
||||
'This only applied to the callback timeout during the April rollout.',
|
||||
'The successful fix was specific to the stale secret in staging.',
|
||||
'It does not define a durable repo workflow for future tasks.',
|
||||
'After rotating the secret, the callback timeout stopped reproducing.',
|
||||
'Treat this as incident response only, not a reusable skill.',
|
||||
'Once staging passed again, we closed TICKET-991.',
|
||||
],
|
||||
},
|
||||
]);
|
||||
},
|
||||
assert: async (config) => {
|
||||
const { state, skillsDir } = await runExtractionAndReadState(config);
|
||||
const skillBodies = await readSkillBodies(skillsDir);
|
||||
|
||||
expect(state.runs).toHaveLength(1);
|
||||
expect(state.runs[0].sessionIds).toHaveLength(2);
|
||||
expect(state.runs[0].skillsCreated).toEqual([]);
|
||||
expect(skillBodies).toEqual([]);
|
||||
},
|
||||
});
|
||||
|
||||
componentEvalTest('USUALLY_PASSES', {
|
||||
suiteName: 'skill-extraction',
|
||||
suiteType: 'component-level',
|
||||
name: 'extracts a repeated project-specific workflow into a skill',
|
||||
files: WORKSPACE_FILES,
|
||||
timeout: 180000,
|
||||
configOverrides: EXTRACTION_CONFIG_OVERRIDES,
|
||||
setup: async (config) => {
|
||||
await seedSessions(config, [
|
||||
{
|
||||
sessionId: 'settings-docs-regen-1',
|
||||
summary: 'Update settings docs after adding a config option',
|
||||
timestampOffsetMinutes: 420,
|
||||
userTurns: [
|
||||
'When we add a new config option, we have to regenerate the settings docs in a specific order.',
|
||||
'The sequence that worked was npm run predocs:settings, npm run schema:settings, then npm run docs:settings.',
|
||||
'Do not hand-edit generated settings docs.',
|
||||
'If predocs is skipped, the generated schema docs miss the new defaults.',
|
||||
'Update the source first, then run that generation sequence.',
|
||||
'After regenerating, verify the schema output and docs changed together.',
|
||||
'We used this same sequence the last time we touched settings docs.',
|
||||
'That ordered workflow passed and produced the expected generated files.',
|
||||
'Please keep the exact command order because reversing it breaks the output.',
|
||||
'Yes, the generated settings docs were correct after those three commands.',
|
||||
],
|
||||
},
|
||||
{
|
||||
sessionId: 'settings-docs-regen-2',
|
||||
summary: 'Regenerate settings schema docs for another new setting',
|
||||
timestampOffsetMinutes: 360,
|
||||
userTurns: [
|
||||
'We are touching another setting, so follow the same settings-doc regeneration workflow again.',
|
||||
'Run npm run predocs:settings before npm run schema:settings and npm run docs:settings.',
|
||||
'The project keeps generated settings docs in sync through those commands, not manual edits.',
|
||||
'Skipping predocs caused stale defaults in the generated output before.',
|
||||
'Change the source, then execute the same three commands in order.',
|
||||
'Verify both the schema artifact and docs update together after regeneration.',
|
||||
'This is the recurring workflow we use whenever a setting changes.',
|
||||
'The exact order worked again on this second settings update.',
|
||||
'Please preserve that ordering constraint for future settings changes.',
|
||||
'Confirmed: the settings docs regenerated correctly with the same command sequence.',
|
||||
],
|
||||
},
|
||||
]);
|
||||
},
|
||||
assert: async (config) => {
|
||||
const { state, skillsDir } = await runExtractionAndReadState(config);
|
||||
const skillBodies = await readSkillBodies(skillsDir);
|
||||
const combinedSkills = skillBodies.join('\n\n');
|
||||
|
||||
expect(state.runs).toHaveLength(1);
|
||||
expect(state.runs[0].sessionIds).toHaveLength(2);
|
||||
expect(state.runs[0].skillsCreated.length).toBeGreaterThanOrEqual(1);
|
||||
expect(skillBodies.length).toBeGreaterThanOrEqual(1);
|
||||
expect(combinedSkills).toContain('npm run predocs:settings');
|
||||
expect(combinedSkills).toContain('npm run schema:settings');
|
||||
expect(combinedSkills).toContain('npm run docs:settings');
|
||||
expect(combinedSkills).toMatch(/When to Use/i);
|
||||
expect(combinedSkills).toMatch(/Verification/i);
|
||||
},
|
||||
});
|
||||
|
||||
componentEvalTest('USUALLY_PASSES', {
|
||||
suiteName: 'skill-extraction',
|
||||
suiteType: 'component-level',
|
||||
name: 'extracts a repeated multi-step migration workflow with ordering constraints',
|
||||
files: WORKSPACE_FILES,
|
||||
timeout: 180000,
|
||||
configOverrides: EXTRACTION_CONFIG_OVERRIDES,
|
||||
setup: async (config) => {
|
||||
await seedSessions(config, [
|
||||
{
|
||||
sessionId: 'db-migration-v12',
|
||||
summary: 'Run database migration for v12 schema update',
|
||||
timestampOffsetMinutes: 420,
|
||||
userTurns: [
|
||||
'Every time we change the database schema we follow a specific migration workflow.',
|
||||
'First run npm run db:check to verify no pending migrations conflict.',
|
||||
'Then run npm run db:migrate to apply the new migration files.',
|
||||
'After migration, always run npm run db:validate to confirm schema integrity.',
|
||||
'If db:validate fails, immediately run npm run db:rollback before anything else.',
|
||||
'Never skip db:check — last time we did, two migrations collided and corrupted the index.',
|
||||
'The ordering is critical: check, migrate, validate. Reversing migrate and validate caused silent data loss before.',
|
||||
'This v12 migration passed after following that exact sequence.',
|
||||
'We use this same three-step workflow every time the schema changes.',
|
||||
'Confirmed: db:check, db:migrate, db:validate completed successfully for v12.',
|
||||
],
|
||||
},
|
||||
{
|
||||
sessionId: 'db-migration-v13',
|
||||
summary: 'Run database migration for v13 schema update',
|
||||
timestampOffsetMinutes: 360,
|
||||
userTurns: [
|
||||
'New schema change for v13, following the same database migration workflow as before.',
|
||||
'Start with npm run db:check to ensure no conflicting pending migrations.',
|
||||
'Then npm run db:migrate to apply the v13 migration files.',
|
||||
'Then npm run db:validate to confirm the schema is consistent.',
|
||||
'If validation fails, run npm run db:rollback immediately — do not attempt manual fixes.',
|
||||
'We learned the hard way that skipping db:check causes index corruption.',
|
||||
'The check-migrate-validate order is mandatory for every schema change.',
|
||||
'This is the same recurring workflow we used for v12 and earlier migrations.',
|
||||
'The v13 migration passed with the same three-step sequence.',
|
||||
'Confirmed: the standard db migration workflow succeeded again for v13.',
|
||||
],
|
||||
},
|
||||
]);
|
||||
},
|
||||
assert: async (config) => {
|
||||
const { state, skillsDir } = await runExtractionAndReadState(config);
|
||||
const skillBodies = await readSkillBodies(skillsDir);
|
||||
const combinedSkills = skillBodies.join('\n\n');
|
||||
|
||||
expect(state.runs).toHaveLength(1);
|
||||
expect(state.runs[0].sessionIds).toHaveLength(2);
|
||||
expect(state.runs[0].skillsCreated.length).toBeGreaterThanOrEqual(1);
|
||||
expect(skillBodies.length).toBeGreaterThanOrEqual(1);
|
||||
expect(combinedSkills).toContain('npm run db:check');
|
||||
expect(combinedSkills).toContain('npm run db:migrate');
|
||||
expect(combinedSkills).toContain('npm run db:validate');
|
||||
expect(combinedSkills).toMatch(/rollback/i);
|
||||
expect(combinedSkills).toMatch(/When to Use/i);
|
||||
},
|
||||
});
|
||||
});
|
||||
90
packages/core/src/agents/skill-extraction-agent.test.ts
Normal file
90
packages/core/src/agents/skill-extraction-agent.test.ts
Normal file
@@ -0,0 +1,90 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import { SkillExtractionAgent } from './skill-extraction-agent.js';
|
||||
import {
|
||||
EDIT_TOOL_NAME,
|
||||
GLOB_TOOL_NAME,
|
||||
GREP_TOOL_NAME,
|
||||
LS_TOOL_NAME,
|
||||
READ_FILE_TOOL_NAME,
|
||||
WRITE_FILE_TOOL_NAME,
|
||||
} from '../tools/tool-names.js';
|
||||
import { PREVIEW_GEMINI_FLASH_MODEL } from '../config/models.js';
|
||||
|
||||
describe('SkillExtractionAgent', () => {
|
||||
const skillsDir = '/tmp/skills';
|
||||
const sessionIndex =
|
||||
'[NEW] Debug login flow (12 user msgs) — /tmp/chats/session-1.json';
|
||||
const existingSkillsSummary =
|
||||
'## Workspace Skills (.gemini/skills — do NOT duplicate)\n- **existing-skill**: Existing description';
|
||||
|
||||
const agent = SkillExtractionAgent(
|
||||
skillsDir,
|
||||
sessionIndex,
|
||||
existingSkillsSummary,
|
||||
);
|
||||
|
||||
it('should expose expected metadata, model, and tools', () => {
|
||||
expect(agent.kind).toBe('local');
|
||||
expect(agent.name).toBe('confucius');
|
||||
expect(agent.displayName).toBe('Skill Extractor');
|
||||
expect(agent.modelConfig.model).toBe(PREVIEW_GEMINI_FLASH_MODEL);
|
||||
expect(agent.toolConfig?.tools).toEqual(
|
||||
expect.arrayContaining([
|
||||
READ_FILE_TOOL_NAME,
|
||||
WRITE_FILE_TOOL_NAME,
|
||||
EDIT_TOOL_NAME,
|
||||
LS_TOOL_NAME,
|
||||
GLOB_TOOL_NAME,
|
||||
GREP_TOOL_NAME,
|
||||
]),
|
||||
);
|
||||
});
|
||||
|
||||
it('should default to no skill unless recurrence and durability are proven', () => {
|
||||
const prompt = agent.promptConfig.systemPrompt;
|
||||
|
||||
expect(prompt).toContain('Default to NO SKILL.');
|
||||
expect(prompt).toContain(
|
||||
'strong evidence this will recur for future agents in this repo/workflow',
|
||||
);
|
||||
expect(prompt).toContain('broader than a single incident');
|
||||
expect(prompt).toContain('A skill MUST meet ALL of these criteria:');
|
||||
expect(prompt).toContain(
|
||||
'Future agents in this repo/workflow are likely to need it',
|
||||
);
|
||||
});
|
||||
|
||||
it('should explicitly reject one-off incidents and single-session preferences', () => {
|
||||
const prompt = agent.promptConfig.systemPrompt;
|
||||
|
||||
expect(prompt).toContain('Single-session preferences');
|
||||
expect(prompt).toContain('One-off incidents');
|
||||
expect(prompt).toContain('Output-style preferences');
|
||||
expect(prompt).toContain('cannot survive renaming the specific');
|
||||
});
|
||||
|
||||
it('should warn that session summaries are user-intent summaries, not workflow evidence', () => {
|
||||
const query = agent.promptConfig.query ?? '';
|
||||
|
||||
expect(query).toContain(existingSkillsSummary);
|
||||
expect(query).toContain(sessionIndex);
|
||||
expect(query).toContain(
|
||||
'The summary is a user-intent summary, not a workflow summary.',
|
||||
);
|
||||
expect(query).toContain(
|
||||
'The session summaries describe user intent, not workflow details.',
|
||||
);
|
||||
expect(query).toContain(
|
||||
'Only write a skill if the evidence shows a durable, recurring workflow',
|
||||
);
|
||||
expect(query).toContain(
|
||||
'If recurrence or future reuse is unclear, create no skill and explain why.',
|
||||
);
|
||||
});
|
||||
});
|
||||
@@ -36,7 +36,7 @@ function buildSystemPrompt(skillsDir: string): string {
|
||||
'- solve similar tasks with fewer tool calls and fewer reasoning tokens',
|
||||
'- reuse proven workflows and verification checklists',
|
||||
'- avoid known failure modes and landmines',
|
||||
'- anticipate user preferences without being reminded',
|
||||
'- capture durable workflow constraints that future agents are likely to encounter again',
|
||||
'',
|
||||
'============================================================',
|
||||
'SAFETY AND HYGIENE (STRICT)',
|
||||
@@ -59,6 +59,10 @@ function buildSystemPrompt(skillsDir: string): string {
|
||||
'1. "Is this something a competent agent would NOT already know?" If no, STOP.',
|
||||
'2. "Does an existing skill (listed below) already cover this?" If yes, STOP.',
|
||||
'3. "Can I write a concrete, step-by-step procedure?" If no, STOP.',
|
||||
'4. "Is there strong evidence this will recur for future agents in this repo/workflow?" If no, STOP.',
|
||||
'5. "Is this broader than a single incident (one bug, one ticket, one branch, one date, one exact error)?" If no, STOP.',
|
||||
'',
|
||||
'Default to NO SKILL.',
|
||||
'',
|
||||
'Do NOT create skills for:',
|
||||
'',
|
||||
@@ -67,6 +71,10 @@ function buildSystemPrompt(skillsDir: string): string {
|
||||
'- **Pure Q&A**: The user asked "how does X work?" and got an answer. No procedure.',
|
||||
'- **Brainstorming/design**: Discussion of how to build something, without a validated',
|
||||
' implementation that produced a reusable procedure.',
|
||||
'- **Single-session preferences**: User-specific style/output preferences or workflow',
|
||||
' preferences mentioned only once.',
|
||||
'- **One-off incidents**: Debugging or incident response tied to a single bug, ticket,',
|
||||
' branch, date, or exact error string.',
|
||||
'- **Anything already covered by an existing skill** (global, workspace, builtin, or',
|
||||
' previously extracted). Check the "Existing Skills" section carefully.',
|
||||
'',
|
||||
@@ -74,31 +82,40 @@ function buildSystemPrompt(skillsDir: string): string {
|
||||
'WHAT COUNTS AS A SKILL',
|
||||
'============================================================',
|
||||
'',
|
||||
'A skill MUST meet BOTH of these criteria:',
|
||||
'A skill MUST meet ALL of these criteria:',
|
||||
'',
|
||||
'1. **Procedural and concrete**: It can be expressed as numbered steps with specific',
|
||||
' commands, paths, or code patterns. If you can only write vague guidance, it is NOT',
|
||||
' a skill. "Be careful with X" is advice, not a skill.',
|
||||
'',
|
||||
'2. **Non-obvious and project-specific**: A competent agent would NOT already know this.',
|
||||
' It encodes project-specific knowledge, non-obvious ordering constraints, or',
|
||||
' hard-won failure shields that cannot be inferred from the codebase alone.',
|
||||
'2. **Durable and reusable**: Future agents in this repo/workflow are likely to need it',
|
||||
' again. If it only solved one incident, it is NOT a skill.',
|
||||
'',
|
||||
'Confidence tiers (prefer higher tiers):',
|
||||
'3. **Evidence-backed and project-specific**: It encodes project-specific knowledge,',
|
||||
' repeated operational constraints, or hard-won failure shields supported by session',
|
||||
' evidence. Do not assume something is non-obvious just because it sounds detailed.',
|
||||
'',
|
||||
'**High confidence** — create the skill:',
|
||||
'- The same workflow appeared in multiple sessions (cross-session repetition)',
|
||||
'- A multi-step procedure was validated (tests passed, user confirmed success)',
|
||||
'Confidence tiers:',
|
||||
'',
|
||||
'**Medium confidence** — create the skill if it is clearly project-specific:',
|
||||
'- A project-specific build/test/deploy/release procedure was established',
|
||||
'- A non-obvious ordering constraint or prerequisite was discovered',
|
||||
'- A failure mode was hit and a concrete fix was found and verified',
|
||||
'**High confidence** — create the skill only when recurrence/durability is clear:',
|
||||
'- The same workflow appeared in multiple sessions (cross-session repetition), OR it is',
|
||||
' a stable recurring repo workflow (for example setup/build/test/deploy/release) with a',
|
||||
' clear future trigger',
|
||||
'- The workflow was validated (tests passed, user confirmed success, or the same fix',
|
||||
' worked repeatedly)',
|
||||
'- The skill can be named without referencing a specific incident, bug, branch, or date',
|
||||
'',
|
||||
'**Medium confidence** — usually do NOT create the skill yet:',
|
||||
'- A project-specific procedure appeared once and seems useful, but recurrence is not yet',
|
||||
' clear',
|
||||
'- A verified fix exists, but it is still tied to one incident',
|
||||
'- A user correction changed the approach once, but durability is uncertain',
|
||||
'',
|
||||
'**Low confidence** — do NOT create the skill:',
|
||||
'- A one-off debugging session with no reusable procedure',
|
||||
'- Generic workflows any agent could figure out from the codebase',
|
||||
'- A code review or investigation with no durable takeaway',
|
||||
'- Output-style preferences that do not materially change procedure',
|
||||
'',
|
||||
'Aim for 0-2 skills per run. Quality over quantity.',
|
||||
'',
|
||||
@@ -117,8 +134,10 @@ function buildSystemPrompt(skillsDir: string): string {
|
||||
'',
|
||||
'What to look for:',
|
||||
'',
|
||||
'- User corrections: "No, do it this way" -> preference signal',
|
||||
'- User corrections that change procedure in a durable way, especially when repeated',
|
||||
' across sessions',
|
||||
'- Repeated patterns across sessions: same commands, same file paths, same workflow',
|
||||
'- Stable recurring repo lifecycle workflows with clear future triggers',
|
||||
'- Failed attempts followed by successful ones -> failure shield',
|
||||
'- Multi-step procedures that were validated (tests passed, user confirmed)',
|
||||
'- User interruptions: "Stop, you need to X first" -> ordering constraint',
|
||||
@@ -129,6 +148,8 @@ function buildSystemPrompt(skillsDir: string): string {
|
||||
'- Tool outputs that are just data (file contents, search results)',
|
||||
'- Speculative plans that were never executed',
|
||||
"- Temporary context (current branch name, today's date, specific error IDs)",
|
||||
'- Similar session summaries without matching workflow evidence',
|
||||
'- One-off artifact names: bug IDs, branch names, timestamps, exact incident strings',
|
||||
'',
|
||||
'============================================================',
|
||||
'SKILL FORMAT',
|
||||
@@ -214,7 +235,10 @@ function buildSystemPrompt(skillsDir: string): string {
|
||||
'- Keep scopes distinct. Avoid overlapping "do-everything" skills.',
|
||||
'- Every skill MUST have: triggers, procedure, at least one pitfall or verification step.',
|
||||
'- If you cannot write a reliable procedure (too many unknowns), do NOT create the skill.',
|
||||
'- Do not create skills for generic advice that any competent agent would already know.',
|
||||
'- If the candidate is tied to one incident or cannot survive renaming the specific',
|
||||
' bug/ticket, do NOT create it.',
|
||||
'- Do not create skills for generic advice, output-style preferences, or ephemeral',
|
||||
' choices that any competent agent would already know or adapt to on the fly.',
|
||||
'- Prefer fewer, higher-quality skills. 0-2 skills per run is typical. 3+ is unusual.',
|
||||
'',
|
||||
'============================================================',
|
||||
@@ -224,17 +248,23 @@ function buildSystemPrompt(skillsDir: string): string {
|
||||
`1. Use list_directory on ${skillsDir} to see existing skills.`,
|
||||
'2. If skills exist, read their SKILL.md files to understand what is already captured.',
|
||||
'3. Scan the session index provided in the query. Look for [NEW] sessions whose summaries',
|
||||
' suggest workflows that ALSO appear in other sessions (either [NEW] or [old]).',
|
||||
'4. Apply the minimum signal gate. If no repeated patterns are visible, report that and finish.',
|
||||
' hint at workflows that ALSO appear in other sessions (either [NEW] or [old]) or at a',
|
||||
' stable recurring repo workflow. Remember: summary similarity alone is NOT enough.',
|
||||
'4. Apply the minimum signal gate. If recurrence or durability is not visible, report that',
|
||||
' no skill should be created and finish.',
|
||||
'5. For promising patterns, use read_file on the session file paths to inspect the full',
|
||||
' conversation. Confirm the workflow was actually repeated and validated.',
|
||||
'6. For each confirmed skill, verify it meets ALL criteria (repeatable, procedural, high-leverage).',
|
||||
' conversation. Confirm the workflow was actually repeated and validated. Read at least',
|
||||
' two sessions unless the candidate is clearly a stable recurring repo lifecycle workflow.',
|
||||
'6. For each candidate, verify it meets ALL criteria. Before writing, make sure you can',
|
||||
' state: future trigger, evidence sessions, recurrence signal, validation signal, and',
|
||||
' why it is not generic.',
|
||||
'7. Write new SKILL.md files or update existing ones in your directory using write_file.',
|
||||
' For skills that live OUTSIDE your directory, write a .patch file instead (see UPDATING EXISTING SKILLS).',
|
||||
'8. Write COMPLETE files — never partially update a SKILL.md.',
|
||||
'',
|
||||
'IMPORTANT: Do NOT read every session. Only read sessions whose summaries suggest a',
|
||||
'repeated pattern worth investigating. Most runs should read 0-3 sessions and create 0 skills.',
|
||||
'repeated pattern or a stable recurring repo workflow worth investigating. Most runs',
|
||||
'should read 0-3 sessions and create 0 skills.',
|
||||
'Do not explore the codebase. Work only with the session index, session files, and the skills directory.',
|
||||
].join('\n');
|
||||
}
|
||||
@@ -301,6 +331,9 @@ export const SkillExtractionAgent = (
|
||||
'Below is an index of past conversation sessions. Each line shows:',
|
||||
'[NEW] or [old] status, a 1-line summary, message count, and the file path.',
|
||||
'',
|
||||
'The summary is a user-intent summary, not a workflow summary.',
|
||||
'Matching summary text alone is never enough evidence for a reusable skill.',
|
||||
'',
|
||||
'[NEW] = not yet processed for skill extraction (focus on these)',
|
||||
'[old] = previously processed (read only if a [NEW] session hints at a repeated pattern)',
|
||||
'',
|
||||
@@ -319,7 +352,7 @@ export const SkillExtractionAgent = (
|
||||
|
||||
return {
|
||||
systemPrompt: buildSystemPrompt(skillsDir),
|
||||
query: `${initialContext}\n\nAnalyze the session index above. Read sessions that suggest repeated workflows using read_file. Extract reusable skills to ${skillsDir}/.`,
|
||||
query: `${initialContext}\n\nAnalyze the session index above. The session summaries describe user intent, not workflow details. Read sessions that suggest repeated workflows using read_file. Only write a skill if the evidence shows a durable, recurring workflow or a stable recurring repo procedure. If recurrence or future reuse is unclear, create no skill and explain why.`,
|
||||
};
|
||||
},
|
||||
runConfig: {
|
||||
|
||||
Reference in New Issue
Block a user