diff --git a/.github/workflows/chained_e2e.yml b/.github/workflows/chained_e2e.yml index bd276a3853..4a5de8bf7c 100644 --- a/.github/workflows/chained_e2e.yml +++ b/.github/workflows/chained_e2e.yml @@ -337,6 +337,7 @@ jobs: if: "${{ steps.check_evals.outputs.should_run == 'true' }}" env: GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}' + GEMINI_CLI_TRUST_WORKSPACE: true GEMINI_MODEL: 'gemini-3-pro-preview' # Only run always passes behavioral tests. EVAL_SUITE_TYPE: 'behavioral' diff --git a/.github/workflows/evals-nightly.yml b/.github/workflows/evals-nightly.yml index fbb770ac84..1fe61971fe 100644 --- a/.github/workflows/evals-nightly.yml +++ b/.github/workflows/evals-nightly.yml @@ -66,6 +66,7 @@ jobs: continue-on-error: true env: GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}' + GEMINI_CLI_TRUST_WORKSPACE: true GEMINI_MODEL: '${{ matrix.model }}' RUN_EVALS: 'true' EVAL_SUITE_TYPE: "${{ github.event.inputs.suite_type || 'behavioral' }}" diff --git a/evals/save_memory.eval.ts b/evals/save_memory.eval.ts index b31167fb4a..f49624419b 100644 --- a/evals/save_memory.eval.ts +++ b/evals/save_memory.eval.ts @@ -5,12 +5,78 @@ */ import { describe, expect } from 'vitest'; +import fs from 'node:fs'; +import path from 'node:path'; +import { + loadConversationRecord, + SESSION_FILE_PREFIX, +} from '@google/gemini-cli-core'; import { evalTest, assertModelHasOutput, checkModelOutputContent, } from './test-helper.js'; +function findDir(base: string, name: string): string | null { + if (!fs.existsSync(base)) return null; + const files = fs.readdirSync(base); + for (const file of files) { + const fullPath = path.join(base, file); + if (fs.statSync(fullPath).isDirectory()) { + if (file === name) return fullPath; + const found = findDir(fullPath, name); + if (found) return found; + } + } + return null; +} + +async function loadLatestSessionRecord(homeDir: string, sessionId: string) { + const chatsDir = findDir(path.join(homeDir, '.gemini'), 'chats'); + if (!chatsDir) { + throw new Error('Could not find chats directory for eval session logs'); + } + + const candidates = fs + .readdirSync(chatsDir) + .filter( + (file) => + file.startsWith(SESSION_FILE_PREFIX) && + (file.endsWith('.json') || file.endsWith('.jsonl')), + ); + + const matchingRecords = []; + for (const file of candidates) { + const filePath = path.join(chatsDir, file); + const record = await loadConversationRecord(filePath); + if (record?.sessionId === sessionId) { + matchingRecords.push(record); + } + } + + matchingRecords.sort( + (a, b) => Date.parse(b.lastUpdated) - Date.parse(a.lastUpdated), + ); + return matchingRecords[0] ?? null; +} + +async function waitForSessionScratchpad( + homeDir: string, + sessionId: string, + timeoutMs = 30000, +) { + const deadline = Date.now() + timeoutMs; + while (Date.now() < deadline) { + const record = await loadLatestSessionRecord(homeDir, sessionId); + if (record?.memoryScratchpad) { + return record; + } + await new Promise((resolve) => setTimeout(resolve, 1000)); + } + + return loadLatestSessionRecord(homeDir, sessionId); +} + describe('save_memory', () => { const TEST_PREFIX = 'Save memory test: '; const rememberingFavoriteColor = "Agent remembers user's favorite color"; @@ -569,6 +635,103 @@ describe('save_memory', () => { }, }); + const memoryV2SessionScratchpad = + 'Session summary persists memory scratchpad for memory-saving sessions'; + evalTest('USUALLY_PASSES', { + suiteName: 'default', + suiteType: 'behavioral', + name: memoryV2SessionScratchpad, + sessionId: 'memory-scratchpad-eval', + params: { + settings: { + experimental: { memoryV2: true }, + }, + }, + messages: [ + { + id: 'msg-1', + type: 'user', + content: [ + { + text: 'Across all my projects, I prefer Vitest over Jest for testing.', + }, + ], + timestamp: '2026-01-01T00:00:00Z', + }, + { + id: 'msg-2', + type: 'gemini', + content: [{ text: 'Noted. What else should I keep in mind?' }], + timestamp: '2026-01-01T00:00:05Z', + }, + { + id: 'msg-3', + type: 'user', + content: [ + { + text: 'For this repo I was debugging a flaky API test earlier, but that was just transient context.', + }, + ], + timestamp: '2026-01-01T00:01:00Z', + }, + { + id: 'msg-4', + type: 'gemini', + content: [ + { text: 'Understood. I will only save the durable preference.' }, + ], + timestamp: '2026-01-01T00:01:05Z', + }, + ], + prompt: + 'Please save any persistent preferences or facts about me from our conversation to memory.', + assert: async (rig, result) => { + await rig.waitForToolCall('write_file').catch(() => {}); + const writeCalls = rig + .readToolLogs() + .filter((log) => + ['write_file', 'replace'].includes(log.toolRequest.name), + ); + + expect( + writeCalls.length, + 'Expected memoryV2 save flow to edit a markdown memory file', + ).toBeGreaterThan(0); + + await rig.run({ + args: ['--list-sessions'], + approvalMode: 'yolo', + timeout: 120000, + }); + + const record = await waitForSessionScratchpad( + rig.homeDir!, + 'memory-scratchpad-eval', + ); + expect( + record?.memoryScratchpad, + 'Expected the resumed session log to contain a memoryScratchpad after session summary generation', + ).toBeDefined(); + expect(record?.memoryScratchpad?.version).toBe(1); + expect( + record?.memoryScratchpad?.toolSequence?.some((toolName) => + ['write_file', 'replace'].includes(toolName), + ), + 'Expected memoryScratchpad.toolSequence to include the markdown editing tool used for memory persistence', + ).toBe(true); + expect( + record?.memoryScratchpad?.touchedPaths?.length, + 'Expected memoryScratchpad to capture at least one touched path', + ).toBeGreaterThan(0); + expect( + record?.memoryScratchpad?.workflowSummary, + 'Expected memoryScratchpad.workflowSummary to be populated', + ).toMatch(/write_file|replace/i); + + assertModelHasOutput(result); + }, + }); + const memoryV2RoutesUserProject = 'Agent routes personal-to-user project notes to user-project memory'; evalTest('USUALLY_PASSES', { diff --git a/evals/skill_extraction.eval.ts b/evals/skill_extraction.eval.ts index 2c80146523..d30a498a8f 100644 --- a/evals/skill_extraction.eval.ts +++ b/evals/skill_extraction.eval.ts @@ -6,21 +6,30 @@ import fsp from 'node:fs/promises'; import path from 'node:path'; -import { describe, expect } from 'vitest'; +import { describe, expect, it } from 'vitest'; import { type Config, ApprovalMode, + type MemoryScratchpad, SESSION_FILE_PREFIX, getProjectHash, startMemoryService, } from '@google/gemini-cli-core'; -import { componentEvalTest } from './component-test-helper.js'; +import { ComponentRig, componentEvalTest } from './component-test-helper.js'; +import { + average, + averageNullable, + countMatchingIds, + roundStat, +} from './statistics-helper.js'; +import { prepareWorkspace } from './test-helper.js'; interface SeedSession { sessionId: string; summary: string; userTurns: string[]; timestampOffsetMinutes: number; + memoryScratchpad?: MemoryScratchpad; } interface MessageRecord { @@ -30,6 +39,81 @@ interface MessageRecord { content: Array<{ text: string }>; } +interface SessionVersion { + sessionId: string; + lastUpdated: string; +} + +interface ExtractionRunSnapshot { + sessionIds: string[]; + skillsCreated: string[]; + candidateSessions: SessionVersion[]; + processedSessions: SessionVersion[]; + turnCount?: number; + durationMs?: number; + terminateReason?: string; +} + +interface ExtractionOutcome { + state: { runs: ExtractionRunSnapshot[] }; + skillsDir: string; + skillBodies: string[]; +} + +interface SkillQualitySignal { + label: string; + pattern: RegExp; +} + +interface ScratchpadRunMetrics { + turnCount: number | null; + durationMs: number | null; + terminateReason: string | null; + skillsCreated: number; + candidateSessions: number; + processedSessions: number; + relevantReads: number; + distractorReads: number; + totalReads: number; + recall: number; + precision: number; + signalScore: number; + skillQualityScore: number; + skillQualityMax: number; + skillQualityRatio: number; + missingQualitySignals: string[]; +} + +interface ScratchpadStatsTrial { + trial: number; + baseline: ScratchpadRunMetrics; + enhanced: ScratchpadRunMetrics; +} + +interface ScratchpadStatsAggregate { + turnCountAvg: number | null; + durationMsAvg: number | null; + recallAvg: number; + precisionAvg: number; + signalScoreAvg: number; + relevantReadsAvg: number; + distractorReadsAvg: number; + skillsCreatedAvg: number; + skillQualityScoreAvg: number; + skillQualityRatioAvg: number; +} + +interface ScratchpadStatsReport { + generatedAt: string; + trials: number; + aggregate: { + baseline: ScratchpadStatsAggregate; + enhanced: ScratchpadStatsAggregate; + }; + deltas: ScratchpadStatsAggregate; + results: ScratchpadStatsTrial[]; +} + const WORKSPACE_FILES = { 'package.json': JSON.stringify( { @@ -68,6 +152,143 @@ function buildMessages(userTurns: string[]): MessageRecord[] { ]); } +function padTurns(turns: string[]): string[] { + if (turns.length >= 10) { + return turns; + } + + const padded = [...turns]; + for (let i = turns.length; i < 10; i++) { + padded.push(`${turns[i % turns.length]} (repeat ${i + 1})`); + } + return padded; +} + +function createScratchpad( + workflowSummary: string, + touchedPaths: string[], + validationStatus: MemoryScratchpad['validationStatus'] = 'passed', +): MemoryScratchpad { + return { + version: 1, + workflowSummary, + toolSequence: ['run_shell_command'], + touchedPaths, + validationStatus, + }; +} + +function createWorkflowComparisonSessions(withScratchpad: boolean): { + sessions: SeedSession[]; + relevantSessionIds: string[]; + distractorSessionIds: string[]; +} { + const relevantWorkflowSummary = + 'run_shell_command -> run_shell_command | paths packages/cli/src/config/settings.ts, docs/settings.md | validated'; + + const relevantScratchpad = withScratchpad + ? createScratchpad(relevantWorkflowSummary, [ + 'packages/cli/src/config/settings.ts', + 'docs/settings.md', + ]) + : undefined; + + const sessions: SeedSession[] = [ + { + sessionId: 'hidden-settings-workflow-a', + summary: 'Prepare release notes for settings launch', + timestampOffsetMinutes: 420, + memoryScratchpad: relevantScratchpad, + userTurns: padTurns([ + 'When we add a new setting, the durable workflow is to regenerate the settings docs instead of editing them by hand.', + 'The sequence that worked was npm run predocs:settings, npm run schema:settings, then npm run docs:settings.', + 'Skipping predocs leaves stale defaults in the generated docs.', + 'We verify the workflow by checking that both the schema output and docs update together.', + 'This exact command order is the recurring workflow we use for settings changes.', + ]), + }, + { + sessionId: 'hidden-settings-workflow-b', + summary: 'Investigate CI drift in generated config reference', + timestampOffsetMinutes: 390, + memoryScratchpad: relevantScratchpad, + userTurns: padTurns([ + 'The config reference drift was fixed by rerunning the standard settings regeneration workflow.', + 'We again used npm run predocs:settings before npm run schema:settings and npm run docs:settings.', + 'The recurring rule is never to hand-edit generated settings docs.', + 'The validation step is to confirm the schema artifact and docs changed together after regeneration.', + 'This is the same recurring workflow we use every time a setting changes.', + ]), + }, + { + sessionId: 'distractor-release-notes', + summary: 'Prepare release notes for auth launch', + timestampOffsetMinutes: 360, + memoryScratchpad: undefined, + userTurns: padTurns([ + 'This release-notes task was one-off and just needed manual wording updates.', + 'I edited CHANGELOG.md and docs/release-notes.md directly.', + 'There was no reusable command sequence here beyond proofreading the copy.', + 'This task should not become a standing workflow.', + 'Once the wording landed, we were done.', + ]), + }, + { + sessionId: 'distractor-ci-snapshots', + summary: 'Investigate CI drift in auth snapshots', + timestampOffsetMinutes: 330, + memoryScratchpad: undefined, + userTurns: padTurns([ + 'This auth snapshot issue was specific to a flaky test in CI.', + 'The only commands we ran were npm test -- auth and an isolated snapshot update.', + 'It was not the recurring settings-doc workflow.', + 'Once the flaky snapshot passed, there was no broader reusable procedure.', + 'Treat this as a one-off CI cleanup.', + ]), + }, + { + sessionId: 'distractor-onboarding-docs', + summary: 'Refresh onboarding documentation copy', + timestampOffsetMinutes: 300, + memoryScratchpad: undefined, + userTurns: padTurns([ + 'This was just a docs wording cleanup in docs/onboarding.md.', + 'No command sequence was involved.', + 'We manually edited the copy and reviewed it.', + 'There is no recurring operational workflow to capture here.', + 'This should stay a one-off docs edit.', + ]), + }, + { + sessionId: 'distractor-deploy-copy', + summary: 'Adjust deployment checklist wording', + timestampOffsetMinutes: 270, + memoryScratchpad: undefined, + userTurns: padTurns([ + 'This was a wording-only change to docs/deploy.md.', + 'We did not run a reusable command sequence.', + 'It should not become a skill.', + 'The edit was only for this deploy checklist cleanup.', + 'After the copy change, the task was complete.', + ]), + }, + ]; + + return { + sessions, + relevantSessionIds: [ + 'hidden-settings-workflow-a', + 'hidden-settings-workflow-b', + ], + distractorSessionIds: [ + 'distractor-release-notes', + 'distractor-ci-snapshots', + 'distractor-onboarding-docs', + 'distractor-deploy-copy', + ], + }; +} + async function seedSessions( config: Config, sessions: SeedSession[], @@ -78,9 +299,10 @@ async function seedSessions( const projectRoot = config.storage.getProjectRoot(); for (const session of sessions) { - const timestamp = new Date( + const sessionTimestamp = new Date( Date.now() - session.timestampOffsetMinutes * 60 * 1000, - ) + ); + const timestamp = sessionTimestamp .toISOString() .slice(0, 16) .replace(/:/g, '-'); @@ -89,8 +311,9 @@ async function seedSessions( sessionId: session.sessionId, projectHash: getProjectHash(projectRoot), summary: session.summary, + memoryScratchpad: session.memoryScratchpad, startTime: new Date(Date.now() - 7 * 60 * 60 * 1000).toISOString(), - lastUpdated: new Date(Date.now() - 4 * 60 * 60 * 1000).toISOString(), + lastUpdated: sessionTimestamp.toISOString(), messages: buildMessages(session.userTurns), }; @@ -101,10 +324,9 @@ async function seedSessions( } } -async function runExtractionAndReadState(config: Config): Promise<{ - state: { runs: Array<{ sessionIds: string[]; skillsCreated: string[] }> }; - skillsDir: string; -}> { +async function runExtractionAndReadState( + config: Config, +): Promise { await startMemoryService(config); const memoryDir = config.storage.getProjectMemoryTempDir(); @@ -113,7 +335,15 @@ async function runExtractionAndReadState(config: Config): Promise<{ const raw = await fsp.readFile(statePath, 'utf-8'); const state = JSON.parse(raw) as { - runs?: Array<{ sessionIds?: string[]; skillsCreated?: string[] }>; + runs?: Array<{ + sessionIds?: string[]; + skillsCreated?: string[]; + candidateSessions?: SessionVersion[]; + processedSessions?: SessionVersion[]; + turnCount?: number; + durationMs?: number; + terminateReason?: string; + }>; }; if (!Array.isArray(state.runs) || state.runs.length === 0) { throw new Error('Skill extraction finished without writing any run state'); @@ -126,27 +356,292 @@ async function runExtractionAndReadState(config: Config): Promise<{ skillsCreated: Array.isArray(run.skillsCreated) ? run.skillsCreated : [], + candidateSessions: Array.isArray(run.candidateSessions) + ? run.candidateSessions + : [], + processedSessions: Array.isArray(run.processedSessions) + ? run.processedSessions + : [], + turnCount: + typeof run.turnCount === 'number' ? run.turnCount : undefined, + durationMs: + typeof run.durationMs === 'number' ? run.durationMs : undefined, + terminateReason: + typeof run.terminateReason === 'string' + ? run.terminateReason + : undefined, })), }, skillsDir, + skillBodies: await readSkillBodies(skillsDir), }; } +async function summarizeScratchpadRun( + outcome: ExtractionOutcome, + run: ExtractionRunSnapshot, + scenario: ReturnType, +): Promise { + const relevantReads = countMatchingIds( + run.processedSessions, + scenario.relevantSessionIds, + ); + const distractorReads = countMatchingIds( + run.processedSessions, + scenario.distractorSessionIds, + ); + const totalReads = run.processedSessions.length; + const quality = scoreSkillQuality( + outcome.skillBodies, + SETTINGS_SKILL_QUALITY_SIGNALS, + ); + + return { + turnCount: run.turnCount ?? null, + durationMs: run.durationMs ?? null, + terminateReason: run.terminateReason ?? null, + skillsCreated: run.skillsCreated.length, + candidateSessions: run.candidateSessions.length, + processedSessions: totalReads, + relevantReads, + distractorReads, + totalReads, + recall: relevantReads / scenario.relevantSessionIds.length, + precision: totalReads === 0 ? 0 : relevantReads / totalReads, + signalScore: relevantReads - distractorReads, + skillQualityScore: quality.score, + skillQualityMax: quality.maxScore, + skillQualityRatio: + quality.maxScore === 0 ? 0 : quality.score / quality.maxScore, + missingQualitySignals: quality.missing, + }; +} + +function averageScratchpadRuns( + runs: ScratchpadRunMetrics[], +): ScratchpadStatsAggregate { + return { + turnCountAvg: roundStat(averageNullable(runs.map((run) => run.turnCount))), + durationMsAvg: roundStat( + averageNullable(runs.map((run) => run.durationMs)), + ), + recallAvg: roundStat(average(runs.map((run) => run.recall))) ?? 0, + precisionAvg: roundStat(average(runs.map((run) => run.precision))) ?? 0, + signalScoreAvg: roundStat(average(runs.map((run) => run.signalScore))) ?? 0, + relevantReadsAvg: + roundStat(average(runs.map((run) => run.relevantReads))) ?? 0, + distractorReadsAvg: + roundStat(average(runs.map((run) => run.distractorReads))) ?? 0, + skillsCreatedAvg: + roundStat(average(runs.map((run) => run.skillsCreated))) ?? 0, + skillQualityScoreAvg: + roundStat(average(runs.map((run) => run.skillQualityScore))) ?? 0, + skillQualityRatioAvg: + roundStat(average(runs.map((run) => run.skillQualityRatio))) ?? 0, + }; +} + +function diffScratchpadAggregates( + baseline: ScratchpadStatsAggregate, + enhanced: ScratchpadStatsAggregate, +): ScratchpadStatsAggregate { + return { + turnCountAvg: + baseline.turnCountAvg === null || enhanced.turnCountAvg === null + ? null + : roundStat(enhanced.turnCountAvg - baseline.turnCountAvg), + durationMsAvg: + baseline.durationMsAvg === null || enhanced.durationMsAvg === null + ? null + : roundStat(enhanced.durationMsAvg - baseline.durationMsAvg), + recallAvg: roundStat(enhanced.recallAvg - baseline.recallAvg) ?? 0, + precisionAvg: roundStat(enhanced.precisionAvg - baseline.precisionAvg) ?? 0, + signalScoreAvg: + roundStat(enhanced.signalScoreAvg - baseline.signalScoreAvg) ?? 0, + relevantReadsAvg: + roundStat(enhanced.relevantReadsAvg - baseline.relevantReadsAvg) ?? 0, + distractorReadsAvg: + roundStat(enhanced.distractorReadsAvg - baseline.distractorReadsAvg) ?? 0, + skillsCreatedAvg: + roundStat(enhanced.skillsCreatedAvg - baseline.skillsCreatedAvg) ?? 0, + skillQualityScoreAvg: + roundStat( + enhanced.skillQualityScoreAvg - baseline.skillQualityScoreAvg, + ) ?? 0, + skillQualityRatioAvg: + roundStat( + enhanced.skillQualityRatioAvg - baseline.skillQualityRatioAvg, + ) ?? 0, + }; +} + +async function runScenarioWithFreshRig( + sessions: SeedSession[], +): Promise { + const rig = new ComponentRig({ + configOverrides: EXTRACTION_CONFIG_OVERRIDES, + }); + try { + await rig.initialize(); + await prepareWorkspace(rig.testDir, rig.testDir, WORKSPACE_FILES); + await seedSessions(rig.config!, sessions); + return await runExtractionAndReadState(rig.config!); + } finally { + await rig.cleanup(); + } +} + +async function runScratchpadStatsTrial( + trial: number, +): Promise { + const baselineScenario = createWorkflowComparisonSessions(false); + const enhancedScenario = createWorkflowComparisonSessions(true); + + const baselineOutcome = await runScenarioWithFreshRig( + baselineScenario.sessions, + ); + const enhancedOutcome = await runScenarioWithFreshRig( + enhancedScenario.sessions, + ); + + const baselineRun = baselineOutcome.state.runs.at(-1); + const enhancedRun = enhancedOutcome.state.runs.at(-1); + if (!baselineRun || !enhancedRun) { + throw new Error('Expected both baseline and scratchpad runs to exist'); + } + + expectSuccessfulExtractionRun(baselineRun); + expectSuccessfulExtractionRun(enhancedRun); + + return { + trial, + baseline: await summarizeScratchpadRun( + baselineOutcome, + baselineRun, + baselineScenario, + ), + enhanced: await summarizeScratchpadRun( + enhancedOutcome, + enhancedRun, + enhancedScenario, + ), + }; +} + +async function runScratchpadStatsReport( + trials: number, +): Promise { + const results: ScratchpadStatsTrial[] = []; + + for (let trial = 1; trial <= trials; trial++) { + results.push(await runScratchpadStatsTrial(trial)); + } + + const baseline = averageScratchpadRuns( + results.map((result) => result.baseline), + ); + const enhanced = averageScratchpadRuns( + results.map((result) => result.enhanced), + ); + + return { + generatedAt: new Date().toISOString(), + trials, + aggregate: { + baseline, + enhanced, + }, + deltas: diffScratchpadAggregates(baseline, enhanced), + results, + }; +} + +async function writeScratchpadStatsReport( + report: ScratchpadStatsReport, +): Promise { + const outputPath = path.resolve( + process.cwd(), + 'evals/logs/skill_extraction_scratchpad_stats.json', + ); + await fsp.mkdir(path.dirname(outputPath), { recursive: true }); + await fsp.writeFile(outputPath, `${JSON.stringify(report, null, 2)}\n`); + return outputPath; +} + async function readSkillBodies(skillsDir: string): Promise { + const bodies: string[] = []; + try { const entries = await fsp.readdir(skillsDir, { withFileTypes: true }); - const skillDirs = entries.filter((entry) => entry.isDirectory()); - const bodies = await Promise.all( - skillDirs.map((entry) => - fsp.readFile(path.join(skillsDir, entry.name, 'SKILL.md'), 'utf-8'), - ), - ); + for (const entry of entries) { + if (!entry.isDirectory()) { + continue; + } + + try { + bodies.push( + await fsp.readFile( + path.join(skillsDir, entry.name, 'SKILL.md'), + 'utf-8', + ), + ); + } catch { + // Ignore incomplete skill directories so one bad artifact does not hide + // valid skills created in the same eval run. + } + } return bodies; } catch { return []; } } +function expectSuccessfulExtractionRun(run: ExtractionRunSnapshot): void { + expect(run.turnCount).toBeGreaterThan(0); + expect(run.turnCount).toBeLessThanOrEqual(30); + expect(run.durationMs).toBeGreaterThan(0); + expect(run.terminateReason).toBe('GOAL'); +} + +function scoreSkillQuality( + skillBodies: string[], + signals: SkillQualitySignal[], +): { score: number; maxScore: number; missing: string[] } { + const combined = skillBodies.join('\n\n'); + const matched = signals.filter((signal) => signal.pattern.test(combined)); + + return { + score: matched.length, + maxScore: signals.length, + missing: signals + .filter((signal) => !signal.pattern.test(combined)) + .map((signal) => signal.label), + }; +} + +const SETTINGS_SKILL_QUALITY_SIGNALS: SkillQualitySignal[] = [ + { label: 'predocs command', pattern: /npm run predocs:settings/i }, + { label: 'schema command', pattern: /npm run schema:settings/i }, + { label: 'docs command', pattern: /npm run docs:settings/i }, + { label: 'verification guidance', pattern: /verif(?:y|ication)/i }, + { + label: 'generated docs warning or ordering constraint', + pattern: + /do not hand-edit|manual edits|exact command order|preserve.*order/i, + }, +]; + +const DB_MIGRATION_SKILL_QUALITY_SIGNALS: SkillQualitySignal[] = [ + { label: 'db check command', pattern: /npm run db:check/i }, + { label: 'db migrate command', pattern: /npm run db:migrate/i }, + { label: 'db validate command', pattern: /npm run db:validate/i }, + { label: 'rollback guidance', pattern: /npm run db:rollback|rollback/i }, + { + label: 'ordering constraint', + pattern: /check.*migrate.*validate|ordering is critical|mandatory/i, + }, +]; + /** * Shared configOverrides for all skill extraction component evals. * - experimentalAutoMemory: enables the Auto Memory skill extraction pipeline. @@ -158,6 +653,16 @@ const EXTRACTION_CONFIG_OVERRIDES = { approvalMode: ApprovalMode.YOLO, }; +function parseScratchpadStatsTrials(): number { + const configured = Number.parseInt( + process.env['SCRATCHPAD_STATS_TRIALS'] ?? '8', + 10, + ); + return Number.isFinite(configured) && configured > 0 ? configured : 8; +} + +const SCRATCHPAD_STATS_TRIALS = parseScratchpadStatsTrials(); + describe('Skill Extraction', () => { componentEvalTest('USUALLY_PASSES', { suiteName: 'skill-extraction', @@ -264,15 +769,24 @@ describe('Skill Extraction', () => { const { state, skillsDir } = await runExtractionAndReadState(config); const skillBodies = await readSkillBodies(skillsDir); const combinedSkills = skillBodies.join('\n\n'); + const quality = scoreSkillQuality( + skillBodies, + SETTINGS_SKILL_QUALITY_SIGNALS, + ); expect(state.runs).toHaveLength(1); expect(state.runs[0].sessionIds).toHaveLength(2); + expectSuccessfulExtractionRun(state.runs[0]); expect(state.runs[0].skillsCreated.length).toBeGreaterThanOrEqual(1); expect(skillBodies.length).toBeGreaterThanOrEqual(1); expect(combinedSkills).toContain('npm run predocs:settings'); expect(combinedSkills).toContain('npm run schema:settings'); expect(combinedSkills).toContain('npm run docs:settings'); - expect(combinedSkills).toMatch(/Verification/i); + expect(combinedSkills).toMatch(/verif(?:y|ication)/i); + expect( + quality.score, + `missing quality signals: ${quality.missing.join(', ')}`, + ).toBeGreaterThanOrEqual(4); // Verify the extraction agent activated skill-creator for design guidance. expect(config.getSkillManager().isSkillActive('skill-creator')).toBe( @@ -281,6 +795,96 @@ describe('Skill Extraction', () => { }, }); + componentEvalTest('USUALLY_PASSES', { + suiteName: 'skill-extraction', + suiteType: 'component-level', + name: 'memory scratchpad improves repeated-workflow recall versus summary-only index', + files: WORKSPACE_FILES, + timeout: 360000, + configOverrides: EXTRACTION_CONFIG_OVERRIDES, + assert: async () => { + const baselineScenario = createWorkflowComparisonSessions(false); + const enhancedScenario = createWorkflowComparisonSessions(true); + + const baselineOutcome = await runScenarioWithFreshRig( + baselineScenario.sessions, + ); + const enhancedOutcome = await runScenarioWithFreshRig( + enhancedScenario.sessions, + ); + + const baselineRun = baselineOutcome.state.runs.at(-1); + const enhancedRun = enhancedOutcome.state.runs.at(-1); + if (!baselineRun || !enhancedRun) { + throw new Error('Expected both baseline and scratchpad runs to exist'); + } + + expectSuccessfulExtractionRun(baselineRun); + expectSuccessfulExtractionRun(enhancedRun); + + const baselineRelevantReads = countMatchingIds( + baselineRun.processedSessions, + baselineScenario.relevantSessionIds, + ); + const enhancedRelevantReads = countMatchingIds( + enhancedRun.processedSessions, + enhancedScenario.relevantSessionIds, + ); + const baselineDistractorReads = countMatchingIds( + baselineRun.processedSessions, + baselineScenario.distractorSessionIds, + ); + const enhancedDistractorReads = countMatchingIds( + enhancedRun.processedSessions, + enhancedScenario.distractorSessionIds, + ); + const baselineSignalScore = + baselineRelevantReads - baselineDistractorReads; + const enhancedSignalScore = + enhancedRelevantReads - enhancedDistractorReads; + + expect(enhancedRun.candidateSessions).toHaveLength( + enhancedScenario.sessions.length, + ); + expect(enhancedRelevantReads).toBeGreaterThanOrEqual(2); + expect(enhancedRelevantReads).toBeGreaterThanOrEqual( + baselineRelevantReads, + ); + expect(enhancedDistractorReads).toBeLessThanOrEqual( + baselineDistractorReads, + ); + expect(enhancedSignalScore).toBeGreaterThan(baselineSignalScore); + }, + }); + + if (process.env['RUN_SCRATCHPAD_STATS'] === '1') { + componentEvalTest('USUALLY_PASSES', { + suiteName: 'skill-extraction', + suiteType: 'component-level', + name: 'reports memory scratchpad retrieval statistics', + timeout: Math.max(360000, SCRATCHPAD_STATS_TRIALS * 150000), + configOverrides: EXTRACTION_CONFIG_OVERRIDES, + assert: async () => { + const report = await runScratchpadStatsReport(SCRATCHPAD_STATS_TRIALS); + const outputPath = await writeScratchpadStatsReport(report); + + console.info( + `Wrote scratchpad stats report to ${outputPath}\n${JSON.stringify( + report.aggregate, + null, + 2, + )}`, + ); + + expect(report.results).toHaveLength(SCRATCHPAD_STATS_TRIALS); + expect(report.aggregate.baseline.recallAvg).toBeGreaterThan(0); + expect(report.aggregate.enhanced.recallAvg).toBeGreaterThan(0); + }, + }); + } else { + it.skip('reports memory scratchpad retrieval statistics', () => {}); + } + componentEvalTest('USUALLY_PASSES', { suiteName: 'skill-extraction', suiteType: 'component-level', @@ -330,15 +934,24 @@ describe('Skill Extraction', () => { const { state, skillsDir } = await runExtractionAndReadState(config); const skillBodies = await readSkillBodies(skillsDir); const combinedSkills = skillBodies.join('\n\n'); + const quality = scoreSkillQuality( + skillBodies, + DB_MIGRATION_SKILL_QUALITY_SIGNALS, + ); expect(state.runs).toHaveLength(1); expect(state.runs[0].sessionIds).toHaveLength(2); + expectSuccessfulExtractionRun(state.runs[0]); expect(state.runs[0].skillsCreated.length).toBeGreaterThanOrEqual(1); expect(skillBodies.length).toBeGreaterThanOrEqual(1); expect(combinedSkills).toContain('npm run db:check'); expect(combinedSkills).toContain('npm run db:migrate'); expect(combinedSkills).toContain('npm run db:validate'); expect(combinedSkills).toMatch(/rollback/i); + expect( + quality.score, + `missing quality signals: ${quality.missing.join(', ')}`, + ).toBeGreaterThanOrEqual(4); // Verify the extraction agent activated skill-creator for design guidance. expect(config.getSkillManager().isSkillActive('skill-creator')).toBe( diff --git a/evals/statistics-helper.ts b/evals/statistics-helper.ts new file mode 100644 index 0000000000..dff3baeb19 --- /dev/null +++ b/evals/statistics-helper.ts @@ -0,0 +1,26 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +export function countMatchingIds( + items: T[], + expectedIds: string[], +): number { + const expected = new Set(expectedIds); + return items.filter((item) => expected.has(item.sessionId)).length; +} + +export function roundStat(value: number | null): number | null { + return value === null ? null : Number(value.toFixed(4)); +} + +export function average(values: number[]): number { + return values.reduce((sum, value) => sum + value, 0) / values.length; +} + +export function averageNullable(values: Array): number | null { + const numericValues = values.filter((value) => value !== null); + return numericValues.length === 0 ? null : average(numericValues); +} diff --git a/packages/core/src/agents/local-executor.ts b/packages/core/src/agents/local-executor.ts index 478521cd9e..ca856d8b8e 100644 --- a/packages/core/src/agents/local-executor.ts +++ b/packages/core/src/agents/local-executor.ts @@ -779,6 +779,8 @@ export class LocalAgentExecutor { return { result: finalResult || 'Task completed.', terminate_reason: terminateReason, + turn_count: turnCounter, + duration_ms: Date.now() - startTime, }; } @@ -786,6 +788,8 @@ export class LocalAgentExecutor { result: finalResult || 'Agent execution was terminated before completion.', terminate_reason: terminateReason, + turn_count: turnCounter, + duration_ms: Date.now() - startTime, }; } catch (error) { // Check if the error is an AbortError caused by our internal timeout. @@ -826,6 +830,8 @@ export class LocalAgentExecutor { return { result: finalResult, terminate_reason: terminateReason, + turn_count: turnCounter, + duration_ms: Date.now() - startTime, }; } } @@ -840,6 +846,8 @@ export class LocalAgentExecutor { return { result: finalResult, terminate_reason: terminateReason, + turn_count: turnCounter, + duration_ms: Date.now() - startTime, }; } diff --git a/packages/core/src/agents/skill-extraction-agent.test.ts b/packages/core/src/agents/skill-extraction-agent.test.ts index a67c7db270..280cbc33e3 100644 --- a/packages/core/src/agents/skill-extraction-agent.test.ts +++ b/packages/core/src/agents/skill-extraction-agent.test.ts @@ -74,12 +74,14 @@ describe('SkillExtractionAgent', () => { expect(query).toContain(existingSkillsSummary); expect(query).toContain(sessionIndex); + expect(query).toContain('optional workflow hint'); expect(query).toContain( - 'The summary is a user-intent summary, not a workflow summary.', + 'workflow hints alone is never enough evidence for a reusable skill.', ); expect(query).toContain( - 'The session summaries describe user intent, not workflow details.', + 'Session summaries describe user intent; optional workflow hints describe likely procedural traces.', ); + expect(query).toContain('Use workflow hints for routing'); expect(query).toContain( 'Only write a skill if the evidence shows a durable, recurring workflow', ); diff --git a/packages/core/src/agents/skill-extraction-agent.ts b/packages/core/src/agents/skill-extraction-agent.ts index 4aa18af388..eea2a4727d 100644 --- a/packages/core/src/agents/skill-extraction-agent.ts +++ b/packages/core/src/agents/skill-extraction-agent.ts @@ -303,10 +303,11 @@ export const SkillExtractionAgent = ( '# Session Index', '', 'Below is an index of past conversation sessions. Each line shows:', - '[NEW] or [old] status, a 1-line summary, message count, and the file path.', + '[NEW] or [old] status, a 1-line user-intent summary, optional workflow hint, message count, and the file path.', '', - 'The summary is a user-intent summary, not a workflow summary.', - 'Matching summary text alone is never enough evidence for a reusable skill.', + 'Some lines may include "| workflow: ..."; this is a compact workflow hint from session metadata.', + 'Use workflow hints to prioritize which sessions to read and to group likely recurring workflows.', + 'Matching summary text or workflow hints alone is never enough evidence for a reusable skill.', '', '[NEW] = not yet processed for skill extraction (focus on these)', '[old] = previously processed (read only if a [NEW] session hints at a repeated pattern)', @@ -326,7 +327,7 @@ export const SkillExtractionAgent = ( return { systemPrompt: buildSystemPrompt(skillsDir), - query: `${initialContext}\n\nAnalyze the session index above. The session summaries describe user intent, not workflow details. Read sessions that suggest repeated workflows using read_file. Only write a skill if the evidence shows a durable, recurring workflow or a stable recurring repo procedure. If recurrence or future reuse is unclear, create no skill and explain why.`, + query: `${initialContext}\n\nAnalyze the session index above. Session summaries describe user intent; optional workflow hints describe likely procedural traces. Use workflow hints for routing, then read sessions that suggest repeated workflows using read_file to verify recurrence from transcript evidence. Only write a skill if the evidence shows a durable, recurring workflow or a stable recurring repo procedure. If recurrence or future reuse is unclear, create no skill and explain why.`, }; }, runConfig: { diff --git a/packages/core/src/agents/types.ts b/packages/core/src/agents/types.ts index 6cf30bcfb4..732dec1809 100644 --- a/packages/core/src/agents/types.ts +++ b/packages/core/src/agents/types.ts @@ -36,6 +36,8 @@ export enum AgentTerminateMode { export interface OutputObject { result: string; terminate_reason: AgentTerminateMode; + turn_count?: number; + duration_ms?: number; } /** diff --git a/packages/core/src/services/chatRecordingService.ts b/packages/core/src/services/chatRecordingService.ts index b3cfb97527..5791184d15 100644 --- a/packages/core/src/services/chatRecordingService.ts +++ b/packages/core/src/services/chatRecordingService.ts @@ -112,6 +112,7 @@ export async function loadConversationRecord( userMessageCount?: number; firstUserMessage?: string; hasUserOrAssistantMessage?: boolean; + memoryScratchpadIsStale?: boolean; }) | null > { @@ -133,6 +134,8 @@ export async function loadConversationRecord( string, { isUser: boolean; isUserOrAssistant: boolean } >(); + let isTrackingMemoryScratchpadFreshness = false; + let memoryScratchpadIsStale = false; let firstUserMessageStr: string | undefined; for await (const line of rl) { @@ -140,6 +143,9 @@ export async function loadConversationRecord( try { const record = JSON.parse(line) as unknown; if (isRewindRecord(record)) { + if (isTrackingMemoryScratchpadFreshness) { + memoryScratchpadIsStale = true; + } const rewindId = record.$rewindTo; if (options?.metadataOnly) { const idx = messageIds.indexOf(rewindId); @@ -168,6 +174,9 @@ export async function loadConversationRecord( } } } else if (isMessageRecord(record)) { + if (isTrackingMemoryScratchpadFreshness) { + memoryScratchpadIsStale = true; + } const id = record.id; const isUser = hasProperty(record, 'type') && record.type === 'user'; const isUserOrAssistant = @@ -206,6 +215,12 @@ export async function loadConversationRecord( } } } else if (isMetadataUpdateRecord(record)) { + if (hasProperty(record.$set, 'memoryScratchpad')) { + isTrackingMemoryScratchpadFreshness = Boolean( + record.$set.memoryScratchpad, + ); + memoryScratchpadIsStale = false; + } // Metadata update metadata = { ...metadata, @@ -257,6 +272,7 @@ export async function loadConversationRecord( startTime: metadata.startTime || new Date().toISOString(), lastUpdated: metadata.lastUpdated || new Date().toISOString(), summary: metadata.summary, + memoryScratchpad: metadata.memoryScratchpad, directories: metadata.directories, kind: metadata.kind, messages: options?.metadataOnly ? [] : loadedMessages, @@ -267,6 +283,9 @@ export async function loadConversationRecord( options?.metadataOnly && metadataMessages.length > 0 ? metadataMessages.filter((m) => m.type === 'user').length : userMessageCount, + memoryScratchpadIsStale: isTrackingMemoryScratchpadFreshness + ? memoryScratchpadIsStale + : undefined, firstUserMessage: fallbackFirstUserMessage, hasUserOrAssistantMessage: options?.metadataOnly && metadataMessages.length > 0 @@ -332,6 +351,13 @@ export class ChatRecordingService { for (const msg of this.cachedConversation.messages) { this.appendRecord(msg); } + if (this.cachedConversation.memoryScratchpad) { + this.appendRecord({ + $set: { + memoryScratchpad: this.cachedConversation.memoryScratchpad, + }, + }); + } } // Update the session ID in the existing file diff --git a/packages/core/src/services/chatRecordingTypes.ts b/packages/core/src/services/chatRecordingTypes.ts index 2ddc218bdc..ae5dca8026 100644 --- a/packages/core/src/services/chatRecordingTypes.ts +++ b/packages/core/src/services/chatRecordingTypes.ts @@ -25,6 +25,19 @@ export interface TokensSummary { total: number; // totalTokenCount } +export type MemoryValidationStatus = 'passed' | 'failed' | 'unknown'; + +/** + * Lightweight workflow metadata attached to a session for memory extraction. + */ +export interface MemoryScratchpad { + version: 1; + workflowSummary?: string; + toolSequence?: string[]; + touchedPaths?: string[]; + validationStatus?: MemoryValidationStatus; +} + /** * Base fields common to all messages. */ @@ -83,6 +96,7 @@ export interface ConversationRecord { lastUpdated: string; messages: MessageRecord[]; summary?: string; + memoryScratchpad?: MemoryScratchpad; /** Workspace directories added during the session via /dir add */ directories?: string[]; /** The kind of conversation (main agent or subagent) */ @@ -120,6 +134,7 @@ export interface PartialMetadataRecord { startTime?: string; lastUpdated?: string; summary?: string; + memoryScratchpad?: MemoryScratchpad; directories?: string[]; kind?: 'main' | 'subagent'; } diff --git a/packages/core/src/services/memoryService.test.ts b/packages/core/src/services/memoryService.test.ts index f0b191667b..86a7885295 100644 --- a/packages/core/src/services/memoryService.test.ts +++ b/packages/core/src/services/memoryService.test.ts @@ -127,6 +127,7 @@ async function writeConversationJsonl( startTime: conversation.startTime, lastUpdated: conversation.lastUpdated, summary: conversation.summary, + memoryScratchpad: conversation.memoryScratchpad, directories: conversation.directories, kind: conversation.kind, }; @@ -565,7 +566,7 @@ describe('memoryService', () => { ); }); - it('records only sessions whose read_file calls succeed as processed', async () => { + it('records only sessions whose read_file completed successfully as processed', async () => { const { startMemoryService, readExtractionState } = await import( './memoryService.js' ); @@ -595,17 +596,69 @@ describe('memoryService', () => { messageCount: 20, lastUpdated: '2025-01-01T01:00:00Z', }); + const failedConversation = createConversation({ + sessionId: 'failed-session', + summary: 'read_file errors on this one', + messageCount: 20, + lastUpdated: '2025-01-03T01:00:00Z', + }); + const rejectedConversation = createConversation({ + sessionId: 'rejected-session', + summary: 'read_file was rejected for this one', + messageCount: 20, + lastUpdated: '2025-01-02T02:00:00Z', + }); + const mismatchedEndConversation = createConversation({ + sessionId: 'mismatched-end-session', + summary: 'read_file start with a mismatched tool end', + messageCount: 20, + lastUpdated: '2025-01-02T03:00:00Z', + }); + const mismatchedErrorConversation = createConversation({ + sessionId: 'mismatched-error-session', + summary: 'read_file recovers after a mismatched tool error', + messageCount: 20, + lastUpdated: '2025-01-02T04:00:00Z', + }); const openedPath = path.join( chatsDir, `${SESSION_FILE_PREFIX}2025-01-02T00-00-opened.jsonl`, ); - const skippedPath = path.join( + const failedPath = path.join( chatsDir, - `${SESSION_FILE_PREFIX}2025-01-01T00-00-skipped.jsonl`, + `${SESSION_FILE_PREFIX}2025-01-03T00-00-failed.jsonl`, + ); + const rejectedPath = path.join( + chatsDir, + `${SESSION_FILE_PREFIX}2025-01-02T00-00-rejected.jsonl`, + ); + const mismatchedEndPath = path.join( + chatsDir, + `${SESSION_FILE_PREFIX}2025-01-02T00-00-mismatched-end.jsonl`, + ); + const mismatchedErrorPath = path.join( + chatsDir, + `${SESSION_FILE_PREFIX}2025-01-02T00-00-mismatched-error.jsonl`, ); await writeConversationJsonl(openedPath, openedConversation); - await writeConversationJsonl(skippedPath, skippedConversation); + await writeConversationJsonl(failedPath, failedConversation); + await writeConversationJsonl(rejectedPath, rejectedConversation); + await writeConversationJsonl( + mismatchedEndPath, + mismatchedEndConversation, + ); + await writeConversationJsonl( + mismatchedErrorPath, + mismatchedErrorConversation, + ); + await writeConversationJsonl( + path.join( + chatsDir, + `${SESSION_FILE_PREFIX}2025-01-01T00-00-skipped.jsonl`, + ), + skippedConversation, + ); vi.mocked(LocalAgentExecutor.create).mockImplementationOnce( async (_definition, _context, onActivity) => @@ -624,21 +677,21 @@ describe('memoryService', () => { onActivity?.({ isSubagentActivityEvent: true, agentName: 'Skill Extractor', - type: 'TOOL_CALL_START', + type: 'TOOL_CALL_END', data: { name: 'read_file', - args: { file_path: skippedPath }, - callId: 'call-skipped', + id: 'call-opened', + data: {}, }, }); onActivity?.({ isSubagentActivityEvent: true, agentName: 'Skill Extractor', - type: 'ERROR', + type: 'TOOL_CALL_START', data: { name: 'read_file', - callId: 'call-skipped', - error: 'access denied', + args: { file_path: failedPath }, + callId: 'call-failed', }, }); onActivity?.({ @@ -647,8 +700,28 @@ describe('memoryService', () => { type: 'TOOL_CALL_END', data: { name: 'read_file', - id: 'call-opened', - data: { content: 'Read this one' }, + id: 'call-failed', + data: { isError: true }, + }, + }); + onActivity?.({ + isSubagentActivityEvent: true, + agentName: 'Skill Extractor', + type: 'TOOL_CALL_START', + data: { + name: 'read_file', + args: { file_path: rejectedPath }, + callId: 'call-rejected', + }, + }); + onActivity?.({ + isSubagentActivityEvent: true, + agentName: 'Skill Extractor', + type: 'ERROR', + data: { + name: 'read_file', + callId: 'call-rejected', + error: 'User rejected this operation.', }, }); onActivity?.({ @@ -661,6 +734,56 @@ describe('memoryService', () => { callId: 'call-unrelated', }, }); + onActivity?.({ + isSubagentActivityEvent: true, + agentName: 'Skill Extractor', + type: 'TOOL_CALL_START', + data: { + name: 'read_file', + args: { file_path: mismatchedEndPath }, + callId: 'call-mismatched-end', + }, + }); + onActivity?.({ + isSubagentActivityEvent: true, + agentName: 'Skill Extractor', + type: 'TOOL_CALL_END', + data: { + name: 'write_file', + id: 'call-mismatched-end', + data: {}, + }, + }); + onActivity?.({ + isSubagentActivityEvent: true, + agentName: 'Skill Extractor', + type: 'TOOL_CALL_START', + data: { + name: 'read_file', + args: { file_path: mismatchedErrorPath }, + callId: 'call-mismatched-error', + }, + }); + onActivity?.({ + isSubagentActivityEvent: true, + agentName: 'Skill Extractor', + type: 'ERROR', + data: { + name: 'write_file', + callId: 'call-mismatched-error', + error: 'Different tool failed.', + }, + }); + onActivity?.({ + isSubagentActivityEvent: true, + agentName: 'Skill Extractor', + type: 'TOOL_CALL_END', + data: { + name: 'read_file', + id: 'call-mismatched-error', + data: {}, + }, + }); return undefined; }), }) as never, @@ -691,6 +814,22 @@ describe('memoryService', () => { ); expect(state.runs).toHaveLength(1); expect(state.runs[0].candidateSessions).toEqual([ + { + sessionId: 'failed-session', + lastUpdated: '2025-01-03T01:00:00Z', + }, + { + sessionId: 'mismatched-error-session', + lastUpdated: '2025-01-02T04:00:00Z', + }, + { + sessionId: 'mismatched-end-session', + lastUpdated: '2025-01-02T03:00:00Z', + }, + { + sessionId: 'rejected-session', + lastUpdated: '2025-01-02T02:00:00Z', + }, { sessionId: 'opened-session', lastUpdated: '2025-01-02T01:00:00Z', @@ -701,12 +840,19 @@ describe('memoryService', () => { }, ]); expect(state.runs[0].processedSessions).toEqual([ + { + sessionId: 'mismatched-error-session', + lastUpdated: '2025-01-02T04:00:00Z', + }, { sessionId: 'opened-session', lastUpdated: '2025-01-02T01:00:00Z', }, ]); - expect(state.runs[0].sessionIds).toEqual(['opened-session']); + expect(state.runs[0].sessionIds).toEqual([ + 'mismatched-error-session', + 'opened-session', + ]); }); }); @@ -902,6 +1048,178 @@ describe('memoryService', () => { expect(result.sessionIndex).toContain(path.join(chatsDir, fileName)); }); + it('falls back to scratchpad workflow summary when summary is missing', async () => { + const { buildSessionIndex } = await import('./memoryService.js'); + + const conversation = createConversation({ + sessionId: 'scratchpad-only', + summary: undefined, + memoryScratchpad: { + version: 1, + workflowSummary: + 'read_file -> edit | paths packages/core/src/services/memoryService.ts | validated', + }, + messageCount: 20, + }); + await writeConversationJsonl( + path.join( + chatsDir, + `${SESSION_FILE_PREFIX}2025-01-01T00-00-scratch01.jsonl`, + ), + conversation, + ); + + const result = await buildSessionIndex(chatsDir, { runs: [] }); + + expect(result.sessionIndex).toContain('read_file -> edit'); + expect(result.sessionIndex).not.toContain('(no summary)'); + }); + + it('ignores malformed scratchpad workflow summaries while indexing sessions', async () => { + const { buildSessionIndex } = await import('./memoryService.js'); + + const malformedConversation = createConversation({ + sessionId: 'malformed-scratchpad', + summary: undefined, + memoryScratchpad: { + version: 1, + workflowSummary: 123, + } as unknown as ConversationRecord['memoryScratchpad'], + messageCount: 20, + }); + await writeConversationJsonl( + path.join( + chatsDir, + `${SESSION_FILE_PREFIX}2025-01-01T00-00-badpad.jsonl`, + ), + malformedConversation, + ); + + const validConversation = createConversation({ + sessionId: 'valid-session', + summary: 'Still indexes other sessions', + messageCount: 20, + }); + await writeConversationJsonl( + path.join( + chatsDir, + `${SESSION_FILE_PREFIX}2025-01-01T00-00-valid.jsonl`, + ), + validConversation, + ); + + const result = await buildSessionIndex(chatsDir, { runs: [] }); + + expect(result.sessionIndex).toContain('(no summary)'); + expect(result.sessionIndex).toContain('Still indexes other sessions'); + expect(result.sessionIndex).not.toContain('123'); + }); + + it('appends workflow summary when both summary and scratchpad are present', async () => { + const { buildSessionIndex } = await import('./memoryService.js'); + + const conversation = createConversation({ + sessionId: 'summary-and-scratchpad', + summary: 'Fix session scanning', + memoryScratchpad: { + version: 1, + workflowSummary: + 'read_file -> edit | paths packages/core/src/services/sessionSummaryUtils.ts', + }, + messageCount: 20, + }); + await writeConversationJsonl( + path.join( + chatsDir, + `${SESSION_FILE_PREFIX}2025-01-01T00-00-scratch02.jsonl`, + ), + conversation, + ); + + const result = await buildSessionIndex(chatsDir, { runs: [] }); + + expect(result.sessionIndex).toContain('Fix session scanning | workflow:'); + expect(result.sessionIndex).toContain('sessionSummaryUtils.ts'); + }); + + it('omits stale scratchpad workflow summaries from resumed JSONL sessions', async () => { + const { buildSessionIndex } = await import('./memoryService.js'); + + const conversation = createConversation({ + sessionId: 'stale-scratchpad', + summary: 'Resume memory work', + messageCount: 20, + lastUpdated: '2025-01-01T01:00:00Z', + }); + const filePath = path.join( + chatsDir, + `${SESSION_FILE_PREFIX}2025-01-01T00-00-stale001.jsonl`, + ); + await writeConversationJsonl(filePath, conversation); + await fs.appendFile( + filePath, + `${JSON.stringify({ + $set: { + memoryScratchpad: { + version: 1, + workflowSummary: 'stale_workflow | paths stale.ts', + }, + }, + })}\n`, + ); + await fs.appendFile( + filePath, + [ + JSON.stringify({ + id: 'resumed-user-message', + timestamp: '2025-01-02T01:00:00Z', + type: 'user', + content: [{ text: 'Continue after the scratchpad was written' }], + }), + JSON.stringify({ + $set: { lastUpdated: '2025-01-02T01:00:01Z' }, + }), + ].join('\n') + '\n', + ); + + const result = await buildSessionIndex(chatsDir, { runs: [] }); + + expect(result.sessionIndex).toContain('Resume memory work'); + expect(result.sessionIndex).not.toContain('stale_workflow'); + expect(result.sessionIndex).not.toContain('stale.ts'); + }); + + it('sanitizes shell command workflow summaries before indexing sessions', async () => { + const { buildSessionIndex } = await import('./memoryService.js'); + + const conversation = createConversation({ + sessionId: 'raw-shell-scratchpad', + summary: 'Investigate API migration', + memoryScratchpad: { + version: 1, + workflowSummary: + 'run_shell_command: curl https://api.example.com -H "Authorization: Bearer sk-secret-token" -> read_file | paths package.json', + }, + messageCount: 20, + }); + await writeConversationJsonl( + path.join( + chatsDir, + `${SESSION_FILE_PREFIX}2025-01-01T00-00-shellraw.jsonl`, + ), + conversation, + ); + + const result = await buildSessionIndex(chatsDir, { runs: [] }); + + expect(result.sessionIndex).toContain( + 'workflow: run_shell_command: curl -> read_file | paths package.json', + ); + expect(result.sessionIndex).not.toContain('Authorization'); + expect(result.sessionIndex).not.toContain('sk-secret-token'); + expect(result.sessionIndex).not.toContain('https://api.example.com'); + }); + it('filters out subagent sessions', async () => { const { buildSessionIndex } = await import('./memoryService.js'); @@ -1176,6 +1494,9 @@ describe('memoryService', () => { }, ], skillsCreated: ['debug-helper', 'test-gen'], + turnCount: 4, + durationMs: 1875, + terminateReason: 'GOAL', }, ], }; @@ -1202,6 +1523,9 @@ describe('memoryService', () => { ]); expect(result.runs[0].sessionIds).toEqual(['s1']); expect(result.runs[0].runAt).toBe('2025-06-01T00:00:00Z'); + expect(result.runs[0].turnCount).toBe(4); + expect(result.runs[0].durationMs).toBe(1875); + expect(result.runs[0].terminateReason).toBe('GOAL'); }); it('writeExtractionState + readExtractionState roundtrips runs correctly', async () => { @@ -1235,11 +1559,17 @@ describe('memoryService', () => { }, ], skillsCreated: ['skill-x'], + turnCount: 3, + durationMs: 2400, + terminateReason: 'GOAL', }, { runAt: '2025-01-02T00:00:00Z', sessionIds: ['c'], skillsCreated: [], + turnCount: 1, + durationMs: 900, + terminateReason: 'GOAL', }, ]; const state: ExtractionState = { runs }; diff --git a/packages/core/src/services/memoryService.ts b/packages/core/src/services/memoryService.ts index 4fdb51e50b..5ea27ac38e 100644 --- a/packages/core/src/services/memoryService.ts +++ b/packages/core/src/services/memoryService.ts @@ -14,6 +14,7 @@ import { SESSION_FILE_PREFIX, loadConversationRecord, type ConversationRecord, + type MemoryScratchpad, } from './chatRecordingService.js'; import { debugLogger } from '../utils/debugLogger.js'; import { coreEvents } from '../utils/events.js'; @@ -22,7 +23,10 @@ import { FRONTMATTER_REGEX, parseFrontmatter } from '../skills/skillLoader.js'; import { LocalAgentExecutor } from '../agents/local-executor.js'; import { SkillExtractionAgent } from '../agents/skill-extraction-agent.js'; import { getModelConfigAlias } from '../agents/registry.js'; -import type { SubagentActivityEvent } from '../agents/types.js'; +import { + isToolActivityError, + type SubagentActivityEvent, +} from '../agents/types.js'; import { ExecutionLifecycleService } from './executionLifecycleService.js'; import { PromptRegistry } from '../prompts/prompt-registry.js'; import { ResourceRegistry } from '../resources/resource-registry.js'; @@ -36,6 +40,7 @@ import { applyParsedSkillPatches, hasParsedPatchHunks, } from './memoryPatchUtils.js'; +import { sanitizeWorkflowSummaryForScratchpad } from './sessionScratchpadUtils.js'; const LOCK_FILENAME = '.extraction.lock'; const STATE_FILENAME = '.extraction-state.json'; @@ -53,20 +58,6 @@ interface LockInfo { startedAt: string; } -function hasProperty( - obj: unknown, - prop: T, -): obj is { [key in T]: unknown } { - return obj !== null && typeof obj === 'object' && prop in obj; -} - -function isStringProperty( - obj: unknown, - prop: T, -): obj is { [key in T]: string } { - return hasProperty(obj, prop) && typeof obj[prop] === 'string'; -} - interface SessionVersion { sessionId: string; lastUpdated: string; @@ -75,6 +66,7 @@ interface SessionVersion { interface IndexedSession extends SessionVersion { filePath: string; summary?: string; + memoryScratchpad?: MemoryScratchpad; userMessageCount: number; } @@ -87,6 +79,9 @@ export interface ExtractionRun { candidateSessions?: SessionVersion[]; processedSessions?: SessionVersion[]; skillsCreated: string[]; + turnCount?: number; + durationMs?: number; + terminateReason?: string; } /** @@ -153,12 +148,25 @@ function normalizeStringArray(value: unknown): string[] { return value.filter((item): item is string => typeof item === 'string'); } +function normalizeOptionalNumber(value: unknown): number | undefined { + return typeof value === 'number' && Number.isFinite(value) + ? value + : undefined; +} + +function normalizeOptionalString(value: unknown): string | undefined { + return typeof value === 'string' ? value : undefined; +} + function isExtractionRunLike(value: unknown): value is { runAt: string; sessionIds?: unknown; candidateSessions?: unknown; processedSessions?: unknown; skillsCreated: unknown; + turnCount?: unknown; + durationMs?: unknown; + terminateReason?: unknown; } { return ( typeof value === 'object' && @@ -198,6 +206,9 @@ function buildExtractionRun(value: unknown): ExtractionRun | null { processedSessions: processedSessions.length > 0 ? processedSessions : undefined, skillsCreated: normalizeStringArray(value.skillsCreated), + turnCount: normalizeOptionalNumber(value.turnCount), + durationMs: normalizeOptionalNumber(value.durationMs), + terminateReason: normalizeOptionalString(value.terminateReason), }; } @@ -291,7 +302,7 @@ function shouldReplaceIndexedSession( return compareIndexedSessions(candidate, existing) < 0; } -function isReadFileStartActivity( +function isReadFileActivity( activity: SubagentActivityEvent, ): activity is SubagentActivityEvent & { data: { name: string; args?: { file_path?: unknown }; callId?: unknown }; @@ -302,11 +313,36 @@ function isReadFileStartActivity( ); } -function getResolvedReadFilePath( +function getReadFileCallId(activity: SubagentActivityEvent): string | null { + if (isReadFileActivity(activity)) { + const { callId } = activity.data; + return typeof callId === 'string' ? callId : null; + } + + if ( + activity.type === 'TOOL_CALL_END' && + activity.data['name'] === READ_FILE_TOOL_NAME + ) { + const id = activity.data['id']; + return typeof id === 'string' ? id : null; + } + + if ( + activity.type === 'ERROR' && + activity.data['name'] === READ_FILE_TOOL_NAME + ) { + const callId = activity.data['callId']; + return typeof callId === 'string' ? callId : null; + } + + return null; +} + +function getResolvedActivityFilePath( config: Config, activity: SubagentActivityEvent, ): string | null { - if (!isReadFileStartActivity(activity)) { + if (!isReadFileActivity(activity)) { return null; } @@ -320,48 +356,11 @@ function getResolvedReadFilePath( return null; } - return path.resolve(config.getTargetDir(), args.file_path); -} - -function getReadFileStartCallId( - activity: SubagentActivityEvent, -): string | null { - if ( - !isReadFileStartActivity(activity) || - !isStringProperty(activity.data, 'callId') - ) { - return null; - } - - return activity.data.callId; -} - -function getCompletedReadFileCallId( - activity: SubagentActivityEvent, -): string | null { - if ( - activity.type !== 'TOOL_CALL_END' || - activity.data['name'] !== READ_FILE_TOOL_NAME || - !isStringProperty(activity.data, 'id') - ) { - return null; - } - - return activity.data['id']; -} - -function getFailedReadFileCallId( - activity: SubagentActivityEvent, -): string | null { - if ( - activity.type !== 'ERROR' || - activity.data['name'] !== READ_FILE_TOOL_NAME || - !isStringProperty(activity.data, 'callId') - ) { - return null; - } - - return activity.data['callId']; + const targetDir = + 'getTargetDir' in config && typeof config.getTargetDir === 'function' + ? config.getTargetDir() + : process.cwd(); + return path.resolve(targetDir, args.file_path); } function getUserMessageCount( @@ -580,6 +579,10 @@ async function scanEligibleSessions( lastUpdated: conversation.lastUpdated, filePath, summary: conversation.summary, + memoryScratchpad: + conversation.memoryScratchpadIsStale === true + ? undefined + : conversation.memoryScratchpad, userMessageCount: getUserMessageCount(conversation), }; @@ -595,6 +598,28 @@ async function scanEligibleSessions( return Array.from(latestBySessionId.values()).sort(compareIndexedSessions); } +function formatSessionHeadline(session: IndexedSession): string { + const rawWorkflowSummary = session.memoryScratchpad?.workflowSummary; + const sanitizedWorkflowSummary = + typeof rawWorkflowSummary === 'string' + ? sanitizeWorkflowSummaryForScratchpad(rawWorkflowSummary) + : undefined; + const workflowSummary = sanitizedWorkflowSummary?.trim() + ? sanitizedWorkflowSummary + : undefined; + const summary = session.summary ?? workflowSummary ?? '(no summary)'; + + if ( + session.summary && + workflowSummary && + workflowSummary !== session.summary + ) { + return `${summary} | workflow: ${workflowSummary}`; + } + + return summary; +} + /** * Builds a session index for the extraction agent: a compact listing of all * eligible sessions with their summary, file path, and new/previously-processed status. @@ -651,8 +676,7 @@ export async function buildSessionIndex( const status = candidateSessionIds.has(getSessionVersionKey(session)) ? '[NEW]' : '[old]'; - const summary = session.summary ?? '(no summary)'; - return `${status} ${summary} (${session.userMessageCount} user msgs) — ${session.filePath}`; + return `${status} ${formatSessionHeadline(session)} (${session.userMessageCount} user msgs) — ${session.filePath}`; }, ); @@ -999,18 +1023,19 @@ export async function startMemoryService(config: Config): Promise { session, ]), ); + const pendingReadFileSessions = new Map(); const processedSessionKeys = new Set(); - const pendingReadFileSessions = new Map(); // Create and run the extraction agent const executor = await LocalAgentExecutor.create( agentDefinition, context, (activity) => { - const readFileCallId = getReadFileStartCallId(activity); - if (readFileCallId) { - const resolvedPath = getResolvedReadFilePath(config, activity); - if (!resolvedPath) { + const readFileCallId = getReadFileCallId(activity); + + if (activity.type === 'TOOL_CALL_START') { + const resolvedPath = getResolvedActivityFilePath(config, activity); + if (!resolvedPath || !readFileCallId) { return; } @@ -1019,35 +1044,31 @@ export async function startMemoryService(config: Config): Promise { return; } - pendingReadFileSessions.set( - readFileCallId, - getSessionVersionKey(session), - ); + pendingReadFileSessions.set(readFileCallId, session); return; } - const completedReadFileCallId = getCompletedReadFileCallId(activity); - if (completedReadFileCallId) { - const sessionKey = pendingReadFileSessions.get( - completedReadFileCallId, - ); - if (!sessionKey) { - return; - } - - processedSessionKeys.add(sessionKey); - pendingReadFileSessions.delete(completedReadFileCallId); + if (!readFileCallId) { return; } - const failedReadFileCallId = getFailedReadFileCallId(activity); - if (failedReadFileCallId) { - pendingReadFileSessions.delete(failedReadFileCallId); + const session = pendingReadFileSessions.get(readFileCallId); + if (!session) { + return; + } + + pendingReadFileSessions.delete(readFileCallId); + + if ( + activity.type === 'TOOL_CALL_END' && + !isToolActivityError(activity.data['data']) + ) { + processedSessionKeys.add(getSessionVersionKey(session)); } }, ); - await executor.run( + const executorResult = await executor.run( { request: 'Extract skills from the provided sessions.' }, abortController.signal, ); @@ -1107,6 +1128,11 @@ export async function startMemoryService(config: Config): Promise { })), processedSessions, skillsCreated, + turnCount: normalizeOptionalNumber(executorResult?.turn_count), + durationMs: normalizeOptionalNumber(executorResult?.duration_ms), + terminateReason: normalizeOptionalString( + executorResult?.terminate_reason, + ), }; const updatedState: ExtractionState = { runs: [...state.runs, run], diff --git a/packages/core/src/services/sessionScratchpadUtils.test.ts b/packages/core/src/services/sessionScratchpadUtils.test.ts new file mode 100644 index 0000000000..4137ded941 --- /dev/null +++ b/packages/core/src/services/sessionScratchpadUtils.test.ts @@ -0,0 +1,45 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, expect, it } from 'vitest'; +import { SHELL_TOOL_NAME } from '../tools/definitions/base-declarations.js'; +import { + sanitizeWorkflowSummaryForScratchpad, + summarizeShellCommandForScratchpad, +} from './sessionScratchpadUtils.js'; + +describe('sessionScratchpadUtils', () => { + describe('summarizeShellCommandForScratchpad', () => { + it('summarizes quoted and assignment-prefixed shell commands', () => { + expect(summarizeShellCommandForScratchpad('"npm" run test')).toBe('npm'); + expect( + summarizeShellCommandForScratchpad( + 'DATABASE_URL=postgres://user:password@example/db pnpm test', + ), + ).toBe('pnpm'); + }); + + it('handles adversarial unterminated quoted input without exposing arguments', () => { + const adversarialCommand = `"${'\\"!'.repeat(10_000)}`; + + expect(summarizeShellCommandForScratchpad(adversarialCommand)).toBe( + 'shell', + ); + }); + }); + + describe('sanitizeWorkflowSummaryForScratchpad', () => { + it('sanitizes adversarial shell commands in workflow summaries', () => { + const adversarialCommand = `"${'\\"!'.repeat(10_000)}`; + + expect( + sanitizeWorkflowSummaryForScratchpad( + `${SHELL_TOOL_NAME}: ${adversarialCommand} -> read_file`, + ), + ).toBe(`${SHELL_TOOL_NAME}: shell -> read_file`); + }); + }); +}); diff --git a/packages/core/src/services/sessionScratchpadUtils.ts b/packages/core/src/services/sessionScratchpadUtils.ts new file mode 100644 index 0000000000..bde7f22c40 --- /dev/null +++ b/packages/core/src/services/sessionScratchpadUtils.ts @@ -0,0 +1,155 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { SHELL_TOOL_NAME } from '../tools/definitions/base-declarations.js'; + +const WORKFLOW_PART_SEPARATOR = ' | '; +const TOOL_SEQUENCE_SEPARATOR = ' -> '; +const SHELL_ASSIGNMENT_REGEX = /^[A-Za-z_][A-Za-z0-9_]*=/; +const SAFE_COMMAND_NAME_REGEX = /^[A-Za-z0-9_.@+-]+$/; +const SAFE_TOOL_SEQUENCE_ENTRY_REGEX = /^[A-Za-z_][A-Za-z0-9_:.]*$/; + +function tokenizeShellCommand(command: string): string[] { + const tokens: string[] = []; + let currentToken = ''; + let quote: '"' | "'" | '`' | undefined; + + for (let i = 0; i < command.length; i++) { + const char = command[i]; + + if (quote) { + if (char === quote) { + quote = undefined; + continue; + } + + if (quote === '"' && char === '\\' && i + 1 < command.length) { + currentToken += command[i + 1]; + i++; + continue; + } + + currentToken += char; + continue; + } + + if (char === ' ' || char === '\t' || char === '\n' || char === '\r') { + if (currentToken) { + tokens.push(currentToken); + currentToken = ''; + } + continue; + } + + if (char === '"' || char === "'" || char === '`') { + quote = char; + continue; + } + + currentToken += char; + } + + if (currentToken) { + tokens.push(currentToken); + } + + return tokens; +} + +function getSafeCommandName(token: string): string | undefined { + if (!token || SHELL_ASSIGNMENT_REGEX.test(token)) { + return undefined; + } + + const pathParts = token.split(/[/\\]/).filter(Boolean); + const basename = pathParts[pathParts.length - 1] ?? token; + if (!basename || basename.includes('://')) { + return 'shell'; + } + + return SAFE_COMMAND_NAME_REGEX.test(basename) ? basename : 'shell'; +} + +export function summarizeShellCommandForScratchpad( + command: string, +): string | undefined { + const normalized = command.replace(/\s+/g, ' ').trim(); + if (normalized.length === 0) { + return undefined; + } + + for (const token of tokenizeShellCommand(normalized)) { + const commandName = getSafeCommandName(token); + if (commandName) { + return commandName; + } + } + + return undefined; +} + +function sanitizeWorkflowToolSequenceEntry(entry: string): string | undefined { + const trimmed = entry.trim(); + if (!trimmed) { + return undefined; + } + + const shellPrefix = `${SHELL_TOOL_NAME}:`; + if (trimmed.startsWith(shellPrefix)) { + const command = trimmed.slice(shellPrefix.length).trim(); + const commandSummary = summarizeShellCommandForScratchpad(command); + return commandSummary + ? `${SHELL_TOOL_NAME}: ${commandSummary}` + : SHELL_TOOL_NAME; + } + + if ( + trimmed === SHELL_TOOL_NAME || + SAFE_TOOL_SEQUENCE_ENTRY_REGEX.test(trimmed) + ) { + return trimmed; + } + + return undefined; +} + +export function sanitizeWorkflowSummaryForScratchpad(summary: string): string { + const normalized = summary.replace(/\s+/g, ' ').trim(); + if (!normalized.includes(`${SHELL_TOOL_NAME}:`)) { + return normalized; + } + + const sanitizedParts: string[] = []; + for (const part of normalized.split(WORKFLOW_PART_SEPARATOR)) { + const trimmed = part.trim(); + if (!trimmed) { + continue; + } + + if (trimmed.includes(`${SHELL_TOOL_NAME}:`)) { + const sanitizedToolSequence = trimmed + .split(TOOL_SEQUENCE_SEPARATOR) + .map(sanitizeWorkflowToolSequenceEntry) + .filter((entry): entry is string => Boolean(entry)); + if (sanitizedToolSequence.length > 0) { + sanitizedParts.push( + sanitizedToolSequence.join(TOOL_SEQUENCE_SEPARATOR), + ); + } + continue; + } + + if ( + trimmed.startsWith('paths ') || + trimmed === 'validated' || + trimmed === 'validation failed' + ) { + sanitizedParts.push(trimmed); + } + } + + return sanitizedParts.join(WORKFLOW_PART_SEPARATOR); +} diff --git a/packages/core/src/services/sessionSummaryUtils.test.ts b/packages/core/src/services/sessionSummaryUtils.test.ts index fa1a47a14f..815f2e8d68 100644 --- a/packages/core/src/services/sessionSummaryUtils.test.ts +++ b/packages/core/src/services/sessionSummaryUtils.test.ts @@ -9,6 +9,8 @@ import { generateSummary, getPreviousSession } from './sessionSummaryUtils.js'; import type { Config } from '../config/config.js'; import type { ContentGenerator } from '../core/contentGenerator.js'; import * as chatRecordingService from './chatRecordingService.js'; +import type { ConversationRecord } from './chatRecordingService.js'; +import { CoreToolCallStatus } from '../scheduler/types.js'; import * as fs from 'node:fs/promises'; import * as path from 'node:path'; import * as os from 'node:os'; @@ -37,25 +39,33 @@ vi.mock('./chatRecordingService.js', async () => { interface SessionFixture { summary?: string; + memoryScratchpad?: unknown; sessionId?: string; startTime?: string; lastUpdated?: string; + kind?: ConversationRecord['kind']; + messages?: ConversationRecord['messages']; userMessageCount: number; } function buildLegacySessionJson(fixture: SessionFixture): string { + const messages = + fixture.messages ?? + Array.from({ length: fixture.userMessageCount }, (_, i) => ({ + id: String(i + 1), + timestamp: '2024-01-01T00:00:00Z', + type: 'user', + content: [{ text: `Message ${i + 1}` }], + })); return JSON.stringify({ sessionId: fixture.sessionId ?? 'session-id', projectHash: 'abc123', startTime: fixture.startTime ?? '2024-01-01T00:00:00Z', lastUpdated: fixture.lastUpdated ?? '2024-01-01T00:00:00Z', summary: fixture.summary, - messages: Array.from({ length: fixture.userMessageCount }, (_, i) => ({ - id: String(i + 1), - timestamp: '2024-01-01T00:00:00Z', - type: 'user', - content: [{ text: `Message ${i + 1}` }], - })), + memoryScratchpad: fixture.memoryScratchpad, + ...(fixture.kind ? { kind: fixture.kind } : {}), + messages, }); } @@ -66,17 +76,22 @@ function buildJsonlSession(fixture: SessionFixture): string { startTime: fixture.startTime ?? '2024-01-01T00:00:00Z', lastUpdated: fixture.lastUpdated ?? '2024-01-01T00:00:00Z', ...(fixture.summary !== undefined ? { summary: fixture.summary } : {}), + ...(fixture.memoryScratchpad !== undefined + ? { memoryScratchpad: fixture.memoryScratchpad } + : {}), + ...(fixture.kind ? { kind: fixture.kind } : {}), }; + const messages = + fixture.messages ?? + Array.from({ length: fixture.userMessageCount }, (_, i) => ({ + id: String(i + 1), + timestamp: '2024-01-01T00:00:00Z', + type: 'user', + content: [{ text: `Message ${i + 1}` }], + })); const lines: string[] = [JSON.stringify(metadata)]; - for (let i = 0; i < fixture.userMessageCount; i++) { - lines.push( - JSON.stringify({ - id: String(i + 1), - timestamp: '2024-01-01T00:00:00Z', - type: 'user', - content: [{ text: `Message ${i + 1}` }], - }), - ); + for (const message of messages) { + lines.push(JSON.stringify(message)); } return lines.join('\n') + '\n'; } @@ -119,6 +134,7 @@ describe('sessionSummaryUtils', () => { mockConfig = { getContentGenerator: vi.fn().mockReturnValue(mockContentGenerator), + getProjectRoot: vi.fn().mockReturnValue(projectTempDir), getSessionId: vi.fn().mockReturnValue('current-session'), storage: { getProjectTempDir: vi.fn().mockReturnValue(projectTempDir), @@ -157,13 +173,50 @@ describe('sessionSummaryUtils', () => { expect(result).toBeNull(); }); - it('should return null if most recent session already has summary', async () => { + it('should return null if most recent session already has summary metadata', async () => { await writeSession( chatsDir, 'session-2024-01-01T10-00-abc12345.json', buildLegacySessionJson({ userMessageCount: 5, summary: 'Existing summary', + memoryScratchpad: { + version: 1, + workflowSummary: 'read_file -> edit', + }, + }), + ); + + const result = await getPreviousSession(mockConfig); + + expect(result).toBeNull(); + }); + + it('should return path if most recent session has summary but no scratchpad', async () => { + const filePath = await writeSession( + chatsDir, + 'session-2024-01-01T10-00-abc12345.json', + buildLegacySessionJson({ + userMessageCount: 5, + summary: 'Existing summary', + }), + ); + + const result = await getPreviousSession(mockConfig); + + expect(result).toBe(filePath); + }); + + it('should return null if most recent session has scratchpad but no summary', async () => { + await writeSession( + chatsDir, + 'session-2024-01-01T10-00-abc12345.json', + buildLegacySessionJson({ + userMessageCount: 5, + memoryScratchpad: { + version: 1, + workflowSummary: 'read_file -> edit', + }, }), ); @@ -302,6 +355,36 @@ describe('sessionSummaryUtils', () => { metadataOnly: true, }); }); + + it('should skip subagent sessions when backfilling scratchpads', async () => { + const mainPath = await writeSession( + chatsDir, + 'session-2024-01-01T10-00-main0001.jsonl', + buildJsonlSession({ + sessionId: 'main-session', + userMessageCount: 2, + lastUpdated: '2024-01-01T10:00:00Z', + summary: 'Main session summary', + }), + ); + await setSessionMtime(mainPath, '2024-01-01T10:00:00Z'); + + await writeSession( + chatsDir, + 'session-2024-01-02T10-00-sub00001.jsonl', + buildJsonlSession({ + sessionId: 'subagent-session', + userMessageCount: 2, + lastUpdated: '2024-01-02T10:00:00Z', + summary: 'Subagent summary', + kind: 'subagent', + }), + ); + + const result = await getPreviousSession(mockConfig); + + expect(result).toBe(mainPath); + }); }); describe('generateSummary', () => { @@ -324,6 +407,7 @@ describe('sessionSummaryUtils', () => { expect(mockGenerateSummary).toHaveBeenCalledTimes(1); const written = JSON.parse(await fs.readFile(filePath, 'utf-8')); expect(written.summary).toBe('Add dark mode to the app'); + expect(written.memoryScratchpad).toEqual({ version: 1 }); expect(written.lastUpdated).toBe(lastUpdated); }); @@ -356,10 +440,160 @@ describe('sessionSummaryUtils', () => { expect(lastRecord).toEqual({ $set: { summary: 'Add dark mode to the app', + memoryScratchpad: { + version: 1, + }, }, }); }); + it('should backfill scratchpad without regenerating summary', async () => { + const filePath = await writeSession( + chatsDir, + 'session-2024-01-01T10-00-backfill.jsonl', + buildJsonlSession({ + userMessageCount: 2, + summary: 'Existing summary', + }), + ); + + await generateSummary(mockConfig); + + expect(mockGenerateSummary).not.toHaveBeenCalled(); + const lines = (await fs.readFile(filePath, 'utf-8')) + .split('\n') + .filter(Boolean); + const lastRecord = JSON.parse(lines[lines.length - 1]); + expect(lastRecord).toEqual({ + $set: { + memoryScratchpad: { + version: 1, + }, + }, + }); + }); + + it('should not retry summary generation after writing a scratchpad fallback', async () => { + const filePath = await writeSession( + chatsDir, + 'session-2024-01-01T10-00-summary-fallback.jsonl', + buildJsonlSession({ + sessionId: 'summary-fallback-session', + userMessageCount: 2, + messages: [ + { + id: 'u1', + timestamp: '2024-01-01T00:00:00Z', + type: 'user', + content: [{ text: 'Read package metadata' }], + }, + { + id: 'g1', + timestamp: '2024-01-01T00:00:01Z', + type: 'gemini', + content: [{ text: 'Reading package.json' }], + toolCalls: [ + { + id: 'tool-1', + name: 'read_file', + args: { file_path: 'package.json' }, + status: CoreToolCallStatus.Success, + timestamp: '2024-01-01T00:00:01Z', + }, + ], + }, + { + id: 'u2', + timestamp: '2024-01-01T00:00:02Z', + type: 'user', + content: [{ text: 'Done' }], + }, + ], + }), + ); + mockGenerateSummary.mockResolvedValue(undefined); + + await generateSummary(mockConfig); + await generateSummary(mockConfig); + + expect(mockGenerateSummary).toHaveBeenCalledTimes(1); + const savedConversation = + await chatRecordingService.loadConversationRecord(filePath); + expect(savedConversation?.summary).toBeUndefined(); + expect(savedConversation?.memoryScratchpad).toEqual({ + version: 1, + workflowSummary: 'read_file | paths package.json', + toolSequence: ['read_file'], + touchedPaths: ['package.json'], + }); + }); + + it('should refresh stale scratchpads when messages were appended after metadata', async () => { + const filePath = await writeSession( + chatsDir, + 'session-2024-01-01T10-00-resumed1.jsonl', + buildJsonlSession({ + sessionId: 'resumed-session', + userMessageCount: 2, + summary: 'Existing summary', + lastUpdated: '2024-01-01T10:00:00Z', + }), + ); + await fs.appendFile( + filePath, + `${JSON.stringify({ + $set: { + memoryScratchpad: { + version: 1, + workflowSummary: 'read_file', + toolSequence: ['read_file'], + }, + }, + })}\n`, + ); + await fs.appendFile( + filePath, + [ + JSON.stringify({ + id: 'u-resumed', + timestamp: '2024-01-02T00:00:00Z', + type: 'user', + content: [{ text: 'Update src/app.ts' }], + }), + JSON.stringify({ + id: 'g-resumed', + timestamp: '2024-01-02T00:00:01Z', + type: 'gemini', + content: [{ text: 'Editing file' }], + toolCalls: [ + { + id: 'tool-resumed', + name: 'replace', + args: { file_path: 'src/app.ts' }, + status: CoreToolCallStatus.Success, + timestamp: '2024-01-02T00:00:01Z', + }, + ], + }), + JSON.stringify({ + $set: { lastUpdated: '2024-01-02T00:00:02Z' }, + }), + ].join('\n') + '\n', + ); + + await generateSummary(mockConfig); + + expect(mockGenerateSummary).not.toHaveBeenCalled(); + const savedConversation = + await chatRecordingService.loadConversationRecord(filePath); + expect(savedConversation?.memoryScratchpad).toEqual({ + version: 1, + workflowSummary: 'replace | paths src/app.ts', + toolSequence: ['replace'], + touchedPaths: ['src/app.ts'], + }); + }); + it('should preserve a newer JSONL lastUpdated written concurrently', async () => { const initialLastUpdated = '2024-01-01T10:00:00Z'; const newerLastUpdated = '2024-01-02T12:34:56Z'; @@ -411,6 +645,7 @@ describe('sessionSummaryUtils', () => { const savedConversation = await chatRecordingService.loadConversationRecord(filePath); expect(savedConversation?.summary).toBe('Add dark mode to the app'); + expect(savedConversation?.memoryScratchpad).toEqual({ version: 1 }); expect(savedConversation?.lastUpdated).toBe(newerLastUpdated); const lines = (await fs.readFile(filePath, 'utf-8')) @@ -420,6 +655,9 @@ describe('sessionSummaryUtils', () => { expect(lastRecord).toEqual({ $set: { summary: 'Add dark mode to the app', + memoryScratchpad: { + version: 1, + }, }, }); }); @@ -454,6 +692,9 @@ describe('sessionSummaryUtils', () => { expect(JSON.parse(previousLines[previousLines.length - 1])).toEqual({ $set: { summary: 'Add dark mode to the app', + memoryScratchpad: { + version: 1, + }, }, }); @@ -462,5 +703,312 @@ describe('sessionSummaryUtils', () => { .filter(Boolean); expect(currentLines).toHaveLength(2); }); + + it('should preserve repo-root file names in scratchpad touched paths', async () => { + const filePath = await writeSession( + chatsDir, + 'session-2024-01-01T10-00-rootpath.jsonl', + buildJsonlSession({ + sessionId: 'root-path-session', + userMessageCount: 2, + summary: 'Existing summary', + messages: [ + { + id: 'u1', + timestamp: '2024-01-01T00:00:00Z', + type: 'user', + content: [{ text: 'Inspect package.json' }], + }, + { + id: 'g1', + timestamp: '2024-01-01T00:00:01Z', + type: 'gemini', + content: [{ text: 'Reading files' }], + toolCalls: [ + { + id: 'tool-1', + name: 'read_file', + args: { file_path: 'package.json' }, + status: CoreToolCallStatus.Success, + timestamp: '2024-01-01T00:00:01Z', + }, + ], + }, + { + id: 'u2', + timestamp: '2024-01-01T00:00:02Z', + type: 'user', + content: [{ text: 'Done' }], + }, + ], + }), + ); + + await generateSummary(mockConfig); + + const savedConversation = + await chatRecordingService.loadConversationRecord(filePath); + expect(savedConversation?.memoryScratchpad).toEqual({ + version: 1, + workflowSummary: 'read_file | paths package.json', + toolSequence: ['read_file'], + touchedPaths: ['package.json'], + }); + }); + + it('should summarize shell commands without raw arguments in scratchpad tool sequence', async () => { + const filePath = await writeSession( + chatsDir, + 'session-2024-01-01T10-00-shellcmd.jsonl', + buildJsonlSession({ + sessionId: 'shell-command-session', + userMessageCount: 2, + summary: 'Existing summary', + messages: [ + { + id: 'u1', + timestamp: '2024-01-01T00:00:00Z', + type: 'user', + content: [{ text: 'Run the migration and regenerate docs' }], + }, + { + id: 'g1', + timestamp: '2024-01-01T00:00:01Z', + type: 'gemini', + content: [{ text: 'Running commands' }], + toolCalls: [ + { + id: 'tool-1', + name: 'run_shell_command', + args: { + command: + 'curl https://api.example.com -H "Authorization: Bearer sk-secret-token"', + }, + status: CoreToolCallStatus.Success, + timestamp: '2024-01-01T00:00:01Z', + }, + { + id: 'tool-2', + name: 'run_shell_command', + args: { + command: + 'DATABASE_URL=postgresql://user:password@localhost/db npm run migrate -- --name add-users', + }, + status: CoreToolCallStatus.Success, + timestamp: '2024-01-01T00:00:02Z', + }, + ], + }, + { + id: 'u2', + timestamp: '2024-01-01T00:00:03Z', + type: 'user', + content: [{ text: 'Done' }], + }, + ], + }), + ); + + await generateSummary(mockConfig); + + const savedConversation = + await chatRecordingService.loadConversationRecord(filePath); + expect(savedConversation?.memoryScratchpad).toEqual({ + version: 1, + workflowSummary: 'run_shell_command: curl -> run_shell_command: npm', + toolSequence: ['run_shell_command: curl', 'run_shell_command: npm'], + }); + expect( + savedConversation?.memoryScratchpad?.workflowSummary, + ).not.toContain('Authorization'); + expect( + savedConversation?.memoryScratchpad?.workflowSummary, + ).not.toContain('sk-secret-token'); + expect( + savedConversation?.memoryScratchpad?.workflowSummary, + ).not.toContain('password'); + expect( + savedConversation?.memoryScratchpad?.workflowSummary, + ).not.toContain('add-users'); + }); + + it('should not classify validation substrings as validation tools', async () => { + const filePath = await writeSession( + chatsDir, + 'session-2024-01-01T10-00-validation-substring.jsonl', + buildJsonlSession({ + sessionId: 'validation-substring-session', + userMessageCount: 2, + summary: 'Existing summary', + messages: [ + { + id: 'u1', + timestamp: '2024-01-01T00:00:00Z', + type: 'user', + content: [{ text: 'Run the contest helper' }], + }, + { + id: 'g1', + timestamp: '2024-01-01T00:00:01Z', + type: 'gemini', + content: [{ text: 'Running helper' }], + toolCalls: [ + { + id: 'tool-1', + name: 'contest_runner', + args: {}, + status: CoreToolCallStatus.Success, + timestamp: '2024-01-01T00:00:01Z', + }, + ], + }, + { + id: 'u2', + timestamp: '2024-01-01T00:00:02Z', + type: 'user', + content: [{ text: 'Done' }], + }, + ], + }), + ); + + await generateSummary(mockConfig); + + const savedConversation = + await chatRecordingService.loadConversationRecord(filePath); + expect(savedConversation?.memoryScratchpad).toEqual({ + version: 1, + workflowSummary: 'contest_runner', + toolSequence: ['contest_runner'], + }); + }); + + it('should cap nested path extraction depth', async () => { + const filePath = await writeSession( + chatsDir, + 'session-2024-01-01T10-00-deep-paths.jsonl', + buildJsonlSession({ + sessionId: 'deep-paths-session', + userMessageCount: 2, + summary: 'Existing summary', + messages: [ + { + id: 'u1', + timestamp: '2024-01-01T00:00:00Z', + type: 'user', + content: [{ text: 'Edit shallow and deeply nested files' }], + }, + { + id: 'g1', + timestamp: '2024-01-01T00:00:01Z', + type: 'gemini', + content: [{ text: 'Editing files' }], + toolCalls: [ + { + id: 'tool-1', + name: 'replace', + args: { + file_path: 'src/shallow.ts', + level1: { + level2: { + level3: { + level4: { + level5: { + level6: { + level7: { + file_path: 'src/deep.ts', + }, + }, + }, + }, + }, + }, + }, + }, + status: CoreToolCallStatus.Success, + timestamp: '2024-01-01T00:00:01Z', + }, + ], + }, + { + id: 'u2', + timestamp: '2024-01-01T00:00:02Z', + type: 'user', + content: [{ text: 'Done' }], + }, + ], + }), + ); + + await generateSummary(mockConfig); + + const savedConversation = + await chatRecordingService.loadConversationRecord(filePath); + expect(savedConversation?.memoryScratchpad).toEqual({ + version: 1, + workflowSummary: 'replace | paths src/shallow.ts', + toolSequence: ['replace'], + touchedPaths: ['src/shallow.ts'], + }); + }); + + it('should use the latest validation result in scratchpad metadata', async () => { + const filePath = await writeSession( + chatsDir, + 'session-2024-01-01T10-00-validation.jsonl', + buildJsonlSession({ + sessionId: 'validation-session', + userMessageCount: 2, + summary: 'Existing summary', + messages: [ + { + id: 'u1', + timestamp: '2024-01-01T00:00:00Z', + type: 'user', + content: [{ text: 'Fix the tests' }], + }, + { + id: 'g1', + timestamp: '2024-01-01T00:00:01Z', + type: 'gemini', + content: [{ text: 'Running tests' }], + toolCalls: [ + { + id: 'tool-1', + name: 'run_shell_command', + args: { command: 'npm test' }, + status: CoreToolCallStatus.Error, + timestamp: '2024-01-01T00:00:01Z', + }, + { + id: 'tool-2', + name: 'run_shell_command', + args: { command: 'npm test' }, + status: CoreToolCallStatus.Success, + timestamp: '2024-01-01T00:00:02Z', + }, + ], + }, + { + id: 'u2', + timestamp: '2024-01-01T00:00:03Z', + type: 'user', + content: [{ text: 'Done' }], + }, + ], + }), + ); + + await generateSummary(mockConfig); + + const savedConversation = + await chatRecordingService.loadConversationRecord(filePath); + expect(savedConversation?.memoryScratchpad).toEqual({ + version: 1, + workflowSummary: 'run_shell_command: npm | validated', + toolSequence: ['run_shell_command: npm'], + validationStatus: 'passed', + }); + }); }); }); diff --git a/packages/core/src/services/sessionSummaryUtils.ts b/packages/core/src/services/sessionSummaryUtils.ts index 592a0b42bf..ac336a3bc1 100644 --- a/packages/core/src/services/sessionSummaryUtils.ts +++ b/packages/core/src/services/sessionSummaryUtils.ts @@ -12,15 +12,29 @@ import { SESSION_FILE_PREFIX, loadConversationRecord, type ConversationRecord, + type MemoryScratchpad, + type ToolCallRecord, } from './chatRecordingService.js'; +import { CoreToolCallStatus } from '../scheduler/types.js'; +import { SHELL_TOOL_NAME } from '../tools/definitions/base-declarations.js'; +import { summarizeShellCommandForScratchpad } from './sessionScratchpadUtils.js'; import fs from 'node:fs/promises'; import path from 'node:path'; const MIN_MESSAGES_FOR_SUMMARY = 1; +const MAX_SCRATCHPAD_TOOLS = 6; +const MAX_SCRATCHPAD_PATHS = 4; +const MAX_SCRATCHPAD_PATH_DEPTH = 6; +const MAX_WORKFLOW_SUMMARY_LENGTH = 160; +const VALIDATION_COMMAND_REGEX = + /\b(test|tests|vitest|jest|pytest|cargo test|npm test|pnpm test|yarn test|bun test|lint|build|check|typecheck)\b/i; +const PATH_KEY_REGEX = /(path|file|dir|directory|cwd|root)/i; +const VALIDATION_TOOL_REGEX = /\b(test|lint|build|check|typecheck)\b/i; type LoadedSession = ConversationRecord & { messageCount?: number; userMessageCount?: number; + memoryScratchpadIsStale?: boolean; }; interface SessionFileCandidate { @@ -72,6 +86,238 @@ function getSessionTimestampMs(session: LoadedSession): number { return Number.isNaN(parsed) ? 0 : parsed; } +function normalizeToolName(name: string): string { + const trimmed = name.trim(); + return trimmed.length > 0 ? trimmed : 'unknown_tool'; +} + +function pushUniqueLimited( + target: string[], + value: string, + limit: number, +): void { + if (!value || target.includes(value) || target.length >= limit) { + return; + } + target.push(value); +} + +function normalizePathCandidate( + candidate: string, + projectRoot: string, +): string | null { + const trimmed = candidate.trim(); + if ( + trimmed.length === 0 || + trimmed.length > 240 || + trimmed.includes('\n') || + (!trimmed.includes('/') && + !trimmed.includes('\\') && + !trimmed.startsWith('.') && + path.extname(trimmed).length === 0) + ) { + return null; + } + + let normalized = trimmed.replace(/\\/g, '/'); + if (path.isAbsolute(trimmed)) { + const relative = path.relative(projectRoot, trimmed); + normalized = + relative && !relative.startsWith('..') && !path.isAbsolute(relative) + ? relative.replace(/\\/g, '/') + : path.basename(trimmed); + } + + if (normalized.length > 120) { + normalized = normalized.split('/').slice(-3).join('/'); + } + + return normalized.length > 0 ? normalized : null; +} + +function collectPathsFromValue( + value: unknown, + projectRoot: string, + paths: string[], + keyHint?: string, + depth = 0, +): void { + if ( + paths.length >= MAX_SCRATCHPAD_PATHS || + depth > MAX_SCRATCHPAD_PATH_DEPTH + ) { + return; + } + + if (typeof value === 'string') { + if (!keyHint || !PATH_KEY_REGEX.test(keyHint)) { + return; + } + + const normalized = normalizePathCandidate(value, projectRoot); + if (normalized) { + pushUniqueLimited(paths, normalized, MAX_SCRATCHPAD_PATHS); + } + return; + } + + if (Array.isArray(value)) { + for (const item of value) { + collectPathsFromValue(item, projectRoot, paths, keyHint, depth + 1); + if (paths.length >= MAX_SCRATCHPAD_PATHS) { + return; + } + } + return; + } + + if (typeof value !== 'object' || value === null) { + return; + } + + for (const [key, nestedValue] of Object.entries(value)) { + collectPathsFromValue(nestedValue, projectRoot, paths, key, depth + 1); + if (paths.length >= MAX_SCRATCHPAD_PATHS) { + return; + } + } +} + +function getToolCallCommand(toolCall: ToolCallRecord): string | undefined { + for (const key of ['command', 'cmd', 'script']) { + const value = toolCall.args[key]; + if (typeof value === 'string' && value.trim().length > 0) { + return value; + } + } + return undefined; +} + +function getToolSequenceEntry(toolCall: ToolCallRecord): string { + const toolName = normalizeToolName(toolCall.name); + if (toolName !== SHELL_TOOL_NAME) { + return toolName; + } + + const command = getToolCallCommand(toolCall); + const commandSummary = command + ? summarizeShellCommandForScratchpad(command) + : undefined; + return commandSummary ? `${toolName}: ${commandSummary}` : toolName; +} + +function getValidationStatusForToolCall( + toolCall: ToolCallRecord, +): MemoryScratchpad['validationStatus'] | undefined { + const command = getToolCallCommand(toolCall); + const isValidationTool = + VALIDATION_TOOL_REGEX.test(toolCall.name) || + (command ? VALIDATION_COMMAND_REGEX.test(command) : false); + if (!isValidationTool) { + return undefined; + } + + if (toolCall.status === CoreToolCallStatus.Success) { + return 'passed'; + } + if ( + toolCall.status === CoreToolCallStatus.Error || + toolCall.status === CoreToolCallStatus.Cancelled + ) { + return 'failed'; + } + return 'unknown'; +} + +function buildWorkflowSummary( + toolSequence: string[], + touchedPaths: string[], + validationStatus?: MemoryScratchpad['validationStatus'], +): string | undefined { + const parts: string[] = []; + + if (toolSequence.length > 0) { + parts.push(toolSequence.join(' -> ')); + } + if (touchedPaths.length > 0) { + parts.push(`paths ${touchedPaths.join(', ')}`); + } + if (validationStatus === 'passed') { + parts.push('validated'); + } else if (validationStatus === 'failed') { + parts.push('validation failed'); + } + + if (parts.length === 0) { + return undefined; + } + + const summary = parts.join(' | '); + if (summary.length === 0) { + return undefined; + } + return summary.length > MAX_WORKFLOW_SUMMARY_LENGTH + ? `${summary.slice(0, MAX_WORKFLOW_SUMMARY_LENGTH - 3)}...` + : summary; +} + +function buildMemoryScratchpad( + messages: ConversationRecord['messages'], + projectRoot: string, +): MemoryScratchpad { + const toolSequence: string[] = []; + const touchedPaths: string[] = []; + let validationStatus: MemoryScratchpad['validationStatus']; + + for (const message of messages) { + if (message.type !== 'gemini' || !message.toolCalls) { + continue; + } + + for (const toolCall of message.toolCalls) { + pushUniqueLimited( + toolSequence, + getToolSequenceEntry(toolCall), + MAX_SCRATCHPAD_TOOLS, + ); + collectPathsFromValue(toolCall.args, projectRoot, touchedPaths); + + const toolValidationStatus = getValidationStatusForToolCall(toolCall); + if (toolValidationStatus) { + validationStatus = toolValidationStatus; + } + } + } + + const workflowSummary = buildWorkflowSummary( + toolSequence, + touchedPaths, + validationStatus, + ); + + return { + version: 1, + ...(workflowSummary ? { workflowSummary } : {}), + ...(toolSequence.length > 0 ? { toolSequence } : {}), + ...(touchedPaths.length > 0 ? { touchedPaths } : {}), + ...(validationStatus ? { validationStatus } : {}), + }; +} + +function hasCurrentMemoryScratchpad(session: LoadedSession): boolean { + return Boolean( + session.memoryScratchpad && session.memoryScratchpadIsStale !== true, + ); +} + +function hasSessionSummaryMetadata(session: LoadedSession): boolean { + return hasCurrentMemoryScratchpad(session); +} + +function getLoadedMessageCount(session: LoadedSession): number { + return session.messageCount ?? session.messages.length; +} + /** * Generates and saves a summary for a session file. */ @@ -85,10 +331,11 @@ async function generateAndSaveSummary( return; } - // Skip if summary already exists - if (conversation.summary) { + // Skip if workflow metadata already exists; memory extraction can use the + // scratchpad even when summary generation was unavailable. + if (hasSessionSummaryMetadata(conversation)) { debugLogger.debug( - `[SessionSummary] Summary already exists for ${sessionPath}, skipping`, + `[SessionSummary] Summary metadata already exists for ${sessionPath}, skipping`, ); return; } @@ -101,29 +348,31 @@ async function generateAndSaveSummary( return; } - // Create summary service - const contentGenerator = config.getContentGenerator(); - if (!contentGenerator) { - debugLogger.debug( - '[SessionSummary] Content generator not available, skipping summary generation', - ); - return; - } - const baseLlmClient = new BaseLlmClient(contentGenerator, config); - const summaryService = new SessionSummaryService(baseLlmClient); - - // Generate summary - const summary = await summaryService.generateSummary({ - messages: conversation.messages, - }); - + let summary = conversation.summary; if (!summary) { - debugLogger.warn( - `[SessionSummary] Failed to generate summary for ${sessionPath}`, - ); - return; + const contentGenerator = config.getContentGenerator(); + if (!contentGenerator) { + debugLogger.debug( + '[SessionSummary] Content generator not available, skipping summary generation', + ); + } else { + const baseLlmClient = new BaseLlmClient(contentGenerator, config); + const summaryService = new SessionSummaryService(baseLlmClient); + summary = + (await summaryService.generateSummary({ + messages: conversation.messages, + })) ?? undefined; + + if (!summary) { + debugLogger.warn( + `[SessionSummary] Failed to generate summary for ${sessionPath}`, + ); + } + } } + let scratchpadSourceConversation = conversation; + // Re-read the file before writing to handle race conditions. For JSONL we // only need the metadata; for legacy JSON we need the full record so we can // round-trip the messages back to disk. @@ -136,18 +385,53 @@ async function generateAndSaveSummary( return; } - // Check if summary was added by another process - if (freshConversation.summary) { + // Check if summary metadata was added by another process + if (hasSessionSummaryMetadata(freshConversation)) { debugLogger.debug( - `[SessionSummary] Summary was added by another process for ${sessionPath}`, + `[SessionSummary] Summary metadata was added by another process for ${sessionPath}`, ); return; } + if ( + !hasCurrentMemoryScratchpad(freshConversation) && + (getLoadedMessageCount(freshConversation) !== + getLoadedMessageCount(conversation) || + freshConversation.lastUpdated !== conversation.lastUpdated) + ) { + const latestConversation = await loadConversationRecord(sessionPath); + if (!latestConversation) { + debugLogger.debug(`[SessionSummary] Could not re-read ${sessionPath}`); + return; + } + if (hasSessionSummaryMetadata(latestConversation)) { + debugLogger.debug( + `[SessionSummary] Summary metadata was added by another process for ${sessionPath}`, + ); + return; + } + scratchpadSourceConversation = latestConversation; + } + + const metadataUpdate: Partial = {}; + if (!freshConversation.summary && summary) { + metadataUpdate.summary = summary; + } + if (!hasCurrentMemoryScratchpad(freshConversation)) { + metadataUpdate.memoryScratchpad = buildMemoryScratchpad( + scratchpadSourceConversation.messages, + config.getProjectRoot(), + ); + } + + if (Object.keys(metadataUpdate).length === 0) { + return; + } + if (isJsonl) { await fs.appendFile( sessionPath, - `${JSON.stringify({ $set: { summary } })}\n`, + `${JSON.stringify({ $set: metadataUpdate })}\n`, ); } else { const lastUpdated = freshConversation.lastUpdated; @@ -156,7 +440,7 @@ async function generateAndSaveSummary( JSON.stringify( { ...freshConversation, - summary, + ...metadataUpdate, lastUpdated, }, null, @@ -165,13 +449,13 @@ async function generateAndSaveSummary( ); } debugLogger.debug( - `[SessionSummary] Saved summary for ${sessionPath}: "${summary}"`, + `[SessionSummary] Saved summary metadata for ${sessionPath}${summary ? `: "${summary}"` : ''}`, ); } /** - * Finds the most recently updated previous session that still needs a summary. - * Returns the path if it needs a summary, null otherwise. + * Finds the most recently updated previous session that still needs workflow metadata. + * Returns the path if it needs a scratchpad, null otherwise. */ export async function getPreviousSession( config: Config, @@ -217,7 +501,8 @@ export async function getPreviousSession( }); if (!conversation) continue; if (conversation.sessionId === config.getSessionId()) continue; - if (conversation.summary) continue; + if (conversation.kind === 'subagent') continue; + if (hasSessionSummaryMetadata(conversation)) continue; // Only generate summaries for sessions with more than 1 user message. // `loadConversationRecord` populates `userMessageCount` in metadataOnly @@ -264,7 +549,7 @@ export async function getPreviousSession( } /** - * Generates summary for the previous session if it lacks one. + * Generates summary metadata for the previous session if it lacks a scratchpad. * This is designed to be called fire-and-forget on startup. */ export async function generateSummary(config: Config): Promise {