feat(simulator): implement episodic memory with background compression

- Add session_notes field to simulator JSON response schema. - Capture and persist factual session notes in sessionMemory. - Inject episodic memory into system prompt for off-screen context. - Add background LLM process to compress memory when it exceeds 5 items. - Add unit tests for note capture, prompt injection, and compression merge.
2026-05-20 08:42:39 -07:00 · 2026-05-04 03:46:28 +00:00
parent 6f48261306
commit 44744334c6
2 changed files with 197 additions and 3 deletions
@@ -228,4 +228,116 @@ describe('UserSimulator', () => {
    simulator.stop();
    vi.useRealTimers();
  });
+
+  it('should capture session notes and inject them into subsequent prompts', async () => {
+    const simulator = new UserSimulator(
+      mockConfig,
+      mockGetScreen,
+      mockStdinBuffer,
+    );
+    mockGetScreen.mockReturnValue('> Prompt 1');
+    mockContentGenerator.generateContent.mockResolvedValueOnce({
+      text: JSON.stringify({
+        action: 'ls\r',
+        session_notes: 'I listed the directory contents.',
+      }),
+    });
+
+    vi.useFakeTimers();
+    simulator.start();
+
+    // First tick: captures note
+    await vi.advanceTimersByTimeAsync(2000);
+    expect(mockContentGenerator.generateContent).toHaveBeenCalledTimes(1);
+
+    // Second tick: different screen to avoid skip
+    mockGetScreen.mockReturnValue('> Prompt 2');
+    mockContentGenerator.generateContent.mockResolvedValueOnce({
+      text: JSON.stringify({ action: 'pwd\r' }),
+    });
+    await vi.advanceTimersByTimeAsync(2000);
+
+    expect(mockContentGenerator.generateContent).toHaveBeenCalledTimes(2);
+    const secondCall = mockContentGenerator.generateContent.mock.calls[1];
+    const prompt = secondCall[0].contents[0].parts[0].text;
+
+    expect(prompt).toContain(
+      "Your Session Memory (Key facts you've recorded):",
+    );
+    expect(prompt).toContain('1. I listed the directory contents.');
+
+    simulator.stop();
+    vi.useRealTimers();
+  });
+
+  it('should trigger background compression when memory exceeds threshold and merge correctly', async () => {
+    const simulator = new UserSimulator(
+      mockConfig,
+      mockGetScreen,
+      mockStdinBuffer,
+    );
+
+    // Provide 4 existing notes
+    // We can't set private sessionMemory directly easily without casting or refactoring
+    // So we'll trigger 5 ticks that each return a note.
+    vi.useFakeTimers();
+    simulator.start();
+
+    for (let i = 0; i < 5; i++) {
+      mockGetScreen.mockReturnValue(`> Prompt ${i}`);
+      mockContentGenerator.generateContent.mockResolvedValueOnce({
+        text: JSON.stringify({
+          action: 'wait\r',
+          session_notes: `Note ${i}`,
+        }),
+      });
+      await vi.advanceTimersByTimeAsync(2000);
+    }
+
+    expect(mockContentGenerator.generateContent).toHaveBeenCalledTimes(5);
+
+    // The 5th tick should have triggered compression.
+    // Let's mock the compression response.
+    // The compression call uses 'simulator-compression' as prompt ID.
+    const compressionCall =
+      mockContentGenerator.generateContent.mock.calls.find(
+        (call) => call[1] === 'simulator-compression',
+      );
+    expect(compressionCall).toBeDefined();
+    expect(compressionCall[0].contents[0].parts[0].text).toContain(
+      'Summarize the following chronological session notes',
+    );
+
+    // Wait for the compression to finish and merge.
+    // We need to resolve the promise for the compression call.
+    // In our mock, it resolves to { action: 'y\r' } by default from beforeEach.
+    // Let's make it return a specific summary.
+    mockContentGenerator.generateContent.mockImplementation(async (req, id) => {
+      if (id === 'simulator-compression') {
+        return { text: 'Compressed Summary' };
+      }
+      return { text: JSON.stringify({ action: 'y\r' }) };
+    });
+
+    // Advance time to allow the background task to complete
+    await vi.advanceTimersByTimeAsync(1000);
+
+    // Trigger one more tick to see if the compressed memory is used
+    mockGetScreen.mockReturnValue('> Final Prompt');
+    await vi.advanceTimersByTimeAsync(2000);
+
+    const finalCall = mockContentGenerator.generateContent.mock.calls.find(
+      (call) =>
+        call[0].contents[0].parts[0].text.includes('> Final Prompt') &&
+        call[1] === 'simulator-prompt',
+    );
+
+    const finalPrompt = finalCall[0].contents[0].parts[0].text;
+    expect(finalPrompt).toContain('1. Compressed Summary');
+    // Note 5 (the one added during or after compression trigger) might be there too
+    // depending on timing, but 'Compressed Summary' must be there.
+
+    simulator.stop();
+    vi.useRealTimers();
+  });
 });
@@ -23,6 +23,7 @@ interface SimulatorResponse {
  thought?: string;
  used_knowledge?: boolean;
  new_rule?: string;
+  session_notes?: string;
 }

 export class UserSimulator {
@@ -30,13 +31,15 @@ export class UserSimulator {
  private timer: NodeJS.Timeout | null = null;
  private lastStateKey = '';
  private isProcessing = false;
+  private isCompressingMemory = false;
+  private staleCycleCount = 0;
  private interactionsFile: string | null = null;

  private knowledgeBase = '';
  private editableKnowledgeFile: string | null = null;
  private actionHistory: string[] = [];
+  private sessionMemory: string[] = [];
  private pendingToolCalls: ToolCall[] = [];
-  private staleCycleCount = 0;
  private messageBusHandler: ((msg: ToolCallsUpdateMessage) => void) | null =
    null;

@@ -189,6 +192,12 @@ export class UserSimulator {
 Ignore any 'Responding' indicators, spinners, or timers. You MUST provide a response (e.g., 'y\\r', '2\\r') to unblock the tool execution NOW.\n`
          : '';

+      const sessionInstruction =
+        this.sessionMemory.length > 0
+          ? `\nYour Session Memory (Key facts you've recorded):
+${this.sessionMemory.map((m, i) => `${i + 1}. ${m}`).join('\n')}\n`
+          : '';
+
      const prompt = `You are evaluating a CLI agent by simulating a user sitting at the terminal.
 Look carefully at the screen and determine the CLI's current state:

@@ -211,15 +220,17 @@ CRITICAL RULES:
 - RULE 1: If there is a clear confirmation prompt (e.g. "[Y/n]", "1) Allow Once") or an input cursor (">"), YOU MUST RESPOND (State 2 or 3). Detect these states aggressively. Only <WAIT> (Rule 1 fallback) if the agent is truly mid-process with no interactive markers visible.
 - RULE 2: If there is an "Action Required" or confirmation prompt on the screen, YOU MUST HANDLE IT (State 2). This takes precedence over everything else.
 - RULE 3: If prompted to allow execution of a command with options like 'Allow once' and 'Allow for this session', you MUST choose the option for 'Allow for this session' (typically by sending '2\\r').
- RULE 4: You MUST output a strictly formatted JSON object with no markdown wrappers or extra text.
+- RULE 4: Use the "session_notes" field to record important facts that are scrolling off the screen (e.g., test results, proposed plans, file names, errors). Keep notes extremely brief. DO NOT record transient states like "Agent is thinking". This memory helps you maintain context across the session.
+- RULE 5: You MUST output a strictly formatted JSON object with no markdown wrappers or extra text.

 JSON FORMAT:
 {
  "action": "<The exact raw characters to send, <WAIT>, or <DONE>>",
+  "session_notes": "<Brief factual note to remember for future turns, if applicable>",
  "used_knowledge": <true if you used the User Knowledge Base below to answer this prompt, false otherwise>,
  "new_rule": "<If used_knowledge is false and action is not <WAIT> or <DONE>, formulate a single, clear, reusable one-line rule combining the question and your answer without using option numbers (e.g. 1, 2) that might change. For example: 'If asked to allow pip execution, always allow it.' or 'Automatically accept edits for snake game implementation.'>"
 }
-${goalInstruction}${knowledgeInstruction}${historyInstruction}${pendingToolInstruction}
+${goalInstruction}${knowledgeInstruction}${sessionInstruction}${historyInstruction}${pendingToolInstruction}

 Here is the current terminal screen output:

@@ -271,6 +282,16 @@ ${strippedScreen}
        // eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
        parsedJson = JSON.parse(cleanJson) as SimulatorResponse;
        responseText = parsedJson.action || '';
+
+        if (parsedJson.session_notes) {
+          this.sessionMemory.push(parsedJson.session_notes);
+          if (this.interactionsFile) {
+            fs.appendFileSync(
+              this.interactionsFile,
+              `[LOG] [SIMULATOR] Recorded session note: ${JSON.stringify(parsedJson.session_notes)}\n\n`,
+            );
+          }
+        }
      } catch (err) {
        debugLogger.error('Failed to parse simulator response as JSON', err);
        const text = (response.text || '').trim();
@@ -394,10 +415,71 @@ ${strippedScreen}
          );
        }
      }
+
+      if (this.sessionMemory.length >= 5 && !this.isCompressingMemory) {
+        // Trigger background compression (do not await)
+        this.compressMemory().catch((err) => {
+          debugLogger.error('Failed to compress simulator memory', err);
+        });
+      }
    } catch (e: unknown) {
      debugLogger.error('UserSimulator tick failed', e);
    } finally {
      this.isProcessing = false;
    }
  }
+
+  private async compressMemory() {
+    this.isCompressingMemory = true;
+    try {
+      const contentGenerator = this.config.getContentGenerator();
+      if (!contentGenerator) return;
+
+      const memoryToCompress = [...this.sessionMemory];
+      const prompt = `Summarize the following chronological session notes into a single, concise list of key facts, preserving specific technical details like file paths, proposed plans, and test results. Drop transient or obsolete observations.
+Notes:
+${memoryToCompress.map((m, i) => `${i + 1}. ${m}`).join('\n')}`;
+
+      const model = resolveModel(
+        PREVIEW_GEMINI_FLASH_MODEL,
+        false, // useGemini3_1
+        false, // useGemini3_1FlashLite
+        false, // useCustomToolModel
+        this.config.getHasAccessToPreviewModel?.() ?? true,
+        this.config,
+      );
+
+      const response = await contentGenerator.generateContent(
+        {
+          model,
+          contents: [
+            {
+              role: 'user',
+              parts: [{ text: prompt }],
+            },
+          ],
+        },
+        'simulator-compression',
+        LlmRole.UTILITY_SIMULATOR,
+      );
+
+      const summary = response.text?.trim();
+      if (summary) {
+        debugLogger.log(`[SIMULATOR] Memory compressed. Summary: ${summary}`);
+        if (this.interactionsFile) {
+          fs.appendFileSync(
+            this.interactionsFile,
+            `[LOG] [SIMULATOR] Memory compressed. Summary: ${summary}\n\n`,
+          );
+        }
+
+        // Replace the older items with the new summary string, while preserving any new notes
+        // that arrived while the compression was running.
+        const newNotes = this.sessionMemory.slice(memoryToCompress.length);
+        this.sessionMemory = [summary, ...newNotes];
+      }
+    } finally {
+      this.isCompressingMemory = false;
+    }
+  }
 }