Improving memory tool instructions and eval testing (#18091)

2026-04-25 20:44:46 -07:00 · 2026-02-05 10:07:47 -08:00
parent 4a6e3eb646
commit 5b9ea35b63
12 changed files with 538 additions and 321 deletions
@@ -5,7 +5,12 @@
 */

 import { describe, it, expect, beforeEach, afterEach } from 'vitest';
-import { TestRig, printDebugInfo, validateModelOutput } from './test-helper.js';
+import {
+  TestRig,
+  printDebugInfo,
+  assertModelHasOutput,
+  checkModelOutputContent,
+} from './test-helper.js';
 import { getShellConfiguration } from '../packages/core/src/utils/shell-utils.js';

 const { shell } = getShellConfiguration();
@@ -115,13 +120,11 @@ describe('run_shell_command', () => {
      'Expected to find a run_shell_command tool call',
    ).toBeTruthy();

-    // Validate model output - will throw if no output, warn if missing expected content
-    // Model often reports exit code instead of showing output
-    validateModelOutput(
-      result,
-      ['hello-world', 'exit code 0'],
-      'Shell command test',
-    );
+    assertModelHasOutput(result);
+    checkModelOutputContent(result, {
+      expectedContent: ['hello-world', 'exit code 0'],
+      testName: 'Shell command test',
+    });
  });

  it('should be able to run a shell command via stdin', async () => {
@@ -149,8 +152,11 @@ describe('run_shell_command', () => {
      'Expected to find a run_shell_command tool call',
    ).toBeTruthy();

-    // Validate model output - will throw if no output, warn if missing expected content
-    validateModelOutput(result, 'test-stdin', 'Shell command stdin test');
+    assertModelHasOutput(result);
+    checkModelOutputContent(result, {
+      expectedContent: 'test-stdin',
+      testName: 'Shell command stdin test',
+    });
  });

  it.skip('should run allowed sub-command in non-interactive mode', async () => {
@@ -494,12 +500,11 @@ describe('run_shell_command', () => {
      )[0];
    expect(toolCall.toolRequest.success).toBe(true);

-    // Validate model output - will throw if no output, warn if missing expected content
-    validateModelOutput(
-      result,
-      'test-allow-all',
-      'Shell command stdin allow all',
-    );
+    assertModelHasOutput(result);
+    checkModelOutputContent(result, {
+      expectedContent: 'test-allow-all',
+      testName: 'Shell command stdin allow all',
+    });
  });

  it('should propagate environment variables to the child process', async () => {
@@ -528,7 +533,11 @@ describe('run_shell_command', () => {
        foundToolCall,
        'Expected to find a run_shell_command tool call',
      ).toBeTruthy();
-      validateModelOutput(result, varValue, 'Env var propagation test');
+      assertModelHasOutput(result);
+      checkModelOutputContent(result, {
+        expectedContent: varValue,
+        testName: 'Env var propagation test',
+      });
      expect(result).toContain(varValue);
    } finally {
      delete process.env[varName];
@@ -558,7 +567,11 @@ describe('run_shell_command', () => {
      'Expected to find a run_shell_command tool call',
    ).toBeTruthy();

-    validateModelOutput(result, fileName, 'Platform-specific listing test');
+    assertModelHasOutput(result);
+    checkModelOutputContent(result, {
+      expectedContent: fileName,
+      testName: 'Platform-specific listing test',
+    });
    expect(result).toContain(fileName);
  });