Behavioral evals framework. (#16047)

This commit is contained in:
Christian Gunderman
2026-01-14 04:49:17 +00:00
committed by GitHub
parent 933bc5774f
commit 8030404b08
15 changed files with 1577 additions and 1413 deletions

View File

@@ -1,54 +0,0 @@
/**
* @license
* Copyright 2025 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, it, expect, beforeEach, afterEach } from 'vitest';
import { TestRig, printDebugInfo, validateModelOutput } from './test-helper.js';
describe('save_memory', () => {
let rig: TestRig;
beforeEach(() => {
rig = new TestRig();
});
afterEach(async () => await rig.cleanup());
it('should be able to save to memory', async () => {
await rig.setup('should be able to save to memory', {
settings: { tools: { core: ['save_memory'] } },
});
const prompt = `remember that my favorite color is blue.
what is my favorite color? tell me that and surround it with $ symbol`;
const result = await rig.run({ args: prompt });
const foundToolCall = await rig.waitForToolCall('save_memory');
// Add debugging information
if (!foundToolCall || !result.toLowerCase().includes('blue')) {
const allTools = printDebugInfo(rig, result, {
'Found tool call': foundToolCall,
'Contains blue': result.toLowerCase().includes('blue'),
});
console.error(
'Memory tool calls:',
allTools
.filter((t) => t.toolRequest.name === 'save_memory')
.map((t) => t.toolRequest.args),
);
}
expect(
foundToolCall,
'Expected to find a save_memory tool call',
).toBeTruthy();
// Validate model output - will throw if no output, warn if missing expected content
validateModelOutput(result, 'blue', 'Save memory test');
});
});

File diff suppressed because it is too large Load Diff