mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-05-18 07:43:00 -07:00
Context manager fuzzing test.
This commit is contained in:
+334
-200
@@ -51,6 +51,7 @@ async function runContextManagerEval(
|
||||
sidecarConfig: SidecarConfig,
|
||||
seed: number,
|
||||
scenario: Scenario,
|
||||
judge: boolean = true,
|
||||
): Promise<{
|
||||
pass: boolean;
|
||||
response: string;
|
||||
@@ -60,6 +61,7 @@ async function runContextManagerEval(
|
||||
const harness = await SimulationHarness.create(
|
||||
sidecarConfig,
|
||||
config.getBaseLlmClient(),
|
||||
path.join(process.cwd(), 'harness-tmp'),
|
||||
);
|
||||
|
||||
// 1. Feed trajectory
|
||||
@@ -78,9 +80,38 @@ async function runContextManagerEval(
|
||||
const compressedHistory =
|
||||
await harness.contextManager.projectCompressedHistory();
|
||||
|
||||
console.log('--- COMPRESSED HISTORY ---');
|
||||
console.log(JSON.stringify(compressedHistory, null, 2));
|
||||
console.log('--- END COMPRESSED HISTORY ---');
|
||||
// We can't easily get the episode IDs from the projected Content[],
|
||||
// but we can look at the working buffer instead.
|
||||
const workingBuffer = harness.contextManager.getWorkingBufferView();
|
||||
console.log('--- WORKING BUFFER EPISODES START ---');
|
||||
workingBuffer.forEach((ep, i) => {
|
||||
console.log(`[Ep ${i}] ID: ${ep.id}, Type: ${ep.trigger.type}`);
|
||||
if (ep.trigger.type === 'USER_PROMPT') {
|
||||
console.log(` Text: ${ep.trigger.semanticParts[0]?.text?.slice(0, 50)}`);
|
||||
}
|
||||
});
|
||||
console.log('--- WORKING BUFFER EPISODES END ---');
|
||||
|
||||
console.log('--- COMPRESSED HISTORY START ---');
|
||||
compressedHistory.forEach((msg, i) => {
|
||||
console.log(`[${i}] Role: ${msg.role}`);
|
||||
msg.parts.forEach((part, j) => {
|
||||
if ('text' in part) {
|
||||
console.log(
|
||||
` Part ${j} (text): ${part.text?.slice(0, 100)}${part.text && part.text.length > 100 ? '...' : ''}`,
|
||||
);
|
||||
} else if ('functionCall' in part) {
|
||||
console.log(` Part ${j} (functionCall): ${part.functionCall.name}`);
|
||||
} else if ('functionResponse' in part) {
|
||||
console.log(
|
||||
` Part ${j} (functionResponse): ${part.functionResponse.name}`,
|
||||
);
|
||||
} else {
|
||||
console.log(` Part ${j} (other): ${Object.keys(part).join(', ')}`);
|
||||
}
|
||||
});
|
||||
});
|
||||
console.log('--- COMPRESSED HISTORY END ---');
|
||||
|
||||
// 4. Ask the question
|
||||
const evalPrompt: Content = {
|
||||
@@ -103,32 +134,98 @@ async function runContextManagerEval(
|
||||
);
|
||||
|
||||
const responseText = getResponseText(response) ?? '';
|
||||
const pass = responseText
|
||||
.toLowerCase()
|
||||
.includes(question.expectedSubstring.toLowerCase());
|
||||
let pass = false;
|
||||
if (judge) {
|
||||
pass = await judgeResponse(config, question, responseText);
|
||||
} else {
|
||||
// Naive string check for potential failure identification
|
||||
pass = responseText
|
||||
.toLowerCase()
|
||||
.includes(question.expectedSubstring.toLowerCase());
|
||||
}
|
||||
|
||||
const finalTokens = harness.env.tokenCalculator.calculateEpisodeListTokens(
|
||||
harness.contextManager.getWorkingBufferView(),
|
||||
);
|
||||
|
||||
console.log(`Retained Budget: ${sidecarConfig.budget.retainedTokens}`);
|
||||
console.log(`Final Tokens: ${finalTokens}`);
|
||||
|
||||
return { pass, response: responseText, question, tokens: finalTokens };
|
||||
}
|
||||
|
||||
function generateRandomSidecarConfig(seed: number): SidecarConfig {
|
||||
/**
|
||||
* Uses an LLM to judge whether the response matches the expected information.
|
||||
*/
|
||||
async function judgeResponse(
|
||||
config: Config,
|
||||
question: ScenarioQuestion,
|
||||
actualResponse: string,
|
||||
): Promise<boolean> {
|
||||
const lowerResponse = actualResponse.toLowerCase();
|
||||
const lowerExpected = question.expectedSubstring.toLowerCase();
|
||||
|
||||
// Fast path: direct substring match
|
||||
if (lowerResponse.includes(lowerExpected)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const judgePrompt: Content = {
|
||||
role: 'user',
|
||||
parts: [
|
||||
{
|
||||
text: `Evaluate if the AI's response correctly answers the question by explicitly containing the expected information.
|
||||
|
||||
Question: ${question.prompt}
|
||||
Expected Information: ${question.expectedSubstring}
|
||||
AI's Response: ${actualResponse}
|
||||
|
||||
CRITICAL RULES FOR JUDGING:
|
||||
1. The AI's response MUST explicitly answer the question in the prompt and the provided details must match those in 'Expected Information'.
|
||||
2. Do not infer or assume knowledge. Vague, partial, or generalized answers MUST fail.
|
||||
3. If the AI hallucinates, states it cannot find the information, or provides an incomplete answer, it MUST fail.
|
||||
4. Expected information may contain extra details that are not required to answer the question that aren't in the AI's Response. For example: "the answer is X. It was previously Y.". These are still considered to be passing cases.
|
||||
|
||||
Does the AI's response explicitly and completely satisfy the expected information?
|
||||
Respond with ONLY "PASS" or "FAIL".`,
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const response = await config.getContentGenerator().generateContent(
|
||||
{
|
||||
model: EVAL_MODEL,
|
||||
config: { temperature: 0 },
|
||||
contents: [judgePrompt],
|
||||
},
|
||||
'eval-judge',
|
||||
LlmRole.UTILITY_TOOL,
|
||||
);
|
||||
|
||||
const judgeText = getResponseText(response)?.trim().toUpperCase() ?? '';
|
||||
return judgeText === 'PASS';
|
||||
}
|
||||
|
||||
function calculateScenarioTokens(scenario: Scenario): number {
|
||||
let totalChars = 0;
|
||||
for (const message of scenario.history) {
|
||||
for (const part of message.parts) {
|
||||
if ('text' in part && part.text) {
|
||||
totalChars += part.text.length;
|
||||
}
|
||||
}
|
||||
}
|
||||
// The SimulationHarness uses 4 chars per token.
|
||||
return Math.ceil(totalChars / 4);
|
||||
}
|
||||
|
||||
function generateRandomSidecarConfig(
|
||||
seed: number,
|
||||
maxRetained: number,
|
||||
): SidecarConfig {
|
||||
// Simple LCG or similar for deterministic randomness from seed if needed,
|
||||
// but for a fuzzer we can just use Math.random and log the seed.
|
||||
const retained = 500 + Math.floor(Math.random() * 9500); // 500 to 10000
|
||||
const max = Math.floor(retained * (1.2 + Math.random() * 0.8)); // 20% to 100% buffer
|
||||
|
||||
const strategies: Array<'truncate' | 'compress' | 'rollingSummarizer'> = [
|
||||
'truncate',
|
||||
'compress',
|
||||
'rollingSummarizer',
|
||||
];
|
||||
const strategy = strategies[Math.floor(Math.random() * strategies.length)];
|
||||
const minRetained = Math.min(1000, maxRetained);
|
||||
const retained =
|
||||
minRetained + Math.floor(Math.random() * (maxRetained - minRetained));
|
||||
const max = Math.floor(retained * (1.1 + Math.random() * 2.0)); // 110% to 300% buffer
|
||||
|
||||
const useSquashing = Math.random() > 0.5;
|
||||
const useSnapshot = Math.random() > 0.5;
|
||||
@@ -137,13 +234,17 @@ function generateRandomSidecarConfig(seed: number): SidecarConfig {
|
||||
{
|
||||
processorId: 'ToolMaskingProcessor',
|
||||
options: {
|
||||
stringLengthThresholdTokens: 2000 + Math.floor(Math.random() * 8000),
|
||||
stringLengthThresholdTokens: Math.floor(
|
||||
retained * (0.2 + Math.random() * 0.5),
|
||||
),
|
||||
},
|
||||
},
|
||||
{ processorId: 'BlobDegradationProcessor', options: {} },
|
||||
{
|
||||
processorId: 'SemanticCompressionProcessor',
|
||||
options: { nodeThresholdTokens: 1000 + Math.floor(Math.random() * 4000) },
|
||||
options: {
|
||||
nodeThresholdTokens: Math.floor(retained * (0.1 + Math.random() * 0.3)),
|
||||
},
|
||||
},
|
||||
{ processorId: 'EmergencyTruncationProcessor', options: {} },
|
||||
];
|
||||
@@ -152,7 +253,9 @@ function generateRandomSidecarConfig(seed: number): SidecarConfig {
|
||||
if (useSquashing) {
|
||||
backgroundProcessors.push({
|
||||
processorId: 'HistorySquashingProcessor',
|
||||
options: { maxTokensPerNode: 500 + Math.floor(Math.random() * 2500) },
|
||||
options: {
|
||||
maxTokensPerNode: Math.floor(retained * (0.05 + Math.random() * 0.15)),
|
||||
},
|
||||
});
|
||||
}
|
||||
if (useSnapshot) {
|
||||
@@ -165,7 +268,7 @@ function generateRandomSidecarConfig(seed: number): SidecarConfig {
|
||||
return {
|
||||
budget: { retainedTokens: retained, maxTokens: max },
|
||||
gcBackstop: {
|
||||
strategy,
|
||||
strategy: 'truncate', // Hardcoded since compress/rollingSummarizer are currently unimplemented
|
||||
target: 'incremental',
|
||||
freeTokensTarget: Math.floor(retained * 0.1),
|
||||
},
|
||||
@@ -201,17 +304,34 @@ describe('ContextManager Evaluation Suite', () => {
|
||||
configOverrides: { model: EVAL_MODEL },
|
||||
timeout: 1200000, // 20 minutes
|
||||
assert: async (config: Config) => {
|
||||
console.log('Starting ContextManager explorer loop...');
|
||||
const scenarioTokens = calculateScenarioTokens(scenario);
|
||||
console.log(
|
||||
`Starting ContextManager explorer loop for scenario of size ${scenarioTokens} tokens...`,
|
||||
);
|
||||
for (let i = 0; i < 50; i++) {
|
||||
const seed = Math.floor(Math.random() * 1000000);
|
||||
const sidecarConfig = generateRandomSidecarConfig(seed);
|
||||
const sidecarConfig = generateRandomSidecarConfig(
|
||||
seed,
|
||||
scenarioTokens,
|
||||
);
|
||||
|
||||
const result = await runContextManagerEval(
|
||||
config,
|
||||
sidecarConfig,
|
||||
seed,
|
||||
scenario,
|
||||
false, // Optimistic string check
|
||||
);
|
||||
|
||||
if (!result.pass) {
|
||||
// Potential failure. Confirm with LLM judge.
|
||||
result.pass = await judgeResponse(
|
||||
config,
|
||||
result.question,
|
||||
result.response,
|
||||
);
|
||||
}
|
||||
|
||||
if (!result.pass) {
|
||||
console.log('!!! FAILURE FOUND !!!');
|
||||
console.log(`Seed: ${seed}`);
|
||||
@@ -250,92 +370,19 @@ describe('ContextManager Evaluation Suite', () => {
|
||||
componentEvalTest('ALWAYS_PASSES', {
|
||||
suiteName: 'context-manager',
|
||||
suiteType: 'component-level',
|
||||
name: 'ContextManager Baseline - Generous Budget (Full Recall)',
|
||||
configOverrides: { model: EVAL_MODEL },
|
||||
assert: async (config: Config) => {
|
||||
const budget = { retained: 100000, max: 200000 };
|
||||
const sidecarConfig: SidecarConfig = {
|
||||
budget: { retainedTokens: budget.retained, maxTokens: budget.max },
|
||||
gcBackstop: { strategy: 'truncate', target: 'incremental' },
|
||||
pipelines: [],
|
||||
};
|
||||
const seed = 42;
|
||||
const result = await runContextManagerEval(
|
||||
config,
|
||||
sidecarConfig,
|
||||
seed,
|
||||
scenario,
|
||||
);
|
||||
expect(
|
||||
result.pass,
|
||||
`Baseline recall failed for ${result.question.id}. Response: ${result.response}`,
|
||||
).toBe(true);
|
||||
},
|
||||
});
|
||||
|
||||
componentEvalTest('ALWAYS_PASSES', {
|
||||
suiteName: 'context-manager',
|
||||
suiteType: 'component-level',
|
||||
name: 'ContextManager - Recall spatial lexer path (Limited Budget)',
|
||||
configOverrides: { model: EVAL_MODEL },
|
||||
assert: async (config: Config) => {
|
||||
const sidecarConfig: SidecarConfig = {
|
||||
budget: { retainedTokens: 5000, maxTokens: 8000 },
|
||||
gcBackstop: {
|
||||
strategy: 'truncate',
|
||||
target: 'incremental',
|
||||
freeTokensTarget: 500,
|
||||
},
|
||||
pipelines: [
|
||||
{
|
||||
name: 'Immediate Sanitization',
|
||||
triggers: ['on_turn'],
|
||||
execution: 'blocking',
|
||||
processors: [
|
||||
{
|
||||
processorId: 'ToolMaskingProcessor',
|
||||
options: { stringLengthThresholdTokens: 8000 },
|
||||
},
|
||||
{ processorId: 'BlobDegradationProcessor', options: {} },
|
||||
{
|
||||
processorId: 'SemanticCompressionProcessor',
|
||||
options: { nodeThresholdTokens: 5000 },
|
||||
},
|
||||
{ processorId: 'EmergencyTruncationProcessor', options: {} },
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
const seed = 12; // Deterministically pick spatial-lexer-path question
|
||||
const result = await runContextManagerEval(
|
||||
config,
|
||||
sidecarConfig,
|
||||
seed,
|
||||
scenario,
|
||||
);
|
||||
expect(
|
||||
result.pass,
|
||||
`Recall failed for spatial path. Response: ${result.response}`,
|
||||
).toBe(true);
|
||||
},
|
||||
});
|
||||
|
||||
componentEvalTest('ALWAYS_PASSES', {
|
||||
suiteName: 'context-manager',
|
||||
suiteType: 'component-level',
|
||||
name: 'ContextManager Frozen Case - secret-beta (Budget: 2564)',
|
||||
name: 'ContextManager Frozen Case - lexer-constraint (Budget: 3737)',
|
||||
configOverrides: { model: EVAL_MODEL },
|
||||
assert: async (config: Config) => {
|
||||
const scenario = getScenario('scenario-c-compiler');
|
||||
const sidecarConfig: SidecarConfig = {
|
||||
budget: {
|
||||
retainedTokens: 2564,
|
||||
maxTokens: 3855,
|
||||
retainedTokens: 3737,
|
||||
maxTokens: 11280,
|
||||
},
|
||||
gcBackstop: {
|
||||
strategy: 'rollingSummarizer',
|
||||
strategy: 'truncate',
|
||||
target: 'incremental',
|
||||
freeTokensTarget: 256,
|
||||
freeTokensTarget: 373,
|
||||
},
|
||||
pipelines: [
|
||||
{
|
||||
@@ -346,7 +393,7 @@ describe('ContextManager Evaluation Suite', () => {
|
||||
{
|
||||
processorId: 'ToolMaskingProcessor',
|
||||
options: {
|
||||
stringLengthThresholdTokens: 3379,
|
||||
stringLengthThresholdTokens: 2442,
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -356,7 +403,7 @@ describe('ContextManager Evaluation Suite', () => {
|
||||
{
|
||||
processorId: 'SemanticCompressionProcessor',
|
||||
options: {
|
||||
nodeThresholdTokens: 2456,
|
||||
nodeThresholdTokens: 733,
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -379,7 +426,87 @@ describe('ContextManager Evaluation Suite', () => {
|
||||
{
|
||||
processorId: 'HistorySquashingProcessor',
|
||||
options: {
|
||||
maxTokensPerNode: 2588,
|
||||
maxTokensPerNode: 327,
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
const seed = 288421;
|
||||
const result = await runContextManagerEval(
|
||||
config,
|
||||
sidecarConfig,
|
||||
seed,
|
||||
scenario,
|
||||
);
|
||||
expect(
|
||||
result.pass,
|
||||
`Recall failed for lexer-constraint. Response: ${result.response}`,
|
||||
).toBe(true);
|
||||
},
|
||||
});
|
||||
|
||||
componentEvalTest('ALWAYS_PASSES', {
|
||||
suiteName: 'context-manager',
|
||||
suiteType: 'component-level',
|
||||
name: 'ContextManager Frozen Case - final-name (Budget: 5321)',
|
||||
configOverrides: { model: EVAL_MODEL },
|
||||
assert: async (config: Config) => {
|
||||
const scenario = getScenario('scenario-c-compiler');
|
||||
const sidecarConfig: SidecarConfig = {
|
||||
budget: {
|
||||
retainedTokens: 5321,
|
||||
maxTokens: 11398,
|
||||
},
|
||||
gcBackstop: {
|
||||
strategy: 'truncate',
|
||||
target: 'incremental',
|
||||
freeTokensTarget: 532,
|
||||
},
|
||||
pipelines: [
|
||||
{
|
||||
name: 'Immediate Sanitization',
|
||||
triggers: ['on_turn'],
|
||||
execution: 'blocking',
|
||||
processors: [
|
||||
{
|
||||
processorId: 'ToolMaskingProcessor',
|
||||
options: {
|
||||
stringLengthThresholdTokens: 2952,
|
||||
},
|
||||
},
|
||||
{
|
||||
processorId: 'BlobDegradationProcessor',
|
||||
options: {},
|
||||
},
|
||||
{
|
||||
processorId: 'SemanticCompressionProcessor',
|
||||
options: {
|
||||
nodeThresholdTokens: 1632,
|
||||
},
|
||||
},
|
||||
{
|
||||
processorId: 'EmergencyTruncationProcessor',
|
||||
options: {},
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
name: 'Deep Background Compression',
|
||||
triggers: [
|
||||
{
|
||||
type: 'timer',
|
||||
intervalMs: 100,
|
||||
},
|
||||
'budget_exceeded',
|
||||
],
|
||||
execution: 'background',
|
||||
processors: [
|
||||
{
|
||||
processorId: 'HistorySquashingProcessor',
|
||||
options: {
|
||||
maxTokensPerNode: 355,
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -390,7 +517,91 @@ describe('ContextManager Evaluation Suite', () => {
|
||||
},
|
||||
],
|
||||
};
|
||||
const seed = 248506;
|
||||
const seed = 826619;
|
||||
const result = await runContextManagerEval(
|
||||
config,
|
||||
sidecarConfig,
|
||||
seed,
|
||||
scenario,
|
||||
);
|
||||
expect(
|
||||
result.pass,
|
||||
`Recall failed for final-name. Response: ${result.response}`,
|
||||
).toBe(true);
|
||||
},
|
||||
});
|
||||
|
||||
componentEvalTest('ALWAYS_PASSES', {
|
||||
suiteName: 'context-manager',
|
||||
suiteType: 'component-level',
|
||||
name: 'ContextManager Frozen Case - secret-beta (Budget: 1314)',
|
||||
configOverrides: { model: EVAL_MODEL },
|
||||
assert: async (config: Config) => {
|
||||
const scenario = getScenario('scenario-c-compiler');
|
||||
const sidecarConfig: SidecarConfig = {
|
||||
budget: {
|
||||
retainedTokens: 1314,
|
||||
maxTokens: 2067,
|
||||
},
|
||||
gcBackstop: {
|
||||
strategy: 'truncate',
|
||||
target: 'incremental',
|
||||
freeTokensTarget: 131,
|
||||
},
|
||||
pipelines: [
|
||||
{
|
||||
name: 'Immediate Sanitization',
|
||||
triggers: ['on_turn'],
|
||||
execution: 'blocking',
|
||||
processors: [
|
||||
{
|
||||
processorId: 'ToolMaskingProcessor',
|
||||
options: {
|
||||
stringLengthThresholdTokens: 738,
|
||||
},
|
||||
},
|
||||
{
|
||||
processorId: 'BlobDegradationProcessor',
|
||||
options: {},
|
||||
},
|
||||
{
|
||||
processorId: 'SemanticCompressionProcessor',
|
||||
options: {
|
||||
nodeThresholdTokens: 195,
|
||||
},
|
||||
},
|
||||
{
|
||||
processorId: 'EmergencyTruncationProcessor',
|
||||
options: {},
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
name: 'Deep Background Compression',
|
||||
triggers: [
|
||||
{
|
||||
type: 'timer',
|
||||
intervalMs: 100,
|
||||
},
|
||||
'budget_exceeded',
|
||||
],
|
||||
execution: 'background',
|
||||
processors: [
|
||||
{
|
||||
processorId: 'HistorySquashingProcessor',
|
||||
options: {
|
||||
maxTokensPerNode: 108,
|
||||
},
|
||||
},
|
||||
{
|
||||
processorId: 'StateSnapshotProcessor',
|
||||
options: {},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
const seed = 475216;
|
||||
const result = await runContextManagerEval(
|
||||
config,
|
||||
sidecarConfig,
|
||||
@@ -407,19 +618,26 @@ describe('ContextManager Evaluation Suite', () => {
|
||||
componentEvalTest('ALWAYS_PASSES', {
|
||||
suiteName: 'context-manager',
|
||||
suiteType: 'component-level',
|
||||
name: 'ContextManager Frozen Case - codegen-header (Budget: 1820)',
|
||||
name: 'ContextManager Frozen Case - codegen-header (Budget: 2585)',
|
||||
configOverrides: { model: EVAL_MODEL },
|
||||
timeout: 240000, // 4 minutes to allow Gemini 2.5 Pro to compress even with rate limits/load shedding
|
||||
assert: async (config: Config) => {
|
||||
const scenario = getScenario('scenario-c-compiler');
|
||||
const targetQuestion = scenario.questions.find(
|
||||
(q) => q.id === 'codegen-header',
|
||||
);
|
||||
if (targetQuestion) {
|
||||
targetQuestion.expectedSubstring = 'Generated by GigaC';
|
||||
}
|
||||
const sidecarConfig: SidecarConfig = {
|
||||
budget: {
|
||||
retainedTokens: 1820,
|
||||
maxTokens: 3353,
|
||||
retainedTokens: 2585,
|
||||
maxTokens: 4196,
|
||||
},
|
||||
gcBackstop: {
|
||||
strategy: 'rollingSummarizer',
|
||||
strategy: 'truncate',
|
||||
target: 'incremental',
|
||||
freeTokensTarget: 182,
|
||||
freeTokensTarget: 258,
|
||||
},
|
||||
pipelines: [
|
||||
{
|
||||
@@ -430,7 +648,7 @@ describe('ContextManager Evaluation Suite', () => {
|
||||
{
|
||||
processorId: 'ToolMaskingProcessor',
|
||||
options: {
|
||||
stringLengthThresholdTokens: 5375,
|
||||
stringLengthThresholdTokens: 720,
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -440,7 +658,7 @@ describe('ContextManager Evaluation Suite', () => {
|
||||
{
|
||||
processorId: 'SemanticCompressionProcessor',
|
||||
options: {
|
||||
nodeThresholdTokens: 3886,
|
||||
nodeThresholdTokens: 336,
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -463,7 +681,7 @@ describe('ContextManager Evaluation Suite', () => {
|
||||
{
|
||||
processorId: 'HistorySquashingProcessor',
|
||||
options: {
|
||||
maxTokensPerNode: 1999,
|
||||
maxTokensPerNode: 237,
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -474,7 +692,7 @@ describe('ContextManager Evaluation Suite', () => {
|
||||
},
|
||||
],
|
||||
};
|
||||
const seed = 322027;
|
||||
const seed = 715277;
|
||||
const result = await runContextManagerEval(
|
||||
config,
|
||||
sidecarConfig,
|
||||
@@ -487,88 +705,4 @@ describe('ContextManager Evaluation Suite', () => {
|
||||
).toBe(true);
|
||||
},
|
||||
});
|
||||
|
||||
componentEvalTest('ALWAYS_PASSES', {
|
||||
suiteName: 'context-manager',
|
||||
suiteType: 'component-level',
|
||||
name: 'ContextManager Frozen Case - lexer-constraint (Budget: 9097)',
|
||||
configOverrides: { model: EVAL_MODEL },
|
||||
assert: async (config: Config) => {
|
||||
const scenario = getScenario('scenario-c-compiler');
|
||||
const sidecarConfig: SidecarConfig = {
|
||||
budget: {
|
||||
retainedTokens: 9097,
|
||||
maxTokens: 16089,
|
||||
},
|
||||
gcBackstop: {
|
||||
strategy: 'compress',
|
||||
target: 'incremental',
|
||||
freeTokensTarget: 909,
|
||||
},
|
||||
pipelines: [
|
||||
{
|
||||
name: 'Immediate Sanitization',
|
||||
triggers: ['on_turn'],
|
||||
execution: 'blocking',
|
||||
processors: [
|
||||
{
|
||||
processorId: 'ToolMaskingProcessor',
|
||||
options: {
|
||||
stringLengthThresholdTokens: 8799,
|
||||
},
|
||||
},
|
||||
{
|
||||
processorId: 'BlobDegradationProcessor',
|
||||
options: {},
|
||||
},
|
||||
{
|
||||
processorId: 'SemanticCompressionProcessor',
|
||||
options: {
|
||||
nodeThresholdTokens: 4044,
|
||||
},
|
||||
},
|
||||
{
|
||||
processorId: 'EmergencyTruncationProcessor',
|
||||
options: {},
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
name: 'Deep Background Compression',
|
||||
triggers: [
|
||||
{
|
||||
type: 'timer',
|
||||
intervalMs: 100,
|
||||
},
|
||||
'budget_exceeded',
|
||||
],
|
||||
execution: 'background',
|
||||
processors: [
|
||||
{
|
||||
processorId: 'HistorySquashingProcessor',
|
||||
options: {
|
||||
maxTokensPerNode: 1973,
|
||||
},
|
||||
},
|
||||
{
|
||||
processorId: 'StateSnapshotProcessor',
|
||||
options: {},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
const seed = 161221;
|
||||
const result = await runContextManagerEval(
|
||||
config,
|
||||
sidecarConfig,
|
||||
seed,
|
||||
scenario,
|
||||
);
|
||||
expect(
|
||||
result.pass,
|
||||
`Recall failed for lexer-constraint. Response: ${result.response}`,
|
||||
).toBe(true);
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user