mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-03-10 22:21:22 -07:00
298 lines
8.9 KiB
TypeScript
298 lines
8.9 KiB
TypeScript
|
|
/**
|
||
|
|
* @license
|
||
|
|
* Copyright 2026 Google LLC
|
||
|
|
* SPDX-License-Identifier: Apache-2.0
|
||
|
|
*/
|
||
|
|
|
||
|
|
import { describe, expect } from 'vitest';
|
||
|
|
import { evalTest } from './test-helper.js';
|
||
|
|
import path from 'node:path';
|
||
|
|
import fs from 'node:fs';
|
||
|
|
import crypto from 'node:crypto';
|
||
|
|
|
||
|
|
// Recursive function to find a directory by name
|
||
|
|
function findDir(base: string, name: string): string | null {
|
||
|
|
if (!fs.existsSync(base)) return null;
|
||
|
|
const files = fs.readdirSync(base);
|
||
|
|
for (const file of files) {
|
||
|
|
const fullPath = path.join(base, file);
|
||
|
|
if (fs.statSync(fullPath).isDirectory()) {
|
||
|
|
if (file === name) return fullPath;
|
||
|
|
const found = findDir(fullPath, name);
|
||
|
|
if (found) return found;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
|
||
|
|
describe('Tool Output Masking Behavioral Evals', () => {
|
||
|
|
/**
|
||
|
|
* Scenario: The agent needs information that was masked in a previous turn.
|
||
|
|
* It should recognize the <tool_output_masked> tag and use a tool to read the file.
|
||
|
|
*/
|
||
|
|
evalTest('USUALLY_PASSES', {
|
||
|
|
name: 'should attempt to read the redirected full output file when information is masked',
|
||
|
|
params: {
|
||
|
|
security: {
|
||
|
|
folderTrust: {
|
||
|
|
enabled: true,
|
||
|
|
},
|
||
|
|
},
|
||
|
|
},
|
||
|
|
prompt: '/help',
|
||
|
|
assert: async (rig) => {
|
||
|
|
// 1. Initialize project directories
|
||
|
|
await rig.run({ args: '/help' });
|
||
|
|
|
||
|
|
// 2. Discover the project temp dir
|
||
|
|
const chatsDir = findDir(path.join(rig.homeDir!, '.gemini'), 'chats');
|
||
|
|
if (!chatsDir) throw new Error('Could not find chats directory');
|
||
|
|
const projectTempDir = path.dirname(chatsDir);
|
||
|
|
|
||
|
|
const sessionId = crypto.randomUUID();
|
||
|
|
const toolOutputsDir = path.join(
|
||
|
|
projectTempDir,
|
||
|
|
'tool-outputs',
|
||
|
|
`session-${sessionId}`,
|
||
|
|
);
|
||
|
|
fs.mkdirSync(toolOutputsDir, { recursive: true });
|
||
|
|
|
||
|
|
const secretValue = 'THE_RECOVERED_SECRET_99';
|
||
|
|
const outputFileName = `masked_output_${crypto.randomUUID()}.txt`;
|
||
|
|
const outputFilePath = path.join(toolOutputsDir, outputFileName);
|
||
|
|
fs.writeFileSync(
|
||
|
|
outputFilePath,
|
||
|
|
`Some padding...\nThe secret key is: ${secretValue}\nMore padding...`,
|
||
|
|
);
|
||
|
|
|
||
|
|
const maskedSnippet = `<tool_output_masked>
|
||
|
|
Output: [PREVIEW]
|
||
|
|
Output too large. Full output available at: ${outputFilePath}
|
||
|
|
</tool_output_masked>`;
|
||
|
|
|
||
|
|
// 3. Inject manual session file
|
||
|
|
const conversation = {
|
||
|
|
sessionId: sessionId,
|
||
|
|
projectHash: path.basename(projectTempDir),
|
||
|
|
startTime: new Date().toISOString(),
|
||
|
|
lastUpdated: new Date().toISOString(),
|
||
|
|
messages: [
|
||
|
|
{
|
||
|
|
id: 'msg_1',
|
||
|
|
timestamp: new Date().toISOString(),
|
||
|
|
type: 'user',
|
||
|
|
content: [{ text: 'Get secret.' }],
|
||
|
|
},
|
||
|
|
{
|
||
|
|
id: 'msg_2',
|
||
|
|
timestamp: new Date().toISOString(),
|
||
|
|
type: 'gemini',
|
||
|
|
model: 'gemini-3-flash-preview',
|
||
|
|
toolCalls: [
|
||
|
|
{
|
||
|
|
id: 'call_1',
|
||
|
|
name: 'run_shell_command',
|
||
|
|
args: { command: 'get_secret' },
|
||
|
|
status: 'success',
|
||
|
|
timestamp: new Date().toISOString(),
|
||
|
|
result: [
|
||
|
|
{
|
||
|
|
functionResponse: {
|
||
|
|
id: 'call_1',
|
||
|
|
name: 'run_shell_command',
|
||
|
|
response: { output: maskedSnippet },
|
||
|
|
},
|
||
|
|
},
|
||
|
|
],
|
||
|
|
},
|
||
|
|
],
|
||
|
|
content: [{ text: 'I found a masked output.' }],
|
||
|
|
},
|
||
|
|
],
|
||
|
|
};
|
||
|
|
|
||
|
|
const futureDate = new Date();
|
||
|
|
futureDate.setFullYear(futureDate.getFullYear() + 1);
|
||
|
|
conversation.startTime = futureDate.toISOString();
|
||
|
|
conversation.lastUpdated = futureDate.toISOString();
|
||
|
|
const timestamp = futureDate
|
||
|
|
.toISOString()
|
||
|
|
.slice(0, 16)
|
||
|
|
.replace(/:/g, '-');
|
||
|
|
const sessionFile = path.join(
|
||
|
|
chatsDir,
|
||
|
|
`session-${timestamp}-${sessionId.slice(0, 8)}.json`,
|
||
|
|
);
|
||
|
|
fs.writeFileSync(sessionFile, JSON.stringify(conversation, null, 2));
|
||
|
|
|
||
|
|
// 4. Trust folder
|
||
|
|
const settingsDir = path.join(rig.homeDir!, '.gemini');
|
||
|
|
fs.writeFileSync(
|
||
|
|
path.join(settingsDir, 'trustedFolders.json'),
|
||
|
|
JSON.stringify(
|
||
|
|
{
|
||
|
|
[path.resolve(rig.homeDir!)]: 'TRUST_FOLDER',
|
||
|
|
},
|
||
|
|
null,
|
||
|
|
2,
|
||
|
|
),
|
||
|
|
);
|
||
|
|
|
||
|
|
// 5. Run agent with --resume
|
||
|
|
const result = await rig.run({
|
||
|
|
args: [
|
||
|
|
'--resume',
|
||
|
|
'latest',
|
||
|
|
'What was the secret key in that last masked shell output?',
|
||
|
|
],
|
||
|
|
approvalMode: 'yolo',
|
||
|
|
timeout: 120000,
|
||
|
|
});
|
||
|
|
|
||
|
|
// ASSERTION: Verify agent accessed the redirected file
|
||
|
|
const logs = rig.readToolLogs();
|
||
|
|
const accessedFile = logs.some((log) =>
|
||
|
|
log.toolRequest.args.includes(outputFileName),
|
||
|
|
);
|
||
|
|
|
||
|
|
expect(
|
||
|
|
accessedFile,
|
||
|
|
`Agent should have attempted to access the masked output file: ${outputFileName}`,
|
||
|
|
).toBe(true);
|
||
|
|
expect(result.toLowerCase()).toContain(secretValue.toLowerCase());
|
||
|
|
},
|
||
|
|
});
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Scenario: Information is in the preview.
|
||
|
|
*/
|
||
|
|
evalTest('USUALLY_PASSES', {
|
||
|
|
name: 'should NOT read the full output file when the information is already in the preview',
|
||
|
|
params: {
|
||
|
|
security: {
|
||
|
|
folderTrust: {
|
||
|
|
enabled: true,
|
||
|
|
},
|
||
|
|
},
|
||
|
|
},
|
||
|
|
prompt: '/help',
|
||
|
|
assert: async (rig) => {
|
||
|
|
await rig.run({ args: '/help' });
|
||
|
|
|
||
|
|
const chatsDir = findDir(path.join(rig.homeDir!, '.gemini'), 'chats');
|
||
|
|
if (!chatsDir) throw new Error('Could not find chats directory');
|
||
|
|
const projectTempDir = path.dirname(chatsDir);
|
||
|
|
|
||
|
|
const sessionId = crypto.randomUUID();
|
||
|
|
const toolOutputsDir = path.join(
|
||
|
|
projectTempDir,
|
||
|
|
'tool-outputs',
|
||
|
|
`session-${sessionId}`,
|
||
|
|
);
|
||
|
|
fs.mkdirSync(toolOutputsDir, { recursive: true });
|
||
|
|
|
||
|
|
const secretValue = 'PREVIEW_SECRET_123';
|
||
|
|
const outputFileName = `masked_output_${crypto.randomUUID()}.txt`;
|
||
|
|
const outputFilePath = path.join(toolOutputsDir, outputFileName);
|
||
|
|
fs.writeFileSync(
|
||
|
|
outputFilePath,
|
||
|
|
`Full content containing ${secretValue}`,
|
||
|
|
);
|
||
|
|
|
||
|
|
const maskedSnippet = `<tool_output_masked>
|
||
|
|
Output: The secret key is: ${secretValue}
|
||
|
|
... lines omitted ...
|
||
|
|
|
||
|
|
Output too large. Full output available at: ${outputFilePath}
|
||
|
|
</tool_output_masked>`;
|
||
|
|
|
||
|
|
const conversation = {
|
||
|
|
sessionId: sessionId,
|
||
|
|
projectHash: path.basename(projectTempDir),
|
||
|
|
startTime: new Date().toISOString(),
|
||
|
|
lastUpdated: new Date().toISOString(),
|
||
|
|
messages: [
|
||
|
|
{
|
||
|
|
id: 'msg_1',
|
||
|
|
timestamp: new Date().toISOString(),
|
||
|
|
type: 'user',
|
||
|
|
content: [{ text: 'Find secret.' }],
|
||
|
|
},
|
||
|
|
{
|
||
|
|
id: 'msg_2',
|
||
|
|
timestamp: new Date().toISOString(),
|
||
|
|
type: 'gemini',
|
||
|
|
model: 'gemini-3-flash-preview',
|
||
|
|
toolCalls: [
|
||
|
|
{
|
||
|
|
id: 'call_1',
|
||
|
|
name: 'run_shell_command',
|
||
|
|
args: { command: 'get_secret' },
|
||
|
|
status: 'success',
|
||
|
|
timestamp: new Date().toISOString(),
|
||
|
|
result: [
|
||
|
|
{
|
||
|
|
functionResponse: {
|
||
|
|
id: 'call_1',
|
||
|
|
name: 'run_shell_command',
|
||
|
|
response: { output: maskedSnippet },
|
||
|
|
},
|
||
|
|
},
|
||
|
|
],
|
||
|
|
},
|
||
|
|
],
|
||
|
|
content: [{ text: 'Masked output found.' }],
|
||
|
|
},
|
||
|
|
],
|
||
|
|
};
|
||
|
|
|
||
|
|
const futureDate = new Date();
|
||
|
|
futureDate.setFullYear(futureDate.getFullYear() + 1);
|
||
|
|
conversation.startTime = futureDate.toISOString();
|
||
|
|
conversation.lastUpdated = futureDate.toISOString();
|
||
|
|
const timestamp = futureDate
|
||
|
|
.toISOString()
|
||
|
|
.slice(0, 16)
|
||
|
|
.replace(/:/g, '-');
|
||
|
|
const sessionFile = path.join(
|
||
|
|
chatsDir,
|
||
|
|
`session-${timestamp}-${sessionId.slice(0, 8)}.json`,
|
||
|
|
);
|
||
|
|
fs.writeFileSync(sessionFile, JSON.stringify(conversation, null, 2));
|
||
|
|
|
||
|
|
const settingsDir = path.join(rig.homeDir!, '.gemini');
|
||
|
|
fs.writeFileSync(
|
||
|
|
path.join(settingsDir, 'trustedFolders.json'),
|
||
|
|
JSON.stringify(
|
||
|
|
{
|
||
|
|
[path.resolve(rig.homeDir!)]: 'TRUST_FOLDER',
|
||
|
|
},
|
||
|
|
null,
|
||
|
|
2,
|
||
|
|
),
|
||
|
|
);
|
||
|
|
|
||
|
|
const result = await rig.run({
|
||
|
|
args: [
|
||
|
|
'--resume',
|
||
|
|
'latest',
|
||
|
|
'What was the secret key mentioned in the previous output?',
|
||
|
|
],
|
||
|
|
approvalMode: 'yolo',
|
||
|
|
timeout: 120000,
|
||
|
|
});
|
||
|
|
|
||
|
|
const logs = rig.readToolLogs();
|
||
|
|
const accessedFile = logs.some((log) =>
|
||
|
|
log.toolRequest.args.includes(outputFileName),
|
||
|
|
);
|
||
|
|
|
||
|
|
expect(
|
||
|
|
accessedFile,
|
||
|
|
'Agent should NOT have accessed the masked output file',
|
||
|
|
).toBe(false);
|
||
|
|
expect(result.toLowerCase()).toContain(secretValue.toLowerCase());
|
||
|
|
},
|
||
|
|
});
|
||
|
|
});
|