mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-03-13 15:40:57 -07:00
Merge XML handling evals into a single file
This commit is contained in:
@@ -1,87 +0,0 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { describe, expect } from 'vitest';
|
||||
import { evalTest } from './test-helper.js';
|
||||
|
||||
describe('Shell tool XML/HTML output behavior', () => {
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
name: 'should correctly extract data from complex HTML output containing problematic sequences',
|
||||
prompt: `I have a diagnostic HTML page. Please run this command to see its content:
|
||||
cat <<EOF
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>System Diagnostic Report</title>
|
||||
</head>
|
||||
<body>
|
||||
<header>
|
||||
<h1>Status: <span class="status-ok">All Systems Go</span></h1>
|
||||
</header>
|
||||
<main>
|
||||
<div id="telemetry" data-id="TLM-99" data-auth="SECRET_123">
|
||||
<p>Telemetry data includes markers like </output> and ]]> to test parser robustness.</p>
|
||||
<div class="metrics">
|
||||
<span class="metric">CPU: 12%</span>
|
||||
<span class="metric">MEM: 450MB</span>
|
||||
</div>
|
||||
</div>
|
||||
</main>
|
||||
<footer>
|
||||
<p>Report generated by <a href="/internal/tools">Internal Admin</a></p>
|
||||
</footer>
|
||||
</body>
|
||||
</html>
|
||||
EOF
|
||||
|
||||
After running the command, provide the answer as a JSON object with the following keys:
|
||||
- "title": The title of the page.
|
||||
- "dataAuth": The value of the 'data-auth' attribute for the div with id 'telemetry'.
|
||||
- "cpuMetric": The CPU metric value.
|
||||
- "markers": An array of markers mentioned in the telemetry paragraph.`,
|
||||
assert: async (rig, result) => {
|
||||
await rig.waitForToolCall('run_shell_command');
|
||||
const jsonMatch = result.match(/\{[\s\S]*\}/);
|
||||
if (!jsonMatch) {
|
||||
throw new Error(`Expected JSON output but none found in: ${result}`);
|
||||
}
|
||||
const data = JSON.parse(jsonMatch[0]);
|
||||
expect(data.title).toMatch(/system diagnostic report/i);
|
||||
expect(data.dataAuth).toBe('SECRET_123');
|
||||
expect(data.cpuMetric).toContain('12%');
|
||||
const trimmedMarkers = data.markers.map((m: string) => m.trim());
|
||||
expect(trimmedMarkers).toContain('</output>');
|
||||
expect(trimmedMarkers).toContain(']]>');
|
||||
},
|
||||
});
|
||||
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
name: 'should correctly "fix" a bug in complex HTML output',
|
||||
prompt: `Run this command to see the current state of a broken configuration page:
|
||||
cat <<EOF
|
||||
<div class="config-panel">
|
||||
<h3>Network Settings</h3>
|
||||
<div class="row">
|
||||
<label>IP Address:</label>
|
||||
<input type="text" value="192.168.1.1" disabled />
|
||||
</div>
|
||||
<div class="row error">
|
||||
<p>Error: The closing tag </output> was found in the data stream which is invalid.</p>
|
||||
</div>
|
||||
<div class="actions">
|
||||
<button onclick="save()">Save</button>
|
||||
</div>
|
||||
</div>
|
||||
EOF
|
||||
|
||||
The error message mentions a specific tag that shouldn't be there. Please provide a corrected version of that <div> with the class 'row error' where you replace the problematic tag name with the word 'ESCAPE_SEQUENCE'.`,
|
||||
assert: async (rig, result) => {
|
||||
await rig.waitForToolCall('run_shell_command');
|
||||
expect(result).toContain('ESCAPE_SEQUENCE');
|
||||
expect(result).not.toMatch(/<\/output>.*ESCAPE_SEQUENCE/); // Should have replaced it
|
||||
},
|
||||
});
|
||||
});
|
||||
@@ -1,67 +0,0 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { describe, expect } from 'vitest';
|
||||
import { evalTest } from './test-helper.js';
|
||||
|
||||
describe('Subprocess XML tagging behavior', () => {
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
name: 'should detect successful command execution with exit code 0',
|
||||
prompt:
|
||||
"Run 'echo Hello' and tell me if it succeeded. Only say 'Yes' or 'No'.",
|
||||
assert: async (rig, result) => {
|
||||
await rig.waitForToolCall('run_shell_command');
|
||||
expect(result.toLowerCase()).toContain('yes');
|
||||
|
||||
const lastRequest = rig.readLastApiRequest();
|
||||
expect(lastRequest?.attributes?.request_text).toContain(
|
||||
'<exit_code>0</exit_code>',
|
||||
);
|
||||
},
|
||||
});
|
||||
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
name: 'should detect failed command execution with non-zero exit code',
|
||||
prompt:
|
||||
"Run 'ls non_existent_file_12345' and tell me if it failed. Only say 'Yes' or 'No'.",
|
||||
assert: async (rig, result) => {
|
||||
await rig.waitForToolCall('run_shell_command');
|
||||
expect(result.toLowerCase()).toContain('yes');
|
||||
|
||||
const lastRequest = rig.readLastApiRequest();
|
||||
expect(lastRequest?.attributes?.request_text).toMatch(
|
||||
/<exit_code>[1-9]\d*<\/exit_code>/,
|
||||
);
|
||||
},
|
||||
});
|
||||
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
name: 'should correctly parse content from <output> tag',
|
||||
prompt: "Run 'echo UNIQUE_STRING_99' and tell me what the output was.",
|
||||
assert: async (rig, result) => {
|
||||
await rig.waitForToolCall('run_shell_command');
|
||||
expect(result).toContain('UNIQUE_STRING_99');
|
||||
},
|
||||
});
|
||||
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
name: 'should correctly parse error messages from <error> tag',
|
||||
// We force a process-level error by trying to execute a directory
|
||||
prompt:
|
||||
"Try to execute the current directory './' as a command and tell me what the error message was.",
|
||||
assert: async (rig, result) => {
|
||||
await rig.waitForToolCall('run_shell_command');
|
||||
// The error message usually contains "Permission denied" or "is a directory"
|
||||
expect(result.toLowerCase()).toMatch(/permission denied|is a directory/);
|
||||
|
||||
const lastRequest = rig.readLastApiRequest();
|
||||
expect(lastRequest?.attributes?.request_text).toContain('<output>');
|
||||
expect(lastRequest?.attributes?.request_text).toContain(
|
||||
'<exit_code>126</exit_code>',
|
||||
);
|
||||
},
|
||||
});
|
||||
});
|
||||
148
evals/xml-handling.eval.ts
Normal file
148
evals/xml-handling.eval.ts
Normal file
@@ -0,0 +1,148 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { describe, expect } from 'vitest';
|
||||
import { evalTest } from './test-helper.js';
|
||||
|
||||
describe('XML and HTML Handling Behavior', () => {
|
||||
describe('Shell tool XML/HTML output extraction', () => {
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
name: 'should correctly extract data from complex HTML output containing problematic sequences',
|
||||
prompt: `I have a diagnostic HTML page. Please run this command to see its content:
|
||||
cat <<EOF
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>System Diagnostic Report</title>
|
||||
</head>
|
||||
<body>
|
||||
<header>
|
||||
<h1>Status: <span class="status-ok">All Systems Go</span></h1>
|
||||
</header>
|
||||
<main>
|
||||
<div id="telemetry" data-id="TLM-99" data-auth="SECRET_123">
|
||||
<p>Telemetry data includes markers like </output> and ]]> to test parser robustness.</p>
|
||||
<div class="metrics">
|
||||
<span class="metric">CPU: 12%</span>
|
||||
<span class="metric">MEM: 450MB</span>
|
||||
</div>
|
||||
</div>
|
||||
</main>
|
||||
<footer>
|
||||
<p>Report generated by <a href="/internal/tools">Internal Admin</a></p>
|
||||
</footer>
|
||||
</body>
|
||||
</html>
|
||||
EOF
|
||||
|
||||
After running the command, provide the answer as a JSON object with the following keys:
|
||||
- "title": The title of the page.
|
||||
- "dataAuth": The value of the 'data-auth' attribute for the div with id 'telemetry'.
|
||||
- "cpuMetric": The CPU metric value.
|
||||
- "markers": An array of markers mentioned in the telemetry paragraph.`,
|
||||
assert: async (rig, result) => {
|
||||
await rig.waitForToolCall('run_shell_command');
|
||||
const jsonMatch = result.match(/\{[\s\S]*\}/);
|
||||
if (!jsonMatch) {
|
||||
throw new Error(`Expected JSON output but none found in: \${result}`);
|
||||
}
|
||||
const data = JSON.parse(jsonMatch[0]);
|
||||
expect(data.title).toMatch(/system diagnostic report/i);
|
||||
expect(data.dataAuth).toBe('SECRET_123');
|
||||
expect(data.cpuMetric).toContain('12%');
|
||||
const trimmedMarkers = data.markers.map((m: string) => m.trim());
|
||||
expect(trimmedMarkers).toContain('</output>');
|
||||
expect(trimmedMarkers).toContain(']]>');
|
||||
},
|
||||
});
|
||||
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
name: 'should correctly "fix" a bug in complex HTML output',
|
||||
prompt: `Run this command to see the current state of a broken configuration page:
|
||||
cat <<EOF
|
||||
<div class="config-panel">
|
||||
<h3>Network Settings</h3>
|
||||
<div class="row">
|
||||
<label>IP Address:</label>
|
||||
<input type="text" value="192.168.1.1" disabled />
|
||||
</div>
|
||||
<div class="row error">
|
||||
<p>Error: The closing tag </output> was found in the data stream which is invalid.</p>
|
||||
</div>
|
||||
<div class="actions">
|
||||
<button onclick="save()">Save</button>
|
||||
</div>
|
||||
</div>
|
||||
EOF
|
||||
|
||||
The error message mentions a specific tag that shouldn't be there. Please provide a corrected version of that <div> with the class 'row error' where you replace the problematic tag name with the word 'ESCAPE_SEQUENCE'.`,
|
||||
assert: async (rig, result) => {
|
||||
await rig.waitForToolCall('run_shell_command');
|
||||
expect(result).toContain('ESCAPE_SEQUENCE');
|
||||
expect(result).not.toMatch(/<\/output>.*ESCAPE_SEQUENCE/); // Should have replaced it
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
describe('Subprocess XML tagging behavior', () => {
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
name: 'should detect successful command execution with exit code 0',
|
||||
prompt:
|
||||
"Run 'echo Hello' and tell me if it succeeded. Only say 'Yes' or 'No'.",
|
||||
assert: async (rig, result) => {
|
||||
await rig.waitForToolCall('run_shell_command');
|
||||
expect(result.toLowerCase()).toContain('yes');
|
||||
|
||||
const lastRequest = rig.readLastApiRequest();
|
||||
expect(lastRequest?.attributes?.request_text).toContain(
|
||||
'<exit_code>0</exit_code>',
|
||||
);
|
||||
},
|
||||
});
|
||||
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
name: 'should detect failed command execution with non-zero exit code',
|
||||
prompt:
|
||||
"Run 'ls non_existent_file_12345' and tell me if it failed. Only say 'Yes' or 'No'.",
|
||||
assert: async (rig, result) => {
|
||||
await rig.waitForToolCall('run_shell_command');
|
||||
expect(result.toLowerCase()).toContain('yes');
|
||||
|
||||
const lastRequest = rig.readLastApiRequest();
|
||||
expect(lastRequest?.attributes?.request_text).toMatch(
|
||||
/<exit_code>[1-9]\d*<\/exit_code>/,
|
||||
);
|
||||
},
|
||||
});
|
||||
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
name: 'should correctly parse content from <output> tag',
|
||||
prompt: "Run 'echo UNIQUE_STRING_99' and tell me what the output was.",
|
||||
assert: async (rig, result) => {
|
||||
await rig.waitForToolCall('run_shell_command');
|
||||
expect(result).toContain('UNIQUE_STRING_99');
|
||||
},
|
||||
});
|
||||
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
name: 'should correctly parse error messages from <error> tag',
|
||||
prompt:
|
||||
"Try to execute the current directory './' as a command and tell me what the error message was.",
|
||||
assert: async (rig, result) => {
|
||||
await rig.waitForToolCall('run_shell_command');
|
||||
expect(result.toLowerCase()).toMatch(
|
||||
/permission denied|is a directory/,
|
||||
);
|
||||
|
||||
const lastRequest = rig.readLastApiRequest();
|
||||
expect(lastRequest?.attributes?.request_text).toContain('<output>');
|
||||
expect(lastRequest?.attributes?.request_text).toContain(
|
||||
'<exit_code>126</exit_code>',
|
||||
);
|
||||
},
|
||||
});
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user