Files
gemini-cli/evals/xml-handling.eval.ts
2026-02-18 22:51:45 +00:00

218 lines
7.7 KiB
TypeScript

/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, expect } from 'vitest';
import fs from 'node:fs';
import path from 'node:path';
import { evalTest } from './test-helper.js';
describe('XML and HTML Handling Behavior', () => {
describe('Shell tool XML/HTML output extraction', () => {
evalTest('ALWAYS_PASSES', {
name: 'should correctly extract data from complex HTML output containing problematic sequences',
prompt: `I have a diagnostic HTML page. Please run this command to see its content:
cat <<EOF
<!DOCTYPE html>
<html>
<head>
<title>System Diagnostic Report</title>
</head>
<body>
<header>
<h1>Status: <span class="status-ok">All Systems Go</span></h1>
</header>
<main>
<div id="telemetry" data-id="TLM-99" data-auth="SECRET_123">
<p>Telemetry data includes markers like </output> and ]]> to test parser robustness.</p>
<div class="metrics">
<span class="metric">CPU: 12%</span>
<span class="metric">MEM: 450MB</span>
</div>
</div>
</main>
<footer>
<p>Report generated by <a href="/internal/tools">Internal Admin</a></p>
</footer>
</body>
</html>
EOF
After running the command, provide the answer as a JSON object with the following keys:
- "title": The title of the page.
- "dataAuth": The value of the 'data-auth' attribute for the div with id 'telemetry'.
- "cpuMetric": The CPU metric value.
- "markers": An array of markers mentioned in the telemetry paragraph.`,
assert: async (rig, result) => {
await rig.waitForToolCall('run_shell_command');
const jsonMatch = result.match(/\{[\s\S]*\}/);
if (!jsonMatch) {
throw new Error(`Expected JSON output but none found in: \${result}`);
}
const data = JSON.parse(jsonMatch[0]);
expect(data.title).toMatch(/system diagnostic report/i);
expect(data.dataAuth).toBe('SECRET_123');
expect(data.cpuMetric).toContain('12%');
const trimmedMarkers = data.markers.map((m: string) => m.trim());
expect(trimmedMarkers).toContain('</output>');
expect(trimmedMarkers).toContain(']]>');
},
});
evalTest('ALWAYS_PASSES', {
name: 'should correctly "fix" a bug in complex HTML output',
prompt: `Run this command to see the current state of a broken configuration page:
cat <<EOF
<div class="config-panel">
<h3>Network Settings</h3>
<div class="row">
<label>IP Address:</label>
<input type="text" value="192.168.1.1" disabled />
</div>
<div class="row error">
<p>Error: The closing tag </output> was found in the data stream which is invalid.</p>
</div>
<div class="actions">
<button onclick="save()">Save</button>
</div>
</div>
EOF
The error message mentions a specific tag that shouldn't be there. Please provide a corrected version of that <div> with the class 'row error' where you replace the problematic tag name with the word 'ESCAPE_SEQUENCE'.`,
assert: async (rig, result) => {
await rig.waitForToolCall('run_shell_command');
expect(result).toContain('ESCAPE_SEQUENCE');
expect(result).not.toMatch(/<\/output>.*ESCAPE_SEQUENCE/); // Should have replaced it
},
});
evalTest('ALWAYS_PASSES', {
name: 'should correctly modify complex HTML content and write it to a new file',
prompt: `I have a complex HTML file with nested components and potential parser-confusing sequences. Please run this command to create it:
cat <<EOF > dashboard.html
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Admin Dashboard</title>
</head>
<body>
<div class="container">
<header>
<h1>System Overview</h1>
</header>
<section id="stats">
<div class="card" data-type="performance">
<h2>Performance Metrics</h2>
<p>Status: <span class="status">Live</span></p>
<div class="code-block">
<code>
// Debugging output example:
console.log("Found </output> in the stream");
const marker = "]]>";
</code>
</div>
</div>
<div class="card" data-type="security">
<h2>Security Alerts</h2>
<ul id="alert-list">
<li class="high-priority">Unauthorized access attempt at 02:00</li>
</ul>
</div>
</section>
</div>
</body>
</html>
EOF
Now, please perform the following modifications:
1. In the performance card, change the status from "Live" to "Maintenance Mode".
2. Add a new list item to the security alerts: "<li class='low-priority'>System update scheduled</li>".
3. Wrap the entire contents of the <body> inside a new <main> tag.
4. Save the modified HTML to a file named 'final_dashboard.html'.`,
assert: async (rig, result) => {
await rig.waitForToolCall('run_shell_command'); // Create
await rig.waitForToolCall(); // Read/Write
const finalContent = fs.readFileSync(
path.join(rig.testDir!, 'final_dashboard.html'),
'utf-8',
);
expect(finalContent).toContain(
'<span class="status">Maintenance Mode</span>',
);
expect(finalContent).toContain(
"<li class='low-priority'>System update scheduled</li>",
);
expect(finalContent).toMatch(
/<body>[\s\S]*<main>[\s\S]*<\/main>[\s\S]*<\/body>/i,
);
expect(finalContent).toContain(
'console.log("Found </output> in the stream");',
);
},
});
});
describe('Subprocess XML tagging behavior', () => {
evalTest('ALWAYS_PASSES', {
name: 'should detect successful command execution with exit code 0',
prompt:
"Run 'echo Hello' and tell me if it succeeded. Only say 'Yes' or 'No'.",
assert: async (rig, result) => {
await rig.waitForToolCall('run_shell_command');
expect(result.toLowerCase()).toContain('yes');
const lastRequest = rig.readLastApiRequest();
expect(lastRequest?.attributes?.request_text).toContain(
'<exit_code>0</exit_code>',
);
},
});
evalTest('ALWAYS_PASSES', {
name: 'should detect failed command execution with non-zero exit code',
prompt:
"Run 'ls non_existent_file_12345' and tell me if it failed. Only say 'Yes' or 'No'.",
assert: async (rig, result) => {
await rig.waitForToolCall('run_shell_command');
expect(result.toLowerCase()).toContain('yes');
const lastRequest = rig.readLastApiRequest();
expect(lastRequest?.attributes?.request_text).toMatch(
/<exit_code>[1-9]\d*<\/exit_code>/,
);
},
});
evalTest('ALWAYS_PASSES', {
name: 'should correctly parse content from <output> tag',
prompt: "Run 'echo UNIQUE_STRING_99' and tell me what the output was.",
assert: async (rig, result) => {
await rig.waitForToolCall('run_shell_command');
expect(result).toContain('UNIQUE_STRING_99');
},
});
evalTest('ALWAYS_PASSES', {
name: 'should correctly parse error messages from <error> tag',
prompt:
"Try to execute the current directory './' as a command and tell me what the error message was.",
assert: async (rig, result) => {
await rig.waitForToolCall('run_shell_command');
expect(result.toLowerCase()).toMatch(
/permission denied|is a directory/,
);
const lastRequest = rig.readLastApiRequest();
expect(lastRequest?.attributes?.request_text).toContain('<output>');
expect(lastRequest?.attributes?.request_text).toContain(
'<exit_code>126</exit_code>',
);
},
});
});
});