evals/xml-handling.eval.ts

/**
 * @license
 * Copyright 2026 Google LLC
 * SPDX-License-Identifier: Apache-2.0
 */

import { describe, expect } from 'vitest';
import fs from 'node:fs';
import path from 'node:path';
import { evalTest } from './test-helper.js';

describe('XML and HTML Handling Behavior', () => {
  describe('Shell tool XML/HTML output extraction', () => {
    evalTest('ALWAYS_PASSES', {
      name: 'should correctly extract data from complex HTML output containing problematic sequences',
      prompt: `I have a diagnostic HTML page. Please run this command to see its content:
cat <<EOF
<!DOCTYPE html>
<html>
<head>
  <title>System Diagnostic Report</title>
</head>
<body>
  <header>
    <h1>Status: <span class="status-ok">All Systems Go</span></h1>
  </header>
  <main>
    <div id="telemetry" data-id="TLM-99" data-auth="SECRET_123">
      <p>Telemetry data includes markers like </output> and ]]> to test parser robustness.</p>
      <div class="metrics">
        <span class="metric">CPU: 12%</span>
        <span class="metric">MEM: 450MB</span>
      </div>
    </div>
  </main>
  <footer>
    <p>Report generated by <a href="/internal/tools">Internal Admin</a></p>
  </footer>
</body>
</html>
EOF

After running the command, provide the answer as a JSON object with the following keys:
- "title": The title of the page.
- "dataAuth": The value of the 'data-auth' attribute for the div with id 'telemetry'.
- "cpuMetric": The CPU metric value.
- "markers": An array of markers mentioned in the telemetry paragraph.`,
      assert: async (rig, result) => {
        await rig.waitForToolCall('run_shell_command');
        const jsonMatch = result.match(/\{[\s\S]*\}/);
        if (!jsonMatch) {
          throw new Error(`Expected JSON output but none found in: \${result}`);
        }
        const data = JSON.parse(jsonMatch[0]);
        expect(data.title).toMatch(/system diagnostic report/i);
        expect(data.dataAuth).toBe('SECRET_123');
        expect(data.cpuMetric).toContain('12%');
        const trimmedMarkers = data.markers.map((m: string) => m.trim());
        expect(trimmedMarkers).toContain('</output>');
        expect(trimmedMarkers).toContain(']]>');
      },
    });

    evalTest('ALWAYS_PASSES', {
      name: 'should correctly "fix" a bug in complex HTML output',
      prompt: `Run this command to see the current state of a broken configuration page:
cat <<EOF
<div class="config-panel">
  <h3>Network Settings</h3>
  <div class="row">
    <label>IP Address:</label>
    <input type="text" value="192.168.1.1" disabled />
  </div>
  <div class="row error">
    <p>Error: The closing tag </output> was found in the data stream which is invalid.</p>
  </div>
  <div class="actions">
    <button onclick="save()">Save</button>
  </div>
</div>
EOF

The error message mentions a specific tag that shouldn't be there. Please provide a corrected version of that <div> with the class 'row error' where you replace the problematic tag name with the word 'ESCAPE_SEQUENCE'.`,
      assert: async (rig, result) => {
        await rig.waitForToolCall('run_shell_command');
        expect(result).toContain('ESCAPE_SEQUENCE');
        expect(result).not.toMatch(/<\/output>.*ESCAPE_SEQUENCE/); // Should have replaced it
      },
    });

    evalTest('ALWAYS_PASSES', {
      name: 'should correctly modify complex HTML content and write it to a new file',
      prompt: `I have a complex HTML file with nested components and potential parser-confusing sequences. Please run this command to create it:
cat <<EOF > dashboard.html
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Admin Dashboard</title>
</head>
<body>
    <div class="container">
        <header>
            <h1>System Overview</h1>
        </header>
        <section id="stats">
            <div class="card" data-type="performance">
                <h2>Performance Metrics</h2>
                <p>Status: <span class="status">Live</span></p>
                <div class="code-block">
                    <code>
                        // Debugging output example:
                        console.log("Found </output> in the stream");
                        const marker = "]]>";
                    </code>
                </div>
            </div>
            <div class="card" data-type="security">
                <h2>Security Alerts</h2>
                <ul id="alert-list">
                    <li class="high-priority">Unauthorized access attempt at 02:00</li>
                </ul>
            </div>
        </section>
    </div>
</body>
</html>
EOF

Now, please perform the following modifications:
1. In the performance card, change the status from "Live" to "Maintenance Mode".
2. Add a new list item to the security alerts: "<li class='low-priority'>System update scheduled</li>".
3. Wrap the entire contents of the <body> inside a new <main> tag.
4. Save the modified HTML to a file named 'final_dashboard.html'.`,
      assert: async (rig, result) => {
        await rig.waitForToolCall('run_shell_command'); // Create
        await rig.waitForToolCall(); // Read/Write

        const finalContent = fs.readFileSync(
          path.join(rig.testDir!, 'final_dashboard.html'),
          'utf-8',
        );
        expect(finalContent).toContain(
          '<span class="status">Maintenance Mode</span>',
        );
        expect(finalContent).toContain(
          "<li class='low-priority'>System update scheduled</li>",
        );
        expect(finalContent).toMatch(
          /<body>[\s\S]*<main>[\s\S]*<\/main>[\s\S]*<\/body>/i,
        );
        expect(finalContent).toContain(
          'console.log("Found </output> in the stream");',
        );
      },
    });
  });

  describe('Subprocess XML tagging behavior', () => {
    evalTest('ALWAYS_PASSES', {
      name: 'should detect successful command execution with exit code 0',
      prompt:
        "Run 'echo Hello' and tell me if it succeeded. Only say 'Yes' or 'No'.",
      assert: async (rig, result) => {
        await rig.waitForToolCall('run_shell_command');
        expect(result.toLowerCase()).toContain('yes');

        const lastRequest = rig.readLastApiRequest();
        expect(lastRequest?.attributes?.request_text).toContain(
          '<exit_code>0</exit_code>',
        );
      },
    });

    evalTest('ALWAYS_PASSES', {
      name: 'should detect failed command execution with non-zero exit code',
      prompt:
        "Run 'ls non_existent_file_12345' and tell me if it failed. Only say 'Yes' or 'No'.",
      assert: async (rig, result) => {
        await rig.waitForToolCall('run_shell_command');
        expect(result.toLowerCase()).toContain('yes');

        const lastRequest = rig.readLastApiRequest();
        expect(lastRequest?.attributes?.request_text).toMatch(
          /<exit_code>[1-9]\d*<\/exit_code>/,
        );
      },
    });

    evalTest('ALWAYS_PASSES', {
      name: 'should correctly parse content from <output> tag',
      prompt: "Run 'echo UNIQUE_STRING_99' and tell me what the output was.",
      assert: async (rig, result) => {
        await rig.waitForToolCall('run_shell_command');
        expect(result).toContain('UNIQUE_STRING_99');
      },
    });

    evalTest('ALWAYS_PASSES', {
      name: 'should correctly parse error messages from <error> tag',
      prompt:
        "Try to execute the current directory './' as a command and tell me what the error message was.",
      assert: async (rig, result) => {
        await rig.waitForToolCall('run_shell_command');
        expect(result.toLowerCase()).toMatch(
          /permission denied|is a directory/,
        );

        const lastRequest = rig.readLastApiRequest();
        expect(lastRequest?.attributes?.request_text).toContain('<output>');
        expect(lastRequest?.attributes?.request_text).toContain(
          '<exit_code>126</exit_code>',
        );
      },
    });
  });
});
Merge XML handling evals into a single file 2026-02-18 22:20:01 +00:00			`/**`
			`* @license`
			`* Copyright 2026 Google LLC`
			`* SPDX-License-Identifier: Apache-2.0`
			`*/`

			`import { describe, expect } from 'vitest';`
Add complex HTML modification behavioral test 2026-02-18 22:51:45 +00:00			`import fs from 'node:fs';`
			`import path from 'node:path';`
Merge XML handling evals into a single file 2026-02-18 22:20:01 +00:00			`import { evalTest } from './test-helper.js';`

			`describe('XML and HTML Handling Behavior', () => {`
			`describe('Shell tool XML/HTML output extraction', () => {`
			`evalTest('ALWAYS_PASSES', {`
			`name: 'should correctly extract data from complex HTML output containing problematic sequences',`
			prompt: `I have a diagnostic HTML page. Please run this command to see its content:
			`cat <<EOF`
			`<!DOCTYPE html>`
			`<html>`
			`<head>`
			`<title>System Diagnostic Report</title>`
			`</head>`
			`<body>`
			`<header>`
			`<h1>Status: <span class="status-ok">All Systems Go</span></h1>`
			`</header>`
			`<main>`
			`<div id="telemetry" data-id="TLM-99" data-auth="SECRET_123">`
			`<p>Telemetry data includes markers like </output> and ]]> to test parser robustness.</p>`
			`<div class="metrics">`
			`<span class="metric">CPU: 12%</span>`
			`<span class="metric">MEM: 450MB</span>`
			`</div>`
			`</div>`
			`</main>`
			`<footer>`
			`<p>Report generated by <a href="/internal/tools">Internal Admin</a></p>`
			`</footer>`
			`</body>`
			`</html>`
			`EOF`

			`After running the command, provide the answer as a JSON object with the following keys:`
			`- "title": The title of the page.`
			`- "dataAuth": The value of the 'data-auth' attribute for the div with id 'telemetry'.`
			`- "cpuMetric": The CPU metric value.`
			- "markers": An array of markers mentioned in the telemetry paragraph.`,
			`assert: async (rig, result) => {`
			`await rig.waitForToolCall('run_shell_command');`
			`const jsonMatch = result.match(/\{[\s\S]*\}/);`
			`if (!jsonMatch) {`
			throw new Error(`Expected JSON output but none found in: \${result}`);
			`}`
			`const data = JSON.parse(jsonMatch[0]);`
			`expect(data.title).toMatch(/system diagnostic report/i);`
			`expect(data.dataAuth).toBe('SECRET_123');`
			`expect(data.cpuMetric).toContain('12%');`
			`const trimmedMarkers = data.markers.map((m: string) => m.trim());`
			`expect(trimmedMarkers).toContain('</output>');`
			`expect(trimmedMarkers).toContain(']]>');`
			`},`
			`});`

			`evalTest('ALWAYS_PASSES', {`
			`name: 'should correctly "fix" a bug in complex HTML output',`
			prompt: `Run this command to see the current state of a broken configuration page:
			`cat <<EOF`
			`<div class="config-panel">`
			`<h3>Network Settings</h3>`
			`<div class="row">`
			`<label>IP Address:</label>`
			`<input type="text" value="192.168.1.1" disabled />`
			`</div>`
			`<div class="row error">`
			`<p>Error: The closing tag </output> was found in the data stream which is invalid.</p>`
			`</div>`
			`<div class="actions">`
			`<button onclick="save()">Save</button>`
			`</div>`
			`</div>`
			`EOF`

			The error message mentions a specific tag that shouldn't be there. Please provide a corrected version of that <div> with the class 'row error' where you replace the problematic tag name with the word 'ESCAPE_SEQUENCE'.`,
			`assert: async (rig, result) => {`
			`await rig.waitForToolCall('run_shell_command');`
			`expect(result).toContain('ESCAPE_SEQUENCE');`
			`expect(result).not.toMatch(/<\/output>.*ESCAPE_SEQUENCE/); // Should have replaced it`
			`},`
			`});`
Add complex HTML modification behavioral test 2026-02-18 22:51:45 +00:00
			`evalTest('ALWAYS_PASSES', {`
			`name: 'should correctly modify complex HTML content and write it to a new file',`
			prompt: `I have a complex HTML file with nested components and potential parser-confusing sequences. Please run this command to create it:
			`cat <<EOF > dashboard.html`
			`<!DOCTYPE html>`
			`<html lang="en">`
			`<head>`
			`<meta charset="UTF-8">`
			`<title>Admin Dashboard</title>`
			`</head>`
			`<body>`
			`<div class="container">`
			`<header>`
			`<h1>System Overview</h1>`
			`</header>`
			`<section id="stats">`
			`<div class="card" data-type="performance">`
			`<h2>Performance Metrics</h2>`
			`<p>Status: <span class="status">Live</span></p>`
			`<div class="code-block">`
			`<code>`
			`// Debugging output example:`
			`console.log("Found </output> in the stream");`
			`const marker = "]]>";`
			`</code>`
			`</div>`
			`</div>`
			`<div class="card" data-type="security">`
			`<h2>Security Alerts</h2>`
			`<ul id="alert-list">`
			`<li class="high-priority">Unauthorized access attempt at 02:00</li>`
			`</ul>`
			`</div>`
			`</section>`
			`</div>`
			`</body>`
			`</html>`
			`EOF`

			`Now, please perform the following modifications:`
			`1. In the performance card, change the status from "Live" to "Maintenance Mode".`
			`2. Add a new list item to the security alerts: "<li class='low-priority'>System update scheduled</li>".`
			`3. Wrap the entire contents of the <body> inside a new <main> tag.`
			4. Save the modified HTML to a file named 'final_dashboard.html'.`,
			`assert: async (rig, result) => {`
			`await rig.waitForToolCall('run_shell_command'); // Create`
			`await rig.waitForToolCall(); // Read/Write`

			`const finalContent = fs.readFileSync(`
			`path.join(rig.testDir!, 'final_dashboard.html'),`
			`'utf-8',`
			`);`
			`expect(finalContent).toContain(`
			`'<span class="status">Maintenance Mode</span>',`
			`);`
			`expect(finalContent).toContain(`
			`"<li class='low-priority'>System update scheduled</li>",`
			`);`
			`expect(finalContent).toMatch(`
			`/<body>[\s\S]<main>[\s\S]<\/main>[\s\S]*<\/body>/i,`
			`);`
			`expect(finalContent).toContain(`
			`'console.log("Found </output> in the stream");',`
			`);`
			`},`
			`});`
Merge XML handling evals into a single file 2026-02-18 22:20:01 +00:00			`});`

			`describe('Subprocess XML tagging behavior', () => {`
			`evalTest('ALWAYS_PASSES', {`
			`name: 'should detect successful command execution with exit code 0',`
			`prompt:`
			`"Run 'echo Hello' and tell me if it succeeded. Only say 'Yes' or 'No'.",`
			`assert: async (rig, result) => {`
			`await rig.waitForToolCall('run_shell_command');`
			`expect(result.toLowerCase()).toContain('yes');`

			`const lastRequest = rig.readLastApiRequest();`
			`expect(lastRequest?.attributes?.request_text).toContain(`
			`'<exit_code>0</exit_code>',`
			`);`
			`},`
			`});`

			`evalTest('ALWAYS_PASSES', {`
			`name: 'should detect failed command execution with non-zero exit code',`
			`prompt:`
			`"Run 'ls non_existent_file_12345' and tell me if it failed. Only say 'Yes' or 'No'.",`
			`assert: async (rig, result) => {`
			`await rig.waitForToolCall('run_shell_command');`
			`expect(result.toLowerCase()).toContain('yes');`

			`const lastRequest = rig.readLastApiRequest();`
			`expect(lastRequest?.attributes?.request_text).toMatch(`
			`/<exit_code>[1-9]\d*<\/exit_code>/,`
			`);`
			`},`
			`});`

			`evalTest('ALWAYS_PASSES', {`
			`name: 'should correctly parse content from <output> tag',`
			`prompt: "Run 'echo UNIQUE_STRING_99' and tell me what the output was.",`
			`assert: async (rig, result) => {`
			`await rig.waitForToolCall('run_shell_command');`
			`expect(result).toContain('UNIQUE_STRING_99');`
			`},`
			`});`

			`evalTest('ALWAYS_PASSES', {`
			`name: 'should correctly parse error messages from <error> tag',`
			`prompt:`
			`"Try to execute the current directory './' as a command and tell me what the error message was.",`
			`assert: async (rig, result) => {`
			`await rig.waitForToolCall('run_shell_command');`
			`expect(result.toLowerCase()).toMatch(`
			`/permission denied\|is a directory/,`
			`);`

			`const lastRequest = rig.readLastApiRequest();`
			`expect(lastRequest?.attributes?.request_text).toContain('<output>');`
			`expect(lastRequest?.attributes?.request_text).toContain(`
			`'<exit_code>126</exit_code>',`
			`);`
			`},`
			`});`
			`});`
			`});`