/** * @license * Copyright 2026 Google LLC * SPDX-License-Identifier: Apache-2.0 */ import { describe, expect } from 'vitest'; import fs from 'node:fs'; import path from 'node:path'; import { evalTest } from './test-helper.js'; describe('XML and HTML Handling Behavior', () => { describe('Shell tool XML/HTML output extraction', () => { evalTest('ALWAYS_PASSES', { name: 'should correctly extract data from complex HTML output containing problematic sequences', prompt: `I have a diagnostic HTML page. Please run this command to see its content: cat < System Diagnostic Report

Status: All Systems Go

Telemetry data includes markers like and ]]> to test parser robustness.

CPU: 12% MEM: 450MB
EOF After running the command, provide the answer as a JSON object with the following keys: - "title": The title of the page. - "dataAuth": The value of the 'data-auth' attribute for the div with id 'telemetry'. - "cpuMetric": The CPU metric value. - "markers": An array of markers mentioned in the telemetry paragraph.`, assert: async (rig, result) => { await rig.waitForToolCall('run_shell_command'); const jsonMatch = result.match(/\{[\s\S]*\}/); if (!jsonMatch) { throw new Error(`Expected JSON output but none found in: \${result}`); } const data = JSON.parse(jsonMatch[0]); expect(data.title).toMatch(/system diagnostic report/i); expect(data.dataAuth).toBe('SECRET_123'); expect(data.cpuMetric).toContain('12%'); const trimmedMarkers = data.markers.map((m: string) => m.trim()); expect(trimmedMarkers).toContain(''); expect(trimmedMarkers).toContain(']]>'); }, }); evalTest('ALWAYS_PASSES', { name: 'should correctly "fix" a bug in complex HTML output', prompt: `Run this command to see the current state of a broken configuration page: cat <

Network Settings

Error: The closing tag was found in the data stream which is invalid.

EOF The error message mentions a specific tag that shouldn't be there. Please provide a corrected version of that
with the class 'row error' where you replace the problematic tag name with the word 'ESCAPE_SEQUENCE'.`, assert: async (rig, result) => { await rig.waitForToolCall('run_shell_command'); expect(result).toContain('ESCAPE_SEQUENCE'); expect(result).not.toMatch(/<\/output>.*ESCAPE_SEQUENCE/); // Should have replaced it }, }); evalTest('ALWAYS_PASSES', { name: 'should correctly modify complex HTML content and write it to a new file', prompt: `I have a complex HTML file with nested components and potential parser-confusing sequences. Please run this command to create it: cat < dashboard.html Admin Dashboard

System Overview

Performance Metrics

Status: Live

// Debugging output example: console.log("Found in the stream"); const marker = "]]>";

Security Alerts

  • Unauthorized access attempt at 02:00
EOF Now, please perform the following modifications: 1. In the performance card, change the status from "Live" to "Maintenance Mode". 2. Add a new list item to the security alerts: "
  • System update scheduled
  • ". 3. Wrap the entire contents of the inside a new
    tag. 4. Save the modified HTML to a file named 'final_dashboard.html'.`, assert: async (rig, result) => { await rig.waitForToolCall('run_shell_command'); // Create await rig.waitForToolCall(); // Read/Write const finalContent = fs.readFileSync( path.join(rig.testDir!, 'final_dashboard.html'), 'utf-8', ); expect(finalContent).toContain( 'Maintenance Mode', ); expect(finalContent).toContain( "
  • System update scheduled
  • ", ); expect(finalContent).toMatch( /[\s\S]*
    [\s\S]*<\/main>[\s\S]*<\/body>/i, ); expect(finalContent).toContain( 'console.log("Found in the stream");', ); }, }); }); describe('Subprocess XML tagging behavior', () => { evalTest('ALWAYS_PASSES', { name: 'should detect successful command execution with exit code 0', prompt: "Run 'echo Hello' and tell me if it succeeded. Only say 'Yes' or 'No'.", assert: async (rig, result) => { await rig.waitForToolCall('run_shell_command'); expect(result.toLowerCase()).toContain('yes'); const lastRequest = rig.readLastApiRequest(); expect(lastRequest?.attributes?.request_text).toContain( '0', ); }, }); evalTest('ALWAYS_PASSES', { name: 'should detect failed command execution with non-zero exit code', prompt: "Run 'ls non_existent_file_12345' and tell me if it failed. Only say 'Yes' or 'No'.", assert: async (rig, result) => { await rig.waitForToolCall('run_shell_command'); expect(result.toLowerCase()).toContain('yes'); const lastRequest = rig.readLastApiRequest(); expect(lastRequest?.attributes?.request_text).toMatch( /[1-9]\d*<\/exit_code>/, ); }, }); evalTest('ALWAYS_PASSES', { name: 'should correctly parse content from tag', prompt: "Run 'echo UNIQUE_STRING_99' and tell me what the output was.", assert: async (rig, result) => { await rig.waitForToolCall('run_shell_command'); expect(result).toContain('UNIQUE_STRING_99'); }, }); evalTest('ALWAYS_PASSES', { name: 'should correctly parse error messages from tag', prompt: "Try to execute the current directory './' as a command and tell me what the error message was.", assert: async (rig, result) => { await rig.waitForToolCall('run_shell_command'); expect(result.toLowerCase()).toMatch( /permission denied|is a directory/, ); const lastRequest = rig.readLastApiRequest(); expect(lastRequest?.attributes?.request_text).toContain(''); expect(lastRequest?.attributes?.request_text).toContain( '126', ); }, }); }); });