2026-02-18 22:20:01 +00:00
/ * *
* @license
* Copyright 2026 Google LLC
* SPDX - License - Identifier : Apache - 2.0
* /
import { describe , expect } from 'vitest' ;
2026-02-18 22:51:45 +00:00
import fs from 'node:fs' ;
import path from 'node:path' ;
2026-02-18 22:20:01 +00:00
import { evalTest } from './test-helper.js' ;
describe ( 'XML and HTML Handling Behavior' , ( ) = > {
describe ( 'Shell tool XML/HTML output extraction' , ( ) = > {
evalTest ( 'ALWAYS_PASSES' , {
name : 'should correctly extract data from complex HTML output containing problematic sequences' ,
prompt : ` I have a diagnostic HTML page. Please run this command to see its content:
cat << EOF
< ! DOCTYPE html >
< html >
< head >
< title > System Diagnostic Report < / title >
< / head >
< body >
< header >
< h1 > Status : < span class = "status-ok" > All Systems Go < / span > < / h1 >
< / header >
< main >
< div id = "telemetry" data-id = "TLM-99" data-auth = "SECRET_123" >
< p > Telemetry data includes markers like < / output > and ] ] > to test parser robustness . < / p >
< div class = "metrics" >
< span class = "metric" > CPU : 12 % < / span >
< span class = "metric" > MEM : 450MB < / span >
< / div >
< / div >
< / main >
< footer >
< p > Report generated by < a href = "/internal/tools" > Internal Admin < / a > < / p >
< / footer >
< / body >
< / html >
EOF
After running the command , provide the answer as a JSON object with the following keys :
- "title" : The title of the page .
- "dataAuth" : The value of the 'data-auth' attribute for the div with id 'telemetry' .
- "cpuMetric" : The CPU metric value .
- "markers" : An array of markers mentioned in the telemetry paragraph . ` ,
assert : async ( rig , result ) = > {
await rig . waitForToolCall ( 'run_shell_command' ) ;
const jsonMatch = result . match ( /\{[\s\S]*\}/ ) ;
if ( ! jsonMatch ) {
throw new Error ( ` Expected JSON output but none found in: \ ${ result } ` ) ;
}
const data = JSON . parse ( jsonMatch [ 0 ] ) ;
expect ( data . title ) . toMatch ( /system diagnostic report/i ) ;
expect ( data . dataAuth ) . toBe ( 'SECRET_123' ) ;
expect ( data . cpuMetric ) . toContain ( '12%' ) ;
const trimmedMarkers = data . markers . map ( ( m : string ) = > m . trim ( ) ) ;
expect ( trimmedMarkers ) . toContain ( '</output>' ) ;
expect ( trimmedMarkers ) . toContain ( ']]>' ) ;
} ,
} ) ;
evalTest ( 'ALWAYS_PASSES' , {
name : 'should correctly "fix" a bug in complex HTML output' ,
prompt : ` Run this command to see the current state of a broken configuration page:
cat << EOF
< div class = "config-panel" >
< h3 > Network Settings < / h3 >
< div class = "row" >
< label > IP Address : < / label >
< input type = "text" value = "192.168.1.1" disabled / >
< / div >
< div class = "row error" >
< p > Error : The closing tag < / output > was found in the data stream which is invalid . < / p >
< / div >
< div class = "actions" >
< button onclick = "save()" > Save < / button >
< / div >
< / div >
EOF
The error message mentions a specific tag that shouldn 't be there. Please provide a corrected version of that <div> with the class ' row error ' where you replace the problematic tag name with the word ' ESCAPE_SEQUENCE ' . ` ,
assert : async ( rig , result ) = > {
await rig . waitForToolCall ( 'run_shell_command' ) ;
expect ( result ) . toContain ( 'ESCAPE_SEQUENCE' ) ;
expect ( result ) . not . toMatch ( /<\/output>.*ESCAPE_SEQUENCE/ ) ; // Should have replaced it
} ,
} ) ;
2026-02-18 22:51:45 +00:00
evalTest ( 'ALWAYS_PASSES' , {
name : 'should correctly modify complex HTML content and write it to a new file' ,
prompt : ` I have a complex HTML file with nested components and potential parser-confusing sequences. Please run this command to create it:
cat << EOF > dashboard . html
< ! DOCTYPE html >
< html lang = "en" >
< head >
< meta charset = "UTF-8" >
< title > Admin Dashboard < / title >
< / head >
< body >
< div class = "container" >
< header >
< h1 > System Overview < / h1 >
< / header >
< section id = "stats" >
< div class = "card" data-type = "performance" >
< h2 > Performance Metrics < / h2 >
< p > Status : < span class = "status" > Live < / span > < / p >
< div class = "code-block" >
< code >
// Debugging output example:
console . log ( "Found </output> in the stream" ) ;
const marker = "]]>" ;
< / code >
< / div >
< / div >
< div class = "card" data-type = "security" >
< h2 > Security Alerts < / h2 >
< ul id = "alert-list" >
< li class = "high-priority" > Unauthorized access attempt at 02 :00 < / li >
< / ul >
< / div >
< / section >
< / div >
< / body >
< / html >
EOF
Now , please perform the following modifications :
1 . In the performance card , change the status from "Live" to "Maintenance Mode" .
2 . Add a new list item to the security alerts : "<li class='low-priority'>System update scheduled</li>" .
3 . Wrap the entire contents of the < body > inside a new < main > tag .
4 . Save the modified HTML to a file named 'final_dashboard.html' . ` ,
assert : async ( rig , result ) = > {
await rig . waitForToolCall ( 'run_shell_command' ) ; // Create
await rig . waitForToolCall ( ) ; // Read/Write
const finalContent = fs . readFileSync (
path . join ( rig . testDir ! , 'final_dashboard.html' ) ,
'utf-8' ,
) ;
expect ( finalContent ) . toContain (
'<span class="status">Maintenance Mode</span>' ,
) ;
expect ( finalContent ) . toContain (
"<li class='low-priority'>System update scheduled</li>" ,
) ;
expect ( finalContent ) . toMatch (
/<body>[\s\S]*<main>[\s\S]*<\/main>[\s\S]*<\/body>/i ,
) ;
expect ( finalContent ) . toContain (
'console.log("Found </output> in the stream");' ,
) ;
} ,
} ) ;
2026-02-18 22:20:01 +00:00
} ) ;
describe ( 'Subprocess XML tagging behavior' , ( ) = > {
evalTest ( 'ALWAYS_PASSES' , {
name : 'should detect successful command execution with exit code 0' ,
prompt :
"Run 'echo Hello' and tell me if it succeeded. Only say 'Yes' or 'No'." ,
assert : async ( rig , result ) = > {
await rig . waitForToolCall ( 'run_shell_command' ) ;
expect ( result . toLowerCase ( ) ) . toContain ( 'yes' ) ;
const lastRequest = rig . readLastApiRequest ( ) ;
expect ( lastRequest ? . attributes ? . request_text ) . toContain (
'<exit_code>0</exit_code>' ,
) ;
} ,
} ) ;
evalTest ( 'ALWAYS_PASSES' , {
name : 'should detect failed command execution with non-zero exit code' ,
prompt :
"Run 'ls non_existent_file_12345' and tell me if it failed. Only say 'Yes' or 'No'." ,
assert : async ( rig , result ) = > {
await rig . waitForToolCall ( 'run_shell_command' ) ;
expect ( result . toLowerCase ( ) ) . toContain ( 'yes' ) ;
const lastRequest = rig . readLastApiRequest ( ) ;
expect ( lastRequest ? . attributes ? . request_text ) . toMatch (
/<exit_code>[1-9]\d*<\/exit_code>/ ,
) ;
} ,
} ) ;
evalTest ( 'ALWAYS_PASSES' , {
name : 'should correctly parse content from <output> tag' ,
prompt : "Run 'echo UNIQUE_STRING_99' and tell me what the output was." ,
assert : async ( rig , result ) = > {
await rig . waitForToolCall ( 'run_shell_command' ) ;
expect ( result ) . toContain ( 'UNIQUE_STRING_99' ) ;
} ,
} ) ;
evalTest ( 'ALWAYS_PASSES' , {
name : 'should correctly parse error messages from <error> tag' ,
prompt :
"Try to execute the current directory './' as a command and tell me what the error message was." ,
assert : async ( rig , result ) = > {
await rig . waitForToolCall ( 'run_shell_command' ) ;
expect ( result . toLowerCase ( ) ) . toMatch (
/permission denied|is a directory/ ,
) ;
const lastRequest = rig . readLastApiRequest ( ) ;
expect ( lastRequest ? . attributes ? . request_text ) . toContain ( '<output>' ) ;
expect ( lastRequest ? . attributes ? . request_text ) . toContain (
'<exit_code>126</exit_code>' ,
) ;
} ,
} ) ;
} ) ;
} ) ;