From e040f9caeff36c5c637a68bb6b879e030efbdf56 Mon Sep 17 00:00:00 2001 From: Aishanee Shah Date: Wed, 18 Feb 2026 22:51:45 +0000 Subject: [PATCH] Add complex HTML modification behavioral test --- evals/xml-handling.eval.ts | 69 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/evals/xml-handling.eval.ts b/evals/xml-handling.eval.ts index d49979a888..e72caabc8e 100644 --- a/evals/xml-handling.eval.ts +++ b/evals/xml-handling.eval.ts @@ -5,6 +5,8 @@ */ import { describe, expect } from 'vitest'; +import fs from 'node:fs'; +import path from 'node:path'; import { evalTest } from './test-helper.js'; describe('XML and HTML Handling Behavior', () => { @@ -85,6 +87,73 @@ The error message mentions a specific tag that shouldn't be there. Please provid expect(result).not.toMatch(/<\/output>.*ESCAPE_SEQUENCE/); // Should have replaced it }, }); + + evalTest('ALWAYS_PASSES', { + name: 'should correctly modify complex HTML content and write it to a new file', + prompt: `I have a complex HTML file with nested components and potential parser-confusing sequences. Please run this command to create it: +cat < dashboard.html + + + + + Admin Dashboard + + +
+
+

System Overview

+
+
+
+

Performance Metrics

+

Status: Live

+
+ + // Debugging output example: + console.log("Found in the stream"); + const marker = "]]>"; + +
+
+
+

Security Alerts

+
    +
  • Unauthorized access attempt at 02:00
  • +
+
+
+
+ + +EOF + +Now, please perform the following modifications: +1. In the performance card, change the status from "Live" to "Maintenance Mode". +2. Add a new list item to the security alerts: "
  • System update scheduled
  • ". +3. Wrap the entire contents of the inside a new
    tag. +4. Save the modified HTML to a file named 'final_dashboard.html'.`, + assert: async (rig, result) => { + await rig.waitForToolCall('run_shell_command'); // Create + await rig.waitForToolCall(); // Read/Write + + const finalContent = fs.readFileSync( + path.join(rig.testDir!, 'final_dashboard.html'), + 'utf-8', + ); + expect(finalContent).toContain( + 'Maintenance Mode', + ); + expect(finalContent).toContain( + "
  • System update scheduled
  • ", + ); + expect(finalContent).toMatch( + /[\s\S]*
    [\s\S]*<\/main>[\s\S]*<\/body>/i, + ); + expect(finalContent).toContain( + 'console.log("Found in the stream");', + ); + }, + }); }); describe('Subprocess XML tagging behavior', () => {