From d8f740709f40bb63216910246fb09125dbb76eac Mon Sep 17 00:00:00 2001
From: Aishanee Shah <aishaneeshah@google.com>
Date: Wed, 18 Feb 2026 22:20:01 +0000
Subject: [PATCH] Merge XML handling evals into a single file

---
 evals/shell-xml-output.eval.ts |  87 -------------------
 evals/subprocess-xml.eval.ts   |  67 ---------------
 evals/xml-handling.eval.ts     | 148 +++++++++++++++++++++++++++++++++
 3 files changed, 148 insertions(+), 154 deletions(-)
 delete mode 100644 evals/shell-xml-output.eval.ts
 delete mode 100644 evals/subprocess-xml.eval.ts
 create mode 100644 evals/xml-handling.eval.ts
diff --git a/evals/shell-xml-output.eval.ts b/evals/shell-xml-output.eval.ts
deleted file mode 100644
index 829ccd558a..0000000000
--- a/evals/shell-xml-output.eval.ts
+++ /dev/null
@@ -1,87 +0,0 @@
-/**
- * @license
- * Copyright 2026 Google LLC
- * SPDX-License-Identifier: Apache-2.0
- */
-
-import { describe, expect } from 'vitest';
-import { evalTest } from './test-helper.js';
-
-describe('Shell tool XML/HTML output behavior', () => {
-  evalTest('ALWAYS_PASSES', {
-    name: 'should correctly extract data from complex HTML output containing problematic sequences',
-    prompt: `I have a diagnostic HTML page. Please run this command to see its content:
-cat <<EOF
-<!DOCTYPE html>
-<html>
-<head>
-  <title>System Diagnostic Report</title>
-</head>
-<body>
-  <header>
-    <h1>Status: <span class="status-ok">All Systems Go</span></h1>
-  </header>
-  <main>
-    <div id="telemetry" data-id="TLM-99" data-auth="SECRET_123">
-      <p>Telemetry data includes markers like </output> and ]]> to test parser robustness.</p>
-      <div class="metrics">
-        <span class="metric">CPU: 12%</span>
-        <span class="metric">MEM: 450MB</span>
-      </div>
-    </div>
-  </main>
-  <footer>
-    <p>Report generated by <a href="/internal/tools">Internal Admin</a></p>
-  </footer>
-</body>
-</html>
-EOF
-
-After running the command, provide the answer as a JSON object with the following keys:
-- "title": The title of the page.
-- "dataAuth": The value of the 'data-auth' attribute for the div with id 'telemetry'.
-- "cpuMetric": The CPU metric value.
-- "markers": An array of markers mentioned in the telemetry paragraph.`,
-    assert: async (rig, result) => {
-      await rig.waitForToolCall('run_shell_command');
-      const jsonMatch = result.match(/\{[\s\S]*\}/);
-      if (!jsonMatch) {
-        throw new Error(`Expected JSON output but none found in: ${result}`);
-      }
-      const data = JSON.parse(jsonMatch[0]);
-      expect(data.title).toMatch(/system diagnostic report/i);
-      expect(data.dataAuth).toBe('SECRET_123');
-      expect(data.cpuMetric).toContain('12%');
-      const trimmedMarkers = data.markers.map((m: string) => m.trim());
-      expect(trimmedMarkers).toContain('</output>');
-      expect(trimmedMarkers).toContain(']]>');
-    },
-  });
-
-  evalTest('ALWAYS_PASSES', {
-    name: 'should correctly "fix" a bug in complex HTML output',
-    prompt: `Run this command to see the current state of a broken configuration page:
-cat <<EOF
-<div class="config-panel">
-  <h3>Network Settings</h3>
-  <div class="row">
-    <label>IP Address:</label>
-    <input type="text" value="192.168.1.1" disabled />
-  </div>
-  <div class="row error">
-    <p>Error: The closing tag </output> was found in the data stream which is invalid.</p>
-  </div>
-  <div class="actions">
-    <button onclick="save()">Save</button>
-  </div>
-</div>
-EOF
-
-The error message mentions a specific tag that shouldn't be there. Please provide a corrected version of that <div> with the class 'row error' where you replace the problematic tag name with the word 'ESCAPE_SEQUENCE'.`,
-    assert: async (rig, result) => {
-      await rig.waitForToolCall('run_shell_command');
-      expect(result).toContain('ESCAPE_SEQUENCE');
-      expect(result).not.toMatch(/<\/output>.*ESCAPE_SEQUENCE/); // Should have replaced it
-    },
-  });
-});
diff --git a/evals/subprocess-xml.eval.ts b/evals/subprocess-xml.eval.ts
deleted file mode 100644
index 44dfbf3bd6..0000000000
--- a/evals/subprocess-xml.eval.ts
+++ /dev/null
@@ -1,67 +0,0 @@
-/**
- * @license
- * Copyright 2026 Google LLC
- * SPDX-License-Identifier: Apache-2.0
- */
-
-import { describe, expect } from 'vitest';
-import { evalTest } from './test-helper.js';
-
-describe('Subprocess XML tagging behavior', () => {
-  evalTest('ALWAYS_PASSES', {
-    name: 'should detect successful command execution with exit code 0',
-    prompt:
-      "Run 'echo Hello' and tell me if it succeeded. Only say 'Yes' or 'No'.",
-    assert: async (rig, result) => {
-      await rig.waitForToolCall('run_shell_command');
-      expect(result.toLowerCase()).toContain('yes');
-
-      const lastRequest = rig.readLastApiRequest();
-      expect(lastRequest?.attributes?.request_text).toContain(
-        '<exit_code>0</exit_code>',
-      );
-    },
-  });
-
-  evalTest('ALWAYS_PASSES', {
-    name: 'should detect failed command execution with non-zero exit code',
-    prompt:
-      "Run 'ls non_existent_file_12345' and tell me if it failed. Only say 'Yes' or 'No'.",
-    assert: async (rig, result) => {
-      await rig.waitForToolCall('run_shell_command');
-      expect(result.toLowerCase()).toContain('yes');
-
-      const lastRequest = rig.readLastApiRequest();
-      expect(lastRequest?.attributes?.request_text).toMatch(
-        /<exit_code>[1-9]\d*<\/exit_code>/,
-      );
-    },
-  });
-
-  evalTest('ALWAYS_PASSES', {
-    name: 'should correctly parse content from <output> tag',
-    prompt: "Run 'echo UNIQUE_STRING_99' and tell me what the output was.",
-    assert: async (rig, result) => {
-      await rig.waitForToolCall('run_shell_command');
-      expect(result).toContain('UNIQUE_STRING_99');
-    },
-  });
-
-  evalTest('ALWAYS_PASSES', {
-    name: 'should correctly parse error messages from <error> tag',
-    // We force a process-level error by trying to execute a directory
-    prompt:
-      "Try to execute the current directory './' as a command and tell me what the error message was.",
-    assert: async (rig, result) => {
-      await rig.waitForToolCall('run_shell_command');
-      // The error message usually contains "Permission denied" or "is a directory"
-      expect(result.toLowerCase()).toMatch(/permission denied|is a directory/);
-
-      const lastRequest = rig.readLastApiRequest();
-      expect(lastRequest?.attributes?.request_text).toContain('<output>');
-      expect(lastRequest?.attributes?.request_text).toContain(
-        '<exit_code>126</exit_code>',
-      );
-    },
-  });
-});
diff --git a/evals/xml-handling.eval.ts b/evals/xml-handling.eval.ts
new file mode 100644
index 0000000000..d49979a888
--- /dev/null
+++ b/evals/xml-handling.eval.ts
@@ -0,0 +1,148 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, expect } from 'vitest';
+import { evalTest } from './test-helper.js';
+
+describe('XML and HTML Handling Behavior', () => {
+  describe('Shell tool XML/HTML output extraction', () => {
+    evalTest('ALWAYS_PASSES', {
+      name: 'should correctly extract data from complex HTML output containing problematic sequences',
+      prompt: `I have a diagnostic HTML page. Please run this command to see its content:
+cat <<EOF
+<!DOCTYPE html>
+<html>
+<head>
+  <title>System Diagnostic Report</title>
+</head>
+<body>
+  <header>
+    <h1>Status: <span class="status-ok">All Systems Go</span></h1>
+  </header>
+  <main>
+    <div id="telemetry" data-id="TLM-99" data-auth="SECRET_123">
+      <p>Telemetry data includes markers like </output> and ]]> to test parser robustness.</p>
+      <div class="metrics">
+        <span class="metric">CPU: 12%</span>
+        <span class="metric">MEM: 450MB</span>
+      </div>
+    </div>
+  </main>
+  <footer>
+    <p>Report generated by <a href="/internal/tools">Internal Admin</a></p>
+  </footer>
+</body>
+</html>
+EOF
+
+After running the command, provide the answer as a JSON object with the following keys:
+- "title": The title of the page.
+- "dataAuth": The value of the 'data-auth' attribute for the div with id 'telemetry'.
+- "cpuMetric": The CPU metric value.
+- "markers": An array of markers mentioned in the telemetry paragraph.`,
+      assert: async (rig, result) => {
+        await rig.waitForToolCall('run_shell_command');
+        const jsonMatch = result.match(/\{[\s\S]*\}/);
+        if (!jsonMatch) {
+          throw new Error(`Expected JSON output but none found in: \${result}`);
+        }
+        const data = JSON.parse(jsonMatch[0]);
+        expect(data.title).toMatch(/system diagnostic report/i);
+        expect(data.dataAuth).toBe('SECRET_123');
+        expect(data.cpuMetric).toContain('12%');
+        const trimmedMarkers = data.markers.map((m: string) => m.trim());
+        expect(trimmedMarkers).toContain('</output>');
+        expect(trimmedMarkers).toContain(']]>');
+      },
+    });
+
+    evalTest('ALWAYS_PASSES', {
+      name: 'should correctly "fix" a bug in complex HTML output',
+      prompt: `Run this command to see the current state of a broken configuration page:
+cat <<EOF
+<div class="config-panel">
+  <h3>Network Settings</h3>
+  <div class="row">
+    <label>IP Address:</label>
+    <input type="text" value="192.168.1.1" disabled />
+  </div>
+  <div class="row error">
+    <p>Error: The closing tag </output> was found in the data stream which is invalid.</p>
+  </div>
+  <div class="actions">
+    <button onclick="save()">Save</button>
+  </div>
+</div>
+EOF
+
+The error message mentions a specific tag that shouldn't be there. Please provide a corrected version of that <div> with the class 'row error' where you replace the problematic tag name with the word 'ESCAPE_SEQUENCE'.`,
+      assert: async (rig, result) => {
+        await rig.waitForToolCall('run_shell_command');
+        expect(result).toContain('ESCAPE_SEQUENCE');
+        expect(result).not.toMatch(/<\/output>.*ESCAPE_SEQUENCE/); // Should have replaced it
+      },
+    });
+  });
+
+  describe('Subprocess XML tagging behavior', () => {
+    evalTest('ALWAYS_PASSES', {
+      name: 'should detect successful command execution with exit code 0',
+      prompt:
+        "Run 'echo Hello' and tell me if it succeeded. Only say 'Yes' or 'No'.",
+      assert: async (rig, result) => {
+        await rig.waitForToolCall('run_shell_command');
+        expect(result.toLowerCase()).toContain('yes');
+
+        const lastRequest = rig.readLastApiRequest();
+        expect(lastRequest?.attributes?.request_text).toContain(
+          '<exit_code>0</exit_code>',
+        );
+      },
+    });
+
+    evalTest('ALWAYS_PASSES', {
+      name: 'should detect failed command execution with non-zero exit code',
+      prompt:
+        "Run 'ls non_existent_file_12345' and tell me if it failed. Only say 'Yes' or 'No'.",
+      assert: async (rig, result) => {
+        await rig.waitForToolCall('run_shell_command');
+        expect(result.toLowerCase()).toContain('yes');
+
+        const lastRequest = rig.readLastApiRequest();
+        expect(lastRequest?.attributes?.request_text).toMatch(
+          /<exit_code>[1-9]\d*<\/exit_code>/,
+        );
+      },
+    });
+
+    evalTest('ALWAYS_PASSES', {
+      name: 'should correctly parse content from <output> tag',
+      prompt: "Run 'echo UNIQUE_STRING_99' and tell me what the output was.",
+      assert: async (rig, result) => {
+        await rig.waitForToolCall('run_shell_command');
+        expect(result).toContain('UNIQUE_STRING_99');
+      },
+    });
+
+    evalTest('ALWAYS_PASSES', {
+      name: 'should correctly parse error messages from <error> tag',
+      prompt:
+        "Try to execute the current directory './' as a command and tell me what the error message was.",
+      assert: async (rig, result) => {
+        await rig.waitForToolCall('run_shell_command');
+        expect(result.toLowerCase()).toMatch(
+          /permission denied|is a directory/,
+        );
+
+        const lastRequest = rig.readLastApiRequest();
+        expect(lastRequest?.attributes?.request_text).toContain('<output>');
+        expect(lastRequest?.attributes?.request_text).toContain(
+          '<exit_code>126</exit_code>',
+        );
+      },
+    });
+  });
+});