From 4fd20c9200e56b70788ae4147681b085cbdb24fc Mon Sep 17 00:00:00 2001 From: Aishanee Shah Date: Wed, 18 Feb 2026 21:54:13 +0000 Subject: [PATCH] test: add behavioral test for shell XML/HTML output safety --- evals/shell-xml-output.eval.ts | 82 ++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 evals/shell-xml-output.eval.ts diff --git a/evals/shell-xml-output.eval.ts b/evals/shell-xml-output.eval.ts new file mode 100644 index 0000000000..78d4316c25 --- /dev/null +++ b/evals/shell-xml-output.eval.ts @@ -0,0 +1,82 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, expect } from 'vitest'; +import { evalTest } from './test-helper.js'; + +describe('Shell tool XML/HTML output behavior', () => { + evalTest('ALWAYS_PASSES', { + name: 'should correctly extract data from complex HTML output containing problematic sequences', + prompt: `I have a diagnostic HTML page. Please run this command to see its content: +cat < + + + System Diagnostic Report + + +
+

Status: All Systems Go

+
+
+
+

Telemetry data includes markers like and ]]> to test parser robustness.

+
+ CPU: 12% + MEM: 450MB +
+
+
+ + + +EOF + +After running the command, tell me: +1. The title of the page. +2. The value of the 'data-auth' attribute for the div with id 'telemetry'. +3. The CPU metric value. +4. What markers were mentioned in the telemetry paragraph.`, + assert: async (rig, result) => { + await rig.waitForToolCall('run_shell_command'); + const lowerResult = result.toLowerCase(); + expect(lowerResult).toContain('system diagnostic report'); + expect(result).toContain('SECRET_123'); + expect(result).toContain('12%'); + expect(result).toContain(''); + expect(result).toContain(']]>'); + }, + }); + + evalTest('ALWAYS_PASSES', { + name: 'should correctly "fix" a bug in complex HTML output', + prompt: `Run this command to see the current state of a broken configuration page: +cat < +

Network Settings

+
+ + +
+
+

Error: The closing tag was found in the data stream which is invalid.

+
+
+ +
+ +EOF + +The error message mentions a specific tag that shouldn't be there. Please provide a corrected version of that
with the class 'row error' where you replace the problematic tag name with the word 'ESCAPE_SEQUENCE'.`, + assert: async (rig, result) => { + await rig.waitForToolCall('run_shell_command'); + expect(result).toContain('ESCAPE_SEQUENCE'); + expect(result).not.toMatch(/<\/output>.*ESCAPE_SEQUENCE/); // Should have replaced it + }, + }); +});