From d8f740709f40bb63216910246fb09125dbb76eac Mon Sep 17 00:00:00 2001 From: Aishanee Shah Date: Wed, 18 Feb 2026 22:20:01 +0000 Subject: [PATCH] Merge XML handling evals into a single file --- evals/shell-xml-output.eval.ts | 87 ------------------- evals/subprocess-xml.eval.ts | 67 --------------- evals/xml-handling.eval.ts | 148 +++++++++++++++++++++++++++++++++ 3 files changed, 148 insertions(+), 154 deletions(-) delete mode 100644 evals/shell-xml-output.eval.ts delete mode 100644 evals/subprocess-xml.eval.ts create mode 100644 evals/xml-handling.eval.ts diff --git a/evals/shell-xml-output.eval.ts b/evals/shell-xml-output.eval.ts deleted file mode 100644 index 829ccd558a..0000000000 --- a/evals/shell-xml-output.eval.ts +++ /dev/null @@ -1,87 +0,0 @@ -/** - * @license - * Copyright 2026 Google LLC - * SPDX-License-Identifier: Apache-2.0 - */ - -import { describe, expect } from 'vitest'; -import { evalTest } from './test-helper.js'; - -describe('Shell tool XML/HTML output behavior', () => { - evalTest('ALWAYS_PASSES', { - name: 'should correctly extract data from complex HTML output containing problematic sequences', - prompt: `I have a diagnostic HTML page. Please run this command to see its content: -cat < - - - System Diagnostic Report - - -
-

Status: All Systems Go

-
-
-
-

Telemetry data includes markers like and ]]> to test parser robustness.

-
- CPU: 12% - MEM: 450MB -
-
-
- - - -EOF - -After running the command, provide the answer as a JSON object with the following keys: -- "title": The title of the page. -- "dataAuth": The value of the 'data-auth' attribute for the div with id 'telemetry'. -- "cpuMetric": The CPU metric value. -- "markers": An array of markers mentioned in the telemetry paragraph.`, - assert: async (rig, result) => { - await rig.waitForToolCall('run_shell_command'); - const jsonMatch = result.match(/\{[\s\S]*\}/); - if (!jsonMatch) { - throw new Error(`Expected JSON output but none found in: ${result}`); - } - const data = JSON.parse(jsonMatch[0]); - expect(data.title).toMatch(/system diagnostic report/i); - expect(data.dataAuth).toBe('SECRET_123'); - expect(data.cpuMetric).toContain('12%'); - const trimmedMarkers = data.markers.map((m: string) => m.trim()); - expect(trimmedMarkers).toContain(''); - expect(trimmedMarkers).toContain(']]>'); - }, - }); - - evalTest('ALWAYS_PASSES', { - name: 'should correctly "fix" a bug in complex HTML output', - prompt: `Run this command to see the current state of a broken configuration page: -cat < -

Network Settings

-
- - -
-
-

Error: The closing tag was found in the data stream which is invalid.

-
-
- -
- -EOF - -The error message mentions a specific tag that shouldn't be there. Please provide a corrected version of that
with the class 'row error' where you replace the problematic tag name with the word 'ESCAPE_SEQUENCE'.`, - assert: async (rig, result) => { - await rig.waitForToolCall('run_shell_command'); - expect(result).toContain('ESCAPE_SEQUENCE'); - expect(result).not.toMatch(/<\/output>.*ESCAPE_SEQUENCE/); // Should have replaced it - }, - }); -}); diff --git a/evals/subprocess-xml.eval.ts b/evals/subprocess-xml.eval.ts deleted file mode 100644 index 44dfbf3bd6..0000000000 --- a/evals/subprocess-xml.eval.ts +++ /dev/null @@ -1,67 +0,0 @@ -/** - * @license - * Copyright 2026 Google LLC - * SPDX-License-Identifier: Apache-2.0 - */ - -import { describe, expect } from 'vitest'; -import { evalTest } from './test-helper.js'; - -describe('Subprocess XML tagging behavior', () => { - evalTest('ALWAYS_PASSES', { - name: 'should detect successful command execution with exit code 0', - prompt: - "Run 'echo Hello' and tell me if it succeeded. Only say 'Yes' or 'No'.", - assert: async (rig, result) => { - await rig.waitForToolCall('run_shell_command'); - expect(result.toLowerCase()).toContain('yes'); - - const lastRequest = rig.readLastApiRequest(); - expect(lastRequest?.attributes?.request_text).toContain( - '0', - ); - }, - }); - - evalTest('ALWAYS_PASSES', { - name: 'should detect failed command execution with non-zero exit code', - prompt: - "Run 'ls non_existent_file_12345' and tell me if it failed. Only say 'Yes' or 'No'.", - assert: async (rig, result) => { - await rig.waitForToolCall('run_shell_command'); - expect(result.toLowerCase()).toContain('yes'); - - const lastRequest = rig.readLastApiRequest(); - expect(lastRequest?.attributes?.request_text).toMatch( - /[1-9]\d*<\/exit_code>/, - ); - }, - }); - - evalTest('ALWAYS_PASSES', { - name: 'should correctly parse content from tag', - prompt: "Run 'echo UNIQUE_STRING_99' and tell me what the output was.", - assert: async (rig, result) => { - await rig.waitForToolCall('run_shell_command'); - expect(result).toContain('UNIQUE_STRING_99'); - }, - }); - - evalTest('ALWAYS_PASSES', { - name: 'should correctly parse error messages from tag', - // We force a process-level error by trying to execute a directory - prompt: - "Try to execute the current directory './' as a command and tell me what the error message was.", - assert: async (rig, result) => { - await rig.waitForToolCall('run_shell_command'); - // The error message usually contains "Permission denied" or "is a directory" - expect(result.toLowerCase()).toMatch(/permission denied|is a directory/); - - const lastRequest = rig.readLastApiRequest(); - expect(lastRequest?.attributes?.request_text).toContain(''); - expect(lastRequest?.attributes?.request_text).toContain( - '126', - ); - }, - }); -}); diff --git a/evals/xml-handling.eval.ts b/evals/xml-handling.eval.ts new file mode 100644 index 0000000000..d49979a888 --- /dev/null +++ b/evals/xml-handling.eval.ts @@ -0,0 +1,148 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, expect } from 'vitest'; +import { evalTest } from './test-helper.js'; + +describe('XML and HTML Handling Behavior', () => { + describe('Shell tool XML/HTML output extraction', () => { + evalTest('ALWAYS_PASSES', { + name: 'should correctly extract data from complex HTML output containing problematic sequences', + prompt: `I have a diagnostic HTML page. Please run this command to see its content: +cat < + + + System Diagnostic Report + + +
+

Status: All Systems Go

+
+
+
+

Telemetry data includes markers like and ]]> to test parser robustness.

+
+ CPU: 12% + MEM: 450MB +
+
+
+ + + +EOF + +After running the command, provide the answer as a JSON object with the following keys: +- "title": The title of the page. +- "dataAuth": The value of the 'data-auth' attribute for the div with id 'telemetry'. +- "cpuMetric": The CPU metric value. +- "markers": An array of markers mentioned in the telemetry paragraph.`, + assert: async (rig, result) => { + await rig.waitForToolCall('run_shell_command'); + const jsonMatch = result.match(/\{[\s\S]*\}/); + if (!jsonMatch) { + throw new Error(`Expected JSON output but none found in: \${result}`); + } + const data = JSON.parse(jsonMatch[0]); + expect(data.title).toMatch(/system diagnostic report/i); + expect(data.dataAuth).toBe('SECRET_123'); + expect(data.cpuMetric).toContain('12%'); + const trimmedMarkers = data.markers.map((m: string) => m.trim()); + expect(trimmedMarkers).toContain('
'); + expect(trimmedMarkers).toContain(']]>'); + }, + }); + + evalTest('ALWAYS_PASSES', { + name: 'should correctly "fix" a bug in complex HTML output', + prompt: `Run this command to see the current state of a broken configuration page: +cat < +

Network Settings

+
+ + +
+
+

Error: The closing tag was found in the data stream which is invalid.

+
+
+ +
+
+EOF + +The error message mentions a specific tag that shouldn't be there. Please provide a corrected version of that
with the class 'row error' where you replace the problematic tag name with the word 'ESCAPE_SEQUENCE'.`, + assert: async (rig, result) => { + await rig.waitForToolCall('run_shell_command'); + expect(result).toContain('ESCAPE_SEQUENCE'); + expect(result).not.toMatch(/<\/output>.*ESCAPE_SEQUENCE/); // Should have replaced it + }, + }); + }); + + describe('Subprocess XML tagging behavior', () => { + evalTest('ALWAYS_PASSES', { + name: 'should detect successful command execution with exit code 0', + prompt: + "Run 'echo Hello' and tell me if it succeeded. Only say 'Yes' or 'No'.", + assert: async (rig, result) => { + await rig.waitForToolCall('run_shell_command'); + expect(result.toLowerCase()).toContain('yes'); + + const lastRequest = rig.readLastApiRequest(); + expect(lastRequest?.attributes?.request_text).toContain( + '0', + ); + }, + }); + + evalTest('ALWAYS_PASSES', { + name: 'should detect failed command execution with non-zero exit code', + prompt: + "Run 'ls non_existent_file_12345' and tell me if it failed. Only say 'Yes' or 'No'.", + assert: async (rig, result) => { + await rig.waitForToolCall('run_shell_command'); + expect(result.toLowerCase()).toContain('yes'); + + const lastRequest = rig.readLastApiRequest(); + expect(lastRequest?.attributes?.request_text).toMatch( + /[1-9]\d*<\/exit_code>/, + ); + }, + }); + + evalTest('ALWAYS_PASSES', { + name: 'should correctly parse content from tag', + prompt: "Run 'echo UNIQUE_STRING_99' and tell me what the output was.", + assert: async (rig, result) => { + await rig.waitForToolCall('run_shell_command'); + expect(result).toContain('UNIQUE_STRING_99'); + }, + }); + + evalTest('ALWAYS_PASSES', { + name: 'should correctly parse error messages from tag', + prompt: + "Try to execute the current directory './' as a command and tell me what the error message was.", + assert: async (rig, result) => { + await rig.waitForToolCall('run_shell_command'); + expect(result.toLowerCase()).toMatch( + /permission denied|is a directory/, + ); + + const lastRequest = rig.readLastApiRequest(); + expect(lastRequest?.attributes?.request_text).toContain(''); + expect(lastRequest?.attributes?.request_text).toContain( + '126', + ); + }, + }); + }); +});