Generalize evals infra to support more types of evals, organization and queuing of named suites (#24941)

2026-07-22 15:51:18 -07:00 · 2026-04-08 23:57:26 +00:00
parent bc3ed61adb
commit f1bb2af6de
32 changed files with 475 additions and 133 deletions
@@ -8,6 +8,8 @@ describe('interactive_commands', () => {
   * intervention.
   */
  evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
    name: 'should not use interactive commands',
    prompt: 'Execute tests.',
    files: {
@@ -49,6 +51,8 @@ describe('interactive_commands', () => {
   * Validates that the agent uses non-interactive flags when scaffolding a new project.
   */
  evalTest('ALWAYS_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
    name: 'should use non-interactive flags when scaffolding a new app',
    prompt: 'Create a new react application named my-app using vite.',
    assert: async (rig, result) => {