diff --git a/.github/workflows/chained_e2e.yml b/.github/workflows/chained_e2e.yml
index fe87fb1d5d..94215e4795 100644
--- a/.github/workflows/chained_e2e.yml
+++ b/.github/workflows/chained_e2e.yml
@@ -335,6 +335,8 @@ jobs:
         env:
           GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}'
           GEMINI_MODEL: 'gemini-3-pro-preview'
+          # Only run always passes behavioral tests.
+          EVAL_SUITE_TYPE: 'behavioral'
           # Disable Vitest internal retries to avoid double-retrying;
           # custom retry logic is handled in evals/test-helper.ts
           VITEST_RETRY: 0
diff --git a/.github/workflows/evals-nightly.yml b/.github/workflows/evals-nightly.yml
index 9acc1de050..50071a7acd 100644
--- a/.github/workflows/evals-nightly.yml
+++ b/.github/workflows/evals-nightly.yml
@@ -5,10 +5,18 @@ on:
     - cron: '0 1 * * *' # Runs at 1 AM every day
   workflow_dispatch:
     inputs:
-      run_all:
-        description: 'Run all evaluations (including usually passing)'
-        type: 'boolean'
-        default: true
+      suite_type:
+        description: 'Suite type to run'
+        type: 'choice'
+        options:
+          - 'behavioral'
+          - 'component-level'
+          - 'hero-scenario'
+        default: 'behavioral'
+      suite_name:
+        description: 'Specific suite name to run'
+        required: false
+        type: 'string'
       test_name_pattern:
         description: 'Test name pattern or file name'
         required: false
@@ -59,7 +67,9 @@ jobs:
         env:
           GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}'
           GEMINI_MODEL: '${{ matrix.model }}'
-          RUN_EVALS: "${{ github.event.inputs.run_all != 'false' }}"
+          RUN_EVALS: 'true'
+          EVAL_SUITE_TYPE: '${{ github.event.inputs.suite_type || 'behavioral' }}'
+          EVAL_SUITE_NAME: '${{ github.event.inputs.suite_name }}'
           TEST_NAME_PATTERN: '${{ github.event.inputs.test_name_pattern }}'
           # Disable Vitest internal retries to avoid double-retrying;
           # custom retry logic is handled in evals/test-helper.ts
diff --git a/evals/answer-vs-act.eval.ts b/evals/answer-vs-act.eval.ts
index ff87d12564..1d19294363 100644
--- a/evals/answer-vs-act.eval.ts
+++ b/evals/answer-vs-act.eval.ts
@@ -19,6 +19,8 @@ describe('Answer vs. ask eval', () => {
    * automatically modify the file, but instead asks for permission.
    */
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should not edit files when asked to inspect for bugs',
     prompt: 'Inspect app.ts for bugs',
     files: FILES,
@@ -42,6 +44,8 @@ describe('Answer vs. ask eval', () => {
    * does modify the file.
    */
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should edit files when asked to fix bug',
     prompt: 'Fix the bug in app.ts - it should add numbers not subtract',
     files: FILES,
@@ -66,6 +70,8 @@ describe('Answer vs. ask eval', () => {
    * automatically modify the file, but instead asks for permission.
    */
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should not edit when asking "any bugs"',
     prompt: 'Any bugs in app.ts?',
     files: FILES,
@@ -89,6 +95,8 @@ describe('Answer vs. ask eval', () => {
    * automatically modify the file.
    */
   evalTest('ALWAYS_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should not edit files when asked a general question',
     prompt: 'How does app.ts work?',
     files: FILES,
@@ -112,6 +120,8 @@ describe('Answer vs. ask eval', () => {
    * automatically modify the file.
    */
   evalTest('ALWAYS_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should not edit files when asked about style',
     prompt: 'Is app.ts following good style?',
     files: FILES,
@@ -135,6 +145,8 @@ describe('Answer vs. ask eval', () => {
    * the agent does NOT automatically modify the file.
    */
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should not edit files when user notes an issue',
     prompt: 'The add function subtracts numbers.',
     files: FILES,
diff --git a/evals/app-test-helper.ts b/evals/app-test-helper.ts
index ce54adaaf4..e9d5d695b7 100644
--- a/evals/app-test-helper.ts
+++ b/evals/app-test-helper.ts
@@ -98,5 +98,5 @@ export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
     });
   };
 
-  runEval(policy, evalCase.name, fn, (evalCase.timeout ?? 60000) + 10000);
+  runEval(policy, evalCase, fn, (evalCase.timeout ?? 60000) + 10000);
 }
diff --git a/evals/ask_user.eval.ts b/evals/ask_user.eval.ts
index 6495cb3f22..b31ef54ae5 100644
--- a/evals/ask_user.eval.ts
+++ b/evals/ask_user.eval.ts
@@ -5,8 +5,8 @@
  */
 
 import { describe, expect } from 'vitest';
-import { appEvalTest, AppEvalCase } from './app-test-helper.js';
-import { EvalPolicy } from './test-helper.js';
+import { appEvalTest, type AppEvalCase } from './app-test-helper.js';
+import { type EvalPolicy } from './test-helper.js';
 
 function askUserEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
   return appEvalTest(policy, {
@@ -28,6 +28,8 @@ function askUserEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
 
 describe('ask_user', () => {
   askUserEvalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'Agent uses AskUser tool to present multiple choice options',
     prompt: `Use the ask_user tool to ask me what my favorite color is. Provide 3 options: red, green, or blue.`,
     setup: async (rig) => {
@@ -43,6 +45,8 @@ describe('ask_user', () => {
   });
 
   askUserEvalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'Agent uses AskUser tool to clarify ambiguous requirements',
     files: {
       'package.json': JSON.stringify({ name: 'my-app', version: '1.0.0' }),
@@ -61,6 +65,8 @@ describe('ask_user', () => {
   });
 
   askUserEvalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'Agent uses AskUser tool before performing significant ambiguous rework',
     files: {
       'packages/core/src/index.ts': '// index\nexport const version = "1.0.0";',
@@ -101,6 +107,8 @@ describe('ask_user', () => {
   // updates to clarify that shell command confirmation is handled by the UI.
   // See fix: https://github.com/google-gemini/gemini-cli/pull/20504
   askUserEvalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'Agent does NOT use AskUser to confirm shell commands',
     files: {
       'package.json': JSON.stringify({
diff --git a/evals/automated-tool-use.eval.ts b/evals/automated-tool-use.eval.ts
index 87f88a1ff3..27e43708dc 100644
--- a/evals/automated-tool-use.eval.ts
+++ b/evals/automated-tool-use.eval.ts
@@ -14,6 +14,8 @@ describe('Automated tool use', () => {
    * a repro by guiding the agent into using the existing deficient script.
    */
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should use automated tools (eslint --fix) to fix code style issues',
     files: {
       'package.json': JSON.stringify(
@@ -102,6 +104,8 @@ describe('Automated tool use', () => {
    * instead of trying to edit the files itself.
    */
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should use automated tools (prettier --write) to fix formatting issues',
     files: {
       'package.json': JSON.stringify(
diff --git a/evals/cli_help_delegation.eval.ts b/evals/cli_help_delegation.eval.ts
index 8be3bf1c51..e1714c0636 100644
--- a/evals/cli_help_delegation.eval.ts
+++ b/evals/cli_help_delegation.eval.ts
@@ -3,6 +3,8 @@ import { evalTest } from './test-helper.js';
 
 describe('CliHelpAgent Delegation', () => {
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should delegate to cli_help agent for subagent creation questions',
     params: {
       settings: {
diff --git a/evals/component-test-helper.ts b/evals/component-test-helper.ts
index 23f3cb8b6a..9be68e6936 100644
--- a/evals/component-test-helper.ts
+++ b/evals/component-test-helper.ts
@@ -132,5 +132,5 @@ export function componentEvalTest(
     });
   };
 
-  runEval(policy, evalCase.name, fn, (evalCase.timeout ?? 60000) + 10000);
+  runEval(policy, evalCase, fn, (evalCase.timeout ?? 60000) + 10000);
 }
diff --git a/evals/concurrency-safety.eval.ts b/evals/concurrency-safety.eval.ts
index f2f9e24be9..3aae68b5c4 100644
--- a/evals/concurrency-safety.eval.ts
+++ b/evals/concurrency-safety.eval.ts
@@ -20,6 +20,8 @@ You are the mutation agent. Do the mutation requested.
 
 describe('concurrency safety eval test cases', () => {
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'mutation agents are run in parallel when explicitly requested',
     params: {
       settings: {
diff --git a/evals/edit-locations-eval.eval.ts b/evals/edit-locations-eval.eval.ts
index 60e34e6df7..4acc4f2cf9 100644
--- a/evals/edit-locations-eval.eval.ts
+++ b/evals/edit-locations-eval.eval.ts
@@ -13,6 +13,8 @@ describe('Edits location eval', () => {
    * instead of creating a new one.
    */
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should update existing test file instead of creating a new one',
     files: {
       'package.json': JSON.stringify(
diff --git a/evals/frugalReads.eval.ts b/evals/frugalReads.eval.ts
index 47578039a6..4dd5f912b8 100644
--- a/evals/frugalReads.eval.ts
+++ b/evals/frugalReads.eval.ts
@@ -15,6 +15,8 @@ describe('Frugal reads eval', () => {
    * nearby ranges into a single contiguous read to save tool calls.
    */
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should use ranged read when nearby lines are targeted',
     files: {
       'package.json': JSON.stringify({
@@ -135,6 +137,8 @@ describe('Frugal reads eval', () => {
    * apart to avoid the need to read the whole file.
    */
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should use ranged read when targets are far apart',
     files: {
       'package.json': JSON.stringify({
@@ -204,6 +208,8 @@ describe('Frugal reads eval', () => {
    * (e.g.: 10), as it's more efficient than many small ranged reads.
    */
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should read the entire file when there are many matches',
     files: {
       'package.json': JSON.stringify({
diff --git a/evals/frugalSearch.eval.ts b/evals/frugalSearch.eval.ts
index 1c49fc2ed4..82438585e6 100644
--- a/evals/frugalSearch.eval.ts
+++ b/evals/frugalSearch.eval.ts
@@ -33,6 +33,8 @@ describe('Frugal Search', () => {
    * ranged reads.
    */
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should use grep or ranged read for large files',
     prompt: 'What year was legacy_processor.ts written?',
     files: {
diff --git a/evals/generalist_agent.eval.ts b/evals/generalist_agent.eval.ts
index 8161e33156..b8313079e9 100644
--- a/evals/generalist_agent.eval.ts
+++ b/evals/generalist_agent.eval.ts
@@ -11,6 +11,8 @@ import fs from 'node:fs/promises';
 
 describe('generalist_agent', () => {
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should be able to use generalist agent by explicitly asking the main agent to invoke it',
     params: {
       settings: {
diff --git a/evals/generalist_delegation.eval.ts b/evals/generalist_delegation.eval.ts
index 81252880eb..d731747826 100644
--- a/evals/generalist_delegation.eval.ts
+++ b/evals/generalist_delegation.eval.ts
@@ -11,6 +11,8 @@ describe('generalist_delegation', () => {
   // --- Positive Evals (Should Delegate) ---
 
   appEvalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should delegate batch error fixing to generalist agent',
     configOverrides: {
       agents: {
@@ -54,6 +56,8 @@ describe('generalist_delegation', () => {
   });
 
   appEvalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should autonomously delegate complex batch task to generalist agent',
     configOverrides: {
       agents: {
@@ -94,6 +98,8 @@ describe('generalist_delegation', () => {
   // --- Negative Evals (Should NOT Delegate - Assertive Handling) ---
 
   appEvalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should NOT delegate simple read and fix to generalist agent',
     configOverrides: {
       agents: {
@@ -128,6 +134,8 @@ describe('generalist_delegation', () => {
   });
 
   appEvalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should NOT delegate simple direct question to generalist agent',
     configOverrides: {
       agents: {
diff --git a/evals/gitRepo.eval.ts b/evals/gitRepo.eval.ts
index 6415b9c20d..b5dbd8a760 100644
--- a/evals/gitRepo.eval.ts
+++ b/evals/gitRepo.eval.ts
@@ -26,6 +26,8 @@ describe('git repo eval', () => {
    * be more consistent.
    */
   evalTest('ALWAYS_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should not git add commit changes unprompted',
     prompt:
       'Finish this up for me by just making a targeted fix for the bug in index.ts. Do not build, install anything, or add tests',
@@ -55,6 +57,8 @@ describe('git repo eval', () => {
    * instructed to not do so by default.
    */
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should git commit changes when prompted',
     prompt:
       'Make a targeted fix for the bug in index.ts without building, installing anything, or adding tests. Then, commit your changes.',
diff --git a/evals/grep_search_functionality.eval.ts b/evals/grep_search_functionality.eval.ts
index f1224b8221..5c1da827e1 100644
--- a/evals/grep_search_functionality.eval.ts
+++ b/evals/grep_search_functionality.eval.ts
@@ -15,6 +15,8 @@ describe('grep_search_functionality', () => {
   const TEST_PREFIX = 'Grep Search Functionality: ';
 
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should find a simple string in a file',
     files: {
       'test.txt': `hello
@@ -33,6 +35,8 @@ describe('grep_search_functionality', () => {
   });
 
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should perform a case-sensitive search',
     files: {
       'test.txt': `Hello
@@ -63,6 +67,8 @@ describe('grep_search_functionality', () => {
   });
 
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should return only file names when names_only is used',
     files: {
       'file1.txt': 'match me',
@@ -93,6 +99,8 @@ describe('grep_search_functionality', () => {
   });
 
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should search only within the specified include_pattern glob',
     files: {
       'file.js': 'my_function();',
@@ -123,6 +131,8 @@ describe('grep_search_functionality', () => {
   });
 
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should search within a specific subdirectory',
     files: {
       'src/main.js': 'unique_string_1',
@@ -153,6 +163,8 @@ describe('grep_search_functionality', () => {
   });
 
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should report no matches correctly',
     files: {
       'file.txt': 'nothing to see here',
diff --git a/evals/hierarchical_memory.eval.ts b/evals/hierarchical_memory.eval.ts
index dd4f8fbbd1..b7b58c79a1 100644
--- a/evals/hierarchical_memory.eval.ts
+++ b/evals/hierarchical_memory.eval.ts
@@ -12,6 +12,8 @@ describe('Hierarchical Memory', () => {
   const conflictResolutionTest =
     'Agent follows hierarchy for contradictory instructions';
   evalTest('ALWAYS_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: conflictResolutionTest,
     params: {
       settings: {
@@ -48,6 +50,8 @@ What is my favorite fruit? Tell me just the name of the fruit.`,
 
   const provenanceAwarenessTest = 'Agent is aware of memory provenance';
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: provenanceAwarenessTest,
     params: {
       settings: {
@@ -87,6 +91,8 @@ Provide the answer as an XML block like this:
 
   const extensionVsGlobalTest = 'Extension memory wins over Global memory';
   evalTest('ALWAYS_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: extensionVsGlobalTest,
     params: {
       settings: {
diff --git a/evals/interactive-hang.eval.ts b/evals/interactive-hang.eval.ts
index 0cf56acf98..72a5067fcc 100644
--- a/evals/interactive-hang.eval.ts
+++ b/evals/interactive-hang.eval.ts
@@ -8,6 +8,8 @@ describe('interactive_commands', () => {
    * intervention.
    */
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should not use interactive commands',
     prompt: 'Execute tests.',
     files: {
@@ -49,6 +51,8 @@ describe('interactive_commands', () => {
    * Validates that the agent uses non-interactive flags when scaffolding a new project.
    */
   evalTest('ALWAYS_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should use non-interactive flags when scaffolding a new app',
     prompt: 'Create a new react application named my-app using vite.',
     assert: async (rig, result) => {
diff --git a/evals/model_steering.eval.ts b/evals/model_steering.eval.ts
index 2cb87edcc2..4033b3a88f 100644
--- a/evals/model_steering.eval.ts
+++ b/evals/model_steering.eval.ts
@@ -5,14 +5,14 @@
  */
 
 import { describe, expect } from 'vitest';
-import { act } from 'react';
 import path from 'node:path';
 import fs from 'node:fs';
 import { appEvalTest } from './app-test-helper.js';
-import { PolicyDecision } from '@google/gemini-cli-core';
 
 describe('Model Steering Behavioral Evals', () => {
   appEvalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'Corrective Hint: Model switches task based on hint during tool turn',
     configOverrides: {
       modelSteering: true,
@@ -52,6 +52,8 @@ describe('Model Steering Behavioral Evals', () => {
   });
 
   appEvalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'Suggestive Hint: Model incorporates user guidance mid-stream',
     configOverrides: {
       modelSteering: true,
diff --git a/evals/plan_mode.eval.ts b/evals/plan_mode.eval.ts
index 8b01f68155..5f041a4a3f 100644
--- a/evals/plan_mode.eval.ts
+++ b/evals/plan_mode.eval.ts
@@ -31,6 +31,8 @@ describe('plan_mode', () => {
       .filter(Boolean);
 
   evalTest('ALWAYS_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should refuse file modification when in plan mode',
     approvalMode: ApprovalMode.PLAN,
     params: {
@@ -66,6 +68,8 @@ describe('plan_mode', () => {
   });
 
   evalTest('ALWAYS_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should refuse saving new documentation to the repo when in plan mode',
     approvalMode: ApprovalMode.PLAN,
     params: {
@@ -103,6 +107,8 @@ describe('plan_mode', () => {
   });
 
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should enter plan mode when asked to create a plan',
     approvalMode: ApprovalMode.DEFAULT,
     params: {
@@ -120,6 +126,8 @@ describe('plan_mode', () => {
   });
 
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should exit plan mode when plan is complete and implementation is requested',
     approvalMode: ApprovalMode.PLAN,
     params: {
@@ -167,6 +175,8 @@ describe('plan_mode', () => {
   });
 
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should allow file modification in plans directory when in plan mode',
     approvalMode: ApprovalMode.PLAN,
     params: {
@@ -198,6 +208,8 @@ describe('plan_mode', () => {
   });
 
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should create a plan in plan mode and implement it for a refactoring task',
     params: {
       settings,
diff --git a/evals/redundant_casts.eval.ts b/evals/redundant_casts.eval.ts
index 83750e44d4..fc991b5ba7 100644
--- a/evals/redundant_casts.eval.ts
+++ b/evals/redundant_casts.eval.ts
@@ -11,6 +11,8 @@ import fs from 'node:fs/promises';
 
 describe('redundant_casts', () => {
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should not add redundant or unsafe casts when modifying typescript code',
     files: {
       'src/cast_example.ts': `
diff --git a/evals/sandbox_recovery.eval.ts b/evals/sandbox_recovery.eval.ts
index ad6b630236..073379e94f 100755
--- a/evals/sandbox_recovery.eval.ts
+++ b/evals/sandbox_recovery.eval.ts
@@ -3,6 +3,8 @@ import { evalTest } from './test-helper.js';
 
 describe('Sandbox recovery', () => {
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'attempts to use additional_permissions when operation not permitted',
     prompt:
       'Run ./script.sh. It will fail with "Operation not permitted". When it does, you must retry running it by passing the appropriate additional_permissions.',
diff --git a/evals/save_memory.eval.ts b/evals/save_memory.eval.ts
index 25e081a819..bbb13d1c44 100644
--- a/evals/save_memory.eval.ts
+++ b/evals/save_memory.eval.ts
@@ -15,6 +15,8 @@ describe('save_memory', () => {
   const TEST_PREFIX = 'Save memory test: ';
   const rememberingFavoriteColor = "Agent remembers user's favorite color";
   evalTest('ALWAYS_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: rememberingFavoriteColor,
 
     prompt: `remember that my favorite color is  blue.
@@ -35,6 +37,8 @@ describe('save_memory', () => {
   });
   const rememberingCommandRestrictions = 'Agent remembers command restrictions';
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: rememberingCommandRestrictions,
 
     prompt: `I don't want you to ever run npm commands.`,
@@ -54,6 +58,8 @@ describe('save_memory', () => {
 
   const rememberingWorkflow = 'Agent remembers workflow preferences';
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: rememberingWorkflow,
 
     prompt: `I want you to always lint after building.`,
@@ -74,6 +80,8 @@ describe('save_memory', () => {
   const ignoringTemporaryInformation =
     'Agent ignores temporary conversation details';
   evalTest('ALWAYS_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: ignoringTemporaryInformation,
 
     prompt: `I'm going to get a coffee.`,
@@ -97,6 +105,8 @@ describe('save_memory', () => {
 
   const rememberingPetName = "Agent remembers user's pet's name";
   evalTest('ALWAYS_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: rememberingPetName,
 
     prompt: `Please remember that my dog's name is Buddy.`,
@@ -116,6 +126,8 @@ describe('save_memory', () => {
 
   const rememberingCommandAlias = 'Agent remembers custom command aliases';
   evalTest('ALWAYS_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: rememberingCommandAlias,
 
     prompt: `When I say 'start server', you should run 'npm run dev'.`,
@@ -136,6 +148,8 @@ describe('save_memory', () => {
   const ignoringDbSchemaLocation =
     "Agent ignores workspace's database schema location";
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: ignoringDbSchemaLocation,
     prompt: `The database schema for this workspace is located in \`db/schema.sql\`.`,
     assert: async (rig, result) => {
@@ -155,6 +169,8 @@ describe('save_memory', () => {
   const rememberingCodingStyle =
     "Agent remembers user's coding style preference";
   evalTest('ALWAYS_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: rememberingCodingStyle,
 
     prompt: `I prefer to use tabs instead of spaces for indentation.`,
@@ -175,6 +191,8 @@ describe('save_memory', () => {
   const ignoringBuildArtifactLocation =
     'Agent ignores workspace build artifact location';
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: ignoringBuildArtifactLocation,
     prompt: `In this workspace, build artifacts are stored in the \`dist/artifacts\` directory.`,
     assert: async (rig, result) => {
@@ -193,6 +211,8 @@ describe('save_memory', () => {
 
   const ignoringMainEntryPoint = "Agent ignores workspace's main entry point";
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: ignoringMainEntryPoint,
     prompt: `The main entry point for this workspace is \`src/index.js\`.`,
     assert: async (rig, result) => {
@@ -211,6 +231,8 @@ describe('save_memory', () => {
 
   const rememberingBirthday = "Agent remembers user's birthday";
   evalTest('ALWAYS_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: rememberingBirthday,
 
     prompt: `My birthday is on June 15th.`,
@@ -231,6 +253,8 @@ describe('save_memory', () => {
   const proactiveMemoryFromLongSession =
     'Agent saves preference from earlier in conversation history';
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: proactiveMemoryFromLongSession,
     params: {
       settings: {
@@ -309,6 +333,8 @@ describe('save_memory', () => {
   const memoryManagerRoutingPreferences =
     'Agent routes global and project preferences to memory';
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: memoryManagerRoutingPreferences,
     params: {
       settings: {
diff --git a/evals/shell-efficiency.eval.ts b/evals/shell-efficiency.eval.ts
index dc555d5298..936af245fd 100644
--- a/evals/shell-efficiency.eval.ts
+++ b/evals/shell-efficiency.eval.ts
@@ -21,6 +21,8 @@ describe('Shell Efficiency', () => {
   };
 
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should use --silent/--quiet flags when installing packages',
     prompt: 'Install the "lodash" package using npm.',
     assert: async (rig) => {
@@ -50,6 +52,8 @@ describe('Shell Efficiency', () => {
   });
 
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should use --no-pager with git commands',
     prompt: 'Show the git log.',
     assert: async (rig) => {
@@ -73,6 +77,8 @@ describe('Shell Efficiency', () => {
   });
 
   evalTest('ALWAYS_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should NOT use efficiency flags when enableShellOutputEfficiency is disabled',
     params: {
       settings: {
diff --git a/evals/subagents.eval.ts b/evals/subagents.eval.ts
index 7053290fba..853d08f211 100644
--- a/evals/subagents.eval.ts
+++ b/evals/subagents.eval.ts
@@ -45,6 +45,8 @@ describe('subagent eval test cases', () => {
    * This tests the system prompt's subagent specific clauses.
    */
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should delegate to user provided agent with relevant expertise',
     params: {
       settings: {
@@ -69,6 +71,8 @@ describe('subagent eval test cases', () => {
    * subagents are available. This helps catch orchestration overuse.
    */
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should avoid delegating trivial direct edit work',
     params: {
       settings: {
@@ -113,6 +117,8 @@ describe('subagent eval test cases', () => {
    * This is meant to codify the "overusing Generalist" failure mode.
    */
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should prefer relevant specialist over generalist',
     params: {
       settings: {
@@ -149,6 +155,8 @@ describe('subagent eval test cases', () => {
    * naturally spans docs and tests, so multiple specialists should be used.
    */
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should use multiple relevant specialists for multi-surface task',
     params: {
       settings: {
@@ -193,6 +201,8 @@ describe('subagent eval test cases', () => {
    * from a large pool of available subagents (10 total).
    */
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should select the correct subagent from a pool of 10 different agents',
     prompt: 'Please add a new SQL table migration for a user profile.',
     files: {
@@ -243,6 +253,8 @@ describe('subagent eval test cases', () => {
    * This test includes stress tests the subagent delegation with ~80 tools.
    */
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should select the correct subagent from a pool of 10 different agents with MCP tools present',
     prompt: 'Please add a new SQL table migration for a user profile.',
     setup: async (rig) => {
diff --git a/evals/test-helper.ts b/evals/test-helper.ts
index 7baab8326f..7369a6919c 100644
--- a/evals/test-helper.ts
+++ b/evals/test-helper.ts
@@ -48,12 +48,7 @@ export const EVAL_MODEL =
 export type EvalPolicy = 'ALWAYS_PASSES' | 'USUALLY_PASSES';
 
 export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
-  runEval(
-    policy,
-    evalCase.name,
-    () => internalEvalTest(evalCase),
-    evalCase.timeout,
-  );
+  runEval(policy, evalCase, () => internalEvalTest(evalCase));
 }
 
 export async function withEvalRetries(
@@ -344,14 +339,30 @@ export async function prepareWorkspace(
  */
 export function runEval(
   policy: EvalPolicy,
-  name: string,
+  evalCase: BaseEvalCase,
   fn: () => Promise<void>,
-  timeout?: number,
+  timeoutOverride?: number,
 ) {
-  if (policy === 'USUALLY_PASSES' && !process.env['RUN_EVALS']) {
-    it.skip(name, fn);
+  const { name, timeout, suiteName, suiteType } = evalCase;
+  const targetSuiteType = process.env['EVAL_SUITE_TYPE'];
+  const targetSuiteName = process.env['EVAL_SUITE_NAME'];
+
+  const meta = { suiteType, suiteName };
+
+  const skipBySuiteType =
+    targetSuiteType && suiteType && suiteType !== targetSuiteType;
+  const skipBySuiteName =
+    targetSuiteName && suiteName && suiteName !== targetSuiteName;
+
+  const options = { timeout: timeoutOverride ?? timeout, meta };
+  if (
+    (policy === 'USUALLY_PASSES' && !process.env['RUN_EVALS']) ||
+    skipBySuiteType ||
+    skipBySuiteName
+  ) {
+    it.skip(name, options, fn);
   } else {
-    it(name, fn, timeout);
+    it(name, options, fn);
   }
 }
 
@@ -391,6 +402,8 @@ interface ForbiddenToolSettings {
 }
 
 export interface BaseEvalCase {
+  suiteName: string;
+  suiteType: 'behavioral' | 'component-level' | 'hero-scenario';
   name: string;
   timeout?: number;
   files?: Record<string, string>;
diff --git a/evals/tool_output_masking.eval.ts b/evals/tool_output_masking.eval.ts
index dff639e421..ccaa279877 100644
--- a/evals/tool_output_masking.eval.ts
+++ b/evals/tool_output_masking.eval.ts
@@ -31,6 +31,8 @@ describe('Tool Output Masking Behavioral Evals', () => {
    * It should recognize the <tool_output_masked> tag and use a tool to read the file.
    */
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should attempt to read the redirected full output file when information is masked',
     params: {
       security: {
@@ -167,6 +169,8 @@ Output too large. Full output available at: ${outputFilePath}
    * Scenario: Information is in the preview.
    */
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should NOT read the full output file when the information is already in the preview',
     params: {
       security: {
diff --git a/evals/tracker.eval.ts b/evals/tracker.eval.ts
index 7afb41dbec..95eb0b7351 100644
--- a/evals/tracker.eval.ts
+++ b/evals/tracker.eval.ts
@@ -25,6 +25,8 @@ const FILES = {
 
 describe('tracker_mode', () => {
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should manage tasks in the tracker when explicitly requested during a bug fix',
     params: {
       settings: { experimental: { taskTracker: true } },
@@ -78,6 +80,8 @@ describe('tracker_mode', () => {
   });
 
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should implicitly create tasks when asked to build a feature plan',
     params: {
       settings: { experimental: { taskTracker: true } },
diff --git a/evals/validation_fidelity.eval.ts b/evals/validation_fidelity.eval.ts
index 8cfb4f6626..2a69b88740 100644
--- a/evals/validation_fidelity.eval.ts
+++ b/evals/validation_fidelity.eval.ts
@@ -9,6 +9,8 @@ import { evalTest } from './test-helper.js';
 
 describe('validation_fidelity', () => {
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should perform exhaustive validation autonomously when guided by system instructions',
     files: {
       'src/types.ts': `
diff --git a/evals/validation_fidelity_pre_existing_errors.eval.ts b/evals/validation_fidelity_pre_existing_errors.eval.ts
index 4990b7bc91..0b100e5668 100644
--- a/evals/validation_fidelity_pre_existing_errors.eval.ts
+++ b/evals/validation_fidelity_pre_existing_errors.eval.ts
@@ -9,6 +9,8 @@ import { evalTest } from './test-helper.js';
 
 describe('validation_fidelity_pre_existing_errors', () => {
   evalTest('USUALLY_PASSES', {
+    suiteName: 'default',
+    suiteType: 'behavioral',
     name: 'should handle pre-existing project errors gracefully during validation',
     files: {
       'src/math.ts': `