Generalize evals infra to support more types of evals, organization and queuing of named suites (#24941)

This commit is contained in:
Christian Gunderman
2026-04-08 23:57:26 +00:00
committed by GitHub
parent bc3ed61adb
commit f1bb2af6de
32 changed files with 475 additions and 133 deletions
+12
View File
@@ -45,6 +45,8 @@ describe('subagent eval test cases', () => {
* This tests the system prompt's subagent specific clauses.
*/
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should delegate to user provided agent with relevant expertise',
params: {
settings: {
@@ -69,6 +71,8 @@ describe('subagent eval test cases', () => {
* subagents are available. This helps catch orchestration overuse.
*/
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should avoid delegating trivial direct edit work',
params: {
settings: {
@@ -113,6 +117,8 @@ describe('subagent eval test cases', () => {
* This is meant to codify the "overusing Generalist" failure mode.
*/
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should prefer relevant specialist over generalist',
params: {
settings: {
@@ -149,6 +155,8 @@ describe('subagent eval test cases', () => {
* naturally spans docs and tests, so multiple specialists should be used.
*/
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should use multiple relevant specialists for multi-surface task',
params: {
settings: {
@@ -193,6 +201,8 @@ describe('subagent eval test cases', () => {
* from a large pool of available subagents (10 total).
*/
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should select the correct subagent from a pool of 10 different agents',
prompt: 'Please add a new SQL table migration for a user profile.',
files: {
@@ -243,6 +253,8 @@ describe('subagent eval test cases', () => {
* This test includes stress tests the subagent delegation with ~80 tools.
*/
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should select the correct subagent from a pool of 10 different agents with MCP tools present',
prompt: 'Please add a new SQL table migration for a user profile.',
setup: async (rig) => {