mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-04-22 19:14:33 -07:00
Generalize evals infra to support more types of evals, organization and queuing of named suites (#24941)
This commit is contained in:
committed by
GitHub
parent
bc3ed61adb
commit
f1bb2af6de
@@ -45,6 +45,8 @@ describe('subagent eval test cases', () => {
|
||||
* This tests the system prompt's subagent specific clauses.
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should delegate to user provided agent with relevant expertise',
|
||||
params: {
|
||||
settings: {
|
||||
@@ -69,6 +71,8 @@ describe('subagent eval test cases', () => {
|
||||
* subagents are available. This helps catch orchestration overuse.
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should avoid delegating trivial direct edit work',
|
||||
params: {
|
||||
settings: {
|
||||
@@ -113,6 +117,8 @@ describe('subagent eval test cases', () => {
|
||||
* This is meant to codify the "overusing Generalist" failure mode.
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should prefer relevant specialist over generalist',
|
||||
params: {
|
||||
settings: {
|
||||
@@ -149,6 +155,8 @@ describe('subagent eval test cases', () => {
|
||||
* naturally spans docs and tests, so multiple specialists should be used.
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should use multiple relevant specialists for multi-surface task',
|
||||
params: {
|
||||
settings: {
|
||||
@@ -193,6 +201,8 @@ describe('subagent eval test cases', () => {
|
||||
* from a large pool of available subagents (10 total).
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should select the correct subagent from a pool of 10 different agents',
|
||||
prompt: 'Please add a new SQL table migration for a user profile.',
|
||||
files: {
|
||||
@@ -243,6 +253,8 @@ describe('subagent eval test cases', () => {
|
||||
* This test includes stress tests the subagent delegation with ~80 tools.
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should select the correct subagent from a pool of 10 different agents with MCP tools present',
|
||||
prompt: 'Please add a new SQL table migration for a user profile.',
|
||||
setup: async (rig) => {
|
||||
|
||||
Reference in New Issue
Block a user