Files
gemini-cli/evals/skill_activation.eval.ts
Aishanee Shah 88985511c6 feat(prompts): redesign SI for modularity, skill activation, and essential workflows
- Redesigned CORE_SI_SKELETON for maximum reasoning fidelity and minimum token usage.
- Extracted Software Engineering and New Application workflows to dynamic skills.
- Added 'Essential Workflows' section to maintain visibility for core user journeys.
- Implemented precedence-based sorting for available skills (Workspace > User > Built-in).
- Added behavioral tests in evals/skill_activation.eval.ts to verify proactive skill activation.
- Fixed pre-existing build error in useGeminiStream.ts related to missing FinishReason values.
2026-02-25 16:23:54 +00:00

111 lines
3.4 KiB
TypeScript

/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe } from 'vitest';
import { evalTest } from './test-helper.js';
describe('Skill Activation Behavioral Evals', () => {
/**
* Tests that the model proactively activates the software-engineering skill
* when faced with a typical engineering task like bug fixing.
*/
evalTest('ALWAYS_PASSES', {
name: 'should activate software-engineering skill for bug fixes',
prompt:
'There is a bug in the greeting logic in src/index.ts. Please fix it.',
files: {
'src/index.ts':
'export const greet = (name: string) => `Hello, ${name}!`;',
'src/index.test.ts': `
import { greet } from './index';
import { expect, test } from 'vitest';
test('greet', () => { expect(greet('World')).toBe('Hello, World!'); });
`,
},
assert: async (rig) => {
await rig.expectToolCallSuccess(['activate_skill'], undefined, (args) => {
try {
const parsed = JSON.parse(args);
return parsed.name === 'software-engineering';
} catch {
return false;
}
});
},
});
/**
* Tests that the model proactively activates the new-application skill
* when asked to scaffold a new project.
*/
evalTest('ALWAYS_PASSES', {
name: 'should activate new-application skill for prototyping',
prompt: 'Build me a new Todo app using React and Vanilla CSS.',
assert: async (rig) => {
await rig.expectToolCallSuccess(['activate_skill'], undefined, (args) => {
try {
const parsed = JSON.parse(args);
return parsed.name === 'new-application';
} catch {
return false;
}
});
},
});
/**
* Tests that the model proactively activates the docs-writer skill
* when asked to update documentation files.
*/
evalTest('ALWAYS_PASSES', {
name: 'should activate docs-writer skill for documentation tasks',
prompt:
'Update the documentation in docs/index.md to include the new features.',
files: {
'docs/index.md': `# Documentation
Existing content.`,
},
assert: async (rig) => {
await rig.expectToolCallSuccess(['activate_skill'], undefined, (args) => {
try {
const parsed = JSON.parse(args);
return parsed.name === 'docs-writer';
} catch {
return false;
}
});
},
});
/**
* Tests that the model can handle multi-step tasks that might require
* activating multiple skills sequentially (though usually it just activates one).
*/
evalTest('USUALLY_PASSES', {
name: 'should activate software-engineering even when the prompt is slightly indirect',
prompt:
'The CI is failing on the main branch. Can you investigate and fix whatever is broken?',
files: {
'package.json': '{ "scripts": { "test": "vitest run" } }',
'src/logic.ts':
'export const compute = () => { throw new Error("Broken"); };',
'src/logic.test.ts':
'import { compute } from "./logic"; import { test } from "vitest"; test("compute", () => { compute(); });',
},
assert: async (rig) => {
await rig.expectToolCallSuccess(['activate_skill'], undefined, (args) => {
try {
const parsed = JSON.parse(args);
return parsed.name === 'software-engineering';
} catch {
return false;
}
});
},
});
});