feat(cli, core): Support hybrid evals.

This commit is contained in:
Your Name
2026-03-19 16:44:32 +00:00
parent 98d1bec99f
commit f178f9c020
20 changed files with 980 additions and 95 deletions

View File

@@ -0,0 +1,78 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, it, expect } from 'vitest';
import { AppRig } from '../test-utils/AppRig.js';
import {
FakeContentGenerator,
FallbackContentGenerator,
userText,
mockGenerateContentStreamText,
extractUserPrompts,
extractFakeResponses,
type ScriptItem,
} from '@google/gemini-cli-core';
describe('Hybrid Handoff (Mock User to Synthetic Live Model)', () => {
it('successfully transitions from mock responses to live responses', async () => {
// 1. Define the conversational script for the priming phase
const primingScript: ScriptItem[] = [
userText('Start priming'),
mockGenerateContentStreamText('Hello! I am a fake response.'),
userText('Continue priming'),
mockGenerateContentStreamText(
'Pump primed successfully. Ready for handoff.',
),
];
// 2. Setup the primary fake generator that runs through the priming script
const fakeGenerator = new FakeContentGenerator(
extractFakeResponses(primingScript),
);
// 3. Setup a "live" fallback generator (it's synthetic so we don't need API keys)
const mockLiveFallback = new FakeContentGenerator([
mockGenerateContentStreamText('The answer is 4.'),
]);
// We need countTokens so AppRig doesn't hang checking size during truncation
mockLiveFallback.countTokens = async () => ({ totalTokens: 10 });
// 4. Compose them using FallbackContentGenerator
const composedGenerator = new FallbackContentGenerator(
fakeGenerator,
mockLiveFallback,
);
// 5. Mount the AppRig natively supporting custom content generators
const rig = new AppRig({
contentGenerator: composedGenerator,
configOverrides: {
fakeResponses: [], // ensure it avoids disk IO attempts internally
},
});
await rig.initialize();
await rig.render();
await rig.waitForIdle();
// 6. Drive the Mock User sequence using the extracted prompts from the script
await rig.driveMockUser(extractUserPrompts(primingScript), 10000);
// 7. Send the final prompt that should exhaust the primary generator and trigger the fallback
await rig.sendMessage('What is 2 + 2?');
// 8. Wait for the fallback response to render
await rig.waitForOutput('The answer is 4.', 10000);
const output = rig.getStaticOutput();
expect(output).toContain('The answer is 4.');
// Wait for everything to settle so React act() warnings don't fire during unmount
await rig.drainBreakpointsUntilIdle(undefined, 10000);
await rig.unmount();
});
});

View File

@@ -8,7 +8,7 @@ import { describe, it, afterEach } from 'vitest';
import { AppRig } from '../test-utils/AppRig.js';
import path from 'node:path';
import { fileURLToPath } from 'node:url';
import { PolicyDecision } from '@google/gemini-cli-core';
import { FakeContentGenerator, PolicyDecision } from '@google/gemini-cli-core';
const __dirname = path.dirname(fileURLToPath(import.meta.url));
@@ -24,8 +24,10 @@ describe('Model Steering Integration', () => {
__dirname,
'../test-utils/fixtures/steering.responses',
);
const contentGenerator =
await FakeContentGenerator.fromFile(fakeResponsesPath);
rig = new AppRig({
fakeResponsesPath,
contentGenerator,
configOverrides: { modelSteering: true },
});
await rig.initialize();

View File

@@ -8,7 +8,7 @@ import { describe, it, afterEach, expect } from 'vitest';
import { AppRig } from './AppRig.js';
import path from 'node:path';
import { fileURLToPath } from 'node:url';
import { debugLogger } from '@google/gemini-cli-core';
import { FakeContentGenerator, debugLogger } from '@google/gemini-cli-core';
const __dirname = path.dirname(fileURLToPath(import.meta.url));
@@ -25,8 +25,10 @@ describe('AppRig', () => {
'fixtures',
'steering.responses',
);
const contentGenerator =
await FakeContentGenerator.fromFile(fakeResponsesPath);
rig = new AppRig({
fakeResponsesPath,
contentGenerator,
configOverrides: { modelSteering: true },
});
await rig.initialize();
@@ -66,7 +68,9 @@ describe('AppRig', () => {
'fixtures',
'simple.responses',
);
rig = new AppRig({ fakeResponsesPath });
const contentGenerator =
await FakeContentGenerator.fromFile(fakeResponsesPath);
rig = new AppRig({ contentGenerator });
await rig.initialize();
await rig.render();
// Wait for initial render

View File

@@ -31,6 +31,8 @@ import {
debugLogger,
CoreToolCallStatus,
IntegrityDataStatus,
ConsecaSafetyChecker,
type ContentGenerator,
} from '@google/gemini-cli-core';
import {
type MockShellCommand,
@@ -48,37 +50,42 @@ import type {
TrackedCompletedToolCall,
TrackedToolCall,
} from '../ui/hooks/useToolScheduler.js';
import type { Content, GenerateContentParameters } from '@google/genai';
// Global state observer for React-based signals
const sessionStateMap = new Map<string, StreamingState>();
const activeRigs = new Map<string, AppRig>();
// Mock StreamingContext to report state changes back to the observer
vi.mock('../ui/contexts/StreamingContext.js', async (importOriginal) => {
// Mock useGeminiStream to report state changes back to the observer
vi.mock('../ui/hooks/useGeminiStream.js', async (importOriginal) => {
const original =
await importOriginal<typeof import('../ui/contexts/StreamingContext.js')>();
const { useConfig } = await import('../ui/contexts/ConfigContext.js');
await importOriginal<typeof import('../ui/hooks/useGeminiStream.js')>();
const React = await import('react');
const { useConfig } = await import('../ui/contexts/ConfigContext.js');
return {
...original,
useStreamingContext: () => {
const state = original.useStreamingContext();
// eslint-disable-next-line @typescript-eslint/no-explicit-any
useGeminiStream: (...args: any[]) => {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const result = (original.useGeminiStream as any)(...args);
const config = useConfig();
const sessionId = config.getSessionId();
React.useEffect(() => {
sessionStateMap.set(sessionId, state);
// If we see activity, we are no longer "awaiting" the start of a response
if (state !== StreamingState.Idle) {
const rig = activeRigs.get(sessionId);
if (rig) {
rig.awaitingResponse = false;
if (sessionId) {
sessionStateMap.set(sessionId, result.streamingState);
// If we see activity, we are no longer "awaiting" the start of a response
if (result.streamingState !== StreamingState.Idle) {
const rig = activeRigs.get(sessionId);
if (rig) {
rig.awaitingResponse = false;
}
}
}
}, [sessionId, state]);
}, [sessionId, result.streamingState]);
return state;
return result;
},
};
});
@@ -142,10 +149,10 @@ vi.mock('../ui/components/GeminiRespondingSpinner.js', async () => {
});
export interface AppRigOptions {
fakeResponsesPath?: string;
terminalWidth?: number;
terminalHeight?: number;
configOverrides?: Partial<ConfigParameters>;
contentGenerator?: ContentGenerator;
}
export interface PendingConfirmation {
@@ -160,6 +167,7 @@ export class AppRig {
private settings: LoadedSettings | undefined;
private testDir: string;
private sessionId: string;
private appRigId: string;
private pendingConfirmations = new Map<string, PendingConfirmation>();
private breakpointTools = new Set<string | undefined>();
@@ -169,12 +177,14 @@ export class AppRig {
* True if a message was just sent but React hasn't yet reported a non-idle state.
*/
awaitingResponse = false;
activeStreamCount = 0;
constructor(private options: AppRigOptions = {}) {
const uniqueId = randomUUID();
this.testDir = fs.mkdtempSync(
path.join(os.tmpdir(), `gemini-app-rig-${uniqueId.slice(0, 8)}-`),
);
this.appRigId = path.basename(this.testDir).toLowerCase();
this.sessionId = `test-session-${uniqueId}`;
activeRigs.set(this.sessionId, this);
}
@@ -197,7 +207,7 @@ export class AppRig {
cwd: this.testDir,
debugMode: false,
model: 'test-model',
fakeResponses: this.options.fakeResponsesPath,
contentGenerator: this.options.contentGenerator,
interactive: true,
approvalMode,
policyEngineConfig,
@@ -209,8 +219,44 @@ export class AppRig {
};
this.config = makeFakeConfig(configParams);
if (this.options.fakeResponsesPath) {
this.stubRefreshAuth();
// Track active streams directly from the client to prevent false idleness during synchronous mock yields
const client = this.config.getGeminiClient();
const originalStream = client.sendMessageStream.bind(client);
// eslint-disable-next-line @typescript-eslint/no-this-alias
const self = this;
client.sendMessageStream = async function* (
// eslint-disable-next-line @typescript-eslint/no-explicit-any
...args: any[]
// eslint-disable-next-line @typescript-eslint/no-explicit-any
): AsyncGenerator<any, any, any> {
self.awaitingResponse = false;
self.activeStreamCount++;
try {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
yield* (originalStream as any)(...args);
} finally {
self.activeStreamCount = Math.max(0, self.activeStreamCount - 1);
}
// eslint-disable-next-line @typescript-eslint/no-explicit-any
} as any;
if (this.config.fakeResponses || this.options.contentGenerator) {
if (!this.options.contentGenerator && !this.config.fakeResponses) {
this.stubRefreshAuth();
}
if (!process.env['GEMINI_API_KEY']) {
vi.stubEnv('GEMINI_API_KEY', 'test-api-key');
}
MockShellExecutionService.setPassthrough(false);
} else {
if (!process.env['GEMINI_API_KEY']) {
throw new Error(
'GEMINI_API_KEY must be set in the environment for live model tests.',
);
}
// For live tests, we allow falling through to the real shell service if no mock matches
MockShellExecutionService.setPassthrough(true);
}
this.setupMessageBusListeners();
@@ -226,18 +272,6 @@ export class AppRig {
private setupEnvironment() {
// Stub environment variables to avoid interference from developer's machine
vi.stubEnv('GEMINI_CLI_HOME', this.testDir);
if (this.options.fakeResponsesPath) {
vi.stubEnv('GEMINI_API_KEY', 'test-api-key');
MockShellExecutionService.setPassthrough(false);
} else {
if (!process.env['GEMINI_API_KEY']) {
throw new Error(
'GEMINI_API_KEY must be set in the environment for live model tests.',
);
}
// For live tests, we allow falling through to the real shell service if no mock matches
MockShellExecutionService.setPassthrough(true);
}
vi.stubEnv('GEMINI_DEFAULT_AUTH_TYPE', AuthType.USE_GEMINI);
}
@@ -355,18 +389,23 @@ export class AppRig {
* Returns true if the agent is currently busy (responding or executing tools).
*/
isBusy(): boolean {
if (this.awaitingResponse) {
const reactState = sessionStateMap.get(this.sessionId);
if (reactState && reactState !== StreamingState.Idle) {
this.awaitingResponse = false;
}
if (this.awaitingResponse || this.activeStreamCount > 0) {
return true;
}
const reactState = sessionStateMap.get(this.sessionId);
// If we have a React-based state, use it as the definitive signal.
// 'responding' and 'waiting-for-confirmation' both count as busy for the overall task.
if (reactState !== undefined) {
return reactState !== StreamingState.Idle;
}
// Fallback to tool tracking if React hasn't reported yet
// Fallback to tool tracking
const isAnyToolActive = this.toolCalls.some((tc) => {
if (
tc.status === CoreToolCallStatus.Executing ||
@@ -657,6 +696,21 @@ export class AppRig {
}
}
/**
* Acts as an automated user ('Mock User') to prime the system with a specific
* history state before handing off control to a live trial or eval.
*
* @param prompts An array of user messages to send sequentially.
* @param timeout Optional timeout per interaction.
*/
async driveMockUser(prompts: string[], timeout = 60000) {
for (let i = 0; i < prompts.length; i++) {
const prompt = prompts[i];
await this.sendMessage(prompt);
await this.drainBreakpointsUntilIdle(undefined, timeout);
}
}
getConfig(): Config {
if (!this.config) throw new Error('AppRig not initialized');
return this.config;
@@ -754,6 +808,10 @@ export class AppRig {
// Forcefully clear IdeClient singleton promise
// eslint-disable-next-line @typescript-eslint/no-explicit-any
(IdeClient as any).instancePromise = null;
// Reset Conseca singleton to avoid leaking config/state across tests
ConsecaSafetyChecker.resetInstance();
vi.clearAllMocks();
this.config = undefined;
@@ -770,4 +828,79 @@ export class AppRig {
}
}
}
getSentRequests() {
if (!this.config) throw new Error('AppRig not initialized');
return this.config.getContentGenerator().getSentRequests?.() || [];
}
/**
* Helper to get the curated history (contents) sent in the most recent model request.
* This method scrubs unstable data like temp paths and IDs for deterministic goldens.
*/
getLastSentRequestContents() {
const requests = this.getSentRequests();
if (requests.length === 0) return [];
const contents = requests[requests.length - 1].contents || [];
return this.scrubUnstableData(contents);
}
/**
* Gets the final curated history of the active chat session.
*/
getCuratedHistory() {
if (!this.config) throw new Error('AppRig not initialized');
const history = this.config.getGeminiClient().getChat().getHistory(true);
return this.scrubUnstableData(history);
}
private scrubUnstableData<
T extends
| Content[]
| GenerateContentParameters['contents']
| readonly Content[],
>(contents: T): T {
// Deeply scrub unstable data
const scrubbedString = JSON.stringify(contents)
.replace(new RegExp(this.testDir, 'g'), '<TEST_DIR>')
.replace(new RegExp(this.appRigId, 'g'), '<APP_RIG_ID>')
.replace(new RegExp(this.sessionId, 'g'), '<SESSION_ID>')
.replace(
/([a-zA-Z0-9_]+)_([0-9]{13})_([0-9]+)\.txt/g,
'$1_<TIMESTAMP>_<INDEX>.txt',
)
.replace(/Process Group PGID: \d+/g, 'Process Group PGID: <PGID>');
const scrubbed = JSON.parse(scrubbedString) as T;
if (Array.isArray(scrubbed) && scrubbed.length > 0) {
const firstItem = scrubbed[0] as Content;
if (firstItem.parts?.[0]?.text?.includes('<session_context>')) {
firstItem.parts[0].text = '<SESSION_CONTEXT>';
}
for (const content of scrubbed as Content[]) {
if (content.parts) {
for (const part of content.parts) {
if (part.functionCall) {
part.functionCall.id = '<CALL_ID>';
}
if (part.functionResponse) {
part.functionResponse.id = '<CALL_ID>';
if (
part.functionResponse.response !== null &&
typeof part.functionResponse.response === 'object' &&
'original_output_file' in part.functionResponse.response
) {
part.functionResponse.response['original_output_file'] =
'<TMP_FILE>';
}
}
}
}
}
}
return scrubbed;
}
}

View File

@@ -262,5 +262,47 @@ describe('useReverseSearchCompletion', () => {
expect(result.current.suggestions).toEqual([]);
expect(result.current.showSuggestions).toBe(false);
});
it('does not cause state updates or timeouts when reverseSearchActive is false', async () => {
const history = ['alpha', 'beta'];
const { result, rerender } = await renderHookWithProviders(
({ active }) => {
const textBuffer = useTextBufferForTest('');
return {
search: useReverseSearchCompletion(textBuffer, history, active),
buffer: textBuffer,
};
},
{ initialProps: { active: false } },
);
// Typing while inactive should NOT update debounced search state.
// We change text to 'a', advance time past the debounce period (100ms),
// and ensure suggestions remain empty because the debounced value never changed.
act(() => {
result.current.buffer.setText('a');
});
act(() => {
vi.advanceTimersByTime(200);
});
expect(result.current.search.suggestions).toEqual([]);
// Activating reverse search and setting text to 'alpha' forces useDebouncedValue
// to receive 'alpha', scheduling the timeout.
act(() => {
result.current.buffer.setText('alpha');
});
rerender({ active: true });
act(() => {
vi.advanceTimersByTime(200);
});
expect(result.current.search.suggestions.map((s) => s.value)).toEqual([
'alpha',
]);
});
});
});

View File

@@ -48,7 +48,10 @@ export function useReverseSearchCompletion(
setVisibleStartIndex,
} = useCompletion();
const debouncedQuery = useDebouncedValue(buffer.text, 100);
const debouncedQuery = useDebouncedValue(
reverseSearchActive ? buffer.text : '',
100,
);
// incremental search
const prevQueryRef = useRef<string>('');