diff --git a/packages/core/src/agents/executor.test.ts b/packages/core/src/agents/executor.test.ts index 0582558648..13e56c6a87 100644 --- a/packages/core/src/agents/executor.test.ts +++ b/packages/core/src/agents/executor.test.ts @@ -26,8 +26,16 @@ import { MockTool } from '../test-utils/mock-tool.js'; import { getDirectoryContextString } from '../utils/environmentContext.js'; import { z } from 'zod'; import { promptIdContext } from '../utils/promptIdContext.js'; -import { logAgentStart, logAgentFinish } from '../telemetry/loggers.js'; -import { AgentStartEvent, AgentFinishEvent } from '../telemetry/types.js'; +import { + logAgentStart, + logAgentFinish, + logRecoveryAttempt, +} from '../telemetry/loggers.js'; +import { + AgentStartEvent, + AgentFinishEvent, + RecoveryAttemptEvent, +} from '../telemetry/types.js'; import type { AgentDefinition, AgentInputs, @@ -61,6 +69,7 @@ vi.mock('../utils/environmentContext.js'); vi.mock('../telemetry/loggers.js', () => ({ logAgentStart: vi.fn(), logAgentFinish: vi.fn(), + logRecoveryAttempt: vi.fn(), })); vi.mock('../utils/promptIdContext.js', async (importOriginal) => { @@ -81,6 +90,7 @@ const mockedGetDirectoryContextString = vi.mocked(getDirectoryContextString); const mockedPromptIdContext = vi.mocked(promptIdContext); const mockedLogAgentStart = vi.mocked(logAgentStart); const mockedLogAgentFinish = vi.mocked(logAgentFinish); +const mockedLogRecoveryAttempt = vi.mocked(logRecoveryAttempt); // Constants for testing const TASK_COMPLETE_TOOL_NAME = 'complete_task'; @@ -1343,4 +1353,91 @@ describe('AgentExecutor', () => { ); }); }); + describe('Telemetry and Logging', () => { + const mockWorkResponse = (id: string) => { + mockModelResponse([{ name: LS_TOOL_NAME, args: { path: '.' }, id }]); + mockExecuteToolCall.mockResolvedValueOnce({ + status: 'success', + request: { + callId: id, + name: LS_TOOL_NAME, + args: { path: '.' }, + isClientInitiated: false, + prompt_id: 'test-prompt', + }, + tool: {} as AnyDeclarativeTool, + invocation: {} as AnyToolInvocation, + response: { + callId: id, + resultDisplay: 'ok', + responseParts: [ + { functionResponse: { name: LS_TOOL_NAME, response: {}, id } }, + ], + error: undefined, + errorType: undefined, + contentLength: undefined, + }, + }); + }; + + beforeEach(() => { + mockedLogRecoveryAttempt.mockClear(); + }); + + it('should log a RecoveryAttemptEvent when a recoverable error occurs and recovery fails', async () => { + const MAX = 1; + const definition = createTestDefinition([LS_TOOL_NAME], { + max_turns: MAX, + }); + const executor = await AgentExecutor.create(definition, mockConfig); + + // Turn 1 (hits max_turns) + mockWorkResponse('t1'); + + // Recovery Turn (fails by calling no tools) + mockModelResponse([], 'I give up again.'); + + await executor.run({ goal: 'Turns recovery fail' }, signal); + + expect(mockedLogRecoveryAttempt).toHaveBeenCalledTimes(1); + const recoveryEvent = mockedLogRecoveryAttempt.mock.calls[0][1]; + expect(recoveryEvent).toBeInstanceOf(RecoveryAttemptEvent); + expect(recoveryEvent.agent_name).toBe(definition.name); + expect(recoveryEvent.reason).toBe(AgentTerminateMode.MAX_TURNS); + expect(recoveryEvent.success).toBe(false); + expect(recoveryEvent.turn_count).toBe(1); + expect(recoveryEvent.duration_ms).toBeGreaterThanOrEqual(0); + }); + + it('should log a successful RecoveryAttemptEvent when recovery succeeds', async () => { + const MAX = 1; + const definition = createTestDefinition([LS_TOOL_NAME], { + max_turns: MAX, + }); + const executor = await AgentExecutor.create(definition, mockConfig); + + // Turn 1 (hits max_turns) + mockWorkResponse('t1'); + + // Recovery Turn (succeeds) + mockModelResponse( + [ + { + name: TASK_COMPLETE_TOOL_NAME, + args: { finalResult: 'Recovered!' }, + id: 't2', + }, + ], + 'Recovering from max turns', + ); + + await executor.run({ goal: 'Turns recovery success' }, signal); + + expect(mockedLogRecoveryAttempt).toHaveBeenCalledTimes(1); + const recoveryEvent = mockedLogRecoveryAttempt.mock.calls[0][1]; + expect(recoveryEvent).toBeInstanceOf(RecoveryAttemptEvent); + expect(recoveryEvent.success).toBe(true); + expect(recoveryEvent.reason).toBe(AgentTerminateMode.MAX_TURNS); + }); + }); }); diff --git a/packages/core/src/agents/executor.ts b/packages/core/src/agents/executor.ts index 84007d90e2..8928a75e69 100644 --- a/packages/core/src/agents/executor.ts +++ b/packages/core/src/agents/executor.ts @@ -30,8 +30,16 @@ import { WEB_SEARCH_TOOL_NAME, } from '../tools/tool-names.js'; import { promptIdContext } from '../utils/promptIdContext.js'; -import { logAgentStart, logAgentFinish } from '../telemetry/loggers.js'; -import { AgentStartEvent, AgentFinishEvent } from '../telemetry/types.js'; +import { + logAgentStart, + logAgentFinish, + logRecoveryAttempt, +} from '../telemetry/loggers.js'; +import { + AgentStartEvent, + AgentFinishEvent, + RecoveryAttemptEvent, +} from '../telemetry/types.js'; import type { AgentDefinition, AgentInputs, @@ -269,6 +277,9 @@ export class AgentExecutor { text: `Execution limit reached (${reason}). Attempting one final recovery turn with a grace period.`, }); + const recoveryStartTime = Date.now(); + let success = false; + const gracePeriodMs = GRACE_PERIOD_MS; const graceTimeoutController = new AbortController(); const graceTimeoutId = setTimeout( @@ -305,6 +316,7 @@ export class AgentExecutor { this.emitActivity('THOUGHT_CHUNK', { text: 'Graceful recovery succeeded.', }); + success = true; return turnResult.finalResult ?? 'Task completed during grace period.'; } @@ -323,6 +335,17 @@ export class AgentExecutor { return null; } finally { clearTimeout(graceTimeoutId); + logRecoveryAttempt( + this.runtimeContext, + new RecoveryAttemptEvent( + this.agentId, + this.definition.name, + reason, + Date.now() - recoveryStartTime, + success, + turnCounter, + ), + ); } } diff --git a/packages/core/src/telemetry/clearcut-logger/clearcut-logger.ts b/packages/core/src/telemetry/clearcut-logger/clearcut-logger.ts index 8fcc9e6fc3..bd3dfb4109 100644 --- a/packages/core/src/telemetry/clearcut-logger/clearcut-logger.ts +++ b/packages/core/src/telemetry/clearcut-logger/clearcut-logger.ts @@ -35,6 +35,7 @@ import type { SmartEditCorrectionEvent, AgentStartEvent, AgentFinishEvent, + RecoveryAttemptEvent, WebFetchFallbackAttemptEvent, ExtensionUpdateEvent, } from '../types.js'; @@ -90,6 +91,7 @@ export enum EventNames { SMART_EDIT_CORRECTION = 'smart_edit_correction', AGENT_START = 'agent_start', AGENT_FINISH = 'agent_finish', + RECOVERY_ATTEMPT = 'recovery_attempt', WEB_FETCH_FALLBACK_ATTEMPT = 'web_fetch_fallback_attempt', } @@ -1230,6 +1232,40 @@ export class ClearcutLogger { this.flushIfNeeded(); } + logRecoveryAttemptEvent(event: RecoveryAttemptEvent): void { + const data: EventValue[] = [ + { + gemini_cli_key: EventMetadataKey.GEMINI_CLI_AGENT_ID, + value: event.agent_id, + }, + { + gemini_cli_key: EventMetadataKey.GEMINI_CLI_AGENT_NAME, + value: event.agent_name, + }, + { + gemini_cli_key: EventMetadataKey.GEMINI_CLI_AGENT_RECOVERY_REASON, + value: event.reason, + }, + { + gemini_cli_key: EventMetadataKey.GEMINI_CLI_AGENT_RECOVERY_DURATION_MS, + value: event.duration_ms.toString(), + }, + { + gemini_cli_key: EventMetadataKey.GEMINI_CLI_AGENT_RECOVERY_SUCCESS, + value: event.success.toString(), + }, + { + gemini_cli_key: EventMetadataKey.GEMINI_CLI_AGENT_TURN_COUNT, + value: event.turn_count.toString(), + }, + ]; + + this.enqueueLogEvent( + this.createLogEvent(EventNames.RECOVERY_ATTEMPT, data), + ); + this.flushIfNeeded(); + } + logWebFetchFallbackAttemptEvent(event: WebFetchFallbackAttemptEvent): void { const data: EventValue[] = [ { diff --git a/packages/core/src/telemetry/clearcut-logger/event-metadata-key.ts b/packages/core/src/telemetry/clearcut-logger/event-metadata-key.ts index 5a99db8001..e24292c51c 100644 --- a/packages/core/src/telemetry/clearcut-logger/event-metadata-key.ts +++ b/packages/core/src/telemetry/clearcut-logger/event-metadata-key.ts @@ -464,4 +464,13 @@ export enum EventMetadataKey { // Logs the reason for agent termination. GEMINI_CLI_AGENT_TERMINATE_REASON = 115, + + // Logs the reason for an agent recovery attempt. + GEMINI_CLI_AGENT_RECOVERY_REASON = 122, + + // Logs the duration of an agent recovery attempt in milliseconds. + GEMINI_CLI_AGENT_RECOVERY_DURATION_MS = 123, + + // Logs whether the agent recovery attempt was successful. + GEMINI_CLI_AGENT_RECOVERY_SUCCESS = 124, } diff --git a/packages/core/src/telemetry/loggers.ts b/packages/core/src/telemetry/loggers.ts index e0ec387cd8..6d2412d2ff 100644 --- a/packages/core/src/telemetry/loggers.ts +++ b/packages/core/src/telemetry/loggers.ts @@ -46,6 +46,7 @@ import type { SmartEditCorrectionEvent, AgentStartEvent, AgentFinishEvent, + RecoveryAttemptEvent, WebFetchFallbackAttemptEvent, ExtensionUpdateEvent, } from './types.js'; @@ -63,6 +64,7 @@ import { recordTokenUsageMetrics, recordApiResponseMetrics, recordAgentRunMetrics, + recordRecoveryAttemptMetrics, recordLinesChanged, } from './metrics.js'; import { isTelemetrySdkInitialized } from './sdk.js'; @@ -636,6 +638,23 @@ export function logAgentFinish(config: Config, event: AgentFinishEvent): void { recordAgentRunMetrics(config, event); } +export function logRecoveryAttempt( + config: Config, + event: RecoveryAttemptEvent, +): void { + ClearcutLogger.getInstance(config)?.logRecoveryAttemptEvent(event); + if (!isTelemetrySdkInitialized()) return; + + const logger = logs.getLogger(SERVICE_NAME); + const logRecord: LogRecord = { + body: event.toLogBody(), + attributes: event.toOpenTelemetryAttributes(config), + }; + logger.emit(logRecord); + + recordRecoveryAttemptMetrics(config, event); +} + export function logWebFetchFallbackAttempt( config: Config, event: WebFetchFallbackAttemptEvent, diff --git a/packages/core/src/telemetry/metrics.ts b/packages/core/src/telemetry/metrics.ts index e28413524a..f2078aa62d 100644 --- a/packages/core/src/telemetry/metrics.ts +++ b/packages/core/src/telemetry/metrics.ts @@ -12,6 +12,7 @@ import type { ModelRoutingEvent, ModelSlashCommandEvent, AgentFinishEvent, + RecoveryAttemptEvent, } from './types.js'; import { AuthType } from '../core/contentGenerator.js'; import { getCommonAttributes } from './telemetryAttributes.js'; @@ -38,6 +39,9 @@ const MODEL_SLASH_COMMAND_CALL_COUNT = const AGENT_RUN_COUNT = 'gemini_cli.agent.run.count'; const AGENT_DURATION_MS = 'gemini_cli.agent.duration'; const AGENT_TURNS = 'gemini_cli.agent.turns'; +const AGENT_RECOVERY_ATTEMPT_COUNT = 'gemini_cli.agent.recovery_attempt.count'; +const AGENT_RECOVERY_ATTEMPT_DURATION = + 'gemini_cli.agent.recovery_attempt.duration'; // OpenTelemetry GenAI Semantic Convention Metrics const GEN_AI_CLIENT_TOKEN_USAGE = 'gen_ai.client.token.usage'; @@ -175,6 +179,16 @@ const COUNTER_DEFINITIONS = { terminate_reason: string; }, }, + [AGENT_RECOVERY_ATTEMPT_COUNT]: { + description: 'Counts agent recovery attempts.', + valueType: ValueType.INT, + assign: (c: Counter) => (agentRecoveryAttemptCounter = c), + attributes: {} as { + agent_name: string; + reason: string; + success: boolean; + }, + }, [FLICKER_FRAME_COUNT]: { description: 'Counts UI frames that flicker (render taller than the terminal).', @@ -244,6 +258,15 @@ const HISTOGRAM_DEFINITIONS = { agent_name: string; }, }, + [AGENT_RECOVERY_ATTEMPT_DURATION]: { + description: 'Duration of agent recovery attempts in milliseconds.', + unit: 'ms', + valueType: ValueType.INT, + assign: (h: Histogram) => (agentRecoveryAttemptDurationHistogram = h), + attributes: {} as { + agent_name: string; + }, + }, [GEN_AI_CLIENT_TOKEN_USAGE]: { description: 'Number of input and output tokens used.', unit: 'token', @@ -478,6 +501,8 @@ let modelSlashCommandCallCounter: Counter | undefined; let agentRunCounter: Counter | undefined; let agentDurationHistogram: Histogram | undefined; let agentTurnsHistogram: Histogram | undefined; +let agentRecoveryAttemptCounter: Counter | undefined; +let agentRecoveryAttemptDurationHistogram: Histogram | undefined; let flickerFrameCounter: Counter | undefined; let exitFailCounter: Counter | undefined; let slowRenderHistogram: Histogram | undefined; @@ -774,6 +799,32 @@ export function recordAgentRunMetrics( }); } +export function recordRecoveryAttemptMetrics( + config: Config, + event: RecoveryAttemptEvent, +): void { + if ( + !agentRecoveryAttemptCounter || + !agentRecoveryAttemptDurationHistogram || + !isMetricsInitialized + ) + return; + + const commonAttributes = baseMetricDefinition.getCommonAttributes(config); + + agentRecoveryAttemptCounter.add(1, { + ...commonAttributes, + agent_name: event.agent_name, + reason: event.reason, + success: event.success, + }); + + agentRecoveryAttemptDurationHistogram.record(event.duration_ms, { + ...commonAttributes, + agent_name: event.agent_name, + }); +} + // OpenTelemetry GenAI Semantic Convention Recording Functions export function recordGenAiClientTokenUsage( diff --git a/packages/core/src/telemetry/types.ts b/packages/core/src/telemetry/types.ts index 6138705a50..3dc4865925 100644 --- a/packages/core/src/telemetry/types.ts +++ b/packages/core/src/telemetry/types.ts @@ -1459,6 +1459,7 @@ export type TelemetryEvent = | ModelSlashCommandEvent | AgentStartEvent | AgentFinishEvent + | RecoveryAttemptEvent | WebFetchFallbackAttemptEvent; export const EVENT_EXTENSION_DISABLE = 'gemini_cli.extension_disable'; @@ -1548,15 +1549,16 @@ export class SmartEditCorrectionEvent implements BaseTelemetryEvent { } } -export const EVENT_AGENT_START = 'gemini_cli.agent.start'; -export class AgentStartEvent implements BaseTelemetryEvent { - 'event.name': 'agent_start'; +abstract class BaseAgentEvent implements BaseTelemetryEvent { + abstract 'event.name': + | 'agent_start' + | 'agent_finish' + | 'agent_recovery_attempt'; 'event.timestamp': string; agent_id: string; agent_name: string; constructor(agent_id: string, agent_name: string) { - this['event.name'] = 'agent_start'; this['event.timestamp'] = new Date().toISOString(); this.agent_id = agent_id; this.agent_name = agent_name; @@ -1565,24 +1567,38 @@ export class AgentStartEvent implements BaseTelemetryEvent { toOpenTelemetryAttributes(config: Config): LogAttributes { return { ...getCommonAttributes(config), - 'event.name': EVENT_AGENT_START, 'event.timestamp': this['event.timestamp'], agent_id: this.agent_id, agent_name: this.agent_name, }; } + abstract toLogBody(): string; +} + +export const EVENT_AGENT_START = 'gemini_cli.agent.start'; +export class AgentStartEvent extends BaseAgentEvent { + 'event.name' = 'agent_start' as const; + + constructor(agent_id: string, agent_name: string) { + super(agent_id, agent_name); + } + + override toOpenTelemetryAttributes(config: Config): LogAttributes { + return { + ...super.toOpenTelemetryAttributes(config), + 'event.name': EVENT_AGENT_START, + }; + } + toLogBody(): string { return `Agent ${this.agent_name} started. ID: ${this.agent_id}`; } } export const EVENT_AGENT_FINISH = 'gemini_cli.agent.finish'; -export class AgentFinishEvent implements BaseTelemetryEvent { - 'event.name': 'agent_finish'; - 'event.timestamp': string; - agent_id: string; - agent_name: string; +export class AgentFinishEvent extends BaseAgentEvent { + 'event.name' = 'agent_finish' as const; duration_ms: number; turn_count: number; terminate_reason: AgentTerminateMode; @@ -1594,22 +1610,16 @@ export class AgentFinishEvent implements BaseTelemetryEvent { turn_count: number, terminate_reason: AgentTerminateMode, ) { - this['event.name'] = 'agent_finish'; - this['event.timestamp'] = new Date().toISOString(); - this.agent_id = agent_id; - this.agent_name = agent_name; + super(agent_id, agent_name); this.duration_ms = duration_ms; this.turn_count = turn_count; this.terminate_reason = terminate_reason; } - toOpenTelemetryAttributes(config: Config): LogAttributes { + override toOpenTelemetryAttributes(config: Config): LogAttributes { return { - ...getCommonAttributes(config), + ...super.toOpenTelemetryAttributes(config), 'event.name': EVENT_AGENT_FINISH, - 'event.timestamp': this['event.timestamp'], - agent_id: this.agent_id, - agent_name: this.agent_name, duration_ms: this.duration_ms, turn_count: this.turn_count, terminate_reason: this.terminate_reason, @@ -1621,6 +1631,45 @@ export class AgentFinishEvent implements BaseTelemetryEvent { } } +export const EVENT_AGENT_RECOVERY_ATTEMPT = 'gemini_cli.agent.recovery_attempt'; +export class RecoveryAttemptEvent extends BaseAgentEvent { + 'event.name' = 'agent_recovery_attempt' as const; + reason: AgentTerminateMode; + duration_ms: number; + success: boolean; + turn_count: number; + + constructor( + agent_id: string, + agent_name: string, + reason: AgentTerminateMode, + duration_ms: number, + success: boolean, + turn_count: number, + ) { + super(agent_id, agent_name); + this.reason = reason; + this.duration_ms = duration_ms; + this.success = success; + this.turn_count = turn_count; + } + + override toOpenTelemetryAttributes(config: Config): LogAttributes { + return { + ...super.toOpenTelemetryAttributes(config), + 'event.name': EVENT_AGENT_RECOVERY_ATTEMPT, + reason: this.reason, + duration_ms: this.duration_ms, + success: this.success, + turn_count: this.turn_count, + }; + } + + toLogBody(): string { + return `Agent ${this.agent_name} recovery attempt. Reason: ${this.reason}. Success: ${this.success}. Duration: ${this.duration_ms}ms.`; + } +} + export const EVENT_WEB_FETCH_FALLBACK_ATTEMPT = 'gemini_cli.web_fetch_fallback_attempt'; export class WebFetchFallbackAttemptEvent implements BaseTelemetryEvent {