Log recovery events (nudges) that happens inside the subagent (#12408)

This commit is contained in:
Silvio Junior
2025-11-03 17:53:43 -05:00
committed by GitHub
parent be1dc13bb1
commit 7a515339ea
7 changed files with 307 additions and 23 deletions

View File

@@ -26,8 +26,16 @@ import { MockTool } from '../test-utils/mock-tool.js';
import { getDirectoryContextString } from '../utils/environmentContext.js';
import { z } from 'zod';
import { promptIdContext } from '../utils/promptIdContext.js';
import { logAgentStart, logAgentFinish } from '../telemetry/loggers.js';
import { AgentStartEvent, AgentFinishEvent } from '../telemetry/types.js';
import {
logAgentStart,
logAgentFinish,
logRecoveryAttempt,
} from '../telemetry/loggers.js';
import {
AgentStartEvent,
AgentFinishEvent,
RecoveryAttemptEvent,
} from '../telemetry/types.js';
import type {
AgentDefinition,
AgentInputs,
@@ -61,6 +69,7 @@ vi.mock('../utils/environmentContext.js');
vi.mock('../telemetry/loggers.js', () => ({
logAgentStart: vi.fn(),
logAgentFinish: vi.fn(),
logRecoveryAttempt: vi.fn(),
}));
vi.mock('../utils/promptIdContext.js', async (importOriginal) => {
@@ -81,6 +90,7 @@ const mockedGetDirectoryContextString = vi.mocked(getDirectoryContextString);
const mockedPromptIdContext = vi.mocked(promptIdContext);
const mockedLogAgentStart = vi.mocked(logAgentStart);
const mockedLogAgentFinish = vi.mocked(logAgentFinish);
const mockedLogRecoveryAttempt = vi.mocked(logRecoveryAttempt);
// Constants for testing
const TASK_COMPLETE_TOOL_NAME = 'complete_task';
@@ -1343,4 +1353,91 @@ describe('AgentExecutor', () => {
);
});
});
describe('Telemetry and Logging', () => {
const mockWorkResponse = (id: string) => {
mockModelResponse([{ name: LS_TOOL_NAME, args: { path: '.' }, id }]);
mockExecuteToolCall.mockResolvedValueOnce({
status: 'success',
request: {
callId: id,
name: LS_TOOL_NAME,
args: { path: '.' },
isClientInitiated: false,
prompt_id: 'test-prompt',
},
tool: {} as AnyDeclarativeTool,
invocation: {} as AnyToolInvocation,
response: {
callId: id,
resultDisplay: 'ok',
responseParts: [
{ functionResponse: { name: LS_TOOL_NAME, response: {}, id } },
],
error: undefined,
errorType: undefined,
contentLength: undefined,
},
});
};
beforeEach(() => {
mockedLogRecoveryAttempt.mockClear();
});
it('should log a RecoveryAttemptEvent when a recoverable error occurs and recovery fails', async () => {
const MAX = 1;
const definition = createTestDefinition([LS_TOOL_NAME], {
max_turns: MAX,
});
const executor = await AgentExecutor.create(definition, mockConfig);
// Turn 1 (hits max_turns)
mockWorkResponse('t1');
// Recovery Turn (fails by calling no tools)
mockModelResponse([], 'I give up again.');
await executor.run({ goal: 'Turns recovery fail' }, signal);
expect(mockedLogRecoveryAttempt).toHaveBeenCalledTimes(1);
const recoveryEvent = mockedLogRecoveryAttempt.mock.calls[0][1];
expect(recoveryEvent).toBeInstanceOf(RecoveryAttemptEvent);
expect(recoveryEvent.agent_name).toBe(definition.name);
expect(recoveryEvent.reason).toBe(AgentTerminateMode.MAX_TURNS);
expect(recoveryEvent.success).toBe(false);
expect(recoveryEvent.turn_count).toBe(1);
expect(recoveryEvent.duration_ms).toBeGreaterThanOrEqual(0);
});
it('should log a successful RecoveryAttemptEvent when recovery succeeds', async () => {
const MAX = 1;
const definition = createTestDefinition([LS_TOOL_NAME], {
max_turns: MAX,
});
const executor = await AgentExecutor.create(definition, mockConfig);
// Turn 1 (hits max_turns)
mockWorkResponse('t1');
// Recovery Turn (succeeds)
mockModelResponse(
[
{
name: TASK_COMPLETE_TOOL_NAME,
args: { finalResult: 'Recovered!' },
id: 't2',
},
],
'Recovering from max turns',
);
await executor.run({ goal: 'Turns recovery success' }, signal);
expect(mockedLogRecoveryAttempt).toHaveBeenCalledTimes(1);
const recoveryEvent = mockedLogRecoveryAttempt.mock.calls[0][1];
expect(recoveryEvent).toBeInstanceOf(RecoveryAttemptEvent);
expect(recoveryEvent.success).toBe(true);
expect(recoveryEvent.reason).toBe(AgentTerminateMode.MAX_TURNS);
});
});
});

View File

@@ -30,8 +30,16 @@ import {
WEB_SEARCH_TOOL_NAME,
} from '../tools/tool-names.js';
import { promptIdContext } from '../utils/promptIdContext.js';
import { logAgentStart, logAgentFinish } from '../telemetry/loggers.js';
import { AgentStartEvent, AgentFinishEvent } from '../telemetry/types.js';
import {
logAgentStart,
logAgentFinish,
logRecoveryAttempt,
} from '../telemetry/loggers.js';
import {
AgentStartEvent,
AgentFinishEvent,
RecoveryAttemptEvent,
} from '../telemetry/types.js';
import type {
AgentDefinition,
AgentInputs,
@@ -269,6 +277,9 @@ export class AgentExecutor<TOutput extends z.ZodTypeAny> {
text: `Execution limit reached (${reason}). Attempting one final recovery turn with a grace period.`,
});
const recoveryStartTime = Date.now();
let success = false;
const gracePeriodMs = GRACE_PERIOD_MS;
const graceTimeoutController = new AbortController();
const graceTimeoutId = setTimeout(
@@ -305,6 +316,7 @@ export class AgentExecutor<TOutput extends z.ZodTypeAny> {
this.emitActivity('THOUGHT_CHUNK', {
text: 'Graceful recovery succeeded.',
});
success = true;
return turnResult.finalResult ?? 'Task completed during grace period.';
}
@@ -323,6 +335,17 @@ export class AgentExecutor<TOutput extends z.ZodTypeAny> {
return null;
} finally {
clearTimeout(graceTimeoutId);
logRecoveryAttempt(
this.runtimeContext,
new RecoveryAttemptEvent(
this.agentId,
this.definition.name,
reason,
Date.now() - recoveryStartTime,
success,
turnCounter,
),
);
}
}

View File

@@ -35,6 +35,7 @@ import type {
SmartEditCorrectionEvent,
AgentStartEvent,
AgentFinishEvent,
RecoveryAttemptEvent,
WebFetchFallbackAttemptEvent,
ExtensionUpdateEvent,
} from '../types.js';
@@ -90,6 +91,7 @@ export enum EventNames {
SMART_EDIT_CORRECTION = 'smart_edit_correction',
AGENT_START = 'agent_start',
AGENT_FINISH = 'agent_finish',
RECOVERY_ATTEMPT = 'recovery_attempt',
WEB_FETCH_FALLBACK_ATTEMPT = 'web_fetch_fallback_attempt',
}
@@ -1230,6 +1232,40 @@ export class ClearcutLogger {
this.flushIfNeeded();
}
logRecoveryAttemptEvent(event: RecoveryAttemptEvent): void {
const data: EventValue[] = [
{
gemini_cli_key: EventMetadataKey.GEMINI_CLI_AGENT_ID,
value: event.agent_id,
},
{
gemini_cli_key: EventMetadataKey.GEMINI_CLI_AGENT_NAME,
value: event.agent_name,
},
{
gemini_cli_key: EventMetadataKey.GEMINI_CLI_AGENT_RECOVERY_REASON,
value: event.reason,
},
{
gemini_cli_key: EventMetadataKey.GEMINI_CLI_AGENT_RECOVERY_DURATION_MS,
value: event.duration_ms.toString(),
},
{
gemini_cli_key: EventMetadataKey.GEMINI_CLI_AGENT_RECOVERY_SUCCESS,
value: event.success.toString(),
},
{
gemini_cli_key: EventMetadataKey.GEMINI_CLI_AGENT_TURN_COUNT,
value: event.turn_count.toString(),
},
];
this.enqueueLogEvent(
this.createLogEvent(EventNames.RECOVERY_ATTEMPT, data),
);
this.flushIfNeeded();
}
logWebFetchFallbackAttemptEvent(event: WebFetchFallbackAttemptEvent): void {
const data: EventValue[] = [
{

View File

@@ -464,4 +464,13 @@ export enum EventMetadataKey {
// Logs the reason for agent termination.
GEMINI_CLI_AGENT_TERMINATE_REASON = 115,
// Logs the reason for an agent recovery attempt.
GEMINI_CLI_AGENT_RECOVERY_REASON = 122,
// Logs the duration of an agent recovery attempt in milliseconds.
GEMINI_CLI_AGENT_RECOVERY_DURATION_MS = 123,
// Logs whether the agent recovery attempt was successful.
GEMINI_CLI_AGENT_RECOVERY_SUCCESS = 124,
}

View File

@@ -46,6 +46,7 @@ import type {
SmartEditCorrectionEvent,
AgentStartEvent,
AgentFinishEvent,
RecoveryAttemptEvent,
WebFetchFallbackAttemptEvent,
ExtensionUpdateEvent,
} from './types.js';
@@ -63,6 +64,7 @@ import {
recordTokenUsageMetrics,
recordApiResponseMetrics,
recordAgentRunMetrics,
recordRecoveryAttemptMetrics,
recordLinesChanged,
} from './metrics.js';
import { isTelemetrySdkInitialized } from './sdk.js';
@@ -636,6 +638,23 @@ export function logAgentFinish(config: Config, event: AgentFinishEvent): void {
recordAgentRunMetrics(config, event);
}
export function logRecoveryAttempt(
config: Config,
event: RecoveryAttemptEvent,
): void {
ClearcutLogger.getInstance(config)?.logRecoveryAttemptEvent(event);
if (!isTelemetrySdkInitialized()) return;
const logger = logs.getLogger(SERVICE_NAME);
const logRecord: LogRecord = {
body: event.toLogBody(),
attributes: event.toOpenTelemetryAttributes(config),
};
logger.emit(logRecord);
recordRecoveryAttemptMetrics(config, event);
}
export function logWebFetchFallbackAttempt(
config: Config,
event: WebFetchFallbackAttemptEvent,

View File

@@ -12,6 +12,7 @@ import type {
ModelRoutingEvent,
ModelSlashCommandEvent,
AgentFinishEvent,
RecoveryAttemptEvent,
} from './types.js';
import { AuthType } from '../core/contentGenerator.js';
import { getCommonAttributes } from './telemetryAttributes.js';
@@ -38,6 +39,9 @@ const MODEL_SLASH_COMMAND_CALL_COUNT =
const AGENT_RUN_COUNT = 'gemini_cli.agent.run.count';
const AGENT_DURATION_MS = 'gemini_cli.agent.duration';
const AGENT_TURNS = 'gemini_cli.agent.turns';
const AGENT_RECOVERY_ATTEMPT_COUNT = 'gemini_cli.agent.recovery_attempt.count';
const AGENT_RECOVERY_ATTEMPT_DURATION =
'gemini_cli.agent.recovery_attempt.duration';
// OpenTelemetry GenAI Semantic Convention Metrics
const GEN_AI_CLIENT_TOKEN_USAGE = 'gen_ai.client.token.usage';
@@ -175,6 +179,16 @@ const COUNTER_DEFINITIONS = {
terminate_reason: string;
},
},
[AGENT_RECOVERY_ATTEMPT_COUNT]: {
description: 'Counts agent recovery attempts.',
valueType: ValueType.INT,
assign: (c: Counter) => (agentRecoveryAttemptCounter = c),
attributes: {} as {
agent_name: string;
reason: string;
success: boolean;
},
},
[FLICKER_FRAME_COUNT]: {
description:
'Counts UI frames that flicker (render taller than the terminal).',
@@ -244,6 +258,15 @@ const HISTOGRAM_DEFINITIONS = {
agent_name: string;
},
},
[AGENT_RECOVERY_ATTEMPT_DURATION]: {
description: 'Duration of agent recovery attempts in milliseconds.',
unit: 'ms',
valueType: ValueType.INT,
assign: (h: Histogram) => (agentRecoveryAttemptDurationHistogram = h),
attributes: {} as {
agent_name: string;
},
},
[GEN_AI_CLIENT_TOKEN_USAGE]: {
description: 'Number of input and output tokens used.',
unit: 'token',
@@ -478,6 +501,8 @@ let modelSlashCommandCallCounter: Counter | undefined;
let agentRunCounter: Counter | undefined;
let agentDurationHistogram: Histogram | undefined;
let agentTurnsHistogram: Histogram | undefined;
let agentRecoveryAttemptCounter: Counter | undefined;
let agentRecoveryAttemptDurationHistogram: Histogram | undefined;
let flickerFrameCounter: Counter | undefined;
let exitFailCounter: Counter | undefined;
let slowRenderHistogram: Histogram | undefined;
@@ -774,6 +799,32 @@ export function recordAgentRunMetrics(
});
}
export function recordRecoveryAttemptMetrics(
config: Config,
event: RecoveryAttemptEvent,
): void {
if (
!agentRecoveryAttemptCounter ||
!agentRecoveryAttemptDurationHistogram ||
!isMetricsInitialized
)
return;
const commonAttributes = baseMetricDefinition.getCommonAttributes(config);
agentRecoveryAttemptCounter.add(1, {
...commonAttributes,
agent_name: event.agent_name,
reason: event.reason,
success: event.success,
});
agentRecoveryAttemptDurationHistogram.record(event.duration_ms, {
...commonAttributes,
agent_name: event.agent_name,
});
}
// OpenTelemetry GenAI Semantic Convention Recording Functions
export function recordGenAiClientTokenUsage(

View File

@@ -1459,6 +1459,7 @@ export type TelemetryEvent =
| ModelSlashCommandEvent
| AgentStartEvent
| AgentFinishEvent
| RecoveryAttemptEvent
| WebFetchFallbackAttemptEvent;
export const EVENT_EXTENSION_DISABLE = 'gemini_cli.extension_disable';
@@ -1548,15 +1549,16 @@ export class SmartEditCorrectionEvent implements BaseTelemetryEvent {
}
}
export const EVENT_AGENT_START = 'gemini_cli.agent.start';
export class AgentStartEvent implements BaseTelemetryEvent {
'event.name': 'agent_start';
abstract class BaseAgentEvent implements BaseTelemetryEvent {
abstract 'event.name':
| 'agent_start'
| 'agent_finish'
| 'agent_recovery_attempt';
'event.timestamp': string;
agent_id: string;
agent_name: string;
constructor(agent_id: string, agent_name: string) {
this['event.name'] = 'agent_start';
this['event.timestamp'] = new Date().toISOString();
this.agent_id = agent_id;
this.agent_name = agent_name;
@@ -1565,24 +1567,38 @@ export class AgentStartEvent implements BaseTelemetryEvent {
toOpenTelemetryAttributes(config: Config): LogAttributes {
return {
...getCommonAttributes(config),
'event.name': EVENT_AGENT_START,
'event.timestamp': this['event.timestamp'],
agent_id: this.agent_id,
agent_name: this.agent_name,
};
}
abstract toLogBody(): string;
}
export const EVENT_AGENT_START = 'gemini_cli.agent.start';
export class AgentStartEvent extends BaseAgentEvent {
'event.name' = 'agent_start' as const;
constructor(agent_id: string, agent_name: string) {
super(agent_id, agent_name);
}
override toOpenTelemetryAttributes(config: Config): LogAttributes {
return {
...super.toOpenTelemetryAttributes(config),
'event.name': EVENT_AGENT_START,
};
}
toLogBody(): string {
return `Agent ${this.agent_name} started. ID: ${this.agent_id}`;
}
}
export const EVENT_AGENT_FINISH = 'gemini_cli.agent.finish';
export class AgentFinishEvent implements BaseTelemetryEvent {
'event.name': 'agent_finish';
'event.timestamp': string;
agent_id: string;
agent_name: string;
export class AgentFinishEvent extends BaseAgentEvent {
'event.name' = 'agent_finish' as const;
duration_ms: number;
turn_count: number;
terminate_reason: AgentTerminateMode;
@@ -1594,22 +1610,16 @@ export class AgentFinishEvent implements BaseTelemetryEvent {
turn_count: number,
terminate_reason: AgentTerminateMode,
) {
this['event.name'] = 'agent_finish';
this['event.timestamp'] = new Date().toISOString();
this.agent_id = agent_id;
this.agent_name = agent_name;
super(agent_id, agent_name);
this.duration_ms = duration_ms;
this.turn_count = turn_count;
this.terminate_reason = terminate_reason;
}
toOpenTelemetryAttributes(config: Config): LogAttributes {
override toOpenTelemetryAttributes(config: Config): LogAttributes {
return {
...getCommonAttributes(config),
...super.toOpenTelemetryAttributes(config),
'event.name': EVENT_AGENT_FINISH,
'event.timestamp': this['event.timestamp'],
agent_id: this.agent_id,
agent_name: this.agent_name,
duration_ms: this.duration_ms,
turn_count: this.turn_count,
terminate_reason: this.terminate_reason,
@@ -1621,6 +1631,45 @@ export class AgentFinishEvent implements BaseTelemetryEvent {
}
}
export const EVENT_AGENT_RECOVERY_ATTEMPT = 'gemini_cli.agent.recovery_attempt';
export class RecoveryAttemptEvent extends BaseAgentEvent {
'event.name' = 'agent_recovery_attempt' as const;
reason: AgentTerminateMode;
duration_ms: number;
success: boolean;
turn_count: number;
constructor(
agent_id: string,
agent_name: string,
reason: AgentTerminateMode,
duration_ms: number,
success: boolean,
turn_count: number,
) {
super(agent_id, agent_name);
this.reason = reason;
this.duration_ms = duration_ms;
this.success = success;
this.turn_count = turn_count;
}
override toOpenTelemetryAttributes(config: Config): LogAttributes {
return {
...super.toOpenTelemetryAttributes(config),
'event.name': EVENT_AGENT_RECOVERY_ATTEMPT,
reason: this.reason,
duration_ms: this.duration_ms,
success: this.success,
turn_count: this.turn_count,
};
}
toLogBody(): string {
return `Agent ${this.agent_name} recovery attempt. Reason: ${this.reason}. Success: ${this.success}. Duration: ${this.duration_ms}ms.`;
}
}
export const EVENT_WEB_FETCH_FALLBACK_ATTEMPT =
'gemini_cli.web_fetch_fallback_attempt';
export class WebFetchFallbackAttemptEvent implements BaseTelemetryEvent {