feat(telemetry): implement retry attempt telemetry for network related retries (#22027)

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
Aishanee Shah
2026-03-11 14:55:48 -04:00
committed by GitHub
parent 36ce2ba96e
commit 067e09a40b
13 changed files with 326 additions and 27 deletions
@@ -27,6 +27,7 @@ import type {
InvalidChunkEvent,
ContentRetryEvent,
ContentRetryFailureEvent,
NetworkRetryAttemptEvent,
ExtensionInstallEvent,
ToolOutputTruncatedEvent,
ExtensionUninstallEvent,
@@ -94,6 +95,7 @@ export enum EventNames {
INVALID_CHUNK = 'invalid_chunk',
CONTENT_RETRY = 'content_retry',
CONTENT_RETRY_FAILURE = 'content_retry_failure',
RETRY_ATTEMPT = 'retry_attempt',
EXTENSION_ENABLE = 'extension_enable',
EXTENSION_DISABLE = 'extension_disable',
EXTENSION_INSTALL = 'extension_install',
@@ -1231,6 +1233,32 @@ export class ClearcutLogger {
this.flushIfNeeded();
}
logNetworkRetryAttemptEvent(event: NetworkRetryAttemptEvent): void {
// This event is generic for any retry attempt (Gemini, WebFetch, etc.)
const data: EventValue[] = [
{
gemini_cli_key:
EventMetadataKey.GEMINI_CLI_NETWORK_RETRY_ATTEMPT_NUMBER,
value: String(event.attempt),
},
{
gemini_cli_key: EventMetadataKey.GEMINI_CLI_NETWORK_RETRY_DELAY_MS,
value: String(event.delay_ms),
},
{
gemini_cli_key: EventMetadataKey.GEMINI_CLI_NETWORK_RETRY_ERROR_TYPE,
value: event.error_type,
},
{
gemini_cli_key: EventMetadataKey.GEMINI_CLI_API_REQUEST_MODEL,
value: event.model,
},
];
this.enqueueLogEvent(this.createLogEvent(EventNames.RETRY_ATTEMPT, data));
this.flushIfNeeded();
}
async logExtensionInstallEvent(event: ExtensionInstallEvent): Promise<void> {
const data: EventValue[] = [
{
@@ -674,4 +674,17 @@ export enum EventMetadataKey {
// Logs the error message for Conseca events.
CONSECA_ERROR = 166,
// ==========================================================================
// Network Retry Event Keys
// ==========================================================================
// Logs the attempt number for a network retry.
GEMINI_CLI_NETWORK_RETRY_ATTEMPT_NUMBER = 180,
// Logs the delay in milliseconds for a network retry.
GEMINI_CLI_NETWORK_RETRY_DELAY_MS = 181,
// Logs the error type for a network retry.
GEMINI_CLI_NETWORK_RETRY_ERROR_TYPE = 182,
}
+3
View File
@@ -46,6 +46,7 @@ export {
logExtensionUninstall,
logExtensionUpdateEvent,
logWebFetchFallbackAttempt,
logNetworkRetryAttempt,
logRewind,
} from './loggers.js';
export {
@@ -66,6 +67,7 @@ export {
ConversationFinishedEvent,
ToolOutputTruncatedEvent,
WebFetchFallbackAttemptEvent,
NetworkRetryAttemptEvent,
ToolCallDecision,
RewindEvent,
ConsecaPolicyGenerationEvent,
@@ -111,6 +113,7 @@ export {
recordApiErrorMetrics,
recordFileOperationMetric,
recordInvalidChunk,
recordRetryAttemptMetrics,
recordContentRetry,
recordContentRetryFailure,
recordModelRoutingMetrics,
@@ -45,6 +45,7 @@ import {
logAgentStart,
logAgentFinish,
logWebFetchFallbackAttempt,
logNetworkRetryAttempt,
logExtensionUpdateEvent,
logHookCall,
} from './loggers.js';
@@ -70,6 +71,7 @@ import {
EVENT_AGENT_FINISH,
EVENT_WEB_FETCH_FALLBACK_ATTEMPT,
EVENT_INVALID_CHUNK,
EVENT_NETWORK_RETRY_ATTEMPT,
ApiErrorEvent,
ApiRequestEvent,
ApiResponseEvent,
@@ -91,6 +93,7 @@ import {
AgentStartEvent,
AgentFinishEvent,
WebFetchFallbackAttemptEvent,
NetworkRetryAttemptEvent,
ExtensionUpdateEvent,
EVENT_EXTENSION_UPDATE,
HookCallEvent,
@@ -2429,6 +2432,56 @@ describe('loggers', () => {
});
});
describe('logNetworkRetryAttempt', () => {
const mockConfig = makeFakeConfig();
beforeEach(() => {
vi.spyOn(ClearcutLogger.prototype, 'logNetworkRetryAttemptEvent');
vi.spyOn(metrics, 'recordRetryAttemptMetrics');
});
it('logs the network retry attempt event to Clearcut and OTEL', () => {
const event = new NetworkRetryAttemptEvent(
2,
5,
'Overloaded',
1000,
'test-model',
);
logNetworkRetryAttempt(mockConfig, event);
expect(
ClearcutLogger.prototype.logNetworkRetryAttemptEvent,
).toHaveBeenCalledWith(event);
expect(mockLogger.emit).toHaveBeenCalledWith({
body: 'Network retry attempt 2/5 for test-model. Delay: 1000ms. Error type: Overloaded',
attributes: {
'session.id': 'test-session-id',
'user.email': 'test-user@example.com',
'installation.id': 'test-installation-id',
'event.name': EVENT_NETWORK_RETRY_ATTEMPT,
'event.timestamp': '2025-01-01T00:00:00.000Z',
interactive: false,
attempt: 2,
max_attempts: 5,
error_type: 'Overloaded',
delay_ms: 1000,
model: 'test-model',
},
});
expect(metrics.recordRetryAttemptMetrics).toHaveBeenCalledWith(
mockConfig,
{
model: 'test-model',
attempt: 2,
},
);
});
});
describe('Telemetry Buffering', () => {
it('should buffer events when SDK is not initialized', async () => {
vi.spyOn(sdk, 'isTelemetrySdkInitialized').mockReturnValue(false);
+21
View File
@@ -32,6 +32,7 @@ import {
type InvalidChunkEvent,
type ContentRetryEvent,
type ContentRetryFailureEvent,
type NetworkRetryAttemptEvent,
type RipgrepFallbackEvent,
type ToolOutputTruncatedEvent,
type ModelRoutingEvent,
@@ -62,6 +63,7 @@ import {
recordToolCallMetrics,
recordChatCompressionMetrics,
recordFileOperationMetric,
recordRetryAttemptMetrics,
recordContentRetry,
recordContentRetryFailure,
recordModelRoutingMetrics,
@@ -485,6 +487,25 @@ export function logInvalidChunk(
});
}
export function logNetworkRetryAttempt(
config: Config,
event: NetworkRetryAttemptEvent,
): void {
ClearcutLogger.getInstance(config)?.logNetworkRetryAttemptEvent(event);
bufferTelemetryEvent(() => {
const logger = logs.getLogger(SERVICE_NAME);
const logRecord: LogRecord = {
body: event.toLogBody(),
attributes: event.toOpenTelemetryAttributes(config),
};
logger.emit(logRecord);
recordRetryAttemptMetrics(config, {
model: event.model,
attempt: event.attempt,
});
});
}
export function logContentRetry(
config: Config,
event: ContentRetryEvent,
+26
View File
@@ -40,6 +40,7 @@ const INVALID_CHUNK_COUNT = 'gemini_cli.chat.invalid_chunk.count';
const CONTENT_RETRY_COUNT = 'gemini_cli.chat.content_retry.count';
const CONTENT_RETRY_FAILURE_COUNT =
'gemini_cli.chat.content_retry_failure.count';
const NETWORK_RETRY_COUNT = 'gemini_cli.network_retry.count';
const MODEL_ROUTING_LATENCY = 'gemini_cli.model_routing.latency';
const MODEL_ROUTING_FAILURE_COUNT = 'gemini_cli.model_routing.failure.count';
const MODEL_SLASH_COMMAND_CALL_COUNT =
@@ -166,6 +167,16 @@ const COUNTER_DEFINITIONS = {
assign: (c: Counter) => (contentRetryFailureCounter = c),
attributes: {} as Record<string, never>,
},
[NETWORK_RETRY_COUNT]: {
description: 'Counts network retries.',
valueType: ValueType.INT,
assign: (c: Counter) => (networkRetryCounter = c),
// eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
attributes: {} as {
model: string;
attempt: number;
},
},
[MODEL_ROUTING_FAILURE_COUNT]: {
description: 'Counts model routing failures.',
valueType: ValueType.INT,
@@ -610,6 +621,7 @@ let chatCompressionCounter: Counter | undefined;
let invalidChunkCounter: Counter | undefined;
let contentRetryCounter: Counter | undefined;
let contentRetryFailureCounter: Counter | undefined;
let networkRetryCounter: Counter | undefined;
let modelRoutingLatencyHistogram: Histogram | undefined;
let modelRoutingFailureCounter: Counter | undefined;
let modelSlashCommandCallCounter: Counter | undefined;
@@ -848,6 +860,20 @@ export function recordInvalidChunk(config: Config): void {
invalidChunkCounter.add(1, baseMetricDefinition.getCommonAttributes(config));
}
export function recordRetryAttemptMetrics(
config: Config,
attributes: {
model: string;
attempt: number;
},
): void {
if (!networkRetryCounter || !isMetricsInitialized) return;
networkRetryCounter.add(1, {
...baseMetricDefinition.getCommonAttributes(config),
...attributes,
});
}
/**
* Records a metric for when a retry is triggered due to a content error.
*/
+45
View File
@@ -1341,6 +1341,51 @@ export class ContentRetryEvent implements BaseTelemetryEvent {
export const EVENT_CONTENT_RETRY_FAILURE =
'gemini_cli.chat.content_retry_failure';
export const EVENT_NETWORK_RETRY_ATTEMPT = 'gemini_cli.network_retry_attempt';
export class NetworkRetryAttemptEvent implements BaseTelemetryEvent {
'event.name': 'network_retry_attempt';
'event.timestamp': string;
attempt: number;
max_attempts: number;
error_type: string;
delay_ms: number;
model: string;
constructor(
attempt: number,
max_attempts: number,
error_type: string,
delay_ms: number,
model: string,
) {
this['event.name'] = 'network_retry_attempt';
this['event.timestamp'] = new Date().toISOString();
this.attempt = attempt;
this.max_attempts = max_attempts;
this.error_type = error_type;
this.delay_ms = delay_ms;
this.model = model;
}
toOpenTelemetryAttributes(config: Config): LogAttributes {
return {
...getCommonAttributes(config),
'event.name': EVENT_NETWORK_RETRY_ATTEMPT,
'event.timestamp': this['event.timestamp'],
attempt: this.attempt,
max_attempts: this.max_attempts,
error_type: this.error_type,
delay_ms: this.delay_ms,
model: this.model,
};
}
toLogBody(): string {
return `Network retry attempt ${this.attempt}/${this.max_attempts} for ${this.model}. Delay: ${this.delay_ms}ms. Error type: ${this.error_type}`;
}
}
export class ContentRetryFailureEvent implements BaseTelemetryEvent {
'event.name': 'content_retry_failure';
'event.timestamp': string;