feat(telemetry): implement retry attempt telemetry for network related retries (#22027)

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
Aishanee Shah
2026-03-11 14:55:48 -04:00
committed by GitHub
parent 36ce2ba96e
commit 067e09a40b
13 changed files with 326 additions and 27 deletions
+29 -7
View File
@@ -18,9 +18,16 @@ import { handleFallback } from '../fallback/handler.js';
import { getResponseText } from '../utils/partUtils.js';
import { reportError } from '../utils/errorReporting.js';
import { getErrorMessage } from '../utils/errors.js';
import { logMalformedJsonResponse } from '../telemetry/loggers.js';
import { MalformedJsonResponseEvent, LlmRole } from '../telemetry/types.js';
import { retryWithBackoff } from '../utils/retry.js';
import {
logMalformedJsonResponse,
logNetworkRetryAttempt,
} from '../telemetry/loggers.js';
import {
MalformedJsonResponseEvent,
LlmRole,
NetworkRetryAttemptEvent,
} from '../telemetry/types.js';
import { retryWithBackoff, getRetryErrorType } from '../utils/retry.js';
import { coreEvents } from '../utils/events.js';
import { getDisplayString } from '../config/models.js';
import type { ModelConfigKey } from '../services/modelConfigService.js';
@@ -331,14 +338,29 @@ export class BaseLlmClient {
this.authType ?? this.config.getContentGeneratorConfig()?.authType,
retryFetchErrors: this.config.getRetryFetchErrors(),
onRetry: (attempt, error, delayMs) => {
const actualMaxAttempts =
availabilityMaxAttempts ?? maxAttempts ?? DEFAULT_MAX_ATTEMPTS;
const modelName = getDisplayString(currentModel);
const errorType = getRetryErrorType(error);
coreEvents.emitRetryAttempt({
attempt,
maxAttempts:
availabilityMaxAttempts ?? maxAttempts ?? DEFAULT_MAX_ATTEMPTS,
maxAttempts: actualMaxAttempts,
delayMs,
error: error instanceof Error ? error.message : String(error),
model: getDisplayString(currentModel),
error: errorType,
model: modelName,
});
logNetworkRetryAttempt(
this.config,
new NetworkRetryAttemptEvent(
attempt,
actualMaxAttempts,
errorType,
delayMs,
modelName,
),
);
},
});
} catch (error) {
+7 -1
View File
@@ -85,14 +85,20 @@ vi.mock('../fallback/handler.js', () => ({
handleFallback: mockHandleFallback,
}));
const { mockLogContentRetry, mockLogContentRetryFailure } = vi.hoisted(() => ({
const {
mockLogContentRetry,
mockLogContentRetryFailure,
mockLogNetworkRetryAttempt,
} = vi.hoisted(() => ({
mockLogContentRetry: vi.fn(),
mockLogContentRetryFailure: vi.fn(),
mockLogNetworkRetryAttempt: vi.fn(),
}));
vi.mock('../telemetry/loggers.js', () => ({
logContentRetry: mockLogContentRetry,
logContentRetryFailure: mockLogContentRetryFailure,
logNetworkRetryAttempt: mockLogNetworkRetryAttempt,
}));
vi.mock('../telemetry/uiTelemetry.js', () => ({
+28 -7
View File
@@ -19,7 +19,11 @@ import {
type GenerateContentParameters,
} from '@google/genai';
import { toParts } from '../code_assist/converter.js';
import { retryWithBackoff, isRetryableError } from '../utils/retry.js';
import {
retryWithBackoff,
isRetryableError,
getRetryErrorType,
} from '../utils/retry.js';
import type { ValidationRequiredError } from '../utils/googleQuotaErrors.js';
import type { Config } from '../config/config.js';
import {
@@ -33,6 +37,7 @@ import type { CompletedToolCall } from './coreToolScheduler.js';
import {
logContentRetry,
logContentRetryFailure,
logNetworkRetryAttempt,
} from '../telemetry/loggers.js';
import {
ChatRecordingService,
@@ -41,6 +46,7 @@ import {
import {
ContentRetryEvent,
ContentRetryFailureEvent,
NetworkRetryAttemptEvent,
type LlmRole,
} from '../telemetry/types.js';
import { handleFallback } from '../fallback/handler.js';
@@ -412,7 +418,9 @@ export class GeminiChat {
}
const isContentError = error instanceof InvalidStreamError;
const errorType = isContentError ? error.type : 'NETWORK_ERROR';
const errorType = isContentError
? error.type
: getRetryErrorType(error);
if (
(isContentError && isGemini2Model(model)) ||
@@ -422,15 +430,28 @@ export class GeminiChat {
if (attempt < maxAttempts - 1) {
const delayMs = INVALID_CONTENT_RETRY_OPTIONS.initialDelayMs;
logContentRetry(
this.config,
new ContentRetryEvent(attempt, errorType, delayMs, model),
);
if (isContentError) {
logContentRetry(
this.config,
new ContentRetryEvent(attempt, errorType, delayMs, model),
);
} else {
logNetworkRetryAttempt(
this.config,
new NetworkRetryAttemptEvent(
attempt + 1,
maxAttempts,
errorType,
delayMs * (attempt + 1),
model,
),
);
}
coreEvents.emitRetryAttempt({
attempt: attempt + 1,
maxAttempts,
delayMs: delayMs * (attempt + 1),
error: error instanceof Error ? error.message : String(error),
error: errorType,
model,
});
await new Promise((res) =>
@@ -47,14 +47,20 @@ vi.mock('../utils/retry.js', async (importOriginal) => {
});
// Mock loggers
const { mockLogContentRetry, mockLogContentRetryFailure } = vi.hoisted(() => ({
const {
mockLogContentRetry,
mockLogContentRetryFailure,
mockLogNetworkRetryAttempt,
} = vi.hoisted(() => ({
mockLogContentRetry: vi.fn(),
mockLogContentRetryFailure: vi.fn(),
mockLogNetworkRetryAttempt: vi.fn(),
}));
vi.mock('../telemetry/loggers.js', () => ({
logContentRetry: mockLogContentRetry,
logContentRetryFailure: mockLogContentRetryFailure,
logNetworkRetryAttempt: mockLogNetworkRetryAttempt,
}));
describe('GeminiChat Network Retries', () => {
@@ -185,10 +191,10 @@ describe('GeminiChat Network Retries', () => {
expect(successChunk).toBeDefined();
// Verify retry logging
expect(mockLogContentRetry).toHaveBeenCalledWith(
expect(mockLogNetworkRetryAttempt).toHaveBeenCalledWith(
expect.anything(),
expect.objectContaining({
error_type: 'NETWORK_ERROR',
error_type: 'SERVER_ERROR',
}),
);
});
@@ -474,11 +480,11 @@ describe('GeminiChat Network Retries', () => {
);
expect(successChunk).toBeDefined();
// Verify retry logging was called with NETWORK_ERROR type
expect(mockLogContentRetry).toHaveBeenCalledWith(
// Verify retry logging was called with network error type
expect(mockLogNetworkRetryAttempt).toHaveBeenCalledWith(
expect.anything(),
expect.objectContaining({
error_type: 'NETWORK_ERROR',
error_type: 'ERR_SSL_SSLV3_ALERT_BAD_RECORD_MAC',
}),
);
});
@@ -27,6 +27,7 @@ import type {
InvalidChunkEvent,
ContentRetryEvent,
ContentRetryFailureEvent,
NetworkRetryAttemptEvent,
ExtensionInstallEvent,
ToolOutputTruncatedEvent,
ExtensionUninstallEvent,
@@ -94,6 +95,7 @@ export enum EventNames {
INVALID_CHUNK = 'invalid_chunk',
CONTENT_RETRY = 'content_retry',
CONTENT_RETRY_FAILURE = 'content_retry_failure',
RETRY_ATTEMPT = 'retry_attempt',
EXTENSION_ENABLE = 'extension_enable',
EXTENSION_DISABLE = 'extension_disable',
EXTENSION_INSTALL = 'extension_install',
@@ -1231,6 +1233,32 @@ export class ClearcutLogger {
this.flushIfNeeded();
}
logNetworkRetryAttemptEvent(event: NetworkRetryAttemptEvent): void {
// This event is generic for any retry attempt (Gemini, WebFetch, etc.)
const data: EventValue[] = [
{
gemini_cli_key:
EventMetadataKey.GEMINI_CLI_NETWORK_RETRY_ATTEMPT_NUMBER,
value: String(event.attempt),
},
{
gemini_cli_key: EventMetadataKey.GEMINI_CLI_NETWORK_RETRY_DELAY_MS,
value: String(event.delay_ms),
},
{
gemini_cli_key: EventMetadataKey.GEMINI_CLI_NETWORK_RETRY_ERROR_TYPE,
value: event.error_type,
},
{
gemini_cli_key: EventMetadataKey.GEMINI_CLI_API_REQUEST_MODEL,
value: event.model,
},
];
this.enqueueLogEvent(this.createLogEvent(EventNames.RETRY_ATTEMPT, data));
this.flushIfNeeded();
}
async logExtensionInstallEvent(event: ExtensionInstallEvent): Promise<void> {
const data: EventValue[] = [
{
@@ -674,4 +674,17 @@ export enum EventMetadataKey {
// Logs the error message for Conseca events.
CONSECA_ERROR = 166,
// ==========================================================================
// Network Retry Event Keys
// ==========================================================================
// Logs the attempt number for a network retry.
GEMINI_CLI_NETWORK_RETRY_ATTEMPT_NUMBER = 180,
// Logs the delay in milliseconds for a network retry.
GEMINI_CLI_NETWORK_RETRY_DELAY_MS = 181,
// Logs the error type for a network retry.
GEMINI_CLI_NETWORK_RETRY_ERROR_TYPE = 182,
}
+3
View File
@@ -46,6 +46,7 @@ export {
logExtensionUninstall,
logExtensionUpdateEvent,
logWebFetchFallbackAttempt,
logNetworkRetryAttempt,
logRewind,
} from './loggers.js';
export {
@@ -66,6 +67,7 @@ export {
ConversationFinishedEvent,
ToolOutputTruncatedEvent,
WebFetchFallbackAttemptEvent,
NetworkRetryAttemptEvent,
ToolCallDecision,
RewindEvent,
ConsecaPolicyGenerationEvent,
@@ -111,6 +113,7 @@ export {
recordApiErrorMetrics,
recordFileOperationMetric,
recordInvalidChunk,
recordRetryAttemptMetrics,
recordContentRetry,
recordContentRetryFailure,
recordModelRoutingMetrics,
@@ -45,6 +45,7 @@ import {
logAgentStart,
logAgentFinish,
logWebFetchFallbackAttempt,
logNetworkRetryAttempt,
logExtensionUpdateEvent,
logHookCall,
} from './loggers.js';
@@ -70,6 +71,7 @@ import {
EVENT_AGENT_FINISH,
EVENT_WEB_FETCH_FALLBACK_ATTEMPT,
EVENT_INVALID_CHUNK,
EVENT_NETWORK_RETRY_ATTEMPT,
ApiErrorEvent,
ApiRequestEvent,
ApiResponseEvent,
@@ -91,6 +93,7 @@ import {
AgentStartEvent,
AgentFinishEvent,
WebFetchFallbackAttemptEvent,
NetworkRetryAttemptEvent,
ExtensionUpdateEvent,
EVENT_EXTENSION_UPDATE,
HookCallEvent,
@@ -2429,6 +2432,56 @@ describe('loggers', () => {
});
});
describe('logNetworkRetryAttempt', () => {
const mockConfig = makeFakeConfig();
beforeEach(() => {
vi.spyOn(ClearcutLogger.prototype, 'logNetworkRetryAttemptEvent');
vi.spyOn(metrics, 'recordRetryAttemptMetrics');
});
it('logs the network retry attempt event to Clearcut and OTEL', () => {
const event = new NetworkRetryAttemptEvent(
2,
5,
'Overloaded',
1000,
'test-model',
);
logNetworkRetryAttempt(mockConfig, event);
expect(
ClearcutLogger.prototype.logNetworkRetryAttemptEvent,
).toHaveBeenCalledWith(event);
expect(mockLogger.emit).toHaveBeenCalledWith({
body: 'Network retry attempt 2/5 for test-model. Delay: 1000ms. Error type: Overloaded',
attributes: {
'session.id': 'test-session-id',
'user.email': 'test-user@example.com',
'installation.id': 'test-installation-id',
'event.name': EVENT_NETWORK_RETRY_ATTEMPT,
'event.timestamp': '2025-01-01T00:00:00.000Z',
interactive: false,
attempt: 2,
max_attempts: 5,
error_type: 'Overloaded',
delay_ms: 1000,
model: 'test-model',
},
});
expect(metrics.recordRetryAttemptMetrics).toHaveBeenCalledWith(
mockConfig,
{
model: 'test-model',
attempt: 2,
},
);
});
});
describe('Telemetry Buffering', () => {
it('should buffer events when SDK is not initialized', async () => {
vi.spyOn(sdk, 'isTelemetrySdkInitialized').mockReturnValue(false);
+21
View File
@@ -32,6 +32,7 @@ import {
type InvalidChunkEvent,
type ContentRetryEvent,
type ContentRetryFailureEvent,
type NetworkRetryAttemptEvent,
type RipgrepFallbackEvent,
type ToolOutputTruncatedEvent,
type ModelRoutingEvent,
@@ -62,6 +63,7 @@ import {
recordToolCallMetrics,
recordChatCompressionMetrics,
recordFileOperationMetric,
recordRetryAttemptMetrics,
recordContentRetry,
recordContentRetryFailure,
recordModelRoutingMetrics,
@@ -485,6 +487,25 @@ export function logInvalidChunk(
});
}
export function logNetworkRetryAttempt(
config: Config,
event: NetworkRetryAttemptEvent,
): void {
ClearcutLogger.getInstance(config)?.logNetworkRetryAttemptEvent(event);
bufferTelemetryEvent(() => {
const logger = logs.getLogger(SERVICE_NAME);
const logRecord: LogRecord = {
body: event.toLogBody(),
attributes: event.toOpenTelemetryAttributes(config),
};
logger.emit(logRecord);
recordRetryAttemptMetrics(config, {
model: event.model,
attempt: event.attempt,
});
});
}
export function logContentRetry(
config: Config,
event: ContentRetryEvent,
+26
View File
@@ -40,6 +40,7 @@ const INVALID_CHUNK_COUNT = 'gemini_cli.chat.invalid_chunk.count';
const CONTENT_RETRY_COUNT = 'gemini_cli.chat.content_retry.count';
const CONTENT_RETRY_FAILURE_COUNT =
'gemini_cli.chat.content_retry_failure.count';
const NETWORK_RETRY_COUNT = 'gemini_cli.network_retry.count';
const MODEL_ROUTING_LATENCY = 'gemini_cli.model_routing.latency';
const MODEL_ROUTING_FAILURE_COUNT = 'gemini_cli.model_routing.failure.count';
const MODEL_SLASH_COMMAND_CALL_COUNT =
@@ -166,6 +167,16 @@ const COUNTER_DEFINITIONS = {
assign: (c: Counter) => (contentRetryFailureCounter = c),
attributes: {} as Record<string, never>,
},
[NETWORK_RETRY_COUNT]: {
description: 'Counts network retries.',
valueType: ValueType.INT,
assign: (c: Counter) => (networkRetryCounter = c),
// eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
attributes: {} as {
model: string;
attempt: number;
},
},
[MODEL_ROUTING_FAILURE_COUNT]: {
description: 'Counts model routing failures.',
valueType: ValueType.INT,
@@ -610,6 +621,7 @@ let chatCompressionCounter: Counter | undefined;
let invalidChunkCounter: Counter | undefined;
let contentRetryCounter: Counter | undefined;
let contentRetryFailureCounter: Counter | undefined;
let networkRetryCounter: Counter | undefined;
let modelRoutingLatencyHistogram: Histogram | undefined;
let modelRoutingFailureCounter: Counter | undefined;
let modelSlashCommandCallCounter: Counter | undefined;
@@ -848,6 +860,20 @@ export function recordInvalidChunk(config: Config): void {
invalidChunkCounter.add(1, baseMetricDefinition.getCommonAttributes(config));
}
export function recordRetryAttemptMetrics(
config: Config,
attributes: {
model: string;
attempt: number;
},
): void {
if (!networkRetryCounter || !isMetricsInitialized) return;
networkRetryCounter.add(1, {
...baseMetricDefinition.getCommonAttributes(config),
...attributes,
});
}
/**
* Records a metric for when a retry is triggered due to a content error.
*/
+45
View File
@@ -1341,6 +1341,51 @@ export class ContentRetryEvent implements BaseTelemetryEvent {
export const EVENT_CONTENT_RETRY_FAILURE =
'gemini_cli.chat.content_retry_failure';
export const EVENT_NETWORK_RETRY_ATTEMPT = 'gemini_cli.network_retry_attempt';
export class NetworkRetryAttemptEvent implements BaseTelemetryEvent {
'event.name': 'network_retry_attempt';
'event.timestamp': string;
attempt: number;
max_attempts: number;
error_type: string;
delay_ms: number;
model: string;
constructor(
attempt: number,
max_attempts: number,
error_type: string,
delay_ms: number,
model: string,
) {
this['event.name'] = 'network_retry_attempt';
this['event.timestamp'] = new Date().toISOString();
this.attempt = attempt;
this.max_attempts = max_attempts;
this.error_type = error_type;
this.delay_ms = delay_ms;
this.model = model;
}
toOpenTelemetryAttributes(config: Config): LogAttributes {
return {
...getCommonAttributes(config),
'event.name': EVENT_NETWORK_RETRY_ATTEMPT,
'event.timestamp': this['event.timestamp'],
attempt: this.attempt,
max_attempts: this.max_attempts,
error_type: this.error_type,
delay_ms: this.delay_ms,
model: this.model,
};
}
toLogBody(): string {
return `Network retry attempt ${this.attempt}/${this.max_attempts} for ${this.model}. Delay: ${this.delay_ms}ms. Error type: ${this.error_type}`;
}
}
export class ContentRetryFailureEvent implements BaseTelemetryEvent {
'event.name': 'content_retry_failure';
'event.timestamp': string;
+21 -4
View File
@@ -27,12 +27,14 @@ import { convert } from 'html-to-text';
import {
logWebFetchFallbackAttempt,
WebFetchFallbackAttemptEvent,
logNetworkRetryAttempt,
NetworkRetryAttemptEvent,
} from '../telemetry/index.js';
import { LlmRole } from '../telemetry/llmRole.js';
import { WEB_FETCH_TOOL_NAME } from './tool-names.js';
import { debugLogger } from '../utils/debugLogger.js';
import { coreEvents } from '../utils/events.js';
import { retryWithBackoff } from '../utils/retry.js';
import { retryWithBackoff, getRetryErrorType } from '../utils/retry.js';
import { WEB_FETCH_DEFINITION } from './definitions/coreTools.js';
import { resolveToolDeclaration } from './definitions/resolver.js';
import { LRUCache } from 'mnemonist';
@@ -188,13 +190,28 @@ class WebFetchToolInvocation extends BaseToolInvocation<
}
private handleRetry(attempt: number, error: unknown, delayMs: number): void {
const maxAttempts = this.config.getMaxAttempts();
const modelName = 'Web Fetch';
const errorType = getRetryErrorType(error);
coreEvents.emitRetryAttempt({
attempt,
maxAttempts: this.config.getMaxAttempts(),
maxAttempts,
delayMs,
error: error instanceof Error ? error.message : String(error),
model: 'Web Fetch',
error: errorType,
model: modelName,
});
logNetworkRetryAttempt(
this.config,
new NetworkRetryAttemptEvent(
attempt,
maxAttempts,
errorType,
delayMs,
modelName,
),
);
}
private async executeFallback(signal: AbortSignal): Promise<ToolResult> {
+40 -2
View File
@@ -99,8 +99,46 @@ function getNetworkErrorCode(error: unknown): string | undefined {
return undefined;
}
const FETCH_FAILED_MESSAGE = 'fetch failed';
const INCOMPLETE_JSON_MESSAGE = 'incomplete json segment';
export const FETCH_FAILED_MESSAGE = 'fetch failed';
export const INCOMPLETE_JSON_MESSAGE = 'incomplete json segment';
/**
* Categorizes an error for retry telemetry purposes.
* Returns a safe string without PII.
*/
export function getRetryErrorType(error: unknown): string {
if (error === 'Invalid content') {
return 'INVALID_CONTENT';
}
const errorCode = getNetworkErrorCode(error);
if (errorCode && RETRYABLE_NETWORK_CODES.includes(errorCode)) {
return errorCode;
}
if (error instanceof Error) {
const lowerMessage = error.message.toLowerCase();
if (lowerMessage.includes(FETCH_FAILED_MESSAGE)) {
return 'FETCH_FAILED';
}
if (lowerMessage.includes(INCOMPLETE_JSON_MESSAGE)) {
return 'INCOMPLETE_JSON';
}
}
const status = getErrorStatus(error);
if (status !== undefined) {
if (status === 429) return 'QUOTA_EXCEEDED';
if (status >= 500 && status < 600) return 'SERVER_ERROR';
return `HTTP_${status}`;
}
if (error instanceof Error) {
return error.name;
}
return 'UNKNOWN';
}
/**
* Default predicate function to determine if a retry should be attempted.