diff --git a/packages/core/src/core/baseLlmClient.ts b/packages/core/src/core/baseLlmClient.ts index 8888a41bbf..29cb798ee2 100644 --- a/packages/core/src/core/baseLlmClient.ts +++ b/packages/core/src/core/baseLlmClient.ts @@ -18,9 +18,16 @@ import { handleFallback } from '../fallback/handler.js'; import { getResponseText } from '../utils/partUtils.js'; import { reportError } from '../utils/errorReporting.js'; import { getErrorMessage } from '../utils/errors.js'; -import { logMalformedJsonResponse } from '../telemetry/loggers.js'; -import { MalformedJsonResponseEvent, LlmRole } from '../telemetry/types.js'; -import { retryWithBackoff } from '../utils/retry.js'; +import { + logMalformedJsonResponse, + logNetworkRetryAttempt, +} from '../telemetry/loggers.js'; +import { + MalformedJsonResponseEvent, + LlmRole, + NetworkRetryAttemptEvent, +} from '../telemetry/types.js'; +import { retryWithBackoff, getRetryErrorType } from '../utils/retry.js'; import { coreEvents } from '../utils/events.js'; import { getDisplayString } from '../config/models.js'; import type { ModelConfigKey } from '../services/modelConfigService.js'; @@ -331,14 +338,29 @@ export class BaseLlmClient { this.authType ?? this.config.getContentGeneratorConfig()?.authType, retryFetchErrors: this.config.getRetryFetchErrors(), onRetry: (attempt, error, delayMs) => { + const actualMaxAttempts = + availabilityMaxAttempts ?? maxAttempts ?? DEFAULT_MAX_ATTEMPTS; + const modelName = getDisplayString(currentModel); + const errorType = getRetryErrorType(error); + coreEvents.emitRetryAttempt({ attempt, - maxAttempts: - availabilityMaxAttempts ?? maxAttempts ?? DEFAULT_MAX_ATTEMPTS, + maxAttempts: actualMaxAttempts, delayMs, - error: error instanceof Error ? error.message : String(error), - model: getDisplayString(currentModel), + error: errorType, + model: modelName, }); + + logNetworkRetryAttempt( + this.config, + new NetworkRetryAttemptEvent( + attempt, + actualMaxAttempts, + errorType, + delayMs, + modelName, + ), + ); }, }); } catch (error) { diff --git a/packages/core/src/core/geminiChat.test.ts b/packages/core/src/core/geminiChat.test.ts index 9c527dbc52..ec375e88be 100644 --- a/packages/core/src/core/geminiChat.test.ts +++ b/packages/core/src/core/geminiChat.test.ts @@ -85,14 +85,20 @@ vi.mock('../fallback/handler.js', () => ({ handleFallback: mockHandleFallback, })); -const { mockLogContentRetry, mockLogContentRetryFailure } = vi.hoisted(() => ({ +const { + mockLogContentRetry, + mockLogContentRetryFailure, + mockLogNetworkRetryAttempt, +} = vi.hoisted(() => ({ mockLogContentRetry: vi.fn(), mockLogContentRetryFailure: vi.fn(), + mockLogNetworkRetryAttempt: vi.fn(), })); vi.mock('../telemetry/loggers.js', () => ({ logContentRetry: mockLogContentRetry, logContentRetryFailure: mockLogContentRetryFailure, + logNetworkRetryAttempt: mockLogNetworkRetryAttempt, })); vi.mock('../telemetry/uiTelemetry.js', () => ({ diff --git a/packages/core/src/core/geminiChat.ts b/packages/core/src/core/geminiChat.ts index 44a28c83a5..b04318b488 100644 --- a/packages/core/src/core/geminiChat.ts +++ b/packages/core/src/core/geminiChat.ts @@ -19,7 +19,11 @@ import { type GenerateContentParameters, } from '@google/genai'; import { toParts } from '../code_assist/converter.js'; -import { retryWithBackoff, isRetryableError } from '../utils/retry.js'; +import { + retryWithBackoff, + isRetryableError, + getRetryErrorType, +} from '../utils/retry.js'; import type { ValidationRequiredError } from '../utils/googleQuotaErrors.js'; import type { Config } from '../config/config.js'; import { @@ -33,6 +37,7 @@ import type { CompletedToolCall } from './coreToolScheduler.js'; import { logContentRetry, logContentRetryFailure, + logNetworkRetryAttempt, } from '../telemetry/loggers.js'; import { ChatRecordingService, @@ -41,6 +46,7 @@ import { import { ContentRetryEvent, ContentRetryFailureEvent, + NetworkRetryAttemptEvent, type LlmRole, } from '../telemetry/types.js'; import { handleFallback } from '../fallback/handler.js'; @@ -412,7 +418,9 @@ export class GeminiChat { } const isContentError = error instanceof InvalidStreamError; - const errorType = isContentError ? error.type : 'NETWORK_ERROR'; + const errorType = isContentError + ? error.type + : getRetryErrorType(error); if ( (isContentError && isGemini2Model(model)) || @@ -422,15 +430,28 @@ export class GeminiChat { if (attempt < maxAttempts - 1) { const delayMs = INVALID_CONTENT_RETRY_OPTIONS.initialDelayMs; - logContentRetry( - this.config, - new ContentRetryEvent(attempt, errorType, delayMs, model), - ); + if (isContentError) { + logContentRetry( + this.config, + new ContentRetryEvent(attempt, errorType, delayMs, model), + ); + } else { + logNetworkRetryAttempt( + this.config, + new NetworkRetryAttemptEvent( + attempt + 1, + maxAttempts, + errorType, + delayMs * (attempt + 1), + model, + ), + ); + } coreEvents.emitRetryAttempt({ attempt: attempt + 1, maxAttempts, delayMs: delayMs * (attempt + 1), - error: error instanceof Error ? error.message : String(error), + error: errorType, model, }); await new Promise((res) => diff --git a/packages/core/src/core/geminiChat_network_retry.test.ts b/packages/core/src/core/geminiChat_network_retry.test.ts index 78b23d54f6..2f7cf69dd8 100644 --- a/packages/core/src/core/geminiChat_network_retry.test.ts +++ b/packages/core/src/core/geminiChat_network_retry.test.ts @@ -47,14 +47,20 @@ vi.mock('../utils/retry.js', async (importOriginal) => { }); // Mock loggers -const { mockLogContentRetry, mockLogContentRetryFailure } = vi.hoisted(() => ({ +const { + mockLogContentRetry, + mockLogContentRetryFailure, + mockLogNetworkRetryAttempt, +} = vi.hoisted(() => ({ mockLogContentRetry: vi.fn(), mockLogContentRetryFailure: vi.fn(), + mockLogNetworkRetryAttempt: vi.fn(), })); vi.mock('../telemetry/loggers.js', () => ({ logContentRetry: mockLogContentRetry, logContentRetryFailure: mockLogContentRetryFailure, + logNetworkRetryAttempt: mockLogNetworkRetryAttempt, })); describe('GeminiChat Network Retries', () => { @@ -185,10 +191,10 @@ describe('GeminiChat Network Retries', () => { expect(successChunk).toBeDefined(); // Verify retry logging - expect(mockLogContentRetry).toHaveBeenCalledWith( + expect(mockLogNetworkRetryAttempt).toHaveBeenCalledWith( expect.anything(), expect.objectContaining({ - error_type: 'NETWORK_ERROR', + error_type: 'SERVER_ERROR', }), ); }); @@ -474,11 +480,11 @@ describe('GeminiChat Network Retries', () => { ); expect(successChunk).toBeDefined(); - // Verify retry logging was called with NETWORK_ERROR type - expect(mockLogContentRetry).toHaveBeenCalledWith( + // Verify retry logging was called with network error type + expect(mockLogNetworkRetryAttempt).toHaveBeenCalledWith( expect.anything(), expect.objectContaining({ - error_type: 'NETWORK_ERROR', + error_type: 'ERR_SSL_SSLV3_ALERT_BAD_RECORD_MAC', }), ); }); diff --git a/packages/core/src/telemetry/clearcut-logger/clearcut-logger.ts b/packages/core/src/telemetry/clearcut-logger/clearcut-logger.ts index 4684969c13..5e19d7f49b 100644 --- a/packages/core/src/telemetry/clearcut-logger/clearcut-logger.ts +++ b/packages/core/src/telemetry/clearcut-logger/clearcut-logger.ts @@ -27,6 +27,7 @@ import type { InvalidChunkEvent, ContentRetryEvent, ContentRetryFailureEvent, + NetworkRetryAttemptEvent, ExtensionInstallEvent, ToolOutputTruncatedEvent, ExtensionUninstallEvent, @@ -94,6 +95,7 @@ export enum EventNames { INVALID_CHUNK = 'invalid_chunk', CONTENT_RETRY = 'content_retry', CONTENT_RETRY_FAILURE = 'content_retry_failure', + RETRY_ATTEMPT = 'retry_attempt', EXTENSION_ENABLE = 'extension_enable', EXTENSION_DISABLE = 'extension_disable', EXTENSION_INSTALL = 'extension_install', @@ -1231,6 +1233,32 @@ export class ClearcutLogger { this.flushIfNeeded(); } + logNetworkRetryAttemptEvent(event: NetworkRetryAttemptEvent): void { + // This event is generic for any retry attempt (Gemini, WebFetch, etc.) + const data: EventValue[] = [ + { + gemini_cli_key: + EventMetadataKey.GEMINI_CLI_NETWORK_RETRY_ATTEMPT_NUMBER, + value: String(event.attempt), + }, + { + gemini_cli_key: EventMetadataKey.GEMINI_CLI_NETWORK_RETRY_DELAY_MS, + value: String(event.delay_ms), + }, + { + gemini_cli_key: EventMetadataKey.GEMINI_CLI_NETWORK_RETRY_ERROR_TYPE, + value: event.error_type, + }, + { + gemini_cli_key: EventMetadataKey.GEMINI_CLI_API_REQUEST_MODEL, + value: event.model, + }, + ]; + + this.enqueueLogEvent(this.createLogEvent(EventNames.RETRY_ATTEMPT, data)); + this.flushIfNeeded(); + } + async logExtensionInstallEvent(event: ExtensionInstallEvent): Promise { const data: EventValue[] = [ { diff --git a/packages/core/src/telemetry/clearcut-logger/event-metadata-key.ts b/packages/core/src/telemetry/clearcut-logger/event-metadata-key.ts index 473b8db524..20c983aa63 100644 --- a/packages/core/src/telemetry/clearcut-logger/event-metadata-key.ts +++ b/packages/core/src/telemetry/clearcut-logger/event-metadata-key.ts @@ -674,4 +674,17 @@ export enum EventMetadataKey { // Logs the error message for Conseca events. CONSECA_ERROR = 166, + + // ========================================================================== + // Network Retry Event Keys + // ========================================================================== + + // Logs the attempt number for a network retry. + GEMINI_CLI_NETWORK_RETRY_ATTEMPT_NUMBER = 180, + + // Logs the delay in milliseconds for a network retry. + GEMINI_CLI_NETWORK_RETRY_DELAY_MS = 181, + + // Logs the error type for a network retry. + GEMINI_CLI_NETWORK_RETRY_ERROR_TYPE = 182, } diff --git a/packages/core/src/telemetry/index.ts b/packages/core/src/telemetry/index.ts index adea893983..0d264695d8 100644 --- a/packages/core/src/telemetry/index.ts +++ b/packages/core/src/telemetry/index.ts @@ -46,6 +46,7 @@ export { logExtensionUninstall, logExtensionUpdateEvent, logWebFetchFallbackAttempt, + logNetworkRetryAttempt, logRewind, } from './loggers.js'; export { @@ -66,6 +67,7 @@ export { ConversationFinishedEvent, ToolOutputTruncatedEvent, WebFetchFallbackAttemptEvent, + NetworkRetryAttemptEvent, ToolCallDecision, RewindEvent, ConsecaPolicyGenerationEvent, @@ -111,6 +113,7 @@ export { recordApiErrorMetrics, recordFileOperationMetric, recordInvalidChunk, + recordRetryAttemptMetrics, recordContentRetry, recordContentRetryFailure, recordModelRoutingMetrics, diff --git a/packages/core/src/telemetry/loggers.test.ts b/packages/core/src/telemetry/loggers.test.ts index ff18d8fcf0..4373a6b96c 100644 --- a/packages/core/src/telemetry/loggers.test.ts +++ b/packages/core/src/telemetry/loggers.test.ts @@ -45,6 +45,7 @@ import { logAgentStart, logAgentFinish, logWebFetchFallbackAttempt, + logNetworkRetryAttempt, logExtensionUpdateEvent, logHookCall, } from './loggers.js'; @@ -70,6 +71,7 @@ import { EVENT_AGENT_FINISH, EVENT_WEB_FETCH_FALLBACK_ATTEMPT, EVENT_INVALID_CHUNK, + EVENT_NETWORK_RETRY_ATTEMPT, ApiErrorEvent, ApiRequestEvent, ApiResponseEvent, @@ -91,6 +93,7 @@ import { AgentStartEvent, AgentFinishEvent, WebFetchFallbackAttemptEvent, + NetworkRetryAttemptEvent, ExtensionUpdateEvent, EVENT_EXTENSION_UPDATE, HookCallEvent, @@ -2429,6 +2432,56 @@ describe('loggers', () => { }); }); + describe('logNetworkRetryAttempt', () => { + const mockConfig = makeFakeConfig(); + + beforeEach(() => { + vi.spyOn(ClearcutLogger.prototype, 'logNetworkRetryAttemptEvent'); + vi.spyOn(metrics, 'recordRetryAttemptMetrics'); + }); + + it('logs the network retry attempt event to Clearcut and OTEL', () => { + const event = new NetworkRetryAttemptEvent( + 2, + 5, + 'Overloaded', + 1000, + 'test-model', + ); + + logNetworkRetryAttempt(mockConfig, event); + + expect( + ClearcutLogger.prototype.logNetworkRetryAttemptEvent, + ).toHaveBeenCalledWith(event); + + expect(mockLogger.emit).toHaveBeenCalledWith({ + body: 'Network retry attempt 2/5 for test-model. Delay: 1000ms. Error type: Overloaded', + attributes: { + 'session.id': 'test-session-id', + 'user.email': 'test-user@example.com', + 'installation.id': 'test-installation-id', + 'event.name': EVENT_NETWORK_RETRY_ATTEMPT, + 'event.timestamp': '2025-01-01T00:00:00.000Z', + interactive: false, + attempt: 2, + max_attempts: 5, + error_type: 'Overloaded', + delay_ms: 1000, + model: 'test-model', + }, + }); + + expect(metrics.recordRetryAttemptMetrics).toHaveBeenCalledWith( + mockConfig, + { + model: 'test-model', + attempt: 2, + }, + ); + }); + }); + describe('Telemetry Buffering', () => { it('should buffer events when SDK is not initialized', async () => { vi.spyOn(sdk, 'isTelemetrySdkInitialized').mockReturnValue(false); diff --git a/packages/core/src/telemetry/loggers.ts b/packages/core/src/telemetry/loggers.ts index 393519d3ec..52e0fb35bb 100644 --- a/packages/core/src/telemetry/loggers.ts +++ b/packages/core/src/telemetry/loggers.ts @@ -32,6 +32,7 @@ import { type InvalidChunkEvent, type ContentRetryEvent, type ContentRetryFailureEvent, + type NetworkRetryAttemptEvent, type RipgrepFallbackEvent, type ToolOutputTruncatedEvent, type ModelRoutingEvent, @@ -62,6 +63,7 @@ import { recordToolCallMetrics, recordChatCompressionMetrics, recordFileOperationMetric, + recordRetryAttemptMetrics, recordContentRetry, recordContentRetryFailure, recordModelRoutingMetrics, @@ -485,6 +487,25 @@ export function logInvalidChunk( }); } +export function logNetworkRetryAttempt( + config: Config, + event: NetworkRetryAttemptEvent, +): void { + ClearcutLogger.getInstance(config)?.logNetworkRetryAttemptEvent(event); + bufferTelemetryEvent(() => { + const logger = logs.getLogger(SERVICE_NAME); + const logRecord: LogRecord = { + body: event.toLogBody(), + attributes: event.toOpenTelemetryAttributes(config), + }; + logger.emit(logRecord); + recordRetryAttemptMetrics(config, { + model: event.model, + attempt: event.attempt, + }); + }); +} + export function logContentRetry( config: Config, event: ContentRetryEvent, diff --git a/packages/core/src/telemetry/metrics.ts b/packages/core/src/telemetry/metrics.ts index 70b188f517..af7f54c535 100644 --- a/packages/core/src/telemetry/metrics.ts +++ b/packages/core/src/telemetry/metrics.ts @@ -40,6 +40,7 @@ const INVALID_CHUNK_COUNT = 'gemini_cli.chat.invalid_chunk.count'; const CONTENT_RETRY_COUNT = 'gemini_cli.chat.content_retry.count'; const CONTENT_RETRY_FAILURE_COUNT = 'gemini_cli.chat.content_retry_failure.count'; +const NETWORK_RETRY_COUNT = 'gemini_cli.network_retry.count'; const MODEL_ROUTING_LATENCY = 'gemini_cli.model_routing.latency'; const MODEL_ROUTING_FAILURE_COUNT = 'gemini_cli.model_routing.failure.count'; const MODEL_SLASH_COMMAND_CALL_COUNT = @@ -166,6 +167,16 @@ const COUNTER_DEFINITIONS = { assign: (c: Counter) => (contentRetryFailureCounter = c), attributes: {} as Record, }, + [NETWORK_RETRY_COUNT]: { + description: 'Counts network retries.', + valueType: ValueType.INT, + assign: (c: Counter) => (networkRetryCounter = c), + // eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion + attributes: {} as { + model: string; + attempt: number; + }, + }, [MODEL_ROUTING_FAILURE_COUNT]: { description: 'Counts model routing failures.', valueType: ValueType.INT, @@ -610,6 +621,7 @@ let chatCompressionCounter: Counter | undefined; let invalidChunkCounter: Counter | undefined; let contentRetryCounter: Counter | undefined; let contentRetryFailureCounter: Counter | undefined; +let networkRetryCounter: Counter | undefined; let modelRoutingLatencyHistogram: Histogram | undefined; let modelRoutingFailureCounter: Counter | undefined; let modelSlashCommandCallCounter: Counter | undefined; @@ -848,6 +860,20 @@ export function recordInvalidChunk(config: Config): void { invalidChunkCounter.add(1, baseMetricDefinition.getCommonAttributes(config)); } +export function recordRetryAttemptMetrics( + config: Config, + attributes: { + model: string; + attempt: number; + }, +): void { + if (!networkRetryCounter || !isMetricsInitialized) return; + networkRetryCounter.add(1, { + ...baseMetricDefinition.getCommonAttributes(config), + ...attributes, + }); +} + /** * Records a metric for when a retry is triggered due to a content error. */ diff --git a/packages/core/src/telemetry/types.ts b/packages/core/src/telemetry/types.ts index 43317f8baa..6669628220 100644 --- a/packages/core/src/telemetry/types.ts +++ b/packages/core/src/telemetry/types.ts @@ -1341,6 +1341,51 @@ export class ContentRetryEvent implements BaseTelemetryEvent { export const EVENT_CONTENT_RETRY_FAILURE = 'gemini_cli.chat.content_retry_failure'; + +export const EVENT_NETWORK_RETRY_ATTEMPT = 'gemini_cli.network_retry_attempt'; +export class NetworkRetryAttemptEvent implements BaseTelemetryEvent { + 'event.name': 'network_retry_attempt'; + 'event.timestamp': string; + attempt: number; + max_attempts: number; + error_type: string; + delay_ms: number; + model: string; + + constructor( + attempt: number, + max_attempts: number, + error_type: string, + delay_ms: number, + model: string, + ) { + this['event.name'] = 'network_retry_attempt'; + this['event.timestamp'] = new Date().toISOString(); + this.attempt = attempt; + this.max_attempts = max_attempts; + this.error_type = error_type; + this.delay_ms = delay_ms; + this.model = model; + } + + toOpenTelemetryAttributes(config: Config): LogAttributes { + return { + ...getCommonAttributes(config), + 'event.name': EVENT_NETWORK_RETRY_ATTEMPT, + 'event.timestamp': this['event.timestamp'], + attempt: this.attempt, + max_attempts: this.max_attempts, + error_type: this.error_type, + delay_ms: this.delay_ms, + model: this.model, + }; + } + + toLogBody(): string { + return `Network retry attempt ${this.attempt}/${this.max_attempts} for ${this.model}. Delay: ${this.delay_ms}ms. Error type: ${this.error_type}`; + } +} + export class ContentRetryFailureEvent implements BaseTelemetryEvent { 'event.name': 'content_retry_failure'; 'event.timestamp': string; diff --git a/packages/core/src/tools/web-fetch.ts b/packages/core/src/tools/web-fetch.ts index a670ebec50..e4d9ebc36f 100644 --- a/packages/core/src/tools/web-fetch.ts +++ b/packages/core/src/tools/web-fetch.ts @@ -27,12 +27,14 @@ import { convert } from 'html-to-text'; import { logWebFetchFallbackAttempt, WebFetchFallbackAttemptEvent, + logNetworkRetryAttempt, + NetworkRetryAttemptEvent, } from '../telemetry/index.js'; import { LlmRole } from '../telemetry/llmRole.js'; import { WEB_FETCH_TOOL_NAME } from './tool-names.js'; import { debugLogger } from '../utils/debugLogger.js'; import { coreEvents } from '../utils/events.js'; -import { retryWithBackoff } from '../utils/retry.js'; +import { retryWithBackoff, getRetryErrorType } from '../utils/retry.js'; import { WEB_FETCH_DEFINITION } from './definitions/coreTools.js'; import { resolveToolDeclaration } from './definitions/resolver.js'; import { LRUCache } from 'mnemonist'; @@ -188,13 +190,28 @@ class WebFetchToolInvocation extends BaseToolInvocation< } private handleRetry(attempt: number, error: unknown, delayMs: number): void { + const maxAttempts = this.config.getMaxAttempts(); + const modelName = 'Web Fetch'; + const errorType = getRetryErrorType(error); + coreEvents.emitRetryAttempt({ attempt, - maxAttempts: this.config.getMaxAttempts(), + maxAttempts, delayMs, - error: error instanceof Error ? error.message : String(error), - model: 'Web Fetch', + error: errorType, + model: modelName, }); + + logNetworkRetryAttempt( + this.config, + new NetworkRetryAttemptEvent( + attempt, + maxAttempts, + errorType, + delayMs, + modelName, + ), + ); } private async executeFallback(signal: AbortSignal): Promise { diff --git a/packages/core/src/utils/retry.ts b/packages/core/src/utils/retry.ts index 8ce8e1934a..00d42f5320 100644 --- a/packages/core/src/utils/retry.ts +++ b/packages/core/src/utils/retry.ts @@ -99,8 +99,46 @@ function getNetworkErrorCode(error: unknown): string | undefined { return undefined; } -const FETCH_FAILED_MESSAGE = 'fetch failed'; -const INCOMPLETE_JSON_MESSAGE = 'incomplete json segment'; +export const FETCH_FAILED_MESSAGE = 'fetch failed'; +export const INCOMPLETE_JSON_MESSAGE = 'incomplete json segment'; + +/** + * Categorizes an error for retry telemetry purposes. + * Returns a safe string without PII. + */ +export function getRetryErrorType(error: unknown): string { + if (error === 'Invalid content') { + return 'INVALID_CONTENT'; + } + + const errorCode = getNetworkErrorCode(error); + if (errorCode && RETRYABLE_NETWORK_CODES.includes(errorCode)) { + return errorCode; + } + + if (error instanceof Error) { + const lowerMessage = error.message.toLowerCase(); + if (lowerMessage.includes(FETCH_FAILED_MESSAGE)) { + return 'FETCH_FAILED'; + } + if (lowerMessage.includes(INCOMPLETE_JSON_MESSAGE)) { + return 'INCOMPLETE_JSON'; + } + } + + const status = getErrorStatus(error); + if (status !== undefined) { + if (status === 429) return 'QUOTA_EXCEEDED'; + if (status >= 500 && status < 600) return 'SERVER_ERROR'; + return `HTTP_${status}`; + } + + if (error instanceof Error) { + return error.name; + } + + return 'UNKNOWN'; +} /** * Default predicate function to determine if a retry should be attempted.