feat(telemetry): implement retry attempt telemetry for network related retries (#22027)

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2026-04-25 12:34:38 -07:00 · 2026-03-11 14:55:48 -04:00
parent 36ce2ba96e
commit 067e09a40b
13 changed files with 326 additions and 27 deletions
@@ -18,9 +18,16 @@ import { handleFallback } from '../fallback/handler.js';
 import { getResponseText } from '../utils/partUtils.js';
 import { reportError } from '../utils/errorReporting.js';
 import { getErrorMessage } from '../utils/errors.js';
-import { logMalformedJsonResponse } from '../telemetry/loggers.js';
-import { MalformedJsonResponseEvent, LlmRole } from '../telemetry/types.js';
-import { retryWithBackoff } from '../utils/retry.js';
+import {
+  logMalformedJsonResponse,
+  logNetworkRetryAttempt,
+} from '../telemetry/loggers.js';
+import {
+  MalformedJsonResponseEvent,
+  LlmRole,
+  NetworkRetryAttemptEvent,
+} from '../telemetry/types.js';
+import { retryWithBackoff, getRetryErrorType } from '../utils/retry.js';
 import { coreEvents } from '../utils/events.js';
 import { getDisplayString } from '../config/models.js';
 import type { ModelConfigKey } from '../services/modelConfigService.js';
@@ -331,14 +338,29 @@ export class BaseLlmClient {
          this.authType ?? this.config.getContentGeneratorConfig()?.authType,
        retryFetchErrors: this.config.getRetryFetchErrors(),
        onRetry: (attempt, error, delayMs) => {
+          const actualMaxAttempts =
+            availabilityMaxAttempts ?? maxAttempts ?? DEFAULT_MAX_ATTEMPTS;
+          const modelName = getDisplayString(currentModel);
+          const errorType = getRetryErrorType(error);
+
          coreEvents.emitRetryAttempt({
            attempt,
-            maxAttempts:
-              availabilityMaxAttempts ?? maxAttempts ?? DEFAULT_MAX_ATTEMPTS,
+            maxAttempts: actualMaxAttempts,
            delayMs,
-            error: error instanceof Error ? error.message : String(error),
-            model: getDisplayString(currentModel),
+            error: errorType,
+            model: modelName,
          });
+
+          logNetworkRetryAttempt(
+            this.config,
+            new NetworkRetryAttemptEvent(
+              attempt,
+              actualMaxAttempts,
+              errorType,
+              delayMs,
+              modelName,
+            ),
+          );
        },
      });
    } catch (error) {
@@ -85,14 +85,20 @@ vi.mock('../fallback/handler.js', () => ({
  handleFallback: mockHandleFallback,
 }));

-const { mockLogContentRetry, mockLogContentRetryFailure } = vi.hoisted(() => ({
+const {
+  mockLogContentRetry,
+  mockLogContentRetryFailure,
+  mockLogNetworkRetryAttempt,
+} = vi.hoisted(() => ({
  mockLogContentRetry: vi.fn(),
  mockLogContentRetryFailure: vi.fn(),
+  mockLogNetworkRetryAttempt: vi.fn(),
 }));

 vi.mock('../telemetry/loggers.js', () => ({
  logContentRetry: mockLogContentRetry,
  logContentRetryFailure: mockLogContentRetryFailure,
+  logNetworkRetryAttempt: mockLogNetworkRetryAttempt,
 }));

 vi.mock('../telemetry/uiTelemetry.js', () => ({
@@ -19,7 +19,11 @@ import {
  type GenerateContentParameters,
 } from '@google/genai';
 import { toParts } from '../code_assist/converter.js';
-import { retryWithBackoff, isRetryableError } from '../utils/retry.js';
+import {
+  retryWithBackoff,
+  isRetryableError,
+  getRetryErrorType,
+} from '../utils/retry.js';
 import type { ValidationRequiredError } from '../utils/googleQuotaErrors.js';
 import type { Config } from '../config/config.js';
 import {
@@ -33,6 +37,7 @@ import type { CompletedToolCall } from './coreToolScheduler.js';
 import {
  logContentRetry,
  logContentRetryFailure,
+  logNetworkRetryAttempt,
 } from '../telemetry/loggers.js';
 import {
  ChatRecordingService,
@@ -41,6 +46,7 @@ import {
 import {
  ContentRetryEvent,
  ContentRetryFailureEvent,
+  NetworkRetryAttemptEvent,
  type LlmRole,
 } from '../telemetry/types.js';
 import { handleFallback } from '../fallback/handler.js';
@@ -412,7 +418,9 @@ export class GeminiChat {
            }

            const isContentError = error instanceof InvalidStreamError;
-            const errorType = isContentError ? error.type : 'NETWORK_ERROR';
+            const errorType = isContentError
+              ? error.type
+              : getRetryErrorType(error);

            if (
              (isContentError && isGemini2Model(model)) ||
@@ -422,15 +430,28 @@ export class GeminiChat {
              if (attempt < maxAttempts - 1) {
                const delayMs = INVALID_CONTENT_RETRY_OPTIONS.initialDelayMs;

-                logContentRetry(
-                  this.config,
-                  new ContentRetryEvent(attempt, errorType, delayMs, model),
-                );
+                if (isContentError) {
+                  logContentRetry(
+                    this.config,
+                    new ContentRetryEvent(attempt, errorType, delayMs, model),
+                  );
+                } else {
+                  logNetworkRetryAttempt(
+                    this.config,
+                    new NetworkRetryAttemptEvent(
+                      attempt + 1,
+                      maxAttempts,
+                      errorType,
+                      delayMs * (attempt + 1),
+                      model,
+                    ),
+                  );
+                }
                coreEvents.emitRetryAttempt({
                  attempt: attempt + 1,
                  maxAttempts,
                  delayMs: delayMs * (attempt + 1),
-                  error: error instanceof Error ? error.message : String(error),
+                  error: errorType,
                  model,
                });
                await new Promise((res) =>
@@ -47,14 +47,20 @@ vi.mock('../utils/retry.js', async (importOriginal) => {
 });

 // Mock loggers
-const { mockLogContentRetry, mockLogContentRetryFailure } = vi.hoisted(() => ({
+const {
+  mockLogContentRetry,
+  mockLogContentRetryFailure,
+  mockLogNetworkRetryAttempt,
+} = vi.hoisted(() => ({
  mockLogContentRetry: vi.fn(),
  mockLogContentRetryFailure: vi.fn(),
+  mockLogNetworkRetryAttempt: vi.fn(),
 }));

 vi.mock('../telemetry/loggers.js', () => ({
  logContentRetry: mockLogContentRetry,
  logContentRetryFailure: mockLogContentRetryFailure,
+  logNetworkRetryAttempt: mockLogNetworkRetryAttempt,
 }));

 describe('GeminiChat Network Retries', () => {
@@ -185,10 +191,10 @@ describe('GeminiChat Network Retries', () => {
    expect(successChunk).toBeDefined();

    // Verify retry logging
-    expect(mockLogContentRetry).toHaveBeenCalledWith(
+    expect(mockLogNetworkRetryAttempt).toHaveBeenCalledWith(
      expect.anything(),
      expect.objectContaining({
-        error_type: 'NETWORK_ERROR',
+        error_type: 'SERVER_ERROR',
      }),
    );
  });
@@ -474,11 +480,11 @@ describe('GeminiChat Network Retries', () => {
    );
    expect(successChunk).toBeDefined();

-    // Verify retry logging was called with NETWORK_ERROR type
-    expect(mockLogContentRetry).toHaveBeenCalledWith(
+    // Verify retry logging was called with network error type
+    expect(mockLogNetworkRetryAttempt).toHaveBeenCalledWith(
      expect.anything(),
      expect.objectContaining({
-        error_type: 'NETWORK_ERROR',
+        error_type: 'ERR_SSL_SSLV3_ALERT_BAD_RECORD_MAC',
      }),
    );
  });
@@ -27,6 +27,7 @@ import type {
  InvalidChunkEvent,
  ContentRetryEvent,
  ContentRetryFailureEvent,
+  NetworkRetryAttemptEvent,
  ExtensionInstallEvent,
  ToolOutputTruncatedEvent,
  ExtensionUninstallEvent,
@@ -94,6 +95,7 @@ export enum EventNames {
  INVALID_CHUNK = 'invalid_chunk',
  CONTENT_RETRY = 'content_retry',
  CONTENT_RETRY_FAILURE = 'content_retry_failure',
+  RETRY_ATTEMPT = 'retry_attempt',
  EXTENSION_ENABLE = 'extension_enable',
  EXTENSION_DISABLE = 'extension_disable',
  EXTENSION_INSTALL = 'extension_install',
@@ -1231,6 +1233,32 @@ export class ClearcutLogger {
    this.flushIfNeeded();
  }

+  logNetworkRetryAttemptEvent(event: NetworkRetryAttemptEvent): void {
+    // This event is generic for any retry attempt (Gemini, WebFetch, etc.)
+    const data: EventValue[] = [
+      {
+        gemini_cli_key:
+          EventMetadataKey.GEMINI_CLI_NETWORK_RETRY_ATTEMPT_NUMBER,
+        value: String(event.attempt),
+      },
+      {
+        gemini_cli_key: EventMetadataKey.GEMINI_CLI_NETWORK_RETRY_DELAY_MS,
+        value: String(event.delay_ms),
+      },
+      {
+        gemini_cli_key: EventMetadataKey.GEMINI_CLI_NETWORK_RETRY_ERROR_TYPE,
+        value: event.error_type,
+      },
+      {
+        gemini_cli_key: EventMetadataKey.GEMINI_CLI_API_REQUEST_MODEL,
+        value: event.model,
+      },
+    ];
+
+    this.enqueueLogEvent(this.createLogEvent(EventNames.RETRY_ATTEMPT, data));
+    this.flushIfNeeded();
+  }
+
  async logExtensionInstallEvent(event: ExtensionInstallEvent): Promise<void> {
    const data: EventValue[] = [
      {
@@ -674,4 +674,17 @@ export enum EventMetadataKey {

  // Logs the error message for Conseca events.
  CONSECA_ERROR = 166,
+
+  // ==========================================================================
+  // Network Retry Event Keys
+  // ==========================================================================
+
+  // Logs the attempt number for a network retry.
+  GEMINI_CLI_NETWORK_RETRY_ATTEMPT_NUMBER = 180,
+
+  // Logs the delay in milliseconds for a network retry.
+  GEMINI_CLI_NETWORK_RETRY_DELAY_MS = 181,
+
+  // Logs the error type for a network retry.
+  GEMINI_CLI_NETWORK_RETRY_ERROR_TYPE = 182,
 }
@@ -46,6 +46,7 @@ export {
  logExtensionUninstall,
  logExtensionUpdateEvent,
  logWebFetchFallbackAttempt,
+  logNetworkRetryAttempt,
  logRewind,
 } from './loggers.js';
 export {
@@ -66,6 +67,7 @@ export {
  ConversationFinishedEvent,
  ToolOutputTruncatedEvent,
  WebFetchFallbackAttemptEvent,
+  NetworkRetryAttemptEvent,
  ToolCallDecision,
  RewindEvent,
  ConsecaPolicyGenerationEvent,
@@ -111,6 +113,7 @@ export {
  recordApiErrorMetrics,
  recordFileOperationMetric,
  recordInvalidChunk,
+  recordRetryAttemptMetrics,
  recordContentRetry,
  recordContentRetryFailure,
  recordModelRoutingMetrics,
@@ -45,6 +45,7 @@ import {
  logAgentStart,
  logAgentFinish,
  logWebFetchFallbackAttempt,
+  logNetworkRetryAttempt,
  logExtensionUpdateEvent,
  logHookCall,
 } from './loggers.js';
@@ -70,6 +71,7 @@ import {
  EVENT_AGENT_FINISH,
  EVENT_WEB_FETCH_FALLBACK_ATTEMPT,
  EVENT_INVALID_CHUNK,
+  EVENT_NETWORK_RETRY_ATTEMPT,
  ApiErrorEvent,
  ApiRequestEvent,
  ApiResponseEvent,
@@ -91,6 +93,7 @@ import {
  AgentStartEvent,
  AgentFinishEvent,
  WebFetchFallbackAttemptEvent,
+  NetworkRetryAttemptEvent,
  ExtensionUpdateEvent,
  EVENT_EXTENSION_UPDATE,
  HookCallEvent,
@@ -2429,6 +2432,56 @@ describe('loggers', () => {
    });
  });

+  describe('logNetworkRetryAttempt', () => {
+    const mockConfig = makeFakeConfig();
+
+    beforeEach(() => {
+      vi.spyOn(ClearcutLogger.prototype, 'logNetworkRetryAttemptEvent');
+      vi.spyOn(metrics, 'recordRetryAttemptMetrics');
+    });
+
+    it('logs the network retry attempt event to Clearcut and OTEL', () => {
+      const event = new NetworkRetryAttemptEvent(
+        2,
+        5,
+        'Overloaded',
+        1000,
+        'test-model',
+      );
+
+      logNetworkRetryAttempt(mockConfig, event);
+
+      expect(
+        ClearcutLogger.prototype.logNetworkRetryAttemptEvent,
+      ).toHaveBeenCalledWith(event);
+
+      expect(mockLogger.emit).toHaveBeenCalledWith({
+        body: 'Network retry attempt 2/5 for test-model. Delay: 1000ms. Error type: Overloaded',
+        attributes: {
+          'session.id': 'test-session-id',
+          'user.email': 'test-user@example.com',
+          'installation.id': 'test-installation-id',
+          'event.name': EVENT_NETWORK_RETRY_ATTEMPT,
+          'event.timestamp': '2025-01-01T00:00:00.000Z',
+          interactive: false,
+          attempt: 2,
+          max_attempts: 5,
+          error_type: 'Overloaded',
+          delay_ms: 1000,
+          model: 'test-model',
+        },
+      });
+
+      expect(metrics.recordRetryAttemptMetrics).toHaveBeenCalledWith(
+        mockConfig,
+        {
+          model: 'test-model',
+          attempt: 2,
+        },
+      );
+    });
+  });
+
  describe('Telemetry Buffering', () => {
    it('should buffer events when SDK is not initialized', async () => {
      vi.spyOn(sdk, 'isTelemetrySdkInitialized').mockReturnValue(false);
@@ -32,6 +32,7 @@ import {
  type InvalidChunkEvent,
  type ContentRetryEvent,
  type ContentRetryFailureEvent,
+  type NetworkRetryAttemptEvent,
  type RipgrepFallbackEvent,
  type ToolOutputTruncatedEvent,
  type ModelRoutingEvent,
@@ -62,6 +63,7 @@ import {
  recordToolCallMetrics,
  recordChatCompressionMetrics,
  recordFileOperationMetric,
+  recordRetryAttemptMetrics,
  recordContentRetry,
  recordContentRetryFailure,
  recordModelRoutingMetrics,
@@ -485,6 +487,25 @@ export function logInvalidChunk(
  });
 }

+export function logNetworkRetryAttempt(
+  config: Config,
+  event: NetworkRetryAttemptEvent,
+): void {
+  ClearcutLogger.getInstance(config)?.logNetworkRetryAttemptEvent(event);
+  bufferTelemetryEvent(() => {
+    const logger = logs.getLogger(SERVICE_NAME);
+    const logRecord: LogRecord = {
+      body: event.toLogBody(),
+      attributes: event.toOpenTelemetryAttributes(config),
+    };
+    logger.emit(logRecord);
+    recordRetryAttemptMetrics(config, {
+      model: event.model,
+      attempt: event.attempt,
+    });
+  });
+}
+
 export function logContentRetry(
  config: Config,
  event: ContentRetryEvent,
@@ -40,6 +40,7 @@ const INVALID_CHUNK_COUNT = 'gemini_cli.chat.invalid_chunk.count';
 const CONTENT_RETRY_COUNT = 'gemini_cli.chat.content_retry.count';
 const CONTENT_RETRY_FAILURE_COUNT =
  'gemini_cli.chat.content_retry_failure.count';
+const NETWORK_RETRY_COUNT = 'gemini_cli.network_retry.count';
 const MODEL_ROUTING_LATENCY = 'gemini_cli.model_routing.latency';
 const MODEL_ROUTING_FAILURE_COUNT = 'gemini_cli.model_routing.failure.count';
 const MODEL_SLASH_COMMAND_CALL_COUNT =
@@ -166,6 +167,16 @@ const COUNTER_DEFINITIONS = {
    assign: (c: Counter) => (contentRetryFailureCounter = c),
    attributes: {} as Record<string, never>,
  },
+  [NETWORK_RETRY_COUNT]: {
+    description: 'Counts network retries.',
+    valueType: ValueType.INT,
+    assign: (c: Counter) => (networkRetryCounter = c),
+    // eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
+    attributes: {} as {
+      model: string;
+      attempt: number;
+    },
+  },
  [MODEL_ROUTING_FAILURE_COUNT]: {
    description: 'Counts model routing failures.',
    valueType: ValueType.INT,
@@ -610,6 +621,7 @@ let chatCompressionCounter: Counter | undefined;
 let invalidChunkCounter: Counter | undefined;
 let contentRetryCounter: Counter | undefined;
 let contentRetryFailureCounter: Counter | undefined;
+let networkRetryCounter: Counter | undefined;
 let modelRoutingLatencyHistogram: Histogram | undefined;
 let modelRoutingFailureCounter: Counter | undefined;
 let modelSlashCommandCallCounter: Counter | undefined;
@@ -848,6 +860,20 @@ export function recordInvalidChunk(config: Config): void {
  invalidChunkCounter.add(1, baseMetricDefinition.getCommonAttributes(config));
 }

+export function recordRetryAttemptMetrics(
+  config: Config,
+  attributes: {
+    model: string;
+    attempt: number;
+  },
+): void {
+  if (!networkRetryCounter || !isMetricsInitialized) return;
+  networkRetryCounter.add(1, {
+    ...baseMetricDefinition.getCommonAttributes(config),
+    ...attributes,
+  });
+}
+
 /**
 * Records a metric for when a retry is triggered due to a content error.
 */
@@ -1341,6 +1341,51 @@ export class ContentRetryEvent implements BaseTelemetryEvent {

 export const EVENT_CONTENT_RETRY_FAILURE =
  'gemini_cli.chat.content_retry_failure';
+
+export const EVENT_NETWORK_RETRY_ATTEMPT = 'gemini_cli.network_retry_attempt';
+export class NetworkRetryAttemptEvent implements BaseTelemetryEvent {
+  'event.name': 'network_retry_attempt';
+  'event.timestamp': string;
+  attempt: number;
+  max_attempts: number;
+  error_type: string;
+  delay_ms: number;
+  model: string;
+
+  constructor(
+    attempt: number,
+    max_attempts: number,
+    error_type: string,
+    delay_ms: number,
+    model: string,
+  ) {
+    this['event.name'] = 'network_retry_attempt';
+    this['event.timestamp'] = new Date().toISOString();
+    this.attempt = attempt;
+    this.max_attempts = max_attempts;
+    this.error_type = error_type;
+    this.delay_ms = delay_ms;
+    this.model = model;
+  }
+
+  toOpenTelemetryAttributes(config: Config): LogAttributes {
+    return {
+      ...getCommonAttributes(config),
+      'event.name': EVENT_NETWORK_RETRY_ATTEMPT,
+      'event.timestamp': this['event.timestamp'],
+      attempt: this.attempt,
+      max_attempts: this.max_attempts,
+      error_type: this.error_type,
+      delay_ms: this.delay_ms,
+      model: this.model,
+    };
+  }
+
+  toLogBody(): string {
+    return `Network retry attempt ${this.attempt}/${this.max_attempts} for ${this.model}. Delay: ${this.delay_ms}ms. Error type: ${this.error_type}`;
+  }
+}
+
 export class ContentRetryFailureEvent implements BaseTelemetryEvent {
  'event.name': 'content_retry_failure';
  'event.timestamp': string;
@@ -27,12 +27,14 @@ import { convert } from 'html-to-text';
 import {
  logWebFetchFallbackAttempt,
  WebFetchFallbackAttemptEvent,
+  logNetworkRetryAttempt,
+  NetworkRetryAttemptEvent,
 } from '../telemetry/index.js';
 import { LlmRole } from '../telemetry/llmRole.js';
 import { WEB_FETCH_TOOL_NAME } from './tool-names.js';
 import { debugLogger } from '../utils/debugLogger.js';
 import { coreEvents } from '../utils/events.js';
-import { retryWithBackoff } from '../utils/retry.js';
+import { retryWithBackoff, getRetryErrorType } from '../utils/retry.js';
 import { WEB_FETCH_DEFINITION } from './definitions/coreTools.js';
 import { resolveToolDeclaration } from './definitions/resolver.js';
 import { LRUCache } from 'mnemonist';
@@ -188,13 +190,28 @@ class WebFetchToolInvocation extends BaseToolInvocation<
  }

  private handleRetry(attempt: number, error: unknown, delayMs: number): void {
+    const maxAttempts = this.config.getMaxAttempts();
+    const modelName = 'Web Fetch';
+    const errorType = getRetryErrorType(error);
+
    coreEvents.emitRetryAttempt({
      attempt,
-      maxAttempts: this.config.getMaxAttempts(),
+      maxAttempts,
      delayMs,
-      error: error instanceof Error ? error.message : String(error),
-      model: 'Web Fetch',
+      error: errorType,
+      model: modelName,
    });
+
+    logNetworkRetryAttempt(
+      this.config,
+      new NetworkRetryAttemptEvent(
+        attempt,
+        maxAttempts,
+        errorType,
+        delayMs,
+        modelName,
+      ),
+    );
  }

  private async executeFallback(signal: AbortSignal): Promise<ToolResult> {
@@ -99,8 +99,46 @@ function getNetworkErrorCode(error: unknown): string | undefined {
  return undefined;
 }

-const FETCH_FAILED_MESSAGE = 'fetch failed';
-const INCOMPLETE_JSON_MESSAGE = 'incomplete json segment';
+export const FETCH_FAILED_MESSAGE = 'fetch failed';
+export const INCOMPLETE_JSON_MESSAGE = 'incomplete json segment';
+
+/**
+ * Categorizes an error for retry telemetry purposes.
+ * Returns a safe string without PII.
+ */
+export function getRetryErrorType(error: unknown): string {
+  if (error === 'Invalid content') {
+    return 'INVALID_CONTENT';
+  }
+
+  const errorCode = getNetworkErrorCode(error);
+  if (errorCode && RETRYABLE_NETWORK_CODES.includes(errorCode)) {
+    return errorCode;
+  }
+
+  if (error instanceof Error) {
+    const lowerMessage = error.message.toLowerCase();
+    if (lowerMessage.includes(FETCH_FAILED_MESSAGE)) {
+      return 'FETCH_FAILED';
+    }
+    if (lowerMessage.includes(INCOMPLETE_JSON_MESSAGE)) {
+      return 'INCOMPLETE_JSON';
+    }
+  }
+
+  const status = getErrorStatus(error);
+  if (status !== undefined) {
+    if (status === 429) return 'QUOTA_EXCEEDED';
+    if (status >= 500 && status < 600) return 'SERVER_ERROR';
+    return `HTTP_${status}`;
+  }
+
+  if (error instanceof Error) {
+    return error.name;
+  }
+
+  return 'UNKNOWN';
+}

 /**
 * Default predicate function to determine if a retry should be attempted.