fix(core): silently retry API errors up to 3 times before halting session (#21989)

2026-06-14 05:17:18 -07:00 · 2026-03-11 16:40:06 -04:00
parent be16caece2
commit 775bcbf3a6
3 changed files with 50 additions and 28 deletions
@@ -1380,11 +1380,11 @@ describe('GeminiChat', () => {
        }
      }).rejects.toThrow(InvalidStreamError);

-      // Should be called 2 times (initial + 1 retry)
+      // Should be called 4 times (initial + 3 retries)
      expect(mockContentGenerator.generateContentStream).toHaveBeenCalledTimes(
-        2,
+        4,
      );
-      expect(mockLogContentRetry).toHaveBeenCalledTimes(1);
+      expect(mockLogContentRetry).toHaveBeenCalledTimes(3);
      expect(mockLogContentRetryFailure).toHaveBeenCalledTimes(1);

      // History should still contain the user message.
@@ -79,17 +79,17 @@ export type StreamEvent =
  | { type: StreamEventType.AGENT_EXECUTION_BLOCKED; reason: string };

 /**
- * Options for retrying due to invalid content from the model.
+ * Options for retrying mid-stream errors (e.g. invalid content or API disconnects).
 */
-interface ContentRetryOptions {
+interface MidStreamRetryOptions {
  /** Total number of attempts to make (1 initial + N retries). */
  maxAttempts: number;
  /** The base delay in milliseconds for linear backoff. */
  initialDelayMs: number;
 }

-const INVALID_CONTENT_RETRY_OPTIONS: ContentRetryOptions = {
-  maxAttempts: 2, // 1 initial call + 1 retry
+const MID_STREAM_RETRY_OPTIONS: MidStreamRetryOptions = {
+  maxAttempts: 4, // 1 initial call + 3 retries mid-stream
  initialDelayMs: 500,
 };

@@ -350,7 +350,7 @@ export class GeminiChat {
      this: GeminiChat,
    ): AsyncGenerator<StreamEvent, void, void> {
      try {
-        const maxAttempts = INVALID_CONTENT_RETRY_OPTIONS.maxAttempts;
+        const maxAttempts = this.config.getMaxAttempts();

        for (let attempt = 0; attempt < maxAttempts; attempt++) {
          let isConnectionPhase = true;
@@ -402,21 +402,19 @@ export class GeminiChat {
              return; // Stop the generator
            }

+            if (isConnectionPhase) {
+              // Connection phase errors have already been retried by retryWithBackoff.
+              // If they bubble up here, they are exhausted or fatal.
+              throw error;
+            }
+
            // Check if the error is retryable (e.g., transient SSL errors
-            // like ERR_SSL_SSLV3_ALERT_BAD_RECORD_MAC)
+            // like ERR_SSL_SSLV3_ALERT_BAD_RECORD_MAC or ApiError)
            const isRetryable = isRetryableError(
              error,
              this.config.getRetryFetchErrors(),
            );

-            // For connection phase errors, only retryable errors should continue
-            if (isConnectionPhase) {
-              if (!isRetryable || signal.aborted) {
-                throw error;
-              }
-              // Fall through to retry logic for retryable connection errors
-            }
-
            const isContentError = error instanceof InvalidStreamError;
            const errorType = isContentError
              ? error.type
@@ -426,9 +424,16 @@ export class GeminiChat {
              (isContentError && isGemini2Model(model)) ||
              (isRetryable && !signal.aborted)
            ) {
-              // Check if we have more attempts left.
-              if (attempt < maxAttempts - 1) {
-                const delayMs = INVALID_CONTENT_RETRY_OPTIONS.initialDelayMs;
+              // The issue requests exactly 3 retries (4 attempts) for API errors during stream iteration.
+              // Regardless of the global maxAttempts (e.g. 10), we only want to retry these mid-stream API errors
+              // up to 3 times before finally throwing the error to the user.
+              const maxMidStreamAttempts = MID_STREAM_RETRY_OPTIONS.maxAttempts;
+
+              if (
+                attempt < maxAttempts - 1 &&
+                attempt < maxMidStreamAttempts - 1
+              ) {
+                const delayMs = MID_STREAM_RETRY_OPTIONS.initialDelayMs;

                if (isContentError) {
                  logContentRetry(
@@ -449,7 +454,7 @@ export class GeminiChat {
                }
                coreEvents.emitRetryAttempt({
                  attempt: attempt + 1,
-                  maxAttempts,
+                  maxAttempts: Math.min(maxAttempts, maxMidStreamAttempts),
                  delayMs: delayMs * (attempt + 1),
                  error: errorType,
                  model,
@@ -292,6 +292,14 @@ describe('GeminiChat Network Retries', () => {
    (sslError as NodeJS.ErrnoException).code =
      'ERR_SSL_SSLV3_ALERT_BAD_RECORD_MAC';

+    // Instead of outer loop, connection retries are handled by retryWithBackoff.
+    // Simulate retryWithBackoff attempting it twice: first throws, second succeeds.
+    mockRetryWithBackoff.mockImplementation(
+      async (apiCall) =>
+        // Execute the apiCall to trigger mockContentGenerator
+        await apiCall(),
+    );
+
    vi.mocked(mockContentGenerator.generateContentStream)
      // First call: throw SSL error immediately (connection phase)
      .mockRejectedValueOnce(sslError)
@@ -309,6 +317,15 @@ describe('GeminiChat Network Retries', () => {
        })(),
      );

+    // Because retryWithBackoff is mocked and we just want to test GeminiChat's integration,
+    // we need to actually execute the real retryWithBackoff logic for this test to see it work.
+    // So let's restore the real retryWithBackoff for this test.
+    const { retryWithBackoff } =
+      await vi.importActual<typeof import('../utils/retry.js')>(
+        '../utils/retry.js',
+      );
+    mockRetryWithBackoff.mockImplementation(retryWithBackoff);
+
    const stream = await chat.sendMessageStream(
      { model: 'test-model' },
      'test message',
@@ -322,10 +339,6 @@ describe('GeminiChat Network Retries', () => {
      events.push(event);
    }

-    // Should have retried and succeeded
-    const retryEvent = events.find((e) => e.type === StreamEventType.RETRY);
-    expect(retryEvent).toBeDefined();
-
    const successChunk = events.find(
      (e) =>
        e.type === StreamEventType.CHUNK &&
@@ -342,6 +355,12 @@ describe('GeminiChat Network Retries', () => {
    const connectionError = new Error('read ECONNRESET');
    (connectionError as NodeJS.ErrnoException).code = 'ECONNRESET';

+    const { retryWithBackoff } =
+      await vi.importActual<typeof import('../utils/retry.js')>(
+        '../utils/retry.js',
+      );
+    mockRetryWithBackoff.mockImplementation(retryWithBackoff);
+
    vi.mocked(mockContentGenerator.generateContentStream)
      .mockRejectedValueOnce(connectionError)
      .mockImplementationOnce(async () =>
@@ -372,9 +391,6 @@ describe('GeminiChat Network Retries', () => {
      events.push(event);
    }

-    const retryEvent = events.find((e) => e.type === StreamEventType.RETRY);
-    expect(retryEvent).toBeDefined();
-
    const successChunk = events.find(
      (e) =>
        e.type === StreamEventType.CHUNK &&
@@ -382,6 +398,7 @@ describe('GeminiChat Network Retries', () => {
          'Success after connection retry',
    );
    expect(successChunk).toBeDefined();
+    expect(mockContentGenerator.generateContentStream).toHaveBeenCalledTimes(2);
  });

  it('should NOT retry on non-retryable error during connection phase', async () => {