From 3d605e674d6ffb6b3b8828d43409bc989d60af4d Mon Sep 17 00:00:00 2001
From: Sehoon Shon <sshon@google.com>
Date: Tue, 10 Mar 2026 19:38:42 -0400
Subject: [PATCH] feat(core): add agent decision mechanism for unretryable
 errors

---
 packages/core/src/core/baseLlmClient.ts | 91 ++++++++++++++++++++++-
 packages/core/src/core/geminiChat.ts    | 98 ++++++++++++++++++++++++-
 packages/core/src/utils/retry.test.ts   | 29 ++++++++
 packages/core/src/utils/retry.ts        | 32 ++++++++
 4 files changed, 247 insertions(+), 3 deletions(-)
diff --git a/packages/core/src/core/baseLlmClient.ts b/packages/core/src/core/baseLlmClient.ts
index 0de4dd1e20..e6a081bb2d 100644
--- a/packages/core/src/core/baseLlmClient.ts
+++ b/packages/core/src/core/baseLlmClient.ts
@@ -20,12 +20,14 @@ import { reportError } from '../utils/errorReporting.js';
 import { getErrorMessage } from '../utils/errors.js';
 import { logMalformedJsonResponse } from '../telemetry/loggers.js';
 import { MalformedJsonResponseEvent, LlmRole } from '../telemetry/types.js';
-import { retryWithBackoff } from '../utils/retry.js';
+import { retryWithBackoff, type AgentDecision } from '../utils/retry.js';
 import type { ModelConfigKey } from '../services/modelConfigService.js';
 import {
   applyModelSelection,
   createAvailabilityContextProvider,
 } from '../availability/policyHelpers.js';
+import { PREVIEW_GEMINI_FLASH_MODEL } from '../config/models.js';
+import { debugLogger } from '../utils/debugLogger.js';
 
 const DEFAULT_MAX_ATTEMPTS = 5;
 
@@ -316,6 +318,92 @@ export class BaseLlmClient {
         );
       };
 
+      const onAgentDecisionCallback = async (
+        error: unknown,
+        attempt: number,
+      ): Promise<AgentDecision> => {
+        try {
+          const errorMsg =
+            error instanceof Error ? error.message : String(error);
+
+          if (errorMsg.includes('429')) {
+            return 'stop';
+          }
+
+          const lastUserMessage = [...contents]
+            .reverse()
+            .find((c) => c.role === 'user');
+          const lastUserText = lastUserMessage?.parts
+            ? lastUserMessage.parts.map((p) => p.text || '').join(' ')
+            : 'N/A';
+
+          const decisionPrompt = `You are a meta-agent deciding whether to retry an AI request that failed.
+
+Error: ${errorMsg}
+Attempt: ${attempt}
+Target Model: ${currentModel}
+Last User Request: ${lastUserText}
+
+Based on the error, should we try to retry the exact same request? 
+Some errors are transient (e.g. network blips, internal server errors), others are terminal (e.g. safety blocks, invalid arguments).
+
+Respond with a JSON object: {"action": "retry" | "stop"}`;
+
+          const decisionResponse = await this.contentGenerator.generateContent(
+            {
+              model: PREVIEW_GEMINI_FLASH_MODEL,
+              contents: [{ role: 'user', parts: [{ text: decisionPrompt }] }],
+              config: {
+                responseMimeType: 'application/json',
+              },
+            },
+            'agent-decision',
+            LlmRole.UTILITY_ROUTER,
+          );
+
+          const decisionText =
+            decisionResponse.candidates?.[0]?.content?.parts?.[0]?.text;
+          if (decisionText) {
+            const parsed = JSON.parse(decisionText) as unknown;
+            if (
+              typeof parsed === 'object' &&
+              parsed !== null &&
+              'action' in parsed
+            ) {
+              const action = (parsed as { action: unknown }).action;
+              if (
+                action === 'retry' ||
+                action === 'stop' ||
+                action === 'modify_request'
+              ) {
+                // eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
+                return action as AgentDecision;
+              }
+            }
+          }
+        } catch (agentError) {
+          debugLogger.warn('Agent decision failed:', agentError);
+        }
+
+        return 'stop';
+      };
+
+      const onAgentDecisionWrapper = async (
+        error: unknown,
+        attempt: number,
+      ): Promise<AgentDecision> => {
+        if (
+          typeof error === 'object' &&
+          error !== null &&
+          'message' in error &&
+          typeof (error as { message?: unknown }).message === 'string' &&
+          (error as { message?: unknown }).message === 'Agent Decision'
+        ) {
+          return 'stop';
+        }
+        return onAgentDecisionCallback(error, attempt);
+      };
+
       return await retryWithBackoff(apiCall, {
         shouldRetryOnContent,
         maxAttempts:
@@ -325,6 +413,7 @@ export class BaseLlmClient {
           ? (authType, error) =>
               handleFallback(this.config, currentModel, authType, error)
           : undefined,
+        onAgentDecision: onAgentDecisionWrapper,
         authType:
           this.authType ?? this.config.getContentGeneratorConfig()?.authType,
       });
diff --git a/packages/core/src/core/geminiChat.ts b/packages/core/src/core/geminiChat.ts
index 44a28c83a5..6026c88948 100644
--- a/packages/core/src/core/geminiChat.ts
+++ b/packages/core/src/core/geminiChat.ts
@@ -19,13 +19,18 @@ import {
   type GenerateContentParameters,
 } from '@google/genai';
 import { toParts } from '../code_assist/converter.js';
-import { retryWithBackoff, isRetryableError } from '../utils/retry.js';
+import {
+  retryWithBackoff,
+  isRetryableError,
+  type AgentDecision,
+} from '../utils/retry.js';
 import type { ValidationRequiredError } from '../utils/googleQuotaErrors.js';
 import type { Config } from '../config/config.js';
 import {
   resolveModel,
   isGemini2Model,
   supportsModernFeatures,
+  PREVIEW_GEMINI_FLASH_MODEL,
 } from '../config/models.js';
 import { hasCycleInSchema } from '../tools/tools.js';
 import type { StructuredError } from './turn.js';
@@ -41,7 +46,7 @@ import {
 import {
   ContentRetryEvent,
   ContentRetryFailureEvent,
-  type LlmRole,
+  LlmRole,
 } from '../telemetry/types.js';
 import { handleFallback } from '../fallback/handler.js';
 import { isFunctionResponse } from '../utils/messageInspectors.js';
@@ -53,6 +58,7 @@ import {
   createAvailabilityContextProvider,
 } from '../availability/policyHelpers.js';
 import { coreEvents } from '../utils/events.js';
+import { debugLogger } from '../utils/debugLogger.js';
 
 export enum StreamEventType {
   /** A regular content chunk from the API. */
@@ -624,9 +630,97 @@ export class GeminiChat {
       );
     };
 
+    const onAgentDecisionCallback = async (
+      error: unknown,
+      attempt: number,
+    ): Promise<AgentDecision> => {
+      try {
+        const errorMsg = error instanceof Error ? error.message : String(error);
+
+        // Don't try to use the agent for 429s as it will likely also fail.
+        if (errorMsg.includes('429')) {
+          return 'stop';
+        }
+
+        const lastUserMessage = [...lastContentsToUse]
+          .reverse()
+          .find((c) => c.role === 'user');
+        const lastUserText = lastUserMessage?.parts
+          ? lastUserMessage.parts.map((p) => p.text || '').join(' ')
+          : 'N/A';
+
+        const decisionPrompt = `You are a meta-agent deciding whether to retry an AI request that failed.
+
+      Error: ${errorMsg}
+      Attempt: ${attempt}
+      Target Model: ${lastModelToUse}
+      Last User Request: ${lastUserText}
+
+      Based on the error, should we try to retry the exact same request? 
+      Some errors are transient (e.g. network blips, internal server errors), others are terminal (e.g. safety blocks, invalid arguments).
+
+      Respond with a JSON object: {"action": "retry" | "stop"}`;
+
+        const decisionResponse = await this.config
+          .getContentGenerator()
+          .generateContent(
+            {
+              model: PREVIEW_GEMINI_FLASH_MODEL,
+              contents: [{ role: 'user', parts: [{ text: decisionPrompt }] }],
+              config: {
+                responseMimeType: 'application/json',
+              },
+            },
+            'agent-decision',
+            LlmRole.UTILITY_ROUTER,
+          );
+
+        const decisionText =
+          decisionResponse.candidates?.[0]?.content?.parts?.[0]?.text;
+        if (decisionText) {
+          const parsed = JSON.parse(decisionText) as unknown;
+          if (
+            typeof parsed === 'object' &&
+            parsed !== null &&
+            'action' in parsed
+          ) {
+            const action = (parsed as { action: unknown }).action;
+            if (
+              action === 'retry' ||
+              action === 'stop' ||
+              action === 'modify_request'
+            ) {
+              // eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
+              return action as AgentDecision;
+            }
+          }
+        }
+      } catch (agentError) {
+        debugLogger.warn('Agent decision failed:', agentError);
+      }
+
+      return 'stop';
+    };
+
+    const onAgentDecisionWrapper = async (
+      error: unknown,
+      attempt: number,
+    ): Promise<AgentDecision> => {
+      if (
+        typeof error === 'object' &&
+        error !== null &&
+        'message' in error &&
+        typeof (error as { message?: unknown }).message === 'string' &&
+        (error as { message?: unknown }).message === 'Agent Decision'
+      ) {
+        return 'stop';
+      }
+      return onAgentDecisionCallback(error, attempt);
+    };
     const streamResponse = await retryWithBackoff(apiCall, {
       onPersistent429: onPersistent429Callback,
       onValidationRequired: onValidationRequiredCallback,
+      onAgentDecision: onAgentDecisionWrapper,
       authType: this.config.getContentGeneratorConfig()?.authType,
       retryFetchErrors: this.config.getRetryFetchErrors(),
       signal: abortSignal,
diff --git a/packages/core/src/utils/retry.test.ts b/packages/core/src/utils/retry.test.ts
index f63a5ed723..3c91891eab 100644
--- a/packages/core/src/utils/retry.test.ts
+++ b/packages/core/src/utils/retry.test.ts
@@ -82,6 +82,35 @@ describe('retryWithBackoff', () => {
     expect(mockFn).toHaveBeenCalledTimes(3);
   });
 
+  it('should ask the agent when max attempts are reached, and retry if requested', async () => {
+    let calls = 0;
+    const mockFn = vi.fn(async () => {
+      calls++;
+      if (calls <= 2) {
+        throw new Error('Generic unretryable error');
+      }
+      return 'success';
+    });
+    const mockAgentDecision = vi.fn().mockResolvedValue('retry');
+
+    const promise = retryWithBackoff(mockFn, {
+      maxAttempts: 1, // Doesn't matter because the error is unretryable and will fail immediately, OR if retryable it will fail after max attempts
+      initialDelayMs: 10,
+      onAgentDecision: mockAgentDecision,
+    });
+
+    await vi.runAllTimersAsync();
+    const result = await promise;
+
+    expect(result).toBe('success');
+    expect(mockFn).toHaveBeenCalledTimes(3); // 1 original + 2 retries
+    expect(mockAgentDecision).toHaveBeenCalledTimes(2);
+    expect(mockAgentDecision).toHaveBeenCalledWith(
+      expect.any(Error),
+      1, // the attempt number when it failed
+    );
+  });
+
   it('should throw an error if all attempts fail', async () => {
     const mockFn = createFailingFunction(3);
 
diff --git a/packages/core/src/utils/retry.ts b/packages/core/src/utils/retry.ts
index a16e823e74..dd0201e6e4 100644
--- a/packages/core/src/utils/retry.ts
+++ b/packages/core/src/utils/retry.ts
@@ -17,6 +17,12 @@ import { getErrorStatus, ModelNotFoundError } from './httpErrors.js';
 import type { RetryAvailabilityContext } from '../availability/modelPolicy.js';
 
 export type { RetryAvailabilityContext };
+
+export type AgentDecision =
+  | 'retry'
+  | 'stop'
+  | { action: 'retry' | 'stop' | 'modify_request'; data?: unknown };
+
 export const DEFAULT_MAX_ATTEMPTS = 10;
 
 export interface RetryOptions {
@@ -32,6 +38,7 @@ export interface RetryOptions {
   onValidationRequired?: (
     error: ValidationRequiredError,
   ) => Promise<'verify' | 'change_auth' | 'cancel'>;
+  onAgentDecision?: (error: unknown, attempt: number) => Promise<AgentDecision>;
   authType?: string;
   retryFetchErrors?: boolean;
   signal?: AbortSignal;
@@ -174,6 +181,7 @@ export async function retryWithBackoff<T>(
     maxDelayMs,
     onPersistent429,
     onValidationRequired,
+    onAgentDecision,
     authType,
     shouldRetryOnError,
     shouldRetryOnContent,
@@ -340,6 +348,30 @@ export async function retryWithBackoff<T>(
         // eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
         !shouldRetryOnError(error as Error, retryFetchErrors)
       ) {
+        if (onAgentDecision) {
+          try {
+            const decision = await onAgentDecision(error, attempt);
+            const decisionType =
+              typeof decision === 'string' ? decision : decision.action;
+
+            debugLogger.log(
+              `Agent evaluated error at attempt ${attempt}: ${decisionType}`,
+            );
+
+            if (decisionType === 'retry' || decisionType === 'modify_request') {
+              attempt = 0;
+              currentDelay = initialDelayMs;
+              continue;
+            } else if (decisionType === 'stop') {
+              throw error;
+            }
+          } catch (agentError) {
+            if (agentError !== error) {
+              debugLogger.warn('Agent decision failed:', agentError);
+            }
+            throw error;
+          }
+        }
         throw error;
       }