feat(core): add agent decision mechanism for unretryable errors

2026-04-25 20:44:46 -07:00 · 2026-03-10 20:18:08 -04:00
parent 7c4570339e
commit bb67e91948
5 changed files with 270 additions and 14 deletions
@@ -20,12 +20,14 @@ import { reportError } from '../utils/errorReporting.js';
 import { getErrorMessage } from '../utils/errors.js';
 import { logMalformedJsonResponse } from '../telemetry/loggers.js';
 import { MalformedJsonResponseEvent, LlmRole } from '../telemetry/types.js';
-import { retryWithBackoff } from '../utils/retry.js';
+import { retryWithBackoff, type AgentDecision } from '../utils/retry.js';
 import type { ModelConfigKey } from '../services/modelConfigService.js';
 import {
  applyModelSelection,
  createAvailabilityContextProvider,
 } from '../availability/policyHelpers.js';
+import { PREVIEW_GEMINI_FLASH_MODEL } from '../config/models.js';
+import { debugLogger } from '../utils/debugLogger.js';

 const DEFAULT_MAX_ATTEMPTS = 5;

@@ -316,6 +318,92 @@ export class BaseLlmClient {
        );
      };

+      const onAgentDecisionCallback = async (
+        error: unknown,
+        attempt: number,
+      ): Promise<AgentDecision> => {
+        try {
+          const errorMsg =
+            error instanceof Error ? error.message : String(error);
+
+          if (errorMsg.includes('429')) {
+            return 'stop';
+          }
+
+          const lastUserMessage = [...contents]
+            .reverse()
+            .find((c) => c.role === 'user');
+          const lastUserText = lastUserMessage?.parts
+            ? lastUserMessage.parts.map((p) => p.text || '').join(' ')
+            : 'N/A';
+
+          const decisionPrompt = `You are a meta-agent deciding whether to retry an AI request that failed.
+
+Error: ${errorMsg}
+Attempt: ${attempt}
+Target Model: ${currentModel}
+Last User Request: ${lastUserText}
+
+Based on the error, should we try to retry the exact same request? 
+Some errors are transient (e.g. network blips, internal server errors), others are terminal (e.g. safety blocks, invalid arguments).
+
+Respond with a JSON object: {"action": "retry" | "stop"}`;
+
+          const decisionResponse = await this.contentGenerator.generateContent(
+            {
+              model: PREVIEW_GEMINI_FLASH_MODEL,
+              contents: [{ role: 'user', parts: [{ text: decisionPrompt }] }],
+              config: {
+                responseMimeType: 'application/json',
+              },
+            },
+            'agent-decision',
+            LlmRole.UTILITY_ROUTER,
+          );
+
+          const decisionText =
+            decisionResponse.candidates?.[0]?.content?.parts?.[0]?.text;
+          if (decisionText) {
+            const parsed = JSON.parse(decisionText) as unknown;
+            if (
+              typeof parsed === 'object' &&
+              parsed !== null &&
+              'action' in parsed
+            ) {
+              const action = (parsed as { action: unknown }).action;
+              if (
+                action === 'retry' ||
+                action === 'stop' ||
+                action === 'modify_request'
+              ) {
+                // eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
+                return action as AgentDecision;
+              }
+            }
+          }
+        } catch (agentError) {
+          debugLogger.warn('Agent decision failed:', agentError);
+        }
+
+        return 'stop';
+      };
+
+      const onAgentDecisionWrapper = async (
+        error: unknown,
+        attempt: number,
+      ): Promise<AgentDecision> => {
+        if (
+          typeof error === 'object' &&
+          error !== null &&
+          'message' in error &&
+          typeof (error as { message?: unknown }).message === 'string' &&
+          (error as { message?: unknown }).message === 'Agent Decision'
+        ) {
+          return 'stop';
+        }
+        return onAgentDecisionCallback(error, attempt);
+      };
+
      return await retryWithBackoff(apiCall, {
        shouldRetryOnContent,
        maxAttempts:
@@ -325,6 +413,7 @@ export class BaseLlmClient {
          ? (authType, error) =>
              handleFallback(this.config, currentModel, authType, error)
          : undefined,
+        onAgentDecision: onAgentDecisionWrapper,
        authType:
          this.authType ?? this.config.getContentGeneratorConfig()?.authType,
      });
@@ -19,13 +19,18 @@ import {
  type GenerateContentParameters,
 } from '@google/genai';
 import { toParts } from '../code_assist/converter.js';
-import { retryWithBackoff, isRetryableError } from '../utils/retry.js';
+import {
+  retryWithBackoff,
+  isRetryableError,
+  type AgentDecision,
+} from '../utils/retry.js';
 import type { ValidationRequiredError } from '../utils/googleQuotaErrors.js';
 import type { Config } from '../config/config.js';
 import {
  resolveModel,
  isGemini2Model,
  supportsModernFeatures,
+  PREVIEW_GEMINI_FLASH_MODEL,
 } from '../config/models.js';
 import { hasCycleInSchema } from '../tools/tools.js';
 import type { StructuredError } from './turn.js';
@@ -41,7 +46,7 @@ import {
 import {
  ContentRetryEvent,
  ContentRetryFailureEvent,
-  type LlmRole,
+  LlmRole,
 } from '../telemetry/types.js';
 import { handleFallback } from '../fallback/handler.js';
 import { isFunctionResponse } from '../utils/messageInspectors.js';
@@ -53,6 +58,7 @@ import {
  createAvailabilityContextProvider,
 } from '../availability/policyHelpers.js';
 import { coreEvents } from '../utils/events.js';
+import { debugLogger } from '../utils/debugLogger.js';

 export enum StreamEventType {
  /** A regular content chunk from the API. */
@@ -624,9 +630,97 @@ export class GeminiChat {
      );
    };

+    const onAgentDecisionCallback = async (
+      error: unknown,
+      attempt: number,
+    ): Promise<AgentDecision> => {
+      try {
+        const errorMsg = error instanceof Error ? error.message : String(error);
+
+        // Don't try to use the agent for 429s as it will likely also fail.
+        if (errorMsg.includes('429')) {
+          return 'stop';
+        }
+
+        const lastUserMessage = [...lastContentsToUse]
+          .reverse()
+          .find((c) => c.role === 'user');
+        const lastUserText = lastUserMessage?.parts
+          ? lastUserMessage.parts.map((p) => p.text || '').join(' ')
+          : 'N/A';
+
+        const decisionPrompt = `You are a meta-agent deciding whether to retry an AI request that failed.
+
+      Error: ${errorMsg}
+      Attempt: ${attempt}
+      Target Model: ${lastModelToUse}
+      Last User Request: ${lastUserText}
+
+      Based on the error, should we try to retry the exact same request? 
+      Some errors are transient (e.g. network blips, internal server errors), others are terminal (e.g. safety blocks, invalid arguments).
+
+      Respond with a JSON object: {"action": "retry" | "stop"}`;
+
+        const decisionResponse = await this.config
+          .getContentGenerator()
+          .generateContent(
+            {
+              model: PREVIEW_GEMINI_FLASH_MODEL,
+              contents: [{ role: 'user', parts: [{ text: decisionPrompt }] }],
+              config: {
+                responseMimeType: 'application/json',
+              },
+            },
+            'agent-decision',
+            LlmRole.UTILITY_ROUTER,
+          );
+
+        const decisionText =
+          decisionResponse.candidates?.[0]?.content?.parts?.[0]?.text;
+        if (decisionText) {
+          const parsed = JSON.parse(decisionText) as unknown;
+          if (
+            typeof parsed === 'object' &&
+            parsed !== null &&
+            'action' in parsed
+          ) {
+            const action = (parsed as { action: unknown }).action;
+            if (
+              action === 'retry' ||
+              action === 'stop' ||
+              action === 'modify_request'
+            ) {
+              // eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
+              return action as AgentDecision;
+            }
+          }
+        }
+      } catch (agentError) {
+        debugLogger.warn('Agent decision failed:', agentError);
+      }
+
+      return 'stop';
+    };
+
+    const onAgentDecisionWrapper = async (
+      error: unknown,
+      attempt: number,
+    ): Promise<AgentDecision> => {
+      if (
+        typeof error === 'object' &&
+        error !== null &&
+        'message' in error &&
+        typeof (error as { message?: unknown }).message === 'string' &&
+        (error as { message?: unknown }).message === 'Agent Decision'
+      ) {
+        return 'stop';
+      }
+      return onAgentDecisionCallback(error, attempt);
+    };
    const streamResponse = await retryWithBackoff(apiCall, {
      onPersistent429: onPersistent429Callback,
      onValidationRequired: onValidationRequiredCallback,
+      onAgentDecision: onAgentDecisionWrapper,
      authType: this.config.getContentGeneratorConfig()?.authType,
      retryFetchErrors: this.config.getRetryFetchErrors(),
      signal: abortSignal,
@@ -5,7 +5,6 @@
 */

 import type { Config } from '../config/config.js';
-import { AuthType } from '../core/contentGenerator.js';
 import {
  openBrowserSecurely,
  shouldLaunchBrowser,
@@ -29,10 +28,6 @@ export async function handleFallback(
  authType?: string,
  error?: unknown,
 ): Promise<string | boolean | null> {
-  if (authType !== AuthType.LOGIN_WITH_GOOGLE) {
-    return null;
-  }
-
  const chain = resolvePolicyChain(config);
  const { failedPolicy, candidates } = buildFallbackPolicyContext(
    chain,
@@ -82,6 +82,35 @@ describe('retryWithBackoff', () => {
    expect(mockFn).toHaveBeenCalledTimes(3);
  });

+  it('should ask the agent when max attempts are reached, and retry if requested', async () => {
+    let calls = 0;
+    const mockFn = vi.fn(async () => {
+      calls++;
+      if (calls <= 2) {
+        throw new Error('Generic unretryable error');
+      }
+      return 'success';
+    });
+    const mockAgentDecision = vi.fn().mockResolvedValue('retry');
+
+    const promise = retryWithBackoff(mockFn, {
+      maxAttempts: 1, // Doesn't matter because the error is unretryable and will fail immediately, OR if retryable it will fail after max attempts
+      initialDelayMs: 10,
+      onAgentDecision: mockAgentDecision,
+    });
+
+    await vi.runAllTimersAsync();
+    const result = await promise;
+
+    expect(result).toBe('success');
+    expect(mockFn).toHaveBeenCalledTimes(3); // 1 original + 2 retries
+    expect(mockAgentDecision).toHaveBeenCalledTimes(2);
+    expect(mockAgentDecision).toHaveBeenCalledWith(
+      expect.any(Error),
+      1, // the attempt number when it failed
+    );
+  });
+
  it('should throw an error if all attempts fail', async () => {
    const mockFn = createFailingFunction(3);

@@ -17,6 +17,12 @@ import { getErrorStatus, ModelNotFoundError } from './httpErrors.js';
 import type { RetryAvailabilityContext } from '../availability/modelPolicy.js';

 export type { RetryAvailabilityContext };
+
+export type AgentDecision =
+  | 'retry'
+  | 'stop'
+  | { action: 'retry' | 'stop' | 'modify_request'; data?: unknown };
+
 export const DEFAULT_MAX_ATTEMPTS = 10;

 export interface RetryOptions {
@@ -32,6 +38,7 @@ export interface RetryOptions {
  onValidationRequired?: (
    error: ValidationRequiredError,
  ) => Promise<'verify' | 'change_auth' | 'cancel'>;
+  onAgentDecision?: (error: unknown, attempt: number) => Promise<AgentDecision>;
  authType?: string;
  retryFetchErrors?: boolean;
  signal?: AbortSignal;
@@ -174,6 +181,7 @@ export async function retryWithBackoff<T>(
    maxDelayMs,
    onPersistent429,
    onValidationRequired,
+    onAgentDecision,
    authType,
    shouldRetryOnError,
    shouldRetryOnContent,
@@ -190,6 +198,29 @@ export async function retryWithBackoff<T>(
  let attempt = 0;
  let currentDelay = initialDelayMs;

+  const handleTerminalError = async (errToThrow: unknown) => {
+    if (onAgentDecision) {
+      try {
+        const decision = await onAgentDecision(errToThrow, attempt);
+        const decisionType =
+          typeof decision === 'string' ? decision : decision.action;
+
+        debugLogger.log(
+          `Agent evaluated error at attempt ${attempt}: ${decisionType}`,
+        );
+
+        if (decisionType === 'retry' || decisionType === 'modify_request') {
+          return true;
+        }
+      } catch (agentError) {
+        if (agentError !== errToThrow) {
+          debugLogger.warn('Agent decision failed:', agentError);
+        }
+      }
+    }
+    throw errToThrow;
+  };
+
  while (attempt < maxAttempts) {
    if (signal?.aborted) {
      throw createAbortError();
@@ -248,7 +279,11 @@ export async function retryWithBackoff<T>(
          }
        }
        // Terminal/not_found already recorded; nothing else to mark here.
-        throw classifiedError; // Throw if no fallback or fallback failed.
+        if (await handleTerminalError(classifiedError)) {
+          attempt = 0;
+          currentDelay = initialDelayMs;
+          continue;
+        }
      }

      // Handle ValidationRequiredError - user needs to verify before proceeding
@@ -268,7 +303,11 @@ export async function retryWithBackoff<T>(
            debugLogger.warn('Validation handler failed:', validationError);
          }
        }
-        throw classifiedError;
+        if (await handleTerminalError(classifiedError)) {
+          attempt = 0;
+          currentDelay = initialDelayMs;
+          continue;
+        }
      }

      const is500 =
@@ -296,9 +335,15 @@ export async function retryWithBackoff<T>(
              debugLogger.warn('Model fallback failed:', fallbackError);
            }
          }
-          throw classifiedError instanceof RetryableQuotaError
-            ? classifiedError
-            : error;
+          const errToThrow =
+            classifiedError instanceof RetryableQuotaError
+              ? classifiedError
+              : error;
+          if (await handleTerminalError(errToThrow)) {
+            attempt = 0;
+            currentDelay = initialDelayMs;
+            continue;
+          }
        }

        if (
@@ -340,7 +385,11 @@ export async function retryWithBackoff<T>(
        // eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
        !shouldRetryOnError(error as Error, retryFetchErrors)
      ) {
-        throw error;
+        if (await handleTerminalError(error)) {
+          attempt = 0;
+          currentDelay = initialDelayMs;
+          continue;
+        }
      }

      const errorStatus = getErrorStatus(error);