feat(core): add agent decision mechanism for unretryable errors

2026-04-25 20:44:46 -07:00 · 2026-03-10 19:38:42 -04:00
parent e22d9917b7
commit 3d605e674d
4 changed files with 247 additions and 3 deletions
@@ -20,12 +20,14 @@ import { reportError } from '../utils/errorReporting.js';
 import { getErrorMessage } from '../utils/errors.js';
 import { logMalformedJsonResponse } from '../telemetry/loggers.js';
 import { MalformedJsonResponseEvent, LlmRole } from '../telemetry/types.js';
-import { retryWithBackoff } from '../utils/retry.js';
+import { retryWithBackoff, type AgentDecision } from '../utils/retry.js';
 import type { ModelConfigKey } from '../services/modelConfigService.js';
 import {
  applyModelSelection,
  createAvailabilityContextProvider,
 } from '../availability/policyHelpers.js';
+import { PREVIEW_GEMINI_FLASH_MODEL } from '../config/models.js';
+import { debugLogger } from '../utils/debugLogger.js';

 const DEFAULT_MAX_ATTEMPTS = 5;

@@ -316,6 +318,92 @@ export class BaseLlmClient {
        );
      };

+      const onAgentDecisionCallback = async (
+        error: unknown,
+        attempt: number,
+      ): Promise<AgentDecision> => {
+        try {
+          const errorMsg =
+            error instanceof Error ? error.message : String(error);
+
+          if (errorMsg.includes('429')) {
+            return 'stop';
+          }
+
+          const lastUserMessage = [...contents]
+            .reverse()
+            .find((c) => c.role === 'user');
+          const lastUserText = lastUserMessage?.parts
+            ? lastUserMessage.parts.map((p) => p.text || '').join(' ')
+            : 'N/A';
+
+          const decisionPrompt = `You are a meta-agent deciding whether to retry an AI request that failed.
+
+Error: ${errorMsg}
+Attempt: ${attempt}
+Target Model: ${currentModel}
+Last User Request: ${lastUserText}
+
+Based on the error, should we try to retry the exact same request? 
+Some errors are transient (e.g. network blips, internal server errors), others are terminal (e.g. safety blocks, invalid arguments).
+
+Respond with a JSON object: {"action": "retry" | "stop"}`;
+
+          const decisionResponse = await this.contentGenerator.generateContent(
+            {
+              model: PREVIEW_GEMINI_FLASH_MODEL,
+              contents: [{ role: 'user', parts: [{ text: decisionPrompt }] }],
+              config: {
+                responseMimeType: 'application/json',
+              },
+            },
+            'agent-decision',
+            LlmRole.UTILITY_ROUTER,
+          );
+
+          const decisionText =
+            decisionResponse.candidates?.[0]?.content?.parts?.[0]?.text;
+          if (decisionText) {
+            const parsed = JSON.parse(decisionText) as unknown;
+            if (
+              typeof parsed === 'object' &&
+              parsed !== null &&
+              'action' in parsed
+            ) {
+              const action = (parsed as { action: unknown }).action;
+              if (
+                action === 'retry' ||
+                action === 'stop' ||
+                action === 'modify_request'
+              ) {
+                // eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
+                return action as AgentDecision;
+              }
+            }
+          }
+        } catch (agentError) {
+          debugLogger.warn('Agent decision failed:', agentError);
+        }
+
+        return 'stop';
+      };
+
+      const onAgentDecisionWrapper = async (
+        error: unknown,
+        attempt: number,
+      ): Promise<AgentDecision> => {
+        if (
+          typeof error === 'object' &&
+          error !== null &&
+          'message' in error &&
+          typeof (error as { message?: unknown }).message === 'string' &&
+          (error as { message?: unknown }).message === 'Agent Decision'
+        ) {
+          return 'stop';
+        }
+        return onAgentDecisionCallback(error, attempt);
+      };
+
      return await retryWithBackoff(apiCall, {
        shouldRetryOnContent,
        maxAttempts:
@@ -325,6 +413,7 @@ export class BaseLlmClient {
          ? (authType, error) =>
              handleFallback(this.config, currentModel, authType, error)
          : undefined,
+        onAgentDecision: onAgentDecisionWrapper,
        authType:
          this.authType ?? this.config.getContentGeneratorConfig()?.authType,
      });
@@ -19,13 +19,18 @@ import {
  type GenerateContentParameters,
 } from '@google/genai';
 import { toParts } from '../code_assist/converter.js';
-import { retryWithBackoff, isRetryableError } from '../utils/retry.js';
+import {
+  retryWithBackoff,
+  isRetryableError,
+  type AgentDecision,
+} from '../utils/retry.js';
 import type { ValidationRequiredError } from '../utils/googleQuotaErrors.js';
 import type { Config } from '../config/config.js';
 import {
  resolveModel,
  isGemini2Model,
  supportsModernFeatures,
+  PREVIEW_GEMINI_FLASH_MODEL,
 } from '../config/models.js';
 import { hasCycleInSchema } from '../tools/tools.js';
 import type { StructuredError } from './turn.js';
@@ -41,7 +46,7 @@ import {
 import {
  ContentRetryEvent,
  ContentRetryFailureEvent,
-  type LlmRole,
+  LlmRole,
 } from '../telemetry/types.js';
 import { handleFallback } from '../fallback/handler.js';
 import { isFunctionResponse } from '../utils/messageInspectors.js';
@@ -53,6 +58,7 @@ import {
  createAvailabilityContextProvider,
 } from '../availability/policyHelpers.js';
 import { coreEvents } from '../utils/events.js';
+import { debugLogger } from '../utils/debugLogger.js';

 export enum StreamEventType {
  /** A regular content chunk from the API. */
@@ -624,9 +630,97 @@ export class GeminiChat {
      );
    };

+    const onAgentDecisionCallback = async (
+      error: unknown,
+      attempt: number,
+    ): Promise<AgentDecision> => {
+      try {
+        const errorMsg = error instanceof Error ? error.message : String(error);
+
+        // Don't try to use the agent for 429s as it will likely also fail.
+        if (errorMsg.includes('429')) {
+          return 'stop';
+        }
+
+        const lastUserMessage = [...lastContentsToUse]
+          .reverse()
+          .find((c) => c.role === 'user');
+        const lastUserText = lastUserMessage?.parts
+          ? lastUserMessage.parts.map((p) => p.text || '').join(' ')
+          : 'N/A';
+
+        const decisionPrompt = `You are a meta-agent deciding whether to retry an AI request that failed.
+
+      Error: ${errorMsg}
+      Attempt: ${attempt}
+      Target Model: ${lastModelToUse}
+      Last User Request: ${lastUserText}
+
+      Based on the error, should we try to retry the exact same request? 
+      Some errors are transient (e.g. network blips, internal server errors), others are terminal (e.g. safety blocks, invalid arguments).
+
+      Respond with a JSON object: {"action": "retry" | "stop"}`;
+
+        const decisionResponse = await this.config
+          .getContentGenerator()
+          .generateContent(
+            {
+              model: PREVIEW_GEMINI_FLASH_MODEL,
+              contents: [{ role: 'user', parts: [{ text: decisionPrompt }] }],
+              config: {
+                responseMimeType: 'application/json',
+              },
+            },
+            'agent-decision',
+            LlmRole.UTILITY_ROUTER,
+          );
+
+        const decisionText =
+          decisionResponse.candidates?.[0]?.content?.parts?.[0]?.text;
+        if (decisionText) {
+          const parsed = JSON.parse(decisionText) as unknown;
+          if (
+            typeof parsed === 'object' &&
+            parsed !== null &&
+            'action' in parsed
+          ) {
+            const action = (parsed as { action: unknown }).action;
+            if (
+              action === 'retry' ||
+              action === 'stop' ||
+              action === 'modify_request'
+            ) {
+              // eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
+              return action as AgentDecision;
+            }
+          }
+        }
+      } catch (agentError) {
+        debugLogger.warn('Agent decision failed:', agentError);
+      }
+
+      return 'stop';
+    };
+
+    const onAgentDecisionWrapper = async (
+      error: unknown,
+      attempt: number,
+    ): Promise<AgentDecision> => {
+      if (
+        typeof error === 'object' &&
+        error !== null &&
+        'message' in error &&
+        typeof (error as { message?: unknown }).message === 'string' &&
+        (error as { message?: unknown }).message === 'Agent Decision'
+      ) {
+        return 'stop';
+      }
+      return onAgentDecisionCallback(error, attempt);
+    };
    const streamResponse = await retryWithBackoff(apiCall, {
      onPersistent429: onPersistent429Callback,
      onValidationRequired: onValidationRequiredCallback,
+      onAgentDecision: onAgentDecisionWrapper,
      authType: this.config.getContentGeneratorConfig()?.authType,
      retryFetchErrors: this.config.getRetryFetchErrors(),
      signal: abortSignal,
@@ -82,6 +82,35 @@ describe('retryWithBackoff', () => {
    expect(mockFn).toHaveBeenCalledTimes(3);
  });

+  it('should ask the agent when max attempts are reached, and retry if requested', async () => {
+    let calls = 0;
+    const mockFn = vi.fn(async () => {
+      calls++;
+      if (calls <= 2) {
+        throw new Error('Generic unretryable error');
+      }
+      return 'success';
+    });
+    const mockAgentDecision = vi.fn().mockResolvedValue('retry');
+
+    const promise = retryWithBackoff(mockFn, {
+      maxAttempts: 1, // Doesn't matter because the error is unretryable and will fail immediately, OR if retryable it will fail after max attempts
+      initialDelayMs: 10,
+      onAgentDecision: mockAgentDecision,
+    });
+
+    await vi.runAllTimersAsync();
+    const result = await promise;
+
+    expect(result).toBe('success');
+    expect(mockFn).toHaveBeenCalledTimes(3); // 1 original + 2 retries
+    expect(mockAgentDecision).toHaveBeenCalledTimes(2);
+    expect(mockAgentDecision).toHaveBeenCalledWith(
+      expect.any(Error),
+      1, // the attempt number when it failed
+    );
+  });
+
  it('should throw an error if all attempts fail', async () => {
    const mockFn = createFailingFunction(3);

@@ -17,6 +17,12 @@ import { getErrorStatus, ModelNotFoundError } from './httpErrors.js';
 import type { RetryAvailabilityContext } from '../availability/modelPolicy.js';

 export type { RetryAvailabilityContext };
+
+export type AgentDecision =
+  | 'retry'
+  | 'stop'
+  | { action: 'retry' | 'stop' | 'modify_request'; data?: unknown };
+
 export const DEFAULT_MAX_ATTEMPTS = 10;

 export interface RetryOptions {
@@ -32,6 +38,7 @@ export interface RetryOptions {
  onValidationRequired?: (
    error: ValidationRequiredError,
  ) => Promise<'verify' | 'change_auth' | 'cancel'>;
+  onAgentDecision?: (error: unknown, attempt: number) => Promise<AgentDecision>;
  authType?: string;
  retryFetchErrors?: boolean;
  signal?: AbortSignal;
@@ -174,6 +181,7 @@ export async function retryWithBackoff<T>(
    maxDelayMs,
    onPersistent429,
    onValidationRequired,
+    onAgentDecision,
    authType,
    shouldRetryOnError,
    shouldRetryOnContent,
@@ -340,6 +348,30 @@ export async function retryWithBackoff<T>(
        // eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
        !shouldRetryOnError(error as Error, retryFetchErrors)
      ) {
+        if (onAgentDecision) {
+          try {
+            const decision = await onAgentDecision(error, attempt);
+            const decisionType =
+              typeof decision === 'string' ? decision : decision.action;
+
+            debugLogger.log(
+              `Agent evaluated error at attempt ${attempt}: ${decisionType}`,
+            );
+
+            if (decisionType === 'retry' || decisionType === 'modify_request') {
+              attempt = 0;
+              currentDelay = initialDelayMs;
+              continue;
+            } else if (decisionType === 'stop') {
+              throw error;
+            }
+          } catch (agentError) {
+            if (agentError !== error) {
+              debugLogger.warn('Agent decision failed:', agentError);
+            }
+            throw error;
+          }
+        }
        throw error;
      }