fix(core): add retry logic for transient SSL/TLS errors (#17318) (#18310)

This commit is contained in:
Philippe
2026-02-05 16:47:35 +01:00
committed by GitHub
parent 2566057e44
commit e3b8490edf
4 changed files with 315 additions and 7 deletions

View File

@@ -394,16 +394,23 @@ export class GeminiChat {
return; // Stop the generator
}
if (isConnectionPhase) {
throw error;
}
lastError = error;
const isContentError = error instanceof InvalidStreamError;
// Check if the error is retryable (e.g., transient SSL errors
// like ERR_SSL_SSLV3_ALERT_BAD_RECORD_MAC)
const isRetryable = isRetryableError(
error,
this.config.getRetryFetchErrors(),
);
// For connection phase errors, only retryable errors should continue
if (isConnectionPhase) {
if (!isRetryable || signal.aborted) {
throw error;
}
// Fall through to retry logic for retryable connection errors
}
lastError = error;
const isContentError = error instanceof InvalidStreamError;
if (
(isContentError && isGemini2Model(model)) ||
(isRetryable && !signal.aborted)

View File

@@ -274,4 +274,204 @@ describe('GeminiChat Network Retries', () => {
expect(mockLogContentRetry).not.toHaveBeenCalled();
});
it('should retry on SSL error during connection phase (ERR_SSL_SSLV3_ALERT_BAD_RECORD_MAC)', async () => {
// Create an SSL error that occurs during connection (before any yield)
const sslError = new Error(
'SSL routines:ssl3_read_bytes:sslv3 alert bad record mac',
);
(sslError as NodeJS.ErrnoException).code =
'ERR_SSL_SSLV3_ALERT_BAD_RECORD_MAC';
vi.mocked(mockContentGenerator.generateContentStream)
// First call: throw SSL error immediately (connection phase)
.mockRejectedValueOnce(sslError)
// Second call: succeed
.mockImplementationOnce(async () =>
(async function* () {
yield {
candidates: [
{
content: { parts: [{ text: 'Success after SSL retry' }] },
finishReason: 'STOP',
},
],
} as unknown as GenerateContentResponse;
})(),
);
const stream = await chat.sendMessageStream(
{ model: 'test-model' },
'test message',
'prompt-id-ssl-retry',
new AbortController().signal,
);
const events: StreamEvent[] = [];
for await (const event of stream) {
events.push(event);
}
// Should have retried and succeeded
const retryEvent = events.find((e) => e.type === StreamEventType.RETRY);
expect(retryEvent).toBeDefined();
const successChunk = events.find(
(e) =>
e.type === StreamEventType.CHUNK &&
e.value.candidates?.[0]?.content?.parts?.[0]?.text ===
'Success after SSL retry',
);
expect(successChunk).toBeDefined();
// Verify the API was called twice (initial + retry)
expect(mockContentGenerator.generateContentStream).toHaveBeenCalledTimes(2);
});
it('should retry on ECONNRESET error during connection phase', async () => {
const connectionError = new Error('read ECONNRESET');
(connectionError as NodeJS.ErrnoException).code = 'ECONNRESET';
vi.mocked(mockContentGenerator.generateContentStream)
.mockRejectedValueOnce(connectionError)
.mockImplementationOnce(async () =>
(async function* () {
yield {
candidates: [
{
content: {
parts: [{ text: 'Success after connection retry' }],
},
finishReason: 'STOP',
},
],
} as unknown as GenerateContentResponse;
})(),
);
const stream = await chat.sendMessageStream(
{ model: 'test-model' },
'test message',
'prompt-id-connection-retry',
new AbortController().signal,
);
const events: StreamEvent[] = [];
for await (const event of stream) {
events.push(event);
}
const retryEvent = events.find((e) => e.type === StreamEventType.RETRY);
expect(retryEvent).toBeDefined();
const successChunk = events.find(
(e) =>
e.type === StreamEventType.CHUNK &&
e.value.candidates?.[0]?.content?.parts?.[0]?.text ===
'Success after connection retry',
);
expect(successChunk).toBeDefined();
});
it('should NOT retry on non-retryable error during connection phase', async () => {
const nonRetryableError = new Error('Some non-retryable error');
vi.mocked(mockContentGenerator.generateContentStream).mockRejectedValueOnce(
nonRetryableError,
);
const stream = await chat.sendMessageStream(
{ model: 'test-model' },
'test message',
'prompt-id-no-connection-retry',
new AbortController().signal,
);
await expect(async () => {
for await (const _ of stream) {
// consume
}
}).rejects.toThrow(nonRetryableError);
// Should only be called once (no retry)
expect(mockContentGenerator.generateContentStream).toHaveBeenCalledTimes(1);
});
it('should retry on SSL error during stream iteration (mid-stream failure)', async () => {
// This simulates the exact scenario from issue #17318 where the error
// occurs during a long session while streaming content
const sslError = new Error(
'request to https://cloudcode-pa.googleapis.com/v1internal:streamGenerateContent failed',
) as NodeJS.ErrnoException & { type?: string };
sslError.type = 'system';
sslError.errno = 'ERR_SSL_SSLV3_ALERT_BAD_RECORD_MAC' as unknown as number;
sslError.code = 'ERR_SSL_SSLV3_ALERT_BAD_RECORD_MAC';
vi.mocked(mockContentGenerator.generateContentStream)
// First call: yield some content, then throw SSL error mid-stream
.mockImplementationOnce(async () =>
(async function* () {
yield {
candidates: [
{ content: { parts: [{ text: 'Partial response...' }] } },
],
} as unknown as GenerateContentResponse;
// SSL error occurs while waiting for more data
throw sslError;
})(),
)
// Second call: succeed
.mockImplementationOnce(async () =>
(async function* () {
yield {
candidates: [
{
content: { parts: [{ text: 'Complete response after retry' }] },
finishReason: 'STOP',
},
],
} as unknown as GenerateContentResponse;
})(),
);
const stream = await chat.sendMessageStream(
{ model: 'test-model' },
'test message',
'prompt-id-ssl-mid-stream',
new AbortController().signal,
);
const events: StreamEvent[] = [];
for await (const event of stream) {
events.push(event);
}
// Should have received partial content, then retry, then success
const partialChunk = events.find(
(e) =>
e.type === StreamEventType.CHUNK &&
e.value.candidates?.[0]?.content?.parts?.[0]?.text ===
'Partial response...',
);
expect(partialChunk).toBeDefined();
const retryEvent = events.find((e) => e.type === StreamEventType.RETRY);
expect(retryEvent).toBeDefined();
const successChunk = events.find(
(e) =>
e.type === StreamEventType.CHUNK &&
e.value.candidates?.[0]?.content?.parts?.[0]?.text ===
'Complete response after retry',
);
expect(successChunk).toBeDefined();
// Verify retry logging was called with NETWORK_ERROR type
expect(mockLogContentRetry).toHaveBeenCalledWith(
expect.anything(),
expect.objectContaining({
error_type: 'NETWORK_ERROR',
}),
);
});
});

View File

@@ -409,6 +409,87 @@ describe('retryWithBackoff', () => {
await vi.runAllTimersAsync();
await expect(promise).resolves.toBe('success');
});
it('should retry on SSL error code (ERR_SSL_SSLV3_ALERT_BAD_RECORD_MAC)', async () => {
const error = new Error('SSL error');
(error as any).code = 'ERR_SSL_SSLV3_ALERT_BAD_RECORD_MAC';
const mockFn = vi
.fn()
.mockRejectedValueOnce(error)
.mockResolvedValue('success');
const promise = retryWithBackoff(mockFn, {
initialDelayMs: 1,
maxDelayMs: 1,
});
await vi.runAllTimersAsync();
await expect(promise).resolves.toBe('success');
expect(mockFn).toHaveBeenCalledTimes(2);
});
it('should retry on SSL error code in deeply nested cause chain', async () => {
const deepCause = new Error('OpenSSL error');
(deepCause as any).code = 'ERR_SSL_BAD_RECORD_MAC';
const middleCause = new Error('TLS handshake failed');
(middleCause as any).cause = deepCause;
const outerError = new Error('fetch failed');
(outerError as any).cause = middleCause;
const mockFn = vi
.fn()
.mockRejectedValueOnce(outerError)
.mockResolvedValue('success');
const promise = retryWithBackoff(mockFn, {
initialDelayMs: 1,
maxDelayMs: 1,
});
await vi.runAllTimersAsync();
await expect(promise).resolves.toBe('success');
expect(mockFn).toHaveBeenCalledTimes(2);
});
it('should retry on EPROTO error (generic protocol/SSL error)', async () => {
const error = new Error('Protocol error');
(error as any).code = 'EPROTO';
const mockFn = vi
.fn()
.mockRejectedValueOnce(error)
.mockResolvedValue('success');
const promise = retryWithBackoff(mockFn, {
initialDelayMs: 1,
maxDelayMs: 1,
});
await vi.runAllTimersAsync();
await expect(promise).resolves.toBe('success');
expect(mockFn).toHaveBeenCalledTimes(2);
});
it('should retry on gaxios-style SSL error with code property', async () => {
// This matches the exact structure from issue #17318
const error = new Error(
'request to https://cloudcode-pa.googleapis.com/v1internal:streamGenerateContent failed',
);
(error as any).type = 'system';
(error as any).errno = 'ERR_SSL_SSLV3_ALERT_BAD_RECORD_MAC';
(error as any).code = 'ERR_SSL_SSLV3_ALERT_BAD_RECORD_MAC';
const mockFn = vi
.fn()
.mockRejectedValueOnce(error)
.mockResolvedValue('success');
const promise = retryWithBackoff(mockFn, {
initialDelayMs: 1,
maxDelayMs: 1,
});
await vi.runAllTimersAsync();
await expect(promise).resolves.toBe('success');
expect(mockFn).toHaveBeenCalledTimes(2);
});
});
describe('Flash model fallback for OAuth users', () => {

View File

@@ -54,6 +54,12 @@ const RETRYABLE_NETWORK_CODES = [
'ENOTFOUND',
'EAI_AGAIN',
'ECONNREFUSED',
// SSL/TLS transient errors
'ERR_SSL_SSLV3_ALERT_BAD_RECORD_MAC',
'ERR_SSL_WRONG_VERSION_NUMBER',
'ERR_SSL_DECRYPTION_FAILED_OR_BAD_RECORD_MAC',
'ERR_SSL_BAD_RECORD_MAC',
'EPROTO', // Generic protocol error (often SSL-related)
];
function getNetworkErrorCode(error: unknown): string | undefined {
@@ -72,8 +78,22 @@ function getNetworkErrorCode(error: unknown): string | undefined {
return directCode;
}
if (typeof error === 'object' && error !== null && 'cause' in error) {
return getCode((error as { cause: unknown }).cause);
// Traverse the cause chain to find error codes (SSL errors are often nested)
let current: unknown = error;
const maxDepth = 5; // Prevent infinite loops in case of circular references
for (let depth = 0; depth < maxDepth; depth++) {
if (
typeof current !== 'object' ||
current === null ||
!('cause' in current)
) {
break;
}
current = (current as { cause: unknown }).cause;
const code = getCode(current);
if (code) {
return code;
}
}
return undefined;