From 0910b35812a6ba5578277015b642d29c2853c9d5 Mon Sep 17 00:00:00 2001 From: Silvio Junior Date: Tue, 26 Aug 2025 18:28:04 -0400 Subject: [PATCH] Hotfix/metrics stream error (#7156) Co-authored-by: Victor May --- packages/core/src/core/geminiChat.test.ts | 31 +++++++- packages/core/src/core/geminiChat.ts | 34 +++++++- .../clearcut-logger/clearcut-logger.ts | 69 ++++++++++++++++ .../clearcut-logger/event-metadata-key.ts | 78 +++++++++++++++++++ packages/core/src/telemetry/constants.ts | 9 +++ packages/core/src/telemetry/loggers.ts | 78 +++++++++++++++++++ packages/core/src/telemetry/metrics.ts | 50 ++++++++++++ packages/core/src/telemetry/types.ts | 58 +++++++++++++- 8 files changed, 400 insertions(+), 7 deletions(-) diff --git a/packages/core/src/core/geminiChat.test.ts b/packages/core/src/core/geminiChat.test.ts index c4fb7f0f6f..1bafb38cdc 100644 --- a/packages/core/src/core/geminiChat.test.ts +++ b/packages/core/src/core/geminiChat.test.ts @@ -25,6 +25,19 @@ const mockModelsModule = { batchEmbedContents: vi.fn(), } as unknown as Models; +const { mockLogInvalidChunk, mockLogContentRetry, mockLogContentRetryFailure } = + vi.hoisted(() => ({ + mockLogInvalidChunk: vi.fn(), + mockLogContentRetry: vi.fn(), + mockLogContentRetryFailure: vi.fn(), + })); + +vi.mock('../telemetry/loggers.js', () => ({ + logInvalidChunk: mockLogInvalidChunk, + logContentRetry: mockLogContentRetry, + logContentRetryFailure: mockLogContentRetryFailure, +})); + describe('GeminiChat', () => { let chat: GeminiChat; let mockConfig: Config; @@ -483,7 +496,7 @@ describe('GeminiChat', () => { }); describe('sendMessageStream with retries', () => { - it('should retry on invalid content and succeed on the second attempt', async () => { + it('should retry on invalid content, succeed, and report metrics', async () => { // Use mockImplementationOnce to provide a fresh, promise-wrapped generator for each attempt. vi.mocked(mockModelsModule.generateContentStream) .mockImplementationOnce(async () => @@ -515,6 +528,9 @@ describe('GeminiChat', () => { } // Assertions + expect(mockLogInvalidChunk).toHaveBeenCalledTimes(1); + expect(mockLogContentRetry).toHaveBeenCalledTimes(1); + expect(mockLogContentRetryFailure).not.toHaveBeenCalled(); expect(mockModelsModule.generateContentStream).toHaveBeenCalledTimes(2); expect( chunks.some( @@ -537,7 +553,7 @@ describe('GeminiChat', () => { }); }); - it('should fail after all retries on persistent invalid content', async () => { + it('should fail after all retries on persistent invalid content and report metrics', async () => { vi.mocked(mockModelsModule.generateContentStream).mockImplementation( async () => (async function* () { @@ -571,6 +587,9 @@ describe('GeminiChat', () => { // Should be called 3 times (initial + 2 retries) expect(mockModelsModule.generateContentStream).toHaveBeenCalledTimes(3); + expect(mockLogInvalidChunk).toHaveBeenCalledTimes(3); + expect(mockLogContentRetry).toHaveBeenCalledTimes(2); + expect(mockLogContentRetryFailure).toHaveBeenCalledTimes(1); // History should be clean, as if the failed turn never happened. const history = chat.getHistory(); @@ -585,7 +604,7 @@ describe('GeminiChat', () => { ]; chat.setHistory(initialHistory); - // 2. Mock the API + // 2. Mock the API to fail once with an empty stream, then succeed. vi.mocked(mockModelsModule.generateContentStream) .mockImplementationOnce(async () => (async function* () { @@ -595,6 +614,7 @@ describe('GeminiChat', () => { })(), ) .mockImplementationOnce(async () => + // Second attempt succeeds (async function* () { yield { candidates: [{ content: { parts: [{ text: 'Second answer' }] } }], @@ -611,10 +631,13 @@ describe('GeminiChat', () => { // consume stream } - // 4. Assert the final history + // 4. Assert the final history and metrics const history = chat.getHistory(); expect(history.length).toBe(4); + // Assert that the correct metrics were reported for one empty-stream retry + expect(mockLogContentRetry).toHaveBeenCalledTimes(1); + // Explicitly verify the structure of each part to satisfy TypeScript const turn1 = history[0]; if (!turn1?.parts?.[0] || !('text' in turn1.parts[0])) { diff --git a/packages/core/src/core/geminiChat.ts b/packages/core/src/core/geminiChat.ts index 93428684ba..a46522fa32 100644 --- a/packages/core/src/core/geminiChat.ts +++ b/packages/core/src/core/geminiChat.ts @@ -23,6 +23,16 @@ import { Config } from '../config/config.js'; import { DEFAULT_GEMINI_FLASH_MODEL } from '../config/models.js'; import { hasCycleInSchema } from '../tools/tools.js'; import { StructuredError } from './turn.js'; +import { + logContentRetry, + logContentRetryFailure, + logInvalidChunk, +} from '../telemetry/loggers.js'; +import { + ContentRetryEvent, + ContentRetryFailureEvent, + InvalidChunkEvent, +} from '../telemetry/types.js'; /** * Options for retrying due to invalid content from the model. @@ -38,7 +48,6 @@ const INVALID_CONTENT_RETRY_OPTIONS: ContentRetryOptions = { maxAttempts: 3, // 1 initial call + 2 retries initialDelayMs: 500, }; - /** * Returns true if the response is valid, false otherwise. */ @@ -349,7 +358,7 @@ export class GeminiChat { for ( let attempt = 0; - attempt <= INVALID_CONTENT_RETRY_OPTIONS.maxAttempts; + attempt < INVALID_CONTENT_RETRY_OPTIONS.maxAttempts; attempt++ ) { try { @@ -373,6 +382,14 @@ export class GeminiChat { if (isContentError) { // Check if we have more attempts left. if (attempt < INVALID_CONTENT_RETRY_OPTIONS.maxAttempts - 1) { + logContentRetry( + self.config, + new ContentRetryEvent( + attempt, + 'EmptyStreamError', + INVALID_CONTENT_RETRY_OPTIONS.initialDelayMs, + ), + ); await new Promise((res) => setTimeout( res, @@ -388,6 +405,15 @@ export class GeminiChat { } if (lastError) { + if (lastError instanceof EmptyStreamError) { + logContentRetryFailure( + self.config, + new ContentRetryFailureEvent( + INVALID_CONTENT_RETRY_OPTIONS.maxAttempts, + 'EmptyStreamError', + ), + ); + } // If the stream fails, remove the user message that was added. if (self.history[self.history.length - 1] === userContent) { self.history.pop(); @@ -545,6 +571,10 @@ export class GeminiChat { } } } else { + logInvalidChunk( + this.config, + new InvalidChunkEvent('Invalid chunk received from stream.'), + ); isStreamInvalid = true; } yield chunk; // Yield every chunk to the UI immediately. diff --git a/packages/core/src/telemetry/clearcut-logger/clearcut-logger.ts b/packages/core/src/telemetry/clearcut-logger/clearcut-logger.ts index 7369bc1b09..221b051438 100644 --- a/packages/core/src/telemetry/clearcut-logger/clearcut-logger.ts +++ b/packages/core/src/telemetry/clearcut-logger/clearcut-logger.ts @@ -19,6 +19,9 @@ import { IdeConnectionEvent, KittySequenceOverflowEvent, ChatCompressionEvent, + InvalidChunkEvent, + ContentRetryEvent, + ContentRetryFailureEvent, } from '../types.js'; import { EventMetadataKey } from './event-metadata-key.js'; import { Config } from '../../config/config.js'; @@ -48,6 +51,9 @@ export enum EventNames { IDE_CONNECTION = 'ide_connection', KITTY_SEQUENCE_OVERFLOW = 'kitty_sequence_overflow', CHAT_COMPRESSION = 'chat_compression', + INVALID_CHUNK = 'invalid_chunk', + CONTENT_RETRY = 'content_retry', + CONTENT_RETRY_FAILURE = 'content_retry_failure', } export interface LogResponse { @@ -675,6 +681,69 @@ export class ClearcutLogger { }); } + logInvalidChunkEvent(event: InvalidChunkEvent): void { + const data: EventValue[] = []; + + if (event.error_message) { + data.push({ + gemini_cli_key: EventMetadataKey.GEMINI_CLI_INVALID_CHUNK_ERROR_MESSAGE, + value: event.error_message, + }); + } + + this.enqueueLogEvent(this.createLogEvent(EventNames.INVALID_CHUNK, data)); + this.flushIfNeeded(); + } + + logContentRetryEvent(event: ContentRetryEvent): void { + const data: EventValue[] = [ + { + gemini_cli_key: + EventMetadataKey.GEMINI_CLI_CONTENT_RETRY_ATTEMPT_NUMBER, + value: String(event.attempt_number), + }, + { + gemini_cli_key: EventMetadataKey.GEMINI_CLI_CONTENT_RETRY_ERROR_TYPE, + value: event.error_type, + }, + { + gemini_cli_key: EventMetadataKey.GEMINI_CLI_CONTENT_RETRY_DELAY_MS, + value: String(event.retry_delay_ms), + }, + ]; + + this.enqueueLogEvent(this.createLogEvent(EventNames.CONTENT_RETRY, data)); + this.flushIfNeeded(); + } + + logContentRetryFailureEvent(event: ContentRetryFailureEvent): void { + const data: EventValue[] = [ + { + gemini_cli_key: + EventMetadataKey.GEMINI_CLI_CONTENT_RETRY_FAILURE_TOTAL_ATTEMPTS, + value: String(event.total_attempts), + }, + { + gemini_cli_key: + EventMetadataKey.GEMINI_CLI_CONTENT_RETRY_FAILURE_FINAL_ERROR_TYPE, + value: event.final_error_type, + }, + ]; + + if (event.total_duration_ms) { + data.push({ + gemini_cli_key: + EventMetadataKey.GEMINI_CLI_CONTENT_RETRY_FAILURE_TOTAL_DURATION_MS, + value: String(event.total_duration_ms), + }); + } + + this.enqueueLogEvent( + this.createLogEvent(EventNames.CONTENT_RETRY_FAILURE, data), + ); + this.flushIfNeeded(); + } + /** * Adds default fields to data, and returns a new data array. This fields * should exist on all log events. diff --git a/packages/core/src/telemetry/clearcut-logger/event-metadata-key.ts b/packages/core/src/telemetry/clearcut-logger/event-metadata-key.ts index 34c8d9b024..b7090bef57 100644 --- a/packages/core/src/telemetry/clearcut-logger/event-metadata-key.ts +++ b/packages/core/src/telemetry/clearcut-logger/event-metadata-key.ts @@ -229,6 +229,16 @@ export enum EventMetadataKey { // Logs the length of the kitty sequence that overflowed. GEMINI_CLI_KITTY_SEQUENCE_LENGTH = 53, + // ========================================================================== + // Conversation Finished Event Keys + // =========================================================================== + + // Logs the approval mode of the session. + GEMINI_CLI_APPROVAL_MODE = 58, + + // Logs the number of turns + GEMINI_CLI_CONVERSATION_TURN_COUNT = 59, + // Logs the number of tokens before context window compression. GEMINI_CLI_COMPRESSION_TOKENS_BEFORE = 60, @@ -237,4 +247,72 @@ export enum EventMetadataKey { // Logs tool type whether it is mcp or native. GEMINI_CLI_TOOL_TYPE = 62, + // Logs name of MCP tools as comma separated string + GEMINI_CLI_START_SESSION_MCP_TOOLS = 65, + + // ========================================================================== + // Research Event Keys + // =========================================================================== + + // Logs the research opt-in status (true/false) + GEMINI_CLI_RESEARCH_OPT_IN_STATUS = 66, + + // Logs the contact email for research participation + GEMINI_CLI_RESEARCH_CONTACT_EMAIL = 67, + + // Logs the user ID for research events + GEMINI_CLI_RESEARCH_USER_ID = 68, + + // Logs the type of research feedback + GEMINI_CLI_RESEARCH_FEEDBACK_TYPE = 69, + + // Logs the content of research feedback + GEMINI_CLI_RESEARCH_FEEDBACK_CONTENT = 70, + + // Logs survey responses for research feedback (JSON stringified) + GEMINI_CLI_RESEARCH_SURVEY_RESPONSES = 71, + + // ========================================================================== + // File Operation Event Keys + // =========================================================================== + + // Logs the programming language of the project. + GEMINI_CLI_PROGRAMMING_LANGUAGE = 56, + + // Logs the operation type of the file operation. + GEMINI_CLI_FILE_OPERATION_TYPE = 57, + + // Logs the number of lines in the file operation. + GEMINI_CLI_FILE_OPERATION_LINES = 72, + + // Logs the mimetype of the file in the file operation. + GEMINI_CLI_FILE_OPERATION_MIMETYPE = 73, + + // Logs the extension of the file in the file operation. + GEMINI_CLI_FILE_OPERATION_EXTENSION = 74, + + // ========================================================================== + // Content Streaming Event Keys + // =========================================================================== + + // Logs the error message for an invalid chunk. + GEMINI_CLI_INVALID_CHUNK_ERROR_MESSAGE = 75, + + // Logs the attempt number for a content retry. + GEMINI_CLI_CONTENT_RETRY_ATTEMPT_NUMBER = 76, + + // Logs the error type for a content retry. + GEMINI_CLI_CONTENT_RETRY_ERROR_TYPE = 77, + + // Logs the delay in milliseconds for a content retry. + GEMINI_CLI_CONTENT_RETRY_DELAY_MS = 78, + + // Logs the total number of attempts for a content retry failure. + GEMINI_CLI_CONTENT_RETRY_FAILURE_TOTAL_ATTEMPTS = 79, + + // Logs the final error type for a content retry failure. + GEMINI_CLI_CONTENT_RETRY_FAILURE_FINAL_ERROR_TYPE = 80, + + // Logs the total duration in milliseconds for a content retry failure. + GEMINI_CLI_CONTENT_RETRY_FAILURE_TOTAL_DURATION_MS = 81, } diff --git a/packages/core/src/telemetry/constants.ts b/packages/core/src/telemetry/constants.ts index 0d978fb2f5..6420a3d125 100644 --- a/packages/core/src/telemetry/constants.ts +++ b/packages/core/src/telemetry/constants.ts @@ -17,6 +17,11 @@ export const EVENT_NEXT_SPEAKER_CHECK = 'gemini_cli.next_speaker_check'; export const EVENT_SLASH_COMMAND = 'gemini_cli.slash_command'; export const EVENT_IDE_CONNECTION = 'gemini_cli.ide_connection'; export const EVENT_CHAT_COMPRESSION = 'gemini_cli.chat_compression'; +export const EVENT_INVALID_CHUNK = 'gemini_cli.chat.invalid_chunk'; +export const EVENT_CONTENT_RETRY = 'gemini_cli.chat.content_retry'; +export const EVENT_CONTENT_RETRY_FAILURE = + 'gemini_cli.chat.content_retry_failure'; + export const METRIC_TOOL_CALL_COUNT = 'gemini_cli.tool.call.count'; export const METRIC_TOOL_CALL_LATENCY = 'gemini_cli.tool.call.latency'; export const METRIC_API_REQUEST_COUNT = 'gemini_cli.api.request.count'; @@ -24,3 +29,7 @@ export const METRIC_API_REQUEST_LATENCY = 'gemini_cli.api.request.latency'; export const METRIC_TOKEN_USAGE = 'gemini_cli.token.usage'; export const METRIC_SESSION_COUNT = 'gemini_cli.session.count'; export const METRIC_FILE_OPERATION_COUNT = 'gemini_cli.file.operation.count'; +export const METRIC_INVALID_CHUNK_COUNT = 'gemini_cli.chat.invalid_chunk.count'; +export const METRIC_CONTENT_RETRY_COUNT = 'gemini_cli.chat.content_retry.count'; +export const METRIC_CONTENT_RETRY_FAILURE_COUNT = + 'gemini_cli.chat.content_retry_failure.count'; diff --git a/packages/core/src/telemetry/loggers.ts b/packages/core/src/telemetry/loggers.ts index 9b9faf792f..f292403f16 100644 --- a/packages/core/src/telemetry/loggers.ts +++ b/packages/core/src/telemetry/loggers.ts @@ -20,6 +20,9 @@ import { SERVICE_NAME, EVENT_SLASH_COMMAND, EVENT_CHAT_COMPRESSION, + EVENT_INVALID_CHUNK, + EVENT_CONTENT_RETRY, + EVENT_CONTENT_RETRY_FAILURE, } from './constants.js'; import { ApiErrorEvent, @@ -35,6 +38,9 @@ import { SlashCommandEvent, KittySequenceOverflowEvent, ChatCompressionEvent, + InvalidChunkEvent, + ContentRetryEvent, + ContentRetryFailureEvent, } from './types.js'; import { recordApiErrorMetrics, @@ -42,6 +48,9 @@ import { recordApiResponseMetrics, recordToolCallMetrics, recordChatCompressionMetrics, + recordInvalidChunk, + recordContentRetry, + recordContentRetryFailure, } from './metrics.js'; import { isTelemetrySdkInitialized } from './sdk.js'; import { uiTelemetryService, UiEvent } from './uiTelemetry.js'; @@ -426,3 +435,72 @@ export function logKittySequenceOverflow( }; logger.emit(logRecord); } +export function logInvalidChunk( + config: Config, + event: InvalidChunkEvent, +): void { + ClearcutLogger.getInstance(config)?.logInvalidChunkEvent(event); + if (!isTelemetrySdkInitialized()) return; + + const attributes: LogAttributes = { + ...getCommonAttributes(config), + 'event.name': EVENT_INVALID_CHUNK, + 'event.timestamp': event['event.timestamp'], + }; + + if (event.error_message) { + attributes['error.message'] = event.error_message; + } + + const logger = logs.getLogger(SERVICE_NAME); + const logRecord: LogRecord = { + body: `Invalid chunk received from stream.`, + attributes, + }; + logger.emit(logRecord); + recordInvalidChunk(config); +} + +export function logContentRetry( + config: Config, + event: ContentRetryEvent, +): void { + ClearcutLogger.getInstance(config)?.logContentRetryEvent(event); + if (!isTelemetrySdkInitialized()) return; + + const attributes: LogAttributes = { + ...getCommonAttributes(config), + ...event, + 'event.name': EVENT_CONTENT_RETRY, + }; + + const logger = logs.getLogger(SERVICE_NAME); + const logRecord: LogRecord = { + body: `Content retry attempt ${event.attempt_number} due to ${event.error_type}.`, + attributes, + }; + logger.emit(logRecord); + recordContentRetry(config); +} + +export function logContentRetryFailure( + config: Config, + event: ContentRetryFailureEvent, +): void { + ClearcutLogger.getInstance(config)?.logContentRetryFailureEvent(event); + if (!isTelemetrySdkInitialized()) return; + + const attributes: LogAttributes = { + ...getCommonAttributes(config), + ...event, + 'event.name': EVENT_CONTENT_RETRY_FAILURE, + }; + + const logger = logs.getLogger(SERVICE_NAME); + const logRecord: LogRecord = { + body: `All content retries failed after ${event.total_attempts} attempts.`, + attributes, + }; + logger.emit(logRecord); + recordContentRetryFailure(config); +} diff --git a/packages/core/src/telemetry/metrics.ts b/packages/core/src/telemetry/metrics.ts index 4abcb4f0cd..a84abbff90 100644 --- a/packages/core/src/telemetry/metrics.ts +++ b/packages/core/src/telemetry/metrics.ts @@ -22,6 +22,9 @@ import { METRIC_SESSION_COUNT, METRIC_FILE_OPERATION_COUNT, EVENT_CHAT_COMPRESSION, + METRIC_INVALID_CHUNK_COUNT, + METRIC_CONTENT_RETRY_COUNT, + METRIC_CONTENT_RETRY_FAILURE_COUNT, } from './constants.js'; import { Config } from '../config/config.js'; import { DiffStat } from '../tools/tools.js'; @@ -40,6 +43,9 @@ let apiRequestLatencyHistogram: Histogram | undefined; let tokenUsageCounter: Counter | undefined; let fileOperationCounter: Counter | undefined; let chatCompressionCounter: Counter | undefined; +let invalidChunkCounter: Counter | undefined; +let contentRetryCounter: Counter | undefined; +let contentRetryFailureCounter: Counter | undefined; let isMetricsInitialized = false; function getCommonAttributes(config: Config): Attributes { @@ -94,6 +100,24 @@ export function initializeMetrics(config: Config): void { description: 'Counts chat compression events.', valueType: ValueType.INT, }); + + // New counters for content errors + invalidChunkCounter = meter.createCounter(METRIC_INVALID_CHUNK_COUNT, { + description: 'Counts invalid chunks received from a stream.', + valueType: ValueType.INT, + }); + contentRetryCounter = meter.createCounter(METRIC_CONTENT_RETRY_COUNT, { + description: 'Counts retries due to content errors (e.g., empty stream).', + valueType: ValueType.INT, + }); + contentRetryFailureCounter = meter.createCounter( + METRIC_CONTENT_RETRY_FAILURE_COUNT, + { + description: 'Counts occurrences of all content retries failing.', + valueType: ValueType.INT, + }, + ); + const sessionCounter = meter.createCounter(METRIC_SESSION_COUNT, { description: 'Count of CLI sessions started.', valueType: ValueType.INT, @@ -227,3 +251,29 @@ export function recordFileOperationMetric( } fileOperationCounter.add(1, attributes); } + +// --- New Metric Recording Functions --- + +/** + * Records a metric for when an invalid chunk is received from a stream. + */ +export function recordInvalidChunk(config: Config): void { + if (!invalidChunkCounter || !isMetricsInitialized) return; + invalidChunkCounter.add(1, getCommonAttributes(config)); +} + +/** + * Records a metric for when a retry is triggered due to a content error. + */ +export function recordContentRetry(config: Config): void { + if (!contentRetryCounter || !isMetricsInitialized) return; + contentRetryCounter.add(1, getCommonAttributes(config)); +} + +/** + * Records a metric for when all content error retries have failed for a request. + */ +export function recordContentRetryFailure(config: Config): void { + if (!contentRetryFailureCounter || !isMetricsInitialized) return; + contentRetryFailureCounter.add(1, getCommonAttributes(config)); +} diff --git a/packages/core/src/telemetry/types.ts b/packages/core/src/telemetry/types.ts index 0a49ca02f1..10ac5da1fb 100644 --- a/packages/core/src/telemetry/types.ts +++ b/packages/core/src/telemetry/types.ts @@ -385,6 +385,59 @@ export class KittySequenceOverflowEvent { } } +// Add these new event interfaces +export class InvalidChunkEvent implements BaseTelemetryEvent { + 'event.name': 'invalid_chunk'; + 'event.timestamp': string; + error_message?: string; // Optional: validation error details + + constructor(error_message?: string) { + this['event.name'] = 'invalid_chunk'; + this['event.timestamp'] = new Date().toISOString(); + this.error_message = error_message; + } +} + +export class ContentRetryEvent implements BaseTelemetryEvent { + 'event.name': 'content_retry'; + 'event.timestamp': string; + attempt_number: number; + error_type: string; // e.g., 'EmptyStreamError' + retry_delay_ms: number; + + constructor( + attempt_number: number, + error_type: string, + retry_delay_ms: number, + ) { + this['event.name'] = 'content_retry'; + this['event.timestamp'] = new Date().toISOString(); + this.attempt_number = attempt_number; + this.error_type = error_type; + this.retry_delay_ms = retry_delay_ms; + } +} + +export class ContentRetryFailureEvent implements BaseTelemetryEvent { + 'event.name': 'content_retry_failure'; + 'event.timestamp': string; + total_attempts: number; + final_error_type: string; + total_duration_ms?: number; // Optional: total time spent retrying + + constructor( + total_attempts: number, + final_error_type: string, + total_duration_ms?: number, + ) { + this['event.name'] = 'content_retry_failure'; + this['event.timestamp'] = new Date().toISOString(); + this.total_attempts = total_attempts; + this.final_error_type = final_error_type; + this.total_duration_ms = total_duration_ms; + } +} + export type TelemetryEvent = | StartSessionEvent | EndSessionEvent @@ -399,4 +452,7 @@ export type TelemetryEvent = | KittySequenceOverflowEvent | MalformedJsonResponseEvent | IdeConnectionEvent - | SlashCommandEvent; + | SlashCommandEvent + | InvalidChunkEvent + | ContentRetryEvent + | ContentRetryFailureEvent;