mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-05-03 16:34:31 -07:00
feat(telemetry): implement retry attempt telemetry for network related retries (#22027)
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
@@ -27,6 +27,7 @@ import type {
|
||||
InvalidChunkEvent,
|
||||
ContentRetryEvent,
|
||||
ContentRetryFailureEvent,
|
||||
NetworkRetryAttemptEvent,
|
||||
ExtensionInstallEvent,
|
||||
ToolOutputTruncatedEvent,
|
||||
ExtensionUninstallEvent,
|
||||
@@ -94,6 +95,7 @@ export enum EventNames {
|
||||
INVALID_CHUNK = 'invalid_chunk',
|
||||
CONTENT_RETRY = 'content_retry',
|
||||
CONTENT_RETRY_FAILURE = 'content_retry_failure',
|
||||
RETRY_ATTEMPT = 'retry_attempt',
|
||||
EXTENSION_ENABLE = 'extension_enable',
|
||||
EXTENSION_DISABLE = 'extension_disable',
|
||||
EXTENSION_INSTALL = 'extension_install',
|
||||
@@ -1231,6 +1233,32 @@ export class ClearcutLogger {
|
||||
this.flushIfNeeded();
|
||||
}
|
||||
|
||||
logNetworkRetryAttemptEvent(event: NetworkRetryAttemptEvent): void {
|
||||
// This event is generic for any retry attempt (Gemini, WebFetch, etc.)
|
||||
const data: EventValue[] = [
|
||||
{
|
||||
gemini_cli_key:
|
||||
EventMetadataKey.GEMINI_CLI_NETWORK_RETRY_ATTEMPT_NUMBER,
|
||||
value: String(event.attempt),
|
||||
},
|
||||
{
|
||||
gemini_cli_key: EventMetadataKey.GEMINI_CLI_NETWORK_RETRY_DELAY_MS,
|
||||
value: String(event.delay_ms),
|
||||
},
|
||||
{
|
||||
gemini_cli_key: EventMetadataKey.GEMINI_CLI_NETWORK_RETRY_ERROR_TYPE,
|
||||
value: event.error_type,
|
||||
},
|
||||
{
|
||||
gemini_cli_key: EventMetadataKey.GEMINI_CLI_API_REQUEST_MODEL,
|
||||
value: event.model,
|
||||
},
|
||||
];
|
||||
|
||||
this.enqueueLogEvent(this.createLogEvent(EventNames.RETRY_ATTEMPT, data));
|
||||
this.flushIfNeeded();
|
||||
}
|
||||
|
||||
async logExtensionInstallEvent(event: ExtensionInstallEvent): Promise<void> {
|
||||
const data: EventValue[] = [
|
||||
{
|
||||
|
||||
@@ -674,4 +674,17 @@ export enum EventMetadataKey {
|
||||
|
||||
// Logs the error message for Conseca events.
|
||||
CONSECA_ERROR = 166,
|
||||
|
||||
// ==========================================================================
|
||||
// Network Retry Event Keys
|
||||
// ==========================================================================
|
||||
|
||||
// Logs the attempt number for a network retry.
|
||||
GEMINI_CLI_NETWORK_RETRY_ATTEMPT_NUMBER = 180,
|
||||
|
||||
// Logs the delay in milliseconds for a network retry.
|
||||
GEMINI_CLI_NETWORK_RETRY_DELAY_MS = 181,
|
||||
|
||||
// Logs the error type for a network retry.
|
||||
GEMINI_CLI_NETWORK_RETRY_ERROR_TYPE = 182,
|
||||
}
|
||||
|
||||
@@ -46,6 +46,7 @@ export {
|
||||
logExtensionUninstall,
|
||||
logExtensionUpdateEvent,
|
||||
logWebFetchFallbackAttempt,
|
||||
logNetworkRetryAttempt,
|
||||
logRewind,
|
||||
} from './loggers.js';
|
||||
export {
|
||||
@@ -66,6 +67,7 @@ export {
|
||||
ConversationFinishedEvent,
|
||||
ToolOutputTruncatedEvent,
|
||||
WebFetchFallbackAttemptEvent,
|
||||
NetworkRetryAttemptEvent,
|
||||
ToolCallDecision,
|
||||
RewindEvent,
|
||||
ConsecaPolicyGenerationEvent,
|
||||
@@ -111,6 +113,7 @@ export {
|
||||
recordApiErrorMetrics,
|
||||
recordFileOperationMetric,
|
||||
recordInvalidChunk,
|
||||
recordRetryAttemptMetrics,
|
||||
recordContentRetry,
|
||||
recordContentRetryFailure,
|
||||
recordModelRoutingMetrics,
|
||||
|
||||
@@ -45,6 +45,7 @@ import {
|
||||
logAgentStart,
|
||||
logAgentFinish,
|
||||
logWebFetchFallbackAttempt,
|
||||
logNetworkRetryAttempt,
|
||||
logExtensionUpdateEvent,
|
||||
logHookCall,
|
||||
} from './loggers.js';
|
||||
@@ -70,6 +71,7 @@ import {
|
||||
EVENT_AGENT_FINISH,
|
||||
EVENT_WEB_FETCH_FALLBACK_ATTEMPT,
|
||||
EVENT_INVALID_CHUNK,
|
||||
EVENT_NETWORK_RETRY_ATTEMPT,
|
||||
ApiErrorEvent,
|
||||
ApiRequestEvent,
|
||||
ApiResponseEvent,
|
||||
@@ -91,6 +93,7 @@ import {
|
||||
AgentStartEvent,
|
||||
AgentFinishEvent,
|
||||
WebFetchFallbackAttemptEvent,
|
||||
NetworkRetryAttemptEvent,
|
||||
ExtensionUpdateEvent,
|
||||
EVENT_EXTENSION_UPDATE,
|
||||
HookCallEvent,
|
||||
@@ -2429,6 +2432,56 @@ describe('loggers', () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe('logNetworkRetryAttempt', () => {
|
||||
const mockConfig = makeFakeConfig();
|
||||
|
||||
beforeEach(() => {
|
||||
vi.spyOn(ClearcutLogger.prototype, 'logNetworkRetryAttemptEvent');
|
||||
vi.spyOn(metrics, 'recordRetryAttemptMetrics');
|
||||
});
|
||||
|
||||
it('logs the network retry attempt event to Clearcut and OTEL', () => {
|
||||
const event = new NetworkRetryAttemptEvent(
|
||||
2,
|
||||
5,
|
||||
'Overloaded',
|
||||
1000,
|
||||
'test-model',
|
||||
);
|
||||
|
||||
logNetworkRetryAttempt(mockConfig, event);
|
||||
|
||||
expect(
|
||||
ClearcutLogger.prototype.logNetworkRetryAttemptEvent,
|
||||
).toHaveBeenCalledWith(event);
|
||||
|
||||
expect(mockLogger.emit).toHaveBeenCalledWith({
|
||||
body: 'Network retry attempt 2/5 for test-model. Delay: 1000ms. Error type: Overloaded',
|
||||
attributes: {
|
||||
'session.id': 'test-session-id',
|
||||
'user.email': 'test-user@example.com',
|
||||
'installation.id': 'test-installation-id',
|
||||
'event.name': EVENT_NETWORK_RETRY_ATTEMPT,
|
||||
'event.timestamp': '2025-01-01T00:00:00.000Z',
|
||||
interactive: false,
|
||||
attempt: 2,
|
||||
max_attempts: 5,
|
||||
error_type: 'Overloaded',
|
||||
delay_ms: 1000,
|
||||
model: 'test-model',
|
||||
},
|
||||
});
|
||||
|
||||
expect(metrics.recordRetryAttemptMetrics).toHaveBeenCalledWith(
|
||||
mockConfig,
|
||||
{
|
||||
model: 'test-model',
|
||||
attempt: 2,
|
||||
},
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
describe('Telemetry Buffering', () => {
|
||||
it('should buffer events when SDK is not initialized', async () => {
|
||||
vi.spyOn(sdk, 'isTelemetrySdkInitialized').mockReturnValue(false);
|
||||
|
||||
@@ -32,6 +32,7 @@ import {
|
||||
type InvalidChunkEvent,
|
||||
type ContentRetryEvent,
|
||||
type ContentRetryFailureEvent,
|
||||
type NetworkRetryAttemptEvent,
|
||||
type RipgrepFallbackEvent,
|
||||
type ToolOutputTruncatedEvent,
|
||||
type ModelRoutingEvent,
|
||||
@@ -62,6 +63,7 @@ import {
|
||||
recordToolCallMetrics,
|
||||
recordChatCompressionMetrics,
|
||||
recordFileOperationMetric,
|
||||
recordRetryAttemptMetrics,
|
||||
recordContentRetry,
|
||||
recordContentRetryFailure,
|
||||
recordModelRoutingMetrics,
|
||||
@@ -485,6 +487,25 @@ export function logInvalidChunk(
|
||||
});
|
||||
}
|
||||
|
||||
export function logNetworkRetryAttempt(
|
||||
config: Config,
|
||||
event: NetworkRetryAttemptEvent,
|
||||
): void {
|
||||
ClearcutLogger.getInstance(config)?.logNetworkRetryAttemptEvent(event);
|
||||
bufferTelemetryEvent(() => {
|
||||
const logger = logs.getLogger(SERVICE_NAME);
|
||||
const logRecord: LogRecord = {
|
||||
body: event.toLogBody(),
|
||||
attributes: event.toOpenTelemetryAttributes(config),
|
||||
};
|
||||
logger.emit(logRecord);
|
||||
recordRetryAttemptMetrics(config, {
|
||||
model: event.model,
|
||||
attempt: event.attempt,
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
export function logContentRetry(
|
||||
config: Config,
|
||||
event: ContentRetryEvent,
|
||||
|
||||
@@ -40,6 +40,7 @@ const INVALID_CHUNK_COUNT = 'gemini_cli.chat.invalid_chunk.count';
|
||||
const CONTENT_RETRY_COUNT = 'gemini_cli.chat.content_retry.count';
|
||||
const CONTENT_RETRY_FAILURE_COUNT =
|
||||
'gemini_cli.chat.content_retry_failure.count';
|
||||
const NETWORK_RETRY_COUNT = 'gemini_cli.network_retry.count';
|
||||
const MODEL_ROUTING_LATENCY = 'gemini_cli.model_routing.latency';
|
||||
const MODEL_ROUTING_FAILURE_COUNT = 'gemini_cli.model_routing.failure.count';
|
||||
const MODEL_SLASH_COMMAND_CALL_COUNT =
|
||||
@@ -166,6 +167,16 @@ const COUNTER_DEFINITIONS = {
|
||||
assign: (c: Counter) => (contentRetryFailureCounter = c),
|
||||
attributes: {} as Record<string, never>,
|
||||
},
|
||||
[NETWORK_RETRY_COUNT]: {
|
||||
description: 'Counts network retries.',
|
||||
valueType: ValueType.INT,
|
||||
assign: (c: Counter) => (networkRetryCounter = c),
|
||||
// eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
|
||||
attributes: {} as {
|
||||
model: string;
|
||||
attempt: number;
|
||||
},
|
||||
},
|
||||
[MODEL_ROUTING_FAILURE_COUNT]: {
|
||||
description: 'Counts model routing failures.',
|
||||
valueType: ValueType.INT,
|
||||
@@ -610,6 +621,7 @@ let chatCompressionCounter: Counter | undefined;
|
||||
let invalidChunkCounter: Counter | undefined;
|
||||
let contentRetryCounter: Counter | undefined;
|
||||
let contentRetryFailureCounter: Counter | undefined;
|
||||
let networkRetryCounter: Counter | undefined;
|
||||
let modelRoutingLatencyHistogram: Histogram | undefined;
|
||||
let modelRoutingFailureCounter: Counter | undefined;
|
||||
let modelSlashCommandCallCounter: Counter | undefined;
|
||||
@@ -848,6 +860,20 @@ export function recordInvalidChunk(config: Config): void {
|
||||
invalidChunkCounter.add(1, baseMetricDefinition.getCommonAttributes(config));
|
||||
}
|
||||
|
||||
export function recordRetryAttemptMetrics(
|
||||
config: Config,
|
||||
attributes: {
|
||||
model: string;
|
||||
attempt: number;
|
||||
},
|
||||
): void {
|
||||
if (!networkRetryCounter || !isMetricsInitialized) return;
|
||||
networkRetryCounter.add(1, {
|
||||
...baseMetricDefinition.getCommonAttributes(config),
|
||||
...attributes,
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Records a metric for when a retry is triggered due to a content error.
|
||||
*/
|
||||
|
||||
@@ -1341,6 +1341,51 @@ export class ContentRetryEvent implements BaseTelemetryEvent {
|
||||
|
||||
export const EVENT_CONTENT_RETRY_FAILURE =
|
||||
'gemini_cli.chat.content_retry_failure';
|
||||
|
||||
export const EVENT_NETWORK_RETRY_ATTEMPT = 'gemini_cli.network_retry_attempt';
|
||||
export class NetworkRetryAttemptEvent implements BaseTelemetryEvent {
|
||||
'event.name': 'network_retry_attempt';
|
||||
'event.timestamp': string;
|
||||
attempt: number;
|
||||
max_attempts: number;
|
||||
error_type: string;
|
||||
delay_ms: number;
|
||||
model: string;
|
||||
|
||||
constructor(
|
||||
attempt: number,
|
||||
max_attempts: number,
|
||||
error_type: string,
|
||||
delay_ms: number,
|
||||
model: string,
|
||||
) {
|
||||
this['event.name'] = 'network_retry_attempt';
|
||||
this['event.timestamp'] = new Date().toISOString();
|
||||
this.attempt = attempt;
|
||||
this.max_attempts = max_attempts;
|
||||
this.error_type = error_type;
|
||||
this.delay_ms = delay_ms;
|
||||
this.model = model;
|
||||
}
|
||||
|
||||
toOpenTelemetryAttributes(config: Config): LogAttributes {
|
||||
return {
|
||||
...getCommonAttributes(config),
|
||||
'event.name': EVENT_NETWORK_RETRY_ATTEMPT,
|
||||
'event.timestamp': this['event.timestamp'],
|
||||
attempt: this.attempt,
|
||||
max_attempts: this.max_attempts,
|
||||
error_type: this.error_type,
|
||||
delay_ms: this.delay_ms,
|
||||
model: this.model,
|
||||
};
|
||||
}
|
||||
|
||||
toLogBody(): string {
|
||||
return `Network retry attempt ${this.attempt}/${this.max_attempts} for ${this.model}. Delay: ${this.delay_ms}ms. Error type: ${this.error_type}`;
|
||||
}
|
||||
}
|
||||
|
||||
export class ContentRetryFailureEvent implements BaseTelemetryEvent {
|
||||
'event.name': 'content_retry_failure';
|
||||
'event.timestamp': string;
|
||||
|
||||
Reference in New Issue
Block a user