Files
gemini-cli/packages/core/src/telemetry/metrics.ts
2025-11-24 21:13:24 +00:00

1215 lines
36 KiB
TypeScript

/**
* @license
* Copyright 2025 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import type { Attributes, Meter, Counter, Histogram } from '@opentelemetry/api';
import { diag, metrics, ValueType } from '@opentelemetry/api';
import { SERVICE_NAME } from './constants.js';
import type { Config } from '../config/config.js';
import type {
ModelRoutingEvent,
ModelSlashCommandEvent,
AgentFinishEvent,
RecoveryAttemptEvent,
} from './types.js';
import { AuthType } from '../core/contentGenerator.js';
import { getCommonAttributes } from './telemetryAttributes.js';
import { sanitizeHookName } from './sanitize.js';
const EVENT_CHAT_COMPRESSION = 'gemini_cli.chat_compression';
const TOOL_CALL_COUNT = 'gemini_cli.tool.call.count';
const TOOL_CALL_LATENCY = 'gemini_cli.tool.call.latency';
const API_REQUEST_COUNT = 'gemini_cli.api.request.count';
const API_REQUEST_LATENCY = 'gemini_cli.api.request.latency';
const TOKEN_USAGE = 'gemini_cli.token.usage';
const SESSION_COUNT = 'gemini_cli.session.count';
const FILE_OPERATION_COUNT = 'gemini_cli.file.operation.count';
const LINES_CHANGED = 'gemini_cli.lines.changed';
const INVALID_CHUNK_COUNT = 'gemini_cli.chat.invalid_chunk.count';
const CONTENT_RETRY_COUNT = 'gemini_cli.chat.content_retry.count';
const CONTENT_RETRY_FAILURE_COUNT =
'gemini_cli.chat.content_retry_failure.count';
const MODEL_ROUTING_LATENCY = 'gemini_cli.model_routing.latency';
const MODEL_ROUTING_FAILURE_COUNT = 'gemini_cli.model_routing.failure.count';
const MODEL_SLASH_COMMAND_CALL_COUNT =
'gemini_cli.slash_command.model.call_count';
const EVENT_HOOK_CALL_COUNT = 'gemini_cli.hook_call.count';
const EVENT_HOOK_CALL_LATENCY = 'gemini_cli.hook_call.latency';
// Agent Metrics
const AGENT_RUN_COUNT = 'gemini_cli.agent.run.count';
const AGENT_DURATION_MS = 'gemini_cli.agent.duration';
const AGENT_TURNS = 'gemini_cli.agent.turns';
const AGENT_RECOVERY_ATTEMPT_COUNT = 'gemini_cli.agent.recovery_attempt.count';
const AGENT_RECOVERY_ATTEMPT_DURATION =
'gemini_cli.agent.recovery_attempt.duration';
// OpenTelemetry GenAI Semantic Convention Metrics
const GEN_AI_CLIENT_TOKEN_USAGE = 'gen_ai.client.token.usage';
const GEN_AI_CLIENT_OPERATION_DURATION = 'gen_ai.client.operation.duration';
// Performance Monitoring Metrics
const STARTUP_TIME = 'gemini_cli.startup.duration';
const MEMORY_USAGE = 'gemini_cli.memory.usage';
const CPU_USAGE = 'gemini_cli.cpu.usage';
const TOOL_QUEUE_DEPTH = 'gemini_cli.tool.queue.depth';
const TOOL_EXECUTION_BREAKDOWN = 'gemini_cli.tool.execution.breakdown';
const TOKEN_EFFICIENCY = 'gemini_cli.token.efficiency';
const API_REQUEST_BREAKDOWN = 'gemini_cli.api.request.breakdown';
const PERFORMANCE_SCORE = 'gemini_cli.performance.score';
const REGRESSION_DETECTION = 'gemini_cli.performance.regression';
const REGRESSION_PERCENTAGE_CHANGE =
'gemini_cli.performance.regression.percentage_change';
const BASELINE_COMPARISON = 'gemini_cli.performance.baseline.comparison';
const FLICKER_FRAME_COUNT = 'gemini_cli.ui.flicker.count';
const SLOW_RENDER_LATENCY = 'gemini_cli.ui.slow_render.latency';
const EXIT_FAIL_COUNT = 'gemini_cli.exit.fail.count';
const baseMetricDefinition = {
getCommonAttributes,
};
const COUNTER_DEFINITIONS = {
[TOOL_CALL_COUNT]: {
description: 'Counts tool calls, tagged by function name and success.',
valueType: ValueType.INT,
assign: (c: Counter) => (toolCallCounter = c),
attributes: {} as {
function_name: string;
success: boolean;
decision?: 'accept' | 'reject' | 'modify' | 'auto_accept';
tool_type?: 'native' | 'mcp';
},
},
[API_REQUEST_COUNT]: {
description: 'Counts API requests, tagged by model and status.',
valueType: ValueType.INT,
assign: (c: Counter) => (apiRequestCounter = c),
attributes: {} as {
model: string;
status_code?: number | string;
error_type?: string;
},
},
[TOKEN_USAGE]: {
description: 'Counts the total number of tokens used.',
valueType: ValueType.INT,
assign: (c: Counter) => (tokenUsageCounter = c),
attributes: {} as {
model: string;
type: 'input' | 'output' | 'thought' | 'cache' | 'tool';
},
},
[SESSION_COUNT]: {
description: 'Count of CLI sessions started.',
valueType: ValueType.INT,
assign: (c: Counter) => (sessionCounter = c),
attributes: {} as Record<string, never>,
},
[FILE_OPERATION_COUNT]: {
description: 'Counts file operations (create, read, update).',
valueType: ValueType.INT,
assign: (c: Counter) => (fileOperationCounter = c),
attributes: {} as {
operation: FileOperation;
lines?: number;
mimetype?: string;
extension?: string;
programming_language?: string;
},
},
[LINES_CHANGED]: {
description: 'Number of lines changed (from file diffs).',
valueType: ValueType.INT,
assign: (c: Counter) => (linesChangedCounter = c),
attributes: {} as {
function_name?: string;
type: 'added' | 'removed';
},
},
[INVALID_CHUNK_COUNT]: {
description: 'Counts invalid chunks received from a stream.',
valueType: ValueType.INT,
assign: (c: Counter) => (invalidChunkCounter = c),
attributes: {} as Record<string, never>,
},
[CONTENT_RETRY_COUNT]: {
description: 'Counts retries due to content errors (e.g., empty stream).',
valueType: ValueType.INT,
assign: (c: Counter) => (contentRetryCounter = c),
attributes: {} as Record<string, never>,
},
[CONTENT_RETRY_FAILURE_COUNT]: {
description: 'Counts occurrences of all content retries failing.',
valueType: ValueType.INT,
assign: (c: Counter) => (contentRetryFailureCounter = c),
attributes: {} as Record<string, never>,
},
[MODEL_ROUTING_FAILURE_COUNT]: {
description: 'Counts model routing failures.',
valueType: ValueType.INT,
assign: (c: Counter) => (modelRoutingFailureCounter = c),
attributes: {} as {
'routing.decision_source': string;
'routing.error_message': string;
},
},
[MODEL_SLASH_COMMAND_CALL_COUNT]: {
description: 'Counts model slash command calls.',
valueType: ValueType.INT,
assign: (c: Counter) => (modelSlashCommandCallCounter = c),
attributes: {} as {
'slash_command.model.model_name': string;
},
},
[EVENT_CHAT_COMPRESSION]: {
description: 'Counts chat compression events.',
valueType: ValueType.INT,
assign: (c: Counter) => (chatCompressionCounter = c),
attributes: {} as {
tokens_before: number;
tokens_after: number;
},
},
[AGENT_RUN_COUNT]: {
description: 'Counts agent runs, tagged by name and termination reason.',
valueType: ValueType.INT,
assign: (c: Counter) => (agentRunCounter = c),
attributes: {} as {
agent_name: string;
terminate_reason: string;
},
},
[AGENT_RECOVERY_ATTEMPT_COUNT]: {
description: 'Counts agent recovery attempts.',
valueType: ValueType.INT,
assign: (c: Counter) => (agentRecoveryAttemptCounter = c),
attributes: {} as {
agent_name: string;
reason: string;
success: boolean;
},
},
[FLICKER_FRAME_COUNT]: {
description:
'Counts UI frames that flicker (render taller than the terminal).',
valueType: ValueType.INT,
assign: (c: Counter) => (flickerFrameCounter = c),
attributes: {} as Record<string, never>,
},
[EXIT_FAIL_COUNT]: {
description: 'Counts CLI exit failures.',
valueType: ValueType.INT,
assign: (c: Counter) => (exitFailCounter = c),
attributes: {} as Record<string, never>,
},
[EVENT_HOOK_CALL_COUNT]: {
description: 'Counts hook calls, tagged by hook event name and success.',
valueType: ValueType.INT,
assign: (c: Counter) => (hookCallCounter = c),
attributes: {} as {
hook_event_name: string;
hook_name: string;
success: boolean;
},
},
} as const;
const HISTOGRAM_DEFINITIONS = {
[TOOL_CALL_LATENCY]: {
description: 'Latency of tool calls in milliseconds.',
unit: 'ms',
valueType: ValueType.INT,
assign: (h: Histogram) => (toolCallLatencyHistogram = h),
attributes: {} as {
function_name: string;
},
},
[API_REQUEST_LATENCY]: {
description: 'Latency of API requests in milliseconds.',
unit: 'ms',
valueType: ValueType.INT,
assign: (h: Histogram) => (apiRequestLatencyHistogram = h),
attributes: {} as {
model: string;
},
},
[MODEL_ROUTING_LATENCY]: {
description: 'Latency of model routing decisions in milliseconds.',
unit: 'ms',
valueType: ValueType.INT,
assign: (h: Histogram) => (modelRoutingLatencyHistogram = h),
attributes: {} as {
'routing.decision_model': string;
'routing.decision_source': string;
},
},
[AGENT_DURATION_MS]: {
description: 'Duration of agent runs in milliseconds.',
unit: 'ms',
valueType: ValueType.INT,
assign: (h: Histogram) => (agentDurationHistogram = h),
attributes: {} as {
agent_name: string;
},
},
[SLOW_RENDER_LATENCY]: {
description: 'Counts UI frames that take too long to render.',
unit: 'ms',
valueType: ValueType.INT,
assign: (h: Histogram) => (slowRenderHistogram = h),
attributes: {} as Record<string, never>,
},
[AGENT_TURNS]: {
description: 'Number of turns taken by agents.',
unit: 'turns',
valueType: ValueType.INT,
assign: (h: Histogram) => (agentTurnsHistogram = h),
attributes: {} as {
agent_name: string;
},
},
[AGENT_RECOVERY_ATTEMPT_DURATION]: {
description: 'Duration of agent recovery attempts in milliseconds.',
unit: 'ms',
valueType: ValueType.INT,
assign: (h: Histogram) => (agentRecoveryAttemptDurationHistogram = h),
attributes: {} as {
agent_name: string;
},
},
[GEN_AI_CLIENT_TOKEN_USAGE]: {
description: 'Number of input and output tokens used.',
unit: 'token',
valueType: ValueType.INT,
assign: (h: Histogram) => (genAiClientTokenUsageHistogram = h),
attributes: {} as {
'gen_ai.operation.name': string;
'gen_ai.provider.name': string;
'gen_ai.token.type': 'input' | 'output';
'gen_ai.request.model'?: string;
'gen_ai.response.model'?: string;
'server.address'?: string;
'server.port'?: number;
},
},
[GEN_AI_CLIENT_OPERATION_DURATION]: {
description: 'GenAI operation duration.',
unit: 's',
valueType: ValueType.DOUBLE,
assign: (h: Histogram) => (genAiClientOperationDurationHistogram = h),
attributes: {} as {
'gen_ai.operation.name': string;
'gen_ai.provider.name': string;
'gen_ai.request.model'?: string;
'gen_ai.response.model'?: string;
'server.address'?: string;
'server.port'?: number;
'error.type'?: string;
},
},
[EVENT_HOOK_CALL_LATENCY]: {
description: 'Latency of hook calls in milliseconds.',
unit: 'ms',
valueType: ValueType.INT,
assign: (c: Histogram) => (hookCallLatencyHistogram = c),
attributes: {} as {
hook_event_name: string;
hook_name: string;
success: boolean;
},
},
} as const;
const PERFORMANCE_COUNTER_DEFINITIONS = {
[REGRESSION_DETECTION]: {
description: 'Performance regression detection events.',
valueType: ValueType.INT,
assign: (c: Counter) => (regressionDetectionCounter = c),
attributes: {} as {
metric: string;
severity: 'low' | 'medium' | 'high';
current_value: number;
baseline_value: number;
},
},
} as const;
const PERFORMANCE_HISTOGRAM_DEFINITIONS = {
[STARTUP_TIME]: {
description:
'CLI startup time in milliseconds, broken down by initialization phase.',
unit: 'ms',
valueType: ValueType.DOUBLE,
assign: (h: Histogram) => (startupTimeHistogram = h),
attributes: {} as {
phase: string;
details?: Record<string, string | number | boolean>;
},
},
[MEMORY_USAGE]: {
description: 'Memory usage in bytes.',
unit: 'bytes',
valueType: ValueType.INT,
assign: (h: Histogram) => (memoryUsageGauge = h),
attributes: {} as {
memory_type: MemoryMetricType;
component?: string;
},
},
[CPU_USAGE]: {
description: 'CPU usage percentage.',
unit: 'percent',
valueType: ValueType.DOUBLE,
assign: (h: Histogram) => (cpuUsageGauge = h),
attributes: {} as {
component?: string;
},
},
[TOOL_QUEUE_DEPTH]: {
description: 'Number of tools in execution queue.',
unit: 'count',
valueType: ValueType.INT,
assign: (h: Histogram) => (toolQueueDepthGauge = h),
attributes: {} as Record<string, never>,
},
[TOOL_EXECUTION_BREAKDOWN]: {
description: 'Tool execution time breakdown by phase in milliseconds.',
unit: 'ms',
valueType: ValueType.INT,
assign: (h: Histogram) => (toolExecutionBreakdownHistogram = h),
attributes: {} as {
function_name: string;
phase: ToolExecutionPhase;
},
},
[TOKEN_EFFICIENCY]: {
description:
'Token efficiency metrics (tokens per operation, cache hit rate, etc.).',
unit: 'ratio',
valueType: ValueType.DOUBLE,
assign: (h: Histogram) => (tokenEfficiencyHistogram = h),
attributes: {} as {
model: string;
metric: string;
context?: string;
},
},
[API_REQUEST_BREAKDOWN]: {
description: 'API request time breakdown by phase in milliseconds.',
unit: 'ms',
valueType: ValueType.INT,
assign: (h: Histogram) => (apiRequestBreakdownHistogram = h),
attributes: {} as {
model: string;
phase: ApiRequestPhase;
},
},
[PERFORMANCE_SCORE]: {
description: 'Composite performance score (0-100).',
unit: 'score',
valueType: ValueType.DOUBLE,
assign: (h: Histogram) => (performanceScoreGauge = h),
attributes: {} as {
category: string;
baseline?: number;
},
},
[REGRESSION_PERCENTAGE_CHANGE]: {
description:
'Percentage change compared to baseline for detected regressions.',
unit: 'percent',
valueType: ValueType.DOUBLE,
assign: (h: Histogram) => (regressionPercentageChangeHistogram = h),
attributes: {} as {
metric: string;
severity: 'low' | 'medium' | 'high';
current_value: number;
baseline_value: number;
},
},
[BASELINE_COMPARISON]: {
description:
'Performance comparison to established baseline (percentage change).',
unit: 'percent',
valueType: ValueType.DOUBLE,
assign: (h: Histogram) => (baselineComparisonHistogram = h),
attributes: {} as {
metric: string;
category: string;
current_value: number;
baseline_value: number;
},
},
} as const;
type AllMetricDefs = typeof COUNTER_DEFINITIONS &
typeof HISTOGRAM_DEFINITIONS &
typeof PERFORMANCE_COUNTER_DEFINITIONS &
typeof PERFORMANCE_HISTOGRAM_DEFINITIONS;
export type MetricDefinitions = {
[K in keyof AllMetricDefs]: {
attributes: AllMetricDefs[K]['attributes'];
};
};
export enum FileOperation {
CREATE = 'create',
READ = 'read',
UPDATE = 'update',
}
export enum PerformanceMetricType {
STARTUP = 'startup',
MEMORY = 'memory',
CPU = 'cpu',
TOOL_EXECUTION = 'tool_execution',
API_REQUEST = 'api_request',
TOKEN_EFFICIENCY = 'token_efficiency',
}
export enum MemoryMetricType {
HEAP_USED = 'heap_used',
HEAP_TOTAL = 'heap_total',
EXTERNAL = 'external',
RSS = 'rss',
}
export enum ToolExecutionPhase {
VALIDATION = 'validation',
PREPARATION = 'preparation',
EXECUTION = 'execution',
RESULT_PROCESSING = 'result_processing',
}
export enum ApiRequestPhase {
REQUEST_PREPARATION = 'request_preparation',
NETWORK_LATENCY = 'network_latency',
RESPONSE_PROCESSING = 'response_processing',
TOKEN_PROCESSING = 'token_processing',
}
export enum GenAiOperationName {
GENERATE_CONTENT = 'generate_content',
}
export enum GenAiProviderName {
GCP_GEN_AI = 'gcp.gen_ai',
GCP_VERTEX_AI = 'gcp.vertex_ai',
}
export enum GenAiTokenType {
INPUT = 'input',
OUTPUT = 'output',
}
let cliMeter: Meter | undefined;
let toolCallCounter: Counter | undefined;
let toolCallLatencyHistogram: Histogram | undefined;
let apiRequestCounter: Counter | undefined;
let apiRequestLatencyHistogram: Histogram | undefined;
let tokenUsageCounter: Counter | undefined;
let sessionCounter: Counter | undefined;
let fileOperationCounter: Counter | undefined;
let linesChangedCounter: Counter | undefined;
let chatCompressionCounter: Counter | undefined;
let invalidChunkCounter: Counter | undefined;
let contentRetryCounter: Counter | undefined;
let contentRetryFailureCounter: Counter | undefined;
let modelRoutingLatencyHistogram: Histogram | undefined;
let modelRoutingFailureCounter: Counter | undefined;
let modelSlashCommandCallCounter: Counter | undefined;
let agentRunCounter: Counter | undefined;
let agentDurationHistogram: Histogram | undefined;
let agentTurnsHistogram: Histogram | undefined;
let agentRecoveryAttemptCounter: Counter | undefined;
let agentRecoveryAttemptDurationHistogram: Histogram | undefined;
let flickerFrameCounter: Counter | undefined;
let exitFailCounter: Counter | undefined;
let slowRenderHistogram: Histogram | undefined;
let hookCallCounter: Counter | undefined;
let hookCallLatencyHistogram: Histogram | undefined;
// OpenTelemetry GenAI Semantic Convention Metrics
let genAiClientTokenUsageHistogram: Histogram | undefined;
let genAiClientOperationDurationHistogram: Histogram | undefined;
// Performance Monitoring Metrics
let startupTimeHistogram: Histogram | undefined;
let memoryUsageGauge: Histogram | undefined; // Using Histogram until ObservableGauge is available
let cpuUsageGauge: Histogram | undefined;
let toolQueueDepthGauge: Histogram | undefined;
let toolExecutionBreakdownHistogram: Histogram | undefined;
let tokenEfficiencyHistogram: Histogram | undefined;
let apiRequestBreakdownHistogram: Histogram | undefined;
let performanceScoreGauge: Histogram | undefined;
let regressionDetectionCounter: Counter | undefined;
let regressionPercentageChangeHistogram: Histogram | undefined;
let baselineComparisonHistogram: Histogram | undefined;
let isMetricsInitialized = false;
let isPerformanceMonitoringEnabled = false;
export function getMeter(): Meter | undefined {
if (!cliMeter) {
cliMeter = metrics.getMeter(SERVICE_NAME);
}
return cliMeter;
}
export function initializeMetrics(config: Config): void {
if (isMetricsInitialized) return;
const meter = getMeter();
if (!meter) return;
// Initialize core metrics
Object.entries(COUNTER_DEFINITIONS).forEach(
([name, { description, valueType, assign }]) => {
assign(meter.createCounter(name, { description, valueType }));
},
);
Object.entries(HISTOGRAM_DEFINITIONS).forEach(
([name, { description, unit, valueType, assign }]) => {
assign(meter.createHistogram(name, { description, unit, valueType }));
},
);
// Increment session counter after all metrics are initialized
sessionCounter?.add(1, baseMetricDefinition.getCommonAttributes(config));
// Initialize performance monitoring metrics if enabled
initializePerformanceMonitoring(config);
isMetricsInitialized = true;
}
export function recordChatCompressionMetrics(
config: Config,
attributes: MetricDefinitions[typeof EVENT_CHAT_COMPRESSION]['attributes'],
) {
if (!chatCompressionCounter || !isMetricsInitialized) return;
chatCompressionCounter.add(1, {
...baseMetricDefinition.getCommonAttributes(config),
...attributes,
});
}
export function recordToolCallMetrics(
config: Config,
durationMs: number,
attributes: MetricDefinitions[typeof TOOL_CALL_COUNT]['attributes'],
): void {
if (!toolCallCounter || !toolCallLatencyHistogram || !isMetricsInitialized)
return;
const metricAttributes: Attributes = {
...baseMetricDefinition.getCommonAttributes(config),
...attributes,
};
toolCallCounter.add(1, metricAttributes);
toolCallLatencyHistogram.record(durationMs, {
...baseMetricDefinition.getCommonAttributes(config),
function_name: attributes.function_name,
});
}
export function recordCustomTokenUsageMetrics(
config: Config,
tokenCount: number,
attributes: MetricDefinitions[typeof TOKEN_USAGE]['attributes'],
): void {
if (!tokenUsageCounter || !isMetricsInitialized) return;
tokenUsageCounter.add(tokenCount, {
...baseMetricDefinition.getCommonAttributes(config),
...attributes,
});
}
export function recordCustomApiResponseMetrics(
config: Config,
durationMs: number,
attributes: MetricDefinitions[typeof API_REQUEST_COUNT]['attributes'],
): void {
if (
!apiRequestCounter ||
!apiRequestLatencyHistogram ||
!isMetricsInitialized
)
return;
const metricAttributes: Attributes = {
...baseMetricDefinition.getCommonAttributes(config),
model: attributes.model,
status_code: attributes.status_code ?? 'ok',
};
apiRequestCounter.add(1, metricAttributes);
apiRequestLatencyHistogram.record(durationMs, {
...baseMetricDefinition.getCommonAttributes(config),
model: attributes.model,
});
}
export function recordApiErrorMetrics(
config: Config,
durationMs: number,
attributes: MetricDefinitions[typeof API_REQUEST_COUNT]['attributes'],
): void {
if (
!apiRequestCounter ||
!apiRequestLatencyHistogram ||
!isMetricsInitialized
)
return;
const metricAttributes: Attributes = {
...baseMetricDefinition.getCommonAttributes(config),
model: attributes.model,
status_code: attributes.status_code ?? 'error',
error_type: attributes.error_type ?? 'unknown',
};
apiRequestCounter.add(1, metricAttributes);
apiRequestLatencyHistogram.record(durationMs, {
...baseMetricDefinition.getCommonAttributes(config),
model: attributes.model,
});
}
export function recordFileOperationMetric(
config: Config,
attributes: MetricDefinitions[typeof FILE_OPERATION_COUNT]['attributes'],
): void {
if (!fileOperationCounter || !isMetricsInitialized) return;
fileOperationCounter.add(1, {
...baseMetricDefinition.getCommonAttributes(config),
...attributes,
});
}
export function recordLinesChanged(
config: Config,
lines: number,
changeType: 'added' | 'removed',
attributes?: { function_name?: string },
): void {
if (!linesChangedCounter || !isMetricsInitialized) return;
if (!Number.isFinite(lines) || lines <= 0) return;
linesChangedCounter.add(lines, {
...baseMetricDefinition.getCommonAttributes(config),
type: changeType,
...(attributes ?? {}),
});
}
// --- New Metric Recording Functions ---
/**
* Records a metric for when a UI frame flickers.
*/
export function recordFlickerFrame(config: Config): void {
if (!flickerFrameCounter || !isMetricsInitialized) return;
flickerFrameCounter.add(1, baseMetricDefinition.getCommonAttributes(config));
}
/**
* Records a metric for when user failed to exit
*/
export function recordExitFail(config: Config): void {
if (!exitFailCounter || !isMetricsInitialized) return;
exitFailCounter.add(1, baseMetricDefinition.getCommonAttributes(config));
}
/**
* Records a metric for when a UI frame is slow in rendering
*/
export function recordSlowRender(config: Config, renderLatency: number): void {
if (!slowRenderHistogram || !isMetricsInitialized) return;
slowRenderHistogram.record(renderLatency, {
...baseMetricDefinition.getCommonAttributes(config),
});
}
/**
* Records a metric for when an invalid chunk is received from a stream.
*/
export function recordInvalidChunk(config: Config): void {
if (!invalidChunkCounter || !isMetricsInitialized) return;
invalidChunkCounter.add(1, baseMetricDefinition.getCommonAttributes(config));
}
/**
* Records a metric for when a retry is triggered due to a content error.
*/
export function recordContentRetry(config: Config): void {
if (!contentRetryCounter || !isMetricsInitialized) return;
contentRetryCounter.add(1, baseMetricDefinition.getCommonAttributes(config));
}
/**
* Records a metric for when all content error retries have failed for a request.
*/
export function recordContentRetryFailure(config: Config): void {
if (!contentRetryFailureCounter || !isMetricsInitialized) return;
contentRetryFailureCounter.add(
1,
baseMetricDefinition.getCommonAttributes(config),
);
}
export function recordModelSlashCommand(
config: Config,
event: ModelSlashCommandEvent,
): void {
if (!modelSlashCommandCallCounter || !isMetricsInitialized) return;
modelSlashCommandCallCounter.add(1, {
...baseMetricDefinition.getCommonAttributes(config),
'slash_command.model.model_name': event.model_name,
});
}
export function recordModelRoutingMetrics(
config: Config,
event: ModelRoutingEvent,
): void {
if (
!modelRoutingLatencyHistogram ||
!modelRoutingFailureCounter ||
!isMetricsInitialized
)
return;
modelRoutingLatencyHistogram.record(event.routing_latency_ms, {
...baseMetricDefinition.getCommonAttributes(config),
'routing.decision_model': event.decision_model,
'routing.decision_source': event.decision_source,
});
if (event.failed) {
modelRoutingFailureCounter.add(1, {
...baseMetricDefinition.getCommonAttributes(config),
'routing.decision_source': event.decision_source,
'routing.error_message': event.error_message,
});
}
}
export function recordAgentRunMetrics(
config: Config,
event: AgentFinishEvent,
): void {
if (
!agentRunCounter ||
!agentDurationHistogram ||
!agentTurnsHistogram ||
!isMetricsInitialized
)
return;
const commonAttributes = baseMetricDefinition.getCommonAttributes(config);
agentRunCounter.add(1, {
...commonAttributes,
agent_name: event.agent_name,
terminate_reason: event.terminate_reason,
});
agentDurationHistogram.record(event.duration_ms, {
...commonAttributes,
agent_name: event.agent_name,
});
agentTurnsHistogram.record(event.turn_count, {
...commonAttributes,
agent_name: event.agent_name,
});
}
export function recordRecoveryAttemptMetrics(
config: Config,
event: RecoveryAttemptEvent,
): void {
if (
!agentRecoveryAttemptCounter ||
!agentRecoveryAttemptDurationHistogram ||
!isMetricsInitialized
)
return;
const commonAttributes = baseMetricDefinition.getCommonAttributes(config);
agentRecoveryAttemptCounter.add(1, {
...commonAttributes,
agent_name: event.agent_name,
reason: event.reason,
success: event.success,
});
agentRecoveryAttemptDurationHistogram.record(event.duration_ms, {
...commonAttributes,
agent_name: event.agent_name,
});
}
// OpenTelemetry GenAI Semantic Convention Recording Functions
export function recordGenAiClientTokenUsage(
config: Config,
tokenCount: number,
attributes: MetricDefinitions[typeof GEN_AI_CLIENT_TOKEN_USAGE]['attributes'],
): void {
if (!genAiClientTokenUsageHistogram || !isMetricsInitialized) return;
const metricAttributes: Attributes = {
...baseMetricDefinition.getCommonAttributes(config),
...attributes,
};
genAiClientTokenUsageHistogram.record(tokenCount, metricAttributes);
}
export function recordGenAiClientOperationDuration(
config: Config,
durationSeconds: number,
attributes: MetricDefinitions[typeof GEN_AI_CLIENT_OPERATION_DURATION]['attributes'],
): void {
if (!genAiClientOperationDurationHistogram || !isMetricsInitialized) return;
const metricAttributes: Attributes = {
...baseMetricDefinition.getCommonAttributes(config),
...attributes,
};
genAiClientOperationDurationHistogram.record(
durationSeconds,
metricAttributes,
);
}
export function getConventionAttributes(event: {
model: string;
auth_type?: string;
}): {
'gen_ai.operation.name': GenAiOperationName;
'gen_ai.provider.name': GenAiProviderName;
'gen_ai.request.model': string;
'gen_ai.response.model': string;
} {
const operationName = getGenAiOperationName();
const provider = getGenAiProvider(event.auth_type);
return {
'gen_ai.operation.name': operationName,
'gen_ai.provider.name': provider,
'gen_ai.request.model': event.model,
'gen_ai.response.model': event.model,
};
}
/**
* Maps authentication type to GenAI provider name following OpenTelemetry conventions
*/
function getGenAiProvider(authType?: string): GenAiProviderName {
switch (authType) {
case AuthType.USE_VERTEX_AI:
case AuthType.COMPUTE_ADC:
case AuthType.LOGIN_WITH_GOOGLE:
return GenAiProviderName.GCP_VERTEX_AI;
case AuthType.USE_GEMINI:
default:
return GenAiProviderName.GCP_GEN_AI;
}
}
function getGenAiOperationName(): GenAiOperationName {
return GenAiOperationName.GENERATE_CONTENT;
}
// Performance Monitoring Functions
export function initializePerformanceMonitoring(config: Config): void {
const meter = getMeter();
if (!meter) return;
// Check if performance monitoring is enabled in config
// For now, enable performance monitoring when telemetry is enabled
// TODO: Add specific performance monitoring settings to config
isPerformanceMonitoringEnabled = config.getTelemetryEnabled();
if (!isPerformanceMonitoringEnabled) return;
Object.entries(PERFORMANCE_COUNTER_DEFINITIONS).forEach(
([name, { description, valueType, assign }]) => {
assign(meter.createCounter(name, { description, valueType }));
},
);
Object.entries(PERFORMANCE_HISTOGRAM_DEFINITIONS).forEach(
([name, { description, unit, valueType, assign }]) => {
assign(meter.createHistogram(name, { description, unit, valueType }));
},
);
}
export function recordStartupPerformance(
config: Config,
durationMs: number,
attributes: MetricDefinitions[typeof STARTUP_TIME]['attributes'],
): void {
if (!startupTimeHistogram || !isPerformanceMonitoringEnabled) return;
const metricAttributes: Attributes = {
...baseMetricDefinition.getCommonAttributes(config),
phase: attributes.phase,
...attributes.details,
};
startupTimeHistogram.record(durationMs, metricAttributes);
}
export function recordMemoryUsage(
config: Config,
bytes: number,
attributes: MetricDefinitions[typeof MEMORY_USAGE]['attributes'],
): void {
if (!memoryUsageGauge || !isPerformanceMonitoringEnabled) return;
const metricAttributes: Attributes = {
...baseMetricDefinition.getCommonAttributes(config),
...attributes,
};
memoryUsageGauge.record(bytes, metricAttributes);
}
export function recordCpuUsage(
config: Config,
percentage: number,
attributes: MetricDefinitions[typeof CPU_USAGE]['attributes'],
): void {
if (!cpuUsageGauge || !isPerformanceMonitoringEnabled) return;
const metricAttributes: Attributes = {
...baseMetricDefinition.getCommonAttributes(config),
...attributes,
};
cpuUsageGauge.record(percentage, metricAttributes);
}
export function recordToolQueueDepth(config: Config, queueDepth: number): void {
if (!toolQueueDepthGauge || !isPerformanceMonitoringEnabled) return;
const attributes: Attributes = {
...baseMetricDefinition.getCommonAttributes(config),
};
toolQueueDepthGauge.record(queueDepth, attributes);
}
export function recordToolExecutionBreakdown(
config: Config,
durationMs: number,
attributes: MetricDefinitions[typeof TOOL_EXECUTION_BREAKDOWN]['attributes'],
): void {
if (!toolExecutionBreakdownHistogram || !isPerformanceMonitoringEnabled)
return;
const metricAttributes: Attributes = {
...baseMetricDefinition.getCommonAttributes(config),
...attributes,
};
toolExecutionBreakdownHistogram.record(durationMs, metricAttributes);
}
export function recordTokenEfficiency(
config: Config,
value: number,
attributes: MetricDefinitions[typeof TOKEN_EFFICIENCY]['attributes'],
): void {
if (!tokenEfficiencyHistogram || !isPerformanceMonitoringEnabled) return;
const metricAttributes: Attributes = {
...baseMetricDefinition.getCommonAttributes(config),
...attributes,
};
tokenEfficiencyHistogram.record(value, metricAttributes);
}
export function recordApiRequestBreakdown(
config: Config,
durationMs: number,
attributes: MetricDefinitions[typeof API_REQUEST_BREAKDOWN]['attributes'],
): void {
if (!apiRequestBreakdownHistogram || !isPerformanceMonitoringEnabled) return;
const metricAttributes: Attributes = {
...baseMetricDefinition.getCommonAttributes(config),
...attributes,
};
apiRequestBreakdownHistogram.record(durationMs, metricAttributes);
}
export function recordPerformanceScore(
config: Config,
score: number,
attributes: MetricDefinitions[typeof PERFORMANCE_SCORE]['attributes'],
): void {
if (!performanceScoreGauge || !isPerformanceMonitoringEnabled) return;
const metricAttributes: Attributes = {
...baseMetricDefinition.getCommonAttributes(config),
...attributes,
};
performanceScoreGauge.record(score, metricAttributes);
}
export function recordPerformanceRegression(
config: Config,
attributes: MetricDefinitions[typeof REGRESSION_DETECTION]['attributes'],
): void {
if (!regressionDetectionCounter || !isPerformanceMonitoringEnabled) return;
const metricAttributes: Attributes = {
...baseMetricDefinition.getCommonAttributes(config),
...attributes,
};
regressionDetectionCounter.add(1, metricAttributes);
if (attributes.baseline_value !== 0 && regressionPercentageChangeHistogram) {
const percentageChange =
((attributes.current_value - attributes.baseline_value) /
attributes.baseline_value) *
100;
regressionPercentageChangeHistogram.record(
percentageChange,
metricAttributes,
);
}
}
export function recordBaselineComparison(
config: Config,
attributes: MetricDefinitions[typeof BASELINE_COMPARISON]['attributes'],
): void {
if (!baselineComparisonHistogram || !isPerformanceMonitoringEnabled) return;
if (attributes.baseline_value === 0) {
diag.warn('Baseline value is zero, skipping comparison.');
return;
}
const percentageChange =
((attributes.current_value - attributes.baseline_value) /
attributes.baseline_value) *
100;
const metricAttributes: Attributes = {
...baseMetricDefinition.getCommonAttributes(config),
...attributes,
};
baselineComparisonHistogram.record(percentageChange, metricAttributes);
}
// Utility function to check if performance monitoring is enabled
export function isPerformanceMonitoringActive(): boolean {
return isPerformanceMonitoringEnabled && isMetricsInitialized;
}
/**
* Token usage recording that emits both custom and convention metrics.
*/
export function recordTokenUsageMetrics(
config: Config,
tokenCount: number,
attributes: {
model: string;
type: 'input' | 'output' | 'thought' | 'cache' | 'tool';
genAiAttributes?: {
'gen_ai.operation.name': string;
'gen_ai.provider.name': string;
'gen_ai.request.model'?: string;
'gen_ai.response.model'?: string;
'server.address'?: string;
'server.port'?: number;
};
},
): void {
recordCustomTokenUsageMetrics(config, tokenCount, {
model: attributes.model,
type: attributes.type,
});
if (
(attributes.type === 'input' || attributes.type === 'output') &&
attributes.genAiAttributes
) {
recordGenAiClientTokenUsage(config, tokenCount, {
...attributes.genAiAttributes,
'gen_ai.token.type': attributes.type,
});
}
}
/**
* Operation latency recording that emits both custom and convention metrics.
*/
export function recordApiResponseMetrics(
config: Config,
durationMs: number,
attributes: {
model: string;
status_code?: number | string;
genAiAttributes?: {
'gen_ai.operation.name': string;
'gen_ai.provider.name': string;
'gen_ai.request.model'?: string;
'gen_ai.response.model'?: string;
'server.address'?: string;
'server.port'?: number;
'error.type'?: string;
};
},
): void {
recordCustomApiResponseMetrics(config, durationMs, {
model: attributes.model,
status_code: attributes.status_code,
});
if (attributes.genAiAttributes) {
const durationSeconds = durationMs / 1000;
recordGenAiClientOperationDuration(config, durationSeconds, {
...attributes.genAiAttributes,
});
}
}
export function recordHookCallMetrics(
config: Config,
hookEventName: string,
hookName: string,
durationMs: number,
success: boolean,
): void {
if (!hookCallCounter || !hookCallLatencyHistogram || !isMetricsInitialized)
return;
// Always sanitize hook names in metrics (metrics are aggregated and exposed)
const sanitizedHookName = sanitizeHookName(hookName);
const metricAttributes: Attributes = {
...baseMetricDefinition.getCommonAttributes(config),
hook_event_name: hookEventName,
hook_name: sanitizedHookName,
success,
};
hookCallCounter.add(1, metricAttributes);
hookCallLatencyHistogram.record(durationMs, metricAttributes);
}