[Part 3/6] feat(telemetry): enhance metrics with performance monitoring APIs (#8113)

2026-05-12 21:03:05 -07:00 · 2025-09-25 18:12:46 +02:00
parent 135d3401cd
commit f80eb71068
4 changed files with 1015 additions and 3 deletions
@@ -5,7 +5,7 @@
 */

 import type { Attributes, Meter, Counter, Histogram } from '@opentelemetry/api';
-import { metrics, ValueType } from '@opentelemetry/api';
+import { diag, metrics, ValueType } from '@opentelemetry/api';
 import {
  SERVICE_NAME,
  METRIC_TOOL_CALL_COUNT,
@@ -22,6 +22,18 @@ import {
  METRIC_MODEL_ROUTING_LATENCY,
  METRIC_MODEL_ROUTING_FAILURE_COUNT,
  METRIC_MODEL_SLASH_COMMAND_CALL_COUNT,
+  // Performance Monitoring Metrics
+  METRIC_STARTUP_TIME,
+  METRIC_MEMORY_USAGE,
+  METRIC_CPU_USAGE,
+  METRIC_TOOL_QUEUE_DEPTH,
+  METRIC_TOOL_EXECUTION_BREAKDOWN,
+  METRIC_TOKEN_EFFICIENCY,
+  METRIC_API_REQUEST_BREAKDOWN,
+  METRIC_PERFORMANCE_SCORE,
+  METRIC_REGRESSION_DETECTION,
+  METRIC_REGRESSION_PERCENTAGE_CHANGE,
+  METRIC_BASELINE_COMPARISON,
 } from './constants.js';
 import type { Config } from '../config/config.js';
 import type { ModelRoutingEvent, ModelSlashCommandEvent } from './types.js';
@@ -32,6 +44,36 @@ export enum FileOperation {
  UPDATE = 'update',
 }

+export enum PerformanceMetricType {
+  STARTUP = 'startup',
+  MEMORY = 'memory',
+  CPU = 'cpu',
+  TOOL_EXECUTION = 'tool_execution',
+  API_REQUEST = 'api_request',
+  TOKEN_EFFICIENCY = 'token_efficiency',
+}
+
+export enum MemoryMetricType {
+  HEAP_USED = 'heap_used',
+  HEAP_TOTAL = 'heap_total',
+  EXTERNAL = 'external',
+  RSS = 'rss',
+}
+
+export enum ToolExecutionPhase {
+  VALIDATION = 'validation',
+  PREPARATION = 'preparation',
+  EXECUTION = 'execution',
+  RESULT_PROCESSING = 'result_processing',
+}
+
+export enum ApiRequestPhase {
+  REQUEST_PREPARATION = 'request_preparation',
+  NETWORK_LATENCY = 'network_latency',
+  RESPONSE_PROCESSING = 'response_processing',
+  TOKEN_PROCESSING = 'token_processing',
+}
+
 let cliMeter: Meter | undefined;
 let toolCallCounter: Counter | undefined;
 let toolCallLatencyHistogram: Histogram | undefined;
@@ -46,7 +88,21 @@ let contentRetryFailureCounter: Counter | undefined;
 let modelRoutingLatencyHistogram: Histogram | undefined;
 let modelRoutingFailureCounter: Counter | undefined;
 let modelSlashCommandCallCounter: Counter | undefined;
+
+// Performance Monitoring Metrics
+let startupTimeHistogram: Histogram | undefined;
+let memoryUsageGauge: Histogram | undefined; // Using Histogram until ObservableGauge is available
+let cpuUsageGauge: Histogram | undefined;
+let toolQueueDepthGauge: Histogram | undefined;
+let toolExecutionBreakdownHistogram: Histogram | undefined;
+let tokenEfficiencyHistogram: Histogram | undefined;
+let apiRequestBreakdownHistogram: Histogram | undefined;
+let performanceScoreGauge: Histogram | undefined;
+let regressionDetectionCounter: Counter | undefined;
+let regressionPercentageChangeHistogram: Histogram | undefined;
+let baselineComparisonHistogram: Histogram | undefined;
 let isMetricsInitialized = false;
+let isPerformanceMonitoringEnabled = false;

 function getCommonAttributes(config: Config): Attributes {
  return {
@@ -67,6 +123,7 @@ export function initializeMetrics(config: Config): void {
  const meter = getMeter();
  if (!meter) return;

+  // Initialize core metrics
  toolCallCounter = meter.createCounter(METRIC_TOOL_CALL_COUNT, {
    description: 'Counts tool calls, tagged by function name and success.',
    valueType: ValueType.INT,
@@ -145,6 +202,10 @@ export function initializeMetrics(config: Config): void {
    valueType: ValueType.INT,
  });
  sessionCounter.add(1, getCommonAttributes(config));
+
+  // Initialize performance monitoring metrics if enabled
+  initializePerformanceMonitoring(config);
+
  isMetricsInitialized = true;
 }

@@ -332,3 +393,292 @@ export function recordModelRoutingMetrics(
    });
  }
 }
+// Performance Monitoring Functions
+
+export function initializePerformanceMonitoring(config: Config): void {
+  const meter = getMeter();
+  if (!meter) return;
+
+  // Check if performance monitoring is enabled in config
+  // For now, enable performance monitoring when telemetry is enabled
+  // TODO: Add specific performance monitoring settings to config
+  isPerformanceMonitoringEnabled = config.getTelemetryEnabled();
+
+  if (!isPerformanceMonitoringEnabled) return;
+
+  // Initialize startup time histogram
+  startupTimeHistogram = meter.createHistogram(METRIC_STARTUP_TIME, {
+    description:
+      'CLI startup time in milliseconds, broken down by initialization phase.',
+    unit: 'ms',
+    valueType: ValueType.DOUBLE,
+  });
+
+  // Initialize memory usage histogram (using histogram until ObservableGauge is available)
+  memoryUsageGauge = meter.createHistogram(METRIC_MEMORY_USAGE, {
+    description: 'Memory usage in bytes.',
+    unit: 'bytes',
+    valueType: ValueType.INT,
+  });
+
+  // Initialize CPU usage histogram
+  cpuUsageGauge = meter.createHistogram(METRIC_CPU_USAGE, {
+    description: 'CPU usage percentage.',
+    unit: 'percent',
+    valueType: ValueType.DOUBLE,
+  });
+
+  // Initialize tool queue depth histogram
+  toolQueueDepthGauge = meter.createHistogram(METRIC_TOOL_QUEUE_DEPTH, {
+    description: 'Number of tools in execution queue.',
+    valueType: ValueType.INT,
+  });
+
+  // Initialize performance breakdowns
+  toolExecutionBreakdownHistogram = meter.createHistogram(
+    METRIC_TOOL_EXECUTION_BREAKDOWN,
+    {
+      description: 'Tool execution time breakdown by phase in milliseconds.',
+      unit: 'ms',
+      valueType: ValueType.INT,
+    },
+  );
+
+  tokenEfficiencyHistogram = meter.createHistogram(METRIC_TOKEN_EFFICIENCY, {
+    description:
+      'Token efficiency metrics (tokens per operation, cache hit rate, etc.).',
+    valueType: ValueType.DOUBLE,
+  });
+
+  apiRequestBreakdownHistogram = meter.createHistogram(
+    METRIC_API_REQUEST_BREAKDOWN,
+    {
+      description: 'API request time breakdown by phase in milliseconds.',
+      unit: 'ms',
+      valueType: ValueType.INT,
+    },
+  );
+
+  // Initialize performance score and regression detection
+  performanceScoreGauge = meter.createHistogram(METRIC_PERFORMANCE_SCORE, {
+    description: 'Composite performance score (0-100).',
+    unit: 'score',
+    valueType: ValueType.DOUBLE,
+  });
+
+  regressionDetectionCounter = meter.createCounter(
+    METRIC_REGRESSION_DETECTION,
+    {
+      description: 'Performance regression detection events.',
+      valueType: ValueType.INT,
+    },
+  );
+
+  regressionPercentageChangeHistogram = meter.createHistogram(
+    METRIC_REGRESSION_PERCENTAGE_CHANGE,
+    {
+      description:
+        'Percentage change compared to baseline for detected regressions.',
+      unit: 'percent',
+      valueType: ValueType.DOUBLE,
+    },
+  );
+
+  baselineComparisonHistogram = meter.createHistogram(
+    METRIC_BASELINE_COMPARISON,
+    {
+      description:
+        'Performance comparison to established baseline (percentage change).',
+      unit: 'percent',
+      valueType: ValueType.DOUBLE,
+    },
+  );
+}
+
+export function recordStartupPerformance(
+  config: Config,
+  phase: string,
+  durationMs: number,
+  details?: Record<string, string | number | boolean>,
+): void {
+  if (!startupTimeHistogram || !isPerformanceMonitoringEnabled) return;
+
+  const attributes: Attributes = {
+    ...getCommonAttributes(config),
+    phase,
+    ...details,
+  };
+
+  startupTimeHistogram.record(durationMs, attributes);
+}
+
+export function recordMemoryUsage(
+  config: Config,
+  memoryType: MemoryMetricType,
+  bytes: number,
+  component?: string,
+): void {
+  if (!memoryUsageGauge || !isPerformanceMonitoringEnabled) return;
+
+  const attributes: Attributes = {
+    ...getCommonAttributes(config),
+    memory_type: memoryType,
+    component,
+  };
+
+  memoryUsageGauge.record(bytes, attributes);
+}
+
+export function recordCpuUsage(
+  config: Config,
+  percentage: number,
+  component?: string,
+): void {
+  if (!cpuUsageGauge || !isPerformanceMonitoringEnabled) return;
+
+  const attributes: Attributes = {
+    ...getCommonAttributes(config),
+    component,
+  };
+
+  cpuUsageGauge.record(percentage, attributes);
+}
+
+export function recordToolQueueDepth(config: Config, queueDepth: number): void {
+  if (!toolQueueDepthGauge || !isPerformanceMonitoringEnabled) return;
+
+  const attributes: Attributes = {
+    ...getCommonAttributes(config),
+  };
+
+  toolQueueDepthGauge.record(queueDepth, attributes);
+}
+
+export function recordToolExecutionBreakdown(
+  config: Config,
+  functionName: string,
+  phase: ToolExecutionPhase,
+  durationMs: number,
+): void {
+  if (!toolExecutionBreakdownHistogram || !isPerformanceMonitoringEnabled)
+    return;
+
+  const attributes: Attributes = {
+    ...getCommonAttributes(config),
+    function_name: functionName,
+    phase,
+  };
+
+  toolExecutionBreakdownHistogram.record(durationMs, attributes);
+}
+
+export function recordTokenEfficiency(
+  config: Config,
+  model: string,
+  metric: string,
+  value: number,
+  context?: string,
+): void {
+  if (!tokenEfficiencyHistogram || !isPerformanceMonitoringEnabled) return;
+
+  const attributes: Attributes = {
+    ...getCommonAttributes(config),
+    model,
+    metric,
+    context,
+  };
+
+  tokenEfficiencyHistogram.record(value, attributes);
+}
+
+export function recordApiRequestBreakdown(
+  config: Config,
+  model: string,
+  phase: ApiRequestPhase,
+  durationMs: number,
+): void {
+  if (!apiRequestBreakdownHistogram || !isPerformanceMonitoringEnabled) return;
+
+  const attributes: Attributes = {
+    ...getCommonAttributes(config),
+    model,
+    phase,
+  };
+
+  apiRequestBreakdownHistogram.record(durationMs, attributes);
+}
+
+export function recordPerformanceScore(
+  config: Config,
+  score: number,
+  category: string,
+  baseline?: number,
+): void {
+  if (!performanceScoreGauge || !isPerformanceMonitoringEnabled) return;
+
+  const attributes: Attributes = {
+    ...getCommonAttributes(config),
+    category,
+    baseline,
+  };
+
+  performanceScoreGauge.record(score, attributes);
+}
+
+export function recordPerformanceRegression(
+  config: Config,
+  metric: string,
+  currentValue: number,
+  baselineValue: number,
+  severity: 'low' | 'medium' | 'high',
+): void {
+  if (!regressionDetectionCounter || !isPerformanceMonitoringEnabled) return;
+
+  const attributes: Attributes = {
+    ...getCommonAttributes(config),
+    metric,
+    severity,
+    current_value: currentValue,
+    baseline_value: baselineValue,
+  };
+
+  regressionDetectionCounter.add(1, attributes);
+
+  if (baselineValue !== 0 && regressionPercentageChangeHistogram) {
+    const percentageChange =
+      ((currentValue - baselineValue) / baselineValue) * 100;
+    regressionPercentageChangeHistogram.record(percentageChange, attributes);
+  }
+}
+
+export function recordBaselineComparison(
+  config: Config,
+  metric: string,
+  currentValue: number,
+  baselineValue: number,
+  category: string,
+): void {
+  if (!baselineComparisonHistogram || !isPerformanceMonitoringEnabled) return;
+
+  if (baselineValue === 0) {
+    diag.warn('Baseline value is zero, skipping comparison.');
+    return;
+  }
+  const percentageChange =
+    ((currentValue - baselineValue) / baselineValue) * 100;
+
+  const attributes: Attributes = {
+    ...getCommonAttributes(config),
+    metric,
+    category,
+    current_value: currentValue,
+    baseline_value: baselineValue,
+  };
+
+  baselineComparisonHistogram.record(percentageChange, attributes);
+}
+
+// Utility function to check if performance monitoring is enabled
+export function isPerformanceMonitoringActive(): boolean {
+  return isPerformanceMonitoringEnabled && isMetricsInitialized;
+}