From aeffa2a4602c0635d6aaeff3ff3ee6eb06b27407 Mon Sep 17 00:00:00 2001 From: Adam Weidman <65992621+adamfweidman@users.noreply.github.com> Date: Fri, 21 Nov 2025 11:54:16 -0500 Subject: [PATCH] feat(core): add modelAvailabilityService for managing and tracking model health (#13426) --- .../modelAvailabilityService.test.ts | 165 ++++++++++++++++++ .../availability/modelAvailabilityService.ts | 131 ++++++++++++++ 2 files changed, 296 insertions(+) create mode 100644 packages/core/src/availability/modelAvailabilityService.test.ts create mode 100644 packages/core/src/availability/modelAvailabilityService.ts diff --git a/packages/core/src/availability/modelAvailabilityService.test.ts b/packages/core/src/availability/modelAvailabilityService.test.ts new file mode 100644 index 0000000000..bddb3b4946 --- /dev/null +++ b/packages/core/src/availability/modelAvailabilityService.test.ts @@ -0,0 +1,165 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, expect, it, vi, beforeEach } from 'vitest'; +import { ModelAvailabilityService } from './modelAvailabilityService.js'; + +describe('ModelAvailabilityService', () => { + let service: ModelAvailabilityService; + const model = 'test-model'; + + beforeEach(() => { + service = new ModelAvailabilityService(); + vi.useRealTimers(); + }); + + it('returns available snapshot when no state recorded', () => { + expect(service.snapshot(model)).toEqual({ available: true }); + }); + + it('tracks retry-once-per-turn failures', () => { + service.markRetryOncePerTurn(model); + expect(service.snapshot(model)).toEqual({ available: true }); + + service.consumeStickyAttempt(model); + expect(service.snapshot(model)).toEqual({ + available: false, + reason: 'retry_once_per_turn', + }); + + service.resetTurn(); + expect(service.snapshot(model)).toEqual({ available: true }); + }); + + it('tracks terminal failures', () => { + service.markTerminal(model, 'quota'); + expect(service.snapshot(model)).toEqual({ + available: false, + reason: 'quota', + }); + }); + + it('does not override terminal failure with sticky failure', () => { + service.markTerminal(model, 'quota'); + service.markRetryOncePerTurn(model); + expect(service.snapshot(model)).toEqual({ + available: false, + reason: 'quota', + }); + }); + + it('selects models respecting terminal and sticky states', () => { + const stickyModel = 'stick-model'; + const healthyModel = 'healthy-model'; + + service.markTerminal(model, 'capacity'); + service.markRetryOncePerTurn(stickyModel); + + const first = service.selectFirstAvailable([ + model, + stickyModel, + healthyModel, + ]); + expect(first).toEqual({ + selected: stickyModel, + attempts: 1, + skipped: [ + { + model, + reason: 'capacity', + }, + ], + }); + + service.consumeStickyAttempt(stickyModel); + const second = service.selectFirstAvailable([ + model, + stickyModel, + healthyModel, + ]); + expect(second).toEqual({ + selected: healthyModel, + skipped: [ + { + model, + reason: 'capacity', + }, + { + model: stickyModel, + reason: 'retry_once_per_turn', + }, + ], + }); + + service.resetTurn(); + const third = service.selectFirstAvailable([ + model, + stickyModel, + healthyModel, + ]); + expect(third).toEqual({ + selected: stickyModel, + attempts: 1, + skipped: [ + { + model, + reason: 'capacity', + }, + ], + }); + }); + + it('preserves consumed state when marking retry-once-per-turn again', () => { + service.markRetryOncePerTurn(model); + service.consumeStickyAttempt(model); + + // It is currently consumed + expect(service.snapshot(model).available).toBe(false); + + // Marking it again should not reset the consumed flag + service.markRetryOncePerTurn(model); + expect(service.snapshot(model).available).toBe(false); + }); + + it('clears consumed state when marked healthy', () => { + service.markRetryOncePerTurn(model); + service.consumeStickyAttempt(model); + expect(service.snapshot(model).available).toBe(false); + + service.markHealthy(model); + expect(service.snapshot(model).available).toBe(true); + + // If we mark it sticky again, it should be fresh (not consumed) + service.markRetryOncePerTurn(model); + expect(service.snapshot(model).available).toBe(true); + }); + + it('resetTurn resets consumed state for multiple sticky models', () => { + const model2 = 'model-2'; + service.markRetryOncePerTurn(model); + service.markRetryOncePerTurn(model2); + + service.consumeStickyAttempt(model); + service.consumeStickyAttempt(model2); + + expect(service.snapshot(model).available).toBe(false); + expect(service.snapshot(model2).available).toBe(false); + + service.resetTurn(); + + expect(service.snapshot(model).available).toBe(true); + expect(service.snapshot(model2).available).toBe(true); + }); + + it('resetTurn does not affect terminal models', () => { + service.markTerminal(model, 'quota'); + service.resetTurn(); + expect(service.snapshot(model)).toEqual({ + available: false, + reason: 'quota', + }); + }); +}); diff --git a/packages/core/src/availability/modelAvailabilityService.ts b/packages/core/src/availability/modelAvailabilityService.ts new file mode 100644 index 0000000000..0a08c28655 --- /dev/null +++ b/packages/core/src/availability/modelAvailabilityService.ts @@ -0,0 +1,131 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +export type ModelId = string; + +export type TerminalUnavailabilityReason = 'quota' | 'capacity'; +export type TurnUnavailabilityReason = 'retry_once_per_turn'; + +export type UnavailabilityReason = + | TerminalUnavailabilityReason + | TurnUnavailabilityReason + | 'unknown'; + +type HealthState = + | { status: 'terminal'; reason: TerminalUnavailabilityReason } + | { + status: 'sticky_retry'; + reason: TurnUnavailabilityReason; + consumed: boolean; + }; + +export interface ModelAvailabilitySnapshot { + available: boolean; + reason?: UnavailabilityReason; +} + +export interface ModelSelectionResult { + selected: ModelId | null; + attempts?: number; + skipped: Array<{ + model: ModelId; + reason: UnavailabilityReason; + }>; +} + +export class ModelAvailabilityService { + private readonly health = new Map(); + + markTerminal(model: ModelId, reason: TerminalUnavailabilityReason) { + this.setState(model, { + status: 'terminal', + reason, + }); + } + + markHealthy(model: ModelId) { + this.clearState(model); + } + + markRetryOncePerTurn(model: ModelId) { + const currentState = this.health.get(model); + // Do not override a terminal failure with a transient one. + if (currentState?.status === 'terminal') { + return; + } + + // Only reset consumption if we are not already in the sticky_retry state. + // This prevents infinite loops if the model fails repeatedly in the same turn. + let consumed = false; + if (currentState?.status === 'sticky_retry') { + consumed = currentState.consumed; + } + + this.setState(model, { + status: 'sticky_retry', + reason: 'retry_once_per_turn', + consumed, + }); + } + + consumeStickyAttempt(model: ModelId) { + const state = this.health.get(model); + if (state?.status === 'sticky_retry') { + this.setState(model, { ...state, consumed: true }); + } + } + + snapshot(model: ModelId): ModelAvailabilitySnapshot { + const state = this.health.get(model); + + if (!state) { + return { available: true }; + } + + if (state.status === 'terminal') { + return { available: false, reason: state.reason }; + } + + if (state.status === 'sticky_retry' && state.consumed) { + return { available: false, reason: state.reason }; + } + + return { available: true }; + } + + selectFirstAvailable(models: ModelId[]): ModelSelectionResult { + const skipped: ModelSelectionResult['skipped'] = []; + + for (const model of models) { + const snapshot = this.snapshot(model); + if (snapshot.available) { + const state = this.health.get(model); + // A sticky model is being attempted, so note that. + const attempts = state?.status === 'sticky_retry' ? 1 : undefined; + return { selected: model, skipped, attempts }; + } else { + skipped.push({ model, reason: snapshot.reason ?? 'unknown' }); + } + } + return { selected: null, skipped }; + } + + resetTurn() { + for (const [model, state] of this.health.entries()) { + if (state.status === 'sticky_retry') { + this.setState(model, { ...state, consumed: false }); + } + } + } + + private setState(model: ModelId, nextState: HealthState) { + this.health.set(model, nextState); + } + + private clearState(model: ModelId) { + this.health.delete(model); + } +}