feat(core): add modelAvailabilityService for managing and tracking model health (#13426)

This commit is contained in:
Adam Weidman
2025-11-21 11:54:16 -05:00
committed by GitHub
parent 030a5ace97
commit aeffa2a460
2 changed files with 296 additions and 0 deletions

View File

@@ -0,0 +1,165 @@
/**
* @license
* Copyright 2025 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, expect, it, vi, beforeEach } from 'vitest';
import { ModelAvailabilityService } from './modelAvailabilityService.js';
describe('ModelAvailabilityService', () => {
let service: ModelAvailabilityService;
const model = 'test-model';
beforeEach(() => {
service = new ModelAvailabilityService();
vi.useRealTimers();
});
it('returns available snapshot when no state recorded', () => {
expect(service.snapshot(model)).toEqual({ available: true });
});
it('tracks retry-once-per-turn failures', () => {
service.markRetryOncePerTurn(model);
expect(service.snapshot(model)).toEqual({ available: true });
service.consumeStickyAttempt(model);
expect(service.snapshot(model)).toEqual({
available: false,
reason: 'retry_once_per_turn',
});
service.resetTurn();
expect(service.snapshot(model)).toEqual({ available: true });
});
it('tracks terminal failures', () => {
service.markTerminal(model, 'quota');
expect(service.snapshot(model)).toEqual({
available: false,
reason: 'quota',
});
});
it('does not override terminal failure with sticky failure', () => {
service.markTerminal(model, 'quota');
service.markRetryOncePerTurn(model);
expect(service.snapshot(model)).toEqual({
available: false,
reason: 'quota',
});
});
it('selects models respecting terminal and sticky states', () => {
const stickyModel = 'stick-model';
const healthyModel = 'healthy-model';
service.markTerminal(model, 'capacity');
service.markRetryOncePerTurn(stickyModel);
const first = service.selectFirstAvailable([
model,
stickyModel,
healthyModel,
]);
expect(first).toEqual({
selected: stickyModel,
attempts: 1,
skipped: [
{
model,
reason: 'capacity',
},
],
});
service.consumeStickyAttempt(stickyModel);
const second = service.selectFirstAvailable([
model,
stickyModel,
healthyModel,
]);
expect(second).toEqual({
selected: healthyModel,
skipped: [
{
model,
reason: 'capacity',
},
{
model: stickyModel,
reason: 'retry_once_per_turn',
},
],
});
service.resetTurn();
const third = service.selectFirstAvailable([
model,
stickyModel,
healthyModel,
]);
expect(third).toEqual({
selected: stickyModel,
attempts: 1,
skipped: [
{
model,
reason: 'capacity',
},
],
});
});
it('preserves consumed state when marking retry-once-per-turn again', () => {
service.markRetryOncePerTurn(model);
service.consumeStickyAttempt(model);
// It is currently consumed
expect(service.snapshot(model).available).toBe(false);
// Marking it again should not reset the consumed flag
service.markRetryOncePerTurn(model);
expect(service.snapshot(model).available).toBe(false);
});
it('clears consumed state when marked healthy', () => {
service.markRetryOncePerTurn(model);
service.consumeStickyAttempt(model);
expect(service.snapshot(model).available).toBe(false);
service.markHealthy(model);
expect(service.snapshot(model).available).toBe(true);
// If we mark it sticky again, it should be fresh (not consumed)
service.markRetryOncePerTurn(model);
expect(service.snapshot(model).available).toBe(true);
});
it('resetTurn resets consumed state for multiple sticky models', () => {
const model2 = 'model-2';
service.markRetryOncePerTurn(model);
service.markRetryOncePerTurn(model2);
service.consumeStickyAttempt(model);
service.consumeStickyAttempt(model2);
expect(service.snapshot(model).available).toBe(false);
expect(service.snapshot(model2).available).toBe(false);
service.resetTurn();
expect(service.snapshot(model).available).toBe(true);
expect(service.snapshot(model2).available).toBe(true);
});
it('resetTurn does not affect terminal models', () => {
service.markTerminal(model, 'quota');
service.resetTurn();
expect(service.snapshot(model)).toEqual({
available: false,
reason: 'quota',
});
});
});

View File

@@ -0,0 +1,131 @@
/**
* @license
* Copyright 2025 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
export type ModelId = string;
export type TerminalUnavailabilityReason = 'quota' | 'capacity';
export type TurnUnavailabilityReason = 'retry_once_per_turn';
export type UnavailabilityReason =
| TerminalUnavailabilityReason
| TurnUnavailabilityReason
| 'unknown';
type HealthState =
| { status: 'terminal'; reason: TerminalUnavailabilityReason }
| {
status: 'sticky_retry';
reason: TurnUnavailabilityReason;
consumed: boolean;
};
export interface ModelAvailabilitySnapshot {
available: boolean;
reason?: UnavailabilityReason;
}
export interface ModelSelectionResult {
selected: ModelId | null;
attempts?: number;
skipped: Array<{
model: ModelId;
reason: UnavailabilityReason;
}>;
}
export class ModelAvailabilityService {
private readonly health = new Map<ModelId, HealthState>();
markTerminal(model: ModelId, reason: TerminalUnavailabilityReason) {
this.setState(model, {
status: 'terminal',
reason,
});
}
markHealthy(model: ModelId) {
this.clearState(model);
}
markRetryOncePerTurn(model: ModelId) {
const currentState = this.health.get(model);
// Do not override a terminal failure with a transient one.
if (currentState?.status === 'terminal') {
return;
}
// Only reset consumption if we are not already in the sticky_retry state.
// This prevents infinite loops if the model fails repeatedly in the same turn.
let consumed = false;
if (currentState?.status === 'sticky_retry') {
consumed = currentState.consumed;
}
this.setState(model, {
status: 'sticky_retry',
reason: 'retry_once_per_turn',
consumed,
});
}
consumeStickyAttempt(model: ModelId) {
const state = this.health.get(model);
if (state?.status === 'sticky_retry') {
this.setState(model, { ...state, consumed: true });
}
}
snapshot(model: ModelId): ModelAvailabilitySnapshot {
const state = this.health.get(model);
if (!state) {
return { available: true };
}
if (state.status === 'terminal') {
return { available: false, reason: state.reason };
}
if (state.status === 'sticky_retry' && state.consumed) {
return { available: false, reason: state.reason };
}
return { available: true };
}
selectFirstAvailable(models: ModelId[]): ModelSelectionResult {
const skipped: ModelSelectionResult['skipped'] = [];
for (const model of models) {
const snapshot = this.snapshot(model);
if (snapshot.available) {
const state = this.health.get(model);
// A sticky model is being attempted, so note that.
const attempts = state?.status === 'sticky_retry' ? 1 : undefined;
return { selected: model, skipped, attempts };
} else {
skipped.push({ model, reason: snapshot.reason ?? 'unknown' });
}
}
return { selected: null, skipped };
}
resetTurn() {
for (const [model, state] of this.health.entries()) {
if (state.status === 'sticky_retry') {
this.setState(model, { ...state, consumed: false });
}
}
}
private setState(model: ModelId, nextState: HealthState) {
this.health.set(model, nextState);
}
private clearState(model: ModelId) {
this.health.delete(model);
}
}