2025-05-10 13:11:03 -07:00
/**
* @license
* Copyright 2025 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
2025-05-26 14:17:56 -07:00
import { Content , SchemaUnion , Type } from '@google/genai' ;
2025-08-07 09:18:53 -07:00
import { DEFAULT_GEMINI_FLASH_MODEL } from '../config/models.js' ;
2025-05-10 13:11:03 -07:00
import { GeminiClient } from '../core/client.js' ;
2025-05-26 14:17:56 -07:00
import { GeminiChat } from '../core/geminiChat.js' ;
2025-05-26 14:39:29 -07:00
import { isFunctionResponse } from './messageInspectors.js' ;
2025-05-10 13:11:03 -07:00
const CHECK_PROMPT = ` Analyze *only* the content and structure of your immediately preceding response (your last turn in the conversation history). Based *strictly* on that response, determine who should logically speak next: the 'user' or the 'model' (you).
**Decision Rules (apply in order):**
1. **Model Continues:** If your last response explicitly states an immediate next action *you* intend to take (e.g., "Next, I will...", "Now I'll process...", "Moving on to analyze...", indicates an intended tool call that didn't execute), OR if the response seems clearly incomplete (cut off mid-thought without a natural conclusion), then the **'model'** should speak next.
2. **Question to User:** If your last response ends with a direct question specifically addressed *to the user*, then the **'user'** should speak next.
2025-07-31 18:17:52 -07:00
3. **Waiting for User:** If your last response completed a thought, statement, or task *and* does not meet the criteria for Rule 1 (Model Continues) or Rule 2 (Question to User), it implies a pause expecting user input or reaction. In this case, the **'user'** should speak next. ` ;
2025-05-10 13:11:03 -07:00
const RESPONSE_SCHEMA : SchemaUnion = {
type : Type . OBJECT ,
properties : {
reasoning : {
type : Type . STRING ,
description :
"Brief explanation justifying the 'next_speaker' choice based *strictly* on the applicable rule and the content/structure of the preceding turn." ,
} ,
next_speaker : {
type : Type . STRING ,
enum : [ 'user' , 'model' ] ,
description :
'Who should speak next based *only* on the preceding turn and the decision rules' ,
} ,
} ,
required : [ 'reasoning' , 'next_speaker' ] ,
} ;
export interface NextSpeakerResponse {
reasoning : string ;
next_speaker : 'user' | 'model' ;
}
export async function checkNextSpeaker (
2025-05-26 14:17:56 -07:00
chat : GeminiChat ,
2025-05-10 13:11:03 -07:00
geminiClient : GeminiClient ,
2025-05-27 23:40:25 -07:00
abortSignal : AbortSignal ,
2025-05-10 13:11:03 -07:00
) : Promise < NextSpeakerResponse | null > {
2025-05-11 12:59:44 -07:00
// We need to capture the curated history because there are many moments when the model will return invalid turns
// that when passed back up to the endpoint will break subsequent calls. An example of this is when the model decides
// to respond with an empty part collection if you were to send that message back to the server it will respond with
// a 400 indicating that model part collections MUST have content.
2025-05-26 14:39:29 -07:00
const curatedHistory = chat . getHistory ( /* curated */ true ) ;
2025-05-11 12:59:44 -07:00
2025-05-10 13:11:03 -07:00
// Ensure there's a model response to analyze
2025-05-26 14:39:29 -07:00
if ( curatedHistory . length === 0 ) {
// Cannot determine next speaker if history is empty.
return null ;
}
const comprehensiveHistory = chat . getHistory ( ) ;
// If comprehensiveHistory is empty, there is no last message to check.
// This case should ideally be caught by the curatedHistory.length check earlier,
// but as a safeguard:
if ( comprehensiveHistory . length === 0 ) {
return null ;
}
const lastComprehensiveMessage =
comprehensiveHistory [ comprehensiveHistory . length - 1 ] ;
// If the last message is a user message containing only function_responses,
// then the model should speak next.
if (
lastComprehensiveMessage &&
isFunctionResponse ( lastComprehensiveMessage )
) {
return {
reasoning :
'The last message was a function response, so the model should speak next.' ,
next_speaker : 'model' ,
} ;
}
if (
lastComprehensiveMessage &&
lastComprehensiveMessage . role === 'model' &&
lastComprehensiveMessage . parts &&
lastComprehensiveMessage . parts . length === 0
) {
lastComprehensiveMessage . parts . push ( { text : '' } ) ;
return {
reasoning :
'The last message was a filler model message with no content (nothing for user to act on), model should speak next.' ,
next_speaker : 'model' ,
} ;
}
2025-07-05 17:23:39 +02:00
// Things checked out. Let's proceed to potentially making an LLM request.
2025-05-26 14:39:29 -07:00
const lastMessage = curatedHistory [ curatedHistory . length - 1 ] ;
if ( ! lastMessage || lastMessage . role !== 'model' ) {
2025-05-10 13:11:03 -07:00
// Cannot determine next speaker if the last turn wasn't from the model
// or if history is empty.
return null ;
}
const contents : Content [ ] = [
2025-05-26 14:39:29 -07:00
. . . curatedHistory ,
2025-05-10 13:11:03 -07:00
{ role : 'user' , parts : [ { text : CHECK_PROMPT } ] } ,
] ;
try {
const parsedResponse = ( await geminiClient . generateJson (
contents ,
RESPONSE_SCHEMA ,
2025-05-27 23:40:25 -07:00
abortSignal ,
2025-08-07 09:18:53 -07:00
DEFAULT_GEMINI_FLASH_MODEL ,
2025-05-10 13:11:03 -07:00
) ) as unknown as NextSpeakerResponse ;
if (
parsedResponse &&
parsedResponse . next_speaker &&
[ 'user' , 'model' ] . includes ( parsedResponse . next_speaker )
) {
return parsedResponse ;
}
return null ;
} catch ( error ) {
console . warn (
2025-05-22 06:00:36 +00:00
'Failed to talk to Gemini endpoint when seeing if conversation should continue.' ,
2025-05-10 13:11:03 -07:00
error ,
) ;
return null ;
}
}