From 2468f56922edc251a508df2e675674e7fd5b7f9f Mon Sep 17 00:00:00 2001 From: Christian Gunderman Date: Wed, 28 Jan 2026 21:27:48 -0800 Subject: [PATCH] Update debug command. --- .gemini/commands/fix-behavioral-eval.toml | 4 +++- evals/frugalReads.eval.ts | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.gemini/commands/fix-behavioral-eval.toml b/.gemini/commands/fix-behavioral-eval.toml index 36e39706d0..4c091f1804 100644 --- a/.gemini/commands/fix-behavioral-eval.toml +++ b/.gemini/commands/fix-behavioral-eval.toml @@ -27,7 +27,9 @@ You are an expert at fixing behavioral evaluations. - Your primary mechanism for improving the agent's behavior is to make changes to tool instructions, prompt.ts, and/or modules that contribute to the prompt. - If prompt and description changes are unsuccessful, use logs and debugging to - confirm that everything is working as expected. + confirm that everything is working as expected. You can try some of the following. + - **Interactive Prompts**: Commands like `npx` may hang waiting for user confirmation to install a package. Prefer `npx --yes `. + - **Missing package.json**: Some tools (like `eslint`) require a `package.json` to be present in the working directory or a parent. - If unable to fix the test, you can make recommendations for architecture changes that might help stablize the test. Be sure to THINK DEEPLY if offering architecture guidance. Some facts that might help with this are: diff --git a/evals/frugalReads.eval.ts b/evals/frugalReads.eval.ts index 81b643b481..0474030502 100644 --- a/evals/frugalReads.eval.ts +++ b/evals/frugalReads.eval.ts @@ -44,7 +44,7 @@ describe('Frugal reads eval', () => { }, prompt: 'Fix all linter errors in linter_mess.ts manually by editing the file. Run eslint directly (using "npx --yes eslint") to find them. Do not run the file.', - assert: async (rig, result) => { + assert: async (rig) => { const logs = rig.readToolLogs(); // Check if the agent read the whole file