diff --git a/evals/frugalReads.eval.ts b/evals/frugalReads.eval.ts index 55a73f85e2..47578039a6 100644 --- a/evals/frugalReads.eval.ts +++ b/evals/frugalReads.eval.ts @@ -78,22 +78,23 @@ describe('Frugal reads eval', () => { ).toBe(true); let totalLinesRead = 0; - const readRanges: { offset: number; limit: number }[] = []; + const readRanges: { start_line: number; end_line: number }[] = []; for (const call of targetFileReads) { const args = JSON.parse(call.toolRequest.args); expect( - args.limit, - 'Agent read the entire file (missing limit) instead of using ranged read', + args.end_line, + 'Agent read the entire file (missing end_line) instead of using ranged read', ).toBeDefined(); - const limit = args.limit; - const offset = args.offset ?? 0; - totalLinesRead += limit; - readRanges.push({ offset, limit }); + const end_line = args.end_line; + const start_line = args.start_line ?? 1; + const linesRead = end_line - start_line + 1; + totalLinesRead += linesRead; + readRanges.push({ start_line, end_line }); - expect(args.limit, 'Agent read too many lines at once').toBeLessThan( + expect(linesRead, 'Agent read too many lines at once').toBeLessThan( 1001, ); } @@ -108,7 +109,7 @@ describe('Frugal reads eval', () => { const errorLines = [500, 510, 520]; for (const line of errorLines) { const covered = readRanges.some( - (range) => line >= range.offset && line < range.offset + range.limit, + (range) => line >= range.start_line && line <= range.end_line, ); expect(covered, `Agent should have read around line ${line}`).toBe( true, @@ -191,8 +192,8 @@ describe('Frugal reads eval', () => { for (const call of targetFileReads) { const args = JSON.parse(call.toolRequest.args); expect( - args.limit, - 'Agent should have used ranged read (limit) to save tokens', + args.end_line, + 'Agent should have used ranged read (end_line) to save tokens', ).toBeDefined(); } }, @@ -253,7 +254,7 @@ describe('Frugal reads eval', () => { // and just read the whole file to be efficient with tool calls. const readEntireFile = targetFileReads.some((call) => { const args = JSON.parse(call.toolRequest.args); - return args.limit === undefined; + return args.end_line === undefined; }); expect( diff --git a/evals/frugalSearch.eval.ts b/evals/frugalSearch.eval.ts index 8805a6a8ed..1c49fc2ed4 100644 --- a/evals/frugalSearch.eval.ts +++ b/evals/frugalSearch.eval.ts @@ -68,7 +68,7 @@ describe('Frugal Search', () => { const args = getParams(call); return ( args.file_path === 'src/legacy_processor.ts' && - (args.limit === undefined || args.limit === null) + (args.end_line === undefined || args.end_line === null) ); }); @@ -87,7 +87,7 @@ describe('Frugal Search', () => { if ( call.toolRequest.name === 'read_file' && args.file_path === 'src/legacy_processor.ts' && - args.limit !== undefined + args.end_line !== undefined ) { return true; } diff --git a/evals/interactive-hang.eval.ts b/evals/interactive-hang.eval.ts index 43b49759bb..0cf56acf98 100644 --- a/evals/interactive-hang.eval.ts +++ b/evals/interactive-hang.eval.ts @@ -56,7 +56,7 @@ describe('interactive_commands', () => { const scaffoldCall = logs.find( (l) => l.toolRequest.name === 'run_shell_command' && - /npm (init|create)|npx create-|yarn create|pnpm create/.test( + /npm (init|create)|npx (.*)?create-|yarn create|pnpm create/.test( l.toolRequest.args, ), );