Use ranged reads and limited searches and fuzzy editing improvements (#19240)

2026-04-27 05:24:34 -07:00 · 2026-02-17 23:54:08 +00:00
parent 55c628e967
commit ce84b3cb5f
9 changed files with 1174 additions and 145 deletions
@@ -373,6 +373,182 @@ describe('EditTool', () => {
      expect(result.occurrences).toBe(1);
    });

+    it('should perform a fuzzy replacement when exact match fails but similarity is high', async () => {
+      const content =
+        'const myConfig = {\n  enableFeature: true,\n  retries: 3\n};';
+      // Typo: missing comma after true
+      const oldString =
+        'const myConfig = {\n  enableFeature: true\n  retries: 3\n};';
+      const newString =
+        'const myConfig = {\n  enableFeature: false,\n  retries: 5\n};';
+
+      const result = await calculateReplacement(mockConfig, {
+        params: {
+          file_path: 'config.ts',
+          instruction: 'update config',
+          old_string: oldString,
+          new_string: newString,
+        },
+        currentContent: content,
+        abortSignal,
+      });
+
+      expect(result.occurrences).toBe(1);
+      expect(result.newContent).toBe(newString);
+    });
+
+    it('should NOT perform a fuzzy replacement when similarity is below threshold', async () => {
+      const content =
+        'const myConfig = {\n  enableFeature: true,\n  retries: 3\n};';
+      // Completely different string
+      const oldString = 'function somethingElse() {\n  return false;\n}';
+      const newString =
+        'const myConfig = {\n  enableFeature: false,\n  retries: 5\n};';
+
+      const result = await calculateReplacement(mockConfig, {
+        params: {
+          file_path: 'config.ts',
+          instruction: 'update config',
+          old_string: oldString,
+          new_string: newString,
+        },
+        currentContent: content,
+        abortSignal,
+      });
+
+      expect(result.occurrences).toBe(0);
+      expect(result.newContent).toBe(content);
+    });
+
+    it('should NOT perform a fuzzy replacement when the complexity (length * size) is too high', async () => {
+      // 2000 chars
+      const longString = 'a'.repeat(2000);
+
+      // Create a file with enough lines to trigger the complexity limit
+      // Complexity = Lines * Length^2
+      // Threshold = 500,000,000
+      // 2000^2 = 4,000,000.
+      // Need > 125 lines. Let's use 200 lines.
+      const lines = Array(200).fill(longString);
+      const content = lines.join('\n');
+
+      // Mismatch at the end (making it a fuzzy match candidate)
+      const oldString = longString + 'c';
+      const newString = 'replacement';
+
+      const result = await calculateReplacement(mockConfig, {
+        params: {
+          file_path: 'test.ts',
+          instruction: 'update',
+          old_string: oldString,
+          new_string: newString,
+        },
+        currentContent: content,
+        abortSignal,
+      });
+
+      // Should return 0 occurrences because fuzzy match is skipped
+      expect(result.occurrences).toBe(0);
+      expect(result.newContent).toBe(content);
+    });
+
+    it('should perform multiple fuzzy replacements if multiple valid matches are found', async () => {
+      const content = `
+function doIt() {
+  console.log("hello");
+}
+
+function doIt() {
+  console.log("hello");
+}
+`;
+      // old_string uses single quotes, file uses double.
+      // This is a fuzzy match (quote difference).
+      const oldString = `
+function doIt() {
+  console.log('hello');
+}
+`.trim();
+
+      const newString = `
+function doIt() {
+  console.log("bye");
+}
+`.trim();
+
+      const result = await calculateReplacement(mockConfig, {
+        params: {
+          file_path: 'test.ts',
+          instruction: 'update',
+          old_string: oldString,
+          new_string: newString,
+        },
+        currentContent: content,
+        abortSignal,
+      });
+
+      expect(result.occurrences).toBe(2);
+      const expectedContent = `
+function doIt() {
+  console.log("bye");
+}
+
+function doIt() {
+  console.log("bye");
+}
+`;
+      expect(result.newContent).toBe(expectedContent);
+    });
+
+    it('should correctly rebase indentation in flexible replacement without double-indenting', async () => {
+      const content = '    if (a) {\n        foo();\n    }\n';
+      // old_string and new_string are unindented. They should be rebased to 4-space.
+      const oldString = 'if (a) {\n    foo();\n}';
+      const newString = 'if (a) {\n    bar();\n}';
+
+      const result = await calculateReplacement(mockConfig, {
+        params: {
+          file_path: 'test.ts',
+          old_string: oldString,
+          new_string: newString,
+        },
+        currentContent: content,
+        abortSignal,
+      });
+
+      expect(result.occurrences).toBe(1);
+      // foo() was at 8 spaces (4 base + 4 indent).
+      // newString has bar() at 4 spaces (0 base + 4 indent).
+      // Rebased to 4 base, it should be 4 + 4 = 8 spaces.
+      const expectedContent = '    if (a) {\n        bar();\n    }\n';
+      expect(result.newContent).toBe(expectedContent);
+    });
+
+    it('should correctly rebase indentation in fuzzy replacement without double-indenting', async () => {
+      const content =
+        '    const myConfig = {\n      enableFeature: true,\n      retries: 3\n    };';
+      // Typo: missing comma. old_string/new_string are unindented.
+      const fuzzyOld =
+        'const myConfig = {\n  enableFeature: true\n  retries: 3\n};';
+      const fuzzyNew =
+        'const myConfig = {\n  enableFeature: false,\n  retries: 5\n};';
+
+      const result = await calculateReplacement(mockConfig, {
+        params: {
+          file_path: 'test.ts',
+          old_string: fuzzyOld,
+          new_string: fuzzyNew,
+        },
+        currentContent: content,
+        abortSignal,
+      });
+
+      expect(result.strategy).toBe('fuzzy');
+      const expectedContent =
+        '    const myConfig = {\n      enableFeature: false,\n      retries: 5\n    };';
+      expect(result.newContent).toBe(expectedContent);
+    });
+
    it('should NOT insert extra newlines when replacing a block preceded by a blank line (regression)', async () => {
      const content = '\n  function oldFunc() {\n    // some code\n  }';
      const result = await calculateReplacement(mockConfig, {
@@ -49,8 +49,13 @@ import {
  EDIT_DISPLAY_NAME,
 } from './tool-names.js';
 import { debugLogger } from '../utils/debugLogger.js';
+import levenshtein from 'fast-levenshtein';
 import { EDIT_DEFINITION } from './definitions/coreTools.js';
 import { resolveToolDeclaration } from './definitions/resolver.js';
+
+const ENABLE_FUZZY_MATCH_RECOVERY = true;
+const FUZZY_MATCH_THRESHOLD = 0.1; // Allow up to 10% weighted difference
+const WHITESPACE_PENALTY_FACTOR = 0.1; // Whitespace differences cost 10% of a character difference
 interface ReplacementContext {
  params: EditToolParams;
  currentContent: string;
@@ -62,6 +67,8 @@ interface ReplacementResult {
  occurrences: number;
  finalOldString: string;
  finalNewString: string;
+  strategy?: 'exact' | 'flexible' | 'regex' | 'fuzzy';
+  matchRanges?: Array<{ start: number; end: number }>;
 }

 export function applyReplacement(
@@ -176,9 +183,7 @@ async function calculateFlexibleReplacement(
      const firstLineInMatch = window[0];
      const indentationMatch = firstLineInMatch.match(/^([ \t]*)/);
      const indentation = indentationMatch ? indentationMatch[1] : '';
-      const newBlockWithIndent = replaceLines.map(
-        (line: string) => `${indentation}${line}`,
-      );
+      const newBlockWithIndent = applyIndentation(replaceLines, indentation);
      sourceLines.splice(
        i,
        searchLinesStripped.length,
@@ -247,9 +252,7 @@ async function calculateRegexReplacement(

  const indentation = match[1] || '';
  const newLines = normalizedReplace.split('\n');
-  const newBlockWithIndent = newLines
-    .map((line) => `${indentation}${line}`)
-    .join('\n');
+  const newBlockWithIndent = applyIndentation(newLines, indentation).join('\n');

  // Use replace with the regex to substitute the matched content.
  // Since the regex doesn't have the 'g' flag, it will only replace the first occurrence.
@@ -305,6 +308,14 @@ export async function calculateReplacement(
    return regexResult;
  }

+  let fuzzyResult;
+  if (
+    ENABLE_FUZZY_MATCH_RECOVERY &&
+    (fuzzyResult = await calculateFuzzyReplacement(config, context))
+  ) {
+    return fuzzyResult;
+  }
+
  return {
    newContent: currentContent,
    occurrences: 0,
@@ -395,6 +406,8 @@ interface CalculatedEdit {
  error?: { display: string; raw: string; type: ToolErrorType };
  isNewFile: boolean;
  originalLineEnding: '\r\n' | '\n';
+  strategy?: 'exact' | 'flexible' | 'regex' | 'fuzzy';
+  matchRanges?: Array<{ start: number; end: number }>;
 }

 class EditToolInvocation
@@ -520,6 +533,8 @@ class EditToolInvocation
      isNewFile: false,
      error: undefined,
      originalLineEnding,
+      strategy: secondAttemptResult.strategy,
+      matchRanges: secondAttemptResult.matchRanges,
    };
  }

@@ -633,6 +648,8 @@ class EditToolInvocation
        isNewFile: false,
        error: undefined,
        originalLineEnding,
+        strategy: replacementResult.strategy,
+        matchRanges: replacementResult.matchRanges,
      };
    }

@@ -859,6 +876,10 @@ class EditToolInvocation
          ? `Created new file: ${this.params.file_path} with provided content.`
          : `Successfully modified file: ${this.params.file_path} (${editData.occurrences} replacements).`,
      ];
+      const fuzzyFeedback = getFuzzyMatchFeedback(editData);
+      if (fuzzyFeedback) {
+        llmSuccessMessageParts.push(fuzzyFeedback);
+      }
      if (this.params.modified_by_user) {
        llmSuccessMessageParts.push(
          `User modified the \`new_string\` content to be: ${this.params.new_string}.`,
@@ -1011,3 +1032,188 @@ export class EditTool
    };
  }
 }
+
+function stripWhitespace(str: string): string {
+  return str.replace(/\s/g, '');
+}
+
+/**
+ * Applies the target indentation to the lines, while preserving relative indentation.
+ * It identifies the common indentation of the provided lines and replaces it with the target indentation.
+ */
+function applyIndentation(
+  lines: string[],
+  targetIndentation: string,
+): string[] {
+  if (lines.length === 0) return [];
+
+  // Use the first line as the reference for indentation, even if it's empty/whitespace.
+  // This is because flexible/fuzzy matching identifies the indentation of the START of the match.
+  const referenceLine = lines[0];
+  const refIndentMatch = referenceLine.match(/^([ \t]*)/);
+  const refIndent = refIndentMatch ? refIndentMatch[1] : '';
+
+  return lines.map((line) => {
+    if (line.trim() === '') {
+      return '';
+    }
+    if (line.startsWith(refIndent)) {
+      return targetIndentation + line.slice(refIndent.length);
+    }
+    return targetIndentation + line.trimStart();
+  });
+}
+
+function getFuzzyMatchFeedback(editData: CalculatedEdit): string | null {
+  if (
+    editData.strategy === 'fuzzy' &&
+    editData.matchRanges &&
+    editData.matchRanges.length > 0
+  ) {
+    const ranges = editData.matchRanges
+      .map((r) => (r.start === r.end ? `${r.start}` : `${r.start}-${r.end}`))
+      .join(', ');
+    return `Applied fuzzy match at line${editData.matchRanges.length > 1 ? 's' : ''} ${ranges}.`;
+  }
+  return null;
+}
+
+async function calculateFuzzyReplacement(
+  config: Config,
+  context: ReplacementContext,
+): Promise<ReplacementResult | null> {
+  const { currentContent, params } = context;
+  const { old_string, new_string } = params;
+
+  // Pre-check: Don't fuzzy match very short strings to avoid false positives
+  if (old_string.length < 10) {
+    return null;
+  }
+
+  const normalizedCode = currentContent.replace(/\r\n/g, '\n');
+  const normalizedSearch = old_string.replace(/\r\n/g, '\n');
+  const normalizedReplace = new_string.replace(/\r\n/g, '\n');
+
+  const sourceLines = normalizedCode.match(/.*(?:\n|$)/g)?.slice(0, -1) ?? [];
+  const searchLines = normalizedSearch
+    .match(/.*(?:\n|$)/g)
+    ?.slice(0, -1)
+    .map((l) => l.trimEnd()); // Trim end of search lines to be more robust
+
+  // Limit the scope of the fuzzy match to reduce impact on responsivesness.
+  // Each comparison takes roughly O(L^2) time.
+  // We perform sourceLines.length comparisons (sliding window).
+  // Total complexity proxy: sourceLines.length * old_string.length^2
+  // Limit to 4e8 for < 1 second.
+  if (sourceLines.length * Math.pow(old_string.length, 2) > 400_000_000) {
+    return null;
+  }
+
+  if (!searchLines || searchLines.length === 0) {
+    return null;
+  }
+
+  const N = searchLines.length;
+  const candidates: Array<{ index: number; score: number }> = [];
+  const searchBlock = searchLines.join('\n');
+
+  // Sliding window
+  for (let i = 0; i <= sourceLines.length - N; i++) {
+    const windowLines = sourceLines.slice(i, i + N);
+    const windowText = windowLines.map((l) => l.trimEnd()).join('\n'); // Normalized join for comparison
+
+    // Length Heuristic Optimization
+    const lengthDiff = Math.abs(windowText.length - searchBlock.length);
+    if (
+      lengthDiff / searchBlock.length >
+      FUZZY_MATCH_THRESHOLD / WHITESPACE_PENALTY_FACTOR
+    ) {
+      continue;
+    }
+
+    // Tiered Scoring
+    const d_raw = levenshtein.get(windowText, searchBlock);
+    const d_norm = levenshtein.get(
+      stripWhitespace(windowText),
+      stripWhitespace(searchBlock),
+    );
+
+    const weightedDist = d_norm + (d_raw - d_norm) * WHITESPACE_PENALTY_FACTOR;
+    const score = weightedDist / searchBlock.length;
+
+    if (score <= FUZZY_MATCH_THRESHOLD) {
+      candidates.push({ index: i, score });
+    }
+  }
+
+  if (candidates.length === 0) {
+    return null;
+  }
+
+  // Select best non-overlapping matches
+  // Sort by score ascending. If scores equal, prefer earlier index (stable sort).
+  candidates.sort((a, b) => a.score - b.score || a.index - b.index);
+
+  const selectedMatches: Array<{ index: number; score: number }> = [];
+  for (const candidate of candidates) {
+    // Check for overlap with already selected matches
+    // Two windows overlap if their start indices are within N lines of each other
+    // (Assuming window size N. Actually overlap is |i - j| < N)
+    const overlaps = selectedMatches.some(
+      (m) => Math.abs(m.index - candidate.index) < N,
+    );
+    if (!overlaps) {
+      selectedMatches.push(candidate);
+    }
+  }
+
+  // If we found matches, apply them
+  if (selectedMatches.length > 0) {
+    const event = new EditStrategyEvent('fuzzy');
+    logEditStrategy(config, event);
+
+    // Calculate match ranges before sorting for replacement
+    // Indices in selectedMatches are 0-based line indices
+    const matchRanges = selectedMatches
+      .map((m) => ({ start: m.index + 1, end: m.index + N }))
+      .sort((a, b) => a.start - b.start);
+
+    // Sort matches by index descending to apply replacements from bottom to top
+    // so that indices remain valid
+    selectedMatches.sort((a, b) => b.index - a.index);
+
+    const newLines = normalizedReplace.split('\n');
+
+    for (const match of selectedMatches) {
+      // If we want to preserve the indentation of the first line of the match:
+      const firstLineMatch = sourceLines[match.index];
+      const indentationMatch = firstLineMatch.match(/^([ \t]*)/);
+      const indentation = indentationMatch ? indentationMatch[1] : '';
+
+      const indentedReplaceLines = applyIndentation(newLines, indentation);
+
+      let replacementText = indentedReplaceLines.join('\n');
+      // If the last line of the match had a newline, preserve it in the replacement
+      // to avoid merging with the next line or losing a blank line separator.
+      if (sourceLines[match.index + N - 1].endsWith('\n')) {
+        replacementText += '\n';
+      }
+
+      sourceLines.splice(match.index, N, replacementText);
+    }
+
+    let modifiedCode = sourceLines.join('');
+    modifiedCode = restoreTrailingNewline(currentContent, modifiedCode);
+
+    return {
+      newContent: modifiedCode,
+      occurrences: selectedMatches.length,
+      finalOldString: normalizedSearch,
+      finalNewString: normalizedReplace,
+      strategy: 'fuzzy',
+      matchRanges,
+    };
+  }
+
+  return null;
+}
@@ -235,8 +235,8 @@ describe('LSTool', () => {

      expect(entries[0]).toBe('[DIR] x-dir');
      expect(entries[1]).toBe('[DIR] y-dir');
-      expect(entries[2]).toBe('a-file.txt');
-      expect(entries[3]).toBe('b-file.txt');
+      expect(entries[2]).toBe('a-file.txt (8 bytes)');
+      expect(entries[3]).toBe('b-file.txt (8 bytes)');
    });

    it('should handle permission errors gracefully', async () => {
@@ -241,7 +241,12 @@ class LSToolInvocation extends BaseToolInvocation<LSToolParams, ToolResult> {

      // Create formatted content for LLM
      const directoryContent = entries
-        .map((entry) => `${entry.isDirectory ? '[DIR] ' : ''}${entry.name}`)
+        .map((entry) => {
+          if (entry.isDirectory) {
+            return `[DIR] ${entry.name}`;
+          }
+          return `${entry.name} (${entry.size} bytes)`;
+        })
        .join('\n');

      let resultMessage = `Directory listing for ${resolvedDirPath}:\n${directoryContent}`;