Files
gemini-cli/packages/core/src/utils/fileUtils.ts

480 lines
16 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* @license
* Copyright 2025 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import fs from 'node:fs';
import fsPromises from 'node:fs/promises';
import path from 'node:path';
import type { PartUnion } from '@google/genai';
// eslint-disable-next-line import/no-internal-modules
import mime from 'mime/lite';
import type { FileSystemService } from '../services/fileSystemService.js';
import { ToolErrorType } from '../tools/tool-error.js';
import { BINARY_EXTENSIONS } from './ignorePatterns.js';
// Constants for text file processing
const DEFAULT_MAX_LINES_TEXT_FILE = 2000;
const MAX_LINE_LENGTH_TEXT_FILE = 2000;
// Default values for encoding and separator format
export const DEFAULT_ENCODING: BufferEncoding = 'utf-8';
// --- Unicode BOM detection & decoding helpers --------------------------------
type UnicodeEncoding = 'utf8' | 'utf16le' | 'utf16be' | 'utf32le' | 'utf32be';
interface BOMInfo {
encoding: UnicodeEncoding;
bomLength: number;
}
/**
* Detect a Unicode BOM (Byte Order Mark) if present.
* Reads up to the first 4 bytes and returns encoding + BOM length, else null.
*/
export function detectBOM(buf: Buffer): BOMInfo | null {
if (buf.length >= 4) {
// UTF-32 LE: FF FE 00 00
if (
buf[0] === 0xff &&
buf[1] === 0xfe &&
buf[2] === 0x00 &&
buf[3] === 0x00
) {
return { encoding: 'utf32le', bomLength: 4 };
}
// UTF-32 BE: 00 00 FE FF
if (
buf[0] === 0x00 &&
buf[1] === 0x00 &&
buf[2] === 0xfe &&
buf[3] === 0xff
) {
return { encoding: 'utf32be', bomLength: 4 };
}
}
if (buf.length >= 3) {
// UTF-8: EF BB BF
if (buf[0] === 0xef && buf[1] === 0xbb && buf[2] === 0xbf) {
return { encoding: 'utf8', bomLength: 3 };
}
}
if (buf.length >= 2) {
// UTF-16 LE: FF FE (but not UTF-32 LE already matched above)
if (
buf[0] === 0xff &&
buf[1] === 0xfe &&
(buf.length < 4 || buf[2] !== 0x00 || buf[3] !== 0x00)
) {
return { encoding: 'utf16le', bomLength: 2 };
}
// UTF-16 BE: FE FF
if (buf[0] === 0xfe && buf[1] === 0xff) {
return { encoding: 'utf16be', bomLength: 2 };
}
}
return null;
}
/**
* Convert a UTF-16 BE buffer to a JS string by swapping to LE then using Node's decoder.
* (Node has 'utf16le' but not 'utf16be'.)
*/
function decodeUTF16BE(buf: Buffer): string {
if (buf.length === 0) return '';
const swapped = Buffer.from(buf); // swap16 mutates in place, so copy
swapped.swap16();
return swapped.toString('utf16le');
}
/**
* Decode a UTF-32 buffer (LE or BE) into a JS string.
* Invalid code points are replaced with U+FFFD, partial trailing bytes are ignored.
*/
function decodeUTF32(buf: Buffer, littleEndian: boolean): string {
if (buf.length < 4) return '';
const usable = buf.length - (buf.length % 4);
let out = '';
for (let i = 0; i < usable; i += 4) {
const cp = littleEndian
? (buf[i] |
(buf[i + 1] << 8) |
(buf[i + 2] << 16) |
(buf[i + 3] << 24)) >>>
0
: (buf[i + 3] |
(buf[i + 2] << 8) |
(buf[i + 1] << 16) |
(buf[i] << 24)) >>>
0;
// Valid planes: 0x0000..0x10FFFF excluding surrogates
if (cp <= 0x10ffff && !(cp >= 0xd800 && cp <= 0xdfff)) {
out += String.fromCodePoint(cp);
} else {
out += '\uFFFD';
}
}
return out;
}
/**
* Read a file as text, honoring BOM encodings (UTF8/16/32) and stripping the BOM.
* Falls back to utf8 when no BOM is present.
*/
export async function readFileWithEncoding(filePath: string): Promise<string> {
// Read the file once; detect BOM and decode from the single buffer.
const full = await fs.promises.readFile(filePath);
if (full.length === 0) return '';
const bom = detectBOM(full);
if (!bom) {
// No BOM → treat as UTF8
return full.toString('utf8');
}
// Strip BOM and decode per encoding
const content = full.subarray(bom.bomLength);
switch (bom.encoding) {
case 'utf8':
return content.toString('utf8');
case 'utf16le':
return content.toString('utf16le');
case 'utf16be':
return decodeUTF16BE(content);
case 'utf32le':
return decodeUTF32(content, true);
case 'utf32be':
return decodeUTF32(content, false);
default:
// Defensive fallback; should be unreachable
return content.toString('utf8');
}
}
/**
* Looks up the specific MIME type for a file path.
* @param filePath Path to the file.
* @returns The specific MIME type string (e.g., 'text/python', 'application/javascript') or undefined if not found or ambiguous.
*/
export function getSpecificMimeType(filePath: string): string | undefined {
const lookedUpMime = mime.getType(filePath);
return typeof lookedUpMime === 'string' ? lookedUpMime : undefined;
}
/**
* Checks if a path is within a given root directory.
* @param pathToCheck The absolute path to check.
* @param rootDirectory The absolute root directory.
* @returns True if the path is within the root directory, false otherwise.
*/
export function isWithinRoot(
pathToCheck: string,
rootDirectory: string,
): boolean {
const normalizedPathToCheck = path.resolve(pathToCheck);
const normalizedRootDirectory = path.resolve(rootDirectory);
// Ensure the rootDirectory path ends with a separator for correct startsWith comparison,
// unless it's the root path itself (e.g., '/' or 'C:\').
const rootWithSeparator =
normalizedRootDirectory === path.sep ||
normalizedRootDirectory.endsWith(path.sep)
? normalizedRootDirectory
: normalizedRootDirectory + path.sep;
return (
normalizedPathToCheck === normalizedRootDirectory ||
normalizedPathToCheck.startsWith(rootWithSeparator)
);
}
/**
* Heuristic: determine if a file is likely binary.
* Now BOM-aware: if a Unicode BOM is detected, we treat it as text.
* For non-BOM files, retain the existing null-byte and non-printable ratio checks.
*/
export async function isBinaryFile(filePath: string): Promise<boolean> {
let fh: fs.promises.FileHandle | null = null;
try {
fh = await fs.promises.open(filePath, 'r');
const stats = await fh.stat();
const fileSize = stats.size;
if (fileSize === 0) return false; // empty is not binary
// Sample up to 4KB from the head (previous behavior)
const sampleSize = Math.min(4096, fileSize);
const buf = Buffer.alloc(sampleSize);
const { bytesRead } = await fh.read(buf, 0, sampleSize, 0);
if (bytesRead === 0) return false;
// BOM → text (avoid false positives for UTF16/32 with nulls)
const bom = detectBOM(buf.subarray(0, Math.min(4, bytesRead)));
if (bom) return false;
let nonPrintableCount = 0;
for (let i = 0; i < bytesRead; i++) {
if (buf[i] === 0) return true; // strong indicator of binary when no BOM
if (buf[i] < 9 || (buf[i] > 13 && buf[i] < 32)) {
nonPrintableCount++;
}
}
// If >30% non-printable characters, consider it binary
return nonPrintableCount / bytesRead > 0.3;
} catch (error) {
console.warn(
`Failed to check if file is binary: ${filePath}`,
error instanceof Error ? error.message : String(error),
);
return false;
} finally {
if (fh) {
try {
await fh.close();
} catch (closeError) {
console.warn(
`Failed to close file handle for: ${filePath}`,
closeError instanceof Error ? closeError.message : String(closeError),
);
}
}
}
}
/**
* Detects the type of file based on extension and content.
* @param filePath Path to the file.
* @returns Promise that resolves to 'text', 'image', 'pdf', 'audio', 'video', 'binary' or 'svg'.
*/
export async function detectFileType(
filePath: string,
): Promise<'text' | 'image' | 'pdf' | 'audio' | 'video' | 'binary' | 'svg'> {
const ext = path.extname(filePath).toLowerCase();
// The mimetype for various TypeScript extensions (ts, mts, cts, tsx) can be
// MPEG transport stream (a video format), but we want to assume these are
// TypeScript files instead.
if (['.ts', '.mts', '.cts'].includes(ext)) {
return 'text';
}
if (ext === '.svg') {
return 'svg';
}
const lookedUpMimeType = mime.getType(filePath); // Returns null if not found, or the mime type string
if (lookedUpMimeType) {
if (lookedUpMimeType.startsWith('image/')) {
return 'image';
}
if (lookedUpMimeType.startsWith('audio/')) {
return 'audio';
}
if (lookedUpMimeType.startsWith('video/')) {
return 'video';
}
if (lookedUpMimeType === 'application/pdf') {
return 'pdf';
}
}
// Stricter binary check for common non-text extensions before content check
// These are often not well-covered by mime-types or might be misidentified.
if (BINARY_EXTENSIONS.includes(ext)) {
return 'binary';
}
// Fall back to content-based check if mime type wasn't conclusive for image/pdf
// and it's not a known binary extension.
if (await isBinaryFile(filePath)) {
return 'binary';
}
return 'text';
}
export interface ProcessedFileReadResult {
llmContent: PartUnion; // string for text, Part for image/pdf/unreadable binary
returnDisplay: string;
error?: string; // Optional error message for the LLM if file processing failed
errorType?: ToolErrorType; // Structured error type
isTruncated?: boolean; // For text files, indicates if content was truncated
originalLineCount?: number; // For text files
linesShown?: [number, number]; // For text files [startLine, endLine] (1-based for display)
}
/**
* Reads and processes a single file, handling text, images, and PDFs.
* @param filePath Absolute path to the file.
* @param rootDirectory Absolute path to the project root for relative path display.
* @param offset Optional offset for text files (0-based line number).
* @param limit Optional limit for text files (number of lines to read).
* @returns ProcessedFileReadResult object.
*/
export async function processSingleFileContent(
filePath: string,
rootDirectory: string,
fileSystemService: FileSystemService,
offset?: number,
limit?: number,
): Promise<ProcessedFileReadResult> {
try {
if (!fs.existsSync(filePath)) {
// Sync check is acceptable before async read
return {
llmContent:
'Could not read file because no file was found at the specified path.',
returnDisplay: 'File not found.',
error: `File not found: ${filePath}`,
errorType: ToolErrorType.FILE_NOT_FOUND,
};
}
const stats = await fs.promises.stat(filePath);
if (stats.isDirectory()) {
return {
llmContent:
'Could not read file because the provided path is a directory, not a file.',
returnDisplay: 'Path is a directory.',
error: `Path is a directory, not a file: ${filePath}`,
errorType: ToolErrorType.TARGET_IS_DIRECTORY,
};
}
const fileSizeInMB = stats.size / (1024 * 1024);
if (fileSizeInMB > 20) {
return {
llmContent: 'File size exceeds the 20MB limit.',
returnDisplay: 'File size exceeds the 20MB limit.',
error: `File size exceeds the 20MB limit: ${filePath} (${fileSizeInMB.toFixed(2)}MB)`,
errorType: ToolErrorType.FILE_TOO_LARGE,
};
}
const fileType = await detectFileType(filePath);
const relativePathForDisplay = path
.relative(rootDirectory, filePath)
.replace(/\\/g, '/');
switch (fileType) {
case 'binary': {
return {
llmContent: `Cannot display content of binary file: ${relativePathForDisplay}`,
returnDisplay: `Skipped binary file: ${relativePathForDisplay}`,
};
}
case 'svg': {
const SVG_MAX_SIZE_BYTES = 1 * 1024 * 1024;
if (stats.size > SVG_MAX_SIZE_BYTES) {
return {
llmContent: `Cannot display content of SVG file larger than 1MB: ${relativePathForDisplay}`,
returnDisplay: `Skipped large SVG file (>1MB): ${relativePathForDisplay}`,
};
}
const content = await readFileWithEncoding(filePath);
return {
llmContent: content,
returnDisplay: `Read SVG as text: ${relativePathForDisplay}`,
};
}
case 'text': {
// Use BOM-aware reader to avoid leaving a BOM character in content and to support UTF-16/32 transparently
const content = await readFileWithEncoding(filePath);
const lines = content.split('\n');
const originalLineCount = lines.length;
const startLine = offset || 0;
const effectiveLimit =
limit === undefined ? DEFAULT_MAX_LINES_TEXT_FILE : limit;
// Ensure endLine does not exceed originalLineCount
const endLine = Math.min(startLine + effectiveLimit, originalLineCount);
// Ensure selectedLines doesn't try to slice beyond array bounds if startLine is too high
const actualStartLine = Math.min(startLine, originalLineCount);
const selectedLines = lines.slice(actualStartLine, endLine);
let linesWereTruncatedInLength = false;
const formattedLines = selectedLines.map((line) => {
if (line.length > MAX_LINE_LENGTH_TEXT_FILE) {
linesWereTruncatedInLength = true;
return (
line.substring(0, MAX_LINE_LENGTH_TEXT_FILE) + '... [truncated]'
);
}
return line;
});
const contentRangeTruncated =
startLine > 0 || endLine < originalLineCount;
const isTruncated = contentRangeTruncated || linesWereTruncatedInLength;
const llmContent = formattedLines.join('\n');
// By default, return nothing to streamline the common case of a successful read_file.
let returnDisplay = '';
if (contentRangeTruncated) {
returnDisplay = `Read lines ${
actualStartLine + 1
}-${endLine} of ${originalLineCount} from ${relativePathForDisplay}`;
if (linesWereTruncatedInLength) {
returnDisplay += ' (some lines were shortened)';
}
} else if (linesWereTruncatedInLength) {
returnDisplay = `Read all ${originalLineCount} lines from ${relativePathForDisplay} (some lines were shortened)`;
}
return {
llmContent,
returnDisplay,
isTruncated,
originalLineCount,
linesShown: [actualStartLine + 1, endLine],
};
}
case 'image':
case 'pdf':
case 'audio':
case 'video': {
const contentBuffer = await fs.promises.readFile(filePath);
const base64Data = contentBuffer.toString('base64');
return {
llmContent: {
inlineData: {
data: base64Data,
mimeType: mime.getType(filePath) || 'application/octet-stream',
},
},
returnDisplay: `Read ${fileType} file: ${relativePathForDisplay}`,
};
}
default: {
// Should not happen with current detectFileType logic
const exhaustiveCheck: never = fileType;
return {
llmContent: `Unhandled file type: ${exhaustiveCheck}`,
returnDisplay: `Skipped unhandled file type: ${relativePathForDisplay}`,
error: `Unhandled file type for ${filePath}`,
};
}
}
} catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
const displayPath = path
.relative(rootDirectory, filePath)
.replace(/\\/g, '/');
return {
llmContent: `Error reading file ${displayPath}: ${errorMessage}`,
returnDisplay: `Error reading file ${displayPath}: ${errorMessage}`,
error: `Error reading file ${filePath}: ${errorMessage}`,
errorType: ToolErrorType.READ_CONTENT_FAILURE,
};
}
}
export async function fileExists(filePath: string): Promise<boolean> {
try {
await fsPromises.access(filePath, fs.constants.F_OK);
return true;
} catch (_: unknown) {
return false;
}
}