gemini-cli/packages/core/src/utils/memoryDiscovery.ts

/**
 * @license
 * Copyright 2025 Google LLC
 * SPDX-License-Identifier: Apache-2.0
 */

import * as fs from 'node:fs/promises';
import * as fsSync from 'node:fs';
import * as path from 'node:path';
import { bfsFileSearch } from './bfsFileSearch.js';
import { getAllGeminiMdFilenames } from '../tools/memoryTool.js';
import type { FileDiscoveryService } from '../services/fileDiscoveryService.js';
import { processImports } from './memoryImportProcessor.js';
import {
  DEFAULT_MEMORY_FILE_FILTERING_OPTIONS,
  type FileFilteringOptions,
} from '../config/constants.js';
import { GEMINI_DIR, homedir, normalizePath } from './paths.js';
import type { ExtensionLoader } from './extensionLoader.js';
import { debugLogger } from './debugLogger.js';
import type { Config } from '../config/config.js';
import type { HierarchicalMemory } from '../config/memory.js';
import { CoreEvent, coreEvents } from './events.js';
import { getErrorMessage } from './errors.js';

// Simple console logger, similar to the one previously in CLI's config.ts
// TODO: Integrate with a more robust server-side logger if available/appropriate.
const logger = {
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
  debug: (...args: any[]) =>
    debugLogger.debug('[DEBUG] [MemoryDiscovery]', ...args),
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
  warn: (...args: any[]) =>
    debugLogger.warn('[WARN] [MemoryDiscovery]', ...args),
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
  error: (...args: any[]) =>
    debugLogger.error('[ERROR] [MemoryDiscovery]', ...args),
};

export interface GeminiFileContent {
  filePath: string;
  content: string | null;
}

/**
 * Deduplicates file paths by file identity (device + inode) rather than string path.
 * This is necessary on case-insensitive filesystems where different case variants
 * of the same filename resolve to the same physical file but have different path strings.
 *
 * @param filePaths Array of file paths to deduplicate
 * @returns Object containing deduplicated file paths and a map of path to identity key
 */
export async function deduplicatePathsByFileIdentity(
  filePaths: string[],
): Promise<{
  paths: string[];
  identityMap: Map<string, string>;
}> {
  if (filePaths.length === 0) {
    return {
      paths: [],
      identityMap: new Map<string, string>(),
    };
  }

  // first deduplicate by string path to avoid redundant stat calls
  const uniqueFilePaths = Array.from(new Set(filePaths));

  const fileIdentityMap = new Map<string, string>();
  const deduplicatedPaths: string[] = [];

  const CONCURRENT_LIMIT = 20;
  const results: Array<{
    path: string;
    dev: bigint | number | null;
    ino: bigint | number | null;
  }> = [];

  for (let i = 0; i < uniqueFilePaths.length; i += CONCURRENT_LIMIT) {
    const batch = uniqueFilePaths.slice(i, i + CONCURRENT_LIMIT);
    const batchPromises = batch.map(async (filePath) => {
      try {
        // use stat() instead of lstat() to follow symlinks and get target file identity
        const stats = await fs.stat(filePath);
        return {
          path: filePath,
          dev: stats.dev,
          ino: stats.ino,
        };
      } catch (error: unknown) {
        const message = error instanceof Error ? error.message : String(error);
        logger.debug(
          `could not stat file for deduplication: ${filePath}. error: ${message}`,
        );
        return {
          path: filePath,
          dev: null,
          ino: null,
        };
      }
    });

    const batchResults = await Promise.allSettled(batchPromises);
    for (const result of batchResults) {
      if (result.status === 'fulfilled') {
        results.push(result.value);
      } else {
        const message = getErrorMessage(result.reason);
        debugLogger.debug(
          '[DEBUG] [MemoryDiscovery] unexpected error during deduplication stat:',
          message,
        );
      }
    }
  }

  const pathToIdentityMap = new Map<string, string>();
  for (const { path, dev, ino } of results) {
    if (dev !== null && ino !== null) {
      const identityKey = `${dev.toString()}:${ino.toString()}`;
      pathToIdentityMap.set(path, identityKey);
      if (!fileIdentityMap.has(identityKey)) {
        fileIdentityMap.set(identityKey, path);
        deduplicatedPaths.push(path);
        debugLogger.debug(
          '[DEBUG] [MemoryDiscovery] deduplication: keeping',
          path,
          `(dev: ${dev}, ino: ${ino})`,
        );
      } else {
        const existingPath = fileIdentityMap.get(identityKey);
        debugLogger.debug(
          '[DEBUG] [MemoryDiscovery] deduplication: skipping',
          path,
          `(same file as ${existingPath})`,
        );
      }
    } else {
      deduplicatedPaths.push(path);
    }
  }

  return {
    paths: deduplicatedPaths,
    identityMap: pathToIdentityMap,
  };
}

async function findProjectRoot(startDir: string): Promise<string | null> {
  let currentDir = normalizePath(startDir);
  while (true) {
    const gitPath = path.join(currentDir, '.git');
    try {
      const stats = await fs.lstat(gitPath);
      if (stats.isDirectory()) {
        return currentDir;
      }
    } catch (error: unknown) {
      // Don't log ENOENT errors as they're expected when .git doesn't exist
      // Also don't log errors in test environments, which often have mocked fs
      const isENOENT =
        typeof error === 'object' &&
        error !== null &&
        'code' in error &&
        // eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
        (error as { code: string }).code === 'ENOENT';

      // Only log unexpected errors in non-test environments
      // process.env['NODE_ENV'] === 'test' or VITEST are common test indicators
      const isTestEnv =
        process.env['NODE_ENV'] === 'test' || process.env['VITEST'];

      if (!isENOENT && !isTestEnv) {
        if (typeof error === 'object' && error !== null && 'code' in error) {
          // eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
          const fsError = error as { code: string; message: string };
          logger.warn(
            `Error checking for .git directory at ${gitPath}: ${fsError.message}`,
          );
        } else {
          logger.warn(
            `Non-standard error checking for .git directory at ${gitPath}: ${String(error)}`,
          );
        }
      }
    }
    const parentDir = normalizePath(path.dirname(currentDir));
    if (parentDir === currentDir) {
      return null;
    }
    currentDir = parentDir;
  }
}

async function getGeminiMdFilePathsInternal(
  currentWorkingDirectory: string,
  includeDirectoriesToReadGemini: readonly string[],
  userHomePath: string,
  fileService: FileDiscoveryService,
  folderTrust: boolean,
  fileFilteringOptions: FileFilteringOptions,
  maxDirs: number,
): Promise<{ global: string[]; project: string[] }> {
  const dirs = new Set<string>([
    ...includeDirectoriesToReadGemini,
    currentWorkingDirectory,
  ]);

  // Process directories in parallel with concurrency limit to prevent EMFILE errors
  const CONCURRENT_LIMIT = 10;
  const dirsArray = Array.from(dirs);
  const globalPaths = new Set<string>();
  const projectPaths = new Set<string>();

  for (let i = 0; i < dirsArray.length; i += CONCURRENT_LIMIT) {
    const batch = dirsArray.slice(i, i + CONCURRENT_LIMIT);
    const batchPromises = batch.map((dir) =>
      getGeminiMdFilePathsInternalForEachDir(
        dir,
        userHomePath,
        fileService,
        folderTrust,
        fileFilteringOptions,
        maxDirs,
      ),
    );

    const batchResults = await Promise.allSettled(batchPromises);

    for (const result of batchResults) {
      if (result.status === 'fulfilled') {
        result.value.global.forEach((p) => globalPaths.add(p));
        result.value.project.forEach((p) => projectPaths.add(p));
      } else {
        // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment
        const error = result.reason;
        const message = error instanceof Error ? error.message : String(error);
        logger.error(`Error discovering files in directory: ${message}`);
      }
    }
  }

  return {
    global: Array.from(globalPaths),
    project: Array.from(projectPaths),
  };
}

async function getGeminiMdFilePathsInternalForEachDir(
  dir: string,
  userHomePath: string,
  fileService: FileDiscoveryService,
  folderTrust: boolean,
  fileFilteringOptions: FileFilteringOptions,
  maxDirs: number,
): Promise<{ global: string[]; project: string[] }> {
  const globalPaths = new Set<string>();
  const projectPaths = new Set<string>();
  const geminiMdFilenames = getAllGeminiMdFilenames();

  for (const geminiMdFilename of geminiMdFilenames) {
    const resolvedHome = normalizePath(userHomePath);
    const globalGeminiDir = normalizePath(path.join(resolvedHome, GEMINI_DIR));
    const globalMemoryPath = normalizePath(
      path.join(globalGeminiDir, geminiMdFilename),
    );

    // This part that finds the global file always runs.
    try {
      await fs.access(globalMemoryPath, fsSync.constants.R_OK);
      globalPaths.add(globalMemoryPath);
      debugLogger.debug(
        '[DEBUG] [MemoryDiscovery] Found readable global',
        geminiMdFilename + ':',
        globalMemoryPath,
      );
    } catch {
      // It's okay if it's not found.
    }

    // FIX: Only perform the workspace search (upward and downward scans)
    // if a valid currentWorkingDirectory is provided.
    if (dir && folderTrust) {
      const resolvedCwd = normalizePath(dir);
      debugLogger.debug(
        '[DEBUG] [MemoryDiscovery] Searching for',
        geminiMdFilename,
        'starting from CWD:',
        resolvedCwd,
      );

      const projectRoot = await findProjectRoot(resolvedCwd);
      debugLogger.debug(
        '[DEBUG] [MemoryDiscovery] Determined project root:',
        projectRoot ?? 'None',
      );

      const upwardPaths: string[] = [];
      let currentDir = resolvedCwd;
      const ultimateStopDir = projectRoot
        ? normalizePath(path.dirname(projectRoot))
        : normalizePath(path.dirname(resolvedHome));

      while (
        currentDir &&
        currentDir !== normalizePath(path.dirname(currentDir))
      ) {
        if (currentDir === globalGeminiDir) {
          break;
        }

        const potentialPath = normalizePath(
          path.join(currentDir, geminiMdFilename),
        );
        try {
          await fs.access(potentialPath, fsSync.constants.R_OK);
          if (potentialPath !== globalMemoryPath) {
            upwardPaths.unshift(potentialPath);
          }
        } catch {
          // Not found, continue.
        }

        if (currentDir === ultimateStopDir) {
          break;
        }

        currentDir = normalizePath(path.dirname(currentDir));
      }
      upwardPaths.forEach((p) => projectPaths.add(p));

      const mergedOptions: FileFilteringOptions = {
        ...DEFAULT_MEMORY_FILE_FILTERING_OPTIONS,
        ...fileFilteringOptions,
      };

      const downwardPaths = await bfsFileSearch(resolvedCwd, {
        fileName: geminiMdFilename,
        maxDirs,
        fileService,
        fileFilteringOptions: mergedOptions,
      });
      downwardPaths.sort();
      for (const dPath of downwardPaths) {
        projectPaths.add(normalizePath(dPath));
      }
    }
  }

  return {
    global: Array.from(globalPaths),
    project: Array.from(projectPaths),
  };
}

export async function readGeminiMdFiles(
  filePaths: string[],
  importFormat: 'flat' | 'tree' = 'tree',
): Promise<GeminiFileContent[]> {
  // Process files in parallel with concurrency limit to prevent EMFILE errors
  const CONCURRENT_LIMIT = 20; // Higher limit for file reads as they're typically faster
  const results: GeminiFileContent[] = [];

  for (let i = 0; i < filePaths.length; i += CONCURRENT_LIMIT) {
    const batch = filePaths.slice(i, i + CONCURRENT_LIMIT);
    const batchPromises = batch.map(
      async (filePath): Promise<GeminiFileContent> => {
        try {
          const content = await fs.readFile(filePath, 'utf-8');

          // Process imports in the content
          const processedResult = await processImports(
            content,
            path.dirname(filePath),
            false,
            undefined,
            undefined,
            importFormat,
          );
          debugLogger.debug(
            '[DEBUG] [MemoryDiscovery] Successfully read and processed imports:',
            filePath,
            `(Length: ${processedResult.content.length})`,
          );

          return { filePath, content: processedResult.content };
        } catch (error: unknown) {
          const isTestEnv =
            process.env['NODE_ENV'] === 'test' || process.env['VITEST'];
          if (!isTestEnv) {
            const message =
              error instanceof Error ? error.message : String(error);
            logger.warn(
              `Warning: Could not read ${getAllGeminiMdFilenames()} file at ${filePath}. Error: ${message}`,
            );
          }
          debugLogger.debug(
            '[DEBUG] [MemoryDiscovery] Failed to read:',
            filePath,
          );
          return { filePath, content: null }; // Still include it with null content
        }
      },
    );

    const batchResults = await Promise.allSettled(batchPromises);

    for (const result of batchResults) {
      if (result.status === 'fulfilled') {
        results.push(result.value);
      } else {
        // This case shouldn't happen since we catch all errors above,
        // but handle it for completeness
        // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment
        const error = result.reason;
        const message = error instanceof Error ? error.message : String(error);
        logger.error(`Unexpected error processing file: ${message}`);
      }
    }
  }

  return results;
}

export function concatenateInstructions(
  instructionContents: GeminiFileContent[],
  // CWD is needed to resolve relative paths for display markers
  currentWorkingDirectoryForDisplay: string,
): string {
  return instructionContents
    .filter((item) => typeof item.content === 'string')
    .map((item) => {
      // eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
      const trimmedContent = (item.content as string).trim();
      if (trimmedContent.length === 0) {
        return null;
      }
      const displayPath = path.isAbsolute(item.filePath)
        ? path.relative(currentWorkingDirectoryForDisplay, item.filePath)
        : item.filePath;
      return `--- Context from: ${displayPath} ---\n${trimmedContent}\n--- End of Context from: ${displayPath} ---`;
    })
    .filter((block): block is string => block !== null)
    .join('\n\n');
}

export interface MemoryLoadResult {
  files: Array<{ path: string; content: string }>;
  fileIdentities?: string[];
}

export async function getGlobalMemoryPaths(): Promise<string[]> {
  const userHome = homedir();
  const geminiMdFilenames = getAllGeminiMdFilenames();

  const accessChecks = geminiMdFilenames.map(async (filename) => {
    const globalPath = normalizePath(path.join(userHome, GEMINI_DIR, filename));
    try {
      await fs.access(globalPath, fsSync.constants.R_OK);
      debugLogger.debug(
        '[DEBUG] [MemoryDiscovery] Found global memory file:',
        globalPath,
      );
      return globalPath;
    } catch {
      return null;
    }
  });

  return (await Promise.all(accessChecks)).filter(
    (p): p is string => p !== null,
  );
}

export function getExtensionMemoryPaths(
  extensionLoader: ExtensionLoader,
): string[] {
  const extensionPaths = extensionLoader
    .getExtensions()
    .filter((ext) => ext.isActive)
    .flatMap((ext) => ext.contextFiles)
    .map((p) => normalizePath(p));

  return Array.from(new Set(extensionPaths)).sort();
}

export async function getEnvironmentMemoryPaths(
  trustedRoots: string[],
): Promise<string[]> {
  const allPaths = new Set<string>();

  // Trusted Roots Upward Traversal (Parallelized)
  const traversalPromises = trustedRoots.map(async (root) => {
    const resolvedRoot = normalizePath(root);
    debugLogger.debug(
      '[DEBUG] [MemoryDiscovery] Loading environment memory for trusted root:',
      resolvedRoot,
      '(Stopping exactly here)',
    );
    return findUpwardGeminiFiles(resolvedRoot, resolvedRoot);
  });

  const pathArrays = await Promise.all(traversalPromises);
  pathArrays.flat().forEach((p) => allPaths.add(p));

  return Array.from(allPaths).sort();
}

export function categorizeAndConcatenate(
  paths: { global: string[]; extension: string[]; project: string[] },
  contentsMap: Map<string, GeminiFileContent>,
  workingDir: string,
): HierarchicalMemory {
  const getConcatenated = (pList: string[]) =>
    concatenateInstructions(
      pList
        .map((p) => contentsMap.get(p))
        .filter((c): c is GeminiFileContent => !!c),
      workingDir,
    );

  return {
    global: getConcatenated(paths.global),
    extension: getConcatenated(paths.extension),
    project: getConcatenated(paths.project),
  };
}

/**
 * Traverses upward from startDir to stopDir, finding all GEMINI.md variants.
 *
 * Files are ordered by directory level (root to leaf), with all filename
 * variants grouped together per directory.
 */
async function findUpwardGeminiFiles(
  startDir: string,
  stopDir: string,
): Promise<string[]> {
  const upwardPaths: string[] = [];
  let currentDir = normalizePath(startDir);
  const resolvedStopDir = normalizePath(stopDir);
  const geminiMdFilenames = getAllGeminiMdFilenames();
  const globalGeminiDir = normalizePath(path.join(homedir(), GEMINI_DIR));

  debugLogger.debug(
    '[DEBUG] [MemoryDiscovery] Starting upward search from',
    currentDir,
    'stopping at',
    resolvedStopDir,
  );

  while (true) {
    if (currentDir === globalGeminiDir) {
      break;
    }

    // Parallelize checks for all filename variants in the current directory
    const accessChecks = geminiMdFilenames.map(async (filename) => {
      const potentialPath = normalizePath(path.join(currentDir, filename));
      try {
        await fs.access(potentialPath, fsSync.constants.R_OK);
        return potentialPath;
      } catch {
        return null;
      }
    });

    const foundPathsInDir = (await Promise.all(accessChecks)).filter(
      (p): p is string => p !== null,
    );

    upwardPaths.unshift(...foundPathsInDir);

    const parentDir = normalizePath(path.dirname(currentDir));
    if (currentDir === resolvedStopDir || currentDir === parentDir) {
      break;
    }
    currentDir = parentDir;
  }
  return upwardPaths;
}

export interface LoadServerHierarchicalMemoryResponse {
  memoryContent: HierarchicalMemory;
  fileCount: number;
  filePaths: string[];
}

/**
 * Loads hierarchical GEMINI.md files and concatenates their content.
 * This function is intended for use by the server.
 */
export async function loadServerHierarchicalMemory(
  currentWorkingDirectory: string,
  includeDirectoriesToReadGemini: readonly string[],
  fileService: FileDiscoveryService,
  extensionLoader: ExtensionLoader,
  folderTrust: boolean,
  importFormat: 'flat' | 'tree' = 'tree',
  fileFilteringOptions?: FileFilteringOptions,
  maxDirs: number = 200,
): Promise<LoadServerHierarchicalMemoryResponse> {
  // FIX: Use real, canonical paths for a reliable comparison to handle symlinks.
  const realCwd = normalizePath(
    await fs.realpath(path.resolve(currentWorkingDirectory)),
  );
  const realHome = normalizePath(await fs.realpath(path.resolve(homedir())));
  const isHomeDirectory = realCwd === realHome;

  // If it is the home directory, pass an empty string to the core memory
  // function to signal that it should skip the workspace search.
  currentWorkingDirectory = isHomeDirectory ? '' : currentWorkingDirectory;

  debugLogger.debug(
    '[DEBUG] [MemoryDiscovery] Loading server hierarchical memory for CWD:',
    currentWorkingDirectory,
    `(importFormat: ${importFormat})`,
  );

  // For the server, homedir() refers to the server process's home.
  // This is consistent with how MemoryTool already finds the global path.
  const userHomePath = homedir();

  // 1. SCATTER: Gather all paths
  const [discoveryResult, extensionPaths] = await Promise.all([
    getGeminiMdFilePathsInternal(
      currentWorkingDirectory,
      includeDirectoriesToReadGemini,
      userHomePath,
      fileService,
      folderTrust,
      fileFilteringOptions || DEFAULT_MEMORY_FILE_FILTERING_OPTIONS,
      maxDirs,
    ),
    Promise.resolve(getExtensionMemoryPaths(extensionLoader)),
  ]);

  const allFilePathsStringDeduped = Array.from(
    new Set([
      ...discoveryResult.global,
      ...discoveryResult.project,
      ...extensionPaths,
    ]),
  );

  if (allFilePathsStringDeduped.length === 0) {
    debugLogger.debug(
      '[DEBUG] [MemoryDiscovery] No GEMINI.md files found in hierarchy of the workspace.',
    );
    return {
      memoryContent: { global: '', extension: '', project: '' },
      fileCount: 0,
      filePaths: [],
    };
  }

  // deduplicate by file identity to handle case-insensitive filesystems
  const { paths: allFilePaths } = await deduplicatePathsByFileIdentity(
    allFilePathsStringDeduped,
  );

  if (allFilePaths.length === 0) {
    debugLogger.debug(
      '[DEBUG] [MemoryDiscovery] No unique GEMINI.md files found after deduplication by file identity.',
    );
    return {
      memoryContent: { global: '', extension: '', project: '' },
      fileCount: 0,
      filePaths: [],
    };
  }

  // 2. GATHER: Read all files in parallel
  const allContents = await readGeminiMdFiles(allFilePaths, importFormat);
  const contentsMap = new Map(allContents.map((c) => [c.filePath, c]));

  // 3. CATEGORIZE: Back into Global, Project, Extension
  const hierarchicalMemory = categorizeAndConcatenate(
    {
      global: discoveryResult.global,
      extension: extensionPaths,
      project: discoveryResult.project,
    },
    contentsMap,
    currentWorkingDirectory,
  );

  return {
    memoryContent: hierarchicalMemory,
    fileCount: allContents.filter((c) => c.content !== null).length,
    filePaths: allFilePaths,
  };
}

/**
 * Loads the hierarchical memory and resets the state of `config` as needed such
 * that it reflects the new memory.
 *
 * Returns the result of the call to `loadHierarchicalGeminiMemory`.
 */
export async function refreshServerHierarchicalMemory(config: Config) {
  const result = await loadServerHierarchicalMemory(
    config.getWorkingDir(),
    config.shouldLoadMemoryFromIncludeDirectories()
      ? config.getWorkspaceContext().getDirectories()
      : [],
    config.getFileService(),
    config.getExtensionLoader(),
    config.isTrustedFolder(),
    config.getImportFormat(),
    config.getFileFilteringOptions(),
    config.getDiscoveryMaxDirs(),
  );
  const mcpInstructions =
    config.getMcpClientManager()?.getMcpInstructions() || '';
  const finalMemory: HierarchicalMemory = {
    ...result.memoryContent,
    project: [result.memoryContent.project, mcpInstructions.trimStart()]
      .filter(Boolean)
      .join('\n\n'),
  };
  config.setUserMemory(finalMemory);
  config.setGeminiMdFileCount(result.fileCount);
  config.setGeminiMdFilePaths(result.filePaths);
  coreEvents.emit(CoreEvent.MemoryChanged, { fileCount: result.fileCount });
  return result;
}

export async function loadJitSubdirectoryMemory(
  targetPath: string,
  trustedRoots: string[],
  alreadyLoadedPaths: Set<string>,
  alreadyLoadedIdentities?: Set<string>,
): Promise<MemoryLoadResult> {
  const resolvedTarget = normalizePath(targetPath);
  let bestRoot: string | null = null;

  // Find the deepest trusted root that contains the target path
  for (const root of trustedRoots) {
    const resolvedRoot = normalizePath(root);
    const resolvedRootWithTrailing = resolvedRoot.endsWith(path.sep)
      ? resolvedRoot
      : resolvedRoot + path.sep;

    if (
      resolvedTarget === resolvedRoot ||
      resolvedTarget.startsWith(resolvedRootWithTrailing)
    ) {
      if (!bestRoot || resolvedRoot.length > bestRoot.length) {
        bestRoot = resolvedRoot;
      }
    }
  }

  if (!bestRoot) {
    debugLogger.debug(
      '[DEBUG] [MemoryDiscovery] JIT memory skipped:',
      resolvedTarget,
      'is not in any trusted root.',
    );
    return { files: [], fileIdentities: [] };
  }

  debugLogger.debug(
    '[DEBUG] [MemoryDiscovery] Loading JIT memory for',
    resolvedTarget,
    `(Trusted root: ${bestRoot})`,
  );

  // Traverse from target up to the trusted root
  const potentialPaths = await findUpwardGeminiFiles(resolvedTarget, bestRoot);

  if (potentialPaths.length === 0) {
    return { files: [], fileIdentities: [] };
  }

  // deduplicate by file identity to handle case-insensitive filesystems
  // this deduplicates within the current batch
  const { paths: deduplicatedNewPaths, identityMap: newPathsIdentityMap } =
    await deduplicatePathsByFileIdentity(potentialPaths);

  // Use cached file identities if provided, otherwise build from paths
  // This avoids redundant fs.stat() calls on already loaded files
  const cachedIdentities = alreadyLoadedIdentities ?? new Set<string>();
  if (!alreadyLoadedIdentities && alreadyLoadedPaths.size > 0) {
    const CONCURRENT_LIMIT = 20;
    const alreadyLoadedArray = Array.from(alreadyLoadedPaths);

    for (let i = 0; i < alreadyLoadedArray.length; i += CONCURRENT_LIMIT) {
      const batch = alreadyLoadedArray.slice(i, i + CONCURRENT_LIMIT);
      const batchPromises = batch.map(async (filePath) => {
        try {
          const stats = await fs.stat(filePath);
          const identityKey = `${stats.dev.toString()}:${stats.ino.toString()}`;
          cachedIdentities.add(identityKey);
        } catch {
          // ignore errors - if we can't stat it, we can't deduplicate by identity
        }
      });
      // Await each batch to properly limit concurrency and prevent EMFILE errors
      await Promise.allSettled(batchPromises);
    }
  }

  // filter out paths that match already loaded files by identity
  // reuse the identities from deduplicatePathsByFileIdentity to avoid redundant stat calls
  const newPaths: string[] = [];
  const newFileIdentities: string[] = [];
  for (const filePath of deduplicatedNewPaths) {
    const identityKey = newPathsIdentityMap.get(filePath);
    if (identityKey && cachedIdentities.has(identityKey)) {
      debugLogger.debug(
        '[DEBUG] [MemoryDiscovery] jit memory: skipping',
        filePath,
        '(already loaded with different case)',
      );
      continue;
    }
    // if we don't have an identity (stat failed), include it to be safe
    newPaths.push(filePath);
    if (identityKey) {
      newFileIdentities.push(identityKey);
    }
  }

  if (newPaths.length === 0) {
    return { files: [], fileIdentities: [] };
  }

  debugLogger.debug(
    '[DEBUG] [MemoryDiscovery] Found new JIT memory files:',
    JSON.stringify(newPaths),
  );

  const contents = await readGeminiMdFiles(newPaths, 'tree');

  return {
    files: contents
      .filter((item) => item.content !== null)
      .map((item) => ({
        path: item.filePath,
        // eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
        content: item.content as string,
      })),
    fileIdentities: newFileIdentities,
  };
}