autojanet/skills/understand/extract-import-map.mjs
Zoë cc74ad0bd0
Some checks failed
ci/woodpecker/push/woodpecker Pipeline failed
fix: use library/ Harbor project, add skills, fix pipeline secrets
- .woodpecker.yaml: image paths -> library/autojanet-{agent,dispatcher}
- .woodpecker.yaml: secret names RS_HARBOR_USER / RS_HARBOR_PASS (global)
- container/Dockerfile: restore COPY skills/, skills/ populated from opencode config
- skills/: 84 opencode skills bundled into image
- k8s/manifests: update image refs to library/
2026-05-30 15:43:14 -07:00

1567 lines
60 KiB
JavaScript

#!/usr/bin/env node
/**
* extract-import-map.mjs
*
* Deterministic import resolution script for the project-scanner agent.
* Uses PluginRegistry (TreeSitterPlugin + non-code parsers) from
* @understand-anything/core to extract raw import paths via tree-sitter,
* then applies language-specific resolution rules to map them to
* project-internal file paths.
*
* Replaces the LLM-written prose import resolver in agents/project-scanner.md
* (the prose previously described patterns by language; runtime LLMs produced
* inconsistent, regex-only scripts with sparse coverage).
*
* Usage:
* node extract-import-map.mjs <input.json> <output.json>
*
* Input JSON:
* {
* projectRoot: <abs-path>,
* files: [{ path, language, fileCategory }, ...]
* }
*
* Output JSON:
* {
* scriptCompleted: true,
* stats: { filesScanned, filesWithImports, totalEdges },
* importMap: { <path>: [<resolvedPath>, ...], ... }
* }
*
* Logging: stderr only (stdout reserved for piped tools).
* Per-file resilience: failures emit `Warning: extract-import-map: ...` and
* set importMap[path] = [], they do not abort the script.
*/
import { createRequire } from 'node:module';
import { dirname, resolve, join, posix } from 'node:path';
import { fileURLToPath, pathToFileURL } from 'node:url';
import { existsSync, readFileSync, realpathSync, writeFileSync } from 'node:fs';
const __dirname = dirname(fileURLToPath(import.meta.url));
// skills/understand/ -> plugin root is two dirs up
const pluginRoot = resolve(__dirname, '../..');
const require = createRequire(resolve(pluginRoot, 'package.json'));
// ---------------------------------------------------------------------------
// Resolve @understand-anything/core
//
// Node ESM dynamic import() requires a file:// URL on Windows; passing a raw
// absolute path like "C:\..." throws ERR_UNSUPPORTED_ESM_URL_SCHEME because the
// loader parses "C:" as a URL scheme. Wrap both resolutions in pathToFileURL().
// ---------------------------------------------------------------------------
let core;
try {
core = await import(pathToFileURL(require.resolve('@understand-anything/core')).href);
} catch {
// Fallback: direct path for installed plugin cache layouts
core = await import(pathToFileURL(resolve(pluginRoot, 'packages/core/dist/index.js')).href);
}
const { TreeSitterPlugin, PluginRegistry, builtinLanguageConfigs, registerAllParsers } = core;
// ---------------------------------------------------------------------------
// Path helpers
// ---------------------------------------------------------------------------
/**
* Normalize a project-relative path to forward slashes (POSIX). Project-scanner
* always emits forward slashes; we re-normalize to keep this script
* cross-platform.
*/
function toPosix(p) {
return p.split(/[\\/]/).filter(Boolean).join('/');
}
/**
* Join a directory with a relative segment, normalizing `.`/`..` segments and
* returning a forward-slash POSIX path. Anchored at project root (no leading
* slash). Returns '' if the path walks above the project root.
*/
function resolveRelative(dir, rel) {
const parts = (dir ? dir.split('/').filter(Boolean) : []).concat(
rel.split('/').filter(Boolean),
);
const stack = [];
for (const part of parts) {
if (part === '' || part === '.') continue;
if (part === '..') {
if (stack.length === 0) return '';
stack.pop();
} else {
stack.push(part);
}
}
return stack.join('/');
}
/**
* Return the directory portion of a project-relative path (no trailing slash,
* '' for top-level files).
*/
function dirOf(p) {
const i = p.lastIndexOf('/');
return i === -1 ? '' : p.slice(0, i);
}
// ---------------------------------------------------------------------------
// Config loading
//
// Cached once at startup. Per-file resolvers consume these values; they MUST
// NOT re-read these files (a 1000-file project would otherwise re-parse the
// same config 1000 times).
// ---------------------------------------------------------------------------
/**
* Parse a single tsconfig.json file content and return
* `{ baseUrl: string, paths: Map<string, string[]> }` or `null` if both the
* comment-stripped and raw parses fail. Centralizes the "JSONC-then-raw"
* fallback so callers can iterate many tsconfigs without duplicating the
* try/catch ladder.
*
* Returning `null` (rather than throwing) lets the caller emit a Warning:
* with the exact tsconfig path that failed; bubbling the error would
* conceal which file was at fault when many tsconfigs are loaded.
*/
function parseTsConfigText(raw) {
// tsconfig.json often contains JSONC-style comments; strip line and block
// comments before parsing. The strip is naive (it doesn't honor string
// contents), so we fall back to the raw text on failure.
const stripped = raw
.replace(/\/\*[\s\S]*?\*\//g, '')
.replace(/(^|[^:])\/\/.*$/gm, '$1');
let parsed;
try {
parsed = JSON.parse(stripped);
} catch {
try {
parsed = JSON.parse(raw);
} catch {
return null;
}
}
const compilerOptions = parsed?.compilerOptions ?? {};
const baseUrl = compilerOptions.baseUrl ?? '.';
const paths = new Map();
if (compilerOptions.paths && typeof compilerOptions.paths === 'object') {
for (const [alias, targets] of Object.entries(compilerOptions.paths)) {
if (Array.isArray(targets)) {
paths.set(alias, targets);
}
}
}
return { baseUrl, paths };
}
/**
* Load every `tsconfig.json` discovered in the input file list and parse
* each. Returns `Map<dirPath, { baseUrl, paths }>` keyed by the
* project-relative POSIX directory containing the tsconfig (empty string
* for a root-level tsconfig.json).
*
* `paths` keys keep their trailing `*` wildcards intact (e.g. `"@/*"`); the
* resolver matches them by prefix. Values are arrays because tsconfig
* allows multiple targets per alias.
*
* WHY plural: pnpm/yarn workspace monorepos commonly carry per-package
* tsconfig.json files with package-scoped `paths` aliases. Loading only
* the root tsconfig would (1) miss aliases defined in sub-packages and
* (2) erroneously apply root aliases to files in sub-packages that
* redefine them. Per-importer walk-up is the only correct behavior.
*
* Returns an empty map if no tsconfigs are found — many JS-only projects
* have none, and relative imports still resolve without one. On parse
* failure for a specific tsconfig, emits a Warning: pointing at the bad
* file and skips it (the rest of the project keeps working).
*
* Parse strategy (per-file, in parseTsConfigText):
* 1. Try the comment-stripped text (handles JSONC-style tsconfigs).
* 2. If that fails, retry the ORIGINAL raw text — recovers the case
* where the stripper damaged a string literal containing `//`.
* 3. If both fail, warn and skip — that tsconfig contributes no aliases.
*/
function loadTsConfigs(projectRoot, files) {
const out = new Map();
for (const f of files) {
const p = toPosix(f.path);
const base = p.includes('/') ? p.slice(p.lastIndexOf('/') + 1) : p;
if (base !== 'tsconfig.json') continue;
const absPath = join(projectRoot, p);
if (!existsSync(absPath)) continue;
let raw;
try {
raw = readFileSync(absPath, 'utf-8');
} catch (err) {
process.stderr.write(
`Warning: extract-import-map: tsconfig.json at ${absPath} failed ` +
`to read (${err.message}) — path aliases from this config will ` +
`not be applied — relative imports unaffected\n`,
);
continue;
}
const parsed = parseTsConfigText(raw);
if (!parsed) {
process.stderr.write(
`Warning: extract-import-map: tsconfig.json at ${absPath} failed ` +
`to parse — path aliases from this config will not be applied ` +
`— relative imports unaffected\n`,
);
continue;
}
out.set(dirOf(p), parsed);
}
return out;
}
/**
* Load every `go.mod` discovered in the input file list and extract its
* `module <name>` line. Returns `Map<dirPath, moduleName>` where `dirPath`
* is the project-relative POSIX directory containing the go.mod (empty
* string for a root-level go.mod).
*
* WHY plural: multi-service / multi-module repositories (e.g. Google's
* microservices-demo) have one go.mod per service. The resolver dispatches
* per importer by walking up to the nearest go.mod, so a single root-only
* lookup misses every file that lives inside a sub-module.
*
* Files outside the discovered `files[]` are ignored — the project-scanner
* is the single source of truth for what the user considers part of the
* project. On read failure for a discovered go.mod we silently skip that
* entry; the per-file resolver will surface the "no ancestor go.mod" warning
* if it matters for any importer.
*
* Example go.mod:
* module github.com/foo/bar
* go 1.21
*
* The resolver uses each module's prefix to translate
* `import "github.com/foo/bar/x"` into the project-internal `x/<file>.go`.
*/
function loadGoModules(projectRoot, files) {
const out = new Map();
for (const f of files) {
const p = toPosix(f.path);
const base = p.includes('/') ? p.slice(p.lastIndexOf('/') + 1) : p;
if (base !== 'go.mod') continue;
const absPath = join(projectRoot, p);
if (!existsSync(absPath)) continue;
let raw;
try {
raw = readFileSync(absPath, 'utf-8');
} catch {
continue;
}
let moduleName = '';
for (const line of raw.split(/\r?\n/)) {
const trimmed = line.replace(/\/\/.*$/, '').trim();
if (!trimmed.startsWith('module ')) continue;
moduleName = trimmed.slice('module '.length).trim();
break;
}
if (!moduleName) continue;
out.set(dirOf(p), moduleName);
}
return out;
}
/**
* Walk up from `startDir` (project-relative POSIX, '' for project root)
* and return the DEEPEST ancestor directory that exists as a key in
* `configMap`, or undefined if no ancestor matches.
*
* Determinism: ancestors are inspected from deepest to shallowest, so the
* deepest match is always picked. This matches the way TS/JS / PHP / Go
* tools resolve nearest config in the wild ("nearest enclosing").
*
* Defensive note: if multiple distinct keys somehow share a depth (cannot
* happen with proper directory paths, but a malformed input could), the
* caller is expected to have normalized the keys. We do not re-sort here
* because the iteration order is determined by depth alone.
*/
function findNearestConfigDir(startDir, configMap) {
if (configMap.size === 0) return undefined;
// Walk ancestors from the importer's directory up to the project root.
// Slicing the parts array gives every prefix; we test each from longest
// to shortest so the deepest match wins.
const parts = startDir ? startDir.split('/').filter(Boolean) : [];
for (let i = parts.length; i >= 0; i--) {
const ancestor = parts.slice(0, i).join('/');
if (configMap.has(ancestor)) return ancestor;
}
return undefined;
}
/**
* Resolution context shared across all per-file resolver calls. Holds:
* - fileSet: Set<string> of every input file's posix path
* - tsConfigs: Map<dir, { baseUrl, paths }> from every tsconfig.json in
* `files[]`. Per-import resolution walks up from the importer to the
* nearest enclosing tsconfig.
* - goModules: Map<dir, moduleName> from every go.mod in `files[]`.
* - phpAutoloads: Map<dir, autoloadMap> from every composer.json in
* `files[]`. Resolved paths are anchored at the composer's directory.
* - goFilesByDir: Map<dir, string[]> of .go files per directory (built
* once so Go's package-level import dispatch doesn't re-scan the file
* set per import).
*
* Build once; pass everywhere.
*/
function buildResolutionContext(projectRoot, files) {
const fileSet = new Set(files.map(f => toPosix(f.path)));
const tsConfigs = loadTsConfigs(projectRoot, files);
const goModules = loadGoModules(projectRoot, files);
// Index .go files by their parent directory so the Go resolver can
// expand a package-level import to all member .go files in O(1).
const goFilesByDir = new Map();
for (const f of files) {
if (!f.path.endsWith('.go')) continue;
const p = toPosix(f.path);
const d = dirOf(p);
if (!goFilesByDir.has(d)) goFilesByDir.set(d, []);
goFilesByDir.get(d).push(p);
}
for (const arr of goFilesByDir.values()) {
arr.sort((a, b) => a.localeCompare(b));
}
// Build per-extension suffix indices for dotted-FQN resolvers (Java,
// Kotlin, C#). Indexed once; reused for every import dispatch.
const javaIndex = buildSuffixIndex(files, p => p.endsWith('.java'));
const kotlinIndex = buildSuffixIndex(files, p => p.endsWith('.kt'));
const csIndex = buildSuffixIndex(files, p => p.endsWith('.cs'));
const phpAutoloads = loadPhpAutoloads(projectRoot, files);
return {
projectRoot,
fileSet,
tsConfigs,
goModules,
goFilesByDir,
javaIndex,
kotlinIndex,
csIndex,
phpAutoloads,
// Dedupe Sets for one-time-per-file warnings. Keyed by importer file
// path. Mutated by resolvers.
_warnedNoRustCrateRoot: new Set(),
_warnedNoGoModule: new Set(),
};
}
// ---------------------------------------------------------------------------
// TypeScript / JavaScript resolver
//
// Handles:
// - Relative imports: `import x from './foo'` -> `<dir>/foo` + ext probes
// - tsconfig path aliases: `import x from '@/foo'` -> `<baseUrl>/<target>/foo`
//
// `imp.source` from tree-sitter is the literal string content of the import
// path (no quotes). We don't need to redo the regex work — we just classify
// the source string and dispatch.
// ---------------------------------------------------------------------------
// Extensions probed when the import has no extension. The order mirrors the
// historical project-scanner prose so behavior matches existing fixtures.
const TS_EXT_PROBES = [
'.ts', '.tsx', '.js', '.jsx', '.mjs', '.cjs',
'/index.ts', '/index.tsx', '/index.js', '/index.jsx',
];
/**
* Try ext probes against the file set for the given base path. Returns the
* first matching project-relative path, or null. If the base path already has
* a code extension AND exists in the file set, returns it directly.
*/
function probeWithExtensions(basePath, fileSet) {
if (!basePath) return null;
// Exact match (import already had an extension)
if (fileSet.has(basePath)) return basePath;
for (const ext of TS_EXT_PROBES) {
const candidate = basePath + ext;
if (fileSet.has(candidate)) return candidate;
}
return null;
}
/**
* Resolve a TypeScript / JavaScript import. Returns project-relative resolved
* path or null. External packages return null.
*
* Path-alias resolution walks up from the importer's directory to find the
* nearest enclosing tsconfig.json (monorepo-friendly). `baseUrl`-relative
* targets are anchored at THAT tsconfig's directory, matching the way the
* TypeScript compiler resolves nested project configs.
*/
export function resolveTsJsImport(rawImport, file, ctx) {
if (!rawImport || typeof rawImport !== 'string') return null;
const src = rawImport.trim();
if (!src) return null;
const importerDir = dirOf(toPosix(file.path));
// Relative imports: ./foo, ../foo — tsconfig has no bearing here.
if (src.startsWith('./') || src.startsWith('../')) {
const base = resolveRelative(importerDir, src);
return probeWithExtensions(base, ctx.fileSet);
}
// tsconfig path aliases. Walk up from the importer to find the nearest
// tsconfig.json; resolve targets relative to THAT tsconfig's directory.
// Without the walk-up, a root tsconfig would either swallow aliases that
// belong to a sub-package or fail to apply sub-package-defined aliases.
const tsConfigDir = findNearestConfigDir(importerDir, ctx.tsConfigs);
if (tsConfigDir !== undefined) {
const tsConfig = ctx.tsConfigs.get(tsConfigDir);
const { baseUrl, paths } = tsConfig;
if (paths && paths.size > 0) {
for (const [alias, targets] of paths) {
const aliasMatch = matchTsAlias(alias, src);
if (aliasMatch === null) continue;
for (const target of targets) {
const mapped = applyTsAlias(target, aliasMatch);
// baseUrl is tsconfig-dir-relative; '.', './', '' all mean the
// tsconfig's own directory. We anchor at tsConfigDir so a nested
// tsconfig's `baseUrl: '.'` maps to its package, not project root.
const normalizedBase = baseUrl === '.' || baseUrl === ''
? ''
: toPosix(baseUrl);
const relativeToConfig = normalizedBase
? posix.join(normalizedBase, mapped)
: mapped;
// posix.normalize strips a leading "./" left over when both
// tsConfigDir and normalizedBase are empty (root tsconfig with
// `"@/*": ["./*"]`, the create-next-app default). Without this the
// candidate stays as "./foo" while ctx.fileSet stores "foo", and
// probeWithExtensions silently drops every cross-module edge.
const candidate = posix.normalize(
tsConfigDir
? posix.join(tsConfigDir, relativeToConfig)
: relativeToConfig,
);
// Defensive: tsconfig targets shouldn't escape the project root.
if (candidate.startsWith('..')) continue;
const probed = probeWithExtensions(candidate, ctx.fileSet);
if (probed) return probed;
}
}
}
}
// Bare specifier with no leading `./`, no alias match -> external package.
return null;
}
/**
* Match an import against a tsconfig paths alias. Aliases use `*` as a single
* wildcard, e.g. `"@/*"` matches `"@/foo/bar"` with the wildcard = "foo/bar".
* Aliases without `*` must match exactly. Returns the wildcard content
* (possibly '') on match, null on no match.
*/
function matchTsAlias(alias, src) {
const starIdx = alias.indexOf('*');
if (starIdx === -1) {
return src === alias ? '' : null;
}
const prefix = alias.slice(0, starIdx);
const suffix = alias.slice(starIdx + 1);
if (!src.startsWith(prefix)) return null;
if (!src.endsWith(suffix)) return null;
// Avoid double-counting when prefix+suffix length exceeds src length
if (src.length < prefix.length + suffix.length) return null;
return src.slice(prefix.length, src.length - suffix.length);
}
/**
* Substitute the wildcard content into a tsconfig target. Mirror of
* matchTsAlias — if the target has no `*`, return it as-is (rare, but valid).
*/
function applyTsAlias(target, wildcard) {
const starIdx = target.indexOf('*');
if (starIdx === -1) return target;
return target.slice(0, starIdx) + wildcard + target.slice(starIdx + 1);
}
/**
* Tree-sitter's TS/JS extractor only records ES module `import` declarations.
* CommonJS `require('./foo')` is treated as a generic call expression and
* never enters `analysis.imports`, which would silently drop edges in
* Node-style codebases. Patch coverage with a focused regex pass on the file
* content — we only want literal string arguments, so the regex is narrow.
*
* Limitations (intentional):
* - Computed requires (`require(name)`) are external/dynamic — skipped.
* - Template-literal requires are unresolved.
* - String concatenation in the argument is unresolved.
*/
const REQUIRE_LITERAL_RE = /\brequire\(\s*(['"])([^'"`\n]+?)\1\s*\)/g;
/**
* Strip JS/TS line and block comments before running text-pattern matchers.
* Replaces with spaces (preserving offsets isn't critical here, but keeping
* roughly the same length avoids surprising the matcher with collapsed
* whitespace). Does not attempt to honor string contents — that's fine for
* the narrow patterns we run (`require('...')`, etc.) because the same
* comment-or-not heuristic applies uniformly to all matched literals.
*/
function stripJsLikeComments(content) {
return content
.replace(/\/\*[\s\S]*?\*\//g, '')
.replace(/\/\/[^\n]*/g, '');
}
function extractRequireSources(content) {
const sources = [];
let m;
const stripped = stripJsLikeComments(content);
REQUIRE_LITERAL_RE.lastIndex = 0;
while ((m = REQUIRE_LITERAL_RE.exec(stripped)) !== null) {
sources.push(m[2]);
}
return sources;
}
/**
* Kotlin has no tree-sitter extractor in this project, so we collect its
* import sources via a focused regex pass. Kotlin imports are syntactically
* simple: one per line, `import x.y.Z` or `import x.y.Z as Alias` (or
* `import x.y.*` for star imports). We capture the dotted FQN and let the
* dotted resolver classify wildcards.
*
* The capture is a strict qualifiedName grammar — a leading identifier
* followed by zero or more `.identifier` segments and an optional trailing
* `.*` for star-imports. The looser `[\w.*]+` form previously here would
* match pathological inputs like `import ...` or `import .foo`.
*/
const KOTLIN_IMPORT_RE =
/^\s*import\s+(\w+(?:\.\w+)*(?:\.\*)?)(?:\s+as\s+\w+)?\s*$/gm;
function extractKotlinSources(content) {
const sources = [];
let m;
KOTLIN_IMPORT_RE.lastIndex = 0;
while ((m = KOTLIN_IMPORT_RE.exec(content)) !== null) {
sources.push(m[1]);
}
return sources;
}
// ---------------------------------------------------------------------------
// Python resolver
//
// Tree-sitter's Python extractor emits one entry per import statement:
// - `import a.b.c` -> { source: 'a.b.c', specifiers: ['a.b.c'] }
// - `from a.b.c import x,y` -> { source: 'a.b.c', specifiers: ['x','y'] }
// - `from . import x` -> { source: '', specifiers: ['x'] }
// - `from .x import y` -> { source: '.x', specifiers: ['y'] }
// - `from ..pkg import y` -> { source: '..pkg', specifiers: ['y'] }
//
// We can't tell relative from absolute by the source string alone — the dots
// could be a leading-dot relative source OR a literal `.` package separator.
// Python's lexical convention disambiguates: leading dots ALWAYS mean
// relative. Tree-sitter preserves leading dots verbatim in the source field,
// so we can dispatch on the prefix.
//
// Resolution rules:
// 1. Relative (starts with `.`): walk up parent dirs by leading-dot count,
// then descend by the remaining dotted segments.
// 2. Absolute (no leading dot): walk up from the importer's directory,
// trying EACH ancestor as a candidate Python root. The first ancestor
// under which probing succeeds wins. This matches how multi-service
// Python repos work in practice — each service directory acts as its
// own root for unqualified `import sibling` style imports
// (e.g. microservices-demo's per-service grpc stubs).
//
// We don't gate this on setup.py / pyproject.toml detection. The
// probe itself IS the test of whether the ancestor is a candidate
// root: an absent module just continues the walk. The closest
// ancestor where the import resolves wins, which gives importer
// scope precedence (sibling files override remote candidates).
// ---------------------------------------------------------------------------
/**
* Resolve a Python import. Unlike most resolvers this can produce multiple
* matches (one for the package `__init__.py` plus one per submodule
* specifier), so the signature differs: returns string[].
*
* Returns empty array for external/unresolved packages.
*/
export function resolvePythonImport(rawImport, specifiers, file, ctx) {
if (typeof rawImport !== 'string') return [];
const src = rawImport;
const importerDir = dirOf(toPosix(file.path));
// Count leading dots; the rest is a dotted module path
let dots = 0;
while (dots < src.length && src.charCodeAt(dots) === 0x2e /* '.' */) dots++;
const tail = src.slice(dots);
const tailSegments = tail ? tail.split('.').filter(Boolean) : [];
if (dots > 0) {
// Relative import. `from . import x` (dots=1, tail='') walks up zero
// directories (sibling level); `from .. import x` walks up one.
// Relative imports are anchored at the importer's package, so we do
// NOT do the per-root walk-up here — leading dots already encode the
// exact anchor.
const importerParts = importerDir ? importerDir.split('/').filter(Boolean) : [];
const dropLevels = dots - 1;
if (dropLevels > importerParts.length) {
// Walked above the project root — unresolvable
return [];
}
const baseParts = importerParts.slice(0, importerParts.length - dropLevels);
// `from .[..] import x, y` with no dotted tail — specifiers are siblings
// at `baseParts`. Probe directly without requiring `<baseParts>/__init__.py`
// to exist: PEP 328 implicit namespace packages are common in modern
// Python (no `__init__.py`), and `resolvePythonProbe` would otherwise
// gate specifier resolution on the package marker and drop these imports.
if (tailSegments.length === 0) {
if (!Array.isArray(specifiers) || specifiers.length === 0) return [];
const base = baseParts.join('/');
const matches = [];
for (const spec of specifiers) {
// Wildcard `*` and qualified specifiers (`Foo.bar`) skip; the
// surface name is what tree-sitter records for `from . import x`.
if (!spec || spec === '*' || spec.includes('.')) continue;
const subFile = base ? `${base}/${spec}.py` : `${spec}.py`;
const subInit = base ? `${base}/${spec}/__init__.py` : `${spec}/__init__.py`;
if (ctx.fileSet.has(subFile)) matches.push(subFile);
else if (ctx.fileSet.has(subInit)) matches.push(subInit);
}
return matches;
}
const moduleParts = baseParts.concat(tailSegments);
return resolvePythonProbe(moduleParts, specifiers, ctx);
}
// Absolute import. Walk up from the importer's directory and try every
// ancestor as a candidate Python root — the first one where probing
// resolves anything wins. This handles the multi-service / multi-package
// case where each service's directory acts as its own implicit
// sys.path entry (e.g. `import demo_pb2_grpc` from
// `src/emailservice/email_server.py` should resolve to
// `src/emailservice/demo_pb2_grpc.py`, NOT fail because the file isn't
// at `<projectRoot>/demo_pb2_grpc.py`).
//
// Importer-scope precedence (deepest ancestor first) means that when
// the same module name exists in multiple services, each service's
// file shadows the others — no cross-service edges.
if (tailSegments.length === 0) {
// `from . import x` is dots>0 only; reaching here means the source
// was the empty string. Nothing to probe.
return [];
}
const importerParts = importerDir ? importerDir.split('/').filter(Boolean) : [];
for (let i = importerParts.length; i >= 0; i--) {
const rootParts = importerParts.slice(0, i);
const candidateModule = rootParts.concat(tailSegments);
const matches = resolvePythonProbe(candidateModule, specifiers, ctx);
if (matches.length > 0) return matches;
}
return [];
}
/**
* Given a fully-qualified module-path segment list (e.g. ['src','utils']),
* probe the file set for `a/b/c.py` then `a/b/c/__init__.py`. On package
* match, also probe each specifier as a submodule. Returns an array of
* resolved project-relative paths (deduped by Set in caller).
*/
function resolvePythonProbe(moduleParts, specifiers, ctx) {
if (moduleParts.length === 0) {
// `from . import x` case: importer's package is the implicit module;
// each x is a sibling module to probe directly.
return [];
}
const base = moduleParts.join('/');
const matches = [];
const moduleFile = `${base}.py`;
const packageInit = `${base}/__init__.py`;
if (ctx.fileSet.has(moduleFile)) {
matches.push(moduleFile);
return matches; // No further probing on a leaf module file.
}
if (ctx.fileSet.has(packageInit)) {
matches.push(packageInit);
// Package match: probe each specifier as a submodule
if (Array.isArray(specifiers)) {
for (const spec of specifiers) {
// Wildcard `*` and qualified specifiers (`Foo.bar`) skip; the
// surface name is what tree-sitter records for `from pkg import x`.
if (!spec || spec === '*' || spec.includes('.')) continue;
const subFile = `${base}/${spec}.py`;
const subInit = `${base}/${spec}/__init__.py`;
if (ctx.fileSet.has(subFile)) matches.push(subFile);
else if (ctx.fileSet.has(subInit)) matches.push(subInit);
}
}
return matches;
}
// No match — external package.
return [];
}
// ---------------------------------------------------------------------------
// Go resolver
//
// Tree-sitter's Go extractor emits the literal import path (without quotes).
// Resolution: walk up from the importer's directory to find the nearest
// enclosing `go.mod` (multi-module monorepos are the norm). Strip that
// module's prefix; the remainder maps to a directory RELATIVE TO THAT
// MODULE'S DIRECTORY in the project. Go imports are package-level (not
// file-level), so a single `import "github.com/foo/bar/util"` produces edges
// to every .go file inside that module's `util/`.
//
// Cross-module imports (`github.com/foo/bar/X` from a file under a module
// that declares `github.com/foo/baz`) are correctly classified as external —
// they refer to a different Go module, which from this module's perspective
// is a third-party dependency.
//
// Inputs:
// - rawImport: 'github.com/foo/bar/util' (no quotes)
// - file.path: importer's project-relative path
// - ctx.goModules: Map<dir, moduleName> of every go.mod discovered.
//
// Result: array of every `<moduleDir>/util/*.go` path in the project
// (deduped by caller).
// ---------------------------------------------------------------------------
export function resolveGoImport(rawImport, file, ctx) {
if (!rawImport || typeof rawImport !== 'string') return [];
const src = rawImport.trim();
if (!src) return [];
const importerPath = toPosix(file.path);
const importerDir = dirOf(importerPath);
const nearestModuleDir = findNearestConfigDir(importerDir, ctx.goModules);
if (nearestModuleDir === undefined) {
// Warn once per importer file — a single .go file can import several
// module-prefixed paths, so suppress duplicates.
if (!ctx._warnedNoGoModule.has(importerPath)) {
ctx._warnedNoGoModule.add(importerPath);
process.stderr.write(
`Warning: extract-import-map: Go file ${importerPath} has no ` +
`ancestor go.mod — import ${src} unresolvable — module-prefix ` +
`imports skipped\n`,
);
}
return [];
}
const moduleName = ctx.goModules.get(nearestModuleDir);
// Strip module prefix; require a `/` boundary so 'githubXcom...' does not
// accidentally match 'github.com...'.
let remainder;
if (src === moduleName) {
remainder = '';
} else if (src.startsWith(moduleName + '/')) {
remainder = src.slice(moduleName.length + 1);
} else {
// External package (stdlib, 3rd-party module, OR a different in-tree
// module — the latter is intentional: from this module's perspective,
// a sibling module is an external dependency).
return [];
}
// Map to a directory in the project (POSIX style). Anchor at the module's
// own directory, so a sub-module's `<module>/sub` resolves under that
// module's tree rather than under project root.
const subDir = toPosix(remainder);
const targetDir = nearestModuleDir
? (subDir ? `${nearestModuleDir}/${subDir}` : nearestModuleDir)
: subDir;
const files = ctx.goFilesByDir.get(targetDir);
return files ? [...files] : [];
}
// ---------------------------------------------------------------------------
// Dotted-package resolver (Java / Kotlin / C#)
//
// Shared logic: an import like `com.example.foo.Bar` maps to a file
// `**/com/example/foo/Bar.<ext>` in the project. Many JVM/CLR projects nest
// sources under `src/main/java/`, `src/main/kotlin/`, etc., so the resolver
// must search for any file whose suffix matches the dotted-path-as-file form.
//
// We pre-build an index: trailing-slash-suffix -> matching project paths.
// Indexing once is O(files * average_segments); per-import lookup is then
// effectively O(1) hash lookup + scan of the bucket.
// ---------------------------------------------------------------------------
/**
* Build an index of all files for a given extension, keyed by their
* "package-path suffix" form. For each file `src/main/java/com/x/Y.java`,
* the index gets entries for every suffix that ends at a `/`:
* - 'com/x/Y.java'
* - 'x/Y.java'
* - 'Y.java'
* keyed off each successively-shorter suffix.
*
* Using a Map<suffix, string[]> avoids per-import full table scans; a 50K-file
* monorepo with deep package nesting still resolves O(1) per import.
*/
function buildSuffixIndex(files, extPredicate) {
const idx = new Map();
for (const f of files) {
const p = toPosix(f.path);
if (!extPredicate(p)) continue;
// Generate every "directory-bounded suffix" of the path
const parts = p.split('/');
for (let i = 0; i < parts.length; i++) {
const suffix = parts.slice(i).join('/');
if (!idx.has(suffix)) idx.set(suffix, []);
idx.get(suffix).push(p);
}
}
// Deterministic order within each bucket
for (const arr of idx.values()) {
arr.sort((a, b) => a.localeCompare(b));
}
return idx;
}
/**
* Resolve a dotted-import to a file. `fqn` is the qualified name
* (`com.example.Foo`); `ext` is the file extension to probe (`.java`,
* `.kt`, `.cs`). Wildcards (e.g. `com.example.*`) and the trailing `*` in
* Java's `com.example.*` are stripped before resolution — there is no good
* single-file resolution for wildcards, so we drop them. (Tree-sitter
* already exposes `*` as a specifier; the source field strips it.)
*
* Returns array (most cases: 0 or 1 match; multiple if the same suffix
* appears in multiple source roots).
*/
function resolveDottedFqn(fqn, ext, suffixIndex) {
if (!fqn || typeof fqn !== 'string') return [];
// Strip trailing wildcard segments like `com.example.*`
const trimmed = fqn.replace(/\.\*$/, '');
if (!trimmed) return [];
const filePart = trimmed.replace(/\./g, '/') + ext;
const matches = suffixIndex.get(filePart);
return matches ? [...matches] : [];
}
// ---------------------------------------------------------------------------
// Java resolver
// ---------------------------------------------------------------------------
export function resolveJavaImport(rawImport, _file, ctx) {
return resolveDottedFqn(rawImport, '.java', ctx.javaIndex);
}
// ---------------------------------------------------------------------------
// Kotlin resolver
//
// Kotlin has no tree-sitter extractor in this project, so its import sources
// are collected via a focused regex pass in extractExtraImportSources(); the
// resolver itself is identical-shape to Java.
// ---------------------------------------------------------------------------
export function resolveKotlinImport(rawImport, _file, ctx) {
return resolveDottedFqn(rawImport, '.kt', ctx.kotlinIndex);
}
// ---------------------------------------------------------------------------
// C# resolver
//
// C# `using Foo.Bar;` declarations are typically NAMESPACES, not files, and
// the C# convention is namespace = directory (loose). Tree-sitter's C#
// extractor captures these as imports with the dotted source. We probe the
// dotted path against the .cs index the same way Java/Kotlin do.
// ---------------------------------------------------------------------------
export function resolveCSharpImport(rawImport, _file, ctx) {
return resolveDottedFqn(rawImport, '.cs', ctx.csIndex);
}
// ---------------------------------------------------------------------------
// Ruby resolver
//
// Two distinct Ruby import forms, with different resolution semantics:
// - `require_relative 'foo'` -> resolve against the importer's directory,
// append .rb
// - `require 'foo/bar'` -> load-path probe: lib/foo/bar.rb,
// app/foo/bar.rb, or foo/bar.rb (whichever
// exists)
//
// Tree-sitter's Ruby extractor uses a single `imports` field for both forms
// and drops the method name, so we cannot tell them apart from the
// extractor output alone. Instead we use a regex pass on the file content,
// which preserves the method name as the discriminator.
//
// The two forms are unambiguous in source — both start with the method name
// followed by a quoted argument — so a focused regex is reliable.
// ---------------------------------------------------------------------------
const RUBY_REQUIRE_RE =
/\b(require_relative|require)\s*\(?\s*(['"])([^'"`\n]+?)\2/g;
/**
* Strip Ruby line comments (`# ...` to end of line) before running the
* require regex. Ruby has no block comments at this scope (=begin/=end
* exists but is rare; tree-sitter would normally handle that). Like the JS
* stripper, this doesn't try to honor string contents — it's a heuristic.
*/
function stripRubyComments(content) {
return content.replace(/#[^\n]*/g, '');
}
/**
* Return [{ kind: 'relative'|'absolute', source }] for every require /
* require_relative call in a Ruby file.
*/
function parseRubyImports(content) {
const out = [];
let m;
const stripped = stripRubyComments(content);
RUBY_REQUIRE_RE.lastIndex = 0;
while ((m = RUBY_REQUIRE_RE.exec(stripped)) !== null) {
out.push({
kind: m[1] === 'require_relative' ? 'relative' : 'absolute',
source: m[3],
});
}
return out;
}
/**
* Resolve a single Ruby require. Returns array (0 or 1 match).
*
* For require_relative: append `.rb` if missing, resolve against importer dir.
* For require: probe lib/<src>.rb, app/<src>.rb, <src>.rb.
*/
export function resolveRubyImport({ kind, source }, file, ctx) {
if (!source) return [];
const importerDir = dirOf(toPosix(file.path));
const withExt = source.endsWith('.rb') ? source : source + '.rb';
if (kind === 'relative') {
const base = resolveRelative(importerDir, withExt);
return ctx.fileSet.has(base) ? [base] : [];
}
// Load-path probe order
const probes = [`lib/${withExt}`, `app/${withExt}`, withExt];
for (const p of probes) {
if (ctx.fileSet.has(p)) return [p];
}
return [];
}
// ---------------------------------------------------------------------------
// PHP resolver
//
// PHP's `use Vendor\Pkg\Class;` is namespace-based. Composer's PSR-4
// autoload map (`composer.json` -> autoload.psr-4) declares which directory
// holds the files for each namespace prefix, e.g.:
// { "App\\": "src/" } means App\Foo\Bar lives at src/Foo/Bar.php
//
// Resolution:
// 1. Find the longest matching autoload prefix.
// 2. Strip that prefix from the FQN.
// 3. Translate backslashes to forward slashes.
// 4. Append `.php` and probe the file set.
//
// Imports whose namespace is not declared in any autoload entry are
// external — dropped.
// ---------------------------------------------------------------------------
/**
* Parse a single composer.json content and return Map<namespacePrefix,
* dir[]> or null if the JSON failed to parse. The returned dirs are
* relative to the composer.json's own directory — NOT projectRoot —
* matching how PSR-4 itself is specified.
*
* Returning `null` (rather than throwing) lets the caller emit a Warning:
* with the exact composer.json path that failed; bubbling the error would
* conceal which file was at fault when many composer.json files are loaded.
*/
function parseComposerAutoloadText(raw) {
let parsed;
try {
parsed = JSON.parse(raw);
} catch {
return null;
}
const out = new Map();
const psr4 = parsed?.autoload?.['psr-4'];
if (!psr4 || typeof psr4 !== 'object') return out;
for (const [prefix, target] of Object.entries(psr4)) {
const targets = Array.isArray(target) ? target : [target];
// Normalize each dir to posix, strip leading `./`, strip trailing `/`
const normalized = targets
.filter(t => typeof t === 'string')
.map(t => toPosix(t).replace(/\/$/, ''));
// Ensure non-empty prefixes end with a backslash so the
// longest-prefix-match does not accidentally split mid-segment
// ("App" vs "Application"). Preserve the empty prefix as-is — it's
// Composer's fallback mapping (`"psr-4": {"": "src/"}`) and means
// "any namespace resolves under this dir". Appending `\` would
// convert it into a prefix that matches nothing.
const normalizedPrefix = prefix === '' || prefix.endsWith('\\') ? prefix : prefix + '\\';
out.set(normalizedPrefix, normalized);
}
return out;
}
/**
* Load every `composer.json` discovered in the input file list and parse
* each's `autoload.psr-4` section. Returns Map<dirPath, autoloadMap>
* keyed by the project-relative POSIX directory containing the
* composer.json (empty string for a root-level composer.json).
*
* WHY plural: Composer monorepos commonly stack a root composer.json over
* per-package composer.json files (one of the two formal "monorepo"
* patterns Composer documents — `wikimedia/composer-merge-plugin` and
* `symplify/monorepo-builder` both ship this layout). Loading only the
* root would miss package-scoped PSR-4 entries entirely.
*
* On parse failure for a specific composer.json, emits a Warning: pointing
* at the bad file and skips it. The rest of the project's PHP imports keep
* resolving via whichever composer.json files parsed cleanly.
*/
function loadPhpAutoloads(projectRoot, files) {
const out = new Map();
for (const f of files) {
const p = toPosix(f.path);
const base = p.includes('/') ? p.slice(p.lastIndexOf('/') + 1) : p;
if (base !== 'composer.json') continue;
const absPath = join(projectRoot, p);
if (!existsSync(absPath)) continue;
let raw;
try {
raw = readFileSync(absPath, 'utf-8');
} catch (err) {
process.stderr.write(
`Warning: extract-import-map: composer.json at ${absPath} failed ` +
`to read (${err.message}) — PSR-4 namespace mapping from this ` +
`composer.json unavailable — PHP imports under this package ` +
`will not resolve\n`,
);
continue;
}
const parsed = parseComposerAutoloadText(raw);
if (parsed === null) {
process.stderr.write(
`Warning: extract-import-map: composer.json at ${absPath} failed ` +
`to parse — PSR-4 namespace mapping unavailable — PHP imports ` +
`under this package will not resolve\n`,
);
continue;
}
out.set(dirOf(p), parsed);
}
return out;
}
/**
* Resolve a PHP `use` FQN against the autoload map of the importer's
* nearest enclosing composer.json. Returns array (0 or 1 match — the first
* dir in the PSR-4 target list that contains the file).
*
* Resolved paths are anchored at the composer.json's directory, NOT at
* projectRoot, so a sub-package's `App\Foo\Bar` resolves to
* `<package-dir>/src/Foo/Bar.php` rather than `<projectRoot>/src/...`.
* This is what Composer's autoloader actually does on disk.
*/
export function resolvePhpImport(rawImport, file, ctx) {
if (!rawImport || typeof rawImport !== 'string') return [];
// Strip leading backslash if present (PHP allows `use \Foo\Bar;`)
const fqn = rawImport.startsWith('\\') ? rawImport.slice(1) : rawImport;
if (!fqn) return [];
const importerDir = dirOf(toPosix(file.path));
const composerDir = findNearestConfigDir(importerDir, ctx.phpAutoloads);
if (composerDir === undefined) return [];
const autoload = ctx.phpAutoloads.get(composerDir);
if (!autoload || autoload.size === 0) return [];
// Longest-prefix match across this composer.json's autoload entries.
// Walk the map and pick the entry with the longest matching prefix, so
// `Foo\Bar` does not match a prefix `F\` if `Foo\` is also present.
// Use `null` as the sentinel rather than 0-length so the empty PSR-4
// fallback prefix (`""` → `src/`) can win when nothing more specific
// matches; otherwise `prefix.length > bestPrefix.length` would always
// be `0 > 0 = false` for the empty prefix.
let bestPrefix = null;
let bestDirs = null;
for (const [prefix, dirs] of autoload) {
if (fqn.startsWith(prefix) && (bestPrefix === null || prefix.length > bestPrefix.length)) {
bestPrefix = prefix;
bestDirs = dirs;
}
}
if (bestDirs === null) return [];
// Drop the prefix (it covers the directory), translate `\` to `/`.
const relative = fqn.slice(bestPrefix.length).replace(/\\/g, '/');
if (!relative) return [];
for (const dir of bestDirs) {
// Anchor at the composer.json's own directory — PSR-4 paths are
// composer-relative, not project-relative.
const dirUnderComposer = dir
? (composerDir ? `${composerDir}/${dir}` : dir)
: composerDir;
const candidate = dirUnderComposer
? `${dirUnderComposer}/${relative}.php`
: `${relative}.php`;
if (ctx.fileSet.has(candidate)) return [candidate];
}
return [];
}
// ---------------------------------------------------------------------------
// Rust resolver
//
// Rust's module system is path-based but the import syntax is `use` rather
// than path strings. Tree-sitter emits sources like `crate::a::b::Item`,
// `super::a::Item`, `self::a`, or bare `std::collections::HashMap`. We map
// only those rooted at `crate::` or `super::` — bare paths are external
// crates.
//
// Resolution heuristics:
// - `crate::a::b::*` -> probe `<crate-root>/a/b.rs`, then
// `<crate-root>/a/b/mod.rs`. The crate root is `<package-dir>/src/`
// (Cargo convention).
// - `super::a::b::*` -> walk up one directory from the importer, then
// descend; same .rs / mod.rs probes.
// - `self::a::*` -> like `super::a::*` but without the walk-up.
//
// Rust uses won't always land on a file (an import like `crate::Foo` could
// refer to a struct re-exported through `mod.rs`); we accept that limitation.
//
// We also extract `mod x;` declarations via regex — these declare submodules
// to load and translate directly to `<importer-dir>/x.rs` or
// `<importer-dir>/x/mod.rs`.
// ---------------------------------------------------------------------------
/**
* Try `<base>.rs` then `<base>/mod.rs` against the file set. Returns the
* first match or null.
*/
function probeRustModule(base, fileSet) {
if (!base) return null;
if (fileSet.has(`${base}.rs`)) return `${base}.rs`;
if (fileSet.has(`${base}/mod.rs`)) return `${base}/mod.rs`;
return null;
}
/**
* Find the "crate root" directory for a Rust importer. By Cargo convention,
* this is the directory containing `src/lib.rs` or `src/main.rs`. For nested
* workspaces, walk up from the importer until a `src/` ancestor is found.
* Returns the path relative to project root, or null if not found.
*
* The loop walks every ancestor directory (including the root) and probes
* `<ancestor>/src/lib.rs` and `<ancestor>/src/main.rs`. We don't need a
* separate "candidate ends with src" branch — when the importer is itself
* inside `src/`, the next iteration up reaches the package dir and the
* `<package>/src/lib.rs` probe catches it.
*/
function findRustCrateSrc(importerDir, fileSet) {
const parts = importerDir.split('/').filter(Boolean);
for (let i = parts.length; i >= 0; i--) {
const ancestor = parts.slice(0, i).join('/');
const childSrc = ancestor ? `${ancestor}/src` : 'src';
if (fileSet.has(`${childSrc}/lib.rs`) || fileSet.has(`${childSrc}/main.rs`)) {
return childSrc;
}
}
return null;
}
export function resolveRustImport(rawImport, file, ctx) {
if (!rawImport || typeof rawImport !== 'string') return [];
const src = rawImport.trim();
if (!src) return [];
const importerDir = dirOf(toPosix(file.path));
const segments = src.split('::').filter(Boolean);
if (segments.length === 0) return [];
const head = segments[0];
// External crates: anything not rooted at crate/super/self.
if (head !== 'crate' && head !== 'super' && head !== 'self') return [];
// Walk segments after the head to a base file path. We probe each
// successive prefix from longest to shortest so that `crate::a::b::Item`
// matches `a/b.rs` (with `Item` being a re-export inside) rather than
// failing because `a/b/Item.rs` doesn't exist.
let baseDir;
if (head === 'crate') {
const crateSrc = findRustCrateSrc(importerDir, ctx.fileSet);
if (!crateSrc) {
// Warn once per importer file (a single .rs file can have many
// `use crate::...` statements; suppress duplicate warnings).
const importerPath = toPosix(file.path);
if (!ctx._warnedNoRustCrateRoot.has(importerPath)) {
ctx._warnedNoRustCrateRoot.add(importerPath);
process.stderr.write(
`Warning: extract-import-map: Rust file ${importerPath} has ` +
`'use crate::' but no crate root (src/lib.rs or src/main.rs) ` +
`found — crate-relative imports unresolved\n`,
);
}
return [];
}
baseDir = crateSrc;
} else if (head === 'super') {
// Walk up one directory from the importer
const parts = importerDir.split('/').filter(Boolean);
if (parts.length === 0) return [];
baseDir = parts.slice(0, -1).join('/');
} else {
// self::
baseDir = importerDir;
}
const rest = segments.slice(1);
// Try each prefix length from longest -> shortest. The empty rest case
// (e.g. bare `use crate;`) is unresolvable.
for (let i = rest.length; i > 0; i--) {
const prefix = rest.slice(0, i);
const base = baseDir
? `${baseDir}/${prefix.join('/')}`
: prefix.join('/');
const match = probeRustModule(base, ctx.fileSet);
if (match) return [match];
}
return [];
}
/**
* Regex pass for Rust `mod x;` declarations. These are NOT captured by
* tree-sitter's import field, but they declare a child module on disk that
* follows the same `<dir>/x.rs` or `<dir>/x/mod.rs` convention.
*/
const RUST_MOD_RE = /^\s*(?:pub(?:\s*\([^)]*\))?\s+)?mod\s+(\w+)\s*;\s*$/gm;
function extractRustModSources(content) {
const sources = [];
let m;
// Rust uses the same line + block comment syntax as JS/TS, so we can reuse
// the same stripper. Without this, `// mod fake;` would phantom-register
// a submodule that doesn't exist on disk.
const stripped = stripJsLikeComments(content);
RUST_MOD_RE.lastIndex = 0;
while ((m = RUST_MOD_RE.exec(stripped)) !== null) {
// Synthesize as a `self::<name>` source so the regular Rust resolver
// handles it (probes the importer's directory).
sources.push(`self::${m[1]}`);
}
return sources;
}
// ---------------------------------------------------------------------------
// C / C++ resolver
//
// Tree-sitter's cpp extractor exposes both quoted and angle-bracket includes
// as imports with `source` set to the bare filename (e.g. `foo.h`).
// Quoted includes resolve relative to the importer's directory; angle
// includes look in a system path. We can't tell quoted from angle from
// tree-sitter alone, but the resolution rules overlap enough that probing
// both yields the right answer most of the time:
// 1. <importer-dir>/<source>
// 2. include/<source>
// 3. src/<source>
// 4. <source> (project-root-relative)
//
// We probe in that order and take the first match. Multiple file extensions
// (.h, .hpp, .hxx, .cuh) are NOT auto-appended — #include carries the
// extension explicitly.
// ---------------------------------------------------------------------------
export function resolveCppImport(rawImport, file, ctx) {
if (!rawImport || typeof rawImport !== 'string') return [];
const src = toPosix(rawImport.trim());
if (!src) return [];
const importerDir = dirOf(toPosix(file.path));
const candidates = [
resolveRelative(importerDir, src),
`include/${src}`,
`src/${src}`,
src,
];
for (const c of candidates) {
if (c && ctx.fileSet.has(c)) return [c];
}
return [];
}
// ---------------------------------------------------------------------------
// Dispatcher
// ---------------------------------------------------------------------------
/**
* Languages recognized as "code" for resolver dispatch. Tree-sitter parses
* these via the corresponding extractor; the dispatcher routes the import
* source through the matching resolver.
*/
const TS_JS_LANGS = new Set([
'typescript', 'javascript', 'tsx', 'jsx', 'vue',
]);
/**
* Dispatch a raw import to the language-specific resolver. Returns an array
* of resolved project-relative paths (most resolvers produce 0 or 1; Python
* can produce multiple when a `from pkg import a, b, c` resolves both the
* package's `__init__.py` and each submodule).
*
* Per-resolver contract: never throw, never read disk (read once in main()).
* Empty array means external/unresolved.
*/
function resolveImport(imp, file, ctx) {
const lang = file.language;
const src = imp.source;
if (TS_JS_LANGS.has(lang)) {
const out = resolveTsJsImport(src, file, ctx);
return out ? [out] : [];
}
if (lang === 'python') {
return resolvePythonImport(src, imp.specifiers, file, ctx);
}
if (lang === 'go') {
return resolveGoImport(src, file, ctx);
}
if (lang === 'java') {
return resolveJavaImport(src, file, ctx);
}
if (lang === 'kotlin') {
return resolveKotlinImport(src, file, ctx);
}
if (lang === 'csharp') {
return resolveCSharpImport(src, file, ctx);
}
if (lang === 'php') {
return resolvePhpImport(src, file, ctx);
}
if (lang === 'rust') {
return resolveRustImport(src, file, ctx);
}
if (lang === 'c' || lang === 'cpp') {
return resolveCppImport(src, file, ctx);
}
// Ruby is handled via a dedicated pathway because its tree-sitter
// extractor flattens require vs require_relative into a single field,
// losing the discriminator the resolver needs.
return [];
}
/**
* Collect extra raw import sources that tree-sitter doesn't capture. Today
* this is CommonJS require() literals for JS/TS files. Returns an array of
* import-source strings to be passed through resolveImport().
*/
function extractExtraImportSources(file, content) {
if (TS_JS_LANGS.has(file.language)) {
return extractRequireSources(content);
}
if (file.language === 'kotlin') {
return extractKotlinSources(content);
}
if (file.language === 'rust') {
// `mod x;` declarations aren't in tree-sitter's `imports` field, but they
// declare submodules on disk that the rust resolver knows how to find.
return extractRustModSources(content);
}
return [];
}
// ---------------------------------------------------------------------------
// Main
// ---------------------------------------------------------------------------
async function main() {
const [,, inputPath, outputPath] = process.argv;
if (!inputPath || !outputPath) {
process.stderr.write('Usage: node extract-import-map.mjs <input.json> <output.json>\n');
process.exit(1);
}
const inputRaw = readFileSync(inputPath, 'utf-8');
const input = JSON.parse(inputRaw);
const { projectRoot, files } = input;
if (!projectRoot || !Array.isArray(files)) {
throw new Error('Invalid input: must contain projectRoot and files array');
}
// Create tree-sitter plugin with all configs that have WASM grammars.
//
// WHY graceful init: the most likely real-world failure mode is the WASM
// loader failing to locate or fetch the grammar binaries (cache eviction,
// restricted sandboxes, transient FS issues). When that happens, we still
// want the script to complete — producing an empty importMap for every
// code file — rather than crashing the whole project-scanner pipeline.
// The structural graph will lose import edges, but all OTHER analysis
// (file inventory, exports inferred from filenames, etc.) keeps working.
let registry = null;
let treeSitterReady = false;
try {
const tsConfigs = builtinLanguageConfigs.filter(c => c.treeSitter);
const tsPlugin = new TreeSitterPlugin(tsConfigs);
await tsPlugin.init();
registry = new PluginRegistry();
registry.register(tsPlugin);
registerAllParsers(registry);
treeSitterReady = true;
} catch (err) {
process.stderr.write(
`Warning: extract-import-map: tree-sitter init failed ` +
`(${err.message}) — all importMap entries will be empty — ` +
`structural graph will have no import edges\n`,
);
}
// Build resolution context (cached configs)
const ctx = buildResolutionContext(projectRoot, files);
const importMap = {};
let filesWithImports = 0;
let totalEdges = 0;
for (const file of files) {
const path = toPosix(file.path);
// Non-code files always get an empty array
if (file.fileCategory !== 'code') {
importMap[path] = [];
continue;
}
// Tree-sitter init failed earlier — produce empty importMap entries for
// every code file and skip the analysis path. The one-time warning was
// already emitted at startup.
if (!treeSitterReady) {
importMap[path] = [];
continue;
}
const absolutePath = join(projectRoot, file.path);
// Read file content (per-file resilience)
let content;
try {
content = readFileSync(absolutePath, 'utf-8');
} catch (err) {
process.stderr.write(
`Warning: extract-import-map: import resolution failed for ${path} ` +
`(read error: ${err.message}) — importMap[${path}]=[]\n`,
);
importMap[path] = [];
continue;
}
// Analyze + resolve
let resolved;
try {
const resolvedSet = new Set();
// Ruby is the only language whose tree-sitter import field doesn't
// preserve the require vs require_relative discriminator, so the
// resolver needs the regex-parsed shape directly. All other tree-sitter
// languages get analyzed once and dispatched normally.
if (file.language === 'ruby') {
for (const imp of parseRubyImports(content)) {
for (const out of resolveRubyImport(imp, file, ctx)) {
if (out && ctx.fileSet.has(out)) resolvedSet.add(out);
}
}
} else {
const analysis = registry.analyzeFile(file.path, content);
const imports = analysis?.imports ?? [];
for (const imp of imports) {
const outs = resolveImport(imp, file, ctx);
for (const out of outs) {
if (out && ctx.fileSet.has(out)) {
resolvedSet.add(out);
}
}
}
// Supplemental pass for sources tree-sitter doesn't capture (e.g.
// CJS require() calls, Kotlin imports). Dedup via the same set.
for (const extra of extractExtraImportSources(file, content)) {
const outs = resolveImport({ source: extra, specifiers: [] }, file, ctx);
for (const out of outs) {
if (out && ctx.fileSet.has(out)) {
resolvedSet.add(out);
}
}
}
}
resolved = [...resolvedSet].sort((a, b) => a.localeCompare(b));
} catch (err) {
process.stderr.write(
`Warning: extract-import-map: import resolution failed for ${path} ` +
`(analyze error: ${err.message}) — importMap[${path}]=[]\n`,
);
importMap[path] = [];
continue;
}
importMap[path] = resolved;
if (resolved.length > 0) {
filesWithImports += 1;
totalEdges += resolved.length;
}
}
const output = {
scriptCompleted: true,
stats: {
filesScanned: files.length,
filesWithImports,
totalEdges,
},
importMap,
};
writeFileSync(outputPath, JSON.stringify(output, null, 2), 'utf-8');
if (!existsSync(outputPath)) {
throw new Error(`output file missing after write: ${outputPath}`);
}
process.stderr.write(
`extract-import-map: filesScanned=${files.length} ` +
`filesWithImports=${filesWithImports} totalEdges=${totalEdges}\n`,
);
}
// ---------------------------------------------------------------------------
// Run only when executed directly as a CLI; importing the module (e.g. from
// tests) must not trigger main().
//
// Canonicalize both sides through realpathSync. Node ESM resolves
// import.meta.url through symlinks but pathToFileURL(process.argv[1]) preserves
// them, so a raw equality check silently no-ops when the script is invoked via
// a symlinked plugin install path (the default in Claude Code / Copilot CLI
// caches). See GitHub issue #162.
// ---------------------------------------------------------------------------
function isCliEntry() {
if (!process.argv[1]) return false;
try {
const modulePath = realpathSync(fileURLToPath(import.meta.url));
const argvPath = realpathSync(process.argv[1]);
return modulePath === argvPath;
} catch {
return false;
}
}
if (isCliEntry()) {
try {
await main();
} catch (err) {
process.stderr.write(`extract-import-map.mjs failed: ${err.message}\n${err.stack}\n`);
process.exit(1);
}
}