autojanet/skills/understand/scan-project.mjs
Zoë cc74ad0bd0
Some checks failed
ci/woodpecker/push/woodpecker Pipeline failed
fix: use library/ Harbor project, add skills, fix pipeline secrets
- .woodpecker.yaml: image paths -> library/autojanet-{agent,dispatcher}
- .woodpecker.yaml: secret names RS_HARBOR_USER / RS_HARBOR_PASS (global)
- container/Dockerfile: restore COPY skills/, skills/ populated from opencode config
- skills/: 84 opencode skills bundled into image
- k8s/manifests: update image refs to library/
2026-05-30 15:43:14 -07:00

802 lines
27 KiB
JavaScript

#!/usr/bin/env node
/**
* scan-project.mjs
*
* Deterministic file enumeration + language/category detection for the
* project-scanner agent. Replaces the LLM-written prose scanner that used to
* (a) author a per-run Node.js script (`tmp/ua-project-scan.js`), (b) walk the
* file tree, and (c) classify each file via lookup tables in LLM context — a
* pure rule-lookup pass that was being billed at LLM rates and adding many
* minutes of per-run latency on mid-sized monorepos.
*
* What the LLM still owns (Step A of project-scanner.md Phase 1):
* - Reading README + top-level manifests to synthesize `name`,
* `rawDescription`, `readmeHead`, `frameworks`, and the high-level
* `languages` narrative.
*
* What this script owns:
* - File enumeration (git ls-files preferred, recursive walk fallback)
* - `.understandignore` filtering (delegated to core's createIgnoreFilter)
* - Per-file language detection (extension + filename table)
* - Per-file category assignment (priority-ordered rules from
* project-scanner.md Step 4)
* - Line counting
* - Complexity estimation (project-scanner.md Step 7 thresholds)
*
* Usage:
* node scan-project.mjs <projectRoot> <outputPath>
*
* Output JSON (subset of what project-scanner.md Phase 1 expects — the LLM
* agent merges this with Step A's narrative fields and Step C's importMap to
* produce the final scan-result.json):
* {
* "scriptCompleted": true,
* "files": [{ "path": "...", "language": "...", "sizeLines": N, "fileCategory": "..." }, ...],
* "totalFiles": N,
* "filteredByIgnore": M,
* "estimatedComplexity": "small" | "moderate" | "large" | "very-large",
* "stats": { "filesScanned": N, "byCategory": {...}, "byLanguage": {...} }
* }
*
* Logging: stderr only (stdout reserved for piped tooling).
* Per-file resilience: read/stat failures emit
* `Warning: scan-project: <path> — <reason> — file skipped from output`
* to stderr and the file is dropped; the rest of the scan completes.
*
* Determinism: files are sorted by `path.localeCompare` before emission, and
* the underlying enumeration is deterministic (git ls-files returns a stable
* order; the fallback walker sorts each directory's entries).
*/
import { createRequire } from 'node:module';
import { dirname, resolve, join, basename, extname, relative, sep } from 'node:path';
import { fileURLToPath, pathToFileURL } from 'node:url';
import {
existsSync,
readFileSync,
readdirSync,
realpathSync,
statSync,
writeFileSync,
} from 'node:fs';
import { spawnSync } from 'node:child_process';
const __dirname = dirname(fileURLToPath(import.meta.url));
// skills/understand/ -> plugin root is two dirs up
const pluginRoot = resolve(__dirname, '../..');
const require = createRequire(resolve(pluginRoot, 'package.json'));
// ---------------------------------------------------------------------------
// Resolve @understand-anything/core
//
// Two-step resolution: try the workspace-linked package first, fall back to
// the installed plugin cache layout. pathToFileURL() is required on Windows
// because dynamic import() of raw "C:\..." paths throws
// ERR_UNSUPPORTED_ESM_URL_SCHEME (Node parses "C:" as a URL scheme).
// ---------------------------------------------------------------------------
let core;
try {
core = await import(pathToFileURL(require.resolve('@understand-anything/core')).href);
} catch {
core = await import(pathToFileURL(resolve(pluginRoot, 'packages/core/dist/index.js')).href);
}
const { createIgnoreFilter } = core;
// ---------------------------------------------------------------------------
// Language detection
//
// Mirrors the canonical extension list from
// understand-anything-plugin/packages/core/src/languages/configs/* and the
// project-scanner.md Step 3 table. Extensions are matched lowercase;
// filenames (Dockerfile, Makefile, etc.) are matched case-sensitively because
// the projects-in-the-wild use canonical capitalizations.
//
// Where the core configs and project-scanner.md diverge (rare), project-
// scanner.md wins because it is the user-facing contract.
// ---------------------------------------------------------------------------
/**
* Extension -> language id. Lowercase keys; lookup is `.ext.toLowerCase()`.
* Includes the legacy Step-3 mapping (.cfg/.ini/.env -> `config`) — note
* that `config` is a language id here, not a category. Category routing
* for these extensions is handled separately in CATEGORY_BY_EXT.
*/
const LANGUAGE_BY_EXT = Object.freeze({
// TypeScript / JavaScript
'.ts': 'typescript',
'.tsx': 'typescript',
'.js': 'javascript',
'.jsx': 'javascript',
'.mjs': 'javascript',
'.cjs': 'javascript',
// Python
'.py': 'python',
'.pyi': 'python',
// Go / Rust / Java / Kotlin / C# / Swift / Lua
'.go': 'go',
'.rs': 'rust',
'.java': 'java',
'.kt': 'kotlin',
'.kts': 'kotlin',
'.cs': 'csharp',
'.swift': 'swift',
'.lua': 'lua',
// Ruby / PHP
'.rb': 'ruby',
'.rake': 'ruby',
'.php': 'php',
// C / C++
'.c': 'c',
'.h': 'c',
'.cpp': 'cpp',
'.cc': 'cpp',
'.cxx': 'cpp',
'.hpp': 'cpp',
'.hxx': 'cpp',
// Vue / Svelte (no tree-sitter extractor, but project-scanner contract
// lists them as code languages — downstream import map will return [])
'.vue': 'vue',
'.svelte': 'svelte',
// Shell / Batch / PowerShell
'.sh': 'shell',
'.bash': 'shell',
'.zsh': 'shell',
'.ps1': 'powershell',
'.psm1': 'powershell',
'.psd1': 'powershell',
'.bat': 'batch',
'.cmd': 'batch',
// Markup / docs
'.html': 'html',
'.htm': 'html',
'.css': 'css',
'.scss': 'css',
'.sass': 'css',
'.less': 'css',
'.md': 'markdown',
'.mdx': 'markdown',
'.rst': 'markdown',
// Config / data
'.yaml': 'yaml',
'.yml': 'yaml',
'.json': 'json',
'.jsonc': 'jsonc',
'.toml': 'toml',
'.xml': 'xml',
'.xsl': 'xml',
'.xsd': 'xml',
'.plist': 'xml',
'.cfg': 'config',
'.ini': 'config',
'.env': 'config',
// Data / schema
'.sql': 'sql',
'.graphql': 'graphql',
'.gql': 'graphql',
'.proto': 'protobuf',
'.prisma': 'prisma',
'.csv': 'csv',
'.tsv': 'csv',
// Infra
'.tf': 'terraform',
'.tfvars': 'terraform',
// JVM build files (categorized via filename-or-extension)
'.gradle': 'gradle',
// .NET project files (mapped to extension-derived ids; downstream
// treats them as config — see CATEGORY_BY_EXT)
'.csproj': 'csproj',
'.sln': 'sln',
'.properties': 'properties',
'.mod': 'mod',
'.sum': 'sum',
});
/**
* Filename (no extension) -> language id. Compared case-sensitively against
* basename(path). Includes the most common no-extension conventions; anything
* NOT in this table with no extension falls back to `unknown`.
*
* Dockerfile.* variants (Dockerfile.dev, Dockerfile.prod) are handled by a
* startsWith check in `detectLanguage()` so we don't have to enumerate every
* possible suffix.
*/
const LANGUAGE_BY_FILENAME = Object.freeze({
Dockerfile: 'dockerfile',
Makefile: 'makefile',
GNUmakefile: 'makefile',
makefile: 'makefile',
Jenkinsfile: 'jenkinsfile',
Procfile: 'procfile',
Vagrantfile: 'vagrantfile',
});
/**
* Detect the language of a file by its path. Lowercase extension lookup,
* then no-extension filename lookup. Never returns null — falls back to
* the lowercased extension (without dot) or 'unknown' if there is no
* extension. Downstream consumers rely on this field always being a string
* (see project-scanner.md Step 3 "Fallback" note).
*/
export function detectLanguage(filePath) {
const base = basename(filePath);
const ext = extname(filePath).toLowerCase();
// Dockerfile.dev, Dockerfile.prod, etc. — common variant form.
if (base === 'Dockerfile' || base.startsWith('Dockerfile.')) return 'dockerfile';
// Dotfile names like .env, .env.local — path.extname returns '' for
// single-segment dotfiles (e.g. '.env') and the SECOND segment for
// compound dotfiles (e.g. '.local' for '.env.local'). Neither hits the
// intended LANGUAGE_BY_EXT['.env'] mapping. Try the leading dotfile
// portion first so `.env`, `.env.local`, `.env.production` all map.
const dotKey = dotfileKey(base);
if (dotKey && LANGUAGE_BY_EXT[dotKey]) return LANGUAGE_BY_EXT[dotKey];
if (ext) {
const byExt = LANGUAGE_BY_EXT[ext];
if (byExt) return byExt;
// Unknown extension → drop the leading dot, lowercase. Never null.
return ext.slice(1);
}
// No-extension file — try filename table.
const byFilename = LANGUAGE_BY_FILENAME[base];
if (byFilename) return byFilename;
return 'unknown';
}
/**
* Extract the canonical dotfile "extension" from a basename, or null.
*
* `.env` -> `.env`
* `.env.local` -> `.env`
* `.bashrc` -> `.bashrc`
* `package.json` -> null (not a dotfile)
*
* Used by both detectLanguage and detectCategory so dotfile-style configs
* (e.g., `.env`, `.env.local`, `.env.production`) get their leading
* segment treated as the implicit extension instead of falling through
* to `unknown` / `code`.
*/
function dotfileKey(base) {
if (!base.startsWith('.')) return null;
const m = base.match(/^(\.[a-z0-9]+)/i);
return m ? m[1].toLowerCase() : null;
}
// ---------------------------------------------------------------------------
// Category detection
//
// Implements the priority-ordered rules from project-scanner.md Step 4.
// Order matters: more specific rules must run before more general ones
// (e.g. `docker-compose.yml` is infra, not config).
//
// Categories: code | config | docs | infra | data | script | markup
// ---------------------------------------------------------------------------
/**
* Extension -> category. Used only after the higher-priority path-based
* checks (infra/docs exclusions) in `detectCategory()`. Plain extension
* lookup is intentionally last-resort — many configs need their full path
* inspected first.
*/
const CATEGORY_BY_EXT = Object.freeze({
// docs
'.md': 'docs',
'.mdx': 'docs',
'.rst': 'docs',
'.txt': 'docs',
'.text': 'docs',
// config
'.yaml': 'config',
'.yml': 'config',
'.json': 'config',
'.jsonc': 'config',
'.toml': 'config',
'.xml': 'config',
'.xsl': 'config',
'.xsd': 'config',
'.plist': 'config',
'.cfg': 'config',
'.ini': 'config',
'.env': 'config',
'.properties': 'config',
'.csproj': 'config',
'.sln': 'config',
'.mod': 'config',
'.sum': 'config',
'.gradle': 'config',
// infra
'.tf': 'infra',
'.tfvars': 'infra',
// data
'.sql': 'data',
'.graphql': 'data',
'.gql': 'data',
'.proto': 'data',
'.prisma': 'data',
'.csv': 'data',
'.tsv': 'data',
// script
'.sh': 'script',
'.bash': 'script',
'.zsh': 'script',
'.ps1': 'script',
'.psm1': 'script',
'.psd1': 'script',
'.bat': 'script',
'.cmd': 'script',
// markup
'.html': 'markup',
'.htm': 'markup',
'.css': 'markup',
'.scss': 'markup',
'.sass': 'markup',
'.less': 'markup',
});
/**
* Filenames (no extension or full filename with extension) that always
* map to `infra` regardless of their extension. Compared case-sensitively
* against basename(path).
*/
const INFRA_FILENAMES = new Set([
'Dockerfile',
'.dockerignore',
'Makefile',
'GNUmakefile',
'makefile',
'Jenkinsfile',
'Procfile',
'Vagrantfile',
'.gitlab-ci.yml',
]);
/**
* Detect the project-scanner category for a file. Priority order matches
* project-scanner.md Step 4 "Priority rule" — most specific wins.
*
* 1. LICENSE -> code (per the spec note "except LICENSE"). The Step-2
* exclusion table normally removes LICENSE, but if a project chooses to
* re-include it via `.understandignore` negation, it should NOT land in
* docs. We classify as `code` rather than inventing a new bucket.
* 2. Filename-based infra (Dockerfile, Makefile, Jenkinsfile,
* docker-compose.*, Vagrantfile, Procfile, .gitlab-ci.yml,
* .dockerignore).
* 3. Path-based infra (.github/workflows/, .circleci/, k8s/, kubernetes/,
* *.k8s.yml, *.k8s.yaml).
* 4. Extension-based mapping (CATEGORY_BY_EXT).
* 5. Fallback: `code` (matches the spec — "All other extensions").
*/
export function detectCategory(filePath) {
const base = basename(filePath);
const ext = extname(filePath).toLowerCase();
const posix = filePath.split(sep).join('/');
// Rule 1: LICENSE exception (project-scanner.md Step 4 table comment).
if (base === 'LICENSE') return 'code';
// Rule 2: infra by filename — Dockerfile + variants, Makefile,
// Jenkinsfile, docker-compose.*, Procfile, Vagrantfile, .gitlab-ci.yml,
// .dockerignore.
if (INFRA_FILENAMES.has(base)) return 'infra';
if (base === 'Dockerfile' || base.startsWith('Dockerfile.')) return 'infra';
if (base.startsWith('docker-compose.')) return 'infra';
if (base === 'compose.yml' || base === 'compose.yaml') return 'infra';
// Rule 3: infra by path.
if (posix.startsWith('.github/workflows/')) return 'infra';
if (posix.startsWith('.circleci/')) return 'infra';
// Match a `k8s/` or `kubernetes/` segment anywhere in the path.
if (/(^|\/)(k8s|kubernetes)\//.test(posix)) return 'infra';
// `*.k8s.yml` and `*.k8s.yaml` — Kubernetes-flavored YAML.
if (/\.k8s\.(ya?ml)$/i.test(base)) return 'infra';
// Rule 4: extension-based lookup.
if (ext) {
const byExt = CATEGORY_BY_EXT[ext];
if (byExt) return byExt;
}
// Rule 4.5: dotfile-style configs (.env, .env.local, .env.production).
// path.extname misses these — see dotfileKey docstring.
const dotKey = dotfileKey(base);
if (dotKey) {
const byDot = CATEGORY_BY_EXT[dotKey];
if (byDot) return byDot;
}
// Rule 5: filename-based config catch-all for no-extension config files
// commonly seen in JVM/Go/.NET projects (covered above for infra but not
// config). We don't enumerate every possible config filename here — that
// gets handled by the language map's no-extension entries upstream.
// Anything not matched falls through to `code`.
return 'code';
}
// ---------------------------------------------------------------------------
// Complexity estimation (project-scanner.md Step 7)
// ---------------------------------------------------------------------------
/**
* Map a total file count to a complexity tier. Thresholds are inclusive on
* the lower bound:
* - small: 1-30
* - moderate: 31-150
* - large: 151-500
* - very-large: >500
*
* Edge case: 0 files maps to `small` (the lowest tier) so the field is
* always set even on empty repos. Downstream consumers treat 0 files as
* a sentinel for "nothing to analyze" via `totalFiles`, not complexity.
*/
export function estimateComplexity(totalFiles) {
if (totalFiles <= 30) return 'small';
if (totalFiles <= 150) return 'moderate';
if (totalFiles <= 500) return 'large';
return 'very-large';
}
// ---------------------------------------------------------------------------
// File enumeration
// ---------------------------------------------------------------------------
/**
* Normalize a path to forward-slash POSIX. The project-scanner contract
* emits POSIX paths; we re-normalize so the output is stable across
* Windows/macOS/Linux.
*/
function toPosix(p) {
return p.split(sep).join('/');
}
/**
* Enumerate all files in `projectRoot` via `git ls-files`. Returns an
* array of project-relative POSIX paths, or null if the directory is not
* a git repository (or git is not installed). Caller falls back to the
* recursive walker.
*
* Why git ls-files first: it respects the repo's `.gitignore`, handles
* submodules sensibly, and gives a fast, deterministic listing. The walker
* is a strict superset of what git would emit (no .gitignore awareness),
* so the ignore filter has to do more work in the fallback path.
*/
function enumerateViaGit(projectRoot) {
// -z = NUL-terminated output. Without it, `git ls-files` C-escapes non-ASCII
// bytes in path names — paths containing emoji, accented characters, CJK
// codepoints, etc. come back quoted with octal escapes (e.g.
// `"30. \360\237\217\227 BD-CCER/file.md"` for a path containing 🏗️).
// Those quoted-escaped strings then fail to round-trip back to real disk
// paths in downstream consumers, so files in such directories are silently
// dropped from the scan. The -z form emits raw bytes between NUL separators,
// preserving every codepoint as-is. This is the same approach git itself
// uses for `--null` everywhere downstream (xargs -0, etc.).
const result = spawnSync('git', ['ls-files', '-z', '-co', '--exclude-standard'], {
cwd: projectRoot,
encoding: 'utf-8',
maxBuffer: 256 * 1024 * 1024, // 256MB — huge monorepos can produce >10MB of paths
});
if (result.status !== 0 || !result.stdout) return null;
// Each NUL-separated chunk is one path, project-relative, already POSIX on
// all platforms because git emits forward slashes regardless of OS.
return result.stdout
.split('\0')
.filter(Boolean)
.map(toPosix);
}
/**
* Recursive directory walker — fallback when `git ls-files` is unavailable
* (no git, not a repo, or git refused). Skips hard-coded "obviously bad"
* directory names BEFORE invoking the ignore filter so we don't waste cycles
* descending into `node_modules/` etc. on huge trees.
*
* Yields project-relative POSIX paths in directory-sorted order so the
* output is deterministic without an extra sort pass.
*/
function enumerateViaWalk(projectRoot) {
// Hard skip — these directories are universally non-source and skipping
// at the walker level avoids materializing thousands of node_modules
// paths before the ignore filter drops them. The ignore filter still
// runs on everything else.
const HARD_SKIP_DIRS = new Set([
'node_modules',
'.git',
'.svn',
'.hg',
'__pycache__',
]);
const out = [];
function walk(absDir) {
let entries;
try {
entries = readdirSync(absDir, { withFileTypes: true });
} catch (err) {
process.stderr.write(
`Warning: scan-project: ${toPosix(relative(projectRoot, absDir)) || '.'} ` +
`— directory read failed (${err.message}) — subtree skipped\n`,
);
return;
}
// Sort deterministically by name; mix files and dirs together so the
// final output (after the path sort) is identical regardless of
// OS-specific readdir order.
entries.sort((a, b) => a.name.localeCompare(b.name));
for (const ent of entries) {
if (ent.isDirectory()) {
if (HARD_SKIP_DIRS.has(ent.name)) continue;
walk(join(absDir, ent.name));
} else if (ent.isFile()) {
const rel = toPosix(relative(projectRoot, join(absDir, ent.name)));
if (rel) out.push(rel);
}
// Symlinks intentionally ignored — git ls-files doesn't follow them
// either, and following them is a classic recursion-bomb footgun.
}
}
walk(projectRoot);
return out;
}
/**
* Enumerate all candidate files in `projectRoot`. Tries git ls-files first;
* falls back to a recursive walk if git is unavailable or this is not a
* repo. Returns an array of project-relative POSIX paths in unspecified
* order — caller is responsible for sorting + filtering.
*/
function enumerateFiles(projectRoot) {
const fromGit = enumerateViaGit(projectRoot);
if (fromGit !== null) return fromGit;
process.stderr.write(
`scan-project: git ls-files unavailable — falling back to recursive walk\n`,
);
return enumerateViaWalk(projectRoot);
}
// ---------------------------------------------------------------------------
// Filter accounting
//
// The project-scanner.md contract requires `filteredByIgnore` to count files
// dropped *specifically* by user `.understandignore` patterns (the delta
// beyond what the hardcoded defaults would have removed). We accomplish this
// by building TWO filters:
// - `defaultOnly`: defaults only, no user patterns
// - `combined`: defaults + user patterns (createIgnoreFilter)
// and counting paths that the combined filter excludes but the defaults-only
// filter would have kept.
//
// Negation (`!pattern`) is correctly handled by the combined filter — a file
// re-included via `!` won't be in the combined-excluded set, so it WON'T be
// counted in filteredByIgnore (it's "kept", not "additionally filtered").
// ---------------------------------------------------------------------------
/**
* Build a defaults-only IgnoreFilter — same patterns as createIgnoreFilter
* would apply, minus any user .understandignore content. We synthesize this
* via a temp directory with no .understandignore files so the core function
* still drives the matcher. (Re-implementing the ignore-package wiring here
* would risk subtle behavior drift from core's matcher.)
*/
function buildDefaultsOnlyFilter() {
// Use the createIgnoreFilter with a path that we KNOW has no .understandignore.
// `os.tmpdir()`-based fresh dir guarantees no user patterns leak in.
// The directory doesn't need to exist on disk because createIgnoreFilter
// only checks existsSync() before reading.
const fakeProjectRoot = join(
require('node:os').tmpdir(),
`ua-scan-defaults-${process.pid}-${Date.now()}`,
);
return createIgnoreFilter(fakeProjectRoot);
}
/**
* Determine whether `projectRoot` has any user .understandignore files.
* When neither file exists, the combined and defaults-only filters are
* identical, so we can skip the dual-filter accounting entirely.
*/
function hasUserIgnoreFile(projectRoot) {
return (
existsSync(join(projectRoot, '.understandignore'))
|| existsSync(join(projectRoot, '.understand-anything', '.understandignore'))
);
}
// ---------------------------------------------------------------------------
// Line counting
// ---------------------------------------------------------------------------
/**
* Count newline-delimited lines in a file. Returns the number of `\n`
* characters; this matches `wc -l` semantics (which counts newlines, not
* "lines of content"). Files without a trailing newline therefore report
* one fewer than the visible line count — same behavior as wc.
*
* Per-file failure: emits a Warning: and returns null. Caller decides
* whether to drop the file or keep it with sizeLines=0.
*/
function countLines(absPath, posixPath) {
try {
const buf = readFileSync(absPath);
// Manual newline count beats split('\n').length on large files — no
// intermediate array allocation. We count the `\n` byte (0x0a) directly.
let count = 0;
for (let i = 0; i < buf.length; i++) {
if (buf[i] === 0x0a) count++;
}
return count;
} catch (err) {
process.stderr.write(
`Warning: scan-project: ${posixPath} — line count failed ` +
`(${err.message}) — file skipped from output\n`,
);
return null;
}
}
// ---------------------------------------------------------------------------
// Main
// ---------------------------------------------------------------------------
async function main() {
const [, , projectRoot, outputPath] = process.argv;
if (!projectRoot || !outputPath) {
process.stderr.write(
'Usage: node scan-project.mjs <projectRoot> <outputPath>\n',
);
process.exit(1);
}
if (!existsSync(projectRoot)) {
process.stderr.write(
`scan-project.mjs failed: projectRoot does not exist: ${projectRoot}\n`,
);
process.exit(1);
}
const projectRootStat = statSync(projectRoot);
if (!projectRootStat.isDirectory()) {
process.stderr.write(
`scan-project.mjs failed: projectRoot is not a directory: ${projectRoot}\n`,
);
process.exit(1);
}
// 1. Enumerate. Either git ls-files or recursive walk.
const candidates = enumerateFiles(projectRoot);
// 2. Filter via createIgnoreFilter (defaults + user .understandignore).
// Build a defaults-only filter in parallel to count user-driven drops.
const combined = createIgnoreFilter(projectRoot);
const userIgnoresPresent = hasUserIgnoreFile(projectRoot);
const defaultsOnly = userIgnoresPresent ? buildDefaultsOnlyFilter() : combined;
let filteredByIgnore = 0;
const kept = [];
for (const rel of candidates) {
const isIgnoredCombined = combined.isIgnored(rel);
if (!isIgnoredCombined) {
kept.push(rel);
continue;
}
// Dropped by combined filter. If defaults-only would have ALSO dropped
// it, this is a baseline default drop — not counted. If defaults-only
// would have KEPT it, this drop is attributable to the user's
// .understandignore content.
if (userIgnoresPresent && !defaultsOnly.isIgnored(rel)) {
filteredByIgnore++;
}
}
// 3. Per-file: language + category + line count.
// Drop files that fail line counting (per-file resilience).
const fileEntries = [];
for (const rel of kept) {
const absPath = join(projectRoot, rel);
// Stat first — git ls-files could include paths that vanished between
// listing and processing; the walker shouldn't but defensive anyway.
try {
const st = statSync(absPath);
if (!st.isFile()) {
// Symlinks-to-dir, special files, etc. — skip silently. Not a
// warning condition because git wouldn't have tracked it as a file.
continue;
}
} catch (err) {
process.stderr.write(
`Warning: scan-project: ${rel} — stat failed (${err.message}) ` +
`— file skipped from output\n`,
);
continue;
}
const sizeLines = countLines(absPath, rel);
if (sizeLines === null) {
// countLines already emitted the Warning: line.
continue;
}
fileEntries.push({
path: rel,
language: detectLanguage(rel),
sizeLines,
fileCategory: detectCategory(rel),
});
}
// 4. Determinism: sort by path.localeCompare.
fileEntries.sort((a, b) => a.path.localeCompare(b.path));
// 5. Stats.
const byCategory = {};
const byLanguage = {};
for (const f of fileEntries) {
byCategory[f.fileCategory] = (byCategory[f.fileCategory] || 0) + 1;
byLanguage[f.language] = (byLanguage[f.language] || 0) + 1;
}
const estimatedComplexity = estimateComplexity(fileEntries.length);
const output = {
scriptCompleted: true,
files: fileEntries,
totalFiles: fileEntries.length,
filteredByIgnore,
estimatedComplexity,
stats: {
filesScanned: fileEntries.length,
byCategory,
byLanguage,
},
};
writeFileSync(outputPath, JSON.stringify(output, null, 2), 'utf-8');
if (!existsSync(outputPath)) {
throw new Error(`output file missing after write: ${outputPath}`);
}
process.stderr.write(
`scan-project: filesScanned=${fileEntries.length} ` +
`filteredByIgnore=${filteredByIgnore} ` +
`complexity=${estimatedComplexity}\n`,
);
}
// ---------------------------------------------------------------------------
// Run only when executed directly as a CLI; importing the module (e.g. from
// tests) must not trigger main().
//
// Canonicalize both sides through realpathSync. Node ESM resolves
// import.meta.url through symlinks but pathToFileURL(process.argv[1]) preserves
// them, so a raw equality check silently no-ops when the script is invoked via
// a symlinked plugin install path (the default in Claude Code / Copilot CLI
// caches). See GitHub issue #162.
// ---------------------------------------------------------------------------
function isCliEntry() {
if (!process.argv[1]) return false;
try {
const modulePath = realpathSync(fileURLToPath(import.meta.url));
const argvPath = realpathSync(process.argv[1]);
return modulePath === argvPath;
} catch {
return false;
}
}
if (isCliEntry()) {
try {
await main();
} catch (err) {
process.stderr.write(`scan-project.mjs failed: ${err.message}\n${err.stack}\n`);
process.exit(1);
}
}
// Default export of helpers for testability.
export default {
detectLanguage,
detectCategory,
estimateComplexity,
};