Some checks failed
ci/woodpecker/push/woodpecker Pipeline failed
- .woodpecker.yaml: image paths -> library/autojanet-{agent,dispatcher}
- .woodpecker.yaml: secret names RS_HARBOR_USER / RS_HARBOR_PASS (global)
- container/Dockerfile: restore COPY skills/, skills/ populated from opencode config
- skills/: 84 opencode skills bundled into image
- k8s/manifests: update image refs to library/
802 lines
27 KiB
JavaScript
802 lines
27 KiB
JavaScript
#!/usr/bin/env node
|
|
/**
|
|
* scan-project.mjs
|
|
*
|
|
* Deterministic file enumeration + language/category detection for the
|
|
* project-scanner agent. Replaces the LLM-written prose scanner that used to
|
|
* (a) author a per-run Node.js script (`tmp/ua-project-scan.js`), (b) walk the
|
|
* file tree, and (c) classify each file via lookup tables in LLM context — a
|
|
* pure rule-lookup pass that was being billed at LLM rates and adding many
|
|
* minutes of per-run latency on mid-sized monorepos.
|
|
*
|
|
* What the LLM still owns (Step A of project-scanner.md Phase 1):
|
|
* - Reading README + top-level manifests to synthesize `name`,
|
|
* `rawDescription`, `readmeHead`, `frameworks`, and the high-level
|
|
* `languages` narrative.
|
|
*
|
|
* What this script owns:
|
|
* - File enumeration (git ls-files preferred, recursive walk fallback)
|
|
* - `.understandignore` filtering (delegated to core's createIgnoreFilter)
|
|
* - Per-file language detection (extension + filename table)
|
|
* - Per-file category assignment (priority-ordered rules from
|
|
* project-scanner.md Step 4)
|
|
* - Line counting
|
|
* - Complexity estimation (project-scanner.md Step 7 thresholds)
|
|
*
|
|
* Usage:
|
|
* node scan-project.mjs <projectRoot> <outputPath>
|
|
*
|
|
* Output JSON (subset of what project-scanner.md Phase 1 expects — the LLM
|
|
* agent merges this with Step A's narrative fields and Step C's importMap to
|
|
* produce the final scan-result.json):
|
|
* {
|
|
* "scriptCompleted": true,
|
|
* "files": [{ "path": "...", "language": "...", "sizeLines": N, "fileCategory": "..." }, ...],
|
|
* "totalFiles": N,
|
|
* "filteredByIgnore": M,
|
|
* "estimatedComplexity": "small" | "moderate" | "large" | "very-large",
|
|
* "stats": { "filesScanned": N, "byCategory": {...}, "byLanguage": {...} }
|
|
* }
|
|
*
|
|
* Logging: stderr only (stdout reserved for piped tooling).
|
|
* Per-file resilience: read/stat failures emit
|
|
* `Warning: scan-project: <path> — <reason> — file skipped from output`
|
|
* to stderr and the file is dropped; the rest of the scan completes.
|
|
*
|
|
* Determinism: files are sorted by `path.localeCompare` before emission, and
|
|
* the underlying enumeration is deterministic (git ls-files returns a stable
|
|
* order; the fallback walker sorts each directory's entries).
|
|
*/
|
|
|
|
import { createRequire } from 'node:module';
|
|
import { dirname, resolve, join, basename, extname, relative, sep } from 'node:path';
|
|
import { fileURLToPath, pathToFileURL } from 'node:url';
|
|
import {
|
|
existsSync,
|
|
readFileSync,
|
|
readdirSync,
|
|
realpathSync,
|
|
statSync,
|
|
writeFileSync,
|
|
} from 'node:fs';
|
|
import { spawnSync } from 'node:child_process';
|
|
|
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
// skills/understand/ -> plugin root is two dirs up
|
|
const pluginRoot = resolve(__dirname, '../..');
|
|
const require = createRequire(resolve(pluginRoot, 'package.json'));
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Resolve @understand-anything/core
|
|
//
|
|
// Two-step resolution: try the workspace-linked package first, fall back to
|
|
// the installed plugin cache layout. pathToFileURL() is required on Windows
|
|
// because dynamic import() of raw "C:\..." paths throws
|
|
// ERR_UNSUPPORTED_ESM_URL_SCHEME (Node parses "C:" as a URL scheme).
|
|
// ---------------------------------------------------------------------------
|
|
let core;
|
|
try {
|
|
core = await import(pathToFileURL(require.resolve('@understand-anything/core')).href);
|
|
} catch {
|
|
core = await import(pathToFileURL(resolve(pluginRoot, 'packages/core/dist/index.js')).href);
|
|
}
|
|
|
|
const { createIgnoreFilter } = core;
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Language detection
|
|
//
|
|
// Mirrors the canonical extension list from
|
|
// understand-anything-plugin/packages/core/src/languages/configs/* and the
|
|
// project-scanner.md Step 3 table. Extensions are matched lowercase;
|
|
// filenames (Dockerfile, Makefile, etc.) are matched case-sensitively because
|
|
// the projects-in-the-wild use canonical capitalizations.
|
|
//
|
|
// Where the core configs and project-scanner.md diverge (rare), project-
|
|
// scanner.md wins because it is the user-facing contract.
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Extension -> language id. Lowercase keys; lookup is `.ext.toLowerCase()`.
|
|
* Includes the legacy Step-3 mapping (.cfg/.ini/.env -> `config`) — note
|
|
* that `config` is a language id here, not a category. Category routing
|
|
* for these extensions is handled separately in CATEGORY_BY_EXT.
|
|
*/
|
|
const LANGUAGE_BY_EXT = Object.freeze({
|
|
// TypeScript / JavaScript
|
|
'.ts': 'typescript',
|
|
'.tsx': 'typescript',
|
|
'.js': 'javascript',
|
|
'.jsx': 'javascript',
|
|
'.mjs': 'javascript',
|
|
'.cjs': 'javascript',
|
|
// Python
|
|
'.py': 'python',
|
|
'.pyi': 'python',
|
|
// Go / Rust / Java / Kotlin / C# / Swift / Lua
|
|
'.go': 'go',
|
|
'.rs': 'rust',
|
|
'.java': 'java',
|
|
'.kt': 'kotlin',
|
|
'.kts': 'kotlin',
|
|
'.cs': 'csharp',
|
|
'.swift': 'swift',
|
|
'.lua': 'lua',
|
|
// Ruby / PHP
|
|
'.rb': 'ruby',
|
|
'.rake': 'ruby',
|
|
'.php': 'php',
|
|
// C / C++
|
|
'.c': 'c',
|
|
'.h': 'c',
|
|
'.cpp': 'cpp',
|
|
'.cc': 'cpp',
|
|
'.cxx': 'cpp',
|
|
'.hpp': 'cpp',
|
|
'.hxx': 'cpp',
|
|
// Vue / Svelte (no tree-sitter extractor, but project-scanner contract
|
|
// lists them as code languages — downstream import map will return [])
|
|
'.vue': 'vue',
|
|
'.svelte': 'svelte',
|
|
// Shell / Batch / PowerShell
|
|
'.sh': 'shell',
|
|
'.bash': 'shell',
|
|
'.zsh': 'shell',
|
|
'.ps1': 'powershell',
|
|
'.psm1': 'powershell',
|
|
'.psd1': 'powershell',
|
|
'.bat': 'batch',
|
|
'.cmd': 'batch',
|
|
// Markup / docs
|
|
'.html': 'html',
|
|
'.htm': 'html',
|
|
'.css': 'css',
|
|
'.scss': 'css',
|
|
'.sass': 'css',
|
|
'.less': 'css',
|
|
'.md': 'markdown',
|
|
'.mdx': 'markdown',
|
|
'.rst': 'markdown',
|
|
// Config / data
|
|
'.yaml': 'yaml',
|
|
'.yml': 'yaml',
|
|
'.json': 'json',
|
|
'.jsonc': 'jsonc',
|
|
'.toml': 'toml',
|
|
'.xml': 'xml',
|
|
'.xsl': 'xml',
|
|
'.xsd': 'xml',
|
|
'.plist': 'xml',
|
|
'.cfg': 'config',
|
|
'.ini': 'config',
|
|
'.env': 'config',
|
|
// Data / schema
|
|
'.sql': 'sql',
|
|
'.graphql': 'graphql',
|
|
'.gql': 'graphql',
|
|
'.proto': 'protobuf',
|
|
'.prisma': 'prisma',
|
|
'.csv': 'csv',
|
|
'.tsv': 'csv',
|
|
// Infra
|
|
'.tf': 'terraform',
|
|
'.tfvars': 'terraform',
|
|
// JVM build files (categorized via filename-or-extension)
|
|
'.gradle': 'gradle',
|
|
// .NET project files (mapped to extension-derived ids; downstream
|
|
// treats them as config — see CATEGORY_BY_EXT)
|
|
'.csproj': 'csproj',
|
|
'.sln': 'sln',
|
|
'.properties': 'properties',
|
|
'.mod': 'mod',
|
|
'.sum': 'sum',
|
|
});
|
|
|
|
/**
|
|
* Filename (no extension) -> language id. Compared case-sensitively against
|
|
* basename(path). Includes the most common no-extension conventions; anything
|
|
* NOT in this table with no extension falls back to `unknown`.
|
|
*
|
|
* Dockerfile.* variants (Dockerfile.dev, Dockerfile.prod) are handled by a
|
|
* startsWith check in `detectLanguage()` so we don't have to enumerate every
|
|
* possible suffix.
|
|
*/
|
|
const LANGUAGE_BY_FILENAME = Object.freeze({
|
|
Dockerfile: 'dockerfile',
|
|
Makefile: 'makefile',
|
|
GNUmakefile: 'makefile',
|
|
makefile: 'makefile',
|
|
Jenkinsfile: 'jenkinsfile',
|
|
Procfile: 'procfile',
|
|
Vagrantfile: 'vagrantfile',
|
|
});
|
|
|
|
/**
|
|
* Detect the language of a file by its path. Lowercase extension lookup,
|
|
* then no-extension filename lookup. Never returns null — falls back to
|
|
* the lowercased extension (without dot) or 'unknown' if there is no
|
|
* extension. Downstream consumers rely on this field always being a string
|
|
* (see project-scanner.md Step 3 "Fallback" note).
|
|
*/
|
|
export function detectLanguage(filePath) {
|
|
const base = basename(filePath);
|
|
const ext = extname(filePath).toLowerCase();
|
|
|
|
// Dockerfile.dev, Dockerfile.prod, etc. — common variant form.
|
|
if (base === 'Dockerfile' || base.startsWith('Dockerfile.')) return 'dockerfile';
|
|
|
|
// Dotfile names like .env, .env.local — path.extname returns '' for
|
|
// single-segment dotfiles (e.g. '.env') and the SECOND segment for
|
|
// compound dotfiles (e.g. '.local' for '.env.local'). Neither hits the
|
|
// intended LANGUAGE_BY_EXT['.env'] mapping. Try the leading dotfile
|
|
// portion first so `.env`, `.env.local`, `.env.production` all map.
|
|
const dotKey = dotfileKey(base);
|
|
if (dotKey && LANGUAGE_BY_EXT[dotKey]) return LANGUAGE_BY_EXT[dotKey];
|
|
|
|
if (ext) {
|
|
const byExt = LANGUAGE_BY_EXT[ext];
|
|
if (byExt) return byExt;
|
|
// Unknown extension → drop the leading dot, lowercase. Never null.
|
|
return ext.slice(1);
|
|
}
|
|
|
|
// No-extension file — try filename table.
|
|
const byFilename = LANGUAGE_BY_FILENAME[base];
|
|
if (byFilename) return byFilename;
|
|
|
|
return 'unknown';
|
|
}
|
|
|
|
/**
|
|
* Extract the canonical dotfile "extension" from a basename, or null.
|
|
*
|
|
* `.env` -> `.env`
|
|
* `.env.local` -> `.env`
|
|
* `.bashrc` -> `.bashrc`
|
|
* `package.json` -> null (not a dotfile)
|
|
*
|
|
* Used by both detectLanguage and detectCategory so dotfile-style configs
|
|
* (e.g., `.env`, `.env.local`, `.env.production`) get their leading
|
|
* segment treated as the implicit extension instead of falling through
|
|
* to `unknown` / `code`.
|
|
*/
|
|
function dotfileKey(base) {
|
|
if (!base.startsWith('.')) return null;
|
|
const m = base.match(/^(\.[a-z0-9]+)/i);
|
|
return m ? m[1].toLowerCase() : null;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Category detection
|
|
//
|
|
// Implements the priority-ordered rules from project-scanner.md Step 4.
|
|
// Order matters: more specific rules must run before more general ones
|
|
// (e.g. `docker-compose.yml` is infra, not config).
|
|
//
|
|
// Categories: code | config | docs | infra | data | script | markup
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Extension -> category. Used only after the higher-priority path-based
|
|
* checks (infra/docs exclusions) in `detectCategory()`. Plain extension
|
|
* lookup is intentionally last-resort — many configs need their full path
|
|
* inspected first.
|
|
*/
|
|
const CATEGORY_BY_EXT = Object.freeze({
|
|
// docs
|
|
'.md': 'docs',
|
|
'.mdx': 'docs',
|
|
'.rst': 'docs',
|
|
'.txt': 'docs',
|
|
'.text': 'docs',
|
|
// config
|
|
'.yaml': 'config',
|
|
'.yml': 'config',
|
|
'.json': 'config',
|
|
'.jsonc': 'config',
|
|
'.toml': 'config',
|
|
'.xml': 'config',
|
|
'.xsl': 'config',
|
|
'.xsd': 'config',
|
|
'.plist': 'config',
|
|
'.cfg': 'config',
|
|
'.ini': 'config',
|
|
'.env': 'config',
|
|
'.properties': 'config',
|
|
'.csproj': 'config',
|
|
'.sln': 'config',
|
|
'.mod': 'config',
|
|
'.sum': 'config',
|
|
'.gradle': 'config',
|
|
// infra
|
|
'.tf': 'infra',
|
|
'.tfvars': 'infra',
|
|
// data
|
|
'.sql': 'data',
|
|
'.graphql': 'data',
|
|
'.gql': 'data',
|
|
'.proto': 'data',
|
|
'.prisma': 'data',
|
|
'.csv': 'data',
|
|
'.tsv': 'data',
|
|
// script
|
|
'.sh': 'script',
|
|
'.bash': 'script',
|
|
'.zsh': 'script',
|
|
'.ps1': 'script',
|
|
'.psm1': 'script',
|
|
'.psd1': 'script',
|
|
'.bat': 'script',
|
|
'.cmd': 'script',
|
|
// markup
|
|
'.html': 'markup',
|
|
'.htm': 'markup',
|
|
'.css': 'markup',
|
|
'.scss': 'markup',
|
|
'.sass': 'markup',
|
|
'.less': 'markup',
|
|
});
|
|
|
|
/**
|
|
* Filenames (no extension or full filename with extension) that always
|
|
* map to `infra` regardless of their extension. Compared case-sensitively
|
|
* against basename(path).
|
|
*/
|
|
const INFRA_FILENAMES = new Set([
|
|
'Dockerfile',
|
|
'.dockerignore',
|
|
'Makefile',
|
|
'GNUmakefile',
|
|
'makefile',
|
|
'Jenkinsfile',
|
|
'Procfile',
|
|
'Vagrantfile',
|
|
'.gitlab-ci.yml',
|
|
]);
|
|
|
|
/**
|
|
* Detect the project-scanner category for a file. Priority order matches
|
|
* project-scanner.md Step 4 "Priority rule" — most specific wins.
|
|
*
|
|
* 1. LICENSE -> code (per the spec note "except LICENSE"). The Step-2
|
|
* exclusion table normally removes LICENSE, but if a project chooses to
|
|
* re-include it via `.understandignore` negation, it should NOT land in
|
|
* docs. We classify as `code` rather than inventing a new bucket.
|
|
* 2. Filename-based infra (Dockerfile, Makefile, Jenkinsfile,
|
|
* docker-compose.*, Vagrantfile, Procfile, .gitlab-ci.yml,
|
|
* .dockerignore).
|
|
* 3. Path-based infra (.github/workflows/, .circleci/, k8s/, kubernetes/,
|
|
* *.k8s.yml, *.k8s.yaml).
|
|
* 4. Extension-based mapping (CATEGORY_BY_EXT).
|
|
* 5. Fallback: `code` (matches the spec — "All other extensions").
|
|
*/
|
|
export function detectCategory(filePath) {
|
|
const base = basename(filePath);
|
|
const ext = extname(filePath).toLowerCase();
|
|
const posix = filePath.split(sep).join('/');
|
|
|
|
// Rule 1: LICENSE exception (project-scanner.md Step 4 table comment).
|
|
if (base === 'LICENSE') return 'code';
|
|
|
|
// Rule 2: infra by filename — Dockerfile + variants, Makefile,
|
|
// Jenkinsfile, docker-compose.*, Procfile, Vagrantfile, .gitlab-ci.yml,
|
|
// .dockerignore.
|
|
if (INFRA_FILENAMES.has(base)) return 'infra';
|
|
if (base === 'Dockerfile' || base.startsWith('Dockerfile.')) return 'infra';
|
|
if (base.startsWith('docker-compose.')) return 'infra';
|
|
if (base === 'compose.yml' || base === 'compose.yaml') return 'infra';
|
|
|
|
// Rule 3: infra by path.
|
|
if (posix.startsWith('.github/workflows/')) return 'infra';
|
|
if (posix.startsWith('.circleci/')) return 'infra';
|
|
// Match a `k8s/` or `kubernetes/` segment anywhere in the path.
|
|
if (/(^|\/)(k8s|kubernetes)\//.test(posix)) return 'infra';
|
|
// `*.k8s.yml` and `*.k8s.yaml` — Kubernetes-flavored YAML.
|
|
if (/\.k8s\.(ya?ml)$/i.test(base)) return 'infra';
|
|
|
|
// Rule 4: extension-based lookup.
|
|
if (ext) {
|
|
const byExt = CATEGORY_BY_EXT[ext];
|
|
if (byExt) return byExt;
|
|
}
|
|
|
|
// Rule 4.5: dotfile-style configs (.env, .env.local, .env.production).
|
|
// path.extname misses these — see dotfileKey docstring.
|
|
const dotKey = dotfileKey(base);
|
|
if (dotKey) {
|
|
const byDot = CATEGORY_BY_EXT[dotKey];
|
|
if (byDot) return byDot;
|
|
}
|
|
|
|
// Rule 5: filename-based config catch-all for no-extension config files
|
|
// commonly seen in JVM/Go/.NET projects (covered above for infra but not
|
|
// config). We don't enumerate every possible config filename here — that
|
|
// gets handled by the language map's no-extension entries upstream.
|
|
// Anything not matched falls through to `code`.
|
|
return 'code';
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Complexity estimation (project-scanner.md Step 7)
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Map a total file count to a complexity tier. Thresholds are inclusive on
|
|
* the lower bound:
|
|
* - small: 1-30
|
|
* - moderate: 31-150
|
|
* - large: 151-500
|
|
* - very-large: >500
|
|
*
|
|
* Edge case: 0 files maps to `small` (the lowest tier) so the field is
|
|
* always set even on empty repos. Downstream consumers treat 0 files as
|
|
* a sentinel for "nothing to analyze" via `totalFiles`, not complexity.
|
|
*/
|
|
export function estimateComplexity(totalFiles) {
|
|
if (totalFiles <= 30) return 'small';
|
|
if (totalFiles <= 150) return 'moderate';
|
|
if (totalFiles <= 500) return 'large';
|
|
return 'very-large';
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// File enumeration
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Normalize a path to forward-slash POSIX. The project-scanner contract
|
|
* emits POSIX paths; we re-normalize so the output is stable across
|
|
* Windows/macOS/Linux.
|
|
*/
|
|
function toPosix(p) {
|
|
return p.split(sep).join('/');
|
|
}
|
|
|
|
/**
|
|
* Enumerate all files in `projectRoot` via `git ls-files`. Returns an
|
|
* array of project-relative POSIX paths, or null if the directory is not
|
|
* a git repository (or git is not installed). Caller falls back to the
|
|
* recursive walker.
|
|
*
|
|
* Why git ls-files first: it respects the repo's `.gitignore`, handles
|
|
* submodules sensibly, and gives a fast, deterministic listing. The walker
|
|
* is a strict superset of what git would emit (no .gitignore awareness),
|
|
* so the ignore filter has to do more work in the fallback path.
|
|
*/
|
|
function enumerateViaGit(projectRoot) {
|
|
// -z = NUL-terminated output. Without it, `git ls-files` C-escapes non-ASCII
|
|
// bytes in path names — paths containing emoji, accented characters, CJK
|
|
// codepoints, etc. come back quoted with octal escapes (e.g.
|
|
// `"30. \360\237\217\227 BD-CCER/file.md"` for a path containing 🏗️).
|
|
// Those quoted-escaped strings then fail to round-trip back to real disk
|
|
// paths in downstream consumers, so files in such directories are silently
|
|
// dropped from the scan. The -z form emits raw bytes between NUL separators,
|
|
// preserving every codepoint as-is. This is the same approach git itself
|
|
// uses for `--null` everywhere downstream (xargs -0, etc.).
|
|
const result = spawnSync('git', ['ls-files', '-z', '-co', '--exclude-standard'], {
|
|
cwd: projectRoot,
|
|
encoding: 'utf-8',
|
|
maxBuffer: 256 * 1024 * 1024, // 256MB — huge monorepos can produce >10MB of paths
|
|
});
|
|
if (result.status !== 0 || !result.stdout) return null;
|
|
// Each NUL-separated chunk is one path, project-relative, already POSIX on
|
|
// all platforms because git emits forward slashes regardless of OS.
|
|
return result.stdout
|
|
.split('\0')
|
|
.filter(Boolean)
|
|
.map(toPosix);
|
|
}
|
|
|
|
/**
|
|
* Recursive directory walker — fallback when `git ls-files` is unavailable
|
|
* (no git, not a repo, or git refused). Skips hard-coded "obviously bad"
|
|
* directory names BEFORE invoking the ignore filter so we don't waste cycles
|
|
* descending into `node_modules/` etc. on huge trees.
|
|
*
|
|
* Yields project-relative POSIX paths in directory-sorted order so the
|
|
* output is deterministic without an extra sort pass.
|
|
*/
|
|
function enumerateViaWalk(projectRoot) {
|
|
// Hard skip — these directories are universally non-source and skipping
|
|
// at the walker level avoids materializing thousands of node_modules
|
|
// paths before the ignore filter drops them. The ignore filter still
|
|
// runs on everything else.
|
|
const HARD_SKIP_DIRS = new Set([
|
|
'node_modules',
|
|
'.git',
|
|
'.svn',
|
|
'.hg',
|
|
'__pycache__',
|
|
]);
|
|
|
|
const out = [];
|
|
|
|
function walk(absDir) {
|
|
let entries;
|
|
try {
|
|
entries = readdirSync(absDir, { withFileTypes: true });
|
|
} catch (err) {
|
|
process.stderr.write(
|
|
`Warning: scan-project: ${toPosix(relative(projectRoot, absDir)) || '.'} ` +
|
|
`— directory read failed (${err.message}) — subtree skipped\n`,
|
|
);
|
|
return;
|
|
}
|
|
// Sort deterministically by name; mix files and dirs together so the
|
|
// final output (after the path sort) is identical regardless of
|
|
// OS-specific readdir order.
|
|
entries.sort((a, b) => a.name.localeCompare(b.name));
|
|
for (const ent of entries) {
|
|
if (ent.isDirectory()) {
|
|
if (HARD_SKIP_DIRS.has(ent.name)) continue;
|
|
walk(join(absDir, ent.name));
|
|
} else if (ent.isFile()) {
|
|
const rel = toPosix(relative(projectRoot, join(absDir, ent.name)));
|
|
if (rel) out.push(rel);
|
|
}
|
|
// Symlinks intentionally ignored — git ls-files doesn't follow them
|
|
// either, and following them is a classic recursion-bomb footgun.
|
|
}
|
|
}
|
|
|
|
walk(projectRoot);
|
|
return out;
|
|
}
|
|
|
|
/**
|
|
* Enumerate all candidate files in `projectRoot`. Tries git ls-files first;
|
|
* falls back to a recursive walk if git is unavailable or this is not a
|
|
* repo. Returns an array of project-relative POSIX paths in unspecified
|
|
* order — caller is responsible for sorting + filtering.
|
|
*/
|
|
function enumerateFiles(projectRoot) {
|
|
const fromGit = enumerateViaGit(projectRoot);
|
|
if (fromGit !== null) return fromGit;
|
|
process.stderr.write(
|
|
`scan-project: git ls-files unavailable — falling back to recursive walk\n`,
|
|
);
|
|
return enumerateViaWalk(projectRoot);
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Filter accounting
|
|
//
|
|
// The project-scanner.md contract requires `filteredByIgnore` to count files
|
|
// dropped *specifically* by user `.understandignore` patterns (the delta
|
|
// beyond what the hardcoded defaults would have removed). We accomplish this
|
|
// by building TWO filters:
|
|
// - `defaultOnly`: defaults only, no user patterns
|
|
// - `combined`: defaults + user patterns (createIgnoreFilter)
|
|
// and counting paths that the combined filter excludes but the defaults-only
|
|
// filter would have kept.
|
|
//
|
|
// Negation (`!pattern`) is correctly handled by the combined filter — a file
|
|
// re-included via `!` won't be in the combined-excluded set, so it WON'T be
|
|
// counted in filteredByIgnore (it's "kept", not "additionally filtered").
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Build a defaults-only IgnoreFilter — same patterns as createIgnoreFilter
|
|
* would apply, minus any user .understandignore content. We synthesize this
|
|
* via a temp directory with no .understandignore files so the core function
|
|
* still drives the matcher. (Re-implementing the ignore-package wiring here
|
|
* would risk subtle behavior drift from core's matcher.)
|
|
*/
|
|
function buildDefaultsOnlyFilter() {
|
|
// Use the createIgnoreFilter with a path that we KNOW has no .understandignore.
|
|
// `os.tmpdir()`-based fresh dir guarantees no user patterns leak in.
|
|
// The directory doesn't need to exist on disk because createIgnoreFilter
|
|
// only checks existsSync() before reading.
|
|
const fakeProjectRoot = join(
|
|
require('node:os').tmpdir(),
|
|
`ua-scan-defaults-${process.pid}-${Date.now()}`,
|
|
);
|
|
return createIgnoreFilter(fakeProjectRoot);
|
|
}
|
|
|
|
/**
|
|
* Determine whether `projectRoot` has any user .understandignore files.
|
|
* When neither file exists, the combined and defaults-only filters are
|
|
* identical, so we can skip the dual-filter accounting entirely.
|
|
*/
|
|
function hasUserIgnoreFile(projectRoot) {
|
|
return (
|
|
existsSync(join(projectRoot, '.understandignore'))
|
|
|| existsSync(join(projectRoot, '.understand-anything', '.understandignore'))
|
|
);
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Line counting
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Count newline-delimited lines in a file. Returns the number of `\n`
|
|
* characters; this matches `wc -l` semantics (which counts newlines, not
|
|
* "lines of content"). Files without a trailing newline therefore report
|
|
* one fewer than the visible line count — same behavior as wc.
|
|
*
|
|
* Per-file failure: emits a Warning: and returns null. Caller decides
|
|
* whether to drop the file or keep it with sizeLines=0.
|
|
*/
|
|
function countLines(absPath, posixPath) {
|
|
try {
|
|
const buf = readFileSync(absPath);
|
|
// Manual newline count beats split('\n').length on large files — no
|
|
// intermediate array allocation. We count the `\n` byte (0x0a) directly.
|
|
let count = 0;
|
|
for (let i = 0; i < buf.length; i++) {
|
|
if (buf[i] === 0x0a) count++;
|
|
}
|
|
return count;
|
|
} catch (err) {
|
|
process.stderr.write(
|
|
`Warning: scan-project: ${posixPath} — line count failed ` +
|
|
`(${err.message}) — file skipped from output\n`,
|
|
);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Main
|
|
// ---------------------------------------------------------------------------
|
|
|
|
async function main() {
|
|
const [, , projectRoot, outputPath] = process.argv;
|
|
if (!projectRoot || !outputPath) {
|
|
process.stderr.write(
|
|
'Usage: node scan-project.mjs <projectRoot> <outputPath>\n',
|
|
);
|
|
process.exit(1);
|
|
}
|
|
|
|
if (!existsSync(projectRoot)) {
|
|
process.stderr.write(
|
|
`scan-project.mjs failed: projectRoot does not exist: ${projectRoot}\n`,
|
|
);
|
|
process.exit(1);
|
|
}
|
|
const projectRootStat = statSync(projectRoot);
|
|
if (!projectRootStat.isDirectory()) {
|
|
process.stderr.write(
|
|
`scan-project.mjs failed: projectRoot is not a directory: ${projectRoot}\n`,
|
|
);
|
|
process.exit(1);
|
|
}
|
|
|
|
// 1. Enumerate. Either git ls-files or recursive walk.
|
|
const candidates = enumerateFiles(projectRoot);
|
|
|
|
// 2. Filter via createIgnoreFilter (defaults + user .understandignore).
|
|
// Build a defaults-only filter in parallel to count user-driven drops.
|
|
const combined = createIgnoreFilter(projectRoot);
|
|
const userIgnoresPresent = hasUserIgnoreFile(projectRoot);
|
|
const defaultsOnly = userIgnoresPresent ? buildDefaultsOnlyFilter() : combined;
|
|
|
|
let filteredByIgnore = 0;
|
|
const kept = [];
|
|
for (const rel of candidates) {
|
|
const isIgnoredCombined = combined.isIgnored(rel);
|
|
if (!isIgnoredCombined) {
|
|
kept.push(rel);
|
|
continue;
|
|
}
|
|
// Dropped by combined filter. If defaults-only would have ALSO dropped
|
|
// it, this is a baseline default drop — not counted. If defaults-only
|
|
// would have KEPT it, this drop is attributable to the user's
|
|
// .understandignore content.
|
|
if (userIgnoresPresent && !defaultsOnly.isIgnored(rel)) {
|
|
filteredByIgnore++;
|
|
}
|
|
}
|
|
|
|
// 3. Per-file: language + category + line count.
|
|
// Drop files that fail line counting (per-file resilience).
|
|
const fileEntries = [];
|
|
for (const rel of kept) {
|
|
const absPath = join(projectRoot, rel);
|
|
// Stat first — git ls-files could include paths that vanished between
|
|
// listing and processing; the walker shouldn't but defensive anyway.
|
|
try {
|
|
const st = statSync(absPath);
|
|
if (!st.isFile()) {
|
|
// Symlinks-to-dir, special files, etc. — skip silently. Not a
|
|
// warning condition because git wouldn't have tracked it as a file.
|
|
continue;
|
|
}
|
|
} catch (err) {
|
|
process.stderr.write(
|
|
`Warning: scan-project: ${rel} — stat failed (${err.message}) ` +
|
|
`— file skipped from output\n`,
|
|
);
|
|
continue;
|
|
}
|
|
const sizeLines = countLines(absPath, rel);
|
|
if (sizeLines === null) {
|
|
// countLines already emitted the Warning: line.
|
|
continue;
|
|
}
|
|
fileEntries.push({
|
|
path: rel,
|
|
language: detectLanguage(rel),
|
|
sizeLines,
|
|
fileCategory: detectCategory(rel),
|
|
});
|
|
}
|
|
|
|
// 4. Determinism: sort by path.localeCompare.
|
|
fileEntries.sort((a, b) => a.path.localeCompare(b.path));
|
|
|
|
// 5. Stats.
|
|
const byCategory = {};
|
|
const byLanguage = {};
|
|
for (const f of fileEntries) {
|
|
byCategory[f.fileCategory] = (byCategory[f.fileCategory] || 0) + 1;
|
|
byLanguage[f.language] = (byLanguage[f.language] || 0) + 1;
|
|
}
|
|
|
|
const estimatedComplexity = estimateComplexity(fileEntries.length);
|
|
|
|
const output = {
|
|
scriptCompleted: true,
|
|
files: fileEntries,
|
|
totalFiles: fileEntries.length,
|
|
filteredByIgnore,
|
|
estimatedComplexity,
|
|
stats: {
|
|
filesScanned: fileEntries.length,
|
|
byCategory,
|
|
byLanguage,
|
|
},
|
|
};
|
|
|
|
writeFileSync(outputPath, JSON.stringify(output, null, 2), 'utf-8');
|
|
|
|
if (!existsSync(outputPath)) {
|
|
throw new Error(`output file missing after write: ${outputPath}`);
|
|
}
|
|
|
|
process.stderr.write(
|
|
`scan-project: filesScanned=${fileEntries.length} ` +
|
|
`filteredByIgnore=${filteredByIgnore} ` +
|
|
`complexity=${estimatedComplexity}\n`,
|
|
);
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Run only when executed directly as a CLI; importing the module (e.g. from
|
|
// tests) must not trigger main().
|
|
//
|
|
// Canonicalize both sides through realpathSync. Node ESM resolves
|
|
// import.meta.url through symlinks but pathToFileURL(process.argv[1]) preserves
|
|
// them, so a raw equality check silently no-ops when the script is invoked via
|
|
// a symlinked plugin install path (the default in Claude Code / Copilot CLI
|
|
// caches). See GitHub issue #162.
|
|
// ---------------------------------------------------------------------------
|
|
function isCliEntry() {
|
|
if (!process.argv[1]) return false;
|
|
try {
|
|
const modulePath = realpathSync(fileURLToPath(import.meta.url));
|
|
const argvPath = realpathSync(process.argv[1]);
|
|
return modulePath === argvPath;
|
|
} catch {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
if (isCliEntry()) {
|
|
try {
|
|
await main();
|
|
} catch (err) {
|
|
process.stderr.write(`scan-project.mjs failed: ${err.message}\n${err.stack}\n`);
|
|
process.exit(1);
|
|
}
|
|
}
|
|
|
|
// Default export of helpers for testability.
|
|
export default {
|
|
detectLanguage,
|
|
detectCategory,
|
|
estimateComplexity,
|
|
};
|