autojanet/skills/understand/extract-structure.mjs
Zoë cc74ad0bd0
Some checks failed
ci/woodpecker/push/woodpecker Pipeline failed
fix: use library/ Harbor project, add skills, fix pipeline secrets
- .woodpecker.yaml: image paths -> library/autojanet-{agent,dispatcher}
- .woodpecker.yaml: secret names RS_HARBOR_USER / RS_HARBOR_PASS (global)
- container/Dockerfile: restore COPY skills/, skills/ populated from opencode config
- skills/: 84 opencode skills bundled into image
- k8s/manifests: update image refs to library/
2026-05-30 15:43:14 -07:00

334 lines
11 KiB
JavaScript

#!/usr/bin/env node
/**
* extract-structure.mjs
*
* Deterministic structural extraction script for the file-analyzer agent.
* Uses PluginRegistry (TreeSitterPlugin + non-code parsers) from @understand-anything/core
* to replace the LLM-generated throwaway regex scripts in Phase 1.
*
* Usage:
* node extract-structure.mjs <input.json> <output.json>
*
* Input JSON:
* { projectRoot, batchFiles: [{path, language, sizeLines, fileCategory}], batchImportData }
*
* Output JSON:
* { scriptCompleted, filesAnalyzed, filesSkipped, results: [...] }
*/
import { createRequire } from 'node:module';
import { dirname, resolve, join } from 'node:path';
import { fileURLToPath, pathToFileURL } from 'node:url';
import { existsSync, readFileSync, realpathSync, writeFileSync } from 'node:fs';
const __dirname = dirname(fileURLToPath(import.meta.url));
// skills/understand/ -> plugin root is two dirs up
const pluginRoot = resolve(__dirname, '../..');
const require = createRequire(resolve(pluginRoot, 'package.json'));
// ---------------------------------------------------------------------------
// Resolve @understand-anything/core
//
// Node ESM dynamic import() requires a file:// URL on Windows; passing a raw
// absolute path like "C:\..." throws ERR_UNSUPPORTED_ESM_URL_SCHEME because the
// loader parses "C:" as a URL scheme. Wrap both resolutions in pathToFileURL().
// ---------------------------------------------------------------------------
let core;
try {
core = await import(pathToFileURL(require.resolve('@understand-anything/core')).href);
} catch {
// Fallback: direct path for installed plugin cache layouts
core = await import(pathToFileURL(resolve(pluginRoot, 'packages/core/dist/index.js')).href);
}
const { TreeSitterPlugin, PluginRegistry, builtinLanguageConfigs, registerAllParsers } = core;
// ---------------------------------------------------------------------------
// Main
// ---------------------------------------------------------------------------
async function main() {
const [,, inputPath, outputPath] = process.argv;
if (!inputPath || !outputPath) {
process.stderr.write('Usage: node extract-structure.mjs <input.json> <output.json>\n');
process.exit(1);
}
// Read input
const inputRaw = readFileSync(inputPath, 'utf-8');
const input = JSON.parse(inputRaw);
const { projectRoot, batchFiles, batchImportData } = input;
if (!projectRoot || !Array.isArray(batchFiles)) {
throw new Error('Invalid input: must contain projectRoot and batchFiles array');
}
// Create tree-sitter plugin with all configs that have WASM grammars
const tsConfigs = builtinLanguageConfigs.filter(c => c.treeSitter);
const tsPlugin = new TreeSitterPlugin(tsConfigs);
await tsPlugin.init();
// Create registry and register tree-sitter + all non-code parsers
const registry = new PluginRegistry();
registry.register(tsPlugin);
registerAllParsers(registry);
const results = [];
const filesSkipped = [];
for (const file of batchFiles) {
const absolutePath = join(projectRoot, file.path);
// Read file content
let content;
try {
content = readFileSync(absolutePath, 'utf-8');
} catch {
filesSkipped.push(file.path);
continue;
}
// Line counts. POSIX text files end in a trailing newline, which makes
// `split('\n')` produce one extra empty element. Match `wc -l` semantics
// (used by the project scanner for `sizeLines`) so the two counts agree.
const lines = content.split('\n');
const totalLines = content.endsWith('\n') ? Math.max(0, lines.length - 1) : lines.length;
const nonEmptyLines = lines.filter(l => l.trim().length > 0).length;
// Structural analysis via registry
let analysis = null;
try {
analysis = registry.analyzeFile(file.path, content);
} catch {
// If analysis throws, treat as degraded — still include basic metrics
}
// Call graph extraction (code files only)
let callGraph = null;
if (file.fileCategory === 'code' || file.fileCategory === 'script') {
try {
const cg = registry.extractCallGraph(file.path, content);
if (cg && cg.length > 0) {
callGraph = cg.map(entry => ({
caller: entry.caller,
callee: entry.callee,
lineNumber: entry.lineNumber,
}));
}
} catch {
// Call graph extraction failed — non-fatal
}
}
// Build result object
const result = buildResult(file, totalLines, nonEmptyLines, analysis, callGraph, batchImportData);
results.push(result);
}
// Write output
const output = {
scriptCompleted: true,
filesAnalyzed: results.length,
filesSkipped,
results,
};
writeFileSync(outputPath, JSON.stringify(output, null, 2), 'utf-8');
if (!existsSync(outputPath)) {
throw new Error(`output file missing after write: ${outputPath}`);
}
}
// ---------------------------------------------------------------------------
// Result builder: maps StructuralAnalysis to the expected output schema.
// Exported for unit tests; pure function, no I/O.
// ---------------------------------------------------------------------------
export function buildResult(file, totalLines, nonEmptyLines, analysis, callGraph, batchImportData) {
const base = {
path: file.path,
language: file.language,
fileCategory: file.fileCategory,
totalLines,
nonEmptyLines,
};
if (!analysis) {
// No parser matched — return basic metrics only
base.metrics = {};
return base;
}
// Functions (code files)
if (analysis.functions && analysis.functions.length > 0) {
base.functions = analysis.functions.map(fn => ({
name: fn.name,
startLine: fn.lineRange[0],
endLine: fn.lineRange[1],
params: fn.params || [],
}));
}
// Classes (code files)
if (analysis.classes && analysis.classes.length > 0) {
base.classes = analysis.classes.map(cls => ({
name: cls.name,
startLine: cls.lineRange[0],
endLine: cls.lineRange[1],
methods: cls.methods || [],
properties: cls.properties || [],
}));
}
// Exports (code files)
if (analysis.exports && analysis.exports.length > 0) {
base.exports = analysis.exports.map(exp => ({
name: exp.name,
line: exp.lineNumber,
isDefault: exp.isDefault === true,
}));
}
// Non-code structural data: pass through directly
if (analysis.sections && analysis.sections.length > 0) {
base.sections = analysis.sections.map(s => ({
heading: s.name,
level: s.level,
line: s.lineRange[0],
}));
}
if (analysis.definitions && analysis.definitions.length > 0) {
base.definitions = analysis.definitions.map(d => ({
name: d.name,
kind: d.kind,
fields: d.fields || [],
startLine: d.lineRange[0],
endLine: d.lineRange[1],
}));
}
if (analysis.services && analysis.services.length > 0) {
base.services = analysis.services.map(s => ({
name: s.name,
image: s.image,
ports: s.ports || [],
...(s.lineRange ? { startLine: s.lineRange[0], endLine: s.lineRange[1] } : {}),
}));
}
if (analysis.endpoints && analysis.endpoints.length > 0) {
base.endpoints = analysis.endpoints.map(e => ({
method: e.method,
path: e.path,
startLine: e.lineRange[0],
endLine: e.lineRange[1],
}));
}
if (analysis.steps && analysis.steps.length > 0) {
base.steps = analysis.steps.map(s => ({
name: s.name,
startLine: s.lineRange[0],
endLine: s.lineRange[1],
}));
}
if (analysis.resources && analysis.resources.length > 0) {
base.resources = analysis.resources.map(r => ({
name: r.name,
kind: r.kind,
startLine: r.lineRange[0],
endLine: r.lineRange[1],
}));
}
// Call graph
if (callGraph && callGraph.length > 0) {
base.callGraph = callGraph;
}
// Metrics
const metrics = {};
// Import count from batchImportData (pre-resolved by project scanner).
// Empty arrays are truthy, so explicitly check length so we fall back to the
// parser's own import list when the scanner could not resolve any imports
// (e.g. Python absolute imports the scanner doesn't follow).
//
// The fallback counts only relative-style imports (those starting with `.`)
// so the metric stays *internal-import* in semantics rather than mixing in
// every external package import seen by the parser. Resolved external imports
// can never produce edges anyway, so counting them would be misleading.
const importPaths = batchImportData?.[file.path];
if (importPaths && importPaths.length > 0) {
metrics.importCount = importPaths.length;
} else if (analysis.imports) {
const internal = analysis.imports.filter(imp => {
const src = imp?.source ?? '';
return src.startsWith('.');
});
metrics.importCount = internal.length;
}
if (analysis.exports) {
metrics.exportCount = analysis.exports.length;
}
if (analysis.functions) {
metrics.functionCount = analysis.functions.length;
}
if (analysis.classes) {
metrics.classCount = analysis.classes.length;
}
if (analysis.sections) {
metrics.sectionCount = analysis.sections.length;
}
if (analysis.definitions) {
metrics.definitionCount = analysis.definitions.length;
}
if (analysis.services) {
metrics.serviceCount = analysis.services.length;
}
if (analysis.endpoints) {
metrics.endpointCount = analysis.endpoints.length;
}
if (analysis.steps) {
metrics.stepCount = analysis.steps.length;
}
if (analysis.resources) {
metrics.resourceCount = analysis.resources.length;
}
base.metrics = metrics;
return base;
}
// ---------------------------------------------------------------------------
// Run only when executed directly as a CLI; importing the module (e.g. from
// tests) must not trigger main().
//
// Canonicalize both sides through realpathSync. Node ESM resolves
// import.meta.url through symlinks but pathToFileURL(process.argv[1]) preserves
// them, so a raw equality check silently no-ops when the script is invoked via
// a symlinked plugin install path (the default in Claude Code / Copilot CLI
// caches). See GitHub issue #162.
// ---------------------------------------------------------------------------
function isCliEntry() {
if (!process.argv[1]) return false;
try {
const modulePath = realpathSync(fileURLToPath(import.meta.url));
const argvPath = realpathSync(process.argv[1]);
return modulePath === argvPath;
} catch {
return false;
}
}
if (isCliEntry()) {
try {
await main();
} catch (err) {
process.stderr.write(`extract-structure.mjs failed: ${err.message}\n${err.stack}\n`);
process.exit(1);
}
}