autojanet/skills/understand/merge-batch-graphs.py
Zoë cc74ad0bd0
Some checks failed
ci/woodpecker/push/woodpecker Pipeline failed
fix: use library/ Harbor project, add skills, fix pipeline secrets
- .woodpecker.yaml: image paths -> library/autojanet-{agent,dispatcher}
- .woodpecker.yaml: secret names RS_HARBOR_USER / RS_HARBOR_PASS (global)
- container/Dockerfile: restore COPY skills/, skills/ populated from opencode config
- skills/: 84 opencode skills bundled into image
- k8s/manifests: update image refs to library/
2026-05-30 15:43:14 -07:00

1164 lines
49 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
merge-batch-graphs.py — Merge and normalize batch analysis results.
Combines batch-*.json files from the intermediate directory into a single
assembled graph with normalized IDs, complexity values, and cleaned edges.
Called at the end of Phase 2 of /understand. Phase 3 (ASSEMBLE REVIEW)
then reviews the output for semantic issues the script cannot catch.
Usage:
python merge-batch-graphs.py <project-root>
Input:
<project-root>/.understand-anything/intermediate/batch-*.json
Output:
<project-root>/.understand-anything/intermediate/assembled-graph.json
"""
import json
import os
import re
import sys
from collections import Counter
from pathlib import Path
from typing import Any
# ── Configuration ─────────────────────────────────────────────────────────
VALID_NODE_PREFIXES = {
"file", "func", "function", "class", "module", "concept",
"config", "document", "service", "table", "endpoint",
"pipeline", "schema", "resource",
"domain", "flow", "step",
# Knowledge-base node types (schema.ts NodeType enum)
"article", "entity", "topic", "claim", "source",
}
# node.type → canonical ID prefix
TYPE_TO_PREFIX: dict[str, str] = {
"file": "file",
"function": "function",
"func": "function",
"class": "class",
"module": "module",
"concept": "concept",
"config": "config",
"document": "document",
"service": "service",
"table": "table",
"endpoint": "endpoint",
"pipeline": "pipeline",
"schema": "schema",
"resource": "resource",
"domain": "domain",
"flow": "flow",
"step": "step",
# Knowledge-base node types
"article": "article",
"entity": "entity",
"topic": "topic",
"claim": "claim",
"source": "source",
}
COMPLEXITY_MAP: dict[str, str] = {
"low": "simple",
"easy": "simple",
"medium": "moderate",
"intermediate": "moderate",
"high": "complex",
"hard": "complex",
"difficult": "complex",
}
VALID_COMPLEXITY = {"simple", "moderate", "complex"}
# ── tested_by linker configuration ────────────────────────────────────────
# JS/TS family: a `.test.ts` file may be testing a `.ts`, `.tsx`, `.js`, etc.
# We try each candidate extension in priority order.
_JS_TS_EXTS: tuple[str, ...] = (".ts", ".tsx", ".js", ".jsx", ".mjs", ".cjs", ".vue")
_JS_TS_TEST_EXTS: frozenset[str] = frozenset(_JS_TS_EXTS)
# Mirrored production roots — when a test sits under `tests/`, it might be
# mirroring `src/`, `app/`, `lib/`, or the project root.
_MIRROR_PRODUCTION_ROOTS: tuple[str, ...] = ("src", "app", "lib", "")
# Per-extension test-name patterns: ext → (prefix_patterns, suffix_patterns).
# A basename qualifies as a test if its stem starts with any prefix or ends
# with any suffix listed for its extension. JS/TS family is handled separately
# because its `.test`/`.spec` infix sits on the *stem* of a double-extension
# basename (e.g. `foo.test.ts` has ext `.ts`, stem `foo.test`).
_TEST_NAME_PATTERNS: dict[str, tuple[tuple[str, ...], tuple[str, ...]]] = {
".go": ((), ("_test",)),
".py": (("test_",), ("_test",)),
".java": ((), ("Test", "Tests", "IT")),
".kt": ((), ("Test", "Tests")),
".cs": ((), ("Test", "Tests")),
".c": (("test_",), ("_test",)),
".cpp": (("test_",), ("_test",)),
".cc": (("test_",), ("_test",)),
}
# Mirrors packages/core/src/schema.ts so the dashboard validator has nothing
# left to auto-correct for the `direction` field on merged graphs.
_DIRECTION_ALIASES: dict[str, str] = {"both": "bidirectional", "mutual": "bidirectional"}
_VALID_DIRECTIONS: frozenset[str] = frozenset({"forward", "backward", "bidirectional"})
def normalize_direction(value: Any) -> str:
"""Canonicalize an edge `direction` value to one of the schema enum members."""
candidate = value.lower() if isinstance(value, str) else ""
candidate = _DIRECTION_ALIASES.get(candidate, candidate)
if candidate not in _VALID_DIRECTIONS:
return "forward"
return candidate
def _num(v: Any) -> float:
"""Coerce a value to float for safe comparison (handles string weights)."""
try:
return float(v)
except (TypeError, ValueError):
return 0.0
# ── Batch loading ─────────────────────────────────────────────────────────
def load_batch(path: Path) -> dict[str, Any] | None:
"""Load a batch JSON file, tolerating malformed files."""
try:
data = json.loads(path.read_text(encoding="utf-8"))
except (OSError, json.JSONDecodeError) as e:
print(f" Warning: skipping {path.name}: {e}", file=sys.stderr)
return None
if not isinstance(data.get("nodes"), list):
print(f" Warning: skipping {path.name}: missing or invalid 'nodes' array", file=sys.stderr)
return None
if not isinstance(data.get("edges"), list):
print(f" Warning: skipping {path.name}: missing or invalid 'edges' array", file=sys.stderr)
return None
return data
# ── ID normalization ──────────────────────────────────────────────────────
def classify_id_fix(original: str, corrected: str) -> str:
"""Return a human-readable pattern label for an ID correction."""
# Double prefix: "file:file:..." → "file:..."
for prefix in VALID_NODE_PREFIXES:
if original.startswith(f"{prefix}:{prefix}:"):
return f"{prefix}:{prefix}: → {prefix}: (double prefix)"
# Project-name prefix: "my-project:file:..." → "file:..."
parts = original.split(":")
if len(parts) >= 3 and parts[0] not in VALID_NODE_PREFIXES and parts[1] in VALID_NODE_PREFIXES:
return f"<project>:{parts[1]}: → {parts[1]}: (project-name prefix)"
# Legacy func: → function:
if original.startswith("func:") and corrected.startswith("function:"):
return "func: → function: (prefix canonicalization)"
# Bare path → prefixed
if not any(original.startswith(f"{p}:") for p in VALID_NODE_PREFIXES):
prefix = corrected.split(":")[0]
return f"bare path → {prefix}: (missing prefix)"
return f"{original}{corrected}"
def normalize_node_id(node_id: str, node: dict[str, Any]) -> str:
"""Normalize a node ID, returning the corrected version."""
nid = node_id
# Strip double prefix: "file:file:src/foo.ts" → "file:src/foo.ts"
for prefix in VALID_NODE_PREFIXES:
double = f"{prefix}:{prefix}:"
if nid.startswith(double):
nid = nid[len(prefix) + 1:]
break
# Strip project-name prefix: "my-project:file:src/foo.ts" → "file:src/foo.ts"
# Pattern: <word>:<valid-prefix>:<path>
match = re.match(r"^[^:]+:(" + "|".join(re.escape(p) for p in VALID_NODE_PREFIXES) + r"):(.+)$", nid)
if match:
# Only strip if the first segment is NOT a valid prefix itself
first_seg = nid.split(":")[0]
if first_seg not in VALID_NODE_PREFIXES:
nid = f"{match.group(1)}:{match.group(2)}"
# Canonicalize legacy prefix: func: → function:
if nid.startswith("func:") and not nid.startswith("function:"):
nid = "function:" + nid[5:]
# Add missing prefix for bare file paths
has_prefix = any(nid.startswith(f"{p}:") for p in VALID_NODE_PREFIXES)
if not has_prefix:
node_type = node.get("type", "file")
prefix = TYPE_TO_PREFIX.get(node_type, "file")
if node_type in ("function", "class"):
file_path = node.get("filePath", "")
name = node.get("name", nid)
if file_path:
nid = f"{prefix}:{file_path}:{name}"
else:
# Without filePath, function:<name> collides with every other
# function of the same name across the project. Prefix with a
# placeholder so the collision is at least detectable in the
# report instead of silently merging unrelated nodes.
nid = f"{prefix}:__nofilepath__:{name}"
else:
nid = f"{prefix}:{nid}"
return nid
def normalize_complexity(value: Any) -> tuple[str, str]:
"""Normalize a complexity value. Returns (normalized, status).
status is one of:
"valid" — already a valid value, no change needed
"mapped" — known alias, confidently mapped (goes to Fixed report)
"unknown" — unrecognized value, defaulted to moderate (goes to Could-not-fix report)
"""
if isinstance(value, str):
lower = value.strip().lower()
if lower in VALID_COMPLEXITY:
return lower, "valid"
if lower in COMPLEXITY_MAP:
return COMPLEXITY_MAP[lower], "mapped"
# Unknown string — default but flag it
return "moderate", "unknown"
elif isinstance(value, (int, float)):
n = int(value)
if n <= 3:
return "simple", "mapped"
elif n <= 6:
return "moderate", "mapped"
else:
return "complex", "mapped"
# None or other type — default but flag it
return "moderate", "unknown"
# ── Deterministic tested_by linker ────────────────────────────────────────
#
# Two-pass linker. Both passes produce canonical `production → test` edges.
#
# Pass 1 — preserve LLM semantics, fix direction.
# The LLM sees the relationship only when analyzing a *test* file
# (production files don't import their tests), so its emitted direction
# is systematically wrong: source = the file it was analyzing = a test.
# We do NOT strip these edges — the *pairing* is real evidence (the LLM
# saw an import / using / same-package call). We just flip direction
# when source is test + target is production. Edges that are
# semantically broken (test↔test, production↔production, orphan endpoints)
# are dropped.
#
# Pass 2 — supplement with path-convention pairings.
# For test files the LLM didn't link to anything, fall back to filename
# conventions (sibling `_test.go`, JS/TS `__tests__/`, Maven `src/test/`,
# etc.) to find a production counterpart. Pairs already covered by
# Pass 1 are skipped.
#
# Why this beats strip-and-rederive: real projects often violate the
# linker's naming conventions (one Go `_test.go` covering several `.go`
# files in the same package, .NET `<svc>/tests/X.cs` against
# `<svc>/src/Y/X.cs`). Stripping LLM edges drops that real-world coverage
# signal entirely. Swapping preserves it.
def _path_segments(path: str) -> list[str]:
"""Split a relative POSIX-style path into segments (ignoring empties)."""
return [seg for seg in path.split("/") if seg]
def _basename(path: str) -> str:
return path.rsplit("/", 1)[-1] if "/" in path else path
def is_test_path(path: str) -> bool:
"""Return True if `path` looks like a test file by basename convention.
Files inside `tests/`, `__tests__/`, `test/`, or `spec/` directories that
do NOT carry a recognized test extension are treated as helpers/fixtures
and classified as non-test (so `__tests__/helpers.ts` is not a test).
"""
stem, ext = os.path.splitext(_basename(path))
# JS/TS family: the test marker is an infix on the stem (foo.test.ts has
# stem "foo.test", ext ".ts"), not a prefix/suffix on the stem itself.
if ext in _JS_TS_TEST_EXTS:
return stem.endswith(".test") or stem.endswith(".spec")
patterns = _TEST_NAME_PATTERNS.get(ext)
if patterns is None:
return False
prefixes, suffixes = patterns
return any(stem.startswith(p) for p in prefixes) or any(
stem.endswith(s) for s in suffixes
)
def _strip_test_infix(stem: str) -> str | None:
"""For a JS/TS-family stem like `foo.test` or `foo.spec`, strip the
trailing `.test` / `.spec`. Returns None if no infix is present."""
for infix in (".test", ".spec"):
if stem.endswith(infix):
return stem[: -len(infix)]
return None
def _join(dir_path: str, name: str) -> str:
"""Join a (possibly empty) directory path to a basename with a single
slash, dropping the slash entirely when there is no directory."""
return f"{dir_path}/{name}" if dir_path else name
def _add_unique(out: list[str], path: str) -> None:
"""Append `path` to `out` unless it is empty or already present."""
if path and path not in out:
out.append(path)
def _js_ts_sibling_candidates(dir_path: str, base_stem: str) -> list[str]:
"""Build sibling candidates for a JS/TS family base stem.
`dir_path` is the parent dir (no trailing slash, may be empty).
`base_stem` is the stem with the test infix already stripped.
"""
return [_join(dir_path, f"{base_stem}{e}") for e in _JS_TS_EXTS]
def production_candidates(test_path: str) -> list[str]:
"""For a test file path, return ordered candidate production paths.
The returned list is in priority order (sibling first, then `__tests__`
walk-out, then mirrored-tree variants). Duplicates are removed while
preserving order. Caller should pick the first candidate that resolves
to a known production node.
"""
stem, ext = os.path.splitext(_basename(test_path))
segs = _path_segments(test_path)
dir_segs = segs[:-1]
dir_path = "/".join(dir_segs)
candidates: list[str] = []
# ── JS/TS family ──────────────────────────────────────────────────
if ext in _JS_TS_TEST_EXTS:
base_stem = _strip_test_infix(stem)
if base_stem is not None:
# 1. Sibling de-infix: prefer the same extension as the test, then
# the rest of the family.
_add_unique(candidates, _join(dir_path, f"{base_stem}{ext}"))
for c in _js_ts_sibling_candidates(dir_path, base_stem):
_add_unique(candidates, c)
# 2. Walk out of test-segregating subdir — drop the trailing
# __tests__/test/spec/tests segment. Some JS/TS projects use
# `<dir>/test/foo.spec.ts` or `<dir>/spec/foo.spec.ts` instead of
# the more idiomatic `__tests__/`; treat them the same.
if dir_segs and dir_segs[-1] in ("__tests__", "test", "spec", "tests"):
parent_dir = "/".join(dir_segs[:-1])
_add_unique(candidates, _join(parent_dir, f"{base_stem}{ext}"))
for c in _js_ts_sibling_candidates(parent_dir, base_stem):
_add_unique(candidates, c)
# 3. Mirrored tree: tests/foo/X.test.ts → src/foo/X.ts (and
# variants for app/lib/<root>).
if dir_segs and dir_segs[0] in ("tests", "test", "__tests__"):
tail_path = "/".join(dir_segs[1:])
for root in _MIRROR_PRODUCTION_ROOTS:
new_dir = "/".join(p for p in (root, tail_path) if p)
_add_unique(candidates, _join(new_dir, f"{base_stem}{ext}"))
for c in _js_ts_sibling_candidates(new_dir, base_stem):
_add_unique(candidates, c)
# ── Go ────────────────────────────────────────────────────────────
elif ext == ".go" and stem.endswith("_test"):
base_stem = stem[: -len("_test")]
_add_unique(candidates, _join(dir_path, f"{base_stem}.go"))
# ── Python ────────────────────────────────────────────────────────
elif ext == ".py" and (stem.startswith("test_") or stem.endswith("_test")):
if stem.startswith("test_"):
base_stem = stem[len("test_"):]
else:
base_stem = stem[: -len("_test")]
# Sibling
_add_unique(candidates, _join(dir_path, f"{base_stem}.py"))
# Walk out of an in-package tests/ or test/ directory:
# `mypkg/tests/test_bar.py` → `mypkg/bar.py`. Common in Django apps
# and any project that colocates tests with the package they cover.
if dir_segs and dir_segs[-1] in ("tests", "test"):
parent_dir = "/".join(dir_segs[:-1])
_add_unique(candidates, _join(parent_dir, f"{base_stem}.py"))
# Mirrored: tests/foo/test_bar.py → src/foo/bar.py (and variants)
if dir_segs and dir_segs[0] in ("tests", "test"):
tail_path = "/".join(dir_segs[1:])
for root in _MIRROR_PRODUCTION_ROOTS:
new_dir = "/".join(p for p in (root, tail_path) if p)
_add_unique(candidates, _join(new_dir, f"{base_stem}.py"))
# ── Java ──────────────────────────────────────────────────────────
elif ext == ".java":
for suffix in ("Tests", "Test", "IT"):
if stem.endswith(suffix):
base_stem = stem[: -len(suffix)]
# Maven/Gradle layout: swap src/test/java/... → src/main/java/...
if (
len(dir_segs) >= 3
and dir_segs[0] == "src"
and dir_segs[1] == "test"
and dir_segs[2] == "java"
):
new_dir = "/".join(["src", "main", "java"] + list(dir_segs[3:]))
_add_unique(candidates, f"{new_dir}/{base_stem}.java")
# Sibling fallback
_add_unique(candidates, _join(dir_path, f"{base_stem}.java"))
break
# ── Kotlin ────────────────────────────────────────────────────────
elif ext == ".kt":
for suffix in ("Tests", "Test"):
if stem.endswith(suffix):
base_stem = stem[: -len(suffix)]
if (
len(dir_segs) >= 3
and dir_segs[0] == "src"
and dir_segs[1] == "test"
and dir_segs[2] == "kotlin"
):
new_dir = "/".join(["src", "main", "kotlin"] + list(dir_segs[3:]))
_add_unique(candidates, f"{new_dir}/{base_stem}.kt")
_add_unique(candidates, _join(dir_path, f"{base_stem}.kt"))
break
# ── C# ────────────────────────────────────────────────────────────
elif ext == ".cs":
for suffix in ("Tests", "Test"):
if stem.endswith(suffix):
base_stem = stem[: -len(suffix)]
# Sibling fallback (e.g. `Foo.Tests/BarTests.cs` ↔ same dir
# is rare but cheap to try).
_add_unique(candidates, _join(dir_path, f"{base_stem}.cs"))
# Walk out of an in-service `tests/` directory and search
# the sibling `src/` subtree. Handles layouts like
# `src/<svc>/tests/BarTests.cs` ↔ `src/<svc>/src/.../Bar.cs`
# (microservices-demo cartservice) and bare
# `<proj>/tests/BarTests.cs` ↔ `<proj>/src/Bar.cs`.
tests_idx = None
for i in range(len(dir_segs) - 1, -1, -1):
if dir_segs[i].lower() in ("tests", "test"):
tests_idx = i
break
if tests_idx is not None:
parent_segs = dir_segs[:tests_idx]
tail_segs = dir_segs[tests_idx + 1 :]
parent_dir = "/".join(parent_segs)
# `<parent>/<base_stem>.cs` (drop `tests/` entirely).
_add_unique(
candidates,
_join(parent_dir, f"{base_stem}.cs"),
)
# `<parent>/src/<tail>/<base_stem>.cs` (mirror through src/).
src_dir = "/".join([*parent_segs, "src", *tail_segs])
_add_unique(candidates, _join(src_dir, f"{base_stem}.cs"))
# `.NET`-style sibling-project mirror: `My.App.Tests/...` ↔
# `My.App/...`. The test project's top dir typically ends in
# `.Tests`. Strip it and try the same tail under the sibling.
if dir_segs:
top = dir_segs[0]
if top.endswith(".Tests") or top.endswith(".Test"):
sibling = top[: -len(".Tests")] if top.endswith(".Tests") else top[: -len(".Test")]
if sibling:
mirror_dir = "/".join([sibling, *dir_segs[1:]])
_add_unique(
candidates,
_join(mirror_dir, f"{base_stem}.cs"),
)
break
# ── C/C++ ─────────────────────────────────────────────────────────
elif ext in {".c", ".cpp", ".cc"}:
if stem.startswith("test_"):
base_stem = stem[len("test_"):]
elif stem.endswith("_test"):
base_stem = stem[: -len("_test")]
else:
base_stem = None
if base_stem is not None:
_add_unique(candidates, _join(dir_path, f"{base_stem}{ext}"))
return candidates
def _file_node_path(node: dict[str, Any]) -> str | None:
"""Return the relative project path for a `file:`-prefixed node, else None."""
nid = node.get("id", "")
if not isinstance(nid, str) or not nid.startswith("file:"):
return None
fp = node.get("filePath")
if isinstance(fp, str) and fp:
return fp
return nid[len("file:"):]
def _swap_tested_by_in_place(
edge: dict[str, Any], original_src: str, original_tgt: str
) -> None:
"""Flip an inverted `tested_by` edge so source becomes production and
target becomes the test file. Mutates `edge` in place; appends a
`[direction corrected]` audit marker to `description`.
"""
edge["source"] = original_tgt
edge["target"] = original_src
edge["direction"] = "forward"
prev = edge.get("description")
edge["description"] = (
"Direction corrected (was test → production)"
if not prev
else f"{prev} [direction corrected]"
)
def _ensure_tested_tag(node: dict[str, Any]) -> bool:
"""Append "tested" to `node["tags"]`, coercing malformed `tags` to a
fresh list. Returns True if the tag was newly added.
`tags` from raw LLM batch JSON may be missing, None, a string, or
another non-list value — the TypeScript autoFixGraph normalizer that
handles this runs downstream of this script, so we defend here.
"""
tags = node.get("tags")
if not isinstance(tags, list):
tags = []
node["tags"] = tags
if "tested" in tags:
return False
tags.append("tested")
return True
def link_tests(
nodes_by_id: dict[str, dict[str, Any]],
edges: list[dict[str, Any]],
) -> tuple[int, int, int, int]:
"""Canonicalize `tested_by` edges and link unmatched test files.
Two passes (see module-level "Deterministic tested_by linker" comment
for the rationale):
1. Walk every existing `tested_by` edge. Keep canonical
(production → test) edges as-is. Flip inverted (test → production)
edges so the swap preserves the LLM's pairing evidence with the
right direction. Drop edges that don't classify cleanly as
file ↔ file or where one endpoint is missing — they have no
recoverable meaning.
2. For every test file not yet paired by Pass 1, walk path-convention
candidates and emit a fresh `production → test` edge for the first
match.
Tagging happens once per production node that ends up on the source
side of any `tested_by` edge (canonical, swapped, or supplemented).
Mutates `nodes_by_id` (adds "tested" tag) and `edges` (rewrites
in place: drops semantically broken edges, swaps inverted ones, appends
supplements).
Returns (added, dropped, tagged, swapped):
added: path-convention supplemental edges appended in Pass 2
dropped: pre-existing `tested_by` edges removed (unsalvageable)
tagged: production nodes newly tagged "tested"
swapped: pre-existing `tested_by` edges flipped (test → production
became production → test)
"""
# ── Index file nodes by relative path; classify each as test/production.
# `is_prod` here means "is a known file node AND is not a test by
# path convention" — used both to validate edge endpoints and to drive
# path-convention candidate matching.
file_paths_to_nodes: dict[str, dict[str, Any]] = {}
node_id_to_classification: dict[str, str] = {} # id → "test" | "prod"
test_nodes: list[tuple[str, dict[str, Any]]] = []
for node in nodes_by_id.values():
path = _file_node_path(node)
if path is None:
continue
file_paths_to_nodes[path] = node
if is_test_path(path):
node_id_to_classification[node["id"]] = "test"
test_nodes.append((path, node))
else:
node_id_to_classification[node["id"]] = "prod"
# ── Pass 1: walk existing tested_by edges, canonicalize or drop.
# `covered` tracks (production_id, test_id) pairs that have a kept edge
# after this pass — used both to deduplicate within Pass 1 and to
# suppress duplicate supplements in Pass 2.
# `pair_to_idx` maps each kept pair to its slot in the compacted edges
# list, so a duplicate that arrives later with a higher weight can
# replace the earlier slot in place (mirrors Step 6's
# `weight > existing.weight` rule — without this, a 0.3-weight edge
# from batch 1 would silently outrank a 0.9-weight edge from batch 2
# because Step 6 only ever sees one of them).
# `swapped_pairs` records which surviving pairs came from a flipped
# edge, so the `swapped` counter reflects the FINAL output and
# doesn't double-count work done on edges that were later replaced.
covered: set[tuple[str, str]] = set()
pair_to_idx: dict[tuple[str, str], int] = {}
swapped_pairs: set[tuple[str, str]] = set()
dropped = 0
write_idx = 0
for edge in edges:
if edge.get("type") != "tested_by":
edges[write_idx] = edge
write_idx += 1
continue
src = edge.get("source", "")
tgt = edge.get("target", "")
src_class = node_id_to_classification.get(src)
tgt_class = node_id_to_classification.get(tgt)
# Both endpoints must be known file nodes; one test, one production.
# Anything else (orphan, test↔test, prod↔prod, non-file endpoint)
# has no recoverable meaning — drop it.
if (src_class, tgt_class) == ("prod", "test"):
pair = (src, tgt)
needs_swap = False
elif (src_class, tgt_class) == ("test", "prod"):
pair = (tgt, src)
needs_swap = True
else:
dropped += 1
continue
if pair in covered:
# Duplicate pair: keep the heavier-weight edge (mirrors the
# weight-aware dedup in Step 6, which can't help here because
# only one of the duplicates would reach it).
existing_idx = pair_to_idx[pair]
existing = edges[existing_idx]
if _num(edge.get("weight", 0)) > _num(existing.get("weight", 0)):
# Heavier — replace existing slot. Apply the swap (or not)
# only on the survivor, so we never spend cycles canonicalizing
# an edge we're about to drop.
if needs_swap:
_swap_tested_by_in_place(edge, src, tgt)
swapped_pairs.add(pair)
else:
# Replacement is canonical — if the previous winner came
# from a swap, the surviving slot is no longer a swap.
swapped_pairs.discard(pair)
edges[existing_idx] = edge
# else: existing is heavier or equal — keep it, drop the new edge.
dropped += 1
continue
if needs_swap:
_swap_tested_by_in_place(edge, src, tgt)
swapped_pairs.add(pair)
covered.add(pair)
pair_to_idx[pair] = write_idx
edges[write_idx] = edge
write_idx += 1
del edges[write_idx:]
swapped = len(swapped_pairs)
# ── Pass 2: path-convention supplement for tests not yet paired.
paired_test_ids = {test_id for (_prod_id, test_id) in covered}
added = 0
for test_path, test_node in test_nodes:
if test_node["id"] in paired_test_ids:
continue
for cand_path in production_candidates(test_path):
prod_node = file_paths_to_nodes.get(cand_path)
if prod_node is None:
continue
if is_test_path(cand_path):
# Don't link a test to another test even if naming aligns.
continue
pair = (prod_node["id"], test_node["id"])
if pair in covered:
continue
edges.append({
"source": prod_node["id"],
"target": test_node["id"],
"type": "tested_by",
"direction": "forward",
"weight": 0.5,
"description": "Path-based pairing (deterministic)",
})
covered.add(pair)
added += 1
break
# ── Tag every production node that ended up sourcing a tested_by edge
# (covers Pass 1 canonical + swapped + Pass 2 supplements in one place).
tagged = 0
for prod_id, _test_id in covered:
prod_node = nodes_by_id.get(prod_id)
if prod_node is None:
continue
if _ensure_tested_tag(prod_node):
tagged += 1
return added, dropped, tagged, swapped
# ── Main merge + normalize ────────────────────────────────────────────────
def merge_and_normalize(batches: list[dict[str, Any]]) -> tuple[dict[str, Any], list[str]]:
"""Merge batch results and normalize. Returns (assembled_graph, report_lines)."""
# ── Pattern counters for "Fixed" report ──────────────────────────
id_fix_patterns: Counter[str] = Counter()
complexity_fix_patterns: Counter[str] = Counter()
# ── Detail lists for "Could not fix" report ──────────────────────
unfixable: list[str] = []
# ── Step 1: Combine all nodes and edges ──────────────────────────
all_nodes: list[dict] = []
all_edges: list[dict] = []
for batch in batches:
all_nodes.extend(batch.get("nodes", []))
all_edges.extend(batch.get("edges", []))
total_input_nodes = len(all_nodes)
total_input_edges = len(all_edges)
# ── Step 2: Normalize node IDs and build ID mapping ──────────────
id_mapping: dict[str, str] = {} # original → corrected
nodes_with_ids: list[dict] = []
unknown_node_types: Counter[str] = Counter()
for i, node in enumerate(all_nodes):
original_id = node.get("id")
if not original_id:
unfixable.append(f"Node[{i}] has no 'id' field (name={node.get('name', '?')}, type={node.get('type', '?')})")
continue
# Flag unknown node types
node_type = node.get("type", "")
if node_type and node_type not in TYPE_TO_PREFIX:
unknown_node_types[node_type] += 1
nodes_with_ids.append(node)
corrected_id = normalize_node_id(original_id, node)
if corrected_id != original_id:
pattern = classify_id_fix(original_id, corrected_id)
id_fix_patterns[pattern] += 1
id_mapping[original_id] = corrected_id
node["id"] = corrected_id
# ── Step 3: Normalize complexity ─────────────────────────────────
complexity_unknown_patterns: Counter[str] = Counter()
for node in nodes_with_ids:
original = node.get("complexity")
normalized, status = normalize_complexity(original)
if status == "mapped":
orig_repr = repr(original) if not isinstance(original, str) else f'"{original}"'
complexity_fix_patterns[f"{orig_repr}\"{normalized}\""] += 1
elif status == "unknown":
orig_repr = repr(original) if not isinstance(original, str) else f'"{original}"'
complexity_unknown_patterns[f"complexity {orig_repr} → defaulted to \"moderate\""] += 1
node["complexity"] = normalized
# ── Step 4: Rewrite edge references ──────────────────────────────
edges_rewritten = 0
for edge in all_edges:
src = edge.get("source", "")
tgt = edge.get("target", "")
new_src = id_mapping.get(src, src)
new_tgt = id_mapping.get(tgt, tgt)
if new_src != src or new_tgt != tgt:
edges_rewritten += 1
edge["source"] = new_src
edge["target"] = new_tgt
# ── Step 5: Deduplicate nodes by ID (keep last) ─────────────────
duplicate_count = 0
nodes_by_id: dict[str, dict] = {}
for node in nodes_with_ids:
nid = node.get("id", "")
if nid in nodes_by_id:
duplicate_count += 1
nodes_by_id[nid] = node
# ── Step 5b: Deterministic tested_by linker ──────────────────────
# See module-level "Deterministic tested_by linker" section above.
tested_by_added, tested_by_dropped, tested_by_tagged, tested_by_swapped = link_tests(
nodes_by_id, all_edges
)
# ── Step 6: Deduplicate edges, drop dangling ─────────────────────
node_ids = set(nodes_by_id.keys())
# Direction is part of the dedup key so a `forward` edge does not silently
# overwrite a `bidirectional` one (or vice versa); they're different
# semantic relationships that the dashboard renders distinctly.
edges_by_key: dict[tuple[str, str, str, str], dict] = {}
for edge in all_edges:
src = edge.get("source", "")
tgt = edge.get("target", "")
etype = edge.get("type", "")
direction = normalize_direction(edge.get("direction"))
edge["direction"] = direction
if src not in node_ids or tgt not in node_ids:
missing = []
if src not in node_ids:
missing.append(f"source '{src}'")
if tgt not in node_ids:
missing.append(f"target '{tgt}'")
unfixable.append(f"Edge {src}{tgt} ({etype}): dropped, missing {', '.join(missing)}")
continue
key = (src, tgt, etype, direction)
existing = edges_by_key.get(key)
if existing is None or _num(edge.get("weight", 0)) > _num(existing.get("weight", 0)):
edges_by_key[key] = edge
# ── Build report ─────────────────────────────────────────────────
report: list[str] = []
report.append(f"Input: {total_input_nodes} nodes, {total_input_edges} edges")
# Fixed section — grouped by pattern
fixed_lines: list[str] = []
if id_fix_patterns:
for pattern, count in id_fix_patterns.most_common():
fixed_lines.append(f" {count:>4} × {pattern}")
if complexity_fix_patterns:
for pattern, count in complexity_fix_patterns.most_common():
fixed_lines.append(f" {count:>4} × complexity {pattern}")
if edges_rewritten:
fixed_lines.append(f" {edges_rewritten:>4} × edge references rewritten after ID normalization")
if duplicate_count:
fixed_lines.append(f" {duplicate_count:>4} × duplicate node IDs removed (kept last)")
if tested_by_swapped:
fixed_lines.append(f" {tested_by_swapped:>4} × tested_by edges flipped (test → production became production → test)")
if tested_by_dropped:
fixed_lines.append(f" {tested_by_dropped:>4} × tested_by edges dropped (orphan endpoint or test↔test / prod↔prod pair)")
if fixed_lines:
report.append("")
total_fixes = (
sum(id_fix_patterns.values())
+ sum(complexity_fix_patterns.values())
+ edges_rewritten
+ duplicate_count
+ tested_by_swapped
+ tested_by_dropped
)
report.append(f"Fixed ({total_fixes} corrections):")
report.extend(fixed_lines)
# Tested-by linker section — separate from Fixed since these are net-new
# additions, not corrections.
if tested_by_added or tested_by_tagged:
report.append("")
report.append("Tested-by linker:")
report.append(f" {tested_by_added:>4} × tested_by edges produced (path-convention supplement, production → test)")
report.append(f" {tested_by_tagged:>4} × production nodes tagged \"tested\"")
# Could not fix section — unknown patterns (grouped) + individual details
unfixable_total = (
len(unfixable)
+ sum(complexity_unknown_patterns.values())
+ sum(unknown_node_types.values())
)
if unfixable_total:
report.append("")
report.append(f"Could not fix ({unfixable_total} issues — needs agent review):")
# Unknown node types (grouped by count)
for ntype, count in unknown_node_types.most_common():
report.append(f" {count:>4} × unknown node type \"{ntype}\" (not in schema, kept as-is)")
# Unknown complexity patterns (grouped by count)
for pattern, count in complexity_unknown_patterns.most_common():
report.append(f" {count:>4} × {pattern}")
# Individual unfixable items
for detail in unfixable:
report.append(f" - {detail}")
# Output stats
report.append("")
report.append(f"Output: {len(nodes_by_id)} nodes, {len(edges_by_key)} edges")
assembled = {
"nodes": list(nodes_by_id.values()),
"edges": list(edges_by_key.values()),
}
return assembled, report
# ── Imports-edge recovery from importMap ──────────────────────────────────
def recover_imports_from_scan(
assembled: dict[str, Any],
scan_result_path: Path,
) -> tuple[int, list[str]]:
"""Re-emit any `imports` edges that exist in `scan-result.json#importMap`
but never made it into a batch's output. The project-scanner's importMap
is the deterministic source of truth for resolved internal imports;
file-analyzer agents are expected to transcribe those into edges 1:1
but in practice drop ~25% of them on real projects (orchestrator-side
batch construction loses entries, agent-side enumeration drops more).
Returns (recovered_count, report_lines).
"""
if not scan_result_path.is_file():
return 0, [f" importMap recovery skipped — {scan_result_path.name} not found"]
try:
scan = json.loads(scan_result_path.read_text(encoding="utf-8"))
except (OSError, json.JSONDecodeError) as e:
return 0, [f" importMap recovery skipped — could not parse {scan_result_path.name}: {e}"]
import_map = scan.get("importMap")
if not isinstance(import_map, dict):
return 0, [f" importMap recovery skipped — no importMap field in {scan_result_path.name}"]
# Build the set of file: node ids actually present in the assembled graph.
file_node_ids: set[str] = set()
for node in assembled["nodes"]:
if node.get("type") == "file":
file_node_ids.add(node.get("id", ""))
# Build the set of (source, target) imports edges already present.
existing: set[tuple[str, str]] = set()
for edge in assembled["edges"]:
if edge.get("type") == "imports":
existing.add((edge.get("source", ""), edge.get("target", "")))
recovered = 0
skipped_no_src_node = 0
skipped_no_tgt_node = 0
for src_path, targets in import_map.items():
if not isinstance(targets, list):
continue
src_id = f"file:{src_path}"
if src_id not in file_node_ids:
if targets:
skipped_no_src_node += 1
continue
for tgt_path in targets:
if not isinstance(tgt_path, str) or not tgt_path:
continue
tgt_id = f"file:{tgt_path}"
if tgt_id not in file_node_ids:
skipped_no_tgt_node += 1
continue
if src_id == tgt_id:
continue
if (src_id, tgt_id) in existing:
continue
assembled["edges"].append({
"source": src_id,
"target": tgt_id,
"type": "imports",
"direction": "forward",
"weight": 0.7,
"recoveredFromImportMap": True,
})
existing.add((src_id, tgt_id))
recovered += 1
lines: list[str] = []
lines.append(
f" Recovered {recovered} `imports` edges from importMap "
f"({len(import_map)} entries scanned)"
)
if skipped_no_src_node:
lines.append(
f" Skipped {skipped_no_src_node} importMap source files "
f"with no `file:` node in graph"
)
if skipped_no_tgt_node:
lines.append(
f" Skipped {skipped_no_tgt_node} importMap target paths "
f"with no `file:` node in graph"
)
return recovered, lines
# ── Main ──────────────────────────────────────────────────────────────────
def main() -> None:
if len(sys.argv) < 2:
print("Usage: python merge-batch-graphs.py <project-root>", file=sys.stderr)
sys.exit(1)
project_root = Path(sys.argv[1]).resolve()
intermediate_dir = project_root / ".understand-anything" / "intermediate"
if not intermediate_dir.is_dir():
print(f"Error: {intermediate_dir} does not exist", file=sys.stderr)
sys.exit(1)
# Discover batch files, sorted by numeric index (not lexicographic)
batch_files = sorted(
intermediate_dir.glob("batch-*.json"),
key=lambda p: int(re.search(r"batch-(\d+)", p.stem).group(1))
if re.search(r"batch-(\d+)", p.stem)
else 0,
)
if not batch_files:
print("Error: no batch-*.json files found in intermediate/", file=sys.stderr)
sys.exit(1)
# Group by logical batch index so the report distinguishes single-batch
# files from multi-part file-analyzer outputs. Files that don't match the
# `batch-<N>.json` / `batch-<N>-part-<K>.json` pattern (e.g. fused
# `batch-fused-8-13.json`, range `batch-8-13.json`) would otherwise be
# silently dropped during load — flag them loudly instead so the user
# can fix the file-analyzer agent.
from collections import defaultdict as _dd
by_batch = _dd(list)
unrecognized_batch_files: list[str] = []
for f in batch_files:
m = re.match(r"batch-(\d+)(?:-part-(\d+))?\.json", f.name)
if m:
by_batch[int(m.group(1))].append((f.name, int(m.group(2)) if m.group(2) else None))
else:
unrecognized_batch_files.append(f.name)
if unrecognized_batch_files:
preview = ", ".join(unrecognized_batch_files[:5])
suffix = (
f" (+{len(unrecognized_batch_files) - 5} more)"
if len(unrecognized_batch_files) > 5
else ""
)
print(
f"Warning: merge-batch-graphs: {len(unrecognized_batch_files)} "
f"batch file(s) with unrecognized filenames will be DROPPED — "
f"files: {preview}{suffix} — fix the file-analyzer agent to use "
f"only batch-<N>.json or batch-<N>-part-<K>.json patterns",
file=sys.stderr,
)
logical_count = len(by_batch)
multi_part = sum(1 for entries in by_batch.values() if len(entries) > 1)
print(
f"Found {len(batch_files)} batch files "
f"({logical_count} logical batches, {multi_part} multi-part):",
file=sys.stderr,
)
# Missing-part detection: for any logical batch with parts (len > 1), the
# set of part numbers MUST be contiguous starting at 1. Gaps suggest a
# truncated write — emit a visible warning so the user can investigate.
# Collect into `missing_part_warnings` so they also surface in the final
# phase report; stderr alone gets buried under the per-batch load lines.
missing_part_warnings: list[str] = []
for idx, entries in by_batch.items():
part_nums = [p for (_n, p) in entries if p is not None]
if not part_nums:
continue
present = set(part_nums)
expected = set(range(1, max(part_nums) + 1))
missing = sorted(expected - present)
if missing:
msg = (
f"batch {idx} has parts {sorted(present)} but "
f"missing part {missing} — possible truncated write — "
f"affected nodes/edges may be lost"
)
print(f"Warning: merge: {msg}", file=sys.stderr)
missing_part_warnings.append(msg)
# Load batches — skip unrecognized filenames so they don't pollute the
# merged graph with content the agent labeled incorrectly.
unrecognized_set = set(unrecognized_batch_files)
batches: list[dict[str, Any]] = []
for f in batch_files:
if f.name in unrecognized_set:
continue
batch = load_batch(f)
if batch is not None:
batches.append(batch)
n = len(batch.get("nodes", []))
e = len(batch.get("edges", []))
print(f" {f.name}: {n} nodes, {e} edges", file=sys.stderr)
if not batches:
print("Error: no valid batch files loaded", file=sys.stderr)
sys.exit(1)
# Merge and normalize
assembled, report = merge_and_normalize(batches)
# Surface missing multi-part files to the phase report (parallel to
# unrecognized-filename handling below). Stderr lines emitted during
# batch discovery get buried under per-batch load output — re-emitting
# via the report list ensures the Phase 4 review and final summary see
# the data-loss signal.
if missing_part_warnings:
report.append("")
report.append(
f"Warning: {len(missing_part_warnings)} batch(es) with missing parts "
f"— some nodes/edges silently dropped:"
)
for w in missing_part_warnings:
report.append(f" - {w}")
# Surface unrecognized-filename drops to the phase report so the
# downstream review step sees them, not just stderr.
if unrecognized_batch_files:
preview = ", ".join(unrecognized_batch_files[:5])
suffix = (
f" (+{len(unrecognized_batch_files) - 5} more)"
if len(unrecognized_batch_files) > 5
else ""
)
report.append("")
report.append(
f"Warning: dropped {len(unrecognized_batch_files)} batch file(s) "
f"with unrecognized filenames — files: {preview}{suffix}"
f"fix the file-analyzer agent to use only batch-<N>.json or "
f"batch-<N>-part-<K>.json patterns (every node/edge in these "
f"files was excluded from the final graph)"
)
# Recover any imports edges file-analyzer batches dropped despite
# `batchImportData` containing them. The project-scanner's importMap
# is the deterministic source of truth.
scan_result_path = intermediate_dir / "scan-result.json"
recovered, recovery_report = recover_imports_from_scan(assembled, scan_result_path)
if recovery_report:
report.append("")
report.append("Imports edge recovery:")
report.extend(recovery_report)
# Print report
print("", file=sys.stderr)
for line in report:
print(line, file=sys.stderr)
# Write output
output_path = intermediate_dir / "assembled-graph.json"
output_path.write_text(json.dumps(assembled, indent=2, ensure_ascii=False), encoding="utf-8")
size_kb = output_path.stat().st_size / 1024
print(f"\nWritten to {output_path} ({size_kb:.0f} KB)", file=sys.stderr)
if __name__ == "__main__":
main()