#!/usr/bin/env python3
"""Merge edges from build/main.rrxiv.aux into build/main.cir.json.

The rrxiv-python parser only extracts claim-to-claim edges where the
\\dependson{}{} arguments are already in the canonical paper:label
form. This paper uses short-form labels (I.1, I.47, etc.) because
the proof DAG is dense and short labels keep the source readable.

This post-processor reads the sidecar, filters to claim-to-claim
edges only (drops post:*, def:*, cn:* targets), and prefixes them
with the canonical paper id.

Usage:
  scripts/merge-sidecar-edges.py
"""

from __future__ import annotations

import json
import re
from pathlib import Path

PAPER_ID = "01923f8e-0009-7c4d-9e1f-3a2b1c0d4e5f"
ROOT = Path(__file__).resolve().parent.parent
CIR_PATH = ROOT / "build" / "main.cir.json"
AUX_PATH = ROOT / "build" / "main.rrxiv.aux"
META_PATH = ROOT / "rrxiv-meta.json"

# Claim labels in book*.tex are uppercase Roman.Arabic — I.1, II.12, etc.
# (Not post:N, cn:N, def:I.N — those are postulates/common notions/defs.)
CLAIM_LABEL_RE = re.compile(r"^[IVXLC]+\.\d+(\.\d+)?$")
EDGE_RE = re.compile(r"^RRXIV:edge:(depends_on|supports|contradicts|extends):([^|]+)\|(.+)$")


def main() -> int:
    if not CIR_PATH.is_file():
        raise SystemExit(f"missing {CIR_PATH}")
    if not AUX_PATH.is_file():
        raise SystemExit(f"missing {AUX_PATH}")

    cir = json.loads(CIR_PATH.read_text())

    # Rewrite the canonical paper-level fields. The parser sets paper_id /
    # claim.id prefixes to the rrxiv-meta slug ("rrxiv-paper-euclid-elements")
    # which is fine for build artefacts but the deployed instance keys
    # everything off the canonical UUID. Patch both top-level + each
    # claim so re-ingest finds them by paper_id.
    cir["id"] = PAPER_ID
    cir.setdefault("id_slug", "rrxiv:2605.00009")

    # Overlay structured authors + based_on + license + topics from
    # rrxiv-meta.json. The parser captures the LaTeX \author{} arg
    # verbatim — for Euclid that includes a \\\and\small annotation
    # we want kept in the rendered PDF but cleaned out of the CIR.
    # rrxiv-meta.json carries the canonical structured author list
    # (one entry per author, with orcid + is_agent + agent_handle),
    # so use it as the source of truth here.
    if META_PATH.is_file():
        meta = json.loads(META_PATH.read_text())
        if isinstance(meta.get("authors"), list) and meta["authors"]:
            cir["authors"] = meta["authors"]
        for key in ("license", "topics", "based_on"):
            if key in meta and meta[key] is not None:
                cir[key] = meta[key]
        # `version` from meta is authoritative too (e.g. "v2" after a
        # revision); fall back to whatever the parser set otherwise.
        if meta.get("version"):
            cir["version"] = meta["version"]
    for c in cir.get("claims", []):
        c["paper_id"] = PAPER_ID
        # `id` may be either parser-shape ("rrxiv-paper-euclid-elements:prop:I.1")
        # or already canonical. Normalise to canonical.
        idx = c["id"].rfind(":prop:")
        if idx >= 0:
            short = c["id"][idx + len(":prop:") :]
            c["id"] = f"{PAPER_ID}:prop:{short}"
        # Same rewriting for any inter-claim edges already on the claim.
        for key in ("depends_on", "supports", "contradicts", "extends"):
            c.setdefault(key, [])
            c[key] = [
                t if ":prop:" not in t
                else f"{PAPER_ID}:prop:{t.rsplit(':prop:', 1)[1]}"
                for t in c[key]
            ]

    claims_by_short: dict[str, dict] = {}
    for c in cir.get("claims", []):
        idx = c["id"].rfind(":prop:")
        if idx >= 0:
            short = c["id"][idx + len(":prop:") :]
            claims_by_short[short] = c

    merged = 0
    skipped = 0
    for line in AUX_PATH.read_text().splitlines():
        m = EDGE_RE.match(line)
        if not m:
            continue
        kind, src, tgt = m.group(1), m.group(2).strip(), m.group(3).strip()
        # Only claim → claim edges.
        if not (CLAIM_LABEL_RE.match(src) and CLAIM_LABEL_RE.match(tgt)):
            skipped += 1
            continue
        claim = claims_by_short.get(src)
        if claim is None:
            skipped += 1
            continue
        full_target = f"{PAPER_ID}:prop:{tgt}"
        if full_target not in claim[kind]:
            claim[kind].append(full_target)
            merged += 1

    CIR_PATH.write_text(json.dumps(cir, indent=2) + "\n")
    print(f"merged {merged} claim-to-claim edges; skipped {skipped} non-claim edges")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())