#!/usr/bin/env python3
"""KRB 4-page rendered WP ↔ Webflow component prop mapper.

Usage:
  WEBFLOW_TOKEN=... python3 krb_rendered_prop_mapping.py

Outputs CSV/JSON/Markdown files beside this script. The token is read from env and
is not written to any output file.
"""
from __future__ import annotations

import csv
import html
import json
import os
import re
import sys
import urllib.error
import urllib.request
from collections import defaultdict
from pathlib import Path
from typing import Any

from bs4 import BeautifulSoup

SITE_ID = "6a1e37436b332da28ecc3001"
BASE_API = "https://api.webflow.com/v2"
OUTDIR = Path(__file__).resolve().parent

PAGES = [
    {
        "page": "Home",
        "wf_path": "/",
        "wf_page_id": "6a1e37436b332da28ecc2fd2",
        "wp_url": "https://www.krb.nsw.edu.au/",
    },
    {
        "page": "Admissions",
        "wf_path": "/admissions",
        "wf_page_id": "6a2b90585e1c44ad05688ea4",
        "wp_url": "https://www.krb.nsw.edu.au/admissions/",
    },
    {
        "page": "Boarding",
        "wf_path": "/boarding",
        "wf_page_id": "6a2b90b66077f506f42e3875",
        "wp_url": "https://www.krb.nsw.edu.au/boarding/",
    },
    {
        "page": "Our Campus",
        "wf_path": "/about-us/our-campus",
        "wf_page_id": "6a2b8d959cfde20f4b4e0535",
        "wp_url": "https://www.krb.nsw.edu.au/about-us/our-campus/",
    },
]

IGNORE_COMPONENT_PREFIXES = (
    "Globals",
    "Section / Navbar",
    "Section / Footer",
    "Component / Breadcrumb",
    "Global /",
)


def norm_text(s: str | None) -> str:
    if not s:
        return ""
    s = html.unescape(str(s))
    s = re.sub(r"\u200d", "", s)
    s = re.sub(r"\s+", " ", s)
    return s.strip()


def api_get(path: str, token: str) -> dict[str, Any]:
    req = urllib.request.Request(
        f"{BASE_API}{path}",
        headers={"Authorization": f"Bearer {token}", "accept": "application/json"},
    )
    with urllib.request.urlopen(req, timeout=45) as resp:
        return json.load(resp)


def fetch_all_components(token: str) -> dict[str, str]:
    comps = []
    offset = 0
    while True:
        data = api_get(f"/sites/{SITE_ID}/components?limit=100&offset={offset}", token)
        batch = data.get("components", [])
        comps.extend(batch)
        if len(batch) < 100:
            break
        offset += 100
    return {c.get("id"): c.get("name") for c in comps}


def fetch_page_dom(page_id: str, token: str) -> list[dict[str, Any]]:
    nodes = []
    offset = 0
    while True:
        data = api_get(f"/pages/{page_id}/dom?limit=100&offset={offset}", token)
        batch = data.get("nodes", [])
        nodes.extend(batch)
        if len(batch) < 100:
            break
        offset += 100
    return nodes


def prop_value(override: dict[str, Any]) -> str:
    text = override.get("text")
    if isinstance(text, dict):
        if text.get("text"):
            return norm_text(text.get("text"))
        if text.get("html"):
            soup = BeautifulSoup(text.get("html") or "", "html.parser")
            return norm_text(soup.get_text(" "))
    # links/images/options often come through as value. Keep compact JSON for non-text values.
    if "value" in override:
        val = override.get("value")
        if isinstance(val, (dict, list)):
            return json.dumps(val, ensure_ascii=False, sort_keys=True)
        return norm_text(str(val))
    return ""


def infer_field_name(prop_label: str) -> str:
    label = prop_label.lower()
    tail = prop_label.split("/")[-1].strip()
    if "heading" in label or "title" in label:
        return "heading/title"
    if "eyebrow" in label or "subtitle" in label:
        return "eyebrow/subtitle"
    if "paragraph" in label or "content" in label or "rich" in label:
        return "body/content"
    if "button text" in label or label.endswith("text") or "link/button" in label:
        return "button text"
    if "button link" in label or "link" in label:
        return "button link/url"
    if "image" in label or "visual" in label:
        return "image/media"
    if "tab" in label:
        return "tab title/content"
    return tail or prop_label


def visible_sections(soup: BeautifulSoup) -> list[dict[str, Any]]:
    # Prefer obvious WordPress block/content wrappers. Fallback to semantic sections.
    selectors = [
        "main section",
        "main .content-block",
        "main [class*='block']",
        "main [class*='component']",
        "main article",
    ]
    seen: set[int] = set()
    sections = []
    for sel in selectors:
        for el in soup.select(sel):
            if id(el) in seen:
                continue
            text = norm_text(el.get_text(" "))
            if len(text) < 20:
                continue
            seen.add(id(el))
            classes = " ".join(el.get("class", []))
            heading_el = el.find(re.compile(r"^h[1-6]$"))
            heading = norm_text(heading_el.get_text(" ") if heading_el else "")
            links = []
            for a in el.find_all("a", href=True)[:8]:
                t = norm_text(a.get_text(" "))
                if t:
                    links.append({"text": t, "href": a["href"]})
            imgs = []
            for img in el.find_all("img")[:5]:
                src = img.get("src") or img.get("data-src") or ""
                alt = norm_text(img.get("alt") or "")
                if src or alt:
                    imgs.append({"alt": alt, "src": src})
            sections.append({"tag": el.name, "classes": classes, "heading": heading, "text": text, "links": links, "images": imgs})
    return sections


def find_wp_match(value: str, sections: list[dict[str, Any]]) -> dict[str, Any]:
    value = norm_text(value)
    if not value or value in {"‍", "#", "{}", "null"} or len(value) < 3:
        return {"matched": False, "reason": "empty/placeholder"}

    # For rich content, use distinctive chunks. Exact full body often differs after truncation.
    candidates = [value]
    if len(value) > 80:
        words = value.split()
        if len(words) >= 8:
            candidates.extend([" ".join(words[:8]), " ".join(words[:14])])
        # Split sentence-ish too.
        candidates.extend([x.strip() for x in re.split(r"[.!?]", value) if len(x.strip()) > 20][:3])

    for section in sections:
        section_text = section["text"]
        for cand in candidates:
            if not cand:
                continue
            if cand.lower() in section_text.lower():
                return {
                    "matched": True,
                    "wp_section_heading": section.get("heading", ""),
                    "wp_section_classes": section.get("classes", ""),
                    "wp_context_snippet": section_text[:360],
                    "wp_links": section.get("links", []),
                    "wp_images": section.get("images", []),
                    "match_basis": cand[:160],
                }
    # Links/buttons may be represented as link text, search all link texts in section list.
    for section in sections:
        for link in section.get("links", []):
            if value.lower() == link.get("text", "").lower():
                return {
                    "matched": True,
                    "wp_section_heading": section.get("heading", ""),
                    "wp_section_classes": section.get("classes", ""),
                    "wp_context_snippet": section["text"][:360],
                    "wp_links": section.get("links", []),
                    "wp_images": section.get("images", []),
                    "match_basis": f"link text: {value}",
                }
    return {"matched": False, "reason": "not found in rendered WP sections"}


def main() -> int:
    token = os.environ.get("WEBFLOW_TOKEN")
    if not token:
        print("WEBFLOW_TOKEN env var required", file=sys.stderr)
        return 2

    component_names = fetch_all_components(token)

    row_instances: list[dict[str, Any]] = []
    row_props: list[dict[str, Any]] = []
    unique: dict[str, dict[str, Any]] = {}

    for page_info in PAGES:
        wp_html = urllib.request.urlopen(page_info["wp_url"], timeout=45).read().decode("utf-8", "ignore")
        soup = BeautifulSoup(wp_html, "html.parser")
        sections = visible_sections(soup)
        dom_nodes = fetch_page_dom(page_info["wf_page_id"], token)
        instance_order = 0
        visible_order = 0
        for node in dom_nodes:
            if node.get("type") != "component-instance":
                continue
            instance_order += 1
            component_id = node.get("componentId")
            component_name = component_names.get(component_id, component_id)
            if component_name.startswith(IGNORE_COMPONENT_PREFIXES):
                continue
            visible_order += 1
            props = []
            wp_match_headings = []
            for override in node.get("propertyOverrides", []):
                label = override.get("label") or override.get("propertyId") or ""
                value = prop_value(override)
                if value in {"", "‍"}:
                    # Still record labels at instance level, but avoid noisy empty prop rows.
                    props.append(f"{label}: [empty]")
                    continue
                match = find_wp_match(value, sections)
                if match.get("matched") and match.get("wp_section_heading"):
                    wp_match_headings.append(match["wp_section_heading"])
                row_props.append(
                    {
                        "Page": page_info["page"],
                        "WP URL": page_info["wp_url"],
                        "Webflow path": page_info["wf_path"],
                        "WF order": visible_order,
                        "Webflow component": component_name,
                        "Webflow prop label": label,
                        "Webflow prop value": value,
                        "Inferred WP rendered field": infer_field_name(label),
                        "Found on WP rendered page": "yes" if match.get("matched") else "no",
                        "WP matched section heading": match.get("wp_section_heading", ""),
                        "WP section classes": match.get("wp_section_classes", ""),
                        "WP context snippet": match.get("wp_context_snippet", ""),
                        "Match basis": match.get("match_basis", match.get("reason", "")),
                    }
                )
                props.append(f"{label}: {value}")
            instance = {
                "Page": page_info["page"],
                "WP URL": page_info["wp_url"],
                "Webflow path": page_info["wf_path"],
                "WF order": visible_order,
                "Webflow component": component_name,
                "Component ID": component_id,
                "Instance ID": node.get("id"),
                "Instance prop/content values": "; ".join(props),
                "WP matched heading(s) from prop content": "; ".join(dict.fromkeys(wp_match_headings)),
                "Likely WP rendered fields to map": ", ".join(sorted(set(infer_field_name((o.get("label") or "")) for o in node.get("propertyOverrides", []))))
                if node.get("propertyOverrides")
                else "slot/default content or no overrides visible",
            }
            row_instances.append(instance)
            if component_id not in unique:
                unique[component_id] = {
                    "First seen page": page_info["page"],
                    "First seen WF order": visible_order,
                    "Webflow component": component_name,
                    "Component ID": component_id,
                    "Example prop/content values": instance["Instance prop/content values"],
                    "Likely WP rendered fields to map": instance["Likely WP rendered fields to map"],
                    "Example WP URL": page_info["wp_url"],
                }

    paths = {
        "instances_csv": OUTDIR / "krb-4-page-wf-instances-with-rendered-wp-matches.csv",
        "props_csv": OUTDIR / "krb-4-page-prop-to-rendered-wp-field-map.csv",
        "unique_csv": OUTDIR / "krb-unique-webflow-components-first-pass.csv",
        "json": OUTDIR / "krb-4-page-rendered-prop-mapping.json",
        "md": OUTDIR / "krb-4-page-rendered-prop-mapping-summary.md",
    }

    def write_csv(path: Path, rows: list[dict[str, Any]]):
        if not rows:
            return
        with path.open("w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=list(rows[0].keys()))
            writer.writeheader()
            writer.writerows(rows)

    write_csv(paths["instances_csv"], row_instances)
    write_csv(paths["props_csv"], row_props)
    write_csv(paths["unique_csv"], list(unique.values()))
    paths["json"].write_text(json.dumps({"instances": row_instances, "props": row_props, "unique_components": list(unique.values())}, indent=2, ensure_ascii=False), encoding="utf-8")

    lines = ["# KRB 4-page rendered prop mapping", "", "Scope: Home, Admissions, Boarding, Our Campus.", "", "## Unique Webflow components first seen"]
    for item in unique.values():
        lines.append(f"- **{item['Webflow component']}** — first seen on {item['First seen page']}; fields: {item['Likely WP rendered fields to map'] or 'none visible'}")
    lines.append("")
    lines.append("## Page instances")
    for r in row_instances:
        lines.append(f"- {r['Page']} #{r['WF order']}: **{r['Webflow component']}** — {r['Instance prop/content values'][:500]}")
    paths["md"].write_text("\n".join(lines), encoding="utf-8")

    print("Wrote:")
    for p in paths.values():
        print(p)
    print(f"Instances: {len(row_instances)} | prop rows: {len(row_props)} | unique components: {len(unique)}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())
