#!/usr/bin/env python3
"""KRB live WordPress block scraper for Webflow migration data.

Fetches an explicit batch of KRB URLs and extracts only the block families we know
how to map into the KRB Webflow component importer. Unknown blocks are preserved
with source child numbers so the import/report can flag manual attention without
losing source content.

Default run emits About Us page-group data compatible with the existing
krb-about-us-full-page-group-import-v7.js SOURCE shape, plus additional
`attention` and `unmatchedSections` arrays.
"""
from __future__ import annotations

import argparse
import csv
import json
import re
import sys
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup, Tag

BASE = "https://www.krb.nsw.edu.au"
OUT_DIR = Path("/Users/iggy/.hermes/profiles/ignite_team/outbound")
RAW_DIR = OUT_DIR / "krb-live-scrape-html"

# Seed mapping for the current About Us group. Extend/override with --batch-json.
DEFAULT_PAGES = [
    {"title": "Our Campus", "url": f"{BASE}/about-us/our-campus/", "webflowPageId": "6a2b8d959cfde20f4b4e0535", "path": "/about-us/our-campus"},
    {"title": "Our People", "url": f"{BASE}/about-us/our-people/", "webflowPageId": "6a2b903289294179fd567656", "path": "/about-us/our-people"},
    {"title": "School Board", "url": f"{BASE}/about-us/our-people/school-board/", "webflowPageId": "6a2b90409ac2f30086e310ff", "path": "/about-us/our-people/school-board"},
    {"title": "Senior Executive", "url": f"{BASE}/about-us/our-people/senior-executive/", "webflowPageId": "6a2b9046d9aa9b69fe9dd02a", "path": "/about-us/our-people/senior-executive"},
    {"title": "Employment", "url": f"{BASE}/about-us/our-people/employment/", "webflowPageId": "6a2b903b944f448e28b408c3", "path": "/about-us/our-people/employment"},
    {"title": "Our Policies", "url": f"{BASE}/about-us/our-policies/", "webflowPageId": "6a2b904caada4c776865154b", "path": "/about-us/our-policies"},
]

DEFAULT_NEXT_PAGES = [
    {"title": "Our History", "subtitle": "About Us", "pageId": "6a2b8d9ad3e8c89dceedfae4"},
    {"title": "Principal's Welcome", "subtitle": "About Us", "pageId": "6a2b90528a59c82b1ea41f40"},
    {"title": "Our Policies", "subtitle": "About Us", "pageId": "6a2b904caada4c776865154b"},
]

IGNORE_CLASSES = {"breadcrumbs", "separator"}


def clean_text(text: str) -> str:
    return re.sub(r"\s+", " ", (text or "").replace("\xa0", " ")).strip()


def slugish(value: str) -> str:
    return re.sub(r"[^a-z0-9]+", "-", value.lower()).strip("-") or "page"


def classes(el: Tag) -> List[str]:
    return list(el.get("class") or [])


def class_string(el: Tag) -> str:
    return " ".join(classes(el))


def has_class(el: Tag, name: str) -> bool:
    return name in classes(el)


def html_inner(el: Tag, strip_images: bool = False) -> Tuple[str, List[Dict[str, Any]]]:
    clone = BeautifulSoup(str(el), "html.parser")
    stripped: List[Dict[str, Any]] = []
    root = clone.find(el.name)
    if not root:
        return "", []
    if strip_images:
        for img in root.find_all("img"):
            spec = image_spec_from_img(img, BASE)
            if spec:
                stripped.append(spec)
            img.decompose()
        # Remove empty picture/source wrappers after image strip.
        for pic in root.find_all(["picture", "source"]):
            if not clean_text(pic.get_text(" ", strip=True)) and not pic.find("img"):
                pic.decompose()
    return " ".join(str(child) for child in root.contents).strip(), stripped


def choose_srcset(srcset: str) -> Optional[str]:
    best: Tuple[int, str] = (-1, "")
    for part in (srcset or "").split(","):
        bits = part.strip().split()
        if not bits:
            continue
        url = bits[0]
        width = 0
        if len(bits) > 1:
            m = re.match(r"(\d+)w", bits[1])
            if m:
                width = int(m.group(1))
        if width >= best[0]:
            best = (width, url)
    return best[1] or None


def filename_from_url(url: str) -> str:
    path = urlparse(url or "").path
    return path.rsplit("/", 1)[-1]


def image_spec_from_url(url: str, alt: str = "", source: str = "") -> Optional[Dict[str, Any]]:
    if not url or url.startswith("data:"):
        return None
    full = urljoin(BASE, url)
    return {"sourceUrl": full, "sourceFile": filename_from_url(full), "alt": clean_text(alt), "source": source}


def image_spec_from_img(img: Tag, base: str = BASE) -> Optional[Dict[str, Any]]:
    src = choose_srcset(img.get("srcset", "")) or img.get("data-src") or img.get("data-lazy-src") or img.get("src")
    if not src:
        # Sometimes <source srcset> is nested under picture.
        parent = img.parent if isinstance(img.parent, Tag) else None
        if parent:
            source = parent.find("source", srcset=True)
            if source:
                src = choose_srcset(source.get("srcset", ""))
    if not src:
        return None
    full = urljoin(base, src)
    return {"sourceUrl": full, "sourceFile": filename_from_url(full), "alt": clean_text(img.get("alt", "")), "source": "img"}


def find_images(el: Tag) -> List[Dict[str, Any]]:
    found: List[Dict[str, Any]] = []
    seen = set()
    for img in el.find_all("img"):
        spec = image_spec_from_img(img)
        if spec and spec["sourceUrl"] not in seen:
            found.append(spec); seen.add(spec["sourceUrl"])
    # Inline/background image URLs.
    for styled in el.find_all(style=True):
        for url in re.findall(r"url\(['\"]?([^)'\"]+)", styled.get("style", "")):
            spec = image_spec_from_url(url, source="background")
            if spec and spec["sourceUrl"] not in seen:
                found.append(spec); seen.add(spec["sourceUrl"])
    return found


def first_heading(el: Tag) -> str:
    h = el.find(["h1", "h2", "h3", "h4"])
    return clean_text(h.get_text(" ", strip=True)) if h else ""


def text_without_heading(el: Tag) -> str:
    clone = BeautifulSoup(str(el), "html.parser")
    root = clone.find(el.name)
    if not root:
        return clean_text(el.get_text(" ", strip=True))
    h = root.find(["h1", "h2", "h3", "h4"])
    if h:
        h.decompose()
    for script in root.find_all(["script", "style", "noscript"]):
        script.decompose()
    return clean_text(root.get_text(" ", strip=True))


def html_content_area(el: Tag, strip_images: bool = True) -> Tuple[str, List[Dict[str, Any]]]:
    # Prefer inner content wrappers if present; otherwise strip heading from clone and keep rich HTML.
    clone = BeautifulSoup(str(el), "html.parser")
    root = clone.find(el.name)
    stripped: List[Dict[str, Any]] = []
    if not root:
        return "", []
    for bad in root.find_all(["script", "style", "noscript"]):
        bad.decompose()
    # Remove repeated heading from rich body. Webflow heading prop carries it.
    h = root.find(["h1", "h2", "h3", "h4"])
    if h:
        h.decompose()
    if strip_images:
        for img in root.find_all("img"):
            spec = image_spec_from_img(img)
            if spec:
                stripped.append(spec)
            img.decompose()
        for pic in root.find_all(["picture", "source"]):
            if not pic.find("img"):
                pic.decompose()
    return " ".join(str(child) for child in root.contents).strip(), stripped


def page_path(url: str) -> str:
    p = urlparse(url).path.rstrip("/")
    return p or "/"


def resolve_page_id(url: str, path_to_page: Dict[str, Dict[str, Any]]) -> Optional[str]:
    p = page_path(urljoin(BASE, url))
    item = path_to_page.get(p)
    return item.get("webflowPageId") if item else None


def extract_gateway(el: Tag, path_to_page: Dict[str, Dict[str, Any]], child_no: int) -> List[Dict[str, Any]]:
    links: List[Dict[str, Any]] = []
    seen = set()
    for a in el.find_all("a", href=True):
        href = urljoin(BASE, a.get("href"))
        title = clean_text(a.get_text(" ", strip=True))
        if not title or title.lower() in {"learn more", "read more"}:
            # Try nearest card heading.
            card = a.find_parent(["article", "div", "li"])
            if card:
                title = first_heading(card) or title
        key = (title, href)
        if title and key not in seen and BASE in href:
            links.append({"title": title, "url": href, "pageId": resolve_page_id(href, path_to_page), "sourceChildNumber": child_no})
            seen.add(key)
    return links


def extract_downloads(el: Tag, child_no: int) -> Dict[str, Any]:
    heading = first_heading(el)
    files = []
    for a in el.find_all("a", href=True):
        href = urljoin(BASE, a.get("href"))
        if re.search(r"\.(pdf|docx?|xlsx?|pptx?)(\?|$)", href, re.I):
            files.append({"label": clean_text(a.get_text(" ", strip=True)) or filename_from_url(href), "url": href, "file": filename_from_url(href), "sourceChildNumber": child_no})
    return {"heading": heading, "files": files, "sourceChildNumber": child_no}


def extract_accordion(el: Tag, child_no: int) -> Dict[str, Any]:
    heading = first_heading(el)
    items: List[Dict[str, Any]] = []
    # KRB accordions tend to include item wrappers; fall back to headings inside block.
    candidates = el.select(".accordion__item, .accordion-item, details")
    if candidates:
        for idx, item in enumerate(candidates, 1):
            items.append({"heading": first_heading(item) or clean_text(item.find("summary").get_text(" ", strip=True)) if item.find("summary") else f"Item {idx}", "html": html_content_area(item)[0], "text": text_without_heading(item), "sourceChildNumber": child_no})
    else:
        for idx, h in enumerate(el.find_all(["h3", "h4"]), 1):
            parts = []
            for sib in h.find_next_siblings():
                if isinstance(sib, Tag) and sib.name in ["h3", "h4"]:
                    break
                if isinstance(sib, Tag):
                    parts.append(str(sib))
            items.append({"heading": clean_text(h.get_text(" ", strip=True)), "html": " ".join(parts), "text": clean_text(BeautifulSoup(" ".join(parts), "html.parser").get_text(" ", strip=True)), "sourceChildNumber": child_no})
    return {"heading": heading, "items": items, "sourceChildNumber": child_no}


def extract_page(page: Dict[str, Any], html: str, path_to_page: Dict[str, Dict[str, Any]]) -> Tuple[str, Dict[str, Any]]:
    soup = BeautifulSoup(html, "html.parser")
    main = soup.select_one("main") or soup.body or soup
    top_children = [c for c in main.find_all(recursive=False) if isinstance(c, Tag)]
    result: Dict[str, Any] = {
        "sourceUrl": page["url"],
        "textContent": [],
        "twoColumn": [],
        "gatewayLinks": [],
        "accordions": [],
        "downloads": [],
        "galleries": [],
        "unmatchedSections": [],
        "attention": [],
        "hero": {"heading": "", "images": []},
        "meta": {"id": page.get("webflowPageId"), "path": page.get("path") or page_path(page["url"])},
        "nextPages": page.get("nextPages") or DEFAULT_NEXT_PAGES,
    }
    page_title = page.get("title") or (soup.title.string.split("-")[0].strip() if soup.title else page_path(page["url"]).rsplit("/", 1)[-1])

    for child_no, child in enumerate(top_children, 1):
        cls = set(classes(child))
        text = clean_text(child.get_text(" ", strip=True))
        if not text and not find_images(child):
            continue
        if "breadcrumbs" in cls:
            continue
        if "page-header" in cls:
            result["hero"] = {"heading": first_heading(child) or page_title, "images": [img["sourceUrl"] for img in find_images(child)], "sourceChildNumber": child_no}
            continue
        if "intro" in cls:
            heading = first_heading(child)
            html_body, stripped = html_content_area(child, strip_images=True)
            block = {"source": "intro", "heading": heading, "text": text_without_heading(child), "html": html_body, "sourceChildNumber": child_no, "imagesStripped": stripped}
            result["textContent"].append(block)
            if stripped:
                result["attention"].append({"severity": "note", "issue": "image stripped from rich intro", "location": f"source child {child_no}", "sourceChildNumber": child_no, "count": len(stripped)})
            continue
        if "mixed" in cls:
            heading = first_heading(child)
            html_body, stripped = html_content_area(child, strip_images=True)
            block = {"source": "mixed", "heading": heading, "text": text_without_heading(child), "html": html_body, "sourceChildNumber": child_no, "imagesStripped": stripped}
            result["textContent"].append(block)
            if stripped:
                result["attention"].append({"severity": "note", "issue": "image stripped from rich content", "location": f"source child {child_no}", "sourceChildNumber": child_no, "count": len(stripped)})
            continue
        if "text-image" in cls:
            heading = first_heading(child)
            html_body, stripped = html_content_area(child, strip_images=True)
            images = find_images(child)
            block = {"heading": heading, "text": text_without_heading(child), "html": html_body, "images": images, "sourceChildNumber": child_no, "imagesStripped": stripped}
            result["twoColumn"].append(block)
            if not images:
                result["attention"].append({"severity": "warning", "issue": "two-column block has no image", "location": f"source child {child_no}", "sourceChildNumber": child_no})
            continue
        if "gateway-rows" in cls:
            result["gatewayLinks"].extend(extract_gateway(child, path_to_page, child_no))
            continue
        if "downloads" in cls:
            result["downloads"].append(extract_downloads(child, child_no))
            continue
        if "accordion" in cls:
            result["accordions"].append(extract_accordion(child, child_no))
            continue
        if "gallery" in cls or "slider" in cls:
            result["galleries"].append({"heading": first_heading(child), "images": find_images(child), "sourceChildNumber": child_no})
            continue
        if cls & IGNORE_CLASSES:
            continue
        # Preserve unknowns with a small HTML snapshot; don't lose content.
        result["unmatchedSections"].append({
            "sourceChildNumber": child_no,
            "classes": classes(child),
            "text": text[:2000],
            "htmlSnippet": str(child)[:5000],
        })
        result["attention"].append({"severity": "manual", "issue": "unmatched source section", "location": f"source child {child_no}", "sourceChildNumber": child_no, "classes": classes(child)})

    return page_title, result


def load_batch(path: Optional[str]) -> List[Dict[str, Any]]:
    if not path:
        return DEFAULT_PAGES
    data = json.loads(Path(path).read_text())
    if isinstance(data, dict) and "pages" in data:
        return data["pages"]
    if isinstance(data, list):
        return data
    raise SystemExit("Batch JSON must be a list or object with pages[]")


def write_attention_csv(data: Dict[str, Any], path: Path) -> None:
    rows = []
    for page_name, page in data.items():
        for item in page.get("attention", []):
            rows.append({
                "page": page_name,
                "severity": item.get("severity", "note"),
                "issue": item.get("issue", ""),
                "location": item.get("location", ""),
                "sourceChildNumber": item.get("sourceChildNumber", ""),
                "classes": " ".join(item.get("classes", []) or []),
            })
    with path.open("w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=["page", "severity", "issue", "location", "sourceChildNumber", "classes"])
        writer.writeheader(); writer.writerows(rows)


def main(argv: Optional[List[str]] = None) -> int:
    ap = argparse.ArgumentParser()
    ap.add_argument("--batch-json", help="Optional explicit batch JSON. List items: title,url,webflowPageId,path,nextPages")
    ap.add_argument("--out", default=str(OUT_DIR / "krb-scrape-normalized.json"))
    ap.add_argument("--attention-csv", default=str(OUT_DIR / "krb-scrape-attention.csv"))
    ap.add_argument("--raw-dir", default=str(RAW_DIR))
    ap.add_argument("--timeout", type=int, default=30)
    args = ap.parse_args(argv)

    pages = load_batch(args.batch_json)
    path_to_page = {page_path(p.get("url") or p.get("path") or ""): p for p in pages}
    raw_dir = Path(args.raw_dir); raw_dir.mkdir(parents=True, exist_ok=True)
    session = requests.Session()
    session.headers.update({"User-Agent": "Mozilla/5.0 IGNITE KRB migration scraper"})

    output: Dict[str, Any] = {}
    for page in pages:
        url = page["url"]
        resp = session.get(url, timeout=args.timeout)
        resp.raise_for_status()
        title = page.get("title") or slugish(page_path(url))
        raw_path = raw_dir / f"{slugish(title)}.html"
        raw_path.write_text(resp.text, encoding="utf-8")
        page_name, extracted = extract_page(page, resp.text, path_to_page)
        output[title or page_name] = extracted
        print(f"scraped {title or page_name}: text={len(extracted['textContent'])} twoColumn={len(extracted['twoColumn'])} downloads={len(extracted['downloads'])} unknown={len(extracted['unmatchedSections'])}")

    out_path = Path(args.out); out_path.parent.mkdir(parents=True, exist_ok=True)
    out_path.write_text(json.dumps(output, ensure_ascii=False, indent=2), encoding="utf-8")
    write_attention_csv(output, Path(args.attention_csv))
    print(f"wrote {out_path}")
    print(f"wrote {args.attention_csv}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())
