#!/usr/bin/env python3
from __future__ import annotations

import csv
import hashlib
import html
import os
import re
import sys
import time
import urllib.parse
from pathlib import Path
from xml.etree import ElementTree as ET

import requests

BASE = 'https://www.krb.nsw.edu.au'
SITEMAP = BASE + '/sitemap.xml'
OUT = Path('/Users/iggy/.hermes/profiles/ignite_team/outbound/krb-image-grab')
DOWNLOADS = OUT / 'static-assets'
PAGE_IMAGES = OUT / 'page-images'
MANIFEST = OUT / 'krb-static-image-downloads.csv'
PAGE_MANIFEST = OUT / 'krb-page-image-candidates.csv'
CMS_SEGMENTS = {'news', 'alumnae-stories', 'principals-blog', 'announcement'}
SKIP_SEGMENTS = {'krb-test-page', 'test-page', 'test-blocks-page'}
IMAGE_EXTS = ('.jpg', '.jpeg', '.png', '.webp', '.gif', '.svg')

# Keep this conservative. Any image on a static page can remain in static-assets;
# page-images is only a convenience copy for likely reusable page-level imagery.
PAGE_IMAGE_HINTS = re.compile(r'(hero|header|banner|cta|discover|tile|card|feature|masthead|intro)', re.I)
UPLOADS_RE = re.compile(r'https?://[^\s"\'<>\)]+/(?:app|wp-content)/uploads/[^\s"\'<>\)]+', re.I)
SRCSET_SPLIT_RE = re.compile(r'\s*,\s*')
ATTR_RE = re.compile(r'''(?:src|href|data-src|data-bg|data-background|data-lazy-src|poster|srcset)\s*=\s*(["'])(.*?)\1''', re.I | re.S)
URL_FUNC_RE = re.compile(r'''url\((['"]?)(.*?)\1\)''', re.I)

session = requests.Session()
session.headers.update({'User-Agent': 'IGNITE-KRB-Webflow-image-migration/1.0'})


def clean_url(u: str, base_url: str) -> str | None:
    u = html.unescape(u.strip())
    if not u or u.startswith('data:') or u.startswith('blob:'):
        return None
    if ',' in u and ' ' in u and not urllib.parse.urlparse(u).scheme:
        # likely srcset handled elsewhere
        pass
    u = urllib.parse.urljoin(base_url, u)
    parsed = urllib.parse.urlparse(u)
    if not parsed.scheme.startswith('http'):
        return None
    path = parsed.path
    if not path.lower().endswith(IMAGE_EXTS):
        return None
    # remove query/fragment for stable asset identity
    return urllib.parse.urlunparse((parsed.scheme, parsed.netloc, path, '', '', ''))


def srcset_urls(value: str, base_url: str) -> list[str]:
    found = []
    for part in SRCSET_SPLIT_RE.split(html.unescape(value)):
        candidate = part.strip().split(' ')[0]
        cu = clean_url(candidate, base_url)
        if cu:
            found.append(cu)
    return found


def page_urls_from_sitemap() -> list[str]:
    r = session.get(SITEMAP, timeout=30)
    r.raise_for_status()
    root = ET.fromstring(r.text.encode('utf-8'))
    ns = {'sm': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
    urls = [e.text.strip() for e in root.findall('.//sm:loc', ns) if e.text]
    static = []
    excluded = []
    for u in urls:
        path = urllib.parse.urlparse(u).path.strip('/')
        first = path.split('/')[0] if path else ''
        if first in CMS_SEGMENTS or first in SKIP_SEGMENTS:
            excluded.append(u)
            continue
        static.append(u)
    (OUT / 'krb-static-pages.txt').write_text('\n'.join(static) + '\n')
    (OUT / 'krb-excluded-cms-pages.txt').write_text('\n'.join(excluded) + '\n')
    return static


def extract_images(page_url: str, body: str) -> list[tuple[str, str]]:
    candidates: list[tuple[str, str]] = []
    for m in UPLOADS_RE.finditer(body):
        cu = clean_url(m.group(0), page_url)
        if cu:
            candidates.append((cu, 'uploads-url'))
    for m in ATTR_RE.finditer(body):
        attr = m.group(0).split('=', 1)[0].strip().lower()
        val = m.group(2)
        if attr == 'srcset' or (' ' in val and ',' in val):
            for cu in srcset_urls(val, page_url):
                candidates.append((cu, attr))
        else:
            cu = clean_url(val, page_url)
            if cu:
                candidates.append((cu, attr))
    for m in URL_FUNC_RE.finditer(body):
        cu = clean_url(m.group(2), page_url)
        if cu:
            candidates.append((cu, 'css-url'))
    # preserve order while unique per page
    seen = set()
    out = []
    for u, source in candidates:
        if u not in seen:
            seen.add(u); out.append((u, source))
    return out


def safe_filename(url: str) -> str:
    name = Path(urllib.parse.unquote(urllib.parse.urlparse(url).path)).name
    name = re.sub(r'[^A-Za-z0-9._ -]+', '-', name).strip(' .')
    return name or hashlib.md5(url.encode()).hexdigest()


def unique_path(folder: Path, filename: str, url: str) -> Path:
    p = folder / filename
    if not p.exists():
        return p
    # If existing file came from a different URL with same filename, suffix hash.
    stem, suffix = p.stem, p.suffix
    return folder / f'{stem}__{hashlib.md5(url.encode()).hexdigest()[:8]}{suffix}'


def is_page_candidate(url: str, page_url: str, source: str, body: str) -> bool:
    name = safe_filename(url)
    path = urllib.parse.urlparse(page_url).path.strip('/') or 'home'
    if PAGE_IMAGE_HINTS.search(name) or PAGE_IMAGE_HINTS.search(source):
        return True
    # if image URL appears near page component hint in HTML, flag it.
    idx = body.find(url)
    if idx >= 0:
        window = body[max(0, idx-500): idx+500]
        if PAGE_IMAGE_HINTS.search(window):
            return True
    # Hero/background often loaded via style/source and large srcsets; flag first few page-level refs later not here.
    return False


def main():
    OUT.mkdir(parents=True, exist_ok=True)
    DOWNLOADS.mkdir(parents=True, exist_ok=True)
    PAGE_IMAGES.mkdir(parents=True, exist_ok=True)

    pages = page_urls_from_sitemap()
    rows = []
    page_rows = []
    url_to_file: dict[str, Path] = {}

    for i, page in enumerate(pages, 1):
        print(f'[{i}/{len(pages)}] {page}', flush=True)
        try:
            resp = session.get(page, timeout=30)
            resp.raise_for_status()
        except Exception as e:
            print(f'  page error: {e}', file=sys.stderr)
            continue
        body = resp.text
        imgs = extract_images(page, body)
        for img_url, source in imgs:
            filename = safe_filename(img_url)
            p = url_to_file.get(img_url)
            status = 'seen'
            size = ''
            md5 = ''
            content_type = ''
            if not p:
                p = unique_path(DOWNLOADS, filename, img_url)
                try:
                    ir = session.get(img_url, timeout=45)
                    ir.raise_for_status()
                    content = ir.content
                    p.write_bytes(content)
                    md5 = hashlib.md5(content).hexdigest()
                    size = str(len(content))
                    content_type = ir.headers.get('content-type', '')
                    status = 'downloaded'
                    url_to_file[img_url] = p
                except Exception as e:
                    status = f'error: {e}'
            else:
                if p.exists():
                    size = str(p.stat().st_size)
                status = 'duplicate-url'

            row = {
                'page_url': page,
                'image_url': img_url,
                'source': source,
                'filename': filename,
                'download_path': str(p),
                'status': status,
                'bytes': size,
                'md5': md5,
                'content_type': content_type,
            }
            rows.append(row)
            if is_page_candidate(img_url, page, source, body):
                target = PAGE_IMAGES / p.name
                if p.exists() and not target.exists():
                    try:
                        target.write_bytes(p.read_bytes())
                    except Exception:
                        pass
                prow = dict(row)
                prow['page_images_path'] = str(target)
                page_rows.append(prow)
        time.sleep(0.1)

    fieldnames = ['page_url','image_url','source','filename','download_path','status','bytes','md5','content_type']
    with MANIFEST.open('w', newline='') as f:
        w = csv.DictWriter(f, fieldnames=fieldnames)
        w.writeheader(); w.writerows(rows)
    with PAGE_MANIFEST.open('w', newline='') as f:
        pf = fieldnames + ['page_images_path']
        w = csv.DictWriter(f, fieldnames=pf)
        w.writeheader(); w.writerows(page_rows)

    unique_downloaded = len(url_to_file)
    print('\nDONE')
    print(f'static pages crawled: {len(pages)}')
    print(f'image occurrences: {len(rows)}')
    print(f'unique image urls downloaded: {unique_downloaded}')
    print(f'page image candidates: {len({r["image_url"] for r in page_rows})}')
    print(f'out: {OUT}')

if __name__ == '__main__':
    main()
