#!/usr/bin/env python3
from __future__ import annotations

import csv
import hashlib
import io
import os
import re
import shutil
import sys
import time
import urllib.parse
from pathlib import Path
from xml.etree import ElementTree as ET

import requests
from PIL import Image, ImageOps

BASE_URL = 'https://www.krb.nsw.edu.au'
SITEMAP = BASE_URL + '/sitemap.xml'
OUT = Path('/Users/iggy/.hermes/profiles/ignite_team/outbound/krb-image-grab')
IMPORT_DIR = OUT / 'webflow-import-assets-2000px'
PAGE_DIR = IMPORT_DIR / 'Page images'
OTHER_DIR = IMPORT_DIR / 'Other static images'
RAW_DIR = OUT / '_tmp-raw-downloads'
PAGE_MANIFEST = OUT / 'krb-webflow-page-images-hero-only-2000px.csv'
OTHER_MANIFEST = OUT / 'krb-webflow-other-static-images-2000px.csv'
STATIC_PAGES_TXT = OUT / 'krb-static-pages.txt'
EXCLUDED_TXT = OUT / 'krb-excluded-cms-pages.txt'

CMS_SEGMENTS = {'news', 'alumnae-stories', 'principals-blog', 'announcement'}
SKIP_SEGMENTS = {'krb-test-page', 'test-page', 'test-blocks-page'}
IMAGE_EXTS = ('.jpg', '.jpeg', '.png', '.webp', '.gif', '.svg')
UPLOADS_RE = re.compile(r'https?://[^\s"\'<>\)]+/(?:app|wp-content)/uploads/[^\s"\'<>\)]+', re.I)
ATTR_RE = re.compile(r'''(?:src|href|data-src|data-bg|data-background|data-lazy-src|poster|srcset)\s*=\s*(["'])(.*?)\1''', re.I | re.S)
URL_FUNC_RE = re.compile(r'''url\((['"]?)(.*?)\1\)''', re.I)
SRCSET_SPLIT_RE = re.compile(r'\s*,\s*')
WP_SIZE_RE = re.compile(r'-(\d{2,5})x(\d{2,5})(?=\.[A-Za-z0-9]+$)')
SKIP_IMAGE_RE = re.compile(r'(logo|icon|favicon|sprite|placeholder|loading|spinner|arrow|chevron|social|facebook|instagram|linkedin|youtube)', re.I)
HERO_HINT_RE = re.compile(r'(hero|banner|header|masthead|main-visual|page-title|page-header)', re.I)
MAX_SIDE = 2000

session = requests.Session()
session.headers.update({'User-Agent': 'IGNITE-KRB-Webflow-image-migration/2.0'})


def page_urls_from_sitemap() -> list[str]:
    r = session.get(SITEMAP, timeout=30)
    r.raise_for_status()
    root = ET.fromstring(r.text.encode('utf-8'))
    ns = {'sm': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
    urls = [e.text.strip() for e in root.findall('.//sm:loc', ns) if e.text]
    static, excluded = [], []
    for u in urls:
        path = urllib.parse.urlparse(u).path.strip('/')
        first = path.split('/')[0] if path else ''
        if first in CMS_SEGMENTS or first in SKIP_SEGMENTS:
            excluded.append(u)
        else:
            static.append(u)
    STATIC_PAGES_TXT.write_text('\n'.join(static) + '\n')
    EXCLUDED_TXT.write_text('\n'.join(excluded) + '\n')
    return static


def clean_url(u: str, base_url: str) -> str | None:
    import html
    u = html.unescape(u.strip())
    if not u or u.startswith(('data:', 'blob:')):
        return None
    u = urllib.parse.urljoin(base_url, u)
    parsed = urllib.parse.urlparse(u)
    if not parsed.scheme.startswith('http'):
        return None
    if not parsed.path.lower().endswith(IMAGE_EXTS):
        return None
    return urllib.parse.urlunparse((parsed.scheme, parsed.netloc, parsed.path, '', '', ''))


def srcset_urls(value: str, base_url: str) -> list[str]:
    out = []
    for part in SRCSET_SPLIT_RE.split(value):
        bits = part.strip().split()
        if bits:
            cu = clean_url(bits[0], base_url)
            if cu:
                out.append(cu)
    return out


def extract_images(page_url: str, body: str) -> list[dict]:
    candidates = []
    for m in ATTR_RE.finditer(body):
        attr = m.group(0).split('=', 1)[0].strip().lower()
        val = m.group(2)
        urls = srcset_urls(val, page_url) if attr == 'srcset' or (',' in val and ' ' in val) else [clean_url(val, page_url)]
        for u in urls:
            if u:
                candidates.append({'url': u, 'source': attr, 'index': m.start()})
    for m in URL_FUNC_RE.finditer(body):
        cu = clean_url(m.group(2), page_url)
        if cu:
            candidates.append({'url': cu, 'source': 'css-url', 'index': m.start()})
    for m in UPLOADS_RE.finditer(body):
        cu = clean_url(m.group(0), page_url)
        if cu:
            candidates.append({'url': cu, 'source': 'uploads-url', 'index': m.start()})
    seen = set(); out = []
    for c in sorted(candidates, key=lambda x: x['index']):
        if c['url'] not in seen:
            seen.add(c['url']); out.append(c)
    return out


def strip_wp_size(url: str) -> str:
    parsed = urllib.parse.urlparse(url)
    path = urllib.parse.unquote(parsed.path)
    new_path = WP_SIZE_RE.sub('', path)
    if new_path == path:
        return url
    return urllib.parse.urlunparse((parsed.scheme, parsed.netloc, urllib.parse.quote(new_path, safe='/._-'), '', '', ''))


def probe_ok(url: str) -> bool:
    try:
        r = session.head(url, timeout=15, allow_redirects=True)
        if r.status_code == 200:
            return True
        r = session.get(url, timeout=20, stream=True)
        ok = r.status_code == 200
        r.close(); return ok
    except Exception:
        return False


def canonical_url(url: str) -> str:
    stripped = strip_wp_size(url)
    if stripped != url and probe_ok(stripped):
        return stripped
    return url


def safe_filename(url: str) -> str:
    name = Path(urllib.parse.unquote(urllib.parse.urlparse(url).path)).name
    name = re.sub(r'[^A-Za-z0-9._ -]+', '-', name).strip(' .')
    return name or hashlib.md5(url.encode()).hexdigest()


def page_slug(page_url: str) -> str:
    path = urllib.parse.urlparse(page_url).path.strip('/')
    if not path:
        return 'home'
    return re.sub(r'[^a-zA-Z0-9_-]+', '-', '__'.join(path.split('/'))).strip('-_').lower()


def raw_path_for(url: str) -> Path:
    name = safe_filename(url)
    return RAW_DIR / f'{hashlib.md5(url.encode()).hexdigest()[:10]}__{name}'


def download_raw(url: str) -> Path:
    RAW_DIR.mkdir(parents=True, exist_ok=True)
    p = raw_path_for(url)
    if p.exists():
        return p
    r = session.get(url, timeout=60)
    r.raise_for_status()
    p.write_bytes(r.content)
    return p


def image_meta(path: Path) -> tuple[int, int, int]:
    if path.suffix.lower() == '.svg':
        return (0, 0, path.stat().st_size)
    try:
        with Image.open(path) as im:
            return (im.width, im.height, path.stat().st_size)
    except Exception:
        return (0, 0, path.stat().st_size if path.exists() else 0)


def resize_to(src: Path, dest: Path) -> tuple[str, int, int, int, int]:
    dest.parent.mkdir(parents=True, exist_ok=True)
    if src.suffix.lower() == '.svg':
        shutil.copy2(src, dest)
        return ('copied-svg', 0, 0, 0, 0)
    with Image.open(src) as im:
        im = ImageOps.exif_transpose(im)
        original_w, original_h = im.width, im.height
        if max(im.width, im.height) > MAX_SIDE:
            im.thumbnail((MAX_SIDE, MAX_SIDE), Image.Resampling.LANCZOS)
        final_w, final_h = im.width, im.height
        ext = dest.suffix.lower()
        save_kwargs = {}
        if ext in ['.jpg', '.jpeg']:
            if im.mode in ('RGBA', 'LA', 'P'):
                bg = Image.new('RGB', im.size, (255, 255, 255))
                if im.mode == 'P':
                    im = im.convert('RGBA')
                bg.paste(im, mask=im.split()[-1] if im.mode in ('RGBA', 'LA') else None)
                im = bg
            else:
                im = im.convert('RGB')
            save_kwargs = {'quality': 88, 'optimize': True, 'progressive': True}
        elif ext == '.png':
            save_kwargs = {'optimize': True}
        elif ext == '.webp':
            save_kwargs = {'quality': 88, 'method': 6}
        elif ext == '.gif':
            # Preserve animated gifs rather than accidentally flattening them.
            shutil.copy2(src, dest)
            return ('copied-gif', original_w, original_h, original_w, original_h)
        im.save(dest, **save_kwargs)
        status = 'resized' if (final_w, final_h) != (original_w, original_h) else 'optimised-no-resize'
        return (status, original_w, original_h, final_w, final_h)


def unique_dest(folder: Path, filename: str) -> Path:
    p = folder / filename
    if not p.exists():
        return p
    stem, suffix = p.stem, p.suffix
    i = 2
    while True:
        c = folder / f'{stem}__{i}{suffix}'
        if not c.exists():
            return c
        i += 1


def score_hero(candidate: dict, body: str) -> int:
    url = candidate['url']
    name = safe_filename(url)
    score = 0
    if SKIP_IMAGE_RE.search(name):
        score -= 1000
    idx = candidate['index']
    if idx < 6000:
        score += 80
    elif idx < 12000:
        score += 40
    if candidate['source'] in {'css-url', 'data-bg', 'data-background', 'poster'}:
        score += 35
    window = body[max(0, idx - 1200): idx + 1200]
    if HERO_HINT_RE.search(window) or HERO_HINT_RE.search(name):
        score += 120
    if 'srcset' in candidate['source']:
        score += 10
    # Prefer non-svg raster photos for hero.
    if Path(urllib.parse.urlparse(url).path).suffix.lower() in {'.jpg', '.jpeg', '.webp', '.png'}:
        score += 20
    return score


def choose_hero(page_url: str, body: str, candidates: list[dict]) -> dict | None:
    usable = []
    for c in candidates:
        if SKIP_IMAGE_RE.search(safe_filename(c['url'])):
            continue
        can = canonical_url(c['url'])
        try:
            raw = download_raw(can)
            w, h, size = image_meta(raw)
        except Exception:
            continue
        if raw.suffix.lower() != '.svg':
            if w < 600 or h < 250:
                continue
        cc = dict(c)
        cc.update({'canonical_url': can, 'raw_path': str(raw), 'original_width': w, 'original_height': h, 'raw_bytes': size, 'hero_score': score_hero(c, body) + min(max(w, h), 3000)//20})
        usable.append(cc)
    if not usable:
        return None
    return max(usable, key=lambda x: (x['hero_score'], -x['index']))


def main() -> None:
    if IMPORT_DIR.exists():
        shutil.rmtree(IMPORT_DIR)
    PAGE_DIR.mkdir(parents=True, exist_ok=True)
    OTHER_DIR.mkdir(parents=True, exist_ok=True)
    pages = page_urls_from_sitemap()

    page_rows = []
    other_occurrences: dict[str, dict] = {}
    hero_urls = set()

    for i, page in enumerate(pages, 1):
        print(f'[{i}/{len(pages)}] {page}', flush=True)
        try:
            resp = session.get(page, timeout=30)
            resp.raise_for_status()
        except Exception as e:
            print(' page error', e, file=sys.stderr)
            continue
        body = resp.text
        candidates = extract_images(page, body)
        hero = choose_hero(page, body, candidates)
        if hero:
            slug = page_slug(page)
            original_filename = safe_filename(hero['canonical_url'])
            dest = unique_dest(PAGE_DIR, f'{slug}__{original_filename}')
            raw = Path(hero['raw_path'])
            try:
                status, ow, oh, fw, fh = resize_to(raw, dest)
            except Exception as e:
                status, ow, oh, fw, fh = f'error: {e}', hero.get('original_width', 0), hero.get('original_height', 0), 0, 0
            hero_urls.add(hero['canonical_url'])
            page_rows.append({
                'page_url': page, 'page_prefix': slug, 'asset_folder': 'Page images',
                'filename': dest.name, 'original_filename': original_filename,
                'source_url': hero['url'], 'canonical_url': hero['canonical_url'],
                'download_path': str(dest), 'status': status,
                'original_width': ow, 'original_height': oh, 'final_width': fw, 'final_height': fh,
                'bytes': dest.stat().st_size if dest.exists() else '', 'hero_score': hero.get('hero_score', ''),
            })
        # group all non-hero candidates as other static images
        for c in candidates:
            can = canonical_url(c['url'])
            g = other_occurrences.setdefault(can, {'canonical_url': can, 'source_urls': set(), 'page_urls': set(), 'sources': set()})
            g['source_urls'].add(c['url']); g['page_urls'].add(page); g['sources'].add(c['source'])
        time.sleep(0.05)

    other_rows = []
    for can, g in sorted(other_occurrences.items(), key=lambda kv: safe_filename(kv[0]).lower()):
        if can in hero_urls:
            # The selected hero copy lives only in Page images. If the same image appears elsewhere, filename search still works from Page images.
            continue
        try:
            raw = download_raw(can)
            dest = unique_dest(OTHER_DIR, safe_filename(can))
            status, ow, oh, fw, fh = resize_to(raw, dest)
            size = dest.stat().st_size
        except Exception as e:
            dest = OTHER_DIR / safe_filename(can)
            status, ow, oh, fw, fh, size = f'error: {e}', 0, 0, 0, 0, ''
        other_rows.append({
            'asset_folder': 'Other static images', 'filename': dest.name,
            'canonical_url': can, 'download_path': str(dest), 'status': status,
            'original_width': ow, 'original_height': oh, 'final_width': fw, 'final_height': fh,
            'bytes': size, 'page_count': len(g['page_urls']), 'occurrence_count': len(g['source_urls']),
            'sample_page_urls': ' | '.join(sorted(g['page_urls'])[:5]),
            'source_urls_collapsed': ' | '.join(sorted(g['source_urls'])[:10]),
            'sources': ' | '.join(sorted(g['sources'])),
        })

    def write_csv(path: Path, rows: list[dict]):
        fields = list(rows[0].keys()) if rows else []
        with path.open('w', newline='') as f:
            w = csv.DictWriter(f, fieldnames=fields)
            w.writeheader(); w.writerows(rows)

    write_csv(PAGE_MANIFEST, page_rows)
    write_csv(OTHER_MANIFEST, other_rows)
    print('\nDONE')
    print(f'static pages: {len(pages)}')
    print(f'page hero images: {len(page_rows)}')
    print(f'other static images: {len(other_rows)}')
    print(f'output: {IMPORT_DIR}')

if __name__ == '__main__':
    main()
