#!/usr/bin/env python3
from __future__ import annotations

import csv
import hashlib
import re
import shutil
import urllib.parse
from pathlib import Path

import requests

OUT = Path('/Users/iggy/.hermes/profiles/ignite_team/outbound/krb-image-grab')
RAW_MANIFEST = OUT / 'krb-static-image-downloads.csv'
PAGE_RAW_MANIFEST = OUT / 'krb-page-image-candidates.csv'
IMPORT_DIR = OUT / 'webflow-import-assets'
STATIC_DIR = IMPORT_DIR / 'static-images'
PAGE_DIR = IMPORT_DIR / 'Page images'
IMPORT_MANIFEST = OUT / 'krb-webflow-asset-import-set.csv'
PAGE_IMPORT_MANIFEST = OUT / 'krb-webflow-page-images-import-set.csv'
IMAGE_EXTS = ('.jpg', '.jpeg', '.png', '.webp', '.gif', '.svg')
WP_SIZE_RE = re.compile(r'-(\d{2,5})x(\d{2,5})(?=\.[A-Za-z0-9]+$)')

session = requests.Session()
session.headers.update({'User-Agent': 'IGNITE-KRB-Webflow-image-migration/1.0'})


def strip_wp_size(url: str) -> str:
    parsed = urllib.parse.urlparse(url)
    path = urllib.parse.unquote(parsed.path)
    new_path = WP_SIZE_RE.sub('', path)
    if new_path == path:
        return url
    return urllib.parse.urlunparse((parsed.scheme, parsed.netloc, urllib.parse.quote(new_path, safe='/._-'), '', '', ''))


def url_exists(url: str) -> bool:
    try:
        r = session.head(url, timeout=20, allow_redirects=True)
        if r.status_code == 200 and 'image' in r.headers.get('content-type','').lower():
            return True
        # Some servers don't handle HEAD correctly.
        r = session.get(url, timeout=25, stream=True)
        ok = r.status_code == 200 and 'image' in r.headers.get('content-type','').lower()
        r.close()
        return ok
    except Exception:
        return False


def canonical_url(url: str) -> str:
    stripped = strip_wp_size(url)
    if stripped != url and url_exists(stripped):
        return stripped
    return url


def safe_filename(url: str) -> str:
    name = Path(urllib.parse.unquote(urllib.parse.urlparse(url).path)).name
    name = re.sub(r'[^A-Za-z0-9._ -]+', '-', name).strip(' .')
    return name or hashlib.md5(url.encode()).hexdigest()


def unique_path(folder: Path, filename: str, url: str) -> Path:
    p = folder / filename
    if not p.exists():
        return p
    stem, suffix = p.stem, p.suffix
    return folder / f'{stem}__{hashlib.md5(url.encode()).hexdigest()[:8]}{suffix}'


def read_rows(path: Path) -> list[dict]:
    with path.open(newline='') as f:
        return list(csv.DictReader(f))


def download(url: str, folder: Path) -> tuple[Path, str, str, str]:
    filename = safe_filename(url)
    p = unique_path(folder, filename, url)
    if p.exists():
        data = p.read_bytes()
        return p, str(len(data)), hashlib.md5(data).hexdigest(), ''
    r = session.get(url, timeout=60)
    r.raise_for_status()
    data = r.content
    p.write_bytes(data)
    return p, str(len(data)), hashlib.md5(data).hexdigest(), r.headers.get('content-type','')


def main():
    STATIC_DIR.mkdir(parents=True, exist_ok=True)
    PAGE_DIR.mkdir(parents=True, exist_ok=True)

    raw = read_rows(RAW_MANIFEST)
    page_raw = read_rows(PAGE_RAW_MANIFEST) if PAGE_RAW_MANIFEST.exists() else []
    page_source_urls = {r['image_url'] for r in page_raw}

    # Canonicalise and group occurrences by final URL.
    grouped: dict[str, dict] = {}
    for i, row in enumerate(raw, 1):
        src = row['image_url']
        can = canonical_url(src)
        g = grouped.setdefault(can, {
            'canonical_url': can,
            'source_urls': set(),
            'page_urls': set(),
            'sources': set(),
            'is_page_image_candidate': False,
        })
        g['source_urls'].add(src)
        g['page_urls'].add(row['page_url'])
        g['sources'].add(row['source'])
        if src in page_source_urls:
            g['is_page_image_candidate'] = True

    rows = []
    page_rows = []
    for can, g in sorted(grouped.items(), key=lambda kv: safe_filename(kv[0]).lower()):
        try:
            p, size, md5, ctype = download(can, STATIC_DIR)
            status = 'downloaded'
        except Exception as e:
            p, size, md5, ctype = Path(''), '', '', ''
            status = f'error: {e}'
        out = {
            'asset_folder': 'static-images',
            'filename': safe_filename(can),
            'canonical_url': can,
            'download_path': str(p),
            'status': status,
            'bytes': size,
            'md5': md5,
            'content_type': ctype,
            'occurrence_count': len(g['source_urls']),
            'page_count': len(g['page_urls']),
            'sample_page_urls': ' | '.join(sorted(g['page_urls'])[:5]),
            'source_urls_collapsed': ' | '.join(sorted(g['source_urls'])[:10]),
            'is_page_image_candidate': 'yes' if g['is_page_image_candidate'] else 'no',
        }
        rows.append(out)
        if g['is_page_image_candidate'] and p.exists():
            target = PAGE_DIR / p.name
            if not target.exists():
                shutil.copy2(p, target)
            pr = dict(out)
            pr['asset_folder'] = 'Page images'
            pr['download_path'] = str(target)
            page_rows.append(pr)

    fields = ['asset_folder','filename','canonical_url','download_path','status','bytes','md5','content_type','occurrence_count','page_count','sample_page_urls','source_urls_collapsed','is_page_image_candidate']
    with IMPORT_MANIFEST.open('w', newline='') as f:
        w = csv.DictWriter(f, fieldnames=fields)
        w.writeheader(); w.writerows(rows)
    with PAGE_IMPORT_MANIFEST.open('w', newline='') as f:
        w = csv.DictWriter(f, fieldnames=fields)
        w.writeheader(); w.writerows(page_rows)

    print('DONE')
    print(f'raw image URL count: {len({r["image_url"] for r in raw})}')
    print(f'canonical import assets: {len(rows)}')
    print(f'page image candidates: {len(page_rows)}')
    print(f'import dir: {IMPORT_DIR}')
    print(f'manifest: {IMPORT_MANIFEST}')

if __name__ == '__main__':
    main()
