#!/usr/bin/env python3
from __future__ import annotations
import json, re, csv, sys
from pathlib import Path
from urllib.parse import urljoin, urlparse, unquote
from typing import Any, Dict, List, Optional

import requests
from bs4 import BeautifulSoup, Tag

OUT = Path('/Users/iggy/.hermes/profiles/ignite_team/outbound')
BASE = 'https://www.krb.nsw.edu.au'
CURRENT = OUT / 'krb-all-static-hero-image-current-readonly-v1.json'
REPORT_JSON = OUT / 'krb-missing-hero-live-asset-audit-v1.json'
REPORT_CSV = OUT / 'krb-missing-hero-live-asset-audit-v1.csv'
REPORT_MD = OUT / 'krb-missing-hero-live-asset-audit-v1.md'


def clean_text(text: str) -> str:
    return re.sub(r'\s+', ' ', (text or '').replace('\xa0',' ')).strip()

def choose_srcset(srcset: str) -> Optional[str]:
    best = (-1, '')
    for part in (srcset or '').split(','):
        bits = part.strip().split()
        if not bits: continue
        url = bits[0]
        width = 0
        if len(bits) > 1:
            m = re.match(r'(\d+)w', bits[1])
            if m: width = int(m.group(1))
        if width >= best[0]: best = (width, url)
    return best[1] or None

def filename_from_url(url: str) -> str:
    return unquote(urlparse(url or '').path.rsplit('/',1)[-1])

def image_spec_from_url(url: str, source='background', alt='') -> Optional[Dict[str, Any]]:
    if not url or str(url).startswith('data:'): return None
    full = urljoin(BASE, url)
    return {'sourceUrl': full, 'sourceFile': filename_from_url(full), 'alt': clean_text(alt), 'source': source}

def image_spec_from_img(img: Tag) -> Optional[Dict[str, Any]]:
    src = choose_srcset(img.get('srcset','')) or img.get('data-src') or img.get('data-lazy-src') or img.get('src')
    if not src:
        parent = img.parent if isinstance(img.parent, Tag) else None
        if parent:
            source = parent.find('source', srcset=True)
            if source: src = choose_srcset(source.get('srcset',''))
    return image_spec_from_url(src, source='img', alt=img.get('alt','')) if src else None

def find_images(el: Tag) -> List[Dict[str, Any]]:
    out, seen = [], set()
    for img in el.find_all('img'):
        spec = image_spec_from_img(img)
        if spec and spec['sourceUrl'] not in seen:
            out.append(spec); seen.add(spec['sourceUrl'])
    for styled in el.find_all(style=True):
        for url in re.findall(r"url\(['\"]?([^)'\"]+)", styled.get('style','')):
            spec = image_spec_from_url(url)
            if spec and spec['sourceUrl'] not in seen:
                out.append(spec); seen.add(spec['sourceUrl'])
    return out

def first_heading(el: Tag) -> str:
    h = el.find(['h1','h2','h3','h4'])
    return clean_text(h.get_text(' ', strip=True)) if h else ''

def live_hero_for_path(path: str) -> Dict[str, Any]:
    url = urljoin(BASE, (path or '/').strip('/') + ('/' if path != '/' else ''))
    if path == '/': url = BASE + '/'
    s = requests.Session(); s.headers.update({'User-Agent':'Mozilla/5.0 IGNITE KRB hero asset audit'})
    try:
        r = s.get(url, timeout=30, allow_redirects=True)
    except Exception as e:
        return {'liveUrl': url, 'status': 'fetch-error', 'error': str(e), 'hero': None, 'images': []}
    if r.status_code >= 400:
        return {'liveUrl': url, 'status': f'http-{r.status_code}', 'finalUrl': r.url, 'hero': None, 'images': []}
    soup = BeautifulSoup(r.text, 'html.parser')
    main = soup.select_one('main') or soup.body or soup
    hero = main.select_one('.page-header') or main.find(class_=lambda c: c and 'page-header' in str(c).split())
    # Fallback: first top-level section with an h1; then any first h1 ancestor.
    if not hero:
        for child in [c for c in main.find_all(recursive=False) if isinstance(c, Tag)]:
            if child.find('h1'):
                hero = child; break
    if not hero:
        h1 = soup.find('h1')
        hero = h1.find_parent(['section','header','div']) if h1 else None
    images = find_images(hero) if hero else []
    return {'liveUrl': url, 'finalUrl': r.url, 'status': 'ok', 'heroHeading': first_heading(hero) if hero else '', 'heroClasses': hero.get('class',[]) if isinstance(hero, Tag) else [], 'images': images}


def compact(s: str) -> str:
    return re.sub(r'[^a-z0-9]+','', (s or '').lower())

def strip_asset_id_prefix(name: str) -> str:
    return re.sub(r'^[a-f0-9]{24}_','', name or '', flags=re.I)

def without_thumb_size(name: str) -> str:
    return re.sub(r'-p-\d+(?=\.[a-z0-9]+$)','', re.sub(r'-150x150(?=\.[a-z0-9]+$)','', re.sub(r'-\d+x\d+(?=\.[a-z0-9]+$)','', name or '', flags=re.I), flags=re.I), flags=re.I)

def strip_ext(name: str) -> str:
    return re.sub(r'\.[a-z0-9]+$','', name or '', flags=re.I)

def asset_key(name: str) -> str:
    stem = strip_ext(strip_asset_id_prefix(without_thumb_size(name)))
    stem = re.sub(r'-scaled(?:-\d+)?$','', stem, flags=re.I).replace('scaled','')
    return compact(stem)

def filename_variants(name: str) -> List[str]:
    clean = strip_asset_id_prefix(without_thumb_size(name or ''))
    stem = strip_ext(clean)
    ext = re.search(r'\.[a-z0-9]+$', clean or '', re.I)
    ext = ext.group(0) if ext else ''
    vals = [clean, stem + ext, re.sub(r'-scaled(?:-\d+)?$','', stem, flags=re.I) + ext, stem + '-scaled' + ext, re.sub(r'-scaled(?:-\d+)?$','', stem, flags=re.I) + '-scaled' + ext]
    out=[]
    for v in vals:
        if v and v not in out: out.append(v)
    return out

def asset_names(a: Dict[str, Any]) -> List[str]:
    vals = [a.get('displayName'), a.get('name'), a.get('fileName'), a.get('originalFileName'), filename_from_url(a.get('hostedUrl') or a.get('url') or '')]
    out=[]
    for v in vals:
        if not v: continue
        for x in filename_variants(v):
            if x and x not in out: out.append(x)
    return out

def canonical_source_spec(spec: Optional[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
    """Prefer full-size WP filename over generated thumbnail names in reports/matching."""
    if not spec:
        return None
    out = dict(spec)
    src = out.get('sourceUrl') or ''
    full = re.sub(r'-\d+x\d+(?=\.[a-z0-9]+(?:\?|$))', '', src, flags=re.I)
    if full != src:
        out['thumbnailUrl'] = src
        out['sourceUrl'] = full
        out['sourceFile'] = filename_from_url(full)
    else:
        out['sourceFile'] = filename_from_url(src) or out.get('sourceFile')
    return out


def match_asset(spec: Optional[Dict[str, Any]], assets: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
    if not spec: return None
    spec = canonical_source_spec(spec)
    desired_names=[]
    for v in [spec.get('sourceFile'), filename_from_url(spec.get('sourceUrl','')), filename_from_url(spec.get('thumbnailUrl',''))]:
        desired_names.extend(filename_variants(v))
    desired_keys = {asset_key(n) for n in desired_names if n}
    desired_compacts = [compact(n) for n in desired_names if n]
    for a in assets:
        names = asset_names(a)
        keys = {asset_key(n) for n in names}
        if desired_keys & keys:
            return {'id': a.get('id') or a.get('_id'), 'displayName': a.get('displayName'), 'originalFileName': a.get('originalFileName'), 'hostedUrl': a.get('hostedUrl'), 'matchType': 'filename-key'}
    # loose containment only when long enough to avoid false positives
    for a in assets:
        names = [compact(n) for n in asset_names(a)]
        for dn in desired_compacts:
            if len(dn) >= 10 and any(dn in n or n in dn for n in names if len(n) >= 10):
                return {'id': a.get('id') or a.get('_id'), 'displayName': a.get('displayName'), 'originalFileName': a.get('originalFileName'), 'hostedUrl': a.get('hostedUrl'), 'matchType': 'loose-name'}
    return None


def main() -> int:
    current = json.loads(CURRENT.read_text())
    assets = current['assets']
    missing = [p for p in current['pages'] if p.get('hasHero') and p.get('heroImageMissing')]
    rows=[]
    for i,p in enumerate(missing,1):
        print(f'[{i}/{len(missing)}] {p["path"]}', file=sys.stderr)
        live = live_hero_for_path(p['path'])
        hero_images = live.get('images') or []
        chosen = next((img for img in hero_images if '-150x150' not in img.get('sourceUrl','')), hero_images[0] if hero_images else None)
        chosen = canonical_source_spec(chosen)
        match = match_asset(chosen, assets) if chosen else None
        rows.append({
            'title': p['title'], 'path': p['path'], 'pageId': p['pageId'], 'webflowHeroHeading': p.get('heroHeading') or '',
            'liveStatus': live.get('status'), 'liveUrl': live.get('liveUrl'), 'liveFinalUrl': live.get('finalUrl',''),
            'liveHeroHeading': live.get('heroHeading',''), 'liveHeroClasses': ' '.join(live.get('heroClasses') or []),
            'liveHeroImageCount': len(hero_images),
            'sourceFile': chosen.get('sourceFile') if chosen else '', 'sourceUrl': chosen.get('sourceUrl') if chosen else '',
            'assetFound': bool(match), 'assetId': match.get('id') if match else '', 'assetDisplayName': match.get('displayName') if match else '', 'assetOriginalFileName': match.get('originalFileName') if match else '', 'assetMatchType': match.get('matchType') if match else '',
        })
    summary = {
        'generatedAt': current['generatedAt'],
        'currentReadGeneratedAt': current['generatedAt'],
        'pagesAuditedInWebflow': current['summary']['pagesAudited'],
        'pagesWithHeroComponent': current['summary']['pagesWithHeroComponent'],
        'pagesMissingWebflowHeroImage': len(rows),
        'liveHeroFound': sum(1 for r in rows if r['sourceUrl']),
        'liveHeroAssetFoundInWebflow': sum(1 for r in rows if r['assetFound']),
        'liveHeroAssetMissingInWebflow': sum(1 for r in rows if r['sourceUrl'] and not r['assetFound']),
        'noLiveHeroImageDetected': sum(1 for r in rows if not r['sourceUrl']),
        'assetLibraryCount': len(assets),
    }
    REPORT_JSON.write_text(json.dumps({'summary': summary, 'rows': rows}, indent=2))
    with REPORT_CSV.open('w', newline='') as f:
        fieldnames = list(rows[0].keys()) if rows else []
        w=csv.DictWriter(f, fieldnames=fieldnames); w.writeheader(); w.writerows(rows)
    asset_missing=[r for r in rows if r['sourceUrl'] and not r['assetFound']]
    asset_found=[r for r in rows if r['assetFound']]
    no_live=[r for r in rows if not r['sourceUrl']]
    lines=[]
    lines.append('# KRB missing Webflow hero images vs live-site assets — read-only audit')
    lines.append('')
    lines.append(f"Generated from current Webflow readback: `{current['generatedAt']}`")
    lines.append('')
    lines.append('## Summary')
    for k,v in summary.items(): lines.append(f'- **{k}**: {v}')
    lines.append('')
    lines.append('Scope: static published Webflow pages only; collection templates, archived/draft pages, internal pages, `/search`, `/test`, and `/untitled` excluded. No Webflow writes were made.')
    lines.append('')
    def table(title, items, extra_cols=False):
        lines.append(f'## {title}')
        lines.append('')
        if not items:
            lines.append('_None._'); lines.append(''); return
        lines.append('| Page | Path | Live status | Live hero file | Asset status | Asset |')
        lines.append('|---|---|---|---|---|---|')
        for r in items:
            status = 'FOUND' if r['assetFound'] else ('MISSING' if r['sourceUrl'] else 'NO LIVE HERO DETECTED')
            asset = r['assetDisplayName'] or r['assetOriginalFileName'] or r['assetId'] or '—'
            lines.append(f"| {r['title']} | `{r['path']}` | `{r['liveStatus']}` | `{r['sourceFile'] or '—'}` | {status} | `{asset}` |")
        lines.append('')
    table('Live hero image found, but no matching Webflow asset', asset_missing)
    table('Live hero image found and matching Webflow asset exists', asset_found)
    table('Webflow hero image missing, but no live hero image detected', no_live)
    lines.append('## Output files')
    lines.append(f'- JSON: `{REPORT_JSON}`')
    lines.append(f'- CSV: `{REPORT_CSV}`')
    REPORT_MD.write_text('\n'.join(lines)+"\n")
    print(json.dumps(summary, indent=2))
    print(REPORT_MD)
    return 0

if __name__ == '__main__':
    raise SystemExit(main())
