#!/usr/bin/env python3
import csv, json, re, os, zipfile
from pathlib import Path
from collections import defaultdict, Counter
from urllib.parse import urlparse

OUT = Path('/Users/iggy/.hermes/profiles/ignite_team/outbound')
IMG = OUT/'krb-image-grab'
SITE = 'https://www.krb.nsw.edu.au'
PAGE_IMAGES_FOLDER = '6a34e1160daf7376173c7071'
OTHER_STATIC_FOLDER = '6a34eb470be8fc7ba5cb19f6'

def norm_path_from_url(url):
    if not url: return ''
    p = urlparse(url).path.rstrip('/') or '/'
    return p

def basename_from_url(url):
    return os.path.basename(urlparse(url).path)

def strip_variant_stem(stem):
    # WordPress thumbnail/size variants and common derived suffixes.
    stem = re.sub(r'-\d+x\d+$', '', stem)
    stem = re.sub(r'-scaled$', '', stem)
    stem = re.sub(r'-e\d{10,}$', '', stem)
    return stem

def file_key(name_or_url):
    b = basename_from_url(name_or_url) if '://' in str(name_or_url) else os.path.basename(str(name_or_url))
    if not b: return ''
    if '__' in b:
        b = b.split('__')[-1]
    stem = os.path.splitext(b)[0].lower()
    stem = strip_variant_stem(stem)
    stem = re.sub(r'[^a-z0-9]+', '', stem)
    return stem

def clean_text(v):
    if v is None: return ''
    if isinstance(v, dict):
        if 'innerText' in v: return clean_text(v['innerText'])
        if 'value' in v: return clean_text(v['value'])
        return ' '.join(clean_text(x) for x in v.values())
    return re.sub(r'\s+', ' ', str(v).replace('\u200d','')).strip()

def static_val(prop):
    for k in ('value','resolvedValue','defaultValue'):
        v = prop.get(k)
        if isinstance(v, dict) and v.get('sourceType') == 'static':
            return v.get('value')
        if v not in (None, ''):
            return v
    return None

def prop_map(props):
    m = defaultdict(list)
    for p in props:
        m[p.get('label') or ''].append(p)
    return m

def first_prop_text(props, labels):
    pm = prop_map(props)
    for lab in labels:
        for p in pm.get(lab, []):
            t = clean_text(static_val(p))
            if t: return t
    return ''

def link_page_id(v):
    if isinstance(v, dict) and v.get('sourceType') == 'static': v = v.get('value')
    if isinstance(v, dict):
        to = v.get('to')
        if isinstance(to, dict): return to.get('pageId')
        if isinstance(to, str) and re.fullmatch(r'[a-f0-9]{24}', to): return to
    return ''

def image_prop_group(label):
    return label.split('/')[0] if '/' in label else ''

# Assets, current-ish readback plus upload CSVs.
assets = []
for fp in [IMG/'krb-webflow-page-images-assets-readback.json', IMG/'krb-webflow-other-static-images-assets-readback.json']:
    if fp.exists():
        assets += json.load(open(fp))
for fp in [IMG/'krb-webflow-page-images-uploaded-to-webflow.csv', IMG/'krb-webflow-other-static-images-uploaded-to-webflow.csv']:
    if fp.exists():
        with open(fp, newline='') as f:
            for r in csv.DictReader(f):
                if r.get('asset_id'):
                    assets.append({'id':r['asset_id'], 'displayName':r.get('filename',''), 'originalFileName':r.get('filename',''), 'folderId':r.get('parent_folder',''), 'hostedUrl':'', 'contentType':r.get('content_type','')})
asset_by_key = defaultdict(list)
asset_by_id = {}
for a in assets:
    aid = a.get('id') or a.get('asset_id')
    if not aid: continue
    asset_by_id[aid]=a
    for field in ['displayName','originalFileName','hostedUrl']:
        k=file_key(a.get(field,''))
        if k: asset_by_key[k].append(a)

def choose_asset_for_source(source_url, prefer_page_folder=False):
    k=file_key(source_url)
    candidates=asset_by_key.get(k, [])
    if not candidates: return None, 'missing'
    # Prefer page images for hero/linked-card page images when available.
    if prefer_page_folder:
        for a in candidates:
            if a.get('folderId') == PAGE_IMAGES_FOLDER or a.get('parent_folder') == PAGE_IMAGES_FOLDER:
                return a, 'filename-key-page-images'
    # Prefer non-SVG images.
    for a in candidates:
        if str(a.get('contentType','')).startswith('image/'):
            return a, 'filename-key'
    return candidates[0], 'filename-key'

# Live/source image downloads by page, with de-duped canonical keys in original scrape order.
source_images_by_page = defaultdict(list)
local_download_by_url = {}
static_csv = IMG/'krb-static-image-downloads.csv'
if static_csv.exists():
    seen_by_page=defaultdict(set)
    with open(static_csv, newline='') as f:
        for r in csv.DictReader(f):
            page_path=norm_path_from_url(r.get('page_url',''))
            url=r.get('image_url','')
            fn=basename_from_url(url)
            if not url or not re.search(r'\.(jpe?g|png|webp)(\?|$)', url, re.I): continue
            if any(x in fn.lower() for x in ['logo','icon']): continue
            k=file_key(url)
            if not k or k in seen_by_page[page_path]: continue
            seen_by_page[page_path].add(k)
            row=dict(r); row['key']=k; row['page_path']=page_path; row['filename']=fn
            source_images_by_page[page_path].append(row)
            local_download_by_url[url]=r.get('download_path','')

# Existing hero-source audit is more reliable for hero source detection.
hero_by_path={}
hero_json=OUT/'krb-missing-hero-live-asset-audit-v2-current.json'
if hero_json.exists():
    h=json.load(open(hero_json))
    for r in h.get('rows',[]): hero_by_path[r['path']]=r
# Previous 31 ready hero repair notes include extra source->asset mappings.
hero_ready_by_path={}
for fp in [OUT/'krb-31-existing-hero-image-repair-v2-notes.md']:
    if fp.exists():
        txt=open(fp).read()
        for m in re.finditer(r'- (.*?) `(.*?)` — asset `(.*?)` \(`([a-f0-9]{24})`\), match `(.*?)`', txt):
            title,path,assetName,assetId,matchType=m.groups()
            hero_ready_by_path[path]={'sourceFile':assetName,'assetId':assetId,'assetDisplayName':assetName,'assetMatchType':matchType}

# Page ID -> path/title from audit pages.
audit=json.load(open(OUT/'krb-empty-image-component-props-current-v1.json'))
page_by_id={p['pageId']:p for p in audit.get('pages',[])}
# Include finding pages too.
for f in audit['findings']:
    page_by_id.setdefault(f['pageId'], {'pageId':f['pageId'], 'path':f['path'], 'title':f['pageTitle']})

ctx=json.load(open(OUT/'krb-empty-image-component-props-current-v1-prop-context.json'))
component_by_id={}
for page in ctx['pages']:
    for c in page['components']:
        component_by_id[c['idString']]=c

# Per-page source-image cursor for sequential component-image assignments.
image_cursor=defaultdict(int)
used_source_by_page=defaultdict(set)

def next_page_image(path, skip_hero=True):
    imgs=source_images_by_page.get(path, [])
    start=1 if skip_hero and len(imgs)>1 else 0
    idx=max(image_cursor[path], start)
    while idx < len(imgs):
        img=imgs[idx]; idx += 1
        if img['key'] in used_source_by_page[path]: continue
        image_cursor[path]=idx
        used_source_by_page[path].add(img['key'])
        return img
    return None

def hero_source_for_path(path):
    if path in hero_by_path and hero_by_path[path].get('sourceUrl'):
        return {'image_url':hero_by_path[path]['sourceUrl'], 'filename':hero_by_path[path].get('sourceFile') or basename_from_url(hero_by_path[path]['sourceUrl']), 'source':'hero-audit', 'key':file_key(hero_by_path[path]['sourceUrl'])}
    imgs=source_images_by_page.get(path, [])
    if imgs: return imgs[0]
    return None

def card_group_has_content(props, group):
    # Empty repeated cards often have blank Title/Paragraph/Link alongside blank Image; don't write those.
    relevant=[]
    for p in props:
        lab=p.get('label') or ''
        if lab.startswith(group+'/') and not lab.endswith('/Image'):
            relevant.append(p)
    for p in relevant:
        lab=p.get('label','').lower()
        if lab.endswith('/link'):
            if link_page_id(static_val(p)): return True
        else:
            if clean_text(static_val(p)): return True
    return False

def target_page_from_group_link(props, group):
    for p in props:
        if (p.get('label') or '') == f'{group}/Link':
            pid=link_page_id(static_val(p))
            if pid and pid in page_by_id: return page_by_id[pid]
    return None

records=[]
for f in audit['findings']:
    c=component_by_id.get(f['componentId'])
    props=c.get('props',[]) if c else []
    component=f['componentName']
    for ep in f['emptyImageProps']:
        label=ep['label']
        group=image_prop_group(label)
        title=first_prop_text(props, [f'{group}/Title', 'Heading', 'Title', 'Eyebrow/Heading', 'Content/Heading'])
        link_target=target_page_from_group_link(props, group) if group else None
        active=True
        if group and re.match(r'^(Page|Tab) \d+$', group):
            active=card_group_has_content(props, group)
        if not active:
            records.append({**f, 'propLabel':label, 'propId':ep['propId'], 'status':'skip-empty-card', 'reason':f'{group} has no title/link/text content present', 'sourceUrl':'', 'sourceFile':'', 'assetId':'', 'assetName':'', 'assetFolder':'', 'matchType':'', 'confidence':'high'})
            continue
        source=None; reason=''; prefer_page_folder=False; confidence='medium'
        if component == 'Section / Hero':
            source=hero_source_for_path(f['path']); reason='hero image from live hero audit/source page'; prefer_page_folder=True; confidence='high' if source else 'low'
        elif link_target:
            source=hero_source_for_path(link_target['path']); reason=f'linked card image from target page hero: {link_target.get("title")} {link_target.get("path")}'; prefer_page_folder=True; confidence='medium' if source else 'low'
        elif component in ['Section / Next Pages','Section / Gateway CTA']:
            source=None; reason='card has content but no resolvable linked target page in component props'; confidence='low'
        else:
            source=next_page_image(f['path'], skip_hero=True); reason='next unmatched live/source image on same page, ordered against component context'; confidence='medium' if source else 'low'
        if source:
            source_url=source.get('image_url') or source.get('sourceUrl') or ''
            source_file=source.get('filename') or basename_from_url(source_url)
            asset=None; match=''
            # Some hero repair mappings already name exact asset IDs.
            if component == 'Section / Hero' and f['path'] in hero_ready_by_path and hero_ready_by_path[f['path']].get('assetId') in asset_by_id:
                asset=asset_by_id[hero_ready_by_path[f['path']]['assetId']]; match='previous-hero-repair-map'
            if not asset:
                asset, match = choose_asset_for_source(source_url or source_file, prefer_page_folder=prefer_page_folder)
            if asset:
                status='ready-existing-asset'
                assetId=asset.get('id') or asset.get('asset_id')
                assetName=asset.get('displayName') or asset.get('originalFileName') or asset.get('filename') or source_file
                assetFolder=asset.get('folderId') or asset.get('parent_folder') or ''
            else:
                status='needs-upload'
                assetId=assetName=assetFolder=''
            folder = PAGE_IMAGES_FOLDER if component == 'Section / Hero' or prefer_page_folder else OTHER_STATIC_FOLDER
            prefix = f['path'].strip('/').replace('/','__') or 'home'
            proposed = (prefix+'__'+source_file) if (component == 'Section / Hero' or prefer_page_folder) else source_file
            records.append({**f, 'propLabel':label, 'propId':ep['propId'], 'status':status, 'reason':reason, 'sourceUrl':source_url, 'sourceFile':source_file, 'assetId':assetId, 'assetName':assetName, 'assetFolder':assetFolder, 'matchType':match, 'confidence':confidence, 'proposedUploadFolderId':folder, 'proposedUploadDisplayName':proposed, 'localDownloadPath':local_download_by_url.get(source_url,'')})
        else:
            records.append({**f, 'propLabel':label, 'propId':ep['propId'], 'status':'manual-review', 'reason':reason or 'no source image found', 'sourceUrl':'', 'sourceFile':'', 'assetId':'', 'assetName':'', 'assetFolder':'', 'matchType':'', 'confidence':confidence})

# Write JSON/CSV/MD
plan={'generatedAt':__import__('datetime').datetime.utcnow().isoformat()+'Z','siteId':'6a1e37436b332da28ecc3001','summary':dict(Counter(r['status'] for r in records)),'records':records}
json_path=OUT/'krb-empty-image-remediation-plan-v1.json'; json_path.write_text(json.dumps(plan,indent=2))
csv_path=OUT/'krb-empty-image-remediation-plan-v1.csv'
fields=['status','pageTitle','path','componentName','context','propLabel','propId','componentId','sourceFile','sourceUrl','assetId','assetName','assetFolder','matchType','confidence','reason','proposedUploadFolderId','proposedUploadDisplayName','localDownloadPath']
with open(csv_path,'w',newline='') as f:
    w=csv.DictWriter(f,fieldnames=fields); w.writeheader();
    for r in records: w.writerow({k:r.get(k,'') for k in fields})
ready=[r for r in records if r['status']=='ready-existing-asset']
needs=[r for r in records if r['status']=='needs-upload']
skips=[r for r in records if r['status']=='skip-empty-card']
manual=[r for r in records if r['status']=='manual-review']
md=[]
md.append('# KRB empty image prop remediation plan v1\n')
md.append(f'Generated: {plan["generatedAt"]}\n')
md.append('## Summary\n')
for k,v in plan['summary'].items(): md.append(f'- {k}: {v}')
md.append('\n## Ready existing-asset writes\n')
for r in ready:
    md.append(f"- {r['pageTitle']} `{r['path']}` — {r['componentName']} / {r['propLabel']} → `{r['assetName']}` (`{r['assetId']}`), source `{r['sourceFile']}`, confidence {r['confidence']}")
md.append('\n## Needs upload before write\n')
for r in needs:
    md.append(f"- {r['pageTitle']} `{r['path']}` — {r['componentName']} / {r['propLabel']} needs `{r['sourceFile']}` from {r['sourceUrl']} → folder `{r.get('proposedUploadFolderId','')}`, proposed name `{r.get('proposedUploadDisplayName','')}`")
md.append('\n## Skipped empty repeated-card slots\n')
for r in skips:
    md.append(f"- {r['pageTitle']} `{r['path']}` — {r['componentName']} / {r['propLabel']}: {r['reason']}")
md.append('\n## Manual review\n')
for r in manual:
    md.append(f"- {r['pageTitle']} `{r['path']}` — {r['componentName']} / {r['propLabel']}: {r['reason']}")
md_path=OUT/'krb-empty-image-remediation-plan-v1.md'; md_path.write_text('\n'.join(md))
print(json.dumps(plan['summary'], indent=2))
print(json_path); print(csv_path); print(md_path)
