from pathlib import Path
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import csv, json, re, zipfile

BASE='https://www.krb.nsw.edu.au'
OUT=Path('/Users/iggy/.hermes/profiles/ignite_team/outbound')
RAW=OUT/'krb-live-scrape-html-advancement-community-v1'
SCRAPE=OUT/'krb-advancement-community-live-scrape-v1.json'
CSV=OUT/'krb-image-grab/krb-webflow-other-static-images-uploaded-to-webflow.csv'
TEMPLATE=OUT/'krb-co-curricular-sport-image-repair-v3.js'

PAGE_ID_BY_URL={
    'https://www.krb.nsw.edu.au/community/parents-friends-association/':'6a2b909bfd42911cf787cc0f',
    'https://www.krb.nsw.edu.au/community/alumnae/':'6a2b909455245a9d7e518417',
    'https://www.krb.nsw.edu.au/community/weddings/':'6a2b90a19cfde20f4b4eba4b',
    'https://www.krb.nsw.edu.au/support/':'6a2b929655245a9d7e51fbd9',
    'https://www.krb.nsw.edu.au/news/':'6a1e37436b332da28ecc2fdd',
}

def clean(s): return re.sub(r'\s+',' ',(s or '').replace('\xa0',' ')).strip()
def compact(s): return re.sub(r'[^a-z0-9]+','', (s or '').lower())
def filename(url): return urlparse(url or '').path.rsplit('/',1)[-1]
def strip_ext(n): return re.sub(r'\.[a-z0-9]+$','',n or '', flags=re.I)
def strip_size(n): return re.sub(r'-(?:p-)?\d+x\d+(?=\.[a-z0-9]+$)','',n or '', flags=re.I)
def strip_scaled(n): return re.sub(r'-scaled(?:-\d+)?$','',n or '', flags=re.I)
def asset_key_from_name(n):
    n=re.sub(r'^[a-f0-9]{24}_','',n or '',flags=re.I)
    n=strip_size(n)
    stem=strip_scaled(strip_ext(n)).replace('scaled','')
    return compact(stem)
def variants(n):
    if not n: return []
    n=re.sub(r'^[a-f0-9]{24}_','',n,flags=re.I)
    base=strip_size(n); stem=strip_ext(base); ext=''.join(re.findall(r'\.[a-z0-9]+$',base,flags=re.I))
    vals=[base, stem+ext, strip_scaled(stem)+ext, stem+'-scaled'+ext, strip_scaled(stem)+'-scaled'+ext]
    out=[]
    for v in vals:
        if v and v not in out: out.append(v)
    return out

asset_rows=[]; by_key={}; fallbacks={}
with CSV.open(newline='',encoding='utf-8') as f:
    for r in csv.DictReader(f):
        asset_rows.append(r)
        names=[r.get('filename'), filename(r.get('canonical_url',''))]
        for u in (r.get('source_urls_collapsed') or '').split(' | '): names.append(filename(u))
        for n in names:
            for v in variants(n):
                k=asset_key_from_name(v)
                if k and k not in by_key: by_key[k]=r

def match_asset(url):
    if not url: return None
    full=urljoin(BASE,url)
    names=[filename(full)]
    for n in variants(filename(full)):
        names.append(n)
    row=None
    for n in names:
        k=asset_key_from_name(n)
        if k in by_key:
            row=by_key[k]; break
    if not row:
        return {'sourceUrl':full,'sourceFile':filename(full)}
    spec={'sourceUrl':full,'sourceFile':filename(full),'displayName':row.get('filename'),'assetUrl':row.get('canonical_url'),'assetId':row.get('asset_id')}
    k=asset_key_from_name(row.get('filename'))
    if k and row.get('asset_id'):
        fallbacks[k]={'assetId':row.get('asset_id'),'displayName':row.get('filename'),'assetUrl':row.get('canonical_url')}
    return spec

def urls_in(el):
    urls=[]
    for node in el.select('[data-src], img[src], [style]'):
        if node.name=='img' and node.get('src'): urls.append(node.get('src'))
        if node.get('data-src'): urls.append(node.get('data-src'))
        if node.get('style'):
            urls.extend(re.findall(r"url\(['\"]?([^)'\"]+)", node.get('style')))
    # prefer non-thumbnail, unique
    seen=[]
    for u in urls:
        if '-150x150' in u: continue
        if u not in seen: seen.append(u)
    for u in urls:
        if u not in seen: seen.append(u)
    return [urljoin(BASE,u) for u in seen]

def link_obj(a):
    if not a: return None
    href=urljoin(BASE,a.get('href',''))
    href_norm=href if href.endswith('/') or '?' in href or href.startswith('mailto:') else href+'/'
    return {'text': clean(a.get_text(' ',strip=True)) or 'Learn more', 'url': href, 'target': a.get('target') or '_self', 'pageId': PAGE_ID_BY_URL.get(href_norm) or PAGE_ID_BY_URL.get(href)}

def html_inner(el): return ' '.join(str(c) for c in el.contents).strip()

scraped=json.loads(SCRAPE.read_text())
source={}
manual=[]

for name,page in scraped.items():
    html_file=RAW/(re.sub(r'[^a-z0-9]+','-', name.lower()).strip('-')+'.html')
    soup=BeautifulSoup(html_file.read_text(errors='ignore'),'html.parser')
    page_source={'sourceUrl':page['sourceUrl'],'meta':page['meta'],'hero':{'heading':page.get('hero',{}).get('heading') or name,'image':None,'sourceChildNumber':page.get('hero',{}).get('sourceChildNumber')},'sections':[],'manual':[]}
    header=soup.select_one('.page-header')
    if header:
        us=urls_in(header)
        if us: page_source['hero']['image']=match_asset(us[0])
    # text content
    for block in page.get('textContent',[]):
        page_source['sections'].append({'type':'textContent','component':'Section / Text Content','heading':block.get('heading') or name,'text':block.get('text') or '', 'html':block.get('html') or block.get('text') or '', 'image':None,'sourceChildNumber':block.get('sourceChildNumber'), 'classes':block.get('classes') or []})
    # gateway cards
    for block in soup.select('.gateway-rows'):
        child_no=5
        for card in block.select('.article'):
            title_a=card.select_one('.article__title-link') or card.select_one('a[href]')
            link_a=card.select_one('.article__link[href]') or title_a
            desc=clean((card.select_one('.article__desc') or card).get_text(' ',strip=True))
            title=clean(title_a.get_text(' ',strip=True)) if title_a else ''
            if title and desc.startswith(title): desc=desc[len(title):].strip()
            if desc.endswith('Learn more'): desc=desc[:-len('Learn more')].strip()
            img_url=(urls_in(card) or [None])[0]
            page_source['sections'].append({'type':'gatewayCta','component':'Section / Gateway CTA','sourceChildNumber':child_no,'title':title,'heading':title,'paragraph':desc,'buttonText':clean(link_a.get_text(' ',strip=True)) if link_a else 'Learn more','link':link_obj(link_a),'image':match_asset(img_url) if img_url else None})
    # accordions as FAQ
    for acc in soup.select('.accordion'):
        items=[]
        for item in acc.select('.accordion__item'):
            title=clean((item.select_one('.accordion__item__title span') or item.select_one('.accordion__item__title') or item).get_text(' ',strip=True))
            cont=item.select_one('.accordion__item__content') or item
            items.append({'title':title,'heading':title,'html':html_inner(cont),'text':clean(cont.get_text(' ',strip=True)),'sourceChildNumber':4})
        if items:
            heading=clean((acc.select_one('.block-header__title') or acc.select_one('h2') or acc).get_text(' ',strip=True))
            page_source['sections'].append({'type':'faqs','component':'Section / FAQs','heading':heading or name,'paragraph':'','items':items,'sourceChildNumber':4})
    # downloads -> CTA(s)
    for d in page.get('downloads',[]):
        for file in d.get('files',[]):
            page_source['sections'].append({'type':'cta','component':'Section / CTA','heading':file.get('label') or 'Download','paragraph':'','html':'','links':[{'text':file.get('label') or 'Download','url':file.get('url'),'target':'_blank','pageId':None}], 'image':None, 'sourceChildNumber':d.get('sourceChildNumber')})
    # gallery -> image cluster first four images
    for gal in soup.select('.gallery-slider'):
        imgs=[match_asset(u) for u in urls_in(gal)[:4]]
        if imgs:
            page_source['sections'].append({'type':'imageCluster','component':'Section / Image Cluster','heading':'','paragraph':'','images':imgs,'sourceChildNumber':5})
    # CTA block
    for cta in soup.select('.call-to-action'):
        body=cta.select_one('.cta__body') or cta
        links=[link_obj(a) for a in cta.select('a[href]')]
        page_source['sections'].append({'type':'cta','component':'Section / CTA','heading':'','paragraph':clean(body.get_text(' ',strip=True)),'html':html_inner(body),'links':[l for l in links if l],'image':None,'sourceChildNumber':6})
    # manual unmatched featured stories
    if soup.select_one('.featured-stories'):
        msg=f"{name}: featured-stories block is dynamic/CMS-like; left for manual/CMS mapping rather than static import."
        page_source['manual'].append(msg); manual.append(msg)
    source[name]=page_source

expected={}
for name,page in source.items():
    c={}
    for sec in page['sections']:
        c[sec['component']]=c.get(sec['component'],0)+1
    expected[name]=c

full_json=OUT/'krb-advancement-community-full-sections-v1.json'
full_json.write_text(json.dumps(source,ensure_ascii=False,indent=2))
expected_json=OUT/'krb-advancement-community-expected-v1.json'
expected_json.write_text(json.dumps(expected,ensure_ascii=False,indent=2))

js=TEMPLATE.read_text()
def replace_const(text,name,value):
    start=text.find(f'  const {name} = ')
    if start<0: raise RuntimeError(name)
    end=text.find('\n', start)
    return text[:start]+f'  const {name} = '+json.dumps(value,ensure_ascii=False,separators=(',',':'))+';'+text[end:]
js=replace_const(js,'SOURCE',source)
js=replace_const(js,'EXPECTED',expected)
js=replace_const(js,'ASSET_FALLBACKS',fallbacks)
config={
 'dryRun': True,
 'runMode':'all-pages',
 'insertMissingTopLevelSections': True,
 'includeHeroImages': True,
 'includeSectionImages': True,
 'stopOnError': False,
 'switchDelayMs': 1200,
 'resultPrefix':'KRB_ADVANCEMENT_COMMUNITY_FULL_SECTION_IMPORT_V1_RESULT',
 'allPagesResultPrefix':'KRB_ADVANCEMENT_COMMUNITY_FULL_SECTION_IMPORT_V1_ALL_RESULT'
}
js=replace_const(js,'CONFIG',config)
js=js.replace('KRB Co-curricular & Sport full-section Design API Playground importer v2. No publish/delete.','KRB Advancement & Community full-section Design API Playground importer v1. No publish/delete.')
js=js.replace('current-page-image-repair-v3','current-page-advancement-community-full-section-v1')
js=js.replace('all-pages-image-repair-v3','all-pages-advancement-community-full-section-v1')
js=js.replace('KRB full-section v2','KRB advancement/community full-section v1')
js=js.replace('full-section v2:', 'advancement/community full-section v1:')

js_path=OUT/'krb-advancement-community-full-section-design-api-import-v1.js'
js_path.write_text(js)
notes=OUT/'krb-advancement-community-full-section-design-api-import-v1-notes.md'
notes.write_text(f"""# KRB Advancement & Community full-section importer v1\n\nDry-run-first Code Lab / Design API Playground importer for:\n\n- Advancement & Community — `/community`\n- Alumnae — `/community/alumnae`\n- Parents & Friends Association — `/community/parents-friends-association`\n- Weddings — `/community/weddings`\n\n## Safety\n\n- `dryRun: true` by default.\n- No publish, delete, remove, redirect, site settings, CMS schema, global styles, or visibility cleanup.\n- Inserts missing top-level sections only where component counts are lower than expected.\n- Uses existing Webflow assets from `krb-webflow-other-static-images-uploaded-to-webflow.csv`; does not upload new assets.\n- Image prop handling is based on the v3 image-object fallback pattern from the previous Co-curricular repair.\n\n## Expected section counts\n\n```json\n{json.dumps(expected, indent=2, ensure_ascii=False)}\n```\n\n## Manual / known gaps\n\n""" + ('\n'.join(f'- {m}' for m in manual) if manual else '- None identified in source read.') + "\n\n## Run marker\n\nDry-run output marker: `KRB_ADVANCEMENT_COMMUNITY_FULL_SECTION_IMPORT_V1_ALL_RESULT`\n\nAfter dry-run review, change only `dryRun: true` to `dryRun: false` for live run.\n")
zip_path=OUT/'krb-advancement-community-full-section-design-api-import-v1.zip'
with zipfile.ZipFile(zip_path,'w',zipfile.ZIP_DEFLATED) as z:
    z.write(js_path,js_path.name)
    z.write(notes,notes.name)
    z.write(full_json,full_json.name)
    z.write(expected_json,expected_json.name)
    z.write(OUT/'krb-advancement-community-batch-v1.json','krb-advancement-community-batch-v1.json')
    z.write(OUT/'krb-advancement-community-live-scrape-v1-attention.csv','krb-advancement-community-live-scrape-v1-attention.csv')
print(json.dumps({'source':str(full_json),'expected':expected,'manual':manual,'js':str(js_path),'zip':str(zip_path),'fallbacks':len(fallbacks)},indent=2,ensure_ascii=False))