from pathlib import Path
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import csv, json, re, zipfile

BASE='https://www.krb.nsw.edu.au'
OUT=Path('/Users/iggy/.hermes/profiles/ignite_team/outbound')
RAW=OUT/'krb-live-scrape-html'
SCRAPE=OUT/'krb-admissions-child-pages-live-scrape-v1.json'
CSVS=[OUT/'krb-image-grab/krb-webflow-other-static-images-uploaded-to-webflow.csv', OUT/'krb-image-grab/krb-webflow-page-images-uploaded-to-webflow.csv']
TEMPLATE=OUT/'krb-advancement-community-full-section-design-api-import-v1.js'
BATCH=OUT/'krb-admissions-child-pages-batch-v1.json'
ATTENTION=OUT/'krb-admissions-child-pages-live-scrape-v1-attention.csv'

PAGE_ID_BY_URL={
    'https://www.krb.nsw.edu.au/admissions/admissions-enquiry/':'6a2b905fabf3beeca90a3e79',
    'https://www.krb.nsw.edu.au/admissions/book-a-call-with-admissions/':'6a2b90664c4cb4583d6a9e0a',
    'https://www.krb.nsw.edu.au/admissions/discovery-mornings/':'6a2b906c3fac7258e1e05b31',
    'https://www.krb.nsw.edu.au/admissions/enrolment/':'6a2b9073c109aa58bcedb001',
    'https://www.krb.nsw.edu.au/admissions/fees/':'6a2b90798235360ef3ac3110',
    'https://www.krb.nsw.edu.au/admissions/scholarships-bursaries/':'6a2b90804b3ae330a69bd58b',
    'https://www.krb.nsw.edu.au/admissions/virtual-tours/':'6a2b908641ccee79f62de38c',
    'https://www.krb.nsw.edu.au/admissions/':'6a2b909fc0baa42668259da8',
}

def clean(s): return re.sub(r'\s+',' ',(s or '').replace('\xa0',' ')).strip()
def compact(s): return re.sub(r'[^a-z0-9]+','', (s or '').lower())
def filename(url): return urlparse(url or '').path.rsplit('/',1)[-1]
def strip_ext(n): return re.sub(r'\.[a-z0-9]+$','',n or '', flags=re.I)
def strip_size(n): return re.sub(r'-(?:p-)?\d+x\d+(?=\.[a-z0-9]+$)','',n or '', flags=re.I)
def strip_scaled(n): return re.sub(r'-scaled(?:-\d+)?$','',n or '', flags=re.I)
def asset_key_from_name(n):
    n=re.sub(r'^[a-f0-9]{24}_','',n or '',flags=re.I)
    n=strip_size(n)
    stem=strip_scaled(strip_ext(n)).replace('scaled','')
    return compact(stem)
def variants(n):
    if not n: return []
    n=re.sub(r'^[a-f0-9]{24}_','',n,flags=re.I)
    base=strip_size(n); stem=strip_ext(base); ext=''.join(re.findall(r'\.[a-z0-9]+$',base,flags=re.I))
    vals=[base, stem+ext, strip_scaled(stem)+ext, stem+'-scaled'+ext, strip_scaled(stem)+'-scaled'+ext]
    out=[]
    for v in vals:
        if v and v not in out: out.append(v)
    return out

asset_rows=[]; by_key={}; fallbacks={}
for csv_path in CSVS:
    if not csv_path.exists():
        continue
    with csv_path.open(newline='',encoding='utf-8') as f:
        for r in csv.DictReader(f):
            asset_rows.append(r)
            names=[r.get('filename'), filename(r.get('canonical_url',''))]
            for u in (r.get('source_urls_collapsed') or '').split(' | '): names.append(filename(u))
            for u in (r.get('sample_page_urls') or '').split(' | '): names.append(filename(u))
            for n in names:
                for v in variants(n):
                    k=asset_key_from_name(v)
                    if k and k not in by_key: by_key[k]=r

def match_asset(url):
    if not url: return None
    full=urljoin(BASE,url)
    names=[filename(full)] + variants(filename(full))
    row=None
    for n in names:
        k=asset_key_from_name(n)
        if k in by_key:
            row=by_key[k]; break
    if not row:
        return {'sourceUrl':full,'sourceFile':filename(full)}
    spec={'sourceUrl':full,'sourceFile':filename(full),'displayName':row.get('filename'),'assetUrl':row.get('canonical_url'),'assetId':row.get('asset_id')}
    k=asset_key_from_name(row.get('filename'))
    if k and row.get('asset_id'):
        fallbacks[k]={'assetId':row.get('asset_id'),'displayName':row.get('filename'),'assetUrl':row.get('canonical_url')}
    return spec

def urls_in(el):
    urls=[]
    if not el: return []
    for node in el.select('[data-src], img[src], [style]'):
        if node.name=='img' and node.get('src'): urls.append(node.get('src'))
        if node.get('data-src'): urls.append(node.get('data-src'))
        if node.get('style'):
            urls.extend(re.findall(r"url\(['\"]?([^)'\"]+)", node.get('style')))
    seen=[]
    for u in urls:
        if '-150x150' in u: continue
        if u not in seen: seen.append(u)
    for u in urls:
        if u not in seen: seen.append(u)
    return [urljoin(BASE,u) for u in seen]

def link_obj(a):
    if not a: return None
    href=urljoin(BASE,a.get('href',''))
    href_norm=href if href.endswith('/') or '?' in href or href.startswith('mailto:') else href+'/'
    return {'text': clean(a.get_text(' ',strip=True)) or 'Learn more', 'url': href, 'target': a.get('target') or '_self', 'pageId': PAGE_ID_BY_URL.get(href_norm) or PAGE_ID_BY_URL.get(href)}
def html_inner(el): return ' '.join(str(c) for c in el.contents).strip() if el else ''
def slugish(s): return re.sub(r'[^a-z0-9]+','-',s.lower()).strip('-')
def remove_selectors(el, selectors):
    clone=BeautifulSoup(str(el),'html.parser')
    root=clone.find()
    for sel in selectors:
        for n in clone.select(sel): n.decompose()
    return html_inner(root or clone)

def cta_section(cta, source_child):
    body=cta.select_one('.cta__body') or cta
    heading=clean((body.select_one('h1,h2,h3,h4,h5,h6') or '').get_text(' ',strip=True)) if body else ''
    body_html=remove_selectors(body, ['h1','h2','h3','h4','h5','h6']) or html_inner(body)
    links=[link_obj(a) for a in cta.select('a[href]')]
    img_url=(urls_in(cta) or [None])[0]
    return {'type':'cta','component':'Section / CTA','heading':heading or clean(body.get_text(' ',strip=True))[:80], 'paragraph':clean(BeautifulSoup(body_html,'html.parser').get_text(' ',strip=True)), 'html':body_html, 'links':[l for l in links if l], 'image':match_asset(img_url) if img_url else None, 'sourceChildNumber':source_child}

def video_section(video, source_child):
    heading=clean((video.select_one('.block-header__title,h2,h3') or video).get_text(' ',strip=True))
    iframe=video.select_one('iframe')
    src=(iframe.get('data-src') or iframe.get('src') or '') if iframe else ''
    embed=str(iframe) if iframe else src
    return {'type':'video','component':'Section / Two Column Text & Image','heading':heading or 'Video','text':'','html':'','videoEmbed':embed,'videoUrl':src,'image':None,'sourceChildNumber':source_child}

def gallery_cluster(gal, heading='', paragraph='', source_child=None, limit=4):
    imgs=[match_asset(u) for u in urls_in(gal)[:limit]]
    if not imgs: return None
    return {'type':'imageCluster','component':'Section / Image Cluster','heading':heading,'paragraph':paragraph,'images':imgs,'sourceChildNumber':source_child}

scraped=json.loads(SCRAPE.read_text())
source={}; manual=[]
for name,page in scraped.items():
    html_file=RAW/(slugish(name)+'.html')
    soup=BeautifulSoup(html_file.read_text(errors='ignore'),'html.parser')
    page_source={'sourceUrl':page['sourceUrl'],'meta':page['meta'],'hero':{'heading':page.get('hero',{}).get('heading') or name,'image':None,'sourceChildNumber':page.get('hero',{}).get('sourceChildNumber')},'sections':[],'manual':[]}
    att_queues={}
    for att in page.get('attention',[]):
        cls=' '.join(att.get('classes') or [])
        key='other'
        if 'call-to-action' in cls: key='cta'
        elif 'video' in cls: key='video'
        elif 'gallery-slider' in cls: key='gallery'
        elif 'table' in cls: key='table'
        elif 'quicklinks-files' in cls: key='quicklinks'
        elif 'blockquote' in cls: key='blockquote'
        att_queues.setdefault(key,[]).append(att.get('sourceChildNumber'))
    def next_child(key):
        q=att_queues.setdefault(key,[])
        return q.pop(0) if q else None
    header=soup.select_one('.page-header')
    us=urls_in(header)
    if us: page_source['hero']['image']=match_asset(us[0])
    for block in page.get('textContent',[]):
        page_source['sections'].append({'type':'textContent','component':'Section / Text Content','heading':block.get('heading') or name,'text':block.get('text') or '', 'html':block.get('html') or block.get('text') or '', 'image':None,'sourceChildNumber':block.get('sourceChildNumber'), 'classes':block.get('classes') or []})
        stripped=[match_asset((img or {}).get('sourceUrl')) for img in (block.get('imagesStripped') or []) if (img or {}).get('sourceUrl')]
        if stripped:
            page_source['sections'].append({'type':'imageCluster','component':'Section / Image Cluster','heading':block.get('heading') or name,'paragraph':'Source image(s) stripped from rich content; verify placement visually.', 'images':stripped[:4], 'sourceChildNumber':block.get('sourceChildNumber')})
    for block in page.get('twoColumn',[]):
        imgs=[match_asset((i or {}).get('sourceUrl')) for i in block.get('images',[]) if (i or {}).get('sourceUrl')]
        page_source['sections'].append({'type':'twoColumn','component':'Section / Two Column Text & Image','heading':block.get('heading') or '', 'text':block.get('text') or '', 'html':block.get('html') or block.get('text') or '', 'image':imgs[0] if imgs else None, 'sourceChildNumber':block.get('sourceChildNumber')})
        if len(imgs) > 1:
            page_source['sections'].append({'type':'imageCluster','component':'Section / Image Cluster','heading':block.get('heading') or '', 'paragraph':'Additional source gallery images from this section.', 'images':imgs[:4], 'sourceChildNumber':block.get('sourceChildNumber')})
    for block in soup.select('.gateway-rows'):
        for card in block.select('.article'):
            title_a=card.select_one('.article__title-link') or card.select_one('a[href]')
            link_a=card.select_one('.article__link[href]') or title_a
            desc=clean((card.select_one('.article__desc') or card).get_text(' ',strip=True))
            title=clean(title_a.get_text(' ',strip=True)) if title_a else ''
            if title and desc.startswith(title): desc=desc[len(title):].strip()
            if desc.endswith('Learn more'): desc=desc[:-len('Learn more')].strip()
            img_url=(urls_in(card) or [None])[0]
            page_source['sections'].append({'type':'gatewayCta','component':'Section / Gateway CTA','sourceChildNumber':14,'title':title,'heading':title,'paragraph':desc,'buttonText':clean(link_a.get_text(' ',strip=True)) if link_a else 'Learn more','link':link_obj(link_a),'image':match_asset(img_url) if img_url else None})
    # Accordions map to Section / FAQs with Accordion Item children.
    for acc in page.get('accordions',[]):
        items=[]
        for idx,it in enumerate(acc.get('items',[]) or []):
            item_soup=BeautifulSoup(it.get('html') or '', 'html.parser')
            title_el=item_soup.select_one('.accordion__item__title span') or item_soup.select_one('.accordion__item__title')
            content_el=item_soup.select_one('.accordion__item__content') or item_soup
            title=clean(title_el.get_text(' ',strip=True)) if title_el else (it.get('heading') or f'Item {idx+1}')
            items.append({'title':title,'html':html_inner(content_el),'text':clean(content_el.get_text(' ',strip=True))})
        if items:
            page_source['sections'].append({'type':'faqs','component':'Section / FAQs','heading':acc.get('heading') or 'FAQs','paragraph':'','items':items,'sourceChildNumber':acc.get('sourceChildNumber')})
    # Downloads as individual CTAs
    for d in page.get('downloads',[]):
        for file in d.get('files',[]):
            page_source['sections'].append({'type':'cta','component':'Section / CTA','heading':file.get('label') or 'Download','paragraph':'','html':'','links':[{'text':file.get('label') or 'Download','url':file.get('url'),'target':'_blank','pageId':None}], 'image':None, 'sourceChildNumber':d.get('sourceChildNumber')})
    # Table blocks are preserved as Text Content sections with source HTML.
    for table_block in soup.select('.table'):
        heading_el=table_block.select_one('.block-header__title,h2,h3')
        heading=clean(heading_el.get_text(' ',strip=True)) if heading_el else ''
        body=table_block.select_one('.table__container') or table_block
        page_source['sections'].append({'type':'textContent','component':'Section / Text Content','heading':heading or 'Fees table','text':clean(body.get_text(' ',strip=True)), 'html':html_inner(body),'image':None,'sourceChildNumber':next_child('table')})
    # Quicklinks/file-list blocks become one CTA per file/link so PDF/download URLs are not flattened away.
    for qf in soup.select('.quicklinks-files'):
        source_child=next_child('quicklinks')
        links=[link_obj(a) for a in qf.select('a[href]')]
        for l in [x for x in links if x]:
            page_source['sections'].append({'type':'cta','component':'Section / CTA','heading':l.get('text') or 'Download','paragraph':'','html':'','links':[l], 'image':None, 'sourceChildNumber':source_child})
        if links:
            msg=f"{name}: quicklinks/files source child {source_child or '?'} mapped to {len(links)} CTA download/link section(s); verify download presentation."
            page_source['manual'].append(msg); manual.append(msg)
    # CTA blocks from unmatched/source HTML
    for cta in soup.select('.call-to-action'):
        page_source['sections'].append(cta_section(cta, next_child('cta')))
    # Videos mapped to Two Column Text & Image video embed props
    for video in soup.select('.video'):
        page_source['sections'].append(video_section(video, next_child('video')))
    # Tabbed content expanded into static Text Content/Image Cluster sections.
    for tabbed in soup.select('.tabbed-content'):
        heading=clean((tabbed.select_one('.block-header__title') or tabbed.select_one('h2') or '').get_text(' ',strip=True))
        tab_titles=[clean(t.get_text(' ',strip=True)) for t in tabbed.select('.tabbed-content__tab .tabbed-content__title')]
        for idx,item in enumerate(tabbed.select('.tabbed-content__content')):
            title=tab_titles[idx] if idx < len(tab_titles) else clean((item.select_one('h3,h4') or '').get_text(' ',strip=True))
            body=item.select_one('.tabbed-content__body') or item
            full_heading=(heading + ' — ' + title).strip(' —')
            page_source['sections'].append({'type':'textContent','component':'Section / Text Content','heading':full_heading or title or heading or 'Tabbed content','text':clean(body.get_text(' ',strip=True)),'html':html_inner(body),'image':None,'sourceChildNumber':None})
            cl=gallery_cluster(item, full_heading, '', None, 4)
            if cl: page_source['sections'].append(cl)
        msg=f"{name}: tabbed-content source block expanded into static text/image sections; verify whether a native tab interaction is required."
        if tabbed: page_source['manual'].append(msg); manual.append(msg)
    # Blockquote sections are preserved as text content with a manual visual-verification note.
    for bq in soup.select('.blockquote'):
        source_child=next_child('blockquote')
        quote_html=html_inner(bq)
        quote_text=clean(bq.get_text(' ',strip=True))
        page_source['sections'].append({'type':'textContent','component':'Section / Text Content','heading':'Quote','text':quote_text,'html':quote_html,'image':None,'sourceChildNumber':source_child})
        msg=f"{name}: blockquote source child {source_child or '?'} mapped to Text Content; verify visual quote treatment."
        page_source['manual'].append(msg); manual.append(msg)
    # Standalone galleries that were not already included by text-image/tabbed are inserted as image clusters.
    for gal in soup.select('.gallery-slider'):
        if gal.find_parent(class_='text-image__image') or gal.find_parent(class_='tabbed-content__content'):
            continue
        cl=gallery_cluster(gal, '', '', next_child('gallery'), 4)
        if cl: page_source['sections'].append(cl)
    # Keep source-attention records as manual review items.
    for att in page.get('attention',[]):
        cls=' '.join(att.get('classes') or [])
        # Some are mapped above; still useful as verification note.
        msg=f"{name}: source child {att.get('sourceChildNumber')} ({cls}) was not natively recognised by scraper; mapped where safe, verify visually."
        page_source['manual'].append(msg); manual.append(msg)
    page_source['sections'].sort(key=lambda sec: (9999 if sec.get('sourceChildNumber') is None else sec.get('sourceChildNumber'), sec.get('component') or '', sec.get('heading') or sec.get('title') or ''))
    source[name]=page_source

expected={}
for name,page in source.items():
    c={}
    for sec in page['sections']:
        c[sec['component']]=c.get(sec['component'],0)+1
    expected[name]=c

full_json=OUT/'krb-admissions-child-pages-full-sections-v1.json'
full_json.write_text(json.dumps(source,ensure_ascii=False,indent=2))
expected_json=OUT/'krb-admissions-child-pages-expected-v1.json'
expected_json.write_text(json.dumps(expected,ensure_ascii=False,indent=2))

js=TEMPLATE.read_text()
def replace_const(text,name,value):
    start=text.find(f'  const {name} = ')
    if start<0: raise RuntimeError(name)
    end=text.find('\n', start)
    return text[:start]+f'  const {name} = '+json.dumps(value,ensure_ascii=False,separators=(',',':'))+';'+text[end:]
js=replace_const(js,'SOURCE',source)
js=replace_const(js,'EXPECTED',expected)
js=replace_const(js,'ASSET_FALLBACKS',fallbacks)
config={
 'dryRun': True,
 'runMode':'all-pages',
 'insertMissingTopLevelSections': True,
 'includeHeroImages': True,
 'includeSectionImages': True,
 'stopOnError': False,
 'switchDelayMs': 1200,
 'resultPrefix':'KRB_ADMISSIONS_CHILD_PAGES_FULL_SECTION_IMPORT_V1_RESULT',
 'allPagesResultPrefix':'KRB_ADMISSIONS_CHILD_PAGES_FULL_SECTION_IMPORT_V1_ALL_RESULT'
}
js=replace_const(js,'CONFIG',config)
js=js.replace('KRB Advancement & Community full-section Design API Playground importer v1. No publish/delete.','KRB Admissions child pages full-section Design API Playground importer v1. No publish/delete.')
js=js.replace('current-page-advancement-community-full-section-v1','current-page-admissions-child-pages-full-section-v1')
js=js.replace('all-pages-advancement-community-full-section-v1','all-pages-admissions-child-pages-full-section-v1')
js=js.replace('advancement/community full-section v1:', 'admissions child pages full-section v1:')
js=js.replace('KRB advancement/community full-section v1:', 'KRB admissions child pages full-section v1:')
js=js.replace('advancement/community full-section v1', 'admissions child pages full-section v1')
# Add video population using existing Two Column Text & Image video props discovered in prior audit.
video_fn="""\n  async function populateVideo(el, sec, report, ctx) { await setProp(el, 'Content/Eyebrow', '', report, ctx); await setProp(el, 'Content/Heading', sec.heading || 'Video', report, ctx); await setProp(el, 'Content/Paragraph', sec.text || '', report, ctx); await setProp(el, 'Visual/Video Embed', sec.videoEmbed || sec.videoUrl || '', report, ctx); }"""
js=js.replace("  async function populateImageCluster(el, sec, report, ctx)", video_fn+"\n  async function populateImageCluster(el, sec, report, ctx)")
js=js.replace("else if (sec.type === 'twoColumn') await populateTwoCol(el, sec, report, ctx); else if (sec.type === 'imageCluster')", "else if (sec.type === 'twoColumn') await populateTwoCol(el, sec, report, ctx); else if (sec.type === 'video') await populateVideo(el, sec, report, ctx); else if (sec.type === 'imageCluster')")
# Insert components in source-section order rather than by grouped component type.
old="""for (const [componentName, expectedCount] of Object.entries(EXPECTED[pageName] || {})) { const have = report.countsBefore[componentName] || 0; if (have < expectedCount) { report.warnings.push(`${componentName} count ${have} < expected ${expectedCount}; ${expectedCount-have} will be inserted${CONFIG.dryRun ? ' (dry run only)' : ''}`); for (let i=have; i<expectedCount; i++) await appendComponent(target, componentName, report, `${pageName} insert ${componentName} #${i+1}`); } }"""
new="""const plannedIndexes = {}; for (const sec of source.sections || []) { const componentName = sec.component; plannedIndexes[componentName] = (plannedIndexes[componentName] || 0) + 1; const have = report.countsBefore[componentName] || 0; if (have < plannedIndexes[componentName]) { const ctxInsert = `${pageName} insert ${componentName} for source child ${sec.sourceChildNumber || '?'} (${sec.heading || sec.title || sec.type})`; report.warnings.push(`${componentName} count ${have} < needed ordinal ${plannedIndexes[componentName]}; inserting in source order${CONFIG.dryRun ? ' (dry run only)' : ''}`); await appendComponent(target, componentName, report, ctxInsert); } }"""
if old not in js:
    raise RuntimeError('expected insertion loop not found')
js=js.replace(old,new)

js_path=OUT/'krb-admissions-child-pages-full-section-design-api-import-v1.js'
js_path.write_text(js)
notes=OUT/'krb-admissions-child-pages-full-section-design-api-import-v1-notes.md'
notes.write_text(f"""# KRB Admissions child pages full-section importer v1

Dry-run-first Code Lab / Design API Playground importer for empty-main-slot Admissions child pages:

- Admissions Enquiry — `/admissions/admissions-enquiry`
- Book a Call with Admissions — `/admissions/book-a-call-with-admissions`
- Discovery Tours — `/admissions/discovery-mornings`
- Enrolment — `/admissions/enrolment`
- Fees — `/admissions/fees`
- Scholarships & Bursaries — `/admissions/scholarships-bursaries`
- Virtual Tours — `/admissions/virtual-tours`

Excluded: parent Admissions page, because the audit found it already has main-slot content.

## Safety

- `dryRun: true` by default.
- No publish, delete, remove, redirect, site settings, CMS schema, global styles, or visibility cleanup.
- Inserts missing top-level sections in source order.
- Uses existing Webflow assets from KRB uploaded static/page-image CSVs; does not upload new assets.
- Image prop handling uses the v3 Asset-object fallback pattern.

## Mapping notes

- Standard source text/mixed blocks map to `Section / Text Content`.
- Embedded Admissions/enrolment iframes are preserved in the Text Content rich HTML where present; verify the embed presentation after dry-run/live run.
- Source CTA blocks map to `Section / CTA`; if more than one link is present, the first is set and extra links are reported for visual/manual follow-up.
- Source Fees table maps to `Section / Text Content` with source table HTML preserved for visual verification.
- Source quicklinks/files map to individual `Section / CTA` download/link sections.
- Source video blocks map to `Section / Two Column Text & Image` using its `Visual/Video Embed` prop.
- Rich-text images stripped by the scraper are reintroduced as `Section / Image Cluster` sections and flagged for visual placement review.

## Expected section counts

```json
{json.dumps(expected, indent=2, ensure_ascii=False)}
```

## Manual / visual verification notes

""" + ('\n'.join(f'- {m}' for m in sorted(set(manual))) if manual else '- None identified in source read.') + "\n\n## Run marker\n\nDry-run output marker: `KRB_ADMISSIONS_CHILD_PAGES_FULL_SECTION_IMPORT_V1_ALL_RESULT`\n\nAfter dry-run review, change only `dryRun: true` to `dryRun: false` for live run.\n")
zip_path=OUT/'krb-admissions-child-pages-full-section-design-api-import-v1.zip'
with zipfile.ZipFile(zip_path,'w',zipfile.ZIP_DEFLATED) as z:
    for p in [js_path, notes, full_json, expected_json, BATCH, ATTENTION]:
        z.write(p,p.name)
print(json.dumps({'source':str(full_json),'expected':expected,'manual_count':len(set(manual)),'js':str(js_path),'zip':str(zip_path),'fallbacks':len(fallbacks)},indent=2,ensure_ascii=False))
