#!/usr/bin/env python3
from __future__ import annotations
import json, os, re, subprocess, urllib.request, urllib.parse, csv, html
from pathlib import Path
from bs4 import BeautifulSoup

SITE_ID='6a1e37436b332da28ecc3001'
BASE='https://api.webflow.com/v2'
OUT=Path('/Users/iggy/.hermes/profiles/ignite_team/outbound')
WP_URL='https://www.krb.nsw.edu.au/our-approach/wellbeing-framework/'
WF_PATH='/our-approach/wellbeing-framework'

def get_token():
    val=os.environ.get('WEBFLOW_TOKEN')
    if val: return val
    cp=subprocess.run(['op','item','get','bqfhhqbsg5dmhoxof2l5c64xsa','--vault','Vault for Iggy (IGNITE OpenClaw Bot)','--format','json','--reveal'],text=True,capture_output=True,check=True)
    item=json.loads(cp.stdout)
    for f in item.get('fields',[]):
        if f.get('id')=='credential' or f.get('label') in ('credential','password','api key','token'):
            if f.get('value'): return f['value']
    raise RuntimeError('No token field found')

def api_get(path, token):
    req=urllib.request.Request(BASE+path,headers={'Authorization':'Bearer '+token,'accept':'application/json'})
    with urllib.request.urlopen(req,timeout=45) as r:
        return json.load(r)

def fetch_all(path_key_path, token):
    # simple limit/offset for common collection arrays
    items=[]; off=0
    resource_path, key=path_key_path
    while True:
        data=api_get(resource_path+('?' if '?' not in resource_path else '&')+f'limit=100&offset={off}',token)
        batch=data.get(key,[])
        items.extend(batch)
        if len(batch)<100: break
        off+=100
    return items

def txt(s):
    return re.sub(r'\s+',' ',html.unescape(str(s or ''))).strip()

def fetch_wp():
    req=urllib.request.Request(WP_URL,headers={'User-Agent':'Mozilla/5.0'})
    raw=urllib.request.urlopen(req,timeout=45).read().decode('utf-8','ignore')
    soup=BeautifulSoup(raw,'html.parser')
    title=txt((soup.find('h1') or soup.find('title')).get_text(' ') if (soup.find('h1') or soup.find('title')) else '')
    sections=[]
    seen=set()
    for sel in ['main .content-block','main [class*=block]','main [class*=component]','main section','main article']:
        for el in soup.select(sel):
            if id(el) in seen: continue
            body=txt(el.get_text(' '))
            if len(body)<20: continue
            seen.add(id(el))
            heading_el=el.find(re.compile('^h[1-6]$'))
            imgs=[]
            for img in el.find_all('img'):
                src=img.get('src') or img.get('data-src') or img.get('data-lazy-src') or ''
                if src:
                    imgs.append({'src':urllib.parse.urljoin(WP_URL,src),'filename':Path(urllib.parse.urlparse(src).path).name,'alt':txt(img.get('alt'))})
            links=[]
            for a in el.find_all('a',href=True):
                t=txt(a.get_text(' '))
                if t: links.append({'text':t,'href':urllib.parse.urljoin(WP_URL,a['href']),'classes':' '.join(a.get('class',[]))})
            sections.append({'tag':el.name,'classes':' '.join(el.get('class',[])),'heading':txt(heading_el.get_text(' ') if heading_el else ''),'text':body[:1000],'images':imgs,'links':links})
    return {'title':title,'sections':sections,'all_images':sum([s['images'] for s in sections],[])}

def main():
    token=get_token()
    pages=fetch_all((f'/sites/{SITE_ID}/pages','pages'),token)
    target=[]
    for p in pages:
        slug=p.get('slug') or ''
        name=p.get('title') or p.get('name') or ''
        path=p.get('path') or p.get('slug') or ''
        hay=' '.join([slug,name,path]).lower()
        if 'wellbeing' in hay:
            target.append(p)
    # choose matching slug/title
    page=None
    for p in target:
        if (p.get('slug')=='wellbeing-framework') or 'wellbeing framework' in ((p.get('title') or p.get('name') or '').lower()):
            page=p; break
    if not page and target: page=target[0]
    dom=[]
    if page:
        pid=page.get('id')
        off=0
        while True:
            data=api_get(f'/pages/{pid}/dom?limit=100&offset={off}',token)
            batch=data.get('nodes',[]); dom.extend(batch)
            if len(batch)<100: break
            off+=100
    comps=fetch_all((f'/sites/{SITE_ID}/components','components'),token)
    comp_names={c.get('id'):c.get('name') for c in comps}
    component_instances=[]
    for i,n in enumerate(dom,1):
        cid=n.get('componentId') or n.get('component',{}).get('id') if isinstance(n.get('component'),dict) else None
        if cid or n.get('type')=='ComponentInstance':
            overrides=n.get('propertyOverrides') or n.get('overrides') or []
            props=[]
            if isinstance(overrides,dict): overrides=list(overrides.values())
            for o in overrides if isinstance(overrides,list) else []:
                if not isinstance(o,dict): continue
                label=o.get('label') or o.get('name') or o.get('propertyName') or o.get('propName') or o.get('propertyId') or ''
                val=''
                for key in ['text','value','url','assetId','altText']:
                    if key in o:
                        v=o[key]
                        if isinstance(v,dict):
                            val=v.get('text') or v.get('html') or json.dumps(v,ensure_ascii=False)
                        else: val=str(v)
                        break
                props.append({'label':label,'value':txt(val)[:500]})
            component_instances.append({'order':i,'nodeId':n.get('id'),'type':n.get('type'),'componentId':cid,'componentName':comp_names.get(cid,''),'props':props,'rawKeys':sorted(n.keys())})
    wp=fetch_wp()
    assets=json.load(open(OUT/'krb-image-grab/krb-webflow-page-images-assets-readback.json'))
    asset_rows=[]
    seen=set()
    for img in wp['all_images']:
        fn=img['filename']
        if not fn or fn in seen: continue
        seen.add(fn)
        matches=[a for a in assets if fn.lower() in (a.get('displayName') or a.get('originalFileName') or '').lower()]
        asset_rows.append({'wp_filename':fn,'wp_src':img['src'],'wp_alt':img['alt'],'match_count':len(matches),'wf_display_names':'; '.join(m.get('displayName','') for m in matches[:5]),'wf_asset_ids':'; '.join(m.get('id','') for m in matches[:5]),'wf_hosted_urls':'; '.join(m.get('hostedUrl','') for m in matches[:2])})
    result={'webflow_page_candidates':target,'selected_webflow_page':page,'webflow_component_instances':component_instances,'wp':wp,'asset_matches':asset_rows}
    (OUT/'krb-wellbeing-framework-probe.json').write_text(json.dumps(result,indent=2,ensure_ascii=False))
    with open(OUT/'krb-wellbeing-framework-asset-matches.csv','w',newline='',encoding='utf-8') as f:
        w=csv.DictWriter(f,fieldnames=['wp_filename','wp_src','wp_alt','match_count','wf_display_names','wf_asset_ids','wf_hosted_urls']); w.writeheader(); w.writerows(asset_rows)
    with open(OUT/'krb-wellbeing-framework-default-structure-components.csv','w',newline='',encoding='utf-8') as f:
        w=csv.DictWriter(f,fieldnames=['order','nodeId','type','componentId','componentName','prop_count','props_summary','rawKeys']); w.writeheader()
        for ci in component_instances:
            w.writerow({**{k:ci.get(k) for k in ['order','nodeId','type','componentId','componentName']},'prop_count':len(ci['props']),'props_summary':'; '.join(f"{p['label']}: {p['value'][:80]}" for p in ci['props']),'rawKeys':', '.join(ci['rawKeys'])})
    print('selected page:', (page or {}).get('id'), (page or {}).get('title') or (page or {}).get('name'), (page or {}).get('slug'))
    print('wellbeing candidates:', len(target))
    print('dom nodes:', len(dom), 'component instances:', len(component_instances))
    print('wp sections:', len(wp['sections']), 'wp unique images:', len(asset_rows))
    print('asset matches:', sum(1 for r in asset_rows if r['match_count']), 'missing:', sum(1 for r in asset_rows if not r['match_count']))
    print('wrote:', OUT/'krb-wellbeing-framework-probe.json')
    print('wrote:', OUT/'krb-wellbeing-framework-asset-matches.csv')
    print('wrote:', OUT/'krb-wellbeing-framework-default-structure-components.csv')
if __name__=='__main__': main()
