import json
import re
from pathlib import Path

import lxml.html
import requests

TARGETS = {
    'Our Campus': 'https://www.krb.nsw.edu.au/about-us/our-campus/',
    'Our People': 'https://www.krb.nsw.edu.au/about-us/our-people/',
    'School Board': 'https://www.krb.nsw.edu.au/about-us/our-people/school-board/',
    'Senior Executive': 'https://www.krb.nsw.edu.au/about-us/our-people/senior-executive/',
    'Employment': 'https://www.krb.nsw.edu.au/about-us/our-people/employment/',
    'Our Policies': 'https://www.krb.nsw.edu.au/about-us/our-policies/',
}


def clean_text(s):
    return re.sub(r'\s+', ' ', (s or '')).strip()


def inner_html(node):
    parts = []
    if node.text and node.text.strip():
        parts.append(node.text)
    for child in node:
        parts.append(lxml.html.tostring(child, encoding='unicode', method='html'))
    html = ''.join(parts).strip()
    html = re.sub(r'\s+', ' ', html)
    return html


def abs_url(url):
    if not url:
        return None
    if url.startswith('//'):
        return 'https:' + url
    if url.startswith('/'):
        return 'https://www.krb.nsw.edu.au' + url
    return url


def parse_page(name, url):
    html = requests.get(url, timeout=30).text
    doc = lxml.html.fromstring(html)
    page = {'name': name, 'url': url, 'hero': {'heading': name}, 'textContent': [], 'twoColumn': [], 'gatewayLinks': [], 'accordions': [], 'downloads': [], 'nextPages': []}
    # hero image candidates
    hero = doc.xpath('//main//*[contains(concat(" ",normalize-space(@class)," ")," page-header ")][1]')
    if hero:
        imgs = []
        for el in hero[0].xpath('.//*[@style or @data-bg or @data-src or @src]'):
            for attr in ['data-bg','data-src','src']:
                if el.get(attr): imgs.append(abs_url(el.get(attr)))
            st = el.get('style') or ''
            m = re.search(r'url\(["\']?([^"\')]+)', st)
            if m: imgs.append(abs_url(m.group(1)))
        page['hero']['images'] = list(dict.fromkeys([x for x in imgs if x]))
    intro = doc.xpath('//main//*[contains(concat(" ",normalize-space(@class)," ")," intro--page ")][1]')
    if intro:
        h = clean_text(' '.join(intro[0].xpath('.//*[self::h1 or self::h2 or self::h3][1]//text()')))
        content_nodes = intro[0].xpath('.//*[contains(concat(" ",normalize-space(@class)," ")," intro__content ")][1]')
        txt = clean_text(content_nodes[0].text_content() if content_nodes else intro[0].text_content().replace(h,''))
        page['textContent'].append({'source':'intro', 'heading': h or '', 'text': txt, 'html': inner_html(content_nodes[0]) if content_nodes else txt})
    for mixed in doc.xpath('//main//*[contains(concat(" ",normalize-space(@class)," ")," mixed--default ")]'):
        content_nodes = mixed.xpath('.//*[contains(concat(" ",normalize-space(@class)," ")," mixed__content ")][1]')
        txt = clean_text(content_nodes[0].text_content() if content_nodes else mixed.text_content())
        if txt:
            page['textContent'].append({'source':'mixed', 'heading':'', 'text': txt, 'html': inner_html(content_nodes[0]) if content_nodes else txt})
    for block in doc.xpath('//main//*[contains(concat(" ",normalize-space(@class)," ")," text-image--default ")]'):
        h = clean_text(' '.join(block.xpath('.//*[contains(concat(" ",normalize-space(@class)," ")," text-image__heading ")]//text()')))
        cnode = block.xpath('.//*[contains(concat(" ",normalize-space(@class)," ")," text-image__content ")][1]')
        txt = clean_text(cnode[0].text_content() if cnode else block.text_content().replace(h,''))
        img_urls = []
        for img in block.xpath('.//img'):
            img_urls.append(abs_url(img.get('data-src') or img.get('src')))
        page['twoColumn'].append({'heading':h,'text':txt,'html':inner_html(cnode[0]) if cnode else txt,'images':[x for x in img_urls if x]})
    for gate in doc.xpath('//main//*[contains(concat(" ",normalize-space(@class)," ")," gateway-rows--default ")]//*[contains(concat(" ",normalize-space(@class)," ")," gateway-rows__item ") or self::a]'):
        href = gate.get('href') or (gate.xpath('.//a/@href') or [None])[0]
        title = clean_text(' '.join(gate.xpath('.//*[self::h2 or self::h3 or self::h4][1]//text()'))) or clean_text(gate.text_content()).split(' Learn more')[0][:80]
        text = clean_text(gate.text_content())
        if href and title:
            page['gatewayLinks'].append({'title':title,'text':text,'url':abs_url(href)})
    # de-dupe gateway links by title/url
    seen=set(); links=[]
    for item in page['gatewayLinks']:
        k=(item['title'],item['url'])
        if k not in seen:
            seen.add(k); links.append(item)
    page['gatewayLinks']=links
    for acc in doc.xpath('//main//*[contains(concat(" ",normalize-space(@class)," ")," accordion--default ")]'):
        heading = clean_text(' '.join(acc.xpath('.//*[contains(concat(" ",normalize-space(@class)," ")," accordion__title ")]//text()')))
        for item in acc.xpath('.//*[contains(concat(" ",normalize-space(@class)," ")," accordion__item ")]'):
            title = clean_text(' '.join(item.xpath('.//*[contains(concat(" ",normalize-space(@class)," ")," accordion__item__title ")]//text()')))
            cnode = item.xpath('.//*[contains(concat(" ",normalize-space(@class)," ")," accordion__item__content ")][1]')
            txt = clean_text(cnode[0].text_content() if cnode else '')
            page['accordions'].append({'groupHeading': heading, 'title': title, 'text': txt, 'html': inner_html(cnode[0]) if cnode else txt})
    for dl in doc.xpath('//main//*[contains(concat(" ",normalize-space(@class)," ")," downloads--default ")]'):
        heading = clean_text(' '.join(dl.xpath('.//*[contains(concat(" ",normalize-space(@class)," ")," downloads__title ")]//text()')))
        files=[]
        for a in dl.xpath('.//a[@href]'):
            label = clean_text(a.text_content()).removesuffix(' PDF').strip()
            files.append({'label': label, 'url': abs_url(a.get('href'))})
        page['downloads'].append({'heading':heading, 'files':files})
    for a in doc.xpath('//main//*[contains(concat(" ",normalize-space(@class)," ")," related ")]//a[@href]'):
        title=clean_text(a.text_content())
        if title:
            page['nextPages'].append({'title':title,'url':abs_url(a.get('href'))})
    return page

out = {name: parse_page(name, url) for name, url in TARGETS.items()}
Path('/Users/iggy/.hermes/profiles/ignite_team/outbound/krb-about-us-source-content.json').write_text(json.dumps(out, indent=2, ensure_ascii=False))
print(json.dumps({k:{'textContent':len(v['textContent']),'twoColumn':len(v['twoColumn']),'gatewayLinks':len(v['gatewayLinks']),'accordions':len(v['accordions']),'downloads':len(v['downloads']),'nextPages':len(v['nextPages'])} for k,v in out.items()}, indent=2))
