import urllib.request, re, csv, os
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from collections import Counter

pages={
 'Home':'https://www.krb.nsw.edu.au/',
 'Admissions':'https://www.krb.nsw.edu.au/admissions/',
 'Boarding':'https://www.krb.nsw.edu.au/boarding/',
 'Our Campus':'https://www.krb.nsw.edu.au/about-us/our-campus/',
}
buttonish=[]
for page,url in pages.items():
    html=urllib.request.urlopen(urllib.request.Request(url,headers={'User-Agent':'Mozilla/5.0'}),timeout=30).read().decode('utf-8','ignore')
    soup=BeautifulSoup(html,'html.parser')
    for el in soup.find_all(['a','button']):
        text=' '.join(el.get_text(' ',strip=True).split())
        cls=' '.join(el.get('class',[]) or [])
        href=el.get('href','')
        role=el.get('role','')
        signal=' '.join([cls,text,role,el.name])
        if not (el.name=='button' or re.search(r'btn|button|cta|download|read more|apply|enquire|learn more|register|discover|book|view|visit|tour|contact', signal, re.I)):
            continue
        if not text and not href:
            continue
        parent=None
        for p in el.parents:
            if p.name in ['section','div','article','header','li'] and p.get('class'):
                parent=p; break
        block=None
        for p in el.parents:
            if p.name in ['section','div','article','header'] and p.get('class') and re.search(r'block|component|content-block|call-to-action|gateway|hero|intro|accordion|downloads|related|pages|cards', ' '.join(p.get('class',[])), re.I):
                block=p; break
        heading=''
        ctx=block or parent
        if ctx:
            h=ctx.find(['h1','h2','h3','h4'])
            heading=' '.join(h.get_text(' ',strip=True).split()) if h else ''
        buttonish.append({
            'page':page,
            'url':url,
            'tag':el.name,
            'text':text,
            'href':urljoin(url,href) if href else '',
            'classes':cls,
            'role':role,
            'parent_classes':' '.join(parent.get('class',[])) if parent else '',
            'block_classes':' '.join(block.get('class',[])) if block else '',
            'section_heading':heading,
        })
base='/Users/iggy/.hermes/profiles/ignite_team/outbound'
os.makedirs(base,exist_ok=True)
out=os.path.join(base,'krb-4-page-wp-button-audit.csv')
with open(out,'w',newline='',encoding='utf-8') as f:
    w=csv.DictWriter(f,fieldnames=list(buttonish[0].keys()))
    w.writeheader(); w.writerows(buttonish)
print('buttonish count',len(buttonish))
print('\nGROUPS')
for k,c in Counter((r['classes'],r['block_classes'] or r['parent_classes']) for r in buttonish).most_common(40):
    print('\nCOUNT',c,'| CLASSES=',k[0] or '[none]','| BLOCK=',k[1] or '[none]')
    for r in [x for x in buttonish if (x['classes'],x['block_classes'] or x['parent_classes'])==k][:6]:
        print(' -',r['page'], '|', r['section_heading'][:60], '|', r['text'][:70], '|', r['href'])
print('\nWROTE',out)
