from pathlib import Path
from bs4 import BeautifulSoup
import re
raw_dir=Path('/Users/iggy/.hermes/profiles/ignite_team/outbound/krb-live-scrape-html-advancement-community-v1')
for f in sorted(raw_dir.glob('*.html')):
    soup=BeautifulSoup(f.read_text(errors='ignore'),'html.parser')
    print('\n##', f.name)
    main=soup.select_one('main') or soup.body or soup
    for i,c in enumerate([x for x in main.find_all(recursive=False) if getattr(x,'name',None)],1):
        cls=' '.join(c.get('class') or [])
        txt=re.sub(r'\s+',' ',c.get_text(' ',strip=True))[:160]
        imgs=[]
        for el in c.select('[data-src], img[src], [style]'):
            if el.name=='img' and el.get('src'):
                imgs.append(el.get('src'))
            if el.get('data-src'):
                imgs.append(el.get('data-src'))
            if el.get('style'):
                imgs += re.findall(r"url\(['\"]?([^)'\"]+)", el.get('style'))
        print(i, cls, 'TXT:', txt, 'IMGS:', imgs[:6])