#!/usr/bin/env python3
"""Read-only Edmund Rice College WordPress media variant optimisation estimate.

Retrieves Synergy/cPanel credentials at runtime from 1Password, crawls uploads via
cPanel Fileman API, downloads a stratified sample over FTPS/TLS, optimises locally
with Pillow, and writes JSON results. Does not modify remote files.
"""
from __future__ import annotations

import collections
import ftplib
import io
import json
import os
import pathlib
import random
import re
import ssl
import subprocess
import sys
import time
from pathlib import Path
from typing import Any

import requests
from PIL import Image, ImageOps
from requests.auth import HTTPBasicAuth
from zeep import Client, Settings
from zeep.helpers import serialize_object
from zeep.transports import Transport

OUT = Path('/Users/iggy/.hermes/profiles/ignite_team/outbound/erc_media_audit')
OUT.mkdir(parents=True, exist_ok=True)
DOMAIN = 'edmundricecollege.nsw.edu.au'
SITE_SLUG = 'edmundricecollege'
VAULT = 'Vault for Iggy (IGNITE OpenClaw Bot)'
ITEM = 'emtrkfztclkv4vtumb6nsa7a4e'
OP_ENV_KEY = 'OP_SERVICE_ACCOUNT_TOKEN'
WSDL = 'https://api.synergywholesale.com/?wsdl'
IMG_EXT = {'.jpg', '.jpeg', '.png', '.webp', '.gif'}
VAR_RE = re.compile(r'-\d+x\d+(?:@[0-9]+x)?(?=\.(?:jpe?g|png|webp|gif)$)', re.I)


def load_token() -> str | None:
    for p in [Path('/Users/iggy/.hermes/profiles/ignite_team/.env'), Path('/Users/iggy/.hermes/.env')]:
        if not p.exists():
            continue
        for line in p.read_text(errors='replace').splitlines():
            s = line.strip()
            if not s or s.startswith('#'):
                continue
            if s.startswith('export '):
                s = s[7:].strip()
            if '=' in s:
                k, v = s.split('=', 1)
                if k.strip() == OP_ENV_KEY:
                    return v.strip().strip('"').strip("'")
    return os.environ.get(OP_ENV_KEY)


def op_fields() -> dict[str, str]:
    env = os.environ.copy()
    tok = load_token()
    if tok:
        env[OP_ENV_KEY] = tok
    cp = subprocess.run(
        ['op', 'item', 'get', ITEM, '--vault', VAULT, '--format', 'json', '--reveal'],
        env=env, text=True, capture_output=True, check=True,
    )
    return {f.get('label'): f.get('value') or '' for f in json.loads(cp.stdout).get('fields', [])}


def get_record() -> dict[str, Any]:
    fields = op_fields()
    client = Client(
        WSDL,
        settings=Settings(strict=False, xml_huge_tree=True),
        transport=Transport(session=requests.Session(), timeout=60),
    )
    for page in range(1, 100):
        resp = serialize_object(client.service.listHosting({
            'resellerID': fields['username'],
            'apiKey': fields['credential'],
            'page': page,
            'limit': 100,
        }))
        if resp.get('status') != 'OK':
            raise RuntimeError(f"listHosting failed on page {page}: {resp.get('errorMessage')}")
        batch = resp.get('hoidList') or []
        for rec in batch:
            if rec.get('domain') == DOMAIN:
                return rec
        if len(batch) < 100:
            break
    raise RuntimeError(f'Hosting record not found: {DOMAIN}')


def cpanel_list(sess: requests.Session, server: str, user: str, directory: str) -> list[dict[str, Any]]:
    url = f'https://{server}:2083/json-api/cpanel'
    params = {
        'cpanel_jsonapi_user': user,
        'cpanel_jsonapi_apiversion': '2',
        'cpanel_jsonapi_module': 'Fileman',
        'cpanel_jsonapi_func': 'listfiles',
        'showdotfiles': '1',
        'dir': directory,
    }
    r = sess.get(url, params=params, timeout=45)
    r.raise_for_status()
    js = r.json()
    cpres = js.get('cpanelresult') or {}
    data = cpres.get('data') or []
    # Missing directories return HTTP 200 with an error string; callers can treat as empty.
    return data


def item_path(parent: str, item: dict[str, Any]) -> str:
    fp = item.get('fullpath') or item.get('path')
    if fp:
        return str(fp)
    name = item.get('file') or item.get('name') or ''
    return f"{parent.rstrip('/')}/{name}"


def item_type(item: dict[str, Any]) -> str:
    t = str(item.get('type') or item.get('filetype') or '').lower()
    if t in {'dir', 'directory'} or item.get('isdir') in {1, '1', True}:
        return 'dir'
    return 'file'


def crawl_uploads(rec: dict[str, Any]) -> dict[str, Any]:
    server, user, password = rec['server'], rec['username'], rec.get('password')
    if not password:
        raise RuntimeError('Hosting password missing from Synergy record (not printed)')
    uploads = f"/home/{user}/automation/{SITE_SLUG}/constants/uploads"
    sess = requests.Session()
    sess.auth = HTTPBasicAuth(user, password)
    q = collections.deque([uploads])
    files: list[dict[str, Any]] = []
    dirs_seen = 0
    start = time.time()
    while q:
        directory = q.popleft()
        dirs_seen += 1
        try:
            entries = cpanel_list(sess, server, user, directory)
        except Exception as e:
            print(f"WARN list failed {directory}: {type(e).__name__}: {str(e)[:120]}", flush=True)
            continue
        for item in entries:
            name = item.get('file') or item.get('name') or ''
            if name in {'.', '..'}:
                continue
            p = item_path(directory, item)
            if item_type(item) == 'dir':
                q.append(p)
            else:
                try:
                    size = int(item.get('size') or 0)
                except Exception:
                    size = 0
                files.append({
                    'path': p,
                    'size': size,
                    'mtime': item.get('mtime') or item.get('modified') or item.get('ctime'),
                    'humansize': item.get('humansize'),
                })
        if dirs_seen % 100 == 0:
            print(f"crawled dirs={dirs_seen} files={len(files)} queue={len(q)} elapsed={time.time()-start:.1f}s", flush=True)
    summary = {
        'domain': DOMAIN,
        'uploads_path': uploads,
        'crawl_duration_sec': round(time.time() - start, 1),
        'directories_seen': dirs_seen,
        'file_count': len(files),
        'total_bytes': sum(f['size'] for f in files),
    }
    inv = {'summary': summary, 'files': files}
    (OUT / 'file_inventory.json').write_text(json.dumps(inv, indent=2))
    return inv


def optimise_bytes(data: bytes, ext: str) -> bytes | None:
    im = Image.open(io.BytesIO(data))
    try:
        im = ImageOps.exif_transpose(im)
    except Exception:
        pass
    out = io.BytesIO()
    ext = ext.lower()
    if ext in ['.jpg', '.jpeg']:
        if im.mode not in ('RGB', 'L'):
            im = im.convert('RGB')
        im.save(out, format='JPEG', quality=85, optimize=True, progressive=True)
    elif ext == '.png':
        im.save(out, format='PNG', optimize=True, compress_level=9)
    elif ext == '.webp':
        if getattr(im, 'is_animated', False):
            return None
        im.save(out, format='WEBP', quality=82, method=6)
    else:
        return None
    return out.getvalue()


def sample_and_estimate(rec: dict[str, Any], inv: dict[str, Any]) -> dict[str, Any]:
    files = inv['files']
    uploads = inv['summary']['uploads_path']
    variants = [
        f for f in files
        if pathlib.Path(f['path']).suffix.lower() in IMG_EXT and VAR_RE.search(pathlib.Path(f['path']).name)
    ]
    random.seed(43)
    by_ext: dict[str, list[dict[str, Any]]] = collections.defaultdict(list)
    for f in variants:
        by_ext[pathlib.Path(f['path']).suffix.lower()].append(f)
    selected: list[dict[str, Any]] = []
    strata_info: dict[str, dict[str, Any]] = {}
    for ext, arr0 in by_ext.items():
        arr = sorted(arr0, key=lambda x: x['size'])
        n = len(arr)
        # Keep the Edmund Rice sample smaller than the Clarendon run because the
        # media library is much larger and cPanel/FTPS throughput is limited.
        if ext == '.jpg':
            bins, per_bin = 10, 20
        elif ext == '.jpeg':
            bins, per_bin = 10, 12
        elif ext == '.png':
            bins, per_bin = 10, 12
        elif ext == '.webp':
            bins, per_bin = 2, 20
        else:
            bins, per_bin = 1, 12
        for b in range(bins):
            part = arr[b*n//bins:(b+1)*n//bins]
            if not part:
                continue
            k = min(per_bin, len(part))
            sample = random.sample(part, k) if len(part) > k else part
            selected.extend(sample)
            strata_info[f'{ext}:bin{b+1}'] = {
                'count': len(part),
                'bytes': sum(x['size'] for x in part),
                'sample_paths': set(x['path'] for x in sample),
            }
    seen = {x['path'] for x in selected}
    for x in sorted(variants, key=lambda x: x['size'], reverse=True)[:40]:
        if x['path'] not in seen:
            selected.append(x)
            seen.add(x['path'])

    ctx = ssl._create_unverified_context()
    ftp = ftplib.FTP_TLS(context=ctx, timeout=90)
    ftp.connect(rec['server'], 21)
    ftp.auth(); ftp.prot_p(); ftp.login(rec['username'], rec['password'])

    prefix = f"/home/{rec['username']}/"
    def fetch(path: str) -> bytes:
        rel = path.replace(prefix, '', 1)
        bio = io.BytesIO()
        ftp.retrbinary('RETR ' + rel, bio.write)
        return bio.getvalue()

    results = []
    errors = []
    start = time.time()
    for i, f0 in enumerate(selected, 1):
        ext = pathlib.Path(f0['path']).suffix.lower()
        try:
            data = fetch(f0['path'])
            opt = optimise_bytes(data, ext)
            if opt is not None:
                results.append({
                    'path': f0['path'],
                    'ext': ext,
                    'orig_size': len(data),
                    'inventory_size': f0['size'],
                    'opt_size': len(opt),
                    'saving': max(0, len(data) - len(opt)),
                })
        except Exception as e:
            errors.append({'path': f0['path'], 'error': repr(e)[:300]})
        if i % 100 == 0:
            print(f"sampled {i}/{len(selected)} ok={len(results)} errors={len(errors)} elapsed={time.time()-start:.1f}s", flush=True)
    try:
        ftp.quit()
    except Exception:
        pass

    res_by_path = {r['path']: r for r in results}
    est_total = 0.0
    est_by_stratum = {}
    for key, info in strata_info.items():
        rs = [res_by_path[p] for p in info['sample_paths'] if p in res_by_path]
        if not rs:
            continue
        orig = sum(r['orig_size'] for r in rs)
        sav = sum(r['saving'] for r in rs)
        ratio = sav / orig if orig else 0
        est = ratio * info['bytes']
        est_total += est
        est_by_stratum[key] = {
            'population_count': info['count'],
            'population_bytes': info['bytes'],
            'sample_count': len(rs),
            'sample_orig_bytes': orig,
            'sample_saving_ratio': ratio,
            'estimated_saving_bytes': est,
        }
    by_ext_res: dict[str, dict[str, int]] = collections.defaultdict(lambda: {'count': 0, 'orig': 0, 'opt': 0, 'saving': 0})
    for r in results:
        d = by_ext_res[r['ext']]
        d['count'] += 1; d['orig'] += r['orig_size']; d['opt'] += r['opt_size']; d['saving'] += r['saving']

    variant_bytes = sum(f['size'] for f in variants)
    summary = {
        'method': 'Stratified sample of WordPress-generated image variants; JPEG q85 optimize/progressive, PNG lossless Pillow optimize, WebP q82. Estimates weighted by extension+size decile strata.',
        'domain': DOMAIN,
        'uploads_path': uploads,
        'variant_population_files': len(variants),
        'variant_population_bytes': variant_bytes,
        'sample_selected': len(selected),
        'sample_success': len(results),
        'sample_errors': len(errors),
        'sample_original_bytes': sum(r['orig_size'] for r in results),
        'sample_optimised_bytes': sum(r['opt_size'] for r in results),
        'sample_saving_bytes': sum(r['saving'] for r in results),
        'estimated_saving_bytes': round(est_total),
        'estimated_post_optimisation_variant_bytes': round(variant_bytes - est_total),
        'by_ext_sample': by_ext_res,
        'by_stratum_estimate': est_by_stratum,
        'errors': errors[:30],
        'duration_sec': round(time.time() - start, 1),
        'largest_sample_savings': sorted(results, key=lambda r: r['saving'], reverse=True)[:50],
    }
    (OUT / 'optimisation_sample_estimate.json').write_text(json.dumps(summary, indent=2))
    return summary


def summarise_inventory(inv: dict[str, Any]) -> dict[str, Any]:
    files = inv['files']
    images = [f for f in files if pathlib.Path(f['path']).suffix.lower() in IMG_EXT]
    variants = [f for f in images if VAR_RE.search(pathlib.Path(f['path']).name)]
    originals = [f for f in images if not VAR_RE.search(pathlib.Path(f['path']).name)]
    def by_ext(arr):
        d = collections.defaultdict(lambda: {'count': 0, 'bytes': 0})
        for f in arr:
            ext = pathlib.Path(f['path']).suffix.lower()
            d[ext]['count'] += 1
            d[ext]['bytes'] += f['size']
        return dict(d)
    s = {
        'total_files': len(files),
        'total_bytes': sum(f['size'] for f in files),
        'image_files': len(images),
        'image_bytes': sum(f['size'] for f in images),
        'variant_files': len(variants),
        'variant_bytes': sum(f['size'] for f in variants),
        'original_files': len(originals),
        'original_bytes': sum(f['size'] for f in originals),
        'variants_by_ext': by_ext(variants),
        'originals_by_ext': by_ext(originals),
    }
    (OUT / 'inventory_summary.json').write_text(json.dumps(s, indent=2))
    return s


def main() -> None:
    rec = get_record()
    safe_rec = {k: rec.get(k) for k in ['domain', 'hoid', 'serviceStatus', 'plan', 'diskUsage', 'diskLimit', 'server', 'serverIPAddress', 'username']}
    print(json.dumps({'hosting': safe_rec}, indent=2), flush=True)
    inv_path = OUT / 'file_inventory.json'
    if inv_path.exists() and os.environ.get('RECrawl') != '1':
        inv = json.loads(inv_path.read_text())
        print('Loaded existing inventory', inv_path, flush=True)
    else:
        inv = crawl_uploads(rec)
        print('Crawl complete', json.dumps(inv['summary'], indent=2), flush=True)
    inv_summary = summarise_inventory(inv)
    print('Inventory summary', json.dumps(inv_summary, indent=2), flush=True)
    estimate = sample_and_estimate(rec, inv)
    print('Estimate summary', json.dumps({
        k: estimate[k]
        for k in ['variant_population_files', 'variant_population_bytes', 'sample_selected', 'sample_success', 'sample_errors', 'sample_original_bytes', 'sample_saving_bytes', 'estimated_saving_bytes', 'estimated_post_optimisation_variant_bytes', 'duration_sec']
    }, indent=2), flush=True)
    print('Saved outputs to', OUT, flush=True)

if __name__ == '__main__':
    main()
