#!/usr/bin/env python3
from __future__ import annotations

import csv
import hashlib
import json
import mimetypes
import os
import subprocess
import sys
import time
from pathlib import Path

import requests

SITE_ID = '6a1e37436b332da28ecc3001'
API_BASE = 'https://api.webflow.com/v2'
OUT = Path('/Users/iggy/.hermes/profiles/ignite_team/outbound/krb-image-grab')
SOURCE_DIR = OUT / 'webflow-import-assets-2000px' / 'Other static images'
SOURCE_MANIFEST = OUT / 'krb-webflow-other-static-images-2000px.csv'
UPLOAD_MANIFEST = OUT / 'krb-webflow-other-static-images-uploaded-to-webflow.csv'
FOLDER_JSON = OUT / 'krb-webflow-other-static-images-folder.json'
READBACK_JSON = OUT / 'krb-webflow-other-static-images-assets-readback.json'
RECONCILIATION_CSV = OUT / 'krb-webflow-other-static-images-reconciliation.csv'
FOLDER_NAME = 'Other static images'
OP_ITEM_ID = 'bqfhhqbsg5dmhoxof2l5c64xsa'
OP_VAULT = 'Vault for Iggy (IGNITE OpenClaw Bot)'


def get_token() -> str:
    token = os.environ.get('WEBFLOW_TOKEN')
    if token:
        return token
    cp = subprocess.run(
        ['op', 'item', 'get', OP_ITEM_ID, '--vault', OP_VAULT, '--format', 'json', '--reveal'],
        text=True, capture_output=True, check=True,
    )
    item = json.loads(cp.stdout)
    for f in item.get('fields', []):
        if (f.get('label') == 'credential' or f.get('id') == 'credential') and f.get('value'):
            return f['value']
    raise RuntimeError('Could not retrieve KRB Webflow credential from 1Password')


def api_headers(token: str) -> dict[str, str]:
    return {'Authorization': f'Bearer {token}', 'accept': 'application/json'}


def json_headers(token: str) -> dict[str, str]:
    h = api_headers(token)
    h['content-type'] = 'application/json'
    return h


def request_with_retry(method: str, url: str, **kwargs) -> requests.Response:
    last_exc: Exception | None = None
    for attempt in range(1, 4):
        try:
            r = requests.request(method, url, timeout=(15, 45), **kwargs)
        except requests.RequestException as exc:
            last_exc = exc
            wait = min(2 ** attempt, 15)
            print(f'  retry {attempt} after {wait}s for {method} {url} exception={type(exc).__name__}', flush=True)
            time.sleep(wait)
            continue
        if r.status_code != 429 and r.status_code < 500:
            return r
        wait = int(r.headers.get('Retry-After', '0') or '0') or min(2 ** attempt, 30)
        print(f'  retry {attempt} after {wait}s for {method} {url} status={r.status_code}', flush=True)
        time.sleep(wait)
    if last_exc:
        raise last_exc
    return r


def paged_get(token: str, url: str, key: str, params: dict | None = None) -> list[dict]:
    out: list[dict] = []
    offset = 0
    params = dict(params or {})
    while True:
        p = dict(params)
        p.update({'limit': 100, 'offset': offset})
        r = request_with_retry('GET', url, headers=api_headers(token), params=p)
        if r.status_code != 200:
            raise RuntimeError(f'GET failed: {url} {r.status_code} {r.text[:1000]}')
        data = r.json()
        out.extend(data.get(key, []))
        pag = data.get('pagination') or {}
        limit = pag.get('limit', 100)
        total = pag.get('total', len(out))
        if offset + limit >= total:
            break
        offset += limit
    return out


def list_folders(token: str) -> list[dict]:
    return paged_get(token, f'{API_BASE}/sites/{SITE_ID}/asset_folders', 'assetFolders')


def get_or_create_folder(token: str) -> dict:
    for f in list_folders(token):
        if f.get('displayName') == FOLDER_NAME and f.get('parentFolder') in (None, ''):
            return f
    r = request_with_retry('POST', f'{API_BASE}/sites/{SITE_ID}/asset_folders', headers=json_headers(token), json={'displayName': FOLDER_NAME, 'parentFolder': None})
    if r.status_code not in (200, 201, 202):
        raise RuntimeError(f'Create folder failed: {r.status_code} {r.text}')
    return r.json()


def md5_file(path: Path) -> str:
    h = hashlib.md5()
    with path.open('rb') as f:
        for chunk in iter(lambda: f.read(1024 * 1024), b''):
            h.update(chunk)
    return h.hexdigest()


def upload_asset(token: str, folder_id: str, path: Path) -> dict:
    file_hash = md5_file(path)
    create_body = {'fileName': path.name, 'fileHash': file_hash, 'parentFolder': folder_id}
    r = request_with_retry('POST', f'{API_BASE}/sites/{SITE_ID}/assets', headers=json_headers(token), json=create_body)
    if r.status_code not in (200, 201, 202):
        raise RuntimeError(f'Create asset failed for {path.name}: {r.status_code} {r.text[:1000]}')
    data = r.json()
    upload_url = data.get('uploadUrl')
    upload_details = data.get('uploadDetails') or {}
    if upload_url:
        content_type = data.get('contentType') or upload_details.get('content-type') or mimetypes.guess_type(path.name)[0] or 'application/octet-stream'
        with path.open('rb') as f:
            sr = request_with_retry('POST', upload_url, data=dict(upload_details), files={'file': (path.name, f, content_type)})
        if sr.status_code not in (200, 201, 202, 204):
            raise RuntimeError(f'S3 upload failed for {path.name}: {sr.status_code} {sr.text[:500]}')
    data['fileName'] = path.name
    data['fileHash'] = file_hash
    data['localBytes'] = path.stat().st_size
    return data


def list_assets_in_folder(token: str, folder_id: str) -> list[dict]:
    return paged_get(token, f'{API_BASE}/sites/{SITE_ID}/assets', 'assets', {'folderId': folder_id})


def asset_url(asset: dict) -> str:
    for key in ['hostedUrl', 'url']:
        if asset.get(key):
            return str(asset[key])
    variants = asset.get('variants') or []
    if variants and isinstance(variants[0], dict):
        return variants[0].get('hostedUrl') or variants[0].get('url') or ''
    return ''


def main() -> None:
    if not SOURCE_DIR.exists():
        raise SystemExit(f'Missing source dir: {SOURCE_DIR}')
    if not SOURCE_MANIFEST.exists():
        raise SystemExit(f'Missing source manifest: {SOURCE_MANIFEST}')

    rows = list(csv.DictReader(SOURCE_MANIFEST.open(newline='')))
    files = sorted([p for p in SOURCE_DIR.iterdir() if p.is_file()])
    if len(rows) != 540 or len(files) != 540:
        raise SystemExit(f'Expected 540 manifest rows/files, found rows={len(rows)} files={len(files)}')

    token = get_token()
    folder = get_or_create_folder(token)
    FOLDER_JSON.write_text(json.dumps(folder, indent=2))
    folder_id = folder['id']
    print(f'Using folder {FOLDER_NAME}: {folder_id}', flush=True)

    # If this script is re-run, skip filenames already present in the target folder with matching hash where available.
    existing_assets = list_assets_in_folder(token, folder_id)
    existing_by_name = {a.get('displayName') or a.get('fileName') or a.get('filename'): a for a in existing_assets}
    print(f'Existing assets in folder before upload: {len(existing_assets)}', flush=True)

    source_by_filename = {r['filename']: r for r in rows}
    upload_rows = []
    for i, path in enumerate(files, 1):
        src = source_by_filename.get(path.name, {})
        local_hash = md5_file(path)
        existing = existing_by_name.get(path.name)
        if existing:
            result = dict(existing)
            result['fileName'] = path.name
            result['fileHash'] = local_hash
            result['localBytes'] = path.stat().st_size
            status = 'already-present-by-filename'
            print(f'[{i}/{len(files)}] skipping existing {path.name}', flush=True)
        else:
            print(f'[{i}/{len(files)}] uploading {path.name}', flush=True)
            try:
                result = upload_asset(token, folder_id, path)
                status = 'uploaded'
                if result.get('id'):
                    existing_by_name[path.name] = result
            except Exception as e:
                result = {'fileName': path.name, 'fileHash': local_hash, 'localBytes': path.stat().st_size, 'id': '', 'parentFolder': folder_id}
                status = f'error: {e}'
                print(f'  ERROR {e}', file=sys.stderr, flush=True)
        upload_rows.append({
            'filename': path.name,
            'asset_id': result.get('id', ''),
            'parent_folder': result.get('parentFolder', folder_id),
            'upload_url_present': 'yes' if result.get('uploadUrl') else 'no',
            'content_type': result.get('contentType', ''),
            'file_hash': result.get('fileHash', local_hash),
            'local_bytes': result.get('localBytes', path.stat().st_size),
            'status': status,
            'canonical_url': src.get('canonical_url', ''),
            'page_count': src.get('page_count', ''),
            'occurrence_count': src.get('occurrence_count', ''),
            'sample_page_urls': src.get('sample_page_urls', ''),
            'source_urls_collapsed': src.get('source_urls_collapsed', ''),
            'sources': src.get('sources', ''),
        })

    fields = ['filename','asset_id','parent_folder','upload_url_present','content_type','file_hash','local_bytes','status','canonical_url','page_count','occurrence_count','sample_page_urls','source_urls_collapsed','sources']
    with UPLOAD_MANIFEST.open('w', newline='') as f:
        w = csv.DictWriter(f, fieldnames=fields)
        w.writeheader(); w.writerows(upload_rows)

    # Allow Webflow indexing a moment, then re-read.
    time.sleep(5)
    assets = list_assets_in_folder(token, folder_id)
    READBACK_JSON.write_text(json.dumps(assets, indent=2))

    readback_by_name = {a.get('displayName') or a.get('fileName') or a.get('filename'): a for a in assets}
    recon_fields = fields + ['webflow_asset_name', 'webflow_hosted_url', 'readback_found']
    with RECONCILIATION_CSV.open('w', newline='') as f:
        w = csv.DictWriter(f, fieldnames=recon_fields)
        w.writeheader()
        for r in upload_rows:
            a = readback_by_name.get(r['filename']) or {}
            out = dict(r)
            out.update({
                'webflow_asset_name': a.get('displayName') or a.get('fileName') or a.get('filename') or '',
                'webflow_hosted_url': asset_url(a),
                'readback_found': 'yes' if a else 'no',
            })
            w.writerow(out)

    print('DONE')
    print(f'folder_id: {folder_id}')
    print(f'attempted rows: {len(upload_rows)}')
    print(f'uploaded: {sum(1 for r in upload_rows if r["status"] == "uploaded")}')
    print(f'already_present: {sum(1 for r in upload_rows if r["status"].startswith("already-present"))}')
    print(f'errors: {sum(1 for r in upload_rows if r["status"].startswith("error:"))}')
    print(f'unique asset ids in manifest: {len({r["asset_id"] for r in upload_rows if r["asset_id"]})}')
    print(f'readback assets in folder: {len(assets)}')
    print(f'manifest: {UPLOAD_MANIFEST}')
    print(f'reconciliation: {RECONCILIATION_CSV}')
    print(f'readback: {READBACK_JSON}')


if __name__ == '__main__':
    main()
