#!/usr/bin/env python3
import hashlib
import json
import mimetypes
import os
import re
import subprocess
import sys
import time
from pathlib import Path
from urllib.parse import urlparse, unquote

import requests

SITE_ID = "6a1e37436b332da28ecc3001"
STORIES_COLLECTION_ID = "6a339195e8b9effdf252ce7d"
CATEGORIES_COLLECTION_ID = "6a33919170f87b875836bf99"
OP_ITEM_ID = "bqfhhqbsg5dmhoxof2l5c64xsa"
OP_VAULT = "Vault for Iggy (IGNITE OpenClaw Bot)"
SCRAPE_PATH = Path("/Users/iggy/.hermes/profiles/ignite_team/outbound/krb-alumni-stories-scrape.json")
REPORT_PATH = Path("/Users/iggy/.hermes/profiles/ignite_team/outbound/krb-alumni-stories-import-report.json")
WF_BASE = "https://api.webflow.com/v2"
UA = "Hermes KRB Alumni Import/1.0"


def read_env_var(path: Path, key: str):
    if not path.exists():
        return None
    for line in path.read_text(errors="replace").splitlines():
        s = line.strip()
        if not s or s.startswith("#"):
            continue
        if s.startswith("export "):
            s = s[7:].strip()
        if "=" not in s:
            continue
        k, v = s.split("=", 1)
        if k.strip() == key:
            return v.strip().strip('"').strip("'")
    return None


def get_webflow_token():
    token = os.environ.get("OP_SERVICE_ACCOUNT_TOKEN") or read_env_var(Path("/Users/iggy/.hermes/profiles/ignite_team/.env"), "OP_SERVICE_ACCOUNT_TOKEN")
    if not token:
        raise SystemExit("OP_SERVICE_ACCOUNT_TOKEN not available")
    env = os.environ.copy()
    env["OP_SERVICE_ACCOUNT_TOKEN"] = token
    cp = subprocess.run(
        ["op", "item", "get", OP_ITEM_ID, "--vault", OP_VAULT, "--format", "json", "--reveal"],
        env=env,
        text=True,
        capture_output=True,
        check=True,
    )
    item = json.loads(cp.stdout)
    for field in item.get("fields", []):
        if field.get("label") == "credential" and field.get("value"):
            return field["value"]
    raise SystemExit("KRB Webflow credential field not found")


class Webflow:
    def __init__(self, token):
        self.s = requests.Session()
        self.s.headers.update({"Authorization": f"Bearer {token}", "accept": "application/json", "User-Agent": UA})

    def request(self, method, path, **kwargs):
        url = WF_BASE + path
        if "json" in kwargs:
            headers = kwargs.pop("headers", {})
            headers.setdefault("content-type", "application/json")
            kwargs["headers"] = headers
        r = self.s.request(method, url, timeout=60, **kwargs)
        if r.status_code >= 400:
            raise RuntimeError(f"Webflow {method} {path} failed {r.status_code}: {r.text[:1000]}")
        if not r.text.strip():
            return {}
        return r.json()

    def get_all_items(self, collection_id):
        out = []
        offset = 0
        while True:
            data = self.request("GET", f"/collections/{collection_id}/items?limit=100&offset={offset}")
            items = data.get("items", [])
            out.extend(items)
            pag = data.get("pagination") or {}
            total = pag.get("total", len(out))
            if len(out) >= total or not items:
                return out
            offset += len(items)

    def create_item(self, collection_id, field_data):
        payload = {"isArchived": False, "isDraft": False, "fieldData": field_data}
        return self.request("POST", f"/collections/{collection_id}/items", json=payload)

    def create_asset_metadata(self, file_name, file_hash):
        return self.request("POST", f"/sites/{SITE_ID}/assets", json={"fileName": file_name[:99], "fileHash": file_hash})


def filename_from_url(url):
    path = unquote(urlparse(url).path)
    name = path.rsplit("/", 1)[-1] or "image.jpg"
    name = re.sub(r"[^A-Za-z0-9._-]+", "-", name)
    if "." not in name:
        name += ".jpg"
    return name[:99]


def download_image(url):
    r = requests.get(url, headers={"User-Agent": UA}, timeout=60)
    r.raise_for_status()
    return r.content, r.headers.get("content-type") or mimetypes.guess_type(url)[0] or "application/octet-stream"


def upload_asset(wf: Webflow, url, title, cache):
    if not url:
        return None
    if url in cache:
        return cache[url]
    content, content_type = download_image(url)
    md5 = hashlib.md5(content).hexdigest()
    file_name = filename_from_url(url)
    meta = wf.create_asset_metadata(file_name, md5)
    asset_id = meta.get("id") or meta.get("asset", {}).get("id")
    asset_url = meta.get("hostedUrl") or meta.get("url") or meta.get("assetUrl") or meta.get("asset", {}).get("hostedUrl") or meta.get("asset", {}).get("url")
    upload_url = meta.get("uploadUrl")
    upload_details = meta.get("uploadDetails") or {}
    if upload_url and upload_details:
        files = {"file": (file_name, content, content_type)}
        s3 = requests.post(upload_url, data=upload_details, files=files, timeout=120)
        if s3.status_code not in (200, 201, 204):
            raise RuntimeError(f"S3 upload failed {s3.status_code}: {s3.text[:500]}")
    if not asset_id:
        raise RuntimeError(f"No asset id in response keys {list(meta.keys())}")
    if not asset_url:
        # Webflow examples accept fileId + url; if URL is missing, use fileId-only fallback below.
        result = {"fileId": asset_id}
    else:
        result = {"fileId": asset_id, "url": asset_url}
    cache[url] = result
    return result


def clean_summary(s):
    s = re.sub(r"\s+", " ", s or "").strip()
    return s[:497] + "…" if len(s) > 500 else s


def main():
    rows = json.loads(SCRAPE_PATH.read_text())
    token = get_webflow_token()
    wf = Webflow(token)
    categories = wf.get_all_items(CATEGORIES_COLLECTION_ID)
    cat_by_slug = {it.get("fieldData", {}).get("slug"): it.get("id") for it in categories}
    existing = wf.get_all_items(STORIES_COLLECTION_ID)
    existing_slugs = {it.get("fieldData", {}).get("slug") for it in existing}

    report = {
        "source_total": len(rows),
        "existing_before": len(existing),
        "created": [],
        "skipped_existing": [],
        "errors": [],
        "image_upload_errors": [],
        "started_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
    }
    asset_cache = {}
    to_create = [r for r in rows if r.get("slug") not in existing_slugs]
    print(f"Preparing to create {len(to_create)} alumni stories (existing skipped: {len(rows)-len(to_create)})", flush=True)

    for idx, r in enumerate(to_create, 1):
        slug = r["slug"]
        field_data = {
            "name": r["title"],
            "slug": slug,
            "summary": clean_summary(r.get("summary")),
            "body": r.get("body") or "",
            "published-date": r.get("published_date"),
        }
        refs = [cat_by_slug[c] for c in r.get("categories", []) if c in cat_by_slug]
        if refs:
            field_data["categories"] = refs
        if r.get("main_image"):
            try:
                image_obj = upload_asset(wf, r["main_image"], r["title"], asset_cache)
                if image_obj:
                    field_data["main-image"] = image_obj
            except Exception as e:
                report["image_upload_errors"].append({"slug": slug, "image": r.get("main_image"), "error": str(e)})
                print(f"[{idx}/{len(to_create)}] image failed for {slug}: {e}", flush=True)
        try:
            created = wf.create_item(STORIES_COLLECTION_ID, field_data)
            item_id = created.get("id") or created.get("item", {}).get("id")
            report["created"].append({"slug": slug, "title": r["title"], "id": item_id, "with_image": "main-image" in field_data, "categories": r.get("categories", [])})
            print(f"[{idx}/{len(to_create)}] created {slug} id={item_id} image={'yes' if 'main-image' in field_data else 'no'}", flush=True)
        except Exception as e:
            report["errors"].append({"slug": slug, "title": r.get("title"), "error": str(e)})
            print(f"[{idx}/{len(to_create)}] CREATE FAILED {slug}: {e}", flush=True)
        REPORT_PATH.write_text(json.dumps(report, indent=2, ensure_ascii=False))
        time.sleep(0.1)

    final = wf.get_all_items(STORIES_COLLECTION_ID)
    report["final_total"] = len(final)
    report["finished_at"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
    REPORT_PATH.write_text(json.dumps(report, indent=2, ensure_ascii=False))
    print("DONE", json.dumps({k: (len(v) if isinstance(v, list) else v) for k, v in report.items() if k not in ("created",)}, indent=2), flush=True)


if __name__ == "__main__":
    main()
