#!/usr/bin/env python3
"""
Delete every recording in `recordings/{uid}/recordings/*` that does NOT
have a non-empty transcription on at least one of its entries. Optionally
also deletes the associated Firebase Storage audio blobs so we don't
leave orphans behind.

Default mode is **dry-run** — prints exactly what would happen. Pass
`--execute` to actually delete (with a "DELETE" confirmation prompt).

Always run `backup_recordings.py` first; this script removes data
permanently from Firestore (and optionally from Storage).

Usage
-----
    # Dry run (safe, default)
    python3 delete_no_transcription.py --email ericmigi@gmail.com

    # Real deletion + audio cleanup
    python3 delete_no_transcription.py --email ericmigi@gmail.com \\
        --execute --delete-audio
"""

from __future__ import annotations

import argparse
import sys
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path

import firebase_admin
from firebase_admin import auth as fb_auth, credentials, firestore
from google.cloud import storage as gcs


DEFAULT_BUCKET = "coreapp-ce061.firebasestorage.app"


def parse_args() -> argparse.Namespace:
    p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
    p.add_argument("--email", default="ericmigi@gmail.com")
    p.add_argument("--uid", default=None)
    p.add_argument("--service-account", default=None)
    p.add_argument("--bucket", default=DEFAULT_BUCKET)
    p.add_argument("--page-size", type=int, default=500)
    p.add_argument("--execute", action="store_true",
                   help="Actually delete (default: dry-run).")
    p.add_argument("--delete-audio", action="store_true",
                   help="Also delete the associated Storage blobs for deleted recordings.")
    p.add_argument("--workers", type=int, default=16,
                   help="Parallel workers for audio deletion.")
    return p.parse_args()


def find_default_service_account() -> Path | None:
    cands = sorted(Path.home().joinpath("Downloads").glob("coreapp-ce061-firebase-adminsdk-*.json"),
                   key=lambda p: p.stat().st_mtime, reverse=True)
    return cands[0] if cands else None


def has_transcription(data: dict) -> bool:
    """Keep a recording if ANY of its entries has a non-empty transcription."""
    for e in (data.get("entries") or []):
        t = e.get("transcription")
        if t and str(t).strip():
            return True
    return False


def main() -> int:
    args = parse_args()
    sa = Path(args.service_account) if args.service_account else find_default_service_account()
    if sa is None or not sa.exists():
        print("ERROR: no --service-account, none in ~/Downloads", file=sys.stderr)
        return 2
    print(f"Using service account: {sa}", file=sys.stderr)

    cred = credentials.Certificate(str(sa))
    firebase_admin.initialize_app(cred)
    db = firestore.client()

    uid = args.uid or fb_auth.get_user_by_email(args.email).uid
    print(f"User: {args.email}  uid={uid}", file=sys.stderr)
    print(f"Mode: {'EXECUTE (will delete)' if args.execute else 'DRY-RUN'}", file=sys.stderr)
    if args.execute and args.delete_audio:
        print(f"Audio: ALSO deleting Storage blobs for removed recordings", file=sys.stderr)

    coll = db.collection("recordings").document(uid).collection("recordings")
    print(f"\nScanning recordings/{uid}/recordings ...", file=sys.stderr)

    keep_refs: list = []
    delete_refs: list = []
    delete_filenames: list[str] = []  # for Storage cleanup

    last = None
    seen = 0
    while True:
        q = coll.order_by("timestamp").limit(args.page_size)
        if last is not None:
            q = q.start_after(last)
        page = list(q.stream())
        if not page:
            break
        for snap in page:
            seen += 1
            data = snap.to_dict() or {}
            if has_transcription(data):
                keep_refs.append(snap.reference)
            else:
                delete_refs.append(snap.reference)
                for e in (data.get("entries") or []):
                    fn = e.get("fileName")
                    if fn:
                        delete_filenames.append(fn)
        last = page[-1]
        print(f"  scanned {seen}  keep={len(keep_refs)}  delete={len(delete_refs)}", file=sys.stderr)
        if len(page) < args.page_size:
            break

    print(f"\n--- Plan ---")
    print(f"  Total recordings  : {seen}")
    print(f"  Keep (has transcription): {len(keep_refs)}")
    print(f"  Delete (no transcription): {len(delete_refs)}")
    print(f"  Audio blobs to remove   : {len(delete_filenames)}")

    if not args.execute:
        print(f"\n[dry-run] No changes made. Re-run with --execute to delete.")
        return 0

    if not delete_refs:
        print("Nothing to delete.")
        return 0

    print(f"\n--- About to delete ---")
    print(f"  {len(delete_refs)} Firestore docs", end="")
    if args.delete_audio:
        print(f" + {len(delete_filenames)} Storage audio blobs")
    else:
        print(" (Storage audio left in place; pass --delete-audio to also remove)")
    confirm = input("Type 'DELETE' to proceed: ")
    if confirm != "DELETE":
        print("Aborted.", file=sys.stderr)
        return 1

    # ---- Firestore: batched deletes (≤450 ops per batch) ----
    print(f"\nDeleting {len(delete_refs)} Firestore docs...", file=sys.stderr)
    deleted_fs = 0
    batch = db.batch()
    ops = 0
    for ref in delete_refs:
        batch.delete(ref)
        ops += 1
        if ops >= 450:
            batch.commit()
            deleted_fs += ops
            print(f"  firestore deleted: {deleted_fs}/{len(delete_refs)}", file=sys.stderr)
            batch = db.batch()
            ops = 0
    if ops:
        batch.commit()
        deleted_fs += ops
    print(f"  firestore deleted: {deleted_fs}/{len(delete_refs)} (done)", file=sys.stderr)

    # ---- Storage ----
    if args.delete_audio and delete_filenames:
        print(f"\nDeleting {len(delete_filenames)} Storage blobs ({args.workers} workers)...", file=sys.stderr)
        client = gcs.Client.from_service_account_json(str(sa))
        bucket = client.bucket(args.bucket)
        progress_every = max(1, len(delete_filenames) // 50)
        counts = {"deleted": 0, "missing": 0, "error": 0}
        done = 0

        def _del(fn: str) -> tuple[str, str]:
            blob = bucket.blob(f"recordings/{uid}/{fn}")
            try:
                if not blob.exists():
                    return fn, "missing"
                blob.delete()
                return fn, "deleted"
            except Exception as e:
                return fn, f"error: {type(e).__name__}: {e}"

        with ThreadPoolExecutor(max_workers=args.workers) as pool:
            futs = [pool.submit(_del, fn) for fn in delete_filenames]
            for fut in as_completed(futs):
                fn, status = fut.result()
                done += 1
                if status == "deleted":
                    counts["deleted"] += 1
                elif status == "missing":
                    counts["missing"] += 1
                else:
                    counts["error"] += 1
                    print(f"    {status}: {fn}", file=sys.stderr)
                if done % progress_every == 0 or done == len(delete_filenames):
                    print(f"  audio: done={done}/{len(delete_filenames)} "
                          f"deleted={counts['deleted']} missing={counts['missing']} error={counts['error']}",
                          file=sys.stderr)
        print(f"  audio: deleted={counts['deleted']} missing={counts['missing']} error={counts['error']}",
              file=sys.stderr)

    return 0


if __name__ == "__main__":
    sys.exit(main())
