#!/usr/bin/env python3
"""
Delete duplicate `recordings` Firestore docs based on the canonical
unique-list files produced by `analyze_local_backup.py`:

    ~/Downloads/2026-04-29 Eric Firestore recording db/unique_with_transcription.json
    ~/Downloads/2026-04-29 Eric Firestore recording db/unique_with_filename_only.json

Each entry in those files looks like:

    {"doc_id": "AAA", "duplicate_doc_ids": ["BBB", "CCC"], ...}

The script keeps `doc_id` and deletes every id in `duplicate_doc_ids`.
Selection is deterministic — `analyze_local_backup.py` picks the
lexicographically-smallest doc id per fingerprint as the canonical
keeper, so re-running this is idempotent.

Path layout in Firestore:
    recordings/{uid}/recordings/{recordingId}

Default is dry-run (just prints what would be deleted). Pass
`--execute` to actually delete. There's a final confirmation prompt
unless you pass `--yes`.

We do NOT touch Firebase Storage audio blobs. Duplicates point at the
same `fileName`, so deleting the dup doc never orphans audio — the
canonical doc still references it.

Usage
-----
    python3 delete_duplicates_from_local.py \\
        --email ericmigi@gmail.com \\
        --service-account ~/Downloads/coreapp-ce061-firebase-adminsdk-fbsvc-0159a91677.json

    # Actually delete:
    python3 delete_duplicates_from_local.py --execute

    # Skip confirmation:
    python3 delete_duplicates_from_local.py --execute --yes
"""

from __future__ import annotations

import argparse
import json
import os
import sys
from pathlib import Path

import firebase_admin
from firebase_admin import auth as fb_auth
from firebase_admin import credentials, firestore


DEFAULT_BACKUP_DIR = Path("/Users/eric/Downloads/2026-04-29 Eric Firestore recording db")
DEFAULT_FILES = (
    "unique_with_transcription.json",
    "unique_with_filename_only.json",
)


def parse_args() -> argparse.Namespace:
    p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
    p.add_argument("--email", default="ericmigi@gmail.com")
    p.add_argument("--uid", default=None,
                   help="Skip the email→UID lookup and use this UID directly.")
    p.add_argument("--service-account", default=None,
                   help="Path to a Firebase Admin SDK service-account JSON. "
                        "Defaults to the newest coreapp-ce061-firebase-adminsdk-*.json in ~/Downloads.")
    p.add_argument("--backup-dir", type=Path, default=DEFAULT_BACKUP_DIR,
                   help="Directory containing unique_*.json files.")
    p.add_argument("--files", nargs="+", default=list(DEFAULT_FILES),
                   help="Which unique_*.json files (relative to --backup-dir) to read.")
    p.add_argument("--execute", action="store_true",
                   help="Actually delete (default is dry-run).")
    p.add_argument("--yes", action="store_true",
                   help="Skip the confirmation prompt when --execute.")
    p.add_argument("--batch-size", type=int, default=400,
                   help="Firestore batch size (max 500). Default 400.")
    return p.parse_args()


def newest_service_account() -> Path | None:
    downloads = Path.home() / "Downloads"
    if not downloads.is_dir():
        return None
    cands = list(downloads.glob("coreapp-ce061-firebase-adminsdk-*.json"))
    if not cands:
        return None
    return max(cands, key=lambda p: p.stat().st_mtime)


def load_dup_ids(backup_dir: Path, files: list[str]) -> tuple[list[str], dict[str, str]]:
    """Returns (ids_to_delete, kept_by_dup_id) where kept_by_dup_id maps
    each deletable id back to the canonical doc that survives — useful
    for the dry-run preview."""
    to_delete: list[str] = []
    kept_by_dup: dict[str, str] = {}
    for fname in files:
        path = backup_dir / fname
        if not path.is_file():
            print(f"WARN: missing {path}", file=sys.stderr)
            continue
        data = json.loads(path.read_text())
        for entry in data:
            keeper = entry["doc_id"]
            for dup in entry.get("duplicate_doc_ids", []) or []:
                to_delete.append(dup)
                kept_by_dup[dup] = keeper
        # Per-file count
        n = sum(len(e.get("duplicate_doc_ids") or []) for e in data)
        print(f"  {fname}: {n} dup ids to delete (canonical keepers: {len(data)})")
    return to_delete, kept_by_dup


def main() -> int:
    args = parse_args()

    sa_path = Path(args.service_account) if args.service_account else newest_service_account()
    if not sa_path or not sa_path.is_file():
        print("ERROR: service-account JSON not found. Pass --service-account.", file=sys.stderr)
        return 2

    print(f"Loading dup list from {args.backup_dir}")
    to_delete, kept_by_dup = load_dup_ids(args.backup_dir, args.files)
    print(f"\nTotal duplicates to delete: {len(to_delete)}")

    if not to_delete:
        print("Nothing to do.")
        return 0

    print("\nFirst 10 (deletable → keeper):")
    for d in to_delete[:10]:
        print(f"  {d}  →  keeper {kept_by_dup[d]}")
    if len(to_delete) > 10:
        print(f"  ... and {len(to_delete) - 10} more")

    # Resolve UID
    print(f"\nInitialising firebase-admin with {sa_path.name}...")
    cred = credentials.Certificate(str(sa_path))
    firebase_admin.initialize_app(cred)
    if args.uid:
        uid = args.uid
    else:
        user = fb_auth.get_user_by_email(args.email)
        uid = user.uid
        print(f"Resolved {args.email} → {uid}")

    db = firestore.client()
    coll = db.collection("recordings").document(uid).collection("recordings")
    print(f"Target path: recordings/{uid}/recordings\n")

    if not args.execute:
        print("DRY-RUN — pass --execute to actually delete.")
        return 0

    if not args.yes:
        confirm = input(f"Delete {len(to_delete)} docs from Firestore? [y/N] ").strip().lower()
        if confirm != "y":
            print("Aborted.")
            return 1

    deleted = 0
    failed = 0
    batch_size = max(1, min(500, args.batch_size))
    # Firestore batched writes: chunk into batch_size
    for chunk_start in range(0, len(to_delete), batch_size):
        chunk = to_delete[chunk_start:chunk_start + batch_size]
        batch = db.batch()
        for doc_id in chunk:
            batch.delete(coll.document(doc_id))
        try:
            batch.commit()
            deleted += len(chunk)
            print(f"  Deleted {deleted}/{len(to_delete)}")
        except Exception as e:
            failed += len(chunk)
            print(f"  Batch failed at chunk_start={chunk_start}: {e}", file=sys.stderr)

    print(f"\nDone. Deleted {deleted} docs, {failed} failed.")
    return 0 if failed == 0 else 1


if __name__ == "__main__":
    sys.exit(main())
