#!/usr/bin/env python3
"""Local-only analysis of the Firestore backup.

Categorises docs into:
  - "with-transcription"   = entries[0].transcription is non-empty (the
                             ones we kept in cloud after the
                             delete_no_transcription pass)
  - "no-content"           = entries empty OR no fileName+no transcription

Reports duplicate groups separately for each, since the "no-content"
docs are noise we already deleted from cloud — what matters now is
duplicates among the 1396 with-transcription docs.

Writes one JSON file per category to the backup directory:
  - unique_with_transcription.json
  - unique_no_content.json
"""

import argparse
import json
import sys
from collections import Counter, defaultdict
from pathlib import Path


def fingerprint(doc: dict) -> tuple:
    ts = doc.get("timestamp") or {}
    if isinstance(ts, dict):
        sec = ts.get("epochSeconds")
        nanos = ts.get("nanosecondsOfSecond")
    else:
        sec = -1  # legacy datetime-string; treat as one bucket
        nanos = str(ts)
    entries = doc.get("entries") or []
    e0 = entries[0] if entries else {}
    return (sec, nanos, e0.get("fileName"), e0.get("transcription"), len(entries))


def has_content(doc: dict) -> bool:
    """A 'real' recording — has at least one entry with either a
    fileName or a non-empty transcription."""
    for e in (doc.get("entries") or []):
        if (e.get("transcription") or "").strip():
            return True
        if e.get("fileName"):
            return True
    return False


def has_transcription(doc: dict) -> bool:
    for e in (doc.get("entries") or []):
        if (e.get("transcription") or "").strip():
            return True
    return False


def main() -> int:
    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument("backup_dir", type=Path,
                        help="Path to a directory containing a `firestore/` subdirectory of *.json doc dumps")
    args = parser.parse_args()
    backup = args.backup_dir
    firestore_dir = backup / "firestore"
    if not firestore_dir.is_dir():
        print(f"ERROR: {firestore_dir} not found", file=sys.stderr)
        return 2

    files = sorted(firestore_dir.glob("*.json"))
    print(f"Scanning {len(files)} backup files from {firestore_dir}\n")

    # Group docs into 3 buckets, then dedup each separately.
    buckets = {
        "with_transcription": defaultdict(list),
        "with_filename_only": defaultdict(list),
        "no_content": defaultdict(list),
    }
    for f in files:
        try:
            doc = json.load(f.open())
        except Exception:
            continue
        if has_transcription(doc):
            key = "with_transcription"
        elif has_content(doc):
            key = "with_filename_only"
        else:
            key = "no_content"
        buckets[key][fingerprint(doc)].append((f.stem, doc))

    for label, groups in buckets.items():
        total = sum(len(v) for v in groups.values())
        unique = len(groups)
        dup_groups = sum(1 for v in groups.values() if len(v) > 1)
        extra = sum(len(v) - 1 for v in groups.values() if len(v) > 1)
        print(f"=== {label} ===")
        print(f"  total docs   : {total}")
        print(f"  unique       : {unique}")
        print(f"  dup groups   : {dup_groups}")
        print(f"  extra copies : {extra}")
        size_dist = Counter(len(v) for v in groups.values())
        for size, n in sorted(size_dist.items()):
            print(f"    {size:>3} copies × {n:>5} groups")
        print()

    # Write unique lists for the two non-trash buckets
    for label in ("with_transcription", "with_filename_only"):
        groups = buckets[label]
        unique = []
        for fp, items in groups.items():
            items_sorted = sorted(items, key=lambda x: x[0])
            canonical_id, _ = items_sorted[0]
            sec, nanos, fn, tx, n = fp
            unique.append({
                "doc_id": canonical_id,
                "timestamp_epochSeconds": sec,
                "fileName": fn,
                "transcription": tx,
                "n_entries": n,
                "duplicate_doc_ids": [d for d, _ in items_sorted[1:]],
            })
        unique.sort(key=lambda r: r["timestamp_epochSeconds"] if isinstance(r["timestamp_epochSeconds"], int) else -1)
        out = backup / f"unique_{label}.json"
        out.write_text(json.dumps(unique, indent=2))
        print(f"Wrote {len(unique):>4} → {out.name}")

    # Sample largest dup groups in with_transcription bucket
    print("\nLargest with-transcription dup groups:")
    biggest = sorted(((len(v), k, v) for k, v in buckets["with_transcription"].items() if len(v) > 1),
                     reverse=True)[:10]
    for size, fp, items in biggest:
        sec, nanos, fn, tx, n = fp
        print(f"\n  {size}× | ts={sec} | file={fn}")
        print(f"      tx={tx!r}")
        for doc_id, _ in items[:size]:
            print(f"      - {doc_id}")
    return 0


if __name__ == "__main__":
    sys.exit(main())