#!/usr/bin/env python3
"""
Read a .tab (TSV) file with a single column named 'review'.
1) Print number of rows
2) Drop exact duplicate reviews and print count again
3) Build JSON describing the distribution of review length (in words) for remaining reviews
"""

import argparse
import json
import sys
from collections import Counter
from pathlib import Path

import pandas as pd


def word_count(text: str) -> int:
    # Count words by whitespace splitting after stripping.
    # Treat non-string / NaN as 0 words (you can change this if you want to drop them).
    if not isinstance(text, str):
        return 0
    s = text.strip()
    if not s:
        return 0
    return len(s.split())


def main() -> int:
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "input_tab", help="Path to .tab/.tsv file with a 'review' column"
    )
    parser.add_argument(
        "--out",
        default="review_length_distribution.json",
        help="Output JSON path (default: review_length_distribution.json)",
    )
    args = parser.parse_args()

    in_path = Path(args.input_tab)
    if not in_path.exists():
        print(f"ERROR: file not found: {in_path}", file=sys.stderr)
        return 1

    # Read as TSV. Keep empty strings; pandas will use NaN for empty fields unless keep_default_na=False.
    df = pd.read_csv(in_path, sep="\t", dtype=str, keep_default_na=False)

    if "review" not in df.columns:
        print(
            f"ERROR: expected a column named 'review'. Found: {list(df.columns)}",
            file=sys.stderr,
        )
        return 1

    n_before = len(df)
    print(f"Rows before dedup: {n_before}")

    # Exact duplicates based on the full string in "review".
    # If you want to ignore leading/trailing spaces, do df['review']=df['review'].str.strip() first.
    df_dedup = df.drop_duplicates(subset=["review"], keep="first").reset_index(
        drop=True
    )

    n_after = len(df_dedup)
    print(f"Rows after dedup:  {n_after}")

    # Compute word counts for remaining reviews
    lengths = df_dedup["review"].map(word_count)

    # Distribution (histogram): word_count -> number of reviews
    dist = Counter(lengths.tolist())

    result = {
        "file": str(in_path),
        "rows_before_dedup": n_before,
        "rows_after_dedup": n_after,
        "distribution_word_length": {
            # JSON keys must be strings; keep as strings for portability.
            str(k): v
            for k, v in sorted(dist.items(), key=lambda kv: int(kv[0]))
        },
        "summary": {
            "min_words": int(lengths.min()) if len(lengths) else 0,
            "max_words": int(lengths.max()) if len(lengths) else 0,
            "mean_words": float(lengths.mean()) if len(lengths) else 0.0,
            "median_words": float(lengths.median()) if len(lengths) else 0.0,
        },
    }

    out_path = Path(args.out)
    out_path.write_text(
        json.dumps(result, ensure_ascii=False, indent=2), encoding="utf-8"
    )
    print(f"Wrote JSON: {out_path}")

    return 0


if __name__ == "__main__":
    raise SystemExit(main())