#!/usr/bin/env python3 """ Read a .tab (TSV) file with a single column named 'review'. 1) Print number of rows 2) Drop exact duplicate reviews and print count again 3) Build JSON describing the distribution of review length (in words) for remaining reviews """ import argparse import json import sys from collections import Counter from pathlib import Path import pandas as pd def word_count(text: str) -> int: # Count words by whitespace splitting after stripping. # Treat non-string / NaN as 0 words (you can change this if you want to drop them). if not isinstance(text, str): return 0 s = text.strip() if not s: return 0 return len(s.split()) def main() -> int: parser = argparse.ArgumentParser() parser.add_argument( "input_tab", help="Path to .tab/.tsv file with a 'review' column" ) parser.add_argument( "--out", default="review_length_distribution.json", help="Output JSON path (default: review_length_distribution.json)", ) args = parser.parse_args() in_path = Path(args.input_tab) if not in_path.exists(): print(f"ERROR: file not found: {in_path}", file=sys.stderr) return 1 # Read as TSV. Keep empty strings; pandas will use NaN for empty fields unless keep_default_na=False. df = pd.read_csv(in_path, sep="\t", dtype=str, keep_default_na=False) if "review" not in df.columns: print( f"ERROR: expected a column named 'review'. Found: {list(df.columns)}", file=sys.stderr, ) return 1 n_before = len(df) print(f"Rows before dedup: {n_before}") # Exact duplicates based on the full string in "review". # If you want to ignore leading/trailing spaces, do df['review']=df['review'].str.strip() first. df_dedup = df.drop_duplicates(subset=["review"], keep="first").reset_index( drop=True ) n_after = len(df_dedup) print(f"Rows after dedup: {n_after}") # Compute word counts for remaining reviews lengths = df_dedup["review"].map(word_count) # Distribution (histogram): word_count -> number of reviews dist = Counter(lengths.tolist()) result = { "file": str(in_path), "rows_before_dedup": n_before, "rows_after_dedup": n_after, "distribution_word_length": { # JSON keys must be strings; keep as strings for portability. str(k): v for k, v in sorted(dist.items(), key=lambda kv: int(kv[0])) }, "summary": { "min_words": int(lengths.min()) if len(lengths) else 0, "max_words": int(lengths.max()) if len(lengths) else 0, "mean_words": float(lengths.mean()) if len(lengths) else 0.0, "median_words": float(lengths.median()) if len(lengths) else 0.0, }, } out_path = Path(args.out) out_path.write_text( json.dumps(result, ensure_ascii=False, indent=2), encoding="utf-8" ) print(f"Wrote JSON: {out_path}") return 0 if __name__ == "__main__": raise SystemExit(main())