Add helper stuff for figures, cleanup

2026-03-22 00:12:42 +01:00 · 2026-02-20 01:56:28 +01:00
parent 28823dc0b5
commit 101bd81ca1
20 changed files with 1862 additions and 1164 deletions
--- a/figures/review_dist.py
+++ b/figures/review_dist.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python3
+"""
+Read a .tab (TSV) file with a single column named 'review'.
+1) Print number of rows
+2) Drop exact duplicate reviews and print count again
+3) Build JSON describing the distribution of review length (in words) for remaining reviews
+"""
+
+import argparse
+import json
+import sys
+from collections import Counter
+from pathlib import Path
+
+import pandas as pd
+
+
+def word_count(text: str) -> int:
+    # Count words by whitespace splitting after stripping.
+    # Treat non-string / NaN as 0 words (you can change this if you want to drop them).
+    if not isinstance(text, str):
+        return 0
+    s = text.strip()
+    if not s:
+        return 0
+    return len(s.split())
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "input_tab", help="Path to .tab/.tsv file with a 'review' column"
+    )
+    parser.add_argument(
+        "--out",
+        default="review_length_distribution.json",
+        help="Output JSON path (default: review_length_distribution.json)",
+    )
+    args = parser.parse_args()
+
+    in_path = Path(args.input_tab)
+    if not in_path.exists():
+        print(f"ERROR: file not found: {in_path}", file=sys.stderr)
+        return 1
+
+    # Read as TSV. Keep empty strings; pandas will use NaN for empty fields unless keep_default_na=False.
+    df = pd.read_csv(in_path, sep="\t", dtype=str, keep_default_na=False)
+
+    if "review" not in df.columns:
+        print(
+            f"ERROR: expected a column named 'review'. Found: {list(df.columns)}",
+            file=sys.stderr,
+        )
+        return 1
+
+    n_before = len(df)
+    print(f"Rows before dedup: {n_before}")
+
+    # Exact duplicates based on the full string in "review".
+    # If you want to ignore leading/trailing spaces, do df['review']=df['review'].str.strip() first.
+    df_dedup = df.drop_duplicates(subset=["review"], keep="first").reset_index(
+        drop=True
+    )
+
+    n_after = len(df_dedup)
+    print(f"Rows after dedup:  {n_after}")
+
+    # Compute word counts for remaining reviews
+    lengths = df_dedup["review"].map(word_count)
+
+    # Distribution (histogram): word_count -> number of reviews
+    dist = Counter(lengths.tolist())
+
+    result = {
+        "file": str(in_path),
+        "rows_before_dedup": n_before,
+        "rows_after_dedup": n_after,
+        "distribution_word_length": {
+            # JSON keys must be strings; keep as strings for portability.
+            str(k): v
+            for k, v in sorted(dist.items(), key=lambda kv: int(kv[0]))
+        },
+        "summary": {
+            "min_words": int(lengths.min()) if len(lengths) else 0,
+            "max_words": int(lengths.max()) if len(lengths) else 0,
+            "mean_words": float(lengths.mean()) if len(lengths) else 0.0,
+            "median_words": float(lengths.median()) if len(lengths) else 0.0,
+        },
+    }
+
+    out_path = Path(args.out)
+    out_path.write_text(
+        json.dumps(result, ensure_ascii=False, indent=2), encoding="utf-8"
+    )
+    print(f"Wrote JSON: {out_path}")
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())