Add helper stuff for figures, cleanup

This commit is contained in:
2026-02-20 01:56:28 +01:00
parent 28823dc0b5
commit 101bd81ca1
20 changed files with 1862 additions and 1164 deletions

101
figures/review_dist.py Normal file
View File

@@ -0,0 +1,101 @@
#!/usr/bin/env python3
"""
Read a .tab (TSV) file with a single column named 'review'.
1) Print number of rows
2) Drop exact duplicate reviews and print count again
3) Build JSON describing the distribution of review length (in words) for remaining reviews
"""
import argparse
import json
import sys
from collections import Counter
from pathlib import Path
import pandas as pd
def word_count(text: str) -> int:
# Count words by whitespace splitting after stripping.
# Treat non-string / NaN as 0 words (you can change this if you want to drop them).
if not isinstance(text, str):
return 0
s = text.strip()
if not s:
return 0
return len(s.split())
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument(
"input_tab", help="Path to .tab/.tsv file with a 'review' column"
)
parser.add_argument(
"--out",
default="review_length_distribution.json",
help="Output JSON path (default: review_length_distribution.json)",
)
args = parser.parse_args()
in_path = Path(args.input_tab)
if not in_path.exists():
print(f"ERROR: file not found: {in_path}", file=sys.stderr)
return 1
# Read as TSV. Keep empty strings; pandas will use NaN for empty fields unless keep_default_na=False.
df = pd.read_csv(in_path, sep="\t", dtype=str, keep_default_na=False)
if "review" not in df.columns:
print(
f"ERROR: expected a column named 'review'. Found: {list(df.columns)}",
file=sys.stderr,
)
return 1
n_before = len(df)
print(f"Rows before dedup: {n_before}")
# Exact duplicates based on the full string in "review".
# If you want to ignore leading/trailing spaces, do df['review']=df['review'].str.strip() first.
df_dedup = df.drop_duplicates(subset=["review"], keep="first").reset_index(
drop=True
)
n_after = len(df_dedup)
print(f"Rows after dedup: {n_after}")
# Compute word counts for remaining reviews
lengths = df_dedup["review"].map(word_count)
# Distribution (histogram): word_count -> number of reviews
dist = Counter(lengths.tolist())
result = {
"file": str(in_path),
"rows_before_dedup": n_before,
"rows_after_dedup": n_after,
"distribution_word_length": {
# JSON keys must be strings; keep as strings for portability.
str(k): v
for k, v in sorted(dist.items(), key=lambda kv: int(kv[0]))
},
"summary": {
"min_words": int(lengths.min()) if len(lengths) else 0,
"max_words": int(lengths.max()) if len(lengths) else 0,
"mean_words": float(lengths.mean()) if len(lengths) else 0.0,
"median_words": float(lengths.median()) if len(lengths) else 0.0,
},
}
out_path = Path(args.out)
out_path.write_text(
json.dumps(result, ensure_ascii=False, indent=2), encoding="utf-8"
)
print(f"Wrote JSON: {out_path}")
return 0
if __name__ == "__main__":
raise SystemExit(main())