mirror of
https://github.com/marvinscham/masterthesis-playground.git
synced 2026-03-22 00:12:42 +01:00
Add helper stuff for figures, cleanup
This commit is contained in:
101
figures/review_dist.py
Normal file
101
figures/review_dist.py
Normal file
@@ -0,0 +1,101 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Read a .tab (TSV) file with a single column named 'review'.
|
||||
1) Print number of rows
|
||||
2) Drop exact duplicate reviews and print count again
|
||||
3) Build JSON describing the distribution of review length (in words) for remaining reviews
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def word_count(text: str) -> int:
|
||||
# Count words by whitespace splitting after stripping.
|
||||
# Treat non-string / NaN as 0 words (you can change this if you want to drop them).
|
||||
if not isinstance(text, str):
|
||||
return 0
|
||||
s = text.strip()
|
||||
if not s:
|
||||
return 0
|
||||
return len(s.split())
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"input_tab", help="Path to .tab/.tsv file with a 'review' column"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--out",
|
||||
default="review_length_distribution.json",
|
||||
help="Output JSON path (default: review_length_distribution.json)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
in_path = Path(args.input_tab)
|
||||
if not in_path.exists():
|
||||
print(f"ERROR: file not found: {in_path}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Read as TSV. Keep empty strings; pandas will use NaN for empty fields unless keep_default_na=False.
|
||||
df = pd.read_csv(in_path, sep="\t", dtype=str, keep_default_na=False)
|
||||
|
||||
if "review" not in df.columns:
|
||||
print(
|
||||
f"ERROR: expected a column named 'review'. Found: {list(df.columns)}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 1
|
||||
|
||||
n_before = len(df)
|
||||
print(f"Rows before dedup: {n_before}")
|
||||
|
||||
# Exact duplicates based on the full string in "review".
|
||||
# If you want to ignore leading/trailing spaces, do df['review']=df['review'].str.strip() first.
|
||||
df_dedup = df.drop_duplicates(subset=["review"], keep="first").reset_index(
|
||||
drop=True
|
||||
)
|
||||
|
||||
n_after = len(df_dedup)
|
||||
print(f"Rows after dedup: {n_after}")
|
||||
|
||||
# Compute word counts for remaining reviews
|
||||
lengths = df_dedup["review"].map(word_count)
|
||||
|
||||
# Distribution (histogram): word_count -> number of reviews
|
||||
dist = Counter(lengths.tolist())
|
||||
|
||||
result = {
|
||||
"file": str(in_path),
|
||||
"rows_before_dedup": n_before,
|
||||
"rows_after_dedup": n_after,
|
||||
"distribution_word_length": {
|
||||
# JSON keys must be strings; keep as strings for portability.
|
||||
str(k): v
|
||||
for k, v in sorted(dist.items(), key=lambda kv: int(kv[0]))
|
||||
},
|
||||
"summary": {
|
||||
"min_words": int(lengths.min()) if len(lengths) else 0,
|
||||
"max_words": int(lengths.max()) if len(lengths) else 0,
|
||||
"mean_words": float(lengths.mean()) if len(lengths) else 0.0,
|
||||
"median_words": float(lengths.median()) if len(lengths) else 0.0,
|
||||
},
|
||||
}
|
||||
|
||||
out_path = Path(args.out)
|
||||
out_path.write_text(
|
||||
json.dumps(result, ensure_ascii=False, indent=2), encoding="utf-8"
|
||||
)
|
||||
print(f"Wrote JSON: {out_path}")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user