mirror of
https://github.com/marvinscham/masterthesis-playground.git
synced 2026-03-22 08:22:43 +01:00
Add helper stuff for figures, cleanup
This commit is contained in:
97
figures/simplify_review_lengths.py
Normal file
97
figures/simplify_review_lengths.py
Normal file
@@ -0,0 +1,97 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Aggregate review length counts into buckets."""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, Tuple
|
||||
|
||||
Bucket = Tuple[int | None, int | None, str]
|
||||
|
||||
|
||||
DEFAULT_BUCKETS: Tuple[Bucket, ...] = (
|
||||
(None, 9, "<10"),
|
||||
(10, 19, "10-19"),
|
||||
(20, 29, "20-29"),
|
||||
(30, 39, "30-39"),
|
||||
(40, 49, "40-49"),
|
||||
(50, 59, "50-59"),
|
||||
(60, 69, "60-69"),
|
||||
(70, 79, "70-79"),
|
||||
(80, 89, "80-89"),
|
||||
(90, 99, "90-99"),
|
||||
(100, 109, "100-109"),
|
||||
(110, 119, "110-119"),
|
||||
(120, 129, "120-129"),
|
||||
(130, 139, "130-139"),
|
||||
(140, 149, "140-149"),
|
||||
(150, 159, "150-159"),
|
||||
(160, 169, "160-169"),
|
||||
(170, 179, "170-179"),
|
||||
(180, 189, "180-189"),
|
||||
(190, 199, "190-199"),
|
||||
(200, 219, "200-219"),
|
||||
(220, 239, "220-239"),
|
||||
(240, 259, "240-259"),
|
||||
(260, 279, "260-279"),
|
||||
(280, 299, "280-299"),
|
||||
(300, 399, "300-399"),
|
||||
(400, 499, "400-499"),
|
||||
(500, 999, "500-999"),
|
||||
(1000, None, "1000+"),
|
||||
)
|
||||
|
||||
|
||||
def load_counts(path: Path) -> Dict[int, int]:
|
||||
with path.open("r", encoding="utf-8") as handle:
|
||||
raw = json.load(handle)
|
||||
return {int(k): int(v) for k, v in raw.items()}
|
||||
|
||||
|
||||
def aggregate(counts: Dict[int, int], buckets: Iterable[Bucket]) -> Dict[str, int]:
|
||||
output: Dict[str, int] = {label: 0 for _, _, label in buckets}
|
||||
for length, count in counts.items():
|
||||
for start, end, label in buckets:
|
||||
if start is None and end is not None and length <= end:
|
||||
output[label] += count
|
||||
break
|
||||
if end is None and start is not None and length >= start:
|
||||
output[label] += count
|
||||
break
|
||||
if start is not None and end is not None and start <= length <= end:
|
||||
output[label] += count
|
||||
break
|
||||
else:
|
||||
raise ValueError(f"No bucket found for length {length}.")
|
||||
return output
|
||||
|
||||
|
||||
def write_output(path: Path, data: Dict[str, int]) -> None:
|
||||
with path.open("w", encoding="utf-8") as handle:
|
||||
json.dump(data, handle, indent=2, ensure_ascii=False)
|
||||
handle.write("\n")
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="Bucket review length counts.")
|
||||
parser.add_argument(
|
||||
"input",
|
||||
type=Path,
|
||||
help="Path to review_lengths.json (mapping of length -> count).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"output",
|
||||
type=Path,
|
||||
help="Path to write bucketed counts JSON.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
counts = load_counts(args.input)
|
||||
bucketed = aggregate(counts, DEFAULT_BUCKETS)
|
||||
write_output(args.output, bucketed)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user