Add helper stuff for figures, cleanup

2026-03-22 08:22:43 +01:00 · 2026-02-20 01:56:28 +01:00
parent 28823dc0b5
commit 101bd81ca1
20 changed files with 1862 additions and 1164 deletions
--- a/figures/simplify_review_lengths.py
+++ b/figures/simplify_review_lengths.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+"""Aggregate review length counts into buckets."""
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+from typing import Dict, Iterable, Tuple
+
+Bucket = Tuple[int | None, int | None, str]
+
+
+DEFAULT_BUCKETS: Tuple[Bucket, ...] = (
+    (None, 9, "<10"),
+    (10, 19, "10-19"),
+    (20, 29, "20-29"),
+    (30, 39, "30-39"),
+    (40, 49, "40-49"),
+    (50, 59, "50-59"),
+    (60, 69, "60-69"),
+    (70, 79, "70-79"),
+    (80, 89, "80-89"),
+    (90, 99, "90-99"),
+    (100, 109, "100-109"),
+    (110, 119, "110-119"),
+    (120, 129, "120-129"),
+    (130, 139, "130-139"),
+    (140, 149, "140-149"),
+    (150, 159, "150-159"),
+    (160, 169, "160-169"),
+    (170, 179, "170-179"),
+    (180, 189, "180-189"),
+    (190, 199, "190-199"),
+    (200, 219, "200-219"),
+    (220, 239, "220-239"),
+    (240, 259, "240-259"),
+    (260, 279, "260-279"),
+    (280, 299, "280-299"),
+    (300, 399, "300-399"),
+    (400, 499, "400-499"),
+    (500, 999, "500-999"),
+    (1000, None, "1000+"),
+)
+
+
+def load_counts(path: Path) -> Dict[int, int]:
+    with path.open("r", encoding="utf-8") as handle:
+        raw = json.load(handle)
+    return {int(k): int(v) for k, v in raw.items()}
+
+
+def aggregate(counts: Dict[int, int], buckets: Iterable[Bucket]) -> Dict[str, int]:
+    output: Dict[str, int] = {label: 0 for _, _, label in buckets}
+    for length, count in counts.items():
+        for start, end, label in buckets:
+            if start is None and end is not None and length <= end:
+                output[label] += count
+                break
+            if end is None and start is not None and length >= start:
+                output[label] += count
+                break
+            if start is not None and end is not None and start <= length <= end:
+                output[label] += count
+                break
+        else:
+            raise ValueError(f"No bucket found for length {length}.")
+    return output
+
+
+def write_output(path: Path, data: Dict[str, int]) -> None:
+    with path.open("w", encoding="utf-8") as handle:
+        json.dump(data, handle, indent=2, ensure_ascii=False)
+        handle.write("\n")
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Bucket review length counts.")
+    parser.add_argument(
+        "input",
+        type=Path,
+        help="Path to review_lengths.json (mapping of length -> count).",
+    )
+    parser.add_argument(
+        "output",
+        type=Path,
+        help="Path to write bucketed counts JSON.",
+    )
+    args = parser.parse_args()
+
+    counts = load_counts(args.input)
+    bucketed = aggregate(counts, DEFAULT_BUCKETS)
+    write_output(args.output, bucketed)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())