mirror of
https://github.com/marvinscham/masterthesis-playground.git
synced 2026-03-22 08:22:43 +01:00
98 lines
2.7 KiB
Python
98 lines
2.7 KiB
Python
#!/usr/bin/env python3
|
|
"""Aggregate review length counts into buckets."""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
from pathlib import Path
|
|
from typing import Dict, Iterable, Tuple
|
|
|
|
Bucket = Tuple[int | None, int | None, str]
|
|
|
|
|
|
DEFAULT_BUCKETS: Tuple[Bucket, ...] = (
|
|
(None, 9, "<10"),
|
|
(10, 19, "10-19"),
|
|
(20, 29, "20-29"),
|
|
(30, 39, "30-39"),
|
|
(40, 49, "40-49"),
|
|
(50, 59, "50-59"),
|
|
(60, 69, "60-69"),
|
|
(70, 79, "70-79"),
|
|
(80, 89, "80-89"),
|
|
(90, 99, "90-99"),
|
|
(100, 109, "100-109"),
|
|
(110, 119, "110-119"),
|
|
(120, 129, "120-129"),
|
|
(130, 139, "130-139"),
|
|
(140, 149, "140-149"),
|
|
(150, 159, "150-159"),
|
|
(160, 169, "160-169"),
|
|
(170, 179, "170-179"),
|
|
(180, 189, "180-189"),
|
|
(190, 199, "190-199"),
|
|
(200, 219, "200-219"),
|
|
(220, 239, "220-239"),
|
|
(240, 259, "240-259"),
|
|
(260, 279, "260-279"),
|
|
(280, 299, "280-299"),
|
|
(300, 399, "300-399"),
|
|
(400, 499, "400-499"),
|
|
(500, 999, "500-999"),
|
|
(1000, None, "1000+"),
|
|
)
|
|
|
|
|
|
def load_counts(path: Path) -> Dict[int, int]:
|
|
with path.open("r", encoding="utf-8") as handle:
|
|
raw = json.load(handle)
|
|
return {int(k): int(v) for k, v in raw.items()}
|
|
|
|
|
|
def aggregate(counts: Dict[int, int], buckets: Iterable[Bucket]) -> Dict[str, int]:
|
|
output: Dict[str, int] = {label: 0 for _, _, label in buckets}
|
|
for length, count in counts.items():
|
|
for start, end, label in buckets:
|
|
if start is None and end is not None and length <= end:
|
|
output[label] += count
|
|
break
|
|
if end is None and start is not None and length >= start:
|
|
output[label] += count
|
|
break
|
|
if start is not None and end is not None and start <= length <= end:
|
|
output[label] += count
|
|
break
|
|
else:
|
|
raise ValueError(f"No bucket found for length {length}.")
|
|
return output
|
|
|
|
|
|
def write_output(path: Path, data: Dict[str, int]) -> None:
|
|
with path.open("w", encoding="utf-8") as handle:
|
|
json.dump(data, handle, indent=2, ensure_ascii=False)
|
|
handle.write("\n")
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(description="Bucket review length counts.")
|
|
parser.add_argument(
|
|
"input",
|
|
type=Path,
|
|
help="Path to review_lengths.json (mapping of length -> count).",
|
|
)
|
|
parser.add_argument(
|
|
"output",
|
|
type=Path,
|
|
help="Path to write bucketed counts JSON.",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
counts = load_counts(args.input)
|
|
bucketed = aggregate(counts, DEFAULT_BUCKETS)
|
|
write_output(args.output, bucketed)
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|