From 99ba5031ca4ce706f69ba13583d045d80d756b1c Mon Sep 17 00:00:00 2001 From: Marvin Scham Date: Fri, 20 Feb 2026 01:58:00 +0100 Subject: [PATCH] Updated BERTopic quality gates --- bertopic/nb_bertopic_lowprep.py | 120 ++++++++++++++++---------------- 1 file changed, 60 insertions(+), 60 deletions(-) diff --git a/bertopic/nb_bertopic_lowprep.py b/bertopic/nb_bertopic_lowprep.py index c080624..14c7690 100644 --- a/bertopic/nb_bertopic_lowprep.py +++ b/bertopic/nb_bertopic_lowprep.py @@ -7,7 +7,7 @@ # format_version: '1.3' # jupytext_version: 1.18.0 # kernelspec: -# display_name: .venv +# display_name: .venv (3.12.3) # language: python # name: python3 # --- @@ -56,7 +56,7 @@ nltk.download("wordnet") # %% RECREATE_MODEL = True RECREATE_REDUCED_MODEL = True -PROCESS_DATA = False +PROCESS_DATA = True REDUCE_OUTLIERS = False # Data Sample Size, -1 for all data @@ -107,6 +107,12 @@ else: .review.to_list() ) +# Remove all duplicate reviews +reviews = list(set(reviews)) + +# Remove reviews that contain less than x words +reviews = [review for review in reviews if len(review.split()) >= 9] + print("Loaded {} reviews".format(len(reviews))) # %% @@ -115,9 +121,6 @@ rep = { r"\n": " ", r'\\"': "", r'"': "", - "mongkey": "monkey", - "monky": "monkey", - "verry": "very", "bali": "", r"\s+": " ", } @@ -302,75 +305,72 @@ if REDUCE_OUTLIERS: # # %% -import random -from pathlib import Path +CLASSIFICATION = False +if CLASSIFICATION: + import random + from pathlib import Path -# --- config --- -topics_to_keep = {2, 4, 5, 9, 22, 26} -INPUT_PATH = "../data/original/reviews.tab" # TSV with a 'review' column -OUTPUT_CSV = "../data/intermediate/selected_topics_documents.csv" -OUTPUT_DIR = Path("../raft/corpus") -OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + # --- config --- + topics_to_keep = {2, 4, 5, 9, 22, 26} + INPUT_PATH = "../data/original/reviews.tab" # TSV with a 'review' column + OUTPUT_CSV = "../data/intermediate/selected_topics_documents.csv" + OUTPUT_DIR = Path("../raft/corpus") + OUTPUT_DIR.mkdir(parents=True, exist_ok=True) -BATCH_SIZE = 60 -MIN_CHARS = 40 -SEED = 42 + BATCH_SIZE = 60 + MIN_CHARS = 40 + SEED = 42 -# --- load data --- -data = pd.read_csv(INPUT_PATH, sep="\t") + # Topic model document info + df = topic_model.get_document_info(reviews) # assumes your model is already fitted + df["Original"] = reviews.values -# If you already have `reviews` elsewhere, replace the next line with that variable -reviews = data["review"].astype(str).fillna("") + # --- filter by topics and length --- + filtered = df[df["Topic"].isin(topics_to_keep)].copy() + filtered["Original"] = filtered["Original"].str.strip() + filtered = filtered[filtered["Original"].str.len() >= MIN_CHARS] -# Topic model document info -df = topic_model.get_document_info(reviews) # assumes your model is already fitted -df["Original"] = reviews.values + # Save an audit CSV + filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False) -# --- filter by topics and length --- -filtered = df[df["Topic"].isin(topics_to_keep)].copy() -filtered["Original"] = filtered["Original"].str.strip() -filtered = filtered[filtered["Original"].str.len() >= MIN_CHARS] + # --- deterministic shuffle + write batched corpus files --- + total_files = 0 + total_reviews = 0 + rng = random.Random(SEED) -# Save an audit CSV -filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False) + for topic_val, g in filtered.groupby("Topic", sort=True): + reviews_list = g["Original"].tolist() -# --- deterministic shuffle + write batched corpus files --- -total_files = 0 -total_reviews = 0 -rng = random.Random(SEED) + # deterministic shuffle within topic + rng.shuffle(reviews_list) -for topic_val, g in filtered.groupby("Topic", sort=True): - reviews_list = g["Original"].tolist() + # chunk into batches of up to 60 + for start in range(0, len(reviews_list), BATCH_SIZE): + chunk = reviews_list[start : start + BATCH_SIZE] + if not chunk: + continue - # deterministic shuffle within topic - rng.shuffle(reviews_list) + # simple header for traceability + header = ( + f"[TOPIC] {topic_val}\n" + + f"[Stats] N={len(chunk)} | Source={INPUT_PATH}\n" + ) - # chunk into batches of up to 60 - for start in range(0, len(reviews_list), BATCH_SIZE): - chunk = reviews_list[start : start + BATCH_SIZE] - if not chunk: - continue + lines = [header, ""] + for i, txt in enumerate(chunk, 1): + lines.append(f"({i}) {txt}") - # simple header for traceability - header = ( - f"[TOPIC] {topic_val}\n" + f"[Stats] N={len(chunk)} | Source={INPUT_PATH}\n" - ) + part_idx = start // BATCH_SIZE + 1 + fname = f"topic={topic_val}__part={part_idx:03d}__n={len(chunk)}.txt" + (OUTPUT_DIR / fname).write_text("\n".join(lines), encoding="utf-8") - lines = [header, ""] - for i, txt in enumerate(chunk, 1): - lines.append(f"({i}) {txt}") + total_files += 1 + total_reviews += len(chunk) - part_idx = start // BATCH_SIZE + 1 - fname = f"topic={topic_val}__part={part_idx:03d}__n={len(chunk)}.txt" - (OUTPUT_DIR / fname).write_text("\n".join(lines), encoding="utf-8") - - total_files += 1 - total_reviews += len(chunk) - -print( - f"[green]Wrote {total_files} docs with {total_reviews} reviews to {OUTPUT_DIR}[/green]" -) -print(f"[green]Filtered CSV saved to {OUTPUT_CSV}[/green]") + print( + f"[green]Wrote {total_files} docs with {total_reviews} reviews to {OUTPUT_DIR}[/green]" + ) + print(f"[green]Filtered CSV saved to {OUTPUT_CSV}[/green]") # %% doc_topic_matrix = probs