RAFT updates, BERTopic config, cleanup

2026-03-22 08:22:43 +01:00 · 2026-02-21 01:57:14 +01:00
parent 8cadcb1f69
commit 1a99b53d44
12 changed files with 10750 additions and 9778 deletions
--- a/bertopic/nb_bertopic_lowprep.py
+++ b/bertopic/nb_bertopic_lowprep.py
@@ -47,14 +47,15 @@ nltk.download("punkt")
 nltk.download("wordnet")

 # %% [markdown]
-# ### Parameters and Tracking
+# ### Hyperparameters and Settings
 #

 # %%
 RECREATE_MODEL = True
 RECREATE_REDUCED_MODEL = True
-PROCESS_DATA = True
+PROCESS_DATA = False
 REDUCE_OUTLIERS = False
+CALCULATE_TOKEN_DISTRIBUTIONS = False

 # Data Sample Size, -1 for all data
 DATA_SAMPLE_SIZE = -1
@@ -76,19 +77,7 @@ MIN_DIST = 0.01
 TOP_N_WORDS = 10
 MAX_TOPICS = None  # or "auto" to pass to HDBSCAN, None to skip

-tracking = {
-    "input": {
-        "min_document_frequency": MIN_DOCUMENT_FREQUENCY,
-        "max_ngram": MAX_NGRAM,
-        "min_topic_size": MIN_TOPIC_SIZE,
-        "min_samples": MIN_SAMPLES,
-        "n_neighbors": N_NEIGHBORS,
-        "n_components": N_COMPONENTS,
-        "min_dist": MIN_DIST,
-        "top_n_words": TOP_N_WORDS,
-        "max_topics": MAX_TOPICS,
-    },
-}
+TF_IDF_STOP_WORDS = ["bali", "place", "visit", "visited", "visiting"]

 # %% [markdown]
 # ### Data Loading & Preprocessing
@@ -116,21 +105,16 @@ rep = {
    r"\n": " ",
    r'\\"': "",
    r'"': "",
-    "bali": "",
    r"\s+": " ",
 }
 rep = dict((re.escape(k), v) for k, v in rep.items())
 pattern = re.compile("|".join(rep.keys()))


-# def preprocess(text):
-#     text = text.strip()
-#     text = text.lower()
-#     text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
-#     return text
-
-
 def preprocess(text):
+    text = text.strip()
+    text = text.lower()
+    text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
    return text


@@ -187,7 +171,7 @@ reduced_embeddings = umap_model.fit_transform(embeddings)

 # %%
 if RECREATE_MODEL:
-    stop_words = list(skltext.ENGLISH_STOP_WORDS.union(["bali"]))
+    stop_words = list(skltext.ENGLISH_STOP_WORDS.union(TF_IDF_STOP_WORDS))

    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
    vectorizer_model = CountVectorizer(
@@ -306,72 +290,23 @@ if REDUCE_OUTLIERS:
 #

 # %%
-CLASSIFICATION = False
+CLASSIFICATION = True
 if CLASSIFICATION:
-    import random
-    from pathlib import Path
-
-    # --- config ---
-    topics_to_keep = {2, 4, 5, 9, 22, 26}
-    INPUT_PATH = "../data/original/reviews.tab"  # TSV with a 'review' column
-    OUTPUT_CSV = "../data/intermediate/selected_topics_documents.csv"
-    OUTPUT_DIR = Path("../raft/corpus")
-    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
-
-    BATCH_SIZE = 60
-    MIN_CHARS = 40
-    SEED = 42
+    topics_to_keep = {14, 8, 13, 18, 17, 4, 2, 30, 28}
+    INPUT_PATH = "../data/intermediate/preprocessed.tab"  # TSV with a 'review' column
+    OUTPUT_CSV = "../data/intermediate/culture_reviews.csv"

    # Topic model document info
-    df = topic_model.get_document_info(reviews)  # assumes your model is already fitted
-    df["Original"] = reviews.values
+    df = topic_model.get_document_info(reviews)
+    df["Original"] = reviews

    # --- filter by topics and length ---
    filtered = df[df["Topic"].isin(topics_to_keep)].copy()
    filtered["Original"] = filtered["Original"].str.strip()
-    filtered = filtered[filtered["Original"].str.len() >= MIN_CHARS]

    # Save an audit CSV
-    filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False)
-
-    # --- deterministic shuffle + write batched corpus files ---
-    total_files = 0
-    total_reviews = 0
-    rng = random.Random(SEED)
-
-    for topic_val, g in filtered.groupby("Topic", sort=True):
-        reviews_list = g["Original"].tolist()
-
-        # deterministic shuffle within topic
-        rng.shuffle(reviews_list)
-
-        # chunk into batches of up to 60
-        for start in range(0, len(reviews_list), BATCH_SIZE):
-            chunk = reviews_list[start : start + BATCH_SIZE]
-            if not chunk:
-                continue
-
-            # simple header for traceability
-            header = (
-                f"[TOPIC] {topic_val}\n"
-                + f"[Stats] N={len(chunk)} | Source={INPUT_PATH}\n"
-            )
-
-            lines = [header, ""]
-            for i, txt in enumerate(chunk, 1):
-                lines.append(f"({i}) {txt}")
-
-            part_idx = start // BATCH_SIZE + 1
-            fname = f"topic={topic_val}__part={part_idx:03d}__n={len(chunk)}.txt"
-            (OUTPUT_DIR / fname).write_text("\n".join(lines), encoding="utf-8")
-
-            total_files += 1
-            total_reviews += len(chunk)
-
-    print(
-        f"[green]Wrote {total_files} docs with {total_reviews} reviews to {OUTPUT_DIR}[/green]"
-    )
-    print(f"[green]Filtered CSV saved to {OUTPUT_CSV}[/green]")
+    filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False, sep=",")
+    print(f"Filtered CSV file saved to {OUTPUT_CSV}")

 # %%
 doc_topic_matrix = probs
@@ -425,7 +360,7 @@ vis = topic_model.visualize_documents(
    custom_labels=True,
    hide_annotations=True,
 )
-vis.write_html("output/visualization.html")
+# vis.write_html("output/visualization.html")
 vis

 # %%
@@ -531,7 +466,7 @@ if this_will_crash_your_pc_are_you_sure:
 #

 # %%
-search_term = "spirituality"
+search_term = "lempuyang"

 similar_topics, similarities = topic_model.find_topics(search_term, top_n=10)
 for i in range(len(similar_topics)):
@@ -542,17 +477,20 @@ for i in range(len(similar_topics)):
 # %%
 # Source: https://maartengr.github.io/BERTopic/getting_started/visualization/visualize_documents.html#visualize-probabilities-or-distribution
 # Calculate the topic distributions on a token-level
-topic_distr, topic_token_distr = topic_model.approximate_distribution(
-    reviews, calculate_tokens=True, use_embedding_model=True
-)
+
+if CALCULATE_TOKEN_DISTRIBUTIONS:
+    topic_distr, topic_token_distr = topic_model.approximate_distribution(
+        reviews, calculate_tokens=True, use_embedding_model=True
+    )

 # %%
 # Visualize the token-level distributions
-DOC_INDEX = 6
-df = topic_model.visualize_approximate_distribution(
-    reviews[DOC_INDEX], topic_token_distr[DOC_INDEX]
-)
-df
+if CALCULATE_TOKEN_DISTRIBUTIONS:
+    DOC_INDEX = 1
+    df = topic_model.visualize_approximate_distribution(
+        reviews[DOC_INDEX], topic_token_distr[DOC_INDEX]
+    )
+    df

 # %% [markdown]
 # ### Topic Hierarchy