Restructure

2026-02-04 13:03:12 +01:00 · 2025-10-20 23:06:52 +02:00
parent 995857ae54
commit c17e5bcc22
54 changed files with 19217 additions and 324966 deletions
--- a/bertopic/bertopic_autotune.py
+++ b/bertopic/bertopic_autotune.py
@@ -0,0 +1,160 @@
+import json
+import traceback
+
+import numpy as np
+import pandas as pd
+from hdbscan import HDBSCAN
+from sentence_transformers import SentenceTransformer
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.metrics import pairwise_distances
+from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.model_selection import ParameterGrid
+from umap import UMAP
+
+from bertopic import BERTopic
+from bertopic.representation import KeyBERTInspired
+from bertopic.vectorizers import ClassTfidfTransformer
+
+param_grid = {
+    "nr_topics": [45, 50, 55],
+    "min_topic_size": [30, 40, 50],
+    "n_gram_max": [3],
+    "min_document_frequency": [1, 2],
+    "n_neighbors": [15],
+    "n_components": [2],
+    "min_dist": [0.1],
+    "top_n_words": [10],
+}
+
+
+def calculate_metrics(topic_model, embedder, top_n_words=5):
+    # Get topic words
+    topic_words = []
+    for topic_id in range(len(topic_model.get_topic_info()) - 1):
+        words = [word for word, _ in topic_model.get_topic(topic_id)]
+        topic_words.append(words[:top_n_words])
+
+    # Coherence
+    coherence_scores = []
+    for words in topic_words:
+        embeddings = embedder.encode(words)
+        sim_matrix = cosine_similarity(embeddings)
+        np.fill_diagonal(sim_matrix, 0)
+        coherence_scores.append(np.mean(sim_matrix))
+    overall_coherence = np.mean(coherence_scores)
+
+    # Diversity
+    all_topic_words = [word for topic in topic_words for word in topic]
+    diversity = len(set(all_topic_words)) / len(all_topic_words)
+
+    # Inter-topic distance
+    topic_embeddings = [
+        np.mean(embedder.encode(words), axis=0) for words in topic_words
+    ]
+    topic_distance = pairwise_distances(topic_embeddings, metric="cosine")
+    avg_distance = np.mean(topic_distance[np.triu_indices_from(topic_distance, k=1)])
+
+    res = {
+        "coherence": float(str(overall_coherence)[:6]),
+        "diversity": float(str(diversity)[:6]),
+        "inter_topic_distance": float(str(avg_distance)[:6]),
+        "combined_score": float(
+            str(0.6 * overall_coherence + 0.2 * diversity + 0.2 * avg_distance)[:6]
+        ),
+    }
+    print(res)
+    return res
+
+
+def auto_tune_bertopic(texts, embedding_model, param_grid):
+    best_score = -1
+    best_params = None
+    best_model = None
+    history = []
+
+    print("Starting auto-tuning of BERTopic...")
+    print(f"Number of reviews: {len(texts)}")
+
+    print("Running embedding model...")
+    embedder = SentenceTransformer(embedding_model)
+    embeddings = embedder.encode(reviews, show_progress_bar=True)
+
+    # Convert param_grid to list for sampling
+    print("Generating parameter combinations...")
+    param_list = list(ParameterGrid(param_grid))
+
+    print(f"Total parameter combinations: {len(param_list)}")
+    for params in param_list:
+        try:
+            print(f"Testing params: {params}")
+            ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
+            vectorizer_model = CountVectorizer(
+                stop_words="english",
+                min_df=params["min_document_frequency"],
+                ngram_range=(1, params["n_gram_max"]),
+            )
+
+            representation_model = KeyBERTInspired()
+
+            umap_model = UMAP(
+                n_neighbors=params["n_neighbors"],
+                n_components=params["n_components"],
+                min_dist=params["min_dist"],
+                metric="cosine",
+                low_memory=True,
+                random_state=42,
+            )
+            hdbscan_model = HDBSCAN(
+                min_cluster_size=params["min_topic_size"],
+                metric="euclidean",
+                cluster_selection_method="eom",
+                gen_min_span_tree=True,
+                prediction_data=True,
+            )
+
+            model = BERTopic(
+                embedding_model=embedding_model,
+                ctfidf_model=ctfidf_model,
+                vectorizer_model=vectorizer_model,
+                umap_model=umap_model,
+                hdbscan_model=hdbscan_model,
+                representation_model=representation_model,
+                verbose=True,
+                calculate_probabilities=True,
+                language="english",
+                top_n_words=params["top_n_words"],
+                nr_topics=params["nr_topics"],
+            )
+            topics, _ = model.fit_transform(texts, embeddings)
+
+            metrics = calculate_metrics(model, embedder)
+            history.append({"params": params, "metrics": metrics})
+
+            with open("history.json", "w") as f:
+                json.dump(history, f, indent=2)
+
+            if metrics["combined_score"] > best_score:
+                best_score = metrics["combined_score"]
+                best_params = params
+                best_model = model
+
+        except Exception as e:
+            print(f"Failed with params {params}: {str(e)}")
+            traceback.print_exc()
+            continue
+
+    return best_model, best_params, best_score, history
+
+
+SPECIAL_CHARS = ["\n", "\\n"]
+MIN_REVIEW_WORDS = 5
+
+reviews = pd.read_csv("data.tab", sep="\t").review.to_list()
+
+for schar in SPECIAL_CHARS:
+    reviews = [
+        review.replace(schar, " ") if isinstance(review, str) else review
+        for review in reviews
+    ]
+reviews = [review for review in reviews if len(str(review).split()) >= MIN_REVIEW_WORDS]
+print(auto_tune_bertopic(reviews, "all-MiniLM-L6-v2", param_grid))
--- a/bertopic/bertopic_autotune_sorter.py
+++ b/bertopic/bertopic_autotune_sorter.py
@@ -0,0 +1,25 @@
+import json
+
+import matplotlib.pyplot as plt
+
+with open("history.json", "r") as f:
+    history = json.load(f)
+
+history = sorted(history, key=lambda x: x["metrics"]["combined_score"], reverse=True)
+
+with open("history_sorted.json", "w") as f:
+    json.dump(history, f, indent=2)
+
+
+# Extract combined scores
+scores = [item["metrics"]["coherence"] for item in history]
+
+# Plot histogram
+plt.hist(scores, bins=20, edgecolor="black")
+plt.title("Distribution of Combined Scores")
+plt.xlabel("Combined Score")
+plt.ylabel("Frequency")
+plt.grid(True)
+plt.tight_layout()
+plt.savefig("combined_score_distribution.png")
+plt.close()
--- a/bertopic/nb_bertopic.py
+++ b/bertopic/nb_bertopic.py
@@ -0,0 +1,569 @@
+# ---
+# jupyter:
+#   jupytext:
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.18.0
+#   kernelspec:
+#     display_name: .venv
+#     language: python
+#     name: python3
+# ---
+
+# %% [markdown]
+# # Topic Detection: Bali Tourist Reviews
+#
+
+# %% [markdown]
+# ## Preparation
+#
+# ### Dependency Loading
+#
+
+# %%
+from bertopic import BERTopic
+from bertopic.representation import KeyBERTInspired
+from bertopic.vectorizers import ClassTfidfTransformer
+from gensim.models.coherencemodel import CoherenceModel
+from hdbscan import HDBSCAN
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+from sentence_transformers import SentenceTransformer
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+from umap import UMAP
+import gensim.corpora as corpora
+import json
+import nltk
+import numpy as np
+import pandas as pd
+import re
+import spacy
+import pickle
+
+nlp = spacy.load("en_core_web_sm")
+
+nltk.download("stopwords")
+nltk.download("punkt")
+nltk.download("wordnet")
+
+# %% [markdown]
+# ### Parameters and Tracking
+#
+
+# %%
+RECREATE_MODEL = True
+RECREATE_REDUCED_MODEL = True
+PROCESS_DATA = False
+REDUCE_OUTLIERS = True
+USE_CONDENSED_MODEL = False
+
+DATA_SAMPLE_SIZE = -1  # -1 for all data
+
+# Classical coherence score. Warning: needs swap to not kill your PC
+CALCULATE_COHERENCE = False
+
+# Vectorization
+MIN_DOCUMENT_FREQUENCY = 1
+MAX_NGRAM = 2
+
+# HDBSCAN Parameters
+MIN_TOPIC_SIZE = 200
+MIN_SAMPLES = 25
+
+# UMAP Parameters
+N_NEIGHBORS = 15
+N_COMPONENTS = 2
+MIN_DIST = 0.01
+
+# Topic Modeling
+TOP_N_WORDS = 10
+MAX_TOPICS = None  # or "auto" to pass to HDBSCAN, None to skip
+
+# %% [markdown]
+# ### Data Loading & Preprocessing
+#
+
+# %%
+if DATA_SAMPLE_SIZE != -1:
+    reviews = (
+        pd.read_csv("../data/original/reviews.tab", sep="\t")
+        .sample(n=DATA_SAMPLE_SIZE)
+        .review.dropna()
+        .to_list()
+    )
+else:
+    reviews = (
+        pd.read_csv("../data/original/reviews.tab", sep="\t").review.dropna().to_list()
+    )
+
+print("Loaded {} reviews".format(len(reviews)))
+
+# %%
+# List of NE in Bali for NER enhancement
+with open("../data/supporting/bali_ner.json", "r") as f:
+    bali_places = json.load(f)
+bali_places_set = set(bali_places)
+
+# Stop word definition
+extra_stopwords = ["bali", "idr", "usd"]
+stop_words = set(stopwords.words("english"))
+with open("../data/supporting/stopwords-en.json", "r") as f:
+    extra_stopwords.extend(json.load(f))
+
+# Custom replacements
+rep = {
+    r"\\n": " ",
+    r"\n": " ",
+    r'\\"': "",
+    r'"': "",
+    "mongkey": "monkey",
+    "monky": "monkey",
+    "verry": "very",
+}
+rep = dict((re.escape(k), v) for k, v in rep.items())
+pattern = re.compile("|".join(rep.keys()))
+
+lemmatizer = WordNetLemmatizer()
+
+
+def preprocess(text):
+    # Step 1: Apply custom replacements (typos, special cases)
+    text = text.lower()
+    text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
+
+    # Step 2: Clean text
+    text = re.sub(r"\d+", " ", text)
+    text = re.sub(r"\W+", " ", text)
+
+    doc = nlp(text)
+
+    # Step 3: POS tagging and filtering
+    filtered_tokens = [
+        token.text
+        for token in doc
+        if token.pos_ in {"NOUN", "PROPN"}
+        or token.ent_type_ in {"GPE", "LOC", "FAC"}
+        or token.text in bali_places_set
+    ]
+
+    # Step 4: Lemmatization and stopword removal
+    lemmatized_tokens = [
+        lemmatizer.lemmatize(w)
+        for w in filtered_tokens
+        if w not in stop_words and w not in extra_stopwords and len(w) > 2
+    ]
+
+    return lemmatized_tokens
+
+
+# %%
+if PROCESS_DATA:
+    print("Processing reviews...")
+    reviews = [preprocess(review) for review in reviews]
+
+    with open("../data/intermediate/processed_texts.pkl", "wb") as f:
+        pickle.dump(reviews, f)
+else:
+    with open("../data/intermediate/processed_texts.pkl", "rb") as f:
+        reviews = pickle.load(f)
+        reviews = [
+            " ".join(review) if isinstance(review, list) else review
+            for review in reviews
+        ]
+
+print(reviews[:1])
+
+# %% [markdown]
+# ### Pre-calculate Embeddings
+#
+
+# %%
+embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
+embeddings = embedding_model.encode(reviews, show_progress_bar=True)
+
+# %% [markdown]
+# ## Model Creation
+#
+
+# %% [markdown]
+# ### Dimensionality Reduction (UMAP)
+#
+
+# %%
+umap_model = UMAP(
+    n_neighbors=N_NEIGHBORS,
+    n_components=N_COMPONENTS,
+    min_dist=MIN_DIST,
+    metric="cosine",
+    low_memory=True,
+    random_state=42,
+)
+reduced_embeddings = umap_model.fit_transform(embeddings)
+
+# %% [markdown]
+# ### BERTopic Model Creation
+#
+
+# %%
+if RECREATE_MODEL:
+    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
+    vectorizer_model = CountVectorizer(
+        min_df=MIN_DOCUMENT_FREQUENCY, ngram_range=(1, MAX_NGRAM)
+    )
+
+    representation_model = KeyBERTInspired()
+    hdbscan_model = HDBSCAN(
+        min_cluster_size=MIN_TOPIC_SIZE,
+        min_samples=MIN_SAMPLES,
+        metric="euclidean",
+        cluster_selection_method="eom",
+        gen_min_span_tree=True,
+        prediction_data=True,
+    )
+
+    topic_model = BERTopic(
+        embedding_model=embedding_model,
+        ctfidf_model=ctfidf_model,
+        vectorizer_model=vectorizer_model,
+        umap_model=umap_model,
+        hdbscan_model=hdbscan_model,
+        representation_model=representation_model,
+        verbose=True,
+        calculate_probabilities=True,
+        language="english",
+        top_n_words=TOP_N_WORDS,
+        nr_topics=MAX_TOPICS,
+    )
+
+    topics, probs = topic_model.fit_transform(reviews, embeddings=embeddings)
+
+    topic_labels = topic_model.generate_topic_labels(
+        nr_words=3, topic_prefix=True, word_length=15, separator=" - "
+    )
+    topic_model.set_topic_labels(topic_labels)
+    BERTopic.save(topic_model, "output/model.bertopic")
+else:
+    print("Nevermind, loading existing model")
+    topic_model = BERTopic.load("output/model.bertopic")
+
+# %% [markdown]
+# ## Fine Tuning
+#
+# ### Topic Condensation
+#
+
+# %%
+if RECREATE_REDUCED_MODEL:
+    done = False
+    iteration = 1
+    while not done:
+        print(f"Iteration {iteration}")
+        iteration += 1
+        similarity_matrix = cosine_similarity(
+            np.array(topic_model.topic_embeddings_)[1:, :]
+        )
+        nothing_to_merge = True
+
+        for i in range(similarity_matrix.shape[0]):
+            for j in range(i + 1, similarity_matrix.shape[1]):
+                sim = similarity_matrix[i, j]
+                if sim > 0.9:
+                    nothing_to_merge = False
+                    t1, t2 = i, j
+                    try:
+                        t1_name = topic_model.get_topic_info(t1)["CustomName"][0]
+                        t2_name = topic_model.get_topic_info(t2)["CustomName"][0]
+                        print(
+                            f"Merging topics {t1} ({t1_name}) and {t2} ({t2_name}) with similarity {sim:.2f}"
+                        )
+                        topic_model.merge_topics(reviews, topics_to_merge=[t1, t2])
+
+                        topic_labels = topic_model.generate_topic_labels(
+                            nr_words=3,
+                            topic_prefix=True,
+                            word_length=15,
+                            separator=" - ",
+                        )
+                        topic_model.set_topic_labels(topic_labels)
+                    except Exception as e:
+                        print(f"Failed to merge {t1} and {t2}: {e}")
+        if nothing_to_merge:
+            print("No more topics to merge.")
+            done = True
+
+    # BERTopic.save(topic_model, "bertopic/model_reduced.bertopic")
+elif USE_CONDENSED_MODEL:
+    print("Nevermind, loading existing reduced model")
+    topic_model = BERTopic.load("bertopic/model_reduced.bertopic")
+else:
+    print("Skipping topic reduction")
+
+# %% [markdown]
+# ### Outlier Reduction
+#
+
+# %%
+if REDUCE_OUTLIERS:
+    new_topics = topic_model.reduce_outliers(
+        reviews,
+        topic_model.topics_,
+        probabilities=topic_model.probabilities_,
+        threshold=0.05,
+        strategy="probabilities",
+    )
+    topic_model.update_topics(reviews, topics=new_topics)
+
+# %% [markdown]
+# ## Results
+#
+# ### Classification
+#
+
+# %%
+from pathlib import Path
+import random
+
+# --- config ---
+topics_to_keep = {2, 4, 6, 8, 10, 5, 7}
+INPUT_PATH = "../data/original/reviews.tab"  # TSV with a 'review' column
+OUTPUT_CSV = "../data/intermediate/selected_topics_documents.csv"
+OUTPUT_DIR = Path("../raft/corpus")
+OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+
+BATCH_SIZE = 60
+MIN_CHARS = 40
+SEED = 42
+
+# --- load data ---
+data = pd.read_csv(INPUT_PATH, sep="\t")
+
+# If you already have `reviews` elsewhere, replace the next line with that variable
+reviews = data["review"].astype(str).fillna("")
+
+# Topic model document info
+df = topic_model.get_document_info(reviews)  # assumes your model is already fitted
+df["Original"] = reviews.values
+
+# --- filter by topics and length ---
+filtered = df[df["Topic"].isin(topics_to_keep)].copy()
+filtered["Original"] = filtered["Original"].str.strip()
+filtered = filtered[filtered["Original"].str.len() >= MIN_CHARS]
+
+# Save an audit CSV
+filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False)
+
+# --- deterministic shuffle + write batched corpus files ---
+total_files = 0
+total_reviews = 0
+rng = random.Random(SEED)
+
+for topic_val, g in filtered.groupby("Topic", sort=True):
+    reviews_list = g["Original"].tolist()
+
+    # deterministic shuffle within topic
+    rng.shuffle(reviews_list)
+
+    # chunk into batches of up to 60
+    for start in range(0, len(reviews_list), BATCH_SIZE):
+        chunk = reviews_list[start : start + BATCH_SIZE]
+        if not chunk:
+            continue
+
+        # simple header for traceability
+        header = (
+            f"[TOPIC] {topic_val}\n" f"[Stats] N={len(chunk)} | Source={INPUT_PATH}\n"
+        )
+
+        lines = [header, ""]
+        for i, txt in enumerate(chunk, 1):
+            lines.append(f"({i}) {txt}")
+
+        part_idx = start // BATCH_SIZE + 1
+        fname = f"topic={topic_val}__part={part_idx:03d}__n={len(chunk)}.txt"
+        (OUTPUT_DIR / fname).write_text("\n".join(lines), encoding="utf-8")
+
+        total_files += 1
+        total_reviews += len(chunk)
+
+print(
+    f"[green]Wrote {total_files} docs with {total_reviews} reviews to {OUTPUT_DIR}[/green]"
+)
+print(f"[green]Filtered CSV saved to {OUTPUT_CSV}[/green]")
+
+# %%
+doc_topic_matrix = probs
+
+# column names
+topicnames = ["Topic " + str(i) for i in range(len(set(topics)) - 1)]
+
+# index names
+docnames = ["Review " + str(i) for i in range(len(reviews))]
+
+# Make the pandas dataframe
+df_document_topic = pd.DataFrame(
+    np.round(doc_topic_matrix, 2), columns=topicnames, index=docnames
+)
+
+# Get dominant topic for each document
+dominant_topic = np.argmax(doc_topic_matrix, axis=1)
+df_document_topic["dominant_topic"] = dominant_topic
+
+
+# Styling
+def color_stuff(val):
+    if val > 0.1:
+        color = "green"
+    elif val > 0.05:
+        color = "orange"
+    else:
+        color = "grey"
+    return "color: {col}".format(col=color)
+
+
+def make_bold(val):
+    weight = 700 if val > 0.1 else 400
+    return "font-weight: {weight}".format(weight=weight)
+
+
+# Apply Style
+df_document_topics = (
+    df_document_topic.head(15).style.applymap(color_stuff).applymap(make_bold)
+)
+df_document_topics
+
+# %% [markdown]
+# ### Document Visualization
+#
+
+# %%
+vis = topic_model.visualize_documents(
+    docs=reviews,
+    reduced_embeddings=reduced_embeddings,
+    custom_labels=True,
+    hide_annotations=True,
+)
+vis.write_html("output/visualization.html")
+vis
+
+# %% [markdown]
+# ### Similarity Matrix
+#
+
+# %%
+topic_model.visualize_heatmap()
+
+# %% [markdown]
+# ### Topic Info
+#
+
+# %%
+topic_model.get_topic_info()
+
+# %% [markdown]
+# ### Semantic Coherence
+#
+
+# %%
+topic_words = []
+for topic_id in range(len(topic_model.get_topic_info()) - 1):
+    words = [word for word, _ in topic_model.get_topic(topic_id)]
+    topic_words.append(words)
+
+# Compute mean pairwise cosine similarity for each topic
+coherence_scores = []
+for words in topic_words:
+    coherence_embeddings = embedding_model.encode(words)
+    sim_matrix = cosine_similarity(coherence_embeddings)
+    np.fill_diagonal(sim_matrix, 0)  # Ignore self-similarity
+    mean_sim = np.mean(sim_matrix)
+    coherence_scores.append(mean_sim)
+
+overall_coherence = np.mean(coherence_scores)
+
+print(len(reviews), "reviews processed")
+print(len(topic_model.get_topic_info()) - 1, "topics found")
+print(f"BERT-based Topic Coherence: {overall_coherence:.4f}")
+
+# %% [markdown]
+# ### Topic Coherence
+#
+
+# %%
+# https://github.com/MaartenGr/BERTopic/issues/90#issuecomment-820915389
+
+if CALCULATE_COHERENCE:
+    # Preprocess Documents
+    documents = pd.DataFrame(
+        {"Document": reviews, "ID": range(len(reviews)), "Topic": topics}
+    )
+    documents_per_topic = documents.groupby(["Topic"], as_index=False).agg(
+        {"Document": " ".join}
+    )
+    cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)
+
+    # Extract vectorizer and analyzer from BERTopic
+    vectorizer = topic_model.vectorizer_model
+    analyzer = vectorizer.build_analyzer()
+
+    # Extract features for Topic Coherence evaluation
+    words = vectorizer.get_feature_names_out()
+    tokens = [analyzer(doc) for doc in cleaned_docs]
+    dictionary = corpora.Dictionary(tokens)
+    corpus = [dictionary.doc2bow(token) for token in tokens]
+    topic_words = [
+        [words for words, _ in topic_model.get_topic(topic)]
+        for topic in range(len(set(topics)) - 1)
+    ]
+
+    # %env TOKENIZERS_PARALLELISM=false    
+    
+    for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]:
+        coherence_model = CoherenceModel(
+            topics=topic_words,
+            texts=tokens,
+            corpus=corpus,
+            dictionary=dictionary,
+            coherence=measurement,
+        )
+        coherence_score = coherence_model.get_coherence()
+        print(f"Coherence ({measurement}): {coherence_score:.4f}")
+else:
+    print("Skipping classical coherence calculation")
+
+# %% [markdown]
+# ### Term Search
+#
+
+# %%
+search_term = "uluwatu"
+
+similar_topics, similarities = topic_model.find_topics(search_term, top_n=10)
+for i in range(len(similar_topics)):
+    # \n{topic_model.get_topic(similar_topics[i])}\n
+    print(
+        f"{str(similarities[i])[:5]} {topic_model.get_topic_info(similar_topics[i])["CustomName"][0]}"
+    )
+
+# %% [markdown]
+# ### Topic Hierarchy
+#
+
+# %%
+topic_model.visualize_hierarchy(custom_labels=True)
+
+# %% [markdown]
+# ### Intertopic Distance Map
+#
+
+# %%
+topic_model.visualize_topics()
+
+# %% [markdown]
+# ### Topic Word Scores
+#
+
+# %%
+topic_model.visualize_barchart(top_n_topics=12, custom_labels=True, n_words=10)
--- a/bertopic/nb_bertopic_lowprep.py
+++ b/bertopic/nb_bertopic_lowprep.py
@@ -0,0 +1,585 @@
+# ---
+# jupyter:
+#   jupytext:
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.18.0
+#   kernelspec:
+#     display_name: .venv
+#     language: python
+#     name: python3
+# ---
+
+# %% [markdown]
+# # Topic Detection: Bali Tourist Reviews
+#
+
+# %% [markdown]
+# ## Preparation
+#
+# ### Dependency Loading
+#
+
+# %%
+from bertopic import BERTopic
+from bertopic.representation import KeyBERTInspired
+from bertopic.vectorizers import ClassTfidfTransformer
+from gensim.models.coherencemodel import CoherenceModel
+from hdbscan import HDBSCAN
+from nltk.corpus import stopwords
+from sentence_transformers import SentenceTransformer
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+from umap import UMAP
+import gensim.corpora as corpora
+import nltk
+import numpy as np
+import pandas as pd
+import re
+import spacy
+import pickle
+
+nlp = spacy.load("en_core_web_sm")
+
+nltk.download("stopwords")
+nltk.download("punkt")
+nltk.download("wordnet")
+
+# %% [markdown]
+# ### Parameters and Tracking
+#
+
+# %%
+RECREATE_MODEL = True
+RECREATE_REDUCED_MODEL = True
+PROCESS_DATA = False
+REDUCE_OUTLIERS = False
+
+# Data Sample Size, -1 for all data
+DATA_SAMPLE_SIZE = -1
+
+# Vectorization
+MIN_DOCUMENT_FREQUENCY = 1
+MAX_NGRAM = 3
+
+# HDBSCAN Parameters
+MIN_TOPIC_SIZE = 200
+MIN_SAMPLES = 25
+
+# UMAP Parameters
+N_NEIGHBORS = 15
+N_COMPONENTS = 2
+MIN_DIST = 0.01
+
+# Topic Modeling
+TOP_N_WORDS = 10
+MAX_TOPICS = None  # or "auto" to pass to HDBSCAN, None to skip
+
+tracking = {
+    "input": {
+        "min_document_frequency": MIN_DOCUMENT_FREQUENCY,
+        "max_ngram": MAX_NGRAM,
+        "min_topic_size": MIN_TOPIC_SIZE,
+        "min_samples": MIN_SAMPLES,
+        "n_neighbors": N_NEIGHBORS,
+        "n_components": N_COMPONENTS,
+        "min_dist": MIN_DIST,
+        "top_n_words": TOP_N_WORDS,
+        "max_topics": MAX_TOPICS,
+    },
+}
+
+# %% [markdown]
+# ### Data Loading & Preprocessing
+#
+
+# %%
+if DATA_SAMPLE_SIZE == -1:
+    reviews = pd.read_csv("../data/original/reviews.tab", sep="\t").review.to_list()
+else:
+    reviews = (
+        pd.read_csv("../data/original/reviews.tab", sep="\t")
+        .sample(n=DATA_SAMPLE_SIZE)
+        .review.to_list()
+    )
+
+print("Loaded {} reviews".format(len(reviews)))
+
+# %%
+rep = {
+    r"\\n": " ",
+    r"\n": " ",
+    r'\\"': "",
+    r'"': "",
+    "mongkey": "monkey",
+    "monky": "monkey",
+    "verry": "very",
+    "bali": "",
+    r"\s+": " ",
+}
+rep = dict((re.escape(k), v) for k, v in rep.items())
+pattern = re.compile("|".join(rep.keys()))
+
+
+def preprocess(text):
+    text = text.strip()
+    text = text.lower()
+    text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
+    return text
+
+
+# %%
+print(
+    preprocess(
+        "Excellent. Definitely worth coming while in bali. Food and people were very nice.\n🌟 🤩 ⭐️ \nTrisna was our host"
+    )
+)
+
+# %%
+if PROCESS_DATA:
+    print("Processing reviews...")
+    reviews = [preprocess(review) for review in reviews]
+
+    with open("../data/intermediate/processed_texts_lowprep.pkl", "wb") as f:
+        pickle.dump(reviews, f)
+else:
+    with open("../data/intermediate/processed_texts_lowprep.pkl", "rb") as f:
+        reviews = pickle.load(f)
+
+print(reviews[:1])
+
+# %% [markdown]
+# ### Pre-calculate Embeddings
+#
+
+# %%
+embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
+embeddings = embedding_model.encode(reviews, show_progress_bar=True)
+
+# %% [markdown]
+# ## Model Creation
+#
+
+# %% [markdown]
+# ### Dimensionality Reduction (UMAP)
+#
+
+# %%
+umap_model = UMAP(
+    n_neighbors=N_NEIGHBORS,
+    n_components=N_COMPONENTS,
+    min_dist=MIN_DIST,
+    metric="cosine",
+    low_memory=True,
+    random_state=42,
+)
+reduced_embeddings = umap_model.fit_transform(embeddings)
+
+# %% [markdown]
+# ### BERTopic Model Creation
+#
+
+# %%
+if RECREATE_MODEL:
+    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
+    vectorizer_model = CountVectorizer(
+        min_df=MIN_DOCUMENT_FREQUENCY,
+        ngram_range=(1, MAX_NGRAM),
+        stop_words=stopwords.words("english"),
+    )
+
+    representation_model = KeyBERTInspired()
+    hdbscan_model = HDBSCAN(
+        min_cluster_size=MIN_TOPIC_SIZE,
+        min_samples=MIN_SAMPLES,
+        metric="euclidean",
+        cluster_selection_method="eom",
+        gen_min_span_tree=True,
+        prediction_data=True,
+    )
+
+    topic_model = BERTopic(
+        embedding_model=embedding_model,
+        ctfidf_model=ctfidf_model,
+        vectorizer_model=vectorizer_model,
+        umap_model=umap_model,
+        hdbscan_model=hdbscan_model,
+        representation_model=representation_model,
+        verbose=True,
+        calculate_probabilities=True,
+        language="english",
+        top_n_words=TOP_N_WORDS,
+        nr_topics=MAX_TOPICS,
+    )
+
+    topics, probs = topic_model.fit_transform(reviews, embeddings=embeddings)
+
+    topic_labels = topic_model.generate_topic_labels(
+        nr_words=3, topic_prefix=True, word_length=15, separator=" - "
+    )
+    topic_model.set_topic_labels(topic_labels)
+    # BERTopic.save(topic_model, "bertopic/model.bertopic")
+else:
+    print("Nevermind, loading existing model")
+    # topic_model = BERTopic.load("bertopic/model.bertopic")
+
+# %% [markdown]
+# ## Fine Tuning
+#
+# ### Topic Condensation
+#
+
+# %%
+if RECREATE_REDUCED_MODEL:
+    done = False
+    iteration = 1
+    while not done:
+        print(f"Iteration {iteration}")
+        iteration += 1
+        similarity_matrix = cosine_similarity(
+            np.array(topic_model.topic_embeddings_)[1:, :]
+        )
+        nothing_to_merge = True
+
+        for i in range(similarity_matrix.shape[0]):
+            for j in range(i + 1, similarity_matrix.shape[1]):
+                try:
+                    sim = similarity_matrix[i, j]
+                    if sim > 0.9:
+                        nothing_to_merge = False
+                        t1, t2 = i, j
+                        try:
+                            t1_name = topic_model.get_topic_info(t1)["CustomName"][0]
+                            t2_name = topic_model.get_topic_info(t2)["CustomName"][0]
+                            print(
+                                f"Merging topics {t1} ({t1_name}) and {t2} ({t2_name}) with similarity {sim:.2f}"
+                            )
+                            topic_model.merge_topics(reviews, topics_to_merge=[t1, t2])
+
+                            topic_labels = topic_model.generate_topic_labels(
+                                nr_words=3,
+                                topic_prefix=True,
+                                word_length=15,
+                                separator=" - ",
+                            )
+                            topic_model.set_topic_labels(topic_labels)
+                            similarity_matrix = cosine_similarity(
+                                np.array(topic_model.topic_embeddings_)[1:, :]
+                            )
+                        except Exception as e:
+                            print(f"Failed to merge {t1} and {t2}: {e}")
+                except IndexError:
+                    pass
+        if nothing_to_merge:
+            print("No more topics to merge.")
+            done = True
+else:
+    print("Skipping topic reduction")
+
+# %% [markdown]
+# ### Outlier Reduction
+#
+
+# %%
+if REDUCE_OUTLIERS:
+    new_topics = topic_model.reduce_outliers(
+        reviews,
+        topic_model.topics_,
+        probabilities=topic_model.probabilities_,
+        threshold=0.05,
+        strategy="probabilities",
+    )
+    topic_model.update_topics(reviews, topics=new_topics)
+
+# %% [markdown]
+# ## Results
+#
+# ### Classification
+#
+
+# %%
+from pathlib import Path
+import random
+
+# --- config ---
+topics_to_keep = {2, 4, 5, 9, 22, 26}
+INPUT_PATH = "../data/original/reviews.tab"  # TSV with a 'review' column
+OUTPUT_CSV = "../data/intermediate/selected_topics_documents.csv"
+OUTPUT_DIR = Path("../raft/corpus")
+OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+
+BATCH_SIZE = 60
+MIN_CHARS = 40
+SEED = 42
+
+# --- load data ---
+data = pd.read_csv(INPUT_PATH, sep="\t")
+
+# If you already have `reviews` elsewhere, replace the next line with that variable
+reviews = data["review"].astype(str).fillna("")
+
+# Topic model document info
+df = topic_model.get_document_info(reviews)  # assumes your model is already fitted
+df["Original"] = reviews.values
+
+# --- filter by topics and length ---
+filtered = df[df["Topic"].isin(topics_to_keep)].copy()
+filtered["Original"] = filtered["Original"].str.strip()
+filtered = filtered[filtered["Original"].str.len() >= MIN_CHARS]
+
+# Save an audit CSV
+filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False)
+
+# --- deterministic shuffle + write batched corpus files ---
+total_files = 0
+total_reviews = 0
+rng = random.Random(SEED)
+
+for topic_val, g in filtered.groupby("Topic", sort=True):
+    reviews_list = g["Original"].tolist()
+
+    # deterministic shuffle within topic
+    rng.shuffle(reviews_list)
+
+    # chunk into batches of up to 60
+    for start in range(0, len(reviews_list), BATCH_SIZE):
+        chunk = reviews_list[start : start + BATCH_SIZE]
+        if not chunk:
+            continue
+
+        # simple header for traceability
+        header = (
+            f"[TOPIC] {topic_val}\n" + f"[Stats] N={len(chunk)} | Source={INPUT_PATH}\n"
+        )
+
+        lines = [header, ""]
+        for i, txt in enumerate(chunk, 1):
+            lines.append(f"({i}) {txt}")
+
+        part_idx = start // BATCH_SIZE + 1
+        fname = f"topic={topic_val}__part={part_idx:03d}__n={len(chunk)}.txt"
+        (OUTPUT_DIR / fname).write_text("\n".join(lines), encoding="utf-8")
+
+        total_files += 1
+        total_reviews += len(chunk)
+
+print(
+    f"[green]Wrote {total_files} docs with {total_reviews} reviews to {OUTPUT_DIR}[/green]"
+)
+print(f"[green]Filtered CSV saved to {OUTPUT_CSV}[/green]")
+
+# %%
+doc_topic_matrix = probs
+
+# column names
+topicnames = ["Topic " + str(i) for i in range(len(set(topics)) - 1)]
+
+# index names
+docnames = ["Review " + str(i) for i in range(len(reviews))]
+
+# Make the pandas dataframe
+df_document_topic = pd.DataFrame(
+    np.round(doc_topic_matrix, 2), columns=topicnames, index=docnames
+)
+
+# Get dominant topic for each document
+dominant_topic = np.argmax(doc_topic_matrix, axis=1)
+df_document_topic["dominant_topic"] = dominant_topic
+
+
+# Styling
+def color_stuff(val):
+    if val > 0.1:
+        color = "green"
+    elif val > 0.05:
+        color = "orange"
+    else:
+        color = "grey"
+    return "color: {col}".format(col=color)
+
+
+def make_bold(val):
+    weight = 700 if val > 0.1 else 400
+    return "font-weight: {weight}".format(weight=weight)
+
+
+# Apply Style
+df_document_topics = (
+    df_document_topic.head(15).style.applymap(color_stuff).applymap(make_bold)
+)
+df_document_topics
+
+# %% [markdown]
+# ### Document Visualization
+#
+
+# %%
+vis = topic_model.visualize_documents(
+    docs=reviews,
+    reduced_embeddings=reduced_embeddings,
+    custom_labels=True,
+    hide_annotations=True,
+)
+vis.write_html("output/visualization.html")
+vis
+
+# %% [markdown]
+# ### Similarity Matrix
+#
+
+# %%
+topic_model.visualize_heatmap()
+
+# %% [markdown]
+# ### Topic Info
+#
+
+# %%
+topic_model.get_topic_info()
+
+# %% [markdown]
+# ### Semantic Coherence
+#
+
+# %%
+topic_words = []
+for topic_id in range(len(topic_model.get_topic_info()) - 1):
+    words = [word for word, _ in topic_model.get_topic(topic_id)]
+    topic_words.append(words)
+
+# Compute mean pairwise cosine similarity for each topic
+coherence_scores = []
+for words in topic_words:
+    coherence_embeddings = embedding_model.encode(words)
+    sim_matrix = cosine_similarity(coherence_embeddings)
+    np.fill_diagonal(sim_matrix, 0)  # Ignore self-similarity
+    mean_sim = np.mean(sim_matrix)
+    coherence_scores.append(mean_sim)
+
+overall_coherence = np.mean(coherence_scores)
+
+print(len(reviews), "reviews processed")
+print(len(topic_model.get_topic_info()) - 1, "topics found")
+print(f"BERT-based Topic Coherence: {overall_coherence:.4f}")
+
+# %% [markdown]
+# ### Topic Coherence
+#
+
+# %%
+# https://github.com/MaartenGr/BERTopic/issues/90#issuecomment-820915389
+
+# This will most likely crash your PC
+this_will_crash_your_pc_are_you_sure = False
+if this_will_crash_your_pc_are_you_sure:
+    # Preprocess Documents
+    documents = pd.DataFrame(
+        {"Document": reviews, "ID": range(len(reviews)), "Topic": topics}
+    )
+    documents_per_topic = documents.groupby(["Topic"], as_index=False).agg(
+        {"Document": " ".join}
+    )
+    cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)
+
+    # Extract vectorizer and analyzer from BERTopic
+    vectorizer = topic_model.vectorizer_model
+    analyzer = vectorizer.build_analyzer()
+
+    # Extract features for Topic Coherence evaluation
+    words = vectorizer.get_feature_names_out()
+    tokens = [analyzer(doc) for doc in cleaned_docs]
+    dictionary = corpora.Dictionary(tokens)
+    corpus = [dictionary.doc2bow(token) for token in tokens]
+    topic_words = [
+        [words for words, _ in topic_model.get_topic(topic)]
+        for topic in range(len(set(topics)) - 1)
+    ]
+
+    # %env TOKENIZERS_PARALLELISM=false
+
+    for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]:
+        coherence_model = CoherenceModel(
+            topics=topic_words,
+            texts=tokens,
+            corpus=corpus,
+            dictionary=dictionary,
+            coherence=measurement,
+        )
+        coherence_score = coherence_model.get_coherence()
+        print(f"Coherence ({measurement}): {coherence_score:.4f}")
+
+# %% [markdown]
+# ### Term Search
+#
+
+# %%
+search_term = "uluwatu"
+
+similar_topics, similarities = topic_model.find_topics(search_term, top_n=10)
+for i in range(len(similar_topics)):
+    # \n{topic_model.get_topic(similar_topics[i])}\n
+    print(
+        f"{str(similarities[i])[:5]} {topic_model.get_topic_info(similar_topics[i])["CustomName"][0]}"
+    )
+
+# %% [markdown]
+# ### Topic Hierarchy
+#
+
+# %%
+topic_model.visualize_hierarchy(custom_labels=True)
+
+# %% [markdown]
+# ### Intertopic Distance Map
+#
+
+# %%
+topic_model.visualize_topics(use_ctfidf=True)
+
+# %% [markdown]
+# ### Topic Word Scores
+#
+
+# %%
+topic_model.visualize_barchart(top_n_topics=12, custom_labels=True, n_words=10)
+
+# %%
+# from matplotlib import pyplot as plt
+# from sklearn.manifold import TSNE
+
+
+# topics = topic_model.topics_
+
+# # Reduce dimensionality with TSNE
+# tsne = TSNE(n_components=2, random_state=42)
+# embeddings_2d = tsne.fit_transform(embeddings)
+
+# # Prepare colors (assign a color to each topic)
+# unique_topics = set(topics)
+# colors = plt.get_cmap("tab20", len(unique_topics))
+
+# # Plot
+# plt.figure(figsize=(12, 8))
+# for topic in unique_topics:
+#     # Select indices for the current topic
+#     indices = [i for i, t in enumerate(topics) if t == topic]
+
+#     # Get 2D points for these indices
+#     x = embeddings_2d[indices, 0]
+#     y = embeddings_2d[indices, 1]
+
+#     # Assign label (exclude outliers)
+#     label = f"Topic {topic}" if topic != -1 else "Outliers"
+
+#     # Plot with color
+#     plt.scatter(x, y, color=colors(topic + 1), label=label, alpha=0.5)
+
+# plt.title("Topic Clusters in 2D Embedding Space")
+# plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
+# plt.tight_layout()
+
+# # Save the plot
+# plt.savefig("topic_clusters.png", dpi=300, bbox_inches="tight")
+# plt.show()
--- a/bertopic/output/heatmap.html
+++ b/bertopic/output/heatmap.html
--- a/bertopic/output/map.html
+++ b/bertopic/output/map.html
--- a/bertopic/output/visualization.html
+++ b/bertopic/output/visualization.html
--- a/bertopic/requirements.txt
+++ b/bertopic/requirements.txt
@@ -0,0 +1,132 @@
+annotated-types==0.7.0
+anyio==4.9.0
+asttokens==3.0.0
+attrs==25.3.0
+bertopic==0.17.0
+Brotli==1.1.0
+certifi==2025.4.26
+charset-normalizer==3.4.2
+click==8.2.1
+comm==0.2.2
+contourpy==1.3.2
+cssselect==1.3.0
+cycler==0.12.1
+debugpy==1.8.14
+decorator==5.2.1
+distro==1.9.0
+dotenv==0.9.9
+executing==2.2.0
+fastjsonschema==2.21.1
+filelock==3.18.0
+fonttools==4.58.0
+fsspec==2025.5.1
+gensim==4.3.3
+h11==0.16.0
+h2==4.2.0
+hdbscan==0.8.40
+hf-xet==1.1.2
+hpack==4.1.0
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==0.32.2
+hyperframe==6.1.0
+idna==3.10
+ipykernel==6.29.5
+ipython==9.3.0
+ipython_pygments_lexers==1.1.1
+jedi==0.19.2
+Jinja2==3.1.6
+jiter==0.10.0
+jmespath==1.0.1
+joblib==1.5.1
+jsonschema==4.24.0
+jsonschema-specifications==2025.4.1
+jupyter_client==8.6.3
+jupyter_core==5.8.1
+kaleido==0.2.1
+kiwisolver==1.4.8
+llvmlite==0.44.0
+lxml==5.4.0
+MarkupSafe==3.0.2
+matplotlib==3.10.3
+matplotlib-inline==0.1.7
+mpmath==1.3.0
+narwhals==1.41.0
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx==3.4.2
+nltk==3.9.1
+numba==0.61.2
+numpy==1.26.4
+nvidia-cublas-cu12==12.6.4.1
+nvidia-cuda-cupti-cu12==12.6.80
+nvidia-cuda-nvrtc-cu12==12.6.77
+nvidia-cuda-runtime-cu12==12.6.77
+nvidia-cudnn-cu12==9.5.1.17
+nvidia-cufft-cu12==11.3.0.4
+nvidia-cufile-cu12==1.11.1.6
+nvidia-curand-cu12==10.3.7.77
+nvidia-cusolver-cu12==11.7.1.2
+nvidia-cusparse-cu12==12.5.4.2
+nvidia-cusparselt-cu12==0.6.3
+nvidia-nccl-cu12==2.26.2
+nvidia-nvjitlink-cu12==12.6.85
+nvidia-nvtx-cu12==12.6.77
+openai==1.82.0
+packaging==25.0
+pandas==2.2.3
+parsel==1.10.0
+parso==0.8.4
+pexpect==4.9.0
+pillow==11.2.1
+platformdirs==4.3.8
+plotly==6.1.2
+prompt_toolkit==3.0.51
+psutil==7.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pydantic==2.11.5
+pydantic_core==2.33.2
+Pygments==2.19.1
+pynndescent==0.5.13
+pyparsing==3.2.3
+python-dateutil==2.9.0.post0
+python-dotenv==1.1.0
+pytz==2025.2
+PyYAML==6.0.2
+pyzmq==26.4.0
+referencing==0.36.2
+regex==2024.11.6
+requests==2.32.3
+rpds-py==0.25.1
+safetensors==0.5.3
+scikit-learn==1.6.1
+scipy==1.13.1
+seaborn==0.13.2
+sentence-transformers==4.1.0
+setuptools==80.9.0
+six==1.17.0
+smart-open==7.1.0
+sniffio==1.3.1
+stack-data==0.6.3
+sympy==1.14.0
+threadpoolctl==3.6.0
+tokenizers==0.21.1
+torch==2.7.0
+tornado==6.5.1
+tqdm==4.67.1
+traitlets==5.14.3
+transformers==4.52.3
+triton==3.3.0
+typing-inspection==0.4.1
+typing_extensions==4.13.2
+tzdata==2025.2
+umap-learn==0.5.7
+urllib3==2.4.0
+w3lib==2.3.1
+wcwidth==0.2.13
+wrapt==1.17.2
+
+spacy
+nbconvert
+jupytext
--- a/bertopic/visualization.html
+++ b/bertopic/visualization.html