BERTopic cleanup

2026-03-22 00:12:42 +01:00 · 2026-02-08 22:43:53 +01:00
parent b2da597b18
commit c98a1d0c6e
8 changed files with 1400 additions and 61 deletions
--- a/bertopic/nb_bertopic_lowprep.py
+++ b/bertopic/nb_bertopic_lowprep.py
@@ -23,7 +23,14 @@
 #

 # %%
-from bertopic import BERTopic
+import pickle
+import re
+
+import gensim.corpora as corpora
+import nltk
+import numpy as np
+import pandas as pd
+import spacy
 from bertopic.representation import KeyBERTInspired
 from bertopic.vectorizers import ClassTfidfTransformer
 from gensim.models.coherencemodel import CoherenceModel
@@ -33,13 +40,8 @@ from sentence_transformers import SentenceTransformer
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 from umap import UMAP
-import gensim.corpora as corpora
-import nltk
-import numpy as np
-import pandas as pd
-import re
-import spacy
-import pickle
+
+from bertopic import BERTopic

 nlp = spacy.load("en_core_web_sm")

@@ -300,8 +302,8 @@ if REDUCE_OUTLIERS:
 #

 # %%
-from pathlib import Path
 import random
+from pathlib import Path

 # --- config ---
 topics_to_keep = {2, 4, 5, 9, 22, 26}
@@ -445,7 +447,11 @@ topic_model.get_topic_info()

 # %%
 topic_words = []
-for topic_id in range(len(topic_model.get_topic_info()) - 1):
+for topic_id in topic_model.get_topic_info()["Topic"]:
+    # Skip outlier topic
+    if topic_id < 0:
+        continue
+
    words = [word for word, _ in topic_model.get_topic(topic_id)]
    topic_words.append(words)

@@ -454,8 +460,10 @@ coherence_scores = []
 for words in topic_words:
    coherence_embeddings = embedding_model.encode(words)
    sim_matrix = cosine_similarity(coherence_embeddings)
-    np.fill_diagonal(sim_matrix, 0)  # Ignore self-similarity
-    mean_sim = np.mean(sim_matrix)
+
+    # Ignore self-similarity
+    np.fill_diagonal(sim_matrix, 0)
+    mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)])
    coherence_scores.append(mean_sim)

 overall_coherence = np.mean(coherence_scores)
@@ -492,10 +500,14 @@ if this_will_crash_your_pc_are_you_sure:
    tokens = [analyzer(doc) for doc in cleaned_docs]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
-    topic_words = [
-        [words for words, _ in topic_model.get_topic(topic)]
-        for topic in range(len(set(topics)) - 1)
-    ]
+
+    for topic_id in topic_model.get_topic_info()["Topic"]:
+        # Skip outlier topic
+        if topic_id < 0:
+            continue
+
+        words = [word for word, _ in topic_model.get_topic(topic_id)]
+        topic_words.append(words)

    # %env TOKENIZERS_PARALLELISM=false