BERTopic cleanup

2026-03-22 00:12:42 +01:00 · 2026-02-08 22:43:53 +01:00
parent b2da597b18
commit c98a1d0c6e
8 changed files with 1400 additions and 61 deletions
--- a/bertopic/nb_bertopic.py
+++ b/bertopic/nb_bertopic.py
@@ -23,7 +23,15 @@
 #

 # %%
-from bertopic import BERTopic
+import json
+import pickle
+import re
+
+import gensim.corpora as corpora
+import nltk
+import numpy as np
+import pandas as pd
+import spacy
 from bertopic.representation import KeyBERTInspired
 from bertopic.vectorizers import ClassTfidfTransformer
 from gensim.models.coherencemodel import CoherenceModel
@@ -34,14 +42,8 @@ from sentence_transformers import SentenceTransformer
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 from umap import UMAP
-import gensim.corpora as corpora
-import json
-import nltk
-import numpy as np
-import pandas as pd
-import re
-import spacy
-import pickle
+
+from bertopic import BERTopic

 nlp = spacy.load("en_core_web_sm")

@@ -323,8 +325,8 @@ if REDUCE_OUTLIERS:
 #

 # %%
-from pathlib import Path
 import random
+from pathlib import Path

 # --- config ---
 topics_to_keep = {2, 4, 6, 8, 10, 5, 7}
@@ -468,7 +470,11 @@ topic_model.get_topic_info()

 # %%
 topic_words = []
-for topic_id in range(len(topic_model.get_topic_info()) - 1):
+for topic_id in topic_model.get_topic_info()["Topic"]:
+    # Skip outlier topic
+    if topic_id < 0:
+        continue
+
    words = [word for word, _ in topic_model.get_topic(topic_id)]
    topic_words.append(words)

@@ -477,8 +483,10 @@ coherence_scores = []
 for words in topic_words:
    coherence_embeddings = embedding_model.encode(words)
    sim_matrix = cosine_similarity(coherence_embeddings)
-    np.fill_diagonal(sim_matrix, 0)  # Ignore self-similarity
-    mean_sim = np.mean(sim_matrix)
+
+    # Ignore self-similarity
+    np.fill_diagonal(sim_matrix, 0)
+    mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)])
    coherence_scores.append(mean_sim)

 overall_coherence = np.mean(coherence_scores)
@@ -518,8 +526,8 @@ if CALCULATE_COHERENCE:
        for topic in range(len(set(topics)) - 1)
    ]

-    # %env TOKENIZERS_PARALLELISM=false    
-    
+    # %env TOKENIZERS_PARALLELISM=false
+
    for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]:
        coherence_model = CoherenceModel(
            topics=topic_words,