BERTopic cleanup

2026-06-21 23:12:33 +02:00 · 2026-02-08 22:43:53 +01:00
parent b2da597b18
commit c98a1d0c6e
8 changed files with 1400 additions and 61 deletions
@@ -0,0 +1,14 @@
+# Masterthesis, praktischer Anteil
+
+## Jupyter Notebooks "rehydrieren"
+
+Damit keine unnötigen Jupyter Outputs etc. im Versionsmanagement landen, gibt es das Skript `convert_jupytext.sh`, welches nur den notwendigen Quelltext in ein `.py` File schreibt. Mit demselben Skript kann dieser Schritt wieder umgekehrt werden, also ein Jupyter Notebook aus dem Python-File geschrieben werden.
+
+Das Skript sollte also immer vor dem Committen von Änderungen mit `py` als erstes Argument ausgeführt werden.
+
+Verwendung:
+
+```bash
+./convert_jupytext.sh py # Jupyter Notebook -> Python
+./convert_jupytext.sh nb # Python -> Jupyter Notebook
+```
@@ -3,6 +3,8 @@ import traceback

 import numpy as np
 import pandas as pd
+from bertopic.representation import KeyBERTInspired
+from bertopic.vectorizers import ClassTfidfTransformer
 from hdbscan import HDBSCAN
 from sentence_transformers import SentenceTransformer
 from sklearn.feature_extraction.text import CountVectorizer
@@ -12,55 +14,50 @@ from sklearn.model_selection import ParameterGrid
 from umap import UMAP

 from bertopic import BERTopic
-from bertopic.representation import KeyBERTInspired
-from bertopic.vectorizers import ClassTfidfTransformer

 param_grid = {
-    "nr_topics": [45, 50, 55],
-    "min_topic_size": [30, 40, 50],
-    "n_gram_max": [3],
-    "min_document_frequency": [1, 2],
-    "n_neighbors": [15],
-    "n_components": [2],
-    "min_dist": [0.1],
-    "top_n_words": [10],
+    "n_gram_max": [2, 3],  # Vectorization
+    "min_document_frequency": [1],  # Vectorization
+    "min_samples": [10, 25],  # HDBSCAN
+    "min_topic_size": [10, 20, 30, 40, 50],  # HDBSCAN
+    "n_neighbors": [15],  # UMAP
+    "n_components": [2, 5],  # UMAP
+    "min_dist": [0.01, 0.1],  # UMAP
+    "nr_topics": ["auto"],  # Topic Modeling
+    "top_n_words": [10, 13, 15, 17, 20],  # Topic Modeling
 }


-def calculate_metrics(topic_model, embedder, top_n_words=5):
+def calculate_metrics(topic_model, embedder, top_n_words=10):
    # Get topic words
    topic_words = []
    for topic_id in range(len(topic_model.get_topic_info()) - 1):
        words = [word for word, _ in topic_model.get_topic(topic_id)]
        topic_words.append(words[:top_n_words])

+    # Pre-compute embeddings for all unique words
+    all_words = list(set(word for words in topic_words for word in words))
+    word_embeddings = embedder.encode(all_words)
+    embedding_map = {word: emb for word, emb in zip(all_words, word_embeddings)}
+
    # Coherence
    coherence_scores = []
    for words in topic_words:
-        embeddings = embedder.encode(words)
+        embeddings = np.array([embedding_map[word] for word in words])
        sim_matrix = cosine_similarity(embeddings)
        np.fill_diagonal(sim_matrix, 0)
-        coherence_scores.append(np.mean(sim_matrix))
+        mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)])
+        coherence_scores.append(mean_sim)
    overall_coherence = np.mean(coherence_scores)

    # Diversity
    all_topic_words = [word for topic in topic_words for word in topic]
    diversity = len(set(all_topic_words)) / len(all_topic_words)

-    # Inter-topic distance
-    topic_embeddings = [
-        np.mean(embedder.encode(words), axis=0) for words in topic_words
-    ]
-    topic_distance = pairwise_distances(topic_embeddings, metric="cosine")
-    avg_distance = np.mean(topic_distance[np.triu_indices_from(topic_distance, k=1)])
-
    res = {
        "coherence": float(str(overall_coherence)[:6]),
        "diversity": float(str(diversity)[:6]),
-        "inter_topic_distance": float(str(avg_distance)[:6]),
-        "combined_score": float(
-            str(0.6 * overall_coherence + 0.2 * diversity + 0.2 * avg_distance)[:6]
-        ),
+        "combined_score": float(str(0.7 * overall_coherence + 0.3 * diversity)[:6]),
    }
    print(res)
    return res
@@ -85,6 +82,7 @@ def auto_tune_bertopic(texts, embedding_model, param_grid):

    print(f"Total parameter combinations: {len(param_list)}")
    for params in param_list:
+        print(f"Testing param combination no. {len(history) + 1}/{len(param_list)}...")
        try:
            print(f"Testing params: {params}")
            ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
@@ -143,18 +141,27 @@ def auto_tune_bertopic(texts, embedding_model, param_grid):
            traceback.print_exc()
            continue

-    return best_model, best_params, best_score, history
+    with open("output/autotune.json", "w") as f:
+        json.dump(history, f, indent=2)
+
+    return best_model, best_params, best_score


 SPECIAL_CHARS = ["\n", "\\n"]
 MIN_REVIEW_WORDS = 5

-reviews = pd.read_csv("data.tab", sep="\t").review.to_list()
+print("Loading reviews...")
+reviews = pd.read_csv("../data/original/reviews.tab", sep="\t").review.to_list()

+print("Running light preprocessing...")
 for schar in SPECIAL_CHARS:
    reviews = [
        review.replace(schar, " ") if isinstance(review, str) else review
        for review in reviews
    ]
+
+print("Filtering short reviews...")
 reviews = [review for review in reviews if len(str(review).split()) >= MIN_REVIEW_WORDS]
+
+print("Staring auto-tuning...")
 print(auto_tune_bertopic(reviews, "all-MiniLM-L6-v2", param_grid))
@@ -2,12 +2,12 @@ import json

 import matplotlib.pyplot as plt

-with open("history.json", "r") as f:
+with open("output/autotune.json", "r") as f:
    history = json.load(f)

-history = sorted(history, key=lambda x: x["metrics"]["combined_score"], reverse=True)
+history = sorted(history, key=lambda x: x["metrics"]["combined_score"], reverse=False)

-with open("history_sorted.json", "w") as f:
+with open("output/autotune_sorted.json", "w") as f:
    json.dump(history, f, indent=2)


@@ -23,7 +23,15 @@
 #

 # %%
-from bertopic import BERTopic
+import json
+import pickle
+import re
+
+import gensim.corpora as corpora
+import nltk
+import numpy as np
+import pandas as pd
+import spacy
 from bertopic.representation import KeyBERTInspired
 from bertopic.vectorizers import ClassTfidfTransformer
 from gensim.models.coherencemodel import CoherenceModel
@@ -34,14 +42,8 @@ from sentence_transformers import SentenceTransformer
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 from umap import UMAP
-import gensim.corpora as corpora
-import json
-import nltk
-import numpy as np
-import pandas as pd
-import re
-import spacy
-import pickle
+
+from bertopic import BERTopic

 nlp = spacy.load("en_core_web_sm")

@@ -323,8 +325,8 @@ if REDUCE_OUTLIERS:
 #

 # %%
-from pathlib import Path
 import random
+from pathlib import Path

 # --- config ---
 topics_to_keep = {2, 4, 6, 8, 10, 5, 7}
@@ -468,7 +470,11 @@ topic_model.get_topic_info()

 # %%
 topic_words = []
-for topic_id in range(len(topic_model.get_topic_info()) - 1):
+for topic_id in topic_model.get_topic_info()["Topic"]:
+    # Skip outlier topic
+    if topic_id < 0:
+        continue
+
    words = [word for word, _ in topic_model.get_topic(topic_id)]
    topic_words.append(words)

@@ -477,8 +483,10 @@ coherence_scores = []
 for words in topic_words:
    coherence_embeddings = embedding_model.encode(words)
    sim_matrix = cosine_similarity(coherence_embeddings)
-    np.fill_diagonal(sim_matrix, 0)  # Ignore self-similarity
-    mean_sim = np.mean(sim_matrix)
+
+    # Ignore self-similarity
+    np.fill_diagonal(sim_matrix, 0)
+    mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)])
    coherence_scores.append(mean_sim)

 overall_coherence = np.mean(coherence_scores)
@@ -23,7 +23,14 @@
 #

 # %%
-from bertopic import BERTopic
+import pickle
+import re
+
+import gensim.corpora as corpora
+import nltk
+import numpy as np
+import pandas as pd
+import spacy
 from bertopic.representation import KeyBERTInspired
 from bertopic.vectorizers import ClassTfidfTransformer
 from gensim.models.coherencemodel import CoherenceModel
@@ -33,13 +40,8 @@ from sentence_transformers import SentenceTransformer
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 from umap import UMAP
-import gensim.corpora as corpora
-import nltk
-import numpy as np
-import pandas as pd
-import re
-import spacy
-import pickle
+
+from bertopic import BERTopic

 nlp = spacy.load("en_core_web_sm")

@@ -300,8 +302,8 @@ if REDUCE_OUTLIERS:
 #

 # %%
-from pathlib import Path
 import random
+from pathlib import Path

 # --- config ---
 topics_to_keep = {2, 4, 5, 9, 22, 26}
@@ -445,7 +447,11 @@ topic_model.get_topic_info()

 # %%
 topic_words = []
-for topic_id in range(len(topic_model.get_topic_info()) - 1):
+for topic_id in topic_model.get_topic_info()["Topic"]:
+    # Skip outlier topic
+    if topic_id < 0:
+        continue
+
    words = [word for word, _ in topic_model.get_topic(topic_id)]
    topic_words.append(words)

@@ -454,8 +460,10 @@ coherence_scores = []
 for words in topic_words:
    coherence_embeddings = embedding_model.encode(words)
    sim_matrix = cosine_similarity(coherence_embeddings)
-    np.fill_diagonal(sim_matrix, 0)  # Ignore self-similarity
-    mean_sim = np.mean(sim_matrix)
+
+    # Ignore self-similarity
+    np.fill_diagonal(sim_matrix, 0)
+    mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)])
    coherence_scores.append(mean_sim)

 overall_coherence = np.mean(coherence_scores)
@@ -492,10 +500,14 @@ if this_will_crash_your_pc_are_you_sure:
    tokens = [analyzer(doc) for doc in cleaned_docs]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
-    topic_words = [
-        [words for words, _ in topic_model.get_topic(topic)]
-        for topic in range(len(set(topics)) - 1)
-    ]
+
+    for topic_id in topic_model.get_topic_info()["Topic"]:
+        # Skip outlier topic
+        if topic_id < 0:
+            continue
+
+        words = [word for word, _ in topic_model.get_topic(topic_id)]
+        topic_words.append(words)

    # %env TOKENIZERS_PARALLELISM=false