BERTopic cleanup

2026-06-22 07:13:08 +02:00 · 2026-02-08 22:43:53 +01:00
parent b2da597b18
commit c98a1d0c6e
8 changed files with 1400 additions and 61 deletions
@@ -0,0 +1,14 @@
 # Masterthesis, praktischer Anteil
 ## Jupyter Notebooks "rehydrieren"
 Damit keine unnötigen Jupyter Outputs etc. im Versionsmanagement landen, gibt es das Skript `convert_jupytext.sh`, welches nur den notwendigen Quelltext in ein `.py` File schreibt. Mit demselben Skript kann dieser Schritt wieder umgekehrt werden, also ein Jupyter Notebook aus dem Python-File geschrieben werden.
 Das Skript sollte also immer vor dem Committen von Änderungen mit `py` als erstes Argument ausgeführt werden.
 Verwendung:
 ```bash
 ./convert_jupytext.sh py # Jupyter Notebook -> Python
 ./convert_jupytext.sh nb # Python -> Jupyter Notebook
 ```
@@ -3,6 +3,8 @@ import traceback
 import numpy as np
 import pandas as pd
 from bertopic.representation import KeyBERTInspired
 from bertopic.vectorizers import ClassTfidfTransformer
 from hdbscan import HDBSCAN
 from sentence_transformers import SentenceTransformer
 from sklearn.feature_extraction.text import CountVectorizer
@@ -12,55 +14,50 @@ from sklearn.model_selection import ParameterGrid
 from umap import UMAP
 from bertopic import BERTopic
 from bertopic.representation import KeyBERTInspired
 from bertopic.vectorizers import ClassTfidfTransformer
 param_grid = {
-    "nr_topics": [45, 50, 55],
+    "n_gram_max": [2, 3],  # Vectorization
-    "min_topic_size": [30, 40, 50],
+    "min_document_frequency": [1],  # Vectorization
-    "n_gram_max": [3],
+    "min_samples": [10, 25],  # HDBSCAN
-    "min_document_frequency": [1, 2],
+    "min_topic_size": [10, 20, 30, 40, 50],  # HDBSCAN
-    "n_neighbors": [15],
+    "n_neighbors": [15],  # UMAP
-    "n_components": [2],
+    "n_components": [2, 5],  # UMAP
-    "min_dist": [0.1],
+    "min_dist": [0.01, 0.1],  # UMAP
-    "top_n_words": [10],
+    "nr_topics": ["auto"],  # Topic Modeling
    "top_n_words": [10, 13, 15, 17, 20],  # Topic Modeling
 }
-def calculate_metrics(topic_model, embedder, top_n_words=5):
+def calculate_metrics(topic_model, embedder, top_n_words=10):
    # Get topic words
    topic_words = []
    for topic_id in range(len(topic_model.get_topic_info()) - 1):
        words = [word for word, _ in topic_model.get_topic(topic_id)]
        topic_words.append(words[:top_n_words])
    # Pre-compute embeddings for all unique words
    all_words = list(set(word for words in topic_words for word in words))
    word_embeddings = embedder.encode(all_words)
    embedding_map = {word: emb for word, emb in zip(all_words, word_embeddings)}
    # Coherence
    coherence_scores = []
    for words in topic_words:
-        embeddings = embedder.encode(words)
+        embeddings = np.array([embedding_map[word] for word in words])
        sim_matrix = cosine_similarity(embeddings)
        np.fill_diagonal(sim_matrix, 0)
-        coherence_scores.append(np.mean(sim_matrix))
+        mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)])
        coherence_scores.append(mean_sim)
    overall_coherence = np.mean(coherence_scores)
    # Diversity
    all_topic_words = [word for topic in topic_words for word in topic]
    diversity = len(set(all_topic_words)) / len(all_topic_words)
    # Inter-topic distance
    topic_embeddings = [
        np.mean(embedder.encode(words), axis=0) for words in topic_words
    ]
    topic_distance = pairwise_distances(topic_embeddings, metric="cosine")
    avg_distance = np.mean(topic_distance[np.triu_indices_from(topic_distance, k=1)])
    res = {
        "coherence": float(str(overall_coherence)[:6]),
        "diversity": float(str(diversity)[:6]),
-        "inter_topic_distance": float(str(avg_distance)[:6]),
+        "combined_score": float(str(0.7 * overall_coherence + 0.3 * diversity)[:6]),
        "combined_score": float(
            str(0.6 * overall_coherence + 0.2 * diversity + 0.2 * avg_distance)[:6]
        ),
    }
    print(res)
    return res
@@ -85,6 +82,7 @@ def auto_tune_bertopic(texts, embedding_model, param_grid):
    print(f"Total parameter combinations: {len(param_list)}")
    for params in param_list:
        print(f"Testing param combination no. {len(history) + 1}/{len(param_list)}...")
        try:
            print(f"Testing params: {params}")
            ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
@@ -143,18 +141,27 @@ def auto_tune_bertopic(texts, embedding_model, param_grid):
            traceback.print_exc()
            continue
-    return best_model, best_params, best_score, history
+    with open("output/autotune.json", "w") as f:
        json.dump(history, f, indent=2)
    return best_model, best_params, best_score
 SPECIAL_CHARS = ["\n", "\\n"]
 MIN_REVIEW_WORDS = 5
-reviews = pd.read_csv("data.tab", sep="\t").review.to_list()
+print("Loading reviews...")
 reviews = pd.read_csv("../data/original/reviews.tab", sep="\t").review.to_list()
 print("Running light preprocessing...")
 for schar in SPECIAL_CHARS:
    reviews = [
        review.replace(schar, " ") if isinstance(review, str) else review
        for review in reviews
    ]
 print("Filtering short reviews...")
 reviews = [review for review in reviews if len(str(review).split()) >= MIN_REVIEW_WORDS]
 print("Staring auto-tuning...")
 print(auto_tune_bertopic(reviews, "all-MiniLM-L6-v2", param_grid))
@@ -2,12 +2,12 @@ import json
 import matplotlib.pyplot as plt
-with open("history.json", "r") as f:
+with open("output/autotune.json", "r") as f:
    history = json.load(f)
-history = sorted(history, key=lambda x: x["metrics"]["combined_score"], reverse=True)
+history = sorted(history, key=lambda x: x["metrics"]["combined_score"], reverse=False)
-with open("history_sorted.json", "w") as f:
+with open("output/autotune_sorted.json", "w") as f:
    json.dump(history, f, indent=2)
@@ -23,7 +23,15 @@
 #
 # %%
-from bertopic import BERTopic
+import json
 import pickle
 import re
 import gensim.corpora as corpora
 import nltk
 import numpy as np
 import pandas as pd
 import spacy
 from bertopic.representation import KeyBERTInspired
 from bertopic.vectorizers import ClassTfidfTransformer
 from gensim.models.coherencemodel import CoherenceModel
@@ -34,14 +42,8 @@ from sentence_transformers import SentenceTransformer
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 from umap import UMAP
-import gensim.corpora as corpora
+
-import json
+from bertopic import BERTopic
 import nltk
 import numpy as np
 import pandas as pd
 import re
 import spacy
 import pickle
 nlp = spacy.load("en_core_web_sm")
@@ -323,8 +325,8 @@ if REDUCE_OUTLIERS:
 #
 # %%
 from pathlib import Path
 import random
 from pathlib import Path
 # --- config ---
 topics_to_keep = {2, 4, 6, 8, 10, 5, 7}
@@ -468,7 +470,11 @@ topic_model.get_topic_info()
 # %%
 topic_words = []
-for topic_id in range(len(topic_model.get_topic_info()) - 1):
+for topic_id in topic_model.get_topic_info()["Topic"]:
    # Skip outlier topic
    if topic_id < 0:
        continue
    words = [word for word, _ in topic_model.get_topic(topic_id)]
    topic_words.append(words)
@@ -477,8 +483,10 @@ coherence_scores = []
 for words in topic_words:
    coherence_embeddings = embedding_model.encode(words)
    sim_matrix = cosine_similarity(coherence_embeddings)
-    np.fill_diagonal(sim_matrix, 0)  # Ignore self-similarity
+
-    mean_sim = np.mean(sim_matrix)
+    # Ignore self-similarity
    np.fill_diagonal(sim_matrix, 0)
    mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)])
    coherence_scores.append(mean_sim)
 overall_coherence = np.mean(coherence_scores)
@@ -23,7 +23,14 @@
 #
 # %%
-from bertopic import BERTopic
+import pickle
 import re
 import gensim.corpora as corpora
 import nltk
 import numpy as np
 import pandas as pd
 import spacy
 from bertopic.representation import KeyBERTInspired
 from bertopic.vectorizers import ClassTfidfTransformer
 from gensim.models.coherencemodel import CoherenceModel
@@ -33,13 +40,8 @@ from sentence_transformers import SentenceTransformer
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 from umap import UMAP
-import gensim.corpora as corpora
+
-import nltk
+from bertopic import BERTopic
 import numpy as np
 import pandas as pd
 import re
 import spacy
 import pickle
 nlp = spacy.load("en_core_web_sm")
@@ -300,8 +302,8 @@ if REDUCE_OUTLIERS:
 #
 # %%
 from pathlib import Path
 import random
 from pathlib import Path
 # --- config ---
 topics_to_keep = {2, 4, 5, 9, 22, 26}
@@ -445,7 +447,11 @@ topic_model.get_topic_info()
 # %%
 topic_words = []
-for topic_id in range(len(topic_model.get_topic_info()) - 1):
+for topic_id in topic_model.get_topic_info()["Topic"]:
    # Skip outlier topic
    if topic_id < 0:
        continue
    words = [word for word, _ in topic_model.get_topic(topic_id)]
    topic_words.append(words)
@@ -454,8 +460,10 @@ coherence_scores = []
 for words in topic_words:
    coherence_embeddings = embedding_model.encode(words)
    sim_matrix = cosine_similarity(coherence_embeddings)
-    np.fill_diagonal(sim_matrix, 0)  # Ignore self-similarity
+
-    mean_sim = np.mean(sim_matrix)
+    # Ignore self-similarity
    np.fill_diagonal(sim_matrix, 0)
    mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)])
    coherence_scores.append(mean_sim)
 overall_coherence = np.mean(coherence_scores)
@@ -492,10 +500,14 @@ if this_will_crash_your_pc_are_you_sure:
    tokens = [analyzer(doc) for doc in cleaned_docs]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
-    topic_words = [
+
-        [words for words, _ in topic_model.get_topic(topic)]
+    for topic_id in topic_model.get_topic_info()["Topic"]:
-        for topic in range(len(set(topics)) - 1)
+        # Skip outlier topic
-    ]
+        if topic_id < 0:
            continue
        words = [word for word, _ in topic_model.get_topic(topic_id)]
        topic_words.append(words)
    # %env TOKENIZERS_PARALLELISM=false