BERTopic cleanup

This commit is contained in:
2026-02-08 22:43:53 +01:00
parent b2da597b18
commit c98a1d0c6e
8 changed files with 1400 additions and 61 deletions

14
README.md Normal file
View File

@@ -0,0 +1,14 @@
# Masterthesis, praktischer Anteil
## Jupyter Notebooks "rehydrieren"
Damit keine unnötigen Jupyter Outputs etc. im Versionsmanagement landen, gibt es das Skript `convert_jupytext.sh`, welches nur den notwendigen Quelltext in ein `.py` File schreibt. Mit demselben Skript kann dieser Schritt wieder umgekehrt werden, also ein Jupyter Notebook aus dem Python-File geschrieben werden.
Das Skript sollte also immer vor dem Committen von Änderungen mit `py` als erstes Argument ausgeführt werden.
Verwendung:
```bash
./convert_jupytext.sh py # Jupyter Notebook -> Python
./convert_jupytext.sh nb # Python -> Jupyter Notebook
```

View File

@@ -3,6 +3,8 @@ import traceback
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
from hdbscan import HDBSCAN from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import CountVectorizer
@@ -12,55 +14,50 @@ from sklearn.model_selection import ParameterGrid
from umap import UMAP from umap import UMAP
from bertopic import BERTopic from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
param_grid = { param_grid = {
"nr_topics": [45, 50, 55], "n_gram_max": [2, 3], # Vectorization
"min_topic_size": [30, 40, 50], "min_document_frequency": [1], # Vectorization
"n_gram_max": [3], "min_samples": [10, 25], # HDBSCAN
"min_document_frequency": [1, 2], "min_topic_size": [10, 20, 30, 40, 50], # HDBSCAN
"n_neighbors": [15], "n_neighbors": [15], # UMAP
"n_components": [2], "n_components": [2, 5], # UMAP
"min_dist": [0.1], "min_dist": [0.01, 0.1], # UMAP
"top_n_words": [10], "nr_topics": ["auto"], # Topic Modeling
"top_n_words": [10, 13, 15, 17, 20], # Topic Modeling
} }
def calculate_metrics(topic_model, embedder, top_n_words=5): def calculate_metrics(topic_model, embedder, top_n_words=10):
# Get topic words # Get topic words
topic_words = [] topic_words = []
for topic_id in range(len(topic_model.get_topic_info()) - 1): for topic_id in range(len(topic_model.get_topic_info()) - 1):
words = [word for word, _ in topic_model.get_topic(topic_id)] words = [word for word, _ in topic_model.get_topic(topic_id)]
topic_words.append(words[:top_n_words]) topic_words.append(words[:top_n_words])
# Pre-compute embeddings for all unique words
all_words = list(set(word for words in topic_words for word in words))
word_embeddings = embedder.encode(all_words)
embedding_map = {word: emb for word, emb in zip(all_words, word_embeddings)}
# Coherence # Coherence
coherence_scores = [] coherence_scores = []
for words in topic_words: for words in topic_words:
embeddings = embedder.encode(words) embeddings = np.array([embedding_map[word] for word in words])
sim_matrix = cosine_similarity(embeddings) sim_matrix = cosine_similarity(embeddings)
np.fill_diagonal(sim_matrix, 0) np.fill_diagonal(sim_matrix, 0)
coherence_scores.append(np.mean(sim_matrix)) mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)])
coherence_scores.append(mean_sim)
overall_coherence = np.mean(coherence_scores) overall_coherence = np.mean(coherence_scores)
# Diversity # Diversity
all_topic_words = [word for topic in topic_words for word in topic] all_topic_words = [word for topic in topic_words for word in topic]
diversity = len(set(all_topic_words)) / len(all_topic_words) diversity = len(set(all_topic_words)) / len(all_topic_words)
# Inter-topic distance
topic_embeddings = [
np.mean(embedder.encode(words), axis=0) for words in topic_words
]
topic_distance = pairwise_distances(topic_embeddings, metric="cosine")
avg_distance = np.mean(topic_distance[np.triu_indices_from(topic_distance, k=1)])
res = { res = {
"coherence": float(str(overall_coherence)[:6]), "coherence": float(str(overall_coherence)[:6]),
"diversity": float(str(diversity)[:6]), "diversity": float(str(diversity)[:6]),
"inter_topic_distance": float(str(avg_distance)[:6]), "combined_score": float(str(0.7 * overall_coherence + 0.3 * diversity)[:6]),
"combined_score": float(
str(0.6 * overall_coherence + 0.2 * diversity + 0.2 * avg_distance)[:6]
),
} }
print(res) print(res)
return res return res
@@ -85,6 +82,7 @@ def auto_tune_bertopic(texts, embedding_model, param_grid):
print(f"Total parameter combinations: {len(param_list)}") print(f"Total parameter combinations: {len(param_list)}")
for params in param_list: for params in param_list:
print(f"Testing param combination no. {len(history) + 1}/{len(param_list)}...")
try: try:
print(f"Testing params: {params}") print(f"Testing params: {params}")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
@@ -143,18 +141,27 @@ def auto_tune_bertopic(texts, embedding_model, param_grid):
traceback.print_exc() traceback.print_exc()
continue continue
return best_model, best_params, best_score, history with open("output/autotune.json", "w") as f:
json.dump(history, f, indent=2)
return best_model, best_params, best_score
SPECIAL_CHARS = ["\n", "\\n"] SPECIAL_CHARS = ["\n", "\\n"]
MIN_REVIEW_WORDS = 5 MIN_REVIEW_WORDS = 5
reviews = pd.read_csv("data.tab", sep="\t").review.to_list() print("Loading reviews...")
reviews = pd.read_csv("../data/original/reviews.tab", sep="\t").review.to_list()
print("Running light preprocessing...")
for schar in SPECIAL_CHARS: for schar in SPECIAL_CHARS:
reviews = [ reviews = [
review.replace(schar, " ") if isinstance(review, str) else review review.replace(schar, " ") if isinstance(review, str) else review
for review in reviews for review in reviews
] ]
print("Filtering short reviews...")
reviews = [review for review in reviews if len(str(review).split()) >= MIN_REVIEW_WORDS] reviews = [review for review in reviews if len(str(review).split()) >= MIN_REVIEW_WORDS]
print("Staring auto-tuning...")
print(auto_tune_bertopic(reviews, "all-MiniLM-L6-v2", param_grid)) print(auto_tune_bertopic(reviews, "all-MiniLM-L6-v2", param_grid))

View File

@@ -2,12 +2,12 @@ import json
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
with open("history.json", "r") as f: with open("output/autotune.json", "r") as f:
history = json.load(f) history = json.load(f)
history = sorted(history, key=lambda x: x["metrics"]["combined_score"], reverse=True) history = sorted(history, key=lambda x: x["metrics"]["combined_score"], reverse=False)
with open("history_sorted.json", "w") as f: with open("output/autotune_sorted.json", "w") as f:
json.dump(history, f, indent=2) json.dump(history, f, indent=2)

Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB

View File

@@ -23,7 +23,15 @@
# #
# %% # %%
from bertopic import BERTopic import json
import pickle
import re
import gensim.corpora as corpora
import nltk
import numpy as np
import pandas as pd
import spacy
from bertopic.representation import KeyBERTInspired from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer from bertopic.vectorizers import ClassTfidfTransformer
from gensim.models.coherencemodel import CoherenceModel from gensim.models.coherencemodel import CoherenceModel
@@ -34,14 +42,8 @@ from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity from sklearn.metrics.pairwise import cosine_similarity
from umap import UMAP from umap import UMAP
import gensim.corpora as corpora
import json from bertopic import BERTopic
import nltk
import numpy as np
import pandas as pd
import re
import spacy
import pickle
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
@@ -323,8 +325,8 @@ if REDUCE_OUTLIERS:
# #
# %% # %%
from pathlib import Path
import random import random
from pathlib import Path
# --- config --- # --- config ---
topics_to_keep = {2, 4, 6, 8, 10, 5, 7} topics_to_keep = {2, 4, 6, 8, 10, 5, 7}
@@ -468,7 +470,11 @@ topic_model.get_topic_info()
# %% # %%
topic_words = [] topic_words = []
for topic_id in range(len(topic_model.get_topic_info()) - 1): for topic_id in topic_model.get_topic_info()["Topic"]:
# Skip outlier topic
if topic_id < 0:
continue
words = [word for word, _ in topic_model.get_topic(topic_id)] words = [word for word, _ in topic_model.get_topic(topic_id)]
topic_words.append(words) topic_words.append(words)
@@ -477,8 +483,10 @@ coherence_scores = []
for words in topic_words: for words in topic_words:
coherence_embeddings = embedding_model.encode(words) coherence_embeddings = embedding_model.encode(words)
sim_matrix = cosine_similarity(coherence_embeddings) sim_matrix = cosine_similarity(coherence_embeddings)
np.fill_diagonal(sim_matrix, 0) # Ignore self-similarity
mean_sim = np.mean(sim_matrix) # Ignore self-similarity
np.fill_diagonal(sim_matrix, 0)
mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)])
coherence_scores.append(mean_sim) coherence_scores.append(mean_sim)
overall_coherence = np.mean(coherence_scores) overall_coherence = np.mean(coherence_scores)
@@ -518,8 +526,8 @@ if CALCULATE_COHERENCE:
for topic in range(len(set(topics)) - 1) for topic in range(len(set(topics)) - 1)
] ]
# %env TOKENIZERS_PARALLELISM=false # %env TOKENIZERS_PARALLELISM=false
for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]: for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]:
coherence_model = CoherenceModel( coherence_model = CoherenceModel(
topics=topic_words, topics=topic_words,

View File

@@ -23,7 +23,14 @@
# #
# %% # %%
from bertopic import BERTopic import pickle
import re
import gensim.corpora as corpora
import nltk
import numpy as np
import pandas as pd
import spacy
from bertopic.representation import KeyBERTInspired from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer from bertopic.vectorizers import ClassTfidfTransformer
from gensim.models.coherencemodel import CoherenceModel from gensim.models.coherencemodel import CoherenceModel
@@ -33,13 +40,8 @@ from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity from sklearn.metrics.pairwise import cosine_similarity
from umap import UMAP from umap import UMAP
import gensim.corpora as corpora
import nltk from bertopic import BERTopic
import numpy as np
import pandas as pd
import re
import spacy
import pickle
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
@@ -300,8 +302,8 @@ if REDUCE_OUTLIERS:
# #
# %% # %%
from pathlib import Path
import random import random
from pathlib import Path
# --- config --- # --- config ---
topics_to_keep = {2, 4, 5, 9, 22, 26} topics_to_keep = {2, 4, 5, 9, 22, 26}
@@ -445,7 +447,11 @@ topic_model.get_topic_info()
# %% # %%
topic_words = [] topic_words = []
for topic_id in range(len(topic_model.get_topic_info()) - 1): for topic_id in topic_model.get_topic_info()["Topic"]:
# Skip outlier topic
if topic_id < 0:
continue
words = [word for word, _ in topic_model.get_topic(topic_id)] words = [word for word, _ in topic_model.get_topic(topic_id)]
topic_words.append(words) topic_words.append(words)
@@ -454,8 +460,10 @@ coherence_scores = []
for words in topic_words: for words in topic_words:
coherence_embeddings = embedding_model.encode(words) coherence_embeddings = embedding_model.encode(words)
sim_matrix = cosine_similarity(coherence_embeddings) sim_matrix = cosine_similarity(coherence_embeddings)
np.fill_diagonal(sim_matrix, 0) # Ignore self-similarity
mean_sim = np.mean(sim_matrix) # Ignore self-similarity
np.fill_diagonal(sim_matrix, 0)
mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)])
coherence_scores.append(mean_sim) coherence_scores.append(mean_sim)
overall_coherence = np.mean(coherence_scores) overall_coherence = np.mean(coherence_scores)
@@ -492,10 +500,14 @@ if this_will_crash_your_pc_are_you_sure:
tokens = [analyzer(doc) for doc in cleaned_docs] tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens) dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens] corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [
[words for words, _ in topic_model.get_topic(topic)] for topic_id in topic_model.get_topic_info()["Topic"]:
for topic in range(len(set(topics)) - 1) # Skip outlier topic
] if topic_id < 0:
continue
words = [word for word, _ in topic_model.get_topic(topic_id)]
topic_words.append(words)
# %env TOKENIZERS_PARALLELISM=false # %env TOKENIZERS_PARALLELISM=false

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long