BERTopic cleanup

This commit is contained in:
2026-02-08 22:43:53 +01:00
parent b2da597b18
commit c98a1d0c6e
8 changed files with 1400 additions and 61 deletions

View File

@@ -3,6 +3,8 @@ import traceback
import numpy as np
import pandas as pd
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
@@ -12,55 +14,50 @@ from sklearn.model_selection import ParameterGrid
from umap import UMAP
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
param_grid = {
"nr_topics": [45, 50, 55],
"min_topic_size": [30, 40, 50],
"n_gram_max": [3],
"min_document_frequency": [1, 2],
"n_neighbors": [15],
"n_components": [2],
"min_dist": [0.1],
"top_n_words": [10],
"n_gram_max": [2, 3], # Vectorization
"min_document_frequency": [1], # Vectorization
"min_samples": [10, 25], # HDBSCAN
"min_topic_size": [10, 20, 30, 40, 50], # HDBSCAN
"n_neighbors": [15], # UMAP
"n_components": [2, 5], # UMAP
"min_dist": [0.01, 0.1], # UMAP
"nr_topics": ["auto"], # Topic Modeling
"top_n_words": [10, 13, 15, 17, 20], # Topic Modeling
}
def calculate_metrics(topic_model, embedder, top_n_words=5):
def calculate_metrics(topic_model, embedder, top_n_words=10):
# Get topic words
topic_words = []
for topic_id in range(len(topic_model.get_topic_info()) - 1):
words = [word for word, _ in topic_model.get_topic(topic_id)]
topic_words.append(words[:top_n_words])
# Pre-compute embeddings for all unique words
all_words = list(set(word for words in topic_words for word in words))
word_embeddings = embedder.encode(all_words)
embedding_map = {word: emb for word, emb in zip(all_words, word_embeddings)}
# Coherence
coherence_scores = []
for words in topic_words:
embeddings = embedder.encode(words)
embeddings = np.array([embedding_map[word] for word in words])
sim_matrix = cosine_similarity(embeddings)
np.fill_diagonal(sim_matrix, 0)
coherence_scores.append(np.mean(sim_matrix))
mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)])
coherence_scores.append(mean_sim)
overall_coherence = np.mean(coherence_scores)
# Diversity
all_topic_words = [word for topic in topic_words for word in topic]
diversity = len(set(all_topic_words)) / len(all_topic_words)
# Inter-topic distance
topic_embeddings = [
np.mean(embedder.encode(words), axis=0) for words in topic_words
]
topic_distance = pairwise_distances(topic_embeddings, metric="cosine")
avg_distance = np.mean(topic_distance[np.triu_indices_from(topic_distance, k=1)])
res = {
"coherence": float(str(overall_coherence)[:6]),
"diversity": float(str(diversity)[:6]),
"inter_topic_distance": float(str(avg_distance)[:6]),
"combined_score": float(
str(0.6 * overall_coherence + 0.2 * diversity + 0.2 * avg_distance)[:6]
),
"combined_score": float(str(0.7 * overall_coherence + 0.3 * diversity)[:6]),
}
print(res)
return res
@@ -85,6 +82,7 @@ def auto_tune_bertopic(texts, embedding_model, param_grid):
print(f"Total parameter combinations: {len(param_list)}")
for params in param_list:
print(f"Testing param combination no. {len(history) + 1}/{len(param_list)}...")
try:
print(f"Testing params: {params}")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
@@ -143,18 +141,27 @@ def auto_tune_bertopic(texts, embedding_model, param_grid):
traceback.print_exc()
continue
return best_model, best_params, best_score, history
with open("output/autotune.json", "w") as f:
json.dump(history, f, indent=2)
return best_model, best_params, best_score
SPECIAL_CHARS = ["\n", "\\n"]
MIN_REVIEW_WORDS = 5
reviews = pd.read_csv("data.tab", sep="\t").review.to_list()
print("Loading reviews...")
reviews = pd.read_csv("../data/original/reviews.tab", sep="\t").review.to_list()
print("Running light preprocessing...")
for schar in SPECIAL_CHARS:
reviews = [
review.replace(schar, " ") if isinstance(review, str) else review
for review in reviews
]
print("Filtering short reviews...")
reviews = [review for review in reviews if len(str(review).split()) >= MIN_REVIEW_WORDS]
print("Staring auto-tuning...")
print(auto_tune_bertopic(reviews, "all-MiniLM-L6-v2", param_grid))