Files
masterthesis-playground/bertopic/bertopic_autotune.py
2026-02-22 23:52:26 +01:00

169 lines
5.7 KiB
Python

import json
import traceback
import numpy as np
import pandas as pd
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import ParameterGrid
from umap import UMAP
from bertopic import BERTopic
param_grid = {
"n_gram_max": [2, 3], # Vectorization
"min_document_frequency": [1, 2], # Vectorization
"min_samples": [10, 25], # HDBSCAN
"min_topic_size": [100, 200], # HDBSCAN
"n_neighbors": [15, 25], # UMAP
"n_components": [2, 5], # UMAP
"min_dist": [0.01, 0.1], # UMAP
"nr_topics": ["auto"], # Topic Modeling
"top_n_words": [10, 13, 15, 17, 20], # Topic Modeling
}
def calculate_metrics(topic_model, embedder, top_n_words=10):
# Get topic words
topic_words = []
for topic_id in range(len(topic_model.get_topic_info()) - 1):
words = [word for word, _ in topic_model.get_topic(topic_id)]
topic_words.append(words[:top_n_words])
# Pre-compute embeddings for all unique words
all_words = list(set(word for words in topic_words for word in words))
word_embeddings = embedder.encode(all_words)
embedding_map = {word: emb for word, emb in zip(all_words, word_embeddings)}
# Coherence
coherence_scores = []
for words in topic_words:
embeddings = np.array([embedding_map[word] for word in words])
sim_matrix = cosine_similarity(embeddings)
np.fill_diagonal(sim_matrix, 0)
mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)])
coherence_scores.append(mean_sim)
overall_coherence = np.mean(coherence_scores)
# Diversity
all_topic_words = [word for topic in topic_words for word in topic]
diversity = len(set(all_topic_words)) / len(all_topic_words)
res = {
"coherence": float(str(overall_coherence)[:6]),
"diversity": float(str(diversity)[:6]),
"combined_score": float(str(0.7 * overall_coherence + 0.3 * diversity)[:6]),
}
print(res)
return res
def auto_tune_bertopic(texts, embedding_model, param_grid):
best_score = -1
best_params = None
best_model = None
history = []
print("Starting auto-tuning of BERTopic...")
print(f"Number of reviews: {len(texts)}")
print("Running embedding model...")
embedder = SentenceTransformer(embedding_model)
embeddings = embedder.encode(texts, show_progress_bar=True)
# Convert param_grid to list for sampling
print("Generating parameter combinations...")
param_list = list(ParameterGrid(param_grid))
print(f"Total parameter combinations: {len(param_list)}")
for params in param_list:
print(f"Testing param combination no. {len(history) + 1}/{len(param_list)}...")
try:
print(f"Testing params: {params}")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
vectorizer_model = CountVectorizer(
stop_words="english",
min_df=params["min_document_frequency"],
ngram_range=(1, params["n_gram_max"]),
)
representation_model = KeyBERTInspired()
umap_model = UMAP(
n_neighbors=params["n_neighbors"],
n_components=params["n_components"],
min_dist=params["min_dist"],
metric="cosine",
low_memory=True,
random_state=42,
)
hdbscan_model = HDBSCAN(
min_cluster_size=params["min_topic_size"],
metric="euclidean",
cluster_selection_method="eom",
gen_min_span_tree=True,
prediction_data=True,
)
model = BERTopic(
embedding_model=embedding_model,
ctfidf_model=ctfidf_model,
vectorizer_model=vectorizer_model,
umap_model=umap_model,
hdbscan_model=hdbscan_model,
representation_model=representation_model,
verbose=True,
calculate_probabilities=True,
language="english",
top_n_words=params["top_n_words"],
nr_topics=params["nr_topics"],
)
topics, _ = model.fit_transform(texts, embeddings)
metrics = calculate_metrics(model, embedder)
history.append({"params": params, "metrics": metrics})
with open("history.json", "w") as f:
json.dump(history, f, indent=2)
if metrics["combined_score"] > best_score:
best_score = metrics["combined_score"]
best_params = params
best_model = model
except Exception as e:
print(f"Failed with params {params}: {str(e)}")
traceback.print_exc()
continue
with open("output/autotune.json", "w") as f:
json.dump(history, f, indent=2)
return best_model, best_params, best_score
SPECIAL_CHARS = ["\n", "\\n"]
MIN_REVIEW_WORDS = 5
print("Loading reviews...")
reviews = pd.read_csv(
"../data/intermediate/preprocessed.tab", sep="\t"
).review.to_list()
print("Running light preprocessing...")
for schar in SPECIAL_CHARS:
reviews = [
review.replace(schar, " ") if isinstance(review, str) else review
for review in reviews
]
print("Filtering short reviews...")
reviews = [review for review in reviews if len(str(review).split()) >= MIN_REVIEW_WORDS]
print("Staring auto-tuning...")
print(auto_tune_bertopic(reviews, "all-MiniLM-L6-v2", param_grid))