mirror of
https://github.com/marvinscham/masterthesis-playground.git
synced 2026-03-22 00:12:42 +01:00
169 lines
5.7 KiB
Python
169 lines
5.7 KiB
Python
import json
|
|
import traceback
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
from bertopic.representation import KeyBERTInspired
|
|
from bertopic.vectorizers import ClassTfidfTransformer
|
|
from hdbscan import HDBSCAN
|
|
from sentence_transformers import SentenceTransformer
|
|
from sklearn.feature_extraction.text import CountVectorizer
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
from sklearn.model_selection import ParameterGrid
|
|
from umap import UMAP
|
|
|
|
from bertopic import BERTopic
|
|
|
|
param_grid = {
|
|
"n_gram_max": [2, 3], # Vectorization
|
|
"min_document_frequency": [1], # Vectorization
|
|
"min_samples": [10, 25], # HDBSCAN
|
|
"min_topic_size": [10, 20, 30, 40, 50], # HDBSCAN
|
|
"n_neighbors": [15], # UMAP
|
|
"n_components": [2, 5], # UMAP
|
|
"min_dist": [0.01, 0.1], # UMAP
|
|
"nr_topics": ["auto"], # Topic Modeling
|
|
"top_n_words": [10, 13, 15, 17, 20], # Topic Modeling
|
|
}
|
|
|
|
|
|
def calculate_metrics(topic_model, embedder, top_n_words=10):
|
|
# Get topic words
|
|
topic_words = []
|
|
for topic_id in range(len(topic_model.get_topic_info()) - 1):
|
|
words = [word for word, _ in topic_model.get_topic(topic_id)]
|
|
topic_words.append(words[:top_n_words])
|
|
|
|
# Pre-compute embeddings for all unique words
|
|
all_words = list(set(word for words in topic_words for word in words))
|
|
word_embeddings = embedder.encode(all_words)
|
|
embedding_map = {word: emb for word, emb in zip(all_words, word_embeddings)}
|
|
|
|
# Coherence
|
|
coherence_scores = []
|
|
for words in topic_words:
|
|
embeddings = np.array([embedding_map[word] for word in words])
|
|
sim_matrix = cosine_similarity(embeddings)
|
|
np.fill_diagonal(sim_matrix, 0)
|
|
mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)])
|
|
coherence_scores.append(mean_sim)
|
|
overall_coherence = np.mean(coherence_scores)
|
|
|
|
# Diversity
|
|
all_topic_words = [word for topic in topic_words for word in topic]
|
|
diversity = len(set(all_topic_words)) / len(all_topic_words)
|
|
|
|
res = {
|
|
"coherence": float(str(overall_coherence)[:6]),
|
|
"diversity": float(str(diversity)[:6]),
|
|
"combined_score": float(str(0.7 * overall_coherence + 0.3 * diversity)[:6]),
|
|
}
|
|
print(res)
|
|
return res
|
|
|
|
|
|
def auto_tune_bertopic(texts, embedding_model, param_grid):
|
|
best_score = -1
|
|
best_params = None
|
|
best_model = None
|
|
history = []
|
|
|
|
print("Starting auto-tuning of BERTopic...")
|
|
print(f"Number of reviews: {len(texts)}")
|
|
|
|
print("Running embedding model...")
|
|
embedder = SentenceTransformer(embedding_model)
|
|
embeddings = embedder.encode(texts, show_progress_bar=True)
|
|
|
|
# Convert param_grid to list for sampling
|
|
print("Generating parameter combinations...")
|
|
param_list = list(ParameterGrid(param_grid))
|
|
|
|
print(f"Total parameter combinations: {len(param_list)}")
|
|
for params in param_list:
|
|
print(f"Testing param combination no. {len(history) + 1}/{len(param_list)}...")
|
|
try:
|
|
print(f"Testing params: {params}")
|
|
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
|
|
vectorizer_model = CountVectorizer(
|
|
stop_words="english",
|
|
min_df=params["min_document_frequency"],
|
|
ngram_range=(1, params["n_gram_max"]),
|
|
)
|
|
|
|
representation_model = KeyBERTInspired()
|
|
|
|
umap_model = UMAP(
|
|
n_neighbors=params["n_neighbors"],
|
|
n_components=params["n_components"],
|
|
min_dist=params["min_dist"],
|
|
metric="cosine",
|
|
low_memory=True,
|
|
random_state=42,
|
|
)
|
|
hdbscan_model = HDBSCAN(
|
|
min_cluster_size=params["min_topic_size"],
|
|
metric="euclidean",
|
|
cluster_selection_method="eom",
|
|
gen_min_span_tree=True,
|
|
prediction_data=True,
|
|
)
|
|
|
|
model = BERTopic(
|
|
embedding_model=embedding_model,
|
|
ctfidf_model=ctfidf_model,
|
|
vectorizer_model=vectorizer_model,
|
|
umap_model=umap_model,
|
|
hdbscan_model=hdbscan_model,
|
|
representation_model=representation_model,
|
|
verbose=True,
|
|
calculate_probabilities=True,
|
|
language="english",
|
|
top_n_words=params["top_n_words"],
|
|
nr_topics=params["nr_topics"],
|
|
)
|
|
topics, _ = model.fit_transform(texts, embeddings)
|
|
|
|
metrics = calculate_metrics(model, embedder)
|
|
history.append({"params": params, "metrics": metrics})
|
|
|
|
with open("history.json", "w") as f:
|
|
json.dump(history, f, indent=2)
|
|
|
|
if metrics["combined_score"] > best_score:
|
|
best_score = metrics["combined_score"]
|
|
best_params = params
|
|
best_model = model
|
|
|
|
except Exception as e:
|
|
print(f"Failed with params {params}: {str(e)}")
|
|
traceback.print_exc()
|
|
continue
|
|
|
|
with open("output/autotune.json", "w") as f:
|
|
json.dump(history, f, indent=2)
|
|
|
|
return best_model, best_params, best_score
|
|
|
|
|
|
SPECIAL_CHARS = ["\n", "\\n"]
|
|
MIN_REVIEW_WORDS = 5
|
|
|
|
print("Loading reviews...")
|
|
reviews = pd.read_csv(
|
|
"../data/intermediate/preprocessed.tab", sep="\t"
|
|
).review.to_list()
|
|
|
|
print("Running light preprocessing...")
|
|
for schar in SPECIAL_CHARS:
|
|
reviews = [
|
|
review.replace(schar, " ") if isinstance(review, str) else review
|
|
for review in reviews
|
|
]
|
|
|
|
print("Filtering short reviews...")
|
|
reviews = [review for review in reviews if len(str(review).split()) >= MIN_REVIEW_WORDS]
|
|
|
|
print("Staring auto-tuning...")
|
|
print(auto_tune_bertopic(reviews, "all-MiniLM-L6-v2", param_grid))
|