import json import traceback import numpy as np import pandas as pd from bertopic.representation import KeyBERTInspired from bertopic.vectorizers import ClassTfidfTransformer from hdbscan import HDBSCAN from sentence_transformers import SentenceTransformer from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics.pairwise import cosine_similarity from sklearn.model_selection import ParameterGrid from umap import UMAP from bertopic import BERTopic param_grid = { "n_gram_max": [2, 3], # Vectorization "min_document_frequency": [1, 2], # Vectorization "min_samples": [10, 25], # HDBSCAN "min_topic_size": [100, 200], # HDBSCAN "n_neighbors": [15, 25], # UMAP "n_components": [2, 5], # UMAP "min_dist": [0.01, 0.1], # UMAP "nr_topics": ["auto"], # Topic Modeling "top_n_words": [10, 13, 15, 17, 20], # Topic Modeling } def calculate_metrics(topic_model, embedder, top_n_words=10): # Get topic words topic_words = [] for topic_id in range(len(topic_model.get_topic_info()) - 1): words = [word for word, _ in topic_model.get_topic(topic_id)] topic_words.append(words[:top_n_words]) # Pre-compute embeddings for all unique words all_words = list(set(word for words in topic_words for word in words)) word_embeddings = embedder.encode(all_words) embedding_map = {word: emb for word, emb in zip(all_words, word_embeddings)} # Coherence coherence_scores = [] for words in topic_words: embeddings = np.array([embedding_map[word] for word in words]) sim_matrix = cosine_similarity(embeddings) np.fill_diagonal(sim_matrix, 0) mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)]) coherence_scores.append(mean_sim) overall_coherence = np.mean(coherence_scores) # Diversity all_topic_words = [word for topic in topic_words for word in topic] diversity = len(set(all_topic_words)) / len(all_topic_words) res = { "coherence": float(str(overall_coherence)[:6]), "diversity": float(str(diversity)[:6]), "combined_score": float(str(0.7 * overall_coherence + 0.3 * diversity)[:6]), } print(res) return res def auto_tune_bertopic(texts, embedding_model, param_grid): best_score = -1 best_params = None best_model = None history = [] print("Starting auto-tuning of BERTopic...") print(f"Number of reviews: {len(texts)}") print("Running embedding model...") embedder = SentenceTransformer(embedding_model) embeddings = embedder.encode(texts, show_progress_bar=True) # Convert param_grid to list for sampling print("Generating parameter combinations...") param_list = list(ParameterGrid(param_grid)) print(f"Total parameter combinations: {len(param_list)}") for params in param_list: print(f"Testing param combination no. {len(history) + 1}/{len(param_list)}...") try: print(f"Testing params: {params}") ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) vectorizer_model = CountVectorizer( stop_words="english", min_df=params["min_document_frequency"], ngram_range=(1, params["n_gram_max"]), ) representation_model = KeyBERTInspired() umap_model = UMAP( n_neighbors=params["n_neighbors"], n_components=params["n_components"], min_dist=params["min_dist"], metric="cosine", low_memory=True, random_state=42, ) hdbscan_model = HDBSCAN( min_cluster_size=params["min_topic_size"], metric="euclidean", cluster_selection_method="eom", gen_min_span_tree=True, prediction_data=True, ) model = BERTopic( embedding_model=embedding_model, ctfidf_model=ctfidf_model, vectorizer_model=vectorizer_model, umap_model=umap_model, hdbscan_model=hdbscan_model, representation_model=representation_model, verbose=True, calculate_probabilities=True, language="english", top_n_words=params["top_n_words"], nr_topics=params["nr_topics"], ) topics, _ = model.fit_transform(texts, embeddings) metrics = calculate_metrics(model, embedder) history.append({"params": params, "metrics": metrics}) with open("history.json", "w") as f: json.dump(history, f, indent=2) if metrics["combined_score"] > best_score: best_score = metrics["combined_score"] best_params = params best_model = model except Exception as e: print(f"Failed with params {params}: {str(e)}") traceback.print_exc() continue with open("output/autotune.json", "w") as f: json.dump(history, f, indent=2) return best_model, best_params, best_score SPECIAL_CHARS = ["\n", "\\n"] MIN_REVIEW_WORDS = 5 print("Loading reviews...") reviews = pd.read_csv( "../data/intermediate/preprocessed.tab", sep="\t" ).review.to_list() print("Running light preprocessing...") for schar in SPECIAL_CHARS: reviews = [ review.replace(schar, " ") if isinstance(review, str) else review for review in reviews ] print("Filtering short reviews...") reviews = [review for review in reviews if len(str(review).split()) >= MIN_REVIEW_WORDS] print("Staring auto-tuning...") print(auto_tune_bertopic(reviews, "all-MiniLM-L6-v2", param_grid))