BERTopic cleanup and structuring

This commit is contained in:
2026-02-20 18:01:46 +01:00
parent 99ba5031ca
commit ccf96b447c
7 changed files with 55743 additions and 61 deletions

View File

@@ -8,7 +8,6 @@ from bertopic.vectorizers import ClassTfidfTransformer
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import ParameterGrid
from umap import UMAP
@@ -74,7 +73,7 @@ def auto_tune_bertopic(texts, embedding_model, param_grid):
print("Running embedding model...")
embedder = SentenceTransformer(embedding_model)
embeddings = embedder.encode(reviews, show_progress_bar=True)
embeddings = embedder.encode(texts, show_progress_bar=True)
# Convert param_grid to list for sampling
print("Generating parameter combinations...")
@@ -151,7 +150,9 @@ SPECIAL_CHARS = ["\n", "\\n"]
MIN_REVIEW_WORDS = 5
print("Loading reviews...")
reviews = pd.read_csv("../data/original/reviews.tab", sep="\t").review.to_list()
reviews = pd.read_csv(
"../data/intermediate/preprocessed.tab", sep="\t"
).review.to_list()
print("Running light preprocessing...")
for schar in SPECIAL_CHARS: