mirror of
https://github.com/marvinscham/masterthesis-playground.git
synced 2026-03-23 00:42:43 +01:00
BERTopic cleanup and structuring
This commit is contained in:
@@ -8,7 +8,6 @@ from bertopic.vectorizers import ClassTfidfTransformer
|
||||
from hdbscan import HDBSCAN
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.metrics import pairwise_distances
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
from sklearn.model_selection import ParameterGrid
|
||||
from umap import UMAP
|
||||
@@ -74,7 +73,7 @@ def auto_tune_bertopic(texts, embedding_model, param_grid):
|
||||
|
||||
print("Running embedding model...")
|
||||
embedder = SentenceTransformer(embedding_model)
|
||||
embeddings = embedder.encode(reviews, show_progress_bar=True)
|
||||
embeddings = embedder.encode(texts, show_progress_bar=True)
|
||||
|
||||
# Convert param_grid to list for sampling
|
||||
print("Generating parameter combinations...")
|
||||
@@ -151,7 +150,9 @@ SPECIAL_CHARS = ["\n", "\\n"]
|
||||
MIN_REVIEW_WORDS = 5
|
||||
|
||||
print("Loading reviews...")
|
||||
reviews = pd.read_csv("../data/original/reviews.tab", sep="\t").review.to_list()
|
||||
reviews = pd.read_csv(
|
||||
"../data/intermediate/preprocessed.tab", sep="\t"
|
||||
).review.to_list()
|
||||
|
||||
print("Running light preprocessing...")
|
||||
for schar in SPECIAL_CHARS:
|
||||
|
||||
Reference in New Issue
Block a user