BERTopic cleanup and structuring

This commit is contained in:
2026-02-20 18:01:46 +01:00
parent 99ba5031ca
commit ccf96b447c
7 changed files with 55743 additions and 61 deletions

View File

@@ -8,7 +8,6 @@ from bertopic.vectorizers import ClassTfidfTransformer
from hdbscan import HDBSCAN from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import ParameterGrid from sklearn.model_selection import ParameterGrid
from umap import UMAP from umap import UMAP
@@ -74,7 +73,7 @@ def auto_tune_bertopic(texts, embedding_model, param_grid):
print("Running embedding model...") print("Running embedding model...")
embedder = SentenceTransformer(embedding_model) embedder = SentenceTransformer(embedding_model)
embeddings = embedder.encode(reviews, show_progress_bar=True) embeddings = embedder.encode(texts, show_progress_bar=True)
# Convert param_grid to list for sampling # Convert param_grid to list for sampling
print("Generating parameter combinations...") print("Generating parameter combinations...")
@@ -151,7 +150,9 @@ SPECIAL_CHARS = ["\n", "\\n"]
MIN_REVIEW_WORDS = 5 MIN_REVIEW_WORDS = 5
print("Loading reviews...") print("Loading reviews...")
reviews = pd.read_csv("../data/original/reviews.tab", sep="\t").review.to_list() reviews = pd.read_csv(
"../data/intermediate/preprocessed.tab", sep="\t"
).review.to_list()
print("Running light preprocessing...") print("Running light preprocessing...")
for schar in SPECIAL_CHARS: for schar in SPECIAL_CHARS:

View File

@@ -30,21 +30,18 @@ import gensim.corpora as corpora
import nltk import nltk
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import spacy
from bertopic.representation import KeyBERTInspired from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer from bertopic.vectorizers import ClassTfidfTransformer
from gensim.models.coherencemodel import CoherenceModel from gensim.models.coherencemodel import CoherenceModel
from hdbscan import HDBSCAN from hdbscan import HDBSCAN
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text as skltext
from sklearn.metrics.pairwise import cosine_similarity from sklearn.metrics.pairwise import cosine_similarity
from umap import UMAP from umap import UMAP
from bertopic import BERTopic from bertopic import BERTopic
nlp = spacy.load("en_core_web_sm")
nltk.download("stopwords") nltk.download("stopwords")
nltk.download("punkt") nltk.download("punkt")
nltk.download("wordnet") nltk.download("wordnet")
@@ -98,21 +95,19 @@ tracking = {
# #
# %% # %%
# Import data after general preprocessing
if DATA_SAMPLE_SIZE == -1: if DATA_SAMPLE_SIZE == -1:
reviews = pd.read_csv("../data/original/reviews.tab", sep="\t").review.to_list() reviews = pd.read_csv(
"../data/intermediate/preprocessed.tab", sep="\t"
).review.to_list()
else: else:
reviews = ( reviews = (
pd.read_csv("../data/original/reviews.tab", sep="\t") pd.read_csv("../data/intermediate/preprocessed.tab", sep="\t")
.sample(n=DATA_SAMPLE_SIZE) .sample(n=DATA_SAMPLE_SIZE)
.review.to_list() .review.to_list()
) )
# Remove all duplicate reviews
reviews = list(set(reviews))
# Remove reviews that contain less than x words
reviews = [review for review in reviews if len(review.split()) >= 9]
print("Loaded {} reviews".format(len(reviews))) print("Loaded {} reviews".format(len(reviews)))
# %% # %%
@@ -128,10 +123,14 @@ rep = dict((re.escape(k), v) for k, v in rep.items())
pattern = re.compile("|".join(rep.keys())) pattern = re.compile("|".join(rep.keys()))
# def preprocess(text):
# text = text.strip()
# text = text.lower()
# text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
# return text
def preprocess(text): def preprocess(text):
text = text.strip()
text = text.lower()
text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
return text return text
@@ -188,11 +187,13 @@ reduced_embeddings = umap_model.fit_transform(embeddings)
# %% # %%
if RECREATE_MODEL: if RECREATE_MODEL:
stop_words = list(skltext.ENGLISH_STOP_WORDS.union(["bali"]))
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
vectorizer_model = CountVectorizer( vectorizer_model = CountVectorizer(
min_df=MIN_DOCUMENT_FREQUENCY, min_df=MIN_DOCUMENT_FREQUENCY,
ngram_range=(1, MAX_NGRAM), ngram_range=(1, MAX_NGRAM),
stop_words=stopwords.words("english"), stop_words=stop_words,
) )
representation_model = KeyBERTInspired() representation_model = KeyBERTInspired()
@@ -427,6 +428,9 @@ vis = topic_model.visualize_documents(
vis.write_html("output/visualization.html") vis.write_html("output/visualization.html")
vis vis
# %%
topic_model.visualize_document_datamap(reviews, reduced_embeddings=reduced_embeddings)
# %% [markdown] # %% [markdown]
# ### Similarity Matrix # ### Similarity Matrix
# #
@@ -527,15 +531,29 @@ if this_will_crash_your_pc_are_you_sure:
# #
# %% # %%
search_term = "uluwatu" search_term = "spirituality"
similar_topics, similarities = topic_model.find_topics(search_term, top_n=10) similar_topics, similarities = topic_model.find_topics(search_term, top_n=10)
for i in range(len(similar_topics)): for i in range(len(similar_topics)):
# \n{topic_model.get_topic(similar_topics[i])}\n
print( print(
f"{str(similarities[i])[:5]} {topic_model.get_topic_info(similar_topics[i])["CustomName"][0]}" f"{str(similarities[i])[:5]} {topic_model.get_topic_info(similar_topics[i])['CustomName'][0]}"
) )
# %%
# Source: https://maartengr.github.io/BERTopic/getting_started/visualization/visualize_documents.html#visualize-probabilities-or-distribution
# Calculate the topic distributions on a token-level
topic_distr, topic_token_distr = topic_model.approximate_distribution(
reviews, calculate_tokens=True, use_embedding_model=True
)
# %%
# Visualize the token-level distributions
DOC_INDEX = 6
df = topic_model.visualize_approximate_distribution(
reviews[DOC_INDEX], topic_token_distr[DOC_INDEX]
)
df
# %% [markdown] # %% [markdown]
# ### Topic Hierarchy # ### Topic Hierarchy
# #
@@ -556,42 +574,3 @@ topic_model.visualize_topics(use_ctfidf=True)
# %% # %%
topic_model.visualize_barchart(top_n_topics=12, custom_labels=True, n_words=10) topic_model.visualize_barchart(top_n_topics=12, custom_labels=True, n_words=10)
# %%
# from matplotlib import pyplot as plt
# from sklearn.manifold import TSNE
# topics = topic_model.topics_
# # Reduce dimensionality with TSNE
# tsne = TSNE(n_components=2, random_state=42)
# embeddings_2d = tsne.fit_transform(embeddings)
# # Prepare colors (assign a color to each topic)
# unique_topics = set(topics)
# colors = plt.get_cmap("tab20", len(unique_topics))
# # Plot
# plt.figure(figsize=(12, 8))
# for topic in unique_topics:
# # Select indices for the current topic
# indices = [i for i, t in enumerate(topics) if t == topic]
# # Get 2D points for these indices
# x = embeddings_2d[indices, 0]
# y = embeddings_2d[indices, 1]
# # Assign label (exclude outliers)
# label = f"Topic {topic}" if topic != -1 else "Outliers"
# # Plot with color
# plt.scatter(x, y, color=colors(topic + 1), label=label, alpha=0.5)
# plt.title("Topic Clusters in 2D Embedding Space")
# plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
# plt.tight_layout()
# # Save the plot
# plt.savefig("topic_clusters.png", dpi=300, bbox_inches="tight")
# plt.show()

View File

@@ -130,3 +130,4 @@ wrapt==1.17.2
spacy spacy
nbconvert nbconvert
jupytext jupytext
datamapplot

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,35 @@
import re
import pandas as pd
reviews = pd.read_csv("../data/original/reviews.tab", sep="\t").review.to_list()
reviews = list(set(reviews)) # Removes exact duplicates
# print reviews with less than 8 words
for review in reviews:
if len(review.split()) < 8:
print("Short review ({} words):".format(len(review.split())))
print(review)
print("-" * 60)
# Remove reviews that contain less than 8 words
reviews = [review for review in reviews if len(review.split()) >= 8]
html_tag_pattern = re.compile(r"</?[a-zA-Z][^>]*>")
def preprocess(text):
if html_tag_pattern.search(text):
print("Possible HTML tag:")
print(text)
print("-" * 60)
text = re.sub(html_tag_pattern, "", text)
return text.strip()
with open("../data/intermediate/preprocessed.tab", "w", encoding="utf-8") as f:
f.write("review\n")
for review in reviews:
f.write(preprocess(review) + "\n")

View File

@@ -0,0 +1 @@
pandas