mirror of
https://github.com/marvinscham/masterthesis-playground.git
synced 2026-03-22 08:22:43 +01:00
BERTopic cleanup and structuring
This commit is contained in:
@@ -8,7 +8,6 @@ from bertopic.vectorizers import ClassTfidfTransformer
|
||||
from hdbscan import HDBSCAN
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.metrics import pairwise_distances
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
from sklearn.model_selection import ParameterGrid
|
||||
from umap import UMAP
|
||||
@@ -74,7 +73,7 @@ def auto_tune_bertopic(texts, embedding_model, param_grid):
|
||||
|
||||
print("Running embedding model...")
|
||||
embedder = SentenceTransformer(embedding_model)
|
||||
embeddings = embedder.encode(reviews, show_progress_bar=True)
|
||||
embeddings = embedder.encode(texts, show_progress_bar=True)
|
||||
|
||||
# Convert param_grid to list for sampling
|
||||
print("Generating parameter combinations...")
|
||||
@@ -151,7 +150,9 @@ SPECIAL_CHARS = ["\n", "\\n"]
|
||||
MIN_REVIEW_WORDS = 5
|
||||
|
||||
print("Loading reviews...")
|
||||
reviews = pd.read_csv("../data/original/reviews.tab", sep="\t").review.to_list()
|
||||
reviews = pd.read_csv(
|
||||
"../data/intermediate/preprocessed.tab", sep="\t"
|
||||
).review.to_list()
|
||||
|
||||
print("Running light preprocessing...")
|
||||
for schar in SPECIAL_CHARS:
|
||||
|
||||
@@ -30,21 +30,18 @@ import gensim.corpora as corpora
|
||||
import nltk
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import spacy
|
||||
from bertopic.representation import KeyBERTInspired
|
||||
from bertopic.vectorizers import ClassTfidfTransformer
|
||||
from gensim.models.coherencemodel import CoherenceModel
|
||||
from hdbscan import HDBSCAN
|
||||
from nltk.corpus import stopwords
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.feature_extraction import text as skltext
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
from umap import UMAP
|
||||
|
||||
from bertopic import BERTopic
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
|
||||
nltk.download("stopwords")
|
||||
nltk.download("punkt")
|
||||
nltk.download("wordnet")
|
||||
@@ -98,21 +95,19 @@ tracking = {
|
||||
#
|
||||
|
||||
# %%
|
||||
# Import data after general preprocessing
|
||||
|
||||
if DATA_SAMPLE_SIZE == -1:
|
||||
reviews = pd.read_csv("../data/original/reviews.tab", sep="\t").review.to_list()
|
||||
reviews = pd.read_csv(
|
||||
"../data/intermediate/preprocessed.tab", sep="\t"
|
||||
).review.to_list()
|
||||
else:
|
||||
reviews = (
|
||||
pd.read_csv("../data/original/reviews.tab", sep="\t")
|
||||
pd.read_csv("../data/intermediate/preprocessed.tab", sep="\t")
|
||||
.sample(n=DATA_SAMPLE_SIZE)
|
||||
.review.to_list()
|
||||
)
|
||||
|
||||
# Remove all duplicate reviews
|
||||
reviews = list(set(reviews))
|
||||
|
||||
# Remove reviews that contain less than x words
|
||||
reviews = [review for review in reviews if len(review.split()) >= 9]
|
||||
|
||||
print("Loaded {} reviews".format(len(reviews)))
|
||||
|
||||
# %%
|
||||
@@ -128,10 +123,14 @@ rep = dict((re.escape(k), v) for k, v in rep.items())
|
||||
pattern = re.compile("|".join(rep.keys()))
|
||||
|
||||
|
||||
# def preprocess(text):
|
||||
# text = text.strip()
|
||||
# text = text.lower()
|
||||
# text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
|
||||
# return text
|
||||
|
||||
|
||||
def preprocess(text):
|
||||
text = text.strip()
|
||||
text = text.lower()
|
||||
text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
|
||||
return text
|
||||
|
||||
|
||||
@@ -188,11 +187,13 @@ reduced_embeddings = umap_model.fit_transform(embeddings)
|
||||
|
||||
# %%
|
||||
if RECREATE_MODEL:
|
||||
stop_words = list(skltext.ENGLISH_STOP_WORDS.union(["bali"]))
|
||||
|
||||
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
|
||||
vectorizer_model = CountVectorizer(
|
||||
min_df=MIN_DOCUMENT_FREQUENCY,
|
||||
ngram_range=(1, MAX_NGRAM),
|
||||
stop_words=stopwords.words("english"),
|
||||
stop_words=stop_words,
|
||||
)
|
||||
|
||||
representation_model = KeyBERTInspired()
|
||||
@@ -427,6 +428,9 @@ vis = topic_model.visualize_documents(
|
||||
vis.write_html("output/visualization.html")
|
||||
vis
|
||||
|
||||
# %%
|
||||
topic_model.visualize_document_datamap(reviews, reduced_embeddings=reduced_embeddings)
|
||||
|
||||
# %% [markdown]
|
||||
# ### Similarity Matrix
|
||||
#
|
||||
@@ -527,15 +531,29 @@ if this_will_crash_your_pc_are_you_sure:
|
||||
#
|
||||
|
||||
# %%
|
||||
search_term = "uluwatu"
|
||||
search_term = "spirituality"
|
||||
|
||||
similar_topics, similarities = topic_model.find_topics(search_term, top_n=10)
|
||||
for i in range(len(similar_topics)):
|
||||
# \n{topic_model.get_topic(similar_topics[i])}\n
|
||||
print(
|
||||
f"{str(similarities[i])[:5]} {topic_model.get_topic_info(similar_topics[i])["CustomName"][0]}"
|
||||
f"{str(similarities[i])[:5]} {topic_model.get_topic_info(similar_topics[i])['CustomName'][0]}"
|
||||
)
|
||||
|
||||
# %%
|
||||
# Source: https://maartengr.github.io/BERTopic/getting_started/visualization/visualize_documents.html#visualize-probabilities-or-distribution
|
||||
# Calculate the topic distributions on a token-level
|
||||
topic_distr, topic_token_distr = topic_model.approximate_distribution(
|
||||
reviews, calculate_tokens=True, use_embedding_model=True
|
||||
)
|
||||
|
||||
# %%
|
||||
# Visualize the token-level distributions
|
||||
DOC_INDEX = 6
|
||||
df = topic_model.visualize_approximate_distribution(
|
||||
reviews[DOC_INDEX], topic_token_distr[DOC_INDEX]
|
||||
)
|
||||
df
|
||||
|
||||
# %% [markdown]
|
||||
# ### Topic Hierarchy
|
||||
#
|
||||
@@ -556,42 +574,3 @@ topic_model.visualize_topics(use_ctfidf=True)
|
||||
|
||||
# %%
|
||||
topic_model.visualize_barchart(top_n_topics=12, custom_labels=True, n_words=10)
|
||||
|
||||
# %%
|
||||
# from matplotlib import pyplot as plt
|
||||
# from sklearn.manifold import TSNE
|
||||
|
||||
|
||||
# topics = topic_model.topics_
|
||||
|
||||
# # Reduce dimensionality with TSNE
|
||||
# tsne = TSNE(n_components=2, random_state=42)
|
||||
# embeddings_2d = tsne.fit_transform(embeddings)
|
||||
|
||||
# # Prepare colors (assign a color to each topic)
|
||||
# unique_topics = set(topics)
|
||||
# colors = plt.get_cmap("tab20", len(unique_topics))
|
||||
|
||||
# # Plot
|
||||
# plt.figure(figsize=(12, 8))
|
||||
# for topic in unique_topics:
|
||||
# # Select indices for the current topic
|
||||
# indices = [i for i, t in enumerate(topics) if t == topic]
|
||||
|
||||
# # Get 2D points for these indices
|
||||
# x = embeddings_2d[indices, 0]
|
||||
# y = embeddings_2d[indices, 1]
|
||||
|
||||
# # Assign label (exclude outliers)
|
||||
# label = f"Topic {topic}" if topic != -1 else "Outliers"
|
||||
|
||||
# # Plot with color
|
||||
# plt.scatter(x, y, color=colors(topic + 1), label=label, alpha=0.5)
|
||||
|
||||
# plt.title("Topic Clusters in 2D Embedding Space")
|
||||
# plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
|
||||
# plt.tight_layout()
|
||||
|
||||
# # Save the plot
|
||||
# plt.savefig("topic_clusters.png", dpi=300, bbox_inches="tight")
|
||||
# plt.show()
|
||||
|
||||
@@ -130,3 +130,4 @@ wrapt==1.17.2
|
||||
spacy
|
||||
nbconvert
|
||||
jupytext
|
||||
datamapplot
|
||||
|
||||
55665
data/intermediate/preprocessed.tab
Normal file
55665
data/intermediate/preprocessed.tab
Normal file
File diff suppressed because one or more lines are too long
Binary file not shown.
35
preprocessing/preprocess.py
Normal file
35
preprocessing/preprocess.py
Normal file
@@ -0,0 +1,35 @@
|
||||
import re
|
||||
|
||||
import pandas as pd
|
||||
|
||||
reviews = pd.read_csv("../data/original/reviews.tab", sep="\t").review.to_list()
|
||||
|
||||
reviews = list(set(reviews)) # Removes exact duplicates
|
||||
|
||||
# print reviews with less than 8 words
|
||||
for review in reviews:
|
||||
if len(review.split()) < 8:
|
||||
print("Short review ({} words):".format(len(review.split())))
|
||||
print(review)
|
||||
print("-" * 60)
|
||||
|
||||
# Remove reviews that contain less than 8 words
|
||||
reviews = [review for review in reviews if len(review.split()) >= 8]
|
||||
|
||||
html_tag_pattern = re.compile(r"</?[a-zA-Z][^>]*>")
|
||||
|
||||
|
||||
def preprocess(text):
|
||||
if html_tag_pattern.search(text):
|
||||
print("Possible HTML tag:")
|
||||
print(text)
|
||||
print("-" * 60)
|
||||
text = re.sub(html_tag_pattern, "", text)
|
||||
|
||||
return text.strip()
|
||||
|
||||
|
||||
with open("../data/intermediate/preprocessed.tab", "w", encoding="utf-8") as f:
|
||||
f.write("review\n")
|
||||
for review in reviews:
|
||||
f.write(preprocess(review) + "\n")
|
||||
1
preprocessing/requirements.txt
Normal file
1
preprocessing/requirements.txt
Normal file
@@ -0,0 +1 @@
|
||||
pandas
|
||||
Reference in New Issue
Block a user