mirror of
https://github.com/marvinscham/masterthesis-playground.git
synced 2026-03-22 00:12:42 +01:00
BERTopic cleanup and structuring
This commit is contained in:
@@ -8,7 +8,6 @@ from bertopic.vectorizers import ClassTfidfTransformer
|
|||||||
from hdbscan import HDBSCAN
|
from hdbscan import HDBSCAN
|
||||||
from sentence_transformers import SentenceTransformer
|
from sentence_transformers import SentenceTransformer
|
||||||
from sklearn.feature_extraction.text import CountVectorizer
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
from sklearn.metrics import pairwise_distances
|
|
||||||
from sklearn.metrics.pairwise import cosine_similarity
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
from sklearn.model_selection import ParameterGrid
|
from sklearn.model_selection import ParameterGrid
|
||||||
from umap import UMAP
|
from umap import UMAP
|
||||||
@@ -74,7 +73,7 @@ def auto_tune_bertopic(texts, embedding_model, param_grid):
|
|||||||
|
|
||||||
print("Running embedding model...")
|
print("Running embedding model...")
|
||||||
embedder = SentenceTransformer(embedding_model)
|
embedder = SentenceTransformer(embedding_model)
|
||||||
embeddings = embedder.encode(reviews, show_progress_bar=True)
|
embeddings = embedder.encode(texts, show_progress_bar=True)
|
||||||
|
|
||||||
# Convert param_grid to list for sampling
|
# Convert param_grid to list for sampling
|
||||||
print("Generating parameter combinations...")
|
print("Generating parameter combinations...")
|
||||||
@@ -151,7 +150,9 @@ SPECIAL_CHARS = ["\n", "\\n"]
|
|||||||
MIN_REVIEW_WORDS = 5
|
MIN_REVIEW_WORDS = 5
|
||||||
|
|
||||||
print("Loading reviews...")
|
print("Loading reviews...")
|
||||||
reviews = pd.read_csv("../data/original/reviews.tab", sep="\t").review.to_list()
|
reviews = pd.read_csv(
|
||||||
|
"../data/intermediate/preprocessed.tab", sep="\t"
|
||||||
|
).review.to_list()
|
||||||
|
|
||||||
print("Running light preprocessing...")
|
print("Running light preprocessing...")
|
||||||
for schar in SPECIAL_CHARS:
|
for schar in SPECIAL_CHARS:
|
||||||
|
|||||||
@@ -30,21 +30,18 @@ import gensim.corpora as corpora
|
|||||||
import nltk
|
import nltk
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import spacy
|
|
||||||
from bertopic.representation import KeyBERTInspired
|
from bertopic.representation import KeyBERTInspired
|
||||||
from bertopic.vectorizers import ClassTfidfTransformer
|
from bertopic.vectorizers import ClassTfidfTransformer
|
||||||
from gensim.models.coherencemodel import CoherenceModel
|
from gensim.models.coherencemodel import CoherenceModel
|
||||||
from hdbscan import HDBSCAN
|
from hdbscan import HDBSCAN
|
||||||
from nltk.corpus import stopwords
|
|
||||||
from sentence_transformers import SentenceTransformer
|
from sentence_transformers import SentenceTransformer
|
||||||
from sklearn.feature_extraction.text import CountVectorizer
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
|
from sklearn.feature_extraction import text as skltext
|
||||||
from sklearn.metrics.pairwise import cosine_similarity
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
from umap import UMAP
|
from umap import UMAP
|
||||||
|
|
||||||
from bertopic import BERTopic
|
from bertopic import BERTopic
|
||||||
|
|
||||||
nlp = spacy.load("en_core_web_sm")
|
|
||||||
|
|
||||||
nltk.download("stopwords")
|
nltk.download("stopwords")
|
||||||
nltk.download("punkt")
|
nltk.download("punkt")
|
||||||
nltk.download("wordnet")
|
nltk.download("wordnet")
|
||||||
@@ -98,21 +95,19 @@ tracking = {
|
|||||||
#
|
#
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
|
# Import data after general preprocessing
|
||||||
|
|
||||||
if DATA_SAMPLE_SIZE == -1:
|
if DATA_SAMPLE_SIZE == -1:
|
||||||
reviews = pd.read_csv("../data/original/reviews.tab", sep="\t").review.to_list()
|
reviews = pd.read_csv(
|
||||||
|
"../data/intermediate/preprocessed.tab", sep="\t"
|
||||||
|
).review.to_list()
|
||||||
else:
|
else:
|
||||||
reviews = (
|
reviews = (
|
||||||
pd.read_csv("../data/original/reviews.tab", sep="\t")
|
pd.read_csv("../data/intermediate/preprocessed.tab", sep="\t")
|
||||||
.sample(n=DATA_SAMPLE_SIZE)
|
.sample(n=DATA_SAMPLE_SIZE)
|
||||||
.review.to_list()
|
.review.to_list()
|
||||||
)
|
)
|
||||||
|
|
||||||
# Remove all duplicate reviews
|
|
||||||
reviews = list(set(reviews))
|
|
||||||
|
|
||||||
# Remove reviews that contain less than x words
|
|
||||||
reviews = [review for review in reviews if len(review.split()) >= 9]
|
|
||||||
|
|
||||||
print("Loaded {} reviews".format(len(reviews)))
|
print("Loaded {} reviews".format(len(reviews)))
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
@@ -128,10 +123,14 @@ rep = dict((re.escape(k), v) for k, v in rep.items())
|
|||||||
pattern = re.compile("|".join(rep.keys()))
|
pattern = re.compile("|".join(rep.keys()))
|
||||||
|
|
||||||
|
|
||||||
|
# def preprocess(text):
|
||||||
|
# text = text.strip()
|
||||||
|
# text = text.lower()
|
||||||
|
# text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
|
||||||
|
# return text
|
||||||
|
|
||||||
|
|
||||||
def preprocess(text):
|
def preprocess(text):
|
||||||
text = text.strip()
|
|
||||||
text = text.lower()
|
|
||||||
text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
@@ -188,11 +187,13 @@ reduced_embeddings = umap_model.fit_transform(embeddings)
|
|||||||
|
|
||||||
# %%
|
# %%
|
||||||
if RECREATE_MODEL:
|
if RECREATE_MODEL:
|
||||||
|
stop_words = list(skltext.ENGLISH_STOP_WORDS.union(["bali"]))
|
||||||
|
|
||||||
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
|
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
|
||||||
vectorizer_model = CountVectorizer(
|
vectorizer_model = CountVectorizer(
|
||||||
min_df=MIN_DOCUMENT_FREQUENCY,
|
min_df=MIN_DOCUMENT_FREQUENCY,
|
||||||
ngram_range=(1, MAX_NGRAM),
|
ngram_range=(1, MAX_NGRAM),
|
||||||
stop_words=stopwords.words("english"),
|
stop_words=stop_words,
|
||||||
)
|
)
|
||||||
|
|
||||||
representation_model = KeyBERTInspired()
|
representation_model = KeyBERTInspired()
|
||||||
@@ -427,6 +428,9 @@ vis = topic_model.visualize_documents(
|
|||||||
vis.write_html("output/visualization.html")
|
vis.write_html("output/visualization.html")
|
||||||
vis
|
vis
|
||||||
|
|
||||||
|
# %%
|
||||||
|
topic_model.visualize_document_datamap(reviews, reduced_embeddings=reduced_embeddings)
|
||||||
|
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
# ### Similarity Matrix
|
# ### Similarity Matrix
|
||||||
#
|
#
|
||||||
@@ -527,15 +531,29 @@ if this_will_crash_your_pc_are_you_sure:
|
|||||||
#
|
#
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
search_term = "uluwatu"
|
search_term = "spirituality"
|
||||||
|
|
||||||
similar_topics, similarities = topic_model.find_topics(search_term, top_n=10)
|
similar_topics, similarities = topic_model.find_topics(search_term, top_n=10)
|
||||||
for i in range(len(similar_topics)):
|
for i in range(len(similar_topics)):
|
||||||
# \n{topic_model.get_topic(similar_topics[i])}\n
|
|
||||||
print(
|
print(
|
||||||
f"{str(similarities[i])[:5]} {topic_model.get_topic_info(similar_topics[i])["CustomName"][0]}"
|
f"{str(similarities[i])[:5]} {topic_model.get_topic_info(similar_topics[i])['CustomName'][0]}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# Source: https://maartengr.github.io/BERTopic/getting_started/visualization/visualize_documents.html#visualize-probabilities-or-distribution
|
||||||
|
# Calculate the topic distributions on a token-level
|
||||||
|
topic_distr, topic_token_distr = topic_model.approximate_distribution(
|
||||||
|
reviews, calculate_tokens=True, use_embedding_model=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# Visualize the token-level distributions
|
||||||
|
DOC_INDEX = 6
|
||||||
|
df = topic_model.visualize_approximate_distribution(
|
||||||
|
reviews[DOC_INDEX], topic_token_distr[DOC_INDEX]
|
||||||
|
)
|
||||||
|
df
|
||||||
|
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
# ### Topic Hierarchy
|
# ### Topic Hierarchy
|
||||||
#
|
#
|
||||||
@@ -556,42 +574,3 @@ topic_model.visualize_topics(use_ctfidf=True)
|
|||||||
|
|
||||||
# %%
|
# %%
|
||||||
topic_model.visualize_barchart(top_n_topics=12, custom_labels=True, n_words=10)
|
topic_model.visualize_barchart(top_n_topics=12, custom_labels=True, n_words=10)
|
||||||
|
|
||||||
# %%
|
|
||||||
# from matplotlib import pyplot as plt
|
|
||||||
# from sklearn.manifold import TSNE
|
|
||||||
|
|
||||||
|
|
||||||
# topics = topic_model.topics_
|
|
||||||
|
|
||||||
# # Reduce dimensionality with TSNE
|
|
||||||
# tsne = TSNE(n_components=2, random_state=42)
|
|
||||||
# embeddings_2d = tsne.fit_transform(embeddings)
|
|
||||||
|
|
||||||
# # Prepare colors (assign a color to each topic)
|
|
||||||
# unique_topics = set(topics)
|
|
||||||
# colors = plt.get_cmap("tab20", len(unique_topics))
|
|
||||||
|
|
||||||
# # Plot
|
|
||||||
# plt.figure(figsize=(12, 8))
|
|
||||||
# for topic in unique_topics:
|
|
||||||
# # Select indices for the current topic
|
|
||||||
# indices = [i for i, t in enumerate(topics) if t == topic]
|
|
||||||
|
|
||||||
# # Get 2D points for these indices
|
|
||||||
# x = embeddings_2d[indices, 0]
|
|
||||||
# y = embeddings_2d[indices, 1]
|
|
||||||
|
|
||||||
# # Assign label (exclude outliers)
|
|
||||||
# label = f"Topic {topic}" if topic != -1 else "Outliers"
|
|
||||||
|
|
||||||
# # Plot with color
|
|
||||||
# plt.scatter(x, y, color=colors(topic + 1), label=label, alpha=0.5)
|
|
||||||
|
|
||||||
# plt.title("Topic Clusters in 2D Embedding Space")
|
|
||||||
# plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
|
|
||||||
# plt.tight_layout()
|
|
||||||
|
|
||||||
# # Save the plot
|
|
||||||
# plt.savefig("topic_clusters.png", dpi=300, bbox_inches="tight")
|
|
||||||
# plt.show()
|
|
||||||
|
|||||||
@@ -130,3 +130,4 @@ wrapt==1.17.2
|
|||||||
spacy
|
spacy
|
||||||
nbconvert
|
nbconvert
|
||||||
jupytext
|
jupytext
|
||||||
|
datamapplot
|
||||||
|
|||||||
55665
data/intermediate/preprocessed.tab
Normal file
55665
data/intermediate/preprocessed.tab
Normal file
File diff suppressed because one or more lines are too long
Binary file not shown.
35
preprocessing/preprocess.py
Normal file
35
preprocessing/preprocess.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
reviews = pd.read_csv("../data/original/reviews.tab", sep="\t").review.to_list()
|
||||||
|
|
||||||
|
reviews = list(set(reviews)) # Removes exact duplicates
|
||||||
|
|
||||||
|
# print reviews with less than 8 words
|
||||||
|
for review in reviews:
|
||||||
|
if len(review.split()) < 8:
|
||||||
|
print("Short review ({} words):".format(len(review.split())))
|
||||||
|
print(review)
|
||||||
|
print("-" * 60)
|
||||||
|
|
||||||
|
# Remove reviews that contain less than 8 words
|
||||||
|
reviews = [review for review in reviews if len(review.split()) >= 8]
|
||||||
|
|
||||||
|
html_tag_pattern = re.compile(r"</?[a-zA-Z][^>]*>")
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess(text):
|
||||||
|
if html_tag_pattern.search(text):
|
||||||
|
print("Possible HTML tag:")
|
||||||
|
print(text)
|
||||||
|
print("-" * 60)
|
||||||
|
text = re.sub(html_tag_pattern, "", text)
|
||||||
|
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
|
with open("../data/intermediate/preprocessed.tab", "w", encoding="utf-8") as f:
|
||||||
|
f.write("review\n")
|
||||||
|
for review in reviews:
|
||||||
|
f.write(preprocess(review) + "\n")
|
||||||
1
preprocessing/requirements.txt
Normal file
1
preprocessing/requirements.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
pandas
|
||||||
Reference in New Issue
Block a user