Files
masterthesis-playground/bertopic/nb_bertopic_lowprep.py

598 lines
15 KiB
Python

# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.18.0
# kernelspec:
# display_name: .venv (3.12.3)
# language: python
# name: python3
# ---
# %% [markdown]
# # Topic Detection: Bali Tourist Reviews
#
# %% [markdown]
# ## Preparation
#
# ### Dependency Loading
#
# %%
import pickle
import re
import gensim.corpora as corpora
import nltk
import numpy as np
import pandas as pd
import spacy
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
from gensim.models.coherencemodel import CoherenceModel
from hdbscan import HDBSCAN
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from umap import UMAP
from bertopic import BERTopic
nlp = spacy.load("en_core_web_sm")
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
# %% [markdown]
# ### Parameters and Tracking
#
# %%
RECREATE_MODEL = True
RECREATE_REDUCED_MODEL = True
PROCESS_DATA = True
REDUCE_OUTLIERS = False
# Data Sample Size, -1 for all data
DATA_SAMPLE_SIZE = -1
# Vectorization
MIN_DOCUMENT_FREQUENCY = 1
MAX_NGRAM = 3
# HDBSCAN Parameters
MIN_TOPIC_SIZE = 200
MIN_SAMPLES = 25
# UMAP Parameters
N_NEIGHBORS = 15
N_COMPONENTS = 2
MIN_DIST = 0.01
# Topic Modeling
TOP_N_WORDS = 10
MAX_TOPICS = None # or "auto" to pass to HDBSCAN, None to skip
tracking = {
"input": {
"min_document_frequency": MIN_DOCUMENT_FREQUENCY,
"max_ngram": MAX_NGRAM,
"min_topic_size": MIN_TOPIC_SIZE,
"min_samples": MIN_SAMPLES,
"n_neighbors": N_NEIGHBORS,
"n_components": N_COMPONENTS,
"min_dist": MIN_DIST,
"top_n_words": TOP_N_WORDS,
"max_topics": MAX_TOPICS,
},
}
# %% [markdown]
# ### Data Loading & Preprocessing
#
# %%
if DATA_SAMPLE_SIZE == -1:
reviews = pd.read_csv("../data/original/reviews.tab", sep="\t").review.to_list()
else:
reviews = (
pd.read_csv("../data/original/reviews.tab", sep="\t")
.sample(n=DATA_SAMPLE_SIZE)
.review.to_list()
)
# Remove all duplicate reviews
reviews = list(set(reviews))
# Remove reviews that contain less than x words
reviews = [review for review in reviews if len(review.split()) >= 9]
print("Loaded {} reviews".format(len(reviews)))
# %%
rep = {
r"\\n": " ",
r"\n": " ",
r'\\"': "",
r'"': "",
"bali": "",
r"\s+": " ",
}
rep = dict((re.escape(k), v) for k, v in rep.items())
pattern = re.compile("|".join(rep.keys()))
def preprocess(text):
text = text.strip()
text = text.lower()
text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
return text
# %%
print(
preprocess(
"Excellent. Definitely worth coming while in bali. Food and people were very nice.\n🌟 🤩 ⭐️ \nTrisna was our host"
)
)
# %%
if PROCESS_DATA:
print("Processing reviews...")
reviews = [preprocess(review) for review in reviews]
with open("../data/intermediate/processed_texts_lowprep.pkl", "wb") as f:
pickle.dump(reviews, f)
else:
with open("../data/intermediate/processed_texts_lowprep.pkl", "rb") as f:
reviews = pickle.load(f)
print(reviews[:1])
# %% [markdown]
# ### Pre-calculate Embeddings
#
# %%
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(reviews, show_progress_bar=True)
# %% [markdown]
# ## Model Creation
#
# %% [markdown]
# ### Dimensionality Reduction (UMAP)
#
# %%
umap_model = UMAP(
n_neighbors=N_NEIGHBORS,
n_components=N_COMPONENTS,
min_dist=MIN_DIST,
metric="cosine",
low_memory=True,
random_state=42,
)
reduced_embeddings = umap_model.fit_transform(embeddings)
# %% [markdown]
# ### BERTopic Model Creation
#
# %%
if RECREATE_MODEL:
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
vectorizer_model = CountVectorizer(
min_df=MIN_DOCUMENT_FREQUENCY,
ngram_range=(1, MAX_NGRAM),
stop_words=stopwords.words("english"),
)
representation_model = KeyBERTInspired()
hdbscan_model = HDBSCAN(
min_cluster_size=MIN_TOPIC_SIZE,
min_samples=MIN_SAMPLES,
metric="euclidean",
cluster_selection_method="eom",
gen_min_span_tree=True,
prediction_data=True,
)
topic_model = BERTopic(
embedding_model=embedding_model,
ctfidf_model=ctfidf_model,
vectorizer_model=vectorizer_model,
umap_model=umap_model,
hdbscan_model=hdbscan_model,
representation_model=representation_model,
verbose=True,
calculate_probabilities=True,
language="english",
top_n_words=TOP_N_WORDS,
nr_topics=MAX_TOPICS,
)
topics, probs = topic_model.fit_transform(reviews, embeddings=embeddings)
topic_labels = topic_model.generate_topic_labels(
nr_words=3, topic_prefix=True, word_length=15, separator=" - "
)
topic_model.set_topic_labels(topic_labels)
# BERTopic.save(topic_model, "bertopic/model.bertopic")
else:
print("Nevermind, loading existing model")
# topic_model = BERTopic.load("bertopic/model.bertopic")
# %% [markdown]
# ## Fine Tuning
#
# ### Topic Condensation
#
# %%
if RECREATE_REDUCED_MODEL:
done = False
iteration = 1
while not done:
print(f"Iteration {iteration}")
iteration += 1
similarity_matrix = cosine_similarity(
np.array(topic_model.topic_embeddings_)[1:, :]
)
nothing_to_merge = True
for i in range(similarity_matrix.shape[0]):
for j in range(i + 1, similarity_matrix.shape[1]):
try:
sim = similarity_matrix[i, j]
if sim > 0.9:
nothing_to_merge = False
t1, t2 = i, j
try:
t1_name = topic_model.get_topic_info(t1)["CustomName"][0]
t2_name = topic_model.get_topic_info(t2)["CustomName"][0]
print(
f"Merging topics {t1} ({t1_name}) and {t2} ({t2_name}) with similarity {sim:.2f}"
)
topic_model.merge_topics(reviews, topics_to_merge=[t1, t2])
topic_labels = topic_model.generate_topic_labels(
nr_words=3,
topic_prefix=True,
word_length=15,
separator=" - ",
)
topic_model.set_topic_labels(topic_labels)
similarity_matrix = cosine_similarity(
np.array(topic_model.topic_embeddings_)[1:, :]
)
except Exception as e:
print(f"Failed to merge {t1} and {t2}: {e}")
except IndexError:
pass
if nothing_to_merge:
print("No more topics to merge.")
done = True
else:
print("Skipping topic reduction")
# %% [markdown]
# ### Outlier Reduction
#
# %%
if REDUCE_OUTLIERS:
new_topics = topic_model.reduce_outliers(
reviews,
topic_model.topics_,
probabilities=topic_model.probabilities_,
threshold=0.05,
strategy="probabilities",
)
topic_model.update_topics(reviews, topics=new_topics)
# %% [markdown]
# ## Results
#
# ### Classification
#
# %%
CLASSIFICATION = False
if CLASSIFICATION:
import random
from pathlib import Path
# --- config ---
topics_to_keep = {2, 4, 5, 9, 22, 26}
INPUT_PATH = "../data/original/reviews.tab" # TSV with a 'review' column
OUTPUT_CSV = "../data/intermediate/selected_topics_documents.csv"
OUTPUT_DIR = Path("../raft/corpus")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
BATCH_SIZE = 60
MIN_CHARS = 40
SEED = 42
# Topic model document info
df = topic_model.get_document_info(reviews) # assumes your model is already fitted
df["Original"] = reviews.values
# --- filter by topics and length ---
filtered = df[df["Topic"].isin(topics_to_keep)].copy()
filtered["Original"] = filtered["Original"].str.strip()
filtered = filtered[filtered["Original"].str.len() >= MIN_CHARS]
# Save an audit CSV
filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False)
# --- deterministic shuffle + write batched corpus files ---
total_files = 0
total_reviews = 0
rng = random.Random(SEED)
for topic_val, g in filtered.groupby("Topic", sort=True):
reviews_list = g["Original"].tolist()
# deterministic shuffle within topic
rng.shuffle(reviews_list)
# chunk into batches of up to 60
for start in range(0, len(reviews_list), BATCH_SIZE):
chunk = reviews_list[start : start + BATCH_SIZE]
if not chunk:
continue
# simple header for traceability
header = (
f"[TOPIC] {topic_val}\n"
+ f"[Stats] N={len(chunk)} | Source={INPUT_PATH}\n"
)
lines = [header, ""]
for i, txt in enumerate(chunk, 1):
lines.append(f"({i}) {txt}")
part_idx = start // BATCH_SIZE + 1
fname = f"topic={topic_val}__part={part_idx:03d}__n={len(chunk)}.txt"
(OUTPUT_DIR / fname).write_text("\n".join(lines), encoding="utf-8")
total_files += 1
total_reviews += len(chunk)
print(
f"[green]Wrote {total_files} docs with {total_reviews} reviews to {OUTPUT_DIR}[/green]"
)
print(f"[green]Filtered CSV saved to {OUTPUT_CSV}[/green]")
# %%
doc_topic_matrix = probs
# column names
topicnames = ["Topic " + str(i) for i in range(len(set(topics)) - 1)]
# index names
docnames = ["Review " + str(i) for i in range(len(reviews))]
# Make the pandas dataframe
df_document_topic = pd.DataFrame(
np.round(doc_topic_matrix, 2), columns=topicnames, index=docnames
)
# Get dominant topic for each document
dominant_topic = np.argmax(doc_topic_matrix, axis=1)
df_document_topic["dominant_topic"] = dominant_topic
# Styling
def color_stuff(val):
if val > 0.1:
color = "green"
elif val > 0.05:
color = "orange"
else:
color = "grey"
return "color: {col}".format(col=color)
def make_bold(val):
weight = 700 if val > 0.1 else 400
return "font-weight: {weight}".format(weight=weight)
# Apply Style
df_document_topics = (
df_document_topic.head(15).style.applymap(color_stuff).applymap(make_bold)
)
df_document_topics
# %% [markdown]
# ### Document Visualization
#
# %%
vis = topic_model.visualize_documents(
docs=reviews,
reduced_embeddings=reduced_embeddings,
custom_labels=True,
hide_annotations=True,
)
vis.write_html("output/visualization.html")
vis
# %% [markdown]
# ### Similarity Matrix
#
# %%
topic_model.visualize_heatmap()
# %% [markdown]
# ### Topic Info
#
# %%
topic_model.get_topic_info()
# %% [markdown]
# ### Semantic Coherence
#
# %%
topic_words = []
for topic_id in topic_model.get_topic_info()["Topic"]:
# Skip outlier topic
if topic_id < 0:
continue
words = [word for word, _ in topic_model.get_topic(topic_id)]
topic_words.append(words)
# Compute mean pairwise cosine similarity for each topic
coherence_scores = []
for words in topic_words:
coherence_embeddings = embedding_model.encode(words)
sim_matrix = cosine_similarity(coherence_embeddings)
# Ignore self-similarity
np.fill_diagonal(sim_matrix, 0)
mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)])
coherence_scores.append(mean_sim)
overall_coherence = np.mean(coherence_scores)
print(len(reviews), "reviews processed")
print(len(topic_model.get_topic_info()) - 1, "topics found")
print(f"BERT-based Topic Coherence: {overall_coherence:.4f}")
# %% [markdown]
# ### Topic Coherence
#
# %%
# https://github.com/MaartenGr/BERTopic/issues/90#issuecomment-820915389
# This will most likely crash your PC
this_will_crash_your_pc_are_you_sure = False
if this_will_crash_your_pc_are_you_sure:
# Preprocess Documents
documents = pd.DataFrame(
{"Document": reviews, "ID": range(len(reviews)), "Topic": topics}
)
documents_per_topic = documents.groupby(["Topic"], as_index=False).agg(
{"Document": " ".join}
)
cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)
# Extract vectorizer and analyzer from BERTopic
vectorizer = topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()
# Extract features for Topic Coherence evaluation
words = vectorizer.get_feature_names_out()
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
for topic_id in topic_model.get_topic_info()["Topic"]:
# Skip outlier topic
if topic_id < 0:
continue
words = [word for word, _ in topic_model.get_topic(topic_id)]
topic_words.append(words)
# %env TOKENIZERS_PARALLELISM=false
for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]:
coherence_model = CoherenceModel(
topics=topic_words,
texts=tokens,
corpus=corpus,
dictionary=dictionary,
coherence=measurement,
)
coherence_score = coherence_model.get_coherence()
print(f"Coherence ({measurement}): {coherence_score:.4f}")
# %% [markdown]
# ### Term Search
#
# %%
search_term = "uluwatu"
similar_topics, similarities = topic_model.find_topics(search_term, top_n=10)
for i in range(len(similar_topics)):
# \n{topic_model.get_topic(similar_topics[i])}\n
print(
f"{str(similarities[i])[:5]} {topic_model.get_topic_info(similar_topics[i])["CustomName"][0]}"
)
# %% [markdown]
# ### Topic Hierarchy
#
# %%
topic_model.visualize_hierarchy(custom_labels=True)
# %% [markdown]
# ### Intertopic Distance Map
#
# %%
topic_model.visualize_topics(use_ctfidf=True)
# %% [markdown]
# ### Topic Word Scores
#
# %%
topic_model.visualize_barchart(top_n_topics=12, custom_labels=True, n_words=10)
# %%
# from matplotlib import pyplot as plt
# from sklearn.manifold import TSNE
# topics = topic_model.topics_
# # Reduce dimensionality with TSNE
# tsne = TSNE(n_components=2, random_state=42)
# embeddings_2d = tsne.fit_transform(embeddings)
# # Prepare colors (assign a color to each topic)
# unique_topics = set(topics)
# colors = plt.get_cmap("tab20", len(unique_topics))
# # Plot
# plt.figure(figsize=(12, 8))
# for topic in unique_topics:
# # Select indices for the current topic
# indices = [i for i, t in enumerate(topics) if t == topic]
# # Get 2D points for these indices
# x = embeddings_2d[indices, 0]
# y = embeddings_2d[indices, 1]
# # Assign label (exclude outliers)
# label = f"Topic {topic}" if topic != -1 else "Outliers"
# # Plot with color
# plt.scatter(x, y, color=colors(topic + 1), label=label, alpha=0.5)
# plt.title("Topic Clusters in 2D Embedding Space")
# plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
# plt.tight_layout()
# # Save the plot
# plt.savefig("topic_clusters.png", dpi=300, bbox_inches="tight")
# plt.show()