Files
masterthesis-playground/top2vec/nb_top2vec.py
2025-10-20 23:06:52 +02:00

317 lines
7.3 KiB
Python

# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.18.0
# kernelspec:
# display_name: .venv
# language: python
# name: python3
# ---
# %% [markdown]
# # Topic Detection: Bali Tourist Reviews
#
# %% [markdown]
# ## Preparation
#
# ### Dependency Loading
#
# %%
from gensim import corpora
from gensim.models import CoherenceModel
from gensim.models.coherencemodel import CoherenceModel
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from top2vec import Top2Vec
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import pickle
import re
import spacy
# %% [markdown]
# ### Parameters and Tracking
#
# %%
PROCESS_DATA = False
RECALCULATE_COHERENCE_PARTS = False
RECREATE_MODEL = True
# %% [markdown]
# ### Data Loading & Preprocessing
#
# %%
reviews = (
pd.read_csv("data.tab", sep="\t").review.dropna().to_list()
) # .sample(5_000, random_state=42)
print("Loaded {} reviews".format(len(reviews)))
# %%
rep = {
r"\\n": " ",
r"\n": " ",
r'\\"': "",
r'"': "",
"mongkey": "monkey",
"monky": "monkey",
"verry": "very",
"bali": "",
r"\s+": " ",
}
rep = dict((re.escape(k), v) for k, v in rep.items())
pattern = re.compile("|".join(rep.keys()))
def preprocess(text):
text = text.strip()
text = text.lower()
text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
return text
# %%
if PROCESS_DATA:
print("Processing reviews...")
reviews = [preprocess(review) for review in reviews]
with open("processed_texts_top2vec.pkl", "wb") as f:
pickle.dump(reviews, f)
else:
with open("processed_texts_top2vec.pkl", "rb") as f:
reviews = pickle.load(f)
reviews = [
" ".join(review) if isinstance(review, list) else review
for review in reviews
]
print("Processed {} reviews".format(len(reviews)))
print(reviews[:1])
# %% [markdown]
# ## Model Creation
#
# %%
if RECREATE_MODEL:
hdbscan_args = {
"min_cluster_size": 200,
"min_samples": 25,
"metric": "euclidean",
"cluster_selection_method": "eom",
}
umap_args = {
"n_neighbors": 15,
"n_components": 2,
"min_dist": 0.01,
"metric": "cosine",
"random_state": 42,
"low_memory": True,
}
model = Top2Vec(
reviews,
workers=8,
hdbscan_args=hdbscan_args,
umap_args=umap_args,
min_count=1,
)
with open("./top2vec/model.pkl", "wb") as f:
pickle.dump(model, f)
else:
with open("./top2vec/model.pkl", "rb") as f:
model = pickle.load(f)
print(f"\nNumber of topics found: {model.get_num_topics()}")
# %% [markdown]
# ## Results
#
# %% [markdown]
# ### Coherence
#
# %%
topic_words = model.get_topics()[0]
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
coherence_scores = []
for words in topic_words:
coherence_embeddings = embedding_model.encode(words)
sim_matrix = cosine_similarity(coherence_embeddings)
np.fill_diagonal(sim_matrix, 0)
mean_sim = np.mean(sim_matrix)
coherence_scores.append(mean_sim)
overall_coherence = np.mean(coherence_scores)
print(f"BERT-based Topic Coherence: {overall_coherence:.4f}")
# %%
# %env TOKENIZERS_PARALLELISM=false
num_words = 10
if RECALCULATE_COHERENCE_PARTS:
tqdm.pandas()
docs = model.documents
doc_topics, _, _, _ = model.get_documents_topics(doc_ids=list(range(len(docs))))
df = pd.DataFrame({"Document": docs, "ID": range(len(docs)), "Topic": doc_topics})
documents_per_topic = df.groupby(["Topic"], as_index=False).agg(
{"Document": " ".join}
)
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
nlp.max_length = 10_000_000
def preprocess(doc):
return [
token.text.lower()
for token in nlp(doc)
if token.is_alpha and not token.is_stop
]
topic_words = model.get_topics()[0]
print(topic_words)
print("Preprocessing topic documents...")
tokens = df["Tokens"] = df["Document"].progress_apply(preprocess)
print("Creating dictionary...")
dictionary = corpora.Dictionary(tokens)
print("Creating corpus...")
corpus = [dictionary.doc2bow(token_list) for token_list in tokens]
num_topics = len(model.topic_sizes)
with open("./top2vec/corpus.pkl", "wb") as f:
pickle.dump(corpus, f)
with open("./top2vec/dictionary.pkl", "wb") as f:
pickle.dump(dictionary, f)
with open("./top2vec/tokens.pkl", "wb") as f:
pickle.dump(tokens, f)
else:
with open("./top2vec/corpus.pkl", "rb") as f:
corpus = pickle.load(f)
with open("./top2vec/dictionary.pkl", "rb") as f:
dictionary = pickle.load(f)
with open("./top2vec/tokens.pkl", "rb") as f:
tokens = pickle.load(f)
print("Starting coherence evaluation...")
for measure in ["c_v", "u_mass", "c_uci", "c_npmi"]:
cm = CoherenceModel(
topics=topic_words,
texts=tokens,
corpus=corpus,
dictionary=dictionary,
coherence=measure,
topn=num_words,
)
score = cm.get_coherence()
print(f"Coherence ({measure}): {score:.4f}")
# %% [markdown]
# ### Topic List
#
# %%
topics, probs, unq_num = model.get_topics()
for i, topic_words in enumerate(topics):
print(f"Topic {unq_num[i]}: {' | '.join(topic_words)}")
# %% [markdown]
# ### Search by term
#
# %%
search_term = "monkey"
print(f"\nSearching for topics related to '{search_term}':")
num_topics = min(model.get_num_topics(), 10)
topic_words, _, _, _ = model.search_topics(
keywords=[search_term], num_topics=num_topics
)
for words in topic_words:
topics, probs, unq_num = model.get_topics()
for i, topic_words in enumerate(topics):
if set(words).issubset(set(topic_words)):
unq_num = unq_num[i]
break
print(f"Topic {unq_num}: {' | '.join(words)}")
# %% [markdown]
# ### Search by topic ID
#
# %%
topic_id = 0
print(f"Topic {topic_id}:")
print("Top words:", " | ".join(topics[topic_id]))
docs, doc_scores, doc_ids = model.search_documents_by_topic(
topic_num=topic_id, num_docs=15
)
for i, doc in enumerate(docs):
print(f"Doc {i+1} (Score: {doc_scores[i]:.2f}): {doc}")
# %%
import plotly.express as px
import pandas as pd
from umap import UMAP
# Get topic metadata
topic_vectors = model.topic_vectors
topic_words = model.get_topics()[0]
topic_nums, topic_sizes = model.get_topic_sizes()
# Reduce vectors to 2D using UMAP
umap_model = UMAP(n_neighbors=15, n_components=2, metric="cosine", random_state=42)
topic_coords = umap_model.fit_transform(topic_vectors)
# Ensure all components are 1D lists
topic_nums = list(topic_nums)
topic_sizes = list(topic_sizes)
topic_labels = [" | ".join(words[:5]) for words in topic_words]
# Build DataFrame
df = pd.DataFrame(
{
"x": topic_coords[:, 0],
"y": topic_coords[:, 1],
"Topic Number": topic_nums,
"Size": topic_sizes,
"Top Words": topic_labels,
}
)
# Plot using Plotly
fig = px.scatter(
df,
x="x",
y="y",
size="Size",
text="Topic Number",
hover_data={"Top Words": True, "Size": True, "x": False, "y": False},
title="Top2Vec Topic Visualization (2D)",
)
fig.update_traces(textposition="top center")
fig.show()