mirror of
https://github.com/marvinscham/masterthesis-playground.git
synced 2025-12-07 02:30:50 +01:00
Restructure
This commit is contained in:
316
top2vec/nb_top2vec.py
Normal file
316
top2vec/nb_top2vec.py
Normal file
@@ -0,0 +1,316 @@
|
||||
# ---
|
||||
# jupyter:
|
||||
# jupytext:
|
||||
# text_representation:
|
||||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.18.0
|
||||
# kernelspec:
|
||||
# display_name: .venv
|
||||
# language: python
|
||||
# name: python3
|
||||
# ---
|
||||
|
||||
# %% [markdown]
|
||||
# # Topic Detection: Bali Tourist Reviews
|
||||
#
|
||||
|
||||
# %% [markdown]
|
||||
# ## Preparation
|
||||
#
|
||||
# ### Dependency Loading
|
||||
#
|
||||
|
||||
# %%
|
||||
from gensim import corpora
|
||||
from gensim.models import CoherenceModel
|
||||
from gensim.models.coherencemodel import CoherenceModel
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
from top2vec import Top2Vec
|
||||
from tqdm.notebook import tqdm
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pickle
|
||||
import re
|
||||
import spacy
|
||||
|
||||
# %% [markdown]
|
||||
# ### Parameters and Tracking
|
||||
#
|
||||
|
||||
# %%
|
||||
PROCESS_DATA = False
|
||||
RECALCULATE_COHERENCE_PARTS = False
|
||||
RECREATE_MODEL = True
|
||||
|
||||
# %% [markdown]
|
||||
# ### Data Loading & Preprocessing
|
||||
#
|
||||
|
||||
# %%
|
||||
reviews = (
|
||||
pd.read_csv("data.tab", sep="\t").review.dropna().to_list()
|
||||
) # .sample(5_000, random_state=42)
|
||||
|
||||
print("Loaded {} reviews".format(len(reviews)))
|
||||
|
||||
# %%
|
||||
rep = {
|
||||
r"\\n": " ",
|
||||
r"\n": " ",
|
||||
r'\\"': "",
|
||||
r'"': "",
|
||||
"mongkey": "monkey",
|
||||
"monky": "monkey",
|
||||
"verry": "very",
|
||||
"bali": "",
|
||||
r"\s+": " ",
|
||||
}
|
||||
rep = dict((re.escape(k), v) for k, v in rep.items())
|
||||
pattern = re.compile("|".join(rep.keys()))
|
||||
|
||||
|
||||
def preprocess(text):
|
||||
text = text.strip()
|
||||
text = text.lower()
|
||||
text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
|
||||
return text
|
||||
|
||||
|
||||
# %%
|
||||
if PROCESS_DATA:
|
||||
print("Processing reviews...")
|
||||
reviews = [preprocess(review) for review in reviews]
|
||||
|
||||
with open("processed_texts_top2vec.pkl", "wb") as f:
|
||||
pickle.dump(reviews, f)
|
||||
else:
|
||||
with open("processed_texts_top2vec.pkl", "rb") as f:
|
||||
reviews = pickle.load(f)
|
||||
reviews = [
|
||||
" ".join(review) if isinstance(review, list) else review
|
||||
for review in reviews
|
||||
]
|
||||
|
||||
print("Processed {} reviews".format(len(reviews)))
|
||||
print(reviews[:1])
|
||||
|
||||
# %% [markdown]
|
||||
# ## Model Creation
|
||||
#
|
||||
|
||||
# %%
|
||||
if RECREATE_MODEL:
|
||||
hdbscan_args = {
|
||||
"min_cluster_size": 200,
|
||||
"min_samples": 25,
|
||||
"metric": "euclidean",
|
||||
"cluster_selection_method": "eom",
|
||||
}
|
||||
umap_args = {
|
||||
"n_neighbors": 15,
|
||||
"n_components": 2,
|
||||
"min_dist": 0.01,
|
||||
"metric": "cosine",
|
||||
"random_state": 42,
|
||||
"low_memory": True,
|
||||
}
|
||||
|
||||
model = Top2Vec(
|
||||
reviews,
|
||||
workers=8,
|
||||
hdbscan_args=hdbscan_args,
|
||||
umap_args=umap_args,
|
||||
min_count=1,
|
||||
)
|
||||
|
||||
with open("./top2vec/model.pkl", "wb") as f:
|
||||
pickle.dump(model, f)
|
||||
else:
|
||||
with open("./top2vec/model.pkl", "rb") as f:
|
||||
model = pickle.load(f)
|
||||
|
||||
print(f"\nNumber of topics found: {model.get_num_topics()}")
|
||||
|
||||
# %% [markdown]
|
||||
# ## Results
|
||||
#
|
||||
|
||||
# %% [markdown]
|
||||
# ### Coherence
|
||||
#
|
||||
|
||||
# %%
|
||||
topic_words = model.get_topics()[0]
|
||||
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
|
||||
coherence_scores = []
|
||||
for words in topic_words:
|
||||
coherence_embeddings = embedding_model.encode(words)
|
||||
sim_matrix = cosine_similarity(coherence_embeddings)
|
||||
np.fill_diagonal(sim_matrix, 0)
|
||||
mean_sim = np.mean(sim_matrix)
|
||||
coherence_scores.append(mean_sim)
|
||||
|
||||
overall_coherence = np.mean(coherence_scores)
|
||||
|
||||
print(f"BERT-based Topic Coherence: {overall_coherence:.4f}")
|
||||
|
||||
# %%
|
||||
# %env TOKENIZERS_PARALLELISM=false
|
||||
num_words = 10
|
||||
|
||||
if RECALCULATE_COHERENCE_PARTS:
|
||||
tqdm.pandas()
|
||||
|
||||
docs = model.documents
|
||||
doc_topics, _, _, _ = model.get_documents_topics(doc_ids=list(range(len(docs))))
|
||||
|
||||
df = pd.DataFrame({"Document": docs, "ID": range(len(docs)), "Topic": doc_topics})
|
||||
|
||||
documents_per_topic = df.groupby(["Topic"], as_index=False).agg(
|
||||
{"Document": " ".join}
|
||||
)
|
||||
|
||||
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
|
||||
nlp.max_length = 10_000_000
|
||||
|
||||
def preprocess(doc):
|
||||
return [
|
||||
token.text.lower()
|
||||
for token in nlp(doc)
|
||||
if token.is_alpha and not token.is_stop
|
||||
]
|
||||
|
||||
topic_words = model.get_topics()[0]
|
||||
print(topic_words)
|
||||
|
||||
print("Preprocessing topic documents...")
|
||||
tokens = df["Tokens"] = df["Document"].progress_apply(preprocess)
|
||||
|
||||
print("Creating dictionary...")
|
||||
dictionary = corpora.Dictionary(tokens)
|
||||
print("Creating corpus...")
|
||||
corpus = [dictionary.doc2bow(token_list) for token_list in tokens]
|
||||
|
||||
num_topics = len(model.topic_sizes)
|
||||
|
||||
with open("./top2vec/corpus.pkl", "wb") as f:
|
||||
pickle.dump(corpus, f)
|
||||
with open("./top2vec/dictionary.pkl", "wb") as f:
|
||||
pickle.dump(dictionary, f)
|
||||
with open("./top2vec/tokens.pkl", "wb") as f:
|
||||
pickle.dump(tokens, f)
|
||||
else:
|
||||
with open("./top2vec/corpus.pkl", "rb") as f:
|
||||
corpus = pickle.load(f)
|
||||
with open("./top2vec/dictionary.pkl", "rb") as f:
|
||||
dictionary = pickle.load(f)
|
||||
with open("./top2vec/tokens.pkl", "rb") as f:
|
||||
tokens = pickle.load(f)
|
||||
|
||||
print("Starting coherence evaluation...")
|
||||
for measure in ["c_v", "u_mass", "c_uci", "c_npmi"]:
|
||||
cm = CoherenceModel(
|
||||
topics=topic_words,
|
||||
texts=tokens,
|
||||
corpus=corpus,
|
||||
dictionary=dictionary,
|
||||
coherence=measure,
|
||||
topn=num_words,
|
||||
)
|
||||
score = cm.get_coherence()
|
||||
print(f"Coherence ({measure}): {score:.4f}")
|
||||
|
||||
# %% [markdown]
|
||||
# ### Topic List
|
||||
#
|
||||
|
||||
# %%
|
||||
topics, probs, unq_num = model.get_topics()
|
||||
|
||||
for i, topic_words in enumerate(topics):
|
||||
print(f"Topic {unq_num[i]}: {' | '.join(topic_words)}")
|
||||
|
||||
# %% [markdown]
|
||||
# ### Search by term
|
||||
#
|
||||
|
||||
# %%
|
||||
search_term = "monkey"
|
||||
|
||||
print(f"\nSearching for topics related to '{search_term}':")
|
||||
num_topics = min(model.get_num_topics(), 10)
|
||||
topic_words, _, _, _ = model.search_topics(
|
||||
keywords=[search_term], num_topics=num_topics
|
||||
)
|
||||
|
||||
for words in topic_words:
|
||||
topics, probs, unq_num = model.get_topics()
|
||||
for i, topic_words in enumerate(topics):
|
||||
if set(words).issubset(set(topic_words)):
|
||||
unq_num = unq_num[i]
|
||||
break
|
||||
|
||||
print(f"Topic {unq_num}: {' | '.join(words)}")
|
||||
|
||||
# %% [markdown]
|
||||
# ### Search by topic ID
|
||||
#
|
||||
|
||||
# %%
|
||||
topic_id = 0
|
||||
|
||||
print(f"Topic {topic_id}:")
|
||||
print("Top words:", " | ".join(topics[topic_id]))
|
||||
|
||||
docs, doc_scores, doc_ids = model.search_documents_by_topic(
|
||||
topic_num=topic_id, num_docs=15
|
||||
)
|
||||
for i, doc in enumerate(docs):
|
||||
print(f"Doc {i+1} (Score: {doc_scores[i]:.2f}): {doc}")
|
||||
|
||||
# %%
|
||||
import plotly.express as px
|
||||
import pandas as pd
|
||||
from umap import UMAP
|
||||
|
||||
# Get topic metadata
|
||||
topic_vectors = model.topic_vectors
|
||||
topic_words = model.get_topics()[0]
|
||||
topic_nums, topic_sizes = model.get_topic_sizes()
|
||||
|
||||
# Reduce vectors to 2D using UMAP
|
||||
umap_model = UMAP(n_neighbors=15, n_components=2, metric="cosine", random_state=42)
|
||||
topic_coords = umap_model.fit_transform(topic_vectors)
|
||||
|
||||
# Ensure all components are 1D lists
|
||||
topic_nums = list(topic_nums)
|
||||
topic_sizes = list(topic_sizes)
|
||||
topic_labels = [" | ".join(words[:5]) for words in topic_words]
|
||||
|
||||
# Build DataFrame
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"x": topic_coords[:, 0],
|
||||
"y": topic_coords[:, 1],
|
||||
"Topic Number": topic_nums,
|
||||
"Size": topic_sizes,
|
||||
"Top Words": topic_labels,
|
||||
}
|
||||
)
|
||||
|
||||
# Plot using Plotly
|
||||
fig = px.scatter(
|
||||
df,
|
||||
x="x",
|
||||
y="y",
|
||||
size="Size",
|
||||
text="Topic Number",
|
||||
hover_data={"Top Words": True, "Size": True, "x": False, "y": False},
|
||||
title="Top2Vec Topic Visualization (2D)",
|
||||
)
|
||||
fig.update_traces(textposition="top center")
|
||||
fig.show()
|
||||
Reference in New Issue
Block a user