mirror of
https://github.com/marvinscham/masterthesis-playground.git
synced 2025-12-06 10:10:50 +01:00
317 lines
7.3 KiB
Python
317 lines
7.3 KiB
Python
# ---
|
|
# jupyter:
|
|
# jupytext:
|
|
# text_representation:
|
|
# extension: .py
|
|
# format_name: percent
|
|
# format_version: '1.3'
|
|
# jupytext_version: 1.18.0
|
|
# kernelspec:
|
|
# display_name: .venv
|
|
# language: python
|
|
# name: python3
|
|
# ---
|
|
|
|
# %% [markdown]
|
|
# # Topic Detection: Bali Tourist Reviews
|
|
#
|
|
|
|
# %% [markdown]
|
|
# ## Preparation
|
|
#
|
|
# ### Dependency Loading
|
|
#
|
|
|
|
# %%
|
|
from gensim import corpora
|
|
from gensim.models import CoherenceModel
|
|
from gensim.models.coherencemodel import CoherenceModel
|
|
from sentence_transformers import SentenceTransformer
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
from top2vec import Top2Vec
|
|
from tqdm.notebook import tqdm
|
|
import numpy as np
|
|
import pandas as pd
|
|
import pickle
|
|
import re
|
|
import spacy
|
|
|
|
# %% [markdown]
|
|
# ### Parameters and Tracking
|
|
#
|
|
|
|
# %%
|
|
PROCESS_DATA = False
|
|
RECALCULATE_COHERENCE_PARTS = False
|
|
RECREATE_MODEL = True
|
|
|
|
# %% [markdown]
|
|
# ### Data Loading & Preprocessing
|
|
#
|
|
|
|
# %%
|
|
reviews = (
|
|
pd.read_csv("data.tab", sep="\t").review.dropna().to_list()
|
|
) # .sample(5_000, random_state=42)
|
|
|
|
print("Loaded {} reviews".format(len(reviews)))
|
|
|
|
# %%
|
|
rep = {
|
|
r"\\n": " ",
|
|
r"\n": " ",
|
|
r'\\"': "",
|
|
r'"': "",
|
|
"mongkey": "monkey",
|
|
"monky": "monkey",
|
|
"verry": "very",
|
|
"bali": "",
|
|
r"\s+": " ",
|
|
}
|
|
rep = dict((re.escape(k), v) for k, v in rep.items())
|
|
pattern = re.compile("|".join(rep.keys()))
|
|
|
|
|
|
def preprocess(text):
|
|
text = text.strip()
|
|
text = text.lower()
|
|
text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
|
|
return text
|
|
|
|
|
|
# %%
|
|
if PROCESS_DATA:
|
|
print("Processing reviews...")
|
|
reviews = [preprocess(review) for review in reviews]
|
|
|
|
with open("processed_texts_top2vec.pkl", "wb") as f:
|
|
pickle.dump(reviews, f)
|
|
else:
|
|
with open("processed_texts_top2vec.pkl", "rb") as f:
|
|
reviews = pickle.load(f)
|
|
reviews = [
|
|
" ".join(review) if isinstance(review, list) else review
|
|
for review in reviews
|
|
]
|
|
|
|
print("Processed {} reviews".format(len(reviews)))
|
|
print(reviews[:1])
|
|
|
|
# %% [markdown]
|
|
# ## Model Creation
|
|
#
|
|
|
|
# %%
|
|
if RECREATE_MODEL:
|
|
hdbscan_args = {
|
|
"min_cluster_size": 200,
|
|
"min_samples": 25,
|
|
"metric": "euclidean",
|
|
"cluster_selection_method": "eom",
|
|
}
|
|
umap_args = {
|
|
"n_neighbors": 15,
|
|
"n_components": 2,
|
|
"min_dist": 0.01,
|
|
"metric": "cosine",
|
|
"random_state": 42,
|
|
"low_memory": True,
|
|
}
|
|
|
|
model = Top2Vec(
|
|
reviews,
|
|
workers=8,
|
|
hdbscan_args=hdbscan_args,
|
|
umap_args=umap_args,
|
|
min_count=1,
|
|
)
|
|
|
|
with open("./top2vec/model.pkl", "wb") as f:
|
|
pickle.dump(model, f)
|
|
else:
|
|
with open("./top2vec/model.pkl", "rb") as f:
|
|
model = pickle.load(f)
|
|
|
|
print(f"\nNumber of topics found: {model.get_num_topics()}")
|
|
|
|
# %% [markdown]
|
|
# ## Results
|
|
#
|
|
|
|
# %% [markdown]
|
|
# ### Coherence
|
|
#
|
|
|
|
# %%
|
|
topic_words = model.get_topics()[0]
|
|
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
|
|
coherence_scores = []
|
|
for words in topic_words:
|
|
coherence_embeddings = embedding_model.encode(words)
|
|
sim_matrix = cosine_similarity(coherence_embeddings)
|
|
np.fill_diagonal(sim_matrix, 0)
|
|
mean_sim = np.mean(sim_matrix)
|
|
coherence_scores.append(mean_sim)
|
|
|
|
overall_coherence = np.mean(coherence_scores)
|
|
|
|
print(f"BERT-based Topic Coherence: {overall_coherence:.4f}")
|
|
|
|
# %%
|
|
# %env TOKENIZERS_PARALLELISM=false
|
|
num_words = 10
|
|
|
|
if RECALCULATE_COHERENCE_PARTS:
|
|
tqdm.pandas()
|
|
|
|
docs = model.documents
|
|
doc_topics, _, _, _ = model.get_documents_topics(doc_ids=list(range(len(docs))))
|
|
|
|
df = pd.DataFrame({"Document": docs, "ID": range(len(docs)), "Topic": doc_topics})
|
|
|
|
documents_per_topic = df.groupby(["Topic"], as_index=False).agg(
|
|
{"Document": " ".join}
|
|
)
|
|
|
|
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
|
|
nlp.max_length = 10_000_000
|
|
|
|
def preprocess(doc):
|
|
return [
|
|
token.text.lower()
|
|
for token in nlp(doc)
|
|
if token.is_alpha and not token.is_stop
|
|
]
|
|
|
|
topic_words = model.get_topics()[0]
|
|
print(topic_words)
|
|
|
|
print("Preprocessing topic documents...")
|
|
tokens = df["Tokens"] = df["Document"].progress_apply(preprocess)
|
|
|
|
print("Creating dictionary...")
|
|
dictionary = corpora.Dictionary(tokens)
|
|
print("Creating corpus...")
|
|
corpus = [dictionary.doc2bow(token_list) for token_list in tokens]
|
|
|
|
num_topics = len(model.topic_sizes)
|
|
|
|
with open("./top2vec/corpus.pkl", "wb") as f:
|
|
pickle.dump(corpus, f)
|
|
with open("./top2vec/dictionary.pkl", "wb") as f:
|
|
pickle.dump(dictionary, f)
|
|
with open("./top2vec/tokens.pkl", "wb") as f:
|
|
pickle.dump(tokens, f)
|
|
else:
|
|
with open("./top2vec/corpus.pkl", "rb") as f:
|
|
corpus = pickle.load(f)
|
|
with open("./top2vec/dictionary.pkl", "rb") as f:
|
|
dictionary = pickle.load(f)
|
|
with open("./top2vec/tokens.pkl", "rb") as f:
|
|
tokens = pickle.load(f)
|
|
|
|
print("Starting coherence evaluation...")
|
|
for measure in ["c_v", "u_mass", "c_uci", "c_npmi"]:
|
|
cm = CoherenceModel(
|
|
topics=topic_words,
|
|
texts=tokens,
|
|
corpus=corpus,
|
|
dictionary=dictionary,
|
|
coherence=measure,
|
|
topn=num_words,
|
|
)
|
|
score = cm.get_coherence()
|
|
print(f"Coherence ({measure}): {score:.4f}")
|
|
|
|
# %% [markdown]
|
|
# ### Topic List
|
|
#
|
|
|
|
# %%
|
|
topics, probs, unq_num = model.get_topics()
|
|
|
|
for i, topic_words in enumerate(topics):
|
|
print(f"Topic {unq_num[i]}: {' | '.join(topic_words)}")
|
|
|
|
# %% [markdown]
|
|
# ### Search by term
|
|
#
|
|
|
|
# %%
|
|
search_term = "monkey"
|
|
|
|
print(f"\nSearching for topics related to '{search_term}':")
|
|
num_topics = min(model.get_num_topics(), 10)
|
|
topic_words, _, _, _ = model.search_topics(
|
|
keywords=[search_term], num_topics=num_topics
|
|
)
|
|
|
|
for words in topic_words:
|
|
topics, probs, unq_num = model.get_topics()
|
|
for i, topic_words in enumerate(topics):
|
|
if set(words).issubset(set(topic_words)):
|
|
unq_num = unq_num[i]
|
|
break
|
|
|
|
print(f"Topic {unq_num}: {' | '.join(words)}")
|
|
|
|
# %% [markdown]
|
|
# ### Search by topic ID
|
|
#
|
|
|
|
# %%
|
|
topic_id = 0
|
|
|
|
print(f"Topic {topic_id}:")
|
|
print("Top words:", " | ".join(topics[topic_id]))
|
|
|
|
docs, doc_scores, doc_ids = model.search_documents_by_topic(
|
|
topic_num=topic_id, num_docs=15
|
|
)
|
|
for i, doc in enumerate(docs):
|
|
print(f"Doc {i+1} (Score: {doc_scores[i]:.2f}): {doc}")
|
|
|
|
# %%
|
|
import plotly.express as px
|
|
import pandas as pd
|
|
from umap import UMAP
|
|
|
|
# Get topic metadata
|
|
topic_vectors = model.topic_vectors
|
|
topic_words = model.get_topics()[0]
|
|
topic_nums, topic_sizes = model.get_topic_sizes()
|
|
|
|
# Reduce vectors to 2D using UMAP
|
|
umap_model = UMAP(n_neighbors=15, n_components=2, metric="cosine", random_state=42)
|
|
topic_coords = umap_model.fit_transform(topic_vectors)
|
|
|
|
# Ensure all components are 1D lists
|
|
topic_nums = list(topic_nums)
|
|
topic_sizes = list(topic_sizes)
|
|
topic_labels = [" | ".join(words[:5]) for words in topic_words]
|
|
|
|
# Build DataFrame
|
|
df = pd.DataFrame(
|
|
{
|
|
"x": topic_coords[:, 0],
|
|
"y": topic_coords[:, 1],
|
|
"Topic Number": topic_nums,
|
|
"Size": topic_sizes,
|
|
"Top Words": topic_labels,
|
|
}
|
|
)
|
|
|
|
# Plot using Plotly
|
|
fig = px.scatter(
|
|
df,
|
|
x="x",
|
|
y="y",
|
|
size="Size",
|
|
text="Topic Number",
|
|
hover_data={"Top Words": True, "Size": True, "x": False, "y": False},
|
|
title="Top2Vec Topic Visualization (2D)",
|
|
)
|
|
fig.update_traces(textposition="top center")
|
|
fig.show()
|