mirror of
https://github.com/marvinscham/masterthesis-playground.git
synced 2025-12-06 18:20:53 +01:00
387 lines
8.9 KiB
Python
387 lines
8.9 KiB
Python
# ---
|
|
# jupyter:
|
|
# jupytext:
|
|
# text_representation:
|
|
# extension: .py
|
|
# format_name: percent
|
|
# format_version: '1.3'
|
|
# jupytext_version: 1.18.0
|
|
# kernelspec:
|
|
# display_name: .venv
|
|
# language: python
|
|
# name: python3
|
|
# ---
|
|
|
|
# %% [markdown]
|
|
# # Topic Detection: Bali Tourist Reviews
|
|
#
|
|
|
|
# %% [markdown]
|
|
# ## Preparation
|
|
#
|
|
# ### Dependency Loading
|
|
#
|
|
|
|
# %%
|
|
from gensim.models import CoherenceModel
|
|
from gensim.models import LdaModel
|
|
from gensim.models.phrases import Phraser, Phrases
|
|
from nltk.corpus import stopwords
|
|
from nltk.stem import WordNetLemmatizer
|
|
from pprint import pprint
|
|
import altair as alt
|
|
import gensim.corpora as corpora
|
|
import json
|
|
import multiprocessing
|
|
import nltk
|
|
import numpy as np
|
|
import os
|
|
import pandas as pd
|
|
import pickle
|
|
import pyLDAvis
|
|
import pyLDAvis.gensim_models as gensimvis
|
|
import re
|
|
import spacy
|
|
import umap
|
|
|
|
nlp = spacy.load("en_core_web_sm")
|
|
|
|
try:
|
|
multiprocessing.set_start_method("spawn")
|
|
except RuntimeError:
|
|
pass
|
|
|
|
nltk.download("stopwords")
|
|
nltk.download("punkt")
|
|
nltk.download("wordnet")
|
|
|
|
print("OK")
|
|
|
|
# %% [markdown]
|
|
# ### Parameters and Tracking
|
|
#
|
|
|
|
# %%
|
|
RUN_BENCHMARK = False
|
|
SAVE_MODEL = True
|
|
PROCESS_DATA = False
|
|
|
|
# %% [markdown]
|
|
# ### Data Loading & Preprocessing
|
|
#
|
|
|
|
# %%
|
|
reviews = (
|
|
pd.read_csv("data.tab", sep="\t")
|
|
.review.dropna()
|
|
.to_list() # .sample(10_000, random_state=42)
|
|
)
|
|
print(f"Loaded {len(reviews)} reviews.")
|
|
|
|
# %%
|
|
# List of NE in Bali for NER enhancement
|
|
with open("bali_ner.json", "r") as f:
|
|
bali_places = json.load(f)
|
|
bali_places_set = set(bali_places)
|
|
|
|
# Stop word definition
|
|
extra_stopwords = ["bali", "idr", "usd"]
|
|
stop_words = set(stopwords.words("english"))
|
|
with open("stopwords-en.json", "r") as f:
|
|
extra_stopwords.extend(json.load(f))
|
|
|
|
# Custom replacements
|
|
rep = {
|
|
r"\\n": " ",
|
|
r"\n": " ",
|
|
r'\\"': "",
|
|
r'"': "",
|
|
"mongkey": "monkey",
|
|
"monky": "monkey",
|
|
"verry": "very",
|
|
}
|
|
rep = dict((re.escape(k), v) for k, v in rep.items())
|
|
pattern = re.compile("|".join(rep.keys()))
|
|
|
|
lemmatizer = WordNetLemmatizer()
|
|
|
|
|
|
def preprocess(text):
|
|
# Step 1: Apply custom replacements (typos, special cases)
|
|
text = text.lower()
|
|
text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
|
|
|
|
# Step 2: Clean text
|
|
text = re.sub(r"\d+", " ", text)
|
|
text = re.sub(r"\W+", " ", text)
|
|
|
|
doc = nlp(text)
|
|
|
|
# Step 3: POS tagging and filtering
|
|
filtered_tokens = [
|
|
token.text
|
|
for token in doc
|
|
if token.pos_ in {"NOUN", "PROPN"}
|
|
or token.ent_type_ in {"GPE", "LOC", "FAC"}
|
|
or token.text in bali_places_set
|
|
]
|
|
|
|
# Step 4: Lemmatization and stopword removal
|
|
lemmatized_tokens = [
|
|
lemmatizer.lemmatize(w)
|
|
for w in filtered_tokens
|
|
if w not in stop_words and w not in extra_stopwords and len(w) > 2
|
|
]
|
|
|
|
return lemmatized_tokens
|
|
|
|
|
|
# %%
|
|
if PROCESS_DATA:
|
|
print("Processing sentences...")
|
|
processed_reviews = [preprocess(review) for review in reviews]
|
|
|
|
with open("processed_texts.pkl", "wb") as f:
|
|
pickle.dump(processed_reviews, f)
|
|
else:
|
|
with open("processed_texts.pkl", "rb") as f:
|
|
processed_reviews = pickle.load(f)
|
|
|
|
print(processed_reviews[:1])
|
|
|
|
# %% [markdown]
|
|
# ### n-gram Creation
|
|
#
|
|
|
|
# %%
|
|
bigram = Phrases(processed_reviews, min_count=5, threshold=10)
|
|
bigram_mod = Phraser(bigram)
|
|
texts = [bigram_mod[doc] for doc in processed_reviews]
|
|
|
|
# %% [markdown]
|
|
# ## Model Creation
|
|
#
|
|
|
|
# %% [markdown]
|
|
# ### Word Mapping & Corpus
|
|
#
|
|
|
|
# %%
|
|
id2word = corpora.Dictionary(texts)
|
|
id2word.filter_extremes(no_below=5, no_above=0.5)
|
|
corpus = [id2word.doc2bow(text) for text in texts]
|
|
|
|
# %% [markdown]
|
|
# ### LDA Model Creation
|
|
#
|
|
|
|
# %%
|
|
if not RUN_BENCHMARK:
|
|
lda_model = LdaModel(
|
|
corpus=corpus,
|
|
id2word=id2word,
|
|
num_topics=3,
|
|
random_state=42,
|
|
update_every=1,
|
|
chunksize=100,
|
|
passes=10,
|
|
alpha="auto",
|
|
per_word_topics=True,
|
|
)
|
|
|
|
# %%
|
|
if RUN_BENCHMARK:
|
|
for num_topics in [3, 4, 5]:
|
|
print(f"Training LDA model with {num_topics} topics...")
|
|
lda_model = LdaModel(
|
|
corpus=corpus,
|
|
id2word=id2word,
|
|
num_topics=num_topics,
|
|
random_state=42,
|
|
update_every=1,
|
|
chunksize=100,
|
|
passes=10,
|
|
alpha="auto",
|
|
per_word_topics=True,
|
|
)
|
|
|
|
for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]:
|
|
coherence_model_lda = CoherenceModel(
|
|
model=lda_model,
|
|
texts=texts,
|
|
dictionary=id2word,
|
|
coherence=measurement,
|
|
)
|
|
coherence_lda = coherence_model_lda.get_coherence()
|
|
print(f"Coherence ({measurement}): {coherence_lda:.4f}")
|
|
|
|
vis = gensimvis.prepare(lda_model, corpus, id2word)
|
|
pyLDAvis.save_html(vis, f"./lda_output/lda_vis_{num_topics}_topics.html")
|
|
print(f"Visualization saved to lda_vis_{num_topics}_topics.html")
|
|
|
|
# %% [markdown]
|
|
# ## Results
|
|
#
|
|
# ### Topics
|
|
#
|
|
|
|
# %%
|
|
pprint(lda_model.print_topics())
|
|
|
|
# %% [markdown]
|
|
# ### Topic Coherence
|
|
#
|
|
|
|
# %%
|
|
for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]:
|
|
coherence_model_lda = CoherenceModel(
|
|
model=lda_model,
|
|
texts=texts,
|
|
dictionary=id2word,
|
|
coherence=measurement,
|
|
)
|
|
coherence_lda = coherence_model_lda.get_coherence()
|
|
print(f"Coherence ({measurement}): {coherence_lda:.4f}")
|
|
|
|
# %% [markdown]
|
|
# ### Perplexity
|
|
#
|
|
|
|
# %%
|
|
log_perplexity = lda_model.log_perplexity(corpus)
|
|
perplexity = np.exp2(-log_perplexity)
|
|
|
|
print(f"Perplexity: {perplexity:.4f}")
|
|
|
|
# %% [markdown]
|
|
# ### Topic Visualization
|
|
#
|
|
|
|
# %%
|
|
pyLDAvis.enable_notebook()
|
|
lda_vis = gensimvis.prepare(lda_model, corpus, id2word)
|
|
pyLDAvis.display(lda_vis)
|
|
|
|
# %%
|
|
VISUALIZATION_THRESHOLD = 0.35
|
|
|
|
doc_topic_lda = [
|
|
lda_model.get_document_topics(doc, minimum_probability=0) for doc in corpus
|
|
]
|
|
doc_topic_lda = np.array([[prob for (_, prob) in doc] for doc in doc_topic_lda])
|
|
|
|
above_threshold_mask = np.any(doc_topic_lda >= VISUALIZATION_THRESHOLD, axis=1)
|
|
|
|
filtered_doc_topic = doc_topic_lda[above_threshold_mask]
|
|
|
|
# UMAP dimensionality reduction
|
|
umap_model = umap.UMAP(n_components=2, metric="hellinger")
|
|
lda_2d = umap_model.fit_transform(filtered_doc_topic)
|
|
|
|
# Assign colors by dominant topic
|
|
dominant_topics = np.argmax(filtered_doc_topic, axis=1)
|
|
|
|
alt_df = pd.DataFrame(
|
|
{
|
|
"x": lda_2d[:, 0],
|
|
"y": lda_2d[:, 1],
|
|
"topic": dominant_topics.astype(str),
|
|
"text": [reviews[i] for i in np.where(above_threshold_mask)[0]],
|
|
"prob": np.max(filtered_doc_topic, axis=1),
|
|
}
|
|
)
|
|
|
|
alt.data_transformers.disable_max_rows()
|
|
chart = (
|
|
alt.Chart(alt_df)
|
|
.mark_circle(size=60)
|
|
.encode(
|
|
x="x:Q",
|
|
y="y:Q",
|
|
color="topic:N",
|
|
tooltip=[
|
|
alt.Tooltip("topic", title="Topic"),
|
|
alt.Tooltip("prob:Q", title="Probability", format=".2f"),
|
|
alt.Tooltip("text", title="Document Text"),
|
|
],
|
|
)
|
|
.properties(
|
|
width=800,
|
|
height=600,
|
|
title=f"Interactive LDA Visualization (Threshold ≥ {VISUALIZATION_THRESHOLD})",
|
|
)
|
|
.interactive()
|
|
)
|
|
|
|
chart
|
|
|
|
# %% [markdown]
|
|
# ### Topic assignment
|
|
#
|
|
|
|
# %%
|
|
import json
|
|
|
|
EXPORT_THRESHOLD = 0.35
|
|
|
|
# Prepare data for JSON export
|
|
output_data = []
|
|
for doc_idx, doc_probs in enumerate(doc_topic_lda):
|
|
# Get topics above threshold for this document
|
|
significant_topics = [
|
|
{"topic_id": int(topic_id), "probability": float(prob)}
|
|
for topic_id, prob in enumerate(doc_probs)
|
|
if prob >= EXPORT_THRESHOLD
|
|
]
|
|
|
|
if significant_topics: # Only include documents with significant topics
|
|
output_data.append(
|
|
{
|
|
"document_id": int(doc_idx),
|
|
"original_text": reviews[doc_idx],
|
|
"topics": [
|
|
{
|
|
"topic_id": t["topic_id"],
|
|
"probability": round(t["probability"], 2),
|
|
}
|
|
for t in significant_topics
|
|
],
|
|
"dominant_topic": int(np.argmax(doc_probs)),
|
|
"dominant_probability": round(float(np.max(doc_probs)), 2),
|
|
}
|
|
)
|
|
|
|
# Export to JSON
|
|
with open("lda_output/topic_to_reviews.json", "w") as f:
|
|
json.dump(
|
|
{
|
|
"metadata": {
|
|
"threshold_used": EXPORT_THRESHOLD,
|
|
"num_topics": lda_model.num_topics,
|
|
"total_documents": len(output_data),
|
|
},
|
|
"documents": output_data,
|
|
},
|
|
f,
|
|
indent=2,
|
|
)
|
|
|
|
# %% [markdown]
|
|
# ## Save Model
|
|
#
|
|
|
|
# %%
|
|
if SAVE_MODEL:
|
|
os.makedirs("lda_output", exist_ok=True)
|
|
|
|
lda_model.save("lda_output/lda_model.gensim")
|
|
id2word.save("lda_output/lda_dictionary.gensim")
|
|
with open("lda_output/lda_corpus.pkl", "wb") as f:
|
|
pickle.dump(corpus, f)
|
|
|
|
with open("lda_output/topics.txt", "w") as f:
|
|
for topic in lda_model.print_topics():
|
|
f.write(f"{topic}\n")
|
|
|
|
print("Done!")
|