mirror of
https://github.com/marvinscham/masterthesis-playground.git
synced 2026-05-08 05:25:47 +02:00
Restructure
This commit is contained in:
+386
@@ -0,0 +1,386 @@
|
||||
# ---
|
||||
# jupyter:
|
||||
# jupytext:
|
||||
# text_representation:
|
||||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.18.0
|
||||
# kernelspec:
|
||||
# display_name: .venv
|
||||
# language: python
|
||||
# name: python3
|
||||
# ---
|
||||
|
||||
# %% [markdown]
|
||||
# # Topic Detection: Bali Tourist Reviews
|
||||
#
|
||||
|
||||
# %% [markdown]
|
||||
# ## Preparation
|
||||
#
|
||||
# ### Dependency Loading
|
||||
#
|
||||
|
||||
# %%
|
||||
from gensim.models import CoherenceModel
|
||||
from gensim.models import LdaModel
|
||||
from gensim.models.phrases import Phraser, Phrases
|
||||
from nltk.corpus import stopwords
|
||||
from nltk.stem import WordNetLemmatizer
|
||||
from pprint import pprint
|
||||
import altair as alt
|
||||
import gensim.corpora as corpora
|
||||
import json
|
||||
import multiprocessing
|
||||
import nltk
|
||||
import numpy as np
|
||||
import os
|
||||
import pandas as pd
|
||||
import pickle
|
||||
import pyLDAvis
|
||||
import pyLDAvis.gensim_models as gensimvis
|
||||
import re
|
||||
import spacy
|
||||
import umap
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
|
||||
try:
|
||||
multiprocessing.set_start_method("spawn")
|
||||
except RuntimeError:
|
||||
pass
|
||||
|
||||
nltk.download("stopwords")
|
||||
nltk.download("punkt")
|
||||
nltk.download("wordnet")
|
||||
|
||||
print("OK")
|
||||
|
||||
# %% [markdown]
|
||||
# ### Parameters and Tracking
|
||||
#
|
||||
|
||||
# %%
|
||||
RUN_BENCHMARK = False
|
||||
SAVE_MODEL = True
|
||||
PROCESS_DATA = False
|
||||
|
||||
# %% [markdown]
|
||||
# ### Data Loading & Preprocessing
|
||||
#
|
||||
|
||||
# %%
|
||||
reviews = (
|
||||
pd.read_csv("data.tab", sep="\t")
|
||||
.review.dropna()
|
||||
.to_list() # .sample(10_000, random_state=42)
|
||||
)
|
||||
print(f"Loaded {len(reviews)} reviews.")
|
||||
|
||||
# %%
|
||||
# List of NE in Bali for NER enhancement
|
||||
with open("bali_ner.json", "r") as f:
|
||||
bali_places = json.load(f)
|
||||
bali_places_set = set(bali_places)
|
||||
|
||||
# Stop word definition
|
||||
extra_stopwords = ["bali", "idr", "usd"]
|
||||
stop_words = set(stopwords.words("english"))
|
||||
with open("stopwords-en.json", "r") as f:
|
||||
extra_stopwords.extend(json.load(f))
|
||||
|
||||
# Custom replacements
|
||||
rep = {
|
||||
r"\\n": " ",
|
||||
r"\n": " ",
|
||||
r'\\"': "",
|
||||
r'"': "",
|
||||
"mongkey": "monkey",
|
||||
"monky": "monkey",
|
||||
"verry": "very",
|
||||
}
|
||||
rep = dict((re.escape(k), v) for k, v in rep.items())
|
||||
pattern = re.compile("|".join(rep.keys()))
|
||||
|
||||
lemmatizer = WordNetLemmatizer()
|
||||
|
||||
|
||||
def preprocess(text):
|
||||
# Step 1: Apply custom replacements (typos, special cases)
|
||||
text = text.lower()
|
||||
text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
|
||||
|
||||
# Step 2: Clean text
|
||||
text = re.sub(r"\d+", " ", text)
|
||||
text = re.sub(r"\W+", " ", text)
|
||||
|
||||
doc = nlp(text)
|
||||
|
||||
# Step 3: POS tagging and filtering
|
||||
filtered_tokens = [
|
||||
token.text
|
||||
for token in doc
|
||||
if token.pos_ in {"NOUN", "PROPN"}
|
||||
or token.ent_type_ in {"GPE", "LOC", "FAC"}
|
||||
or token.text in bali_places_set
|
||||
]
|
||||
|
||||
# Step 4: Lemmatization and stopword removal
|
||||
lemmatized_tokens = [
|
||||
lemmatizer.lemmatize(w)
|
||||
for w in filtered_tokens
|
||||
if w not in stop_words and w not in extra_stopwords and len(w) > 2
|
||||
]
|
||||
|
||||
return lemmatized_tokens
|
||||
|
||||
|
||||
# %%
|
||||
if PROCESS_DATA:
|
||||
print("Processing sentences...")
|
||||
processed_reviews = [preprocess(review) for review in reviews]
|
||||
|
||||
with open("processed_texts.pkl", "wb") as f:
|
||||
pickle.dump(processed_reviews, f)
|
||||
else:
|
||||
with open("processed_texts.pkl", "rb") as f:
|
||||
processed_reviews = pickle.load(f)
|
||||
|
||||
print(processed_reviews[:1])
|
||||
|
||||
# %% [markdown]
|
||||
# ### n-gram Creation
|
||||
#
|
||||
|
||||
# %%
|
||||
bigram = Phrases(processed_reviews, min_count=5, threshold=10)
|
||||
bigram_mod = Phraser(bigram)
|
||||
texts = [bigram_mod[doc] for doc in processed_reviews]
|
||||
|
||||
# %% [markdown]
|
||||
# ## Model Creation
|
||||
#
|
||||
|
||||
# %% [markdown]
|
||||
# ### Word Mapping & Corpus
|
||||
#
|
||||
|
||||
# %%
|
||||
id2word = corpora.Dictionary(texts)
|
||||
id2word.filter_extremes(no_below=5, no_above=0.5)
|
||||
corpus = [id2word.doc2bow(text) for text in texts]
|
||||
|
||||
# %% [markdown]
|
||||
# ### LDA Model Creation
|
||||
#
|
||||
|
||||
# %%
|
||||
if not RUN_BENCHMARK:
|
||||
lda_model = LdaModel(
|
||||
corpus=corpus,
|
||||
id2word=id2word,
|
||||
num_topics=3,
|
||||
random_state=42,
|
||||
update_every=1,
|
||||
chunksize=100,
|
||||
passes=10,
|
||||
alpha="auto",
|
||||
per_word_topics=True,
|
||||
)
|
||||
|
||||
# %%
|
||||
if RUN_BENCHMARK:
|
||||
for num_topics in [3, 4, 5]:
|
||||
print(f"Training LDA model with {num_topics} topics...")
|
||||
lda_model = LdaModel(
|
||||
corpus=corpus,
|
||||
id2word=id2word,
|
||||
num_topics=num_topics,
|
||||
random_state=42,
|
||||
update_every=1,
|
||||
chunksize=100,
|
||||
passes=10,
|
||||
alpha="auto",
|
||||
per_word_topics=True,
|
||||
)
|
||||
|
||||
for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]:
|
||||
coherence_model_lda = CoherenceModel(
|
||||
model=lda_model,
|
||||
texts=texts,
|
||||
dictionary=id2word,
|
||||
coherence=measurement,
|
||||
)
|
||||
coherence_lda = coherence_model_lda.get_coherence()
|
||||
print(f"Coherence ({measurement}): {coherence_lda:.4f}")
|
||||
|
||||
vis = gensimvis.prepare(lda_model, corpus, id2word)
|
||||
pyLDAvis.save_html(vis, f"./lda_output/lda_vis_{num_topics}_topics.html")
|
||||
print(f"Visualization saved to lda_vis_{num_topics}_topics.html")
|
||||
|
||||
# %% [markdown]
|
||||
# ## Results
|
||||
#
|
||||
# ### Topics
|
||||
#
|
||||
|
||||
# %%
|
||||
pprint(lda_model.print_topics())
|
||||
|
||||
# %% [markdown]
|
||||
# ### Topic Coherence
|
||||
#
|
||||
|
||||
# %%
|
||||
for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]:
|
||||
coherence_model_lda = CoherenceModel(
|
||||
model=lda_model,
|
||||
texts=texts,
|
||||
dictionary=id2word,
|
||||
coherence=measurement,
|
||||
)
|
||||
coherence_lda = coherence_model_lda.get_coherence()
|
||||
print(f"Coherence ({measurement}): {coherence_lda:.4f}")
|
||||
|
||||
# %% [markdown]
|
||||
# ### Perplexity
|
||||
#
|
||||
|
||||
# %%
|
||||
log_perplexity = lda_model.log_perplexity(corpus)
|
||||
perplexity = np.exp2(-log_perplexity)
|
||||
|
||||
print(f"Perplexity: {perplexity:.4f}")
|
||||
|
||||
# %% [markdown]
|
||||
# ### Topic Visualization
|
||||
#
|
||||
|
||||
# %%
|
||||
pyLDAvis.enable_notebook()
|
||||
lda_vis = gensimvis.prepare(lda_model, corpus, id2word)
|
||||
pyLDAvis.display(lda_vis)
|
||||
|
||||
# %%
|
||||
VISUALIZATION_THRESHOLD = 0.35
|
||||
|
||||
doc_topic_lda = [
|
||||
lda_model.get_document_topics(doc, minimum_probability=0) for doc in corpus
|
||||
]
|
||||
doc_topic_lda = np.array([[prob for (_, prob) in doc] for doc in doc_topic_lda])
|
||||
|
||||
above_threshold_mask = np.any(doc_topic_lda >= VISUALIZATION_THRESHOLD, axis=1)
|
||||
|
||||
filtered_doc_topic = doc_topic_lda[above_threshold_mask]
|
||||
|
||||
# UMAP dimensionality reduction
|
||||
umap_model = umap.UMAP(n_components=2, metric="hellinger")
|
||||
lda_2d = umap_model.fit_transform(filtered_doc_topic)
|
||||
|
||||
# Assign colors by dominant topic
|
||||
dominant_topics = np.argmax(filtered_doc_topic, axis=1)
|
||||
|
||||
alt_df = pd.DataFrame(
|
||||
{
|
||||
"x": lda_2d[:, 0],
|
||||
"y": lda_2d[:, 1],
|
||||
"topic": dominant_topics.astype(str),
|
||||
"text": [reviews[i] for i in np.where(above_threshold_mask)[0]],
|
||||
"prob": np.max(filtered_doc_topic, axis=1),
|
||||
}
|
||||
)
|
||||
|
||||
alt.data_transformers.disable_max_rows()
|
||||
chart = (
|
||||
alt.Chart(alt_df)
|
||||
.mark_circle(size=60)
|
||||
.encode(
|
||||
x="x:Q",
|
||||
y="y:Q",
|
||||
color="topic:N",
|
||||
tooltip=[
|
||||
alt.Tooltip("topic", title="Topic"),
|
||||
alt.Tooltip("prob:Q", title="Probability", format=".2f"),
|
||||
alt.Tooltip("text", title="Document Text"),
|
||||
],
|
||||
)
|
||||
.properties(
|
||||
width=800,
|
||||
height=600,
|
||||
title=f"Interactive LDA Visualization (Threshold ≥ {VISUALIZATION_THRESHOLD})",
|
||||
)
|
||||
.interactive()
|
||||
)
|
||||
|
||||
chart
|
||||
|
||||
# %% [markdown]
|
||||
# ### Topic assignment
|
||||
#
|
||||
|
||||
# %%
|
||||
import json
|
||||
|
||||
EXPORT_THRESHOLD = 0.35
|
||||
|
||||
# Prepare data for JSON export
|
||||
output_data = []
|
||||
for doc_idx, doc_probs in enumerate(doc_topic_lda):
|
||||
# Get topics above threshold for this document
|
||||
significant_topics = [
|
||||
{"topic_id": int(topic_id), "probability": float(prob)}
|
||||
for topic_id, prob in enumerate(doc_probs)
|
||||
if prob >= EXPORT_THRESHOLD
|
||||
]
|
||||
|
||||
if significant_topics: # Only include documents with significant topics
|
||||
output_data.append(
|
||||
{
|
||||
"document_id": int(doc_idx),
|
||||
"original_text": reviews[doc_idx],
|
||||
"topics": [
|
||||
{
|
||||
"topic_id": t["topic_id"],
|
||||
"probability": round(t["probability"], 2),
|
||||
}
|
||||
for t in significant_topics
|
||||
],
|
||||
"dominant_topic": int(np.argmax(doc_probs)),
|
||||
"dominant_probability": round(float(np.max(doc_probs)), 2),
|
||||
}
|
||||
)
|
||||
|
||||
# Export to JSON
|
||||
with open("lda_output/topic_to_reviews.json", "w") as f:
|
||||
json.dump(
|
||||
{
|
||||
"metadata": {
|
||||
"threshold_used": EXPORT_THRESHOLD,
|
||||
"num_topics": lda_model.num_topics,
|
||||
"total_documents": len(output_data),
|
||||
},
|
||||
"documents": output_data,
|
||||
},
|
||||
f,
|
||||
indent=2,
|
||||
)
|
||||
|
||||
# %% [markdown]
|
||||
# ## Save Model
|
||||
#
|
||||
|
||||
# %%
|
||||
if SAVE_MODEL:
|
||||
os.makedirs("lda_output", exist_ok=True)
|
||||
|
||||
lda_model.save("lda_output/lda_model.gensim")
|
||||
id2word.save("lda_output/lda_dictionary.gensim")
|
||||
with open("lda_output/lda_corpus.pkl", "wb") as f:
|
||||
pickle.dump(corpus, f)
|
||||
|
||||
with open("lda_output/topics.txt", "w") as f:
|
||||
for topic in lda_model.print_topics():
|
||||
f.write(f"{topic}\n")
|
||||
|
||||
print("Done!")
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -0,0 +1,3 @@
|
||||
(0, '0.191*"temple" + 0.102*"view" + 0.079*"sunset" + 0.061*"cliff" + 0.041*"uluwatu" + 0.031*"dance" + 0.030*"kecak_dance" + 0.027*"tourist" + 0.015*"hour" + 0.013*"sun"')
|
||||
(1, '0.052*"sea" + 0.041*"ocean" + 0.038*"guide" + 0.036*"bit" + 0.033*"water" + 0.031*"location" + 0.027*"beach" + 0.025*"wave" + 0.021*"day" + 0.014*"rock"')
|
||||
(2, '0.174*"monkey" + 0.046*"time" + 0.030*"people" + 0.028*"lot" + 0.026*"visit" + 0.022*"glass" + 0.016*"sunglass" + 0.016*"photo" + 0.015*"trip" + 0.014*"day"')
|
||||
Binary file not shown.
Reference in New Issue
Block a user