masterthesis-playground/lda/nb_lda.py

# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: percent
#       format_version: '1.3'
#       jupytext_version: 1.18.0
#   kernelspec:
#     display_name: .venv
#     language: python
#     name: python3
# ---

# %% [markdown]
# # Topic Detection: Bali Tourist Reviews
#

# %% [markdown]
# ## Preparation
#
# ### Dependency Loading
#

# %%
from gensim.models import CoherenceModel
from gensim.models import LdaModel
from gensim.models.phrases import Phraser, Phrases
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from pprint import pprint
import altair as alt
import gensim.corpora as corpora
import json
import multiprocessing
import nltk
import numpy as np
import os
import pandas as pd
import pickle
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import re
import spacy
import umap

nlp = spacy.load("en_core_web_sm")

try:
    multiprocessing.set_start_method("spawn")
except RuntimeError:
    pass

nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

print("OK")

# %% [markdown]
# ### Parameters and Tracking
#

# %%
RUN_BENCHMARK = False
SAVE_MODEL = True
PROCESS_DATA = False

# %% [markdown]
# ### Data Loading & Preprocessing
#

# %%
reviews = (
    pd.read_csv("data.tab", sep="\t")
    .review.dropna()
    .to_list()  # .sample(10_000, random_state=42)
)
print(f"Loaded {len(reviews)} reviews.")

# %%
# List of NE in Bali for NER enhancement
with open("bali_ner.json", "r") as f:
    bali_places = json.load(f)
bali_places_set = set(bali_places)

# Stop word definition
extra_stopwords = ["bali", "idr", "usd"]
stop_words = set(stopwords.words("english"))
with open("stopwords-en.json", "r") as f:
    extra_stopwords.extend(json.load(f))

# Custom replacements
rep = {
    r"\\n": " ",
    r"\n": " ",
    r'\\"': "",
    r'"': "",
    "mongkey": "monkey",
    "monky": "monkey",
    "verry": "very",
}
rep = dict((re.escape(k), v) for k, v in rep.items())
pattern = re.compile("|".join(rep.keys()))

lemmatizer = WordNetLemmatizer()


def preprocess(text):
    # Step 1: Apply custom replacements (typos, special cases)
    text = text.lower()
    text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)

    # Step 2: Clean text
    text = re.sub(r"\d+", " ", text)
    text = re.sub(r"\W+", " ", text)

    doc = nlp(text)

    # Step 3: POS tagging and filtering
    filtered_tokens = [
        token.text
        for token in doc
        if token.pos_ in {"NOUN", "PROPN"}
        or token.ent_type_ in {"GPE", "LOC", "FAC"}
        or token.text in bali_places_set
    ]

    # Step 4: Lemmatization and stopword removal
    lemmatized_tokens = [
        lemmatizer.lemmatize(w)
        for w in filtered_tokens
        if w not in stop_words and w not in extra_stopwords and len(w) > 2
    ]

    return lemmatized_tokens


# %%
if PROCESS_DATA:
    print("Processing sentences...")
    processed_reviews = [preprocess(review) for review in reviews]

    with open("processed_texts.pkl", "wb") as f:
        pickle.dump(processed_reviews, f)
else:
    with open("processed_texts.pkl", "rb") as f:
        processed_reviews = pickle.load(f)

print(processed_reviews[:1])

# %% [markdown]
# ### n-gram Creation
#

# %%
bigram = Phrases(processed_reviews, min_count=5, threshold=10)
bigram_mod = Phraser(bigram)
texts = [bigram_mod[doc] for doc in processed_reviews]

# %% [markdown]
# ## Model Creation
#

# %% [markdown]
# ### Word Mapping & Corpus
#

# %%
id2word = corpora.Dictionary(texts)
id2word.filter_extremes(no_below=5, no_above=0.5)
corpus = [id2word.doc2bow(text) for text in texts]

# %% [markdown]
# ### LDA Model Creation
#

# %%
if not RUN_BENCHMARK:
    lda_model = LdaModel(
        corpus=corpus,
        id2word=id2word,
        num_topics=3,
        random_state=42,
        update_every=1,
        chunksize=100,
        passes=10,
        alpha="auto",
        per_word_topics=True,
    )

# %%
if RUN_BENCHMARK:
    for num_topics in [3, 4, 5]:
        print(f"Training LDA model with {num_topics} topics...")
        lda_model = LdaModel(
            corpus=corpus,
            id2word=id2word,
            num_topics=num_topics,
            random_state=42,
            update_every=1,
            chunksize=100,
            passes=10,
            alpha="auto",
            per_word_topics=True,
        )

        for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]:
            coherence_model_lda = CoherenceModel(
                model=lda_model,
                texts=texts,
                dictionary=id2word,
                coherence=measurement,
            )
            coherence_lda = coherence_model_lda.get_coherence()
            print(f"Coherence ({measurement}): {coherence_lda:.4f}")

        vis = gensimvis.prepare(lda_model, corpus, id2word)
        pyLDAvis.save_html(vis, f"./lda_output/lda_vis_{num_topics}_topics.html")
        print(f"Visualization saved to lda_vis_{num_topics}_topics.html")

# %% [markdown]
# ## Results
#
# ### Topics
#

# %%
pprint(lda_model.print_topics())

# %% [markdown]
# ### Topic Coherence
#

# %%
for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]:
    coherence_model_lda = CoherenceModel(
        model=lda_model,
        texts=texts,
        dictionary=id2word,
        coherence=measurement,
    )
    coherence_lda = coherence_model_lda.get_coherence()
    print(f"Coherence ({measurement}): {coherence_lda:.4f}")

# %% [markdown]
# ### Perplexity
#

# %%
log_perplexity = lda_model.log_perplexity(corpus)
perplexity = np.exp2(-log_perplexity)

print(f"Perplexity: {perplexity:.4f}")

# %% [markdown]
# ### Topic Visualization
#

# %%
pyLDAvis.enable_notebook()
lda_vis = gensimvis.prepare(lda_model, corpus, id2word)
pyLDAvis.display(lda_vis)

# %%
VISUALIZATION_THRESHOLD = 0.35

doc_topic_lda = [
    lda_model.get_document_topics(doc, minimum_probability=0) for doc in corpus
]
doc_topic_lda = np.array([[prob for (_, prob) in doc] for doc in doc_topic_lda])

above_threshold_mask = np.any(doc_topic_lda >= VISUALIZATION_THRESHOLD, axis=1)

filtered_doc_topic = doc_topic_lda[above_threshold_mask]

# UMAP dimensionality reduction
umap_model = umap.UMAP(n_components=2, metric="hellinger")
lda_2d = umap_model.fit_transform(filtered_doc_topic)

# Assign colors by dominant topic
dominant_topics = np.argmax(filtered_doc_topic, axis=1)

alt_df = pd.DataFrame(
    {
        "x": lda_2d[:, 0],
        "y": lda_2d[:, 1],
        "topic": dominant_topics.astype(str),
        "text": [reviews[i] for i in np.where(above_threshold_mask)[0]],
        "prob": np.max(filtered_doc_topic, axis=1),
    }
)

alt.data_transformers.disable_max_rows()
chart = (
    alt.Chart(alt_df)
    .mark_circle(size=60)
    .encode(
        x="x:Q",
        y="y:Q",
        color="topic:N",
        tooltip=[
            alt.Tooltip("topic", title="Topic"),
            alt.Tooltip("prob:Q", title="Probability", format=".2f"),
            alt.Tooltip("text", title="Document Text"),
        ],
    )
    .properties(
        width=800,
        height=600,
        title=f"Interactive LDA Visualization (Threshold ≥ {VISUALIZATION_THRESHOLD})",
    )
    .interactive()
)

chart

# %% [markdown]
# ### Topic assignment
#

# %%
import json

EXPORT_THRESHOLD = 0.35

# Prepare data for JSON export
output_data = []
for doc_idx, doc_probs in enumerate(doc_topic_lda):
    # Get topics above threshold for this document
    significant_topics = [
        {"topic_id": int(topic_id), "probability": float(prob)}
        for topic_id, prob in enumerate(doc_probs)
        if prob >= EXPORT_THRESHOLD
    ]

    if significant_topics:  # Only include documents with significant topics
        output_data.append(
            {
                "document_id": int(doc_idx),
                "original_text": reviews[doc_idx],
                "topics": [
                    {
                        "topic_id": t["topic_id"],
                        "probability": round(t["probability"], 2),
                    }
                    for t in significant_topics
                ],
                "dominant_topic": int(np.argmax(doc_probs)),
                "dominant_probability": round(float(np.max(doc_probs)), 2),
            }
        )

# Export to JSON
with open("lda_output/topic_to_reviews.json", "w") as f:
    json.dump(
        {
            "metadata": {
                "threshold_used": EXPORT_THRESHOLD,
                "num_topics": lda_model.num_topics,
                "total_documents": len(output_data),
            },
            "documents": output_data,
        },
        f,
        indent=2,
    )

# %% [markdown]
# ## Save Model
#

# %%
if SAVE_MODEL:
    os.makedirs("lda_output", exist_ok=True)

    lda_model.save("lda_output/lda_model.gensim")
    id2word.save("lda_output/lda_dictionary.gensim")
    with open("lda_output/lda_corpus.pkl", "wb") as f:
        pickle.dump(corpus, f)

    with open("lda_output/topics.txt", "w") as f:
        for topic in lda_model.print_topics():
            f.write(f"{topic}\n")

    print("Done!")