Restructure

2026-06-22 23:23:07 +02:00 · 2025-10-20 23:06:52 +02:00
parent 995857ae54
commit c17e5bcc22
54 changed files with 19217 additions and 324966 deletions
@@ -0,0 +1,386 @@
+# ---
+# jupyter:
+#   jupytext:
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.18.0
+#   kernelspec:
+#     display_name: .venv
+#     language: python
+#     name: python3
+# ---
+
+# %% [markdown]
+# # Topic Detection: Bali Tourist Reviews
+#
+
+# %% [markdown]
+# ## Preparation
+#
+# ### Dependency Loading
+#
+
+# %%
+from gensim.models import CoherenceModel
+from gensim.models import LdaModel
+from gensim.models.phrases import Phraser, Phrases
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+from pprint import pprint
+import altair as alt
+import gensim.corpora as corpora
+import json
+import multiprocessing
+import nltk
+import numpy as np
+import os
+import pandas as pd
+import pickle
+import pyLDAvis
+import pyLDAvis.gensim_models as gensimvis
+import re
+import spacy
+import umap
+
+nlp = spacy.load("en_core_web_sm")
+
+try:
+    multiprocessing.set_start_method("spawn")
+except RuntimeError:
+    pass
+
+nltk.download("stopwords")
+nltk.download("punkt")
+nltk.download("wordnet")
+
+print("OK")
+
+# %% [markdown]
+# ### Parameters and Tracking
+#
+
+# %%
+RUN_BENCHMARK = False
+SAVE_MODEL = True
+PROCESS_DATA = False
+
+# %% [markdown]
+# ### Data Loading & Preprocessing
+#
+
+# %%
+reviews = (
+    pd.read_csv("data.tab", sep="\t")
+    .review.dropna()
+    .to_list()  # .sample(10_000, random_state=42)
+)
+print(f"Loaded {len(reviews)} reviews.")
+
+# %%
+# List of NE in Bali for NER enhancement
+with open("bali_ner.json", "r") as f:
+    bali_places = json.load(f)
+bali_places_set = set(bali_places)
+
+# Stop word definition
+extra_stopwords = ["bali", "idr", "usd"]
+stop_words = set(stopwords.words("english"))
+with open("stopwords-en.json", "r") as f:
+    extra_stopwords.extend(json.load(f))
+
+# Custom replacements
+rep = {
+    r"\\n": " ",
+    r"\n": " ",
+    r'\\"': "",
+    r'"': "",
+    "mongkey": "monkey",
+    "monky": "monkey",
+    "verry": "very",
+}
+rep = dict((re.escape(k), v) for k, v in rep.items())
+pattern = re.compile("|".join(rep.keys()))
+
+lemmatizer = WordNetLemmatizer()
+
+
+def preprocess(text):
+    # Step 1: Apply custom replacements (typos, special cases)
+    text = text.lower()
+    text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
+
+    # Step 2: Clean text
+    text = re.sub(r"\d+", " ", text)
+    text = re.sub(r"\W+", " ", text)
+
+    doc = nlp(text)
+
+    # Step 3: POS tagging and filtering
+    filtered_tokens = [
+        token.text
+        for token in doc
+        if token.pos_ in {"NOUN", "PROPN"}
+        or token.ent_type_ in {"GPE", "LOC", "FAC"}
+        or token.text in bali_places_set
+    ]
+
+    # Step 4: Lemmatization and stopword removal
+    lemmatized_tokens = [
+        lemmatizer.lemmatize(w)
+        for w in filtered_tokens
+        if w not in stop_words and w not in extra_stopwords and len(w) > 2
+    ]
+
+    return lemmatized_tokens
+
+
+# %%
+if PROCESS_DATA:
+    print("Processing sentences...")
+    processed_reviews = [preprocess(review) for review in reviews]
+
+    with open("processed_texts.pkl", "wb") as f:
+        pickle.dump(processed_reviews, f)
+else:
+    with open("processed_texts.pkl", "rb") as f:
+        processed_reviews = pickle.load(f)
+
+print(processed_reviews[:1])
+
+# %% [markdown]
+# ### n-gram Creation
+#
+
+# %%
+bigram = Phrases(processed_reviews, min_count=5, threshold=10)
+bigram_mod = Phraser(bigram)
+texts = [bigram_mod[doc] for doc in processed_reviews]
+
+# %% [markdown]
+# ## Model Creation
+#
+
+# %% [markdown]
+# ### Word Mapping & Corpus
+#
+
+# %%
+id2word = corpora.Dictionary(texts)
+id2word.filter_extremes(no_below=5, no_above=0.5)
+corpus = [id2word.doc2bow(text) for text in texts]
+
+# %% [markdown]
+# ### LDA Model Creation
+#
+
+# %%
+if not RUN_BENCHMARK:
+    lda_model = LdaModel(
+        corpus=corpus,
+        id2word=id2word,
+        num_topics=3,
+        random_state=42,
+        update_every=1,
+        chunksize=100,
+        passes=10,
+        alpha="auto",
+        per_word_topics=True,
+    )
+
+# %%
+if RUN_BENCHMARK:
+    for num_topics in [3, 4, 5]:
+        print(f"Training LDA model with {num_topics} topics...")
+        lda_model = LdaModel(
+            corpus=corpus,
+            id2word=id2word,
+            num_topics=num_topics,
+            random_state=42,
+            update_every=1,
+            chunksize=100,
+            passes=10,
+            alpha="auto",
+            per_word_topics=True,
+        )
+
+        for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]:
+            coherence_model_lda = CoherenceModel(
+                model=lda_model,
+                texts=texts,
+                dictionary=id2word,
+                coherence=measurement,
+            )
+            coherence_lda = coherence_model_lda.get_coherence()
+            print(f"Coherence ({measurement}): {coherence_lda:.4f}")
+
+        vis = gensimvis.prepare(lda_model, corpus, id2word)
+        pyLDAvis.save_html(vis, f"./lda_output/lda_vis_{num_topics}_topics.html")
+        print(f"Visualization saved to lda_vis_{num_topics}_topics.html")
+
+# %% [markdown]
+# ## Results
+#
+# ### Topics
+#
+
+# %%
+pprint(lda_model.print_topics())
+
+# %% [markdown]
+# ### Topic Coherence
+#
+
+# %%
+for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]:
+    coherence_model_lda = CoherenceModel(
+        model=lda_model,
+        texts=texts,
+        dictionary=id2word,
+        coherence=measurement,
+    )
+    coherence_lda = coherence_model_lda.get_coherence()
+    print(f"Coherence ({measurement}): {coherence_lda:.4f}")
+
+# %% [markdown]
+# ### Perplexity
+#
+
+# %%
+log_perplexity = lda_model.log_perplexity(corpus)
+perplexity = np.exp2(-log_perplexity)
+
+print(f"Perplexity: {perplexity:.4f}")
+
+# %% [markdown]
+# ### Topic Visualization
+#
+
+# %%
+pyLDAvis.enable_notebook()
+lda_vis = gensimvis.prepare(lda_model, corpus, id2word)
+pyLDAvis.display(lda_vis)
+
+# %%
+VISUALIZATION_THRESHOLD = 0.35
+
+doc_topic_lda = [
+    lda_model.get_document_topics(doc, minimum_probability=0) for doc in corpus
+]
+doc_topic_lda = np.array([[prob for (_, prob) in doc] for doc in doc_topic_lda])
+
+above_threshold_mask = np.any(doc_topic_lda >= VISUALIZATION_THRESHOLD, axis=1)
+
+filtered_doc_topic = doc_topic_lda[above_threshold_mask]
+
+# UMAP dimensionality reduction
+umap_model = umap.UMAP(n_components=2, metric="hellinger")
+lda_2d = umap_model.fit_transform(filtered_doc_topic)
+
+# Assign colors by dominant topic
+dominant_topics = np.argmax(filtered_doc_topic, axis=1)
+
+alt_df = pd.DataFrame(
+    {
+        "x": lda_2d[:, 0],
+        "y": lda_2d[:, 1],
+        "topic": dominant_topics.astype(str),
+        "text": [reviews[i] for i in np.where(above_threshold_mask)[0]],
+        "prob": np.max(filtered_doc_topic, axis=1),
+    }
+)
+
+alt.data_transformers.disable_max_rows()
+chart = (
+    alt.Chart(alt_df)
+    .mark_circle(size=60)
+    .encode(
+        x="x:Q",
+        y="y:Q",
+        color="topic:N",
+        tooltip=[
+            alt.Tooltip("topic", title="Topic"),
+            alt.Tooltip("prob:Q", title="Probability", format=".2f"),
+            alt.Tooltip("text", title="Document Text"),
+        ],
+    )
+    .properties(
+        width=800,
+        height=600,
+        title=f"Interactive LDA Visualization (Threshold ≥ {VISUALIZATION_THRESHOLD})",
+    )
+    .interactive()
+)
+
+chart
+
+# %% [markdown]
+# ### Topic assignment
+#
+
+# %%
+import json
+
+EXPORT_THRESHOLD = 0.35
+
+# Prepare data for JSON export
+output_data = []
+for doc_idx, doc_probs in enumerate(doc_topic_lda):
+    # Get topics above threshold for this document
+    significant_topics = [
+        {"topic_id": int(topic_id), "probability": float(prob)}
+        for topic_id, prob in enumerate(doc_probs)
+        if prob >= EXPORT_THRESHOLD
+    ]
+
+    if significant_topics:  # Only include documents with significant topics
+        output_data.append(
+            {
+                "document_id": int(doc_idx),
+                "original_text": reviews[doc_idx],
+                "topics": [
+                    {
+                        "topic_id": t["topic_id"],
+                        "probability": round(t["probability"], 2),
+                    }
+                    for t in significant_topics
+                ],
+                "dominant_topic": int(np.argmax(doc_probs)),
+                "dominant_probability": round(float(np.max(doc_probs)), 2),
+            }
+        )
+
+# Export to JSON
+with open("lda_output/topic_to_reviews.json", "w") as f:
+    json.dump(
+        {
+            "metadata": {
+                "threshold_used": EXPORT_THRESHOLD,
+                "num_topics": lda_model.num_topics,
+                "total_documents": len(output_data),
+            },
+            "documents": output_data,
+        },
+        f,
+        indent=2,
+    )
+
+# %% [markdown]
+# ## Save Model
+#
+
+# %%
+if SAVE_MODEL:
+    os.makedirs("lda_output", exist_ok=True)
+
+    lda_model.save("lda_output/lda_model.gensim")
+    id2word.save("lda_output/lda_dictionary.gensim")
+    with open("lda_output/lda_corpus.pkl", "wb") as f:
+        pickle.dump(corpus, f)
+
+    with open("lda_output/topics.txt", "w") as f:
+        for topic in lda_model.print_topics():
+            f.write(f"{topic}\n")
+
+    print("Done!")
@@ -0,0 +1,3 @@
+(0, '0.191*"temple" + 0.102*"view" + 0.079*"sunset" + 0.061*"cliff" + 0.041*"uluwatu" + 0.031*"dance" + 0.030*"kecak_dance" + 0.027*"tourist" + 0.015*"hour" + 0.013*"sun"')
+(1, '0.052*"sea" + 0.041*"ocean" + 0.038*"guide" + 0.036*"bit" + 0.033*"water" + 0.031*"location" + 0.027*"beach" + 0.025*"wave" + 0.021*"day" + 0.014*"rock"')
+(2, '0.174*"monkey" + 0.046*"time" + 0.030*"people" + 0.028*"lot" + 0.026*"visit" + 0.022*"glass" + 0.016*"sunglass" + 0.016*"photo" + 0.015*"trip" + 0.014*"day"')