# --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: percent # format_version: '1.3' # jupytext_version: 1.18.0 # kernelspec: # display_name: .venv # language: python # name: python3 # --- # %% [markdown] # # Topic Detection: Bali Tourist Reviews # # %% [markdown] # ## Preparation # # ### Dependency Loading # # %% from gensim.models import CoherenceModel from gensim.models import LdaModel from gensim.models.phrases import Phraser, Phrases from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from pprint import pprint import altair as alt import gensim.corpora as corpora import json import multiprocessing import nltk import numpy as np import os import pandas as pd import pickle import pyLDAvis import pyLDAvis.gensim_models as gensimvis import re import spacy import umap nlp = spacy.load("en_core_web_sm") try: multiprocessing.set_start_method("spawn") except RuntimeError: pass nltk.download("stopwords") nltk.download("punkt") nltk.download("wordnet") print("OK") # %% [markdown] # ### Parameters and Tracking # # %% RUN_BENCHMARK = False SAVE_MODEL = True PROCESS_DATA = False # %% [markdown] # ### Data Loading & Preprocessing # # %% reviews = ( pd.read_csv("data.tab", sep="\t") .review.dropna() .to_list() # .sample(10_000, random_state=42) ) print(f"Loaded {len(reviews)} reviews.") # %% # List of NE in Bali for NER enhancement with open("bali_ner.json", "r") as f: bali_places = json.load(f) bali_places_set = set(bali_places) # Stop word definition extra_stopwords = ["bali", "idr", "usd"] stop_words = set(stopwords.words("english")) with open("stopwords-en.json", "r") as f: extra_stopwords.extend(json.load(f)) # Custom replacements rep = { r"\\n": " ", r"\n": " ", r'\\"': "", r'"': "", "mongkey": "monkey", "monky": "monkey", "verry": "very", } rep = dict((re.escape(k), v) for k, v in rep.items()) pattern = re.compile("|".join(rep.keys())) lemmatizer = WordNetLemmatizer() def preprocess(text): # Step 1: Apply custom replacements (typos, special cases) text = text.lower() text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text) # Step 2: Clean text text = re.sub(r"\d+", " ", text) text = re.sub(r"\W+", " ", text) doc = nlp(text) # Step 3: POS tagging and filtering filtered_tokens = [ token.text for token in doc if token.pos_ in {"NOUN", "PROPN"} or token.ent_type_ in {"GPE", "LOC", "FAC"} or token.text in bali_places_set ] # Step 4: Lemmatization and stopword removal lemmatized_tokens = [ lemmatizer.lemmatize(w) for w in filtered_tokens if w not in stop_words and w not in extra_stopwords and len(w) > 2 ] return lemmatized_tokens # %% if PROCESS_DATA: print("Processing sentences...") processed_reviews = [preprocess(review) for review in reviews] with open("processed_texts.pkl", "wb") as f: pickle.dump(processed_reviews, f) else: with open("processed_texts.pkl", "rb") as f: processed_reviews = pickle.load(f) print(processed_reviews[:1]) # %% [markdown] # ### n-gram Creation # # %% bigram = Phrases(processed_reviews, min_count=5, threshold=10) bigram_mod = Phraser(bigram) texts = [bigram_mod[doc] for doc in processed_reviews] # %% [markdown] # ## Model Creation # # %% [markdown] # ### Word Mapping & Corpus # # %% id2word = corpora.Dictionary(texts) id2word.filter_extremes(no_below=5, no_above=0.5) corpus = [id2word.doc2bow(text) for text in texts] # %% [markdown] # ### LDA Model Creation # # %% if not RUN_BENCHMARK: lda_model = LdaModel( corpus=corpus, id2word=id2word, num_topics=3, random_state=42, update_every=1, chunksize=100, passes=10, alpha="auto", per_word_topics=True, ) # %% if RUN_BENCHMARK: for num_topics in [3, 4, 5]: print(f"Training LDA model with {num_topics} topics...") lda_model = LdaModel( corpus=corpus, id2word=id2word, num_topics=num_topics, random_state=42, update_every=1, chunksize=100, passes=10, alpha="auto", per_word_topics=True, ) for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]: coherence_model_lda = CoherenceModel( model=lda_model, texts=texts, dictionary=id2word, coherence=measurement, ) coherence_lda = coherence_model_lda.get_coherence() print(f"Coherence ({measurement}): {coherence_lda:.4f}") vis = gensimvis.prepare(lda_model, corpus, id2word) pyLDAvis.save_html(vis, f"./lda_output/lda_vis_{num_topics}_topics.html") print(f"Visualization saved to lda_vis_{num_topics}_topics.html") # %% [markdown] # ## Results # # ### Topics # # %% pprint(lda_model.print_topics()) # %% [markdown] # ### Topic Coherence # # %% for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]: coherence_model_lda = CoherenceModel( model=lda_model, texts=texts, dictionary=id2word, coherence=measurement, ) coherence_lda = coherence_model_lda.get_coherence() print(f"Coherence ({measurement}): {coherence_lda:.4f}") # %% [markdown] # ### Perplexity # # %% log_perplexity = lda_model.log_perplexity(corpus) perplexity = np.exp2(-log_perplexity) print(f"Perplexity: {perplexity:.4f}") # %% [markdown] # ### Topic Visualization # # %% pyLDAvis.enable_notebook() lda_vis = gensimvis.prepare(lda_model, corpus, id2word) pyLDAvis.display(lda_vis) # %% VISUALIZATION_THRESHOLD = 0.35 doc_topic_lda = [ lda_model.get_document_topics(doc, minimum_probability=0) for doc in corpus ] doc_topic_lda = np.array([[prob for (_, prob) in doc] for doc in doc_topic_lda]) above_threshold_mask = np.any(doc_topic_lda >= VISUALIZATION_THRESHOLD, axis=1) filtered_doc_topic = doc_topic_lda[above_threshold_mask] # UMAP dimensionality reduction umap_model = umap.UMAP(n_components=2, metric="hellinger") lda_2d = umap_model.fit_transform(filtered_doc_topic) # Assign colors by dominant topic dominant_topics = np.argmax(filtered_doc_topic, axis=1) alt_df = pd.DataFrame( { "x": lda_2d[:, 0], "y": lda_2d[:, 1], "topic": dominant_topics.astype(str), "text": [reviews[i] for i in np.where(above_threshold_mask)[0]], "prob": np.max(filtered_doc_topic, axis=1), } ) alt.data_transformers.disable_max_rows() chart = ( alt.Chart(alt_df) .mark_circle(size=60) .encode( x="x:Q", y="y:Q", color="topic:N", tooltip=[ alt.Tooltip("topic", title="Topic"), alt.Tooltip("prob:Q", title="Probability", format=".2f"), alt.Tooltip("text", title="Document Text"), ], ) .properties( width=800, height=600, title=f"Interactive LDA Visualization (Threshold ≥ {VISUALIZATION_THRESHOLD})", ) .interactive() ) chart # %% [markdown] # ### Topic assignment # # %% import json EXPORT_THRESHOLD = 0.35 # Prepare data for JSON export output_data = [] for doc_idx, doc_probs in enumerate(doc_topic_lda): # Get topics above threshold for this document significant_topics = [ {"topic_id": int(topic_id), "probability": float(prob)} for topic_id, prob in enumerate(doc_probs) if prob >= EXPORT_THRESHOLD ] if significant_topics: # Only include documents with significant topics output_data.append( { "document_id": int(doc_idx), "original_text": reviews[doc_idx], "topics": [ { "topic_id": t["topic_id"], "probability": round(t["probability"], 2), } for t in significant_topics ], "dominant_topic": int(np.argmax(doc_probs)), "dominant_probability": round(float(np.max(doc_probs)), 2), } ) # Export to JSON with open("lda_output/topic_to_reviews.json", "w") as f: json.dump( { "metadata": { "threshold_used": EXPORT_THRESHOLD, "num_topics": lda_model.num_topics, "total_documents": len(output_data), }, "documents": output_data, }, f, indent=2, ) # %% [markdown] # ## Save Model # # %% if SAVE_MODEL: os.makedirs("lda_output", exist_ok=True) lda_model.save("lda_output/lda_model.gensim") id2word.save("lda_output/lda_dictionary.gensim") with open("lda_output/lda_corpus.pkl", "wb") as f: pickle.dump(corpus, f) with open("lda_output/topics.txt", "w") as f: for topic in lda_model.print_topics(): f.write(f"{topic}\n") print("Done!")