{ "cells": [ { "cell_type": "markdown", "id": "fb0c4489", "metadata": {}, "source": [ "# Topic Detection: Bali Tourist Reviews\n" ] }, { "cell_type": "markdown", "id": "0f2de0b8", "metadata": {}, "source": [ "## Preparation\n", "\n", "### Dependency Loading\n" ] }, { "cell_type": "code", "execution_count": 149, "id": "04155951", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "OK\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to /home/marvin/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n", "[nltk_data] Downloading package punkt to /home/marvin/nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n", "[nltk_data] Downloading package wordnet to /home/marvin/nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n" ] } ], "source": [ "from gensim.models import CoherenceModel\n", "from gensim.models import LdaModel\n", "from gensim.models.phrases import Phraser, Phrases\n", "from nltk.corpus import stopwords\n", "from nltk.stem import WordNetLemmatizer\n", "from pprint import pprint\n", "import altair as alt\n", "import gensim.corpora as corpora\n", "import json\n", "import multiprocessing\n", "import nltk\n", "import numpy as np\n", "import os\n", "import pandas as pd\n", "import pickle\n", "import pyLDAvis\n", "import pyLDAvis.gensim_models as gensimvis\n", "import re\n", "import spacy\n", "import umap\n", "\n", "nlp = spacy.load(\"en_core_web_sm\")\n", "\n", "try:\n", " multiprocessing.set_start_method(\"spawn\")\n", "except RuntimeError:\n", " pass\n", "\n", "nltk.download(\"stopwords\")\n", "nltk.download(\"punkt\")\n", "nltk.download(\"wordnet\")\n", "\n", "print(\"OK\")" ] }, { "cell_type": "markdown", "id": "f5a5f594", "metadata": {}, "source": [ "### Parameters and Tracking\n" ] }, { "cell_type": "code", "execution_count": 110, "id": "0e093220", "metadata": {}, "outputs": [], "source": [ "RUN_BENCHMARK = False\n", "SAVE_MODEL = True\n", "PROCESS_DATA = False" ] }, { "cell_type": "markdown", "id": "aca75110", "metadata": {}, "source": [ "### Data Loading & Preprocessing\n" ] }, { "cell_type": "code", "execution_count": 130, "id": "68da67a9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loaded 56446 reviews.\n" ] } ], "source": [ "reviews = (\n", " pd.read_csv(\"data.tab\", sep=\"\\t\")\n", " .review.dropna()\n", " .to_list() # .sample(10_000, random_state=42)\n", ")\n", "print(f\"Loaded {len(reviews)} reviews.\")" ] }, { "cell_type": "code", "execution_count": null, "id": "fe09a86e", "metadata": {}, "outputs": [], "source": [ "# List of NE in Bali for NER enhancement\n", "with open(\"bali_ner.json\", \"r\") as f:\n", " bali_places = json.load(f)\n", "bali_places_set = set(bali_places)\n", "\n", "# Stop word definition\n", "extra_stopwords = [\"bali\", \"idr\", \"usd\"]\n", "stop_words = set(stopwords.words(\"english\"))\n", "with open(\"stopwords-en.json\", \"r\") as f:\n", " extra_stopwords.extend(json.load(f))\n", "\n", "# Custom replacements\n", "rep = {\n", " r\"\\\\n\": \" \",\n", " r\"\\n\": \" \",\n", " r'\\\\\"': \"\",\n", " r'\"': \"\",\n", " \"mongkey\": \"monkey\",\n", " \"monky\": \"monkey\",\n", " \"verry\": \"very\",\n", "}\n", "rep = dict((re.escape(k), v) for k, v in rep.items())\n", "pattern = re.compile(\"|\".join(rep.keys()))\n", "\n", "lemmatizer = WordNetLemmatizer()\n", "\n", "\n", "def preprocess(text):\n", " # Step 1: Apply custom replacements (typos, special cases)\n", " text = text.lower()\n", " text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)\n", "\n", " # Step 2: Clean text\n", " text = re.sub(r\"\\d+\", \" \", text)\n", " text = re.sub(r\"\\W+\", \" \", text)\n", "\n", " doc = nlp(text)\n", "\n", " # Step 3: POS tagging and filtering\n", " filtered_tokens = [\n", " token.text\n", " for token in doc\n", " if token.pos_ in {\"NOUN\", \"PROPN\"}\n", " or token.ent_type_ in {\"GPE\", \"LOC\", \"FAC\"}\n", " or token.text in bali_places_set\n", " ]\n", "\n", " # Step 4: Lemmatization and stopword removal\n", " lemmatized_tokens = [\n", " lemmatizer.lemmatize(w)\n", " for w in filtered_tokens\n", " if w not in stop_words and w not in extra_stopwords and len(w) > 2\n", " ]\n", "\n", " return lemmatized_tokens" ] }, { "cell_type": "code", "execution_count": 114, "id": "90cc3e61", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[['experience', 'gita', 'host', 'knowledge', 'enthusiasm', 'driver', 'road', 'experience', 'orangutan', 'bucket', 'list', 'item', 'ticket', 'penny', 'experience']]\n" ] } ], "source": [ "if PROCESS_DATA:\n", " print(\"Processing sentences...\")\n", " processed_reviews = [preprocess(review) for review in reviews]\n", "\n", " with open(\"processed_texts.pkl\", \"wb\") as f:\n", " pickle.dump(processed_reviews, f)\n", "else:\n", " with open(\"processed_texts.pkl\", \"rb\") as f:\n", " processed_reviews = pickle.load(f)\n", "\n", "print(processed_reviews[:1])" ] }, { "cell_type": "markdown", "id": "db9cef4d", "metadata": {}, "source": [ "### n-gram Creation\n" ] }, { "cell_type": "code", "execution_count": 115, "id": "b218e6a3", "metadata": {}, "outputs": [], "source": [ "bigram = Phrases(processed_reviews, min_count=5, threshold=10)\n", "bigram_mod = Phraser(bigram)\n", "texts = [bigram_mod[doc] for doc in processed_reviews]" ] }, { "cell_type": "markdown", "id": "baf1dab2", "metadata": {}, "source": [ "## Model Creation\n" ] }, { "cell_type": "markdown", "id": "f0ef059c", "metadata": {}, "source": [ "### Word Mapping & Corpus\n" ] }, { "cell_type": "code", "execution_count": 116, "id": "575113fe", "metadata": {}, "outputs": [], "source": [ "id2word = corpora.Dictionary(texts)\n", "id2word.filter_extremes(no_below=5, no_above=0.5)\n", "corpus = [id2word.doc2bow(text) for text in texts]" ] }, { "cell_type": "markdown", "id": "d8ee7a92", "metadata": {}, "source": [ "### LDA Model Creation\n" ] }, { "cell_type": "code", "execution_count": 117, "id": "9d99c286", "metadata": {}, "outputs": [], "source": [ "if not RUN_BENCHMARK:\n", " lda_model = LdaModel(\n", " corpus=corpus,\n", " id2word=id2word,\n", " num_topics=3,\n", " random_state=42,\n", " update_every=1,\n", " chunksize=100,\n", " passes=10,\n", " alpha=\"auto\",\n", " per_word_topics=True,\n", " )" ] }, { "cell_type": "code", "execution_count": 118, "id": "ba83c339", "metadata": {}, "outputs": [], "source": [ "if RUN_BENCHMARK:\n", " for num_topics in [3, 4, 5]:\n", " print(f\"Training LDA model with {num_topics} topics...\")\n", " lda_model = LdaModel(\n", " corpus=corpus,\n", " id2word=id2word,\n", " num_topics=num_topics,\n", " random_state=42,\n", " update_every=1,\n", " chunksize=100,\n", " passes=10,\n", " alpha=\"auto\",\n", " per_word_topics=True,\n", " )\n", "\n", " for measurement in [\"c_v\", \"u_mass\", \"c_uci\", \"c_npmi\"]:\n", " coherence_model_lda = CoherenceModel(\n", " model=lda_model,\n", " texts=texts,\n", " dictionary=id2word,\n", " coherence=measurement,\n", " )\n", " coherence_lda = coherence_model_lda.get_coherence()\n", " print(f\"Coherence ({measurement}): {coherence_lda:.4f}\")\n", "\n", " vis = gensimvis.prepare(lda_model, corpus, id2word)\n", " pyLDAvis.save_html(vis, f\"./lda_output/lda_vis_{num_topics}_topics.html\")\n", " print(f\"Visualization saved to lda_vis_{num_topics}_topics.html\")" ] }, { "cell_type": "markdown", "id": "1034fedf", "metadata": {}, "source": [ "## Results\n", "\n", "### Topics\n" ] }, { "cell_type": "code", "execution_count": 119, "id": "4a3ddb5b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[(0,\n", " '0.191*\"temple\" + 0.102*\"view\" + 0.079*\"sunset\" + 0.061*\"cliff\" + '\n", " '0.041*\"uluwatu\" + 0.031*\"dance\" + 0.030*\"kecak_dance\" + 0.027*\"tourist\" + '\n", " '0.015*\"hour\" + 0.013*\"sun\"'),\n", " (1,\n", " '0.052*\"sea\" + 0.041*\"ocean\" + 0.038*\"guide\" + 0.036*\"bit\" + 0.033*\"water\" + '\n", " '0.031*\"location\" + 0.027*\"beach\" + 0.025*\"wave\" + 0.021*\"day\" + '\n", " '0.014*\"rock\"'),\n", " (2,\n", " '0.174*\"monkey\" + 0.046*\"time\" + 0.030*\"people\" + 0.028*\"lot\" + '\n", " '0.026*\"visit\" + 0.022*\"glass\" + 0.016*\"sunglass\" + 0.016*\"photo\" + '\n", " '0.015*\"trip\" + 0.014*\"day\"')]\n" ] } ], "source": [ "pprint(lda_model.print_topics())" ] }, { "cell_type": "markdown", "id": "ed096f39", "metadata": {}, "source": [ "### Topic Coherence\n" ] }, { "cell_type": "code", "execution_count": 120, "id": "3c74d397", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Coherence (c_v): 0.5478\n", "Coherence (u_mass): -2.9861\n", "Coherence (c_uci): 0.0768\n", "Coherence (c_npmi): 0.0234\n" ] } ], "source": [ "for measurement in [\"c_v\", \"u_mass\", \"c_uci\", \"c_npmi\"]:\n", " coherence_model_lda = CoherenceModel(\n", " model=lda_model,\n", " texts=texts,\n", " dictionary=id2word,\n", " coherence=measurement,\n", " )\n", " coherence_lda = coherence_model_lda.get_coherence()\n", " print(f\"Coherence ({measurement}): {coherence_lda:.4f}\")" ] }, { "cell_type": "markdown", "id": "a4eba9ff", "metadata": {}, "source": [ "### Perplexity\n" ] }, { "cell_type": "code", "execution_count": null, "id": "ec08fc9b", "metadata": {}, "outputs": [], "source": [ "log_perplexity = lda_model.log_perplexity(corpus)\n", "perplexity = np.exp2(-log_perplexity)\n", "\n", "print(f\"Perplexity: {perplexity:.4f}\")" ] }, { "cell_type": "markdown", "id": "43b559ba", "metadata": {}, "source": [ "### Topic Visualization\n" ] }, { "cell_type": "code", "execution_count": 121, "id": "2c6e2693", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "
\n", "" ], "text/plain": [ "