{ "cells": [ { "cell_type": "markdown", "id": "fb0c4489", "metadata": {}, "source": [ "# Topic Detection: Bali Tourist Reviews\n" ] }, { "cell_type": "markdown", "id": "0f2de0b8", "metadata": {}, "source": [ "## Preparation\n", "\n", "### Dependency Loading\n" ] }, { "cell_type": "code", "execution_count": 1, "id": "04155951", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/media/marvin/Dev/Git/masterthesis-praktisch/.venv/lib/python3.12/site-packages/spacy/cli/_util.py:23: DeprecationWarning: Importing 'parser.split_arg_string' is deprecated, it will only be available in 'shell_completion' in Click 9.0.\n", " from click.parser import split_arg_string\n", "/media/marvin/Dev/Git/masterthesis-praktisch/.venv/lib/python3.12/site-packages/weasel/util/config.py:8: DeprecationWarning: Importing 'parser.split_arg_string' is deprecated, it will only be available in 'shell_completion' in Click 9.0.\n", " from click.parser import split_arg_string\n", "/media/marvin/Dev/Git/masterthesis-praktisch/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "OK\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to /home/marvin/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n", "[nltk_data] Downloading package punkt to /home/marvin/nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n", "[nltk_data] Downloading package wordnet to /home/marvin/nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n" ] } ], "source": [ "from gensim.models import CoherenceModel\n", "from gensim.models import LdaModel\n", "from gensim.models.phrases import Phraser, Phrases\n", "from nltk.corpus import stopwords\n", "from nltk.stem import WordNetLemmatizer\n", "from pprint import pprint\n", "import altair as alt\n", "import gensim.corpora as corpora\n", "import json\n", "import multiprocessing\n", "import nltk\n", "import numpy as np\n", "import os\n", "import pandas as pd\n", "import pickle\n", "import pyLDAvis\n", "import pyLDAvis.gensim_models as gensimvis\n", "import re\n", "import spacy\n", "import umap\n", "\n", "nlp = spacy.load(\"en_core_web_sm\")\n", "\n", "try:\n", " multiprocessing.set_start_method(\"spawn\")\n", "except RuntimeError:\n", " pass\n", "\n", "nltk.download(\"stopwords\")\n", "nltk.download(\"punkt\")\n", "nltk.download(\"wordnet\")\n", "\n", "print(\"OK\")" ] }, { "cell_type": "markdown", "id": "f5a5f594", "metadata": {}, "source": [ "### Parameters and Tracking\n" ] }, { "cell_type": "code", "execution_count": 2, "id": "0e093220", "metadata": {}, "outputs": [], "source": [ "RUN_BENCHMARK = False\n", "SAVE_MODEL = True\n", "PROCESS_DATA = False" ] }, { "cell_type": "markdown", "id": "aca75110", "metadata": {}, "source": [ "### Data Loading & Preprocessing\n" ] }, { "cell_type": "code", "execution_count": 3, "id": "68da67a9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loaded 56446 reviews.\n" ] } ], "source": [ "reviews = (\n", " pd.read_csv(\"data.tab\", sep=\"\\t\")\n", " .review.dropna()\n", " .to_list() # .sample(10_000, random_state=42)\n", ")\n", "print(f\"Loaded {len(reviews)} reviews.\")" ] }, { "cell_type": "code", "execution_count": 4, "id": "fe09a86e", "metadata": {}, "outputs": [], "source": [ "# List of NE in Bali for NER enhancement\n", "with open(\"bali_ner.json\", \"r\") as f:\n", " bali_places = json.load(f)\n", "bali_places_set = set(bali_places)\n", "\n", "# Stop word definition\n", "extra_stopwords = [\"bali\", \"idr\", \"usd\"]\n", "stop_words = set(stopwords.words(\"english\"))\n", "with open(\"stopwords-en.json\", \"r\") as f:\n", " extra_stopwords.extend(json.load(f))\n", "\n", "# Custom replacements\n", "rep = {\n", " r\"\\\\n\": \" \",\n", " r\"\\n\": \" \",\n", " r'\\\\\"': \"\",\n", " r'\"': \"\",\n", " \"mongkey\": \"monkey\",\n", " \"monky\": \"monkey\",\n", " \"verry\": \"very\",\n", "}\n", "rep = dict((re.escape(k), v) for k, v in rep.items())\n", "pattern = re.compile(\"|\".join(rep.keys()))\n", "\n", "lemmatizer = WordNetLemmatizer()\n", "\n", "\n", "def preprocess(text):\n", " # Step 1: Apply custom replacements (typos, special cases)\n", " text = text.lower()\n", " text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)\n", "\n", " # Step 2: Clean text\n", " text = re.sub(r\"\\d+\", \" \", text)\n", " text = re.sub(r\"\\W+\", \" \", text)\n", "\n", " doc = nlp(text)\n", "\n", " # Step 3: POS tagging and filtering\n", " filtered_tokens = [\n", " token.text\n", " for token in doc\n", " if token.pos_ in {\"NOUN\", \"PROPN\"}\n", " or token.ent_type_ in {\"GPE\", \"LOC\", \"FAC\"}\n", " or token.text in bali_places_set\n", " ]\n", "\n", " # Step 4: Lemmatization and stopword removal\n", " lemmatized_tokens = [\n", " lemmatizer.lemmatize(w)\n", " for w in filtered_tokens\n", " if w not in stop_words and w not in extra_stopwords and len(w) > 2\n", " ]\n", "\n", " return lemmatized_tokens" ] }, { "cell_type": "code", "execution_count": 5, "id": "90cc3e61", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[['experience', 'gita', 'host', 'knowledge', 'enthusiasm', 'driver', 'road', 'experience', 'orangutan', 'bucket', 'list', 'item', 'ticket', 'penny', 'experience']]\n" ] } ], "source": [ "if PROCESS_DATA:\n", " print(\"Processing sentences...\")\n", " processed_reviews = [preprocess(review) for review in reviews]\n", "\n", " with open(\"processed_texts.pkl\", \"wb\") as f:\n", " pickle.dump(processed_reviews, f)\n", "else:\n", " with open(\"processed_texts.pkl\", \"rb\") as f:\n", " processed_reviews = pickle.load(f)\n", "\n", "print(processed_reviews[:1])" ] }, { "cell_type": "markdown", "id": "db9cef4d", "metadata": {}, "source": [ "### n-gram Creation\n" ] }, { "cell_type": "code", "execution_count": 6, "id": "b218e6a3", "metadata": {}, "outputs": [], "source": [ "bigram = Phrases(processed_reviews, min_count=5, threshold=10)\n", "bigram_mod = Phraser(bigram)\n", "texts = [bigram_mod[doc] for doc in processed_reviews]" ] }, { "cell_type": "markdown", "id": "baf1dab2", "metadata": {}, "source": [ "## Model Creation\n" ] }, { "cell_type": "markdown", "id": "f0ef059c", "metadata": {}, "source": [ "### Word Mapping & Corpus\n" ] }, { "cell_type": "code", "execution_count": 7, "id": "575113fe", "metadata": {}, "outputs": [], "source": [ "id2word = corpora.Dictionary(texts)\n", "id2word.filter_extremes(no_below=5, no_above=0.5)\n", "corpus = [id2word.doc2bow(text) for text in texts]" ] }, { "cell_type": "markdown", "id": "d8ee7a92", "metadata": {}, "source": [ "### LDA Model Creation\n" ] }, { "cell_type": "code", "execution_count": 8, "id": "9d99c286", "metadata": {}, "outputs": [], "source": [ "if not RUN_BENCHMARK:\n", " lda_model = LdaModel(\n", " corpus=corpus,\n", " id2word=id2word,\n", " num_topics=3,\n", " random_state=42,\n", " update_every=1,\n", " chunksize=100,\n", " passes=10,\n", " alpha=\"auto\",\n", " per_word_topics=True,\n", " )" ] }, { "cell_type": "code", "execution_count": 9, "id": "ba83c339", "metadata": {}, "outputs": [], "source": [ "if RUN_BENCHMARK:\n", " for num_topics in [3, 4, 5]:\n", " print(f\"Training LDA model with {num_topics} topics...\")\n", " lda_model = LdaModel(\n", " corpus=corpus,\n", " id2word=id2word,\n", " num_topics=num_topics,\n", " random_state=42,\n", " update_every=1,\n", " chunksize=100,\n", " passes=10,\n", " alpha=\"auto\",\n", " per_word_topics=True,\n", " )\n", "\n", " for measurement in [\"c_v\", \"u_mass\", \"c_uci\", \"c_npmi\"]:\n", " coherence_model_lda = CoherenceModel(\n", " model=lda_model,\n", " texts=texts,\n", " dictionary=id2word,\n", " coherence=measurement,\n", " )\n", " coherence_lda = coherence_model_lda.get_coherence()\n", " print(f\"Coherence ({measurement}): {coherence_lda:.4f}\")\n", "\n", " vis = gensimvis.prepare(lda_model, corpus, id2word)\n", " pyLDAvis.save_html(vis, f\"./lda_output/lda_vis_{num_topics}_topics.html\")\n", " print(f\"Visualization saved to lda_vis_{num_topics}_topics.html\")" ] }, { "cell_type": "markdown", "id": "1034fedf", "metadata": {}, "source": [ "## Results\n", "\n", "### Topics\n" ] }, { "cell_type": "code", "execution_count": 10, "id": "4a3ddb5b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[(0,\n", " '0.191*\"temple\" + 0.102*\"view\" + 0.079*\"sunset\" + 0.061*\"cliff\" + '\n", " '0.041*\"uluwatu\" + 0.031*\"dance\" + 0.030*\"kecak_dance\" + 0.027*\"tourist\" + '\n", " '0.015*\"hour\" + 0.013*\"sun\"'),\n", " (1,\n", " '0.052*\"sea\" + 0.041*\"ocean\" + 0.038*\"guide\" + 0.036*\"bit\" + 0.033*\"water\" + '\n", " '0.031*\"location\" + 0.027*\"beach\" + 0.025*\"wave\" + 0.021*\"day\" + '\n", " '0.014*\"rock\"'),\n", " (2,\n", " '0.174*\"monkey\" + 0.046*\"time\" + 0.030*\"people\" + 0.028*\"lot\" + '\n", " '0.026*\"visit\" + 0.022*\"glass\" + 0.016*\"sunglass\" + 0.016*\"photo\" + '\n", " '0.015*\"trip\" + 0.014*\"day\"')]\n" ] } ], "source": [ "pprint(lda_model.print_topics())" ] }, { "cell_type": "markdown", "id": "ed096f39", "metadata": {}, "source": [ "### Topic Coherence\n" ] }, { "cell_type": "code", "execution_count": 11, "id": "3c74d397", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Coherence (c_v): 0.5478\n", "Coherence (u_mass): -2.9861\n", "Coherence (c_uci): 0.0768\n", "Coherence (c_npmi): 0.0234\n" ] } ], "source": [ "for measurement in [\"c_v\", \"u_mass\", \"c_uci\", \"c_npmi\"]:\n", " coherence_model_lda = CoherenceModel(\n", " model=lda_model,\n", " texts=texts,\n", " dictionary=id2word,\n", " coherence=measurement,\n", " )\n", " coherence_lda = coherence_model_lda.get_coherence()\n", " print(f\"Coherence ({measurement}): {coherence_lda:.4f}\")" ] }, { "cell_type": "markdown", "id": "a4eba9ff", "metadata": {}, "source": [ "### Perplexity\n" ] }, { "cell_type": "code", "execution_count": 12, "id": "ec08fc9b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Perplexity: 115.7554\n" ] } ], "source": [ "log_perplexity = lda_model.log_perplexity(corpus)\n", "perplexity = np.exp2(-log_perplexity)\n", "\n", "print(f\"Perplexity: {perplexity:.4f}\")" ] }, { "cell_type": "markdown", "id": "43b559ba", "metadata": {}, "source": [ "### Topic Visualization\n" ] }, { "cell_type": "code", "execution_count": 13, "id": "2c6e2693", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "
\n", "" ], "text/plain": [ "" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pyLDAvis.enable_notebook()\n", "lda_vis = gensimvis.prepare(lda_model, corpus, id2word)\n", "pyLDAvis.display(lda_vis)" ] }, { "cell_type": "code", "execution_count": 14, "id": "95cfac6e", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/media/marvin/Dev/Git/masterthesis-praktisch/.venv/lib/python3.12/site-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n", " warnings.warn(\n", "/media/marvin/Dev/Git/masterthesis-praktisch/.venv/lib/python3.12/site-packages/numba/np/ufunc/dufunc.py:288: RuntimeWarning: invalid value encountered in correct_alternative_hellinger\n", " return super().__call__(*args, **kws)\n" ] }, { "data": { "text/html": [ "\n", "\n", "
\n", "" ], "text/plain": [ "alt.Chart(...)" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "VISUALIZATION_THRESHOLD = 0.35\n", "\n", "doc_topic_lda = [\n", " lda_model.get_document_topics(doc, minimum_probability=0) for doc in corpus\n", "]\n", "doc_topic_lda = np.array([[prob for (_, prob) in doc] for doc in doc_topic_lda])\n", "\n", "above_threshold_mask = np.any(doc_topic_lda >= VISUALIZATION_THRESHOLD, axis=1)\n", "\n", "filtered_doc_topic = doc_topic_lda[above_threshold_mask]\n", "\n", "# UMAP dimensionality reduction\n", "umap_model = umap.UMAP(n_components=2, metric=\"hellinger\")\n", "lda_2d = umap_model.fit_transform(filtered_doc_topic)\n", "\n", "# Assign colors by dominant topic\n", "dominant_topics = np.argmax(filtered_doc_topic, axis=1)\n", "\n", "alt_df = pd.DataFrame(\n", " {\n", " \"x\": lda_2d[:, 0],\n", " \"y\": lda_2d[:, 1],\n", " \"topic\": dominant_topics.astype(str),\n", " \"text\": [reviews[i] for i in np.where(above_threshold_mask)[0]],\n", " \"prob\": np.max(filtered_doc_topic, axis=1),\n", " }\n", ")\n", "\n", "alt.data_transformers.disable_max_rows()\n", "chart = (\n", " alt.Chart(alt_df)\n", " .mark_circle(size=60)\n", " .encode(\n", " x=\"x:Q\",\n", " y=\"y:Q\",\n", " color=\"topic:N\",\n", " tooltip=[\n", " alt.Tooltip(\"topic\", title=\"Topic\"),\n", " alt.Tooltip(\"prob:Q\", title=\"Probability\", format=\".2f\"),\n", " alt.Tooltip(\"text\", title=\"Document Text\"),\n", " ],\n", " )\n", " .properties(\n", " width=800,\n", " height=600,\n", " title=f\"Interactive LDA Visualization (Threshold ≥ {VISUALIZATION_THRESHOLD})\",\n", " )\n", " .interactive()\n", ")\n", "\n", "chart" ] }, { "cell_type": "markdown", "id": "96d1e85d", "metadata": {}, "source": [ "### Topic assignment\n" ] }, { "cell_type": "code", "execution_count": 15, "id": "19b4ff3a", "metadata": {}, "outputs": [], "source": [ "import json\n", "\n", "EXPORT_THRESHOLD = 0.35\n", "\n", "# Prepare data for JSON export\n", "output_data = []\n", "for doc_idx, doc_probs in enumerate(doc_topic_lda):\n", " # Get topics above threshold for this document\n", " significant_topics = [\n", " {\"topic_id\": int(topic_id), \"probability\": float(prob)}\n", " for topic_id, prob in enumerate(doc_probs)\n", " if prob >= EXPORT_THRESHOLD\n", " ]\n", "\n", " if significant_topics: # Only include documents with significant topics\n", " output_data.append(\n", " {\n", " \"document_id\": int(doc_idx),\n", " \"original_text\": reviews[doc_idx],\n", " \"topics\": [\n", " {\n", " \"topic_id\": t[\"topic_id\"],\n", " \"probability\": round(t[\"probability\"], 2),\n", " }\n", " for t in significant_topics\n", " ],\n", " \"dominant_topic\": int(np.argmax(doc_probs)),\n", " \"dominant_probability\": round(float(np.max(doc_probs)), 2),\n", " }\n", " )\n", "\n", "# Export to JSON\n", "with open(\"lda_output/topic_to_reviews.json\", \"w\") as f:\n", " json.dump(\n", " {\n", " \"metadata\": {\n", " \"threshold_used\": EXPORT_THRESHOLD,\n", " \"num_topics\": lda_model.num_topics,\n", " \"total_documents\": len(output_data),\n", " },\n", " \"documents\": output_data,\n", " },\n", " f,\n", " indent=2,\n", " )" ] }, { "cell_type": "markdown", "id": "c4631ced", "metadata": {}, "source": [ "## Save Model\n" ] }, { "cell_type": "code", "execution_count": 16, "id": "02dc81ba", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done!\n" ] } ], "source": [ "if SAVE_MODEL:\n", " os.makedirs(\"lda_output\", exist_ok=True)\n", "\n", " lda_model.save(\"lda_output/lda_model.gensim\")\n", " id2word.save(\"lda_output/lda_dictionary.gensim\")\n", " with open(\"lda_output/lda_corpus.pkl\", \"wb\") as f:\n", " pickle.dump(corpus, f)\n", "\n", " with open(\"lda_output/topics.txt\", \"w\") as f:\n", " for topic in lda_model.print_topics():\n", " f.write(f\"{topic}\\n\")\n", "\n", " print(\"Done!\")" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 5 }