mirror of
https://github.com/marvinscham/masterthesis-playground.git
synced 2025-12-06 02:00:50 +01:00
1590 lines
71 KiB
Plaintext
1590 lines
71 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "817d9e5f",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Topic Detection: Bali Tourist Reviews\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "de3ee3bf",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Preparation\n",
|
|
"\n",
|
|
"### Dependency Loading\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "4bbd7aa5",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"2025-06-22 14:11:09.736672: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
|
|
"WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
|
|
"E0000 00:00:1750594269.748078 846609 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
|
|
"E0000 00:00:1750594269.751680 846609 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
|
|
"W0000 00:00:1750594269.761978 846609 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
|
|
"W0000 00:00:1750594269.761987 846609 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
|
|
"W0000 00:00:1750594269.761989 846609 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
|
|
"W0000 00:00:1750594269.761990 846609 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
|
|
"2025-06-22 14:11:09.765121: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
|
|
"To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
|
|
"/media/marvin/Dev/Git/masterthesis-praktisch/.venv/lib/python3.12/site-packages/tensorflow_hub/__init__.py:61: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.\n",
|
|
" from pkg_resources import parse_version\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from gensim import corpora\n",
|
|
"from gensim.models import CoherenceModel\n",
|
|
"from gensim.models.coherencemodel import CoherenceModel\n",
|
|
"from sentence_transformers import SentenceTransformer\n",
|
|
"from sklearn.metrics.pairwise import cosine_similarity\n",
|
|
"from top2vec import Top2Vec\n",
|
|
"from tqdm.notebook import tqdm\n",
|
|
"import numpy as np\n",
|
|
"import pandas as pd\n",
|
|
"import pickle\n",
|
|
"import re\n",
|
|
"import spacy"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "7cbb87a1",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Parameters and Tracking\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "cff3e424",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"PROCESS_DATA = False\n",
|
|
"RECALCULATE_COHERENCE_PARTS = False\n",
|
|
"RECREATE_MODEL = True"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "8b3b077a",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Data Loading & Preprocessing\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "2af8f41f",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Loaded 56446 reviews\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"reviews = (\n",
|
|
" pd.read_csv(\"data.tab\", sep=\"\\t\").review.dropna().to_list()\n",
|
|
") # .sample(5_000, random_state=42)\n",
|
|
"\n",
|
|
"print(\"Loaded {} reviews\".format(len(reviews)))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "59bf1eda",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"rep = {\n",
|
|
" r\"\\\\n\": \" \",\n",
|
|
" r\"\\n\": \" \",\n",
|
|
" r'\\\\\"': \"\",\n",
|
|
" r'\"': \"\",\n",
|
|
" \"mongkey\": \"monkey\",\n",
|
|
" \"monky\": \"monkey\",\n",
|
|
" \"verry\": \"very\",\n",
|
|
" \"bali\": \"\",\n",
|
|
" r\"\\s+\": \" \",\n",
|
|
"}\n",
|
|
"rep = dict((re.escape(k), v) for k, v in rep.items())\n",
|
|
"pattern = re.compile(\"|\".join(rep.keys()))\n",
|
|
"\n",
|
|
"\n",
|
|
"def preprocess(text):\n",
|
|
" text = text.strip()\n",
|
|
" text = text.lower()\n",
|
|
" text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)\n",
|
|
" return text"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "cdf23e7a",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Processed 56446 reviews\n",
|
|
"['an out of this world experience! gita was an amazing host, filled with knowledge and enthusiasm, and billy was the best driver, eager to help and make your road travels a special experience! not only orangutans that you get to see and interact with, and a bucket-list item was definitely ticket off! must must must do, worth every penny and then some. such a special experience!']\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"if PROCESS_DATA:\n",
|
|
" print(\"Processing reviews...\")\n",
|
|
" reviews = [preprocess(review) for review in reviews]\n",
|
|
"\n",
|
|
" with open(\"processed_texts_top2vec.pkl\", \"wb\") as f:\n",
|
|
" pickle.dump(reviews, f)\n",
|
|
"else:\n",
|
|
" with open(\"processed_texts_top2vec.pkl\", \"rb\") as f:\n",
|
|
" reviews = pickle.load(f)\n",
|
|
" reviews = [\n",
|
|
" \" \".join(review) if isinstance(review, list) else review\n",
|
|
" for review in reviews\n",
|
|
" ]\n",
|
|
"\n",
|
|
"print(\"Processed {} reviews\".format(len(reviews)))\n",
|
|
"print(reviews[:1])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "125f4095",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Model Creation\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "36fede91",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"2025-06-22 14:11:18,712 - top2vec - INFO - Pre-processing documents for training\n",
|
|
"/media/marvin/Dev/Git/masterthesis-praktisch/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/text.py:517: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n",
|
|
" warnings.warn(\n",
|
|
"2025-06-22 14:11:23,399 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model\n",
|
|
"2025-06-22 14:11:27,184 - top2vec - INFO - Creating joint document/word embedding\n",
|
|
"2025-06-22 14:11:48,273 - top2vec - INFO - Creating lower dimension embedding of documents\n",
|
|
"/media/marvin/Dev/Git/masterthesis-praktisch/.venv/lib/python3.12/site-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
|
|
" warnings.warn(\n",
|
|
"/media/marvin/Dev/Git/masterthesis-praktisch/.venv/lib/python3.12/site-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
|
|
" warn(\n",
|
|
"2025-06-22 14:12:28,395 - top2vec - INFO - Finding dense areas of documents\n",
|
|
"/media/marvin/Dev/Git/masterthesis-praktisch/.venv/lib/python3.12/site-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
|
|
" warnings.warn(\n",
|
|
"/media/marvin/Dev/Git/masterthesis-praktisch/.venv/lib/python3.12/site-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
|
|
" warnings.warn(\n",
|
|
"2025-06-22 14:12:30,401 - top2vec - INFO - Finding topics\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\n",
|
|
"Number of topics found: 27\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"if RECREATE_MODEL:\n",
|
|
" hdbscan_args = {\n",
|
|
" \"min_cluster_size\": 200,\n",
|
|
" \"min_samples\": 25,\n",
|
|
" \"metric\": \"euclidean\",\n",
|
|
" \"cluster_selection_method\": \"eom\",\n",
|
|
" }\n",
|
|
" umap_args = {\n",
|
|
" \"n_neighbors\": 15,\n",
|
|
" \"n_components\": 2,\n",
|
|
" \"min_dist\": 0.01,\n",
|
|
" \"metric\": \"cosine\",\n",
|
|
" \"random_state\": 42,\n",
|
|
" \"low_memory\": True,\n",
|
|
" }\n",
|
|
"\n",
|
|
" model = Top2Vec(\n",
|
|
" reviews,\n",
|
|
" workers=8,\n",
|
|
" hdbscan_args=hdbscan_args,\n",
|
|
" umap_args=umap_args,\n",
|
|
" min_count=1,\n",
|
|
" )\n",
|
|
"\n",
|
|
" with open(\"./top2vec/model.pkl\", \"wb\") as f:\n",
|
|
" pickle.dump(model, f)\n",
|
|
"else:\n",
|
|
" with open(\"./top2vec/model.pkl\", \"rb\") as f:\n",
|
|
" model = pickle.load(f)\n",
|
|
"\n",
|
|
"print(f\"\\nNumber of topics found: {model.get_num_topics()}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "56d57fa7",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Results\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "ff1b256e",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Coherence\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "863f7c36",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"BERT-based Topic Coherence: 0.4038\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"topic_words = model.get_topics()[0]\n",
|
|
"embedding_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n",
|
|
"\n",
|
|
"coherence_scores = []\n",
|
|
"for words in topic_words:\n",
|
|
" coherence_embeddings = embedding_model.encode(words)\n",
|
|
" sim_matrix = cosine_similarity(coherence_embeddings)\n",
|
|
" np.fill_diagonal(sim_matrix, 0)\n",
|
|
" mean_sim = np.mean(sim_matrix)\n",
|
|
" coherence_scores.append(mean_sim)\n",
|
|
"\n",
|
|
"overall_coherence = np.mean(coherence_scores)\n",
|
|
"\n",
|
|
"print(f\"BERT-based Topic Coherence: {overall_coherence:.4f}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"id": "633bbb44",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"env: TOKENIZERS_PARALLELISM=false\n",
|
|
"Starting coherence evaluation...\n",
|
|
"Coherence (c_v): 0.3583\n",
|
|
"Coherence (u_mass): -11.5933\n",
|
|
"Coherence (c_uci): -4.9920\n",
|
|
"Coherence (c_npmi): -0.1464\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"%env TOKENIZERS_PARALLELISM=false\n",
|
|
"num_words = 10\n",
|
|
"\n",
|
|
"if RECALCULATE_COHERENCE_PARTS:\n",
|
|
" tqdm.pandas()\n",
|
|
"\n",
|
|
" docs = model.documents\n",
|
|
" doc_topics, _, _, _ = model.get_documents_topics(doc_ids=list(range(len(docs))))\n",
|
|
"\n",
|
|
" df = pd.DataFrame({\"Document\": docs, \"ID\": range(len(docs)), \"Topic\": doc_topics})\n",
|
|
"\n",
|
|
" documents_per_topic = df.groupby([\"Topic\"], as_index=False).agg(\n",
|
|
" {\"Document\": \" \".join}\n",
|
|
" )\n",
|
|
"\n",
|
|
" nlp = spacy.load(\"en_core_web_sm\", disable=[\"ner\", \"parser\"])\n",
|
|
" nlp.max_length = 10_000_000\n",
|
|
"\n",
|
|
" def preprocess(doc):\n",
|
|
" return [\n",
|
|
" token.text.lower()\n",
|
|
" for token in nlp(doc)\n",
|
|
" if token.is_alpha and not token.is_stop\n",
|
|
" ]\n",
|
|
" \n",
|
|
" topic_words = model.get_topics()[0]\n",
|
|
" print(topic_words)\n",
|
|
"\n",
|
|
" print(\"Preprocessing topic documents...\")\n",
|
|
" tokens = df[\"Tokens\"] = df[\"Document\"].progress_apply(preprocess)\n",
|
|
"\n",
|
|
" print(\"Creating dictionary...\")\n",
|
|
" dictionary = corpora.Dictionary(tokens)\n",
|
|
" print(\"Creating corpus...\")\n",
|
|
" corpus = [dictionary.doc2bow(token_list) for token_list in tokens]\n",
|
|
" \n",
|
|
" num_topics = len(model.topic_sizes)\n",
|
|
"\n",
|
|
" with open(\"./top2vec/corpus.pkl\", \"wb\") as f:\n",
|
|
" pickle.dump(corpus, f)\n",
|
|
" with open(\"./top2vec/dictionary.pkl\", \"wb\") as f:\n",
|
|
" pickle.dump(dictionary, f)\n",
|
|
" with open(\"./top2vec/tokens.pkl\", \"wb\") as f:\n",
|
|
" pickle.dump(tokens, f)\n",
|
|
"else:\n",
|
|
" with open(\"./top2vec/corpus.pkl\", \"rb\") as f:\n",
|
|
" corpus = pickle.load(f)\n",
|
|
" with open(\"./top2vec/dictionary.pkl\", \"rb\") as f:\n",
|
|
" dictionary = pickle.load(f)\n",
|
|
" with open(\"./top2vec/tokens.pkl\", \"rb\") as f:\n",
|
|
" tokens = pickle.load(f)\n",
|
|
"\n",
|
|
"print(\"Starting coherence evaluation...\")\n",
|
|
"for measure in [\"c_v\", \"u_mass\", \"c_uci\", \"c_npmi\"]:\n",
|
|
" cm = CoherenceModel(\n",
|
|
" topics=topic_words,\n",
|
|
" texts=tokens,\n",
|
|
" corpus=corpus,\n",
|
|
" dictionary=dictionary,\n",
|
|
" coherence=measure,\n",
|
|
" topn=num_words,\n",
|
|
" )\n",
|
|
" score = cm.get_coherence()\n",
|
|
" print(f\"Coherence ({measure}): {score:.4f}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "d5e53ffc",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Topic List\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"id": "47d87c37",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Topic 0: monkeyes | monkeys | monkey | monkeyforest | monkeying | primates | zoos | zoo | primate | monkeyed | apes | macaque | macaques | chimpanzees | zookeepers | zoological | wildlife | monkies | ape | chimpanzee | rainforest | monkeydance | monkes | forest | lemurs | rainforests | park | gibbon | monkeies | chimps | jungles | zookeeper | gorillas | attractions | forests | jungle | gazebo | visiting | orangutangs | borneo | zooperkids | parks | bananas | wilderness | rainforesty | monkees | gibbons | foraging | sightseeing | tourists\n",
|
|
"Topic 1: beach | beaches | beachs | beachside | beachfront | beachwalk | coastal | seaside | beached | beachcombing | seashore | shoreline | beache | beachwear | beachgoers | surf | beachline | surfs | coastline | waterfront | beachbag | beachclub | coastlines | beachclubs | beachbars | lagoons | dunes | ashore | coast | ocean | sand | lagoon | scenically | shore | cruises | resort | oceanfront | surfing | resorts | inland | coasts | onshore | reefs | boardwalk | scenic | sands | surfers | shores | seas | beachboys\n",
|
|
"Topic 2: temple | temples | shrines | shrine | pilgrimage | scenically | monuments | monument | attractions | sanctum | picturesque | scenic | monastery | tourism | pagoda | pagodas | tourist | visiting | bangkok | landmarks | tourists | templed | touristical | mosque | visit | sanctuaries | mecca | mandir | disneyland | tombs | tomb | scenery | touristified | memorial | terraces | waterfront | resort | mandira | coliseum | touristy | landmark | dome | courtyard | sightseeings | statues | touristic | lakeside | courtyards | amenities | grottoes\n",
|
|
"Topic 3: scenically | scenic | picturesque | attractions | scenery | tourism | tourists | visiting | tourist | sightseeings | sightseeing | resort | touristy | visit | waterfront | touristical | sightsee | touristed | beach | beachfront | resorts | touristic | nearby | beaches | visitors | touristified | touristique | visits | touristshops | breathtakingly | tours | oceanfront | destinations | breathtaking | excursion | vicinity | disneyland | hotels | landmarks | boattrip | patio | hotel | terrace | promenades | place | sunsets | landmark | promenade | trip | situated\n",
|
|
"Topic 4: zoo | zoos | zoological | zookeepers | zookeeper | wildlife | attractions | disneyland | zooperkids | zoovenir | foraging | elephants | elephant | animals | fauna | orangutangs | visiting | orangutang | accomodations | accomodation | tourism | habitat | picnicking | visits | habitats | excursions | tourists | touristy | excursion | tourist | accomadation | visitors | touristical | sightseeing | exhibits | visit | touristed | pets | picnics | animal | disneyworld | sightseeings | rainforest | orangutans | lion | orangutan | tours | giraffes | outings | park\n",
|
|
"Topic 5: temples | temple | shrines | sanctum | zoo | monuments | zoos | scenically | pagodas | scenic | pagoda | attractions | monument | monastery | shrine | tourism | tourist | monkeys | picturesque | grottoes | tourists | monkes | landmarks | pilgrimage | monkeies | monkies | templed | scenery | visiting | grotto | monkey | sightseeings | balcony | statues | apes | shangri | sightseeing | touristical | monkeyforest | monkeyes | touristy | visit | disneyland | gazebo | majestic | sacred | park | terraces | zookeepers | touristic\n",
|
|
"Topic 6: beaches | sanur | beach | san | beachs | sanura | sanctuaries | sanuar | seaside | beachside | coastal | beachwalk | beache | sanurs | seashore | inland | resort | beachfront | sanctoo | resorts | beachline | beachbars | coast | scenically | beachcombing | shoreline | beached | beachclub | beachwear | coastline | surf | touristy | waterfront | coasts | bukit | sands | ashore | shore | coastlines | lagoons | surfs | beachclubs | lagoon | beachgoers | sanctury | scenic | onshore | tourist | harbour | watersport\n",
|
|
"Topic 7: mountaineering | hike | hiking | hikes | climb | hiked | climbing | climbs | mountain | excursion | scenically | treking | climbed | expeditions | summit | scenic | wilderness | exploration | trails | ascent | tours | trekkingtour | mt | excursions | mountainous | hikers | mountains | journeys | summits | trip | mountainside | expedition | trips | explore | resort | embarking | sightseeing | summited | climbers | hiker | camping | explorations | climber | trail | elevations | camped | everest | terrain | sunbathed | cruising\n",
|
|
"Topic 8: uluwatu | temples | uluwati | uluwantu | temple | uluwata | uluwathu | ulawatu | shrines | uluawatu | uluvatu | uluatu | attractions | kilimanjaro | ulwathu | bangkok | uuwatu | ulluwatu | uluwatutemple | monuments | mecca | monument | ulwatu | ullawatu | dharmawangsa | tourism | shrine | ulundanu | ulan | ulun | klunkung | tourist | pilgrimage | disneyland | sanctum | landmarks | orangutang | scenically | ullun | klungkung | visiting | tourists | visit | angkor | purajati | orangutangs | scenic | kumingan | touristy | kutuh\n",
|
|
"Topic 9: gardens | attractions | scenically | scenic | resort | ponds | tourism | lagoons | picturesque | visiting | waterfront | poolside | lagoon | fountains | scenery | lake | habitats | tourist | amenities | waterpark | courtyards | landscapes | tourists | visit | beaches | pond | resorts | touristical | lakeside | courtyard | touristy | garden | pools | baths | aquatic | lakeshore | terraces | touristed | visits | hotels | habitat | waterfalls | watersport | backyard | excursion | freshwater | grottoes | villas | wading | touristic\n",
|
|
"Topic 10: temple | temples | shrines | attractions | bangkok | concerts | pilgrimage | promenades | scenic | promenade | festival | venue | shrine | disneyland | scenically | festivities | tourism | picturesque | tourist | choreography | tours | festivals | ceremonies | concert | coliseum | rituals | sanctum | venues | monuments | visiting | touristical | touristy | kudeta | tourists | reservations | sightseeing | breathtaking | sightseeings | gatherings | ritual | touristified | tour | breathtakingly | monastery | resort | visit | scenery | kuta | touristic | auditorium\n",
|
|
"Topic 11: temple | tanah | temples | tannah | pilgrimage | tanha | attractions | picturesque | tana | scenically | sundara | shrines | bangkok | tourism | pagoda | pagodas | visiting | tanhalot | scenic | mosque | sightseeings | sightseeing | tanaha | tanunj | tanoh | tanjung | tourist | tandon | tourists | monastery | tan | sightsee | tanh | mandira | mecca | mandala | goa | visita | visit | sunset | monuments | shrine | bengong | scenery | kandara | paradise | tanalot | disneyland | beachfront | visitng\n",
|
|
"Topic 12: beach | beaches | buffets | seafoods | seafood | buffet | beachs | cruises | restaurants | coastal | restaurant | beachside | beachgoers | bay | seashore | dining | diner | beachfront | shrimp | buffett | beachbars | shrimps | resort | seaside | shoreline | beache | beached | ashore | diners | coastline | beachcombing | marinade | coast | waterfront | resorts | bangkok | coastlines | picnicking | lagoons | boattrip | oceanfront | lunches | beachclubs | lagoon | bays | beachwear | tourism | barbecue | resturants | catering\n",
|
|
"Topic 13: beaches | beach | beachs | nusa | nusadua | coastal | resorts | resort | beachfront | coastline | beachside | coastlines | seaside | ashore | beache | visita | lagoons | scenically | lagoon | dusa | seashore | beachwalk | dunes | shoreline | nusas | scenic | inland | coast | surfs | surf | tourism | hotels | coasts | nusantara | oceanfront | noosa | maldives | beached | waterfront | nusapenida | tourist | sands | shore | merapi | reefs | bangkok | tourists | attractions | touristique | ocean\n",
|
|
"Topic 14: beaches | beach | beachs | seminyak | beachfront | seminyal | beachside | dunes | coastal | semiyak | coastline | kilimanjaro | coastlines | seminiyak | seashore | jakarta | beachgoers | seaside | bangkok | inland | desert | beached | beachwalk | beachwear | bahamas | resort | shoreline | scenically | lagoons | beachbars | ashore | beachcombing | lagoon | waterfront | beache | beachclub | resorts | arid | seminak | oceanfront | surf | beachline | beachclubs | beachbag | bengong | lakeside | surfs | coast | onshore | sands\n",
|
|
"Topic 15: beach | beaches | scenically | scenic | beachwalk | beachs | beachfront | beachside | scenery | shoreline | picturesque | oceanfront | coastline | seaside | beached | beachcombing | coastal | resort | waterfront | beachline | coastlines | ashore | surf | beachclub | beache | seashore | beachwear | hiking | surfs | beachclubs | reefs | cliffs | resorts | dunes | coast | beachgoers | ocean | beachbars | surfing | seafront | hike | beachbag | sightseeing | onshore | tourist | sightseeings | breathtaking | shore | tourism | boardwalk\n",
|
|
"Topic 16: beach | beaches | kuta | beachs | kudeta | kura | bangkok | beachside | beachfront | ku | kusuma | beachwalk | beache | coastal | kutat | kumbakarna | seaside | seashore | kupit | beached | inland | jakarta | kumala | waterfront | scenically | beachcombing | hawaii | kurusetra | kubu | kilimanjaro | shoreline | dunes | lagoons | kute | ashore | beachgoers | beachwear | beachbars | kutuh | kursi | kutta | lagoon | kumingan | maui | beachline | coastline | kudis | desert | resort | coastlines\n",
|
|
"Topic 17: temple | temples | shrines | shrine | pilgrimage | monastery | monuments | bangkok | monument | pagoda | scenically | visiting | sanctum | visitng | tourism | visit | tours | scenic | tourist | tombs | pagodas | mecca | grotto | mountaineering | statues | picturesque | landmarks | attractions | sightseeing | terraces | tomb | statue | scenery | tourists | templed | grottoes | mandir | sightseeings | mt | pyramid | mountainous | dharmawangsa | mosque | mandira | angkor | visits | disneyland | orangutang | archway | courtyard\n",
|
|
"Topic 18: hawkers | beaches | beach | beachs | coastal | beachcombing | beachwear | beachside | seaside | beachfront | harbour | hawker | beachgoers | beachbars | coastlines | coastline | ashore | beachbag | beachwalk | beached | seashore | onshore | lagoons | beache | shoreline | forshore | touristshops | wharf | tourists | coasts | waterfront | mangrove | sandcastles | bay | beachline | resort | patios | mangroves | beachclubs | tourist | coast | shore | restaurants | resorts | bukit | tourism | seascape | cruises | buffets | foreshore\n",
|
|
"Topic 19: scenically | scenic | volcano | scenery | resort | mt | mountain | volcanic | volcanoes | volcanos | mountaineering | attractions | sightseeings | sightseeing | visit | mountains | resorts | picturesque | visiting | tourism | sightsee | mountainous | erupted | hike | excursion | summit | volcanoe | mountainside | valcano | volcanics | tourist | summits | hiking | landmarks | eruption | mountaintop | tourists | hikes | lake | excursions | eruptions | landscape | trekkingtour | maui | lakeview | grotto | tours | touristy | caldera | touristed\n",
|
|
"Topic 20: scenically | scenic | scenery | beaches | resort | island | bahamas | beach | tourism | picturesque | oceanfront | paradise | surabaya | lembongan | waterfalls | islands | waterfall | bermuda | waterfront | attractions | visit | visiting | surf | resorts | beachfront | tourist | ashore | touristy | reefs | tourists | landscapes | surfs | boattrip | ocean | breathtakingly | surfing | volcanic | valcano | beachside | breathtaking | afar | coastlines | mauritius | seafront | crete | tsunami | ruins | coastline | sunrise | nearby\n",
|
|
"Topic 21: elephants | elephant | zoo | zoos | zoological | zookeeper | zookeepers | orangutangs | attractions | disneyland | fauna | orangutang | expeditions | zooperkids | exhibits | animale | rainforest | mammoth | wildlife | rhino | animals | orangutans | rhinos | mud | tours | adventures | giraffes | rainforesty | orangutan | adventure | zoovenir | mudbath | owl | exhibitions | apes | foraging | jungle | horseback | exhibit | rhinoceros | mudded | exhibition | expedition | giraffe | barn | monkeyforest | volunteered | excursions | jungles | whale\n",
|
|
"Topic 22: temple | temples | kuta | kudeta | bangkok | shrines | kura | kumingan | kumala | pagoda | pilgrimage | kusuma | mandira | ku | jakarta | mandir | kutuh | kurusetra | pagodas | kutta | kilimanjaro | scenically | kutat | shrine | klunkung | tourism | kursi | attractions | kuku | kumbakarna | monastery | kubu | visiting | sanctum | klungkung | mecca | tourist | traveloka | disneyland | kudis | visit | kuts | scenic | monuments | kute | kandara | tourists | kupit | monument | mosque\n",
|
|
"Topic 23: beaches | beach | beachs | jakarta | beachside | beachfront | coastal | kilimanjaro | resort | coastline | coastlines | beachwalk | seashore | ashore | beachgoers | seaside | surfs | surf | bahamas | tourism | lagoons | touristy | beachcombing | shoreline | tourist | bangkok | indonesia | maui | lagoon | scenically | beache | inland | waterfront | resorts | beached | surfing | dunes | beachbars | hawaii | sulawesi | beachclub | tourists | beachwear | beachline | onshore | surffing | reefs | islands | beachclubs | attractions\n",
|
|
"Topic 24: beaches | beach | beachfront | pandawa | shoreline | beachs | coastline | coastal | coastlines | beachside | pandawan | seashore | seaside | ashore | pandawas | lagoons | lagoon | oceanfront | seafront | beachwalk | hawaii | dunes | coast | inland | scenically | shore | bukit | surf | onshore | beached | waterfront | reefs | surfs | beachcombing | forshore | disneyland | pandama | lakeshore | beachline | sand | lakeside | pandava | desert | beachgoers | shangri | scenic | bangkok | padangtegal | kilimanjaro | reef\n",
|
|
"Topic 25: tirtagangga | tirtaganga | tiratagangga | tirta | gangga | ganga | visitng | titra | ponds | attractions | bengong | amplapura | ganesha | amlapura | tika | ibiza | visiti | sumatra | tourism | fountains | visita | wading | lagoons | lakeside | resort | villas | lanka | waterfalls | scenically | visit | jakarta | karagasem | surabaya | visiting | gardens | river | gannga | lagoon | changgu | borneo | ayam | sundara | baths | tourist | tegalalang | waterfall | tegallalang | tapas | kandara | pagodas\n",
|
|
"Topic 26: temples | temple | shrines | ulun | shrine | ulan | ulawatu | uluwata | uluwati | uluwatu | mandira | uluawatu | erawan | pagoda | ulundanu | mandir | danua | nepal | uluwantu | kandara | visiti | ganesha | pagodas | monument | ulwathu | dharmawangsa | mecca | uluvatu | uluatu | pilgrimage | visitng | nelayan | attractions | goa | scenically | lake | uluwathu | bangkok | ulluwatu | monuments | kilimanjaro | sanctum | monastery | mosque | diwali | paneida | ullun | krishna | kudeta | ullawatu\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"topics, probs, unq_num = model.get_topics()\n",
|
|
"\n",
|
|
"for i, topic_words in enumerate(topics):\n",
|
|
" print(f\"Topic {unq_num[i]}: {' | '.join(topic_words)}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "b3ea7ebe",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Search by term\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"id": "d1c855fb",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\n",
|
|
"Searching for topics related to 'monkey':\n",
|
|
"Topic 0: monkeyes | monkeys | monkey | monkeyforest | monkeying | primates | zoos | zoo | primate | monkeyed | apes | macaque | macaques | chimpanzees | zookeepers | zoological | wildlife | monkies | ape | chimpanzee | rainforest | monkeydance | monkes | forest | lemurs | rainforests | park | gibbon | monkeies | chimps | jungles | zookeeper | gorillas | attractions | forests | jungle | gazebo | visiting | orangutangs | borneo | zooperkids | parks | bananas | wilderness | rainforesty | monkees | gibbons | foraging | sightseeing | tourists\n",
|
|
"Topic 5: temples | temple | shrines | sanctum | zoo | monuments | zoos | scenically | pagodas | scenic | pagoda | attractions | monument | monastery | shrine | tourism | tourist | monkeys | picturesque | grottoes | tourists | monkes | landmarks | pilgrimage | monkeies | monkies | templed | scenery | visiting | grotto | monkey | sightseeings | balcony | statues | apes | shangri | sightseeing | touristical | monkeyforest | monkeyes | touristy | visit | disneyland | gazebo | majestic | sacred | park | terraces | zookeepers | touristic\n",
|
|
"Topic 4: zoo | zoos | zoological | zookeepers | zookeeper | wildlife | attractions | disneyland | zooperkids | zoovenir | foraging | elephants | elephant | animals | fauna | orangutangs | visiting | orangutang | accomodations | accomodation | tourism | habitat | picnicking | visits | habitats | excursions | tourists | touristy | excursion | tourist | accomadation | visitors | touristical | sightseeing | exhibits | visit | touristed | pets | picnics | animal | disneyworld | sightseeings | rainforest | orangutans | lion | orangutan | tours | giraffes | outings | park\n",
|
|
"Topic 21: elephants | elephant | zoo | zoos | zoological | zookeeper | zookeepers | orangutangs | attractions | disneyland | fauna | orangutang | expeditions | zooperkids | exhibits | animale | rainforest | mammoth | wildlife | rhino | animals | orangutans | rhinos | mud | tours | adventures | giraffes | rainforesty | orangutan | adventure | zoovenir | mudbath | owl | exhibitions | apes | foraging | jungle | horseback | exhibit | rhinoceros | mudded | exhibition | expedition | giraffe | barn | monkeyforest | volunteered | excursions | jungles | whale\n",
|
|
"Topic 24: beaches | beach | beachfront | pandawa | shoreline | beachs | coastline | coastal | coastlines | beachside | pandawan | seashore | seaside | ashore | pandawas | lagoons | lagoon | oceanfront | seafront | beachwalk | hawaii | dunes | coast | inland | scenically | shore | bukit | surf | onshore | beached | waterfront | reefs | surfs | beachcombing | forshore | disneyland | pandama | lakeshore | beachline | sand | lakeside | pandava | desert | beachgoers | shangri | scenic | bangkok | padangtegal | kilimanjaro | reef\n",
|
|
"Topic 8: uluwatu | temples | uluwati | uluwantu | temple | uluwata | uluwathu | ulawatu | shrines | uluawatu | uluvatu | uluatu | attractions | kilimanjaro | ulwathu | bangkok | uuwatu | ulluwatu | uluwatutemple | monuments | mecca | monument | ulwatu | ullawatu | dharmawangsa | tourism | shrine | ulundanu | ulan | ulun | klunkung | tourist | pilgrimage | disneyland | sanctum | landmarks | orangutang | scenically | ullun | klungkung | visiting | tourists | visit | angkor | purajati | orangutangs | scenic | kumingan | touristy | kutuh\n",
|
|
"Topic 20: scenically | scenic | scenery | beaches | resort | island | bahamas | beach | tourism | picturesque | oceanfront | paradise | surabaya | lembongan | waterfalls | islands | waterfall | bermuda | waterfront | attractions | visit | visiting | surf | resorts | beachfront | tourist | ashore | touristy | reefs | tourists | landscapes | surfs | boattrip | ocean | breathtakingly | surfing | volcanic | valcano | beachside | breathtaking | afar | coastlines | mauritius | seafront | crete | tsunami | ruins | coastline | sunrise | nearby\n",
|
|
"Topic 12: beach | beaches | buffets | seafoods | seafood | buffet | beachs | cruises | restaurants | coastal | restaurant | beachside | beachgoers | bay | seashore | dining | diner | beachfront | shrimp | buffett | beachbars | shrimps | resort | seaside | shoreline | beache | beached | ashore | diners | coastline | beachcombing | marinade | coast | waterfront | resorts | bangkok | coastlines | picnicking | lagoons | boattrip | oceanfront | lunches | beachclubs | lagoon | bays | beachwear | tourism | barbecue | resturants | catering\n",
|
|
"Topic 23: beaches | beach | beachs | jakarta | beachside | beachfront | coastal | kilimanjaro | resort | coastline | coastlines | beachwalk | seashore | ashore | beachgoers | seaside | surfs | surf | bahamas | tourism | lagoons | touristy | beachcombing | shoreline | tourist | bangkok | indonesia | maui | lagoon | scenically | beache | inland | waterfront | resorts | beached | surfing | dunes | beachbars | hawaii | sulawesi | beachclub | tourists | beachwear | beachline | onshore | surffing | reefs | islands | beachclubs | attractions\n",
|
|
"Topic 14: beaches | beach | beachs | seminyak | beachfront | seminyal | beachside | dunes | coastal | semiyak | coastline | kilimanjaro | coastlines | seminiyak | seashore | jakarta | beachgoers | seaside | bangkok | inland | desert | beached | beachwalk | beachwear | bahamas | resort | shoreline | scenically | lagoons | beachbars | ashore | beachcombing | lagoon | waterfront | beache | beachclub | resorts | arid | seminak | oceanfront | surf | beachline | beachclubs | beachbag | bengong | lakeside | surfs | coast | onshore | sands\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"search_term = \"monkey\"\n",
|
|
"\n",
|
|
"print(f\"\\nSearching for topics related to '{search_term}':\")\n",
|
|
"num_topics = min(model.get_num_topics(), 10)\n",
|
|
"topic_words, _, _, _ = model.search_topics(\n",
|
|
" keywords=[search_term], num_topics=num_topics\n",
|
|
")\n",
|
|
"\n",
|
|
"for words in topic_words:\n",
|
|
" topics, probs, unq_num = model.get_topics()\n",
|
|
" for i, topic_words in enumerate(topics):\n",
|
|
" if set(words).issubset(set(topic_words)):\n",
|
|
" unq_num = unq_num[i]\n",
|
|
" break\n",
|
|
"\n",
|
|
" print(f\"Topic {unq_num}: {' | '.join(words)}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "1c3b894e",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Search by topic ID\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"id": "b7dbf512",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Topic 0:\n",
|
|
"Top words: monkeyes | monkeys | monkey | monkeyforest | monkeying | primates | zoos | zoo | primate | monkeyed | apes | macaque | macaques | chimpanzees | zookeepers | zoological | wildlife | monkies | ape | chimpanzee | rainforest | monkeydance | monkes | forest | lemurs | rainforests | park | gibbon | monkeies | chimps | jungles | zookeeper | gorillas | attractions | forests | jungle | gazebo | visiting | orangutangs | borneo | zooperkids | parks | bananas | wilderness | rainforesty | monkees | gibbons | foraging | sightseeing | tourists\n",
|
|
"Doc 1 (Score: 0.92): as our accommodation was right next to monkey forest it was worth a visit. the monkeys are extremely tame and used to visitors. if you buy some food to feed them they'll jump on your shoulder, which is great if you are wanting a picture with them. very beautiful surroundings in the forest too.\n",
|
|
"Doc 2 (Score: 0.91): the monkey forest was nice to visit. the monkeys are so sweet and you see them up close. you can also buy bananas to feed the monkeys - also a good thing to do if you want a picture with a monkey. but if you buy bananas be careful. if the monkeys see them they come to you. i would recommend only to show them to the small ones and feed them. one of the bigger monkeys saw my two last bananas and came to me and i gave one of them. but the monkey also wanted the other one so it bit me. nothing dangerous. one from the monkey forest staff told me all the monkeys are clean.\n",
|
|
"Doc 3 (Score: 0.91): this place is a must for everyone visiting . my husband and brother said this was the highlight of their trip. they bought lots of bananas and had lots of monkeys climbing on them. myself, my daughter and my sister in law only fed a monkey once to get a picture. we had lots of laughs watching the boys interact with the monkeys. please note that the monkeys aren't tame. there are signs around with advice about the monkeys definitely read and follow them. good place to have a walk around only need about a hour to visit.\n",
|
|
"Doc 4 (Score: 0.91): this place is amazing! the stroll through the forest is very nice and there are some amazing bridges and sculptures carved into the rocks, with a stream of water flowing. the monkeys are quite naughty, so its best to keep a safe distance. don't touch them, or get close. they steal hats, sunglasses, cameras, and will try to get into your bag if you have food. keep a safe distance and you will be fine. a great place!\n",
|
|
"Doc 5 (Score: 0.91): despite hearing somewhat negative opinions on the monkey sanctuary, i was excited to travel to this forest and see the local monkeys. i wasn't disappointed! there are monkeys running around everywhere, and in addition, the temples and statues are beautiful to look at. if you follow the rules of the sanctuary, the monkeys will probably not bother you. if you want to get up close and personal with them, there are bananas for sale that you can buy and share with the monkeys. keep a tight hold on your things, as when i was leaving, i saw a monkey grab another person's phone and run off with it. just be wary of the mischievous monkeys and you'll have a great time! probably my favorite experience in !\n",
|
|
"Doc 6 (Score: 0.91): beautiful location and lots of nice views in this forest. we had always planned to go to monkey forest as we had heard so much about it. it was nice to see how free they monkeys are and how many staff were there to ensure that the monkeys were treated well by visitors. make sure you keep any valuables hidden away and try not to go into your bag in front of the monkeys as they will think you are taking out something for them and when they have their eye on you, they will stalk you! :) this was such a fun day and at the end of the trip we bought a handful of bananas for a reasonable price from a seller within the forest. the monkeys are happy to jump all over you and rifle through your clothes, hair etc. i guess this is not for the faint hearted but i couldn't have visited ubud without going here! would 100% recommend to others :)\n",
|
|
"Doc 7 (Score: 0.91): the monkey forest was better than i expected, it's a really large area to walk around with some temples, bridges & carvings-lots of photo opportunities & nice scenery there are way more monkeys than i expected! i'm a wuss & did not buy bananas to let the monkeys climb on me but we did watch some others do this-there are plenty of staff supervising & they were telling tourists which monkeys were friendly & ok to approach & which ones to give space i had heard horror stories about aggressive monkeys, pick pocketing monkeys but didn't find any of this my partner sat down & some smaller ones climbed into his lap, the babies especially are cute to watch, fun activity\n",
|
|
"Doc 8 (Score: 0.91): monkey forest was a great place to visit, but it's important to follow certain rules. - it's best to visit early morning before the crowds and heat - do not carry any food and bananas (they say it everywhere but some people do anyway and that's where most of the horror stories come from) - do not have anything shiny on display - monkeys will take it for sure! we even saw a monkey try to take a girls earrings! - if the monkey climbs on you - stay calm and walk away - it really works - screaming and trying to shake it off - doesn't) the forest itself is more like a park - it's nice to see and you can get some great photos of the monkeys\n",
|
|
"Doc 9 (Score: 0.91): at the recommendation of friends we visited the sacred monkey forest sanctuary in ubud - the forest was beautiful and there were certainly plenty of monkeys. we'd been warned not to get too close to the monkeys as they could bite (my girlfriends daughter had been bitten on a previous visit) - my husband and our friend's son decided to buy some bananas so that they could have photo's taken with the monkeys sitting on them - rather them than me :). caution - hold on tightly to your backpacks/cameras/handbags as we'd also been told that the monkeys could grab them and scamper back into the forest. pleasant way to spend an hour or so.\n",
|
|
"Doc 10 (Score: 0.91): i love monkeys!!! this trip is well worth it to see these cheeky critters. i could literally watch them for hours. its very cheap to get in the monkey forest and if you want to feed them you can get a real up close n personal. pack all your sunglasses, water bottle anything they can steal away. its a beautiful area i love it. there are keepers and handlers around which can be needed as monkeys can be aggressive and bite which you would need a rabies shot for especially if you have something they want. (they are wild animals at the end of the day) not normally interested in you if you don't have food. funny lil buggers. also beautiful statues in the area. there is a temple here. i always recommend monkey forest to anyone who hasn't been its magical\n",
|
|
"Doc 11 (Score: 0.91): great experience as with the other times we visited...they say take all your jewelry of etc but really the only thing the monkeys are interested in is food, so if you don't buy the bananas & no food on you, they will not go near you! i recommend buying the bananas to get that amazing pics with a monkey on you! they have guides who you can pay a couple of dollars to ensure you get that perfect pic with the monkeys...beauitful small forest...only saw one aggressive monkey who was soon scared away by the guides...lots of baby monkeys with their mummas...my 4 year old son had his first experience of a monkey on him...the monkeys are very used to tourists unlike in other areas of . see you next year monkey forest!\n",
|
|
"Doc 12 (Score: 0.91): we visited the monkey forest as part of day long tour and i wasn't sure what to expect but it was truly magical to be able to see this monkeys in a (mostly) natural habitat- if they want to be near people they can be and if they don't they can run off into the forest which makes a nice change from seeing animals caged up. there are banana stands where you can buy different sized bunches of bananas to feed to the monkeys. although i was way too scared to do this my sister did and monkeys were climbing up on her to get the bananas which was equally exciting to watch! i would definitely recommend visiting this place.\n",
|
|
"Doc 13 (Score: 0.91): we visited the sacred monkey forest sanctuary in ubud with 2 adults, 1 teen and 3 kids under 8. it was a hot, sticky day and from the moment our driver pulled up we were delighted to see monkeys everywhere, some mischievously hopping around and others just relaxing and ignoring the dozens of tourists. we were warned to leave all belongings in the car and we were glad we did when we saw monkeys launching at people to steal their water bottles and snacks. there was such a wide variety of monkey ages and personalities, some adorable shy baby monkeys, some older, calmer monkeys, an angry one or two the a while bunch of cheeky ones. half the group had monkeys climb on us and the others enjoyed from a distance. the forest itself is gorgeous but it was difficult to take in the sites when there are so many people. the staff varied from friendly and helpful to rip-off merchants, taking our money then giving bananas straight to the monkeys and shrugging their shoulders. definitely a worthwhile experience.\n",
|
|
"Doc 14 (Score: 0.91): spent an enjoyable hour or so wandering around the monkey sanctuary near ubud. this area is set out like an open zoo with the animals wandering freely along the paths and surrounding forest. they are fed regularly, so there's none of the aggression you sometimes get with the fully wild monkeys who have become used to being fed by tourists. they seemed healthy and well, and the large numbers of babies with their mothers was a real treat. we were advised to look not touch the monkeys, but if you wanted a more personal experience you could buy bananas, and straight away monkeys were climbing on your shoulders, or grabbing them out of your hands! plenty of staff on hand if rescue needed :-) this complex includes an two attractive temples, and a well paved walk way through the trees.\n",
|
|
"Doc 15 (Score: 0.91): i was nervous about going here after reading some of the reviews about misbehaving monkeys, but i had a great time. it's probably a good idea to be mindful about having food or things they'd like to steal, but we didn't have any issues. none of the monkeys even tried to touch us. there was an employee who walked around the whole place with us to make sure there weren't any issues, and she only had to shoo one monkey once. our guide told us that a lot fewer people have been going, and we thought that was a real shame. (in fact, we saw much more aughty\\ monkeys at one of the temples than we did here.) the monkeys were a lot of fun, and it's a beautiful, neat, jungley forest. there was also a fun little spot where you could take your pictures wearing traditional hats/crowns and holding a bat. definitely recommended!\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"topic_id = 0\n",
|
|
"\n",
|
|
"print(f\"Topic {topic_id}:\")\n",
|
|
"print(\"Top words:\", \" | \".join(topics[topic_id]))\n",
|
|
"\n",
|
|
"docs, doc_scores, doc_ids = model.search_documents_by_topic(\n",
|
|
" topic_num=topic_id, num_docs=15\n",
|
|
")\n",
|
|
"for i, doc in enumerate(docs):\n",
|
|
" print(f\"Doc {i+1} (Score: {doc_scores[i]:.2f}): {doc}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"id": "be02d93c",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/media/marvin/Dev/Git/masterthesis-praktisch/.venv/lib/python3.12/site-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
|
|
" warnings.warn(\n",
|
|
"/media/marvin/Dev/Git/masterthesis-praktisch/.venv/lib/python3.12/site-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
|
|
" warn(\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"application/vnd.plotly.v1+json": {
|
|
"config": {
|
|
"plotlyServerURL": "https://plot.ly"
|
|
},
|
|
"data": [
|
|
{
|
|
"customdata": [
|
|
[
|
|
"monkeyes | monkeys | monkey | monkeyforest | monkeying",
|
|
0
|
|
],
|
|
[
|
|
"beach | beaches | beachs | beachside | beachfront",
|
|
1
|
|
],
|
|
[
|
|
"temple | temples | shrines | shrine | pilgrimage",
|
|
2
|
|
],
|
|
[
|
|
"scenically | scenic | picturesque | attractions | scenery",
|
|
3
|
|
],
|
|
[
|
|
"zoo | zoos | zoological | zookeepers | zookeeper",
|
|
4
|
|
],
|
|
[
|
|
"temples | temple | shrines | sanctum | zoo",
|
|
5
|
|
],
|
|
[
|
|
"beaches | sanur | beach | san | beachs",
|
|
6
|
|
],
|
|
[
|
|
"mountaineering | hike | hiking | hikes | climb",
|
|
7
|
|
],
|
|
[
|
|
"uluwatu | temples | uluwati | uluwantu | temple",
|
|
8
|
|
],
|
|
[
|
|
"gardens | attractions | scenically | scenic | resort",
|
|
9
|
|
],
|
|
[
|
|
"temple | temples | shrines | attractions | bangkok",
|
|
10
|
|
],
|
|
[
|
|
"temple | tanah | temples | tannah | pilgrimage",
|
|
11
|
|
],
|
|
[
|
|
"beach | beaches | buffets | seafoods | seafood",
|
|
12
|
|
],
|
|
[
|
|
"beaches | beach | beachs | nusa | nusadua",
|
|
13
|
|
],
|
|
[
|
|
"beaches | beach | beachs | seminyak | beachfront",
|
|
14
|
|
],
|
|
[
|
|
"beach | beaches | scenically | scenic | beachwalk",
|
|
15
|
|
],
|
|
[
|
|
"beach | beaches | kuta | beachs | kudeta",
|
|
16
|
|
],
|
|
[
|
|
"temple | temples | shrines | shrine | pilgrimage",
|
|
17
|
|
],
|
|
[
|
|
"hawkers | beaches | beach | beachs | coastal",
|
|
18
|
|
],
|
|
[
|
|
"scenically | scenic | volcano | scenery | resort",
|
|
19
|
|
],
|
|
[
|
|
"scenically | scenic | scenery | beaches | resort",
|
|
20
|
|
],
|
|
[
|
|
"elephants | elephant | zoo | zoos | zoological",
|
|
21
|
|
],
|
|
[
|
|
"temple | temples | kuta | kudeta | bangkok",
|
|
22
|
|
],
|
|
[
|
|
"beaches | beach | beachs | jakarta | beachside",
|
|
23
|
|
],
|
|
[
|
|
"beaches | beach | beachfront | pandawa | shoreline",
|
|
24
|
|
],
|
|
[
|
|
"tirtagangga | tirtaganga | tiratagangga | tirta | gangga",
|
|
25
|
|
],
|
|
[
|
|
"temples | temple | shrines | ulun | shrine",
|
|
26
|
|
]
|
|
],
|
|
"hovertemplate": "Size=%{customdata[1]}<br>Topic Number=%{text}<br>Top Words=%{customdata[0]}<extra></extra>",
|
|
"legendgroup": "",
|
|
"marker": {
|
|
"color": "#636efa",
|
|
"size": {
|
|
"bdata": "AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBka",
|
|
"dtype": "i1"
|
|
},
|
|
"sizemode": "area",
|
|
"sizeref": 0.065,
|
|
"symbol": "circle"
|
|
},
|
|
"mode": "markers+text",
|
|
"name": "",
|
|
"orientation": "v",
|
|
"showlegend": false,
|
|
"text": {
|
|
"bdata": "AAAAAAAp0UAAAAAAADTAQAAAAAAAJLBAAAAAAABYrkAAAAAAANykQAAAAAAADqBAAAAAAADImEAAAAAAAOCXQAAAAAAA6JVAAAAAAAB0k0AAAAAAACSTQAAAAAAAWJJAAAAAAACYkUAAAAAAADyRQAAAAAAACJFAAAAAAAAwj0AAAAAAAJCNQAAAAAAAcIFAAAAAAACggEAAAAAAAGCAQAAAAAAAoHxAAAAAAABgfEAAAAAAAKB5QAAAAAAAQHlAAAAAAABAeUAAAAAAAOB1QAAAAAAAAHRA",
|
|
"dtype": "f8"
|
|
},
|
|
"textposition": "top center",
|
|
"type": "scatter",
|
|
"x": {
|
|
"bdata": "GzdhQA+gCkC55YtABfESQHwRSUCfwoZA+ws2P+/UE0AuKJtA3zY9QNTSkkBIU4NA13P5P9XDaj8GOr0/w7fGPzgfkj8HZqNAacXqPwTd8j+WCgxAH9RiQGJbnEADc7g/CWOuP8gYZUBB6qdA",
|
|
"dtype": "f4"
|
|
},
|
|
"xaxis": "x",
|
|
"y": {
|
|
"bdata": "x8P7QAfkLEFPBQJBTywPQVyNAkGK3/dA5DwkQWJdBUH6vgJB9mwUQTiJ7ECmfORALRQnQdluKUG/9CdB1YgWQcroLUEO0+pAsUovQcR4A0GV/RVBdu8FQXLI90AmRDVB6JEiQX6kFUFJggNB",
|
|
"dtype": "f4"
|
|
},
|
|
"yaxis": "y"
|
|
}
|
|
],
|
|
"layout": {
|
|
"legend": {
|
|
"itemsizing": "constant",
|
|
"tracegroupgap": 0
|
|
},
|
|
"template": {
|
|
"data": {
|
|
"bar": [
|
|
{
|
|
"error_x": {
|
|
"color": "#2a3f5f"
|
|
},
|
|
"error_y": {
|
|
"color": "#2a3f5f"
|
|
},
|
|
"marker": {
|
|
"line": {
|
|
"color": "#E5ECF6",
|
|
"width": 0.5
|
|
},
|
|
"pattern": {
|
|
"fillmode": "overlay",
|
|
"size": 10,
|
|
"solidity": 0.2
|
|
}
|
|
},
|
|
"type": "bar"
|
|
}
|
|
],
|
|
"barpolar": [
|
|
{
|
|
"marker": {
|
|
"line": {
|
|
"color": "#E5ECF6",
|
|
"width": 0.5
|
|
},
|
|
"pattern": {
|
|
"fillmode": "overlay",
|
|
"size": 10,
|
|
"solidity": 0.2
|
|
}
|
|
},
|
|
"type": "barpolar"
|
|
}
|
|
],
|
|
"carpet": [
|
|
{
|
|
"aaxis": {
|
|
"endlinecolor": "#2a3f5f",
|
|
"gridcolor": "white",
|
|
"linecolor": "white",
|
|
"minorgridcolor": "white",
|
|
"startlinecolor": "#2a3f5f"
|
|
},
|
|
"baxis": {
|
|
"endlinecolor": "#2a3f5f",
|
|
"gridcolor": "white",
|
|
"linecolor": "white",
|
|
"minorgridcolor": "white",
|
|
"startlinecolor": "#2a3f5f"
|
|
},
|
|
"type": "carpet"
|
|
}
|
|
],
|
|
"choropleth": [
|
|
{
|
|
"colorbar": {
|
|
"outlinewidth": 0,
|
|
"ticks": ""
|
|
},
|
|
"type": "choropleth"
|
|
}
|
|
],
|
|
"contour": [
|
|
{
|
|
"colorbar": {
|
|
"outlinewidth": 0,
|
|
"ticks": ""
|
|
},
|
|
"colorscale": [
|
|
[
|
|
0,
|
|
"#0d0887"
|
|
],
|
|
[
|
|
0.1111111111111111,
|
|
"#46039f"
|
|
],
|
|
[
|
|
0.2222222222222222,
|
|
"#7201a8"
|
|
],
|
|
[
|
|
0.3333333333333333,
|
|
"#9c179e"
|
|
],
|
|
[
|
|
0.4444444444444444,
|
|
"#bd3786"
|
|
],
|
|
[
|
|
0.5555555555555556,
|
|
"#d8576b"
|
|
],
|
|
[
|
|
0.6666666666666666,
|
|
"#ed7953"
|
|
],
|
|
[
|
|
0.7777777777777778,
|
|
"#fb9f3a"
|
|
],
|
|
[
|
|
0.8888888888888888,
|
|
"#fdca26"
|
|
],
|
|
[
|
|
1,
|
|
"#f0f921"
|
|
]
|
|
],
|
|
"type": "contour"
|
|
}
|
|
],
|
|
"contourcarpet": [
|
|
{
|
|
"colorbar": {
|
|
"outlinewidth": 0,
|
|
"ticks": ""
|
|
},
|
|
"type": "contourcarpet"
|
|
}
|
|
],
|
|
"heatmap": [
|
|
{
|
|
"colorbar": {
|
|
"outlinewidth": 0,
|
|
"ticks": ""
|
|
},
|
|
"colorscale": [
|
|
[
|
|
0,
|
|
"#0d0887"
|
|
],
|
|
[
|
|
0.1111111111111111,
|
|
"#46039f"
|
|
],
|
|
[
|
|
0.2222222222222222,
|
|
"#7201a8"
|
|
],
|
|
[
|
|
0.3333333333333333,
|
|
"#9c179e"
|
|
],
|
|
[
|
|
0.4444444444444444,
|
|
"#bd3786"
|
|
],
|
|
[
|
|
0.5555555555555556,
|
|
"#d8576b"
|
|
],
|
|
[
|
|
0.6666666666666666,
|
|
"#ed7953"
|
|
],
|
|
[
|
|
0.7777777777777778,
|
|
"#fb9f3a"
|
|
],
|
|
[
|
|
0.8888888888888888,
|
|
"#fdca26"
|
|
],
|
|
[
|
|
1,
|
|
"#f0f921"
|
|
]
|
|
],
|
|
"type": "heatmap"
|
|
}
|
|
],
|
|
"histogram": [
|
|
{
|
|
"marker": {
|
|
"pattern": {
|
|
"fillmode": "overlay",
|
|
"size": 10,
|
|
"solidity": 0.2
|
|
}
|
|
},
|
|
"type": "histogram"
|
|
}
|
|
],
|
|
"histogram2d": [
|
|
{
|
|
"colorbar": {
|
|
"outlinewidth": 0,
|
|
"ticks": ""
|
|
},
|
|
"colorscale": [
|
|
[
|
|
0,
|
|
"#0d0887"
|
|
],
|
|
[
|
|
0.1111111111111111,
|
|
"#46039f"
|
|
],
|
|
[
|
|
0.2222222222222222,
|
|
"#7201a8"
|
|
],
|
|
[
|
|
0.3333333333333333,
|
|
"#9c179e"
|
|
],
|
|
[
|
|
0.4444444444444444,
|
|
"#bd3786"
|
|
],
|
|
[
|
|
0.5555555555555556,
|
|
"#d8576b"
|
|
],
|
|
[
|
|
0.6666666666666666,
|
|
"#ed7953"
|
|
],
|
|
[
|
|
0.7777777777777778,
|
|
"#fb9f3a"
|
|
],
|
|
[
|
|
0.8888888888888888,
|
|
"#fdca26"
|
|
],
|
|
[
|
|
1,
|
|
"#f0f921"
|
|
]
|
|
],
|
|
"type": "histogram2d"
|
|
}
|
|
],
|
|
"histogram2dcontour": [
|
|
{
|
|
"colorbar": {
|
|
"outlinewidth": 0,
|
|
"ticks": ""
|
|
},
|
|
"colorscale": [
|
|
[
|
|
0,
|
|
"#0d0887"
|
|
],
|
|
[
|
|
0.1111111111111111,
|
|
"#46039f"
|
|
],
|
|
[
|
|
0.2222222222222222,
|
|
"#7201a8"
|
|
],
|
|
[
|
|
0.3333333333333333,
|
|
"#9c179e"
|
|
],
|
|
[
|
|
0.4444444444444444,
|
|
"#bd3786"
|
|
],
|
|
[
|
|
0.5555555555555556,
|
|
"#d8576b"
|
|
],
|
|
[
|
|
0.6666666666666666,
|
|
"#ed7953"
|
|
],
|
|
[
|
|
0.7777777777777778,
|
|
"#fb9f3a"
|
|
],
|
|
[
|
|
0.8888888888888888,
|
|
"#fdca26"
|
|
],
|
|
[
|
|
1,
|
|
"#f0f921"
|
|
]
|
|
],
|
|
"type": "histogram2dcontour"
|
|
}
|
|
],
|
|
"mesh3d": [
|
|
{
|
|
"colorbar": {
|
|
"outlinewidth": 0,
|
|
"ticks": ""
|
|
},
|
|
"type": "mesh3d"
|
|
}
|
|
],
|
|
"parcoords": [
|
|
{
|
|
"line": {
|
|
"colorbar": {
|
|
"outlinewidth": 0,
|
|
"ticks": ""
|
|
}
|
|
},
|
|
"type": "parcoords"
|
|
}
|
|
],
|
|
"pie": [
|
|
{
|
|
"automargin": true,
|
|
"type": "pie"
|
|
}
|
|
],
|
|
"scatter": [
|
|
{
|
|
"fillpattern": {
|
|
"fillmode": "overlay",
|
|
"size": 10,
|
|
"solidity": 0.2
|
|
},
|
|
"type": "scatter"
|
|
}
|
|
],
|
|
"scatter3d": [
|
|
{
|
|
"line": {
|
|
"colorbar": {
|
|
"outlinewidth": 0,
|
|
"ticks": ""
|
|
}
|
|
},
|
|
"marker": {
|
|
"colorbar": {
|
|
"outlinewidth": 0,
|
|
"ticks": ""
|
|
}
|
|
},
|
|
"type": "scatter3d"
|
|
}
|
|
],
|
|
"scattercarpet": [
|
|
{
|
|
"marker": {
|
|
"colorbar": {
|
|
"outlinewidth": 0,
|
|
"ticks": ""
|
|
}
|
|
},
|
|
"type": "scattercarpet"
|
|
}
|
|
],
|
|
"scattergeo": [
|
|
{
|
|
"marker": {
|
|
"colorbar": {
|
|
"outlinewidth": 0,
|
|
"ticks": ""
|
|
}
|
|
},
|
|
"type": "scattergeo"
|
|
}
|
|
],
|
|
"scattergl": [
|
|
{
|
|
"marker": {
|
|
"colorbar": {
|
|
"outlinewidth": 0,
|
|
"ticks": ""
|
|
}
|
|
},
|
|
"type": "scattergl"
|
|
}
|
|
],
|
|
"scattermap": [
|
|
{
|
|
"marker": {
|
|
"colorbar": {
|
|
"outlinewidth": 0,
|
|
"ticks": ""
|
|
}
|
|
},
|
|
"type": "scattermap"
|
|
}
|
|
],
|
|
"scattermapbox": [
|
|
{
|
|
"marker": {
|
|
"colorbar": {
|
|
"outlinewidth": 0,
|
|
"ticks": ""
|
|
}
|
|
},
|
|
"type": "scattermapbox"
|
|
}
|
|
],
|
|
"scatterpolar": [
|
|
{
|
|
"marker": {
|
|
"colorbar": {
|
|
"outlinewidth": 0,
|
|
"ticks": ""
|
|
}
|
|
},
|
|
"type": "scatterpolar"
|
|
}
|
|
],
|
|
"scatterpolargl": [
|
|
{
|
|
"marker": {
|
|
"colorbar": {
|
|
"outlinewidth": 0,
|
|
"ticks": ""
|
|
}
|
|
},
|
|
"type": "scatterpolargl"
|
|
}
|
|
],
|
|
"scatterternary": [
|
|
{
|
|
"marker": {
|
|
"colorbar": {
|
|
"outlinewidth": 0,
|
|
"ticks": ""
|
|
}
|
|
},
|
|
"type": "scatterternary"
|
|
}
|
|
],
|
|
"surface": [
|
|
{
|
|
"colorbar": {
|
|
"outlinewidth": 0,
|
|
"ticks": ""
|
|
},
|
|
"colorscale": [
|
|
[
|
|
0,
|
|
"#0d0887"
|
|
],
|
|
[
|
|
0.1111111111111111,
|
|
"#46039f"
|
|
],
|
|
[
|
|
0.2222222222222222,
|
|
"#7201a8"
|
|
],
|
|
[
|
|
0.3333333333333333,
|
|
"#9c179e"
|
|
],
|
|
[
|
|
0.4444444444444444,
|
|
"#bd3786"
|
|
],
|
|
[
|
|
0.5555555555555556,
|
|
"#d8576b"
|
|
],
|
|
[
|
|
0.6666666666666666,
|
|
"#ed7953"
|
|
],
|
|
[
|
|
0.7777777777777778,
|
|
"#fb9f3a"
|
|
],
|
|
[
|
|
0.8888888888888888,
|
|
"#fdca26"
|
|
],
|
|
[
|
|
1,
|
|
"#f0f921"
|
|
]
|
|
],
|
|
"type": "surface"
|
|
}
|
|
],
|
|
"table": [
|
|
{
|
|
"cells": {
|
|
"fill": {
|
|
"color": "#EBF0F8"
|
|
},
|
|
"line": {
|
|
"color": "white"
|
|
}
|
|
},
|
|
"header": {
|
|
"fill": {
|
|
"color": "#C8D4E3"
|
|
},
|
|
"line": {
|
|
"color": "white"
|
|
}
|
|
},
|
|
"type": "table"
|
|
}
|
|
]
|
|
},
|
|
"layout": {
|
|
"annotationdefaults": {
|
|
"arrowcolor": "#2a3f5f",
|
|
"arrowhead": 0,
|
|
"arrowwidth": 1
|
|
},
|
|
"autotypenumbers": "strict",
|
|
"coloraxis": {
|
|
"colorbar": {
|
|
"outlinewidth": 0,
|
|
"ticks": ""
|
|
}
|
|
},
|
|
"colorscale": {
|
|
"diverging": [
|
|
[
|
|
0,
|
|
"#8e0152"
|
|
],
|
|
[
|
|
0.1,
|
|
"#c51b7d"
|
|
],
|
|
[
|
|
0.2,
|
|
"#de77ae"
|
|
],
|
|
[
|
|
0.3,
|
|
"#f1b6da"
|
|
],
|
|
[
|
|
0.4,
|
|
"#fde0ef"
|
|
],
|
|
[
|
|
0.5,
|
|
"#f7f7f7"
|
|
],
|
|
[
|
|
0.6,
|
|
"#e6f5d0"
|
|
],
|
|
[
|
|
0.7,
|
|
"#b8e186"
|
|
],
|
|
[
|
|
0.8,
|
|
"#7fbc41"
|
|
],
|
|
[
|
|
0.9,
|
|
"#4d9221"
|
|
],
|
|
[
|
|
1,
|
|
"#276419"
|
|
]
|
|
],
|
|
"sequential": [
|
|
[
|
|
0,
|
|
"#0d0887"
|
|
],
|
|
[
|
|
0.1111111111111111,
|
|
"#46039f"
|
|
],
|
|
[
|
|
0.2222222222222222,
|
|
"#7201a8"
|
|
],
|
|
[
|
|
0.3333333333333333,
|
|
"#9c179e"
|
|
],
|
|
[
|
|
0.4444444444444444,
|
|
"#bd3786"
|
|
],
|
|
[
|
|
0.5555555555555556,
|
|
"#d8576b"
|
|
],
|
|
[
|
|
0.6666666666666666,
|
|
"#ed7953"
|
|
],
|
|
[
|
|
0.7777777777777778,
|
|
"#fb9f3a"
|
|
],
|
|
[
|
|
0.8888888888888888,
|
|
"#fdca26"
|
|
],
|
|
[
|
|
1,
|
|
"#f0f921"
|
|
]
|
|
],
|
|
"sequentialminus": [
|
|
[
|
|
0,
|
|
"#0d0887"
|
|
],
|
|
[
|
|
0.1111111111111111,
|
|
"#46039f"
|
|
],
|
|
[
|
|
0.2222222222222222,
|
|
"#7201a8"
|
|
],
|
|
[
|
|
0.3333333333333333,
|
|
"#9c179e"
|
|
],
|
|
[
|
|
0.4444444444444444,
|
|
"#bd3786"
|
|
],
|
|
[
|
|
0.5555555555555556,
|
|
"#d8576b"
|
|
],
|
|
[
|
|
0.6666666666666666,
|
|
"#ed7953"
|
|
],
|
|
[
|
|
0.7777777777777778,
|
|
"#fb9f3a"
|
|
],
|
|
[
|
|
0.8888888888888888,
|
|
"#fdca26"
|
|
],
|
|
[
|
|
1,
|
|
"#f0f921"
|
|
]
|
|
]
|
|
},
|
|
"colorway": [
|
|
"#636efa",
|
|
"#EF553B",
|
|
"#00cc96",
|
|
"#ab63fa",
|
|
"#FFA15A",
|
|
"#19d3f3",
|
|
"#FF6692",
|
|
"#B6E880",
|
|
"#FF97FF",
|
|
"#FECB52"
|
|
],
|
|
"font": {
|
|
"color": "#2a3f5f"
|
|
},
|
|
"geo": {
|
|
"bgcolor": "white",
|
|
"lakecolor": "white",
|
|
"landcolor": "#E5ECF6",
|
|
"showlakes": true,
|
|
"showland": true,
|
|
"subunitcolor": "white"
|
|
},
|
|
"hoverlabel": {
|
|
"align": "left"
|
|
},
|
|
"hovermode": "closest",
|
|
"mapbox": {
|
|
"style": "light"
|
|
},
|
|
"paper_bgcolor": "white",
|
|
"plot_bgcolor": "#E5ECF6",
|
|
"polar": {
|
|
"angularaxis": {
|
|
"gridcolor": "white",
|
|
"linecolor": "white",
|
|
"ticks": ""
|
|
},
|
|
"bgcolor": "#E5ECF6",
|
|
"radialaxis": {
|
|
"gridcolor": "white",
|
|
"linecolor": "white",
|
|
"ticks": ""
|
|
}
|
|
},
|
|
"scene": {
|
|
"xaxis": {
|
|
"backgroundcolor": "#E5ECF6",
|
|
"gridcolor": "white",
|
|
"gridwidth": 2,
|
|
"linecolor": "white",
|
|
"showbackground": true,
|
|
"ticks": "",
|
|
"zerolinecolor": "white"
|
|
},
|
|
"yaxis": {
|
|
"backgroundcolor": "#E5ECF6",
|
|
"gridcolor": "white",
|
|
"gridwidth": 2,
|
|
"linecolor": "white",
|
|
"showbackground": true,
|
|
"ticks": "",
|
|
"zerolinecolor": "white"
|
|
},
|
|
"zaxis": {
|
|
"backgroundcolor": "#E5ECF6",
|
|
"gridcolor": "white",
|
|
"gridwidth": 2,
|
|
"linecolor": "white",
|
|
"showbackground": true,
|
|
"ticks": "",
|
|
"zerolinecolor": "white"
|
|
}
|
|
},
|
|
"shapedefaults": {
|
|
"line": {
|
|
"color": "#2a3f5f"
|
|
}
|
|
},
|
|
"ternary": {
|
|
"aaxis": {
|
|
"gridcolor": "white",
|
|
"linecolor": "white",
|
|
"ticks": ""
|
|
},
|
|
"baxis": {
|
|
"gridcolor": "white",
|
|
"linecolor": "white",
|
|
"ticks": ""
|
|
},
|
|
"bgcolor": "#E5ECF6",
|
|
"caxis": {
|
|
"gridcolor": "white",
|
|
"linecolor": "white",
|
|
"ticks": ""
|
|
}
|
|
},
|
|
"title": {
|
|
"x": 0.05
|
|
},
|
|
"xaxis": {
|
|
"automargin": true,
|
|
"gridcolor": "white",
|
|
"linecolor": "white",
|
|
"ticks": "",
|
|
"title": {
|
|
"standoff": 15
|
|
},
|
|
"zerolinecolor": "white",
|
|
"zerolinewidth": 2
|
|
},
|
|
"yaxis": {
|
|
"automargin": true,
|
|
"gridcolor": "white",
|
|
"linecolor": "white",
|
|
"ticks": "",
|
|
"title": {
|
|
"standoff": 15
|
|
},
|
|
"zerolinecolor": "white",
|
|
"zerolinewidth": 2
|
|
}
|
|
}
|
|
},
|
|
"title": {
|
|
"text": "Top2Vec Topic Visualization (2D)"
|
|
},
|
|
"xaxis": {
|
|
"anchor": "y",
|
|
"domain": [
|
|
0,
|
|
1
|
|
],
|
|
"title": {
|
|
"text": "x"
|
|
}
|
|
},
|
|
"yaxis": {
|
|
"anchor": "x",
|
|
"domain": [
|
|
0,
|
|
1
|
|
],
|
|
"title": {
|
|
"text": "y"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"import plotly.express as px\n",
|
|
"import pandas as pd\n",
|
|
"from umap import UMAP\n",
|
|
"\n",
|
|
"# Get topic metadata\n",
|
|
"topic_vectors = model.topic_vectors\n",
|
|
"topic_words = model.get_topics()[0]\n",
|
|
"topic_nums, topic_sizes = model.get_topic_sizes()\n",
|
|
"\n",
|
|
"# Reduce vectors to 2D using UMAP\n",
|
|
"umap_model = UMAP(n_neighbors=15, n_components=2, metric=\"cosine\", random_state=42)\n",
|
|
"topic_coords = umap_model.fit_transform(topic_vectors)\n",
|
|
"\n",
|
|
"# Ensure all components are 1D lists\n",
|
|
"topic_nums = list(topic_nums)\n",
|
|
"topic_sizes = list(topic_sizes)\n",
|
|
"topic_labels = [\" | \".join(words[:5]) for words in topic_words]\n",
|
|
"\n",
|
|
"# Build DataFrame\n",
|
|
"df = pd.DataFrame(\n",
|
|
" {\n",
|
|
" \"x\": topic_coords[:, 0],\n",
|
|
" \"y\": topic_coords[:, 1],\n",
|
|
" \"Topic Number\": topic_nums,\n",
|
|
" \"Size\": topic_sizes,\n",
|
|
" \"Top Words\": topic_labels,\n",
|
|
" }\n",
|
|
")\n",
|
|
"\n",
|
|
"# Plot using Plotly\n",
|
|
"fig = px.scatter(\n",
|
|
" df,\n",
|
|
" x=\"x\",\n",
|
|
" y=\"y\",\n",
|
|
" size=\"Size\",\n",
|
|
" text=\"Topic Number\",\n",
|
|
" hover_data={\"Top Words\": True, \"Size\": True, \"x\": False, \"y\": False},\n",
|
|
" title=\"Top2Vec Topic Visualization (2D)\",\n",
|
|
")\n",
|
|
"fig.update_traces(textposition=\"top center\")\n",
|
|
"fig.show()"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": ".venv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.12.3"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|