# Topic Detection: Bali Tourist Reviews


## Preparation

### Dependency Loading


In [2]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
from gensim.models.coherencemodel import CoherenceModel
from hdbscan import HDBSCAN
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from umap import UMAP
import gensim.corpora as corpora
import json
import nltk
import numpy as np
import pandas as pd
import re
import spacy
import pickle

nlp = spacy.load("en_core_web_sm")

nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

  from .autonotebook import tqdm as notebook_tqdm
2025-06-21 16:17:57.575812: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750515477.586643  647691 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750515477.590106  647691 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1750515477.599119  647691 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1750515477.599128  647691 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1750515477.599129  647691

True

### Parameters and Tracking


In [3]:
RECREATE_MODEL = True
RECREATE_REDUCED_MODEL = False
PROCESS_DATA = False
REDUCE_OUTLIERS = False
USE_CONDENSED_MODEL = False

# Vectorization
MIN_DOCUMENT_FREQUENCY = 1
MAX_NGRAM = 2

# HDBSCAN Parameters
MIN_TOPIC_SIZE = 200
MIN_SAMPLES = 25

# UMAP Parameters
N_NEIGHBORS = 15
N_COMPONENTS = 2
MIN_DIST = 0.01

# Topic Modeling
TOP_N_WORDS = 10
MAX_TOPICS = None  # or "auto" to pass to HDBSCAN, None to skip

### Data Loading & Preprocessing


In [4]:
reviews = (
    pd.read_csv("data.tab", sep="\t").review.dropna().to_list()
)  # .sample(5_000, random_state=42)

print("Loaded {} reviews".format(len(reviews)))

Loaded 56446 reviews


In [5]:
# List of NE in Bali for NER enhancement
with open("bali_ner.json", "r") as f:
    bali_places = json.load(f)
bali_places_set = set(bali_places)

# Stop word definition
extra_stopwords = ["bali", "idr", "usd"]
stop_words = set(stopwords.words("english"))
with open("stopwords-en.json", "r") as f:
    extra_stopwords.extend(json.load(f))

# Custom replacements
rep = {
    r"\\n": " ",
    r"\n": " ",
    r'\\"': "",
    r'"': "",
    "mongkey": "monkey",
    "monky": "monkey",
    "verry": "very",
}
rep = dict((re.escape(k), v) for k, v in rep.items())
pattern = re.compile("|".join(rep.keys()))

lemmatizer = WordNetLemmatizer()


def preprocess(text):
    # Step 1: Apply custom replacements (typos, special cases)
    text = text.lower()
    text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)

    # Step 2: Clean text
    text = re.sub(r"\d+", " ", text)
    text = re.sub(r"\W+", " ", text)

    doc = nlp(text)

    # Step 3: POS tagging and filtering
    filtered_tokens = [
        token.text
        for token in doc
        if token.pos_ in {"NOUN", "PROPN"}
        or token.ent_type_ in {"GPE", "LOC", "FAC"}
        or token.text in bali_places_set
    ]

    # Step 4: Lemmatization and stopword removal
    lemmatized_tokens = [
        lemmatizer.lemmatize(w)
        for w in filtered_tokens
        if w not in stop_words and w not in extra_stopwords and len(w) > 2
    ]

    return lemmatized_tokens

In [6]:
if PROCESS_DATA:
    print("Processing reviews...")
    reviews = [preprocess(review) for review in reviews]

    with open("processed_texts.pkl", "wb") as f:
        pickle.dump(reviews, f)
else:
    with open("processed_texts.pkl", "rb") as f:
        reviews = pickle.load(f)
        reviews = [
            " ".join(review) if isinstance(review, list) else review
            for review in reviews
        ]

print(reviews[:1])

['experience gita host knowledge enthusiasm driver road experience orangutan bucket list item ticket penny experience']


### Pre-calculate Embeddings


In [7]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(reviews, show_progress_bar=True)

Batches: 100%|██████████| 1764/1764 [00:08<00:00, 217.66it/s]


## Model Creation


### Dimensionality Reduction (UMAP)


In [8]:
umap_model = UMAP(
    n_neighbors=N_NEIGHBORS,
    n_components=N_COMPONENTS,
    min_dist=MIN_DIST,
    metric="cosine",
    low_memory=True,
    random_state=42,
)
reduced_embeddings = umap_model.fit_transform(embeddings)

### BERTopic Model Creation


In [9]:
if RECREATE_MODEL:
    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
    vectorizer_model = CountVectorizer(
        min_df=MIN_DOCUMENT_FREQUENCY, ngram_range=(1, MAX_NGRAM)
    )

    representation_model = KeyBERTInspired()
    hdbscan_model = HDBSCAN(
        min_cluster_size=MIN_TOPIC_SIZE,
        min_samples=MIN_SAMPLES,
        metric="euclidean",
        cluster_selection_method="eom",
        gen_min_span_tree=True,
        prediction_data=True,
    )

    topic_model = BERTopic(
        embedding_model=embedding_model,
        ctfidf_model=ctfidf_model,
        vectorizer_model=vectorizer_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        representation_model=representation_model,
        verbose=True,
        calculate_probabilities=True,
        language="english",
        top_n_words=TOP_N_WORDS,
        nr_topics=MAX_TOPICS,
    )

    topics, probs = topic_model.fit_transform(reviews, embeddings=embeddings)

    topic_labels = topic_model.generate_topic_labels(
        nr_words=3, topic_prefix=True, word_length=15, separator=" - "
    )
    topic_model.set_topic_labels(topic_labels)
    # BERTopic.save(topic_model, "bertopic/model.bertopic")
else:
    print("Nevermind, loading existing model")
    # topic_model = BERTopic.load("bertopic/model.bertopic")

2025-06-21 16:19:06,736 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-21 16:19:32,934 - BERTopic - Dimensionality - Completed ✓
2025-06-21 16:19:32,935 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-21 16:19:38,734 - BERTopic - Cluster - Completed ✓
2025-06-21 16:19:38,741 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-06-21 16:19:40,681 - BERTopic - Representation - Completed ✓


## Fine Tuning

### Topic Condensation


In [10]:
if RECREATE_REDUCED_MODEL:
    done = False
    iteration = 1
    while not done:
        print(f"Iteration {iteration}")
        iteration += 1
        similarity_matrix = cosine_similarity(
            np.array(topic_model.topic_embeddings_)[1:, :]
        )
        nothing_to_merge = True

        for i in range(similarity_matrix.shape[0]):
            for j in range(i + 1, similarity_matrix.shape[1]):
                sim = similarity_matrix[i, j]
                if sim > 0.9:
                    nothing_to_merge = False
                    t1, t2 = i, j
                    try:
                        t1_name = topic_model.get_topic_info(t1)["CustomName"][0]
                        t2_name = topic_model.get_topic_info(t2)["CustomName"][0]
                        print(
                            f"Merging topics {t1} ({t1_name}) and {t2} ({t2_name}) with similarity {sim:.2f}"
                        )
                        topic_model.merge_topics(reviews, topics_to_merge=[t1, t2])

                        topic_labels = topic_model.generate_topic_labels(
                            nr_words=3,
                            topic_prefix=True,
                            word_length=15,
                            separator=" - ",
                        )
                        topic_model.set_topic_labels(topic_labels)
                    except Exception as e:
                        print(f"Failed to merge {t1} and {t2}: {e}")
        if nothing_to_merge:
            print("No more topics to merge.")
            done = True

    # BERTopic.save(topic_model, "bertopic/model_reduced.bertopic")
elif USE_CONDENSED_MODEL:
    print("Nevermind, loading existing reduced model")
    topic_model = BERTopic.load("bertopic/model_reduced.bertopic")
else:
    print("Skipping topic reduction")

Skipping topic reduction


### Outlier Reduction


In [11]:
if REDUCE_OUTLIERS:
    new_topics = topic_model.reduce_outliers(
        reviews,
        topic_model.topics_,
        probabilities=topic_model.probabilities_,
        threshold=0.05,
        strategy="probabilities",
    )
    topic_model.update_topics(reviews, topics=new_topics)

## Results

### Document Visualization


In [12]:
vis = topic_model.visualize_documents(
    docs=reviews,
    reduced_embeddings=reduced_embeddings,
    custom_labels=True,
    hide_annotations=True,
)
vis.write_html("bertopic/visualization.html")
vis

### Similarity Matrix


In [13]:
topic_model.visualize_heatmap()

### Topic Info


In [14]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,CustomName,Representation,Representative_Docs
0,-1,5367,-1_temple rock_temple_tourist_restaurant,-1 - temple rock - temple - tourist,"[temple rock, temple, tourist, restaurant, tem...",[sea nusa lembongan island boat trip nusa lemb...
1,0,18773,0_monkey bag_bag monkey_monkey banana_banana m...,0 - monkey bag - bag monkey - monkey banana,"[monkey bag, bag monkey, monkey banana, banana...",[morning monkey forest day ubud fun animal mon...
2,1,12021,1_restaurant beach_hotel beach_beach restauran...,1 - restaurant beac - hotel beach - beach rest...,"[restaurant beach, hotel beach, beach restaura...",[sanur beach beach strip wave sun plenty resta...
3,2,3044,2_zoo breakfast_breakfast zoo_breakfast orangu...,2 - zoo breakfast - breakfast zoo - breakfast ...,"[zoo breakfast, breakfast zoo, breakfast orang...",[time zoo day breakfast orangutan experience b...
4,3,2187,3_monkey temple_temple monkey_temple cliff_cli...,3 - monkey temple - temple monkey - temple cliff,"[monkey temple, temple monkey, temple cliff, c...",[temple cliff view ocean cliff walk view monke...
5,4,2011,4_hike_sunrise hike_trekking_sunrise trek,4 - hike - sunrise hike - trekking,"[hike, sunrise hike, trekking, sunrise trek, h...",[mountain batur sea level hiker crater experie...
6,5,1411,5_temple uluwatu_uluwatu temple_dance uluwatu_...,5 - temple uluwatu - uluwatu temple - dance ul...,"[temple uluwatu, uluwatu temple, dance uluwatu...",[excursion discova trip uluwatu temple site tr...
7,6,1144,6_temple tanah_tanah lot_lot temple_visit tanah,6 - temple tanah - tanah lot - lot temple,"[temple tanah, tanah lot, lot temple, visit ta...","[tanah lot temple view, road tanah lot traffic..."
8,7,1114,7_rubbish beach_beach rubbish_beach garbage_be...,7 - rubbish beach - beach rubbish - beach garbage,"[rubbish beach, beach rubbish, beach garbage, ...",[beach hawker kuta beach battle rubbish washin...
9,8,900,8_beach nusa_nusa beach_dua beach_resort nusa,8 - beach nusa - nusa beach - dua beach,"[beach nusa, nusa beach, dua beach, resort nus...","[beach hotel nusa dua beach, nusa dua beach ho..."


### Semantic Coherence


In [15]:
topic_words = []
for topic_id in range(len(topic_model.get_topic_info()) - 1):
    words = [word for word, _ in topic_model.get_topic(topic_id)]
    topic_words.append(words)

# Compute mean pairwise cosine similarity for each topic
coherence_scores = []
for words in topic_words:
    coherence_embeddings = embedding_model.encode(words)
    sim_matrix = cosine_similarity(coherence_embeddings)
    np.fill_diagonal(sim_matrix, 0)  # Ignore self-similarity
    mean_sim = np.mean(sim_matrix)
    coherence_scores.append(mean_sim)

overall_coherence = np.mean(coherence_scores)

print(len(reviews), "reviews processed")
print(len(topic_model.get_topic_info()) - 1, "topics found")
print(f"BERT-based Topic Coherence: {overall_coherence:.4f}")

56446 reviews processed
26 topics found
BERT-based Topic Coherence: 0.5763


### Topic Coherence


In [17]:
# https://github.com/MaartenGr/BERTopic/issues/90#issuecomment-820915389

# Preprocess Documents
documents = pd.DataFrame(
    {"Document": reviews, "ID": range(len(reviews)), "Topic": topics}
)
documents_per_topic = documents.groupby(["Topic"], as_index=False).agg(
    {"Document": " ".join}
)
cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

# Extract vectorizer and analyzer from BERTopic
vectorizer = topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()

# Extract features for Topic Coherence evaluation
words = vectorizer.get_feature_names_out()
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [
    [words for words, _ in topic_model.get_topic(topic)]
    for topic in range(len(set(topics)) - 1)
]

%env TOKENIZERS_PARALLELISM=false

for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]:
    coherence_model = CoherenceModel(
        topics=topic_words,
        texts=tokens,
        corpus=corpus,
        dictionary=dictionary,
        coherence=measurement,
    )
    coherence_score = coherence_model.get_coherence()
    print(f"Coherence ({measurement}): {coherence_score:.4f}")

env: TOKENIZERS_PARALLELISM=false
Coherence (c_v): 0.8206
Coherence (u_mass): -0.3569
Coherence (c_uci): -0.2871
Coherence (c_npmi): 0.1441


### Term Search


In [19]:
search_term = "uluwatu"

similar_topics, similarities = topic_model.find_topics(search_term, top_n=10)
for i in range(len(similar_topics)):
    # \n{topic_model.get_topic(similar_topics[i])}\n
    print(
        f"{str(similarities[i])[:5]} {topic_model.get_topic_info(similar_topics[i])["CustomName"][0]}"
    )

0.582 5 - temple uluwatu - uluwatu temple - dance uluwatu
0.404 22 - bratan temple - temple lake - lake temple
0.243 17 - temple gate - gate temple - temple mount
0.240 20 - gangga palace - tirta gangga - gangga garden
0.206 -1 - temple rock - temple - tourist
0.197 2 - zoo breakfast - breakfast zoo - breakfast orang
0.195 8 - beach nusa - nusa beach - dua beach
0.193 18 - pandawa beach - beach pandawa - view beach
0.192 9 - kecak dance - temple kecak - dance temple
0.182 12 - photo tourist - tourist picture - photo people


### Topic Hierarchy


In [20]:
topic_model.visualize_hierarchy(custom_labels=True)

### Intertopic Distance Map


In [21]:
topic_model.visualize_topics()

### Topic Word Scores


In [22]:
topic_model.visualize_barchart(top_n_topics=12, custom_labels=True, n_words=10)

In [23]:
# from matplotlib import pyplot as plt
# from sklearn.manifold import TSNE


# topics = topic_model.topics_

# # Reduce dimensionality with TSNE
# tsne = TSNE(n_components=2, random_state=42)
# embeddings_2d = tsne.fit_transform(embeddings)

# # Prepare colors (assign a color to each topic)
# unique_topics = set(topics)
# colors = plt.get_cmap("tab20", len(unique_topics))

# # Plot
# plt.figure(figsize=(12, 8))
# for topic in unique_topics:
#     # Select indices for the current topic
#     indices = [i for i, t in enumerate(topics) if t == topic]

#     # Get 2D points for these indices
#     x = embeddings_2d[indices, 0]
#     y = embeddings_2d[indices, 1]

#     # Assign label (exclude outliers)
#     label = f"Topic {topic}" if topic != -1 else "Outliers"

#     # Plot with color
#     plt.scatter(x, y, color=colors(topic + 1), label=label, alpha=0.5)

# plt.title("Topic Clusters in 2D Embedding Space")
# plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
# plt.tight_layout()

# # Save the plot
# plt.savefig("topic_clusters.png", dpi=300, bbox_inches="tight")
# plt.show()