# Topic Detection: Bali Tourist Reviews


## Preparation

### Dependency Loading


In [2]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
from gensim.models.coherencemodel import CoherenceModel
from hdbscan import HDBSCAN
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from umap import UMAP
import gensim.corpora as corpora
import json
import nltk
import numpy as np
import pandas as pd
import re
import spacy
import pickle

nlp = spacy.load("en_core_web_sm")

nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to /home/marvin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/marvin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/marvin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Parameters and Tracking


In [2]:
RECREATE_MODEL = True
RECREATE_REDUCED_MODEL = False
PROCESS_DATA = False
REDUCE_OUTLIERS = False
USE_CONDENSED_MODEL = False

# Vectorization
MIN_DOCUMENT_FREQUENCY = 5
MAX_NGRAM = 2

# HDBSCAN Parameters
MIN_TOPIC_SIZE = 55
MIN_SAMPLES = 25

# UMAP Parameters
N_NEIGHBORS = 15
N_COMPONENTS = 2
MIN_DIST = 0.1

# Topic Modeling
TOP_N_WORDS = 10
MAX_TOPICS = None  # or "auto" to pass to HDBSCAN, None to skip

tracking = {
    "input": {
        "min_document_frequency": MIN_DOCUMENT_FREQUENCY,
        "max_ngram": MAX_NGRAM,
        "min_topic_size": MIN_TOPIC_SIZE,
        "min_samples": MIN_SAMPLES,
        "n_neighbors": N_NEIGHBORS,
        "n_components": N_COMPONENTS,
        "min_dist": MIN_DIST,
        "top_n_words": TOP_N_WORDS,
        "max_topics": MAX_TOPICS,
    },
}

### Data Loading & Preprocessing


In [3]:
reviews = pd.read_csv(
    "data.tab", sep="\t"
).review.to_list()  # .sample(5_000, random_state=42)

print("Loaded {} reviews".format(len(reviews)))

Loaded 56446 reviews


In [5]:
# Remove hard coded stop symbols
# for stop_symbol in STOP_SYMBOLS:
#     reviews = [
#         review.replace(stop_symbol, "") if isinstance(review, str) else review
#         for review in reviews
#     ]

# stop_words = set(stopwords.words("english"))
# lemmatizer = WordNetLemmatizer()

# extra_stopwords = STOP_WORDS.copy()
# with open("stopwords-en.json", "r") as f:
#     extra_stopwords.extend(json.load(f))


# def preprocess(text):
#     text = re.sub(r"\W+", " ", text.lower())
#     tokens = nltk.word_tokenize(text)
#     tokens = [
#         lemmatizer.lemmatize(w)
#         for w in tokens
#         if w not in stop_words and w not in extra_stopwords and len(w) > 2
#     ]
#     return " ".join(tokens)


# reviews = [preprocess(review) for review in reviews]
# reviews = [review for review in reviews if len(str(review).split()) >= MIN_REVIEW_WORDS]

# print(len(reviews), "reviews with >= {} words".format(MIN_REVIEW_WORDS))

In [None]:
# List of NE in Bali for NER enhancement
with open("bali_ner.json", "r") as f:
    bali_places = json.load(f)
bali_places_set = set(bali_places)

# Stop word definition
extra_stopwords = ["bali", "idr", "usd"]
stop_words = set(stopwords.words("english"))
with open("stopwords-en.json", "r") as f:
    extra_stopwords.extend(json.load(f))

# Custom replacements
rep = {
    r"\\n": " ",
    r"\n": " ",
    r'\\"': "",
    r'"': "",
    "mongkey": "monkey",
    "monky": "monkey",
    "verry": "very",
}
rep = dict((re.escape(k), v) for k, v in rep.items())
pattern = re.compile("|".join(rep.keys()))

lemmatizer = WordNetLemmatizer()


def preprocess(text):
    # Step 1: Apply custom replacements (typos, special cases)
    text = text.lower()
    text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)

    # Step 2: Clean text
    text = re.sub(r"\d+", " ", text)
    text = re.sub(r"\W+", " ", text)

    doc = nlp(text)

    # Step 3: POS tagging and filtering
    filtered_tokens = [
        token.text
        for token in doc
        if token.pos_ in {"NOUN", "PROPN"}
        or token.ent_type_ in {"GPE", "LOC", "FAC"}
        or token.text in bali_places_set
    ]

    # Step 4: Lemmatization and stopword removal
    lemmatized_tokens = [
        lemmatizer.lemmatize(w)
        for w in filtered_tokens
        if w not in stop_words and w not in extra_stopwords and len(w) > 2
    ]

    return lemmatized_tokens

In [7]:
if PROCESS_DATA:
    print("Processing reviews...")
    reviews = [preprocess(review) for review in reviews]

    with open("processed_texts.pkl", "wb") as f:
        pickle.dump(reviews, f)
else:
    with open("processed_texts.pkl", "rb") as f:
        reviews = pickle.load(f)
        reviews = [
            " ".join(review) if isinstance(review, list) else review
            for review in reviews
        ]

print(reviews[:1])

['experience gita host knowledge enthusiasm driver road experience orangutan bucket list item ticket penny experience']


### Pre-calculate Embeddings


In [8]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(reviews, show_progress_bar=True)

Batches: 100%|██████████| 1764/1764 [00:08<00:00, 203.84it/s]


## Model Creation


### Dimensionality Reduction (UMAP)


In [9]:
umap_model = UMAP(
    n_neighbors=N_NEIGHBORS,
    n_components=N_COMPONENTS,
    min_dist=MIN_DIST,
    metric="cosine",
    low_memory=True,
    random_state=42,
)
reduced_embeddings = umap_model.fit_transform(embeddings)

### BERTopic Model Creation


In [10]:
if RECREATE_MODEL:
    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
    vectorizer_model = CountVectorizer(
        min_df=MIN_DOCUMENT_FREQUENCY, ngram_range=(1, MAX_NGRAM)
    )

    representation_model = KeyBERTInspired()
    hdbscan_model = HDBSCAN(
        min_cluster_size=MIN_TOPIC_SIZE,
        min_samples=MIN_SAMPLES,
        metric="euclidean",
        cluster_selection_method="eom",
        gen_min_span_tree=True,
        prediction_data=True,
    )

    topic_model = BERTopic(
        embedding_model=embedding_model,
        ctfidf_model=ctfidf_model,
        vectorizer_model=vectorizer_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        representation_model=representation_model,
        verbose=True,
        calculate_probabilities=True,
        language="english",
        top_n_words=TOP_N_WORDS,
        nr_topics=MAX_TOPICS,
    )

    topics, probs = topic_model.fit_transform(reviews, embeddings=embeddings)

    topic_labels = topic_model.generate_topic_labels(
        nr_words=3, topic_prefix=True, word_length=15, separator=" - "
    )
    topic_model.set_topic_labels(topic_labels)
    # BERTopic.save(topic_model, "bertopic/model.bertopic")
else:
    print("Nevermind, loading existing model")
    # topic_model = BERTopic.load("bertopic/model.bertopic")

2025-06-18 16:04:49,107 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-18 16:05:17,287 - BERTopic - Dimensionality - Completed ✓
2025-06-18 16:05:17,288 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-18 16:05:22,595 - BERTopic - Cluster - Completed ✓
2025-06-18 16:05:22,601 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-06-18 16:05:23,660 - BERTopic - Representation - Completed ✓


## Fine Tuning

### Topic Condensation


In [11]:
if RECREATE_REDUCED_MODEL:
    done = False
    iteration = 1
    while not done:
        print(f"Iteration {iteration}")
        iteration += 1
        similarity_matrix = cosine_similarity(
            np.array(topic_model.topic_embeddings_)[1:, :]
        )
        nothing_to_merge = True

        for i in range(similarity_matrix.shape[0]):
            for j in range(i + 1, similarity_matrix.shape[1]):
                sim = similarity_matrix[i, j]
                if sim > 0.9:
                    nothing_to_merge = False
                    t1, t2 = i, j
                    try:
                        t1_name = topic_model.get_topic_info(t1)["CustomName"][0]
                        t2_name = topic_model.get_topic_info(t2)["CustomName"][0]
                        print(
                            f"Merging topics {t1} ({t1_name}) and {t2} ({t2_name}) with similarity {sim:.2f}"
                        )
                        topic_model.merge_topics(reviews, topics_to_merge=[t1, t2])

                        topic_labels = topic_model.generate_topic_labels(
                            nr_words=3,
                            topic_prefix=True,
                            word_length=15,
                            separator=" - ",
                        )
                        topic_model.set_topic_labels(topic_labels)
                    except Exception as e:
                        print(f"Failed to merge {t1} and {t2}: {e}")
        if nothing_to_merge:
            print("No more topics to merge.")
            done = True

    # BERTopic.save(topic_model, "bertopic/model_reduced.bertopic")
elif USE_CONDENSED_MODEL:
    print("Nevermind, loading existing reduced model")
    topic_model = BERTopic.load("bertopic/model_reduced.bertopic")
else:
    print("Skipping topic reduction")

Skipping topic reduction


### Outlier Reduction


In [12]:
if REDUCE_OUTLIERS:
    new_topics = topic_model.reduce_outliers(
        reviews,
        topic_model.topics_,
        probabilities=topic_model.probabilities_,
        threshold=0.05,
        strategy="probabilities",
    )
    topic_model.update_topics(reviews, topics=new_topics)

## Results

### Document Visualization


In [13]:
vis = topic_model.visualize_documents(
    docs=reviews,
    reduced_embeddings=reduced_embeddings,
    custom_labels=True,
    hide_annotations=True,
)
vis.write_html("bertopic/visualization.html")
vis

### Similarity Matrix


In [14]:
topic_model.visualize_heatmap()

### Topic Info


In [15]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,CustomName,Representation,Representative_Docs
0,-1,2223,-1_day temple_temple tourist_sunset temple_tem...,-1 - day temple - temple tourist - sunset temple,"[day temple, temple tourist, sunset temple, te...",[attraction view foot temple spring water expe...
1,0,17778,0_beach restaurant_beach club_beach beach_wave...,0 - beach restauran - beach club - beach beach,"[beach restaurant, beach club, beach beach, wa...",[sea sport sanur beach sand beach resort hotel...
2,1,17658,1_monkey tourist_park monkey_monkey visit_visi...,1 - monkey tourist - park monkey - monkey visit,"[monkey tourist, park monkey, monkey visit, vi...",[visit monkey forest ubud midday day monkey ph...
3,2,4613,2_temple uluwatu_dance temple_uluwatu temple_s...,2 - temple uluwatu - dance temple - uluwatu te...,"[temple uluwatu, dance temple, uluwatu temple,...",[hour kuta uluwatu holiday hour sunset driver ...
4,3,3034,3_zoo_zoo animal_kid zoo_elephant,3 - zoo - zoo animal - kid zoo,"[zoo, zoo animal, kid zoo, elephant, time anim...",[zoo elephant trek lion cub elephant ride phuk...
5,4,1968,4_hike_climb_hiking_trekking,4 - hike - climb - hiking,"[hike, climb, hiking, trekking, climbing, trai...",[ubud town hotel tour person bit price lot gui...
6,5,1627,5_temple shop_temple temple_temple market_tour...,5 - temple shop - temple temple - temple market,"[temple shop, temple temple, temple market, to...",[temple souvenir shop restaurant chain store s...
7,6,1528,6_tourist sunset_tourist picture_lot tourist_s...,6 - tourist sunset - tourist picture - lot tou...,"[tourist sunset, tourist picture, lot tourist,...",[opportunity photo season weather spot sunset ...
8,7,1228,7_water palace_water garden_water park_restaur...,7 - water palace - water garden - water park,"[water palace, water garden, water park, resta...",[water palace trip candidasa drive rice paddy ...
9,8,1086,8_temple tanah_lot temple_tanah lot_temple coast,8 - temple tanah - lot temple - tanah lot,"[temple tanah, lot temple, tanah lot, temple c...",[tanah lot temple rock shore tide base rock pr...


### Semantic Coherence


In [16]:
topic_words = []
for topic_id in range(len(topic_model.get_topic_info()) - 1):
    words = [word for word, _ in topic_model.get_topic(topic_id)]
    topic_words.append(words)

# Compute mean pairwise cosine similarity for each topic
coherence_scores = []
for words in topic_words:
    coherence_embeddings = embedding_model.encode(words)
    sim_matrix = cosine_similarity(coherence_embeddings)
    np.fill_diagonal(sim_matrix, 0)  # Ignore self-similarity
    mean_sim = np.mean(sim_matrix)
    coherence_scores.append(mean_sim)

overall_coherence = np.mean(coherence_scores)

print(len(reviews), "reviews processed")
print(len(topic_model.get_topic_info()) - 1, "topics found")
print(f"BERT-based Topic Coherence: {overall_coherence}")

tracking["output"] = {
    "num_reviews": len(reviews),
    "num_topics": len(topic_model.get_topic_info()) - 1,
    "outliers": int(topic_model.get_topic_info(-1)["Count"][0]),
    "coherence_score": str(overall_coherence)[:6],
}

with open("bertopic/tracking.json", "r") as f:
    tracking_data = json.load(f)

with open("bertopic/tracking.json", "w") as f:
    tracking_data.append(tracking)
    json.dump(tracking_data, f, indent=2)

56446 reviews processed
21 topics found
BERT-based Topic Coherence: 0.5482090711593628


### Topic Coherence


In [17]:
# https://github.com/MaartenGr/BERTopic/issues/90#issuecomment-820915389

# Preprocess Documents
documents = pd.DataFrame(
    {"Document": reviews, "ID": range(len(reviews)), "Topic": topics}
)
documents_per_topic = documents.groupby(["Topic"], as_index=False).agg(
    {"Document": " ".join}
)
cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

# Extract vectorizer and analyzer from BERTopic
vectorizer = topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()

# Extract features for Topic Coherence evaluation
words = vectorizer.get_feature_names_out()
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [
    [words for words, _ in topic_model.get_topic(topic)]
    for topic in range(len(set(topics)) - 1)
]

%env TOKENIZERS_PARALLELISM=false

for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]:
    coherence_model = CoherenceModel(
        topics=topic_words,
        texts=tokens,
        corpus=corpus,
        dictionary=dictionary,
        coherence=measurement,
    )
    coherence_score = coherence_model.get_coherence()
    print(f"Coherence ({measurement}): {coherence_score:.4f}")

env: TOKENIZERS_PARALLELISM=false
Coherence (c_v): 0.7351
Coherence (u_mass): -0.3053
Coherence (c_uci): -2.0835
Coherence (c_npmi): 0.0498


### Term Search


In [18]:
search_term = "uluwatu"

similar_topics, similarities = topic_model.find_topics(search_term, top_n=10)
for i in range(len(similar_topics)):
    # \n{topic_model.get_topic(similar_topics[i])}\n
    print(
        f"{str(similarities[i])[:5]} {topic_model.get_topic_info(similar_topics[i])["CustomName"][0]}"
    )

0.393 14 - bratan temple - ulun temple - lake temple
0.304 2 - temple uluwatu - dance temple - uluwatu temple
0.238 13 - temple gate - lempuyang templ - temple lempuyan
0.213 -1 - day temple - temple tourist - sunset temple
0.197 3 - zoo - zoo animal - kid zoo
0.193 12 - tourist beach - padang beach - beach tourist
0.191 20 - traffic temple - ticket booth - ticket tourist
0.188 7 - water palace - water garden - water park
0.185 17 - temple monkey - monkey temple - indiana jones
0.178 19 - monkey monkey - monkey - human monkey


### Topic Hierarchy


In [19]:
topic_model.visualize_hierarchy(custom_labels=True)

### Intertopic Distance Map


In [20]:
topic_model.visualize_topics()

### Topic Word Scores


In [21]:
topic_model.visualize_barchart(top_n_topics=12, custom_labels=True, n_words=10)

In [22]:
# from matplotlib import pyplot as plt
# from sklearn.manifold import TSNE


# topics = topic_model.topics_

# # Reduce dimensionality with TSNE
# tsne = TSNE(n_components=2, random_state=42)
# embeddings_2d = tsne.fit_transform(embeddings)

# # Prepare colors (assign a color to each topic)
# unique_topics = set(topics)
# colors = plt.get_cmap("tab20", len(unique_topics))

# # Plot
# plt.figure(figsize=(12, 8))
# for topic in unique_topics:
#     # Select indices for the current topic
#     indices = [i for i, t in enumerate(topics) if t == topic]

#     # Get 2D points for these indices
#     x = embeddings_2d[indices, 0]
#     y = embeddings_2d[indices, 1]

#     # Assign label (exclude outliers)
#     label = f"Topic {topic}" if topic != -1 else "Outliers"

#     # Plot with color
#     plt.scatter(x, y, color=colors(topic + 1), label=label, alpha=0.5)

# plt.title("Topic Clusters in 2D Embedding Space")
# plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
# plt.tight_layout()

# # Save the plot
# plt.savefig("topic_clusters.png", dpi=300, bbox_inches="tight")
# plt.show()