masterthesis-playground/bertopic/nb_bertopic_lowprep.py

# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: percent
#       format_version: '1.3'
#       jupytext_version: 1.18.0
#   kernelspec:
#     display_name: .venv (3.12.3)
#     language: python
#     name: python3
# ---

# %% [markdown]
# # Topic Detection: Bali Tourist Reviews
#

# %% [markdown]
# ## Preparation
#
# ### Dependency Loading
#

# %%
import pickle
import re

import gensim.corpora as corpora
import nltk
import numpy as np
import pandas as pd
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
from gensim.models.coherencemodel import CoherenceModel
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text as skltext
from sklearn.metrics.pairwise import cosine_similarity
from umap import UMAP

from bertopic import BERTopic

nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

# %% [markdown]
# ### Hyperparameters and Settings
#

# %%
RECREATE_MODEL = True
RECREATE_REDUCED_MODEL = True
PROCESS_DATA = False
REDUCE_OUTLIERS = False
CALCULATE_TOKEN_DISTRIBUTIONS = False

# Data Sample Size, -1 for all data
DATA_SAMPLE_SIZE = -1

# Vectorization
MIN_DOCUMENT_FREQUENCY = 1
MAX_NGRAM = 3

# HDBSCAN Parameters
MIN_TOPIC_SIZE = 200
MIN_SAMPLES = 25

# UMAP Parameters
N_NEIGHBORS = 15
N_COMPONENTS = 2
MIN_DIST = 0.01

# Topic Modeling
TOP_N_WORDS = 10
MAX_TOPICS = None  # or "auto" to pass to HDBSCAN, None to skip

TF_IDF_STOP_WORDS = ["bali", "place", "visit", "visited", "visiting"]

# %% [markdown]
# ### Data Loading & Preprocessing
#

# %%
# Import data after general preprocessing

if DATA_SAMPLE_SIZE == -1:
    reviews = pd.read_csv(
        "../data/intermediate/preprocessed.tab", sep="\t"
    ).review.to_list()
else:
    reviews = (
        pd.read_csv("../data/intermediate/preprocessed.tab", sep="\t")
        .sample(n=DATA_SAMPLE_SIZE)
        .review.to_list()
    )

print("Loaded {} reviews".format(len(reviews)))

# %%
rep = {
    r"\\n": " ",
    r"\n": " ",
    r'\\"': "",
    r'"': "",
    r"\s+": " ",
}
rep = dict((re.escape(k), v) for k, v in rep.items())
pattern = re.compile("|".join(rep.keys()))


def preprocess(text):
    text = text.strip()
    text = text.lower()
    text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
    return text


# %%
print(
    preprocess(
        "Excellent. Definitely worth coming while in bali. Food and people were very nice.\n🌟 🤩 ⭐️ \nTrisna was our host"
    )
)

# %%
if PROCESS_DATA:
    print("Processing reviews...")
    reviews = [preprocess(review) for review in reviews]

    with open("../data/intermediate/processed_texts_lowprep.pkl", "wb") as f:
        pickle.dump(reviews, f)
else:
    with open("../data/intermediate/processed_texts_lowprep.pkl", "rb") as f:
        reviews = pickle.load(f)

print(reviews[:1])

# %% [markdown]
# ### Pre-calculate Embeddings
#

# %%
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(reviews, show_progress_bar=True)

# %% [markdown]
# ## Model Creation
#

# %% [markdown]
# ### Dimensionality Reduction (UMAP)
#

# %%
umap_model = UMAP(
    n_neighbors=N_NEIGHBORS,
    n_components=N_COMPONENTS,
    min_dist=MIN_DIST,
    metric="cosine",
    low_memory=True,
    random_state=42,
)
reduced_embeddings = umap_model.fit_transform(embeddings)

# %% [markdown]
# ### BERTopic Model Creation
#

# %%
if RECREATE_MODEL:
    stop_words = list(skltext.ENGLISH_STOP_WORDS.union(TF_IDF_STOP_WORDS))

    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
    vectorizer_model = CountVectorizer(
        min_df=MIN_DOCUMENT_FREQUENCY,
        ngram_range=(1, MAX_NGRAM),
        stop_words=stop_words,
    )

    representation_model = KeyBERTInspired()
    hdbscan_model = HDBSCAN(
        min_cluster_size=MIN_TOPIC_SIZE,
        min_samples=MIN_SAMPLES,
        metric="euclidean",
        cluster_selection_method="eom",
        gen_min_span_tree=True,
        prediction_data=True,
    )

    topic_model = BERTopic(
        embedding_model=embedding_model,
        ctfidf_model=ctfidf_model,
        vectorizer_model=vectorizer_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        representation_model=representation_model,
        verbose=True,
        calculate_probabilities=True,
        language="english",
        top_n_words=TOP_N_WORDS,
        nr_topics=MAX_TOPICS,
    )

    topics, probs = topic_model.fit_transform(reviews, embeddings=embeddings)

    topic_labels = topic_model.generate_topic_labels(
        nr_words=3, topic_prefix=True, word_length=15, separator=" - "
    )
    topic_model.set_topic_labels(topic_labels)
    # BERTopic.save(topic_model, "bertopic/model.bertopic")
else:
    print("Nevermind, loading existing model")
    # topic_model = BERTopic.load("bertopic/model.bertopic")

# %% [markdown]
# ## Fine Tuning
#
# ### Topic Condensation
#

# %%
if RECREATE_REDUCED_MODEL:
    done = False
    iteration = 1
    while not done:
        print(f"Iteration {iteration}")
        iteration += 1
        similarity_matrix = cosine_similarity(
            np.array(topic_model.topic_embeddings_)[1:, :]
        )
        nothing_to_merge = True

        for i in range(similarity_matrix.shape[0]):
            for j in range(i + 1, similarity_matrix.shape[1]):
                try:
                    sim = similarity_matrix[i, j]
                    if sim > 0.9:
                        nothing_to_merge = False
                        t1, t2 = i, j
                        try:
                            t1_name = topic_model.get_topic_info(t1)["CustomName"][0]
                            t2_name = topic_model.get_topic_info(t2)["CustomName"][0]
                            print(
                                f"Merging topics {t1} ({t1_name}) and {t2} ({t2_name}) with similarity {sim:.2f}"
                            )
                            topic_model.merge_topics(reviews, topics_to_merge=[t1, t2])

                            topic_labels = topic_model.generate_topic_labels(
                                nr_words=3,
                                topic_prefix=True,
                                word_length=15,
                                separator=" - ",
                            )
                            topic_model.set_topic_labels(topic_labels)
                            similarity_matrix = cosine_similarity(
                                np.array(topic_model.topic_embeddings_)[1:, :]
                            )
                        except Exception as e:
                            print(f"Failed to merge {t1} and {t2}: {e}")
                except IndexError:
                    pass
        if nothing_to_merge:
            print("No more topics to merge.")
            done = True
else:
    print("Skipping topic reduction")

# %% [markdown]
# ### Outlier Reduction
#

# %%
if REDUCE_OUTLIERS:
    new_topics = topic_model.reduce_outliers(
        reviews,
        topic_model.topics_,
        probabilities=topic_model.probabilities_,
        threshold=0.05,
        strategy="probabilities",
    )
    topic_model.update_topics(reviews, topics=new_topics)

# %% [markdown]
# ## Results
#

# %%
doc_topic_matrix = probs

# column names
topicnames = ["Topic " + str(i) for i in range(len(set(topics)) - 1)]

# index names
docnames = ["Review " + str(i) for i in range(len(reviews))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(
    np.round(doc_topic_matrix, 2), columns=topicnames, index=docnames
)

# Get dominant topic for each document
dominant_topic = np.argmax(doc_topic_matrix, axis=1)
df_document_topic["dominant_topic"] = dominant_topic


# Styling
def color_stuff(val):
    if val > 0.1:
        color = "green"
    elif val > 0.05:
        color = "orange"
    else:
        color = "grey"
    return "color: {col}".format(col=color)


def make_bold(val):
    weight = 700 if val > 0.1 else 400
    return "font-weight: {weight}".format(weight=weight)


# Apply Style
df_document_topics = (
    df_document_topic.head(15).style.applymap(color_stuff).applymap(make_bold)
)
df_document_topics

# %% [markdown]
# ### Document Visualization
#

# %%
vis = topic_model.visualize_documents(
    docs=reviews,
    reduced_embeddings=reduced_embeddings,
    custom_labels=True,
    hide_annotations=True,
)
vis

# %%
topic_model.visualize_document_datamap(reviews, reduced_embeddings=reduced_embeddings)

# %% [markdown]
# ### Similarity Matrix
#

# %%
topic_model.visualize_heatmap()

# %% [markdown]
# ### Topic Info
#

# %%
topic_info = topic_model.get_topic_info()
topic_info

# %%
import matplotlib.pyplot as plt

topic_info = topic_info[topic_info["Topic"] != -1]

# Truncate labels at the third dash
topic_info["ShortName"] = topic_info["CustomName"].apply(
    lambda x: "-".join(x.split("-")[:3]) if "-" in x else x
)

# Sort by count in descending order
topic_info = topic_info.sort_values("Count", ascending=True)

plt.figure(figsize=(10, 6))

bars = plt.barh(topic_info["ShortName"], topic_info["Count"])

# Add count labels to each bar
for i, (count, bar) in enumerate(zip(topic_info["Count"], bars)):
    plt.text(
        count,
        bar.get_y() + bar.get_height() / 2,
        f" {count}",
        va="center",
        ha="left",
        fontsize=10,
        color="black",
    )

plt.xscale("log")
plt.ylabel("Topic")
plt.xlabel("Anzahl der Dokumente")
plt.title("")
plt.tight_layout()

plt.show()

# %% [markdown]
# ### Semantic Coherence
#

# %%
topic_words = []
for topic_id in topic_model.get_topic_info()["Topic"]:
    # Skip outlier topic
    if topic_id < 0:
        continue

    words = [word for word, _ in topic_model.get_topic(topic_id)]
    topic_words.append(words)

# Compute mean pairwise cosine similarity for each topic
coherence_scores = []
for words in topic_words:
    coherence_embeddings = embedding_model.encode(words)
    sim_matrix = cosine_similarity(coherence_embeddings)

    # Ignore self-similarity
    np.fill_diagonal(sim_matrix, 0)
    mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)])
    coherence_scores.append(mean_sim)

overall_coherence = np.mean(coherence_scores)

print(len(reviews), "reviews processed")
print(len(topic_model.get_topic_info()) - 1, "topics found")
print(f"BERT-based Topic Coherence: {overall_coherence:.4f}")

# %% [markdown]
# ### Topic Coherence
#

# %%
# https://github.com/MaartenGr/BERTopic/issues/90#issuecomment-820915389

# This will most likely crash your PC
this_will_crash_your_pc_are_you_sure = False
if this_will_crash_your_pc_are_you_sure:
    # Preprocess Documents
    documents = pd.DataFrame(
        {"Document": reviews, "ID": range(len(reviews)), "Topic": topics}
    )
    documents_per_topic = documents.groupby(["Topic"], as_index=False).agg(
        {"Document": " ".join}
    )
    cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

    # Extract vectorizer and analyzer from BERTopic
    vectorizer = topic_model.vectorizer_model
    analyzer = vectorizer.build_analyzer()

    # Extract features for Topic Coherence evaluation
    words = vectorizer.get_feature_names_out()
    tokens = [analyzer(doc) for doc in cleaned_docs]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]

    for topic_id in topic_model.get_topic_info()["Topic"]:
        # Skip outlier topic
        if topic_id < 0:
            continue

        words = [word for word, _ in topic_model.get_topic(topic_id)]
        topic_words.append(words)

    # %env TOKENIZERS_PARALLELISM=false

    for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]:
        coherence_model = CoherenceModel(
            topics=topic_words,
            texts=tokens,
            corpus=corpus,
            dictionary=dictionary,
            coherence=measurement,
        )
        coherence_score = coherence_model.get_coherence()
        print(f"Coherence ({measurement}): {coherence_score:.4f}")

# %% [markdown]
# ### Term Search
#

# %%
search_term = "lempuyang"

similar_topics, similarities = topic_model.find_topics(search_term, top_n=10)
for i in range(len(similar_topics)):
    print(
        f"{str(similarities[i])[:5]} {topic_model.get_topic_info(similar_topics[i])['CustomName'][0]}"
    )

# %%
# Source: https://maartengr.github.io/BERTopic/getting_started/visualization/visualize_documents.html#visualize-probabilities-or-distribution
# Calculate the topic distributions on a token-level

if CALCULATE_TOKEN_DISTRIBUTIONS:
    topic_distr, topic_token_distr = topic_model.approximate_distribution(
        reviews, calculate_tokens=True, use_embedding_model=True
    )

# %%
# Visualize the token-level distributions
if CALCULATE_TOKEN_DISTRIBUTIONS:
    DOC_INDEX = 1
    df = topic_model.visualize_approximate_distribution(
        reviews[DOC_INDEX], topic_token_distr[DOC_INDEX]
    )
    df

# %% [markdown]
# ### Topic Hierarchy
#

# %%
topic_model.visualize_hierarchy(custom_labels=True, color_threshold=0.98)

# %%
hierarchical_topics = topic_model.hierarchical_topics(reviews)
tree = topic_model.get_topic_tree(hier_topics=hierarchical_topics)
print(tree)

# %% [markdown]
# ### Classification
#

# %%
CLASSIFICATION = True
if CLASSIFICATION:
    topics_to_keep = {14, 8, 13, 18, 17, 4, 2, 30}
    INPUT_PATH = "../data/intermediate/preprocessed.tab"  # TSV with a 'review' column
    OUTPUT_CSV = "../data/intermediate/culture_reviews.csv"

    # Topic model document info
    df = topic_model.get_document_info(reviews)
    df["Original"] = reviews

    # --- filter by topics and length ---
    filtered = df[df["Topic"].isin(topics_to_keep)].copy()
    filtered["Original"] = filtered["Original"].str.strip()

    # Save an audit CSV
    filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False, sep=",")
    print(f"Filtered CSV file saved to {OUTPUT_CSV}")

# %% [markdown]
# ### Intertopic Distance Map
#

# %%
topic_model.visualize_topics(use_ctfidf=True, custom_labels=True)

# %% [markdown]
# ### Topic Word Scores
#

# %%
topic_model.visualize_barchart(top_n_topics=12, custom_labels=True, n_words=10)

# %%
from wordcloud import WordCloud
import matplotlib.pyplot as plt


def create_wordcloud(model, topic):
    text = {word: value for word, value in model.get_topic(topic)}
    wc = WordCloud(background_color="white", max_words=1000)
    wc.generate_from_frequencies(text)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.show()


# Show wordcloud
create_wordcloud(topic_model, topic=1)