masterthesis-playground/bertopic/nb_bertopic_lowprep.py

# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: percent
#       format_version: '1.3'
#       jupytext_version: 1.18.0
#   kernelspec:
#     display_name: .venv
#     language: python
#     name: python3
# ---

# %% [markdown]
# # Topic Detection: Bali Tourist Reviews
#

# %% [markdown]
# ## Preparation
#
# ### Dependency Loading
#

# %%
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
from gensim.models.coherencemodel import CoherenceModel
from hdbscan import HDBSCAN
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from umap import UMAP
import gensim.corpora as corpora
import nltk
import numpy as np
import pandas as pd
import re
import spacy
import pickle

nlp = spacy.load("en_core_web_sm")

nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

# %% [markdown]
# ### Parameters and Tracking
#

# %%
RECREATE_MODEL = True
RECREATE_REDUCED_MODEL = True
PROCESS_DATA = False
REDUCE_OUTLIERS = False

# Data Sample Size, -1 for all data
DATA_SAMPLE_SIZE = -1

# Vectorization
MIN_DOCUMENT_FREQUENCY = 1
MAX_NGRAM = 3

# HDBSCAN Parameters
MIN_TOPIC_SIZE = 200
MIN_SAMPLES = 25

# UMAP Parameters
N_NEIGHBORS = 15
N_COMPONENTS = 2
MIN_DIST = 0.01

# Topic Modeling
TOP_N_WORDS = 10
MAX_TOPICS = None  # or "auto" to pass to HDBSCAN, None to skip

tracking = {
    "input": {
        "min_document_frequency": MIN_DOCUMENT_FREQUENCY,
        "max_ngram": MAX_NGRAM,
        "min_topic_size": MIN_TOPIC_SIZE,
        "min_samples": MIN_SAMPLES,
        "n_neighbors": N_NEIGHBORS,
        "n_components": N_COMPONENTS,
        "min_dist": MIN_DIST,
        "top_n_words": TOP_N_WORDS,
        "max_topics": MAX_TOPICS,
    },
}

# %% [markdown]
# ### Data Loading & Preprocessing
#

# %%
if DATA_SAMPLE_SIZE == -1:
    reviews = pd.read_csv("../data/original/reviews.tab", sep="\t").review.to_list()
else:
    reviews = (
        pd.read_csv("../data/original/reviews.tab", sep="\t")
        .sample(n=DATA_SAMPLE_SIZE)
        .review.to_list()
    )

print("Loaded {} reviews".format(len(reviews)))

# %%
rep = {
    r"\\n": " ",
    r"\n": " ",
    r'\\"': "",
    r'"': "",
    "mongkey": "monkey",
    "monky": "monkey",
    "verry": "very",
    "bali": "",
    r"\s+": " ",
}
rep = dict((re.escape(k), v) for k, v in rep.items())
pattern = re.compile("|".join(rep.keys()))


def preprocess(text):
    text = text.strip()
    text = text.lower()
    text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
    return text


# %%
print(
    preprocess(
        "Excellent. Definitely worth coming while in bali. Food and people were very nice.\n🌟 🤩 ⭐️ \nTrisna was our host"
    )
)

# %%
if PROCESS_DATA:
    print("Processing reviews...")
    reviews = [preprocess(review) for review in reviews]

    with open("../data/intermediate/processed_texts_lowprep.pkl", "wb") as f:
        pickle.dump(reviews, f)
else:
    with open("../data/intermediate/processed_texts_lowprep.pkl", "rb") as f:
        reviews = pickle.load(f)

print(reviews[:1])

# %% [markdown]
# ### Pre-calculate Embeddings
#

# %%
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(reviews, show_progress_bar=True)

# %% [markdown]
# ## Model Creation
#

# %% [markdown]
# ### Dimensionality Reduction (UMAP)
#

# %%
umap_model = UMAP(
    n_neighbors=N_NEIGHBORS,
    n_components=N_COMPONENTS,
    min_dist=MIN_DIST,
    metric="cosine",
    low_memory=True,
    random_state=42,
)
reduced_embeddings = umap_model.fit_transform(embeddings)

# %% [markdown]
# ### BERTopic Model Creation
#

# %%
if RECREATE_MODEL:
    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
    vectorizer_model = CountVectorizer(
        min_df=MIN_DOCUMENT_FREQUENCY,
        ngram_range=(1, MAX_NGRAM),
        stop_words=stopwords.words("english"),
    )

    representation_model = KeyBERTInspired()
    hdbscan_model = HDBSCAN(
        min_cluster_size=MIN_TOPIC_SIZE,
        min_samples=MIN_SAMPLES,
        metric="euclidean",
        cluster_selection_method="eom",
        gen_min_span_tree=True,
        prediction_data=True,
    )

    topic_model = BERTopic(
        embedding_model=embedding_model,
        ctfidf_model=ctfidf_model,
        vectorizer_model=vectorizer_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        representation_model=representation_model,
        verbose=True,
        calculate_probabilities=True,
        language="english",
        top_n_words=TOP_N_WORDS,
        nr_topics=MAX_TOPICS,
    )

    topics, probs = topic_model.fit_transform(reviews, embeddings=embeddings)

    topic_labels = topic_model.generate_topic_labels(
        nr_words=3, topic_prefix=True, word_length=15, separator=" - "
    )
    topic_model.set_topic_labels(topic_labels)
    # BERTopic.save(topic_model, "bertopic/model.bertopic")
else:
    print("Nevermind, loading existing model")
    # topic_model = BERTopic.load("bertopic/model.bertopic")

# %% [markdown]
# ## Fine Tuning
#
# ### Topic Condensation
#

# %%
if RECREATE_REDUCED_MODEL:
    done = False
    iteration = 1
    while not done:
        print(f"Iteration {iteration}")
        iteration += 1
        similarity_matrix = cosine_similarity(
            np.array(topic_model.topic_embeddings_)[1:, :]
        )
        nothing_to_merge = True

        for i in range(similarity_matrix.shape[0]):
            for j in range(i + 1, similarity_matrix.shape[1]):
                try:
                    sim = similarity_matrix[i, j]
                    if sim > 0.9:
                        nothing_to_merge = False
                        t1, t2 = i, j
                        try:
                            t1_name = topic_model.get_topic_info(t1)["CustomName"][0]
                            t2_name = topic_model.get_topic_info(t2)["CustomName"][0]
                            print(
                                f"Merging topics {t1} ({t1_name}) and {t2} ({t2_name}) with similarity {sim:.2f}"
                            )
                            topic_model.merge_topics(reviews, topics_to_merge=[t1, t2])

                            topic_labels = topic_model.generate_topic_labels(
                                nr_words=3,
                                topic_prefix=True,
                                word_length=15,
                                separator=" - ",
                            )
                            topic_model.set_topic_labels(topic_labels)
                            similarity_matrix = cosine_similarity(
                                np.array(topic_model.topic_embeddings_)[1:, :]
                            )
                        except Exception as e:
                            print(f"Failed to merge {t1} and {t2}: {e}")
                except IndexError:
                    pass
        if nothing_to_merge:
            print("No more topics to merge.")
            done = True
else:
    print("Skipping topic reduction")

# %% [markdown]
# ### Outlier Reduction
#

# %%
if REDUCE_OUTLIERS:
    new_topics = topic_model.reduce_outliers(
        reviews,
        topic_model.topics_,
        probabilities=topic_model.probabilities_,
        threshold=0.05,
        strategy="probabilities",
    )
    topic_model.update_topics(reviews, topics=new_topics)

# %% [markdown]
# ## Results
#
# ### Classification
#

# %%
from pathlib import Path
import random

# --- config ---
topics_to_keep = {2, 4, 5, 9, 22, 26}
INPUT_PATH = "../data/original/reviews.tab"  # TSV with a 'review' column
OUTPUT_CSV = "../data/intermediate/selected_topics_documents.csv"
OUTPUT_DIR = Path("../raft/corpus")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

BATCH_SIZE = 60
MIN_CHARS = 40
SEED = 42

# --- load data ---
data = pd.read_csv(INPUT_PATH, sep="\t")

# If you already have `reviews` elsewhere, replace the next line with that variable
reviews = data["review"].astype(str).fillna("")

# Topic model document info
df = topic_model.get_document_info(reviews)  # assumes your model is already fitted
df["Original"] = reviews.values

# --- filter by topics and length ---
filtered = df[df["Topic"].isin(topics_to_keep)].copy()
filtered["Original"] = filtered["Original"].str.strip()
filtered = filtered[filtered["Original"].str.len() >= MIN_CHARS]

# Save an audit CSV
filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False)

# --- deterministic shuffle + write batched corpus files ---
total_files = 0
total_reviews = 0
rng = random.Random(SEED)

for topic_val, g in filtered.groupby("Topic", sort=True):
    reviews_list = g["Original"].tolist()

    # deterministic shuffle within topic
    rng.shuffle(reviews_list)

    # chunk into batches of up to 60
    for start in range(0, len(reviews_list), BATCH_SIZE):
        chunk = reviews_list[start : start + BATCH_SIZE]
        if not chunk:
            continue

        # simple header for traceability
        header = (
            f"[TOPIC] {topic_val}\n" + f"[Stats] N={len(chunk)} | Source={INPUT_PATH}\n"
        )

        lines = [header, ""]
        for i, txt in enumerate(chunk, 1):
            lines.append(f"({i}) {txt}")

        part_idx = start // BATCH_SIZE + 1
        fname = f"topic={topic_val}__part={part_idx:03d}__n={len(chunk)}.txt"
        (OUTPUT_DIR / fname).write_text("\n".join(lines), encoding="utf-8")

        total_files += 1
        total_reviews += len(chunk)

print(
    f"[green]Wrote {total_files} docs with {total_reviews} reviews to {OUTPUT_DIR}[/green]"
)
print(f"[green]Filtered CSV saved to {OUTPUT_CSV}[/green]")

# %%
doc_topic_matrix = probs

# column names
topicnames = ["Topic " + str(i) for i in range(len(set(topics)) - 1)]

# index names
docnames = ["Review " + str(i) for i in range(len(reviews))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(
    np.round(doc_topic_matrix, 2), columns=topicnames, index=docnames
)

# Get dominant topic for each document
dominant_topic = np.argmax(doc_topic_matrix, axis=1)
df_document_topic["dominant_topic"] = dominant_topic


# Styling
def color_stuff(val):
    if val > 0.1:
        color = "green"
    elif val > 0.05:
        color = "orange"
    else:
        color = "grey"
    return "color: {col}".format(col=color)


def make_bold(val):
    weight = 700 if val > 0.1 else 400
    return "font-weight: {weight}".format(weight=weight)


# Apply Style
df_document_topics = (
    df_document_topic.head(15).style.applymap(color_stuff).applymap(make_bold)
)
df_document_topics

# %% [markdown]
# ### Document Visualization
#

# %%
vis = topic_model.visualize_documents(
    docs=reviews,
    reduced_embeddings=reduced_embeddings,
    custom_labels=True,
    hide_annotations=True,
)
vis.write_html("output/visualization.html")
vis

# %% [markdown]
# ### Similarity Matrix
#

# %%
topic_model.visualize_heatmap()

# %% [markdown]
# ### Topic Info
#

# %%
topic_model.get_topic_info()

# %% [markdown]
# ### Semantic Coherence
#

# %%
topic_words = []
for topic_id in range(len(topic_model.get_topic_info()) - 1):
    words = [word for word, _ in topic_model.get_topic(topic_id)]
    topic_words.append(words)

# Compute mean pairwise cosine similarity for each topic
coherence_scores = []
for words in topic_words:
    coherence_embeddings = embedding_model.encode(words)
    sim_matrix = cosine_similarity(coherence_embeddings)
    np.fill_diagonal(sim_matrix, 0)  # Ignore self-similarity
    mean_sim = np.mean(sim_matrix)
    coherence_scores.append(mean_sim)

overall_coherence = np.mean(coherence_scores)

print(len(reviews), "reviews processed")
print(len(topic_model.get_topic_info()) - 1, "topics found")
print(f"BERT-based Topic Coherence: {overall_coherence:.4f}")

# %% [markdown]
# ### Topic Coherence
#

# %%
# https://github.com/MaartenGr/BERTopic/issues/90#issuecomment-820915389

# This will most likely crash your PC
this_will_crash_your_pc_are_you_sure = False
if this_will_crash_your_pc_are_you_sure:
    # Preprocess Documents
    documents = pd.DataFrame(
        {"Document": reviews, "ID": range(len(reviews)), "Topic": topics}
    )
    documents_per_topic = documents.groupby(["Topic"], as_index=False).agg(
        {"Document": " ".join}
    )
    cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

    # Extract vectorizer and analyzer from BERTopic
    vectorizer = topic_model.vectorizer_model
    analyzer = vectorizer.build_analyzer()

    # Extract features for Topic Coherence evaluation
    words = vectorizer.get_feature_names_out()
    tokens = [analyzer(doc) for doc in cleaned_docs]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
    topic_words = [
        [words for words, _ in topic_model.get_topic(topic)]
        for topic in range(len(set(topics)) - 1)
    ]

    # %env TOKENIZERS_PARALLELISM=false

    for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]:
        coherence_model = CoherenceModel(
            topics=topic_words,
            texts=tokens,
            corpus=corpus,
            dictionary=dictionary,
            coherence=measurement,
        )
        coherence_score = coherence_model.get_coherence()
        print(f"Coherence ({measurement}): {coherence_score:.4f}")

# %% [markdown]
# ### Term Search
#

# %%
search_term = "uluwatu"

similar_topics, similarities = topic_model.find_topics(search_term, top_n=10)
for i in range(len(similar_topics)):
    # \n{topic_model.get_topic(similar_topics[i])}\n
    print(
        f"{str(similarities[i])[:5]} {topic_model.get_topic_info(similar_topics[i])["CustomName"][0]}"
    )

# %% [markdown]
# ### Topic Hierarchy
#

# %%
topic_model.visualize_hierarchy(custom_labels=True)

# %% [markdown]
# ### Intertopic Distance Map
#

# %%
topic_model.visualize_topics(use_ctfidf=True)

# %% [markdown]
# ### Topic Word Scores
#

# %%
topic_model.visualize_barchart(top_n_topics=12, custom_labels=True, n_words=10)

# %%
# from matplotlib import pyplot as plt
# from sklearn.manifold import TSNE


# topics = topic_model.topics_

# # Reduce dimensionality with TSNE
# tsne = TSNE(n_components=2, random_state=42)
# embeddings_2d = tsne.fit_transform(embeddings)

# # Prepare colors (assign a color to each topic)
# unique_topics = set(topics)
# colors = plt.get_cmap("tab20", len(unique_topics))

# # Plot
# plt.figure(figsize=(12, 8))
# for topic in unique_topics:
#     # Select indices for the current topic
#     indices = [i for i, t in enumerate(topics) if t == topic]

#     # Get 2D points for these indices
#     x = embeddings_2d[indices, 0]
#     y = embeddings_2d[indices, 1]

#     # Assign label (exclude outliers)
#     label = f"Topic {topic}" if topic != -1 else "Outliers"

#     # Plot with color
#     plt.scatter(x, y, color=colors(topic + 1), label=label, alpha=0.5)

# plt.title("Topic Clusters in 2D Embedding Space")
# plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
# plt.tight_layout()

# # Save the plot
# plt.savefig("topic_clusters.png", dpi=300, bbox_inches="tight")
# plt.show()