masterthesis-playground/top2vec/nb_top2vec.py

# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: percent
#       format_version: '1.3'
#       jupytext_version: 1.18.0
#   kernelspec:
#     display_name: .venv
#     language: python
#     name: python3
# ---

# %% [markdown]
# # Topic Detection: Bali Tourist Reviews
#

# %% [markdown]
# ## Preparation
#
# ### Dependency Loading
#

# %%
from gensim import corpora
from gensim.models import CoherenceModel
from gensim.models.coherencemodel import CoherenceModel
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from top2vec import Top2Vec
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import pickle
import re
import spacy

# %% [markdown]
# ### Parameters and Tracking
#

# %%
PROCESS_DATA = False
RECALCULATE_COHERENCE_PARTS = False
RECREATE_MODEL = True

# %% [markdown]
# ### Data Loading & Preprocessing
#

# %%
reviews = (
    pd.read_csv("data.tab", sep="\t").review.dropna().to_list()
)  # .sample(5_000, random_state=42)

print("Loaded {} reviews".format(len(reviews)))

# %%
rep = {
    r"\\n": " ",
    r"\n": " ",
    r'\\"': "",
    r'"': "",
    "mongkey": "monkey",
    "monky": "monkey",
    "verry": "very",
    "bali": "",
    r"\s+": " ",
}
rep = dict((re.escape(k), v) for k, v in rep.items())
pattern = re.compile("|".join(rep.keys()))


def preprocess(text):
    text = text.strip()
    text = text.lower()
    text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
    return text


# %%
if PROCESS_DATA:
    print("Processing reviews...")
    reviews = [preprocess(review) for review in reviews]

    with open("processed_texts_top2vec.pkl", "wb") as f:
        pickle.dump(reviews, f)
else:
    with open("processed_texts_top2vec.pkl", "rb") as f:
        reviews = pickle.load(f)
        reviews = [
            " ".join(review) if isinstance(review, list) else review
            for review in reviews
        ]

print("Processed {} reviews".format(len(reviews)))
print(reviews[:1])

# %% [markdown]
# ## Model Creation
#

# %%
if RECREATE_MODEL:
    hdbscan_args = {
        "min_cluster_size": 200,
        "min_samples": 25,
        "metric": "euclidean",
        "cluster_selection_method": "eom",
    }
    umap_args = {
        "n_neighbors": 15,
        "n_components": 2,
        "min_dist": 0.01,
        "metric": "cosine",
        "random_state": 42,
        "low_memory": True,
    }

    model = Top2Vec(
        reviews,
        workers=8,
        hdbscan_args=hdbscan_args,
        umap_args=umap_args,
        min_count=1,
    )

    with open("./top2vec/model.pkl", "wb") as f:
        pickle.dump(model, f)
else:
    with open("./top2vec/model.pkl", "rb") as f:
        model = pickle.load(f)

print(f"\nNumber of topics found: {model.get_num_topics()}")

# %% [markdown]
# ## Results
#

# %% [markdown]
# ### Coherence
#

# %%
topic_words = model.get_topics()[0]
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

coherence_scores = []
for words in topic_words:
    coherence_embeddings = embedding_model.encode(words)
    sim_matrix = cosine_similarity(coherence_embeddings)
    np.fill_diagonal(sim_matrix, 0)
    mean_sim = np.mean(sim_matrix)
    coherence_scores.append(mean_sim)

overall_coherence = np.mean(coherence_scores)

print(f"BERT-based Topic Coherence: {overall_coherence:.4f}")

# %%
# %env TOKENIZERS_PARALLELISM=false
num_words = 10

if RECALCULATE_COHERENCE_PARTS:
    tqdm.pandas()

    docs = model.documents
    doc_topics, _, _, _ = model.get_documents_topics(doc_ids=list(range(len(docs))))

    df = pd.DataFrame({"Document": docs, "ID": range(len(docs)), "Topic": doc_topics})

    documents_per_topic = df.groupby(["Topic"], as_index=False).agg(
        {"Document": " ".join}
    )

    nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
    nlp.max_length = 10_000_000

    def preprocess(doc):
        return [
            token.text.lower()
            for token in nlp(doc)
            if token.is_alpha and not token.is_stop
        ]

    topic_words = model.get_topics()[0]
    print(topic_words)

    print("Preprocessing topic documents...")
    tokens = df["Tokens"] = df["Document"].progress_apply(preprocess)

    print("Creating dictionary...")
    dictionary = corpora.Dictionary(tokens)
    print("Creating corpus...")
    corpus = [dictionary.doc2bow(token_list) for token_list in tokens]

    num_topics = len(model.topic_sizes)

    with open("./top2vec/corpus.pkl", "wb") as f:
        pickle.dump(corpus, f)
    with open("./top2vec/dictionary.pkl", "wb") as f:
        pickle.dump(dictionary, f)
    with open("./top2vec/tokens.pkl", "wb") as f:
        pickle.dump(tokens, f)
else:
    with open("./top2vec/corpus.pkl", "rb") as f:
        corpus = pickle.load(f)
    with open("./top2vec/dictionary.pkl", "rb") as f:
        dictionary = pickle.load(f)
    with open("./top2vec/tokens.pkl", "rb") as f:
        tokens = pickle.load(f)

print("Starting coherence evaluation...")
for measure in ["c_v", "u_mass", "c_uci", "c_npmi"]:
    cm = CoherenceModel(
        topics=topic_words,
        texts=tokens,
        corpus=corpus,
        dictionary=dictionary,
        coherence=measure,
        topn=num_words,
    )
    score = cm.get_coherence()
    print(f"Coherence ({measure}): {score:.4f}")

# %% [markdown]
# ### Topic List
#

# %%
topics, probs, unq_num = model.get_topics()

for i, topic_words in enumerate(topics):
    print(f"Topic {unq_num[i]}: {' | '.join(topic_words)}")

# %% [markdown]
# ### Search by term
#

# %%
search_term = "monkey"

print(f"\nSearching for topics related to '{search_term}':")
num_topics = min(model.get_num_topics(), 10)
topic_words, _, _, _ = model.search_topics(
    keywords=[search_term], num_topics=num_topics
)

for words in topic_words:
    topics, probs, unq_num = model.get_topics()
    for i, topic_words in enumerate(topics):
        if set(words).issubset(set(topic_words)):
            unq_num = unq_num[i]
            break

    print(f"Topic {unq_num}: {' | '.join(words)}")

# %% [markdown]
# ### Search by topic ID
#

# %%
topic_id = 0

print(f"Topic {topic_id}:")
print("Top words:", " | ".join(topics[topic_id]))

docs, doc_scores, doc_ids = model.search_documents_by_topic(
    topic_num=topic_id, num_docs=15
)
for i, doc in enumerate(docs):
    print(f"Doc {i+1} (Score: {doc_scores[i]:.2f}): {doc}")

# %%
import plotly.express as px
import pandas as pd
from umap import UMAP

# Get topic metadata
topic_vectors = model.topic_vectors
topic_words = model.get_topics()[0]
topic_nums, topic_sizes = model.get_topic_sizes()

# Reduce vectors to 2D using UMAP
umap_model = UMAP(n_neighbors=15, n_components=2, metric="cosine", random_state=42)
topic_coords = umap_model.fit_transform(topic_vectors)

# Ensure all components are 1D lists
topic_nums = list(topic_nums)
topic_sizes = list(topic_sizes)
topic_labels = [" | ".join(words[:5]) for words in topic_words]

# Build DataFrame
df = pd.DataFrame(
    {
        "x": topic_coords[:, 0],
        "y": topic_coords[:, 1],
        "Topic Number": topic_nums,
        "Size": topic_sizes,
        "Top Words": topic_labels,
    }
)

# Plot using Plotly
fig = px.scatter(
    df,
    x="x",
    y="y",
    size="Size",
    text="Topic Number",
    hover_data={"Top Words": True, "Size": True, "x": False, "y": False},
    title="Top2Vec Topic Visualization (2D)",
)
fig.update_traces(textposition="top center")
fig.show()