Files
masterthesis-playground/bertopic/nb_bertopic.py
2025-10-20 23:06:52 +02:00

570 lines
14 KiB
Python

# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.18.0
# kernelspec:
# display_name: .venv
# language: python
# name: python3
# ---
# %% [markdown]
# # Topic Detection: Bali Tourist Reviews
#
# %% [markdown]
# ## Preparation
#
# ### Dependency Loading
#
# %%
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
from gensim.models.coherencemodel import CoherenceModel
from hdbscan import HDBSCAN
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from umap import UMAP
import gensim.corpora as corpora
import json
import nltk
import numpy as np
import pandas as pd
import re
import spacy
import pickle
nlp = spacy.load("en_core_web_sm")
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
# %% [markdown]
# ### Parameters and Tracking
#
# %%
RECREATE_MODEL = True
RECREATE_REDUCED_MODEL = True
PROCESS_DATA = False
REDUCE_OUTLIERS = True
USE_CONDENSED_MODEL = False
DATA_SAMPLE_SIZE = -1 # -1 for all data
# Classical coherence score. Warning: needs swap to not kill your PC
CALCULATE_COHERENCE = False
# Vectorization
MIN_DOCUMENT_FREQUENCY = 1
MAX_NGRAM = 2
# HDBSCAN Parameters
MIN_TOPIC_SIZE = 200
MIN_SAMPLES = 25
# UMAP Parameters
N_NEIGHBORS = 15
N_COMPONENTS = 2
MIN_DIST = 0.01
# Topic Modeling
TOP_N_WORDS = 10
MAX_TOPICS = None # or "auto" to pass to HDBSCAN, None to skip
# %% [markdown]
# ### Data Loading & Preprocessing
#
# %%
if DATA_SAMPLE_SIZE != -1:
reviews = (
pd.read_csv("../data/original/reviews.tab", sep="\t")
.sample(n=DATA_SAMPLE_SIZE)
.review.dropna()
.to_list()
)
else:
reviews = (
pd.read_csv("../data/original/reviews.tab", sep="\t").review.dropna().to_list()
)
print("Loaded {} reviews".format(len(reviews)))
# %%
# List of NE in Bali for NER enhancement
with open("../data/supporting/bali_ner.json", "r") as f:
bali_places = json.load(f)
bali_places_set = set(bali_places)
# Stop word definition
extra_stopwords = ["bali", "idr", "usd"]
stop_words = set(stopwords.words("english"))
with open("../data/supporting/stopwords-en.json", "r") as f:
extra_stopwords.extend(json.load(f))
# Custom replacements
rep = {
r"\\n": " ",
r"\n": " ",
r'\\"': "",
r'"': "",
"mongkey": "monkey",
"monky": "monkey",
"verry": "very",
}
rep = dict((re.escape(k), v) for k, v in rep.items())
pattern = re.compile("|".join(rep.keys()))
lemmatizer = WordNetLemmatizer()
def preprocess(text):
# Step 1: Apply custom replacements (typos, special cases)
text = text.lower()
text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
# Step 2: Clean text
text = re.sub(r"\d+", " ", text)
text = re.sub(r"\W+", " ", text)
doc = nlp(text)
# Step 3: POS tagging and filtering
filtered_tokens = [
token.text
for token in doc
if token.pos_ in {"NOUN", "PROPN"}
or token.ent_type_ in {"GPE", "LOC", "FAC"}
or token.text in bali_places_set
]
# Step 4: Lemmatization and stopword removal
lemmatized_tokens = [
lemmatizer.lemmatize(w)
for w in filtered_tokens
if w not in stop_words and w not in extra_stopwords and len(w) > 2
]
return lemmatized_tokens
# %%
if PROCESS_DATA:
print("Processing reviews...")
reviews = [preprocess(review) for review in reviews]
with open("../data/intermediate/processed_texts.pkl", "wb") as f:
pickle.dump(reviews, f)
else:
with open("../data/intermediate/processed_texts.pkl", "rb") as f:
reviews = pickle.load(f)
reviews = [
" ".join(review) if isinstance(review, list) else review
for review in reviews
]
print(reviews[:1])
# %% [markdown]
# ### Pre-calculate Embeddings
#
# %%
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(reviews, show_progress_bar=True)
# %% [markdown]
# ## Model Creation
#
# %% [markdown]
# ### Dimensionality Reduction (UMAP)
#
# %%
umap_model = UMAP(
n_neighbors=N_NEIGHBORS,
n_components=N_COMPONENTS,
min_dist=MIN_DIST,
metric="cosine",
low_memory=True,
random_state=42,
)
reduced_embeddings = umap_model.fit_transform(embeddings)
# %% [markdown]
# ### BERTopic Model Creation
#
# %%
if RECREATE_MODEL:
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
vectorizer_model = CountVectorizer(
min_df=MIN_DOCUMENT_FREQUENCY, ngram_range=(1, MAX_NGRAM)
)
representation_model = KeyBERTInspired()
hdbscan_model = HDBSCAN(
min_cluster_size=MIN_TOPIC_SIZE,
min_samples=MIN_SAMPLES,
metric="euclidean",
cluster_selection_method="eom",
gen_min_span_tree=True,
prediction_data=True,
)
topic_model = BERTopic(
embedding_model=embedding_model,
ctfidf_model=ctfidf_model,
vectorizer_model=vectorizer_model,
umap_model=umap_model,
hdbscan_model=hdbscan_model,
representation_model=representation_model,
verbose=True,
calculate_probabilities=True,
language="english",
top_n_words=TOP_N_WORDS,
nr_topics=MAX_TOPICS,
)
topics, probs = topic_model.fit_transform(reviews, embeddings=embeddings)
topic_labels = topic_model.generate_topic_labels(
nr_words=3, topic_prefix=True, word_length=15, separator=" - "
)
topic_model.set_topic_labels(topic_labels)
BERTopic.save(topic_model, "output/model.bertopic")
else:
print("Nevermind, loading existing model")
topic_model = BERTopic.load("output/model.bertopic")
# %% [markdown]
# ## Fine Tuning
#
# ### Topic Condensation
#
# %%
if RECREATE_REDUCED_MODEL:
done = False
iteration = 1
while not done:
print(f"Iteration {iteration}")
iteration += 1
similarity_matrix = cosine_similarity(
np.array(topic_model.topic_embeddings_)[1:, :]
)
nothing_to_merge = True
for i in range(similarity_matrix.shape[0]):
for j in range(i + 1, similarity_matrix.shape[1]):
sim = similarity_matrix[i, j]
if sim > 0.9:
nothing_to_merge = False
t1, t2 = i, j
try:
t1_name = topic_model.get_topic_info(t1)["CustomName"][0]
t2_name = topic_model.get_topic_info(t2)["CustomName"][0]
print(
f"Merging topics {t1} ({t1_name}) and {t2} ({t2_name}) with similarity {sim:.2f}"
)
topic_model.merge_topics(reviews, topics_to_merge=[t1, t2])
topic_labels = topic_model.generate_topic_labels(
nr_words=3,
topic_prefix=True,
word_length=15,
separator=" - ",
)
topic_model.set_topic_labels(topic_labels)
except Exception as e:
print(f"Failed to merge {t1} and {t2}: {e}")
if nothing_to_merge:
print("No more topics to merge.")
done = True
# BERTopic.save(topic_model, "bertopic/model_reduced.bertopic")
elif USE_CONDENSED_MODEL:
print("Nevermind, loading existing reduced model")
topic_model = BERTopic.load("bertopic/model_reduced.bertopic")
else:
print("Skipping topic reduction")
# %% [markdown]
# ### Outlier Reduction
#
# %%
if REDUCE_OUTLIERS:
new_topics = topic_model.reduce_outliers(
reviews,
topic_model.topics_,
probabilities=topic_model.probabilities_,
threshold=0.05,
strategy="probabilities",
)
topic_model.update_topics(reviews, topics=new_topics)
# %% [markdown]
# ## Results
#
# ### Classification
#
# %%
from pathlib import Path
import random
# --- config ---
topics_to_keep = {2, 4, 6, 8, 10, 5, 7}
INPUT_PATH = "../data/original/reviews.tab" # TSV with a 'review' column
OUTPUT_CSV = "../data/intermediate/selected_topics_documents.csv"
OUTPUT_DIR = Path("../raft/corpus")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
BATCH_SIZE = 60
MIN_CHARS = 40
SEED = 42
# --- load data ---
data = pd.read_csv(INPUT_PATH, sep="\t")
# If you already have `reviews` elsewhere, replace the next line with that variable
reviews = data["review"].astype(str).fillna("")
# Topic model document info
df = topic_model.get_document_info(reviews) # assumes your model is already fitted
df["Original"] = reviews.values
# --- filter by topics and length ---
filtered = df[df["Topic"].isin(topics_to_keep)].copy()
filtered["Original"] = filtered["Original"].str.strip()
filtered = filtered[filtered["Original"].str.len() >= MIN_CHARS]
# Save an audit CSV
filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False)
# --- deterministic shuffle + write batched corpus files ---
total_files = 0
total_reviews = 0
rng = random.Random(SEED)
for topic_val, g in filtered.groupby("Topic", sort=True):
reviews_list = g["Original"].tolist()
# deterministic shuffle within topic
rng.shuffle(reviews_list)
# chunk into batches of up to 60
for start in range(0, len(reviews_list), BATCH_SIZE):
chunk = reviews_list[start : start + BATCH_SIZE]
if not chunk:
continue
# simple header for traceability
header = (
f"[TOPIC] {topic_val}\n" f"[Stats] N={len(chunk)} | Source={INPUT_PATH}\n"
)
lines = [header, ""]
for i, txt in enumerate(chunk, 1):
lines.append(f"({i}) {txt}")
part_idx = start // BATCH_SIZE + 1
fname = f"topic={topic_val}__part={part_idx:03d}__n={len(chunk)}.txt"
(OUTPUT_DIR / fname).write_text("\n".join(lines), encoding="utf-8")
total_files += 1
total_reviews += len(chunk)
print(
f"[green]Wrote {total_files} docs with {total_reviews} reviews to {OUTPUT_DIR}[/green]"
)
print(f"[green]Filtered CSV saved to {OUTPUT_CSV}[/green]")
# %%
doc_topic_matrix = probs
# column names
topicnames = ["Topic " + str(i) for i in range(len(set(topics)) - 1)]
# index names
docnames = ["Review " + str(i) for i in range(len(reviews))]
# Make the pandas dataframe
df_document_topic = pd.DataFrame(
np.round(doc_topic_matrix, 2), columns=topicnames, index=docnames
)
# Get dominant topic for each document
dominant_topic = np.argmax(doc_topic_matrix, axis=1)
df_document_topic["dominant_topic"] = dominant_topic
# Styling
def color_stuff(val):
if val > 0.1:
color = "green"
elif val > 0.05:
color = "orange"
else:
color = "grey"
return "color: {col}".format(col=color)
def make_bold(val):
weight = 700 if val > 0.1 else 400
return "font-weight: {weight}".format(weight=weight)
# Apply Style
df_document_topics = (
df_document_topic.head(15).style.applymap(color_stuff).applymap(make_bold)
)
df_document_topics
# %% [markdown]
# ### Document Visualization
#
# %%
vis = topic_model.visualize_documents(
docs=reviews,
reduced_embeddings=reduced_embeddings,
custom_labels=True,
hide_annotations=True,
)
vis.write_html("output/visualization.html")
vis
# %% [markdown]
# ### Similarity Matrix
#
# %%
topic_model.visualize_heatmap()
# %% [markdown]
# ### Topic Info
#
# %%
topic_model.get_topic_info()
# %% [markdown]
# ### Semantic Coherence
#
# %%
topic_words = []
for topic_id in range(len(topic_model.get_topic_info()) - 1):
words = [word for word, _ in topic_model.get_topic(topic_id)]
topic_words.append(words)
# Compute mean pairwise cosine similarity for each topic
coherence_scores = []
for words in topic_words:
coherence_embeddings = embedding_model.encode(words)
sim_matrix = cosine_similarity(coherence_embeddings)
np.fill_diagonal(sim_matrix, 0) # Ignore self-similarity
mean_sim = np.mean(sim_matrix)
coherence_scores.append(mean_sim)
overall_coherence = np.mean(coherence_scores)
print(len(reviews), "reviews processed")
print(len(topic_model.get_topic_info()) - 1, "topics found")
print(f"BERT-based Topic Coherence: {overall_coherence:.4f}")
# %% [markdown]
# ### Topic Coherence
#
# %%
# https://github.com/MaartenGr/BERTopic/issues/90#issuecomment-820915389
if CALCULATE_COHERENCE:
# Preprocess Documents
documents = pd.DataFrame(
{"Document": reviews, "ID": range(len(reviews)), "Topic": topics}
)
documents_per_topic = documents.groupby(["Topic"], as_index=False).agg(
{"Document": " ".join}
)
cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)
# Extract vectorizer and analyzer from BERTopic
vectorizer = topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()
# Extract features for Topic Coherence evaluation
words = vectorizer.get_feature_names_out()
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [
[words for words, _ in topic_model.get_topic(topic)]
for topic in range(len(set(topics)) - 1)
]
# %env TOKENIZERS_PARALLELISM=false
for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]:
coherence_model = CoherenceModel(
topics=topic_words,
texts=tokens,
corpus=corpus,
dictionary=dictionary,
coherence=measurement,
)
coherence_score = coherence_model.get_coherence()
print(f"Coherence ({measurement}): {coherence_score:.4f}")
else:
print("Skipping classical coherence calculation")
# %% [markdown]
# ### Term Search
#
# %%
search_term = "uluwatu"
similar_topics, similarities = topic_model.find_topics(search_term, top_n=10)
for i in range(len(similar_topics)):
# \n{topic_model.get_topic(similar_topics[i])}\n
print(
f"{str(similarities[i])[:5]} {topic_model.get_topic_info(similar_topics[i])["CustomName"][0]}"
)
# %% [markdown]
# ### Topic Hierarchy
#
# %%
topic_model.visualize_hierarchy(custom_labels=True)
# %% [markdown]
# ### Intertopic Distance Map
#
# %%
topic_model.visualize_topics()
# %% [markdown]
# ### Topic Word Scores
#
# %%
topic_model.visualize_barchart(top_n_topics=12, custom_labels=True, n_words=10)