# --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: percent # format_version: '1.3' # jupytext_version: 1.18.0 # kernelspec: # display_name: .venv # language: python # name: python3 # --- # %% [markdown] # # Topic Detection: Bali Tourist Reviews # # %% [markdown] # ## Preparation # # ### Dependency Loading # # %% from bertopic import BERTopic from bertopic.representation import KeyBERTInspired from bertopic.vectorizers import ClassTfidfTransformer from gensim.models.coherencemodel import CoherenceModel from hdbscan import HDBSCAN from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from sentence_transformers import SentenceTransformer from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics.pairwise import cosine_similarity from umap import UMAP import gensim.corpora as corpora import json import nltk import numpy as np import pandas as pd import re import spacy import pickle nlp = spacy.load("en_core_web_sm") nltk.download("stopwords") nltk.download("punkt") nltk.download("wordnet") # %% [markdown] # ### Parameters and Tracking # # %% RECREATE_MODEL = True RECREATE_REDUCED_MODEL = True PROCESS_DATA = False REDUCE_OUTLIERS = True USE_CONDENSED_MODEL = False DATA_SAMPLE_SIZE = -1 # -1 for all data # Classical coherence score. Warning: needs swap to not kill your PC CALCULATE_COHERENCE = False # Vectorization MIN_DOCUMENT_FREQUENCY = 1 MAX_NGRAM = 2 # HDBSCAN Parameters MIN_TOPIC_SIZE = 200 MIN_SAMPLES = 25 # UMAP Parameters N_NEIGHBORS = 15 N_COMPONENTS = 2 MIN_DIST = 0.01 # Topic Modeling TOP_N_WORDS = 10 MAX_TOPICS = None # or "auto" to pass to HDBSCAN, None to skip # %% [markdown] # ### Data Loading & Preprocessing # # %% if DATA_SAMPLE_SIZE != -1: reviews = ( pd.read_csv("../data/original/reviews.tab", sep="\t") .sample(n=DATA_SAMPLE_SIZE) .review.dropna() .to_list() ) else: reviews = ( pd.read_csv("../data/original/reviews.tab", sep="\t").review.dropna().to_list() ) print("Loaded {} reviews".format(len(reviews))) # %% # List of NE in Bali for NER enhancement with open("../data/supporting/bali_ner.json", "r") as f: bali_places = json.load(f) bali_places_set = set(bali_places) # Stop word definition extra_stopwords = ["bali", "idr", "usd"] stop_words = set(stopwords.words("english")) with open("../data/supporting/stopwords-en.json", "r") as f: extra_stopwords.extend(json.load(f)) # Custom replacements rep = { r"\\n": " ", r"\n": " ", r'\\"': "", r'"': "", "mongkey": "monkey", "monky": "monkey", "verry": "very", } rep = dict((re.escape(k), v) for k, v in rep.items()) pattern = re.compile("|".join(rep.keys())) lemmatizer = WordNetLemmatizer() def preprocess(text): # Step 1: Apply custom replacements (typos, special cases) text = text.lower() text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text) # Step 2: Clean text text = re.sub(r"\d+", " ", text) text = re.sub(r"\W+", " ", text) doc = nlp(text) # Step 3: POS tagging and filtering filtered_tokens = [ token.text for token in doc if token.pos_ in {"NOUN", "PROPN"} or token.ent_type_ in {"GPE", "LOC", "FAC"} or token.text in bali_places_set ] # Step 4: Lemmatization and stopword removal lemmatized_tokens = [ lemmatizer.lemmatize(w) for w in filtered_tokens if w not in stop_words and w not in extra_stopwords and len(w) > 2 ] return lemmatized_tokens # %% if PROCESS_DATA: print("Processing reviews...") reviews = [preprocess(review) for review in reviews] with open("../data/intermediate/processed_texts.pkl", "wb") as f: pickle.dump(reviews, f) else: with open("../data/intermediate/processed_texts.pkl", "rb") as f: reviews = pickle.load(f) reviews = [ " ".join(review) if isinstance(review, list) else review for review in reviews ] print(reviews[:1]) # %% [markdown] # ### Pre-calculate Embeddings # # %% embedding_model = SentenceTransformer("all-MiniLM-L6-v2") embeddings = embedding_model.encode(reviews, show_progress_bar=True) # %% [markdown] # ## Model Creation # # %% [markdown] # ### Dimensionality Reduction (UMAP) # # %% umap_model = UMAP( n_neighbors=N_NEIGHBORS, n_components=N_COMPONENTS, min_dist=MIN_DIST, metric="cosine", low_memory=True, random_state=42, ) reduced_embeddings = umap_model.fit_transform(embeddings) # %% [markdown] # ### BERTopic Model Creation # # %% if RECREATE_MODEL: ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) vectorizer_model = CountVectorizer( min_df=MIN_DOCUMENT_FREQUENCY, ngram_range=(1, MAX_NGRAM) ) representation_model = KeyBERTInspired() hdbscan_model = HDBSCAN( min_cluster_size=MIN_TOPIC_SIZE, min_samples=MIN_SAMPLES, metric="euclidean", cluster_selection_method="eom", gen_min_span_tree=True, prediction_data=True, ) topic_model = BERTopic( embedding_model=embedding_model, ctfidf_model=ctfidf_model, vectorizer_model=vectorizer_model, umap_model=umap_model, hdbscan_model=hdbscan_model, representation_model=representation_model, verbose=True, calculate_probabilities=True, language="english", top_n_words=TOP_N_WORDS, nr_topics=MAX_TOPICS, ) topics, probs = topic_model.fit_transform(reviews, embeddings=embeddings) topic_labels = topic_model.generate_topic_labels( nr_words=3, topic_prefix=True, word_length=15, separator=" - " ) topic_model.set_topic_labels(topic_labels) BERTopic.save(topic_model, "output/model.bertopic") else: print("Nevermind, loading existing model") topic_model = BERTopic.load("output/model.bertopic") # %% [markdown] # ## Fine Tuning # # ### Topic Condensation # # %% if RECREATE_REDUCED_MODEL: done = False iteration = 1 while not done: print(f"Iteration {iteration}") iteration += 1 similarity_matrix = cosine_similarity( np.array(topic_model.topic_embeddings_)[1:, :] ) nothing_to_merge = True for i in range(similarity_matrix.shape[0]): for j in range(i + 1, similarity_matrix.shape[1]): sim = similarity_matrix[i, j] if sim > 0.9: nothing_to_merge = False t1, t2 = i, j try: t1_name = topic_model.get_topic_info(t1)["CustomName"][0] t2_name = topic_model.get_topic_info(t2)["CustomName"][0] print( f"Merging topics {t1} ({t1_name}) and {t2} ({t2_name}) with similarity {sim:.2f}" ) topic_model.merge_topics(reviews, topics_to_merge=[t1, t2]) topic_labels = topic_model.generate_topic_labels( nr_words=3, topic_prefix=True, word_length=15, separator=" - ", ) topic_model.set_topic_labels(topic_labels) except Exception as e: print(f"Failed to merge {t1} and {t2}: {e}") if nothing_to_merge: print("No more topics to merge.") done = True # BERTopic.save(topic_model, "bertopic/model_reduced.bertopic") elif USE_CONDENSED_MODEL: print("Nevermind, loading existing reduced model") topic_model = BERTopic.load("bertopic/model_reduced.bertopic") else: print("Skipping topic reduction") # %% [markdown] # ### Outlier Reduction # # %% if REDUCE_OUTLIERS: new_topics = topic_model.reduce_outliers( reviews, topic_model.topics_, probabilities=topic_model.probabilities_, threshold=0.05, strategy="probabilities", ) topic_model.update_topics(reviews, topics=new_topics) # %% [markdown] # ## Results # # ### Classification # # %% from pathlib import Path import random # --- config --- topics_to_keep = {2, 4, 6, 8, 10, 5, 7} INPUT_PATH = "../data/original/reviews.tab" # TSV with a 'review' column OUTPUT_CSV = "../data/intermediate/selected_topics_documents.csv" OUTPUT_DIR = Path("../raft/corpus") OUTPUT_DIR.mkdir(parents=True, exist_ok=True) BATCH_SIZE = 60 MIN_CHARS = 40 SEED = 42 # --- load data --- data = pd.read_csv(INPUT_PATH, sep="\t") # If you already have `reviews` elsewhere, replace the next line with that variable reviews = data["review"].astype(str).fillna("") # Topic model document info df = topic_model.get_document_info(reviews) # assumes your model is already fitted df["Original"] = reviews.values # --- filter by topics and length --- filtered = df[df["Topic"].isin(topics_to_keep)].copy() filtered["Original"] = filtered["Original"].str.strip() filtered = filtered[filtered["Original"].str.len() >= MIN_CHARS] # Save an audit CSV filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False) # --- deterministic shuffle + write batched corpus files --- total_files = 0 total_reviews = 0 rng = random.Random(SEED) for topic_val, g in filtered.groupby("Topic", sort=True): reviews_list = g["Original"].tolist() # deterministic shuffle within topic rng.shuffle(reviews_list) # chunk into batches of up to 60 for start in range(0, len(reviews_list), BATCH_SIZE): chunk = reviews_list[start : start + BATCH_SIZE] if not chunk: continue # simple header for traceability header = ( f"[TOPIC] {topic_val}\n" f"[Stats] N={len(chunk)} | Source={INPUT_PATH}\n" ) lines = [header, ""] for i, txt in enumerate(chunk, 1): lines.append(f"({i}) {txt}") part_idx = start // BATCH_SIZE + 1 fname = f"topic={topic_val}__part={part_idx:03d}__n={len(chunk)}.txt" (OUTPUT_DIR / fname).write_text("\n".join(lines), encoding="utf-8") total_files += 1 total_reviews += len(chunk) print( f"[green]Wrote {total_files} docs with {total_reviews} reviews to {OUTPUT_DIR}[/green]" ) print(f"[green]Filtered CSV saved to {OUTPUT_CSV}[/green]") # %% doc_topic_matrix = probs # column names topicnames = ["Topic " + str(i) for i in range(len(set(topics)) - 1)] # index names docnames = ["Review " + str(i) for i in range(len(reviews))] # Make the pandas dataframe df_document_topic = pd.DataFrame( np.round(doc_topic_matrix, 2), columns=topicnames, index=docnames ) # Get dominant topic for each document dominant_topic = np.argmax(doc_topic_matrix, axis=1) df_document_topic["dominant_topic"] = dominant_topic # Styling def color_stuff(val): if val > 0.1: color = "green" elif val > 0.05: color = "orange" else: color = "grey" return "color: {col}".format(col=color) def make_bold(val): weight = 700 if val > 0.1 else 400 return "font-weight: {weight}".format(weight=weight) # Apply Style df_document_topics = ( df_document_topic.head(15).style.applymap(color_stuff).applymap(make_bold) ) df_document_topics # %% [markdown] # ### Document Visualization # # %% vis = topic_model.visualize_documents( docs=reviews, reduced_embeddings=reduced_embeddings, custom_labels=True, hide_annotations=True, ) vis.write_html("output/visualization.html") vis # %% [markdown] # ### Similarity Matrix # # %% topic_model.visualize_heatmap() # %% [markdown] # ### Topic Info # # %% topic_model.get_topic_info() # %% [markdown] # ### Semantic Coherence # # %% topic_words = [] for topic_id in range(len(topic_model.get_topic_info()) - 1): words = [word for word, _ in topic_model.get_topic(topic_id)] topic_words.append(words) # Compute mean pairwise cosine similarity for each topic coherence_scores = [] for words in topic_words: coherence_embeddings = embedding_model.encode(words) sim_matrix = cosine_similarity(coherence_embeddings) np.fill_diagonal(sim_matrix, 0) # Ignore self-similarity mean_sim = np.mean(sim_matrix) coherence_scores.append(mean_sim) overall_coherence = np.mean(coherence_scores) print(len(reviews), "reviews processed") print(len(topic_model.get_topic_info()) - 1, "topics found") print(f"BERT-based Topic Coherence: {overall_coherence:.4f}") # %% [markdown] # ### Topic Coherence # # %% # https://github.com/MaartenGr/BERTopic/issues/90#issuecomment-820915389 if CALCULATE_COHERENCE: # Preprocess Documents documents = pd.DataFrame( {"Document": reviews, "ID": range(len(reviews)), "Topic": topics} ) documents_per_topic = documents.groupby(["Topic"], as_index=False).agg( {"Document": " ".join} ) cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values) # Extract vectorizer and analyzer from BERTopic vectorizer = topic_model.vectorizer_model analyzer = vectorizer.build_analyzer() # Extract features for Topic Coherence evaluation words = vectorizer.get_feature_names_out() tokens = [analyzer(doc) for doc in cleaned_docs] dictionary = corpora.Dictionary(tokens) corpus = [dictionary.doc2bow(token) for token in tokens] topic_words = [ [words for words, _ in topic_model.get_topic(topic)] for topic in range(len(set(topics)) - 1) ] # %env TOKENIZERS_PARALLELISM=false for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]: coherence_model = CoherenceModel( topics=topic_words, texts=tokens, corpus=corpus, dictionary=dictionary, coherence=measurement, ) coherence_score = coherence_model.get_coherence() print(f"Coherence ({measurement}): {coherence_score:.4f}") else: print("Skipping classical coherence calculation") # %% [markdown] # ### Term Search # # %% search_term = "uluwatu" similar_topics, similarities = topic_model.find_topics(search_term, top_n=10) for i in range(len(similar_topics)): # \n{topic_model.get_topic(similar_topics[i])}\n print( f"{str(similarities[i])[:5]} {topic_model.get_topic_info(similar_topics[i])["CustomName"][0]}" ) # %% [markdown] # ### Topic Hierarchy # # %% topic_model.visualize_hierarchy(custom_labels=True) # %% [markdown] # ### Intertopic Distance Map # # %% topic_model.visualize_topics() # %% [markdown] # ### Topic Word Scores # # %% topic_model.visualize_barchart(top_n_topics=12, custom_labels=True, n_words=10)