# --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: percent # format_version: '1.3' # jupytext_version: 1.18.0 # kernelspec: # display_name: .venv # language: python # name: python3 # --- # %% [markdown] # # Topic Detection: Bali Tourist Reviews # # %% [markdown] # ## Preparation # # ### Dependency Loading # # %% import json import pickle import re import gensim.corpora as corpora import nltk import numpy as np import pandas as pd import spacy from bertopic.representation import KeyBERTInspired from bertopic.vectorizers import ClassTfidfTransformer from gensim.models.coherencemodel import CoherenceModel from hdbscan import HDBSCAN from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from sentence_transformers import SentenceTransformer from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics.pairwise import cosine_similarity from umap import UMAP from bertopic import BERTopic nlp = spacy.load("en_core_web_sm") nltk.download("stopwords") nltk.download("punkt") nltk.download("wordnet") # %% [markdown] # ### Parameters and Tracking # # %% RECREATE_MODEL = True RECREATE_REDUCED_MODEL = True PROCESS_DATA = False REDUCE_OUTLIERS = True USE_CONDENSED_MODEL = False DATA_SAMPLE_SIZE = -1 # -1 for all data # Classical coherence score. Warning: needs swap to not kill your PC CALCULATE_COHERENCE = False # Vectorization MIN_DOCUMENT_FREQUENCY = 1 MAX_NGRAM = 2 # HDBSCAN Parameters MIN_TOPIC_SIZE = 200 MIN_SAMPLES = 25 # UMAP Parameters N_NEIGHBORS = 15 N_COMPONENTS = 2 MIN_DIST = 0.01 # Topic Modeling TOP_N_WORDS = 10 MAX_TOPICS = None # or "auto" to pass to HDBSCAN, None to skip # %% [markdown] # ### Data Loading & Preprocessing # # %% if DATA_SAMPLE_SIZE != -1: reviews = ( pd.read_csv("../data/original/reviews.tab", sep="\t") .sample(n=DATA_SAMPLE_SIZE) .review.dropna() .to_list() ) else: reviews = ( pd.read_csv("../data/original/reviews.tab", sep="\t").review.dropna().to_list() ) print("Loaded {} reviews".format(len(reviews))) # %% # List of NE in Bali for NER enhancement with open("../data/supporting/bali_ner.json", "r") as f: bali_places = json.load(f) bali_places_set = set(bali_places) # Stop word definition extra_stopwords = ["bali", "idr", "usd"] stop_words = set(stopwords.words("english")) with open("../data/supporting/stopwords-en.json", "r") as f: extra_stopwords.extend(json.load(f)) # Custom replacements rep = { r"\\n": " ", r"\n": " ", r'\\"': "", r'"': "", "mongkey": "monkey", "monky": "monkey", "verry": "very", } rep = dict((re.escape(k), v) for k, v in rep.items()) pattern = re.compile("|".join(rep.keys())) lemmatizer = WordNetLemmatizer() def preprocess(text): # Step 1: Apply custom replacements (typos, special cases) text = text.lower() text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text) # Step 2: Clean text text = re.sub(r"\d+", " ", text) text = re.sub(r"\W+", " ", text) doc = nlp(text) # Step 3: POS tagging and filtering filtered_tokens = [ token.text for token in doc if token.pos_ in {"NOUN", "PROPN"} or token.ent_type_ in {"GPE", "LOC", "FAC"} or token.text in bali_places_set ] # Step 4: Lemmatization and stopword removal lemmatized_tokens = [ lemmatizer.lemmatize(w) for w in filtered_tokens if w not in stop_words and w not in extra_stopwords and len(w) > 2 ] return lemmatized_tokens # %% if PROCESS_DATA: print("Processing reviews...") reviews = [preprocess(review) for review in reviews] with open("../data/intermediate/processed_texts.pkl", "wb") as f: pickle.dump(reviews, f) else: with open("../data/intermediate/processed_texts.pkl", "rb") as f: reviews = pickle.load(f) reviews = [ " ".join(review) if isinstance(review, list) else review for review in reviews ] print(reviews[:1]) # %% [markdown] # ### Pre-calculate Embeddings # # %% embedding_model = SentenceTransformer("all-MiniLM-L6-v2") embeddings = embedding_model.encode(reviews, show_progress_bar=True) # %% [markdown] # ## Model Creation # # %% [markdown] # ### Dimensionality Reduction (UMAP) # # %% umap_model = UMAP( n_neighbors=N_NEIGHBORS, n_components=N_COMPONENTS, min_dist=MIN_DIST, metric="cosine", low_memory=True, random_state=42, ) reduced_embeddings = umap_model.fit_transform(embeddings) # %% [markdown] # ### BERTopic Model Creation # # %% if RECREATE_MODEL: ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) vectorizer_model = CountVectorizer( min_df=MIN_DOCUMENT_FREQUENCY, ngram_range=(1, MAX_NGRAM) ) representation_model = KeyBERTInspired() hdbscan_model = HDBSCAN( min_cluster_size=MIN_TOPIC_SIZE, min_samples=MIN_SAMPLES, metric="euclidean", cluster_selection_method="eom", gen_min_span_tree=True, prediction_data=True, ) topic_model = BERTopic( embedding_model=embedding_model, ctfidf_model=ctfidf_model, vectorizer_model=vectorizer_model, umap_model=umap_model, hdbscan_model=hdbscan_model, representation_model=representation_model, verbose=True, calculate_probabilities=True, language="english", top_n_words=TOP_N_WORDS, nr_topics=MAX_TOPICS, ) topics, probs = topic_model.fit_transform(reviews, embeddings=embeddings) topic_labels = topic_model.generate_topic_labels( nr_words=3, topic_prefix=True, word_length=15, separator=" - " ) topic_model.set_topic_labels(topic_labels) BERTopic.save(topic_model, "output/model.bertopic") else: print("Nevermind, loading existing model") topic_model = BERTopic.load("output/model.bertopic") # %% [markdown] # ## Fine Tuning # # ### Topic Condensation # # %% if RECREATE_REDUCED_MODEL: done = False iteration = 1 while not done: print(f"Iteration {iteration}") iteration += 1 similarity_matrix = cosine_similarity( np.array(topic_model.topic_embeddings_)[1:, :] ) nothing_to_merge = True for i in range(similarity_matrix.shape[0]): for j in range(i + 1, similarity_matrix.shape[1]): sim = similarity_matrix[i, j] if sim > 0.9: nothing_to_merge = False t1, t2 = i, j try: t1_name = topic_model.get_topic_info(t1)["CustomName"][0] t2_name = topic_model.get_topic_info(t2)["CustomName"][0] print( f"Merging topics {t1} ({t1_name}) and {t2} ({t2_name}) with similarity {sim:.2f}" ) topic_model.merge_topics(reviews, topics_to_merge=[t1, t2]) topic_labels = topic_model.generate_topic_labels( nr_words=3, topic_prefix=True, word_length=15, separator=" - ", ) topic_model.set_topic_labels(topic_labels) except Exception as e: print(f"Failed to merge {t1} and {t2}: {e}") if nothing_to_merge: print("No more topics to merge.") done = True # BERTopic.save(topic_model, "bertopic/model_reduced.bertopic") elif USE_CONDENSED_MODEL: print("Nevermind, loading existing reduced model") topic_model = BERTopic.load("bertopic/model_reduced.bertopic") else: print("Skipping topic reduction") # %% [markdown] # ### Outlier Reduction # # %% if REDUCE_OUTLIERS: new_topics = topic_model.reduce_outliers( reviews, topic_model.topics_, probabilities=topic_model.probabilities_, threshold=0.05, strategy="probabilities", ) topic_model.update_topics(reviews, topics=new_topics) # %% [markdown] # ## Results # # ### Classification # # %% import random from pathlib import Path # --- config --- topics_to_keep = {2, 4, 6, 8, 10, 5, 7} INPUT_PATH = "../data/original/reviews.tab" # TSV with a 'review' column OUTPUT_CSV = "../data/intermediate/selected_topics_documents.csv" OUTPUT_DIR = Path("../raft/corpus") OUTPUT_DIR.mkdir(parents=True, exist_ok=True) BATCH_SIZE = 60 MIN_CHARS = 40 SEED = 42 # --- load data --- data = pd.read_csv(INPUT_PATH, sep="\t") # If you already have `reviews` elsewhere, replace the next line with that variable reviews = data["review"].astype(str).fillna("") # Topic model document info df = topic_model.get_document_info(reviews) # assumes your model is already fitted df["Original"] = reviews.values # --- filter by topics and length --- filtered = df[df["Topic"].isin(topics_to_keep)].copy() filtered["Original"] = filtered["Original"].str.strip() filtered = filtered[filtered["Original"].str.len() >= MIN_CHARS] # Save an audit CSV filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False) # --- deterministic shuffle + write batched corpus files --- total_files = 0 total_reviews = 0 rng = random.Random(SEED) for topic_val, g in filtered.groupby("Topic", sort=True): reviews_list = g["Original"].tolist() # deterministic shuffle within topic rng.shuffle(reviews_list) # chunk into batches of up to 60 for start in range(0, len(reviews_list), BATCH_SIZE): chunk = reviews_list[start : start + BATCH_SIZE] if not chunk: continue # simple header for traceability header = ( f"[TOPIC] {topic_val}\n" f"[Stats] N={len(chunk)} | Source={INPUT_PATH}\n" ) lines = [header, ""] for i, txt in enumerate(chunk, 1): lines.append(f"({i}) {txt}") part_idx = start // BATCH_SIZE + 1 fname = f"topic={topic_val}__part={part_idx:03d}__n={len(chunk)}.txt" (OUTPUT_DIR / fname).write_text("\n".join(lines), encoding="utf-8") total_files += 1 total_reviews += len(chunk) print( f"[green]Wrote {total_files} docs with {total_reviews} reviews to {OUTPUT_DIR}[/green]" ) print(f"[green]Filtered CSV saved to {OUTPUT_CSV}[/green]") # %% doc_topic_matrix = probs # column names topicnames = ["Topic " + str(i) for i in range(len(set(topics)) - 1)] # index names docnames = ["Review " + str(i) for i in range(len(reviews))] # Make the pandas dataframe df_document_topic = pd.DataFrame( np.round(doc_topic_matrix, 2), columns=topicnames, index=docnames ) # Get dominant topic for each document dominant_topic = np.argmax(doc_topic_matrix, axis=1) df_document_topic["dominant_topic"] = dominant_topic # Styling def color_stuff(val): if val > 0.1: color = "green" elif val > 0.05: color = "orange" else: color = "grey" return "color: {col}".format(col=color) def make_bold(val): weight = 700 if val > 0.1 else 400 return "font-weight: {weight}".format(weight=weight) # Apply Style df_document_topics = ( df_document_topic.head(15).style.applymap(color_stuff).applymap(make_bold) ) df_document_topics # %% [markdown] # ### Document Visualization # # %% vis = topic_model.visualize_documents( docs=reviews, reduced_embeddings=reduced_embeddings, custom_labels=True, hide_annotations=True, ) vis.write_html("output/visualization.html") vis # %% [markdown] # ### Similarity Matrix # # %% topic_model.visualize_heatmap() # %% [markdown] # ### Topic Info # # %% topic_model.get_topic_info() # %% [markdown] # ### Semantic Coherence # # %% topic_words = [] for topic_id in topic_model.get_topic_info()["Topic"]: # Skip outlier topic if topic_id < 0: continue words = [word for word, _ in topic_model.get_topic(topic_id)] topic_words.append(words) # Compute mean pairwise cosine similarity for each topic coherence_scores = [] for words in topic_words: coherence_embeddings = embedding_model.encode(words) sim_matrix = cosine_similarity(coherence_embeddings) # Ignore self-similarity np.fill_diagonal(sim_matrix, 0) mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)]) coherence_scores.append(mean_sim) overall_coherence = np.mean(coherence_scores) print(len(reviews), "reviews processed") print(len(topic_model.get_topic_info()) - 1, "topics found") print(f"BERT-based Topic Coherence: {overall_coherence:.4f}") # %% [markdown] # ### Topic Coherence # # %% # https://github.com/MaartenGr/BERTopic/issues/90#issuecomment-820915389 if CALCULATE_COHERENCE: # Preprocess Documents documents = pd.DataFrame( {"Document": reviews, "ID": range(len(reviews)), "Topic": topics} ) documents_per_topic = documents.groupby(["Topic"], as_index=False).agg( {"Document": " ".join} ) cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values) # Extract vectorizer and analyzer from BERTopic vectorizer = topic_model.vectorizer_model analyzer = vectorizer.build_analyzer() # Extract features for Topic Coherence evaluation words = vectorizer.get_feature_names_out() tokens = [analyzer(doc) for doc in cleaned_docs] dictionary = corpora.Dictionary(tokens) corpus = [dictionary.doc2bow(token) for token in tokens] topic_words = [ [words for words, _ in topic_model.get_topic(topic)] for topic in range(len(set(topics)) - 1) ] # %env TOKENIZERS_PARALLELISM=false for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]: coherence_model = CoherenceModel( topics=topic_words, texts=tokens, corpus=corpus, dictionary=dictionary, coherence=measurement, ) coherence_score = coherence_model.get_coherence() print(f"Coherence ({measurement}): {coherence_score:.4f}") else: print("Skipping classical coherence calculation") # %% [markdown] # ### Term Search # # %% search_term = "uluwatu" similar_topics, similarities = topic_model.find_topics(search_term, top_n=10) for i in range(len(similar_topics)): # \n{topic_model.get_topic(similar_topics[i])}\n print( f"{str(similarities[i])[:5]} {topic_model.get_topic_info(similar_topics[i])["CustomName"][0]}" ) # %% [markdown] # ### Topic Hierarchy # # %% topic_model.visualize_hierarchy(custom_labels=True) # %% [markdown] # ### Intertopic Distance Map # # %% topic_model.visualize_topics() # %% [markdown] # ### Topic Word Scores # # %% topic_model.visualize_barchart(top_n_topics=12, custom_labels=True, n_words=10)