# --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: percent # format_version: '1.3' # jupytext_version: 1.18.0 # kernelspec: # display_name: .venv (3.12.3) # language: python # name: python3 # --- # %% [markdown] # # Topic Detection: Bali Tourist Reviews # # %% [markdown] # ## Preparation # # ### Dependency Loading # # %% import pickle import re import gensim.corpora as corpora import nltk import numpy as np import pandas as pd from bertopic.representation import KeyBERTInspired from bertopic.vectorizers import ClassTfidfTransformer from gensim.models.coherencemodel import CoherenceModel from hdbscan import HDBSCAN from sentence_transformers import SentenceTransformer from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction import text as skltext from sklearn.metrics.pairwise import cosine_similarity from umap import UMAP from bertopic import BERTopic nltk.download("stopwords") nltk.download("punkt") nltk.download("wordnet") # %% [markdown] # ### Hyperparameters and Settings # # %% RECREATE_MODEL = True RECREATE_REDUCED_MODEL = True PROCESS_DATA = False REDUCE_OUTLIERS = False CALCULATE_TOKEN_DISTRIBUTIONS = False # Data Sample Size, -1 for all data DATA_SAMPLE_SIZE = -1 # Vectorization MIN_DOCUMENT_FREQUENCY = 1 MAX_NGRAM = 3 # HDBSCAN Parameters MIN_TOPIC_SIZE = 200 MIN_SAMPLES = 25 # UMAP Parameters N_NEIGHBORS = 15 N_COMPONENTS = 2 MIN_DIST = 0.01 # Topic Modeling TOP_N_WORDS = 10 MAX_TOPICS = None # or "auto" to pass to HDBSCAN, None to skip TF_IDF_STOP_WORDS = ["bali", "place", "visit", "visited", "visiting"] # %% [markdown] # ### Data Loading & Preprocessing # # %% # Import data after general preprocessing if DATA_SAMPLE_SIZE == -1: reviews = pd.read_csv( "../data/intermediate/preprocessed.tab", sep="\t" ).review.to_list() else: reviews = ( pd.read_csv("../data/intermediate/preprocessed.tab", sep="\t") .sample(n=DATA_SAMPLE_SIZE) .review.to_list() ) print("Loaded {} reviews".format(len(reviews))) # %% rep = { r"\\n": " ", r"\n": " ", r'\\"': "", r'"': "", r"\s+": " ", } rep = dict((re.escape(k), v) for k, v in rep.items()) pattern = re.compile("|".join(rep.keys())) def preprocess(text): text = text.strip() text = text.lower() text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text) return text # %% print( preprocess( "Excellent. Definitely worth coming while in bali. Food and people were very nice.\n🌟 🤩 ⭐️ \nTrisna was our host" ) ) # %% if PROCESS_DATA: print("Processing reviews...") reviews = [preprocess(review) for review in reviews] with open("../data/intermediate/processed_texts_lowprep.pkl", "wb") as f: pickle.dump(reviews, f) else: with open("../data/intermediate/processed_texts_lowprep.pkl", "rb") as f: reviews = pickle.load(f) print(reviews[:1]) # %% [markdown] # ### Pre-calculate Embeddings # # %% embedding_model = SentenceTransformer("all-MiniLM-L6-v2") embeddings = embedding_model.encode(reviews, show_progress_bar=True) # %% [markdown] # ## Model Creation # # %% [markdown] # ### Dimensionality Reduction (UMAP) # # %% umap_model = UMAP( n_neighbors=N_NEIGHBORS, n_components=N_COMPONENTS, min_dist=MIN_DIST, metric="cosine", low_memory=True, random_state=42, ) reduced_embeddings = umap_model.fit_transform(embeddings) # %% [markdown] # ### BERTopic Model Creation # # %% if RECREATE_MODEL: stop_words = list(skltext.ENGLISH_STOP_WORDS.union(TF_IDF_STOP_WORDS)) ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) vectorizer_model = CountVectorizer( min_df=MIN_DOCUMENT_FREQUENCY, ngram_range=(1, MAX_NGRAM), stop_words=stop_words, ) representation_model = KeyBERTInspired() hdbscan_model = HDBSCAN( min_cluster_size=MIN_TOPIC_SIZE, min_samples=MIN_SAMPLES, metric="euclidean", cluster_selection_method="eom", gen_min_span_tree=True, prediction_data=True, ) topic_model = BERTopic( embedding_model=embedding_model, ctfidf_model=ctfidf_model, vectorizer_model=vectorizer_model, umap_model=umap_model, hdbscan_model=hdbscan_model, representation_model=representation_model, verbose=True, calculate_probabilities=True, language="english", top_n_words=TOP_N_WORDS, nr_topics=MAX_TOPICS, ) topics, probs = topic_model.fit_transform(reviews, embeddings=embeddings) topic_labels = topic_model.generate_topic_labels( nr_words=3, topic_prefix=True, word_length=15, separator=" - " ) topic_model.set_topic_labels(topic_labels) # BERTopic.save(topic_model, "bertopic/model.bertopic") else: print("Nevermind, loading existing model") # topic_model = BERTopic.load("bertopic/model.bertopic") # %% [markdown] # ## Fine Tuning # # ### Topic Condensation # # %% if RECREATE_REDUCED_MODEL: done = False iteration = 1 while not done: print(f"Iteration {iteration}") iteration += 1 similarity_matrix = cosine_similarity( np.array(topic_model.topic_embeddings_)[1:, :] ) nothing_to_merge = True for i in range(similarity_matrix.shape[0]): for j in range(i + 1, similarity_matrix.shape[1]): try: sim = similarity_matrix[i, j] if sim > 0.9: nothing_to_merge = False t1, t2 = i, j try: t1_name = topic_model.get_topic_info(t1)["CustomName"][0] t2_name = topic_model.get_topic_info(t2)["CustomName"][0] print( f"Merging topics {t1} ({t1_name}) and {t2} ({t2_name}) with similarity {sim:.2f}" ) topic_model.merge_topics(reviews, topics_to_merge=[t1, t2]) topic_labels = topic_model.generate_topic_labels( nr_words=3, topic_prefix=True, word_length=15, separator=" - ", ) topic_model.set_topic_labels(topic_labels) similarity_matrix = cosine_similarity( np.array(topic_model.topic_embeddings_)[1:, :] ) except Exception as e: print(f"Failed to merge {t1} and {t2}: {e}") except IndexError: pass if nothing_to_merge: print("No more topics to merge.") done = True else: print("Skipping topic reduction") # %% [markdown] # ### Outlier Reduction # # %% if REDUCE_OUTLIERS: new_topics = topic_model.reduce_outliers( reviews, topic_model.topics_, probabilities=topic_model.probabilities_, threshold=0.05, strategy="probabilities", ) topic_model.update_topics(reviews, topics=new_topics) # %% [markdown] # ## Results # # ### Classification # # %% CLASSIFICATION = True if CLASSIFICATION: topics_to_keep = {14, 8, 13, 18, 17, 4, 2, 30, 28} INPUT_PATH = "../data/intermediate/preprocessed.tab" # TSV with a 'review' column OUTPUT_CSV = "../data/intermediate/culture_reviews.csv" # Topic model document info df = topic_model.get_document_info(reviews) df["Original"] = reviews # --- filter by topics and length --- filtered = df[df["Topic"].isin(topics_to_keep)].copy() filtered["Original"] = filtered["Original"].str.strip() # Save an audit CSV filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False, sep=",") print(f"Filtered CSV file saved to {OUTPUT_CSV}") # %% doc_topic_matrix = probs # column names topicnames = ["Topic " + str(i) for i in range(len(set(topics)) - 1)] # index names docnames = ["Review " + str(i) for i in range(len(reviews))] # Make the pandas dataframe df_document_topic = pd.DataFrame( np.round(doc_topic_matrix, 2), columns=topicnames, index=docnames ) # Get dominant topic for each document dominant_topic = np.argmax(doc_topic_matrix, axis=1) df_document_topic["dominant_topic"] = dominant_topic # Styling def color_stuff(val): if val > 0.1: color = "green" elif val > 0.05: color = "orange" else: color = "grey" return "color: {col}".format(col=color) def make_bold(val): weight = 700 if val > 0.1 else 400 return "font-weight: {weight}".format(weight=weight) # Apply Style df_document_topics = ( df_document_topic.head(15).style.applymap(color_stuff).applymap(make_bold) ) df_document_topics # %% [markdown] # ### Document Visualization # # %% vis = topic_model.visualize_documents( docs=reviews, reduced_embeddings=reduced_embeddings, custom_labels=True, hide_annotations=True, ) # vis.write_html("output/visualization.html") vis # %% topic_model.visualize_document_datamap(reviews, reduced_embeddings=reduced_embeddings) # %% [markdown] # ### Similarity Matrix # # %% topic_model.visualize_heatmap() # %% [markdown] # ### Topic Info # # %% topic_model.get_topic_info() # %% [markdown] # ### Semantic Coherence # # %% topic_words = [] for topic_id in topic_model.get_topic_info()["Topic"]: # Skip outlier topic if topic_id < 0: continue words = [word for word, _ in topic_model.get_topic(topic_id)] topic_words.append(words) # Compute mean pairwise cosine similarity for each topic coherence_scores = [] for words in topic_words: coherence_embeddings = embedding_model.encode(words) sim_matrix = cosine_similarity(coherence_embeddings) # Ignore self-similarity np.fill_diagonal(sim_matrix, 0) mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)]) coherence_scores.append(mean_sim) overall_coherence = np.mean(coherence_scores) print(len(reviews), "reviews processed") print(len(topic_model.get_topic_info()) - 1, "topics found") print(f"BERT-based Topic Coherence: {overall_coherence:.4f}") # %% [markdown] # ### Topic Coherence # # %% # https://github.com/MaartenGr/BERTopic/issues/90#issuecomment-820915389 # This will most likely crash your PC this_will_crash_your_pc_are_you_sure = False if this_will_crash_your_pc_are_you_sure: # Preprocess Documents documents = pd.DataFrame( {"Document": reviews, "ID": range(len(reviews)), "Topic": topics} ) documents_per_topic = documents.groupby(["Topic"], as_index=False).agg( {"Document": " ".join} ) cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values) # Extract vectorizer and analyzer from BERTopic vectorizer = topic_model.vectorizer_model analyzer = vectorizer.build_analyzer() # Extract features for Topic Coherence evaluation words = vectorizer.get_feature_names_out() tokens = [analyzer(doc) for doc in cleaned_docs] dictionary = corpora.Dictionary(tokens) corpus = [dictionary.doc2bow(token) for token in tokens] for topic_id in topic_model.get_topic_info()["Topic"]: # Skip outlier topic if topic_id < 0: continue words = [word for word, _ in topic_model.get_topic(topic_id)] topic_words.append(words) # %env TOKENIZERS_PARALLELISM=false for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]: coherence_model = CoherenceModel( topics=topic_words, texts=tokens, corpus=corpus, dictionary=dictionary, coherence=measurement, ) coherence_score = coherence_model.get_coherence() print(f"Coherence ({measurement}): {coherence_score:.4f}") # %% [markdown] # ### Term Search # # %% search_term = "lempuyang" similar_topics, similarities = topic_model.find_topics(search_term, top_n=10) for i in range(len(similar_topics)): print( f"{str(similarities[i])[:5]} {topic_model.get_topic_info(similar_topics[i])['CustomName'][0]}" ) # %% # Source: https://maartengr.github.io/BERTopic/getting_started/visualization/visualize_documents.html#visualize-probabilities-or-distribution # Calculate the topic distributions on a token-level if CALCULATE_TOKEN_DISTRIBUTIONS: topic_distr, topic_token_distr = topic_model.approximate_distribution( reviews, calculate_tokens=True, use_embedding_model=True ) # %% # Visualize the token-level distributions if CALCULATE_TOKEN_DISTRIBUTIONS: DOC_INDEX = 1 df = topic_model.visualize_approximate_distribution( reviews[DOC_INDEX], topic_token_distr[DOC_INDEX] ) df # %% [markdown] # ### Topic Hierarchy # # %% topic_model.visualize_hierarchy(custom_labels=True) # %% [markdown] # ### Intertopic Distance Map # # %% topic_model.visualize_topics(use_ctfidf=True) # %% [markdown] # ### Topic Word Scores # # %% topic_model.visualize_barchart(top_n_topics=12, custom_labels=True, n_words=10)