# --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: percent # format_version: '1.3' # jupytext_version: 1.18.0 # kernelspec: # display_name: .venv # language: python # name: python3 # --- # %% [markdown] # # Topic Detection: Bali Tourist Reviews # # %% [markdown] # ## Preparation # # ### Dependency Loading # # %% from gensim import corpora from gensim.models import CoherenceModel from gensim.models.coherencemodel import CoherenceModel from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity from top2vec import Top2Vec from tqdm.notebook import tqdm import numpy as np import pandas as pd import pickle import re import spacy # %% [markdown] # ### Parameters and Tracking # # %% PROCESS_DATA = False RECALCULATE_COHERENCE_PARTS = False RECREATE_MODEL = True # %% [markdown] # ### Data Loading & Preprocessing # # %% reviews = ( pd.read_csv("data.tab", sep="\t").review.dropna().to_list() ) # .sample(5_000, random_state=42) print("Loaded {} reviews".format(len(reviews))) # %% rep = { r"\\n": " ", r"\n": " ", r'\\"': "", r'"': "", "mongkey": "monkey", "monky": "monkey", "verry": "very", "bali": "", r"\s+": " ", } rep = dict((re.escape(k), v) for k, v in rep.items()) pattern = re.compile("|".join(rep.keys())) def preprocess(text): text = text.strip() text = text.lower() text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text) return text # %% if PROCESS_DATA: print("Processing reviews...") reviews = [preprocess(review) for review in reviews] with open("processed_texts_top2vec.pkl", "wb") as f: pickle.dump(reviews, f) else: with open("processed_texts_top2vec.pkl", "rb") as f: reviews = pickle.load(f) reviews = [ " ".join(review) if isinstance(review, list) else review for review in reviews ] print("Processed {} reviews".format(len(reviews))) print(reviews[:1]) # %% [markdown] # ## Model Creation # # %% if RECREATE_MODEL: hdbscan_args = { "min_cluster_size": 200, "min_samples": 25, "metric": "euclidean", "cluster_selection_method": "eom", } umap_args = { "n_neighbors": 15, "n_components": 2, "min_dist": 0.01, "metric": "cosine", "random_state": 42, "low_memory": True, } model = Top2Vec( reviews, workers=8, hdbscan_args=hdbscan_args, umap_args=umap_args, min_count=1, ) with open("./top2vec/model.pkl", "wb") as f: pickle.dump(model, f) else: with open("./top2vec/model.pkl", "rb") as f: model = pickle.load(f) print(f"\nNumber of topics found: {model.get_num_topics()}") # %% [markdown] # ## Results # # %% [markdown] # ### Coherence # # %% topic_words = model.get_topics()[0] embedding_model = SentenceTransformer("all-MiniLM-L6-v2") coherence_scores = [] for words in topic_words: coherence_embeddings = embedding_model.encode(words) sim_matrix = cosine_similarity(coherence_embeddings) np.fill_diagonal(sim_matrix, 0) mean_sim = np.mean(sim_matrix) coherence_scores.append(mean_sim) overall_coherence = np.mean(coherence_scores) print(f"BERT-based Topic Coherence: {overall_coherence:.4f}") # %% # %env TOKENIZERS_PARALLELISM=false num_words = 10 if RECALCULATE_COHERENCE_PARTS: tqdm.pandas() docs = model.documents doc_topics, _, _, _ = model.get_documents_topics(doc_ids=list(range(len(docs)))) df = pd.DataFrame({"Document": docs, "ID": range(len(docs)), "Topic": doc_topics}) documents_per_topic = df.groupby(["Topic"], as_index=False).agg( {"Document": " ".join} ) nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"]) nlp.max_length = 10_000_000 def preprocess(doc): return [ token.text.lower() for token in nlp(doc) if token.is_alpha and not token.is_stop ] topic_words = model.get_topics()[0] print(topic_words) print("Preprocessing topic documents...") tokens = df["Tokens"] = df["Document"].progress_apply(preprocess) print("Creating dictionary...") dictionary = corpora.Dictionary(tokens) print("Creating corpus...") corpus = [dictionary.doc2bow(token_list) for token_list in tokens] num_topics = len(model.topic_sizes) with open("./top2vec/corpus.pkl", "wb") as f: pickle.dump(corpus, f) with open("./top2vec/dictionary.pkl", "wb") as f: pickle.dump(dictionary, f) with open("./top2vec/tokens.pkl", "wb") as f: pickle.dump(tokens, f) else: with open("./top2vec/corpus.pkl", "rb") as f: corpus = pickle.load(f) with open("./top2vec/dictionary.pkl", "rb") as f: dictionary = pickle.load(f) with open("./top2vec/tokens.pkl", "rb") as f: tokens = pickle.load(f) print("Starting coherence evaluation...") for measure in ["c_v", "u_mass", "c_uci", "c_npmi"]: cm = CoherenceModel( topics=topic_words, texts=tokens, corpus=corpus, dictionary=dictionary, coherence=measure, topn=num_words, ) score = cm.get_coherence() print(f"Coherence ({measure}): {score:.4f}") # %% [markdown] # ### Topic List # # %% topics, probs, unq_num = model.get_topics() for i, topic_words in enumerate(topics): print(f"Topic {unq_num[i]}: {' | '.join(topic_words)}") # %% [markdown] # ### Search by term # # %% search_term = "monkey" print(f"\nSearching for topics related to '{search_term}':") num_topics = min(model.get_num_topics(), 10) topic_words, _, _, _ = model.search_topics( keywords=[search_term], num_topics=num_topics ) for words in topic_words: topics, probs, unq_num = model.get_topics() for i, topic_words in enumerate(topics): if set(words).issubset(set(topic_words)): unq_num = unq_num[i] break print(f"Topic {unq_num}: {' | '.join(words)}") # %% [markdown] # ### Search by topic ID # # %% topic_id = 0 print(f"Topic {topic_id}:") print("Top words:", " | ".join(topics[topic_id])) docs, doc_scores, doc_ids = model.search_documents_by_topic( topic_num=topic_id, num_docs=15 ) for i, doc in enumerate(docs): print(f"Doc {i+1} (Score: {doc_scores[i]:.2f}): {doc}") # %% import plotly.express as px import pandas as pd from umap import UMAP # Get topic metadata topic_vectors = model.topic_vectors topic_words = model.get_topics()[0] topic_nums, topic_sizes = model.get_topic_sizes() # Reduce vectors to 2D using UMAP umap_model = UMAP(n_neighbors=15, n_components=2, metric="cosine", random_state=42) topic_coords = umap_model.fit_transform(topic_vectors) # Ensure all components are 1D lists topic_nums = list(topic_nums) topic_sizes = list(topic_sizes) topic_labels = [" | ".join(words[:5]) for words in topic_words] # Build DataFrame df = pd.DataFrame( { "x": topic_coords[:, 0], "y": topic_coords[:, 1], "Topic Number": topic_nums, "Size": topic_sizes, "Top Words": topic_labels, } ) # Plot using Plotly fig = px.scatter( df, x="x", y="y", size="Size", text="Topic Number", hover_data={"Top Words": True, "Size": True, "x": False, "y": False}, title="Top2Vec Topic Visualization (2D)", ) fig.update_traces(textposition="top center") fig.show()