Restructure

2026-02-04 13:03:12 +01:00 · 2025-10-20 23:06:52 +02:00
parent 995857ae54
commit c17e5bcc22
54 changed files with 19217 additions and 324966 deletions
--- a/top2vec/nb_top2vec.py
+++ b/top2vec/nb_top2vec.py
@@ -0,0 +1,316 @@
+# ---
+# jupyter:
+#   jupytext:
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.18.0
+#   kernelspec:
+#     display_name: .venv
+#     language: python
+#     name: python3
+# ---
+
+# %% [markdown]
+# # Topic Detection: Bali Tourist Reviews
+#
+
+# %% [markdown]
+# ## Preparation
+#
+# ### Dependency Loading
+#
+
+# %%
+from gensim import corpora
+from gensim.models import CoherenceModel
+from gensim.models.coherencemodel import CoherenceModel
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+from top2vec import Top2Vec
+from tqdm.notebook import tqdm
+import numpy as np
+import pandas as pd
+import pickle
+import re
+import spacy
+
+# %% [markdown]
+# ### Parameters and Tracking
+#
+
+# %%
+PROCESS_DATA = False
+RECALCULATE_COHERENCE_PARTS = False
+RECREATE_MODEL = True
+
+# %% [markdown]
+# ### Data Loading & Preprocessing
+#
+
+# %%
+reviews = (
+    pd.read_csv("data.tab", sep="\t").review.dropna().to_list()
+)  # .sample(5_000, random_state=42)
+
+print("Loaded {} reviews".format(len(reviews)))
+
+# %%
+rep = {
+    r"\\n": " ",
+    r"\n": " ",
+    r'\\"': "",
+    r'"': "",
+    "mongkey": "monkey",
+    "monky": "monkey",
+    "verry": "very",
+    "bali": "",
+    r"\s+": " ",
+}
+rep = dict((re.escape(k), v) for k, v in rep.items())
+pattern = re.compile("|".join(rep.keys()))
+
+
+def preprocess(text):
+    text = text.strip()
+    text = text.lower()
+    text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
+    return text
+
+
+# %%
+if PROCESS_DATA:
+    print("Processing reviews...")
+    reviews = [preprocess(review) for review in reviews]
+
+    with open("processed_texts_top2vec.pkl", "wb") as f:
+        pickle.dump(reviews, f)
+else:
+    with open("processed_texts_top2vec.pkl", "rb") as f:
+        reviews = pickle.load(f)
+        reviews = [
+            " ".join(review) if isinstance(review, list) else review
+            for review in reviews
+        ]
+
+print("Processed {} reviews".format(len(reviews)))
+print(reviews[:1])
+
+# %% [markdown]
+# ## Model Creation
+#
+
+# %%
+if RECREATE_MODEL:
+    hdbscan_args = {
+        "min_cluster_size": 200,
+        "min_samples": 25,
+        "metric": "euclidean",
+        "cluster_selection_method": "eom",
+    }
+    umap_args = {
+        "n_neighbors": 15,
+        "n_components": 2,
+        "min_dist": 0.01,
+        "metric": "cosine",
+        "random_state": 42,
+        "low_memory": True,
+    }
+
+    model = Top2Vec(
+        reviews,
+        workers=8,
+        hdbscan_args=hdbscan_args,
+        umap_args=umap_args,
+        min_count=1,
+    )
+
+    with open("./top2vec/model.pkl", "wb") as f:
+        pickle.dump(model, f)
+else:
+    with open("./top2vec/model.pkl", "rb") as f:
+        model = pickle.load(f)
+
+print(f"\nNumber of topics found: {model.get_num_topics()}")
+
+# %% [markdown]
+# ## Results
+#
+
+# %% [markdown]
+# ### Coherence
+#
+
+# %%
+topic_words = model.get_topics()[0]
+embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
+
+coherence_scores = []
+for words in topic_words:
+    coherence_embeddings = embedding_model.encode(words)
+    sim_matrix = cosine_similarity(coherence_embeddings)
+    np.fill_diagonal(sim_matrix, 0)
+    mean_sim = np.mean(sim_matrix)
+    coherence_scores.append(mean_sim)
+
+overall_coherence = np.mean(coherence_scores)
+
+print(f"BERT-based Topic Coherence: {overall_coherence:.4f}")
+
+# %%
+# %env TOKENIZERS_PARALLELISM=false
+num_words = 10
+
+if RECALCULATE_COHERENCE_PARTS:
+    tqdm.pandas()
+
+    docs = model.documents
+    doc_topics, _, _, _ = model.get_documents_topics(doc_ids=list(range(len(docs))))
+
+    df = pd.DataFrame({"Document": docs, "ID": range(len(docs)), "Topic": doc_topics})
+
+    documents_per_topic = df.groupby(["Topic"], as_index=False).agg(
+        {"Document": " ".join}
+    )
+
+    nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
+    nlp.max_length = 10_000_000
+
+    def preprocess(doc):
+        return [
+            token.text.lower()
+            for token in nlp(doc)
+            if token.is_alpha and not token.is_stop
+        ]
+        
+    topic_words = model.get_topics()[0]
+    print(topic_words)
+
+    print("Preprocessing topic documents...")
+    tokens = df["Tokens"] = df["Document"].progress_apply(preprocess)
+
+    print("Creating dictionary...")
+    dictionary = corpora.Dictionary(tokens)
+    print("Creating corpus...")
+    corpus = [dictionary.doc2bow(token_list) for token_list in tokens]
+    
+    num_topics = len(model.topic_sizes)
+
+    with open("./top2vec/corpus.pkl", "wb") as f:
+        pickle.dump(corpus, f)
+    with open("./top2vec/dictionary.pkl", "wb") as f:
+        pickle.dump(dictionary, f)
+    with open("./top2vec/tokens.pkl", "wb") as f:
+        pickle.dump(tokens, f)
+else:
+    with open("./top2vec/corpus.pkl", "rb") as f:
+        corpus = pickle.load(f)
+    with open("./top2vec/dictionary.pkl", "rb") as f:
+        dictionary = pickle.load(f)
+    with open("./top2vec/tokens.pkl", "rb") as f:
+        tokens = pickle.load(f)
+
+print("Starting coherence evaluation...")
+for measure in ["c_v", "u_mass", "c_uci", "c_npmi"]:
+    cm = CoherenceModel(
+        topics=topic_words,
+        texts=tokens,
+        corpus=corpus,
+        dictionary=dictionary,
+        coherence=measure,
+        topn=num_words,
+    )
+    score = cm.get_coherence()
+    print(f"Coherence ({measure}): {score:.4f}")
+
+# %% [markdown]
+# ### Topic List
+#
+
+# %%
+topics, probs, unq_num = model.get_topics()
+
+for i, topic_words in enumerate(topics):
+    print(f"Topic {unq_num[i]}: {' | '.join(topic_words)}")
+
+# %% [markdown]
+# ### Search by term
+#
+
+# %%
+search_term = "monkey"
+
+print(f"\nSearching for topics related to '{search_term}':")
+num_topics = min(model.get_num_topics(), 10)
+topic_words, _, _, _ = model.search_topics(
+    keywords=[search_term], num_topics=num_topics
+)
+
+for words in topic_words:
+    topics, probs, unq_num = model.get_topics()
+    for i, topic_words in enumerate(topics):
+        if set(words).issubset(set(topic_words)):
+            unq_num = unq_num[i]
+            break
+
+    print(f"Topic {unq_num}: {' | '.join(words)}")
+
+# %% [markdown]
+# ### Search by topic ID
+#
+
+# %%
+topic_id = 0
+
+print(f"Topic {topic_id}:")
+print("Top words:", " | ".join(topics[topic_id]))
+
+docs, doc_scores, doc_ids = model.search_documents_by_topic(
+    topic_num=topic_id, num_docs=15
+)
+for i, doc in enumerate(docs):
+    print(f"Doc {i+1} (Score: {doc_scores[i]:.2f}): {doc}")
+
+# %%
+import plotly.express as px
+import pandas as pd
+from umap import UMAP
+
+# Get topic metadata
+topic_vectors = model.topic_vectors
+topic_words = model.get_topics()[0]
+topic_nums, topic_sizes = model.get_topic_sizes()
+
+# Reduce vectors to 2D using UMAP
+umap_model = UMAP(n_neighbors=15, n_components=2, metric="cosine", random_state=42)
+topic_coords = umap_model.fit_transform(topic_vectors)
+
+# Ensure all components are 1D lists
+topic_nums = list(topic_nums)
+topic_sizes = list(topic_sizes)
+topic_labels = [" | ".join(words[:5]) for words in topic_words]
+
+# Build DataFrame
+df = pd.DataFrame(
+    {
+        "x": topic_coords[:, 0],
+        "y": topic_coords[:, 1],
+        "Topic Number": topic_nums,
+        "Size": topic_sizes,
+        "Top Words": topic_labels,
+    }
+)
+
+# Plot using Plotly
+fig = px.scatter(
+    df,
+    x="x",
+    y="y",
+    size="Size",
+    text="Topic Number",
+    hover_data={"Top Words": True, "Size": True, "x": False, "y": False},
+    title="Top2Vec Topic Visualization (2D)",
+)
+fig.update_traces(textposition="top center")
+fig.show()
--- a/top2vec/output/corpus.pkl
+++ b/top2vec/output/corpus.pkl
--- a/top2vec/output/dictionary.pkl
+++ b/top2vec/output/dictionary.pkl
--- a/top2vec/output/tokens.pkl
+++ b/top2vec/output/tokens.pkl