mirror of
https://github.com/marvinscham/masterthesis-playground.git
synced 2025-12-06 18:20:53 +01:00
Restructure
This commit is contained in:
160
bertopic/bertopic_autotune.py
Normal file
160
bertopic/bertopic_autotune.py
Normal file
@@ -0,0 +1,160 @@
|
||||
import json
|
||||
import traceback
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from hdbscan import HDBSCAN
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.metrics import pairwise_distances
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
from sklearn.model_selection import ParameterGrid
|
||||
from umap import UMAP
|
||||
|
||||
from bertopic import BERTopic
|
||||
from bertopic.representation import KeyBERTInspired
|
||||
from bertopic.vectorizers import ClassTfidfTransformer
|
||||
|
||||
param_grid = {
|
||||
"nr_topics": [45, 50, 55],
|
||||
"min_topic_size": [30, 40, 50],
|
||||
"n_gram_max": [3],
|
||||
"min_document_frequency": [1, 2],
|
||||
"n_neighbors": [15],
|
||||
"n_components": [2],
|
||||
"min_dist": [0.1],
|
||||
"top_n_words": [10],
|
||||
}
|
||||
|
||||
|
||||
def calculate_metrics(topic_model, embedder, top_n_words=5):
|
||||
# Get topic words
|
||||
topic_words = []
|
||||
for topic_id in range(len(topic_model.get_topic_info()) - 1):
|
||||
words = [word for word, _ in topic_model.get_topic(topic_id)]
|
||||
topic_words.append(words[:top_n_words])
|
||||
|
||||
# Coherence
|
||||
coherence_scores = []
|
||||
for words in topic_words:
|
||||
embeddings = embedder.encode(words)
|
||||
sim_matrix = cosine_similarity(embeddings)
|
||||
np.fill_diagonal(sim_matrix, 0)
|
||||
coherence_scores.append(np.mean(sim_matrix))
|
||||
overall_coherence = np.mean(coherence_scores)
|
||||
|
||||
# Diversity
|
||||
all_topic_words = [word for topic in topic_words for word in topic]
|
||||
diversity = len(set(all_topic_words)) / len(all_topic_words)
|
||||
|
||||
# Inter-topic distance
|
||||
topic_embeddings = [
|
||||
np.mean(embedder.encode(words), axis=0) for words in topic_words
|
||||
]
|
||||
topic_distance = pairwise_distances(topic_embeddings, metric="cosine")
|
||||
avg_distance = np.mean(topic_distance[np.triu_indices_from(topic_distance, k=1)])
|
||||
|
||||
res = {
|
||||
"coherence": float(str(overall_coherence)[:6]),
|
||||
"diversity": float(str(diversity)[:6]),
|
||||
"inter_topic_distance": float(str(avg_distance)[:6]),
|
||||
"combined_score": float(
|
||||
str(0.6 * overall_coherence + 0.2 * diversity + 0.2 * avg_distance)[:6]
|
||||
),
|
||||
}
|
||||
print(res)
|
||||
return res
|
||||
|
||||
|
||||
def auto_tune_bertopic(texts, embedding_model, param_grid):
|
||||
best_score = -1
|
||||
best_params = None
|
||||
best_model = None
|
||||
history = []
|
||||
|
||||
print("Starting auto-tuning of BERTopic...")
|
||||
print(f"Number of reviews: {len(texts)}")
|
||||
|
||||
print("Running embedding model...")
|
||||
embedder = SentenceTransformer(embedding_model)
|
||||
embeddings = embedder.encode(reviews, show_progress_bar=True)
|
||||
|
||||
# Convert param_grid to list for sampling
|
||||
print("Generating parameter combinations...")
|
||||
param_list = list(ParameterGrid(param_grid))
|
||||
|
||||
print(f"Total parameter combinations: {len(param_list)}")
|
||||
for params in param_list:
|
||||
try:
|
||||
print(f"Testing params: {params}")
|
||||
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
|
||||
vectorizer_model = CountVectorizer(
|
||||
stop_words="english",
|
||||
min_df=params["min_document_frequency"],
|
||||
ngram_range=(1, params["n_gram_max"]),
|
||||
)
|
||||
|
||||
representation_model = KeyBERTInspired()
|
||||
|
||||
umap_model = UMAP(
|
||||
n_neighbors=params["n_neighbors"],
|
||||
n_components=params["n_components"],
|
||||
min_dist=params["min_dist"],
|
||||
metric="cosine",
|
||||
low_memory=True,
|
||||
random_state=42,
|
||||
)
|
||||
hdbscan_model = HDBSCAN(
|
||||
min_cluster_size=params["min_topic_size"],
|
||||
metric="euclidean",
|
||||
cluster_selection_method="eom",
|
||||
gen_min_span_tree=True,
|
||||
prediction_data=True,
|
||||
)
|
||||
|
||||
model = BERTopic(
|
||||
embedding_model=embedding_model,
|
||||
ctfidf_model=ctfidf_model,
|
||||
vectorizer_model=vectorizer_model,
|
||||
umap_model=umap_model,
|
||||
hdbscan_model=hdbscan_model,
|
||||
representation_model=representation_model,
|
||||
verbose=True,
|
||||
calculate_probabilities=True,
|
||||
language="english",
|
||||
top_n_words=params["top_n_words"],
|
||||
nr_topics=params["nr_topics"],
|
||||
)
|
||||
topics, _ = model.fit_transform(texts, embeddings)
|
||||
|
||||
metrics = calculate_metrics(model, embedder)
|
||||
history.append({"params": params, "metrics": metrics})
|
||||
|
||||
with open("history.json", "w") as f:
|
||||
json.dump(history, f, indent=2)
|
||||
|
||||
if metrics["combined_score"] > best_score:
|
||||
best_score = metrics["combined_score"]
|
||||
best_params = params
|
||||
best_model = model
|
||||
|
||||
except Exception as e:
|
||||
print(f"Failed with params {params}: {str(e)}")
|
||||
traceback.print_exc()
|
||||
continue
|
||||
|
||||
return best_model, best_params, best_score, history
|
||||
|
||||
|
||||
SPECIAL_CHARS = ["\n", "\\n"]
|
||||
MIN_REVIEW_WORDS = 5
|
||||
|
||||
reviews = pd.read_csv("data.tab", sep="\t").review.to_list()
|
||||
|
||||
for schar in SPECIAL_CHARS:
|
||||
reviews = [
|
||||
review.replace(schar, " ") if isinstance(review, str) else review
|
||||
for review in reviews
|
||||
]
|
||||
reviews = [review for review in reviews if len(str(review).split()) >= MIN_REVIEW_WORDS]
|
||||
print(auto_tune_bertopic(reviews, "all-MiniLM-L6-v2", param_grid))
|
||||
25
bertopic/bertopic_autotune_sorter.py
Normal file
25
bertopic/bertopic_autotune_sorter.py
Normal file
@@ -0,0 +1,25 @@
|
||||
import json
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
with open("history.json", "r") as f:
|
||||
history = json.load(f)
|
||||
|
||||
history = sorted(history, key=lambda x: x["metrics"]["combined_score"], reverse=True)
|
||||
|
||||
with open("history_sorted.json", "w") as f:
|
||||
json.dump(history, f, indent=2)
|
||||
|
||||
|
||||
# Extract combined scores
|
||||
scores = [item["metrics"]["coherence"] for item in history]
|
||||
|
||||
# Plot histogram
|
||||
plt.hist(scores, bins=20, edgecolor="black")
|
||||
plt.title("Distribution of Combined Scores")
|
||||
plt.xlabel("Combined Score")
|
||||
plt.ylabel("Frequency")
|
||||
plt.grid(True)
|
||||
plt.tight_layout()
|
||||
plt.savefig("combined_score_distribution.png")
|
||||
plt.close()
|
||||
569
bertopic/nb_bertopic.py
Normal file
569
bertopic/nb_bertopic.py
Normal file
@@ -0,0 +1,569 @@
|
||||
# ---
|
||||
# jupyter:
|
||||
# jupytext:
|
||||
# text_representation:
|
||||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.18.0
|
||||
# kernelspec:
|
||||
# display_name: .venv
|
||||
# language: python
|
||||
# name: python3
|
||||
# ---
|
||||
|
||||
# %% [markdown]
|
||||
# # Topic Detection: Bali Tourist Reviews
|
||||
#
|
||||
|
||||
# %% [markdown]
|
||||
# ## Preparation
|
||||
#
|
||||
# ### Dependency Loading
|
||||
#
|
||||
|
||||
# %%
|
||||
from bertopic import BERTopic
|
||||
from bertopic.representation import KeyBERTInspired
|
||||
from bertopic.vectorizers import ClassTfidfTransformer
|
||||
from gensim.models.coherencemodel import CoherenceModel
|
||||
from hdbscan import HDBSCAN
|
||||
from nltk.corpus import stopwords
|
||||
from nltk.stem import WordNetLemmatizer
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
from umap import UMAP
|
||||
import gensim.corpora as corpora
|
||||
import json
|
||||
import nltk
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import re
|
||||
import spacy
|
||||
import pickle
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
|
||||
nltk.download("stopwords")
|
||||
nltk.download("punkt")
|
||||
nltk.download("wordnet")
|
||||
|
||||
# %% [markdown]
|
||||
# ### Parameters and Tracking
|
||||
#
|
||||
|
||||
# %%
|
||||
RECREATE_MODEL = True
|
||||
RECREATE_REDUCED_MODEL = True
|
||||
PROCESS_DATA = False
|
||||
REDUCE_OUTLIERS = True
|
||||
USE_CONDENSED_MODEL = False
|
||||
|
||||
DATA_SAMPLE_SIZE = -1 # -1 for all data
|
||||
|
||||
# Classical coherence score. Warning: needs swap to not kill your PC
|
||||
CALCULATE_COHERENCE = False
|
||||
|
||||
# Vectorization
|
||||
MIN_DOCUMENT_FREQUENCY = 1
|
||||
MAX_NGRAM = 2
|
||||
|
||||
# HDBSCAN Parameters
|
||||
MIN_TOPIC_SIZE = 200
|
||||
MIN_SAMPLES = 25
|
||||
|
||||
# UMAP Parameters
|
||||
N_NEIGHBORS = 15
|
||||
N_COMPONENTS = 2
|
||||
MIN_DIST = 0.01
|
||||
|
||||
# Topic Modeling
|
||||
TOP_N_WORDS = 10
|
||||
MAX_TOPICS = None # or "auto" to pass to HDBSCAN, None to skip
|
||||
|
||||
# %% [markdown]
|
||||
# ### Data Loading & Preprocessing
|
||||
#
|
||||
|
||||
# %%
|
||||
if DATA_SAMPLE_SIZE != -1:
|
||||
reviews = (
|
||||
pd.read_csv("../data/original/reviews.tab", sep="\t")
|
||||
.sample(n=DATA_SAMPLE_SIZE)
|
||||
.review.dropna()
|
||||
.to_list()
|
||||
)
|
||||
else:
|
||||
reviews = (
|
||||
pd.read_csv("../data/original/reviews.tab", sep="\t").review.dropna().to_list()
|
||||
)
|
||||
|
||||
print("Loaded {} reviews".format(len(reviews)))
|
||||
|
||||
# %%
|
||||
# List of NE in Bali for NER enhancement
|
||||
with open("../data/supporting/bali_ner.json", "r") as f:
|
||||
bali_places = json.load(f)
|
||||
bali_places_set = set(bali_places)
|
||||
|
||||
# Stop word definition
|
||||
extra_stopwords = ["bali", "idr", "usd"]
|
||||
stop_words = set(stopwords.words("english"))
|
||||
with open("../data/supporting/stopwords-en.json", "r") as f:
|
||||
extra_stopwords.extend(json.load(f))
|
||||
|
||||
# Custom replacements
|
||||
rep = {
|
||||
r"\\n": " ",
|
||||
r"\n": " ",
|
||||
r'\\"': "",
|
||||
r'"': "",
|
||||
"mongkey": "monkey",
|
||||
"monky": "monkey",
|
||||
"verry": "very",
|
||||
}
|
||||
rep = dict((re.escape(k), v) for k, v in rep.items())
|
||||
pattern = re.compile("|".join(rep.keys()))
|
||||
|
||||
lemmatizer = WordNetLemmatizer()
|
||||
|
||||
|
||||
def preprocess(text):
|
||||
# Step 1: Apply custom replacements (typos, special cases)
|
||||
text = text.lower()
|
||||
text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
|
||||
|
||||
# Step 2: Clean text
|
||||
text = re.sub(r"\d+", " ", text)
|
||||
text = re.sub(r"\W+", " ", text)
|
||||
|
||||
doc = nlp(text)
|
||||
|
||||
# Step 3: POS tagging and filtering
|
||||
filtered_tokens = [
|
||||
token.text
|
||||
for token in doc
|
||||
if token.pos_ in {"NOUN", "PROPN"}
|
||||
or token.ent_type_ in {"GPE", "LOC", "FAC"}
|
||||
or token.text in bali_places_set
|
||||
]
|
||||
|
||||
# Step 4: Lemmatization and stopword removal
|
||||
lemmatized_tokens = [
|
||||
lemmatizer.lemmatize(w)
|
||||
for w in filtered_tokens
|
||||
if w not in stop_words and w not in extra_stopwords and len(w) > 2
|
||||
]
|
||||
|
||||
return lemmatized_tokens
|
||||
|
||||
|
||||
# %%
|
||||
if PROCESS_DATA:
|
||||
print("Processing reviews...")
|
||||
reviews = [preprocess(review) for review in reviews]
|
||||
|
||||
with open("../data/intermediate/processed_texts.pkl", "wb") as f:
|
||||
pickle.dump(reviews, f)
|
||||
else:
|
||||
with open("../data/intermediate/processed_texts.pkl", "rb") as f:
|
||||
reviews = pickle.load(f)
|
||||
reviews = [
|
||||
" ".join(review) if isinstance(review, list) else review
|
||||
for review in reviews
|
||||
]
|
||||
|
||||
print(reviews[:1])
|
||||
|
||||
# %% [markdown]
|
||||
# ### Pre-calculate Embeddings
|
||||
#
|
||||
|
||||
# %%
|
||||
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
embeddings = embedding_model.encode(reviews, show_progress_bar=True)
|
||||
|
||||
# %% [markdown]
|
||||
# ## Model Creation
|
||||
#
|
||||
|
||||
# %% [markdown]
|
||||
# ### Dimensionality Reduction (UMAP)
|
||||
#
|
||||
|
||||
# %%
|
||||
umap_model = UMAP(
|
||||
n_neighbors=N_NEIGHBORS,
|
||||
n_components=N_COMPONENTS,
|
||||
min_dist=MIN_DIST,
|
||||
metric="cosine",
|
||||
low_memory=True,
|
||||
random_state=42,
|
||||
)
|
||||
reduced_embeddings = umap_model.fit_transform(embeddings)
|
||||
|
||||
# %% [markdown]
|
||||
# ### BERTopic Model Creation
|
||||
#
|
||||
|
||||
# %%
|
||||
if RECREATE_MODEL:
|
||||
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
|
||||
vectorizer_model = CountVectorizer(
|
||||
min_df=MIN_DOCUMENT_FREQUENCY, ngram_range=(1, MAX_NGRAM)
|
||||
)
|
||||
|
||||
representation_model = KeyBERTInspired()
|
||||
hdbscan_model = HDBSCAN(
|
||||
min_cluster_size=MIN_TOPIC_SIZE,
|
||||
min_samples=MIN_SAMPLES,
|
||||
metric="euclidean",
|
||||
cluster_selection_method="eom",
|
||||
gen_min_span_tree=True,
|
||||
prediction_data=True,
|
||||
)
|
||||
|
||||
topic_model = BERTopic(
|
||||
embedding_model=embedding_model,
|
||||
ctfidf_model=ctfidf_model,
|
||||
vectorizer_model=vectorizer_model,
|
||||
umap_model=umap_model,
|
||||
hdbscan_model=hdbscan_model,
|
||||
representation_model=representation_model,
|
||||
verbose=True,
|
||||
calculate_probabilities=True,
|
||||
language="english",
|
||||
top_n_words=TOP_N_WORDS,
|
||||
nr_topics=MAX_TOPICS,
|
||||
)
|
||||
|
||||
topics, probs = topic_model.fit_transform(reviews, embeddings=embeddings)
|
||||
|
||||
topic_labels = topic_model.generate_topic_labels(
|
||||
nr_words=3, topic_prefix=True, word_length=15, separator=" - "
|
||||
)
|
||||
topic_model.set_topic_labels(topic_labels)
|
||||
BERTopic.save(topic_model, "output/model.bertopic")
|
||||
else:
|
||||
print("Nevermind, loading existing model")
|
||||
topic_model = BERTopic.load("output/model.bertopic")
|
||||
|
||||
# %% [markdown]
|
||||
# ## Fine Tuning
|
||||
#
|
||||
# ### Topic Condensation
|
||||
#
|
||||
|
||||
# %%
|
||||
if RECREATE_REDUCED_MODEL:
|
||||
done = False
|
||||
iteration = 1
|
||||
while not done:
|
||||
print(f"Iteration {iteration}")
|
||||
iteration += 1
|
||||
similarity_matrix = cosine_similarity(
|
||||
np.array(topic_model.topic_embeddings_)[1:, :]
|
||||
)
|
||||
nothing_to_merge = True
|
||||
|
||||
for i in range(similarity_matrix.shape[0]):
|
||||
for j in range(i + 1, similarity_matrix.shape[1]):
|
||||
sim = similarity_matrix[i, j]
|
||||
if sim > 0.9:
|
||||
nothing_to_merge = False
|
||||
t1, t2 = i, j
|
||||
try:
|
||||
t1_name = topic_model.get_topic_info(t1)["CustomName"][0]
|
||||
t2_name = topic_model.get_topic_info(t2)["CustomName"][0]
|
||||
print(
|
||||
f"Merging topics {t1} ({t1_name}) and {t2} ({t2_name}) with similarity {sim:.2f}"
|
||||
)
|
||||
topic_model.merge_topics(reviews, topics_to_merge=[t1, t2])
|
||||
|
||||
topic_labels = topic_model.generate_topic_labels(
|
||||
nr_words=3,
|
||||
topic_prefix=True,
|
||||
word_length=15,
|
||||
separator=" - ",
|
||||
)
|
||||
topic_model.set_topic_labels(topic_labels)
|
||||
except Exception as e:
|
||||
print(f"Failed to merge {t1} and {t2}: {e}")
|
||||
if nothing_to_merge:
|
||||
print("No more topics to merge.")
|
||||
done = True
|
||||
|
||||
# BERTopic.save(topic_model, "bertopic/model_reduced.bertopic")
|
||||
elif USE_CONDENSED_MODEL:
|
||||
print("Nevermind, loading existing reduced model")
|
||||
topic_model = BERTopic.load("bertopic/model_reduced.bertopic")
|
||||
else:
|
||||
print("Skipping topic reduction")
|
||||
|
||||
# %% [markdown]
|
||||
# ### Outlier Reduction
|
||||
#
|
||||
|
||||
# %%
|
||||
if REDUCE_OUTLIERS:
|
||||
new_topics = topic_model.reduce_outliers(
|
||||
reviews,
|
||||
topic_model.topics_,
|
||||
probabilities=topic_model.probabilities_,
|
||||
threshold=0.05,
|
||||
strategy="probabilities",
|
||||
)
|
||||
topic_model.update_topics(reviews, topics=new_topics)
|
||||
|
||||
# %% [markdown]
|
||||
# ## Results
|
||||
#
|
||||
# ### Classification
|
||||
#
|
||||
|
||||
# %%
|
||||
from pathlib import Path
|
||||
import random
|
||||
|
||||
# --- config ---
|
||||
topics_to_keep = {2, 4, 6, 8, 10, 5, 7}
|
||||
INPUT_PATH = "../data/original/reviews.tab" # TSV with a 'review' column
|
||||
OUTPUT_CSV = "../data/intermediate/selected_topics_documents.csv"
|
||||
OUTPUT_DIR = Path("../raft/corpus")
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
BATCH_SIZE = 60
|
||||
MIN_CHARS = 40
|
||||
SEED = 42
|
||||
|
||||
# --- load data ---
|
||||
data = pd.read_csv(INPUT_PATH, sep="\t")
|
||||
|
||||
# If you already have `reviews` elsewhere, replace the next line with that variable
|
||||
reviews = data["review"].astype(str).fillna("")
|
||||
|
||||
# Topic model document info
|
||||
df = topic_model.get_document_info(reviews) # assumes your model is already fitted
|
||||
df["Original"] = reviews.values
|
||||
|
||||
# --- filter by topics and length ---
|
||||
filtered = df[df["Topic"].isin(topics_to_keep)].copy()
|
||||
filtered["Original"] = filtered["Original"].str.strip()
|
||||
filtered = filtered[filtered["Original"].str.len() >= MIN_CHARS]
|
||||
|
||||
# Save an audit CSV
|
||||
filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False)
|
||||
|
||||
# --- deterministic shuffle + write batched corpus files ---
|
||||
total_files = 0
|
||||
total_reviews = 0
|
||||
rng = random.Random(SEED)
|
||||
|
||||
for topic_val, g in filtered.groupby("Topic", sort=True):
|
||||
reviews_list = g["Original"].tolist()
|
||||
|
||||
# deterministic shuffle within topic
|
||||
rng.shuffle(reviews_list)
|
||||
|
||||
# chunk into batches of up to 60
|
||||
for start in range(0, len(reviews_list), BATCH_SIZE):
|
||||
chunk = reviews_list[start : start + BATCH_SIZE]
|
||||
if not chunk:
|
||||
continue
|
||||
|
||||
# simple header for traceability
|
||||
header = (
|
||||
f"[TOPIC] {topic_val}\n" f"[Stats] N={len(chunk)} | Source={INPUT_PATH}\n"
|
||||
)
|
||||
|
||||
lines = [header, ""]
|
||||
for i, txt in enumerate(chunk, 1):
|
||||
lines.append(f"({i}) {txt}")
|
||||
|
||||
part_idx = start // BATCH_SIZE + 1
|
||||
fname = f"topic={topic_val}__part={part_idx:03d}__n={len(chunk)}.txt"
|
||||
(OUTPUT_DIR / fname).write_text("\n".join(lines), encoding="utf-8")
|
||||
|
||||
total_files += 1
|
||||
total_reviews += len(chunk)
|
||||
|
||||
print(
|
||||
f"[green]Wrote {total_files} docs with {total_reviews} reviews to {OUTPUT_DIR}[/green]"
|
||||
)
|
||||
print(f"[green]Filtered CSV saved to {OUTPUT_CSV}[/green]")
|
||||
|
||||
# %%
|
||||
doc_topic_matrix = probs
|
||||
|
||||
# column names
|
||||
topicnames = ["Topic " + str(i) for i in range(len(set(topics)) - 1)]
|
||||
|
||||
# index names
|
||||
docnames = ["Review " + str(i) for i in range(len(reviews))]
|
||||
|
||||
# Make the pandas dataframe
|
||||
df_document_topic = pd.DataFrame(
|
||||
np.round(doc_topic_matrix, 2), columns=topicnames, index=docnames
|
||||
)
|
||||
|
||||
# Get dominant topic for each document
|
||||
dominant_topic = np.argmax(doc_topic_matrix, axis=1)
|
||||
df_document_topic["dominant_topic"] = dominant_topic
|
||||
|
||||
|
||||
# Styling
|
||||
def color_stuff(val):
|
||||
if val > 0.1:
|
||||
color = "green"
|
||||
elif val > 0.05:
|
||||
color = "orange"
|
||||
else:
|
||||
color = "grey"
|
||||
return "color: {col}".format(col=color)
|
||||
|
||||
|
||||
def make_bold(val):
|
||||
weight = 700 if val > 0.1 else 400
|
||||
return "font-weight: {weight}".format(weight=weight)
|
||||
|
||||
|
||||
# Apply Style
|
||||
df_document_topics = (
|
||||
df_document_topic.head(15).style.applymap(color_stuff).applymap(make_bold)
|
||||
)
|
||||
df_document_topics
|
||||
|
||||
# %% [markdown]
|
||||
# ### Document Visualization
|
||||
#
|
||||
|
||||
# %%
|
||||
vis = topic_model.visualize_documents(
|
||||
docs=reviews,
|
||||
reduced_embeddings=reduced_embeddings,
|
||||
custom_labels=True,
|
||||
hide_annotations=True,
|
||||
)
|
||||
vis.write_html("output/visualization.html")
|
||||
vis
|
||||
|
||||
# %% [markdown]
|
||||
# ### Similarity Matrix
|
||||
#
|
||||
|
||||
# %%
|
||||
topic_model.visualize_heatmap()
|
||||
|
||||
# %% [markdown]
|
||||
# ### Topic Info
|
||||
#
|
||||
|
||||
# %%
|
||||
topic_model.get_topic_info()
|
||||
|
||||
# %% [markdown]
|
||||
# ### Semantic Coherence
|
||||
#
|
||||
|
||||
# %%
|
||||
topic_words = []
|
||||
for topic_id in range(len(topic_model.get_topic_info()) - 1):
|
||||
words = [word for word, _ in topic_model.get_topic(topic_id)]
|
||||
topic_words.append(words)
|
||||
|
||||
# Compute mean pairwise cosine similarity for each topic
|
||||
coherence_scores = []
|
||||
for words in topic_words:
|
||||
coherence_embeddings = embedding_model.encode(words)
|
||||
sim_matrix = cosine_similarity(coherence_embeddings)
|
||||
np.fill_diagonal(sim_matrix, 0) # Ignore self-similarity
|
||||
mean_sim = np.mean(sim_matrix)
|
||||
coherence_scores.append(mean_sim)
|
||||
|
||||
overall_coherence = np.mean(coherence_scores)
|
||||
|
||||
print(len(reviews), "reviews processed")
|
||||
print(len(topic_model.get_topic_info()) - 1, "topics found")
|
||||
print(f"BERT-based Topic Coherence: {overall_coherence:.4f}")
|
||||
|
||||
# %% [markdown]
|
||||
# ### Topic Coherence
|
||||
#
|
||||
|
||||
# %%
|
||||
# https://github.com/MaartenGr/BERTopic/issues/90#issuecomment-820915389
|
||||
|
||||
if CALCULATE_COHERENCE:
|
||||
# Preprocess Documents
|
||||
documents = pd.DataFrame(
|
||||
{"Document": reviews, "ID": range(len(reviews)), "Topic": topics}
|
||||
)
|
||||
documents_per_topic = documents.groupby(["Topic"], as_index=False).agg(
|
||||
{"Document": " ".join}
|
||||
)
|
||||
cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)
|
||||
|
||||
# Extract vectorizer and analyzer from BERTopic
|
||||
vectorizer = topic_model.vectorizer_model
|
||||
analyzer = vectorizer.build_analyzer()
|
||||
|
||||
# Extract features for Topic Coherence evaluation
|
||||
words = vectorizer.get_feature_names_out()
|
||||
tokens = [analyzer(doc) for doc in cleaned_docs]
|
||||
dictionary = corpora.Dictionary(tokens)
|
||||
corpus = [dictionary.doc2bow(token) for token in tokens]
|
||||
topic_words = [
|
||||
[words for words, _ in topic_model.get_topic(topic)]
|
||||
for topic in range(len(set(topics)) - 1)
|
||||
]
|
||||
|
||||
# %env TOKENIZERS_PARALLELISM=false
|
||||
|
||||
for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]:
|
||||
coherence_model = CoherenceModel(
|
||||
topics=topic_words,
|
||||
texts=tokens,
|
||||
corpus=corpus,
|
||||
dictionary=dictionary,
|
||||
coherence=measurement,
|
||||
)
|
||||
coherence_score = coherence_model.get_coherence()
|
||||
print(f"Coherence ({measurement}): {coherence_score:.4f}")
|
||||
else:
|
||||
print("Skipping classical coherence calculation")
|
||||
|
||||
# %% [markdown]
|
||||
# ### Term Search
|
||||
#
|
||||
|
||||
# %%
|
||||
search_term = "uluwatu"
|
||||
|
||||
similar_topics, similarities = topic_model.find_topics(search_term, top_n=10)
|
||||
for i in range(len(similar_topics)):
|
||||
# \n{topic_model.get_topic(similar_topics[i])}\n
|
||||
print(
|
||||
f"{str(similarities[i])[:5]} {topic_model.get_topic_info(similar_topics[i])["CustomName"][0]}"
|
||||
)
|
||||
|
||||
# %% [markdown]
|
||||
# ### Topic Hierarchy
|
||||
#
|
||||
|
||||
# %%
|
||||
topic_model.visualize_hierarchy(custom_labels=True)
|
||||
|
||||
# %% [markdown]
|
||||
# ### Intertopic Distance Map
|
||||
#
|
||||
|
||||
# %%
|
||||
topic_model.visualize_topics()
|
||||
|
||||
# %% [markdown]
|
||||
# ### Topic Word Scores
|
||||
#
|
||||
|
||||
# %%
|
||||
topic_model.visualize_barchart(top_n_topics=12, custom_labels=True, n_words=10)
|
||||
585
bertopic/nb_bertopic_lowprep.py
Normal file
585
bertopic/nb_bertopic_lowprep.py
Normal file
@@ -0,0 +1,585 @@
|
||||
# ---
|
||||
# jupyter:
|
||||
# jupytext:
|
||||
# text_representation:
|
||||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.18.0
|
||||
# kernelspec:
|
||||
# display_name: .venv
|
||||
# language: python
|
||||
# name: python3
|
||||
# ---
|
||||
|
||||
# %% [markdown]
|
||||
# # Topic Detection: Bali Tourist Reviews
|
||||
#
|
||||
|
||||
# %% [markdown]
|
||||
# ## Preparation
|
||||
#
|
||||
# ### Dependency Loading
|
||||
#
|
||||
|
||||
# %%
|
||||
from bertopic import BERTopic
|
||||
from bertopic.representation import KeyBERTInspired
|
||||
from bertopic.vectorizers import ClassTfidfTransformer
|
||||
from gensim.models.coherencemodel import CoherenceModel
|
||||
from hdbscan import HDBSCAN
|
||||
from nltk.corpus import stopwords
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
from umap import UMAP
|
||||
import gensim.corpora as corpora
|
||||
import nltk
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import re
|
||||
import spacy
|
||||
import pickle
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
|
||||
nltk.download("stopwords")
|
||||
nltk.download("punkt")
|
||||
nltk.download("wordnet")
|
||||
|
||||
# %% [markdown]
|
||||
# ### Parameters and Tracking
|
||||
#
|
||||
|
||||
# %%
|
||||
RECREATE_MODEL = True
|
||||
RECREATE_REDUCED_MODEL = True
|
||||
PROCESS_DATA = False
|
||||
REDUCE_OUTLIERS = False
|
||||
|
||||
# Data Sample Size, -1 for all data
|
||||
DATA_SAMPLE_SIZE = -1
|
||||
|
||||
# Vectorization
|
||||
MIN_DOCUMENT_FREQUENCY = 1
|
||||
MAX_NGRAM = 3
|
||||
|
||||
# HDBSCAN Parameters
|
||||
MIN_TOPIC_SIZE = 200
|
||||
MIN_SAMPLES = 25
|
||||
|
||||
# UMAP Parameters
|
||||
N_NEIGHBORS = 15
|
||||
N_COMPONENTS = 2
|
||||
MIN_DIST = 0.01
|
||||
|
||||
# Topic Modeling
|
||||
TOP_N_WORDS = 10
|
||||
MAX_TOPICS = None # or "auto" to pass to HDBSCAN, None to skip
|
||||
|
||||
tracking = {
|
||||
"input": {
|
||||
"min_document_frequency": MIN_DOCUMENT_FREQUENCY,
|
||||
"max_ngram": MAX_NGRAM,
|
||||
"min_topic_size": MIN_TOPIC_SIZE,
|
||||
"min_samples": MIN_SAMPLES,
|
||||
"n_neighbors": N_NEIGHBORS,
|
||||
"n_components": N_COMPONENTS,
|
||||
"min_dist": MIN_DIST,
|
||||
"top_n_words": TOP_N_WORDS,
|
||||
"max_topics": MAX_TOPICS,
|
||||
},
|
||||
}
|
||||
|
||||
# %% [markdown]
|
||||
# ### Data Loading & Preprocessing
|
||||
#
|
||||
|
||||
# %%
|
||||
if DATA_SAMPLE_SIZE == -1:
|
||||
reviews = pd.read_csv("../data/original/reviews.tab", sep="\t").review.to_list()
|
||||
else:
|
||||
reviews = (
|
||||
pd.read_csv("../data/original/reviews.tab", sep="\t")
|
||||
.sample(n=DATA_SAMPLE_SIZE)
|
||||
.review.to_list()
|
||||
)
|
||||
|
||||
print("Loaded {} reviews".format(len(reviews)))
|
||||
|
||||
# %%
|
||||
rep = {
|
||||
r"\\n": " ",
|
||||
r"\n": " ",
|
||||
r'\\"': "",
|
||||
r'"': "",
|
||||
"mongkey": "monkey",
|
||||
"monky": "monkey",
|
||||
"verry": "very",
|
||||
"bali": "",
|
||||
r"\s+": " ",
|
||||
}
|
||||
rep = dict((re.escape(k), v) for k, v in rep.items())
|
||||
pattern = re.compile("|".join(rep.keys()))
|
||||
|
||||
|
||||
def preprocess(text):
|
||||
text = text.strip()
|
||||
text = text.lower()
|
||||
text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
|
||||
return text
|
||||
|
||||
|
||||
# %%
|
||||
print(
|
||||
preprocess(
|
||||
"Excellent. Definitely worth coming while in bali. Food and people were very nice.\n🌟 🤩 ⭐️ \nTrisna was our host"
|
||||
)
|
||||
)
|
||||
|
||||
# %%
|
||||
if PROCESS_DATA:
|
||||
print("Processing reviews...")
|
||||
reviews = [preprocess(review) for review in reviews]
|
||||
|
||||
with open("../data/intermediate/processed_texts_lowprep.pkl", "wb") as f:
|
||||
pickle.dump(reviews, f)
|
||||
else:
|
||||
with open("../data/intermediate/processed_texts_lowprep.pkl", "rb") as f:
|
||||
reviews = pickle.load(f)
|
||||
|
||||
print(reviews[:1])
|
||||
|
||||
# %% [markdown]
|
||||
# ### Pre-calculate Embeddings
|
||||
#
|
||||
|
||||
# %%
|
||||
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
embeddings = embedding_model.encode(reviews, show_progress_bar=True)
|
||||
|
||||
# %% [markdown]
|
||||
# ## Model Creation
|
||||
#
|
||||
|
||||
# %% [markdown]
|
||||
# ### Dimensionality Reduction (UMAP)
|
||||
#
|
||||
|
||||
# %%
|
||||
umap_model = UMAP(
|
||||
n_neighbors=N_NEIGHBORS,
|
||||
n_components=N_COMPONENTS,
|
||||
min_dist=MIN_DIST,
|
||||
metric="cosine",
|
||||
low_memory=True,
|
||||
random_state=42,
|
||||
)
|
||||
reduced_embeddings = umap_model.fit_transform(embeddings)
|
||||
|
||||
# %% [markdown]
|
||||
# ### BERTopic Model Creation
|
||||
#
|
||||
|
||||
# %%
|
||||
if RECREATE_MODEL:
|
||||
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
|
||||
vectorizer_model = CountVectorizer(
|
||||
min_df=MIN_DOCUMENT_FREQUENCY,
|
||||
ngram_range=(1, MAX_NGRAM),
|
||||
stop_words=stopwords.words("english"),
|
||||
)
|
||||
|
||||
representation_model = KeyBERTInspired()
|
||||
hdbscan_model = HDBSCAN(
|
||||
min_cluster_size=MIN_TOPIC_SIZE,
|
||||
min_samples=MIN_SAMPLES,
|
||||
metric="euclidean",
|
||||
cluster_selection_method="eom",
|
||||
gen_min_span_tree=True,
|
||||
prediction_data=True,
|
||||
)
|
||||
|
||||
topic_model = BERTopic(
|
||||
embedding_model=embedding_model,
|
||||
ctfidf_model=ctfidf_model,
|
||||
vectorizer_model=vectorizer_model,
|
||||
umap_model=umap_model,
|
||||
hdbscan_model=hdbscan_model,
|
||||
representation_model=representation_model,
|
||||
verbose=True,
|
||||
calculate_probabilities=True,
|
||||
language="english",
|
||||
top_n_words=TOP_N_WORDS,
|
||||
nr_topics=MAX_TOPICS,
|
||||
)
|
||||
|
||||
topics, probs = topic_model.fit_transform(reviews, embeddings=embeddings)
|
||||
|
||||
topic_labels = topic_model.generate_topic_labels(
|
||||
nr_words=3, topic_prefix=True, word_length=15, separator=" - "
|
||||
)
|
||||
topic_model.set_topic_labels(topic_labels)
|
||||
# BERTopic.save(topic_model, "bertopic/model.bertopic")
|
||||
else:
|
||||
print("Nevermind, loading existing model")
|
||||
# topic_model = BERTopic.load("bertopic/model.bertopic")
|
||||
|
||||
# %% [markdown]
|
||||
# ## Fine Tuning
|
||||
#
|
||||
# ### Topic Condensation
|
||||
#
|
||||
|
||||
# %%
|
||||
if RECREATE_REDUCED_MODEL:
|
||||
done = False
|
||||
iteration = 1
|
||||
while not done:
|
||||
print(f"Iteration {iteration}")
|
||||
iteration += 1
|
||||
similarity_matrix = cosine_similarity(
|
||||
np.array(topic_model.topic_embeddings_)[1:, :]
|
||||
)
|
||||
nothing_to_merge = True
|
||||
|
||||
for i in range(similarity_matrix.shape[0]):
|
||||
for j in range(i + 1, similarity_matrix.shape[1]):
|
||||
try:
|
||||
sim = similarity_matrix[i, j]
|
||||
if sim > 0.9:
|
||||
nothing_to_merge = False
|
||||
t1, t2 = i, j
|
||||
try:
|
||||
t1_name = topic_model.get_topic_info(t1)["CustomName"][0]
|
||||
t2_name = topic_model.get_topic_info(t2)["CustomName"][0]
|
||||
print(
|
||||
f"Merging topics {t1} ({t1_name}) and {t2} ({t2_name}) with similarity {sim:.2f}"
|
||||
)
|
||||
topic_model.merge_topics(reviews, topics_to_merge=[t1, t2])
|
||||
|
||||
topic_labels = topic_model.generate_topic_labels(
|
||||
nr_words=3,
|
||||
topic_prefix=True,
|
||||
word_length=15,
|
||||
separator=" - ",
|
||||
)
|
||||
topic_model.set_topic_labels(topic_labels)
|
||||
similarity_matrix = cosine_similarity(
|
||||
np.array(topic_model.topic_embeddings_)[1:, :]
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Failed to merge {t1} and {t2}: {e}")
|
||||
except IndexError:
|
||||
pass
|
||||
if nothing_to_merge:
|
||||
print("No more topics to merge.")
|
||||
done = True
|
||||
else:
|
||||
print("Skipping topic reduction")
|
||||
|
||||
# %% [markdown]
|
||||
# ### Outlier Reduction
|
||||
#
|
||||
|
||||
# %%
|
||||
if REDUCE_OUTLIERS:
|
||||
new_topics = topic_model.reduce_outliers(
|
||||
reviews,
|
||||
topic_model.topics_,
|
||||
probabilities=topic_model.probabilities_,
|
||||
threshold=0.05,
|
||||
strategy="probabilities",
|
||||
)
|
||||
topic_model.update_topics(reviews, topics=new_topics)
|
||||
|
||||
# %% [markdown]
|
||||
# ## Results
|
||||
#
|
||||
# ### Classification
|
||||
#
|
||||
|
||||
# %%
|
||||
from pathlib import Path
|
||||
import random
|
||||
|
||||
# --- config ---
|
||||
topics_to_keep = {2, 4, 5, 9, 22, 26}
|
||||
INPUT_PATH = "../data/original/reviews.tab" # TSV with a 'review' column
|
||||
OUTPUT_CSV = "../data/intermediate/selected_topics_documents.csv"
|
||||
OUTPUT_DIR = Path("../raft/corpus")
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
BATCH_SIZE = 60
|
||||
MIN_CHARS = 40
|
||||
SEED = 42
|
||||
|
||||
# --- load data ---
|
||||
data = pd.read_csv(INPUT_PATH, sep="\t")
|
||||
|
||||
# If you already have `reviews` elsewhere, replace the next line with that variable
|
||||
reviews = data["review"].astype(str).fillna("")
|
||||
|
||||
# Topic model document info
|
||||
df = topic_model.get_document_info(reviews) # assumes your model is already fitted
|
||||
df["Original"] = reviews.values
|
||||
|
||||
# --- filter by topics and length ---
|
||||
filtered = df[df["Topic"].isin(topics_to_keep)].copy()
|
||||
filtered["Original"] = filtered["Original"].str.strip()
|
||||
filtered = filtered[filtered["Original"].str.len() >= MIN_CHARS]
|
||||
|
||||
# Save an audit CSV
|
||||
filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False)
|
||||
|
||||
# --- deterministic shuffle + write batched corpus files ---
|
||||
total_files = 0
|
||||
total_reviews = 0
|
||||
rng = random.Random(SEED)
|
||||
|
||||
for topic_val, g in filtered.groupby("Topic", sort=True):
|
||||
reviews_list = g["Original"].tolist()
|
||||
|
||||
# deterministic shuffle within topic
|
||||
rng.shuffle(reviews_list)
|
||||
|
||||
# chunk into batches of up to 60
|
||||
for start in range(0, len(reviews_list), BATCH_SIZE):
|
||||
chunk = reviews_list[start : start + BATCH_SIZE]
|
||||
if not chunk:
|
||||
continue
|
||||
|
||||
# simple header for traceability
|
||||
header = (
|
||||
f"[TOPIC] {topic_val}\n" + f"[Stats] N={len(chunk)} | Source={INPUT_PATH}\n"
|
||||
)
|
||||
|
||||
lines = [header, ""]
|
||||
for i, txt in enumerate(chunk, 1):
|
||||
lines.append(f"({i}) {txt}")
|
||||
|
||||
part_idx = start // BATCH_SIZE + 1
|
||||
fname = f"topic={topic_val}__part={part_idx:03d}__n={len(chunk)}.txt"
|
||||
(OUTPUT_DIR / fname).write_text("\n".join(lines), encoding="utf-8")
|
||||
|
||||
total_files += 1
|
||||
total_reviews += len(chunk)
|
||||
|
||||
print(
|
||||
f"[green]Wrote {total_files} docs with {total_reviews} reviews to {OUTPUT_DIR}[/green]"
|
||||
)
|
||||
print(f"[green]Filtered CSV saved to {OUTPUT_CSV}[/green]")
|
||||
|
||||
# %%
|
||||
doc_topic_matrix = probs
|
||||
|
||||
# column names
|
||||
topicnames = ["Topic " + str(i) for i in range(len(set(topics)) - 1)]
|
||||
|
||||
# index names
|
||||
docnames = ["Review " + str(i) for i in range(len(reviews))]
|
||||
|
||||
# Make the pandas dataframe
|
||||
df_document_topic = pd.DataFrame(
|
||||
np.round(doc_topic_matrix, 2), columns=topicnames, index=docnames
|
||||
)
|
||||
|
||||
# Get dominant topic for each document
|
||||
dominant_topic = np.argmax(doc_topic_matrix, axis=1)
|
||||
df_document_topic["dominant_topic"] = dominant_topic
|
||||
|
||||
|
||||
# Styling
|
||||
def color_stuff(val):
|
||||
if val > 0.1:
|
||||
color = "green"
|
||||
elif val > 0.05:
|
||||
color = "orange"
|
||||
else:
|
||||
color = "grey"
|
||||
return "color: {col}".format(col=color)
|
||||
|
||||
|
||||
def make_bold(val):
|
||||
weight = 700 if val > 0.1 else 400
|
||||
return "font-weight: {weight}".format(weight=weight)
|
||||
|
||||
|
||||
# Apply Style
|
||||
df_document_topics = (
|
||||
df_document_topic.head(15).style.applymap(color_stuff).applymap(make_bold)
|
||||
)
|
||||
df_document_topics
|
||||
|
||||
# %% [markdown]
|
||||
# ### Document Visualization
|
||||
#
|
||||
|
||||
# %%
|
||||
vis = topic_model.visualize_documents(
|
||||
docs=reviews,
|
||||
reduced_embeddings=reduced_embeddings,
|
||||
custom_labels=True,
|
||||
hide_annotations=True,
|
||||
)
|
||||
vis.write_html("output/visualization.html")
|
||||
vis
|
||||
|
||||
# %% [markdown]
|
||||
# ### Similarity Matrix
|
||||
#
|
||||
|
||||
# %%
|
||||
topic_model.visualize_heatmap()
|
||||
|
||||
# %% [markdown]
|
||||
# ### Topic Info
|
||||
#
|
||||
|
||||
# %%
|
||||
topic_model.get_topic_info()
|
||||
|
||||
# %% [markdown]
|
||||
# ### Semantic Coherence
|
||||
#
|
||||
|
||||
# %%
|
||||
topic_words = []
|
||||
for topic_id in range(len(topic_model.get_topic_info()) - 1):
|
||||
words = [word for word, _ in topic_model.get_topic(topic_id)]
|
||||
topic_words.append(words)
|
||||
|
||||
# Compute mean pairwise cosine similarity for each topic
|
||||
coherence_scores = []
|
||||
for words in topic_words:
|
||||
coherence_embeddings = embedding_model.encode(words)
|
||||
sim_matrix = cosine_similarity(coherence_embeddings)
|
||||
np.fill_diagonal(sim_matrix, 0) # Ignore self-similarity
|
||||
mean_sim = np.mean(sim_matrix)
|
||||
coherence_scores.append(mean_sim)
|
||||
|
||||
overall_coherence = np.mean(coherence_scores)
|
||||
|
||||
print(len(reviews), "reviews processed")
|
||||
print(len(topic_model.get_topic_info()) - 1, "topics found")
|
||||
print(f"BERT-based Topic Coherence: {overall_coherence:.4f}")
|
||||
|
||||
# %% [markdown]
|
||||
# ### Topic Coherence
|
||||
#
|
||||
|
||||
# %%
|
||||
# https://github.com/MaartenGr/BERTopic/issues/90#issuecomment-820915389
|
||||
|
||||
# This will most likely crash your PC
|
||||
this_will_crash_your_pc_are_you_sure = False
|
||||
if this_will_crash_your_pc_are_you_sure:
|
||||
# Preprocess Documents
|
||||
documents = pd.DataFrame(
|
||||
{"Document": reviews, "ID": range(len(reviews)), "Topic": topics}
|
||||
)
|
||||
documents_per_topic = documents.groupby(["Topic"], as_index=False).agg(
|
||||
{"Document": " ".join}
|
||||
)
|
||||
cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)
|
||||
|
||||
# Extract vectorizer and analyzer from BERTopic
|
||||
vectorizer = topic_model.vectorizer_model
|
||||
analyzer = vectorizer.build_analyzer()
|
||||
|
||||
# Extract features for Topic Coherence evaluation
|
||||
words = vectorizer.get_feature_names_out()
|
||||
tokens = [analyzer(doc) for doc in cleaned_docs]
|
||||
dictionary = corpora.Dictionary(tokens)
|
||||
corpus = [dictionary.doc2bow(token) for token in tokens]
|
||||
topic_words = [
|
||||
[words for words, _ in topic_model.get_topic(topic)]
|
||||
for topic in range(len(set(topics)) - 1)
|
||||
]
|
||||
|
||||
# %env TOKENIZERS_PARALLELISM=false
|
||||
|
||||
for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]:
|
||||
coherence_model = CoherenceModel(
|
||||
topics=topic_words,
|
||||
texts=tokens,
|
||||
corpus=corpus,
|
||||
dictionary=dictionary,
|
||||
coherence=measurement,
|
||||
)
|
||||
coherence_score = coherence_model.get_coherence()
|
||||
print(f"Coherence ({measurement}): {coherence_score:.4f}")
|
||||
|
||||
# %% [markdown]
|
||||
# ### Term Search
|
||||
#
|
||||
|
||||
# %%
|
||||
search_term = "uluwatu"
|
||||
|
||||
similar_topics, similarities = topic_model.find_topics(search_term, top_n=10)
|
||||
for i in range(len(similar_topics)):
|
||||
# \n{topic_model.get_topic(similar_topics[i])}\n
|
||||
print(
|
||||
f"{str(similarities[i])[:5]} {topic_model.get_topic_info(similar_topics[i])["CustomName"][0]}"
|
||||
)
|
||||
|
||||
# %% [markdown]
|
||||
# ### Topic Hierarchy
|
||||
#
|
||||
|
||||
# %%
|
||||
topic_model.visualize_hierarchy(custom_labels=True)
|
||||
|
||||
# %% [markdown]
|
||||
# ### Intertopic Distance Map
|
||||
#
|
||||
|
||||
# %%
|
||||
topic_model.visualize_topics(use_ctfidf=True)
|
||||
|
||||
# %% [markdown]
|
||||
# ### Topic Word Scores
|
||||
#
|
||||
|
||||
# %%
|
||||
topic_model.visualize_barchart(top_n_topics=12, custom_labels=True, n_words=10)
|
||||
|
||||
# %%
|
||||
# from matplotlib import pyplot as plt
|
||||
# from sklearn.manifold import TSNE
|
||||
|
||||
|
||||
# topics = topic_model.topics_
|
||||
|
||||
# # Reduce dimensionality with TSNE
|
||||
# tsne = TSNE(n_components=2, random_state=42)
|
||||
# embeddings_2d = tsne.fit_transform(embeddings)
|
||||
|
||||
# # Prepare colors (assign a color to each topic)
|
||||
# unique_topics = set(topics)
|
||||
# colors = plt.get_cmap("tab20", len(unique_topics))
|
||||
|
||||
# # Plot
|
||||
# plt.figure(figsize=(12, 8))
|
||||
# for topic in unique_topics:
|
||||
# # Select indices for the current topic
|
||||
# indices = [i for i, t in enumerate(topics) if t == topic]
|
||||
|
||||
# # Get 2D points for these indices
|
||||
# x = embeddings_2d[indices, 0]
|
||||
# y = embeddings_2d[indices, 1]
|
||||
|
||||
# # Assign label (exclude outliers)
|
||||
# label = f"Topic {topic}" if topic != -1 else "Outliers"
|
||||
|
||||
# # Plot with color
|
||||
# plt.scatter(x, y, color=colors(topic + 1), label=label, alpha=0.5)
|
||||
|
||||
# plt.title("Topic Clusters in 2D Embedding Space")
|
||||
# plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
|
||||
# plt.tight_layout()
|
||||
|
||||
# # Save the plot
|
||||
# plt.savefig("topic_clusters.png", dpi=300, bbox_inches="tight")
|
||||
# plt.show()
|
||||
3885
bertopic/output/visualization.html
Normal file
3885
bertopic/output/visualization.html
Normal file
File diff suppressed because one or more lines are too long
132
bertopic/requirements.txt
Normal file
132
bertopic/requirements.txt
Normal file
@@ -0,0 +1,132 @@
|
||||
annotated-types==0.7.0
|
||||
anyio==4.9.0
|
||||
asttokens==3.0.0
|
||||
attrs==25.3.0
|
||||
bertopic==0.17.0
|
||||
Brotli==1.1.0
|
||||
certifi==2025.4.26
|
||||
charset-normalizer==3.4.2
|
||||
click==8.2.1
|
||||
comm==0.2.2
|
||||
contourpy==1.3.2
|
||||
cssselect==1.3.0
|
||||
cycler==0.12.1
|
||||
debugpy==1.8.14
|
||||
decorator==5.2.1
|
||||
distro==1.9.0
|
||||
dotenv==0.9.9
|
||||
executing==2.2.0
|
||||
fastjsonschema==2.21.1
|
||||
filelock==3.18.0
|
||||
fonttools==4.58.0
|
||||
fsspec==2025.5.1
|
||||
gensim==4.3.3
|
||||
h11==0.16.0
|
||||
h2==4.2.0
|
||||
hdbscan==0.8.40
|
||||
hf-xet==1.1.2
|
||||
hpack==4.1.0
|
||||
httpcore==1.0.9
|
||||
httpx==0.28.1
|
||||
huggingface-hub==0.32.2
|
||||
hyperframe==6.1.0
|
||||
idna==3.10
|
||||
ipykernel==6.29.5
|
||||
ipython==9.3.0
|
||||
ipython_pygments_lexers==1.1.1
|
||||
jedi==0.19.2
|
||||
Jinja2==3.1.6
|
||||
jiter==0.10.0
|
||||
jmespath==1.0.1
|
||||
joblib==1.5.1
|
||||
jsonschema==4.24.0
|
||||
jsonschema-specifications==2025.4.1
|
||||
jupyter_client==8.6.3
|
||||
jupyter_core==5.8.1
|
||||
kaleido==0.2.1
|
||||
kiwisolver==1.4.8
|
||||
llvmlite==0.44.0
|
||||
lxml==5.4.0
|
||||
MarkupSafe==3.0.2
|
||||
matplotlib==3.10.3
|
||||
matplotlib-inline==0.1.7
|
||||
mpmath==1.3.0
|
||||
narwhals==1.41.0
|
||||
nbformat==5.10.4
|
||||
nest-asyncio==1.6.0
|
||||
networkx==3.4.2
|
||||
nltk==3.9.1
|
||||
numba==0.61.2
|
||||
numpy==1.26.4
|
||||
nvidia-cublas-cu12==12.6.4.1
|
||||
nvidia-cuda-cupti-cu12==12.6.80
|
||||
nvidia-cuda-nvrtc-cu12==12.6.77
|
||||
nvidia-cuda-runtime-cu12==12.6.77
|
||||
nvidia-cudnn-cu12==9.5.1.17
|
||||
nvidia-cufft-cu12==11.3.0.4
|
||||
nvidia-cufile-cu12==1.11.1.6
|
||||
nvidia-curand-cu12==10.3.7.77
|
||||
nvidia-cusolver-cu12==11.7.1.2
|
||||
nvidia-cusparse-cu12==12.5.4.2
|
||||
nvidia-cusparselt-cu12==0.6.3
|
||||
nvidia-nccl-cu12==2.26.2
|
||||
nvidia-nvjitlink-cu12==12.6.85
|
||||
nvidia-nvtx-cu12==12.6.77
|
||||
openai==1.82.0
|
||||
packaging==25.0
|
||||
pandas==2.2.3
|
||||
parsel==1.10.0
|
||||
parso==0.8.4
|
||||
pexpect==4.9.0
|
||||
pillow==11.2.1
|
||||
platformdirs==4.3.8
|
||||
plotly==6.1.2
|
||||
prompt_toolkit==3.0.51
|
||||
psutil==7.0.0
|
||||
ptyprocess==0.7.0
|
||||
pure_eval==0.2.3
|
||||
pydantic==2.11.5
|
||||
pydantic_core==2.33.2
|
||||
Pygments==2.19.1
|
||||
pynndescent==0.5.13
|
||||
pyparsing==3.2.3
|
||||
python-dateutil==2.9.0.post0
|
||||
python-dotenv==1.1.0
|
||||
pytz==2025.2
|
||||
PyYAML==6.0.2
|
||||
pyzmq==26.4.0
|
||||
referencing==0.36.2
|
||||
regex==2024.11.6
|
||||
requests==2.32.3
|
||||
rpds-py==0.25.1
|
||||
safetensors==0.5.3
|
||||
scikit-learn==1.6.1
|
||||
scipy==1.13.1
|
||||
seaborn==0.13.2
|
||||
sentence-transformers==4.1.0
|
||||
setuptools==80.9.0
|
||||
six==1.17.0
|
||||
smart-open==7.1.0
|
||||
sniffio==1.3.1
|
||||
stack-data==0.6.3
|
||||
sympy==1.14.0
|
||||
threadpoolctl==3.6.0
|
||||
tokenizers==0.21.1
|
||||
torch==2.7.0
|
||||
tornado==6.5.1
|
||||
tqdm==4.67.1
|
||||
traitlets==5.14.3
|
||||
transformers==4.52.3
|
||||
triton==3.3.0
|
||||
typing-inspection==0.4.1
|
||||
typing_extensions==4.13.2
|
||||
tzdata==2025.2
|
||||
umap-learn==0.5.7
|
||||
urllib3==2.4.0
|
||||
w3lib==2.3.1
|
||||
wcwidth==0.2.13
|
||||
wrapt==1.17.2
|
||||
|
||||
spacy
|
||||
nbconvert
|
||||
jupytext
|
||||
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user