This commit is contained in:
2025-06-06 05:14:58 +02:00
commit c002e46acb
22 changed files with 265678 additions and 0 deletions

5
.gitignore vendored Normal file
View File

@@ -0,0 +1,5 @@
.env
.venv/
__pycache__/
**.bertopic
history*.json

121436
bertopic.ipynb Normal file

File diff suppressed because one or more lines are too long

3885
bertopic/heatmap.html Normal file

File diff suppressed because one or more lines are too long

3885
bertopic/map.html Normal file

File diff suppressed because one or more lines are too long

1027
bertopic/tracking.json Normal file

File diff suppressed because it is too large Load Diff

160
bertopic_autotune.py Normal file
View File

@@ -0,0 +1,160 @@
import json
import traceback
import numpy as np
import pandas as pd
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import ParameterGrid
from umap import UMAP
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
param_grid = {
"nr_topics": [45, 50, 55],
"min_topic_size": [30, 40, 50],
"n_gram_max": [3],
"min_document_frequency": [1, 2],
"n_neighbors": [15],
"n_components": [2],
"min_dist": [0.1],
"top_n_words": [10],
}
def calculate_metrics(topic_model, embedder, top_n_words=5):
# Get topic words
topic_words = []
for topic_id in range(len(topic_model.get_topic_info()) - 1):
words = [word for word, _ in topic_model.get_topic(topic_id)]
topic_words.append(words[:top_n_words])
# Coherence
coherence_scores = []
for words in topic_words:
embeddings = embedder.encode(words)
sim_matrix = cosine_similarity(embeddings)
np.fill_diagonal(sim_matrix, 0)
coherence_scores.append(np.mean(sim_matrix))
overall_coherence = np.mean(coherence_scores)
# Diversity
all_topic_words = [word for topic in topic_words for word in topic]
diversity = len(set(all_topic_words)) / len(all_topic_words)
# Inter-topic distance
topic_embeddings = [
np.mean(embedder.encode(words), axis=0) for words in topic_words
]
topic_distance = pairwise_distances(topic_embeddings, metric="cosine")
avg_distance = np.mean(topic_distance[np.triu_indices_from(topic_distance, k=1)])
res = {
"coherence": float(str(overall_coherence)[:6]),
"diversity": float(str(diversity)[:6]),
"inter_topic_distance": float(str(avg_distance)[:6]),
"combined_score": float(
str(0.6 * overall_coherence + 0.2 * diversity + 0.2 * avg_distance)[:6]
),
}
print(res)
return res
def auto_tune_bertopic(texts, embedding_model, param_grid):
best_score = -1
best_params = None
best_model = None
history = []
print("Starting auto-tuning of BERTopic...")
print(f"Number of reviews: {len(texts)}")
print("Running embedding model...")
embedder = SentenceTransformer(embedding_model)
embeddings = embedder.encode(reviews, show_progress_bar=True)
# Convert param_grid to list for sampling
print("Generating parameter combinations...")
param_list = list(ParameterGrid(param_grid))
print(f"Total parameter combinations: {len(param_list)}")
for params in param_list:
try:
print(f"Testing params: {params}")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
vectorizer_model = CountVectorizer(
stop_words="english",
min_df=params["min_document_frequency"],
ngram_range=(1, params["n_gram_max"]),
)
representation_model = KeyBERTInspired()
umap_model = UMAP(
n_neighbors=params["n_neighbors"],
n_components=params["n_components"],
min_dist=params["min_dist"],
metric="cosine",
low_memory=True,
random_state=42,
)
hdbscan_model = HDBSCAN(
min_cluster_size=params["min_topic_size"],
metric="euclidean",
cluster_selection_method="eom",
gen_min_span_tree=True,
prediction_data=True,
)
model = BERTopic(
embedding_model=embedding_model,
ctfidf_model=ctfidf_model,
vectorizer_model=vectorizer_model,
umap_model=umap_model,
hdbscan_model=hdbscan_model,
representation_model=representation_model,
verbose=True,
calculate_probabilities=True,
language="english",
top_n_words=params["top_n_words"],
nr_topics=params["nr_topics"],
)
topics, _ = model.fit_transform(texts, embeddings)
metrics = calculate_metrics(model, embedder)
history.append({"params": params, "metrics": metrics})
with open("history.json", "w") as f:
json.dump(history, f, indent=2)
if metrics["combined_score"] > best_score:
best_score = metrics["combined_score"]
best_params = params
best_model = model
except Exception as e:
print(f"Failed with params {params}: {str(e)}")
traceback.print_exc()
continue
return best_model, best_params, best_score, history
SPECIAL_CHARS = ["\n", "\\n"]
MIN_REVIEW_WORDS = 5
reviews = pd.read_csv("data.tab", sep="\t").review.to_list()
for schar in SPECIAL_CHARS:
reviews = [
review.replace(schar, " ") if isinstance(review, str) else review
for review in reviews
]
reviews = [review for review in reviews if len(str(review).split()) >= MIN_REVIEW_WORDS]
print(auto_tune_bertopic(reviews, "all-MiniLM-L6-v2", param_grid))

View File

@@ -0,0 +1,25 @@
import json
import matplotlib.pyplot as plt
with open("history.json", "r") as f:
history = json.load(f)
history = sorted(history, key=lambda x: x["metrics"]["combined_score"], reverse=True)
with open("history_sorted.json", "w") as f:
json.dump(history, f, indent=2)
# Extract combined scores
scores = [item["metrics"]["coherence"] for item in history]
# Plot histogram
plt.hist(scores, bins=20, edgecolor="black")
plt.title("Distribution of Combined Scores")
plt.xlabel("Combined Score")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.savefig("combined_score_distribution.png")
plt.close()

56449
data.tab Normal file

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,101 @@
import json
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
def load_labels(file_path):
"""Load labels from JSON file"""
with open(file_path, "r", encoding="utf-8") as f:
return json.load(f)
def process_labels(data):
"""Extract valid categories and count their occurrences"""
categories = []
errors = 0
for entry in data:
if "deepseek" in entry:
categories.append(entry["deepseek"]["category"])
elif "error" in entry:
errors += 1
category_counts = Counter(categories)
return category_counts, errors
def visualize_distribution(category_counts, errors, output_file=None):
"""Create visualization of category distribution"""
# Prepare data
categories = list(category_counts.keys())
counts = list(category_counts.values())
total_valid = sum(counts)
total = total_valid + errors
# Set style
sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))
# Create bar plot
ax = sns.barplot(x=categories, y=counts, palette="viridis")
# Customize plot
plt.title(
f"Review Category Distribution\n(Total: {total} reviews - {errors} errors)",
pad=20,
)
plt.xlabel("Category")
plt.ylabel("Count")
plt.xticks(rotation=45, ha="right")
# Add value labels
for i, count in enumerate(counts):
ax.text(i, count + 0.5, str(count), ha="center")
# Add error count annotation if there are errors
if errors > 0:
plt.annotate(
f"{errors} errors\n({errors/total:.1%})",
xy=(0.95, 0.95),
xycoords="axes fraction",
ha="right",
va="top",
bbox=dict(boxstyle="round", facecolor="white", alpha=0.8),
)
# Adjust layout
plt.tight_layout()
# Save or show
if output_file:
plt.savefig(output_file, dpi=300)
print(f"Visualization saved to {output_file}")
else:
plt.show()
def main():
input_file = "deepseek_labels.json"
output_image = (
"./img/category_distribution.png" # Set to None to display instead of saving
)
# Load and process data
data = load_labels(input_file)
category_counts, errors = process_labels(data)
# Print basic stats
print("Category Distribution:")
for category, count in category_counts.most_common():
print(f"- {category}: {count} ({count/len(data):.1%})")
if errors > 0:
print(f"- Errors: {errors} ({errors/len(data):.1%})")
# Visualize
visualize_distribution(category_counts, errors, output_image)
if __name__ == "__main__":
main()

143
deepseek_labeler.py Normal file
View File

@@ -0,0 +1,143 @@
import concurrent.futures
import json
import os
from pathlib import Path
from threading import Lock
from dotenv import load_dotenv
from openai import OpenAI
# Initialize a thread-safe lock for file writing
load_dotenv()
file_lock = Lock()
client = OpenAI(
api_key=os.getenv("DEEPSEEK_API_KEY"),
base_url="https://api.deepseek.com",
)
system_prompt = """
The user will provide a tourist review. Please categorize them according to the following categories, provide a short reasoning for the decision (max 8 words) and output them in JSON format.
The categories are: adventurer, business, family, backpacker, luxury, or none if no category fits.
EXAMPLE INPUT:
Perfect for families! The hotel had a kids' club, a shallow pool, and spacious rooms. Nearby attractions were child-friendly, and the staff went out of their way to accommodate us. Will definitely return!
EXAMPLE JSON OUTPUT:
{
"category": "family",
"reason": "child-friendly amenities and staff"
}
"""
def query_deepseek(review):
"""Query DeepSeek API for categorization"""
try:
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": review},
],
temperature=0.2,
response_format={"type": "json_object"},
)
content = response.choices[0].message.content
return content
except Exception as e:
print(f"Error querying DeepSeek API: {e}")
return None
def read_reviews(file_path):
"""Read reviews from tab-separated file, assuming one review per line"""
with open(file_path, "r", encoding="utf-8") as f:
return [line.strip() for line in f if line.strip()]
def validate_response(response):
"""Validate if response matches expected JSON format"""
try:
data = json.loads(response)
if not all(key in data for key in ["category", "reason"]):
return None
if len(data["reason"].split()) > 8:
return None
return data
except json.JSONDecodeError:
return None
def process_review(i, review, output_file):
"""Process a single review and save results"""
print(f"Processing review {i}")
deepseek_response = query_deepseek(review)
deepseek_result = process_response(deepseek_response, i, "deepseek")
result = {
"id": i,
"review": review.strip('"'),
"deepseek": deepseek_result,
}
# Thread-safe file writing
with file_lock:
with open(output_file, "r+", encoding="utf-8") as f:
try:
data = json.load(f)
except json.JSONDecodeError:
data = []
data.append(result)
f.seek(0)
json.dump(data, f, indent=2)
f.truncate()
def process_response(response, i, model_name):
"""Helper function to validate and format responses"""
if not response:
return {"error": "query failed"}
validated = validate_response(response)
if validated:
return validated
else:
print(f"Format mismatch for {model_name} response {i}: {response}")
return {"error": "format mismatch"}
def main():
input_file = "data.tab"
output_file = "labels.json"
# Initialize output file
if not Path(output_file).exists():
with open(output_file, "w") as f:
json.dump([], f)
reviews = read_reviews(input_file)
# Skip header and limit to 20,000 reviews
reviews_to_process = [
(i, review) for i, review in enumerate(reviews[1:20001], start=1)
]
# Use ThreadPoolExecutor for parallel processing
# Adjust max_workers based on your API rate limits and system capabilities
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
futures = []
for i, review in reviews_to_process:
futures.append(executor.submit(process_review, i, review, output_file))
# Wait for all futures to complete
for future in concurrent.futures.as_completed(futures):
try:
future.result()
except Exception as e:
print(f"Error processing review: {e}")
if __name__ == "__main__":
main()

78434
deepseek_labels.json Normal file

File diff suppressed because one or more lines are too long

Binary file not shown.

After

Width:  |  Height:  |  Size: 154 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

BIN
img/heatmap.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 152 KiB

BIN
img/heatmap_corr.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 121 KiB

BIN
img/heatmap_corr_fill.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 111 KiB

BIN
img/newplot.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 236 KiB

BIN
img/opt.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 208 KiB

BIN
img/opt_corr.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 206 KiB

BIN
img/opt_corr_fill.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 208 KiB

BIN
img/topic_clusters.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.6 MiB

128
requirements.txt Normal file
View File

@@ -0,0 +1,128 @@
annotated-types==0.7.0
anyio==4.9.0
asttokens==3.0.0
attrs==25.3.0
bertopic==0.17.0
Brotli==1.1.0
certifi==2025.4.26
charset-normalizer==3.4.2
click==8.2.1
comm==0.2.2
contourpy==1.3.2
cssselect==1.3.0
cycler==0.12.1
debugpy==1.8.14
decorator==5.2.1
distro==1.9.0
dotenv==0.9.9
executing==2.2.0
fastjsonschema==2.21.1
filelock==3.18.0
fonttools==4.58.0
fsspec==2025.5.1
gensim==4.3.3
h11==0.16.0
h2==4.2.0
hdbscan==0.8.40
hf-xet==1.1.2
hpack==4.1.0
httpcore==1.0.9
httpx==0.28.1
huggingface-hub==0.32.2
hyperframe==6.1.0
idna==3.10
ipykernel==6.29.5
ipython==9.3.0
ipython_pygments_lexers==1.1.1
jedi==0.19.2
Jinja2==3.1.6
jiter==0.10.0
jmespath==1.0.1
joblib==1.5.1
jsonschema==4.24.0
jsonschema-specifications==2025.4.1
jupyter_client==8.6.3
jupyter_core==5.8.1
kaleido==0.2.1
kiwisolver==1.4.8
llvmlite==0.44.0
lxml==5.4.0
MarkupSafe==3.0.2
matplotlib==3.10.3
matplotlib-inline==0.1.7
mpmath==1.3.0
narwhals==1.41.0
nbformat==5.10.4
nest-asyncio==1.6.0
networkx==3.4.2
nltk==3.9.1
numba==0.61.2
numpy==1.26.4
nvidia-cublas-cu12==12.6.4.1
nvidia-cuda-cupti-cu12==12.6.80
nvidia-cuda-nvrtc-cu12==12.6.77
nvidia-cuda-runtime-cu12==12.6.77
nvidia-cudnn-cu12==9.5.1.17
nvidia-cufft-cu12==11.3.0.4
nvidia-cufile-cu12==1.11.1.6
nvidia-curand-cu12==10.3.7.77
nvidia-cusolver-cu12==11.7.1.2
nvidia-cusparse-cu12==12.5.4.2
nvidia-cusparselt-cu12==0.6.3
nvidia-nccl-cu12==2.26.2
nvidia-nvjitlink-cu12==12.6.85
nvidia-nvtx-cu12==12.6.77
openai==1.82.0
packaging==25.0
pandas==2.2.3
parsel==1.10.0
parso==0.8.4
pexpect==4.9.0
pillow==11.2.1
platformdirs==4.3.8
plotly==6.1.2
prompt_toolkit==3.0.51
psutil==7.0.0
ptyprocess==0.7.0
pure_eval==0.2.3
pydantic==2.11.5
pydantic_core==2.33.2
Pygments==2.19.1
pynndescent==0.5.13
pyparsing==3.2.3
python-dateutil==2.9.0.post0
python-dotenv==1.1.0
pytz==2025.2
PyYAML==6.0.2
pyzmq==26.4.0
referencing==0.36.2
regex==2024.11.6
requests==2.32.3
rpds-py==0.25.1
safetensors==0.5.3
scikit-learn==1.6.1
scipy==1.13.1
seaborn==0.13.2
sentence-transformers==4.1.0
setuptools==80.9.0
six==1.17.0
smart-open==7.1.0
sniffio==1.3.1
stack-data==0.6.3
sympy==1.14.0
threadpoolctl==3.6.0
tokenizers==0.21.1
torch==2.7.0
tornado==6.5.1
tqdm==4.67.1
traitlets==5.14.3
transformers==4.52.3
triton==3.3.0
typing-inspection==0.4.1
typing_extensions==4.13.2
tzdata==2025.2
umap-learn==0.5.7
urllib3==2.4.0
w3lib==2.3.1
wcwidth==0.2.13
wrapt==1.17.2