Init
5
.gitignore
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
.env
|
||||
.venv/
|
||||
__pycache__/
|
||||
**.bertopic
|
||||
history*.json
|
||||
121436
bertopic.ipynb
Normal file
3885
bertopic/heatmap.html
Normal file
3885
bertopic/map.html
Normal file
1027
bertopic/tracking.json
Normal file
160
bertopic_autotune.py
Normal file
@@ -0,0 +1,160 @@
|
||||
import json
|
||||
import traceback
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from hdbscan import HDBSCAN
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.metrics import pairwise_distances
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
from sklearn.model_selection import ParameterGrid
|
||||
from umap import UMAP
|
||||
|
||||
from bertopic import BERTopic
|
||||
from bertopic.representation import KeyBERTInspired
|
||||
from bertopic.vectorizers import ClassTfidfTransformer
|
||||
|
||||
param_grid = {
|
||||
"nr_topics": [45, 50, 55],
|
||||
"min_topic_size": [30, 40, 50],
|
||||
"n_gram_max": [3],
|
||||
"min_document_frequency": [1, 2],
|
||||
"n_neighbors": [15],
|
||||
"n_components": [2],
|
||||
"min_dist": [0.1],
|
||||
"top_n_words": [10],
|
||||
}
|
||||
|
||||
|
||||
def calculate_metrics(topic_model, embedder, top_n_words=5):
|
||||
# Get topic words
|
||||
topic_words = []
|
||||
for topic_id in range(len(topic_model.get_topic_info()) - 1):
|
||||
words = [word for word, _ in topic_model.get_topic(topic_id)]
|
||||
topic_words.append(words[:top_n_words])
|
||||
|
||||
# Coherence
|
||||
coherence_scores = []
|
||||
for words in topic_words:
|
||||
embeddings = embedder.encode(words)
|
||||
sim_matrix = cosine_similarity(embeddings)
|
||||
np.fill_diagonal(sim_matrix, 0)
|
||||
coherence_scores.append(np.mean(sim_matrix))
|
||||
overall_coherence = np.mean(coherence_scores)
|
||||
|
||||
# Diversity
|
||||
all_topic_words = [word for topic in topic_words for word in topic]
|
||||
diversity = len(set(all_topic_words)) / len(all_topic_words)
|
||||
|
||||
# Inter-topic distance
|
||||
topic_embeddings = [
|
||||
np.mean(embedder.encode(words), axis=0) for words in topic_words
|
||||
]
|
||||
topic_distance = pairwise_distances(topic_embeddings, metric="cosine")
|
||||
avg_distance = np.mean(topic_distance[np.triu_indices_from(topic_distance, k=1)])
|
||||
|
||||
res = {
|
||||
"coherence": float(str(overall_coherence)[:6]),
|
||||
"diversity": float(str(diversity)[:6]),
|
||||
"inter_topic_distance": float(str(avg_distance)[:6]),
|
||||
"combined_score": float(
|
||||
str(0.6 * overall_coherence + 0.2 * diversity + 0.2 * avg_distance)[:6]
|
||||
),
|
||||
}
|
||||
print(res)
|
||||
return res
|
||||
|
||||
|
||||
def auto_tune_bertopic(texts, embedding_model, param_grid):
|
||||
best_score = -1
|
||||
best_params = None
|
||||
best_model = None
|
||||
history = []
|
||||
|
||||
print("Starting auto-tuning of BERTopic...")
|
||||
print(f"Number of reviews: {len(texts)}")
|
||||
|
||||
print("Running embedding model...")
|
||||
embedder = SentenceTransformer(embedding_model)
|
||||
embeddings = embedder.encode(reviews, show_progress_bar=True)
|
||||
|
||||
# Convert param_grid to list for sampling
|
||||
print("Generating parameter combinations...")
|
||||
param_list = list(ParameterGrid(param_grid))
|
||||
|
||||
print(f"Total parameter combinations: {len(param_list)}")
|
||||
for params in param_list:
|
||||
try:
|
||||
print(f"Testing params: {params}")
|
||||
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
|
||||
vectorizer_model = CountVectorizer(
|
||||
stop_words="english",
|
||||
min_df=params["min_document_frequency"],
|
||||
ngram_range=(1, params["n_gram_max"]),
|
||||
)
|
||||
|
||||
representation_model = KeyBERTInspired()
|
||||
|
||||
umap_model = UMAP(
|
||||
n_neighbors=params["n_neighbors"],
|
||||
n_components=params["n_components"],
|
||||
min_dist=params["min_dist"],
|
||||
metric="cosine",
|
||||
low_memory=True,
|
||||
random_state=42,
|
||||
)
|
||||
hdbscan_model = HDBSCAN(
|
||||
min_cluster_size=params["min_topic_size"],
|
||||
metric="euclidean",
|
||||
cluster_selection_method="eom",
|
||||
gen_min_span_tree=True,
|
||||
prediction_data=True,
|
||||
)
|
||||
|
||||
model = BERTopic(
|
||||
embedding_model=embedding_model,
|
||||
ctfidf_model=ctfidf_model,
|
||||
vectorizer_model=vectorizer_model,
|
||||
umap_model=umap_model,
|
||||
hdbscan_model=hdbscan_model,
|
||||
representation_model=representation_model,
|
||||
verbose=True,
|
||||
calculate_probabilities=True,
|
||||
language="english",
|
||||
top_n_words=params["top_n_words"],
|
||||
nr_topics=params["nr_topics"],
|
||||
)
|
||||
topics, _ = model.fit_transform(texts, embeddings)
|
||||
|
||||
metrics = calculate_metrics(model, embedder)
|
||||
history.append({"params": params, "metrics": metrics})
|
||||
|
||||
with open("history.json", "w") as f:
|
||||
json.dump(history, f, indent=2)
|
||||
|
||||
if metrics["combined_score"] > best_score:
|
||||
best_score = metrics["combined_score"]
|
||||
best_params = params
|
||||
best_model = model
|
||||
|
||||
except Exception as e:
|
||||
print(f"Failed with params {params}: {str(e)}")
|
||||
traceback.print_exc()
|
||||
continue
|
||||
|
||||
return best_model, best_params, best_score, history
|
||||
|
||||
|
||||
SPECIAL_CHARS = ["\n", "\\n"]
|
||||
MIN_REVIEW_WORDS = 5
|
||||
|
||||
reviews = pd.read_csv("data.tab", sep="\t").review.to_list()
|
||||
|
||||
for schar in SPECIAL_CHARS:
|
||||
reviews = [
|
||||
review.replace(schar, " ") if isinstance(review, str) else review
|
||||
for review in reviews
|
||||
]
|
||||
reviews = [review for review in reviews if len(str(review).split()) >= MIN_REVIEW_WORDS]
|
||||
print(auto_tune_bertopic(reviews, "all-MiniLM-L6-v2", param_grid))
|
||||
25
bertopic_autotune_sorter.py
Normal file
@@ -0,0 +1,25 @@
|
||||
import json
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
with open("history.json", "r") as f:
|
||||
history = json.load(f)
|
||||
|
||||
history = sorted(history, key=lambda x: x["metrics"]["combined_score"], reverse=True)
|
||||
|
||||
with open("history_sorted.json", "w") as f:
|
||||
json.dump(history, f, indent=2)
|
||||
|
||||
|
||||
# Extract combined scores
|
||||
scores = [item["metrics"]["coherence"] for item in history]
|
||||
|
||||
# Plot histogram
|
||||
plt.hist(scores, bins=20, edgecolor="black")
|
||||
plt.title("Distribution of Combined Scores")
|
||||
plt.xlabel("Combined Score")
|
||||
plt.ylabel("Frequency")
|
||||
plt.grid(True)
|
||||
plt.tight_layout()
|
||||
plt.savefig("combined_score_distribution.png")
|
||||
plt.close()
|
||||
101
deepseek_label_distribution.py
Normal file
@@ -0,0 +1,101 @@
|
||||
import json
|
||||
from collections import Counter
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
|
||||
|
||||
def load_labels(file_path):
|
||||
"""Load labels from JSON file"""
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def process_labels(data):
|
||||
"""Extract valid categories and count their occurrences"""
|
||||
categories = []
|
||||
errors = 0
|
||||
|
||||
for entry in data:
|
||||
if "deepseek" in entry:
|
||||
categories.append(entry["deepseek"]["category"])
|
||||
elif "error" in entry:
|
||||
errors += 1
|
||||
|
||||
category_counts = Counter(categories)
|
||||
return category_counts, errors
|
||||
|
||||
|
||||
def visualize_distribution(category_counts, errors, output_file=None):
|
||||
"""Create visualization of category distribution"""
|
||||
# Prepare data
|
||||
categories = list(category_counts.keys())
|
||||
counts = list(category_counts.values())
|
||||
total_valid = sum(counts)
|
||||
total = total_valid + errors
|
||||
|
||||
# Set style
|
||||
sns.set(style="whitegrid")
|
||||
plt.figure(figsize=(10, 6))
|
||||
|
||||
# Create bar plot
|
||||
ax = sns.barplot(x=categories, y=counts, palette="viridis")
|
||||
|
||||
# Customize plot
|
||||
plt.title(
|
||||
f"Review Category Distribution\n(Total: {total} reviews - {errors} errors)",
|
||||
pad=20,
|
||||
)
|
||||
plt.xlabel("Category")
|
||||
plt.ylabel("Count")
|
||||
plt.xticks(rotation=45, ha="right")
|
||||
|
||||
# Add value labels
|
||||
for i, count in enumerate(counts):
|
||||
ax.text(i, count + 0.5, str(count), ha="center")
|
||||
|
||||
# Add error count annotation if there are errors
|
||||
if errors > 0:
|
||||
plt.annotate(
|
||||
f"{errors} errors\n({errors/total:.1%})",
|
||||
xy=(0.95, 0.95),
|
||||
xycoords="axes fraction",
|
||||
ha="right",
|
||||
va="top",
|
||||
bbox=dict(boxstyle="round", facecolor="white", alpha=0.8),
|
||||
)
|
||||
|
||||
# Adjust layout
|
||||
plt.tight_layout()
|
||||
|
||||
# Save or show
|
||||
if output_file:
|
||||
plt.savefig(output_file, dpi=300)
|
||||
print(f"Visualization saved to {output_file}")
|
||||
else:
|
||||
plt.show()
|
||||
|
||||
|
||||
def main():
|
||||
input_file = "deepseek_labels.json"
|
||||
output_image = (
|
||||
"./img/category_distribution.png" # Set to None to display instead of saving
|
||||
)
|
||||
|
||||
# Load and process data
|
||||
data = load_labels(input_file)
|
||||
category_counts, errors = process_labels(data)
|
||||
|
||||
# Print basic stats
|
||||
print("Category Distribution:")
|
||||
for category, count in category_counts.most_common():
|
||||
print(f"- {category}: {count} ({count/len(data):.1%})")
|
||||
if errors > 0:
|
||||
print(f"- Errors: {errors} ({errors/len(data):.1%})")
|
||||
|
||||
# Visualize
|
||||
visualize_distribution(category_counts, errors, output_image)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
143
deepseek_labeler.py
Normal file
@@ -0,0 +1,143 @@
|
||||
import concurrent.futures
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from threading import Lock
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from openai import OpenAI
|
||||
|
||||
# Initialize a thread-safe lock for file writing
|
||||
load_dotenv()
|
||||
file_lock = Lock()
|
||||
|
||||
client = OpenAI(
|
||||
api_key=os.getenv("DEEPSEEK_API_KEY"),
|
||||
base_url="https://api.deepseek.com",
|
||||
)
|
||||
|
||||
system_prompt = """
|
||||
The user will provide a tourist review. Please categorize them according to the following categories, provide a short reasoning for the decision (max 8 words) and output them in JSON format.
|
||||
The categories are: adventurer, business, family, backpacker, luxury, or none if no category fits.
|
||||
|
||||
EXAMPLE INPUT:
|
||||
Perfect for families! The hotel had a kids' club, a shallow pool, and spacious rooms. Nearby attractions were child-friendly, and the staff went out of their way to accommodate us. Will definitely return!
|
||||
|
||||
EXAMPLE JSON OUTPUT:
|
||||
{
|
||||
"category": "family",
|
||||
"reason": "child-friendly amenities and staff"
|
||||
}
|
||||
"""
|
||||
|
||||
|
||||
def query_deepseek(review):
|
||||
"""Query DeepSeek API for categorization"""
|
||||
try:
|
||||
response = client.chat.completions.create(
|
||||
model="deepseek-chat",
|
||||
messages=[
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": review},
|
||||
],
|
||||
temperature=0.2,
|
||||
response_format={"type": "json_object"},
|
||||
)
|
||||
content = response.choices[0].message.content
|
||||
return content
|
||||
except Exception as e:
|
||||
print(f"Error querying DeepSeek API: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def read_reviews(file_path):
|
||||
"""Read reviews from tab-separated file, assuming one review per line"""
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
return [line.strip() for line in f if line.strip()]
|
||||
|
||||
|
||||
def validate_response(response):
|
||||
"""Validate if response matches expected JSON format"""
|
||||
try:
|
||||
data = json.loads(response)
|
||||
if not all(key in data for key in ["category", "reason"]):
|
||||
return None
|
||||
if len(data["reason"].split()) > 8:
|
||||
return None
|
||||
return data
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
|
||||
|
||||
def process_review(i, review, output_file):
|
||||
"""Process a single review and save results"""
|
||||
print(f"Processing review {i}")
|
||||
|
||||
deepseek_response = query_deepseek(review)
|
||||
deepseek_result = process_response(deepseek_response, i, "deepseek")
|
||||
|
||||
result = {
|
||||
"id": i,
|
||||
"review": review.strip('"'),
|
||||
"deepseek": deepseek_result,
|
||||
}
|
||||
|
||||
# Thread-safe file writing
|
||||
with file_lock:
|
||||
with open(output_file, "r+", encoding="utf-8") as f:
|
||||
try:
|
||||
data = json.load(f)
|
||||
except json.JSONDecodeError:
|
||||
data = []
|
||||
data.append(result)
|
||||
f.seek(0)
|
||||
json.dump(data, f, indent=2)
|
||||
f.truncate()
|
||||
|
||||
|
||||
def process_response(response, i, model_name):
|
||||
"""Helper function to validate and format responses"""
|
||||
if not response:
|
||||
return {"error": "query failed"}
|
||||
|
||||
validated = validate_response(response)
|
||||
if validated:
|
||||
return validated
|
||||
else:
|
||||
print(f"Format mismatch for {model_name} response {i}: {response}")
|
||||
return {"error": "format mismatch"}
|
||||
|
||||
|
||||
def main():
|
||||
input_file = "data.tab"
|
||||
output_file = "labels.json"
|
||||
|
||||
# Initialize output file
|
||||
if not Path(output_file).exists():
|
||||
with open(output_file, "w") as f:
|
||||
json.dump([], f)
|
||||
|
||||
reviews = read_reviews(input_file)
|
||||
|
||||
# Skip header and limit to 20,000 reviews
|
||||
reviews_to_process = [
|
||||
(i, review) for i, review in enumerate(reviews[1:20001], start=1)
|
||||
]
|
||||
|
||||
# Use ThreadPoolExecutor for parallel processing
|
||||
# Adjust max_workers based on your API rate limits and system capabilities
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
|
||||
futures = []
|
||||
for i, review in reviews_to_process:
|
||||
futures.append(executor.submit(process_review, i, review, output_file))
|
||||
|
||||
# Wait for all futures to complete
|
||||
for future in concurrent.futures.as_completed(futures):
|
||||
try:
|
||||
future.result()
|
||||
except Exception as e:
|
||||
print(f"Error processing review: {e}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
78434
deepseek_labels.json
Normal file
BIN
img/category_distribution.png
Normal file
|
After Width: | Height: | Size: 154 KiB |
BIN
img/combined_score_distribution.png
Normal file
|
After Width: | Height: | Size: 20 KiB |
BIN
img/heatmap.png
Normal file
|
After Width: | Height: | Size: 152 KiB |
BIN
img/heatmap_corr.png
Normal file
|
After Width: | Height: | Size: 121 KiB |
BIN
img/heatmap_corr_fill.png
Normal file
|
After Width: | Height: | Size: 111 KiB |
BIN
img/newplot.png
Normal file
|
After Width: | Height: | Size: 236 KiB |
BIN
img/opt.png
Normal file
|
After Width: | Height: | Size: 208 KiB |
BIN
img/opt_corr.png
Normal file
|
After Width: | Height: | Size: 206 KiB |
BIN
img/opt_corr_fill.png
Normal file
|
After Width: | Height: | Size: 208 KiB |
BIN
img/topic_clusters.png
Normal file
|
After Width: | Height: | Size: 2.6 MiB |
128
requirements.txt
Normal file
@@ -0,0 +1,128 @@
|
||||
annotated-types==0.7.0
|
||||
anyio==4.9.0
|
||||
asttokens==3.0.0
|
||||
attrs==25.3.0
|
||||
bertopic==0.17.0
|
||||
Brotli==1.1.0
|
||||
certifi==2025.4.26
|
||||
charset-normalizer==3.4.2
|
||||
click==8.2.1
|
||||
comm==0.2.2
|
||||
contourpy==1.3.2
|
||||
cssselect==1.3.0
|
||||
cycler==0.12.1
|
||||
debugpy==1.8.14
|
||||
decorator==5.2.1
|
||||
distro==1.9.0
|
||||
dotenv==0.9.9
|
||||
executing==2.2.0
|
||||
fastjsonschema==2.21.1
|
||||
filelock==3.18.0
|
||||
fonttools==4.58.0
|
||||
fsspec==2025.5.1
|
||||
gensim==4.3.3
|
||||
h11==0.16.0
|
||||
h2==4.2.0
|
||||
hdbscan==0.8.40
|
||||
hf-xet==1.1.2
|
||||
hpack==4.1.0
|
||||
httpcore==1.0.9
|
||||
httpx==0.28.1
|
||||
huggingface-hub==0.32.2
|
||||
hyperframe==6.1.0
|
||||
idna==3.10
|
||||
ipykernel==6.29.5
|
||||
ipython==9.3.0
|
||||
ipython_pygments_lexers==1.1.1
|
||||
jedi==0.19.2
|
||||
Jinja2==3.1.6
|
||||
jiter==0.10.0
|
||||
jmespath==1.0.1
|
||||
joblib==1.5.1
|
||||
jsonschema==4.24.0
|
||||
jsonschema-specifications==2025.4.1
|
||||
jupyter_client==8.6.3
|
||||
jupyter_core==5.8.1
|
||||
kaleido==0.2.1
|
||||
kiwisolver==1.4.8
|
||||
llvmlite==0.44.0
|
||||
lxml==5.4.0
|
||||
MarkupSafe==3.0.2
|
||||
matplotlib==3.10.3
|
||||
matplotlib-inline==0.1.7
|
||||
mpmath==1.3.0
|
||||
narwhals==1.41.0
|
||||
nbformat==5.10.4
|
||||
nest-asyncio==1.6.0
|
||||
networkx==3.4.2
|
||||
nltk==3.9.1
|
||||
numba==0.61.2
|
||||
numpy==1.26.4
|
||||
nvidia-cublas-cu12==12.6.4.1
|
||||
nvidia-cuda-cupti-cu12==12.6.80
|
||||
nvidia-cuda-nvrtc-cu12==12.6.77
|
||||
nvidia-cuda-runtime-cu12==12.6.77
|
||||
nvidia-cudnn-cu12==9.5.1.17
|
||||
nvidia-cufft-cu12==11.3.0.4
|
||||
nvidia-cufile-cu12==1.11.1.6
|
||||
nvidia-curand-cu12==10.3.7.77
|
||||
nvidia-cusolver-cu12==11.7.1.2
|
||||
nvidia-cusparse-cu12==12.5.4.2
|
||||
nvidia-cusparselt-cu12==0.6.3
|
||||
nvidia-nccl-cu12==2.26.2
|
||||
nvidia-nvjitlink-cu12==12.6.85
|
||||
nvidia-nvtx-cu12==12.6.77
|
||||
openai==1.82.0
|
||||
packaging==25.0
|
||||
pandas==2.2.3
|
||||
parsel==1.10.0
|
||||
parso==0.8.4
|
||||
pexpect==4.9.0
|
||||
pillow==11.2.1
|
||||
platformdirs==4.3.8
|
||||
plotly==6.1.2
|
||||
prompt_toolkit==3.0.51
|
||||
psutil==7.0.0
|
||||
ptyprocess==0.7.0
|
||||
pure_eval==0.2.3
|
||||
pydantic==2.11.5
|
||||
pydantic_core==2.33.2
|
||||
Pygments==2.19.1
|
||||
pynndescent==0.5.13
|
||||
pyparsing==3.2.3
|
||||
python-dateutil==2.9.0.post0
|
||||
python-dotenv==1.1.0
|
||||
pytz==2025.2
|
||||
PyYAML==6.0.2
|
||||
pyzmq==26.4.0
|
||||
referencing==0.36.2
|
||||
regex==2024.11.6
|
||||
requests==2.32.3
|
||||
rpds-py==0.25.1
|
||||
safetensors==0.5.3
|
||||
scikit-learn==1.6.1
|
||||
scipy==1.13.1
|
||||
seaborn==0.13.2
|
||||
sentence-transformers==4.1.0
|
||||
setuptools==80.9.0
|
||||
six==1.17.0
|
||||
smart-open==7.1.0
|
||||
sniffio==1.3.1
|
||||
stack-data==0.6.3
|
||||
sympy==1.14.0
|
||||
threadpoolctl==3.6.0
|
||||
tokenizers==0.21.1
|
||||
torch==2.7.0
|
||||
tornado==6.5.1
|
||||
tqdm==4.67.1
|
||||
traitlets==5.14.3
|
||||
transformers==4.52.3
|
||||
triton==3.3.0
|
||||
typing-inspection==0.4.1
|
||||
typing_extensions==4.13.2
|
||||
tzdata==2025.2
|
||||
umap-learn==0.5.7
|
||||
urllib3==2.4.0
|
||||
w3lib==2.3.1
|
||||
wcwidth==0.2.13
|
||||
wrapt==1.17.2
|
||||