Init

2026-02-04 05:03:11 +01:00 · 2025-06-06 05:14:58 +02:00
commit c002e46acb
22 changed files with 265678 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,5 @@
+.env
+.venv/
+__pycache__/
+**.bertopic
+history*.json
--- a/bertopic.ipynb
+++ b/bertopic.ipynb
--- a/bertopic/heatmap.html
+++ b/bertopic/heatmap.html
--- a/bertopic/map.html
+++ b/bertopic/map.html
--- a/bertopic/tracking.json
+++ b/bertopic/tracking.json
--- a/bertopic_autotune.py
+++ b/bertopic_autotune.py
@@ -0,0 +1,160 @@
+import json
+import traceback
+
+import numpy as np
+import pandas as pd
+from hdbscan import HDBSCAN
+from sentence_transformers import SentenceTransformer
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.metrics import pairwise_distances
+from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.model_selection import ParameterGrid
+from umap import UMAP
+
+from bertopic import BERTopic
+from bertopic.representation import KeyBERTInspired
+from bertopic.vectorizers import ClassTfidfTransformer
+
+param_grid = {
+    "nr_topics": [45, 50, 55],
+    "min_topic_size": [30, 40, 50],
+    "n_gram_max": [3],
+    "min_document_frequency": [1, 2],
+    "n_neighbors": [15],
+    "n_components": [2],
+    "min_dist": [0.1],
+    "top_n_words": [10],
+}
+
+
+def calculate_metrics(topic_model, embedder, top_n_words=5):
+    # Get topic words
+    topic_words = []
+    for topic_id in range(len(topic_model.get_topic_info()) - 1):
+        words = [word for word, _ in topic_model.get_topic(topic_id)]
+        topic_words.append(words[:top_n_words])
+
+    # Coherence
+    coherence_scores = []
+    for words in topic_words:
+        embeddings = embedder.encode(words)
+        sim_matrix = cosine_similarity(embeddings)
+        np.fill_diagonal(sim_matrix, 0)
+        coherence_scores.append(np.mean(sim_matrix))
+    overall_coherence = np.mean(coherence_scores)
+
+    # Diversity
+    all_topic_words = [word for topic in topic_words for word in topic]
+    diversity = len(set(all_topic_words)) / len(all_topic_words)
+
+    # Inter-topic distance
+    topic_embeddings = [
+        np.mean(embedder.encode(words), axis=0) for words in topic_words
+    ]
+    topic_distance = pairwise_distances(topic_embeddings, metric="cosine")
+    avg_distance = np.mean(topic_distance[np.triu_indices_from(topic_distance, k=1)])
+
+    res = {
+        "coherence": float(str(overall_coherence)[:6]),
+        "diversity": float(str(diversity)[:6]),
+        "inter_topic_distance": float(str(avg_distance)[:6]),
+        "combined_score": float(
+            str(0.6 * overall_coherence + 0.2 * diversity + 0.2 * avg_distance)[:6]
+        ),
+    }
+    print(res)
+    return res
+
+
+def auto_tune_bertopic(texts, embedding_model, param_grid):
+    best_score = -1
+    best_params = None
+    best_model = None
+    history = []
+
+    print("Starting auto-tuning of BERTopic...")
+    print(f"Number of reviews: {len(texts)}")
+
+    print("Running embedding model...")
+    embedder = SentenceTransformer(embedding_model)
+    embeddings = embedder.encode(reviews, show_progress_bar=True)
+
+    # Convert param_grid to list for sampling
+    print("Generating parameter combinations...")
+    param_list = list(ParameterGrid(param_grid))
+
+    print(f"Total parameter combinations: {len(param_list)}")
+    for params in param_list:
+        try:
+            print(f"Testing params: {params}")
+            ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
+            vectorizer_model = CountVectorizer(
+                stop_words="english",
+                min_df=params["min_document_frequency"],
+                ngram_range=(1, params["n_gram_max"]),
+            )
+
+            representation_model = KeyBERTInspired()
+
+            umap_model = UMAP(
+                n_neighbors=params["n_neighbors"],
+                n_components=params["n_components"],
+                min_dist=params["min_dist"],
+                metric="cosine",
+                low_memory=True,
+                random_state=42,
+            )
+            hdbscan_model = HDBSCAN(
+                min_cluster_size=params["min_topic_size"],
+                metric="euclidean",
+                cluster_selection_method="eom",
+                gen_min_span_tree=True,
+                prediction_data=True,
+            )
+
+            model = BERTopic(
+                embedding_model=embedding_model,
+                ctfidf_model=ctfidf_model,
+                vectorizer_model=vectorizer_model,
+                umap_model=umap_model,
+                hdbscan_model=hdbscan_model,
+                representation_model=representation_model,
+                verbose=True,
+                calculate_probabilities=True,
+                language="english",
+                top_n_words=params["top_n_words"],
+                nr_topics=params["nr_topics"],
+            )
+            topics, _ = model.fit_transform(texts, embeddings)
+
+            metrics = calculate_metrics(model, embedder)
+            history.append({"params": params, "metrics": metrics})
+
+            with open("history.json", "w") as f:
+                json.dump(history, f, indent=2)
+
+            if metrics["combined_score"] > best_score:
+                best_score = metrics["combined_score"]
+                best_params = params
+                best_model = model
+
+        except Exception as e:
+            print(f"Failed with params {params}: {str(e)}")
+            traceback.print_exc()
+            continue
+
+    return best_model, best_params, best_score, history
+
+
+SPECIAL_CHARS = ["\n", "\\n"]
+MIN_REVIEW_WORDS = 5
+
+reviews = pd.read_csv("data.tab", sep="\t").review.to_list()
+
+for schar in SPECIAL_CHARS:
+    reviews = [
+        review.replace(schar, " ") if isinstance(review, str) else review
+        for review in reviews
+    ]
+reviews = [review for review in reviews if len(str(review).split()) >= MIN_REVIEW_WORDS]
+print(auto_tune_bertopic(reviews, "all-MiniLM-L6-v2", param_grid))
--- a/bertopic_autotune_sorter.py
+++ b/bertopic_autotune_sorter.py
@@ -0,0 +1,25 @@
+import json
+
+import matplotlib.pyplot as plt
+
+with open("history.json", "r") as f:
+    history = json.load(f)
+
+history = sorted(history, key=lambda x: x["metrics"]["combined_score"], reverse=True)
+
+with open("history_sorted.json", "w") as f:
+    json.dump(history, f, indent=2)
+
+
+# Extract combined scores
+scores = [item["metrics"]["coherence"] for item in history]
+
+# Plot histogram
+plt.hist(scores, bins=20, edgecolor="black")
+plt.title("Distribution of Combined Scores")
+plt.xlabel("Combined Score")
+plt.ylabel("Frequency")
+plt.grid(True)
+plt.tight_layout()
+plt.savefig("combined_score_distribution.png")
+plt.close()
--- a/data.tab
+++ b/data.tab
--- a/deepseek_label_distribution.py
+++ b/deepseek_label_distribution.py
@@ -0,0 +1,101 @@
+import json
+from collections import Counter
+
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+
+def load_labels(file_path):
+    """Load labels from JSON file"""
+    with open(file_path, "r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+def process_labels(data):
+    """Extract valid categories and count their occurrences"""
+    categories = []
+    errors = 0
+
+    for entry in data:
+        if "deepseek" in entry:
+            categories.append(entry["deepseek"]["category"])
+        elif "error" in entry:
+            errors += 1
+
+    category_counts = Counter(categories)
+    return category_counts, errors
+
+
+def visualize_distribution(category_counts, errors, output_file=None):
+    """Create visualization of category distribution"""
+    # Prepare data
+    categories = list(category_counts.keys())
+    counts = list(category_counts.values())
+    total_valid = sum(counts)
+    total = total_valid + errors
+
+    # Set style
+    sns.set(style="whitegrid")
+    plt.figure(figsize=(10, 6))
+
+    # Create bar plot
+    ax = sns.barplot(x=categories, y=counts, palette="viridis")
+
+    # Customize plot
+    plt.title(
+        f"Review Category Distribution\n(Total: {total} reviews - {errors} errors)",
+        pad=20,
+    )
+    plt.xlabel("Category")
+    plt.ylabel("Count")
+    plt.xticks(rotation=45, ha="right")
+
+    # Add value labels
+    for i, count in enumerate(counts):
+        ax.text(i, count + 0.5, str(count), ha="center")
+
+    # Add error count annotation if there are errors
+    if errors > 0:
+        plt.annotate(
+            f"{errors} errors\n({errors/total:.1%})",
+            xy=(0.95, 0.95),
+            xycoords="axes fraction",
+            ha="right",
+            va="top",
+            bbox=dict(boxstyle="round", facecolor="white", alpha=0.8),
+        )
+
+    # Adjust layout
+    plt.tight_layout()
+
+    # Save or show
+    if output_file:
+        plt.savefig(output_file, dpi=300)
+        print(f"Visualization saved to {output_file}")
+    else:
+        plt.show()
+
+
+def main():
+    input_file = "deepseek_labels.json"
+    output_image = (
+        "./img/category_distribution.png"  # Set to None to display instead of saving
+    )
+
+    # Load and process data
+    data = load_labels(input_file)
+    category_counts, errors = process_labels(data)
+
+    # Print basic stats
+    print("Category Distribution:")
+    for category, count in category_counts.most_common():
+        print(f"- {category}: {count} ({count/len(data):.1%})")
+    if errors > 0:
+        print(f"- Errors: {errors} ({errors/len(data):.1%})")
+
+    # Visualize
+    visualize_distribution(category_counts, errors, output_image)
+
+
+if __name__ == "__main__":
+    main()
--- a/deepseek_labeler.py
+++ b/deepseek_labeler.py
@@ -0,0 +1,143 @@
+import concurrent.futures
+import json
+import os
+from pathlib import Path
+from threading import Lock
+
+from dotenv import load_dotenv
+from openai import OpenAI
+
+# Initialize a thread-safe lock for file writing
+load_dotenv()
+file_lock = Lock()
+
+client = OpenAI(
+    api_key=os.getenv("DEEPSEEK_API_KEY"),
+    base_url="https://api.deepseek.com",
+)
+
+system_prompt = """
+The user will provide a tourist review. Please categorize them according to the following categories, provide a short reasoning for the decision (max 8 words) and output them in JSON format.
+The categories are: adventurer, business, family, backpacker, luxury, or none if no category fits.
+
+EXAMPLE INPUT: 
+Perfect for families! The hotel had a kids' club, a shallow pool, and spacious rooms. Nearby attractions were child-friendly, and the staff went out of their way to accommodate us. Will definitely return!
+
+EXAMPLE JSON OUTPUT:
+{
+    "category": "family",
+    "reason": "child-friendly amenities and staff"
+}
+"""
+
+
+def query_deepseek(review):
+    """Query DeepSeek API for categorization"""
+    try:
+        response = client.chat.completions.create(
+            model="deepseek-chat",
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": review},
+            ],
+            temperature=0.2,
+            response_format={"type": "json_object"},
+        )
+        content = response.choices[0].message.content
+        return content
+    except Exception as e:
+        print(f"Error querying DeepSeek API: {e}")
+        return None
+
+
+def read_reviews(file_path):
+    """Read reviews from tab-separated file, assuming one review per line"""
+    with open(file_path, "r", encoding="utf-8") as f:
+        return [line.strip() for line in f if line.strip()]
+
+
+def validate_response(response):
+    """Validate if response matches expected JSON format"""
+    try:
+        data = json.loads(response)
+        if not all(key in data for key in ["category", "reason"]):
+            return None
+        if len(data["reason"].split()) > 8:
+            return None
+        return data
+    except json.JSONDecodeError:
+        return None
+
+
+def process_review(i, review, output_file):
+    """Process a single review and save results"""
+    print(f"Processing review {i}")
+
+    deepseek_response = query_deepseek(review)
+    deepseek_result = process_response(deepseek_response, i, "deepseek")
+
+    result = {
+        "id": i,
+        "review": review.strip('"'),
+        "deepseek": deepseek_result,
+    }
+
+    # Thread-safe file writing
+    with file_lock:
+        with open(output_file, "r+", encoding="utf-8") as f:
+            try:
+                data = json.load(f)
+            except json.JSONDecodeError:
+                data = []
+            data.append(result)
+            f.seek(0)
+            json.dump(data, f, indent=2)
+            f.truncate()
+
+
+def process_response(response, i, model_name):
+    """Helper function to validate and format responses"""
+    if not response:
+        return {"error": "query failed"}
+
+    validated = validate_response(response)
+    if validated:
+        return validated
+    else:
+        print(f"Format mismatch for {model_name} response {i}: {response}")
+        return {"error": "format mismatch"}
+
+
+def main():
+    input_file = "data.tab"
+    output_file = "labels.json"
+
+    # Initialize output file
+    if not Path(output_file).exists():
+        with open(output_file, "w") as f:
+            json.dump([], f)
+
+    reviews = read_reviews(input_file)
+
+    # Skip header and limit to 20,000 reviews
+    reviews_to_process = [
+        (i, review) for i, review in enumerate(reviews[1:20001], start=1)
+    ]
+
+    # Use ThreadPoolExecutor for parallel processing
+    # Adjust max_workers based on your API rate limits and system capabilities
+    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
+        futures = []
+        for i, review in reviews_to_process:
+            futures.append(executor.submit(process_review, i, review, output_file))
+
+        # Wait for all futures to complete
+        for future in concurrent.futures.as_completed(futures):
+            try:
+                future.result()
+            except Exception as e:
+                print(f"Error processing review: {e}")
+
+
+if __name__ == "__main__":
+    main()
--- a/deepseek_labels.json
+++ b/deepseek_labels.json
--- a/img/category_distribution.png
+++ b/img/category_distribution.png
--- a/img/combined_score_distribution.png
+++ b/img/combined_score_distribution.png
--- a/img/heatmap.png
+++ b/img/heatmap.png
--- a/img/heatmap_corr.png
+++ b/img/heatmap_corr.png
--- a/img/heatmap_corr_fill.png
+++ b/img/heatmap_corr_fill.png
--- a/img/newplot.png
+++ b/img/newplot.png
--- a/img/opt.png
+++ b/img/opt.png
--- a/img/opt_corr.png
+++ b/img/opt_corr.png
--- a/img/opt_corr_fill.png
+++ b/img/opt_corr_fill.png
--- a/img/topic_clusters.png
+++ b/img/topic_clusters.png
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,128 @@
+annotated-types==0.7.0
+anyio==4.9.0
+asttokens==3.0.0
+attrs==25.3.0
+bertopic==0.17.0
+Brotli==1.1.0
+certifi==2025.4.26
+charset-normalizer==3.4.2
+click==8.2.1
+comm==0.2.2
+contourpy==1.3.2
+cssselect==1.3.0
+cycler==0.12.1
+debugpy==1.8.14
+decorator==5.2.1
+distro==1.9.0
+dotenv==0.9.9
+executing==2.2.0
+fastjsonschema==2.21.1
+filelock==3.18.0
+fonttools==4.58.0
+fsspec==2025.5.1
+gensim==4.3.3
+h11==0.16.0
+h2==4.2.0
+hdbscan==0.8.40
+hf-xet==1.1.2
+hpack==4.1.0
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==0.32.2
+hyperframe==6.1.0
+idna==3.10
+ipykernel==6.29.5
+ipython==9.3.0
+ipython_pygments_lexers==1.1.1
+jedi==0.19.2
+Jinja2==3.1.6
+jiter==0.10.0
+jmespath==1.0.1
+joblib==1.5.1
+jsonschema==4.24.0
+jsonschema-specifications==2025.4.1
+jupyter_client==8.6.3
+jupyter_core==5.8.1
+kaleido==0.2.1
+kiwisolver==1.4.8
+llvmlite==0.44.0
+lxml==5.4.0
+MarkupSafe==3.0.2
+matplotlib==3.10.3
+matplotlib-inline==0.1.7
+mpmath==1.3.0
+narwhals==1.41.0
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx==3.4.2
+nltk==3.9.1
+numba==0.61.2
+numpy==1.26.4
+nvidia-cublas-cu12==12.6.4.1
+nvidia-cuda-cupti-cu12==12.6.80
+nvidia-cuda-nvrtc-cu12==12.6.77
+nvidia-cuda-runtime-cu12==12.6.77
+nvidia-cudnn-cu12==9.5.1.17
+nvidia-cufft-cu12==11.3.0.4
+nvidia-cufile-cu12==1.11.1.6
+nvidia-curand-cu12==10.3.7.77
+nvidia-cusolver-cu12==11.7.1.2
+nvidia-cusparse-cu12==12.5.4.2
+nvidia-cusparselt-cu12==0.6.3
+nvidia-nccl-cu12==2.26.2
+nvidia-nvjitlink-cu12==12.6.85
+nvidia-nvtx-cu12==12.6.77
+openai==1.82.0
+packaging==25.0
+pandas==2.2.3
+parsel==1.10.0
+parso==0.8.4
+pexpect==4.9.0
+pillow==11.2.1
+platformdirs==4.3.8
+plotly==6.1.2
+prompt_toolkit==3.0.51
+psutil==7.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pydantic==2.11.5
+pydantic_core==2.33.2
+Pygments==2.19.1
+pynndescent==0.5.13
+pyparsing==3.2.3
+python-dateutil==2.9.0.post0
+python-dotenv==1.1.0
+pytz==2025.2
+PyYAML==6.0.2
+pyzmq==26.4.0
+referencing==0.36.2
+regex==2024.11.6
+requests==2.32.3
+rpds-py==0.25.1
+safetensors==0.5.3
+scikit-learn==1.6.1
+scipy==1.13.1
+seaborn==0.13.2
+sentence-transformers==4.1.0
+setuptools==80.9.0
+six==1.17.0
+smart-open==7.1.0
+sniffio==1.3.1
+stack-data==0.6.3
+sympy==1.14.0
+threadpoolctl==3.6.0
+tokenizers==0.21.1
+torch==2.7.0
+tornado==6.5.1
+tqdm==4.67.1
+traitlets==5.14.3
+transformers==4.52.3
+triton==3.3.0
+typing-inspection==0.4.1
+typing_extensions==4.13.2
+tzdata==2025.2
+umap-learn==0.5.7
+urllib3==2.4.0
+w3lib==2.3.1
+wcwidth==0.2.13
+wrapt==1.17.2