Init

2025-12-06 10:10:50 +01:00 · 2025-06-06 05:14:58 +02:00
commit c002e46acb
22 changed files with 265678 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,5 @@
 .env
 .venv/
 __pycache__/
 **.bertopic
 history*.json
--- a/bertopic.ipynb
+++ b/bertopic.ipynb
--- a/bertopic/heatmap.html
+++ b/bertopic/heatmap.html
--- a/bertopic/map.html
+++ b/bertopic/map.html
--- a/bertopic/tracking.json
+++ b/bertopic/tracking.json
--- a/bertopic_autotune.py
+++ b/bertopic_autotune.py
@@ -0,0 +1,160 @@
 import json
 import traceback
 import numpy as np
 import pandas as pd
 from hdbscan import HDBSCAN
 from sentence_transformers import SentenceTransformer
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.metrics import pairwise_distances
 from sklearn.metrics.pairwise import cosine_similarity
 from sklearn.model_selection import ParameterGrid
 from umap import UMAP
 from bertopic import BERTopic
 from bertopic.representation import KeyBERTInspired
 from bertopic.vectorizers import ClassTfidfTransformer
 param_grid = {
    "nr_topics": [45, 50, 55],
    "min_topic_size": [30, 40, 50],
    "n_gram_max": [3],
    "min_document_frequency": [1, 2],
    "n_neighbors": [15],
    "n_components": [2],
    "min_dist": [0.1],
    "top_n_words": [10],
 }
 def calculate_metrics(topic_model, embedder, top_n_words=5):
    # Get topic words
    topic_words = []
    for topic_id in range(len(topic_model.get_topic_info()) - 1):
        words = [word for word, _ in topic_model.get_topic(topic_id)]
        topic_words.append(words[:top_n_words])
    # Coherence
    coherence_scores = []
    for words in topic_words:
        embeddings = embedder.encode(words)
        sim_matrix = cosine_similarity(embeddings)
        np.fill_diagonal(sim_matrix, 0)
        coherence_scores.append(np.mean(sim_matrix))
    overall_coherence = np.mean(coherence_scores)
    # Diversity
    all_topic_words = [word for topic in topic_words for word in topic]
    diversity = len(set(all_topic_words)) / len(all_topic_words)
    # Inter-topic distance
    topic_embeddings = [
        np.mean(embedder.encode(words), axis=0) for words in topic_words
    ]
    topic_distance = pairwise_distances(topic_embeddings, metric="cosine")
    avg_distance = np.mean(topic_distance[np.triu_indices_from(topic_distance, k=1)])
    res = {
        "coherence": float(str(overall_coherence)[:6]),
        "diversity": float(str(diversity)[:6]),
        "inter_topic_distance": float(str(avg_distance)[:6]),
        "combined_score": float(
            str(0.6 * overall_coherence + 0.2 * diversity + 0.2 * avg_distance)[:6]
        ),
    }
    print(res)
    return res
 def auto_tune_bertopic(texts, embedding_model, param_grid):
    best_score = -1
    best_params = None
    best_model = None
    history = []
    print("Starting auto-tuning of BERTopic...")
    print(f"Number of reviews: {len(texts)}")
    print("Running embedding model...")
    embedder = SentenceTransformer(embedding_model)
    embeddings = embedder.encode(reviews, show_progress_bar=True)
    # Convert param_grid to list for sampling
    print("Generating parameter combinations...")
    param_list = list(ParameterGrid(param_grid))
    print(f"Total parameter combinations: {len(param_list)}")
    for params in param_list:
        try:
            print(f"Testing params: {params}")
            ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
            vectorizer_model = CountVectorizer(
                stop_words="english",
                min_df=params["min_document_frequency"],
                ngram_range=(1, params["n_gram_max"]),
            )
            representation_model = KeyBERTInspired()
            umap_model = UMAP(
                n_neighbors=params["n_neighbors"],
                n_components=params["n_components"],
                min_dist=params["min_dist"],
                metric="cosine",
                low_memory=True,
                random_state=42,
            )
            hdbscan_model = HDBSCAN(
                min_cluster_size=params["min_topic_size"],
                metric="euclidean",
                cluster_selection_method="eom",
                gen_min_span_tree=True,
                prediction_data=True,
            )
            model = BERTopic(
                embedding_model=embedding_model,
                ctfidf_model=ctfidf_model,
                vectorizer_model=vectorizer_model,
                umap_model=umap_model,
                hdbscan_model=hdbscan_model,
                representation_model=representation_model,
                verbose=True,
                calculate_probabilities=True,
                language="english",
                top_n_words=params["top_n_words"],
                nr_topics=params["nr_topics"],
            )
            topics, _ = model.fit_transform(texts, embeddings)
            metrics = calculate_metrics(model, embedder)
            history.append({"params": params, "metrics": metrics})
            with open("history.json", "w") as f:
                json.dump(history, f, indent=2)
            if metrics["combined_score"] > best_score:
                best_score = metrics["combined_score"]
                best_params = params
                best_model = model
        except Exception as e:
            print(f"Failed with params {params}: {str(e)}")
            traceback.print_exc()
            continue
    return best_model, best_params, best_score, history
 SPECIAL_CHARS = ["\n", "\\n"]
 MIN_REVIEW_WORDS = 5
 reviews = pd.read_csv("data.tab", sep="\t").review.to_list()
 for schar in SPECIAL_CHARS:
    reviews = [
        review.replace(schar, " ") if isinstance(review, str) else review
        for review in reviews
    ]
 reviews = [review for review in reviews if len(str(review).split()) >= MIN_REVIEW_WORDS]
 print(auto_tune_bertopic(reviews, "all-MiniLM-L6-v2", param_grid))
--- a/bertopic_autotune_sorter.py
+++ b/bertopic_autotune_sorter.py
@@ -0,0 +1,25 @@
 import json
 import matplotlib.pyplot as plt
 with open("history.json", "r") as f:
    history = json.load(f)
 history = sorted(history, key=lambda x: x["metrics"]["combined_score"], reverse=True)
 with open("history_sorted.json", "w") as f:
    json.dump(history, f, indent=2)
 # Extract combined scores
 scores = [item["metrics"]["coherence"] for item in history]
 # Plot histogram
 plt.hist(scores, bins=20, edgecolor="black")
 plt.title("Distribution of Combined Scores")
 plt.xlabel("Combined Score")
 plt.ylabel("Frequency")
 plt.grid(True)
 plt.tight_layout()
 plt.savefig("combined_score_distribution.png")
 plt.close()
--- a/data.tab
+++ b/data.tab
--- a/deepseek_label_distribution.py
+++ b/deepseek_label_distribution.py
@@ -0,0 +1,101 @@
 import json
 from collections import Counter
 import matplotlib.pyplot as plt
 import seaborn as sns
 def load_labels(file_path):
    """Load labels from JSON file"""
    with open(file_path, "r", encoding="utf-8") as f:
        return json.load(f)
 def process_labels(data):
    """Extract valid categories and count their occurrences"""
    categories = []
    errors = 0
    for entry in data:
        if "deepseek" in entry:
            categories.append(entry["deepseek"]["category"])
        elif "error" in entry:
            errors += 1
    category_counts = Counter(categories)
    return category_counts, errors
 def visualize_distribution(category_counts, errors, output_file=None):
    """Create visualization of category distribution"""
    # Prepare data
    categories = list(category_counts.keys())
    counts = list(category_counts.values())
    total_valid = sum(counts)
    total = total_valid + errors
    # Set style
    sns.set(style="whitegrid")
    plt.figure(figsize=(10, 6))
    # Create bar plot
    ax = sns.barplot(x=categories, y=counts, palette="viridis")
    # Customize plot
    plt.title(
        f"Review Category Distribution\n(Total: {total} reviews - {errors} errors)",
        pad=20,
    )
    plt.xlabel("Category")
    plt.ylabel("Count")
    plt.xticks(rotation=45, ha="right")
    # Add value labels
    for i, count in enumerate(counts):
        ax.text(i, count + 0.5, str(count), ha="center")
    # Add error count annotation if there are errors
    if errors > 0:
        plt.annotate(
            f"{errors} errors\n({errors/total:.1%})",
            xy=(0.95, 0.95),
            xycoords="axes fraction",
            ha="right",
            va="top",
            bbox=dict(boxstyle="round", facecolor="white", alpha=0.8),
        )
    # Adjust layout
    plt.tight_layout()
    # Save or show
    if output_file:
        plt.savefig(output_file, dpi=300)
        print(f"Visualization saved to {output_file}")
    else:
        plt.show()
 def main():
    input_file = "deepseek_labels.json"
    output_image = (
        "./img/category_distribution.png"  # Set to None to display instead of saving
    )
    # Load and process data
    data = load_labels(input_file)
    category_counts, errors = process_labels(data)
    # Print basic stats
    print("Category Distribution:")
    for category, count in category_counts.most_common():
        print(f"- {category}: {count} ({count/len(data):.1%})")
    if errors > 0:
        print(f"- Errors: {errors} ({errors/len(data):.1%})")
    # Visualize
    visualize_distribution(category_counts, errors, output_image)
 if __name__ == "__main__":
    main()
--- a/deepseek_labeler.py
+++ b/deepseek_labeler.py
@@ -0,0 +1,143 @@
 import concurrent.futures
 import json
 import os
 from pathlib import Path
 from threading import Lock
 from dotenv import load_dotenv
 from openai import OpenAI
 # Initialize a thread-safe lock for file writing
 load_dotenv()
 file_lock = Lock()
 client = OpenAI(
    api_key=os.getenv("DEEPSEEK_API_KEY"),
    base_url="https://api.deepseek.com",
 )
 system_prompt = """
 The user will provide a tourist review. Please categorize them according to the following categories, provide a short reasoning for the decision (max 8 words) and output them in JSON format.
 The categories are: adventurer, business, family, backpacker, luxury, or none if no category fits.
 EXAMPLE INPUT: 
 Perfect for families! The hotel had a kids' club, a shallow pool, and spacious rooms. Nearby attractions were child-friendly, and the staff went out of their way to accommodate us. Will definitely return!
 EXAMPLE JSON OUTPUT:
 {
    "category": "family",
    "reason": "child-friendly amenities and staff"
 }
 """
 def query_deepseek(review):
    """Query DeepSeek API for categorization"""
    try:
        response = client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": review},
            ],
            temperature=0.2,
            response_format={"type": "json_object"},
        )
        content = response.choices[0].message.content
        return content
    except Exception as e:
        print(f"Error querying DeepSeek API: {e}")
        return None
 def read_reviews(file_path):
    """Read reviews from tab-separated file, assuming one review per line"""
    with open(file_path, "r", encoding="utf-8") as f:
        return [line.strip() for line in f if line.strip()]
 def validate_response(response):
    """Validate if response matches expected JSON format"""
    try:
        data = json.loads(response)
        if not all(key in data for key in ["category", "reason"]):
            return None
        if len(data["reason"].split()) > 8:
            return None
        return data
    except json.JSONDecodeError:
        return None
 def process_review(i, review, output_file):
    """Process a single review and save results"""
    print(f"Processing review {i}")
    deepseek_response = query_deepseek(review)
    deepseek_result = process_response(deepseek_response, i, "deepseek")
    result = {
        "id": i,
        "review": review.strip('"'),
        "deepseek": deepseek_result,
    }
    # Thread-safe file writing
    with file_lock:
        with open(output_file, "r+", encoding="utf-8") as f:
            try:
                data = json.load(f)
            except json.JSONDecodeError:
                data = []
            data.append(result)
            f.seek(0)
            json.dump(data, f, indent=2)
            f.truncate()
 def process_response(response, i, model_name):
    """Helper function to validate and format responses"""
    if not response:
        return {"error": "query failed"}
    validated = validate_response(response)
    if validated:
        return validated
    else:
        print(f"Format mismatch for {model_name} response {i}: {response}")
        return {"error": "format mismatch"}
 def main():
    input_file = "data.tab"
    output_file = "labels.json"
    # Initialize output file
    if not Path(output_file).exists():
        with open(output_file, "w") as f:
            json.dump([], f)
    reviews = read_reviews(input_file)
    # Skip header and limit to 20,000 reviews
    reviews_to_process = [
        (i, review) for i, review in enumerate(reviews[1:20001], start=1)
    ]
    # Use ThreadPoolExecutor for parallel processing
    # Adjust max_workers based on your API rate limits and system capabilities
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        futures = []
        for i, review in reviews_to_process:
            futures.append(executor.submit(process_review, i, review, output_file))
        # Wait for all futures to complete
        for future in concurrent.futures.as_completed(futures):
            try:
                future.result()
            except Exception as e:
                print(f"Error processing review: {e}")
 if __name__ == "__main__":
    main()
--- a/deepseek_labels.json
+++ b/deepseek_labels.json
--- a/img/category_distribution.png
+++ b/img/category_distribution.png
--- a/img/combined_score_distribution.png
+++ b/img/combined_score_distribution.png
--- a/img/heatmap.png
+++ b/img/heatmap.png
--- a/img/heatmap_corr.png
+++ b/img/heatmap_corr.png
--- a/img/heatmap_corr_fill.png
+++ b/img/heatmap_corr_fill.png
--- a/img/newplot.png
+++ b/img/newplot.png
--- a/img/opt.png
+++ b/img/opt.png
--- a/img/opt_corr.png
+++ b/img/opt_corr.png
--- a/img/opt_corr_fill.png
+++ b/img/opt_corr_fill.png
--- a/img/topic_clusters.png
+++ b/img/topic_clusters.png
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,128 @@
 annotated-types==0.7.0
 anyio==4.9.0
 asttokens==3.0.0
 attrs==25.3.0
 bertopic==0.17.0
 Brotli==1.1.0
 certifi==2025.4.26
 charset-normalizer==3.4.2
 click==8.2.1
 comm==0.2.2
 contourpy==1.3.2
 cssselect==1.3.0
 cycler==0.12.1
 debugpy==1.8.14
 decorator==5.2.1
 distro==1.9.0
 dotenv==0.9.9
 executing==2.2.0
 fastjsonschema==2.21.1
 filelock==3.18.0
 fonttools==4.58.0
 fsspec==2025.5.1
 gensim==4.3.3
 h11==0.16.0
 h2==4.2.0
 hdbscan==0.8.40
 hf-xet==1.1.2
 hpack==4.1.0
 httpcore==1.0.9
 httpx==0.28.1
 huggingface-hub==0.32.2
 hyperframe==6.1.0
 idna==3.10
 ipykernel==6.29.5
 ipython==9.3.0
 ipython_pygments_lexers==1.1.1
 jedi==0.19.2
 Jinja2==3.1.6
 jiter==0.10.0
 jmespath==1.0.1
 joblib==1.5.1
 jsonschema==4.24.0
 jsonschema-specifications==2025.4.1
 jupyter_client==8.6.3
 jupyter_core==5.8.1
 kaleido==0.2.1
 kiwisolver==1.4.8
 llvmlite==0.44.0
 lxml==5.4.0
 MarkupSafe==3.0.2
 matplotlib==3.10.3
 matplotlib-inline==0.1.7
 mpmath==1.3.0
 narwhals==1.41.0
 nbformat==5.10.4
 nest-asyncio==1.6.0
 networkx==3.4.2
 nltk==3.9.1
 numba==0.61.2
 numpy==1.26.4
 nvidia-cublas-cu12==12.6.4.1
 nvidia-cuda-cupti-cu12==12.6.80
 nvidia-cuda-nvrtc-cu12==12.6.77
 nvidia-cuda-runtime-cu12==12.6.77
 nvidia-cudnn-cu12==9.5.1.17
 nvidia-cufft-cu12==11.3.0.4
 nvidia-cufile-cu12==1.11.1.6
 nvidia-curand-cu12==10.3.7.77
 nvidia-cusolver-cu12==11.7.1.2
 nvidia-cusparse-cu12==12.5.4.2
 nvidia-cusparselt-cu12==0.6.3
 nvidia-nccl-cu12==2.26.2
 nvidia-nvjitlink-cu12==12.6.85
 nvidia-nvtx-cu12==12.6.77
 openai==1.82.0
 packaging==25.0
 pandas==2.2.3
 parsel==1.10.0
 parso==0.8.4
 pexpect==4.9.0
 pillow==11.2.1
 platformdirs==4.3.8
 plotly==6.1.2
 prompt_toolkit==3.0.51
 psutil==7.0.0
 ptyprocess==0.7.0
 pure_eval==0.2.3
 pydantic==2.11.5
 pydantic_core==2.33.2
 Pygments==2.19.1
 pynndescent==0.5.13
 pyparsing==3.2.3
 python-dateutil==2.9.0.post0
 python-dotenv==1.1.0
 pytz==2025.2
 PyYAML==6.0.2
 pyzmq==26.4.0
 referencing==0.36.2
 regex==2024.11.6
 requests==2.32.3
 rpds-py==0.25.1
 safetensors==0.5.3
 scikit-learn==1.6.1
 scipy==1.13.1
 seaborn==0.13.2
 sentence-transformers==4.1.0
 setuptools==80.9.0
 six==1.17.0
 smart-open==7.1.0
 sniffio==1.3.1
 stack-data==0.6.3
 sympy==1.14.0
 threadpoolctl==3.6.0
 tokenizers==0.21.1
 torch==2.7.0
 tornado==6.5.1
 tqdm==4.67.1
 traitlets==5.14.3
 transformers==4.52.3
 triton==3.3.0
 typing-inspection==0.4.1
 typing_extensions==4.13.2
 tzdata==2025.2
 umap-learn==0.5.7
 urllib3==2.4.0
 w3lib==2.3.1
 wcwidth==0.2.13
 wrapt==1.17.2