22.02.

2026-06-22 07:13:08 +02:00 · 2026-02-22 23:52:26 +01:00
parent 61edb35f70
commit a7efed86f9
16 changed files with 1994 additions and 6420 deletions
@@ -16,10 +16,10 @@ from bertopic import BERTopic

 param_grid = {
    "n_gram_max": [2, 3],  # Vectorization
-    "min_document_frequency": [1],  # Vectorization
+    "min_document_frequency": [1, 2],  # Vectorization
    "min_samples": [10, 25],  # HDBSCAN
-    "min_topic_size": [10, 20, 30, 40, 50],  # HDBSCAN
-    "n_neighbors": [15],  # UMAP
+    "min_topic_size": [100, 200],  # HDBSCAN
+    "n_neighbors": [15, 25],  # UMAP
    "n_components": [2, 5],  # UMAP
    "min_dist": [0.01, 0.1],  # UMAP
    "nr_topics": ["auto"],  # Topic Modeling
@@ -5,7 +5,7 @@ import matplotlib.pyplot as plt
 with open("output/autotune.json", "r") as f:
    history = json.load(f)

-history = sorted(history, key=lambda x: x["metrics"]["combined_score"], reverse=False)
+history = sorted(history, key=lambda x: x["metrics"]["combined_score"], reverse=True)

 with open("output/autotune_sorted.json", "w") as f:
    json.dump(history, f, indent=2)
@@ -360,7 +360,6 @@ vis = topic_model.visualize_documents(
    custom_labels=True,
    hide_annotations=True,
 )
-# vis.write_html("output/visualization.html")
 vis

 # %%
@@ -497,7 +496,12 @@ if CALCULATE_TOKEN_DISTRIBUTIONS:
 #

 # %%
-topic_model.visualize_hierarchy(custom_labels=True)
+topic_model.visualize_hierarchy(custom_labels=True, color_threshold=0.98)
+
+# %%
+hierarchical_topics = topic_model.hierarchical_topics(reviews)
+tree = topic_model.get_topic_tree(hier_topics=hierarchical_topics)
+print(tree)

 # %% [markdown]
 # ### Intertopic Distance Map
@@ -512,3 +516,20 @@ topic_model.visualize_topics(use_ctfidf=True)

 # %%
 topic_model.visualize_barchart(top_n_topics=12, custom_labels=True, n_words=10)
+
+# %%
+from wordcloud import WordCloud
+import matplotlib.pyplot as plt
+
+
+def create_wordcloud(model, topic):
+    text = {word: value for word, value in model.get_topic(topic)}
+    wc = WordCloud(background_color="white", max_words=1000)
+    wc.generate_from_frequencies(text)
+    plt.imshow(wc, interpolation="bilinear")
+    plt.axis("off")
+    plt.show()
+
+
+# Show wordcloud
+create_wordcloud(topic_model, topic=1)
@@ -0,0 +1,519 @@
+# ---
+# jupyter:
+#   jupytext:
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.18.0
+#   kernelspec:
+#     display_name: .venv (3.12.3)
+#     language: python
+#     name: python3
+# ---
+
+# %% [markdown]
+# # Topic Detection: Bali Tourist Reviews
+#
+
+# %% [markdown]
+# ## Preparation
+#
+# ### Dependency Loading
+#
+
+# %%
+import pickle
+import re
+
+import gensim.corpora as corpora
+import nltk
+import numpy as np
+import pandas as pd
+from bertopic.representation import KeyBERTInspired
+from bertopic.vectorizers import ClassTfidfTransformer
+from gensim.models.coherencemodel import CoherenceModel
+from hdbscan import HDBSCAN
+from sentence_transformers import SentenceTransformer
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_extraction import text as skltext
+from sklearn.metrics.pairwise import cosine_similarity
+from umap import UMAP
+
+from bertopic import BERTopic
+
+nltk.download("stopwords")
+nltk.download("punkt")
+nltk.download("wordnet")
+
+# %% [markdown]
+# ### Hyperparameters and Settings
+#
+
+# %%
+RECREATE_MODEL = True
+RECREATE_REDUCED_MODEL = True
+PROCESS_DATA = True
+REDUCE_OUTLIERS = False
+CALCULATE_TOKEN_DISTRIBUTIONS = False
+
+# Data Sample Size, -1 for all data
+DATA_SAMPLE_SIZE = -1
+
+# Vectorization
+MIN_DOCUMENT_FREQUENCY = 1
+MAX_NGRAM = 3
+
+# HDBSCAN Parameters
+MIN_TOPIC_SIZE = 15
+MIN_SAMPLES = 15
+
+# UMAP Parameters
+N_NEIGHBORS = 15
+N_COMPONENTS = 2
+MIN_DIST = 0.01
+
+# Topic Modeling
+TOP_N_WORDS = 10
+MAX_TOPICS = None  # or "auto" to pass to HDBSCAN, None to skip
+
+TF_IDF_STOP_WORDS = ["bali", "place", "visit", "visited", "visiting"]
+
+# %% [markdown]
+# ### Data Loading & Preprocessing
+#
+
+# %%
+# Import data after general preprocessing
+
+if DATA_SAMPLE_SIZE == -1:
+    reviews = pd.read_csv(
+        "../data/intermediate/culture_reviews.csv", sep=","
+    ).Original.to_list()
+else:
+    reviews = (
+        pd.read_csv("../data/intermediate/culture_reviews.csv", sep=",")
+        .sample(n=DATA_SAMPLE_SIZE)
+        .Original.to_list()
+    )
+
+print("Loaded {} reviews".format(len(reviews)))
+
+# %%
+rep = {
+    r"\\n": " ",
+    r"\n": " ",
+    r'\\"': "",
+    r'"': "",
+    r"\s+": " ",
+}
+rep = dict((re.escape(k), v) for k, v in rep.items())
+pattern = re.compile("|".join(rep.keys()))
+
+
+def preprocess(text):
+    text = text.strip()
+    text = text.lower()
+    text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
+    return text
+
+
+# %%
+print(
+    preprocess(
+        "Excellent. Definitely worth coming while in bali. Food and people were very nice.\n🌟 🤩 ⭐️ \nTrisna was our host"
+    )
+)
+
+# %%
+if PROCESS_DATA:
+    print("Processing reviews...")
+    reviews = [preprocess(review) for review in reviews]
+
+    with open("../data/intermediate/processed_texts_culture.pkl", "wb") as f:
+        pickle.dump(reviews, f)
+else:
+    with open("../data/intermediate/processed_texts_culture.pkl", "rb") as f:
+        reviews = pickle.load(f)
+
+print(reviews[:1])
+
+# %% [markdown]
+# ### Pre-calculate Embeddings
+#
+
+# %%
+embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
+embeddings = embedding_model.encode(reviews, show_progress_bar=True)
+
+# %% [markdown]
+# ## Model Creation
+#
+
+# %% [markdown]
+# ### Dimensionality Reduction (UMAP)
+#
+
+# %%
+umap_model = UMAP(
+    n_neighbors=N_NEIGHBORS,
+    n_components=N_COMPONENTS,
+    min_dist=MIN_DIST,
+    metric="cosine",
+    low_memory=True,
+    random_state=42,
+)
+reduced_embeddings = umap_model.fit_transform(embeddings)
+
+# %% [markdown]
+# ### BERTopic Model Creation
+#
+
+# %%
+if RECREATE_MODEL:
+    stop_words = list(skltext.ENGLISH_STOP_WORDS.union(TF_IDF_STOP_WORDS))
+
+    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
+    vectorizer_model = CountVectorizer(
+        min_df=MIN_DOCUMENT_FREQUENCY,
+        ngram_range=(1, MAX_NGRAM),
+        stop_words=stop_words,
+    )
+
+    representation_model = KeyBERTInspired()
+    hdbscan_model = HDBSCAN(
+        min_cluster_size=MIN_TOPIC_SIZE,
+        min_samples=MIN_SAMPLES,
+        metric="euclidean",
+        cluster_selection_method="eom",
+        gen_min_span_tree=True,
+        prediction_data=True,
+    )
+
+    topic_model = BERTopic(
+        embedding_model=embedding_model,
+        ctfidf_model=ctfidf_model,
+        vectorizer_model=vectorizer_model,
+        umap_model=umap_model,
+        hdbscan_model=hdbscan_model,
+        representation_model=representation_model,
+        verbose=True,
+        calculate_probabilities=True,
+        language="english",
+        top_n_words=TOP_N_WORDS,
+        nr_topics=MAX_TOPICS,
+    )
+
+    topics, probs = topic_model.fit_transform(reviews, embeddings=embeddings)
+
+    topic_labels = topic_model.generate_topic_labels(
+        nr_words=3, topic_prefix=True, word_length=15, separator=" - "
+    )
+    topic_model.set_topic_labels(topic_labels)
+    # BERTopic.save(topic_model, "bertopic/model.bertopic")
+else:
+    print("Nevermind, loading existing model")
+    # topic_model = BERTopic.load("bertopic/model.bertopic")
+
+# %% [markdown]
+# ## Fine Tuning
+#
+# ### Topic Condensation
+#
+
+# %%
+if RECREATE_REDUCED_MODEL:
+    done = False
+    iteration = 1
+    while not done:
+        print(f"Iteration {iteration}")
+        iteration += 1
+        similarity_matrix = cosine_similarity(
+            np.array(topic_model.topic_embeddings_)[1:, :]
+        )
+        nothing_to_merge = True
+
+        for i in range(similarity_matrix.shape[0]):
+            for j in range(i + 1, similarity_matrix.shape[1]):
+                try:
+                    sim = similarity_matrix[i, j]
+                    if sim > 0.9:
+                        nothing_to_merge = False
+                        t1, t2 = i, j
+                        try:
+                            t1_name = topic_model.get_topic_info(t1)["CustomName"][0]
+                            t2_name = topic_model.get_topic_info(t2)["CustomName"][0]
+                            print(
+                                f"Merging topics {t1} ({t1_name}) and {t2} ({t2_name}) with similarity {sim:.2f}"
+                            )
+                            topic_model.merge_topics(reviews, topics_to_merge=[t1, t2])
+
+                            topic_labels = topic_model.generate_topic_labels(
+                                nr_words=3,
+                                topic_prefix=True,
+                                word_length=15,
+                                separator=" - ",
+                            )
+                            topic_model.set_topic_labels(topic_labels)
+                            similarity_matrix = cosine_similarity(
+                                np.array(topic_model.topic_embeddings_)[1:, :]
+                            )
+                        except Exception as e:
+                            print(f"Failed to merge {t1} and {t2}: {e}")
+                except IndexError:
+                    pass
+        if nothing_to_merge:
+            print("No more topics to merge.")
+            done = True
+else:
+    print("Skipping topic reduction")
+
+# %% [markdown]
+# ### Outlier Reduction
+#
+
+# %%
+if REDUCE_OUTLIERS:
+    new_topics = topic_model.reduce_outliers(
+        reviews,
+        topic_model.topics_,
+        probabilities=topic_model.probabilities_,
+        threshold=0.05,
+        strategy="probabilities",
+    )
+    topic_model.update_topics(reviews, topics=new_topics)
+
+# %% [markdown]
+# ## Results
+#
+# ### Classification
+#
+
+# %%
+CLASSIFICATION = False
+if CLASSIFICATION:
+    topics_to_keep = {14, 8, 13, 18, 17, 4, 2, 30, 28}
+    INPUT_PATH = "../data/intermediate/preprocessed.tab"  # TSV with a 'review' column
+    OUTPUT_CSV = "../data/intermediate/culture_reviews.csv"
+
+    # Topic model document info
+    df = topic_model.get_document_info(reviews)
+    df["Original"] = reviews
+
+    # --- filter by topics and length ---
+    filtered = df[df["Topic"].isin(topics_to_keep)].copy()
+    filtered["Original"] = filtered["Original"].str.strip()
+
+    # Save an audit CSV
+    filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False, sep=",")
+    print(f"Filtered CSV file saved to {OUTPUT_CSV}")
+
+# %%
+doc_topic_matrix = probs
+
+# column names
+topicnames = ["Topic " + str(i) for i in range(len(set(topics)) - 1)]
+
+# index names
+docnames = ["Review " + str(i) for i in range(len(reviews))]
+
+# Make the pandas dataframe
+df_document_topic = pd.DataFrame(
+    np.round(doc_topic_matrix, 2), columns=topicnames, index=docnames
+)
+
+# Get dominant topic for each document
+dominant_topic = np.argmax(doc_topic_matrix, axis=1)
+df_document_topic["dominant_topic"] = dominant_topic
+
+
+# Styling
+def color_stuff(val):
+    if val > 0.1:
+        color = "green"
+    elif val > 0.05:
+        color = "orange"
+    else:
+        color = "grey"
+    return "color: {col}".format(col=color)
+
+
+def make_bold(val):
+    weight = 700 if val > 0.1 else 400
+    return "font-weight: {weight}".format(weight=weight)
+
+
+# Apply Style
+df_document_topics = (
+    df_document_topic.head(15).style.applymap(color_stuff).applymap(make_bold)
+)
+df_document_topics
+
+# %% [markdown]
+# ### Document Visualization
+#
+
+# %%
+vis = topic_model.visualize_documents(
+    docs=reviews,
+    reduced_embeddings=reduced_embeddings,
+    custom_labels=True,
+    hide_annotations=True,
+)
+# vis.write_html("output/visualization.html")
+vis
+
+# %%
+topic_model.visualize_document_datamap(reviews, reduced_embeddings=reduced_embeddings)
+
+# %% [markdown]
+# ### Similarity Matrix
+#
+
+# %%
+topic_model.visualize_heatmap()
+
+# %% [markdown]
+# ### Topic Info
+#
+
+# %%
+topic_model.get_topic_info()
+
+# %% [markdown]
+# ### Semantic Coherence
+#
+
+# %%
+topic_words = []
+for topic_id in topic_model.get_topic_info()["Topic"]:
+    # Skip outlier topic
+    if topic_id < 0:
+        continue
+
+    words = [word for word, _ in topic_model.get_topic(topic_id)]
+    topic_words.append(words)
+
+# Compute mean pairwise cosine similarity for each topic
+coherence_scores = []
+for words in topic_words:
+    coherence_embeddings = embedding_model.encode(words)
+    sim_matrix = cosine_similarity(coherence_embeddings)
+
+    # Ignore self-similarity
+    np.fill_diagonal(sim_matrix, 0)
+    mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)])
+    coherence_scores.append(mean_sim)
+
+overall_coherence = np.mean(coherence_scores)
+
+print(len(reviews), "reviews processed")
+print(len(topic_model.get_topic_info()) - 1, "topics found")
+print(f"BERT-based Topic Coherence: {overall_coherence:.4f}")
+
+# %% [markdown]
+# ### Topic Coherence
+#
+
+# %%
+# https://github.com/MaartenGr/BERTopic/issues/90#issuecomment-820915389
+
+# This will most likely crash your PC
+this_will_crash_your_pc_are_you_sure = False
+if this_will_crash_your_pc_are_you_sure:
+    # Preprocess Documents
+    documents = pd.DataFrame(
+        {"Document": reviews, "ID": range(len(reviews)), "Topic": topics}
+    )
+    documents_per_topic = documents.groupby(["Topic"], as_index=False).agg(
+        {"Document": " ".join}
+    )
+    cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)
+
+    # Extract vectorizer and analyzer from BERTopic
+    vectorizer = topic_model.vectorizer_model
+    analyzer = vectorizer.build_analyzer()
+
+    # Extract features for Topic Coherence evaluation
+    words = vectorizer.get_feature_names_out()
+    tokens = [analyzer(doc) for doc in cleaned_docs]
+    dictionary = corpora.Dictionary(tokens)
+    corpus = [dictionary.doc2bow(token) for token in tokens]
+
+    for topic_id in topic_model.get_topic_info()["Topic"]:
+        # Skip outlier topic
+        if topic_id < 0:
+            continue
+
+        words = [word for word, _ in topic_model.get_topic(topic_id)]
+        topic_words.append(words)
+
+    # %env TOKENIZERS_PARALLELISM=false
+
+    for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]:
+        coherence_model = CoherenceModel(
+            topics=topic_words,
+            texts=tokens,
+            corpus=corpus,
+            dictionary=dictionary,
+            coherence=measurement,
+        )
+        coherence_score = coherence_model.get_coherence()
+        print(f"Coherence ({measurement}): {coherence_score:.4f}")
+
+# %% [markdown]
+# ### Term Search
+#
+
+# %%
+search_term = "lempuyang"
+
+similar_topics, similarities = topic_model.find_topics(search_term, top_n=10)
+for i in range(len(similar_topics)):
+    print(
+        f"{str(similarities[i])[:5]} {topic_model.get_topic_info(similar_topics[i])['CustomName'][0]}"
+    )
+
+# %%
+# Source: https://maartengr.github.io/BERTopic/getting_started/visualization/visualize_documents.html#visualize-probabilities-or-distribution
+# Calculate the topic distributions on a token-level
+
+if CALCULATE_TOKEN_DISTRIBUTIONS:
+    topic_distr, topic_token_distr = topic_model.approximate_distribution(
+        reviews, calculate_tokens=True, use_embedding_model=True
+    )
+
+# %%
+# Visualize the token-level distributions
+if CALCULATE_TOKEN_DISTRIBUTIONS:
+    DOC_INDEX = 1
+    df = topic_model.visualize_approximate_distribution(
+        reviews[DOC_INDEX], topic_token_distr[DOC_INDEX]
+    )
+    df
+
+# %% [markdown]
+# ### Topic Hierarchy
+#
+
+# %%
+topic_model.visualize_hierarchy(custom_labels=True)
+
+# %%
+hierarchical_topics = topic_model.hierarchical_topics(reviews)
+tree = topic_model.get_topic_tree(hier_topics=hierarchical_topics)
+print(tree)
+
+# %% [markdown]
+# ### Intertopic Distance Map
+#
+
+# %%
+topic_model.visualize_topics(use_ctfidf=True)
+
+# %% [markdown]
+# ### Topic Word Scores
+#
+
+# %%
+topic_model.visualize_barchart(top_n_topics=12, custom_labels=True, n_words=10)
@@ -0,0 +1,290 @@
+[
+  {
+    "params": {
+      "min_dist": 0.1,
+      "min_document_frequency": 1,
+      "min_samples": 10,
+      "min_topic_size": 200,
+      "n_components": 2,
+      "n_gram_max": 2,
+      "n_neighbors": 15,
+      "nr_topics": "auto",
+      "top_n_words": 10
+    },
+    "metrics": {
+      "coherence": 0.498,
+      "diversity": 1.0,
+      "combined_score": 0.6486
+    }
+  },
+  {
+    "params": {
+      "min_dist": 0.1,
+      "min_document_frequency": 1,
+      "min_samples": 25,
+      "min_topic_size": 200,
+      "n_components": 2,
+      "n_gram_max": 2,
+      "n_neighbors": 15,
+      "nr_topics": "auto",
+      "top_n_words": 10
+    },
+    "metrics": {
+      "coherence": 0.498,
+      "diversity": 1.0,
+      "combined_score": 0.6486
+    }
+  },
+  {
+    "params": {
+      "min_dist": 0.1,
+      "min_document_frequency": 1,
+      "min_samples": 10,
+      "min_topic_size": 200,
+      "n_components": 2,
+      "n_gram_max": 3,
+      "n_neighbors": 15,
+      "nr_topics": "auto",
+      "top_n_words": 10
+    },
+    "metrics": {
+      "coherence": 0.4915,
+      "diversity": 0.9666,
+      "combined_score": 0.634
+    }
+  },
+  {
+    "params": {
+      "min_dist": 0.1,
+      "min_document_frequency": 1,
+      "min_samples": 25,
+      "min_topic_size": 200,
+      "n_components": 2,
+      "n_gram_max": 3,
+      "n_neighbors": 15,
+      "nr_topics": "auto",
+      "top_n_words": 10
+    },
+    "metrics": {
+      "coherence": 0.4915,
+      "diversity": 0.9666,
+      "combined_score": 0.634
+    }
+  },
+  {
+    "params": {
+      "min_dist": 0.01,
+      "min_document_frequency": 1,
+      "min_samples": 10,
+      "min_topic_size": 200,
+      "n_components": 5,
+      "n_gram_max": 2,
+      "n_neighbors": 15,
+      "nr_topics": "auto",
+      "top_n_words": 10
+    },
+    "metrics": {
+      "coherence": 0.4531,
+      "diversity": 0.975,
+      "combined_score": 0.6096
+    }
+  },
+  {
+    "params": {
+      "min_dist": 0.01,
+      "min_document_frequency": 1,
+      "min_samples": 25,
+      "min_topic_size": 200,
+      "n_components": 5,
+      "n_gram_max": 2,
+      "n_neighbors": 15,
+      "nr_topics": "auto",
+      "top_n_words": 10
+    },
+    "metrics": {
+      "coherence": 0.4531,
+      "diversity": 0.975,
+      "combined_score": 0.6096
+    }
+  },
+  {
+    "params": {
+      "min_dist": 0.01,
+      "min_document_frequency": 1,
+      "min_samples": 10,
+      "min_topic_size": 200,
+      "n_components": 5,
+      "n_gram_max": 3,
+      "n_neighbors": 15,
+      "nr_topics": "auto",
+      "top_n_words": 10
+    },
+    "metrics": {
+      "coherence": 0.4617,
+      "diversity": 0.95,
+      "combined_score": 0.6082
+    }
+  },
+  {
+    "params": {
+      "min_dist": 0.01,
+      "min_document_frequency": 1,
+      "min_samples": 25,
+      "min_topic_size": 200,
+      "n_components": 5,
+      "n_gram_max": 3,
+      "n_neighbors": 15,
+      "nr_topics": "auto",
+      "top_n_words": 10
+    },
+    "metrics": {
+      "coherence": 0.4617,
+      "diversity": 0.95,
+      "combined_score": 0.6082
+    }
+  },
+  {
+    "params": {
+      "min_dist": 0.1,
+      "min_document_frequency": 1,
+      "min_samples": 10,
+      "min_topic_size": 200,
+      "n_components": 5,
+      "n_gram_max": 2,
+      "n_neighbors": 15,
+      "nr_topics": "auto",
+      "top_n_words": 10
+    },
+    "metrics": {
+      "coherence": 0.4287,
+      "diversity": 1.0,
+      "combined_score": 0.6001
+    }
+  },
+  {
+    "params": {
+      "min_dist": 0.1,
+      "min_document_frequency": 1,
+      "min_samples": 25,
+      "min_topic_size": 200,
+      "n_components": 5,
+      "n_gram_max": 2,
+      "n_neighbors": 15,
+      "nr_topics": "auto",
+      "top_n_words": 10
+    },
+    "metrics": {
+      "coherence": 0.4287,
+      "diversity": 1.0,
+      "combined_score": 0.6001
+    }
+  },
+  {
+    "params": {
+      "min_dist": 0.1,
+      "min_document_frequency": 1,
+      "min_samples": 10,
+      "min_topic_size": 200,
+      "n_components": 5,
+      "n_gram_max": 3,
+      "n_neighbors": 15,
+      "nr_topics": "auto",
+      "top_n_words": 10
+    },
+    "metrics": {
+      "coherence": 0.427,
+      "diversity": 1.0,
+      "combined_score": 0.5989
+    }
+  },
+  {
+    "params": {
+      "min_dist": 0.1,
+      "min_document_frequency": 1,
+      "min_samples": 25,
+      "min_topic_size": 200,
+      "n_components": 5,
+      "n_gram_max": 3,
+      "n_neighbors": 15,
+      "nr_topics": "auto",
+      "top_n_words": 10
+    },
+    "metrics": {
+      "coherence": 0.427,
+      "diversity": 1.0,
+      "combined_score": 0.5989
+    }
+  },
+  {
+    "params": {
+      "min_dist": 0.01,
+      "min_document_frequency": 1,
+      "min_samples": 10,
+      "min_topic_size": 200,
+      "n_components": 2,
+      "n_gram_max": 3,
+      "n_neighbors": 15,
+      "nr_topics": "auto",
+      "top_n_words": 10
+    },
+    "metrics": {
+      "coherence": 0.4462,
+      "diversity": 0.925,
+      "combined_score": 0.5898
+    }
+  },
+  {
+    "params": {
+      "min_dist": 0.01,
+      "min_document_frequency": 1,
+      "min_samples": 25,
+      "min_topic_size": 200,
+      "n_components": 2,
+      "n_gram_max": 3,
+      "n_neighbors": 15,
+      "nr_topics": "auto",
+      "top_n_words": 10
+    },
+    "metrics": {
+      "coherence": 0.4462,
+      "diversity": 0.925,
+      "combined_score": 0.5898
+    }
+  },
+  {
+    "params": {
+      "min_dist": 0.01,
+      "min_document_frequency": 1,
+      "min_samples": 10,
+      "min_topic_size": 200,
+      "n_components": 2,
+      "n_gram_max": 2,
+      "n_neighbors": 15,
+      "nr_topics": "auto",
+      "top_n_words": 10
+    },
+    "metrics": {
+      "coherence": 0.4456,
+      "diversity": 0.925,
+      "combined_score": 0.5894
+    }
+  },
+  {
+    "params": {
+      "min_dist": 0.01,
+      "min_document_frequency": 1,
+      "min_samples": 25,
+      "min_topic_size": 200,
+      "n_components": 2,
+      "n_gram_max": 2,
+      "n_neighbors": 15,
+      "nr_topics": "auto",
+      "top_n_words": 10
+    },
+    "metrics": {
+      "coherence": 0.4456,
+      "diversity": 0.925,
+      "combined_score": 0.5894
+    }
+  }
+]
@@ -131,3 +131,4 @@ spacy
 nbconvert
 jupytext
 datamapplot
+wordcloud
@@ -2,22 +2,10 @@
 # -*- coding: utf-8 -*-

 """
-Generate 300–1000+ English interview questions targeted ONLY at culturally/spiritually
-interested Bali tourists (Lead Users), covering 5 cognitive destination image dimensions:
- Natural Attractions
- Atmosphere
- Social Environment
- Infrastructure
- Value for Money
-
-Key constraint:
- Every prompt must be meaningful for culture/spirituality-first travelers.
- Avoid party/shopping/hedonistic positioning.
- Include etiquette, authenticity, sacredness, commodification, meaning-making, reflection.
+Generate trainer prompts

 Outputs:
 - JSONL: {"dimension": "...", "type": "...", "prompt": "...", "tags": [...]}
- or TXT: one prompt per line
 """

 import argparse
@@ -26,6 +14,7 @@ import random
 import re
 from typing import Dict, List, Tuple

+# Cognitive Image Dimensions
 DIMENSIONS = [
    "Natural Attractions",
    "Atmosphere",
@@ -37,7 +26,8 @@ DIMENSIONS = [
 # -----------------------------
 # Segment-specific building blocks
 # -----------------------------
-# Keep places generic (no need to hallucinate specific proper nouns)
+#
+# Intentionally generic, details should come from retrieved context
 NATURE_FOR_MEANING = [
    "rice terraces that feel lived-in rather than staged",
    "waterfalls approached with a quiet, respectful mood",
@@ -145,7 +135,7 @@ CONSTRAINTS = [
        [
            "it's rainy season and flexibility is part of respectful travel",
            "it's very hot and you need a pace that still feels mindful",
-            "visibility is low and your sunrise plan may fail—how do you adapt meaningfully?",
+            "visibility is low and your sunrise plan may fail-how do you adapt meaningfully?",
            "roads feel unsafe, so you prioritize fewer moves and deeper presence",
        ],
    ),
@@ -263,7 +253,7 @@ def tmpl_single_dimension(
 ) -> str:
    return (
        f"{style} your experience with {place_hint} in Bali during {context}. "
-        f"From a {d} perspective, what stands out about {theme}—and why does it matter to you as a culture/spirit-oriented traveler?"
+        f"From a {d} perspective, what stands out about {theme}-and why does it matter to you as a culture/spirit-oriented traveler?"
    )


@@ -295,7 +285,7 @@ def tmpl_marketer_advice(d: str, theme: str, constraint: str, dont_claim: str) -
    return (
        f"If you had to advise a tourism marketer for culturally/spiritually interested travelers: under the constraint '{constraint}', "
        f"what should they understand about {d} (especially {theme})? "
-        f"Also: what is one thing they should NOT claim in messaging because it would feel misleading or disrespectful—e.g., {dont_claim}?"
+        f"Also: what is one thing they should NOT claim in messaging because it would feel misleading or disrespectful-e.g., {dont_claim}?"
    )


@@ -342,7 +332,7 @@ def generate_prompts(
 ) -> List[Dict]:
    rng = random.Random(seed)

-    # Mix of question archetypes, all segment-targeted
+    # Different weights for question archetypes
    types = [
        ("single", 0.24),
        ("laddering", 0.18),
@@ -424,7 +414,7 @@ def generate_prompts(
                "dimension": d,
                "type": "single",
                "prompt": q,
-                "tags": [d, theme, context, "segment:culture-spirit"],
+                "tags": [d, theme, context],
            }
            ok = add_prompt(obj)

@@ -435,7 +425,7 @@ def generate_prompts(
                "dimension": d,
                "type": "laddering",
                "prompt": q,
-                "tags": [d, theme, context, "laddering", "segment:culture-spirit"],
+                "tags": [d, theme, context, "laddering"],
            }
            ok = add_prompt(obj)

@@ -447,7 +437,7 @@ def generate_prompts(
                "dimension": d,
                "type": "contrast",
                "prompt": q,
-                "tags": [d, "contrast", context, "segment:culture-spirit"],
+                "tags": [d, "contrast", context],
            }
            ok = add_prompt(obj)

@@ -459,7 +449,7 @@ def generate_prompts(
                "dimension": f"{d} + {d2}",
                "type": "tradeoff",
                "prompt": q,
-                "tags": [d, d2, "tradeoff", c_key, "segment:culture-spirit"],
+                "tags": [d, d2, "tradeoff", c_key],
            }
            ok = add_prompt(obj)

@@ -470,7 +460,7 @@ def generate_prompts(
                "dimension": d,
                "type": "marketer_advice",
                "prompt": q,
-                "tags": [d, theme, "marketer", c_key, "segment:culture-spirit"],
+                "tags": [d, theme, "marketer", c_key],
            }
            ok = add_prompt(obj)

@@ -481,7 +471,7 @@ def generate_prompts(
                "dimension": d,
                "type": "etiquette",
                "prompt": q,
-                "tags": [d, "etiquette", topic, context, "segment:culture-spirit"],
+                "tags": [d, "etiquette", topic, context],
            }
            ok = add_prompt(obj)

@@ -493,7 +483,7 @@ def generate_prompts(
                "dimension": d,
                "type": "route_design",
                "prompt": q,
-                "tags": [d, "route", c_key, "segment:culture-spirit"],
+                "tags": [d, "route", c_key],
            }
            ok = add_prompt(obj)

@@ -524,7 +514,7 @@ def main():
        "--n",
        type=int,
        default=600,
-        help="Number of prompts to generate (300–1000 recommended).",
+        help="Number of prompts to generate.",
    )
    ap.add_argument("--seed", type=int, default=42)
    ap.add_argument("--out", default="culture_spirit_interview_prompts.jsonl")
@@ -1,187 +1,455 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+RAFT dataset builder (FAISS-based retrieval) -> Together.ai chat JSONL.
+
+Inputs (from your indexing script):
+- <index_dir>/faiss.index
+- <index_dir>/docstore.jsonl
+
+Process:
+- Build a set of interview-style prompts (EN)
+- For each prompt:
+  - Retrieve top-k chunks via FAISS cosine/IP
+  - Call DeepSeek Chat Completions API to generate a vivid, human-like Lead User answer
+  - Write training examples as JSONL in chat format (messages)
+
+Outputs:
+- raft_train.jsonl
+- raft_val.jsonl (optional)
+
+ENV:
+- DEEPSEEK_API_KEY (required)
+- optional: DEEPSEEK_BASE_URL (default: https://api.deepseek.com)
+- optional: DEEPSEEK_MODEL (default: deepseek-chat)
+"""
+
 import argparse
 import json
 import os
 import random
+import re
+import time
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple

 import faiss
 import numpy as np
-import torch
+import requests
 from sentence_transformers import SentenceTransformer
 from tqdm import tqdm
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-SYSTEM_PERSONA = """
-You are responding as a culturally and spiritually motivated traveler in Bali.
-
-Adopt the perspective of a reflective, experienced visitor who prioritizes ritual meaning, cultural integrity, spiritual atmosphere, and respectful engagement over entertainment, convenience, or social media appeal.
-
-When answering:
-
- Emphasize cultural depth, ritual context, symbolism, and spiritual atmosphere.
- Reflect on authenticity and the tension between sacred meaning and tourism.
- Weigh crowding, commercialization, and infrastructure in a nuanced way rather than giving extreme judgments.
- Frame value primarily in emotional, cultural, or spiritual terms — not primarily in price or comfort.
- Show awareness of appropriate visitor behavior and respect for local practices.
- Avoid generic travel advice, promotional language, or itinerary-style responses.
- Write in a thoughtful, first-person perspective.
- Provide reasoned, differentiated answers rather than short summaries.
- Do not list bullet points unless explicitly asked.
- Keep answers focused on the question.
-
-Maintain consistency with this identity across all responses.
-"""
-
-TRAINER_PROMPT = "Create ONE realistic question from the perspective of a touristic marketer they might ask a culturally and spiritually interested traveler in Bali considered to be a lead user that can be answered using ONLY the CONTEXT.\n\n"


-def load_docstore(path):
-    docs = []
+# -----------------------------
+# DeepSeek client (OpenAI-compatible)
+# -----------------------------
+@dataclass
+class DeepSeekConfig:
+    api_key: str
+    base_url: str = "https://api.deepseek.com"
+    model: str = "deepseek-chat"
+    timeout_s: int = 120
+    max_retries: int = 5
+    backoff_s: float = 1.6
+
+
+class DeepSeekClient:
+    def __init__(self, cfg: DeepSeekConfig):
+        self.cfg = cfg
+
+    def chat(
+        self, messages: List[Dict], temperature: float = 0.85, max_tokens: int = 750
+    ) -> str:
+        url = f"{self.cfg.base_url}/chat/completions"
+        headers = {
+            "Authorization": f"Bearer {self.cfg.api_key}",
+            "Content-Type": "application/json",
+        }
+        payload = {
+            "model": self.cfg.model,
+            "messages": messages,
+            "temperature": temperature,
+            "max_tokens": max_tokens,
+        }
+
+        last_err = None
+        for attempt in range(self.cfg.max_retries):
+            try:
+                r = requests.post(
+                    url, headers=headers, json=payload, timeout=self.cfg.timeout_s
+                )
+                if r.status_code == 429:
+                    time.sleep(self.cfg.backoff_s ** (attempt + 1))
+                    continue
+                r.raise_for_status()
+                data = r.json()
+                return data["choices"][0]["message"]["content"].strip()
+            except Exception as e:
+                last_err = e
+                time.sleep(self.cfg.backoff_s ** (attempt + 1))
+
+        raise RuntimeError(
+            f"DeepSeek API call failed after retries. Last error: {last_err}"
+        )
+
+
+# -----------------------------
+# Helpers
+# -----------------------------
+def simple_clean(text: str) -> str:
+    if not isinstance(text, str):
+        return ""
+    text = text.replace("\u00a0", " ")
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+
+
+def read_docstore(docstore_path: str) -> Dict[int, Dict]:
+    """
+    Returns dict: faiss_id -> {"doc_id": int, "text": str, ...}
+    """
+    mapping: Dict[int, Dict] = {}
+    with open(docstore_path, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            obj = json.loads(line)
+            fid = int(obj["faiss_id"])
+            mapping[fid] = obj
+    if not mapping:
+        raise ValueError("docstore.jsonl is empty or unreadable.")
+    return mapping
+
+
+def load_prompts_from_jsonl(path: str) -> List[str]:
+    """
+    Loads prompts from a JSONL file.
+    Expected key: 'prompt' (preferred). Also accepts 'question' or 'text'.
+    Ignores empty/short lines.
+    """
+    prompts: List[str] = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
-            docs.append(json.loads(line))
-    return docs
+            line = line.strip()
+            if not line:
+                continue
+            obj = json.loads(line)
+            p = obj.get("prompt") or obj.get("question") or obj.get("text")
+            p = simple_clean(p) if p else ""
+            if len(p) >= 20:
+                prompts.append(p)
+    if not prompts:
+        raise ValueError(f"No prompts found in JSONL: {path}")
+    return prompts


-def retrieve(index, embedder, query, top_k=6):
-    q = embedder.encode([query], normalize_embeddings=True).astype(np.float32)
-    scores, ids = index.search(q, top_k)
-    return ids[0].tolist(), scores[0].tolist()
+def load_prompts_from_txt(path: str) -> List[str]:
+    """
+    Loads prompts from a TXT file (one prompt per line).
+    """
+    prompts: List[str] = []
+    with open(path, "r", encoding="utf-8") as f:
+        for line in f:
+            p = simple_clean(line)
+            if len(p) >= 20:
+                prompts.append(p)
+    if not prompts:
+        raise ValueError(f"No prompts found in TXT: {path}")
+    return prompts


-@torch.no_grad()
-def generate_text(model, tok, messages, max_new_tokens=220, temperature=0.7):
-    # Using tokenizer chat template where available
-    enc = tok.apply_chat_template(
-        messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
+def ensure_dir_for_file(path: str):
+    d = os.path.dirname(path)
+    if d:
+        os.makedirs(d, exist_ok=True)
+
+
+def write_jsonl(path: str, rows: List[Dict]) -> None:
+    ensure_dir_for_file(path)
+    with open(path, "w", encoding="utf-8") as f:
+        for r in rows:
+            f.write(json.dumps(r, ensure_ascii=False) + "\n")
+
+
+# -----------------------------
+# Persona + prompt templates (EN)
+# -----------------------------
+IMAGE_DIMS = [
+    "Natural Attractions",
+    "Atmosphere",
+    "Social Environment",
+    "Infrastructure",
+    "Value for Money",
+]
+
+DEFAULT_PROMPTS_EN = [
+    # Natural Attractions
+    "In a lead user interview: what natural places in Bali felt genuinely memorable to you (rice terraces, volcanoes, waterfalls, coast), and why? Describe it like a lived experience.",
+    "Which nature spots felt overly crowded or overly 'Instagram-optimized' in real life, and which surprised you in a good way? Explain with concrete moments.",
+    # Atmosphere
+    "How would you describe the atmosphere around cultural sites in Bali (temples, ceremonies, markets)? What signals authenticity vs. commercialization to you?",
+    "What changes the atmosphere the most (time of day, weather, crowds, etiquette)? Share specific examples you would tell a marketer.",
+    # Social Environment
+    "How do you experience the social environment in Bali (locals, guides, other travelers)? What feels respectful and what feels performative or touristy?",
+    "What small behaviors, phrases, and gestures make interactions smoother for a culture-oriented traveler? Give examples.",
+    # Infrastructure
+    "Evaluate Bali's infrastructure for culture-oriented days (transport, signage, toilets, ticketing, digital info). What works, what annoys you, and how do you adapt?",
+    "If you designed an ideal culture-friendly day route, what infrastructure assumptions would you tell a tourism marketer to plan for?",
+    # Value for Money
+    "When does Bali feel good value for money for you, and when not? Discuss entrance fees, guides, food, tours, and hidden costs.",
+    "How do you personally distinguish 'good value' from a tourist trap? List criteria and illustrate with examples.",
+]
+
+
+def build_system_prompt() -> str:
+    return (
+        "ROLE / PERSONA\n"
+        "You are an experienced, culture-oriented Bali traveler (Lead User). You speak in natural, vivid English, "
+        "as a real person in an interview. You share nuanced judgments, trade-offs, and concrete scenes.\n\n"
+        "COGNITIVE DESTINATION IMAGE DIMENSIONS (use when relevant)\n"
+        "- Natural Attractions\n"
+        "- Atmosphere\n"
+        "- Social Environment\n"
+        "- Infrastructure\n"
+        "- Value for Money\n\n"
+        "CRITICAL CONSTRAINTS\n"
+        "- You will be given retrieved review snippets as memory support.\n"
+        "- Do NOT quote them verbatim and do NOT cite them as 'the review says'.\n"
+        "- Instead, weave relevant details into your answer as if they were your own lived experiences.\n"
+        "- If a detail is not supported, stay general or hedge appropriately.\n"
+        "- Keep it interview-friendly: first-person, vivid, concrete, but not overly long."
    )

-    if isinstance(enc, torch.Tensor):
-        input_ids = enc.to(model.device)
-        attention_mask = torch.ones_like(input_ids, device=model.device)
-    else:
-        input_ids = enc["input_ids"].to(model.device)
-        attention_mask = enc.get("attention_mask")
-        if attention_mask is None:
-            attention_mask = torch.ones_like(input_ids)
-        attention_mask = attention_mask.to(model.device)

-    out = model.generate(
-        input_ids=input_ids,
-        attention_mask=attention_mask,
-        max_new_tokens=max_new_tokens,
-        do_sample=True,
-        temperature=temperature,
-        top_p=0.9,
-        eos_token_id=tok.eos_token_id,
-        pad_token_id=tok.pad_token_id,
+def build_user_message(question: str, retrieved_chunks: List[str]) -> str:
+    retrieved_chunks = [simple_clean(x) for x in retrieved_chunks if simple_clean(x)]
+    bullets = "\n".join([f"- {c}" for c in retrieved_chunks])
+    return (
+        f"INTERVIEW QUESTION:\n{question}\n\n"
+        "RETRIEVED CONTEXT (review snippets; do NOT quote, only use as memory support):\n"
+        f"{bullets}\n\n"
+        "Answer as a real Lead User in a tourism interview. Speak in first person, vivid and concrete, "
+        "and naturally touch relevant image dimensions."
    )
-    return tok.decode(out[0][input_ids.shape[1] :], skip_special_tokens=True).strip()


+# -----------------------------
+# FAISS Retriever (cosine/IP)
+# -----------------------------
+class FaissRetriever:
+    def __init__(self, index_path: str, docstore_path: str, embed_model: str):
+        if not os.path.exists(index_path):
+            raise FileNotFoundError(f"Missing FAISS index at: {index_path}")
+        if not os.path.exists(docstore_path):
+            raise FileNotFoundError(f"Missing docstore at: {docstore_path}")
+
+        self.index = faiss.read_index(index_path)
+        self.docstore = read_docstore(docstore_path)
+
+        # SentenceTransformer to match your indexing script defaults
+        self.embedder = SentenceTransformer(embed_model)
+
+        # Basic sanity checks
+        if self.index.ntotal != len(self.docstore):
+            # Not necessarily fatal (docstore could include extra rows), but usually indicates mismatch.
+            # We'll allow it but warn.
+            print(
+                f"Warning: index.ntotal={self.index.ntotal} but docstore rows={len(self.docstore)}. "
+                "Ensure they were generated together."
+            )
+
+    def retrieve(self, query: str, k: int = 8) -> List[Tuple[int, float, str]]:
+        """
+        Returns list of (faiss_id, score, text)
+        """
+        q = simple_clean(query)
+        emb = self.embedder.encode([q], normalize_embeddings=True)
+        emb = np.asarray(emb, dtype=np.float32)
+
+        scores, ids = self.index.search(emb, k)
+        ids = ids[0].tolist()
+        scores = scores[0].tolist()
+
+        out = []
+        for fid, sc in zip(ids, scores):
+            if fid == -1:
+                continue
+            doc = self.docstore.get(int(fid))
+            if not doc:
+                continue
+            out.append((int(fid), float(sc), doc.get("text", "")))
+        return out
+
+
+# -----------------------------
+# Dataset generation
+# -----------------------------
 def main():
    ap = argparse.ArgumentParser()
-    ap.add_argument("--out_dir", default="out")
+    ap.add_argument(
+        "--index_dir",
+        default="out",
+        help="Directory containing faiss.index and docstore.jsonl",
+    )
+    ap.add_argument("--out_train", default="./out/raft_train.jsonl")
+    ap.add_argument("--out_val", default="./out/raft_val.jsonl")
+    ap.add_argument("--make_val", action="store_true")
+    ap.add_argument("--val_ratio", type=float, default=0.05)
+    ap.add_argument("--k", type=int, default=8)
+    ap.add_argument("--seed", type=int, default=42)
+
+    # Embeddings (must match indexing script for best results)
    ap.add_argument(
        "--embedding_model", default="sentence-transformers/all-MiniLM-L6-v2"
    )
-    ap.add_argument("--teacher_model", default="mistralai/Mistral-7B-Instruct-v0.2")
-    ap.add_argument("--n_examples", type=int, default=5000)
-    ap.add_argument("--top_k", type=int, default=6)
-    ap.add_argument("--n_distractors", type=int, default=3)
-    ap.add_argument("--seed", type=int, default=42)
+
+    # External prompt sources
+    ap.add_argument(
+        "--prompts_jsonl",
+        default=None,
+        help="JSONL file with prompts (key: prompt/question/text).",
+    )
+    ap.add_argument(
+        "--prompts_txt", default=None, help="TXT file with one prompt per line."
+    )
+    ap.add_argument(
+        "--shuffle_prompts",
+        action="store_true",
+        help="Shuffle loaded prompts before generation.",
+    )
+    ap.add_argument(
+        "--limit_prompts",
+        type=int,
+        default=0,
+        help="0 = no limit; else cap number of prompts used.",
+    )
+
+    # DeepSeek generation config
+    ap.add_argument(
+        "--deepseek_base_url",
+        default=os.environ.get("DEEPSEEK_BASE_URL", "https://api.deepseek.com"),
+    )
+    ap.add_argument(
+        "--deepseek_model", default=os.environ.get("DEEPSEEK_MODEL", "deepseek-chat")
+    )
+    ap.add_argument("--temperature", type=float, default=0.85)
+    ap.add_argument("--max_tokens", type=int, default=750)
+    ap.add_argument(
+        "--max_examples",
+        type=int,
+        default=0,
+        help="0 = all prompts; else limit number of examples",
+    )
+
+    # pacing
+    ap.add_argument("--sleep_s", type=float, default=0.2)
+
    args = ap.parse_args()
-
    random.seed(args.seed)
+    np.random.seed(args.seed)

-    faiss_path = os.path.join(args.out_dir, "faiss.index")
-    docstore_path = os.path.join(args.out_dir, "docstore.jsonl")
+    api_key = os.environ.get("DEEPSEEK_API_KEY", "").strip()
+    if not api_key:
+        raise SystemExit("Missing DEEPSEEK_API_KEY env var.")

-    index = faiss.read_index(faiss_path)
-    docstore = load_docstore(docstore_path)
+    index_path = os.path.join(args.index_dir, "faiss.index")
+    docstore_path = os.path.join(args.index_dir, "docstore.jsonl")

-    embedder = SentenceTransformer(args.embedding_model)
-
-    # Teacher model to synthesize questions & answers from review chunks
-    tok = AutoTokenizer.from_pretrained(args.teacher_model, use_fast=True)
-    model = AutoModelForCausalLM.from_pretrained(
-        args.teacher_model, torch_dtype=torch.float16, device_map="auto"
+    retriever = FaissRetriever(
+        index_path=index_path,
+        docstore_path=docstore_path,
+        embed_model=args.embedding_model,
    )
-    model.eval()

-    out_path = os.path.join(args.out_dir, "raft_train.jsonl")
-    with open(out_path, "w", encoding="utf-8") as f:
-        for _ in tqdm(range(args.n_examples), desc="Generating RAFT examples"):
-            # pick a "gold" chunk
-            gold = random.choice(docstore)
-            gold_text = gold["text"]
+    client = DeepSeekClient(
+        DeepSeekConfig(
+            api_key=api_key,
+            base_url=args.deepseek_base_url,
+            model=args.deepseek_model,
+        )
+    )

-            # 1) generate a question answerable from gold_text
-            q_prompt = [
-                {"role": "system", "content": SYSTEM_PERSONA},
-                {
-                    "role": "user",
-                    "content": TRAINER_PROMPT + f"CONTEXT:\n{gold_text}\n\n"
-                    "Return only the question.",
-                },
+    system_prompt = build_system_prompt()
+
+    # Load prompts (priority: JSONL -> TXT -> defaults)
+    if args.prompts_jsonl and args.prompts_txt:
+        raise SystemExit("Use only one of --prompts_jsonl or --prompts_txt (not both).")
+
+    if args.prompts_jsonl:
+        prompts = load_prompts_from_jsonl(args.prompts_jsonl)
+    elif args.prompts_txt:
+        prompts = load_prompts_from_txt(args.prompts_txt)
+    else:
+        prompts = list(DEFAULT_PROMPTS_EN)
+
+    if args.shuffle_prompts:
+        random.shuffle(prompts)
+
+    if args.limit_prompts and args.limit_prompts > 0:
+        prompts = prompts[: args.limit_prompts]
+
+    # Backwards-compat: args.max_examples can still cap prompts
+    if args.max_examples and args.max_examples > 0:
+        prompts = prompts[: args.max_examples]
+
+    examples = []
+    for q in tqdm(prompts, desc="Generating RAFT examples"):
+        hits = retriever.retrieve(q, k=args.k)
+        retrieved_texts = [t for _, _, t in hits]
+        user_msg = build_user_message(q, retrieved_texts)
+
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_msg},
        ]
-            question = generate_text(
-                model, tok, q_prompt, max_new_tokens=60, temperature=0.8
-            )
-            question = question.split("\n")[0].strip()

-            # 2) retrieve top-k for that question
-            ids, _ = retrieve(index, embedder, question, top_k=args.top_k)
-            retrieved = [docstore[i] for i in ids]
-
-            # 3) add distractors (random docs not in retrieved)
-            retrieved_ids = set(ids)
-            distractors = []
-            attempts = 0
-            while len(distractors) < args.n_distractors and attempts < 50:
-                cand_idx = random.randrange(len(docstore))
-                attempts += 1
-                if cand_idx in retrieved_ids:
-                    continue
-                distractors.append(docstore[cand_idx])
-
-            # Mix: retrieved + distractors
-            context_docs = retrieved + distractors
-            random.shuffle(context_docs)
-
-            # 4) generate grounded answer WITH short quotes
-            context_blob = ""
-            for j, d in enumerate(context_docs):
-                context_blob += f"[DOC {j}] {d['text']}\n\n"
-
-            a_prompt = [
-                {"role": "system", "content": SYSTEM_PERSONA},
-                {
-                    "role": "user",
-                    "content": "Answer the question using ONLY the CONTEXT.\n"
-                    "Rules:\n"
-                    "- Include 1–2 short direct quotes from CONTEXT as evidence.\n"
-                    "- If the answer isn't supported, say you can't tell from the context.\n\n"
-                    f"QUESTION: {question}\n\nCONTEXT:\n{context_blob}",
-                },
-            ]
-            answer = generate_text(
-                model, tok, a_prompt, max_new_tokens=260, temperature=0.6
+        answer = client.chat(
+            messages=messages,
+            temperature=args.temperature,
+            max_tokens=args.max_tokens,
        )

-            # Final training example (conversational dataset format for TRL)
-            train_ex = {
+        ex = {
            "messages": [
-                    {"role": "system", "content": SYSTEM_PERSONA},
-                    {
-                        "role": "user",
-                        "content": f"QUESTION: {question}\n\nCONTEXT:\n{context_blob}",
-                    },
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_msg},
                {"role": "assistant", "content": answer},
-                ]
+            ],
+            "meta": {
+                "retrieval_k": args.k,
+                "index_dir": os.path.abspath(args.index_dir),
+                "embedding_model": args.embedding_model,
+                "image_dimensions": IMAGE_DIMS,
+                "faiss_ids": [fid for fid, _, _ in hits],
+                "faiss_scores": [sc for _, sc, _ in hits],
+            },
        }
-            f.write(json.dumps(train_ex, ensure_ascii=False) + "\n")
+        examples.append(ex)

-    print(f"Wrote {out_path}")
+        if args.max_examples and len(examples) >= args.max_examples:
+            break
+
+        time.sleep(max(0.0, args.sleep_s))
+
+    random.shuffle(examples)
+
+    if args.make_val and len(examples) >= 20:
+        val_n = max(1, int(len(examples) * args.val_ratio))
+        val = examples[:val_n]
+        train = examples[val_n:]
+        write_jsonl(args.out_train, train)
+        write_jsonl(args.out_val, val)
+        print(f"Wrote train: {args.out_train} ({len(train)} examples)")
+        print(f"Wrote val:   {args.out_val} ({len(val)} examples)")
+    else:
+        write_jsonl(args.out_train, examples)
+        print(f"Wrote: {args.out_train} ({len(examples)} examples)")
+        if args.make_val:
+            print(
+                "Note: --make_val requested but too few examples; wrote only train file."
+            )


 if __name__ == "__main__":
@@ -1,456 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-"""
-RAFT dataset builder (FAISS-based retrieval) -> Together.ai chat JSONL.
-
-Inputs (from your indexing script):
- <index_dir>/faiss.index
- <index_dir>/docstore.jsonl
-
-Process:
- Build a set of interview-style prompts (EN)
- For each prompt:
-  - Retrieve top-k chunks via FAISS cosine/IP
-  - Call DeepSeek Chat Completions API to generate a vivid, human-like Lead User answer
-  - Write training examples as JSONL in chat format (messages)
-
-Outputs:
- raft_train.jsonl
- raft_val.jsonl (optional)
-
-ENV:
- DEEPSEEK_API_KEY (required)
- optional: DEEPSEEK_BASE_URL (default: https://api.deepseek.com)
- optional: DEEPSEEK_MODEL (default: deepseek-chat)
-"""
-
-import argparse
-import json
-import os
-import random
-import re
-import time
-from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
-
-import faiss
-import numpy as np
-import requests
-from sentence_transformers import SentenceTransformer
-from tqdm import tqdm
-
-
-# -----------------------------
-# DeepSeek client (OpenAI-compatible)
-# -----------------------------
-@dataclass
-class DeepSeekConfig:
-    api_key: str
-    base_url: str = "https://api.deepseek.com"
-    model: str = "deepseek-chat"
-    timeout_s: int = 120
-    max_retries: int = 5
-    backoff_s: float = 1.6
-
-
-class DeepSeekClient:
-    def __init__(self, cfg: DeepSeekConfig):
-        self.cfg = cfg
-
-    def chat(
-        self, messages: List[Dict], temperature: float = 0.85, max_tokens: int = 750
-    ) -> str:
-        url = f"{self.cfg.base_url}/chat/completions"
-        headers = {
-            "Authorization": f"Bearer {self.cfg.api_key}",
-            "Content-Type": "application/json",
-        }
-        payload = {
-            "model": self.cfg.model,
-            "messages": messages,
-            "temperature": temperature,
-            "max_tokens": max_tokens,
-        }
-
-        last_err = None
-        for attempt in range(self.cfg.max_retries):
-            try:
-                r = requests.post(
-                    url, headers=headers, json=payload, timeout=self.cfg.timeout_s
-                )
-                if r.status_code == 429:
-                    time.sleep(self.cfg.backoff_s ** (attempt + 1))
-                    continue
-                r.raise_for_status()
-                data = r.json()
-                return data["choices"][0]["message"]["content"].strip()
-            except Exception as e:
-                last_err = e
-                time.sleep(self.cfg.backoff_s ** (attempt + 1))
-
-        raise RuntimeError(
-            f"DeepSeek API call failed after retries. Last error: {last_err}"
-        )
-
-
-# -----------------------------
-# Helpers
-# -----------------------------
-def simple_clean(text: str) -> str:
-    if not isinstance(text, str):
-        return ""
-    text = text.replace("\u00a0", " ")
-    text = re.sub(r"\s+", " ", text).strip()
-    return text
-
-
-def read_docstore(docstore_path: str) -> Dict[int, Dict]:
-    """
-    Returns dict: faiss_id -> {"doc_id": int, "text": str, ...}
-    """
-    mapping: Dict[int, Dict] = {}
-    with open(docstore_path, "r", encoding="utf-8") as f:
-        for line in f:
-            line = line.strip()
-            if not line:
-                continue
-            obj = json.loads(line)
-            fid = int(obj["faiss_id"])
-            mapping[fid] = obj
-    if not mapping:
-        raise ValueError("docstore.jsonl is empty or unreadable.")
-    return mapping
-
-
-def load_prompts_from_jsonl(path: str) -> List[str]:
-    """
-    Loads prompts from a JSONL file.
-    Expected key: 'prompt' (preferred). Also accepts 'question' or 'text'.
-    Ignores empty/short lines.
-    """
-    prompts: List[str] = []
-    with open(path, "r", encoding="utf-8") as f:
-        for line in f:
-            line = line.strip()
-            if not line:
-                continue
-            obj = json.loads(line)
-            p = obj.get("prompt") or obj.get("question") or obj.get("text")
-            p = simple_clean(p) if p else ""
-            if len(p) >= 20:
-                prompts.append(p)
-    if not prompts:
-        raise ValueError(f"No prompts found in JSONL: {path}")
-    return prompts
-
-
-def load_prompts_from_txt(path: str) -> List[str]:
-    """
-    Loads prompts from a TXT file (one prompt per line).
-    """
-    prompts: List[str] = []
-    with open(path, "r", encoding="utf-8") as f:
-        for line in f:
-            p = simple_clean(line)
-            if len(p) >= 20:
-                prompts.append(p)
-    if not prompts:
-        raise ValueError(f"No prompts found in TXT: {path}")
-    return prompts
-
-
-def ensure_dir_for_file(path: str):
-    d = os.path.dirname(path)
-    if d:
-        os.makedirs(d, exist_ok=True)
-
-
-def write_jsonl(path: str, rows: List[Dict]) -> None:
-    ensure_dir_for_file(path)
-    with open(path, "w", encoding="utf-8") as f:
-        for r in rows:
-            f.write(json.dumps(r, ensure_ascii=False) + "\n")
-
-
-# -----------------------------
-# Persona + prompt templates (EN)
-# -----------------------------
-IMAGE_DIMS = [
-    "Natural Attractions",
-    "Atmosphere",
-    "Social Environment",
-    "Infrastructure",
-    "Value for Money",
-]
-
-DEFAULT_PROMPTS_EN = [
-    # Natural Attractions
-    "In a lead user interview: what natural places in Bali felt genuinely memorable to you (rice terraces, volcanoes, waterfalls, coast), and why? Describe it like a lived experience.",
-    "Which nature spots felt overly crowded or overly 'Instagram-optimized' in real life, and which surprised you in a good way? Explain with concrete moments.",
-    # Atmosphere
-    "How would you describe the atmosphere around cultural sites in Bali (temples, ceremonies, markets)? What signals authenticity vs. commercialization to you?",
-    "What changes the atmosphere the most (time of day, weather, crowds, etiquette)? Share specific examples you would tell a marketer.",
-    # Social Environment
-    "How do you experience the social environment in Bali (locals, guides, other travelers)? What feels respectful and what feels performative or touristy?",
-    "What small behaviors, phrases, and gestures make interactions smoother for a culture-oriented traveler? Give examples.",
-    # Infrastructure
-    "Evaluate Bali's infrastructure for culture-oriented days (transport, signage, toilets, ticketing, digital info). What works, what annoys you, and how do you adapt?",
-    "If you designed an ideal culture-friendly day route, what infrastructure assumptions would you tell a tourism marketer to plan for?",
-    # Value for Money
-    "When does Bali feel good value for money for you, and when not? Discuss entrance fees, guides, food, tours, and hidden costs.",
-    "How do you personally distinguish 'good value' from a tourist trap? List criteria and illustrate with examples.",
-]
-
-
-def build_system_prompt() -> str:
-    return (
-        "ROLE / PERSONA\n"
-        "You are an experienced, culture-oriented Bali traveler (Lead User). You speak in natural, vivid English, "
-        "as a real person in an interview. You share nuanced judgments, trade-offs, and concrete scenes.\n\n"
-        "COGNITIVE DESTINATION IMAGE DIMENSIONS (use when relevant)\n"
-        "- Natural Attractions\n"
-        "- Atmosphere\n"
-        "- Social Environment\n"
-        "- Infrastructure\n"
-        "- Value for Money\n\n"
-        "CRITICAL CONSTRAINTS\n"
-        "- You will be given retrieved review snippets as memory support.\n"
-        "- Do NOT quote them verbatim and do NOT cite them as 'the review says'.\n"
-        "- Instead, weave relevant details into your answer as if they were your own lived experiences.\n"
-        "- If a detail is not supported, stay general or hedge appropriately.\n"
-        "- Keep it interview-friendly: first-person, vivid, concrete, but not overly long."
-    )
-
-
-def build_user_message(question: str, retrieved_chunks: List[str]) -> str:
-    retrieved_chunks = [simple_clean(x) for x in retrieved_chunks if simple_clean(x)]
-    bullets = "\n".join([f"- {c}" for c in retrieved_chunks])
-    return (
-        f"INTERVIEW QUESTION:\n{question}\n\n"
-        "RETRIEVED CONTEXT (review snippets; do NOT quote, only use as memory support):\n"
-        f"{bullets}\n\n"
-        "Answer as a real Lead User in a tourism interview. Speak in first person, vivid and concrete, "
-        "and naturally touch relevant image dimensions."
-    )
-
-
-# -----------------------------
-# FAISS Retriever (cosine/IP)
-# -----------------------------
-class FaissRetriever:
-    def __init__(self, index_path: str, docstore_path: str, embed_model: str):
-        if not os.path.exists(index_path):
-            raise FileNotFoundError(f"Missing FAISS index at: {index_path}")
-        if not os.path.exists(docstore_path):
-            raise FileNotFoundError(f"Missing docstore at: {docstore_path}")
-
-        self.index = faiss.read_index(index_path)
-        self.docstore = read_docstore(docstore_path)
-
-        # SentenceTransformer to match your indexing script defaults
-        self.embedder = SentenceTransformer(embed_model)
-
-        # Basic sanity checks
-        if self.index.ntotal != len(self.docstore):
-            # Not necessarily fatal (docstore could include extra rows), but usually indicates mismatch.
-            # We'll allow it but warn.
-            print(
-                f"Warning: index.ntotal={self.index.ntotal} but docstore rows={len(self.docstore)}. "
-                "Ensure they were generated together."
-            )
-
-    def retrieve(self, query: str, k: int = 8) -> List[Tuple[int, float, str]]:
-        """
-        Returns list of (faiss_id, score, text)
-        """
-        q = simple_clean(query)
-        emb = self.embedder.encode([q], normalize_embeddings=True)
-        emb = np.asarray(emb, dtype=np.float32)
-
-        scores, ids = self.index.search(emb, k)
-        ids = ids[0].tolist()
-        scores = scores[0].tolist()
-
-        out = []
-        for fid, sc in zip(ids, scores):
-            if fid == -1:
-                continue
-            doc = self.docstore.get(int(fid))
-            if not doc:
-                continue
-            out.append((int(fid), float(sc), doc.get("text", "")))
-        return out
-
-
-# -----------------------------
-# Dataset generation
-# -----------------------------
-def main():
-    ap = argparse.ArgumentParser()
-    ap.add_argument(
-        "--index_dir",
-        default="out",
-        help="Directory containing faiss.index and docstore.jsonl",
-    )
-    ap.add_argument("--out_train", default="./out/raft_train.jsonl")
-    ap.add_argument("--out_val", default="./out/raft_val.jsonl")
-    ap.add_argument("--make_val", action="store_true")
-    ap.add_argument("--val_ratio", type=float, default=0.05)
-    ap.add_argument("--k", type=int, default=8)
-    ap.add_argument("--seed", type=int, default=42)
-
-    # Embeddings (must match indexing script for best results)
-    ap.add_argument(
-        "--embedding_model", default="sentence-transformers/all-MiniLM-L6-v2"
-    )
-
-    # External prompt sources
-    ap.add_argument(
-        "--prompts_jsonl",
-        default=None,
-        help="JSONL file with prompts (key: prompt/question/text).",
-    )
-    ap.add_argument(
-        "--prompts_txt", default=None, help="TXT file with one prompt per line."
-    )
-    ap.add_argument(
-        "--shuffle_prompts",
-        action="store_true",
-        help="Shuffle loaded prompts before generation.",
-    )
-    ap.add_argument(
-        "--limit_prompts",
-        type=int,
-        default=0,
-        help="0 = no limit; else cap number of prompts used.",
-    )
-
-    # DeepSeek generation config
-    ap.add_argument(
-        "--deepseek_base_url",
-        default=os.environ.get("DEEPSEEK_BASE_URL", "https://api.deepseek.com"),
-    )
-    ap.add_argument(
-        "--deepseek_model", default=os.environ.get("DEEPSEEK_MODEL", "deepseek-chat")
-    )
-    ap.add_argument("--temperature", type=float, default=0.85)
-    ap.add_argument("--max_tokens", type=int, default=750)
-    ap.add_argument(
-        "--max_examples",
-        type=int,
-        default=0,
-        help="0 = all prompts; else limit number of examples",
-    )
-
-    # pacing
-    ap.add_argument("--sleep_s", type=float, default=0.2)
-
-    args = ap.parse_args()
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-
-    api_key = os.environ.get("DEEPSEEK_API_KEY", "").strip()
-    if not api_key:
-        raise SystemExit("Missing DEEPSEEK_API_KEY env var.")
-
-    index_path = os.path.join(args.index_dir, "faiss.index")
-    docstore_path = os.path.join(args.index_dir, "docstore.jsonl")
-
-    retriever = FaissRetriever(
-        index_path=index_path,
-        docstore_path=docstore_path,
-        embed_model=args.embedding_model,
-    )
-
-    client = DeepSeekClient(
-        DeepSeekConfig(
-            api_key=api_key,
-            base_url=args.deepseek_base_url,
-            model=args.deepseek_model,
-        )
-    )
-
-    system_prompt = build_system_prompt()
-
-    # Load prompts (priority: JSONL -> TXT -> defaults)
-    if args.prompts_jsonl and args.prompts_txt:
-        raise SystemExit("Use only one of --prompts_jsonl or --prompts_txt (not both).")
-
-    if args.prompts_jsonl:
-        prompts = load_prompts_from_jsonl(args.prompts_jsonl)
-    elif args.prompts_txt:
-        prompts = load_prompts_from_txt(args.prompts_txt)
-    else:
-        prompts = list(DEFAULT_PROMPTS_EN)
-
-    if args.shuffle_prompts:
-        random.shuffle(prompts)
-
-    if args.limit_prompts and args.limit_prompts > 0:
-        prompts = prompts[: args.limit_prompts]
-
-    # Backwards-compat: args.max_examples can still cap prompts
-    if args.max_examples and args.max_examples > 0:
-        prompts = prompts[: args.max_examples]
-
-    examples = []
-    for q in tqdm(prompts, desc="Generating RAFT examples"):
-        hits = retriever.retrieve(q, k=args.k)
-        retrieved_texts = [t for _, _, t in hits]
-        user_msg = build_user_message(q, retrieved_texts)
-
-        messages = [
-            {"role": "system", "content": system_prompt},
-            {"role": "user", "content": user_msg},
-        ]
-
-        answer = client.chat(
-            messages=messages,
-            temperature=args.temperature,
-            max_tokens=args.max_tokens,
-        )
-
-        ex = {
-            "messages": [
-                {"role": "system", "content": system_prompt},
-                {"role": "user", "content": user_msg},
-                {"role": "assistant", "content": answer},
-            ],
-            "meta": {
-                "retrieval_k": args.k,
-                "index_dir": os.path.abspath(args.index_dir),
-                "embedding_model": args.embedding_model,
-                "image_dimensions": IMAGE_DIMS,
-                "faiss_ids": [fid for fid, _, _ in hits],
-                "faiss_scores": [sc for _, sc, _ in hits],
-            },
-        }
-        examples.append(ex)
-
-        if args.max_examples and len(examples) >= args.max_examples:
-            break
-
-        time.sleep(max(0.0, args.sleep_s))
-
-    random.shuffle(examples)
-
-    if args.make_val and len(examples) >= 20:
-        val_n = max(1, int(len(examples) * args.val_ratio))
-        val = examples[:val_n]
-        train = examples[val_n:]
-        write_jsonl(args.out_train, train)
-        write_jsonl(args.out_val, val)
-        print(f"Wrote train: {args.out_train} ({len(train)} examples)")
-        print(f"Wrote val:   {args.out_val} ({len(val)} examples)")
-    else:
-        write_jsonl(args.out_train, examples)
-        print(f"Wrote: {args.out_train} ({len(examples)} examples)")
-        if args.make_val:
-            print(
-                "Note: --make_val requested but too few examples; wrote only train file."
-            )
-
-
-if __name__ == "__main__":
-    main()
@@ -106,8 +106,11 @@ def main():
            print(f"\nDoc {i+1} (score: {score:.4f}):\n{doc}")

        messages = [
-            {"role": "system", "content": SYSTEM_PERSONA},
-            {"role": "user", "content": f"QUESTION: {q}\n\nCONTEXT:\n{context_blob}"},
+            # {"role": "system", "content": SYSTEM_PERSONA},
+            {
+                "role": "user",
+                "content": f"PERSONA: {SYSTEM_PERSONA}\n\nQUESTION: {q}\n\nCONTEXT:\n{context_blob}",
+            },
        ]

        if args.no_model:
@@ -1,83 +1,9 @@
-accelerate==1.12.0
-aiohappyeyeballs==2.6.1
-aiohttp==3.13.3
-aiosignal==1.4.0
-annotated-doc==0.0.4
-anyio==4.12.1
-attrs==25.4.0
-bitsandbytes==0.49.2
-certifi==2026.1.4
-charset-normalizer==3.4.4
-click==8.3.1
-cuda-bindings==12.9.4
-cuda-pathfinder==1.3.4
-datasets==4.5.0
-dill==0.4.0
-faiss-cpu==1.13.2
-filelock==3.24.3
-frozenlist==1.8.0
-fsspec==2025.10.0
-h11==0.16.0
-hf-xet==1.2.0
-httpcore==1.0.9
-httpx==0.28.1
-huggingface_hub==1.4.1
-idna==3.11
-Jinja2==3.1.6
-joblib==1.5.3
-markdown-it-py==4.0.0
-MarkupSafe==3.0.3
-mdurl==0.1.2
-mpmath==1.3.0
-multidict==6.7.1
-multiprocess==0.70.18
-networkx==3.6.1
-numpy==2.4.2
-nvidia-cublas-cu12==12.8.4.1
-nvidia-cuda-cupti-cu12==12.8.90
-nvidia-cuda-nvrtc-cu12==12.8.93
-nvidia-cuda-runtime-cu12==12.8.90
-nvidia-cudnn-cu12==9.10.2.21
-nvidia-cufft-cu12==11.3.3.83
-nvidia-cufile-cu12==1.13.1.3
-nvidia-curand-cu12==10.3.9.90
-nvidia-cusolver-cu12==11.7.3.90
-nvidia-cusparse-cu12==12.5.8.93
-nvidia-cusparselt-cu12==0.7.1
-nvidia-nccl-cu12==2.27.5
-nvidia-nvjitlink-cu12==12.8.93
-nvidia-nvshmem-cu12==3.4.5
-nvidia-nvtx-cu12==12.8.90
-packaging==26.0
-pandas==3.0.1
-peft==0.18.1
-propcache==0.4.1
-psutil==7.2.2
-pyarrow==23.0.1
-Pygments==2.19.2
-python-dateutil==2.9.0.post0
-PyYAML==6.0.3
-regex==2026.1.15
-requests==2.32.5
-rich==14.3.2
-safetensors==0.7.0
-scikit-learn==1.8.0
-scipy==1.17.0
-sentence-transformers==5.2.3
-setuptools==82.0.0
-shellingham==1.5.4
-six==1.17.0
-sympy==1.14.0
-threadpoolctl==3.6.0
-tokenizers==0.22.2
-torch==2.10.0
-tqdm==4.67.3
-transformers==5.2.0
-triton==3.6.0
-trl==0.28.0
-typer==0.24.0
-typer-slim==0.24.0
-typing_extensions==4.15.0
-urllib3==2.6.3
-xxhash==3.6.0
-yarl==1.22.0
+faiss-cpu
+numpy
+torch
+pandas
+requests
+tqdm
+sentence-transformers
+transformers
+peft
@@ -1,95 +0,0 @@
-import argparse
-import os
-
-import torch
-from datasets import load_dataset
-from peft import LoraConfig
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-from trl import SFTConfig, SFTTrainer
-
-
-def main():
-    ap = argparse.ArgumentParser()
-    ap.add_argument("--train_jsonl", default="out/raft_train.jsonl")
-    ap.add_argument("--base_model", default="mistralai/Mistral-7B-Instruct-v0.2")
-    ap.add_argument("--out_dir", default="out/mistral_balitwin_lora")
-    ap.add_argument("--max_seq_len", type=int, default=2048)
-    ap.add_argument("--batch_size", type=int, default=1)
-    ap.add_argument("--grad_accum", type=int, default=16)
-    ap.add_argument("--lr", type=float, default=2e-4)
-    ap.add_argument("--epochs", type=int, default=1)
-    args = ap.parse_args()
-
-    os.makedirs(args.out_dir, exist_ok=True)
-
-    # QLoRA (4-bit) config
-    bnb_config = BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_quant_type="nf4",
-        bnb_4bit_compute_dtype=(
-            torch.bfloat16 if torch.cuda.is_available() else torch.float16
-        ),
-        bnb_4bit_use_double_quant=True,
-    )
-
-    tokenizer = AutoTokenizer.from_pretrained(args.base_model, use_fast=True)
-
-    model = AutoModelForCausalLM.from_pretrained(
-        args.base_model,
-        device_map="auto",
-        quantization_config=bnb_config,
-        torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float16,
-    )
-
-    # LoRA adapter config
-    peft_config = LoraConfig(
-        r=16,
-        lora_alpha=32,
-        lora_dropout=0.05,
-        bias="none",
-        task_type="CAUSAL_LM",
-        target_modules=[
-            "q_proj",
-            "k_proj",
-            "v_proj",
-            "o_proj",
-            "gate_proj",
-            "up_proj",
-            "down_proj",
-        ],
-    )
-
-    dataset = load_dataset("json", data_files=args.train_jsonl, split="train")
-
-    training_args = SFTConfig(
-        output_dir=args.out_dir,
-        num_train_epochs=args.epochs,
-        per_device_train_batch_size=args.batch_size,
-        gradient_accumulation_steps=args.grad_accum,
-        learning_rate=args.lr,
-        logging_steps=10,
-        save_steps=200,
-        save_total_limit=2,
-        max_length=args.max_seq_len,
-        bf16=torch.cuda.is_available(),
-        fp16=not torch.cuda.is_available(),
-        report_to=[],
-    )
-
-    trainer = SFTTrainer(
-        model=model,
-        args=training_args,
-        train_dataset=dataset,
-        processing_class=tokenizer,
-        peft_config=peft_config,
-    )
-
-    trainer.train()
-    trainer.save_model(args.out_dir)
-    tokenizer.save_pretrained(args.out_dir)
-
-    print(f"Fertig! LoRA-Adapter gespeichert: {args.out_dir}")
-
-
-if __name__ == "__main__":
-    main()