Cleanup

2026-06-22 23:23:07 +02:00 · 2026-02-24 23:39:14 +01:00
parent a2967767d3
commit 0d2807f59f
15 changed files with 648 additions and 10319 deletions
@@ -1,577 +0,0 @@
-# ---
-# jupyter:
-#   jupytext:
-#     text_representation:
-#       extension: .py
-#       format_name: percent
-#       format_version: '1.3'
-#       jupytext_version: 1.18.0
-#   kernelspec:
-#     display_name: .venv
-#     language: python
-#     name: python3
-# ---
-
-# %% [markdown]
-# # Topic Detection: Bali Tourist Reviews
-#
-
-# %% [markdown]
-# ## Preparation
-#
-# ### Dependency Loading
-#
-
-# %%
-import json
-import pickle
-import re
-
-import gensim.corpora as corpora
-import nltk
-import numpy as np
-import pandas as pd
-import spacy
-from bertopic.representation import KeyBERTInspired
-from bertopic.vectorizers import ClassTfidfTransformer
-from gensim.models.coherencemodel import CoherenceModel
-from hdbscan import HDBSCAN
-from nltk.corpus import stopwords
-from nltk.stem import WordNetLemmatizer
-from sentence_transformers import SentenceTransformer
-from sklearn.feature_extraction.text import CountVectorizer
-from sklearn.metrics.pairwise import cosine_similarity
-from umap import UMAP
-
-from bertopic import BERTopic
-
-nlp = spacy.load("en_core_web_sm")
-
-nltk.download("stopwords")
-nltk.download("punkt")
-nltk.download("wordnet")
-
-# %% [markdown]
-# ### Parameters and Tracking
-#
-
-# %%
-RECREATE_MODEL = True
-RECREATE_REDUCED_MODEL = True
-PROCESS_DATA = False
-REDUCE_OUTLIERS = True
-USE_CONDENSED_MODEL = False
-
-DATA_SAMPLE_SIZE = -1  # -1 for all data
-
-# Classical coherence score. Warning: needs swap to not kill your PC
-CALCULATE_COHERENCE = False
-
-# Vectorization
-MIN_DOCUMENT_FREQUENCY = 1
-MAX_NGRAM = 2
-
-# HDBSCAN Parameters
-MIN_TOPIC_SIZE = 200
-MIN_SAMPLES = 25
-
-# UMAP Parameters
-N_NEIGHBORS = 15
-N_COMPONENTS = 2
-MIN_DIST = 0.01
-
-# Topic Modeling
-TOP_N_WORDS = 10
-MAX_TOPICS = None  # or "auto" to pass to HDBSCAN, None to skip
-
-# %% [markdown]
-# ### Data Loading & Preprocessing
-#
-
-# %%
-if DATA_SAMPLE_SIZE != -1:
-    reviews = (
-        pd.read_csv("../data/original/reviews.tab", sep="\t")
-        .sample(n=DATA_SAMPLE_SIZE)
-        .review.dropna()
-        .to_list()
-    )
-else:
-    reviews = (
-        pd.read_csv("../data/original/reviews.tab", sep="\t").review.dropna().to_list()
-    )
-
-print("Loaded {} reviews".format(len(reviews)))
-
-# %%
-# List of NE in Bali for NER enhancement
-with open("../data/supporting/bali_ner.json", "r") as f:
-    bali_places = json.load(f)
-bali_places_set = set(bali_places)
-
-# Stop word definition
-extra_stopwords = ["bali", "idr", "usd"]
-stop_words = set(stopwords.words("english"))
-with open("../data/supporting/stopwords-en.json", "r") as f:
-    extra_stopwords.extend(json.load(f))
-
-# Custom replacements
-rep = {
-    r"\\n": " ",
-    r"\n": " ",
-    r'\\"': "",
-    r'"': "",
-    "mongkey": "monkey",
-    "monky": "monkey",
-    "verry": "very",
-}
-rep = dict((re.escape(k), v) for k, v in rep.items())
-pattern = re.compile("|".join(rep.keys()))
-
-lemmatizer = WordNetLemmatizer()
-
-
-def preprocess(text):
-    # Step 1: Apply custom replacements (typos, special cases)
-    text = text.lower()
-    text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
-
-    # Step 2: Clean text
-    text = re.sub(r"\d+", " ", text)
-    text = re.sub(r"\W+", " ", text)
-
-    doc = nlp(text)
-
-    # Step 3: POS tagging and filtering
-    filtered_tokens = [
-        token.text
-        for token in doc
-        if token.pos_ in {"NOUN", "PROPN"}
-        or token.ent_type_ in {"GPE", "LOC", "FAC"}
-        or token.text in bali_places_set
-    ]
-
-    # Step 4: Lemmatization and stopword removal
-    lemmatized_tokens = [
-        lemmatizer.lemmatize(w)
-        for w in filtered_tokens
-        if w not in stop_words and w not in extra_stopwords and len(w) > 2
-    ]
-
-    return lemmatized_tokens
-
-
-# %%
-if PROCESS_DATA:
-    print("Processing reviews...")
-    reviews = [preprocess(review) for review in reviews]
-
-    with open("../data/intermediate/processed_texts.pkl", "wb") as f:
-        pickle.dump(reviews, f)
-else:
-    with open("../data/intermediate/processed_texts.pkl", "rb") as f:
-        reviews = pickle.load(f)
-        reviews = [
-            " ".join(review) if isinstance(review, list) else review
-            for review in reviews
-        ]
-
-print(reviews[:1])
-
-# %% [markdown]
-# ### Pre-calculate Embeddings
-#
-
-# %%
-embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
-embeddings = embedding_model.encode(reviews, show_progress_bar=True)
-
-# %% [markdown]
-# ## Model Creation
-#
-
-# %% [markdown]
-# ### Dimensionality Reduction (UMAP)
-#
-
-# %%
-umap_model = UMAP(
-    n_neighbors=N_NEIGHBORS,
-    n_components=N_COMPONENTS,
-    min_dist=MIN_DIST,
-    metric="cosine",
-    low_memory=True,
-    random_state=42,
-)
-reduced_embeddings = umap_model.fit_transform(embeddings)
-
-# %% [markdown]
-# ### BERTopic Model Creation
-#
-
-# %%
-if RECREATE_MODEL:
-    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
-    vectorizer_model = CountVectorizer(
-        min_df=MIN_DOCUMENT_FREQUENCY, ngram_range=(1, MAX_NGRAM)
-    )
-
-    representation_model = KeyBERTInspired()
-    hdbscan_model = HDBSCAN(
-        min_cluster_size=MIN_TOPIC_SIZE,
-        min_samples=MIN_SAMPLES,
-        metric="euclidean",
-        cluster_selection_method="eom",
-        gen_min_span_tree=True,
-        prediction_data=True,
-    )
-
-    topic_model = BERTopic(
-        embedding_model=embedding_model,
-        ctfidf_model=ctfidf_model,
-        vectorizer_model=vectorizer_model,
-        umap_model=umap_model,
-        hdbscan_model=hdbscan_model,
-        representation_model=representation_model,
-        verbose=True,
-        calculate_probabilities=True,
-        language="english",
-        top_n_words=TOP_N_WORDS,
-        nr_topics=MAX_TOPICS,
-    )
-
-    topics, probs = topic_model.fit_transform(reviews, embeddings=embeddings)
-
-    topic_labels = topic_model.generate_topic_labels(
-        nr_words=3, topic_prefix=True, word_length=15, separator=" - "
-    )
-    topic_model.set_topic_labels(topic_labels)
-    BERTopic.save(topic_model, "output/model.bertopic")
-else:
-    print("Nevermind, loading existing model")
-    topic_model = BERTopic.load("output/model.bertopic")
-
-# %% [markdown]
-# ## Fine Tuning
-#
-# ### Topic Condensation
-#
-
-# %%
-if RECREATE_REDUCED_MODEL:
-    done = False
-    iteration = 1
-    while not done:
-        print(f"Iteration {iteration}")
-        iteration += 1
-        similarity_matrix = cosine_similarity(
-            np.array(topic_model.topic_embeddings_)[1:, :]
-        )
-        nothing_to_merge = True
-
-        for i in range(similarity_matrix.shape[0]):
-            for j in range(i + 1, similarity_matrix.shape[1]):
-                sim = similarity_matrix[i, j]
-                if sim > 0.9:
-                    nothing_to_merge = False
-                    t1, t2 = i, j
-                    try:
-                        t1_name = topic_model.get_topic_info(t1)["CustomName"][0]
-                        t2_name = topic_model.get_topic_info(t2)["CustomName"][0]
-                        print(
-                            f"Merging topics {t1} ({t1_name}) and {t2} ({t2_name}) with similarity {sim:.2f}"
-                        )
-                        topic_model.merge_topics(reviews, topics_to_merge=[t1, t2])
-
-                        topic_labels = topic_model.generate_topic_labels(
-                            nr_words=3,
-                            topic_prefix=True,
-                            word_length=15,
-                            separator=" - ",
-                        )
-                        topic_model.set_topic_labels(topic_labels)
-                    except Exception as e:
-                        print(f"Failed to merge {t1} and {t2}: {e}")
-        if nothing_to_merge:
-            print("No more topics to merge.")
-            done = True
-
-    # BERTopic.save(topic_model, "bertopic/model_reduced.bertopic")
-elif USE_CONDENSED_MODEL:
-    print("Nevermind, loading existing reduced model")
-    topic_model = BERTopic.load("bertopic/model_reduced.bertopic")
-else:
-    print("Skipping topic reduction")
-
-# %% [markdown]
-# ### Outlier Reduction
-#
-
-# %%
-if REDUCE_OUTLIERS:
-    new_topics = topic_model.reduce_outliers(
-        reviews,
-        topic_model.topics_,
-        probabilities=topic_model.probabilities_,
-        threshold=0.05,
-        strategy="probabilities",
-    )
-    topic_model.update_topics(reviews, topics=new_topics)
-
-# %% [markdown]
-# ## Results
-#
-# ### Classification
-#
-
-# %%
-import random
-from pathlib import Path
-
-# --- config ---
-topics_to_keep = {2, 4, 6, 8, 10, 5, 7}
-INPUT_PATH = "../data/original/reviews.tab"  # TSV with a 'review' column
-OUTPUT_CSV = "../data/intermediate/selected_topics_documents.csv"
-OUTPUT_DIR = Path("../raft/corpus")
-OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
-
-BATCH_SIZE = 60
-MIN_CHARS = 40
-SEED = 42
-
-# --- load data ---
-data = pd.read_csv(INPUT_PATH, sep="\t")
-
-# If you already have `reviews` elsewhere, replace the next line with that variable
-reviews = data["review"].astype(str).fillna("")
-
-# Topic model document info
-df = topic_model.get_document_info(reviews)  # assumes your model is already fitted
-df["Original"] = reviews.values
-
-# --- filter by topics and length ---
-filtered = df[df["Topic"].isin(topics_to_keep)].copy()
-filtered["Original"] = filtered["Original"].str.strip()
-filtered = filtered[filtered["Original"].str.len() >= MIN_CHARS]
-
-# Save an audit CSV
-filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False)
-
-# --- deterministic shuffle + write batched corpus files ---
-total_files = 0
-total_reviews = 0
-rng = random.Random(SEED)
-
-for topic_val, g in filtered.groupby("Topic", sort=True):
-    reviews_list = g["Original"].tolist()
-
-    # deterministic shuffle within topic
-    rng.shuffle(reviews_list)
-
-    # chunk into batches of up to 60
-    for start in range(0, len(reviews_list), BATCH_SIZE):
-        chunk = reviews_list[start : start + BATCH_SIZE]
-        if not chunk:
-            continue
-
-        # simple header for traceability
-        header = (
-            f"[TOPIC] {topic_val}\n" f"[Stats] N={len(chunk)} | Source={INPUT_PATH}\n"
-        )
-
-        lines = [header, ""]
-        for i, txt in enumerate(chunk, 1):
-            lines.append(f"({i}) {txt}")
-
-        part_idx = start // BATCH_SIZE + 1
-        fname = f"topic={topic_val}__part={part_idx:03d}__n={len(chunk)}.txt"
-        (OUTPUT_DIR / fname).write_text("\n".join(lines), encoding="utf-8")
-
-        total_files += 1
-        total_reviews += len(chunk)
-
-print(
-    f"[green]Wrote {total_files} docs with {total_reviews} reviews to {OUTPUT_DIR}[/green]"
-)
-print(f"[green]Filtered CSV saved to {OUTPUT_CSV}[/green]")
-
-# %%
-doc_topic_matrix = probs
-
-# column names
-topicnames = ["Topic " + str(i) for i in range(len(set(topics)) - 1)]
-
-# index names
-docnames = ["Review " + str(i) for i in range(len(reviews))]
-
-# Make the pandas dataframe
-df_document_topic = pd.DataFrame(
-    np.round(doc_topic_matrix, 2), columns=topicnames, index=docnames
-)
-
-# Get dominant topic for each document
-dominant_topic = np.argmax(doc_topic_matrix, axis=1)
-df_document_topic["dominant_topic"] = dominant_topic
-
-
-# Styling
-def color_stuff(val):
-    if val > 0.1:
-        color = "green"
-    elif val > 0.05:
-        color = "orange"
-    else:
-        color = "grey"
-    return "color: {col}".format(col=color)
-
-
-def make_bold(val):
-    weight = 700 if val > 0.1 else 400
-    return "font-weight: {weight}".format(weight=weight)
-
-
-# Apply Style
-df_document_topics = (
-    df_document_topic.head(15).style.applymap(color_stuff).applymap(make_bold)
-)
-df_document_topics
-
-# %% [markdown]
-# ### Document Visualization
-#
-
-# %%
-vis = topic_model.visualize_documents(
-    docs=reviews,
-    reduced_embeddings=reduced_embeddings,
-    custom_labels=True,
-    hide_annotations=True,
-)
-vis.write_html("output/visualization.html")
-vis
-
-# %% [markdown]
-# ### Similarity Matrix
-#
-
-# %%
-topic_model.visualize_heatmap()
-
-# %% [markdown]
-# ### Topic Info
-#
-
-# %%
-topic_model.get_topic_info()
-
-# %% [markdown]
-# ### Semantic Coherence
-#
-
-# %%
-topic_words = []
-for topic_id in topic_model.get_topic_info()["Topic"]:
-    # Skip outlier topic
-    if topic_id < 0:
-        continue
-
-    words = [word for word, _ in topic_model.get_topic(topic_id)]
-    topic_words.append(words)
-
-# Compute mean pairwise cosine similarity for each topic
-coherence_scores = []
-for words in topic_words:
-    coherence_embeddings = embedding_model.encode(words)
-    sim_matrix = cosine_similarity(coherence_embeddings)
-
-    # Ignore self-similarity
-    np.fill_diagonal(sim_matrix, 0)
-    mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)])
-    coherence_scores.append(mean_sim)
-
-overall_coherence = np.mean(coherence_scores)
-
-print(len(reviews), "reviews processed")
-print(len(topic_model.get_topic_info()) - 1, "topics found")
-print(f"BERT-based Topic Coherence: {overall_coherence:.4f}")
-
-# %% [markdown]
-# ### Topic Coherence
-#
-
-# %%
-# https://github.com/MaartenGr/BERTopic/issues/90#issuecomment-820915389
-
-if CALCULATE_COHERENCE:
-    # Preprocess Documents
-    documents = pd.DataFrame(
-        {"Document": reviews, "ID": range(len(reviews)), "Topic": topics}
-    )
-    documents_per_topic = documents.groupby(["Topic"], as_index=False).agg(
-        {"Document": " ".join}
-    )
-    cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)
-
-    # Extract vectorizer and analyzer from BERTopic
-    vectorizer = topic_model.vectorizer_model
-    analyzer = vectorizer.build_analyzer()
-
-    # Extract features for Topic Coherence evaluation
-    words = vectorizer.get_feature_names_out()
-    tokens = [analyzer(doc) for doc in cleaned_docs]
-    dictionary = corpora.Dictionary(tokens)
-    corpus = [dictionary.doc2bow(token) for token in tokens]
-    topic_words = [
-        [words for words, _ in topic_model.get_topic(topic)]
-        for topic in range(len(set(topics)) - 1)
-    ]
-
-    # %env TOKENIZERS_PARALLELISM=false
-
-    for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]:
-        coherence_model = CoherenceModel(
-            topics=topic_words,
-            texts=tokens,
-            corpus=corpus,
-            dictionary=dictionary,
-            coherence=measurement,
-        )
-        coherence_score = coherence_model.get_coherence()
-        print(f"Coherence ({measurement}): {coherence_score:.4f}")
-else:
-    print("Skipping classical coherence calculation")
-
-# %% [markdown]
-# ### Term Search
-#
-
-# %%
-search_term = "uluwatu"
-
-similar_topics, similarities = topic_model.find_topics(search_term, top_n=10)
-for i in range(len(similar_topics)):
-    # \n{topic_model.get_topic(similar_topics[i])}\n
-    print(
-        f"{str(similarities[i])[:5]} {topic_model.get_topic_info(similar_topics[i])["CustomName"][0]}"
-    )
-
-# %% [markdown]
-# ### Topic Hierarchy
-#
-
-# %%
-topic_model.visualize_hierarchy(custom_labels=True)
-
-# %% [markdown]
-# ### Intertopic Distance Map
-#
-
-# %%
-topic_model.visualize_topics()
-
-# %% [markdown]
-# ### Topic Word Scores
-#
-
-# %%
-topic_model.visualize_barchart(top_n_topics=12, custom_labels=True, n_words=10)
@@ -1,290 +0,0 @@
-[
-  {
-    "params": {
-      "min_dist": 0.01,
-      "min_document_frequency": 1,
-      "min_samples": 10,
-      "min_topic_size": 200,
-      "n_components": 2,
-      "n_gram_max": 2,
-      "n_neighbors": 15,
-      "nr_topics": "auto",
-      "top_n_words": 10
-    },
-    "metrics": {
-      "coherence": 0.4456,
-      "diversity": 0.925,
-      "combined_score": 0.5894
-    }
-  },
-  {
-    "params": {
-      "min_dist": 0.01,
-      "min_document_frequency": 1,
-      "min_samples": 10,
-      "min_topic_size": 200,
-      "n_components": 2,
-      "n_gram_max": 3,
-      "n_neighbors": 15,
-      "nr_topics": "auto",
-      "top_n_words": 10
-    },
-    "metrics": {
-      "coherence": 0.4462,
-      "diversity": 0.925,
-      "combined_score": 0.5898
-    }
-  },
-  {
-    "params": {
-      "min_dist": 0.01,
-      "min_document_frequency": 1,
-      "min_samples": 10,
-      "min_topic_size": 200,
-      "n_components": 5,
-      "n_gram_max": 2,
-      "n_neighbors": 15,
-      "nr_topics": "auto",
-      "top_n_words": 10
-    },
-    "metrics": {
-      "coherence": 0.4531,
-      "diversity": 0.975,
-      "combined_score": 0.6096
-    }
-  },
-  {
-    "params": {
-      "min_dist": 0.01,
-      "min_document_frequency": 1,
-      "min_samples": 10,
-      "min_topic_size": 200,
-      "n_components": 5,
-      "n_gram_max": 3,
-      "n_neighbors": 15,
-      "nr_topics": "auto",
-      "top_n_words": 10
-    },
-    "metrics": {
-      "coherence": 0.4617,
-      "diversity": 0.95,
-      "combined_score": 0.6082
-    }
-  },
-  {
-    "params": {
-      "min_dist": 0.01,
-      "min_document_frequency": 1,
-      "min_samples": 25,
-      "min_topic_size": 200,
-      "n_components": 2,
-      "n_gram_max": 2,
-      "n_neighbors": 15,
-      "nr_topics": "auto",
-      "top_n_words": 10
-    },
-    "metrics": {
-      "coherence": 0.4456,
-      "diversity": 0.925,
-      "combined_score": 0.5894
-    }
-  },
-  {
-    "params": {
-      "min_dist": 0.01,
-      "min_document_frequency": 1,
-      "min_samples": 25,
-      "min_topic_size": 200,
-      "n_components": 2,
-      "n_gram_max": 3,
-      "n_neighbors": 15,
-      "nr_topics": "auto",
-      "top_n_words": 10
-    },
-    "metrics": {
-      "coherence": 0.4462,
-      "diversity": 0.925,
-      "combined_score": 0.5898
-    }
-  },
-  {
-    "params": {
-      "min_dist": 0.01,
-      "min_document_frequency": 1,
-      "min_samples": 25,
-      "min_topic_size": 200,
-      "n_components": 5,
-      "n_gram_max": 2,
-      "n_neighbors": 15,
-      "nr_topics": "auto",
-      "top_n_words": 10
-    },
-    "metrics": {
-      "coherence": 0.4531,
-      "diversity": 0.975,
-      "combined_score": 0.6096
-    }
-  },
-  {
-    "params": {
-      "min_dist": 0.01,
-      "min_document_frequency": 1,
-      "min_samples": 25,
-      "min_topic_size": 200,
-      "n_components": 5,
-      "n_gram_max": 3,
-      "n_neighbors": 15,
-      "nr_topics": "auto",
-      "top_n_words": 10
-    },
-    "metrics": {
-      "coherence": 0.4617,
-      "diversity": 0.95,
-      "combined_score": 0.6082
-    }
-  },
-  {
-    "params": {
-      "min_dist": 0.1,
-      "min_document_frequency": 1,
-      "min_samples": 10,
-      "min_topic_size": 200,
-      "n_components": 2,
-      "n_gram_max": 2,
-      "n_neighbors": 15,
-      "nr_topics": "auto",
-      "top_n_words": 10
-    },
-    "metrics": {
-      "coherence": 0.498,
-      "diversity": 1.0,
-      "combined_score": 0.6486
-    }
-  },
-  {
-    "params": {
-      "min_dist": 0.1,
-      "min_document_frequency": 1,
-      "min_samples": 10,
-      "min_topic_size": 200,
-      "n_components": 2,
-      "n_gram_max": 3,
-      "n_neighbors": 15,
-      "nr_topics": "auto",
-      "top_n_words": 10
-    },
-    "metrics": {
-      "coherence": 0.4915,
-      "diversity": 0.9666,
-      "combined_score": 0.634
-    }
-  },
-  {
-    "params": {
-      "min_dist": 0.1,
-      "min_document_frequency": 1,
-      "min_samples": 10,
-      "min_topic_size": 200,
-      "n_components": 5,
-      "n_gram_max": 2,
-      "n_neighbors": 15,
-      "nr_topics": "auto",
-      "top_n_words": 10
-    },
-    "metrics": {
-      "coherence": 0.4287,
-      "diversity": 1.0,
-      "combined_score": 0.6001
-    }
-  },
-  {
-    "params": {
-      "min_dist": 0.1,
-      "min_document_frequency": 1,
-      "min_samples": 10,
-      "min_topic_size": 200,
-      "n_components": 5,
-      "n_gram_max": 3,
-      "n_neighbors": 15,
-      "nr_topics": "auto",
-      "top_n_words": 10
-    },
-    "metrics": {
-      "coherence": 0.427,
-      "diversity": 1.0,
-      "combined_score": 0.5989
-    }
-  },
-  {
-    "params": {
-      "min_dist": 0.1,
-      "min_document_frequency": 1,
-      "min_samples": 25,
-      "min_topic_size": 200,
-      "n_components": 2,
-      "n_gram_max": 2,
-      "n_neighbors": 15,
-      "nr_topics": "auto",
-      "top_n_words": 10
-    },
-    "metrics": {
-      "coherence": 0.498,
-      "diversity": 1.0,
-      "combined_score": 0.6486
-    }
-  },
-  {
-    "params": {
-      "min_dist": 0.1,
-      "min_document_frequency": 1,
-      "min_samples": 25,
-      "min_topic_size": 200,
-      "n_components": 2,
-      "n_gram_max": 3,
-      "n_neighbors": 15,
-      "nr_topics": "auto",
-      "top_n_words": 10
-    },
-    "metrics": {
-      "coherence": 0.4915,
-      "diversity": 0.9666,
-      "combined_score": 0.634
-    }
-  },
-  {
-    "params": {
-      "min_dist": 0.1,
-      "min_document_frequency": 1,
-      "min_samples": 25,
-      "min_topic_size": 200,
-      "n_components": 5,
-      "n_gram_max": 2,
-      "n_neighbors": 15,
-      "nr_topics": "auto",
-      "top_n_words": 10
-    },
-    "metrics": {
-      "coherence": 0.4287,
-      "diversity": 1.0,
-      "combined_score": 0.6001
-    }
-  },
-  {
-    "params": {
-      "min_dist": 0.1,
-      "min_document_frequency": 1,
-      "min_samples": 25,
-      "min_topic_size": 200,
-      "n_components": 5,
-      "n_gram_max": 3,
-      "n_neighbors": 15,
-      "nr_topics": "auto",
-      "top_n_words": 10
-    },
-    "metrics": {
-      "coherence": 0.427,
-      "diversity": 1.0,
-      "combined_score": 0.5989
-    }
-  }
-]
@@ -1,290 +0,0 @@
-[
-  {
-    "params": {
-      "min_dist": 0.1,
-      "min_document_frequency": 1,
-      "min_samples": 10,
-      "min_topic_size": 200,
-      "n_components": 2,
-      "n_gram_max": 2,
-      "n_neighbors": 15,
-      "nr_topics": "auto",
-      "top_n_words": 10
-    },
-    "metrics": {
-      "coherence": 0.498,
-      "diversity": 1.0,
-      "combined_score": 0.6486
-    }
-  },
-  {
-    "params": {
-      "min_dist": 0.1,
-      "min_document_frequency": 1,
-      "min_samples": 25,
-      "min_topic_size": 200,
-      "n_components": 2,
-      "n_gram_max": 2,
-      "n_neighbors": 15,
-      "nr_topics": "auto",
-      "top_n_words": 10
-    },
-    "metrics": {
-      "coherence": 0.498,
-      "diversity": 1.0,
-      "combined_score": 0.6486
-    }
-  },
-  {
-    "params": {
-      "min_dist": 0.1,
-      "min_document_frequency": 1,
-      "min_samples": 10,
-      "min_topic_size": 200,
-      "n_components": 2,
-      "n_gram_max": 3,
-      "n_neighbors": 15,
-      "nr_topics": "auto",
-      "top_n_words": 10
-    },
-    "metrics": {
-      "coherence": 0.4915,
-      "diversity": 0.9666,
-      "combined_score": 0.634
-    }
-  },
-  {
-    "params": {
-      "min_dist": 0.1,
-      "min_document_frequency": 1,
-      "min_samples": 25,
-      "min_topic_size": 200,
-      "n_components": 2,
-      "n_gram_max": 3,
-      "n_neighbors": 15,
-      "nr_topics": "auto",
-      "top_n_words": 10
-    },
-    "metrics": {
-      "coherence": 0.4915,
-      "diversity": 0.9666,
-      "combined_score": 0.634
-    }
-  },
-  {
-    "params": {
-      "min_dist": 0.01,
-      "min_document_frequency": 1,
-      "min_samples": 10,
-      "min_topic_size": 200,
-      "n_components": 5,
-      "n_gram_max": 2,
-      "n_neighbors": 15,
-      "nr_topics": "auto",
-      "top_n_words": 10
-    },
-    "metrics": {
-      "coherence": 0.4531,
-      "diversity": 0.975,
-      "combined_score": 0.6096
-    }
-  },
-  {
-    "params": {
-      "min_dist": 0.01,
-      "min_document_frequency": 1,
-      "min_samples": 25,
-      "min_topic_size": 200,
-      "n_components": 5,
-      "n_gram_max": 2,
-      "n_neighbors": 15,
-      "nr_topics": "auto",
-      "top_n_words": 10
-    },
-    "metrics": {
-      "coherence": 0.4531,
-      "diversity": 0.975,
-      "combined_score": 0.6096
-    }
-  },
-  {
-    "params": {
-      "min_dist": 0.01,
-      "min_document_frequency": 1,
-      "min_samples": 10,
-      "min_topic_size": 200,
-      "n_components": 5,
-      "n_gram_max": 3,
-      "n_neighbors": 15,
-      "nr_topics": "auto",
-      "top_n_words": 10
-    },
-    "metrics": {
-      "coherence": 0.4617,
-      "diversity": 0.95,
-      "combined_score": 0.6082
-    }
-  },
-  {
-    "params": {
-      "min_dist": 0.01,
-      "min_document_frequency": 1,
-      "min_samples": 25,
-      "min_topic_size": 200,
-      "n_components": 5,
-      "n_gram_max": 3,
-      "n_neighbors": 15,
-      "nr_topics": "auto",
-      "top_n_words": 10
-    },
-    "metrics": {
-      "coherence": 0.4617,
-      "diversity": 0.95,
-      "combined_score": 0.6082
-    }
-  },
-  {
-    "params": {
-      "min_dist": 0.1,
-      "min_document_frequency": 1,
-      "min_samples": 10,
-      "min_topic_size": 200,
-      "n_components": 5,
-      "n_gram_max": 2,
-      "n_neighbors": 15,
-      "nr_topics": "auto",
-      "top_n_words": 10
-    },
-    "metrics": {
-      "coherence": 0.4287,
-      "diversity": 1.0,
-      "combined_score": 0.6001
-    }
-  },
-  {
-    "params": {
-      "min_dist": 0.1,
-      "min_document_frequency": 1,
-      "min_samples": 25,
-      "min_topic_size": 200,
-      "n_components": 5,
-      "n_gram_max": 2,
-      "n_neighbors": 15,
-      "nr_topics": "auto",
-      "top_n_words": 10
-    },
-    "metrics": {
-      "coherence": 0.4287,
-      "diversity": 1.0,
-      "combined_score": 0.6001
-    }
-  },
-  {
-    "params": {
-      "min_dist": 0.1,
-      "min_document_frequency": 1,
-      "min_samples": 10,
-      "min_topic_size": 200,
-      "n_components": 5,
-      "n_gram_max": 3,
-      "n_neighbors": 15,
-      "nr_topics": "auto",
-      "top_n_words": 10
-    },
-    "metrics": {
-      "coherence": 0.427,
-      "diversity": 1.0,
-      "combined_score": 0.5989
-    }
-  },
-  {
-    "params": {
-      "min_dist": 0.1,
-      "min_document_frequency": 1,
-      "min_samples": 25,
-      "min_topic_size": 200,
-      "n_components": 5,
-      "n_gram_max": 3,
-      "n_neighbors": 15,
-      "nr_topics": "auto",
-      "top_n_words": 10
-    },
-    "metrics": {
-      "coherence": 0.427,
-      "diversity": 1.0,
-      "combined_score": 0.5989
-    }
-  },
-  {
-    "params": {
-      "min_dist": 0.01,
-      "min_document_frequency": 1,
-      "min_samples": 10,
-      "min_topic_size": 200,
-      "n_components": 2,
-      "n_gram_max": 3,
-      "n_neighbors": 15,
-      "nr_topics": "auto",
-      "top_n_words": 10
-    },
-    "metrics": {
-      "coherence": 0.4462,
-      "diversity": 0.925,
-      "combined_score": 0.5898
-    }
-  },
-  {
-    "params": {
-      "min_dist": 0.01,
-      "min_document_frequency": 1,
-      "min_samples": 25,
-      "min_topic_size": 200,
-      "n_components": 2,
-      "n_gram_max": 3,
-      "n_neighbors": 15,
-      "nr_topics": "auto",
-      "top_n_words": 10
-    },
-    "metrics": {
-      "coherence": 0.4462,
-      "diversity": 0.925,
-      "combined_score": 0.5898
-    }
-  },
-  {
-    "params": {
-      "min_dist": 0.01,
-      "min_document_frequency": 1,
-      "min_samples": 10,
-      "min_topic_size": 200,
-      "n_components": 2,
-      "n_gram_max": 2,
-      "n_neighbors": 15,
-      "nr_topics": "auto",
-      "top_n_words": 10
-    },
-    "metrics": {
-      "coherence": 0.4456,
-      "diversity": 0.925,
-      "combined_score": 0.5894
-    }
-  },
-  {
-    "params": {
-      "min_dist": 0.01,
-      "min_document_frequency": 1,
-      "min_samples": 25,
-      "min_topic_size": 200,
-      "n_components": 2,
-      "n_gram_max": 2,
-      "n_neighbors": 15,
-      "nr_topics": "auto",
-      "top_n_words": 10
-    },
-    "metrics": {
-      "coherence": 0.4456,
-      "diversity": 0.925,
-      "combined_score": 0.5894
-    }
-  }
-]
@@ -6,15 +6,9 @@

 _(Perception of natural beauty, cultural substance, historical depth)_

-1. **When you think of Bali, which specific natural or spiritual places embody “authentic cultural depth” for you — and what makes them stand out?**
+2. What distinguishes a spiritually meaningful temple complex from a purely scenic attraction in your perception?

-2. **What distinguishes a spiritually meaningful temple complex from a purely scenic attraction in your perception?**
-
-3. **Using Uluwatu or Lempuyang as examples: What elements would need to be communicated for you to perceive them not as “Instagram spots,” but as culturally substantial places?**
-
-4. **How important is active ritual presence (e.g., ceremonies, offerings, priests) compared to architectural or historical aspects?**
-
-5. **If you had to choose between Tanah Lot and Ulun Danu Bratan for a reflective, culturally immersive experience, which criteria would guide your decision?**
+3. If you had to choose between Tanah Lot and Ulun Danu Bratan for a reflective, culturally immersive experience, which criteria would guide your decision?

 ---

@@ -22,15 +16,13 @@ _(Perception of natural beauty, cultural substance, historical depth)_

 _(Emotional quality, spirituality, aesthetic perception, subjective experience)_

-6. **How would you describe the atmosphere of a place where you feel culturally and spiritually aligned? What factors create that feeling?**
+6. How would you describe the atmosphere of a place where you feel culturally and spiritually aligned? What factors create that feeling?

-7. **To what extent do visitor numbers affect your spiritual experience — and is there a threshold you still consider acceptable?**
+7. To what extent do visitor numbers affect your spiritual experience — and is there a threshold you still consider acceptable?

-8. **Which timing or contextual conditions (e.g., ceremony days, off-season, sunrise instead of sunset) enhance the cultural intensity of a place for you?**
+8. Which timing or contextual conditions (e.g., ceremony days, off-season, sunrise instead of sunset) enhance the cultural intensity of a place for you?

-9. **How do you internally reconcile the sacred character of a site with strong touristic staging or commercialization?**
-
-10. **What would a destination need to do in order to evoke not just visual admiration, but genuine spiritual resonance for you?**
+9. What would a destination need to do in order to evoke not just visual admiration, but genuine spiritual resonance for you?

 ---

@@ -38,13 +30,13 @@ _(Emotional quality, spirituality, aesthetic perception, subjective experience)_

 _(Local interaction, authenticity, visitor behavior, cultural credibility)_

-11. **What role does interaction with local priests, guides, or community members play in shaping the depth of your experience?**
+11. What role does interaction with local priests, guides, or community members play in shaping the depth of your experience?

-12. **How do you define appropriate visitor behavior at Balinese temples, and how strongly does this influence your overall perception of the site?**
+12. How do you define appropriate visitor behavior at Balinese temples, and how strongly does this influence your overall perception of the site?

-13. **If other visitors focus primarily on photography, does that diminish the spiritual quality of the place for you, or can you detach from it?**
+13. If other visitors focus primarily on photography, does that diminish the spiritual quality of the place for you, or can you detach from it?

-14. **What type of cultural storytelling by locals feels authentic and credible rather than staged for tourism?**
+14. What type of cultural storytelling by locals feels authentic and credible rather than staged for tourism?

 ---

@@ -52,13 +44,13 @@ _(Local interaction, authenticity, visitor behavior, cultural credibility)_

 _(Accessibility, organization, hygiene standards, information systems)_

-15. **How important are curated background explanations (e.g., symbolism, ritual calendars, historical context) compared to independent exploration?**
+15. How important are curated background explanations (e.g., symbolism, ritual calendars, historical context) compared to independent exploration?

-16. **Do long waiting times — for example at Lempuyang — affect your perception of a site’s spiritual substance, or do you separate logistical issues from cultural meaning?**
+16. Do long waiting times — for example at Lempuyang — affect your perception of a site’s spiritual substance, or do you separate logistical issues from cultural meaning?

-17. **Which infrastructural measures (e.g., visitor flow management, limited entry slots, silent zones) would enhance the cultural quality of your experience?**
+17. Which infrastructural measures (e.g., visitor flow management, limited entry slots, silent zones) would enhance the cultural quality of your experience?

-18. **How should destinations communicate information in order to appeal to spiritually interested travelers without reinforcing mass-tourism dynamics?**
+18. How should destinations communicate information in order to appeal to spiritually interested travelers without reinforcing mass-tourism dynamics?

 ---

@@ -66,18 +58,18 @@ _(Accessibility, organization, hygiene standards, information systems)_

 _(Perceived value, immaterial benefits, willingness to pay)_

-19. **How do you personally assess the “value” of cultural attractions — in terms of emotional depth, learning outcomes, exclusivity, or something else?**
+19. How do you personally assess the “value” of cultural attractions — in terms of emotional depth, learning outcomes, exclusivity, or something else?

-20. **Would you be willing to accept higher entrance fees or donations if they demonstrably contribute to preserving religious structures and practices? Why or why not?**
+20. Would you be willing to accept higher entrance fees or donations if they demonstrably contribute to preserving religious structures and practices? Why or why not?

-21. **What would legitimize a paid cultural experience (e.g., guided participation in a ceremony) for you — and what would make it feel commercialized or inauthentic?**
+21. What would legitimize a paid cultural experience (e.g., guided participation in a ceremony) for you — and what would make it feel commercialized or inauthentic?

 ---

 ## VI. Segment Identity & Positioning (Lead-User Perspective)

-22. **How would you describe yourself as a Bali traveler if your primary focus is cultural and spiritual depth?**
+22. How would you describe yourself as a Bali traveler if your primary focus is cultural and spiritual depth?

-23. **Which typical Bali tourism offerings do you consciously avoid, and why do they not align with your travel philosophy?**
+23. Which typical Bali tourism offerings do you consciously avoid, and why do they not align with your travel philosophy?

-24. **If a tourism brand wanted to position Bali specifically for culturally and spiritually motivated travelers, which narratives should it emphasize — and which should it avoid?**
+24. If a tourism brand wanted to position Bali specifically for culturally and spiritually motivated travelers, which narratives should it emphasize — and which should it avoid?
@@ -18,12 +18,6 @@ python prepare_corpus.py --input_csv ../data/intermediate/culture_reviews.csv --
 python make_raft_data.py --out_dir out --n_examples 10
 ```

-## Training der QLoRA-Adapter
-
-```bash
-  python train_mistral_raft.py --train_jsonl out/raft_train.jsonl --out_dir out/mistral_balitwin_lora
-```
-
 ## Inferenz

 ### Pre-Merged Modell + Adapter
@@ -31,11 +25,3 @@ python make_raft_data.py --out_dir out --n_examples 10
 ```bash
 python rag_chat_merged.py --model_dir /path/to/model_folder --out_dir out
 ```
-
-### Per Baseline Mistral 7B + PEFT-Adapter
-
-Hinweis: das Skript wurde nach wenigen oberflächlichen Evaluationsrunden nicht weiter verwendet, da der beste Kandidat durch einen Merge des Basismodells und seiner PEFT-Adapter beschleunigt werden konnte und dieses Skript nicht länger relevant war.
-
-```bash
-python deprecated_rag_chat.py --lora_dir out/mistral_balitwin_lora
-```
@@ -1,98 +0,0 @@
-import argparse
-import json
-import os
-
-import faiss
-import numpy as np
-import torch
-from peft import PeftModel
-from sentence_transformers import SentenceTransformer
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-SYSTEM_PERSONA = """You are simulating a culturally interested Bali traveler segment for evaluation purposes.
-
-Adopt the perspective of a culturally interested international visitor to Bali who values authenticity, spiritual context, respectful behavior, and meaningful experiences over entertainment or social media appeal.
-
-When answering:
- Prioritize cultural interpretation, atmosphere, and visitor ethics.
- Weigh trade-offs thoughtfully (e.g., crowds vs. significance).
- Avoid generic travel advice and avoid promotional language.
- Do not exaggerate.
- Provide nuanced, reflective reasoning rather than bullet lists.
- Keep answers concise but specific.
-
-Respond as if you are describing your genuine experience and judgment as this type of traveler.
-
-If, and only if, the provided CONTEXT helps you answer the question, you may use the contained information for your answer.
-"""
-
-
-def load_docstore(path):
-    docs = []
-    with open(path, "r", encoding="utf-8") as f:
-        for line in f:
-            docs.append(json.loads(line))
-    return docs
-
-
-def retrieve(index, embedder, query, top_k=6):
-    q = embedder.encode([query], normalize_embeddings=True).astype(np.float32)
-    scores, ids = index.search(q, top_k)
-    return ids[0].tolist(), scores[0].tolist()
-
-
-@torch.no_grad()
-def main():
-    ap = argparse.ArgumentParser()
-    ap.add_argument("--base_model", default="mistralai/Mistral-7B-Instruct-v0.2")
-    ap.add_argument("--lora_dir", default="out/mistral_balitwin_lora")
-    ap.add_argument("--out_dir", default="out")
-    ap.add_argument(
-        "--embedding_model", default="sentence-transformers/all-MiniLM-L6-v2"
-    )
-    ap.add_argument("--top_k", type=int, default=6)
-    args = ap.parse_args()
-
-    index = faiss.read_index(os.path.join(args.out_dir, "faiss.index"))
-    docstore = load_docstore(os.path.join(args.out_dir, "docstore.jsonl"))
-    embedder = SentenceTransformer(args.embedding_model)
-
-    tok = AutoTokenizer.from_pretrained(args.base_model, use_fast=True)
-    base = AutoModelForCausalLM.from_pretrained(
-        args.base_model, device_map="auto", torch_dtype=torch.float16
-    )
-    model = PeftModel.from_pretrained(base, args.lora_dir)
-    model.eval()
-
-    print("Type your question (Ctrl+C to exit).")
-    while True:
-        q = input("\nYou: ").strip()
-        if not q:
-            continue
-
-        ids, _ = retrieve(index, embedder, q, top_k=args.top_k)
-        context_docs = [docstore[i]["text"] for i in ids]
-        context_blob = "\n\n".join(
-            [f"[DOC {i}] {t}" for i, t in enumerate(context_docs)]
-        )
-
-        messages = [
-            {"role": "system", "content": SYSTEM_PERSONA},
-            {"role": "user", "content": f"QUESTION: {q}\n\nCONTEXT:\n{context_blob}"},
-        ]
-        inp = tok.apply_chat_template(messages, return_tensors="pt").to(model.device)
-
-        out = model.generate(
-            inp,
-            max_new_tokens=320,
-            do_sample=True,
-            temperature=0.7,
-            top_p=0.9,
-            eos_token_id=tok.eos_token_id,
-        )
-        ans = tok.decode(out[0][inp.shape[1] :], skip_special_tokens=True).strip()
-        print(f"\nBaliTwin: {ans}")
-
-
-if __name__ == "__main__":
-    main()
@@ -2,11 +2,11 @@
 # -*- coding: utf-8 -*-

 """
-RAFT dataset builder (FAISS-based retrieval) -> Together.ai chat JSONL.
+RAFT dataset builder with FAISS-based retrieval.

-Inputs (from your indexing script):
- <index_dir>/faiss.index
- <index_dir>/docstore.jsonl
+Inputs:
+- faiss.index
+- docstore.jsonl

 Process:
 - Build a set of interview-style prompts (EN)
@@ -20,9 +20,7 @@ Outputs:
 - raft_val.jsonl (optional)

 ENV:
- DEEPSEEK_API_KEY (required)
- optional: DEEPSEEK_BASE_URL (default: https://api.deepseek.com)
- optional: DEEPSEEK_MODEL (default: deepseek-chat)
+- DEEPSEEK_API_KEY
 """

 import argparse
@@ -32,7 +30,7 @@ import random
 import re
 import time
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Tuple

 import faiss
 import numpy as np
@@ -41,9 +39,6 @@ from sentence_transformers import SentenceTransformer
 from tqdm import tqdm


-# -----------------------------
-# DeepSeek client (OpenAI-compatible)
-# -----------------------------
@dataclass
 class DeepSeekConfig:
    api_key: str
@@ -89,9 +84,7 @@ class DeepSeekClient:
                last_err = e
                time.sleep(self.cfg.backoff_s ** (attempt + 1))

-        raise RuntimeError(
-            f"DeepSeek API call failed after retries. Last error: {last_err}"
-        )
+        raise RuntimeError(f"DeepSeek API call failed. Last error: {last_err}")


 # -----------------------------
@@ -119,15 +112,13 @@ def read_docstore(docstore_path: str) -> Dict[int, Dict]:
            fid = int(obj["faiss_id"])
            mapping[fid] = obj
    if not mapping:
-        raise ValueError("docstore.jsonl is empty or unreadable.")
+        raise ValueError("docstore.jsonl is broken.")
    return mapping


 def load_prompts_from_jsonl(path: str) -> List[str]:
    """
    Loads prompts from a JSONL file.
-    Expected key: 'prompt' (preferred). Also accepts 'question' or 'text'.
-    Ignores empty/short lines.
    """
    prompts: List[str] = []
    with open(path, "r", encoding="utf-8") as f:
@@ -141,13 +132,13 @@ def load_prompts_from_jsonl(path: str) -> List[str]:
            if len(p) >= 20:
                prompts.append(p)
    if not prompts:
-        raise ValueError(f"No prompts found in JSONL: {path}")
+        raise ValueError(f"No prompts in JSONL: {path}")
    return prompts


 def load_prompts_from_txt(path: str) -> List[str]:
    """
-    Loads prompts from a TXT file (one prompt per line).
+    Loads prompts from a TXT file (each line is a prompt).
    """
    prompts: List[str] = []
    with open(path, "r", encoding="utf-8") as f:
@@ -156,7 +147,7 @@ def load_prompts_from_txt(path: str) -> List[str]:
            if len(p) >= 20:
                prompts.append(p)
    if not prompts:
-        raise ValueError(f"No prompts found in TXT: {path}")
+        raise ValueError(f"No prompts in TXT: {path}")
    return prompts


@@ -173,9 +164,6 @@ def write_jsonl(path: str, rows: List[Dict]) -> None:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")


-# -----------------------------
-# Persona + prompt templates (EN)
-# -----------------------------
 IMAGE_DIMS = [
    "Natural Attractions",
    "Atmosphere",
@@ -184,36 +172,12 @@ IMAGE_DIMS = [
    "Value for Money",
 ]

-DEFAULT_PROMPTS_EN = [
-    # Natural Attractions
-    "In a lead user interview: what natural places in Bali felt genuinely memorable to you (rice terraces, volcanoes, waterfalls, coast), and why? Describe it like a lived experience.",
-    "Which nature spots felt overly crowded or overly 'Instagram-optimized' in real life, and which surprised you in a good way? Explain with concrete moments.",
-    # Atmosphere
-    "How would you describe the atmosphere around cultural sites in Bali (temples, ceremonies, markets)? What signals authenticity vs. commercialization to you?",
-    "What changes the atmosphere the most (time of day, weather, crowds, etiquette)? Share specific examples you would tell a marketer.",
-    # Social Environment
-    "How do you experience the social environment in Bali (locals, guides, other travelers)? What feels respectful and what feels performative or touristy?",
-    "What small behaviors, phrases, and gestures make interactions smoother for a culture-oriented traveler? Give examples.",
-    # Infrastructure
-    "Evaluate Bali's infrastructure for culture-oriented days (transport, signage, toilets, ticketing, digital info). What works, what annoys you, and how do you adapt?",
-    "If you designed an ideal culture-friendly day route, what infrastructure assumptions would you tell a tourism marketer to plan for?",
-    # Value for Money
-    "When does Bali feel good value for money for you, and when not? Discuss entrance fees, guides, food, tours, and hidden costs.",
-    "How do you personally distinguish 'good value' from a tourist trap? List criteria and illustrate with examples.",
-]
-

 def build_system_prompt() -> str:
    return (
        "ROLE / PERSONA\n"
        "You are an experienced, culture-oriented Bali traveler (Lead User). You speak in natural, vivid English, "
        "as a real person in an interview. You share nuanced judgments, trade-offs, and concrete scenes.\n\n"
-        "COGNITIVE DESTINATION IMAGE DIMENSIONS (use when relevant)\n"
-        "- Natural Attractions\n"
-        "- Atmosphere\n"
-        "- Social Environment\n"
-        "- Infrastructure\n"
-        "- Value for Money\n\n"
        "CRITICAL CONSTRAINTS\n"
        "- You will be given retrieved review snippets as memory support.\n"
        "- Do NOT quote them verbatim and do NOT cite them as 'the review says'.\n"
@@ -382,7 +346,8 @@ def main():
    elif args.prompts_txt:
        prompts = load_prompts_from_txt(args.prompts_txt)
    else:
-        prompts = list(DEFAULT_PROMPTS_EN)
+        print("Provide a prompt source with --prompts_jsonl or --prompts_txt.")
+        exit(1)

    if args.shuffle_prompts:
        random.shuffle(prompts)
@@ -9,7 +9,18 @@ import torch
 from sentence_transformers import SentenceTransformer
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer

-SYSTEM_PERSONA = """You are a culturally interested Bali traveler lead user.
+# """
+# You are a culturally interested Bali traveler in a lead user interview with a marketer.
+
+# When answering:
+# - Do not exaggerate.
+# - Provide nuanced, reflective reasoning rather than bullet lists.
+# - Keep answers concise but specific.
+
+# Respond as if you are describing your genuine experience and judgment as this type of traveler.
+# """
+
+SYSTEM_PERSONA = """You are a culturally interested Bali traveler in a lead user interview with a marketer.

 Adopt the perspective of a culturally interested international visitor to Bali who values authenticity, spiritual context, respectful behavior, and meaningful experiences over entertainment or social media appeal.

@@ -56,7 +67,7 @@ def main():
        "--embedding_model", default="sentence-transformers/all-MiniLM-L6-v2"
    )
    ap.add_argument("--top_k", type=int, default=12)
-    ap.add_argument("--max_new_tokens", type=int, default=320)
+    ap.add_argument("--max_new_tokens", type=int, default=1000)
    ap.add_argument("--no_model", action=argparse.BooleanOptionalAction)
    args = ap.parse_args()

@@ -101,9 +112,9 @@ def main():
        context_docs = [docstore[i]["text"] for i in ids]
        context_blob = "\n\n".join([t for _, t in enumerate(context_docs)])

-        print("\nRetrieved Context:")
+        print("\nRetrieved Context:\n")
        for i, (doc, score) in enumerate(zip(context_docs, scores)):
-            print(f"\nDoc {i+1} (score: {score:.4f}):\n{doc}")
+            print(f"Doc {i+1} (score: {score:.4f}):\n{doc}\n\n")

        messages = [
            # {"role": "system", "content": SYSTEM_PERSONA},