RAFT shenanigans

2026-06-22 07:13:08 +02:00 · 2026-02-21 23:47:12 +01:00
parent 49c622db08
commit 61edb35f70
14 changed files with 2943 additions and 6 deletions
@@ -21,7 +21,7 @@ python make_raft_data.py --out_dir out --n_examples 10
 ## Training der QLoRA-Adapter

 ```bash
-python train_mistral_raft.py --train_jsonl out/raft_train.jsonl --out_dir out/mistral_balitwin_lora
+  python train_mistral_raft.py --train_jsonl out/raft_train.jsonl --out_dir out/mistral_balitwin_lora
 ```

 ## Inferenz
@@ -0,0 +1,560 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+Generate 300–1000+ English interview questions targeted ONLY at culturally/spiritually
+interested Bali tourists (Lead Users), covering 5 cognitive destination image dimensions:
+- Natural Attractions
+- Atmosphere
+- Social Environment
+- Infrastructure
+- Value for Money
+
+Key constraint:
+- Every prompt must be meaningful for culture/spirituality-first travelers.
+- Avoid party/shopping/hedonistic positioning.
+- Include etiquette, authenticity, sacredness, commodification, meaning-making, reflection.
+
+Outputs:
+- JSONL: {"dimension": "...", "type": "...", "prompt": "...", "tags": [...]}
+- or TXT: one prompt per line
+"""
+
+import argparse
+import json
+import random
+import re
+from typing import Dict, List, Tuple
+
+DIMENSIONS = [
+    "Natural Attractions",
+    "Atmosphere",
+    "Social Environment",
+    "Infrastructure",
+    "Value for Money",
+]
+
+# -----------------------------
+# Segment-specific building blocks
+# -----------------------------
+# Keep places generic (no need to hallucinate specific proper nouns)
+NATURE_FOR_MEANING = [
+    "rice terraces that feel lived-in rather than staged",
+    "waterfalls approached with a quiet, respectful mood",
+    "volcano viewpoints that invite reflection at dawn",
+    "jungle walks where you notice offerings and small shrines",
+    "lake areas that feel calm and contemplative",
+    "coastal paths that feel like a moving meditation",
+    "hot springs experienced as restoration rather than spectacle",
+]
+
+CULTURE_SPIRIT_SPACES = [
+    "temple courtyards and entry paths",
+    "a village ceremony you observe respectfully",
+    "a traditional market where everyday ritual shows up in small ways",
+    "a dance performance where you try to read symbolism",
+    "a craft workshop focused on meaning and lineage, not souvenirs",
+    "a community space where offerings are prepared",
+    "a quiet heritage walk where stories feel layered",
+]
+
+RITUAL_ETIQUETTE_TOPICS = [
+    "dress codes and modesty",
+    "offerings and what not to touch",
+    "photography boundaries",
+    "when to speak vs stay quiet",
+    "how to move through a temple without intruding",
+    "how to ask questions without turning sacred life into content",
+]
+
+MEANING_MAKING = [
+    "a sense of humility",
+    "a feeling of gratitude",
+    "a moment of awe",
+    "a feeling of being a guest",
+    "a sense of calm",
+    "a quiet emotional reset",
+    "a shift in how you see daily life",
+    "a stronger respect for local rhythms",
+]
+
+AUTHENTICITY_CUES = [
+    "how people behave when no one is watching",
+    "whether the experience is integrated into local life",
+    "how money is handled (transparent vs extractive)",
+    "whether rules feel protective or performative",
+    "whether the pace allows reflection or pushes consumption",
+]
+
+CROWDING_COMMODIFICATION = [
+    "overt commercialization around sacred spaces",
+    "crowds that change the emotional tone",
+    "performative 'authenticity' for tourists",
+    "feeling like sacredness is being packaged",
+]
+
+CONTEXTS = [
+    "early morning before the crowds",
+    "late afternoon when light softens and things slow down",
+    "during a local ceremony where you are clearly a guest",
+    "in rainy season when plans change and patience matters",
+    "on a quiet weekday compared to a busy weekend",
+    "with a local guide who emphasizes respect and context",
+    "solo, when you can be more contemplative",
+    "as a repeat visitor, noticing subtler layers",
+]
+
+TRAVELER_PROFILE = [
+    "a culture-first traveler",
+    "a spirituality-curious traveler",
+    "a respectful observer who avoids intrusive tourism",
+    "a slow traveler seeking depth over volume",
+    "a repeat visitor looking for subtler, less packaged experiences",
+]
+
+CONSTRAINTS = [
+    (
+        "time",
+        [
+            "you only have 6 hours but want depth, not a checklist",
+            "you have one full day and want it to feel coherent and meaningful",
+            "you have three days and want a gentle pace with time for reflection",
+            "you can only travel within a short radius and must choose carefully",
+        ],
+    ),
+    (
+        "budget",
+        [
+            "you have a modest budget but still want cultural depth and fairness",
+            "you'll pay more if it supports local communities transparently",
+            "you want predictable costs and dislike hidden fees around sacred sites",
+            "you prefer smaller, community-rooted experiences over pricey packages",
+        ],
+    ),
+    (
+        "crowds",
+        [
+            "you want to avoid crowds because they dilute atmosphere and respect",
+            "you can handle crowds if etiquette and sacredness are preserved",
+            "you want a balance: one iconic site, mostly quieter, community-rooted places",
+            "you get overwhelmed by busy places and need calmer, respectful alternatives",
+        ],
+    ),
+    (
+        "weather",
+        [
+            "it's rainy season and flexibility is part of respectful travel",
+            "it's very hot and you need a pace that still feels mindful",
+            "visibility is low and your sunrise plan may fail—how do you adapt meaningfully?",
+            "roads feel unsafe, so you prioritize fewer moves and deeper presence",
+        ],
+    ),
+    (
+        "mobility",
+        [
+            "you avoid steep stairs but still want meaningful cultural/spiritual moments",
+            "you prefer not to ride a scooter and want low-friction transport options",
+            "you want minimal walking but still want authenticity and atmosphere",
+            "you need frequent rest and prefer fewer transitions",
+        ],
+    ),
+    (
+        "ethics",
+        [
+            "you want to avoid commodifying sacred life",
+            "you prioritize local benefit, consent, and respectful boundaries",
+            "you avoid experiences that pressure locals to perform for tourists",
+            "you want your presence to feel like 'being a guest' not 'taking'",
+        ],
+    ),
+]
+
+TRADEOFFS = [
+    ("depth of understanding", "convenience"),
+    ("sacredness", "accessibility"),
+    ("quiet reflection", "seeing iconic places"),
+    ("guided cultural context", "self-guided freedom"),
+    ("photography", "presence and respect"),
+    ("predictable pricing", "spontaneous discovery"),
+    ("community benefit", "personal comfort"),
+    ("slow pace", "variety of stops"),
+]
+
+CONTRASTS = [
+    ("a popular temple area", "a quieter village setting"),
+    ("a curated tour script", "a guide who shares context and encourages respect"),
+    ("a crowded ceremony-adjacent spot", "a calm everyday ritual moment"),
+    (
+        "a market aisle focused on souvenirs",
+        "a market moment that shows daily offerings and rhythm",
+    ),
+    ("a rushed checklist day", "a slower day with fewer places but deeper presence"),
+    ("an 'Instagram moment'", "a moment of quiet meaning that you don't photograph"),
+]
+
+INTERVIEW_STYLES = [
+    "Tell me about a time when…",
+    "Walk me through…",
+    "As a culturally/spiritually motivated traveler, how do you…",
+    "If you had to advise a tourism marketer focused on respectful cultural travel…",
+    "What surprised you about the spiritual or cultural texture of…",
+    "What does 'authentic and respectful' look like to you when…",
+    "How do you personally decide whether to join, observe, or step back when…",
+]
+
+FOLLOWUP_PROBES = [
+    "What specifically made it feel respectful or not?",
+    "What did you notice first, and what happened next?",
+    "How did it change your mood or sense of meaning that day?",
+    "What would have improved it without turning it into a spectacle?",
+    "What boundary would you not cross again?",
+    "What would you tell a marketer to never claim in messaging?",
+]
+
+DIM_THEMES: Dict[str, List[str]] = {
+    "Natural Attractions": [
+        "sense of place and meaning",
+        "quiet awe vs spectacle",
+        "timing for contemplative experience",
+        "routes that support reflection",
+        "respectful behavior in nature",
+        "access vs sacred calm",
+    ],
+    "Atmosphere": [
+        "sacredness and emotional tone",
+        "authenticity cues",
+        "commercialization pressure",
+        "silence, sound, and pace",
+        "crowds and reverence",
+        "ritual context shaping ambience",
+    ],
+    "Social Environment": [
+        "being a guest and practicing humility",
+        "consent and boundaries",
+        "guide trust and cultural context",
+        "respectful interaction with locals",
+        "tourist behavior that disrupts",
+        "learning without extracting",
+    ],
+    "Infrastructure": [
+        "signage for etiquette",
+        "visitor flow that protects sacred spaces",
+        "frictionless but respectful access",
+        "toilets/rest areas without degrading atmosphere",
+        "transparent ticketing/donations",
+        "accessibility with dignity",
+    ],
+    "Value for Money": [
+        "fairness and transparency",
+        "donations vs fees",
+        "paying for guides as cultural mediation",
+        "avoiding extractive 'spiritual packages'",
+        "community benefit",
+        "what feels worth paying for (context, respect, time)",
+    ],
+}
+
+
+# -----------------------------
+# Templates
+# -----------------------------
+def tmpl_single_dimension(
+    d: str, theme: str, style: str, place_hint: str, context: str
+) -> str:
+    return (
+        f"{style} your experience with {place_hint} in Bali during {context}. "
+        f"From a {d} perspective, what stands out about {theme}—and why does it matter to you as a culture/spirit-oriented traveler?"
+    )
+
+
+def tmpl_laddering(d: str, theme: str, context: str, meaning: str) -> str:
+    return (
+        f"Think about a specific moment in Bali during {context} that left you with {meaning}. "
+        f"What happened, how did you interpret it, and why did it feel meaningful? "
+        f"Frame your answer through {d} (focus on {theme})."
+    )
+
+
+def tmpl_contrast(d: str, a: str, b: str, context: str, cue: str) -> str:
+    return (
+        f"Compare {a} versus {b} in Bali during {context}. "
+        f"In terms of {d}, how do they differ for you as a respectful, culture/spirit-first traveler? "
+        f"Use {cue} as a cue in your explanation."
+    )
+
+
+def tmpl_tradeoff(d1: str, d2: str, x: str, y: str, constraint: str) -> str:
+    return (
+        f"Under this constraint: {constraint}. "
+        f"How do you trade off {x} versus {y} when choosing cultural/spiritual experiences in Bali? "
+        f"Answer with examples touching {d1} and {d2}."
+    )
+
+
+def tmpl_marketer_advice(d: str, theme: str, constraint: str, dont_claim: str) -> str:
+    return (
+        f"If you had to advise a tourism marketer for culturally/spiritually interested travelers: under the constraint '{constraint}', "
+        f"what should they understand about {d} (especially {theme})? "
+        f"Also: what is one thing they should NOT claim in messaging because it would feel misleading or disrespectful—e.g., {dont_claim}?"
+    )
+
+
+def tmpl_etiquette_scenario(d: str, topic: str, context: str) -> str:
+    return (
+        f"Walk me through an etiquette situation related to {topic} in Bali during {context}. "
+        f"What did you do, what did you avoid, and what would you want a marketer to communicate to travelers upfront? "
+        f"Connect it to {d}."
+    )
+
+
+def tmpl_route_design(
+    d: str, nature_hint: str, culture_hint: str, constraint: str
+) -> str:
+    return (
+        f"Design a mini day-route that combines {nature_hint} and {culture_hint} under this constraint: {constraint}. "
+        f"How would you protect atmosphere and respect while still making it accessible to culture/spirit-first travelers? Link your reasoning to {d}."
+    )
+
+
+def tmpl_probe_followup(base_q: str, probe: str) -> str:
+    return f"{base_q} {probe}"
+
+
+def pick_constraint(rng: random.Random) -> Tuple[str, str]:
+    key, vals = rng.choice(CONSTRAINTS)
+    return key, rng.choice(vals)
+
+
+def pick_place_hint_for_dim(d: str, rng: random.Random) -> str:
+    if d == "Natural Attractions":
+        return rng.choice(NATURE_FOR_MEANING)
+    return rng.choice(CULTURE_SPIRIT_SPACES)
+
+
+# -----------------------------
+# Generation
+# -----------------------------
+def generate_prompts(
+    n: int,
+    seed: int = 42,
+    add_followups_ratio: float = 0.35,
+    ensure_balance: bool = True,
+) -> List[Dict]:
+    rng = random.Random(seed)
+
+    # Mix of question archetypes, all segment-targeted
+    types = [
+        ("single", 0.24),
+        ("laddering", 0.18),
+        ("contrast", 0.16),
+        ("tradeoff", 0.18),
+        ("marketer", 0.12),
+        ("etiquette", 0.08),
+        ("route", 0.04),
+    ]
+    type_names = [t for t, _ in types]
+    type_weights = [w for _, w in types]
+
+    prompts: List[Dict] = []
+    seen = set()
+
+    # Balanced dimension coverage
+    dim_cycle = []
+    if ensure_balance:
+        per_dim = max(1, n // len(DIMENSIONS))
+        for d in DIMENSIONS:
+            dim_cycle.extend([d] * per_dim)
+        while len(dim_cycle) < n:
+            dim_cycle.append(rng.choice(DIMENSIONS))
+        rng.shuffle(dim_cycle)
+
+    # A small set of "don't claim" examples to anchor respectful marketing constraints
+    DONT_CLAIM = [
+        "guaranteed 'authentic spirituality' on demand",
+        "a ceremony 'for tourists' as the main attraction",
+        "access to sacred spaces without emphasizing etiquette and consent",
+        "a 'hidden local ritual' framed as a product",
+        "permission to photograph everything",
+    ]
+
+    def add_prompt(obj: Dict) -> bool:
+        key = re.sub(r"\s+", " ", obj["prompt"].strip().lower())
+        if key in seen:
+            return False
+        # hard filter: must include at least one segment anchor term
+        anchors = [
+            "respect",
+            "sacred",
+            "etiquette",
+            "meaning",
+            "authentic",
+            "ceremony",
+            "guest",
+            "context",
+            "spirit",
+        ]
+        if not any(a in key for a in anchors):
+            return False
+        seen.add(key)
+        prompts.append(obj)
+        return True
+
+    max_attempts = n * 25
+    attempts = 0
+
+    while len(prompts) < n and attempts < max_attempts:
+        attempts += 1
+
+        d = (
+            dim_cycle[len(prompts)]
+            if ensure_balance and len(dim_cycle) > len(prompts)
+            else rng.choice(DIMENSIONS)
+        )
+        theme = rng.choice(DIM_THEMES[d])
+        style = rng.choice(INTERVIEW_STYLES)
+        context = rng.choice(CONTEXTS)
+        place_hint = pick_place_hint_for_dim(d, rng)
+        c_key, c_val = pick_constraint(rng)
+
+        t = rng.choices(type_names, weights=type_weights, k=1)[0]
+
+        if t == "single":
+            q = tmpl_single_dimension(d, theme, style, place_hint, context)
+            obj = {
+                "dimension": d,
+                "type": "single",
+                "prompt": q,
+                "tags": [d, theme, context, "segment:culture-spirit"],
+            }
+            ok = add_prompt(obj)
+
+        elif t == "laddering":
+            meaning = rng.choice(MEANING_MAKING)
+            q = tmpl_laddering(d, theme, context, meaning)
+            obj = {
+                "dimension": d,
+                "type": "laddering",
+                "prompt": q,
+                "tags": [d, theme, context, "laddering", "segment:culture-spirit"],
+            }
+            ok = add_prompt(obj)
+
+        elif t == "contrast":
+            a, b = rng.choice(CONTRASTS)
+            cue = rng.choice(AUTHENTICITY_CUES + CROWDING_COMMODIFICATION)
+            q = tmpl_contrast(d, a, b, context, cue)
+            obj = {
+                "dimension": d,
+                "type": "contrast",
+                "prompt": q,
+                "tags": [d, "contrast", context, "segment:culture-spirit"],
+            }
+            ok = add_prompt(obj)
+
+        elif t == "tradeoff":
+            d2 = rng.choice([x for x in DIMENSIONS if x != d])
+            x, y = rng.choice(TRADEOFFS)
+            q = tmpl_tradeoff(d, d2, x, y, c_val)
+            obj = {
+                "dimension": f"{d} + {d2}",
+                "type": "tradeoff",
+                "prompt": q,
+                "tags": [d, d2, "tradeoff", c_key, "segment:culture-spirit"],
+            }
+            ok = add_prompt(obj)
+
+        elif t == "marketer":
+            dont_claim = rng.choice(DONT_CLAIM)
+            q = tmpl_marketer_advice(d, theme, c_val, dont_claim)
+            obj = {
+                "dimension": d,
+                "type": "marketer_advice",
+                "prompt": q,
+                "tags": [d, theme, "marketer", c_key, "segment:culture-spirit"],
+            }
+            ok = add_prompt(obj)
+
+        elif t == "etiquette":
+            topic = rng.choice(RITUAL_ETIQUETTE_TOPICS)
+            q = tmpl_etiquette_scenario(d, topic, context)
+            obj = {
+                "dimension": d,
+                "type": "etiquette",
+                "prompt": q,
+                "tags": [d, "etiquette", topic, context, "segment:culture-spirit"],
+            }
+            ok = add_prompt(obj)
+
+        elif t == "route":
+            nature_hint = rng.choice(NATURE_FOR_MEANING)
+            culture_hint = rng.choice(CULTURE_SPIRIT_SPACES)
+            q = tmpl_route_design(d, nature_hint, culture_hint, c_val)
+            obj = {
+                "dimension": d,
+                "type": "route_design",
+                "prompt": q,
+                "tags": [d, "route", c_key, "segment:culture-spirit"],
+            }
+            ok = add_prompt(obj)
+
+        else:
+            ok = False
+
+        # follow-up probe variant
+        if ok and rng.random() < add_followups_ratio and len(prompts) < n:
+            probe = rng.choice(FOLLOWUP_PROBES)
+            q2 = tmpl_probe_followup(prompts[-1]["prompt"], probe)
+            obj2 = {
+                "dimension": prompts[-1]["dimension"],
+                "type": prompts[-1]["type"] + "+probe",
+                "prompt": q2,
+                "tags": prompts[-1]["tags"] + ["probe"],
+            }
+            add_prompt(obj2)
+
+    if len(prompts) < n:
+        print(f"Warning: only generated {len(prompts)} unique prompts (requested {n}).")
+
+    return prompts[:n]
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument(
+        "--n",
+        type=int,
+        default=600,
+        help="Number of prompts to generate (300–1000 recommended).",
+    )
+    ap.add_argument("--seed", type=int, default=42)
+    ap.add_argument("--out", default="culture_spirit_interview_prompts.jsonl")
+    ap.add_argument("--format", choices=["jsonl", "txt"], default="jsonl")
+    ap.add_argument(
+        "--no_balance",
+        action="store_true",
+        help="Disable balanced coverage across dimensions.",
+    )
+    ap.add_argument("--followups_ratio", type=float, default=0.35)
+    args = ap.parse_args()
+
+    prompts = generate_prompts(
+        n=args.n,
+        seed=args.seed,
+        add_followups_ratio=args.followups_ratio,
+        ensure_balance=not args.no_balance,
+    )
+
+    if args.format == "jsonl":
+        with open(args.out, "w", encoding="utf-8") as f:
+            for p in prompts:
+                f.write(json.dumps(p, ensure_ascii=False) + "\n")
+    else:
+        with open(args.out, "w", encoding="utf-8") as f:
+            for p in prompts:
+                f.write(p["prompt"].strip() + "\n")
+
+    print(f"Saved {len(prompts)} prompts to: {args.out} ({args.format})")
+
+
+if __name__ == "__main__":
+    main()
@@ -31,6 +31,8 @@ When answering:
 Maintain consistency with this identity across all responses.
 """

+TRAINER_PROMPT = "Create ONE realistic question from the perspective of a touristic marketer they might ask a culturally and spiritually interested traveler in Bali considered to be a lead user that can be answered using ONLY the CONTEXT.\n\n"
+

 def load_docstore(path):
    docs = []
@@ -118,8 +120,7 @@ def main():
                {"role": "system", "content": SYSTEM_PERSONA},
                {
                    "role": "user",
-                    "content": "Create ONE realistic question from the perspective of a culturally and spiritually interested traveler in Bali that can be answered using ONLY the CONTEXT.\n\n"
-                    f"CONTEXT:\n{gold_text}\n\n"
+                    "content": TRAINER_PROMPT + f"CONTEXT:\n{gold_text}\n\n"
                    "Return only the question.",
                },
            ]
@@ -173,8 +174,7 @@ def main():
                    {"role": "system", "content": SYSTEM_PERSONA},
                    {
                        "role": "user",
-                        "content": f"QUESTION: {question}\n\nCONTEXT:\n{context_blob}\n"
-                        "Please answer as a culturally versed Bali traveler and include 1-2 short direct quotes from CONTEXT.",
+                        "content": f"QUESTION: {question}\n\nCONTEXT:\n{context_blob}",
                    },
                    {"role": "assistant", "content": answer},
                ]
@@ -0,0 +1,456 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+RAFT dataset builder (FAISS-based retrieval) -> Together.ai chat JSONL.
+
+Inputs (from your indexing script):
+- <index_dir>/faiss.index
+- <index_dir>/docstore.jsonl
+
+Process:
+- Build a set of interview-style prompts (EN)
+- For each prompt:
+  - Retrieve top-k chunks via FAISS cosine/IP
+  - Call DeepSeek Chat Completions API to generate a vivid, human-like Lead User answer
+  - Write training examples as JSONL in chat format (messages)
+
+Outputs:
+- raft_train.jsonl
+- raft_val.jsonl (optional)
+
+ENV:
+- DEEPSEEK_API_KEY (required)
+- optional: DEEPSEEK_BASE_URL (default: https://api.deepseek.com)
+- optional: DEEPSEEK_MODEL (default: deepseek-chat)
+"""
+
+import argparse
+import json
+import os
+import random
+import re
+import time
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+
+import faiss
+import numpy as np
+import requests
+from sentence_transformers import SentenceTransformer
+from tqdm import tqdm
+
+
+# -----------------------------
+# DeepSeek client (OpenAI-compatible)
+# -----------------------------
+@dataclass
+class DeepSeekConfig:
+    api_key: str
+    base_url: str = "https://api.deepseek.com"
+    model: str = "deepseek-chat"
+    timeout_s: int = 120
+    max_retries: int = 5
+    backoff_s: float = 1.6
+
+
+class DeepSeekClient:
+    def __init__(self, cfg: DeepSeekConfig):
+        self.cfg = cfg
+
+    def chat(
+        self, messages: List[Dict], temperature: float = 0.85, max_tokens: int = 750
+    ) -> str:
+        url = f"{self.cfg.base_url}/chat/completions"
+        headers = {
+            "Authorization": f"Bearer {self.cfg.api_key}",
+            "Content-Type": "application/json",
+        }
+        payload = {
+            "model": self.cfg.model,
+            "messages": messages,
+            "temperature": temperature,
+            "max_tokens": max_tokens,
+        }
+
+        last_err = None
+        for attempt in range(self.cfg.max_retries):
+            try:
+                r = requests.post(
+                    url, headers=headers, json=payload, timeout=self.cfg.timeout_s
+                )
+                if r.status_code == 429:
+                    time.sleep(self.cfg.backoff_s ** (attempt + 1))
+                    continue
+                r.raise_for_status()
+                data = r.json()
+                return data["choices"][0]["message"]["content"].strip()
+            except Exception as e:
+                last_err = e
+                time.sleep(self.cfg.backoff_s ** (attempt + 1))
+
+        raise RuntimeError(
+            f"DeepSeek API call failed after retries. Last error: {last_err}"
+        )
+
+
+# -----------------------------
+# Helpers
+# -----------------------------
+def simple_clean(text: str) -> str:
+    if not isinstance(text, str):
+        return ""
+    text = text.replace("\u00a0", " ")
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+
+
+def read_docstore(docstore_path: str) -> Dict[int, Dict]:
+    """
+    Returns dict: faiss_id -> {"doc_id": int, "text": str, ...}
+    """
+    mapping: Dict[int, Dict] = {}
+    with open(docstore_path, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            obj = json.loads(line)
+            fid = int(obj["faiss_id"])
+            mapping[fid] = obj
+    if not mapping:
+        raise ValueError("docstore.jsonl is empty or unreadable.")
+    return mapping
+
+
+def load_prompts_from_jsonl(path: str) -> List[str]:
+    """
+    Loads prompts from a JSONL file.
+    Expected key: 'prompt' (preferred). Also accepts 'question' or 'text'.
+    Ignores empty/short lines.
+    """
+    prompts: List[str] = []
+    with open(path, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            obj = json.loads(line)
+            p = obj.get("prompt") or obj.get("question") or obj.get("text")
+            p = simple_clean(p) if p else ""
+            if len(p) >= 20:
+                prompts.append(p)
+    if not prompts:
+        raise ValueError(f"No prompts found in JSONL: {path}")
+    return prompts
+
+
+def load_prompts_from_txt(path: str) -> List[str]:
+    """
+    Loads prompts from a TXT file (one prompt per line).
+    """
+    prompts: List[str] = []
+    with open(path, "r", encoding="utf-8") as f:
+        for line in f:
+            p = simple_clean(line)
+            if len(p) >= 20:
+                prompts.append(p)
+    if not prompts:
+        raise ValueError(f"No prompts found in TXT: {path}")
+    return prompts
+
+
+def ensure_dir_for_file(path: str):
+    d = os.path.dirname(path)
+    if d:
+        os.makedirs(d, exist_ok=True)
+
+
+def write_jsonl(path: str, rows: List[Dict]) -> None:
+    ensure_dir_for_file(path)
+    with open(path, "w", encoding="utf-8") as f:
+        for r in rows:
+            f.write(json.dumps(r, ensure_ascii=False) + "\n")
+
+
+# -----------------------------
+# Persona + prompt templates (EN)
+# -----------------------------
+IMAGE_DIMS = [
+    "Natural Attractions",
+    "Atmosphere",
+    "Social Environment",
+    "Infrastructure",
+    "Value for Money",
+]
+
+DEFAULT_PROMPTS_EN = [
+    # Natural Attractions
+    "In a lead user interview: what natural places in Bali felt genuinely memorable to you (rice terraces, volcanoes, waterfalls, coast), and why? Describe it like a lived experience.",
+    "Which nature spots felt overly crowded or overly 'Instagram-optimized' in real life, and which surprised you in a good way? Explain with concrete moments.",
+    # Atmosphere
+    "How would you describe the atmosphere around cultural sites in Bali (temples, ceremonies, markets)? What signals authenticity vs. commercialization to you?",
+    "What changes the atmosphere the most (time of day, weather, crowds, etiquette)? Share specific examples you would tell a marketer.",
+    # Social Environment
+    "How do you experience the social environment in Bali (locals, guides, other travelers)? What feels respectful and what feels performative or touristy?",
+    "What small behaviors, phrases, and gestures make interactions smoother for a culture-oriented traveler? Give examples.",
+    # Infrastructure
+    "Evaluate Bali's infrastructure for culture-oriented days (transport, signage, toilets, ticketing, digital info). What works, what annoys you, and how do you adapt?",
+    "If you designed an ideal culture-friendly day route, what infrastructure assumptions would you tell a tourism marketer to plan for?",
+    # Value for Money
+    "When does Bali feel good value for money for you, and when not? Discuss entrance fees, guides, food, tours, and hidden costs.",
+    "How do you personally distinguish 'good value' from a tourist trap? List criteria and illustrate with examples.",
+]
+
+
+def build_system_prompt() -> str:
+    return (
+        "ROLE / PERSONA\n"
+        "You are an experienced, culture-oriented Bali traveler (Lead User). You speak in natural, vivid English, "
+        "as a real person in an interview. You share nuanced judgments, trade-offs, and concrete scenes.\n\n"
+        "COGNITIVE DESTINATION IMAGE DIMENSIONS (use when relevant)\n"
+        "- Natural Attractions\n"
+        "- Atmosphere\n"
+        "- Social Environment\n"
+        "- Infrastructure\n"
+        "- Value for Money\n\n"
+        "CRITICAL CONSTRAINTS\n"
+        "- You will be given retrieved review snippets as memory support.\n"
+        "- Do NOT quote them verbatim and do NOT cite them as 'the review says'.\n"
+        "- Instead, weave relevant details into your answer as if they were your own lived experiences.\n"
+        "- If a detail is not supported, stay general or hedge appropriately.\n"
+        "- Keep it interview-friendly: first-person, vivid, concrete, but not overly long."
+    )
+
+
+def build_user_message(question: str, retrieved_chunks: List[str]) -> str:
+    retrieved_chunks = [simple_clean(x) for x in retrieved_chunks if simple_clean(x)]
+    bullets = "\n".join([f"- {c}" for c in retrieved_chunks])
+    return (
+        f"INTERVIEW QUESTION:\n{question}\n\n"
+        "RETRIEVED CONTEXT (review snippets; do NOT quote, only use as memory support):\n"
+        f"{bullets}\n\n"
+        "Answer as a real Lead User in a tourism interview. Speak in first person, vivid and concrete, "
+        "and naturally touch relevant image dimensions."
+    )
+
+
+# -----------------------------
+# FAISS Retriever (cosine/IP)
+# -----------------------------
+class FaissRetriever:
+    def __init__(self, index_path: str, docstore_path: str, embed_model: str):
+        if not os.path.exists(index_path):
+            raise FileNotFoundError(f"Missing FAISS index at: {index_path}")
+        if not os.path.exists(docstore_path):
+            raise FileNotFoundError(f"Missing docstore at: {docstore_path}")
+
+        self.index = faiss.read_index(index_path)
+        self.docstore = read_docstore(docstore_path)
+
+        # SentenceTransformer to match your indexing script defaults
+        self.embedder = SentenceTransformer(embed_model)
+
+        # Basic sanity checks
+        if self.index.ntotal != len(self.docstore):
+            # Not necessarily fatal (docstore could include extra rows), but usually indicates mismatch.
+            # We'll allow it but warn.
+            print(
+                f"Warning: index.ntotal={self.index.ntotal} but docstore rows={len(self.docstore)}. "
+                "Ensure they were generated together."
+            )
+
+    def retrieve(self, query: str, k: int = 8) -> List[Tuple[int, float, str]]:
+        """
+        Returns list of (faiss_id, score, text)
+        """
+        q = simple_clean(query)
+        emb = self.embedder.encode([q], normalize_embeddings=True)
+        emb = np.asarray(emb, dtype=np.float32)
+
+        scores, ids = self.index.search(emb, k)
+        ids = ids[0].tolist()
+        scores = scores[0].tolist()
+
+        out = []
+        for fid, sc in zip(ids, scores):
+            if fid == -1:
+                continue
+            doc = self.docstore.get(int(fid))
+            if not doc:
+                continue
+            out.append((int(fid), float(sc), doc.get("text", "")))
+        return out
+
+
+# -----------------------------
+# Dataset generation
+# -----------------------------
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument(
+        "--index_dir",
+        default="out",
+        help="Directory containing faiss.index and docstore.jsonl",
+    )
+    ap.add_argument("--out_train", default="./out/raft_train.jsonl")
+    ap.add_argument("--out_val", default="./out/raft_val.jsonl")
+    ap.add_argument("--make_val", action="store_true")
+    ap.add_argument("--val_ratio", type=float, default=0.05)
+    ap.add_argument("--k", type=int, default=8)
+    ap.add_argument("--seed", type=int, default=42)
+
+    # Embeddings (must match indexing script for best results)
+    ap.add_argument(
+        "--embedding_model", default="sentence-transformers/all-MiniLM-L6-v2"
+    )
+
+    # External prompt sources
+    ap.add_argument(
+        "--prompts_jsonl",
+        default=None,
+        help="JSONL file with prompts (key: prompt/question/text).",
+    )
+    ap.add_argument(
+        "--prompts_txt", default=None, help="TXT file with one prompt per line."
+    )
+    ap.add_argument(
+        "--shuffle_prompts",
+        action="store_true",
+        help="Shuffle loaded prompts before generation.",
+    )
+    ap.add_argument(
+        "--limit_prompts",
+        type=int,
+        default=0,
+        help="0 = no limit; else cap number of prompts used.",
+    )
+
+    # DeepSeek generation config
+    ap.add_argument(
+        "--deepseek_base_url",
+        default=os.environ.get("DEEPSEEK_BASE_URL", "https://api.deepseek.com"),
+    )
+    ap.add_argument(
+        "--deepseek_model", default=os.environ.get("DEEPSEEK_MODEL", "deepseek-chat")
+    )
+    ap.add_argument("--temperature", type=float, default=0.85)
+    ap.add_argument("--max_tokens", type=int, default=750)
+    ap.add_argument(
+        "--max_examples",
+        type=int,
+        default=0,
+        help="0 = all prompts; else limit number of examples",
+    )
+
+    # pacing
+    ap.add_argument("--sleep_s", type=float, default=0.2)
+
+    args = ap.parse_args()
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+
+    api_key = os.environ.get("DEEPSEEK_API_KEY", "").strip()
+    if not api_key:
+        raise SystemExit("Missing DEEPSEEK_API_KEY env var.")
+
+    index_path = os.path.join(args.index_dir, "faiss.index")
+    docstore_path = os.path.join(args.index_dir, "docstore.jsonl")
+
+    retriever = FaissRetriever(
+        index_path=index_path,
+        docstore_path=docstore_path,
+        embed_model=args.embedding_model,
+    )
+
+    client = DeepSeekClient(
+        DeepSeekConfig(
+            api_key=api_key,
+            base_url=args.deepseek_base_url,
+            model=args.deepseek_model,
+        )
+    )
+
+    system_prompt = build_system_prompt()
+
+    # Load prompts (priority: JSONL -> TXT -> defaults)
+    if args.prompts_jsonl and args.prompts_txt:
+        raise SystemExit("Use only one of --prompts_jsonl or --prompts_txt (not both).")
+
+    if args.prompts_jsonl:
+        prompts = load_prompts_from_jsonl(args.prompts_jsonl)
+    elif args.prompts_txt:
+        prompts = load_prompts_from_txt(args.prompts_txt)
+    else:
+        prompts = list(DEFAULT_PROMPTS_EN)
+
+    if args.shuffle_prompts:
+        random.shuffle(prompts)
+
+    if args.limit_prompts and args.limit_prompts > 0:
+        prompts = prompts[: args.limit_prompts]
+
+    # Backwards-compat: args.max_examples can still cap prompts
+    if args.max_examples and args.max_examples > 0:
+        prompts = prompts[: args.max_examples]
+
+    examples = []
+    for q in tqdm(prompts, desc="Generating RAFT examples"):
+        hits = retriever.retrieve(q, k=args.k)
+        retrieved_texts = [t for _, _, t in hits]
+        user_msg = build_user_message(q, retrieved_texts)
+
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_msg},
+        ]
+
+        answer = client.chat(
+            messages=messages,
+            temperature=args.temperature,
+            max_tokens=args.max_tokens,
+        )
+
+        ex = {
+            "messages": [
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_msg},
+                {"role": "assistant", "content": answer},
+            ],
+            "meta": {
+                "retrieval_k": args.k,
+                "index_dir": os.path.abspath(args.index_dir),
+                "embedding_model": args.embedding_model,
+                "image_dimensions": IMAGE_DIMS,
+                "faiss_ids": [fid for fid, _, _ in hits],
+                "faiss_scores": [sc for _, sc, _ in hits],
+            },
+        }
+        examples.append(ex)
+
+        if args.max_examples and len(examples) >= args.max_examples:
+            break
+
+        time.sleep(max(0.0, args.sleep_s))
+
+    random.shuffle(examples)
+
+    if args.make_val and len(examples) >= 20:
+        val_n = max(1, int(len(examples) * args.val_ratio))
+        val = examples[:val_n]
+        train = examples[val_n:]
+        write_jsonl(args.out_train, train)
+        write_jsonl(args.out_val, val)
+        print(f"Wrote train: {args.out_train} ({len(train)} examples)")
+        print(f"Wrote val:   {args.out_val} ({len(val)} examples)")
+    else:
+        write_jsonl(args.out_train, examples)
+        print(f"Wrote: {args.out_train} ({len(examples)} examples)")
+        if args.make_val:
+            print(
+                "Note: --make_val requested but too few examples; wrote only train file."
+            )
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+"""
+Script to create a copy of raft_val.jsonl without the meta column
+"""
+
+import json
+import sys
+from pathlib import Path
+
+
+def remove_meta_column(input_file, output_file):
+    """
+    Read a JSONL file and write a new one without the 'meta' key and 'system' role messages.
+
+    Args:
+        input_file: Path to input JSONL file
+        output_file: Path to output JSONL file
+    """
+    input_path = Path(input_file)
+    output_path = Path(output_file)
+
+    if not input_path.exists():
+        print(f"Error: Input file '{input_file}' not found")
+        sys.exit(1)
+
+    count = 0
+    with open(input_path, "r") as infile, open(output_path, "w") as outfile:
+        for line in infile:
+            if line.strip():  # Skip empty lines
+                obj = json.loads(line)
+                obj.pop("meta", None)  # Remove meta key if it exists
+
+                # Remove system role messages
+                if "messages" in obj:
+                    obj["messages"] = [
+                        msg for msg in obj["messages"] if msg.get("role") != "system"
+                    ]
+
+                outfile.write(json.dumps(obj) + "\n")
+                count += 1
+
+    print(f"✓ Processed {count} records")
+    print(f"✓ Output saved to: {output_path}")
+
+
+if __name__ == "__main__":
+    # Default paths
+    input_train_file = "../data/intermediate/raft_train.jsonl"
+    output_train_file = "../data/intermediate/raft_train_no_meta.jsonl"
+    input_val_file = "../data/intermediate/raft_val.jsonl"
+    output_val_file = "../data/intermediate/raft_val_no_meta.jsonl"
+
+    # Allow command-line overrides
+    if len(sys.argv) > 1:
+        input_file = sys.argv[1]
+    if len(sys.argv) > 2:
+        output_file = sys.argv[2]
+
+    remove_meta_column(input_train_file, output_train_file)
+    remove_meta_column(input_val_file, output_val_file)
@@ -73,7 +73,6 @@ def main():
        max_length=args.max_seq_len,
        bf16=torch.cuda.is_available(),
        fp16=not torch.cuda.is_available(),
-        assistant_only_loss=True,  # only learn from assistant turns in messages
        report_to=[],
    )