RAFT shenanigans

This commit is contained in:
2026-02-21 23:47:12 +01:00
parent 49c622db08
commit 61edb35f70
14 changed files with 2943 additions and 6 deletions

View File

@@ -21,7 +21,7 @@ python make_raft_data.py --out_dir out --n_examples 10
## Training der QLoRA-Adapter
```bash
python train_mistral_raft.py --train_jsonl out/raft_train.jsonl --out_dir out/mistral_balitwin_lora
python train_mistral_raft.py --train_jsonl out/raft_train.jsonl --out_dir out/mistral_balitwin_lora
```
## Inferenz

View File

@@ -0,0 +1,560 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Generate 3001000+ English interview questions targeted ONLY at culturally/spiritually
interested Bali tourists (Lead Users), covering 5 cognitive destination image dimensions:
- Natural Attractions
- Atmosphere
- Social Environment
- Infrastructure
- Value for Money
Key constraint:
- Every prompt must be meaningful for culture/spirituality-first travelers.
- Avoid party/shopping/hedonistic positioning.
- Include etiquette, authenticity, sacredness, commodification, meaning-making, reflection.
Outputs:
- JSONL: {"dimension": "...", "type": "...", "prompt": "...", "tags": [...]}
- or TXT: one prompt per line
"""
import argparse
import json
import random
import re
from typing import Dict, List, Tuple
DIMENSIONS = [
"Natural Attractions",
"Atmosphere",
"Social Environment",
"Infrastructure",
"Value for Money",
]
# -----------------------------
# Segment-specific building blocks
# -----------------------------
# Keep places generic (no need to hallucinate specific proper nouns)
NATURE_FOR_MEANING = [
"rice terraces that feel lived-in rather than staged",
"waterfalls approached with a quiet, respectful mood",
"volcano viewpoints that invite reflection at dawn",
"jungle walks where you notice offerings and small shrines",
"lake areas that feel calm and contemplative",
"coastal paths that feel like a moving meditation",
"hot springs experienced as restoration rather than spectacle",
]
CULTURE_SPIRIT_SPACES = [
"temple courtyards and entry paths",
"a village ceremony you observe respectfully",
"a traditional market where everyday ritual shows up in small ways",
"a dance performance where you try to read symbolism",
"a craft workshop focused on meaning and lineage, not souvenirs",
"a community space where offerings are prepared",
"a quiet heritage walk where stories feel layered",
]
RITUAL_ETIQUETTE_TOPICS = [
"dress codes and modesty",
"offerings and what not to touch",
"photography boundaries",
"when to speak vs stay quiet",
"how to move through a temple without intruding",
"how to ask questions without turning sacred life into content",
]
MEANING_MAKING = [
"a sense of humility",
"a feeling of gratitude",
"a moment of awe",
"a feeling of being a guest",
"a sense of calm",
"a quiet emotional reset",
"a shift in how you see daily life",
"a stronger respect for local rhythms",
]
AUTHENTICITY_CUES = [
"how people behave when no one is watching",
"whether the experience is integrated into local life",
"how money is handled (transparent vs extractive)",
"whether rules feel protective or performative",
"whether the pace allows reflection or pushes consumption",
]
CROWDING_COMMODIFICATION = [
"overt commercialization around sacred spaces",
"crowds that change the emotional tone",
"performative 'authenticity' for tourists",
"feeling like sacredness is being packaged",
]
CONTEXTS = [
"early morning before the crowds",
"late afternoon when light softens and things slow down",
"during a local ceremony where you are clearly a guest",
"in rainy season when plans change and patience matters",
"on a quiet weekday compared to a busy weekend",
"with a local guide who emphasizes respect and context",
"solo, when you can be more contemplative",
"as a repeat visitor, noticing subtler layers",
]
TRAVELER_PROFILE = [
"a culture-first traveler",
"a spirituality-curious traveler",
"a respectful observer who avoids intrusive tourism",
"a slow traveler seeking depth over volume",
"a repeat visitor looking for subtler, less packaged experiences",
]
CONSTRAINTS = [
(
"time",
[
"you only have 6 hours but want depth, not a checklist",
"you have one full day and want it to feel coherent and meaningful",
"you have three days and want a gentle pace with time for reflection",
"you can only travel within a short radius and must choose carefully",
],
),
(
"budget",
[
"you have a modest budget but still want cultural depth and fairness",
"you'll pay more if it supports local communities transparently",
"you want predictable costs and dislike hidden fees around sacred sites",
"you prefer smaller, community-rooted experiences over pricey packages",
],
),
(
"crowds",
[
"you want to avoid crowds because they dilute atmosphere and respect",
"you can handle crowds if etiquette and sacredness are preserved",
"you want a balance: one iconic site, mostly quieter, community-rooted places",
"you get overwhelmed by busy places and need calmer, respectful alternatives",
],
),
(
"weather",
[
"it's rainy season and flexibility is part of respectful travel",
"it's very hot and you need a pace that still feels mindful",
"visibility is low and your sunrise plan may fail—how do you adapt meaningfully?",
"roads feel unsafe, so you prioritize fewer moves and deeper presence",
],
),
(
"mobility",
[
"you avoid steep stairs but still want meaningful cultural/spiritual moments",
"you prefer not to ride a scooter and want low-friction transport options",
"you want minimal walking but still want authenticity and atmosphere",
"you need frequent rest and prefer fewer transitions",
],
),
(
"ethics",
[
"you want to avoid commodifying sacred life",
"you prioritize local benefit, consent, and respectful boundaries",
"you avoid experiences that pressure locals to perform for tourists",
"you want your presence to feel like 'being a guest' not 'taking'",
],
),
]
TRADEOFFS = [
("depth of understanding", "convenience"),
("sacredness", "accessibility"),
("quiet reflection", "seeing iconic places"),
("guided cultural context", "self-guided freedom"),
("photography", "presence and respect"),
("predictable pricing", "spontaneous discovery"),
("community benefit", "personal comfort"),
("slow pace", "variety of stops"),
]
CONTRASTS = [
("a popular temple area", "a quieter village setting"),
("a curated tour script", "a guide who shares context and encourages respect"),
("a crowded ceremony-adjacent spot", "a calm everyday ritual moment"),
(
"a market aisle focused on souvenirs",
"a market moment that shows daily offerings and rhythm",
),
("a rushed checklist day", "a slower day with fewer places but deeper presence"),
("an 'Instagram moment'", "a moment of quiet meaning that you don't photograph"),
]
INTERVIEW_STYLES = [
"Tell me about a time when…",
"Walk me through…",
"As a culturally/spiritually motivated traveler, how do you…",
"If you had to advise a tourism marketer focused on respectful cultural travel…",
"What surprised you about the spiritual or cultural texture of…",
"What does 'authentic and respectful' look like to you when…",
"How do you personally decide whether to join, observe, or step back when…",
]
FOLLOWUP_PROBES = [
"What specifically made it feel respectful or not?",
"What did you notice first, and what happened next?",
"How did it change your mood or sense of meaning that day?",
"What would have improved it without turning it into a spectacle?",
"What boundary would you not cross again?",
"What would you tell a marketer to never claim in messaging?",
]
DIM_THEMES: Dict[str, List[str]] = {
"Natural Attractions": [
"sense of place and meaning",
"quiet awe vs spectacle",
"timing for contemplative experience",
"routes that support reflection",
"respectful behavior in nature",
"access vs sacred calm",
],
"Atmosphere": [
"sacredness and emotional tone",
"authenticity cues",
"commercialization pressure",
"silence, sound, and pace",
"crowds and reverence",
"ritual context shaping ambience",
],
"Social Environment": [
"being a guest and practicing humility",
"consent and boundaries",
"guide trust and cultural context",
"respectful interaction with locals",
"tourist behavior that disrupts",
"learning without extracting",
],
"Infrastructure": [
"signage for etiquette",
"visitor flow that protects sacred spaces",
"frictionless but respectful access",
"toilets/rest areas without degrading atmosphere",
"transparent ticketing/donations",
"accessibility with dignity",
],
"Value for Money": [
"fairness and transparency",
"donations vs fees",
"paying for guides as cultural mediation",
"avoiding extractive 'spiritual packages'",
"community benefit",
"what feels worth paying for (context, respect, time)",
],
}
# -----------------------------
# Templates
# -----------------------------
def tmpl_single_dimension(
d: str, theme: str, style: str, place_hint: str, context: str
) -> str:
return (
f"{style} your experience with {place_hint} in Bali during {context}. "
f"From a {d} perspective, what stands out about {theme}—and why does it matter to you as a culture/spirit-oriented traveler?"
)
def tmpl_laddering(d: str, theme: str, context: str, meaning: str) -> str:
return (
f"Think about a specific moment in Bali during {context} that left you with {meaning}. "
f"What happened, how did you interpret it, and why did it feel meaningful? "
f"Frame your answer through {d} (focus on {theme})."
)
def tmpl_contrast(d: str, a: str, b: str, context: str, cue: str) -> str:
return (
f"Compare {a} versus {b} in Bali during {context}. "
f"In terms of {d}, how do they differ for you as a respectful, culture/spirit-first traveler? "
f"Use {cue} as a cue in your explanation."
)
def tmpl_tradeoff(d1: str, d2: str, x: str, y: str, constraint: str) -> str:
return (
f"Under this constraint: {constraint}. "
f"How do you trade off {x} versus {y} when choosing cultural/spiritual experiences in Bali? "
f"Answer with examples touching {d1} and {d2}."
)
def tmpl_marketer_advice(d: str, theme: str, constraint: str, dont_claim: str) -> str:
return (
f"If you had to advise a tourism marketer for culturally/spiritually interested travelers: under the constraint '{constraint}', "
f"what should they understand about {d} (especially {theme})? "
f"Also: what is one thing they should NOT claim in messaging because it would feel misleading or disrespectful—e.g., {dont_claim}?"
)
def tmpl_etiquette_scenario(d: str, topic: str, context: str) -> str:
return (
f"Walk me through an etiquette situation related to {topic} in Bali during {context}. "
f"What did you do, what did you avoid, and what would you want a marketer to communicate to travelers upfront? "
f"Connect it to {d}."
)
def tmpl_route_design(
d: str, nature_hint: str, culture_hint: str, constraint: str
) -> str:
return (
f"Design a mini day-route that combines {nature_hint} and {culture_hint} under this constraint: {constraint}. "
f"How would you protect atmosphere and respect while still making it accessible to culture/spirit-first travelers? Link your reasoning to {d}."
)
def tmpl_probe_followup(base_q: str, probe: str) -> str:
return f"{base_q} {probe}"
def pick_constraint(rng: random.Random) -> Tuple[str, str]:
key, vals = rng.choice(CONSTRAINTS)
return key, rng.choice(vals)
def pick_place_hint_for_dim(d: str, rng: random.Random) -> str:
if d == "Natural Attractions":
return rng.choice(NATURE_FOR_MEANING)
return rng.choice(CULTURE_SPIRIT_SPACES)
# -----------------------------
# Generation
# -----------------------------
def generate_prompts(
n: int,
seed: int = 42,
add_followups_ratio: float = 0.35,
ensure_balance: bool = True,
) -> List[Dict]:
rng = random.Random(seed)
# Mix of question archetypes, all segment-targeted
types = [
("single", 0.24),
("laddering", 0.18),
("contrast", 0.16),
("tradeoff", 0.18),
("marketer", 0.12),
("etiquette", 0.08),
("route", 0.04),
]
type_names = [t for t, _ in types]
type_weights = [w for _, w in types]
prompts: List[Dict] = []
seen = set()
# Balanced dimension coverage
dim_cycle = []
if ensure_balance:
per_dim = max(1, n // len(DIMENSIONS))
for d in DIMENSIONS:
dim_cycle.extend([d] * per_dim)
while len(dim_cycle) < n:
dim_cycle.append(rng.choice(DIMENSIONS))
rng.shuffle(dim_cycle)
# A small set of "don't claim" examples to anchor respectful marketing constraints
DONT_CLAIM = [
"guaranteed 'authentic spirituality' on demand",
"a ceremony 'for tourists' as the main attraction",
"access to sacred spaces without emphasizing etiquette and consent",
"a 'hidden local ritual' framed as a product",
"permission to photograph everything",
]
def add_prompt(obj: Dict) -> bool:
key = re.sub(r"\s+", " ", obj["prompt"].strip().lower())
if key in seen:
return False
# hard filter: must include at least one segment anchor term
anchors = [
"respect",
"sacred",
"etiquette",
"meaning",
"authentic",
"ceremony",
"guest",
"context",
"spirit",
]
if not any(a in key for a in anchors):
return False
seen.add(key)
prompts.append(obj)
return True
max_attempts = n * 25
attempts = 0
while len(prompts) < n and attempts < max_attempts:
attempts += 1
d = (
dim_cycle[len(prompts)]
if ensure_balance and len(dim_cycle) > len(prompts)
else rng.choice(DIMENSIONS)
)
theme = rng.choice(DIM_THEMES[d])
style = rng.choice(INTERVIEW_STYLES)
context = rng.choice(CONTEXTS)
place_hint = pick_place_hint_for_dim(d, rng)
c_key, c_val = pick_constraint(rng)
t = rng.choices(type_names, weights=type_weights, k=1)[0]
if t == "single":
q = tmpl_single_dimension(d, theme, style, place_hint, context)
obj = {
"dimension": d,
"type": "single",
"prompt": q,
"tags": [d, theme, context, "segment:culture-spirit"],
}
ok = add_prompt(obj)
elif t == "laddering":
meaning = rng.choice(MEANING_MAKING)
q = tmpl_laddering(d, theme, context, meaning)
obj = {
"dimension": d,
"type": "laddering",
"prompt": q,
"tags": [d, theme, context, "laddering", "segment:culture-spirit"],
}
ok = add_prompt(obj)
elif t == "contrast":
a, b = rng.choice(CONTRASTS)
cue = rng.choice(AUTHENTICITY_CUES + CROWDING_COMMODIFICATION)
q = tmpl_contrast(d, a, b, context, cue)
obj = {
"dimension": d,
"type": "contrast",
"prompt": q,
"tags": [d, "contrast", context, "segment:culture-spirit"],
}
ok = add_prompt(obj)
elif t == "tradeoff":
d2 = rng.choice([x for x in DIMENSIONS if x != d])
x, y = rng.choice(TRADEOFFS)
q = tmpl_tradeoff(d, d2, x, y, c_val)
obj = {
"dimension": f"{d} + {d2}",
"type": "tradeoff",
"prompt": q,
"tags": [d, d2, "tradeoff", c_key, "segment:culture-spirit"],
}
ok = add_prompt(obj)
elif t == "marketer":
dont_claim = rng.choice(DONT_CLAIM)
q = tmpl_marketer_advice(d, theme, c_val, dont_claim)
obj = {
"dimension": d,
"type": "marketer_advice",
"prompt": q,
"tags": [d, theme, "marketer", c_key, "segment:culture-spirit"],
}
ok = add_prompt(obj)
elif t == "etiquette":
topic = rng.choice(RITUAL_ETIQUETTE_TOPICS)
q = tmpl_etiquette_scenario(d, topic, context)
obj = {
"dimension": d,
"type": "etiquette",
"prompt": q,
"tags": [d, "etiquette", topic, context, "segment:culture-spirit"],
}
ok = add_prompt(obj)
elif t == "route":
nature_hint = rng.choice(NATURE_FOR_MEANING)
culture_hint = rng.choice(CULTURE_SPIRIT_SPACES)
q = tmpl_route_design(d, nature_hint, culture_hint, c_val)
obj = {
"dimension": d,
"type": "route_design",
"prompt": q,
"tags": [d, "route", c_key, "segment:culture-spirit"],
}
ok = add_prompt(obj)
else:
ok = False
# follow-up probe variant
if ok and rng.random() < add_followups_ratio and len(prompts) < n:
probe = rng.choice(FOLLOWUP_PROBES)
q2 = tmpl_probe_followup(prompts[-1]["prompt"], probe)
obj2 = {
"dimension": prompts[-1]["dimension"],
"type": prompts[-1]["type"] + "+probe",
"prompt": q2,
"tags": prompts[-1]["tags"] + ["probe"],
}
add_prompt(obj2)
if len(prompts) < n:
print(f"Warning: only generated {len(prompts)} unique prompts (requested {n}).")
return prompts[:n]
def main():
ap = argparse.ArgumentParser()
ap.add_argument(
"--n",
type=int,
default=600,
help="Number of prompts to generate (3001000 recommended).",
)
ap.add_argument("--seed", type=int, default=42)
ap.add_argument("--out", default="culture_spirit_interview_prompts.jsonl")
ap.add_argument("--format", choices=["jsonl", "txt"], default="jsonl")
ap.add_argument(
"--no_balance",
action="store_true",
help="Disable balanced coverage across dimensions.",
)
ap.add_argument("--followups_ratio", type=float, default=0.35)
args = ap.parse_args()
prompts = generate_prompts(
n=args.n,
seed=args.seed,
add_followups_ratio=args.followups_ratio,
ensure_balance=not args.no_balance,
)
if args.format == "jsonl":
with open(args.out, "w", encoding="utf-8") as f:
for p in prompts:
f.write(json.dumps(p, ensure_ascii=False) + "\n")
else:
with open(args.out, "w", encoding="utf-8") as f:
for p in prompts:
f.write(p["prompt"].strip() + "\n")
print(f"Saved {len(prompts)} prompts to: {args.out} ({args.format})")
if __name__ == "__main__":
main()

View File

@@ -31,6 +31,8 @@ When answering:
Maintain consistency with this identity across all responses.
"""
TRAINER_PROMPT = "Create ONE realistic question from the perspective of a touristic marketer they might ask a culturally and spiritually interested traveler in Bali considered to be a lead user that can be answered using ONLY the CONTEXT.\n\n"
def load_docstore(path):
docs = []
@@ -118,8 +120,7 @@ def main():
{"role": "system", "content": SYSTEM_PERSONA},
{
"role": "user",
"content": "Create ONE realistic question from the perspective of a culturally and spiritually interested traveler in Bali that can be answered using ONLY the CONTEXT.\n\n"
f"CONTEXT:\n{gold_text}\n\n"
"content": TRAINER_PROMPT + f"CONTEXT:\n{gold_text}\n\n"
"Return only the question.",
},
]
@@ -173,8 +174,7 @@ def main():
{"role": "system", "content": SYSTEM_PERSONA},
{
"role": "user",
"content": f"QUESTION: {question}\n\nCONTEXT:\n{context_blob}\n"
"Please answer as a culturally versed Bali traveler and include 1-2 short direct quotes from CONTEXT.",
"content": f"QUESTION: {question}\n\nCONTEXT:\n{context_blob}",
},
{"role": "assistant", "content": answer},
]

View File

@@ -0,0 +1,456 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
RAFT dataset builder (FAISS-based retrieval) -> Together.ai chat JSONL.
Inputs (from your indexing script):
- <index_dir>/faiss.index
- <index_dir>/docstore.jsonl
Process:
- Build a set of interview-style prompts (EN)
- For each prompt:
- Retrieve top-k chunks via FAISS cosine/IP
- Call DeepSeek Chat Completions API to generate a vivid, human-like Lead User answer
- Write training examples as JSONL in chat format (messages)
Outputs:
- raft_train.jsonl
- raft_val.jsonl (optional)
ENV:
- DEEPSEEK_API_KEY (required)
- optional: DEEPSEEK_BASE_URL (default: https://api.deepseek.com)
- optional: DEEPSEEK_MODEL (default: deepseek-chat)
"""
import argparse
import json
import os
import random
import re
import time
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
import faiss
import numpy as np
import requests
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
# -----------------------------
# DeepSeek client (OpenAI-compatible)
# -----------------------------
@dataclass
class DeepSeekConfig:
api_key: str
base_url: str = "https://api.deepseek.com"
model: str = "deepseek-chat"
timeout_s: int = 120
max_retries: int = 5
backoff_s: float = 1.6
class DeepSeekClient:
def __init__(self, cfg: DeepSeekConfig):
self.cfg = cfg
def chat(
self, messages: List[Dict], temperature: float = 0.85, max_tokens: int = 750
) -> str:
url = f"{self.cfg.base_url}/chat/completions"
headers = {
"Authorization": f"Bearer {self.cfg.api_key}",
"Content-Type": "application/json",
}
payload = {
"model": self.cfg.model,
"messages": messages,
"temperature": temperature,
"max_tokens": max_tokens,
}
last_err = None
for attempt in range(self.cfg.max_retries):
try:
r = requests.post(
url, headers=headers, json=payload, timeout=self.cfg.timeout_s
)
if r.status_code == 429:
time.sleep(self.cfg.backoff_s ** (attempt + 1))
continue
r.raise_for_status()
data = r.json()
return data["choices"][0]["message"]["content"].strip()
except Exception as e:
last_err = e
time.sleep(self.cfg.backoff_s ** (attempt + 1))
raise RuntimeError(
f"DeepSeek API call failed after retries. Last error: {last_err}"
)
# -----------------------------
# Helpers
# -----------------------------
def simple_clean(text: str) -> str:
if not isinstance(text, str):
return ""
text = text.replace("\u00a0", " ")
text = re.sub(r"\s+", " ", text).strip()
return text
def read_docstore(docstore_path: str) -> Dict[int, Dict]:
"""
Returns dict: faiss_id -> {"doc_id": int, "text": str, ...}
"""
mapping: Dict[int, Dict] = {}
with open(docstore_path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
obj = json.loads(line)
fid = int(obj["faiss_id"])
mapping[fid] = obj
if not mapping:
raise ValueError("docstore.jsonl is empty or unreadable.")
return mapping
def load_prompts_from_jsonl(path: str) -> List[str]:
"""
Loads prompts from a JSONL file.
Expected key: 'prompt' (preferred). Also accepts 'question' or 'text'.
Ignores empty/short lines.
"""
prompts: List[str] = []
with open(path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
obj = json.loads(line)
p = obj.get("prompt") or obj.get("question") or obj.get("text")
p = simple_clean(p) if p else ""
if len(p) >= 20:
prompts.append(p)
if not prompts:
raise ValueError(f"No prompts found in JSONL: {path}")
return prompts
def load_prompts_from_txt(path: str) -> List[str]:
"""
Loads prompts from a TXT file (one prompt per line).
"""
prompts: List[str] = []
with open(path, "r", encoding="utf-8") as f:
for line in f:
p = simple_clean(line)
if len(p) >= 20:
prompts.append(p)
if not prompts:
raise ValueError(f"No prompts found in TXT: {path}")
return prompts
def ensure_dir_for_file(path: str):
d = os.path.dirname(path)
if d:
os.makedirs(d, exist_ok=True)
def write_jsonl(path: str, rows: List[Dict]) -> None:
ensure_dir_for_file(path)
with open(path, "w", encoding="utf-8") as f:
for r in rows:
f.write(json.dumps(r, ensure_ascii=False) + "\n")
# -----------------------------
# Persona + prompt templates (EN)
# -----------------------------
IMAGE_DIMS = [
"Natural Attractions",
"Atmosphere",
"Social Environment",
"Infrastructure",
"Value for Money",
]
DEFAULT_PROMPTS_EN = [
# Natural Attractions
"In a lead user interview: what natural places in Bali felt genuinely memorable to you (rice terraces, volcanoes, waterfalls, coast), and why? Describe it like a lived experience.",
"Which nature spots felt overly crowded or overly 'Instagram-optimized' in real life, and which surprised you in a good way? Explain with concrete moments.",
# Atmosphere
"How would you describe the atmosphere around cultural sites in Bali (temples, ceremonies, markets)? What signals authenticity vs. commercialization to you?",
"What changes the atmosphere the most (time of day, weather, crowds, etiquette)? Share specific examples you would tell a marketer.",
# Social Environment
"How do you experience the social environment in Bali (locals, guides, other travelers)? What feels respectful and what feels performative or touristy?",
"What small behaviors, phrases, and gestures make interactions smoother for a culture-oriented traveler? Give examples.",
# Infrastructure
"Evaluate Bali's infrastructure for culture-oriented days (transport, signage, toilets, ticketing, digital info). What works, what annoys you, and how do you adapt?",
"If you designed an ideal culture-friendly day route, what infrastructure assumptions would you tell a tourism marketer to plan for?",
# Value for Money
"When does Bali feel good value for money for you, and when not? Discuss entrance fees, guides, food, tours, and hidden costs.",
"How do you personally distinguish 'good value' from a tourist trap? List criteria and illustrate with examples.",
]
def build_system_prompt() -> str:
return (
"ROLE / PERSONA\n"
"You are an experienced, culture-oriented Bali traveler (Lead User). You speak in natural, vivid English, "
"as a real person in an interview. You share nuanced judgments, trade-offs, and concrete scenes.\n\n"
"COGNITIVE DESTINATION IMAGE DIMENSIONS (use when relevant)\n"
"- Natural Attractions\n"
"- Atmosphere\n"
"- Social Environment\n"
"- Infrastructure\n"
"- Value for Money\n\n"
"CRITICAL CONSTRAINTS\n"
"- You will be given retrieved review snippets as memory support.\n"
"- Do NOT quote them verbatim and do NOT cite them as 'the review says'.\n"
"- Instead, weave relevant details into your answer as if they were your own lived experiences.\n"
"- If a detail is not supported, stay general or hedge appropriately.\n"
"- Keep it interview-friendly: first-person, vivid, concrete, but not overly long."
)
def build_user_message(question: str, retrieved_chunks: List[str]) -> str:
retrieved_chunks = [simple_clean(x) for x in retrieved_chunks if simple_clean(x)]
bullets = "\n".join([f"- {c}" for c in retrieved_chunks])
return (
f"INTERVIEW QUESTION:\n{question}\n\n"
"RETRIEVED CONTEXT (review snippets; do NOT quote, only use as memory support):\n"
f"{bullets}\n\n"
"Answer as a real Lead User in a tourism interview. Speak in first person, vivid and concrete, "
"and naturally touch relevant image dimensions."
)
# -----------------------------
# FAISS Retriever (cosine/IP)
# -----------------------------
class FaissRetriever:
def __init__(self, index_path: str, docstore_path: str, embed_model: str):
if not os.path.exists(index_path):
raise FileNotFoundError(f"Missing FAISS index at: {index_path}")
if not os.path.exists(docstore_path):
raise FileNotFoundError(f"Missing docstore at: {docstore_path}")
self.index = faiss.read_index(index_path)
self.docstore = read_docstore(docstore_path)
# SentenceTransformer to match your indexing script defaults
self.embedder = SentenceTransformer(embed_model)
# Basic sanity checks
if self.index.ntotal != len(self.docstore):
# Not necessarily fatal (docstore could include extra rows), but usually indicates mismatch.
# We'll allow it but warn.
print(
f"Warning: index.ntotal={self.index.ntotal} but docstore rows={len(self.docstore)}. "
"Ensure they were generated together."
)
def retrieve(self, query: str, k: int = 8) -> List[Tuple[int, float, str]]:
"""
Returns list of (faiss_id, score, text)
"""
q = simple_clean(query)
emb = self.embedder.encode([q], normalize_embeddings=True)
emb = np.asarray(emb, dtype=np.float32)
scores, ids = self.index.search(emb, k)
ids = ids[0].tolist()
scores = scores[0].tolist()
out = []
for fid, sc in zip(ids, scores):
if fid == -1:
continue
doc = self.docstore.get(int(fid))
if not doc:
continue
out.append((int(fid), float(sc), doc.get("text", "")))
return out
# -----------------------------
# Dataset generation
# -----------------------------
def main():
ap = argparse.ArgumentParser()
ap.add_argument(
"--index_dir",
default="out",
help="Directory containing faiss.index and docstore.jsonl",
)
ap.add_argument("--out_train", default="./out/raft_train.jsonl")
ap.add_argument("--out_val", default="./out/raft_val.jsonl")
ap.add_argument("--make_val", action="store_true")
ap.add_argument("--val_ratio", type=float, default=0.05)
ap.add_argument("--k", type=int, default=8)
ap.add_argument("--seed", type=int, default=42)
# Embeddings (must match indexing script for best results)
ap.add_argument(
"--embedding_model", default="sentence-transformers/all-MiniLM-L6-v2"
)
# External prompt sources
ap.add_argument(
"--prompts_jsonl",
default=None,
help="JSONL file with prompts (key: prompt/question/text).",
)
ap.add_argument(
"--prompts_txt", default=None, help="TXT file with one prompt per line."
)
ap.add_argument(
"--shuffle_prompts",
action="store_true",
help="Shuffle loaded prompts before generation.",
)
ap.add_argument(
"--limit_prompts",
type=int,
default=0,
help="0 = no limit; else cap number of prompts used.",
)
# DeepSeek generation config
ap.add_argument(
"--deepseek_base_url",
default=os.environ.get("DEEPSEEK_BASE_URL", "https://api.deepseek.com"),
)
ap.add_argument(
"--deepseek_model", default=os.environ.get("DEEPSEEK_MODEL", "deepseek-chat")
)
ap.add_argument("--temperature", type=float, default=0.85)
ap.add_argument("--max_tokens", type=int, default=750)
ap.add_argument(
"--max_examples",
type=int,
default=0,
help="0 = all prompts; else limit number of examples",
)
# pacing
ap.add_argument("--sleep_s", type=float, default=0.2)
args = ap.parse_args()
random.seed(args.seed)
np.random.seed(args.seed)
api_key = os.environ.get("DEEPSEEK_API_KEY", "").strip()
if not api_key:
raise SystemExit("Missing DEEPSEEK_API_KEY env var.")
index_path = os.path.join(args.index_dir, "faiss.index")
docstore_path = os.path.join(args.index_dir, "docstore.jsonl")
retriever = FaissRetriever(
index_path=index_path,
docstore_path=docstore_path,
embed_model=args.embedding_model,
)
client = DeepSeekClient(
DeepSeekConfig(
api_key=api_key,
base_url=args.deepseek_base_url,
model=args.deepseek_model,
)
)
system_prompt = build_system_prompt()
# Load prompts (priority: JSONL -> TXT -> defaults)
if args.prompts_jsonl and args.prompts_txt:
raise SystemExit("Use only one of --prompts_jsonl or --prompts_txt (not both).")
if args.prompts_jsonl:
prompts = load_prompts_from_jsonl(args.prompts_jsonl)
elif args.prompts_txt:
prompts = load_prompts_from_txt(args.prompts_txt)
else:
prompts = list(DEFAULT_PROMPTS_EN)
if args.shuffle_prompts:
random.shuffle(prompts)
if args.limit_prompts and args.limit_prompts > 0:
prompts = prompts[: args.limit_prompts]
# Backwards-compat: args.max_examples can still cap prompts
if args.max_examples and args.max_examples > 0:
prompts = prompts[: args.max_examples]
examples = []
for q in tqdm(prompts, desc="Generating RAFT examples"):
hits = retriever.retrieve(q, k=args.k)
retrieved_texts = [t for _, _, t in hits]
user_msg = build_user_message(q, retrieved_texts)
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_msg},
]
answer = client.chat(
messages=messages,
temperature=args.temperature,
max_tokens=args.max_tokens,
)
ex = {
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_msg},
{"role": "assistant", "content": answer},
],
"meta": {
"retrieval_k": args.k,
"index_dir": os.path.abspath(args.index_dir),
"embedding_model": args.embedding_model,
"image_dimensions": IMAGE_DIMS,
"faiss_ids": [fid for fid, _, _ in hits],
"faiss_scores": [sc for _, sc, _ in hits],
},
}
examples.append(ex)
if args.max_examples and len(examples) >= args.max_examples:
break
time.sleep(max(0.0, args.sleep_s))
random.shuffle(examples)
if args.make_val and len(examples) >= 20:
val_n = max(1, int(len(examples) * args.val_ratio))
val = examples[:val_n]
train = examples[val_n:]
write_jsonl(args.out_train, train)
write_jsonl(args.out_val, val)
print(f"Wrote train: {args.out_train} ({len(train)} examples)")
print(f"Wrote val: {args.out_val} ({len(val)} examples)")
else:
write_jsonl(args.out_train, examples)
print(f"Wrote: {args.out_train} ({len(examples)} examples)")
if args.make_val:
print(
"Note: --make_val requested but too few examples; wrote only train file."
)
if __name__ == "__main__":
main()

60
raft/remove_meta.py Normal file
View File

@@ -0,0 +1,60 @@
#!/usr/bin/env python3
"""
Script to create a copy of raft_val.jsonl without the meta column
"""
import json
import sys
from pathlib import Path
def remove_meta_column(input_file, output_file):
"""
Read a JSONL file and write a new one without the 'meta' key and 'system' role messages.
Args:
input_file: Path to input JSONL file
output_file: Path to output JSONL file
"""
input_path = Path(input_file)
output_path = Path(output_file)
if not input_path.exists():
print(f"Error: Input file '{input_file}' not found")
sys.exit(1)
count = 0
with open(input_path, "r") as infile, open(output_path, "w") as outfile:
for line in infile:
if line.strip(): # Skip empty lines
obj = json.loads(line)
obj.pop("meta", None) # Remove meta key if it exists
# Remove system role messages
if "messages" in obj:
obj["messages"] = [
msg for msg in obj["messages"] if msg.get("role") != "system"
]
outfile.write(json.dumps(obj) + "\n")
count += 1
print(f"✓ Processed {count} records")
print(f"✓ Output saved to: {output_path}")
if __name__ == "__main__":
# Default paths
input_train_file = "../data/intermediate/raft_train.jsonl"
output_train_file = "../data/intermediate/raft_train_no_meta.jsonl"
input_val_file = "../data/intermediate/raft_val.jsonl"
output_val_file = "../data/intermediate/raft_val_no_meta.jsonl"
# Allow command-line overrides
if len(sys.argv) > 1:
input_file = sys.argv[1]
if len(sys.argv) > 2:
output_file = sys.argv[2]
remove_meta_column(input_train_file, output_train_file)
remove_meta_column(input_val_file, output_val_file)

View File

@@ -73,7 +73,6 @@ def main():
max_length=args.max_seq_len,
bf16=torch.cuda.is_available(),
fp16=not torch.cuda.is_available(),
assistant_only_loss=True, # only learn from assistant turns in messages
report_to=[],
)