mirror of
https://github.com/marvinscham/masterthesis-playground.git
synced 2026-03-22 08:22:43 +01:00
RAFT shenanigans
This commit is contained in:
@@ -21,7 +21,7 @@ python make_raft_data.py --out_dir out --n_examples 10
|
||||
## Training der QLoRA-Adapter
|
||||
|
||||
```bash
|
||||
python train_mistral_raft.py --train_jsonl out/raft_train.jsonl --out_dir out/mistral_balitwin_lora
|
||||
python train_mistral_raft.py --train_jsonl out/raft_train.jsonl --out_dir out/mistral_balitwin_lora
|
||||
```
|
||||
|
||||
## Inferenz
|
||||
|
||||
560
raft/generate_culture_spirit_interview_prompts.py
Normal file
560
raft/generate_culture_spirit_interview_prompts.py
Normal file
@@ -0,0 +1,560 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Generate 300–1000+ English interview questions targeted ONLY at culturally/spiritually
|
||||
interested Bali tourists (Lead Users), covering 5 cognitive destination image dimensions:
|
||||
- Natural Attractions
|
||||
- Atmosphere
|
||||
- Social Environment
|
||||
- Infrastructure
|
||||
- Value for Money
|
||||
|
||||
Key constraint:
|
||||
- Every prompt must be meaningful for culture/spirituality-first travelers.
|
||||
- Avoid party/shopping/hedonistic positioning.
|
||||
- Include etiquette, authenticity, sacredness, commodification, meaning-making, reflection.
|
||||
|
||||
Outputs:
|
||||
- JSONL: {"dimension": "...", "type": "...", "prompt": "...", "tags": [...]}
|
||||
- or TXT: one prompt per line
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import random
|
||||
import re
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
DIMENSIONS = [
|
||||
"Natural Attractions",
|
||||
"Atmosphere",
|
||||
"Social Environment",
|
||||
"Infrastructure",
|
||||
"Value for Money",
|
||||
]
|
||||
|
||||
# -----------------------------
|
||||
# Segment-specific building blocks
|
||||
# -----------------------------
|
||||
# Keep places generic (no need to hallucinate specific proper nouns)
|
||||
NATURE_FOR_MEANING = [
|
||||
"rice terraces that feel lived-in rather than staged",
|
||||
"waterfalls approached with a quiet, respectful mood",
|
||||
"volcano viewpoints that invite reflection at dawn",
|
||||
"jungle walks where you notice offerings and small shrines",
|
||||
"lake areas that feel calm and contemplative",
|
||||
"coastal paths that feel like a moving meditation",
|
||||
"hot springs experienced as restoration rather than spectacle",
|
||||
]
|
||||
|
||||
CULTURE_SPIRIT_SPACES = [
|
||||
"temple courtyards and entry paths",
|
||||
"a village ceremony you observe respectfully",
|
||||
"a traditional market where everyday ritual shows up in small ways",
|
||||
"a dance performance where you try to read symbolism",
|
||||
"a craft workshop focused on meaning and lineage, not souvenirs",
|
||||
"a community space where offerings are prepared",
|
||||
"a quiet heritage walk where stories feel layered",
|
||||
]
|
||||
|
||||
RITUAL_ETIQUETTE_TOPICS = [
|
||||
"dress codes and modesty",
|
||||
"offerings and what not to touch",
|
||||
"photography boundaries",
|
||||
"when to speak vs stay quiet",
|
||||
"how to move through a temple without intruding",
|
||||
"how to ask questions without turning sacred life into content",
|
||||
]
|
||||
|
||||
MEANING_MAKING = [
|
||||
"a sense of humility",
|
||||
"a feeling of gratitude",
|
||||
"a moment of awe",
|
||||
"a feeling of being a guest",
|
||||
"a sense of calm",
|
||||
"a quiet emotional reset",
|
||||
"a shift in how you see daily life",
|
||||
"a stronger respect for local rhythms",
|
||||
]
|
||||
|
||||
AUTHENTICITY_CUES = [
|
||||
"how people behave when no one is watching",
|
||||
"whether the experience is integrated into local life",
|
||||
"how money is handled (transparent vs extractive)",
|
||||
"whether rules feel protective or performative",
|
||||
"whether the pace allows reflection or pushes consumption",
|
||||
]
|
||||
|
||||
CROWDING_COMMODIFICATION = [
|
||||
"overt commercialization around sacred spaces",
|
||||
"crowds that change the emotional tone",
|
||||
"performative 'authenticity' for tourists",
|
||||
"feeling like sacredness is being packaged",
|
||||
]
|
||||
|
||||
CONTEXTS = [
|
||||
"early morning before the crowds",
|
||||
"late afternoon when light softens and things slow down",
|
||||
"during a local ceremony where you are clearly a guest",
|
||||
"in rainy season when plans change and patience matters",
|
||||
"on a quiet weekday compared to a busy weekend",
|
||||
"with a local guide who emphasizes respect and context",
|
||||
"solo, when you can be more contemplative",
|
||||
"as a repeat visitor, noticing subtler layers",
|
||||
]
|
||||
|
||||
TRAVELER_PROFILE = [
|
||||
"a culture-first traveler",
|
||||
"a spirituality-curious traveler",
|
||||
"a respectful observer who avoids intrusive tourism",
|
||||
"a slow traveler seeking depth over volume",
|
||||
"a repeat visitor looking for subtler, less packaged experiences",
|
||||
]
|
||||
|
||||
CONSTRAINTS = [
|
||||
(
|
||||
"time",
|
||||
[
|
||||
"you only have 6 hours but want depth, not a checklist",
|
||||
"you have one full day and want it to feel coherent and meaningful",
|
||||
"you have three days and want a gentle pace with time for reflection",
|
||||
"you can only travel within a short radius and must choose carefully",
|
||||
],
|
||||
),
|
||||
(
|
||||
"budget",
|
||||
[
|
||||
"you have a modest budget but still want cultural depth and fairness",
|
||||
"you'll pay more if it supports local communities transparently",
|
||||
"you want predictable costs and dislike hidden fees around sacred sites",
|
||||
"you prefer smaller, community-rooted experiences over pricey packages",
|
||||
],
|
||||
),
|
||||
(
|
||||
"crowds",
|
||||
[
|
||||
"you want to avoid crowds because they dilute atmosphere and respect",
|
||||
"you can handle crowds if etiquette and sacredness are preserved",
|
||||
"you want a balance: one iconic site, mostly quieter, community-rooted places",
|
||||
"you get overwhelmed by busy places and need calmer, respectful alternatives",
|
||||
],
|
||||
),
|
||||
(
|
||||
"weather",
|
||||
[
|
||||
"it's rainy season and flexibility is part of respectful travel",
|
||||
"it's very hot and you need a pace that still feels mindful",
|
||||
"visibility is low and your sunrise plan may fail—how do you adapt meaningfully?",
|
||||
"roads feel unsafe, so you prioritize fewer moves and deeper presence",
|
||||
],
|
||||
),
|
||||
(
|
||||
"mobility",
|
||||
[
|
||||
"you avoid steep stairs but still want meaningful cultural/spiritual moments",
|
||||
"you prefer not to ride a scooter and want low-friction transport options",
|
||||
"you want minimal walking but still want authenticity and atmosphere",
|
||||
"you need frequent rest and prefer fewer transitions",
|
||||
],
|
||||
),
|
||||
(
|
||||
"ethics",
|
||||
[
|
||||
"you want to avoid commodifying sacred life",
|
||||
"you prioritize local benefit, consent, and respectful boundaries",
|
||||
"you avoid experiences that pressure locals to perform for tourists",
|
||||
"you want your presence to feel like 'being a guest' not 'taking'",
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
TRADEOFFS = [
|
||||
("depth of understanding", "convenience"),
|
||||
("sacredness", "accessibility"),
|
||||
("quiet reflection", "seeing iconic places"),
|
||||
("guided cultural context", "self-guided freedom"),
|
||||
("photography", "presence and respect"),
|
||||
("predictable pricing", "spontaneous discovery"),
|
||||
("community benefit", "personal comfort"),
|
||||
("slow pace", "variety of stops"),
|
||||
]
|
||||
|
||||
CONTRASTS = [
|
||||
("a popular temple area", "a quieter village setting"),
|
||||
("a curated tour script", "a guide who shares context and encourages respect"),
|
||||
("a crowded ceremony-adjacent spot", "a calm everyday ritual moment"),
|
||||
(
|
||||
"a market aisle focused on souvenirs",
|
||||
"a market moment that shows daily offerings and rhythm",
|
||||
),
|
||||
("a rushed checklist day", "a slower day with fewer places but deeper presence"),
|
||||
("an 'Instagram moment'", "a moment of quiet meaning that you don't photograph"),
|
||||
]
|
||||
|
||||
INTERVIEW_STYLES = [
|
||||
"Tell me about a time when…",
|
||||
"Walk me through…",
|
||||
"As a culturally/spiritually motivated traveler, how do you…",
|
||||
"If you had to advise a tourism marketer focused on respectful cultural travel…",
|
||||
"What surprised you about the spiritual or cultural texture of…",
|
||||
"What does 'authentic and respectful' look like to you when…",
|
||||
"How do you personally decide whether to join, observe, or step back when…",
|
||||
]
|
||||
|
||||
FOLLOWUP_PROBES = [
|
||||
"What specifically made it feel respectful or not?",
|
||||
"What did you notice first, and what happened next?",
|
||||
"How did it change your mood or sense of meaning that day?",
|
||||
"What would have improved it without turning it into a spectacle?",
|
||||
"What boundary would you not cross again?",
|
||||
"What would you tell a marketer to never claim in messaging?",
|
||||
]
|
||||
|
||||
DIM_THEMES: Dict[str, List[str]] = {
|
||||
"Natural Attractions": [
|
||||
"sense of place and meaning",
|
||||
"quiet awe vs spectacle",
|
||||
"timing for contemplative experience",
|
||||
"routes that support reflection",
|
||||
"respectful behavior in nature",
|
||||
"access vs sacred calm",
|
||||
],
|
||||
"Atmosphere": [
|
||||
"sacredness and emotional tone",
|
||||
"authenticity cues",
|
||||
"commercialization pressure",
|
||||
"silence, sound, and pace",
|
||||
"crowds and reverence",
|
||||
"ritual context shaping ambience",
|
||||
],
|
||||
"Social Environment": [
|
||||
"being a guest and practicing humility",
|
||||
"consent and boundaries",
|
||||
"guide trust and cultural context",
|
||||
"respectful interaction with locals",
|
||||
"tourist behavior that disrupts",
|
||||
"learning without extracting",
|
||||
],
|
||||
"Infrastructure": [
|
||||
"signage for etiquette",
|
||||
"visitor flow that protects sacred spaces",
|
||||
"frictionless but respectful access",
|
||||
"toilets/rest areas without degrading atmosphere",
|
||||
"transparent ticketing/donations",
|
||||
"accessibility with dignity",
|
||||
],
|
||||
"Value for Money": [
|
||||
"fairness and transparency",
|
||||
"donations vs fees",
|
||||
"paying for guides as cultural mediation",
|
||||
"avoiding extractive 'spiritual packages'",
|
||||
"community benefit",
|
||||
"what feels worth paying for (context, respect, time)",
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# Templates
|
||||
# -----------------------------
|
||||
def tmpl_single_dimension(
|
||||
d: str, theme: str, style: str, place_hint: str, context: str
|
||||
) -> str:
|
||||
return (
|
||||
f"{style} your experience with {place_hint} in Bali during {context}. "
|
||||
f"From a {d} perspective, what stands out about {theme}—and why does it matter to you as a culture/spirit-oriented traveler?"
|
||||
)
|
||||
|
||||
|
||||
def tmpl_laddering(d: str, theme: str, context: str, meaning: str) -> str:
|
||||
return (
|
||||
f"Think about a specific moment in Bali during {context} that left you with {meaning}. "
|
||||
f"What happened, how did you interpret it, and why did it feel meaningful? "
|
||||
f"Frame your answer through {d} (focus on {theme})."
|
||||
)
|
||||
|
||||
|
||||
def tmpl_contrast(d: str, a: str, b: str, context: str, cue: str) -> str:
|
||||
return (
|
||||
f"Compare {a} versus {b} in Bali during {context}. "
|
||||
f"In terms of {d}, how do they differ for you as a respectful, culture/spirit-first traveler? "
|
||||
f"Use {cue} as a cue in your explanation."
|
||||
)
|
||||
|
||||
|
||||
def tmpl_tradeoff(d1: str, d2: str, x: str, y: str, constraint: str) -> str:
|
||||
return (
|
||||
f"Under this constraint: {constraint}. "
|
||||
f"How do you trade off {x} versus {y} when choosing cultural/spiritual experiences in Bali? "
|
||||
f"Answer with examples touching {d1} and {d2}."
|
||||
)
|
||||
|
||||
|
||||
def tmpl_marketer_advice(d: str, theme: str, constraint: str, dont_claim: str) -> str:
|
||||
return (
|
||||
f"If you had to advise a tourism marketer for culturally/spiritually interested travelers: under the constraint '{constraint}', "
|
||||
f"what should they understand about {d} (especially {theme})? "
|
||||
f"Also: what is one thing they should NOT claim in messaging because it would feel misleading or disrespectful—e.g., {dont_claim}?"
|
||||
)
|
||||
|
||||
|
||||
def tmpl_etiquette_scenario(d: str, topic: str, context: str) -> str:
|
||||
return (
|
||||
f"Walk me through an etiquette situation related to {topic} in Bali during {context}. "
|
||||
f"What did you do, what did you avoid, and what would you want a marketer to communicate to travelers upfront? "
|
||||
f"Connect it to {d}."
|
||||
)
|
||||
|
||||
|
||||
def tmpl_route_design(
|
||||
d: str, nature_hint: str, culture_hint: str, constraint: str
|
||||
) -> str:
|
||||
return (
|
||||
f"Design a mini day-route that combines {nature_hint} and {culture_hint} under this constraint: {constraint}. "
|
||||
f"How would you protect atmosphere and respect while still making it accessible to culture/spirit-first travelers? Link your reasoning to {d}."
|
||||
)
|
||||
|
||||
|
||||
def tmpl_probe_followup(base_q: str, probe: str) -> str:
|
||||
return f"{base_q} {probe}"
|
||||
|
||||
|
||||
def pick_constraint(rng: random.Random) -> Tuple[str, str]:
|
||||
key, vals = rng.choice(CONSTRAINTS)
|
||||
return key, rng.choice(vals)
|
||||
|
||||
|
||||
def pick_place_hint_for_dim(d: str, rng: random.Random) -> str:
|
||||
if d == "Natural Attractions":
|
||||
return rng.choice(NATURE_FOR_MEANING)
|
||||
return rng.choice(CULTURE_SPIRIT_SPACES)
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# Generation
|
||||
# -----------------------------
|
||||
def generate_prompts(
|
||||
n: int,
|
||||
seed: int = 42,
|
||||
add_followups_ratio: float = 0.35,
|
||||
ensure_balance: bool = True,
|
||||
) -> List[Dict]:
|
||||
rng = random.Random(seed)
|
||||
|
||||
# Mix of question archetypes, all segment-targeted
|
||||
types = [
|
||||
("single", 0.24),
|
||||
("laddering", 0.18),
|
||||
("contrast", 0.16),
|
||||
("tradeoff", 0.18),
|
||||
("marketer", 0.12),
|
||||
("etiquette", 0.08),
|
||||
("route", 0.04),
|
||||
]
|
||||
type_names = [t for t, _ in types]
|
||||
type_weights = [w for _, w in types]
|
||||
|
||||
prompts: List[Dict] = []
|
||||
seen = set()
|
||||
|
||||
# Balanced dimension coverage
|
||||
dim_cycle = []
|
||||
if ensure_balance:
|
||||
per_dim = max(1, n // len(DIMENSIONS))
|
||||
for d in DIMENSIONS:
|
||||
dim_cycle.extend([d] * per_dim)
|
||||
while len(dim_cycle) < n:
|
||||
dim_cycle.append(rng.choice(DIMENSIONS))
|
||||
rng.shuffle(dim_cycle)
|
||||
|
||||
# A small set of "don't claim" examples to anchor respectful marketing constraints
|
||||
DONT_CLAIM = [
|
||||
"guaranteed 'authentic spirituality' on demand",
|
||||
"a ceremony 'for tourists' as the main attraction",
|
||||
"access to sacred spaces without emphasizing etiquette and consent",
|
||||
"a 'hidden local ritual' framed as a product",
|
||||
"permission to photograph everything",
|
||||
]
|
||||
|
||||
def add_prompt(obj: Dict) -> bool:
|
||||
key = re.sub(r"\s+", " ", obj["prompt"].strip().lower())
|
||||
if key in seen:
|
||||
return False
|
||||
# hard filter: must include at least one segment anchor term
|
||||
anchors = [
|
||||
"respect",
|
||||
"sacred",
|
||||
"etiquette",
|
||||
"meaning",
|
||||
"authentic",
|
||||
"ceremony",
|
||||
"guest",
|
||||
"context",
|
||||
"spirit",
|
||||
]
|
||||
if not any(a in key for a in anchors):
|
||||
return False
|
||||
seen.add(key)
|
||||
prompts.append(obj)
|
||||
return True
|
||||
|
||||
max_attempts = n * 25
|
||||
attempts = 0
|
||||
|
||||
while len(prompts) < n and attempts < max_attempts:
|
||||
attempts += 1
|
||||
|
||||
d = (
|
||||
dim_cycle[len(prompts)]
|
||||
if ensure_balance and len(dim_cycle) > len(prompts)
|
||||
else rng.choice(DIMENSIONS)
|
||||
)
|
||||
theme = rng.choice(DIM_THEMES[d])
|
||||
style = rng.choice(INTERVIEW_STYLES)
|
||||
context = rng.choice(CONTEXTS)
|
||||
place_hint = pick_place_hint_for_dim(d, rng)
|
||||
c_key, c_val = pick_constraint(rng)
|
||||
|
||||
t = rng.choices(type_names, weights=type_weights, k=1)[0]
|
||||
|
||||
if t == "single":
|
||||
q = tmpl_single_dimension(d, theme, style, place_hint, context)
|
||||
obj = {
|
||||
"dimension": d,
|
||||
"type": "single",
|
||||
"prompt": q,
|
||||
"tags": [d, theme, context, "segment:culture-spirit"],
|
||||
}
|
||||
ok = add_prompt(obj)
|
||||
|
||||
elif t == "laddering":
|
||||
meaning = rng.choice(MEANING_MAKING)
|
||||
q = tmpl_laddering(d, theme, context, meaning)
|
||||
obj = {
|
||||
"dimension": d,
|
||||
"type": "laddering",
|
||||
"prompt": q,
|
||||
"tags": [d, theme, context, "laddering", "segment:culture-spirit"],
|
||||
}
|
||||
ok = add_prompt(obj)
|
||||
|
||||
elif t == "contrast":
|
||||
a, b = rng.choice(CONTRASTS)
|
||||
cue = rng.choice(AUTHENTICITY_CUES + CROWDING_COMMODIFICATION)
|
||||
q = tmpl_contrast(d, a, b, context, cue)
|
||||
obj = {
|
||||
"dimension": d,
|
||||
"type": "contrast",
|
||||
"prompt": q,
|
||||
"tags": [d, "contrast", context, "segment:culture-spirit"],
|
||||
}
|
||||
ok = add_prompt(obj)
|
||||
|
||||
elif t == "tradeoff":
|
||||
d2 = rng.choice([x for x in DIMENSIONS if x != d])
|
||||
x, y = rng.choice(TRADEOFFS)
|
||||
q = tmpl_tradeoff(d, d2, x, y, c_val)
|
||||
obj = {
|
||||
"dimension": f"{d} + {d2}",
|
||||
"type": "tradeoff",
|
||||
"prompt": q,
|
||||
"tags": [d, d2, "tradeoff", c_key, "segment:culture-spirit"],
|
||||
}
|
||||
ok = add_prompt(obj)
|
||||
|
||||
elif t == "marketer":
|
||||
dont_claim = rng.choice(DONT_CLAIM)
|
||||
q = tmpl_marketer_advice(d, theme, c_val, dont_claim)
|
||||
obj = {
|
||||
"dimension": d,
|
||||
"type": "marketer_advice",
|
||||
"prompt": q,
|
||||
"tags": [d, theme, "marketer", c_key, "segment:culture-spirit"],
|
||||
}
|
||||
ok = add_prompt(obj)
|
||||
|
||||
elif t == "etiquette":
|
||||
topic = rng.choice(RITUAL_ETIQUETTE_TOPICS)
|
||||
q = tmpl_etiquette_scenario(d, topic, context)
|
||||
obj = {
|
||||
"dimension": d,
|
||||
"type": "etiquette",
|
||||
"prompt": q,
|
||||
"tags": [d, "etiquette", topic, context, "segment:culture-spirit"],
|
||||
}
|
||||
ok = add_prompt(obj)
|
||||
|
||||
elif t == "route":
|
||||
nature_hint = rng.choice(NATURE_FOR_MEANING)
|
||||
culture_hint = rng.choice(CULTURE_SPIRIT_SPACES)
|
||||
q = tmpl_route_design(d, nature_hint, culture_hint, c_val)
|
||||
obj = {
|
||||
"dimension": d,
|
||||
"type": "route_design",
|
||||
"prompt": q,
|
||||
"tags": [d, "route", c_key, "segment:culture-spirit"],
|
||||
}
|
||||
ok = add_prompt(obj)
|
||||
|
||||
else:
|
||||
ok = False
|
||||
|
||||
# follow-up probe variant
|
||||
if ok and rng.random() < add_followups_ratio and len(prompts) < n:
|
||||
probe = rng.choice(FOLLOWUP_PROBES)
|
||||
q2 = tmpl_probe_followup(prompts[-1]["prompt"], probe)
|
||||
obj2 = {
|
||||
"dimension": prompts[-1]["dimension"],
|
||||
"type": prompts[-1]["type"] + "+probe",
|
||||
"prompt": q2,
|
||||
"tags": prompts[-1]["tags"] + ["probe"],
|
||||
}
|
||||
add_prompt(obj2)
|
||||
|
||||
if len(prompts) < n:
|
||||
print(f"Warning: only generated {len(prompts)} unique prompts (requested {n}).")
|
||||
|
||||
return prompts[:n]
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument(
|
||||
"--n",
|
||||
type=int,
|
||||
default=600,
|
||||
help="Number of prompts to generate (300–1000 recommended).",
|
||||
)
|
||||
ap.add_argument("--seed", type=int, default=42)
|
||||
ap.add_argument("--out", default="culture_spirit_interview_prompts.jsonl")
|
||||
ap.add_argument("--format", choices=["jsonl", "txt"], default="jsonl")
|
||||
ap.add_argument(
|
||||
"--no_balance",
|
||||
action="store_true",
|
||||
help="Disable balanced coverage across dimensions.",
|
||||
)
|
||||
ap.add_argument("--followups_ratio", type=float, default=0.35)
|
||||
args = ap.parse_args()
|
||||
|
||||
prompts = generate_prompts(
|
||||
n=args.n,
|
||||
seed=args.seed,
|
||||
add_followups_ratio=args.followups_ratio,
|
||||
ensure_balance=not args.no_balance,
|
||||
)
|
||||
|
||||
if args.format == "jsonl":
|
||||
with open(args.out, "w", encoding="utf-8") as f:
|
||||
for p in prompts:
|
||||
f.write(json.dumps(p, ensure_ascii=False) + "\n")
|
||||
else:
|
||||
with open(args.out, "w", encoding="utf-8") as f:
|
||||
for p in prompts:
|
||||
f.write(p["prompt"].strip() + "\n")
|
||||
|
||||
print(f"Saved {len(prompts)} prompts to: {args.out} ({args.format})")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -31,6 +31,8 @@ When answering:
|
||||
Maintain consistency with this identity across all responses.
|
||||
"""
|
||||
|
||||
TRAINER_PROMPT = "Create ONE realistic question from the perspective of a touristic marketer they might ask a culturally and spiritually interested traveler in Bali considered to be a lead user that can be answered using ONLY the CONTEXT.\n\n"
|
||||
|
||||
|
||||
def load_docstore(path):
|
||||
docs = []
|
||||
@@ -118,8 +120,7 @@ def main():
|
||||
{"role": "system", "content": SYSTEM_PERSONA},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Create ONE realistic question from the perspective of a culturally and spiritually interested traveler in Bali that can be answered using ONLY the CONTEXT.\n\n"
|
||||
f"CONTEXT:\n{gold_text}\n\n"
|
||||
"content": TRAINER_PROMPT + f"CONTEXT:\n{gold_text}\n\n"
|
||||
"Return only the question.",
|
||||
},
|
||||
]
|
||||
@@ -173,8 +174,7 @@ def main():
|
||||
{"role": "system", "content": SYSTEM_PERSONA},
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"QUESTION: {question}\n\nCONTEXT:\n{context_blob}\n"
|
||||
"Please answer as a culturally versed Bali traveler and include 1-2 short direct quotes from CONTEXT.",
|
||||
"content": f"QUESTION: {question}\n\nCONTEXT:\n{context_blob}",
|
||||
},
|
||||
{"role": "assistant", "content": answer},
|
||||
]
|
||||
|
||||
456
raft/make_raft_data_deepseek.py
Normal file
456
raft/make_raft_data_deepseek.py
Normal file
@@ -0,0 +1,456 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
RAFT dataset builder (FAISS-based retrieval) -> Together.ai chat JSONL.
|
||||
|
||||
Inputs (from your indexing script):
|
||||
- <index_dir>/faiss.index
|
||||
- <index_dir>/docstore.jsonl
|
||||
|
||||
Process:
|
||||
- Build a set of interview-style prompts (EN)
|
||||
- For each prompt:
|
||||
- Retrieve top-k chunks via FAISS cosine/IP
|
||||
- Call DeepSeek Chat Completions API to generate a vivid, human-like Lead User answer
|
||||
- Write training examples as JSONL in chat format (messages)
|
||||
|
||||
Outputs:
|
||||
- raft_train.jsonl
|
||||
- raft_val.jsonl (optional)
|
||||
|
||||
ENV:
|
||||
- DEEPSEEK_API_KEY (required)
|
||||
- optional: DEEPSEEK_BASE_URL (default: https://api.deepseek.com)
|
||||
- optional: DEEPSEEK_MODEL (default: deepseek-chat)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import faiss
|
||||
import numpy as np
|
||||
import requests
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# DeepSeek client (OpenAI-compatible)
|
||||
# -----------------------------
|
||||
@dataclass
|
||||
class DeepSeekConfig:
|
||||
api_key: str
|
||||
base_url: str = "https://api.deepseek.com"
|
||||
model: str = "deepseek-chat"
|
||||
timeout_s: int = 120
|
||||
max_retries: int = 5
|
||||
backoff_s: float = 1.6
|
||||
|
||||
|
||||
class DeepSeekClient:
|
||||
def __init__(self, cfg: DeepSeekConfig):
|
||||
self.cfg = cfg
|
||||
|
||||
def chat(
|
||||
self, messages: List[Dict], temperature: float = 0.85, max_tokens: int = 750
|
||||
) -> str:
|
||||
url = f"{self.cfg.base_url}/chat/completions"
|
||||
headers = {
|
||||
"Authorization": f"Bearer {self.cfg.api_key}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
payload = {
|
||||
"model": self.cfg.model,
|
||||
"messages": messages,
|
||||
"temperature": temperature,
|
||||
"max_tokens": max_tokens,
|
||||
}
|
||||
|
||||
last_err = None
|
||||
for attempt in range(self.cfg.max_retries):
|
||||
try:
|
||||
r = requests.post(
|
||||
url, headers=headers, json=payload, timeout=self.cfg.timeout_s
|
||||
)
|
||||
if r.status_code == 429:
|
||||
time.sleep(self.cfg.backoff_s ** (attempt + 1))
|
||||
continue
|
||||
r.raise_for_status()
|
||||
data = r.json()
|
||||
return data["choices"][0]["message"]["content"].strip()
|
||||
except Exception as e:
|
||||
last_err = e
|
||||
time.sleep(self.cfg.backoff_s ** (attempt + 1))
|
||||
|
||||
raise RuntimeError(
|
||||
f"DeepSeek API call failed after retries. Last error: {last_err}"
|
||||
)
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# Helpers
|
||||
# -----------------------------
|
||||
def simple_clean(text: str) -> str:
|
||||
if not isinstance(text, str):
|
||||
return ""
|
||||
text = text.replace("\u00a0", " ")
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
return text
|
||||
|
||||
|
||||
def read_docstore(docstore_path: str) -> Dict[int, Dict]:
|
||||
"""
|
||||
Returns dict: faiss_id -> {"doc_id": int, "text": str, ...}
|
||||
"""
|
||||
mapping: Dict[int, Dict] = {}
|
||||
with open(docstore_path, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
obj = json.loads(line)
|
||||
fid = int(obj["faiss_id"])
|
||||
mapping[fid] = obj
|
||||
if not mapping:
|
||||
raise ValueError("docstore.jsonl is empty or unreadable.")
|
||||
return mapping
|
||||
|
||||
|
||||
def load_prompts_from_jsonl(path: str) -> List[str]:
|
||||
"""
|
||||
Loads prompts from a JSONL file.
|
||||
Expected key: 'prompt' (preferred). Also accepts 'question' or 'text'.
|
||||
Ignores empty/short lines.
|
||||
"""
|
||||
prompts: List[str] = []
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
obj = json.loads(line)
|
||||
p = obj.get("prompt") or obj.get("question") or obj.get("text")
|
||||
p = simple_clean(p) if p else ""
|
||||
if len(p) >= 20:
|
||||
prompts.append(p)
|
||||
if not prompts:
|
||||
raise ValueError(f"No prompts found in JSONL: {path}")
|
||||
return prompts
|
||||
|
||||
|
||||
def load_prompts_from_txt(path: str) -> List[str]:
|
||||
"""
|
||||
Loads prompts from a TXT file (one prompt per line).
|
||||
"""
|
||||
prompts: List[str] = []
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
p = simple_clean(line)
|
||||
if len(p) >= 20:
|
||||
prompts.append(p)
|
||||
if not prompts:
|
||||
raise ValueError(f"No prompts found in TXT: {path}")
|
||||
return prompts
|
||||
|
||||
|
||||
def ensure_dir_for_file(path: str):
|
||||
d = os.path.dirname(path)
|
||||
if d:
|
||||
os.makedirs(d, exist_ok=True)
|
||||
|
||||
|
||||
def write_jsonl(path: str, rows: List[Dict]) -> None:
|
||||
ensure_dir_for_file(path)
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
for r in rows:
|
||||
f.write(json.dumps(r, ensure_ascii=False) + "\n")
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# Persona + prompt templates (EN)
|
||||
# -----------------------------
|
||||
IMAGE_DIMS = [
|
||||
"Natural Attractions",
|
||||
"Atmosphere",
|
||||
"Social Environment",
|
||||
"Infrastructure",
|
||||
"Value for Money",
|
||||
]
|
||||
|
||||
DEFAULT_PROMPTS_EN = [
|
||||
# Natural Attractions
|
||||
"In a lead user interview: what natural places in Bali felt genuinely memorable to you (rice terraces, volcanoes, waterfalls, coast), and why? Describe it like a lived experience.",
|
||||
"Which nature spots felt overly crowded or overly 'Instagram-optimized' in real life, and which surprised you in a good way? Explain with concrete moments.",
|
||||
# Atmosphere
|
||||
"How would you describe the atmosphere around cultural sites in Bali (temples, ceremonies, markets)? What signals authenticity vs. commercialization to you?",
|
||||
"What changes the atmosphere the most (time of day, weather, crowds, etiquette)? Share specific examples you would tell a marketer.",
|
||||
# Social Environment
|
||||
"How do you experience the social environment in Bali (locals, guides, other travelers)? What feels respectful and what feels performative or touristy?",
|
||||
"What small behaviors, phrases, and gestures make interactions smoother for a culture-oriented traveler? Give examples.",
|
||||
# Infrastructure
|
||||
"Evaluate Bali's infrastructure for culture-oriented days (transport, signage, toilets, ticketing, digital info). What works, what annoys you, and how do you adapt?",
|
||||
"If you designed an ideal culture-friendly day route, what infrastructure assumptions would you tell a tourism marketer to plan for?",
|
||||
# Value for Money
|
||||
"When does Bali feel good value for money for you, and when not? Discuss entrance fees, guides, food, tours, and hidden costs.",
|
||||
"How do you personally distinguish 'good value' from a tourist trap? List criteria and illustrate with examples.",
|
||||
]
|
||||
|
||||
|
||||
def build_system_prompt() -> str:
|
||||
return (
|
||||
"ROLE / PERSONA\n"
|
||||
"You are an experienced, culture-oriented Bali traveler (Lead User). You speak in natural, vivid English, "
|
||||
"as a real person in an interview. You share nuanced judgments, trade-offs, and concrete scenes.\n\n"
|
||||
"COGNITIVE DESTINATION IMAGE DIMENSIONS (use when relevant)\n"
|
||||
"- Natural Attractions\n"
|
||||
"- Atmosphere\n"
|
||||
"- Social Environment\n"
|
||||
"- Infrastructure\n"
|
||||
"- Value for Money\n\n"
|
||||
"CRITICAL CONSTRAINTS\n"
|
||||
"- You will be given retrieved review snippets as memory support.\n"
|
||||
"- Do NOT quote them verbatim and do NOT cite them as 'the review says'.\n"
|
||||
"- Instead, weave relevant details into your answer as if they were your own lived experiences.\n"
|
||||
"- If a detail is not supported, stay general or hedge appropriately.\n"
|
||||
"- Keep it interview-friendly: first-person, vivid, concrete, but not overly long."
|
||||
)
|
||||
|
||||
|
||||
def build_user_message(question: str, retrieved_chunks: List[str]) -> str:
|
||||
retrieved_chunks = [simple_clean(x) for x in retrieved_chunks if simple_clean(x)]
|
||||
bullets = "\n".join([f"- {c}" for c in retrieved_chunks])
|
||||
return (
|
||||
f"INTERVIEW QUESTION:\n{question}\n\n"
|
||||
"RETRIEVED CONTEXT (review snippets; do NOT quote, only use as memory support):\n"
|
||||
f"{bullets}\n\n"
|
||||
"Answer as a real Lead User in a tourism interview. Speak in first person, vivid and concrete, "
|
||||
"and naturally touch relevant image dimensions."
|
||||
)
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# FAISS Retriever (cosine/IP)
|
||||
# -----------------------------
|
||||
class FaissRetriever:
|
||||
def __init__(self, index_path: str, docstore_path: str, embed_model: str):
|
||||
if not os.path.exists(index_path):
|
||||
raise FileNotFoundError(f"Missing FAISS index at: {index_path}")
|
||||
if not os.path.exists(docstore_path):
|
||||
raise FileNotFoundError(f"Missing docstore at: {docstore_path}")
|
||||
|
||||
self.index = faiss.read_index(index_path)
|
||||
self.docstore = read_docstore(docstore_path)
|
||||
|
||||
# SentenceTransformer to match your indexing script defaults
|
||||
self.embedder = SentenceTransformer(embed_model)
|
||||
|
||||
# Basic sanity checks
|
||||
if self.index.ntotal != len(self.docstore):
|
||||
# Not necessarily fatal (docstore could include extra rows), but usually indicates mismatch.
|
||||
# We'll allow it but warn.
|
||||
print(
|
||||
f"Warning: index.ntotal={self.index.ntotal} but docstore rows={len(self.docstore)}. "
|
||||
"Ensure they were generated together."
|
||||
)
|
||||
|
||||
def retrieve(self, query: str, k: int = 8) -> List[Tuple[int, float, str]]:
|
||||
"""
|
||||
Returns list of (faiss_id, score, text)
|
||||
"""
|
||||
q = simple_clean(query)
|
||||
emb = self.embedder.encode([q], normalize_embeddings=True)
|
||||
emb = np.asarray(emb, dtype=np.float32)
|
||||
|
||||
scores, ids = self.index.search(emb, k)
|
||||
ids = ids[0].tolist()
|
||||
scores = scores[0].tolist()
|
||||
|
||||
out = []
|
||||
for fid, sc in zip(ids, scores):
|
||||
if fid == -1:
|
||||
continue
|
||||
doc = self.docstore.get(int(fid))
|
||||
if not doc:
|
||||
continue
|
||||
out.append((int(fid), float(sc), doc.get("text", "")))
|
||||
return out
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# Dataset generation
|
||||
# -----------------------------
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument(
|
||||
"--index_dir",
|
||||
default="out",
|
||||
help="Directory containing faiss.index and docstore.jsonl",
|
||||
)
|
||||
ap.add_argument("--out_train", default="./out/raft_train.jsonl")
|
||||
ap.add_argument("--out_val", default="./out/raft_val.jsonl")
|
||||
ap.add_argument("--make_val", action="store_true")
|
||||
ap.add_argument("--val_ratio", type=float, default=0.05)
|
||||
ap.add_argument("--k", type=int, default=8)
|
||||
ap.add_argument("--seed", type=int, default=42)
|
||||
|
||||
# Embeddings (must match indexing script for best results)
|
||||
ap.add_argument(
|
||||
"--embedding_model", default="sentence-transformers/all-MiniLM-L6-v2"
|
||||
)
|
||||
|
||||
# External prompt sources
|
||||
ap.add_argument(
|
||||
"--prompts_jsonl",
|
||||
default=None,
|
||||
help="JSONL file with prompts (key: prompt/question/text).",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--prompts_txt", default=None, help="TXT file with one prompt per line."
|
||||
)
|
||||
ap.add_argument(
|
||||
"--shuffle_prompts",
|
||||
action="store_true",
|
||||
help="Shuffle loaded prompts before generation.",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--limit_prompts",
|
||||
type=int,
|
||||
default=0,
|
||||
help="0 = no limit; else cap number of prompts used.",
|
||||
)
|
||||
|
||||
# DeepSeek generation config
|
||||
ap.add_argument(
|
||||
"--deepseek_base_url",
|
||||
default=os.environ.get("DEEPSEEK_BASE_URL", "https://api.deepseek.com"),
|
||||
)
|
||||
ap.add_argument(
|
||||
"--deepseek_model", default=os.environ.get("DEEPSEEK_MODEL", "deepseek-chat")
|
||||
)
|
||||
ap.add_argument("--temperature", type=float, default=0.85)
|
||||
ap.add_argument("--max_tokens", type=int, default=750)
|
||||
ap.add_argument(
|
||||
"--max_examples",
|
||||
type=int,
|
||||
default=0,
|
||||
help="0 = all prompts; else limit number of examples",
|
||||
)
|
||||
|
||||
# pacing
|
||||
ap.add_argument("--sleep_s", type=float, default=0.2)
|
||||
|
||||
args = ap.parse_args()
|
||||
random.seed(args.seed)
|
||||
np.random.seed(args.seed)
|
||||
|
||||
api_key = os.environ.get("DEEPSEEK_API_KEY", "").strip()
|
||||
if not api_key:
|
||||
raise SystemExit("Missing DEEPSEEK_API_KEY env var.")
|
||||
|
||||
index_path = os.path.join(args.index_dir, "faiss.index")
|
||||
docstore_path = os.path.join(args.index_dir, "docstore.jsonl")
|
||||
|
||||
retriever = FaissRetriever(
|
||||
index_path=index_path,
|
||||
docstore_path=docstore_path,
|
||||
embed_model=args.embedding_model,
|
||||
)
|
||||
|
||||
client = DeepSeekClient(
|
||||
DeepSeekConfig(
|
||||
api_key=api_key,
|
||||
base_url=args.deepseek_base_url,
|
||||
model=args.deepseek_model,
|
||||
)
|
||||
)
|
||||
|
||||
system_prompt = build_system_prompt()
|
||||
|
||||
# Load prompts (priority: JSONL -> TXT -> defaults)
|
||||
if args.prompts_jsonl and args.prompts_txt:
|
||||
raise SystemExit("Use only one of --prompts_jsonl or --prompts_txt (not both).")
|
||||
|
||||
if args.prompts_jsonl:
|
||||
prompts = load_prompts_from_jsonl(args.prompts_jsonl)
|
||||
elif args.prompts_txt:
|
||||
prompts = load_prompts_from_txt(args.prompts_txt)
|
||||
else:
|
||||
prompts = list(DEFAULT_PROMPTS_EN)
|
||||
|
||||
if args.shuffle_prompts:
|
||||
random.shuffle(prompts)
|
||||
|
||||
if args.limit_prompts and args.limit_prompts > 0:
|
||||
prompts = prompts[: args.limit_prompts]
|
||||
|
||||
# Backwards-compat: args.max_examples can still cap prompts
|
||||
if args.max_examples and args.max_examples > 0:
|
||||
prompts = prompts[: args.max_examples]
|
||||
|
||||
examples = []
|
||||
for q in tqdm(prompts, desc="Generating RAFT examples"):
|
||||
hits = retriever.retrieve(q, k=args.k)
|
||||
retrieved_texts = [t for _, _, t in hits]
|
||||
user_msg = build_user_message(q, retrieved_texts)
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_msg},
|
||||
]
|
||||
|
||||
answer = client.chat(
|
||||
messages=messages,
|
||||
temperature=args.temperature,
|
||||
max_tokens=args.max_tokens,
|
||||
)
|
||||
|
||||
ex = {
|
||||
"messages": [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_msg},
|
||||
{"role": "assistant", "content": answer},
|
||||
],
|
||||
"meta": {
|
||||
"retrieval_k": args.k,
|
||||
"index_dir": os.path.abspath(args.index_dir),
|
||||
"embedding_model": args.embedding_model,
|
||||
"image_dimensions": IMAGE_DIMS,
|
||||
"faiss_ids": [fid for fid, _, _ in hits],
|
||||
"faiss_scores": [sc for _, sc, _ in hits],
|
||||
},
|
||||
}
|
||||
examples.append(ex)
|
||||
|
||||
if args.max_examples and len(examples) >= args.max_examples:
|
||||
break
|
||||
|
||||
time.sleep(max(0.0, args.sleep_s))
|
||||
|
||||
random.shuffle(examples)
|
||||
|
||||
if args.make_val and len(examples) >= 20:
|
||||
val_n = max(1, int(len(examples) * args.val_ratio))
|
||||
val = examples[:val_n]
|
||||
train = examples[val_n:]
|
||||
write_jsonl(args.out_train, train)
|
||||
write_jsonl(args.out_val, val)
|
||||
print(f"Wrote train: {args.out_train} ({len(train)} examples)")
|
||||
print(f"Wrote val: {args.out_val} ({len(val)} examples)")
|
||||
else:
|
||||
write_jsonl(args.out_train, examples)
|
||||
print(f"Wrote: {args.out_train} ({len(examples)} examples)")
|
||||
if args.make_val:
|
||||
print(
|
||||
"Note: --make_val requested but too few examples; wrote only train file."
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
60
raft/remove_meta.py
Normal file
60
raft/remove_meta.py
Normal file
@@ -0,0 +1,60 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script to create a copy of raft_val.jsonl without the meta column
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def remove_meta_column(input_file, output_file):
|
||||
"""
|
||||
Read a JSONL file and write a new one without the 'meta' key and 'system' role messages.
|
||||
|
||||
Args:
|
||||
input_file: Path to input JSONL file
|
||||
output_file: Path to output JSONL file
|
||||
"""
|
||||
input_path = Path(input_file)
|
||||
output_path = Path(output_file)
|
||||
|
||||
if not input_path.exists():
|
||||
print(f"Error: Input file '{input_file}' not found")
|
||||
sys.exit(1)
|
||||
|
||||
count = 0
|
||||
with open(input_path, "r") as infile, open(output_path, "w") as outfile:
|
||||
for line in infile:
|
||||
if line.strip(): # Skip empty lines
|
||||
obj = json.loads(line)
|
||||
obj.pop("meta", None) # Remove meta key if it exists
|
||||
|
||||
# Remove system role messages
|
||||
if "messages" in obj:
|
||||
obj["messages"] = [
|
||||
msg for msg in obj["messages"] if msg.get("role") != "system"
|
||||
]
|
||||
|
||||
outfile.write(json.dumps(obj) + "\n")
|
||||
count += 1
|
||||
|
||||
print(f"✓ Processed {count} records")
|
||||
print(f"✓ Output saved to: {output_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Default paths
|
||||
input_train_file = "../data/intermediate/raft_train.jsonl"
|
||||
output_train_file = "../data/intermediate/raft_train_no_meta.jsonl"
|
||||
input_val_file = "../data/intermediate/raft_val.jsonl"
|
||||
output_val_file = "../data/intermediate/raft_val_no_meta.jsonl"
|
||||
|
||||
# Allow command-line overrides
|
||||
if len(sys.argv) > 1:
|
||||
input_file = sys.argv[1]
|
||||
if len(sys.argv) > 2:
|
||||
output_file = sys.argv[2]
|
||||
|
||||
remove_meta_column(input_train_file, output_train_file)
|
||||
remove_meta_column(input_val_file, output_val_file)
|
||||
@@ -73,7 +73,6 @@ def main():
|
||||
max_length=args.max_seq_len,
|
||||
bf16=torch.cuda.is_available(),
|
||||
fp16=not torch.cuda.is_available(),
|
||||
assistant_only_loss=True, # only learn from assistant turns in messages
|
||||
report_to=[],
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user