Cleanup

2026-03-22 00:12:42 +01:00 · 2026-02-24 23:39:14 +01:00
parent a2967767d3
commit 0d2807f59f
15 changed files with 648 additions and 10319 deletions
--- a/raft/make_raft_data.py
+++ b/raft/make_raft_data.py
@@ -2,11 +2,11 @@
 # -*- coding: utf-8 -*-

 """
-RAFT dataset builder (FAISS-based retrieval) -> Together.ai chat JSONL.
+RAFT dataset builder with FAISS-based retrieval.

-Inputs (from your indexing script):
- <index_dir>/faiss.index
- <index_dir>/docstore.jsonl
+Inputs:
+- faiss.index
+- docstore.jsonl

 Process:
 - Build a set of interview-style prompts (EN)
@@ -20,9 +20,7 @@ Outputs:
 - raft_val.jsonl (optional)

 ENV:
- DEEPSEEK_API_KEY (required)
- optional: DEEPSEEK_BASE_URL (default: https://api.deepseek.com)
- optional: DEEPSEEK_MODEL (default: deepseek-chat)
+- DEEPSEEK_API_KEY
 """

 import argparse
@@ -32,7 +30,7 @@ import random
 import re
 import time
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Tuple

 import faiss
 import numpy as np
@@ -41,9 +39,6 @@ from sentence_transformers import SentenceTransformer
 from tqdm import tqdm


-# -----------------------------
-# DeepSeek client (OpenAI-compatible)
-# -----------------------------
@dataclass
 class DeepSeekConfig:
    api_key: str
@@ -89,9 +84,7 @@ class DeepSeekClient:
                last_err = e
                time.sleep(self.cfg.backoff_s ** (attempt + 1))

-        raise RuntimeError(
-            f"DeepSeek API call failed after retries. Last error: {last_err}"
-        )
+        raise RuntimeError(f"DeepSeek API call failed. Last error: {last_err}")


 # -----------------------------
@@ -119,15 +112,13 @@ def read_docstore(docstore_path: str) -> Dict[int, Dict]:
            fid = int(obj["faiss_id"])
            mapping[fid] = obj
    if not mapping:
-        raise ValueError("docstore.jsonl is empty or unreadable.")
+        raise ValueError("docstore.jsonl is broken.")
    return mapping


 def load_prompts_from_jsonl(path: str) -> List[str]:
    """
    Loads prompts from a JSONL file.
-    Expected key: 'prompt' (preferred). Also accepts 'question' or 'text'.
-    Ignores empty/short lines.
    """
    prompts: List[str] = []
    with open(path, "r", encoding="utf-8") as f:
@@ -141,13 +132,13 @@ def load_prompts_from_jsonl(path: str) -> List[str]:
            if len(p) >= 20:
                prompts.append(p)
    if not prompts:
-        raise ValueError(f"No prompts found in JSONL: {path}")
+        raise ValueError(f"No prompts in JSONL: {path}")
    return prompts


 def load_prompts_from_txt(path: str) -> List[str]:
    """
-    Loads prompts from a TXT file (one prompt per line).
+    Loads prompts from a TXT file (each line is a prompt).
    """
    prompts: List[str] = []
    with open(path, "r", encoding="utf-8") as f:
@@ -156,7 +147,7 @@ def load_prompts_from_txt(path: str) -> List[str]:
            if len(p) >= 20:
                prompts.append(p)
    if not prompts:
-        raise ValueError(f"No prompts found in TXT: {path}")
+        raise ValueError(f"No prompts in TXT: {path}")
    return prompts


@@ -173,9 +164,6 @@ def write_jsonl(path: str, rows: List[Dict]) -> None:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")


-# -----------------------------
-# Persona + prompt templates (EN)
-# -----------------------------
 IMAGE_DIMS = [
    "Natural Attractions",
    "Atmosphere",
@@ -184,36 +172,12 @@ IMAGE_DIMS = [
    "Value for Money",
 ]

-DEFAULT_PROMPTS_EN = [
-    # Natural Attractions
-    "In a lead user interview: what natural places in Bali felt genuinely memorable to you (rice terraces, volcanoes, waterfalls, coast), and why? Describe it like a lived experience.",
-    "Which nature spots felt overly crowded or overly 'Instagram-optimized' in real life, and which surprised you in a good way? Explain with concrete moments.",
-    # Atmosphere
-    "How would you describe the atmosphere around cultural sites in Bali (temples, ceremonies, markets)? What signals authenticity vs. commercialization to you?",
-    "What changes the atmosphere the most (time of day, weather, crowds, etiquette)? Share specific examples you would tell a marketer.",
-    # Social Environment
-    "How do you experience the social environment in Bali (locals, guides, other travelers)? What feels respectful and what feels performative or touristy?",
-    "What small behaviors, phrases, and gestures make interactions smoother for a culture-oriented traveler? Give examples.",
-    # Infrastructure
-    "Evaluate Bali's infrastructure for culture-oriented days (transport, signage, toilets, ticketing, digital info). What works, what annoys you, and how do you adapt?",
-    "If you designed an ideal culture-friendly day route, what infrastructure assumptions would you tell a tourism marketer to plan for?",
-    # Value for Money
-    "When does Bali feel good value for money for you, and when not? Discuss entrance fees, guides, food, tours, and hidden costs.",
-    "How do you personally distinguish 'good value' from a tourist trap? List criteria and illustrate with examples.",
-]
-

 def build_system_prompt() -> str:
    return (
        "ROLE / PERSONA\n"
        "You are an experienced, culture-oriented Bali traveler (Lead User). You speak in natural, vivid English, "
        "as a real person in an interview. You share nuanced judgments, trade-offs, and concrete scenes.\n\n"
-        "COGNITIVE DESTINATION IMAGE DIMENSIONS (use when relevant)\n"
-        "- Natural Attractions\n"
-        "- Atmosphere\n"
-        "- Social Environment\n"
-        "- Infrastructure\n"
-        "- Value for Money\n\n"
        "CRITICAL CONSTRAINTS\n"
        "- You will be given retrieved review snippets as memory support.\n"
        "- Do NOT quote them verbatim and do NOT cite them as 'the review says'.\n"
@@ -382,7 +346,8 @@ def main():
    elif args.prompts_txt:
        prompts = load_prompts_from_txt(args.prompts_txt)
    else:
-        prompts = list(DEFAULT_PROMPTS_EN)
+        print("Provide a prompt source with --prompts_jsonl or --prompts_txt.")
+        exit(1)

    if args.shuffle_prompts:
        random.shuffle(prompts)