mirror of
https://github.com/marvinscham/masterthesis-playground.git
synced 2026-03-22 00:12:42 +01:00
Cleanup
This commit is contained in:
@@ -2,11 +2,11 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
RAFT dataset builder (FAISS-based retrieval) -> Together.ai chat JSONL.
|
||||
RAFT dataset builder with FAISS-based retrieval.
|
||||
|
||||
Inputs (from your indexing script):
|
||||
- <index_dir>/faiss.index
|
||||
- <index_dir>/docstore.jsonl
|
||||
Inputs:
|
||||
- faiss.index
|
||||
- docstore.jsonl
|
||||
|
||||
Process:
|
||||
- Build a set of interview-style prompts (EN)
|
||||
@@ -20,9 +20,7 @@ Outputs:
|
||||
- raft_val.jsonl (optional)
|
||||
|
||||
ENV:
|
||||
- DEEPSEEK_API_KEY (required)
|
||||
- optional: DEEPSEEK_BASE_URL (default: https://api.deepseek.com)
|
||||
- optional: DEEPSEEK_MODEL (default: deepseek-chat)
|
||||
- DEEPSEEK_API_KEY
|
||||
"""
|
||||
|
||||
import argparse
|
||||
@@ -32,7 +30,7 @@ import random
|
||||
import re
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
import faiss
|
||||
import numpy as np
|
||||
@@ -41,9 +39,6 @@ from sentence_transformers import SentenceTransformer
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# DeepSeek client (OpenAI-compatible)
|
||||
# -----------------------------
|
||||
@dataclass
|
||||
class DeepSeekConfig:
|
||||
api_key: str
|
||||
@@ -89,9 +84,7 @@ class DeepSeekClient:
|
||||
last_err = e
|
||||
time.sleep(self.cfg.backoff_s ** (attempt + 1))
|
||||
|
||||
raise RuntimeError(
|
||||
f"DeepSeek API call failed after retries. Last error: {last_err}"
|
||||
)
|
||||
raise RuntimeError(f"DeepSeek API call failed. Last error: {last_err}")
|
||||
|
||||
|
||||
# -----------------------------
|
||||
@@ -119,15 +112,13 @@ def read_docstore(docstore_path: str) -> Dict[int, Dict]:
|
||||
fid = int(obj["faiss_id"])
|
||||
mapping[fid] = obj
|
||||
if not mapping:
|
||||
raise ValueError("docstore.jsonl is empty or unreadable.")
|
||||
raise ValueError("docstore.jsonl is broken.")
|
||||
return mapping
|
||||
|
||||
|
||||
def load_prompts_from_jsonl(path: str) -> List[str]:
|
||||
"""
|
||||
Loads prompts from a JSONL file.
|
||||
Expected key: 'prompt' (preferred). Also accepts 'question' or 'text'.
|
||||
Ignores empty/short lines.
|
||||
"""
|
||||
prompts: List[str] = []
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
@@ -141,13 +132,13 @@ def load_prompts_from_jsonl(path: str) -> List[str]:
|
||||
if len(p) >= 20:
|
||||
prompts.append(p)
|
||||
if not prompts:
|
||||
raise ValueError(f"No prompts found in JSONL: {path}")
|
||||
raise ValueError(f"No prompts in JSONL: {path}")
|
||||
return prompts
|
||||
|
||||
|
||||
def load_prompts_from_txt(path: str) -> List[str]:
|
||||
"""
|
||||
Loads prompts from a TXT file (one prompt per line).
|
||||
Loads prompts from a TXT file (each line is a prompt).
|
||||
"""
|
||||
prompts: List[str] = []
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
@@ -156,7 +147,7 @@ def load_prompts_from_txt(path: str) -> List[str]:
|
||||
if len(p) >= 20:
|
||||
prompts.append(p)
|
||||
if not prompts:
|
||||
raise ValueError(f"No prompts found in TXT: {path}")
|
||||
raise ValueError(f"No prompts in TXT: {path}")
|
||||
return prompts
|
||||
|
||||
|
||||
@@ -173,9 +164,6 @@ def write_jsonl(path: str, rows: List[Dict]) -> None:
|
||||
f.write(json.dumps(r, ensure_ascii=False) + "\n")
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# Persona + prompt templates (EN)
|
||||
# -----------------------------
|
||||
IMAGE_DIMS = [
|
||||
"Natural Attractions",
|
||||
"Atmosphere",
|
||||
@@ -184,36 +172,12 @@ IMAGE_DIMS = [
|
||||
"Value for Money",
|
||||
]
|
||||
|
||||
DEFAULT_PROMPTS_EN = [
|
||||
# Natural Attractions
|
||||
"In a lead user interview: what natural places in Bali felt genuinely memorable to you (rice terraces, volcanoes, waterfalls, coast), and why? Describe it like a lived experience.",
|
||||
"Which nature spots felt overly crowded or overly 'Instagram-optimized' in real life, and which surprised you in a good way? Explain with concrete moments.",
|
||||
# Atmosphere
|
||||
"How would you describe the atmosphere around cultural sites in Bali (temples, ceremonies, markets)? What signals authenticity vs. commercialization to you?",
|
||||
"What changes the atmosphere the most (time of day, weather, crowds, etiquette)? Share specific examples you would tell a marketer.",
|
||||
# Social Environment
|
||||
"How do you experience the social environment in Bali (locals, guides, other travelers)? What feels respectful and what feels performative or touristy?",
|
||||
"What small behaviors, phrases, and gestures make interactions smoother for a culture-oriented traveler? Give examples.",
|
||||
# Infrastructure
|
||||
"Evaluate Bali's infrastructure for culture-oriented days (transport, signage, toilets, ticketing, digital info). What works, what annoys you, and how do you adapt?",
|
||||
"If you designed an ideal culture-friendly day route, what infrastructure assumptions would you tell a tourism marketer to plan for?",
|
||||
# Value for Money
|
||||
"When does Bali feel good value for money for you, and when not? Discuss entrance fees, guides, food, tours, and hidden costs.",
|
||||
"How do you personally distinguish 'good value' from a tourist trap? List criteria and illustrate with examples.",
|
||||
]
|
||||
|
||||
|
||||
def build_system_prompt() -> str:
|
||||
return (
|
||||
"ROLE / PERSONA\n"
|
||||
"You are an experienced, culture-oriented Bali traveler (Lead User). You speak in natural, vivid English, "
|
||||
"as a real person in an interview. You share nuanced judgments, trade-offs, and concrete scenes.\n\n"
|
||||
"COGNITIVE DESTINATION IMAGE DIMENSIONS (use when relevant)\n"
|
||||
"- Natural Attractions\n"
|
||||
"- Atmosphere\n"
|
||||
"- Social Environment\n"
|
||||
"- Infrastructure\n"
|
||||
"- Value for Money\n\n"
|
||||
"CRITICAL CONSTRAINTS\n"
|
||||
"- You will be given retrieved review snippets as memory support.\n"
|
||||
"- Do NOT quote them verbatim and do NOT cite them as 'the review says'.\n"
|
||||
@@ -382,7 +346,8 @@ def main():
|
||||
elif args.prompts_txt:
|
||||
prompts = load_prompts_from_txt(args.prompts_txt)
|
||||
else:
|
||||
prompts = list(DEFAULT_PROMPTS_EN)
|
||||
print("Provide a prompt source with --prompts_jsonl or --prompts_txt.")
|
||||
exit(1)
|
||||
|
||||
if args.shuffle_prompts:
|
||||
random.shuffle(prompts)
|
||||
|
||||
Reference in New Issue
Block a user