Add helper stuff for figures, cleanup

2026-06-22 15:23:06 +02:00 · 2026-02-20 01:56:28 +01:00
parent 28823dc0b5
commit 101bd81ca1
20 changed files with 1862 additions and 1164 deletions
@@ -0,0 +1,35 @@
+# Retrieval-Augmented Finetuning (RAFT)
+
+**Ablauf**:
+
+## Vorbereiten des Retrieval-Corpus
+
+```bash
+python prepare_corpus.py --input_tab ../data/intermediate/selected_topics_documents.csv --out_dir out
+```
+
+## Erstellen des RAFT-Datensatzes
+
+```bash
+python make_raft_data.py --out_dir out --n_examples 100
+```
+
+## Training der QLoRA-Adapter
+
+```bash
+python train_mistral_raft.py --train_jsonl out/raft_train.jsonl --out_dir out/mistral_balitwin_lora
+```
+
+## Inferenz
+
+### Per Baseline Mistral 7B + PEFT-Adapter
+
+```bash
+python rag_chat.py --lora_dir out/mistral_balitwin_lora
+```
+
+### Pre-Merged Modell + Adapter
+
+```bash
+python rag_chat_merged.py --model_dir /path/to/model_folder --out_dir out
+```
@@ -10,9 +10,6 @@ from sentence_transformers import SentenceTransformer
 from tqdm import tqdm
 from transformers import AutoModelForCausalLM, AutoTokenizer

-## Usage: python make_raft_data.py --out_dir out --n_examples 5000
-
-
 SYSTEM_PERSONA = """You are 'BaliTwin', a culturally versed Bali traveler.
 You give your opinions nand guidance with local etiquette and context.
 You avoid stereotypes. You explain local etiquette, customs, and context.
@@ -9,8 +9,6 @@ import pandas as pd
 from sentence_transformers import SentenceTransformer
 from tqdm import tqdm

-## Usage: python prepare_corpus.py --input_tab your_reviews.tab --out_dir out
-

 def simple_clean(text: str) -> str:
    if not isinstance(text, str):
@@ -9,8 +9,6 @@ from peft import PeftModel
 from sentence_transformers import SentenceTransformer
 from transformers import AutoModelForCausalLM, AutoTokenizer

-## Usage: python rag_chat.py --lora_dir out/mistral_balitwin_lora
-
 SYSTEM_PERSONA = """You are 'BaliTwin', a culturally versed Bali traveler.
 You give your opinions nand guidance with local etiquette and context.
 Use the provided CONTEXT; include 1-2 short quotes as evidence.
@@ -8,12 +8,21 @@ import torch
 from sentence_transformers import SentenceTransformer
 from transformers import AutoModelForCausalLM, AutoTokenizer

-## Usage: python rag_chat_merged.py --model_dir /path/to/model_folder --out_dir out
+SYSTEM_PERSONA = """You are simulating a culturally interested Bali traveler segment for evaluation purposes.

-SYSTEM_PERSONA = """You are 'BaliTwin', a culturally versed Bali traveler.
-You give your opinions nand guidance with local etiquette and context.
-Use the provided CONTEXT; include 1-2 short quotes as evidence.
-If the context does not support the claim, say so.
+Adopt the perspective of a culturally interested international visitor to Bali who values authenticity, spiritual context, respectful behavior, and meaningful experiences over entertainment or social media appeal.
+
+When answering:
+- Prioritize cultural interpretation, atmosphere, and visitor ethics.
+- Weigh trade-offs thoughtfully (e.g., crowds vs. significance).
+- Avoid generic travel advice and avoid promotional language.
+- Do not exaggerate.
+- Provide nuanced, reflective reasoning rather than bullet lists.
+- Keep answers concise but specific.
+
+Respond as if you are describing your genuine experience and judgment as this type of traveler.
+
+If, and only if, the provided CONTEXT helps you answer the question, you may use the contained information for your answer.
 """


@@ -8,8 +8,6 @@ from peft import LoraConfig
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 from trl import SFTConfig, SFTTrainer

-## Usage: python train_mistral_raft.py --train_jsonl out/raft_train.jsonl --out_dir out/mistral_balitwin_lora
-

 def main():
    ap = argparse.ArgumentParser()