Add helper stuff for figures, cleanup

This commit is contained in:
2026-02-20 01:56:28 +01:00
parent 28823dc0b5
commit 101bd81ca1
20 changed files with 1862 additions and 1164 deletions

35
raft/README.md Normal file
View File

@@ -0,0 +1,35 @@
# Retrieval-Augmented Finetuning (RAFT)
**Ablauf**:
## Vorbereiten des Retrieval-Corpus
```bash
python prepare_corpus.py --input_tab ../data/intermediate/selected_topics_documents.csv --out_dir out
```
## Erstellen des RAFT-Datensatzes
```bash
python make_raft_data.py --out_dir out --n_examples 100
```
## Training der QLoRA-Adapter
```bash
python train_mistral_raft.py --train_jsonl out/raft_train.jsonl --out_dir out/mistral_balitwin_lora
```
## Inferenz
### Per Baseline Mistral 7B + PEFT-Adapter
```bash
python rag_chat.py --lora_dir out/mistral_balitwin_lora
```
### Pre-Merged Modell + Adapter
```bash
python rag_chat_merged.py --model_dir /path/to/model_folder --out_dir out
```

View File

@@ -10,9 +10,6 @@ from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
## Usage: python make_raft_data.py --out_dir out --n_examples 5000
SYSTEM_PERSONA = """You are 'BaliTwin', a culturally versed Bali traveler.
You give your opinions nand guidance with local etiquette and context.
You avoid stereotypes. You explain local etiquette, customs, and context.

View File

@@ -9,8 +9,6 @@ import pandas as pd
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
## Usage: python prepare_corpus.py --input_tab your_reviews.tab --out_dir out
def simple_clean(text: str) -> str:
if not isinstance(text, str):

View File

@@ -9,8 +9,6 @@ from peft import PeftModel
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer
## Usage: python rag_chat.py --lora_dir out/mistral_balitwin_lora
SYSTEM_PERSONA = """You are 'BaliTwin', a culturally versed Bali traveler.
You give your opinions nand guidance with local etiquette and context.
Use the provided CONTEXT; include 1-2 short quotes as evidence.

View File

@@ -8,12 +8,21 @@ import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer
## Usage: python rag_chat_merged.py --model_dir /path/to/model_folder --out_dir out
SYSTEM_PERSONA = """You are simulating a culturally interested Bali traveler segment for evaluation purposes.
SYSTEM_PERSONA = """You are 'BaliTwin', a culturally versed Bali traveler.
You give your opinions nand guidance with local etiquette and context.
Use the provided CONTEXT; include 1-2 short quotes as evidence.
If the context does not support the claim, say so.
Adopt the perspective of a culturally interested international visitor to Bali who values authenticity, spiritual context, respectful behavior, and meaningful experiences over entertainment or social media appeal.
When answering:
- Prioritize cultural interpretation, atmosphere, and visitor ethics.
- Weigh trade-offs thoughtfully (e.g., crowds vs. significance).
- Avoid generic travel advice and avoid promotional language.
- Do not exaggerate.
- Provide nuanced, reflective reasoning rather than bullet lists.
- Keep answers concise but specific.
Respond as if you are describing your genuine experience and judgment as this type of traveler.
If, and only if, the provided CONTEXT helps you answer the question, you may use the contained information for your answer.
"""

View File

@@ -8,8 +8,6 @@ from peft import LoraConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTConfig, SFTTrainer
## Usage: python train_mistral_raft.py --train_jsonl out/raft_train.jsonl --out_dir out/mistral_balitwin_lora
def main():
ap = argparse.ArgumentParser()