RAFT test setup

This commit is contained in:
Marvin Scham
2025-10-13 17:34:49 +02:00
parent bfff8f7d96
commit 0d1dc45ec0
10 changed files with 78717 additions and 72462 deletions

View File

@@ -0,0 +1,417 @@
# Re-create the Jupyter Notebook for RAFT QLoRA fine-tuning and save to /mnt/data
import nbformat as nbf
from pathlib import Path
nb = nbf.v4.new_notebook()
nb.metadata.update(
{
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3",
},
"language_info": {"name": "python", "version": "3.x"},
}
)
cells = []
cells.append(
nbf.v4.new_markdown_cell(
"""
# RAFT Supervised Fine-Tuning (QLoRA) — Local Training
This notebook fine-tunes an open-source base model on a RAFT-style dataset (`input` → `output`) using **QLoRA** with **PEFT** and **Transformers**. It is designed to run locally (single or multi-GPU) and to export both **LoRA adapters** and (optionally) a **merged** model for inference.
> **Assumptions**
> - Your dataset lives at `./outputs/raft_dataset.jsonl` (from the previous notebook). Adjust the path if needed.
> - You have a CUDA-capable GPU and can install `bitsandbytes`. (CPU training is possible but slow.)
> - You have enough VRAM for the chosen base model when loaded in 4-bit NF4.
"""
)
)
cells.append(nbf.v4.new_markdown_cell("## 0) Install dependencies"))
cells.append(
nbf.v4.new_code_cell(
"""
# If needed, uncomment the following installs:
# %pip install --quiet transformers==4.44.2 datasets==2.20.0 peft==0.12.0 accelerate==0.34.2 bitsandbytes==0.43.3 evaluate==0.4.2 sentencepiece==0.2.0
# Optional extras:
# %pip install --quiet trl==0.9.6 sacrebleu==2.4.3 rouge-score==0.1.2
"""
)
)
cells.append(nbf.v4.new_markdown_cell("## 1) Configuration"))
cells.append(
nbf.v4.new_code_cell(
"""
from pathlib import Path
# Paths
DATA_JSONL = Path("./outputs/raft_dataset.jsonl") # change if different
RUN_NAME = "raft_qlora_run"
OUTPUT_DIR = Path(f"./finetuned/{RUN_NAME}")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# Base model — examples: "meta-llama/Llama-3.1-8B", "Qwen/Qwen2-7B-Instruct", "mistralai/Mistral-7B-Instruct-v0.3"
# Prefer an instruction-tuned base for better stability on SFT.
BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
# Tokenization/prompt formatting
SYSTEM_PREFIX = "You are a helpful assistant. Answer concisely and truthfully based ONLY on the user's request."
USE_CHAT_TEMPLATE = True # if the tokenizer has a chat template, we'll leverage it
# QLoRA/PEFT params
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05
TARGET_MODULES = None # None = let PEFT auto-detect common modules (works for most models)
# 4-bit quantization (QLoRA)
LOAD_IN_4BIT = True
BNB_4BIT_COMPUTE_DTYPE = "bfloat16" # "float16" or "bfloat16"
BNB_4BIT_QUANT_TYPE = "nf4" # "nf4" or "fp4"
BNB_4BIT_USE_DOUBLE_QUANT = True
# Training
TRAIN_VAL_SPLIT = 0.98
MAX_SEQ_LEN = 2048
PER_DEVICE_TRAIN_BATCH = 1
PER_DEVICE_EVAL_BATCH = 1
GRADIENT_ACCUM_STEPS = 16
LEARNING_RATE = 2e-4
NUM_TRAIN_EPOCHS = 2
WEIGHT_DECAY = 0.0
WARMUP_RATIO = 0.03
LR_SCHEDULER_TYPE = "cosine"
LOGGING_STEPS = 10
EVAL_STEPS = 200
SAVE_STEPS = 200
BF16 = True
FP16 = False
SEED = 7
"""
)
)
cells.append(nbf.v4.new_markdown_cell("## 2) Load dataset (JSONL)"))
cells.append(
nbf.v4.new_code_cell(
"""
import json, random
from datasets import Dataset
def read_jsonl(p: Path):
rows = []
with p.open("r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
obj = json.loads(line)
if "input" in obj and "output" in obj:
rows.append(obj)
except Exception:
pass
return rows
rows = read_jsonl(DATA_JSONL)
print(f"Loaded {len(rows)} rows from {DATA_JSONL}")
random.Random(SEED).shuffle(rows)
split = int(len(rows) * TRAIN_VAL_SPLIT)
train_rows = rows[:split]
val_rows = rows[split:] if split < len(rows) else rows[-max(1, len(rows)//50):]
train_ds = Dataset.from_list(train_rows)
eval_ds = Dataset.from_list(val_rows) if val_rows else None
train_ds, eval_ds
"""
)
)
cells.append(nbf.v4.new_markdown_cell("## 3) Prompt formatting"))
cells.append(
nbf.v4.new_code_cell(
"""
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
def format_example(ex):
user = ex["input"]
assistant = ex["output"]
if USE_CHAT_TEMPLATE and hasattr(tokenizer, "apply_chat_template"):
messages = [
{"role": "system", "content": SYSTEM_PREFIX},
{"role": "user", "content": user},
{"role": "assistant", "content": assistant},
]
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
else:
text = f"<s>[SYSTEM]\\n{SYSTEM_PREFIX}\\n[/SYSTEM]\\n[USER]\\n{user}\\n[/USER]\\n[ASSISTANT]\\n{assistant}</s>"
return {"text": text}
train_ds_fmt = train_ds.map(format_example, remove_columns=train_ds.column_names)
eval_ds_fmt = eval_ds.map(format_example, remove_columns=eval_ds.column_names) if eval_ds else None
print(train_ds_fmt[0]["text"][:400])
"""
)
)
cells.append(nbf.v4.new_markdown_cell("## 4) Tokenize"))
cells.append(
nbf.v4.new_code_cell(
"""
def tokenize(batch):
return tokenizer(
batch["text"],
truncation=True,
max_length=MAX_SEQ_LEN,
padding="max_length",
return_tensors=None,
)
train_tok = train_ds_fmt.map(tokenize, batched=True, remove_columns=train_ds_fmt.column_names)
eval_tok = eval_ds_fmt.map(tokenize, batched=True, remove_columns=eval_ds_fmt.column_names) if eval_ds_fmt else None
train_tok = train_tok.rename_column("input_ids", "input_ids")
train_tok = train_tok.add_column("labels", train_tok["input_ids"])
if eval_tok:
eval_tok = eval_tok.add_column("labels", eval_tok["input_ids"])
train_tok, (eval_tok[0]['input_ids'][:10] if eval_tok else [])
"""
)
)
cells.append(
nbf.v4.new_markdown_cell(
"## 5) Load base model with 4-bit quantization and prepare QLoRA"
)
)
cells.append(
nbf.v4.new_code_cell(
"""
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
bnb_config = None
if LOAD_IN_4BIT:
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=BNB_4BIT_USE_DOUBLE_QUANT,
bnb_4bit_quant_type=BNB_4BIT_QUANT_TYPE,
bnb_4bit_compute_dtype=getattr(torch, BNB_4BIT_COMPUTE_DTYPE)
)
model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
quantization_config=bnb_config,
torch_dtype=torch.bfloat16 if BF16 else (torch.float16 if FP16 else None),
device_map="auto",
)
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
r=LORA_R,
lora_alpha=LORA_ALPHA,
lora_dropout=LORA_DROPOUT,
bias="none",
task_type="CAUSAL_LM",
target_modules=TARGET_MODULES,
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
"""
)
)
cells.append(nbf.v4.new_markdown_cell("## 6) Train"))
cells.append(
nbf.v4.new_code_cell(
"""
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
import math
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
args = TrainingArguments(
output_dir=str(OUTPUT_DIR),
run_name=RUN_NAME,
num_train_epochs=NUM_TRAIN_EPOCHS,
per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH,
per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
gradient_accumulation_steps=GRADIENT_ACCUM_STEPS,
learning_rate=LEARNING_RATE,
lr_scheduler_type=LR_SCHEDULER_TYPE,
warmup_ratio=WARMUP_RATIO,
weight_decay=WEIGHT_DECAY,
logging_steps=LOGGING_STEPS,
evaluation_strategy="steps",
eval_steps=EVAL_STEPS,
save_steps=SAVE_STEPS,
save_total_limit=2,
bf16=BF16,
fp16=FP16,
gradient_checkpointing=True,
report_to=["none"],
seed=SEED,
)
trainer = Trainer(
model=model,
tokenizer=tokenizer,
args=args,
train_dataset=train_tok,
eval_dataset=eval_tok,
data_collator=data_collator,
)
train_result = trainer.train()
metrics = trainer.evaluate() if eval_tok else {}
perplexity = math.exp(metrics["eval_loss"]) if metrics and "eval_loss" in metrics else None
metrics, perplexity
"""
)
)
cells.append(nbf.v4.new_markdown_cell("## 7) Save LoRA adapters"))
cells.append(
nbf.v4.new_code_cell(
"""
adapter_dir = OUTPUT_DIR / "lora_adapter"
adapter_dir.mkdir(parents=True, exist_ok=True)
model.save_pretrained(str(adapter_dir))
tokenizer.save_pretrained(str(adapter_dir))
print(f"Saved LoRA adapter to: {adapter_dir}")
"""
)
)
cells.append(
nbf.v4.new_markdown_cell(
"## 8) (Optional) Merge adapters into base model and save full weights"
)
)
cells.append(
nbf.v4.new_code_cell(
"""
DO_MERGE = False # set True to produce a standalone merged model
if DO_MERGE:
from peft import PeftModel
base_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
torch_dtype=torch.bfloat16 if BF16 else (torch.float16 if FP16 else None),
device_map="auto",
)
merged = PeftModel.from_pretrained(base_model, str(adapter_dir)).merge_and_unload()
merged_dir = OUTPUT_DIR / "merged_model"
merged.save_pretrained(str(merged_dir))
tokenizer.save_pretrained(str(merged_dir))
print(f"Merged full model saved to: {merged_dir}")
else:
print("Skipping merge (set DO_MERGE=True to enable).")
"""
)
)
cells.append(nbf.v4.new_markdown_cell("## 9) Quick inference with the trained adapter"))
cells.append(
nbf.v4.new_code_cell(
"""
from peft import PeftModel
import torch
test_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
quantization_config=bnb_config,
torch_dtype=torch.bfloat16 if BF16 else (torch.float16 if FP16 else None),
device_map="auto",
)
test_model = PeftModel.from_pretrained(test_model, str(adapter_dir))
test_model.eval()
def generate_answer(prompt, max_new_tokens=256, temperature=0.2, top_p=0.9):
if USE_CHAT_TEMPLATE and hasattr(tokenizer, "apply_chat_template"):
messages = [
{"role": "system", "content": SYSTEM_PREFIX},
{"role": "user", "content": prompt},
]
model_inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to(test_model.device)
else:
text = f"<s>[SYSTEM]\\n{SYSTEM_PREFIX}\\n[/SYSTEM]\\n[USER]\\n{prompt}\\n[/USER]\\n[ASSISTANT]\\n"
model_inputs = tokenizer([text], return_tensors="pt").to(test_model.device)
with torch.no_grad():
out = test_model.generate(
**model_inputs,
do_sample=True,
max_new_tokens=max_new_tokens,
temperature=temperature,
top_p=top_p,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
)
return tokenizer.decode(out[0], skip_special_tokens=True)
sample_prompt = (train_rows[0]["input"] if len(train_rows)>0 else "What are the visitor crowd levels like?")
print(generate_answer(sample_prompt)[:800])
"""
)
)
cells.append(nbf.v4.new_markdown_cell("## 10) Light evaluation on the validation set"))
cells.append(
nbf.v4.new_code_cell(
"""
import evaluate
if eval_ds:
rouge = evaluate.load("rouge")
preds, refs = [], []
for ex in val_rows[:50]:
preds.append(generate_answer(ex["input"], max_new_tokens=192, temperature=0.0))
refs.append(ex["output"])
results = rouge.compute(predictions=preds, references=refs)
print(results)
else:
print("No eval split available; skipped.")
"""
)
)
cells.append(
nbf.v4.new_markdown_cell(
"""
## 11) (Optional) Use with other runtimes
- **Python Inference (PEFT)**: Load base model + adapter as shown in Section 9.
- **Merged model**: Set `DO_MERGE=True` to create a standalone model directory; you can then convert to other runtimes (e.g., llama.cpp GGUF) using their conversion tools.
- **Ollama**: If your runtime supports adapters or merged weights for the chosen base model, create a `Modelfile` pointing to them. Need a concrete path? Tell me your base and target runtime and Ill add exact steps.
"""
)
)
nb["cells"] = cells
out_path = Path("./raft_finetune_qlora.ipynb")
with open(out_path, "w", encoding="utf-8") as f:
nbf.write(nb, f)
str(out_path)