mirror of
https://github.com/marvinscham/masterthesis-playground.git
synced 2025-12-06 18:20:53 +01:00
RAFT test setup
This commit is contained in:
417
raft/create_raft_tuning_notebook.py
Normal file
417
raft/create_raft_tuning_notebook.py
Normal file
@@ -0,0 +1,417 @@
|
||||
# Re-create the Jupyter Notebook for RAFT QLoRA fine-tuning and save to /mnt/data
|
||||
|
||||
import nbformat as nbf
|
||||
from pathlib import Path
|
||||
|
||||
nb = nbf.v4.new_notebook()
|
||||
nb.metadata.update(
|
||||
{
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3",
|
||||
},
|
||||
"language_info": {"name": "python", "version": "3.x"},
|
||||
}
|
||||
)
|
||||
|
||||
cells = []
|
||||
|
||||
cells.append(
|
||||
nbf.v4.new_markdown_cell(
|
||||
"""
|
||||
# RAFT Supervised Fine-Tuning (QLoRA) — Local Training
|
||||
|
||||
This notebook fine-tunes an open-source base model on a RAFT-style dataset (`input` → `output`) using **QLoRA** with **PEFT** and **Transformers**. It is designed to run locally (single or multi-GPU) and to export both **LoRA adapters** and (optionally) a **merged** model for inference.
|
||||
|
||||
> **Assumptions**
|
||||
> - Your dataset lives at `./outputs/raft_dataset.jsonl` (from the previous notebook). Adjust the path if needed.
|
||||
> - You have a CUDA-capable GPU and can install `bitsandbytes`. (CPU training is possible but slow.)
|
||||
> - You have enough VRAM for the chosen base model when loaded in 4-bit NF4.
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
cells.append(nbf.v4.new_markdown_cell("## 0) Install dependencies"))
|
||||
cells.append(
|
||||
nbf.v4.new_code_cell(
|
||||
"""
|
||||
# If needed, uncomment the following installs:
|
||||
# %pip install --quiet transformers==4.44.2 datasets==2.20.0 peft==0.12.0 accelerate==0.34.2 bitsandbytes==0.43.3 evaluate==0.4.2 sentencepiece==0.2.0
|
||||
# Optional extras:
|
||||
# %pip install --quiet trl==0.9.6 sacrebleu==2.4.3 rouge-score==0.1.2
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
cells.append(nbf.v4.new_markdown_cell("## 1) Configuration"))
|
||||
cells.append(
|
||||
nbf.v4.new_code_cell(
|
||||
"""
|
||||
from pathlib import Path
|
||||
|
||||
# Paths
|
||||
DATA_JSONL = Path("./outputs/raft_dataset.jsonl") # change if different
|
||||
RUN_NAME = "raft_qlora_run"
|
||||
OUTPUT_DIR = Path(f"./finetuned/{RUN_NAME}")
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Base model — examples: "meta-llama/Llama-3.1-8B", "Qwen/Qwen2-7B-Instruct", "mistralai/Mistral-7B-Instruct-v0.3"
|
||||
# Prefer an instruction-tuned base for better stability on SFT.
|
||||
BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
|
||||
|
||||
# Tokenization/prompt formatting
|
||||
SYSTEM_PREFIX = "You are a helpful assistant. Answer concisely and truthfully based ONLY on the user's request."
|
||||
USE_CHAT_TEMPLATE = True # if the tokenizer has a chat template, we'll leverage it
|
||||
|
||||
# QLoRA/PEFT params
|
||||
LORA_R = 16
|
||||
LORA_ALPHA = 32
|
||||
LORA_DROPOUT = 0.05
|
||||
TARGET_MODULES = None # None = let PEFT auto-detect common modules (works for most models)
|
||||
|
||||
# 4-bit quantization (QLoRA)
|
||||
LOAD_IN_4BIT = True
|
||||
BNB_4BIT_COMPUTE_DTYPE = "bfloat16" # "float16" or "bfloat16"
|
||||
BNB_4BIT_QUANT_TYPE = "nf4" # "nf4" or "fp4"
|
||||
BNB_4BIT_USE_DOUBLE_QUANT = True
|
||||
|
||||
# Training
|
||||
TRAIN_VAL_SPLIT = 0.98
|
||||
MAX_SEQ_LEN = 2048
|
||||
PER_DEVICE_TRAIN_BATCH = 1
|
||||
PER_DEVICE_EVAL_BATCH = 1
|
||||
GRADIENT_ACCUM_STEPS = 16
|
||||
LEARNING_RATE = 2e-4
|
||||
NUM_TRAIN_EPOCHS = 2
|
||||
WEIGHT_DECAY = 0.0
|
||||
WARMUP_RATIO = 0.03
|
||||
LR_SCHEDULER_TYPE = "cosine"
|
||||
LOGGING_STEPS = 10
|
||||
EVAL_STEPS = 200
|
||||
SAVE_STEPS = 200
|
||||
BF16 = True
|
||||
FP16 = False
|
||||
|
||||
SEED = 7
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
cells.append(nbf.v4.new_markdown_cell("## 2) Load dataset (JSONL)"))
|
||||
cells.append(
|
||||
nbf.v4.new_code_cell(
|
||||
"""
|
||||
import json, random
|
||||
from datasets import Dataset
|
||||
|
||||
def read_jsonl(p: Path):
|
||||
rows = []
|
||||
with p.open("r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
obj = json.loads(line)
|
||||
if "input" in obj and "output" in obj:
|
||||
rows.append(obj)
|
||||
except Exception:
|
||||
pass
|
||||
return rows
|
||||
|
||||
rows = read_jsonl(DATA_JSONL)
|
||||
print(f"Loaded {len(rows)} rows from {DATA_JSONL}")
|
||||
|
||||
random.Random(SEED).shuffle(rows)
|
||||
split = int(len(rows) * TRAIN_VAL_SPLIT)
|
||||
train_rows = rows[:split]
|
||||
val_rows = rows[split:] if split < len(rows) else rows[-max(1, len(rows)//50):]
|
||||
|
||||
train_ds = Dataset.from_list(train_rows)
|
||||
eval_ds = Dataset.from_list(val_rows) if val_rows else None
|
||||
train_ds, eval_ds
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
cells.append(nbf.v4.new_markdown_cell("## 3) Prompt formatting"))
|
||||
cells.append(
|
||||
nbf.v4.new_code_cell(
|
||||
"""
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
|
||||
if tokenizer.pad_token is None:
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
||||
def format_example(ex):
|
||||
user = ex["input"]
|
||||
assistant = ex["output"]
|
||||
|
||||
if USE_CHAT_TEMPLATE and hasattr(tokenizer, "apply_chat_template"):
|
||||
messages = [
|
||||
{"role": "system", "content": SYSTEM_PREFIX},
|
||||
{"role": "user", "content": user},
|
||||
{"role": "assistant", "content": assistant},
|
||||
]
|
||||
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
|
||||
else:
|
||||
text = f"<s>[SYSTEM]\\n{SYSTEM_PREFIX}\\n[/SYSTEM]\\n[USER]\\n{user}\\n[/USER]\\n[ASSISTANT]\\n{assistant}</s>"
|
||||
return {"text": text}
|
||||
|
||||
train_ds_fmt = train_ds.map(format_example, remove_columns=train_ds.column_names)
|
||||
eval_ds_fmt = eval_ds.map(format_example, remove_columns=eval_ds.column_names) if eval_ds else None
|
||||
|
||||
print(train_ds_fmt[0]["text"][:400])
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
cells.append(nbf.v4.new_markdown_cell("## 4) Tokenize"))
|
||||
cells.append(
|
||||
nbf.v4.new_code_cell(
|
||||
"""
|
||||
def tokenize(batch):
|
||||
return tokenizer(
|
||||
batch["text"],
|
||||
truncation=True,
|
||||
max_length=MAX_SEQ_LEN,
|
||||
padding="max_length",
|
||||
return_tensors=None,
|
||||
)
|
||||
|
||||
train_tok = train_ds_fmt.map(tokenize, batched=True, remove_columns=train_ds_fmt.column_names)
|
||||
eval_tok = eval_ds_fmt.map(tokenize, batched=True, remove_columns=eval_ds_fmt.column_names) if eval_ds_fmt else None
|
||||
|
||||
train_tok = train_tok.rename_column("input_ids", "input_ids")
|
||||
train_tok = train_tok.add_column("labels", train_tok["input_ids"])
|
||||
if eval_tok:
|
||||
eval_tok = eval_tok.add_column("labels", eval_tok["input_ids"])
|
||||
|
||||
train_tok, (eval_tok[0]['input_ids'][:10] if eval_tok else [])
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
cells.append(
|
||||
nbf.v4.new_markdown_cell(
|
||||
"## 5) Load base model with 4-bit quantization and prepare QLoRA"
|
||||
)
|
||||
)
|
||||
cells.append(
|
||||
nbf.v4.new_code_cell(
|
||||
"""
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
|
||||
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
|
||||
|
||||
bnb_config = None
|
||||
if LOAD_IN_4BIT:
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_use_double_quant=BNB_4BIT_USE_DOUBLE_QUANT,
|
||||
bnb_4bit_quant_type=BNB_4BIT_QUANT_TYPE,
|
||||
bnb_4bit_compute_dtype=getattr(torch, BNB_4BIT_COMPUTE_DTYPE)
|
||||
)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
BASE_MODEL,
|
||||
quantization_config=bnb_config,
|
||||
torch_dtype=torch.bfloat16 if BF16 else (torch.float16 if FP16 else None),
|
||||
device_map="auto",
|
||||
)
|
||||
|
||||
model = prepare_model_for_kbit_training(model)
|
||||
|
||||
peft_config = LoraConfig(
|
||||
r=LORA_R,
|
||||
lora_alpha=LORA_ALPHA,
|
||||
lora_dropout=LORA_DROPOUT,
|
||||
bias="none",
|
||||
task_type="CAUSAL_LM",
|
||||
target_modules=TARGET_MODULES,
|
||||
)
|
||||
|
||||
model = get_peft_model(model, peft_config)
|
||||
model.print_trainable_parameters()
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
cells.append(nbf.v4.new_markdown_cell("## 6) Train"))
|
||||
cells.append(
|
||||
nbf.v4.new_code_cell(
|
||||
"""
|
||||
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
|
||||
import math
|
||||
|
||||
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
|
||||
|
||||
args = TrainingArguments(
|
||||
output_dir=str(OUTPUT_DIR),
|
||||
run_name=RUN_NAME,
|
||||
num_train_epochs=NUM_TRAIN_EPOCHS,
|
||||
per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH,
|
||||
per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
|
||||
gradient_accumulation_steps=GRADIENT_ACCUM_STEPS,
|
||||
learning_rate=LEARNING_RATE,
|
||||
lr_scheduler_type=LR_SCHEDULER_TYPE,
|
||||
warmup_ratio=WARMUP_RATIO,
|
||||
weight_decay=WEIGHT_DECAY,
|
||||
logging_steps=LOGGING_STEPS,
|
||||
evaluation_strategy="steps",
|
||||
eval_steps=EVAL_STEPS,
|
||||
save_steps=SAVE_STEPS,
|
||||
save_total_limit=2,
|
||||
bf16=BF16,
|
||||
fp16=FP16,
|
||||
gradient_checkpointing=True,
|
||||
report_to=["none"],
|
||||
seed=SEED,
|
||||
)
|
||||
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
args=args,
|
||||
train_dataset=train_tok,
|
||||
eval_dataset=eval_tok,
|
||||
data_collator=data_collator,
|
||||
)
|
||||
|
||||
train_result = trainer.train()
|
||||
metrics = trainer.evaluate() if eval_tok else {}
|
||||
perplexity = math.exp(metrics["eval_loss"]) if metrics and "eval_loss" in metrics else None
|
||||
metrics, perplexity
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
cells.append(nbf.v4.new_markdown_cell("## 7) Save LoRA adapters"))
|
||||
cells.append(
|
||||
nbf.v4.new_code_cell(
|
||||
"""
|
||||
adapter_dir = OUTPUT_DIR / "lora_adapter"
|
||||
adapter_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
model.save_pretrained(str(adapter_dir))
|
||||
tokenizer.save_pretrained(str(adapter_dir))
|
||||
|
||||
print(f"Saved LoRA adapter to: {adapter_dir}")
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
cells.append(
|
||||
nbf.v4.new_markdown_cell(
|
||||
"## 8) (Optional) Merge adapters into base model and save full weights"
|
||||
)
|
||||
)
|
||||
cells.append(
|
||||
nbf.v4.new_code_cell(
|
||||
"""
|
||||
DO_MERGE = False # set True to produce a standalone merged model
|
||||
|
||||
if DO_MERGE:
|
||||
from peft import PeftModel
|
||||
base_model = AutoModelForCausalLM.from_pretrained(
|
||||
BASE_MODEL,
|
||||
torch_dtype=torch.bfloat16 if BF16 else (torch.float16 if FP16 else None),
|
||||
device_map="auto",
|
||||
)
|
||||
merged = PeftModel.from_pretrained(base_model, str(adapter_dir)).merge_and_unload()
|
||||
merged_dir = OUTPUT_DIR / "merged_model"
|
||||
merged.save_pretrained(str(merged_dir))
|
||||
tokenizer.save_pretrained(str(merged_dir))
|
||||
print(f"Merged full model saved to: {merged_dir}")
|
||||
else:
|
||||
print("Skipping merge (set DO_MERGE=True to enable).")
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
cells.append(nbf.v4.new_markdown_cell("## 9) Quick inference with the trained adapter"))
|
||||
cells.append(
|
||||
nbf.v4.new_code_cell(
|
||||
"""
|
||||
from peft import PeftModel
|
||||
import torch
|
||||
|
||||
test_model = AutoModelForCausalLM.from_pretrained(
|
||||
BASE_MODEL,
|
||||
quantization_config=bnb_config,
|
||||
torch_dtype=torch.bfloat16 if BF16 else (torch.float16 if FP16 else None),
|
||||
device_map="auto",
|
||||
)
|
||||
test_model = PeftModel.from_pretrained(test_model, str(adapter_dir))
|
||||
test_model.eval()
|
||||
|
||||
def generate_answer(prompt, max_new_tokens=256, temperature=0.2, top_p=0.9):
|
||||
if USE_CHAT_TEMPLATE and hasattr(tokenizer, "apply_chat_template"):
|
||||
messages = [
|
||||
{"role": "system", "content": SYSTEM_PREFIX},
|
||||
{"role": "user", "content": prompt},
|
||||
]
|
||||
model_inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to(test_model.device)
|
||||
else:
|
||||
text = f"<s>[SYSTEM]\\n{SYSTEM_PREFIX}\\n[/SYSTEM]\\n[USER]\\n{prompt}\\n[/USER]\\n[ASSISTANT]\\n"
|
||||
model_inputs = tokenizer([text], return_tensors="pt").to(test_model.device)
|
||||
|
||||
with torch.no_grad():
|
||||
out = test_model.generate(
|
||||
**model_inputs,
|
||||
do_sample=True,
|
||||
max_new_tokens=max_new_tokens,
|
||||
temperature=temperature,
|
||||
top_p=top_p,
|
||||
eos_token_id=tokenizer.eos_token_id,
|
||||
pad_token_id=tokenizer.pad_token_id,
|
||||
)
|
||||
return tokenizer.decode(out[0], skip_special_tokens=True)
|
||||
|
||||
sample_prompt = (train_rows[0]["input"] if len(train_rows)>0 else "What are the visitor crowd levels like?")
|
||||
print(generate_answer(sample_prompt)[:800])
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
cells.append(nbf.v4.new_markdown_cell("## 10) Light evaluation on the validation set"))
|
||||
cells.append(
|
||||
nbf.v4.new_code_cell(
|
||||
"""
|
||||
import evaluate
|
||||
|
||||
if eval_ds:
|
||||
rouge = evaluate.load("rouge")
|
||||
preds, refs = [], []
|
||||
for ex in val_rows[:50]:
|
||||
preds.append(generate_answer(ex["input"], max_new_tokens=192, temperature=0.0))
|
||||
refs.append(ex["output"])
|
||||
results = rouge.compute(predictions=preds, references=refs)
|
||||
print(results)
|
||||
else:
|
||||
print("No eval split available; skipped.")
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
cells.append(
|
||||
nbf.v4.new_markdown_cell(
|
||||
"""
|
||||
## 11) (Optional) Use with other runtimes
|
||||
|
||||
- **Python Inference (PEFT)**: Load base model + adapter as shown in Section 9.
|
||||
- **Merged model**: Set `DO_MERGE=True` to create a standalone model directory; you can then convert to other runtimes (e.g., llama.cpp GGUF) using their conversion tools.
|
||||
- **Ollama**: If your runtime supports adapters or merged weights for the chosen base model, create a `Modelfile` pointing to them. Need a concrete path? Tell me your base and target runtime and I’ll add exact steps.
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
nb["cells"] = cells
|
||||
|
||||
out_path = Path("./raft_finetune_qlora.ipynb")
|
||||
with open(out_path, "w", encoding="utf-8") as f:
|
||||
nbf.write(nb, f)
|
||||
|
||||
str(out_path)
|
||||
Reference in New Issue
Block a user