Files
masterthesis-playground/raft/create_raft_tuning_notebook.py
2025-10-13 17:34:49 +02:00

418 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# Re-create the Jupyter Notebook for RAFT QLoRA fine-tuning and save to /mnt/data
import nbformat as nbf
from pathlib import Path
nb = nbf.v4.new_notebook()
nb.metadata.update(
{
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3",
},
"language_info": {"name": "python", "version": "3.x"},
}
)
cells = []
cells.append(
nbf.v4.new_markdown_cell(
"""
# RAFT Supervised Fine-Tuning (QLoRA) — Local Training
This notebook fine-tunes an open-source base model on a RAFT-style dataset (`input` → `output`) using **QLoRA** with **PEFT** and **Transformers**. It is designed to run locally (single or multi-GPU) and to export both **LoRA adapters** and (optionally) a **merged** model for inference.
> **Assumptions**
> - Your dataset lives at `./outputs/raft_dataset.jsonl` (from the previous notebook). Adjust the path if needed.
> - You have a CUDA-capable GPU and can install `bitsandbytes`. (CPU training is possible but slow.)
> - You have enough VRAM for the chosen base model when loaded in 4-bit NF4.
"""
)
)
cells.append(nbf.v4.new_markdown_cell("## 0) Install dependencies"))
cells.append(
nbf.v4.new_code_cell(
"""
# If needed, uncomment the following installs:
# %pip install --quiet transformers==4.44.2 datasets==2.20.0 peft==0.12.0 accelerate==0.34.2 bitsandbytes==0.43.3 evaluate==0.4.2 sentencepiece==0.2.0
# Optional extras:
# %pip install --quiet trl==0.9.6 sacrebleu==2.4.3 rouge-score==0.1.2
"""
)
)
cells.append(nbf.v4.new_markdown_cell("## 1) Configuration"))
cells.append(
nbf.v4.new_code_cell(
"""
from pathlib import Path
# Paths
DATA_JSONL = Path("./outputs/raft_dataset.jsonl") # change if different
RUN_NAME = "raft_qlora_run"
OUTPUT_DIR = Path(f"./finetuned/{RUN_NAME}")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# Base model — examples: "meta-llama/Llama-3.1-8B", "Qwen/Qwen2-7B-Instruct", "mistralai/Mistral-7B-Instruct-v0.3"
# Prefer an instruction-tuned base for better stability on SFT.
BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
# Tokenization/prompt formatting
SYSTEM_PREFIX = "You are a helpful assistant. Answer concisely and truthfully based ONLY on the user's request."
USE_CHAT_TEMPLATE = True # if the tokenizer has a chat template, we'll leverage it
# QLoRA/PEFT params
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05
TARGET_MODULES = None # None = let PEFT auto-detect common modules (works for most models)
# 4-bit quantization (QLoRA)
LOAD_IN_4BIT = True
BNB_4BIT_COMPUTE_DTYPE = "bfloat16" # "float16" or "bfloat16"
BNB_4BIT_QUANT_TYPE = "nf4" # "nf4" or "fp4"
BNB_4BIT_USE_DOUBLE_QUANT = True
# Training
TRAIN_VAL_SPLIT = 0.98
MAX_SEQ_LEN = 2048
PER_DEVICE_TRAIN_BATCH = 1
PER_DEVICE_EVAL_BATCH = 1
GRADIENT_ACCUM_STEPS = 16
LEARNING_RATE = 2e-4
NUM_TRAIN_EPOCHS = 2
WEIGHT_DECAY = 0.0
WARMUP_RATIO = 0.03
LR_SCHEDULER_TYPE = "cosine"
LOGGING_STEPS = 10
EVAL_STEPS = 200
SAVE_STEPS = 200
BF16 = True
FP16 = False
SEED = 7
"""
)
)
cells.append(nbf.v4.new_markdown_cell("## 2) Load dataset (JSONL)"))
cells.append(
nbf.v4.new_code_cell(
"""
import json, random
from datasets import Dataset
def read_jsonl(p: Path):
rows = []
with p.open("r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
obj = json.loads(line)
if "input" in obj and "output" in obj:
rows.append(obj)
except Exception:
pass
return rows
rows = read_jsonl(DATA_JSONL)
print(f"Loaded {len(rows)} rows from {DATA_JSONL}")
random.Random(SEED).shuffle(rows)
split = int(len(rows) * TRAIN_VAL_SPLIT)
train_rows = rows[:split]
val_rows = rows[split:] if split < len(rows) else rows[-max(1, len(rows)//50):]
train_ds = Dataset.from_list(train_rows)
eval_ds = Dataset.from_list(val_rows) if val_rows else None
train_ds, eval_ds
"""
)
)
cells.append(nbf.v4.new_markdown_cell("## 3) Prompt formatting"))
cells.append(
nbf.v4.new_code_cell(
"""
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
def format_example(ex):
user = ex["input"]
assistant = ex["output"]
if USE_CHAT_TEMPLATE and hasattr(tokenizer, "apply_chat_template"):
messages = [
{"role": "system", "content": SYSTEM_PREFIX},
{"role": "user", "content": user},
{"role": "assistant", "content": assistant},
]
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
else:
text = f"<s>[SYSTEM]\\n{SYSTEM_PREFIX}\\n[/SYSTEM]\\n[USER]\\n{user}\\n[/USER]\\n[ASSISTANT]\\n{assistant}</s>"
return {"text": text}
train_ds_fmt = train_ds.map(format_example, remove_columns=train_ds.column_names)
eval_ds_fmt = eval_ds.map(format_example, remove_columns=eval_ds.column_names) if eval_ds else None
print(train_ds_fmt[0]["text"][:400])
"""
)
)
cells.append(nbf.v4.new_markdown_cell("## 4) Tokenize"))
cells.append(
nbf.v4.new_code_cell(
"""
def tokenize(batch):
return tokenizer(
batch["text"],
truncation=True,
max_length=MAX_SEQ_LEN,
padding="max_length",
return_tensors=None,
)
train_tok = train_ds_fmt.map(tokenize, batched=True, remove_columns=train_ds_fmt.column_names)
eval_tok = eval_ds_fmt.map(tokenize, batched=True, remove_columns=eval_ds_fmt.column_names) if eval_ds_fmt else None
train_tok = train_tok.rename_column("input_ids", "input_ids")
train_tok = train_tok.add_column("labels", train_tok["input_ids"])
if eval_tok:
eval_tok = eval_tok.add_column("labels", eval_tok["input_ids"])
train_tok, (eval_tok[0]['input_ids'][:10] if eval_tok else [])
"""
)
)
cells.append(
nbf.v4.new_markdown_cell(
"## 5) Load base model with 4-bit quantization and prepare QLoRA"
)
)
cells.append(
nbf.v4.new_code_cell(
"""
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
bnb_config = None
if LOAD_IN_4BIT:
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=BNB_4BIT_USE_DOUBLE_QUANT,
bnb_4bit_quant_type=BNB_4BIT_QUANT_TYPE,
bnb_4bit_compute_dtype=getattr(torch, BNB_4BIT_COMPUTE_DTYPE)
)
model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
quantization_config=bnb_config,
torch_dtype=torch.bfloat16 if BF16 else (torch.float16 if FP16 else None),
device_map="auto",
)
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
r=LORA_R,
lora_alpha=LORA_ALPHA,
lora_dropout=LORA_DROPOUT,
bias="none",
task_type="CAUSAL_LM",
target_modules=TARGET_MODULES,
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
"""
)
)
cells.append(nbf.v4.new_markdown_cell("## 6) Train"))
cells.append(
nbf.v4.new_code_cell(
"""
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
import math
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
args = TrainingArguments(
output_dir=str(OUTPUT_DIR),
run_name=RUN_NAME,
num_train_epochs=NUM_TRAIN_EPOCHS,
per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH,
per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
gradient_accumulation_steps=GRADIENT_ACCUM_STEPS,
learning_rate=LEARNING_RATE,
lr_scheduler_type=LR_SCHEDULER_TYPE,
warmup_ratio=WARMUP_RATIO,
weight_decay=WEIGHT_DECAY,
logging_steps=LOGGING_STEPS,
evaluation_strategy="steps",
eval_steps=EVAL_STEPS,
save_steps=SAVE_STEPS,
save_total_limit=2,
bf16=BF16,
fp16=FP16,
gradient_checkpointing=True,
report_to=["none"],
seed=SEED,
)
trainer = Trainer(
model=model,
tokenizer=tokenizer,
args=args,
train_dataset=train_tok,
eval_dataset=eval_tok,
data_collator=data_collator,
)
train_result = trainer.train()
metrics = trainer.evaluate() if eval_tok else {}
perplexity = math.exp(metrics["eval_loss"]) if metrics and "eval_loss" in metrics else None
metrics, perplexity
"""
)
)
cells.append(nbf.v4.new_markdown_cell("## 7) Save LoRA adapters"))
cells.append(
nbf.v4.new_code_cell(
"""
adapter_dir = OUTPUT_DIR / "lora_adapter"
adapter_dir.mkdir(parents=True, exist_ok=True)
model.save_pretrained(str(adapter_dir))
tokenizer.save_pretrained(str(adapter_dir))
print(f"Saved LoRA adapter to: {adapter_dir}")
"""
)
)
cells.append(
nbf.v4.new_markdown_cell(
"## 8) (Optional) Merge adapters into base model and save full weights"
)
)
cells.append(
nbf.v4.new_code_cell(
"""
DO_MERGE = False # set True to produce a standalone merged model
if DO_MERGE:
from peft import PeftModel
base_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
torch_dtype=torch.bfloat16 if BF16 else (torch.float16 if FP16 else None),
device_map="auto",
)
merged = PeftModel.from_pretrained(base_model, str(adapter_dir)).merge_and_unload()
merged_dir = OUTPUT_DIR / "merged_model"
merged.save_pretrained(str(merged_dir))
tokenizer.save_pretrained(str(merged_dir))
print(f"Merged full model saved to: {merged_dir}")
else:
print("Skipping merge (set DO_MERGE=True to enable).")
"""
)
)
cells.append(nbf.v4.new_markdown_cell("## 9) Quick inference with the trained adapter"))
cells.append(
nbf.v4.new_code_cell(
"""
from peft import PeftModel
import torch
test_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
quantization_config=bnb_config,
torch_dtype=torch.bfloat16 if BF16 else (torch.float16 if FP16 else None),
device_map="auto",
)
test_model = PeftModel.from_pretrained(test_model, str(adapter_dir))
test_model.eval()
def generate_answer(prompt, max_new_tokens=256, temperature=0.2, top_p=0.9):
if USE_CHAT_TEMPLATE and hasattr(tokenizer, "apply_chat_template"):
messages = [
{"role": "system", "content": SYSTEM_PREFIX},
{"role": "user", "content": prompt},
]
model_inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to(test_model.device)
else:
text = f"<s>[SYSTEM]\\n{SYSTEM_PREFIX}\\n[/SYSTEM]\\n[USER]\\n{prompt}\\n[/USER]\\n[ASSISTANT]\\n"
model_inputs = tokenizer([text], return_tensors="pt").to(test_model.device)
with torch.no_grad():
out = test_model.generate(
**model_inputs,
do_sample=True,
max_new_tokens=max_new_tokens,
temperature=temperature,
top_p=top_p,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
)
return tokenizer.decode(out[0], skip_special_tokens=True)
sample_prompt = (train_rows[0]["input"] if len(train_rows)>0 else "What are the visitor crowd levels like?")
print(generate_answer(sample_prompt)[:800])
"""
)
)
cells.append(nbf.v4.new_markdown_cell("## 10) Light evaluation on the validation set"))
cells.append(
nbf.v4.new_code_cell(
"""
import evaluate
if eval_ds:
rouge = evaluate.load("rouge")
preds, refs = [], []
for ex in val_rows[:50]:
preds.append(generate_answer(ex["input"], max_new_tokens=192, temperature=0.0))
refs.append(ex["output"])
results = rouge.compute(predictions=preds, references=refs)
print(results)
else:
print("No eval split available; skipped.")
"""
)
)
cells.append(
nbf.v4.new_markdown_cell(
"""
## 11) (Optional) Use with other runtimes
- **Python Inference (PEFT)**: Load base model + adapter as shown in Section 9.
- **Merged model**: Set `DO_MERGE=True` to create a standalone model directory; you can then convert to other runtimes (e.g., llama.cpp GGUF) using their conversion tools.
- **Ollama**: If your runtime supports adapters or merged weights for the chosen base model, create a `Modelfile` pointing to them. Need a concrete path? Tell me your base and target runtime and Ill add exact steps.
"""
)
)
nb["cells"] = cells
out_path = Path("./raft_finetune_qlora.ipynb")
with open(out_path, "w", encoding="utf-8") as f:
nbf.write(nb, f)
str(out_path)