masterthesis-playground/raft/create_raft_tuning_notebook.py

# Re-create the Jupyter Notebook for RAFT QLoRA fine-tuning and save to /mnt/data

import nbformat as nbf
from pathlib import Path

nb = nbf.v4.new_notebook()
nb.metadata.update(
    {
        "kernelspec": {
            "display_name": "Python 3",
            "language": "python",
            "name": "python3",
        },
        "language_info": {"name": "python", "version": "3.x"},
    }
)

cells = []

cells.append(
    nbf.v4.new_markdown_cell(
        """
# RAFT Supervised Fine-Tuning (QLoRA) — Local Training

This notebook fine-tunes an open-source base model on a RAFT-style dataset (`input` → `output`) using **QLoRA** with **PEFT** and **Transformers**. It is designed to run locally (single or multi-GPU) and to export both **LoRA adapters** and (optionally) a **merged** model for inference.

> **Assumptions**
> - Your dataset lives at `./outputs/raft_dataset.jsonl` (from the previous notebook). Adjust the path if needed.
> - You have a CUDA-capable GPU and can install `bitsandbytes`. (CPU training is possible but slow.)
> - You have enough VRAM for the chosen base model when loaded in 4-bit NF4.
"""
    )
)

cells.append(nbf.v4.new_markdown_cell("## 0) Install dependencies"))
cells.append(
    nbf.v4.new_code_cell(
        """
# If needed, uncomment the following installs:
# %pip install --quiet transformers==4.44.2 datasets==2.20.0 peft==0.12.0 accelerate==0.34.2 bitsandbytes==0.43.3 evaluate==0.4.2 sentencepiece==0.2.0
# Optional extras:
# %pip install --quiet trl==0.9.6 sacrebleu==2.4.3 rouge-score==0.1.2
"""
    )
)

cells.append(nbf.v4.new_markdown_cell("## 1) Configuration"))
cells.append(
    nbf.v4.new_code_cell(
        """
from pathlib import Path

# Paths
DATA_JSONL = Path("./outputs/raft_dataset.jsonl")  # change if different
RUN_NAME = "raft_qlora_run"
OUTPUT_DIR = Path(f"./finetuned/{RUN_NAME}")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Base model — examples: "meta-llama/Llama-3.1-8B", "Qwen/Qwen2-7B-Instruct", "mistralai/Mistral-7B-Instruct-v0.3"
# Prefer an instruction-tuned base for better stability on SFT.
BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"

# Tokenization/prompt formatting
SYSTEM_PREFIX = "You are a helpful assistant. Answer concisely and truthfully based ONLY on the user's request."
USE_CHAT_TEMPLATE = True  # if the tokenizer has a chat template, we'll leverage it

# QLoRA/PEFT params
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05
TARGET_MODULES = None  # None = let PEFT auto-detect common modules (works for most models)

# 4-bit quantization (QLoRA)
LOAD_IN_4BIT = True
BNB_4BIT_COMPUTE_DTYPE = "bfloat16"  # "float16" or "bfloat16"
BNB_4BIT_QUANT_TYPE = "nf4"          # "nf4" or "fp4"
BNB_4BIT_USE_DOUBLE_QUANT = True

# Training
TRAIN_VAL_SPLIT = 0.98
MAX_SEQ_LEN = 2048
PER_DEVICE_TRAIN_BATCH = 1
PER_DEVICE_EVAL_BATCH = 1
GRADIENT_ACCUM_STEPS = 16
LEARNING_RATE = 2e-4
NUM_TRAIN_EPOCHS = 2
WEIGHT_DECAY = 0.0
WARMUP_RATIO = 0.03
LR_SCHEDULER_TYPE = "cosine"
LOGGING_STEPS = 10
EVAL_STEPS = 200
SAVE_STEPS = 200
BF16 = True
FP16 = False

SEED = 7
"""
    )
)

cells.append(nbf.v4.new_markdown_cell("## 2) Load dataset (JSONL)"))
cells.append(
    nbf.v4.new_code_cell(
        """
import json, random
from datasets import Dataset

def read_jsonl(p: Path):
    rows = []
    with p.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
                if "input" in obj and "output" in obj:
                    rows.append(obj)
            except Exception:
                pass
    return rows

rows = read_jsonl(DATA_JSONL)
print(f"Loaded {len(rows)} rows from {DATA_JSONL}")

random.Random(SEED).shuffle(rows)
split = int(len(rows) * TRAIN_VAL_SPLIT)
train_rows = rows[:split]
val_rows = rows[split:] if split < len(rows) else rows[-max(1, len(rows)//50):]

train_ds = Dataset.from_list(train_rows)
eval_ds = Dataset.from_list(val_rows) if val_rows else None
train_ds, eval_ds
"""
    )
)

cells.append(nbf.v4.new_markdown_cell("## 3) Prompt formatting"))
cells.append(
    nbf.v4.new_code_cell(
        """
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def format_example(ex):
    user = ex["input"]
    assistant = ex["output"]

    if USE_CHAT_TEMPLATE and hasattr(tokenizer, "apply_chat_template"):
        messages = [
            {"role": "system", "content": SYSTEM_PREFIX},
            {"role": "user", "content": user},
            {"role": "assistant", "content": assistant},
        ]
        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
    else:
        text = f"<s>[SYSTEM]\\n{SYSTEM_PREFIX}\\n[/SYSTEM]\\n[USER]\\n{user}\\n[/USER]\\n[ASSISTANT]\\n{assistant}</s>"
    return {"text": text}

train_ds_fmt = train_ds.map(format_example, remove_columns=train_ds.column_names)
eval_ds_fmt = eval_ds.map(format_example, remove_columns=eval_ds.column_names) if eval_ds else None

print(train_ds_fmt[0]["text"][:400])
"""
    )
)

cells.append(nbf.v4.new_markdown_cell("## 4) Tokenize"))
cells.append(
    nbf.v4.new_code_cell(
        """
def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=MAX_SEQ_LEN,
        padding="max_length",
        return_tensors=None,
    )

train_tok = train_ds_fmt.map(tokenize, batched=True, remove_columns=train_ds_fmt.column_names)
eval_tok = eval_ds_fmt.map(tokenize, batched=True, remove_columns=eval_ds_fmt.column_names) if eval_ds_fmt else None

train_tok = train_tok.rename_column("input_ids", "input_ids")
train_tok = train_tok.add_column("labels", train_tok["input_ids"])
if eval_tok:
    eval_tok = eval_tok.add_column("labels", eval_tok["input_ids"])

train_tok, (eval_tok[0]['input_ids'][:10] if eval_tok else [])
"""
    )
)

cells.append(
    nbf.v4.new_markdown_cell(
        "## 5) Load base model with 4-bit quantization and prepare QLoRA"
    )
)
cells.append(
    nbf.v4.new_code_cell(
        """
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

bnb_config = None
if LOAD_IN_4BIT:
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=BNB_4BIT_USE_DOUBLE_QUANT,
        bnb_4bit_quant_type=BNB_4BIT_QUANT_TYPE,
        bnb_4bit_compute_dtype=getattr(torch, BNB_4BIT_COMPUTE_DTYPE)
    )

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16 if BF16 else (torch.float16 if FP16 else None),
    device_map="auto",
)

model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=TARGET_MODULES,
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
"""
    )
)

cells.append(nbf.v4.new_markdown_cell("## 6) Train"))
cells.append(
    nbf.v4.new_code_cell(
        """
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
import math

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

args = TrainingArguments(
    output_dir=str(OUTPUT_DIR),
    run_name=RUN_NAME,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH,
    per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
    gradient_accumulation_steps=GRADIENT_ACCUM_STEPS,
    learning_rate=LEARNING_RATE,
    lr_scheduler_type=LR_SCHEDULER_TYPE,
    warmup_ratio=WARMUP_RATIO,
    weight_decay=WEIGHT_DECAY,
    logging_steps=LOGGING_STEPS,
    evaluation_strategy="steps",
    eval_steps=EVAL_STEPS,
    save_steps=SAVE_STEPS,
    save_total_limit=2,
    bf16=BF16,
    fp16=FP16,
    gradient_checkpointing=True,
    report_to=["none"],
    seed=SEED,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    train_dataset=train_tok,
    eval_dataset=eval_tok,
    data_collator=data_collator,
)

train_result = trainer.train()
metrics = trainer.evaluate() if eval_tok else {}
perplexity = math.exp(metrics["eval_loss"]) if metrics and "eval_loss" in metrics else None
metrics, perplexity
"""
    )
)

cells.append(nbf.v4.new_markdown_cell("## 7) Save LoRA adapters"))
cells.append(
    nbf.v4.new_code_cell(
        """
adapter_dir = OUTPUT_DIR / "lora_adapter"
adapter_dir.mkdir(parents=True, exist_ok=True)

model.save_pretrained(str(adapter_dir))
tokenizer.save_pretrained(str(adapter_dir))

print(f"Saved LoRA adapter to: {adapter_dir}")
"""
    )
)

cells.append(
    nbf.v4.new_markdown_cell(
        "## 8) (Optional) Merge adapters into base model and save full weights"
    )
)
cells.append(
    nbf.v4.new_code_cell(
        """
DO_MERGE = False  # set True to produce a standalone merged model

if DO_MERGE:
    from peft import PeftModel
    base_model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        torch_dtype=torch.bfloat16 if BF16 else (torch.float16 if FP16 else None),
        device_map="auto",
    )
    merged = PeftModel.from_pretrained(base_model, str(adapter_dir)).merge_and_unload()
    merged_dir = OUTPUT_DIR / "merged_model"
    merged.save_pretrained(str(merged_dir))
    tokenizer.save_pretrained(str(merged_dir))
    print(f"Merged full model saved to: {merged_dir}")
else:
    print("Skipping merge (set DO_MERGE=True to enable).")
"""
    )
)

cells.append(nbf.v4.new_markdown_cell("## 9) Quick inference with the trained adapter"))
cells.append(
    nbf.v4.new_code_cell(
        """
from peft import PeftModel
import torch

test_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16 if BF16 else (torch.float16 if FP16 else None),
    device_map="auto",
)
test_model = PeftModel.from_pretrained(test_model, str(adapter_dir))
test_model.eval()

def generate_answer(prompt, max_new_tokens=256, temperature=0.2, top_p=0.9):
    if USE_CHAT_TEMPLATE and hasattr(tokenizer, "apply_chat_template"):
        messages = [
            {"role": "system", "content": SYSTEM_PREFIX},
            {"role": "user", "content": prompt},
        ]
        model_inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to(test_model.device)
    else:
        text = f"<s>[SYSTEM]\\n{SYSTEM_PREFIX}\\n[/SYSTEM]\\n[USER]\\n{prompt}\\n[/USER]\\n[ASSISTANT]\\n"
        model_inputs = tokenizer([text], return_tensors="pt").to(test_model.device)

    with torch.no_grad():
        out = test_model.generate(
            **model_inputs,
            do_sample=True,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )
    return tokenizer.decode(out[0], skip_special_tokens=True)

sample_prompt = (train_rows[0]["input"] if len(train_rows)>0 else "What are the visitor crowd levels like?")
print(generate_answer(sample_prompt)[:800])
"""
    )
)

cells.append(nbf.v4.new_markdown_cell("## 10) Light evaluation on the validation set"))
cells.append(
    nbf.v4.new_code_cell(
        """
import evaluate

if eval_ds:
    rouge = evaluate.load("rouge")
    preds, refs = [], []
    for ex in val_rows[:50]:
        preds.append(generate_answer(ex["input"], max_new_tokens=192, temperature=0.0))
        refs.append(ex["output"])
    results = rouge.compute(predictions=preds, references=refs)
    print(results)
else:
    print("No eval split available; skipped.")
"""
    )
)

cells.append(
    nbf.v4.new_markdown_cell(
        """
## 11) (Optional) Use with other runtimes

- **Python Inference (PEFT)**: Load base model + adapter as shown in Section 9.
- **Merged model**: Set `DO_MERGE=True` to create a standalone model directory; you can then convert to other runtimes (e.g., llama.cpp GGUF) using their conversion tools.
- **Ollama**: If your runtime supports adapters or merged weights for the chosen base model, create a `Modelfile` pointing to them. Need a concrete path? Tell me your base and target runtime and I’ll add exact steps.
"""
    )
)

nb["cells"] = cells

out_path = Path("./raft_finetune_qlora.ipynb")
with open(out_path, "w", encoding="utf-8") as f:
    nbf.write(nb, f)

str(out_path)