# Re-create the Jupyter Notebook for RAFT QLoRA fine-tuning and save to /mnt/data import nbformat as nbf from pathlib import Path nb = nbf.v4.new_notebook() nb.metadata.update( { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3", }, "language_info": {"name": "python", "version": "3.x"}, } ) cells = [] cells.append( nbf.v4.new_markdown_cell( """ # RAFT Supervised Fine-Tuning (QLoRA) — Local Training This notebook fine-tunes an open-source base model on a RAFT-style dataset (`input` → `output`) using **QLoRA** with **PEFT** and **Transformers**. It is designed to run locally (single or multi-GPU) and to export both **LoRA adapters** and (optionally) a **merged** model for inference. > **Assumptions** > - Your dataset lives at `./outputs/raft_dataset.jsonl` (from the previous notebook). Adjust the path if needed. > - You have a CUDA-capable GPU and can install `bitsandbytes`. (CPU training is possible but slow.) > - You have enough VRAM for the chosen base model when loaded in 4-bit NF4. """ ) ) cells.append(nbf.v4.new_markdown_cell("## 0) Install dependencies")) cells.append( nbf.v4.new_code_cell( """ # If needed, uncomment the following installs: # %pip install --quiet transformers==4.44.2 datasets==2.20.0 peft==0.12.0 accelerate==0.34.2 bitsandbytes==0.43.3 evaluate==0.4.2 sentencepiece==0.2.0 # Optional extras: # %pip install --quiet trl==0.9.6 sacrebleu==2.4.3 rouge-score==0.1.2 """ ) ) cells.append(nbf.v4.new_markdown_cell("## 1) Configuration")) cells.append( nbf.v4.new_code_cell( """ from pathlib import Path # Paths DATA_JSONL = Path("./outputs/raft_dataset.jsonl") # change if different RUN_NAME = "raft_qlora_run" OUTPUT_DIR = Path(f"./finetuned/{RUN_NAME}") OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # Base model — examples: "meta-llama/Llama-3.1-8B", "Qwen/Qwen2-7B-Instruct", "mistralai/Mistral-7B-Instruct-v0.3" # Prefer an instruction-tuned base for better stability on SFT. BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.3" # Tokenization/prompt formatting SYSTEM_PREFIX = "You are a helpful assistant. Answer concisely and truthfully based ONLY on the user's request." USE_CHAT_TEMPLATE = True # if the tokenizer has a chat template, we'll leverage it # QLoRA/PEFT params LORA_R = 16 LORA_ALPHA = 32 LORA_DROPOUT = 0.05 TARGET_MODULES = None # None = let PEFT auto-detect common modules (works for most models) # 4-bit quantization (QLoRA) LOAD_IN_4BIT = True BNB_4BIT_COMPUTE_DTYPE = "bfloat16" # "float16" or "bfloat16" BNB_4BIT_QUANT_TYPE = "nf4" # "nf4" or "fp4" BNB_4BIT_USE_DOUBLE_QUANT = True # Training TRAIN_VAL_SPLIT = 0.98 MAX_SEQ_LEN = 2048 PER_DEVICE_TRAIN_BATCH = 1 PER_DEVICE_EVAL_BATCH = 1 GRADIENT_ACCUM_STEPS = 16 LEARNING_RATE = 2e-4 NUM_TRAIN_EPOCHS = 2 WEIGHT_DECAY = 0.0 WARMUP_RATIO = 0.03 LR_SCHEDULER_TYPE = "cosine" LOGGING_STEPS = 10 EVAL_STEPS = 200 SAVE_STEPS = 200 BF16 = True FP16 = False SEED = 7 """ ) ) cells.append(nbf.v4.new_markdown_cell("## 2) Load dataset (JSONL)")) cells.append( nbf.v4.new_code_cell( """ import json, random from datasets import Dataset def read_jsonl(p: Path): rows = [] with p.open("r", encoding="utf-8") as f: for line in f: line = line.strip() if not line: continue try: obj = json.loads(line) if "input" in obj and "output" in obj: rows.append(obj) except Exception: pass return rows rows = read_jsonl(DATA_JSONL) print(f"Loaded {len(rows)} rows from {DATA_JSONL}") random.Random(SEED).shuffle(rows) split = int(len(rows) * TRAIN_VAL_SPLIT) train_rows = rows[:split] val_rows = rows[split:] if split < len(rows) else rows[-max(1, len(rows)//50):] train_ds = Dataset.from_list(train_rows) eval_ds = Dataset.from_list(val_rows) if val_rows else None train_ds, eval_ds """ ) ) cells.append(nbf.v4.new_markdown_cell("## 3) Prompt formatting")) cells.append( nbf.v4.new_code_cell( """ from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token def format_example(ex): user = ex["input"] assistant = ex["output"] if USE_CHAT_TEMPLATE and hasattr(tokenizer, "apply_chat_template"): messages = [ {"role": "system", "content": SYSTEM_PREFIX}, {"role": "user", "content": user}, {"role": "assistant", "content": assistant}, ] text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False) else: text = f"[SYSTEM]\\n{SYSTEM_PREFIX}\\n[/SYSTEM]\\n[USER]\\n{user}\\n[/USER]\\n[ASSISTANT]\\n{assistant}" return {"text": text} train_ds_fmt = train_ds.map(format_example, remove_columns=train_ds.column_names) eval_ds_fmt = eval_ds.map(format_example, remove_columns=eval_ds.column_names) if eval_ds else None print(train_ds_fmt[0]["text"][:400]) """ ) ) cells.append(nbf.v4.new_markdown_cell("## 4) Tokenize")) cells.append( nbf.v4.new_code_cell( """ def tokenize(batch): return tokenizer( batch["text"], truncation=True, max_length=MAX_SEQ_LEN, padding="max_length", return_tensors=None, ) train_tok = train_ds_fmt.map(tokenize, batched=True, remove_columns=train_ds_fmt.column_names) eval_tok = eval_ds_fmt.map(tokenize, batched=True, remove_columns=eval_ds_fmt.column_names) if eval_ds_fmt else None train_tok = train_tok.rename_column("input_ids", "input_ids") train_tok = train_tok.add_column("labels", train_tok["input_ids"]) if eval_tok: eval_tok = eval_tok.add_column("labels", eval_tok["input_ids"]) train_tok, (eval_tok[0]['input_ids'][:10] if eval_tok else []) """ ) ) cells.append( nbf.v4.new_markdown_cell( "## 5) Load base model with 4-bit quantization and prepare QLoRA" ) ) cells.append( nbf.v4.new_code_cell( """ import torch from transformers import AutoModelForCausalLM, BitsAndBytesConfig from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training bnb_config = None if LOAD_IN_4BIT: bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_use_double_quant=BNB_4BIT_USE_DOUBLE_QUANT, bnb_4bit_quant_type=BNB_4BIT_QUANT_TYPE, bnb_4bit_compute_dtype=getattr(torch, BNB_4BIT_COMPUTE_DTYPE) ) model = AutoModelForCausalLM.from_pretrained( BASE_MODEL, quantization_config=bnb_config, torch_dtype=torch.bfloat16 if BF16 else (torch.float16 if FP16 else None), device_map="auto", ) model = prepare_model_for_kbit_training(model) peft_config = LoraConfig( r=LORA_R, lora_alpha=LORA_ALPHA, lora_dropout=LORA_DROPOUT, bias="none", task_type="CAUSAL_LM", target_modules=TARGET_MODULES, ) model = get_peft_model(model, peft_config) model.print_trainable_parameters() """ ) ) cells.append(nbf.v4.new_markdown_cell("## 6) Train")) cells.append( nbf.v4.new_code_cell( """ from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling import math data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) args = TrainingArguments( output_dir=str(OUTPUT_DIR), run_name=RUN_NAME, num_train_epochs=NUM_TRAIN_EPOCHS, per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH, per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH, gradient_accumulation_steps=GRADIENT_ACCUM_STEPS, learning_rate=LEARNING_RATE, lr_scheduler_type=LR_SCHEDULER_TYPE, warmup_ratio=WARMUP_RATIO, weight_decay=WEIGHT_DECAY, logging_steps=LOGGING_STEPS, evaluation_strategy="steps", eval_steps=EVAL_STEPS, save_steps=SAVE_STEPS, save_total_limit=2, bf16=BF16, fp16=FP16, gradient_checkpointing=True, report_to=["none"], seed=SEED, ) trainer = Trainer( model=model, tokenizer=tokenizer, args=args, train_dataset=train_tok, eval_dataset=eval_tok, data_collator=data_collator, ) train_result = trainer.train() metrics = trainer.evaluate() if eval_tok else {} perplexity = math.exp(metrics["eval_loss"]) if metrics and "eval_loss" in metrics else None metrics, perplexity """ ) ) cells.append(nbf.v4.new_markdown_cell("## 7) Save LoRA adapters")) cells.append( nbf.v4.new_code_cell( """ adapter_dir = OUTPUT_DIR / "lora_adapter" adapter_dir.mkdir(parents=True, exist_ok=True) model.save_pretrained(str(adapter_dir)) tokenizer.save_pretrained(str(adapter_dir)) print(f"Saved LoRA adapter to: {adapter_dir}") """ ) ) cells.append( nbf.v4.new_markdown_cell( "## 8) (Optional) Merge adapters into base model and save full weights" ) ) cells.append( nbf.v4.new_code_cell( """ DO_MERGE = False # set True to produce a standalone merged model if DO_MERGE: from peft import PeftModel base_model = AutoModelForCausalLM.from_pretrained( BASE_MODEL, torch_dtype=torch.bfloat16 if BF16 else (torch.float16 if FP16 else None), device_map="auto", ) merged = PeftModel.from_pretrained(base_model, str(adapter_dir)).merge_and_unload() merged_dir = OUTPUT_DIR / "merged_model" merged.save_pretrained(str(merged_dir)) tokenizer.save_pretrained(str(merged_dir)) print(f"Merged full model saved to: {merged_dir}") else: print("Skipping merge (set DO_MERGE=True to enable).") """ ) ) cells.append(nbf.v4.new_markdown_cell("## 9) Quick inference with the trained adapter")) cells.append( nbf.v4.new_code_cell( """ from peft import PeftModel import torch test_model = AutoModelForCausalLM.from_pretrained( BASE_MODEL, quantization_config=bnb_config, torch_dtype=torch.bfloat16 if BF16 else (torch.float16 if FP16 else None), device_map="auto", ) test_model = PeftModel.from_pretrained(test_model, str(adapter_dir)) test_model.eval() def generate_answer(prompt, max_new_tokens=256, temperature=0.2, top_p=0.9): if USE_CHAT_TEMPLATE and hasattr(tokenizer, "apply_chat_template"): messages = [ {"role": "system", "content": SYSTEM_PREFIX}, {"role": "user", "content": prompt}, ] model_inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to(test_model.device) else: text = f"[SYSTEM]\\n{SYSTEM_PREFIX}\\n[/SYSTEM]\\n[USER]\\n{prompt}\\n[/USER]\\n[ASSISTANT]\\n" model_inputs = tokenizer([text], return_tensors="pt").to(test_model.device) with torch.no_grad(): out = test_model.generate( **model_inputs, do_sample=True, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id, ) return tokenizer.decode(out[0], skip_special_tokens=True) sample_prompt = (train_rows[0]["input"] if len(train_rows)>0 else "What are the visitor crowd levels like?") print(generate_answer(sample_prompt)[:800]) """ ) ) cells.append(nbf.v4.new_markdown_cell("## 10) Light evaluation on the validation set")) cells.append( nbf.v4.new_code_cell( """ import evaluate if eval_ds: rouge = evaluate.load("rouge") preds, refs = [], [] for ex in val_rows[:50]: preds.append(generate_answer(ex["input"], max_new_tokens=192, temperature=0.0)) refs.append(ex["output"]) results = rouge.compute(predictions=preds, references=refs) print(results) else: print("No eval split available; skipped.") """ ) ) cells.append( nbf.v4.new_markdown_cell( """ ## 11) (Optional) Use with other runtimes - **Python Inference (PEFT)**: Load base model + adapter as shown in Section 9. - **Merged model**: Set `DO_MERGE=True` to create a standalone model directory; you can then convert to other runtimes (e.g., llama.cpp GGUF) using their conversion tools. - **Ollama**: If your runtime supports adapters or merged weights for the chosen base model, create a `Modelfile` pointing to them. Need a concrete path? Tell me your base and target runtime and I’ll add exact steps. """ ) ) nb["cells"] = cells out_path = Path("./raft_finetune_qlora.ipynb") with open(out_path, "w", encoding="utf-8") as f: nbf.write(nb, f) str(out_path)