RAFT test setup

2025-12-06 18:20:53 +01:00 · 2025-10-13 17:34:49 +02:00
parent bfff8f7d96
commit 0d1dc45ec0
10 changed files with 78717 additions and 72462 deletions
--- a/raft/create_raft_tuning_notebook.py
+++ b/raft/create_raft_tuning_notebook.py
@@ -0,0 +1,417 @@
+# Re-create the Jupyter Notebook for RAFT QLoRA fine-tuning and save to /mnt/data
+
+import nbformat as nbf
+from pathlib import Path
+
+nb = nbf.v4.new_notebook()
+nb.metadata.update(
+    {
+        "kernelspec": {
+            "display_name": "Python 3",
+            "language": "python",
+            "name": "python3",
+        },
+        "language_info": {"name": "python", "version": "3.x"},
+    }
+)
+
+cells = []
+
+cells.append(
+    nbf.v4.new_markdown_cell(
+        """
+# RAFT Supervised Fine-Tuning (QLoRA) — Local Training
+
+This notebook fine-tunes an open-source base model on a RAFT-style dataset (`input` → `output`) using **QLoRA** with **PEFT** and **Transformers**. It is designed to run locally (single or multi-GPU) and to export both **LoRA adapters** and (optionally) a **merged** model for inference.
+
+> **Assumptions**
+> - Your dataset lives at `./outputs/raft_dataset.jsonl` (from the previous notebook). Adjust the path if needed.
+> - You have a CUDA-capable GPU and can install `bitsandbytes`. (CPU training is possible but slow.)
+> - You have enough VRAM for the chosen base model when loaded in 4-bit NF4.
+"""
+    )
+)
+
+cells.append(nbf.v4.new_markdown_cell("## 0) Install dependencies"))
+cells.append(
+    nbf.v4.new_code_cell(
+        """
+# If needed, uncomment the following installs:
+# %pip install --quiet transformers==4.44.2 datasets==2.20.0 peft==0.12.0 accelerate==0.34.2 bitsandbytes==0.43.3 evaluate==0.4.2 sentencepiece==0.2.0
+# Optional extras:
+# %pip install --quiet trl==0.9.6 sacrebleu==2.4.3 rouge-score==0.1.2
+"""
+    )
+)
+
+cells.append(nbf.v4.new_markdown_cell("## 1) Configuration"))
+cells.append(
+    nbf.v4.new_code_cell(
+        """
+from pathlib import Path
+
+# Paths
+DATA_JSONL = Path("./outputs/raft_dataset.jsonl")  # change if different
+RUN_NAME = "raft_qlora_run"
+OUTPUT_DIR = Path(f"./finetuned/{RUN_NAME}")
+OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+
+# Base model — examples: "meta-llama/Llama-3.1-8B", "Qwen/Qwen2-7B-Instruct", "mistralai/Mistral-7B-Instruct-v0.3"
+# Prefer an instruction-tuned base for better stability on SFT.
+BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
+
+# Tokenization/prompt formatting
+SYSTEM_PREFIX = "You are a helpful assistant. Answer concisely and truthfully based ONLY on the user's request."
+USE_CHAT_TEMPLATE = True  # if the tokenizer has a chat template, we'll leverage it
+
+# QLoRA/PEFT params
+LORA_R = 16
+LORA_ALPHA = 32
+LORA_DROPOUT = 0.05
+TARGET_MODULES = None  # None = let PEFT auto-detect common modules (works for most models)
+
+# 4-bit quantization (QLoRA)
+LOAD_IN_4BIT = True
+BNB_4BIT_COMPUTE_DTYPE = "bfloat16"  # "float16" or "bfloat16"
+BNB_4BIT_QUANT_TYPE = "nf4"          # "nf4" or "fp4"
+BNB_4BIT_USE_DOUBLE_QUANT = True
+
+# Training
+TRAIN_VAL_SPLIT = 0.98
+MAX_SEQ_LEN = 2048
+PER_DEVICE_TRAIN_BATCH = 1
+PER_DEVICE_EVAL_BATCH = 1
+GRADIENT_ACCUM_STEPS = 16
+LEARNING_RATE = 2e-4
+NUM_TRAIN_EPOCHS = 2
+WEIGHT_DECAY = 0.0
+WARMUP_RATIO = 0.03
+LR_SCHEDULER_TYPE = "cosine"
+LOGGING_STEPS = 10
+EVAL_STEPS = 200
+SAVE_STEPS = 200
+BF16 = True
+FP16 = False
+
+SEED = 7
+"""
+    )
+)
+
+cells.append(nbf.v4.new_markdown_cell("## 2) Load dataset (JSONL)"))
+cells.append(
+    nbf.v4.new_code_cell(
+        """
+import json, random
+from datasets import Dataset
+
+def read_jsonl(p: Path):
+    rows = []
+    with p.open("r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                obj = json.loads(line)
+                if "input" in obj and "output" in obj:
+                    rows.append(obj)
+            except Exception:
+                pass
+    return rows
+
+rows = read_jsonl(DATA_JSONL)
+print(f"Loaded {len(rows)} rows from {DATA_JSONL}")
+
+random.Random(SEED).shuffle(rows)
+split = int(len(rows) * TRAIN_VAL_SPLIT)
+train_rows = rows[:split]
+val_rows = rows[split:] if split < len(rows) else rows[-max(1, len(rows)//50):]
+
+train_ds = Dataset.from_list(train_rows)
+eval_ds = Dataset.from_list(val_rows) if val_rows else None
+train_ds, eval_ds
+"""
+    )
+)
+
+cells.append(nbf.v4.new_markdown_cell("## 3) Prompt formatting"))
+cells.append(
+    nbf.v4.new_code_cell(
+        """
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
+if tokenizer.pad_token is None:
+    tokenizer.pad_token = tokenizer.eos_token
+
+def format_example(ex):
+    user = ex["input"]
+    assistant = ex["output"]
+
+    if USE_CHAT_TEMPLATE and hasattr(tokenizer, "apply_chat_template"):
+        messages = [
+            {"role": "system", "content": SYSTEM_PREFIX},
+            {"role": "user", "content": user},
+            {"role": "assistant", "content": assistant},
+        ]
+        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
+    else:
+        text = f"<s>[SYSTEM]\\n{SYSTEM_PREFIX}\\n[/SYSTEM]\\n[USER]\\n{user}\\n[/USER]\\n[ASSISTANT]\\n{assistant}</s>"
+    return {"text": text}
+
+train_ds_fmt = train_ds.map(format_example, remove_columns=train_ds.column_names)
+eval_ds_fmt = eval_ds.map(format_example, remove_columns=eval_ds.column_names) if eval_ds else None
+
+print(train_ds_fmt[0]["text"][:400])
+"""
+    )
+)
+
+cells.append(nbf.v4.new_markdown_cell("## 4) Tokenize"))
+cells.append(
+    nbf.v4.new_code_cell(
+        """
+def tokenize(batch):
+    return tokenizer(
+        batch["text"],
+        truncation=True,
+        max_length=MAX_SEQ_LEN,
+        padding="max_length",
+        return_tensors=None,
+    )
+
+train_tok = train_ds_fmt.map(tokenize, batched=True, remove_columns=train_ds_fmt.column_names)
+eval_tok = eval_ds_fmt.map(tokenize, batched=True, remove_columns=eval_ds_fmt.column_names) if eval_ds_fmt else None
+
+train_tok = train_tok.rename_column("input_ids", "input_ids")
+train_tok = train_tok.add_column("labels", train_tok["input_ids"])
+if eval_tok:
+    eval_tok = eval_tok.add_column("labels", eval_tok["input_ids"])
+
+train_tok, (eval_tok[0]['input_ids'][:10] if eval_tok else [])
+"""
+    )
+)
+
+cells.append(
+    nbf.v4.new_markdown_cell(
+        "## 5) Load base model with 4-bit quantization and prepare QLoRA"
+    )
+)
+cells.append(
+    nbf.v4.new_code_cell(
+        """
+import torch
+from transformers import AutoModelForCausalLM, BitsAndBytesConfig
+from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
+
+bnb_config = None
+if LOAD_IN_4BIT:
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_use_double_quant=BNB_4BIT_USE_DOUBLE_QUANT,
+        bnb_4bit_quant_type=BNB_4BIT_QUANT_TYPE,
+        bnb_4bit_compute_dtype=getattr(torch, BNB_4BIT_COMPUTE_DTYPE)
+    )
+
+model = AutoModelForCausalLM.from_pretrained(
+    BASE_MODEL,
+    quantization_config=bnb_config,
+    torch_dtype=torch.bfloat16 if BF16 else (torch.float16 if FP16 else None),
+    device_map="auto",
+)
+
+model = prepare_model_for_kbit_training(model)
+
+peft_config = LoraConfig(
+    r=LORA_R,
+    lora_alpha=LORA_ALPHA,
+    lora_dropout=LORA_DROPOUT,
+    bias="none",
+    task_type="CAUSAL_LM",
+    target_modules=TARGET_MODULES,
+)
+
+model = get_peft_model(model, peft_config)
+model.print_trainable_parameters()
+"""
+    )
+)
+
+cells.append(nbf.v4.new_markdown_cell("## 6) Train"))
+cells.append(
+    nbf.v4.new_code_cell(
+        """
+from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
+import math
+
+data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
+
+args = TrainingArguments(
+    output_dir=str(OUTPUT_DIR),
+    run_name=RUN_NAME,
+    num_train_epochs=NUM_TRAIN_EPOCHS,
+    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH,
+    per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
+    gradient_accumulation_steps=GRADIENT_ACCUM_STEPS,
+    learning_rate=LEARNING_RATE,
+    lr_scheduler_type=LR_SCHEDULER_TYPE,
+    warmup_ratio=WARMUP_RATIO,
+    weight_decay=WEIGHT_DECAY,
+    logging_steps=LOGGING_STEPS,
+    evaluation_strategy="steps",
+    eval_steps=EVAL_STEPS,
+    save_steps=SAVE_STEPS,
+    save_total_limit=2,
+    bf16=BF16,
+    fp16=FP16,
+    gradient_checkpointing=True,
+    report_to=["none"],
+    seed=SEED,
+)
+
+trainer = Trainer(
+    model=model,
+    tokenizer=tokenizer,
+    args=args,
+    train_dataset=train_tok,
+    eval_dataset=eval_tok,
+    data_collator=data_collator,
+)
+
+train_result = trainer.train()
+metrics = trainer.evaluate() if eval_tok else {}
+perplexity = math.exp(metrics["eval_loss"]) if metrics and "eval_loss" in metrics else None
+metrics, perplexity
+"""
+    )
+)
+
+cells.append(nbf.v4.new_markdown_cell("## 7) Save LoRA adapters"))
+cells.append(
+    nbf.v4.new_code_cell(
+        """
+adapter_dir = OUTPUT_DIR / "lora_adapter"
+adapter_dir.mkdir(parents=True, exist_ok=True)
+
+model.save_pretrained(str(adapter_dir))
+tokenizer.save_pretrained(str(adapter_dir))
+
+print(f"Saved LoRA adapter to: {adapter_dir}")
+"""
+    )
+)
+
+cells.append(
+    nbf.v4.new_markdown_cell(
+        "## 8) (Optional) Merge adapters into base model and save full weights"
+    )
+)
+cells.append(
+    nbf.v4.new_code_cell(
+        """
+DO_MERGE = False  # set True to produce a standalone merged model
+
+if DO_MERGE:
+    from peft import PeftModel
+    base_model = AutoModelForCausalLM.from_pretrained(
+        BASE_MODEL,
+        torch_dtype=torch.bfloat16 if BF16 else (torch.float16 if FP16 else None),
+        device_map="auto",
+    )
+    merged = PeftModel.from_pretrained(base_model, str(adapter_dir)).merge_and_unload()
+    merged_dir = OUTPUT_DIR / "merged_model"
+    merged.save_pretrained(str(merged_dir))
+    tokenizer.save_pretrained(str(merged_dir))
+    print(f"Merged full model saved to: {merged_dir}")
+else:
+    print("Skipping merge (set DO_MERGE=True to enable).")
+"""
+    )
+)
+
+cells.append(nbf.v4.new_markdown_cell("## 9) Quick inference with the trained adapter"))
+cells.append(
+    nbf.v4.new_code_cell(
+        """
+from peft import PeftModel
+import torch
+
+test_model = AutoModelForCausalLM.from_pretrained(
+    BASE_MODEL,
+    quantization_config=bnb_config,
+    torch_dtype=torch.bfloat16 if BF16 else (torch.float16 if FP16 else None),
+    device_map="auto",
+)
+test_model = PeftModel.from_pretrained(test_model, str(adapter_dir))
+test_model.eval()
+
+def generate_answer(prompt, max_new_tokens=256, temperature=0.2, top_p=0.9):
+    if USE_CHAT_TEMPLATE and hasattr(tokenizer, "apply_chat_template"):
+        messages = [
+            {"role": "system", "content": SYSTEM_PREFIX},
+            {"role": "user", "content": prompt},
+        ]
+        model_inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to(test_model.device)
+    else:
+        text = f"<s>[SYSTEM]\\n{SYSTEM_PREFIX}\\n[/SYSTEM]\\n[USER]\\n{prompt}\\n[/USER]\\n[ASSISTANT]\\n"
+        model_inputs = tokenizer([text], return_tensors="pt").to(test_model.device)
+
+    with torch.no_grad():
+        out = test_model.generate(
+            **model_inputs,
+            do_sample=True,
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            eos_token_id=tokenizer.eos_token_id,
+            pad_token_id=tokenizer.pad_token_id,
+        )
+    return tokenizer.decode(out[0], skip_special_tokens=True)
+
+sample_prompt = (train_rows[0]["input"] if len(train_rows)>0 else "What are the visitor crowd levels like?")
+print(generate_answer(sample_prompt)[:800])
+"""
+    )
+)
+
+cells.append(nbf.v4.new_markdown_cell("## 10) Light evaluation on the validation set"))
+cells.append(
+    nbf.v4.new_code_cell(
+        """
+import evaluate
+
+if eval_ds:
+    rouge = evaluate.load("rouge")
+    preds, refs = [], []
+    for ex in val_rows[:50]:
+        preds.append(generate_answer(ex["input"], max_new_tokens=192, temperature=0.0))
+        refs.append(ex["output"])
+    results = rouge.compute(predictions=preds, references=refs)
+    print(results)
+else:
+    print("No eval split available; skipped.")
+"""
+    )
+)
+
+cells.append(
+    nbf.v4.new_markdown_cell(
+        """
+## 11) (Optional) Use with other runtimes
+
+- **Python Inference (PEFT)**: Load base model + adapter as shown in Section 9.
+- **Merged model**: Set `DO_MERGE=True` to create a standalone model directory; you can then convert to other runtimes (e.g., llama.cpp GGUF) using their conversion tools.
+- **Ollama**: If your runtime supports adapters or merged weights for the chosen base model, create a `Modelfile` pointing to them. Need a concrete path? Tell me your base and target runtime and I’ll add exact steps.
+"""
+    )
+)
+
+nb["cells"] = cells
+
+out_path = Path("./raft_finetune_qlora.ipynb")
+with open(out_path, "w", encoding="utf-8") as f:
+    nbf.write(nb, f)
+
+str(out_path)