diff --git a/hooks/pre-commit b/hooks/pre-commit
index ead2310..0028c28 100644
--- a/hooks/pre-commit
+++ b/hooks/pre-commit
@@ -1,25 +1,6 @@
#!/bin/bash
set -e
-# Find all staged .ipynb files
-NOTEBOOKS=$(git diff --cached --name-only --diff-filter=ACM | grep '\.ipynb$' || true)
-
-if [ -z "$NOTEBOOKS" ]; then
- echo "No Jupyter notebooks staged. Skipping Jupytext conversion."
- exit 0
-fi
-
-echo "Converting staged Jupyter notebooks to .py (percent format)..."
-
-# Loop through each notebook and convert
-for nb in $NOTEBOOKS; do
- if [ -f "$nb" ]; then
- echo " - Converting $nb"
- jupytext --to py:percent "$nb"
- pyfile="${nb%.ipynb}.py"
- # Stage the generated .py file
- git add "$pyfile"
- fi
-done
+./convert_jupytext.sh py
echo "✅ Jupytext conversion complete."
diff --git a/raft/.gitignore b/raft/.gitignore
index 1efe02e..7ba4ce4 100644
--- a/raft/.gitignore
+++ b/raft/.gitignore
@@ -1,3 +1,5 @@
-offload
+offload/
finetuned/**
-!*.md
+!finetuned/
+!finetuned/**/
+!finetuned/**/train.md
diff --git a/raft/corpus_old/topic=22__part=001__n=60.txt b/raft/corpus/topic=22__part=001__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=22__part=001__n=60.txt
rename to raft/corpus/topic=22__part=001__n=60.txt
diff --git a/raft/corpus_old/topic=22__part=002__n=60.txt b/raft/corpus/topic=22__part=002__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=22__part=002__n=60.txt
rename to raft/corpus/topic=22__part=002__n=60.txt
diff --git a/raft/corpus_old/topic=22__part=003__n=60.txt b/raft/corpus/topic=22__part=003__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=22__part=003__n=60.txt
rename to raft/corpus/topic=22__part=003__n=60.txt
diff --git a/raft/corpus_old/topic=22__part=004__n=60.txt b/raft/corpus/topic=22__part=004__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=22__part=004__n=60.txt
rename to raft/corpus/topic=22__part=004__n=60.txt
diff --git a/raft/corpus_old/topic=22__part=005__n=60.txt b/raft/corpus/topic=22__part=005__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=22__part=005__n=60.txt
rename to raft/corpus/topic=22__part=005__n=60.txt
diff --git a/raft/corpus_old/topic=22__part=006__n=3.txt b/raft/corpus/topic=22__part=006__n=3.txt
similarity index 100%
rename from raft/corpus_old/topic=22__part=006__n=3.txt
rename to raft/corpus/topic=22__part=006__n=3.txt
diff --git a/raft/corpus_old/topic=26__part=001__n=60.txt b/raft/corpus/topic=26__part=001__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=26__part=001__n=60.txt
rename to raft/corpus/topic=26__part=001__n=60.txt
diff --git a/raft/corpus_old/topic=26__part=002__n=60.txt b/raft/corpus/topic=26__part=002__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=26__part=002__n=60.txt
rename to raft/corpus/topic=26__part=002__n=60.txt
diff --git a/raft/corpus_old/topic=26__part=003__n=60.txt b/raft/corpus/topic=26__part=003__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=26__part=003__n=60.txt
rename to raft/corpus/topic=26__part=003__n=60.txt
diff --git a/raft/corpus_old/topic=26__part=004__n=60.txt b/raft/corpus/topic=26__part=004__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=26__part=004__n=60.txt
rename to raft/corpus/topic=26__part=004__n=60.txt
diff --git a/raft/corpus_old/topic=26__part=005__n=20.txt b/raft/corpus/topic=26__part=005__n=20.txt
similarity index 100%
rename from raft/corpus_old/topic=26__part=005__n=20.txt
rename to raft/corpus/topic=26__part=005__n=20.txt
diff --git a/raft/corpus_old/topic=2__part=001__n=60.txt b/raft/corpus/topic=2__part=001__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=001__n=60.txt
rename to raft/corpus/topic=2__part=001__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=002__n=60.txt b/raft/corpus/topic=2__part=002__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=002__n=60.txt
rename to raft/corpus/topic=2__part=002__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=003__n=60.txt b/raft/corpus/topic=2__part=003__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=003__n=60.txt
rename to raft/corpus/topic=2__part=003__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=004__n=60.txt b/raft/corpus/topic=2__part=004__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=004__n=60.txt
rename to raft/corpus/topic=2__part=004__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=005__n=60.txt b/raft/corpus/topic=2__part=005__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=005__n=60.txt
rename to raft/corpus/topic=2__part=005__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=006__n=60.txt b/raft/corpus/topic=2__part=006__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=006__n=60.txt
rename to raft/corpus/topic=2__part=006__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=007__n=60.txt b/raft/corpus/topic=2__part=007__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=007__n=60.txt
rename to raft/corpus/topic=2__part=007__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=008__n=60.txt b/raft/corpus/topic=2__part=008__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=008__n=60.txt
rename to raft/corpus/topic=2__part=008__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=009__n=60.txt b/raft/corpus/topic=2__part=009__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=009__n=60.txt
rename to raft/corpus/topic=2__part=009__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=010__n=60.txt b/raft/corpus/topic=2__part=010__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=010__n=60.txt
rename to raft/corpus/topic=2__part=010__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=011__n=60.txt b/raft/corpus/topic=2__part=011__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=011__n=60.txt
rename to raft/corpus/topic=2__part=011__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=012__n=60.txt b/raft/corpus/topic=2__part=012__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=012__n=60.txt
rename to raft/corpus/topic=2__part=012__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=013__n=60.txt b/raft/corpus/topic=2__part=013__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=013__n=60.txt
rename to raft/corpus/topic=2__part=013__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=014__n=60.txt b/raft/corpus/topic=2__part=014__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=014__n=60.txt
rename to raft/corpus/topic=2__part=014__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=015__n=60.txt b/raft/corpus/topic=2__part=015__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=015__n=60.txt
rename to raft/corpus/topic=2__part=015__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=016__n=60.txt b/raft/corpus/topic=2__part=016__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=016__n=60.txt
rename to raft/corpus/topic=2__part=016__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=017__n=60.txt b/raft/corpus/topic=2__part=017__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=017__n=60.txt
rename to raft/corpus/topic=2__part=017__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=018__n=60.txt b/raft/corpus/topic=2__part=018__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=018__n=60.txt
rename to raft/corpus/topic=2__part=018__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=019__n=60.txt b/raft/corpus/topic=2__part=019__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=019__n=60.txt
rename to raft/corpus/topic=2__part=019__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=020__n=60.txt b/raft/corpus/topic=2__part=020__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=020__n=60.txt
rename to raft/corpus/topic=2__part=020__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=021__n=60.txt b/raft/corpus/topic=2__part=021__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=021__n=60.txt
rename to raft/corpus/topic=2__part=021__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=022__n=60.txt b/raft/corpus/topic=2__part=022__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=022__n=60.txt
rename to raft/corpus/topic=2__part=022__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=023__n=60.txt b/raft/corpus/topic=2__part=023__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=023__n=60.txt
rename to raft/corpus/topic=2__part=023__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=024__n=60.txt b/raft/corpus/topic=2__part=024__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=024__n=60.txt
rename to raft/corpus/topic=2__part=024__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=025__n=60.txt b/raft/corpus/topic=2__part=025__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=025__n=60.txt
rename to raft/corpus/topic=2__part=025__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=026__n=60.txt b/raft/corpus/topic=2__part=026__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=026__n=60.txt
rename to raft/corpus/topic=2__part=026__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=027__n=60.txt b/raft/corpus/topic=2__part=027__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=027__n=60.txt
rename to raft/corpus/topic=2__part=027__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=028__n=60.txt b/raft/corpus/topic=2__part=028__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=028__n=60.txt
rename to raft/corpus/topic=2__part=028__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=029__n=60.txt b/raft/corpus/topic=2__part=029__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=029__n=60.txt
rename to raft/corpus/topic=2__part=029__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=030__n=60.txt b/raft/corpus/topic=2__part=030__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=030__n=60.txt
rename to raft/corpus/topic=2__part=030__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=031__n=60.txt b/raft/corpus/topic=2__part=031__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=031__n=60.txt
rename to raft/corpus/topic=2__part=031__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=032__n=60.txt b/raft/corpus/topic=2__part=032__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=032__n=60.txt
rename to raft/corpus/topic=2__part=032__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=033__n=60.txt b/raft/corpus/topic=2__part=033__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=033__n=60.txt
rename to raft/corpus/topic=2__part=033__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=034__n=60.txt b/raft/corpus/topic=2__part=034__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=034__n=60.txt
rename to raft/corpus/topic=2__part=034__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=035__n=60.txt b/raft/corpus/topic=2__part=035__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=035__n=60.txt
rename to raft/corpus/topic=2__part=035__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=036__n=60.txt b/raft/corpus/topic=2__part=036__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=036__n=60.txt
rename to raft/corpus/topic=2__part=036__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=037__n=60.txt b/raft/corpus/topic=2__part=037__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=037__n=60.txt
rename to raft/corpus/topic=2__part=037__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=038__n=60.txt b/raft/corpus/topic=2__part=038__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=038__n=60.txt
rename to raft/corpus/topic=2__part=038__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=039__n=60.txt b/raft/corpus/topic=2__part=039__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=039__n=60.txt
rename to raft/corpus/topic=2__part=039__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=040__n=60.txt b/raft/corpus/topic=2__part=040__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=040__n=60.txt
rename to raft/corpus/topic=2__part=040__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=041__n=60.txt b/raft/corpus/topic=2__part=041__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=041__n=60.txt
rename to raft/corpus/topic=2__part=041__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=042__n=60.txt b/raft/corpus/topic=2__part=042__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=042__n=60.txt
rename to raft/corpus/topic=2__part=042__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=043__n=60.txt b/raft/corpus/topic=2__part=043__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=043__n=60.txt
rename to raft/corpus/topic=2__part=043__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=044__n=60.txt b/raft/corpus/topic=2__part=044__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=044__n=60.txt
rename to raft/corpus/topic=2__part=044__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=045__n=60.txt b/raft/corpus/topic=2__part=045__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=045__n=60.txt
rename to raft/corpus/topic=2__part=045__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=046__n=60.txt b/raft/corpus/topic=2__part=046__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=046__n=60.txt
rename to raft/corpus/topic=2__part=046__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=047__n=60.txt b/raft/corpus/topic=2__part=047__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=047__n=60.txt
rename to raft/corpus/topic=2__part=047__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=048__n=60.txt b/raft/corpus/topic=2__part=048__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=048__n=60.txt
rename to raft/corpus/topic=2__part=048__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=049__n=60.txt b/raft/corpus/topic=2__part=049__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=049__n=60.txt
rename to raft/corpus/topic=2__part=049__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=050__n=60.txt b/raft/corpus/topic=2__part=050__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=050__n=60.txt
rename to raft/corpus/topic=2__part=050__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=051__n=60.txt b/raft/corpus/topic=2__part=051__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=051__n=60.txt
rename to raft/corpus/topic=2__part=051__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=052__n=60.txt b/raft/corpus/topic=2__part=052__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=052__n=60.txt
rename to raft/corpus/topic=2__part=052__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=053__n=60.txt b/raft/corpus/topic=2__part=053__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=053__n=60.txt
rename to raft/corpus/topic=2__part=053__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=054__n=60.txt b/raft/corpus/topic=2__part=054__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=054__n=60.txt
rename to raft/corpus/topic=2__part=054__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=055__n=60.txt b/raft/corpus/topic=2__part=055__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=055__n=60.txt
rename to raft/corpus/topic=2__part=055__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=056__n=60.txt b/raft/corpus/topic=2__part=056__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=056__n=60.txt
rename to raft/corpus/topic=2__part=056__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=057__n=60.txt b/raft/corpus/topic=2__part=057__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=057__n=60.txt
rename to raft/corpus/topic=2__part=057__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=058__n=60.txt b/raft/corpus/topic=2__part=058__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=058__n=60.txt
rename to raft/corpus/topic=2__part=058__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=059__n=60.txt b/raft/corpus/topic=2__part=059__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=059__n=60.txt
rename to raft/corpus/topic=2__part=059__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=060__n=60.txt b/raft/corpus/topic=2__part=060__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=060__n=60.txt
rename to raft/corpus/topic=2__part=060__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=061__n=60.txt b/raft/corpus/topic=2__part=061__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=061__n=60.txt
rename to raft/corpus/topic=2__part=061__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=062__n=60.txt b/raft/corpus/topic=2__part=062__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=062__n=60.txt
rename to raft/corpus/topic=2__part=062__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=063__n=60.txt b/raft/corpus/topic=2__part=063__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=063__n=60.txt
rename to raft/corpus/topic=2__part=063__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=064__n=60.txt b/raft/corpus/topic=2__part=064__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=064__n=60.txt
rename to raft/corpus/topic=2__part=064__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=065__n=60.txt b/raft/corpus/topic=2__part=065__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=065__n=60.txt
rename to raft/corpus/topic=2__part=065__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=066__n=60.txt b/raft/corpus/topic=2__part=066__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=066__n=60.txt
rename to raft/corpus/topic=2__part=066__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=067__n=60.txt b/raft/corpus/topic=2__part=067__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=067__n=60.txt
rename to raft/corpus/topic=2__part=067__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=068__n=60.txt b/raft/corpus/topic=2__part=068__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=068__n=60.txt
rename to raft/corpus/topic=2__part=068__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=069__n=60.txt b/raft/corpus/topic=2__part=069__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=069__n=60.txt
rename to raft/corpus/topic=2__part=069__n=60.txt
diff --git a/raft/corpus_old/topic=2__part=070__n=26.txt b/raft/corpus/topic=2__part=070__n=26.txt
similarity index 100%
rename from raft/corpus_old/topic=2__part=070__n=26.txt
rename to raft/corpus/topic=2__part=070__n=26.txt
diff --git a/raft/corpus_old/topic=4__part=001__n=60.txt b/raft/corpus/topic=4__part=001__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=4__part=001__n=60.txt
rename to raft/corpus/topic=4__part=001__n=60.txt
diff --git a/raft/corpus_old/topic=4__part=002__n=60.txt b/raft/corpus/topic=4__part=002__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=4__part=002__n=60.txt
rename to raft/corpus/topic=4__part=002__n=60.txt
diff --git a/raft/corpus_old/topic=4__part=003__n=60.txt b/raft/corpus/topic=4__part=003__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=4__part=003__n=60.txt
rename to raft/corpus/topic=4__part=003__n=60.txt
diff --git a/raft/corpus_old/topic=4__part=004__n=60.txt b/raft/corpus/topic=4__part=004__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=4__part=004__n=60.txt
rename to raft/corpus/topic=4__part=004__n=60.txt
diff --git a/raft/corpus_old/topic=4__part=005__n=60.txt b/raft/corpus/topic=4__part=005__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=4__part=005__n=60.txt
rename to raft/corpus/topic=4__part=005__n=60.txt
diff --git a/raft/corpus_old/topic=4__part=006__n=60.txt b/raft/corpus/topic=4__part=006__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=4__part=006__n=60.txt
rename to raft/corpus/topic=4__part=006__n=60.txt
diff --git a/raft/corpus_old/topic=4__part=007__n=60.txt b/raft/corpus/topic=4__part=007__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=4__part=007__n=60.txt
rename to raft/corpus/topic=4__part=007__n=60.txt
diff --git a/raft/corpus_old/topic=4__part=008__n=60.txt b/raft/corpus/topic=4__part=008__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=4__part=008__n=60.txt
rename to raft/corpus/topic=4__part=008__n=60.txt
diff --git a/raft/corpus_old/topic=4__part=009__n=60.txt b/raft/corpus/topic=4__part=009__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=4__part=009__n=60.txt
rename to raft/corpus/topic=4__part=009__n=60.txt
diff --git a/raft/corpus_old/topic=4__part=010__n=60.txt b/raft/corpus/topic=4__part=010__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=4__part=010__n=60.txt
rename to raft/corpus/topic=4__part=010__n=60.txt
diff --git a/raft/corpus_old/topic=4__part=011__n=60.txt b/raft/corpus/topic=4__part=011__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=4__part=011__n=60.txt
rename to raft/corpus/topic=4__part=011__n=60.txt
diff --git a/raft/corpus_old/topic=4__part=012__n=60.txt b/raft/corpus/topic=4__part=012__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=4__part=012__n=60.txt
rename to raft/corpus/topic=4__part=012__n=60.txt
diff --git a/raft/corpus_old/topic=4__part=013__n=60.txt b/raft/corpus/topic=4__part=013__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=4__part=013__n=60.txt
rename to raft/corpus/topic=4__part=013__n=60.txt
diff --git a/raft/corpus_old/topic=4__part=014__n=60.txt b/raft/corpus/topic=4__part=014__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=4__part=014__n=60.txt
rename to raft/corpus/topic=4__part=014__n=60.txt
diff --git a/raft/corpus_old/topic=4__part=015__n=60.txt b/raft/corpus/topic=4__part=015__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=4__part=015__n=60.txt
rename to raft/corpus/topic=4__part=015__n=60.txt
diff --git a/raft/corpus_old/topic=4__part=016__n=60.txt b/raft/corpus/topic=4__part=016__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=4__part=016__n=60.txt
rename to raft/corpus/topic=4__part=016__n=60.txt
diff --git a/raft/corpus_old/topic=4__part=017__n=60.txt b/raft/corpus/topic=4__part=017__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=4__part=017__n=60.txt
rename to raft/corpus/topic=4__part=017__n=60.txt
diff --git a/raft/corpus_old/topic=4__part=018__n=60.txt b/raft/corpus/topic=4__part=018__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=4__part=018__n=60.txt
rename to raft/corpus/topic=4__part=018__n=60.txt
diff --git a/raft/corpus_old/topic=4__part=019__n=60.txt b/raft/corpus/topic=4__part=019__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=4__part=019__n=60.txt
rename to raft/corpus/topic=4__part=019__n=60.txt
diff --git a/raft/corpus_old/topic=4__part=020__n=60.txt b/raft/corpus/topic=4__part=020__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=4__part=020__n=60.txt
rename to raft/corpus/topic=4__part=020__n=60.txt
diff --git a/raft/corpus_old/topic=4__part=021__n=60.txt b/raft/corpus/topic=4__part=021__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=4__part=021__n=60.txt
rename to raft/corpus/topic=4__part=021__n=60.txt
diff --git a/raft/corpus_old/topic=4__part=022__n=60.txt b/raft/corpus/topic=4__part=022__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=4__part=022__n=60.txt
rename to raft/corpus/topic=4__part=022__n=60.txt
diff --git a/raft/corpus_old/topic=4__part=023__n=60.txt b/raft/corpus/topic=4__part=023__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=4__part=023__n=60.txt
rename to raft/corpus/topic=4__part=023__n=60.txt
diff --git a/raft/corpus_old/topic=4__part=024__n=60.txt b/raft/corpus/topic=4__part=024__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=4__part=024__n=60.txt
rename to raft/corpus/topic=4__part=024__n=60.txt
diff --git a/raft/corpus_old/topic=4__part=025__n=60.txt b/raft/corpus/topic=4__part=025__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=4__part=025__n=60.txt
rename to raft/corpus/topic=4__part=025__n=60.txt
diff --git a/raft/corpus_old/topic=4__part=026__n=60.txt b/raft/corpus/topic=4__part=026__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=4__part=026__n=60.txt
rename to raft/corpus/topic=4__part=026__n=60.txt
diff --git a/raft/corpus_old/topic=4__part=027__n=60.txt b/raft/corpus/topic=4__part=027__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=4__part=027__n=60.txt
rename to raft/corpus/topic=4__part=027__n=60.txt
diff --git a/raft/corpus_old/topic=4__part=028__n=60.txt b/raft/corpus/topic=4__part=028__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=4__part=028__n=60.txt
rename to raft/corpus/topic=4__part=028__n=60.txt
diff --git a/raft/corpus_old/topic=4__part=029__n=60.txt b/raft/corpus/topic=4__part=029__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=4__part=029__n=60.txt
rename to raft/corpus/topic=4__part=029__n=60.txt
diff --git a/raft/corpus_old/topic=4__part=030__n=60.txt b/raft/corpus/topic=4__part=030__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=4__part=030__n=60.txt
rename to raft/corpus/topic=4__part=030__n=60.txt
diff --git a/raft/corpus_old/topic=4__part=031__n=60.txt b/raft/corpus/topic=4__part=031__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=4__part=031__n=60.txt
rename to raft/corpus/topic=4__part=031__n=60.txt
diff --git a/raft/corpus_old/topic=4__part=032__n=60.txt b/raft/corpus/topic=4__part=032__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=4__part=032__n=60.txt
rename to raft/corpus/topic=4__part=032__n=60.txt
diff --git a/raft/corpus_old/topic=4__part=033__n=60.txt b/raft/corpus/topic=4__part=033__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=4__part=033__n=60.txt
rename to raft/corpus/topic=4__part=033__n=60.txt
diff --git a/raft/corpus_old/topic=4__part=034__n=41.txt b/raft/corpus/topic=4__part=034__n=41.txt
similarity index 100%
rename from raft/corpus_old/topic=4__part=034__n=41.txt
rename to raft/corpus/topic=4__part=034__n=41.txt
diff --git a/raft/corpus_old/topic=5__part=001__n=60.txt b/raft/corpus/topic=5__part=001__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=5__part=001__n=60.txt
rename to raft/corpus/topic=5__part=001__n=60.txt
diff --git a/raft/corpus_old/topic=5__part=002__n=60.txt b/raft/corpus/topic=5__part=002__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=5__part=002__n=60.txt
rename to raft/corpus/topic=5__part=002__n=60.txt
diff --git a/raft/corpus_old/topic=5__part=003__n=60.txt b/raft/corpus/topic=5__part=003__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=5__part=003__n=60.txt
rename to raft/corpus/topic=5__part=003__n=60.txt
diff --git a/raft/corpus_old/topic=5__part=004__n=60.txt b/raft/corpus/topic=5__part=004__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=5__part=004__n=60.txt
rename to raft/corpus/topic=5__part=004__n=60.txt
diff --git a/raft/corpus_old/topic=5__part=005__n=60.txt b/raft/corpus/topic=5__part=005__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=5__part=005__n=60.txt
rename to raft/corpus/topic=5__part=005__n=60.txt
diff --git a/raft/corpus_old/topic=5__part=006__n=60.txt b/raft/corpus/topic=5__part=006__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=5__part=006__n=60.txt
rename to raft/corpus/topic=5__part=006__n=60.txt
diff --git a/raft/corpus_old/topic=5__part=007__n=60.txt b/raft/corpus/topic=5__part=007__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=5__part=007__n=60.txt
rename to raft/corpus/topic=5__part=007__n=60.txt
diff --git a/raft/corpus_old/topic=5__part=008__n=60.txt b/raft/corpus/topic=5__part=008__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=5__part=008__n=60.txt
rename to raft/corpus/topic=5__part=008__n=60.txt
diff --git a/raft/corpus_old/topic=5__part=009__n=60.txt b/raft/corpus/topic=5__part=009__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=5__part=009__n=60.txt
rename to raft/corpus/topic=5__part=009__n=60.txt
diff --git a/raft/corpus_old/topic=5__part=010__n=60.txt b/raft/corpus/topic=5__part=010__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=5__part=010__n=60.txt
rename to raft/corpus/topic=5__part=010__n=60.txt
diff --git a/raft/corpus_old/topic=5__part=011__n=60.txt b/raft/corpus/topic=5__part=011__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=5__part=011__n=60.txt
rename to raft/corpus/topic=5__part=011__n=60.txt
diff --git a/raft/corpus_old/topic=5__part=012__n=60.txt b/raft/corpus/topic=5__part=012__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=5__part=012__n=60.txt
rename to raft/corpus/topic=5__part=012__n=60.txt
diff --git a/raft/corpus_old/topic=5__part=013__n=60.txt b/raft/corpus/topic=5__part=013__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=5__part=013__n=60.txt
rename to raft/corpus/topic=5__part=013__n=60.txt
diff --git a/raft/corpus_old/topic=5__part=014__n=60.txt b/raft/corpus/topic=5__part=014__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=5__part=014__n=60.txt
rename to raft/corpus/topic=5__part=014__n=60.txt
diff --git a/raft/corpus_old/topic=5__part=015__n=60.txt b/raft/corpus/topic=5__part=015__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=5__part=015__n=60.txt
rename to raft/corpus/topic=5__part=015__n=60.txt
diff --git a/raft/corpus_old/topic=5__part=016__n=60.txt b/raft/corpus/topic=5__part=016__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=5__part=016__n=60.txt
rename to raft/corpus/topic=5__part=016__n=60.txt
diff --git a/raft/corpus_old/topic=5__part=017__n=60.txt b/raft/corpus/topic=5__part=017__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=5__part=017__n=60.txt
rename to raft/corpus/topic=5__part=017__n=60.txt
diff --git a/raft/corpus_old/topic=5__part=018__n=60.txt b/raft/corpus/topic=5__part=018__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=5__part=018__n=60.txt
rename to raft/corpus/topic=5__part=018__n=60.txt
diff --git a/raft/corpus_old/topic=5__part=019__n=60.txt b/raft/corpus/topic=5__part=019__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=5__part=019__n=60.txt
rename to raft/corpus/topic=5__part=019__n=60.txt
diff --git a/raft/corpus_old/topic=5__part=020__n=60.txt b/raft/corpus/topic=5__part=020__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=5__part=020__n=60.txt
rename to raft/corpus/topic=5__part=020__n=60.txt
diff --git a/raft/corpus_old/topic=5__part=021__n=60.txt b/raft/corpus/topic=5__part=021__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=5__part=021__n=60.txt
rename to raft/corpus/topic=5__part=021__n=60.txt
diff --git a/raft/corpus_old/topic=5__part=022__n=60.txt b/raft/corpus/topic=5__part=022__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=5__part=022__n=60.txt
rename to raft/corpus/topic=5__part=022__n=60.txt
diff --git a/raft/corpus_old/topic=5__part=023__n=60.txt b/raft/corpus/topic=5__part=023__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=5__part=023__n=60.txt
rename to raft/corpus/topic=5__part=023__n=60.txt
diff --git a/raft/corpus_old/topic=5__part=024__n=60.txt b/raft/corpus/topic=5__part=024__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=5__part=024__n=60.txt
rename to raft/corpus/topic=5__part=024__n=60.txt
diff --git a/raft/corpus_old/topic=5__part=025__n=60.txt b/raft/corpus/topic=5__part=025__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=5__part=025__n=60.txt
rename to raft/corpus/topic=5__part=025__n=60.txt
diff --git a/raft/corpus_old/topic=5__part=026__n=60.txt b/raft/corpus/topic=5__part=026__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=5__part=026__n=60.txt
rename to raft/corpus/topic=5__part=026__n=60.txt
diff --git a/raft/corpus_old/topic=5__part=027__n=60.txt b/raft/corpus/topic=5__part=027__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=5__part=027__n=60.txt
rename to raft/corpus/topic=5__part=027__n=60.txt
diff --git a/raft/corpus_old/topic=5__part=028__n=60.txt b/raft/corpus/topic=5__part=028__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=5__part=028__n=60.txt
rename to raft/corpus/topic=5__part=028__n=60.txt
diff --git a/raft/corpus_old/topic=5__part=029__n=39.txt b/raft/corpus/topic=5__part=029__n=39.txt
similarity index 100%
rename from raft/corpus_old/topic=5__part=029__n=39.txt
rename to raft/corpus/topic=5__part=029__n=39.txt
diff --git a/raft/corpus_old/topic=9__part=001__n=60.txt b/raft/corpus/topic=9__part=001__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=9__part=001__n=60.txt
rename to raft/corpus/topic=9__part=001__n=60.txt
diff --git a/raft/corpus_old/topic=9__part=002__n=60.txt b/raft/corpus/topic=9__part=002__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=9__part=002__n=60.txt
rename to raft/corpus/topic=9__part=002__n=60.txt
diff --git a/raft/corpus_old/topic=9__part=003__n=60.txt b/raft/corpus/topic=9__part=003__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=9__part=003__n=60.txt
rename to raft/corpus/topic=9__part=003__n=60.txt
diff --git a/raft/corpus_old/topic=9__part=004__n=60.txt b/raft/corpus/topic=9__part=004__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=9__part=004__n=60.txt
rename to raft/corpus/topic=9__part=004__n=60.txt
diff --git a/raft/corpus_old/topic=9__part=005__n=60.txt b/raft/corpus/topic=9__part=005__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=9__part=005__n=60.txt
rename to raft/corpus/topic=9__part=005__n=60.txt
diff --git a/raft/corpus_old/topic=9__part=006__n=60.txt b/raft/corpus/topic=9__part=006__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=9__part=006__n=60.txt
rename to raft/corpus/topic=9__part=006__n=60.txt
diff --git a/raft/corpus_old/topic=9__part=007__n=60.txt b/raft/corpus/topic=9__part=007__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=9__part=007__n=60.txt
rename to raft/corpus/topic=9__part=007__n=60.txt
diff --git a/raft/corpus_old/topic=9__part=008__n=60.txt b/raft/corpus/topic=9__part=008__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=9__part=008__n=60.txt
rename to raft/corpus/topic=9__part=008__n=60.txt
diff --git a/raft/corpus_old/topic=9__part=009__n=60.txt b/raft/corpus/topic=9__part=009__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=9__part=009__n=60.txt
rename to raft/corpus/topic=9__part=009__n=60.txt
diff --git a/raft/corpus_old/topic=9__part=010__n=60.txt b/raft/corpus/topic=9__part=010__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=9__part=010__n=60.txt
rename to raft/corpus/topic=9__part=010__n=60.txt
diff --git a/raft/corpus_old/topic=9__part=011__n=60.txt b/raft/corpus/topic=9__part=011__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=9__part=011__n=60.txt
rename to raft/corpus/topic=9__part=011__n=60.txt
diff --git a/raft/corpus_old/topic=9__part=012__n=60.txt b/raft/corpus/topic=9__part=012__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=9__part=012__n=60.txt
rename to raft/corpus/topic=9__part=012__n=60.txt
diff --git a/raft/corpus_old/topic=9__part=013__n=60.txt b/raft/corpus/topic=9__part=013__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=9__part=013__n=60.txt
rename to raft/corpus/topic=9__part=013__n=60.txt
diff --git a/raft/corpus_old/topic=9__part=014__n=60.txt b/raft/corpus/topic=9__part=014__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=9__part=014__n=60.txt
rename to raft/corpus/topic=9__part=014__n=60.txt
diff --git a/raft/corpus_old/topic=9__part=015__n=60.txt b/raft/corpus/topic=9__part=015__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=9__part=015__n=60.txt
rename to raft/corpus/topic=9__part=015__n=60.txt
diff --git a/raft/corpus_old/topic=9__part=016__n=60.txt b/raft/corpus/topic=9__part=016__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=9__part=016__n=60.txt
rename to raft/corpus/topic=9__part=016__n=60.txt
diff --git a/raft/corpus_old/topic=9__part=017__n=60.txt b/raft/corpus/topic=9__part=017__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=9__part=017__n=60.txt
rename to raft/corpus/topic=9__part=017__n=60.txt
diff --git a/raft/corpus_old/topic=9__part=018__n=60.txt b/raft/corpus/topic=9__part=018__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=9__part=018__n=60.txt
rename to raft/corpus/topic=9__part=018__n=60.txt
diff --git a/raft/corpus_old/topic=9__part=019__n=60.txt b/raft/corpus/topic=9__part=019__n=60.txt
similarity index 100%
rename from raft/corpus_old/topic=9__part=019__n=60.txt
rename to raft/corpus/topic=9__part=019__n=60.txt
diff --git a/raft/corpus_old/topic=9__part=020__n=44.txt b/raft/corpus/topic=9__part=020__n=44.txt
similarity index 100%
rename from raft/corpus_old/topic=9__part=020__n=44.txt
rename to raft/corpus/topic=9__part=020__n=44.txt
diff --git a/raft/create_raft_dataset_notebook.py b/raft/create_raft_dataset_notebook.py
deleted file mode 100644
index 7d29d20..0000000
--- a/raft/create_raft_dataset_notebook.py
+++ /dev/null
@@ -1,619 +0,0 @@
-# This script programmatically creates a Jupyter Notebook tailored for
-# "Retrieval-Augmented Fine-Tuning (RAFT) dataset generation using local Ollama".
-# It saves the notebook to /mnt/data/ so you can download and run it.
-
-import json
-from datetime import datetime
-import nbformat as nbf
-
-nb = nbf.v4.new_notebook()
-nb["metadata"].update(
- {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3",
- },
- "language_info": {"name": "python", "version": "3.x"},
- }
-)
-
-cells = []
-
-# Title / Overview
-cells.append(
- nbf.v4.new_markdown_cell(
- """
-# Retrieval-Augmented **Fine-Tuning** (RAFT) Dataset Generation with **Local Ollama**
-
-This notebook builds a **supervised fine-tuning dataset** (JSONL) for *retrieval-augmented* tasks, by:
-1. **Ingesting** your local corpus (Markdown, text, HTML; PDFs optional with extra deps).
-2. **Chunking** and **embedding** documents using Ollama's local **embedding model** (e.g., `nomic-embed-text`, `mxbai-embed-large`).
-3. Building a **lightweight vector index** (FAISS).
-4. **Sampling contexts** and using a local **generation model** via Ollama (e.g., `llama3.1`, `qwen2`, `phi3`) to synthesize **grounded Q&A** or instruction–response pairs.
-5. Emitting a **RAFT-style JSONL** for supervised training (e.g., `input`, `output`, `meta` with source citations).
-
-> **Requirements**
->
-> - Local [Ollama](https://ollama.com/) running at `http://localhost:11434`
-> - At least one **embedding** model pulled (e.g., `ollama pull nomic-embed-text`)
-> - At least one **generation** model pulled (e.g., `ollama pull llama3.1`)
->
-> You can adapt the prompts and schema for your specific downstream trainer (Llama.cpp, vLLM, Axolotl, mlx, etc.).
-"""
- )
-)
-
-# Setup
-cells.append(
- nbf.v4.new_markdown_cell(
- """
-## 0) Setup
-
-Install Python dependencies. If you're offline, pre-install or remove what you don't need.
-"""
- )
-)
-cells.append(
- nbf.v4.new_code_cell(
- """
-# If needed, uncomment:
-# %pip install --quiet requests faiss-cpu rich markdownify python-frontmatter pypdf regex
-# Optional extras:
-# %pip install --quiet tiktoken beautifulsoup4 lxml
-"""
- )
-)
-
-# Config
-cells.append(
- nbf.v4.new_markdown_cell(
- """
-## 1) Configuration
-
-Set paths, models, and chunking/index params.
-"""
- )
-)
-cells.append(
- nbf.v4.new_code_cell(
- """
-from dataclasses import dataclass, asdict
-from pathlib import Path
-from typing import List, Dict, Any, Optional, Tuple
-import os, re, json, uuid, math, glob, random, time
-import hashlib
-import requests
-from rich import print
-import regex
-import numpy as np
-
-# ---- Core config ----
-DATA_DIR = Path("./corpus") # Put your source docs here
-OUTPUT_DIR = Path("./outputs") # Where artifacts are saved
-OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
-
-# Ollama endpoints & models
-OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434")
-EMBED_MODEL = os.environ.get("EMBED_MODEL", "nomic-embed-text")
-GEN_MODEL = os.environ.get("GEN_MODEL", "llama3.1")
-
-# Chunking
-CHUNK_SIZE = 1200 # characters
-CHUNK_OVERLAP = 200 # characters
-MIN_CHARS = 200 # minimum viable chunk length
-
-# Index
-USE_FAISS = True
-TOP_K = 4
-
-# RAFT generation
-SEED = 7
-SAMPLES_PER_DOC = 4
-MAX_TOKENS_GEN = 512 # Generation max tokens (approx; Ollama supports 'num_predict')
-TEMPERATURE = 0.6
-
-random.seed(SEED)
-np.random.seed(SEED)
-
-print({
- "DATA_DIR": str(DATA_DIR.resolve()),
- "OUTPUT_DIR": str(OUTPUT_DIR.resolve()),
- "OLLAMA_URL": OLLAMA_URL,
- "EMBED_MODEL": EMBED_MODEL,
- "GEN_MODEL": GEN_MODEL,
- "CHUNK_SIZE": CHUNK_SIZE,
- "CHUNK_OVERLAP": CHUNK_OVERLAP,
- "TOP_K": TOP_K,
- "SAMPLES_PER_DOC": SAMPLES_PER_DOC
-})
-"""
- )
-)
-
-# Helpers: loaders
-cells.append(
- nbf.v4.new_markdown_cell(
- """
-## 2) Load & Normalize Documents
-
-Basic loaders for `.md`, `.txt`, `.html`. PDF support is optional (requires `pypdf`). You can extend as needed.
-"""
- )
-)
-cells.append(
- nbf.v4.new_code_cell(
- """
-from bs4 import BeautifulSoup # if you didn't install bs4, comment HTML support below
-try:
- import frontmatter
-except Exception:
- frontmatter = None
-
-def read_text_file(p: Path) -> str:
- return p.read_text(encoding="utf-8", errors="ignore")
-
-def read_markdown(p: Path) -> str:
- text = p.read_text(encoding="utf-8", errors="ignore")
- # Optional: strip YAML frontmatter
- if frontmatter:
- try:
- fm = frontmatter.loads(text)
- return fm.content
- except Exception:
- return text
- return text
-
-def read_html(p: Path) -> str:
- html = p.read_text(encoding="utf-8", errors="ignore")
- soup = BeautifulSoup(html, "lxml")
- # Remove script/style
- for tag in soup(["script", "style", "noscript"]):
- tag.decompose()
- text = soup.get_text(" ", strip=True)
- return text
-
-def read_pdf(p: Path) -> str:
- try:
- from pypdf import PdfReader
- except Exception as e:
- print("[yellow]Install pypdf to enable PDF parsing: %pip install pypdf[/yellow]")
- raise e
- reader = PdfReader(str(p))
- parts = []
- for page in reader.pages:
- try:
- parts.append(page.extract_text() or "")
- except Exception:
- parts.append("")
- return "\\n".join(parts)
-
-SUPPORTED_EXTS = {".txt": read_text_file, ".md": read_markdown, ".markdown": read_markdown,
- ".html": read_html, ".htm": read_html, ".pdf": read_pdf}
-
-def load_corpus(data_dir: Path) -> Dict[str, str]:
- docs = {}
- for p in data_dir.rglob("*"):
- if not p.is_file():
- continue
- fn = p.suffix.lower()
- if fn in SUPPORTED_EXTS:
- try:
- docs[str(p)] = SUPPORTED_EXTS[fn](p)
- except Exception as e:
- print(f"[red]Failed to read {p}: {e}[/red]")
- print(f"[green]Loaded {len(docs)} documents[/green]")
- return docs
-
-docs = load_corpus(DATA_DIR)
-len(docs)
-"""
- )
-)
-
-# Chunking
-cells.append(
- nbf.v4.new_markdown_cell(
- """
-## 3) Chunking
-
-Simple character-based chunker with overlap. Swap in a token-based chunker if you prefer.
-"""
- )
-)
-cells.append(
- nbf.v4.new_code_cell(
- """
-@dataclass
-class Chunk:
- id: str
- doc_path: str
- start: int
- end: int
- text: str
- sha1: str
-
-def chunk_text(text: str, doc_path: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[Chunk]:
- chunks: List[Chunk] = []
- i = 0
- n = len(text)
- while i < n:
- j = min(i + chunk_size, n)
- piece = text[i:j].strip()
- if len(piece) >= MIN_CHARS:
- sha1 = hashlib.sha1(piece.encode("utf-8")).hexdigest()
- chunks.append(Chunk(
- id=str(uuid.uuid4()),
- doc_path=doc_path,
- start=i, end=j,
- text=piece,
- sha1=sha1
- ))
- if j == n:
- break
- i = j - overlap
- if i < 0:
- i = 0
- if i >= n:
- break
- return chunks
-
-all_chunks: List[Chunk] = []
-for path, text in docs.items():
- all_chunks.extend(chunk_text(text, path))
-
-print(f"[green]Total chunks: {len(all_chunks)}[/green]")
-"""
- )
-)
-
-# Embeddings via Ollama
-cells.append(
- nbf.v4.new_markdown_cell(
- """
-## 4) Embeddings via Ollama
-
-Uses Ollama's `POST /api/embeddings` endpoint with your selected embedding model.
-Make sure you've pulled it locally: `ollama pull nomic-embed-text` (or your chosen model).
-"""
- )
-)
-cells.append(
- nbf.v4.new_code_cell(
- """
-EMBED_ENDPOINT = f"{OLLAMA_URL}/api/embeddings"
-
-def embed_texts(texts: List[str], model: str = EMBED_MODEL, batch_size: int = 32) -> np.ndarray:
- vectors = []
- for i in range(0, len(texts), batch_size):
- batch = texts[i:i+batch_size]
- # Ollama supports a single prompt or list? We'll call one by one to be safe with large content.
- for t in batch:
- r = requests.post(EMBED_ENDPOINT, json={"model": model, "prompt": t})
- r.raise_for_status()
- data = r.json()
- vec = np.array(data["embedding"], dtype=np.float32)
- vectors.append(vec)
- return np.vstack(vectors) if vectors else np.zeros((0, 768), dtype=np.float32)
-
-chunk_texts = [c.text for c in all_chunks]
-emb_matrix = embed_texts(chunk_texts, model=EMBED_MODEL, batch_size=8)
-emb_matrix.shape
-"""
- )
-)
-
-# Build FAISS
-cells.append(
- nbf.v4.new_markdown_cell(
- """
-## 5) Build Vector Index (FAISS)
-
-We normalize vectors and use inner product (equivalent to cosine on normalized vectors).
-"""
- )
-)
-cells.append(
- nbf.v4.new_code_cell(
- """
-def normalize_rows(x: np.ndarray) -> np.ndarray:
- norms = np.linalg.norm(x, axis=1, keepdims=True) + 1e-12
- return x / norms
-
-if USE_FAISS:
- import faiss
- xb = normalize_rows(emb_matrix).astype(np.float32)
- d = xb.shape[1]
- index = faiss.IndexFlatIP(d)
- index.add(xb)
- print("[green]FAISS index built:[/green]", index.ntotal, "vectors")
-else:
- index = None
- xb = normalize_rows(emb_matrix).astype(np.float32)
-"""
- )
-)
-
-# Retrieval helper
-cells.append(
- nbf.v4.new_markdown_cell(
- """
-## 6) Retrieval Helper
-"""
- )
-)
-cells.append(
- nbf.v4.new_code_cell(
- """
-def search(query: str, top_k: int = TOP_K) -> List[Tuple[int, float]]:
- # Embed the query
- qv = embed_texts([query], model=EMBED_MODEL, batch_size=1)
- qv = normalize_rows(qv).astype(np.float32)
- if USE_FAISS and index is not None:
- D, I = index.search(qv, top_k)
- hits = list(zip(I[0].tolist(), D[0].tolist()))
- else:
- sims = (xb @ qv.T).ravel()
- I = np.argsort(-sims)[:top_k]
- hits = [(int(i), float(sims[i])) for i in I]
- return hits
-
-# quick smoke test (no error means it's wired up)
-# print(search("What does this corpus talk about?", 3))
-"""
- )
-)
-
-# Generation via Ollama
-cells.append(
- nbf.v4.new_markdown_cell(
- """
-## 7) Synthesize Grounded Q&A / Instructions with Ollama
-
-We sample chunks, retrieve neighbors for richer context, and prompt a local LLM to create **high-quality** pairs.
-"""
- )
-)
-cells.append(
- nbf.v4.new_code_cell(
- """
-GEN_ENDPOINT = f"{OLLAMA_URL}/api/generate"
-
-SYSTEM_PROMPT = (
- "You are a careful dataset writer. Given only the provided CONTEXT, craft high-quality, factual "
- "question–answer pairs for supervised fine-tuning. Answers must be grounded strictly in the context. "
- "If the context lacks the answer, say 'INSUFFICIENT_CONTEXT'. Focus on clarity, specificity, and avoid hallucinations."
-)
-
-USER_PROMPT_TEMPLATE = (
- "CONTEXT:\\n\\n{context}\\n\\n"
- "Task: Produce {n} diverse Q&A pairs about the content above. "
- "Use JSON lines (one JSON object per line) with keys: 'input' (question/instruction), 'output' (concise grounded answer), "
- "'meta' (object with 'source_path', 'chunk_ids', and optional 'citations': list of quotes). "
- "Do NOT include markdown; output JSON objects only."
-)
-
-def ollama_generate(prompt: str, model: str = GEN_MODEL, temperature: float = TEMPERATURE, num_predict: int = MAX_TOKENS_GEN) -> str:
- payload = {
- "model": model,
- "prompt": prompt,
- "system": SYSTEM_PROMPT,
- "options": {
- "temperature": temperature,
- "num_predict": num_predict
- },
- "stream": False
- }
- r = requests.post(GEN_ENDPOINT, json=payload)
- r.raise_for_status()
- data = r.json()
- return data.get("response", "")
-
-def build_context(primary_idx: int, k: int = TOP_K) -> Tuple[str, List[str]]:
- primary_chunk = all_chunks[primary_idx]
- query = primary_chunk.text[:400] # use the start of the chunk as a pseudo-query
- hits = search(query, k)
- pieces, ids = [], []
- for i, score in hits:
- ch = all_chunks[i]
- ids.append(ch.id)
- pieces.append(f"[{Path(ch.doc_path).name}::{ch.start}-{ch.end}]\\n{ch.text}")
- return "\\n\\n---\\n\\n".join(pieces), ids
-
-def parse_llm_jsonl(text: str) -> List[Dict[str, Any]]:
- rows = []
- for line in text.splitlines():
- line = line.strip()
- if not line:
- continue
- # be forgiving for trailing commas etc.
- try:
- obj = json.loads(line)
- if isinstance(obj, dict):
- rows.append(obj)
- except Exception:
- # try to salvage with regex for JSON-ish
- try:
- fixed = regex.sub(r",\\s*}", "}", line)
- fixed = regex.sub(r",\\s*]", "]", fixed)
- obj = json.loads(fixed)
- if isinstance(obj, dict):
- rows.append(obj)
- except Exception:
- pass
- return rows
-"""
- )
-)
-
-# Sampling and synthesis loop
-cells.append(
- nbf.v4.new_markdown_cell(
- """
-## 8) Generate the RAFT Dataset
-
-This step iterates over documents, samples chunks, retrieves neighbors, and asks the model to produce JSONL rows.
-"""
- )
-)
-cells.append(
- nbf.v4.new_code_cell(
- """
-def synthesize_dataset(samples_per_doc: int = SAMPLES_PER_DOC, out_path: Path = OUTPUT_DIR / "raft_dataset.jsonl") -> Path:
- rng = random.Random(SEED)
- doc_to_chunk_idx = {}
- for i, ch in enumerate(all_chunks):
- doc_to_chunk_idx.setdefault(ch.doc_path, []).append(i)
-
- total_target = 0
- with out_path.open("w", encoding="utf-8") as f:
- for doc_path, idxs in doc_to_chunk_idx.items():
- if not idxs:
- continue
- chosen = rng.sample(idxs, min(samples_per_doc, len(idxs)))
- for pi in chosen:
- ctx, ids = build_context(pi, k=TOP_K)
- user = USER_PROMPT_TEMPLATE.format(context=ctx, n=3)
- raw = ollama_generate(user, model=GEN_MODEL, temperature=TEMPERATURE, num_predict=MAX_TOKENS_GEN)
- rows = parse_llm_jsonl(raw)
- for r in rows:
- # enforce schema & enrich meta
- inp = r.get("input") or r.get("question") or r.get("query")
- out = r.get("output") or r.get("answer") or r.get("response")
- meta = r.get("meta") or {}
- if not isinstance(meta, dict):
- meta = {}
- meta.update({
- "source_path": str(doc_path),
- "chunk_ids": ids,
- "generated_at": datetime.utcnow().isoformat() + "Z",
- "model": GEN_MODEL,
- "embed_model": EMBED_MODEL
- })
- if inp and out:
- obj = {"input": inp, "output": out, "meta": meta}
- f.write(json.dumps(obj, ensure_ascii=False) + "\\n")
- total_target += 1
- print(f"[green]Wrote {total_target} rows -> {out_path}[/green]")
- return out_path
-
-OUT_JSONL = synthesize_dataset(samples_per_doc=SAMPLES_PER_DOC)
-OUT_JSONL
-"""
- )
-)
-
-# Sanity check / preview
-cells.append(
- nbf.v4.new_markdown_cell(
- """
-## 9) Preview Samples
-"""
- )
-)
-cells.append(
- nbf.v4.new_code_cell(
- """
-from itertools import islice
-
-def head_jsonl(p: Path, n: int = 5):
- with p.open("r", encoding="utf-8") as f:
- for line in islice(f, n):
- print(line.rstrip())
-
-head_jsonl(OUT_JSONL, 5)
-"""
- )
-)
-
-# Optional: small eval
-cells.append(
- nbf.v4.new_markdown_cell(
- """
-## 10) Optional: Spot-Check Generation Quality
-
-Run a tiny evaluation by asking the model with and without retrieval and compare answers.
-"""
- )
-)
-cells.append(
- nbf.v4.new_code_cell(
- """
-EVAL_QUESTIONS = []
-
-# Collect inputs from the dataset (first N)
-with (OUTPUT_DIR / "raft_dataset.jsonl").open("r", encoding="utf-8") as f:
- for i, line in enumerate(f):
- try:
- obj = json.loads(line)
- EVAL_QUESTIONS.append(obj["input"])
- except Exception:
- pass
- if len(EVAL_QUESTIONS) >= 5:
- break
-
-def rag_answer(q: str, k: int = TOP_K) -> str:
- hits = search(q, k)
- ctx = "\\n\\n".join([all_chunks[i].text for i,_ in hits])
- user = f"Answer the question using ONLY this context. If missing, say INSUFFICIENT_CONTEXT.\\n\\nCONTEXT:\\n{ctx}\\n\\nQUESTION: {q}"
- return ollama_generate(user, model=GEN_MODEL, temperature=0.2, num_predict=256)
-
-for q in EVAL_QUESTIONS:
- print("\\n[bold]Q:[/bold]", q)
- ans = rag_answer(q)
- print("[bold]A:[/bold]", ans.strip()[:500], "...")
-"""
- )
-)
-
-# Save artifacts list
-cells.append(
- nbf.v4.new_markdown_cell(
- """
-## 11) Artifacts
-
-- `outputs/raft_dataset.jsonl` — your RAFT dataset (input/output/meta per line)
-- `corpus/` — your source documents (you provide)
-- You can also persist `emb_matrix.npy` and a FAISS index for reuse.
-"""
- )
-)
-cells.append(
- nbf.v4.new_code_cell(
- """
-# Optionally persist embeddings and index for later reuse
-np.save(OUTPUT_DIR / "emb_matrix.npy", emb_matrix)
-
-if USE_FAISS:
- import faiss
- faiss.write_index(index, str(OUTPUT_DIR / "faiss.index"))
- print("[green]Saved FAISS index and embeddings.[/green]")
-else:
- print("[yellow]FAISS disabled; only saved embeddings.[/yellow]")
-"""
- )
-)
-
-# Troubleshooting
-cells.append(
- nbf.v4.new_markdown_cell(
- """
-## 12) Troubleshooting
-
-- **Connection error to Ollama**: ensure `ollama serve` is running and models are pulled (`ollama pull nomic-embed-text`, `ollama pull llama3.1`).
-- **Empty dataset**: your corpus may be too small or the parser skipped files. Check `corpus/` content and chunk parameters.
-- **Hallucinations**: tighten the system prompt, lower temperature, or increase `TOP_K` and chunk size.
-- **JSON parsing issues**: the notebook tries to be forgiving; you can harden `parse_llm_jsonl` per your needs.
-- **PDFs**: `pip install pypdf` and try again.
-"""
- )
-)
-
-# Save the notebook
-nb["cells"] = cells
-
-out_path = "raft_ollama_dataset.ipynb"
-with open(out_path, "w", encoding="utf-8") as f:
- nbf.write(nb, f)
-
-out_path
diff --git a/raft/create_raft_tuning_notebook.py b/raft/create_raft_tuning_notebook.py
deleted file mode 100644
index 03cf697..0000000
--- a/raft/create_raft_tuning_notebook.py
+++ /dev/null
@@ -1,417 +0,0 @@
-# Re-create the Jupyter Notebook for RAFT QLoRA fine-tuning and save to /mnt/data
-
-import nbformat as nbf
-from pathlib import Path
-
-nb = nbf.v4.new_notebook()
-nb.metadata.update(
- {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3",
- },
- "language_info": {"name": "python", "version": "3.x"},
- }
-)
-
-cells = []
-
-cells.append(
- nbf.v4.new_markdown_cell(
- """
-# RAFT Supervised Fine-Tuning (QLoRA) — Local Training
-
-This notebook fine-tunes an open-source base model on a RAFT-style dataset (`input` → `output`) using **QLoRA** with **PEFT** and **Transformers**. It is designed to run locally (single or multi-GPU) and to export both **LoRA adapters** and (optionally) a **merged** model for inference.
-
-> **Assumptions**
-> - Your dataset lives at `./outputs/raft_dataset.jsonl` (from the previous notebook). Adjust the path if needed.
-> - You have a CUDA-capable GPU and can install `bitsandbytes`. (CPU training is possible but slow.)
-> - You have enough VRAM for the chosen base model when loaded in 4-bit NF4.
-"""
- )
-)
-
-cells.append(nbf.v4.new_markdown_cell("## 0) Install dependencies"))
-cells.append(
- nbf.v4.new_code_cell(
- """
-# If needed, uncomment the following installs:
-# %pip install --quiet transformers==4.44.2 datasets==2.20.0 peft==0.12.0 accelerate==0.34.2 bitsandbytes==0.43.3 evaluate==0.4.2 sentencepiece==0.2.0
-# Optional extras:
-# %pip install --quiet trl==0.9.6 sacrebleu==2.4.3 rouge-score==0.1.2
-"""
- )
-)
-
-cells.append(nbf.v4.new_markdown_cell("## 1) Configuration"))
-cells.append(
- nbf.v4.new_code_cell(
- """
-from pathlib import Path
-
-# Paths
-DATA_JSONL = Path("./outputs/raft_dataset.jsonl") # change if different
-RUN_NAME = "raft_qlora_run"
-OUTPUT_DIR = Path(f"./finetuned/{RUN_NAME}")
-OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
-
-# Base model — examples: "meta-llama/Llama-3.1-8B", "Qwen/Qwen2-7B-Instruct", "mistralai/Mistral-7B-Instruct-v0.3"
-# Prefer an instruction-tuned base for better stability on SFT.
-BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
-
-# Tokenization/prompt formatting
-SYSTEM_PREFIX = "You are a helpful assistant. Answer concisely and truthfully based ONLY on the user's request."
-USE_CHAT_TEMPLATE = True # if the tokenizer has a chat template, we'll leverage it
-
-# QLoRA/PEFT params
-LORA_R = 16
-LORA_ALPHA = 32
-LORA_DROPOUT = 0.05
-TARGET_MODULES = None # None = let PEFT auto-detect common modules (works for most models)
-
-# 4-bit quantization (QLoRA)
-LOAD_IN_4BIT = True
-BNB_4BIT_COMPUTE_DTYPE = "bfloat16" # "float16" or "bfloat16"
-BNB_4BIT_QUANT_TYPE = "nf4" # "nf4" or "fp4"
-BNB_4BIT_USE_DOUBLE_QUANT = True
-
-# Training
-TRAIN_VAL_SPLIT = 0.98
-MAX_SEQ_LEN = 2048
-PER_DEVICE_TRAIN_BATCH = 1
-PER_DEVICE_EVAL_BATCH = 1
-GRADIENT_ACCUM_STEPS = 16
-LEARNING_RATE = 2e-4
-NUM_TRAIN_EPOCHS = 2
-WEIGHT_DECAY = 0.0
-WARMUP_RATIO = 0.03
-LR_SCHEDULER_TYPE = "cosine"
-LOGGING_STEPS = 10
-EVAL_STEPS = 200
-SAVE_STEPS = 200
-BF16 = True
-FP16 = False
-
-SEED = 7
-"""
- )
-)
-
-cells.append(nbf.v4.new_markdown_cell("## 2) Load dataset (JSONL)"))
-cells.append(
- nbf.v4.new_code_cell(
- """
-import json, random
-from datasets import Dataset
-
-def read_jsonl(p: Path):
- rows = []
- with p.open("r", encoding="utf-8") as f:
- for line in f:
- line = line.strip()
- if not line:
- continue
- try:
- obj = json.loads(line)
- if "input" in obj and "output" in obj:
- rows.append(obj)
- except Exception:
- pass
- return rows
-
-rows = read_jsonl(DATA_JSONL)
-print(f"Loaded {len(rows)} rows from {DATA_JSONL}")
-
-random.Random(SEED).shuffle(rows)
-split = int(len(rows) * TRAIN_VAL_SPLIT)
-train_rows = rows[:split]
-val_rows = rows[split:] if split < len(rows) else rows[-max(1, len(rows)//50):]
-
-train_ds = Dataset.from_list(train_rows)
-eval_ds = Dataset.from_list(val_rows) if val_rows else None
-train_ds, eval_ds
-"""
- )
-)
-
-cells.append(nbf.v4.new_markdown_cell("## 3) Prompt formatting"))
-cells.append(
- nbf.v4.new_code_cell(
- """
-from transformers import AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
-if tokenizer.pad_token is None:
- tokenizer.pad_token = tokenizer.eos_token
-
-def format_example(ex):
- user = ex["input"]
- assistant = ex["output"]
-
- if USE_CHAT_TEMPLATE and hasattr(tokenizer, "apply_chat_template"):
- messages = [
- {"role": "system", "content": SYSTEM_PREFIX},
- {"role": "user", "content": user},
- {"role": "assistant", "content": assistant},
- ]
- text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
- else:
- text = f"[SYSTEM]\\n{SYSTEM_PREFIX}\\n[/SYSTEM]\\n[USER]\\n{user}\\n[/USER]\\n[ASSISTANT]\\n{assistant}"
- return {"text": text}
-
-train_ds_fmt = train_ds.map(format_example, remove_columns=train_ds.column_names)
-eval_ds_fmt = eval_ds.map(format_example, remove_columns=eval_ds.column_names) if eval_ds else None
-
-print(train_ds_fmt[0]["text"][:400])
-"""
- )
-)
-
-cells.append(nbf.v4.new_markdown_cell("## 4) Tokenize"))
-cells.append(
- nbf.v4.new_code_cell(
- """
-def tokenize(batch):
- return tokenizer(
- batch["text"],
- truncation=True,
- max_length=MAX_SEQ_LEN,
- padding="max_length",
- return_tensors=None,
- )
-
-train_tok = train_ds_fmt.map(tokenize, batched=True, remove_columns=train_ds_fmt.column_names)
-eval_tok = eval_ds_fmt.map(tokenize, batched=True, remove_columns=eval_ds_fmt.column_names) if eval_ds_fmt else None
-
-train_tok = train_tok.rename_column("input_ids", "input_ids")
-train_tok = train_tok.add_column("labels", train_tok["input_ids"])
-if eval_tok:
- eval_tok = eval_tok.add_column("labels", eval_tok["input_ids"])
-
-train_tok, (eval_tok[0]['input_ids'][:10] if eval_tok else [])
-"""
- )
-)
-
-cells.append(
- nbf.v4.new_markdown_cell(
- "## 5) Load base model with 4-bit quantization and prepare QLoRA"
- )
-)
-cells.append(
- nbf.v4.new_code_cell(
- """
-import torch
-from transformers import AutoModelForCausalLM, BitsAndBytesConfig
-from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
-
-bnb_config = None
-if LOAD_IN_4BIT:
- bnb_config = BitsAndBytesConfig(
- load_in_4bit=True,
- bnb_4bit_use_double_quant=BNB_4BIT_USE_DOUBLE_QUANT,
- bnb_4bit_quant_type=BNB_4BIT_QUANT_TYPE,
- bnb_4bit_compute_dtype=getattr(torch, BNB_4BIT_COMPUTE_DTYPE)
- )
-
-model = AutoModelForCausalLM.from_pretrained(
- BASE_MODEL,
- quantization_config=bnb_config,
- torch_dtype=torch.bfloat16 if BF16 else (torch.float16 if FP16 else None),
- device_map="auto",
-)
-
-model = prepare_model_for_kbit_training(model)
-
-peft_config = LoraConfig(
- r=LORA_R,
- lora_alpha=LORA_ALPHA,
- lora_dropout=LORA_DROPOUT,
- bias="none",
- task_type="CAUSAL_LM",
- target_modules=TARGET_MODULES,
-)
-
-model = get_peft_model(model, peft_config)
-model.print_trainable_parameters()
-"""
- )
-)
-
-cells.append(nbf.v4.new_markdown_cell("## 6) Train"))
-cells.append(
- nbf.v4.new_code_cell(
- """
-from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
-import math
-
-data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
-
-args = TrainingArguments(
- output_dir=str(OUTPUT_DIR),
- run_name=RUN_NAME,
- num_train_epochs=NUM_TRAIN_EPOCHS,
- per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH,
- per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
- gradient_accumulation_steps=GRADIENT_ACCUM_STEPS,
- learning_rate=LEARNING_RATE,
- lr_scheduler_type=LR_SCHEDULER_TYPE,
- warmup_ratio=WARMUP_RATIO,
- weight_decay=WEIGHT_DECAY,
- logging_steps=LOGGING_STEPS,
- evaluation_strategy="steps",
- eval_steps=EVAL_STEPS,
- save_steps=SAVE_STEPS,
- save_total_limit=2,
- bf16=BF16,
- fp16=FP16,
- gradient_checkpointing=True,
- report_to=["none"],
- seed=SEED,
-)
-
-trainer = Trainer(
- model=model,
- tokenizer=tokenizer,
- args=args,
- train_dataset=train_tok,
- eval_dataset=eval_tok,
- data_collator=data_collator,
-)
-
-train_result = trainer.train()
-metrics = trainer.evaluate() if eval_tok else {}
-perplexity = math.exp(metrics["eval_loss"]) if metrics and "eval_loss" in metrics else None
-metrics, perplexity
-"""
- )
-)
-
-cells.append(nbf.v4.new_markdown_cell("## 7) Save LoRA adapters"))
-cells.append(
- nbf.v4.new_code_cell(
- """
-adapter_dir = OUTPUT_DIR / "lora_adapter"
-adapter_dir.mkdir(parents=True, exist_ok=True)
-
-model.save_pretrained(str(adapter_dir))
-tokenizer.save_pretrained(str(adapter_dir))
-
-print(f"Saved LoRA adapter to: {adapter_dir}")
-"""
- )
-)
-
-cells.append(
- nbf.v4.new_markdown_cell(
- "## 8) (Optional) Merge adapters into base model and save full weights"
- )
-)
-cells.append(
- nbf.v4.new_code_cell(
- """
-DO_MERGE = False # set True to produce a standalone merged model
-
-if DO_MERGE:
- from peft import PeftModel
- base_model = AutoModelForCausalLM.from_pretrained(
- BASE_MODEL,
- torch_dtype=torch.bfloat16 if BF16 else (torch.float16 if FP16 else None),
- device_map="auto",
- )
- merged = PeftModel.from_pretrained(base_model, str(adapter_dir)).merge_and_unload()
- merged_dir = OUTPUT_DIR / "merged_model"
- merged.save_pretrained(str(merged_dir))
- tokenizer.save_pretrained(str(merged_dir))
- print(f"Merged full model saved to: {merged_dir}")
-else:
- print("Skipping merge (set DO_MERGE=True to enable).")
-"""
- )
-)
-
-cells.append(nbf.v4.new_markdown_cell("## 9) Quick inference with the trained adapter"))
-cells.append(
- nbf.v4.new_code_cell(
- """
-from peft import PeftModel
-import torch
-
-test_model = AutoModelForCausalLM.from_pretrained(
- BASE_MODEL,
- quantization_config=bnb_config,
- torch_dtype=torch.bfloat16 if BF16 else (torch.float16 if FP16 else None),
- device_map="auto",
-)
-test_model = PeftModel.from_pretrained(test_model, str(adapter_dir))
-test_model.eval()
-
-def generate_answer(prompt, max_new_tokens=256, temperature=0.2, top_p=0.9):
- if USE_CHAT_TEMPLATE and hasattr(tokenizer, "apply_chat_template"):
- messages = [
- {"role": "system", "content": SYSTEM_PREFIX},
- {"role": "user", "content": prompt},
- ]
- model_inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to(test_model.device)
- else:
- text = f"[SYSTEM]\\n{SYSTEM_PREFIX}\\n[/SYSTEM]\\n[USER]\\n{prompt}\\n[/USER]\\n[ASSISTANT]\\n"
- model_inputs = tokenizer([text], return_tensors="pt").to(test_model.device)
-
- with torch.no_grad():
- out = test_model.generate(
- **model_inputs,
- do_sample=True,
- max_new_tokens=max_new_tokens,
- temperature=temperature,
- top_p=top_p,
- eos_token_id=tokenizer.eos_token_id,
- pad_token_id=tokenizer.pad_token_id,
- )
- return tokenizer.decode(out[0], skip_special_tokens=True)
-
-sample_prompt = (train_rows[0]["input"] if len(train_rows)>0 else "What are the visitor crowd levels like?")
-print(generate_answer(sample_prompt)[:800])
-"""
- )
-)
-
-cells.append(nbf.v4.new_markdown_cell("## 10) Light evaluation on the validation set"))
-cells.append(
- nbf.v4.new_code_cell(
- """
-import evaluate
-
-if eval_ds:
- rouge = evaluate.load("rouge")
- preds, refs = [], []
- for ex in val_rows[:50]:
- preds.append(generate_answer(ex["input"], max_new_tokens=192, temperature=0.0))
- refs.append(ex["output"])
- results = rouge.compute(predictions=preds, references=refs)
- print(results)
-else:
- print("No eval split available; skipped.")
-"""
- )
-)
-
-cells.append(
- nbf.v4.new_markdown_cell(
- """
-## 11) (Optional) Use with other runtimes
-
-- **Python Inference (PEFT)**: Load base model + adapter as shown in Section 9.
-- **Merged model**: Set `DO_MERGE=True` to create a standalone model directory; you can then convert to other runtimes (e.g., llama.cpp GGUF) using their conversion tools.
-- **Ollama**: If your runtime supports adapters or merged weights for the chosen base model, create a `Modelfile` pointing to them. Need a concrete path? Tell me your base and target runtime and I’ll add exact steps.
-"""
- )
-)
-
-nb["cells"] = cells
-
-out_path = Path("./raft_finetune_qlora.ipynb")
-with open(out_path, "w", encoding="utf-8") as f:
- nbf.write(nb, f)
-
-str(out_path)
diff --git a/raft/finetuned/raft_qlora_tourist/train.md b/raft/finetuned/raft_qlora_tourist/train.md
new file mode 100644
index 0000000..0e3ed7e
--- /dev/null
+++ b/raft/finetuned/raft_qlora_tourist/train.md
@@ -0,0 +1,31 @@
+| epochs | train_loss | eval_loss |
+| ------ | ---------- | --------- |
+| 50 | 4.377000 | 3.628506 |
+| 100 | 2.636800 | 2.558457 |
+| 150 | 2.428800 | 2.427239 |
+| 200 | 2.334800 | 2.193493 |
+| 250 | 2.188500 | 2.186310 |
+| 300 | 2.112400 | 2.173394 |
+| 350 | 2.122900 | 2.163947 |
+| 400 | 2.155400 | 2.162106 |
+| 450 | 2.072100 | 2.154830 |
+| 500 | 1.979900 | 2.165512 |
+| 550 | 1.935800 | 2.176313 |
+| 600 | 1.942800 | 2.170668 |
+| 650 | 1.968000 | 2.162810 |
+| 700 | 1.974100 | 2.167501 |
+| 750 | 1.801900 | 2.235841 |
+| 800 | 1.768000 | 2.233753 |
+| 850 | 1.779100 | 2.218278 |
+| 900 | 1.828900 | 2.220891 |
+| 950 | 1.854900 | 2.208387 |
+| 1000 | 1.653600 | 2.302763 |
+| 1050 | 1.663500 | 2.307982 |
+| 1100 | 1.673400 | 2.301423 |
+| 1150 | 1.608400 | 2.320958 |
+| 1200 | 1.683500 | 2.303580 |
+| 1250 | 1.532100 | 2.434277 |
+| 1300 | 1.558900 | 2.418276 |
+| 1350 | 1.508900 | 2.422347 |
+| 1400 | 1.535100 | 2.416650 |
+| 1450 | 1.529900 | 2.415497 |
diff --git a/raft/finetuned/raft_qlora_tourist_0.2/train.md b/raft/finetuned/raft_qlora_tourist_0.2/train.md
new file mode 100644
index 0000000..ae814db
--- /dev/null
+++ b/raft/finetuned/raft_qlora_tourist_0.2/train.md
@@ -0,0 +1,16 @@
+| epoch | train_loss | eval_loss |
+| ----- | ---------- | --------- |
+| 50 | 2.419000 | 1.970156 |
+| 100 | 1.952300 | 1.843970 |
+| 150 | 1.870500 | 1.846282 |
+| 200 | 1.901400 | 1.800553 |
+| 250 | 1.739600 | 1.820830 |
+| 300 | 1.561900 | 1.817271 |
+| 350 | 1.548000 | 1.805666 |
+| 400 | 1.572800 | 1.808402 |
+| 450 | 1.508000 | 1.794848 |
+| 500 | 1.350500 | 1.905955 |
+| 550 | 1.169200 | 1.949706 |
+| 600 | 1.166600 | 1.940000 |
+| 650 | 1.160400 | 1.940272 |
+| 700 | 1.182600 | 1.951116 |