From 8cad184cb5911183a56720320340fc08540e4757 Mon Sep 17 00:00:00 2001 From: Marvin Scham Date: Mon, 20 Oct 2025 23:40:58 +0200 Subject: [PATCH] Add training results, update meta files --- hooks/pre-commit | 21 +- raft/.gitignore | 6 +- .../topic=22__part=001__n=60.txt | 0 .../topic=22__part=002__n=60.txt | 0 .../topic=22__part=003__n=60.txt | 0 .../topic=22__part=004__n=60.txt | 0 .../topic=22__part=005__n=60.txt | 0 .../topic=22__part=006__n=3.txt | 0 .../topic=26__part=001__n=60.txt | 0 .../topic=26__part=002__n=60.txt | 0 .../topic=26__part=003__n=60.txt | 0 .../topic=26__part=004__n=60.txt | 0 .../topic=26__part=005__n=20.txt | 0 .../topic=2__part=001__n=60.txt | 0 .../topic=2__part=002__n=60.txt | 0 .../topic=2__part=003__n=60.txt | 0 .../topic=2__part=004__n=60.txt | 0 .../topic=2__part=005__n=60.txt | 0 .../topic=2__part=006__n=60.txt | 0 .../topic=2__part=007__n=60.txt | 0 .../topic=2__part=008__n=60.txt | 0 .../topic=2__part=009__n=60.txt | 0 .../topic=2__part=010__n=60.txt | 0 .../topic=2__part=011__n=60.txt | 0 .../topic=2__part=012__n=60.txt | 0 .../topic=2__part=013__n=60.txt | 0 .../topic=2__part=014__n=60.txt | 0 .../topic=2__part=015__n=60.txt | 0 .../topic=2__part=016__n=60.txt | 0 .../topic=2__part=017__n=60.txt | 0 .../topic=2__part=018__n=60.txt | 0 .../topic=2__part=019__n=60.txt | 0 .../topic=2__part=020__n=60.txt | 0 .../topic=2__part=021__n=60.txt | 0 .../topic=2__part=022__n=60.txt | 0 .../topic=2__part=023__n=60.txt | 0 .../topic=2__part=024__n=60.txt | 0 .../topic=2__part=025__n=60.txt | 0 .../topic=2__part=026__n=60.txt | 0 .../topic=2__part=027__n=60.txt | 0 .../topic=2__part=028__n=60.txt | 0 .../topic=2__part=029__n=60.txt | 0 .../topic=2__part=030__n=60.txt | 0 .../topic=2__part=031__n=60.txt | 0 .../topic=2__part=032__n=60.txt | 0 .../topic=2__part=033__n=60.txt | 0 .../topic=2__part=034__n=60.txt | 0 .../topic=2__part=035__n=60.txt | 0 .../topic=2__part=036__n=60.txt | 0 .../topic=2__part=037__n=60.txt | 0 .../topic=2__part=038__n=60.txt | 0 .../topic=2__part=039__n=60.txt | 0 .../topic=2__part=040__n=60.txt | 0 .../topic=2__part=041__n=60.txt | 0 .../topic=2__part=042__n=60.txt | 0 .../topic=2__part=043__n=60.txt | 0 .../topic=2__part=044__n=60.txt | 0 .../topic=2__part=045__n=60.txt | 0 .../topic=2__part=046__n=60.txt | 0 .../topic=2__part=047__n=60.txt | 0 .../topic=2__part=048__n=60.txt | 0 .../topic=2__part=049__n=60.txt | 0 .../topic=2__part=050__n=60.txt | 0 .../topic=2__part=051__n=60.txt | 0 .../topic=2__part=052__n=60.txt | 0 .../topic=2__part=053__n=60.txt | 0 .../topic=2__part=054__n=60.txt | 0 .../topic=2__part=055__n=60.txt | 0 .../topic=2__part=056__n=60.txt | 0 .../topic=2__part=057__n=60.txt | 0 .../topic=2__part=058__n=60.txt | 0 .../topic=2__part=059__n=60.txt | 0 .../topic=2__part=060__n=60.txt | 0 .../topic=2__part=061__n=60.txt | 0 .../topic=2__part=062__n=60.txt | 0 .../topic=2__part=063__n=60.txt | 0 .../topic=2__part=064__n=60.txt | 0 .../topic=2__part=065__n=60.txt | 0 .../topic=2__part=066__n=60.txt | 0 .../topic=2__part=067__n=60.txt | 0 .../topic=2__part=068__n=60.txt | 0 .../topic=2__part=069__n=60.txt | 0 .../topic=2__part=070__n=26.txt | 0 .../topic=4__part=001__n=60.txt | 0 .../topic=4__part=002__n=60.txt | 0 .../topic=4__part=003__n=60.txt | 0 .../topic=4__part=004__n=60.txt | 0 .../topic=4__part=005__n=60.txt | 0 .../topic=4__part=006__n=60.txt | 0 .../topic=4__part=007__n=60.txt | 0 .../topic=4__part=008__n=60.txt | 0 .../topic=4__part=009__n=60.txt | 0 .../topic=4__part=010__n=60.txt | 0 .../topic=4__part=011__n=60.txt | 0 .../topic=4__part=012__n=60.txt | 0 .../topic=4__part=013__n=60.txt | 0 .../topic=4__part=014__n=60.txt | 0 .../topic=4__part=015__n=60.txt | 0 .../topic=4__part=016__n=60.txt | 0 .../topic=4__part=017__n=60.txt | 0 .../topic=4__part=018__n=60.txt | 0 .../topic=4__part=019__n=60.txt | 0 .../topic=4__part=020__n=60.txt | 0 .../topic=4__part=021__n=60.txt | 0 .../topic=4__part=022__n=60.txt | 0 .../topic=4__part=023__n=60.txt | 0 .../topic=4__part=024__n=60.txt | 0 .../topic=4__part=025__n=60.txt | 0 .../topic=4__part=026__n=60.txt | 0 .../topic=4__part=027__n=60.txt | 0 .../topic=4__part=028__n=60.txt | 0 .../topic=4__part=029__n=60.txt | 0 .../topic=4__part=030__n=60.txt | 0 .../topic=4__part=031__n=60.txt | 0 .../topic=4__part=032__n=60.txt | 0 .../topic=4__part=033__n=60.txt | 0 .../topic=4__part=034__n=41.txt | 0 .../topic=5__part=001__n=60.txt | 0 .../topic=5__part=002__n=60.txt | 0 .../topic=5__part=003__n=60.txt | 0 .../topic=5__part=004__n=60.txt | 0 .../topic=5__part=005__n=60.txt | 0 .../topic=5__part=006__n=60.txt | 0 .../topic=5__part=007__n=60.txt | 0 .../topic=5__part=008__n=60.txt | 0 .../topic=5__part=009__n=60.txt | 0 .../topic=5__part=010__n=60.txt | 0 .../topic=5__part=011__n=60.txt | 0 .../topic=5__part=012__n=60.txt | 0 .../topic=5__part=013__n=60.txt | 0 .../topic=5__part=014__n=60.txt | 0 .../topic=5__part=015__n=60.txt | 0 .../topic=5__part=016__n=60.txt | 0 .../topic=5__part=017__n=60.txt | 0 .../topic=5__part=018__n=60.txt | 0 .../topic=5__part=019__n=60.txt | 0 .../topic=5__part=020__n=60.txt | 0 .../topic=5__part=021__n=60.txt | 0 .../topic=5__part=022__n=60.txt | 0 .../topic=5__part=023__n=60.txt | 0 .../topic=5__part=024__n=60.txt | 0 .../topic=5__part=025__n=60.txt | 0 .../topic=5__part=026__n=60.txt | 0 .../topic=5__part=027__n=60.txt | 0 .../topic=5__part=028__n=60.txt | 0 .../topic=5__part=029__n=39.txt | 0 .../topic=9__part=001__n=60.txt | 0 .../topic=9__part=002__n=60.txt | 0 .../topic=9__part=003__n=60.txt | 0 .../topic=9__part=004__n=60.txt | 0 .../topic=9__part=005__n=60.txt | 0 .../topic=9__part=006__n=60.txt | 0 .../topic=9__part=007__n=60.txt | 0 .../topic=9__part=008__n=60.txt | 0 .../topic=9__part=009__n=60.txt | 0 .../topic=9__part=010__n=60.txt | 0 .../topic=9__part=011__n=60.txt | 0 .../topic=9__part=012__n=60.txt | 0 .../topic=9__part=013__n=60.txt | 0 .../topic=9__part=014__n=60.txt | 0 .../topic=9__part=015__n=60.txt | 0 .../topic=9__part=016__n=60.txt | 0 .../topic=9__part=017__n=60.txt | 0 .../topic=9__part=018__n=60.txt | 0 .../topic=9__part=019__n=60.txt | 0 .../topic=9__part=020__n=44.txt | 0 raft/create_raft_dataset_notebook.py | 619 ------------------ raft/create_raft_tuning_notebook.py | 417 ------------ raft/finetuned/raft_qlora_tourist/train.md | 31 + .../finetuned/raft_qlora_tourist_0.2/train.md | 16 + 170 files changed, 52 insertions(+), 1058 deletions(-) rename raft/{corpus_old => corpus}/topic=22__part=001__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=22__part=002__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=22__part=003__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=22__part=004__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=22__part=005__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=22__part=006__n=3.txt (100%) rename raft/{corpus_old => corpus}/topic=26__part=001__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=26__part=002__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=26__part=003__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=26__part=004__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=26__part=005__n=20.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=001__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=002__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=003__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=004__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=005__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=006__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=007__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=008__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=009__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=010__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=011__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=012__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=013__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=014__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=015__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=016__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=017__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=018__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=019__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=020__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=021__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=022__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=023__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=024__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=025__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=026__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=027__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=028__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=029__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=030__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=031__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=032__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=033__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=034__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=035__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=036__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=037__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=038__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=039__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=040__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=041__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=042__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=043__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=044__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=045__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=046__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=047__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=048__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=049__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=050__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=051__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=052__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=053__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=054__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=055__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=056__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=057__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=058__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=059__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=060__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=061__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=062__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=063__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=064__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=065__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=066__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=067__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=068__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=069__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=2__part=070__n=26.txt (100%) rename raft/{corpus_old => corpus}/topic=4__part=001__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=4__part=002__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=4__part=003__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=4__part=004__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=4__part=005__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=4__part=006__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=4__part=007__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=4__part=008__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=4__part=009__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=4__part=010__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=4__part=011__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=4__part=012__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=4__part=013__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=4__part=014__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=4__part=015__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=4__part=016__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=4__part=017__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=4__part=018__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=4__part=019__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=4__part=020__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=4__part=021__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=4__part=022__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=4__part=023__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=4__part=024__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=4__part=025__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=4__part=026__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=4__part=027__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=4__part=028__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=4__part=029__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=4__part=030__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=4__part=031__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=4__part=032__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=4__part=033__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=4__part=034__n=41.txt (100%) rename raft/{corpus_old => corpus}/topic=5__part=001__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=5__part=002__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=5__part=003__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=5__part=004__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=5__part=005__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=5__part=006__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=5__part=007__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=5__part=008__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=5__part=009__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=5__part=010__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=5__part=011__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=5__part=012__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=5__part=013__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=5__part=014__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=5__part=015__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=5__part=016__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=5__part=017__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=5__part=018__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=5__part=019__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=5__part=020__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=5__part=021__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=5__part=022__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=5__part=023__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=5__part=024__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=5__part=025__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=5__part=026__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=5__part=027__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=5__part=028__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=5__part=029__n=39.txt (100%) rename raft/{corpus_old => corpus}/topic=9__part=001__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=9__part=002__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=9__part=003__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=9__part=004__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=9__part=005__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=9__part=006__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=9__part=007__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=9__part=008__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=9__part=009__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=9__part=010__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=9__part=011__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=9__part=012__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=9__part=013__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=9__part=014__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=9__part=015__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=9__part=016__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=9__part=017__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=9__part=018__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=9__part=019__n=60.txt (100%) rename raft/{corpus_old => corpus}/topic=9__part=020__n=44.txt (100%) delete mode 100644 raft/create_raft_dataset_notebook.py delete mode 100644 raft/create_raft_tuning_notebook.py create mode 100644 raft/finetuned/raft_qlora_tourist/train.md create mode 100644 raft/finetuned/raft_qlora_tourist_0.2/train.md diff --git a/hooks/pre-commit b/hooks/pre-commit index ead2310..0028c28 100644 --- a/hooks/pre-commit +++ b/hooks/pre-commit @@ -1,25 +1,6 @@ #!/bin/bash set -e -# Find all staged .ipynb files -NOTEBOOKS=$(git diff --cached --name-only --diff-filter=ACM | grep '\.ipynb$' || true) - -if [ -z "$NOTEBOOKS" ]; then - echo "No Jupyter notebooks staged. Skipping Jupytext conversion." - exit 0 -fi - -echo "Converting staged Jupyter notebooks to .py (percent format)..." - -# Loop through each notebook and convert -for nb in $NOTEBOOKS; do - if [ -f "$nb" ]; then - echo " - Converting $nb" - jupytext --to py:percent "$nb" - pyfile="${nb%.ipynb}.py" - # Stage the generated .py file - git add "$pyfile" - fi -done +./convert_jupytext.sh py echo "✅ Jupytext conversion complete." diff --git a/raft/.gitignore b/raft/.gitignore index 1efe02e..7ba4ce4 100644 --- a/raft/.gitignore +++ b/raft/.gitignore @@ -1,3 +1,5 @@ -offload +offload/ finetuned/** -!*.md +!finetuned/ +!finetuned/**/ +!finetuned/**/train.md diff --git a/raft/corpus_old/topic=22__part=001__n=60.txt b/raft/corpus/topic=22__part=001__n=60.txt similarity index 100% rename from raft/corpus_old/topic=22__part=001__n=60.txt rename to raft/corpus/topic=22__part=001__n=60.txt diff --git a/raft/corpus_old/topic=22__part=002__n=60.txt b/raft/corpus/topic=22__part=002__n=60.txt similarity index 100% rename from raft/corpus_old/topic=22__part=002__n=60.txt rename to raft/corpus/topic=22__part=002__n=60.txt diff --git a/raft/corpus_old/topic=22__part=003__n=60.txt b/raft/corpus/topic=22__part=003__n=60.txt similarity index 100% rename from raft/corpus_old/topic=22__part=003__n=60.txt rename to raft/corpus/topic=22__part=003__n=60.txt diff --git a/raft/corpus_old/topic=22__part=004__n=60.txt b/raft/corpus/topic=22__part=004__n=60.txt similarity index 100% rename from raft/corpus_old/topic=22__part=004__n=60.txt rename to raft/corpus/topic=22__part=004__n=60.txt diff --git a/raft/corpus_old/topic=22__part=005__n=60.txt b/raft/corpus/topic=22__part=005__n=60.txt similarity index 100% rename from raft/corpus_old/topic=22__part=005__n=60.txt rename to raft/corpus/topic=22__part=005__n=60.txt diff --git a/raft/corpus_old/topic=22__part=006__n=3.txt b/raft/corpus/topic=22__part=006__n=3.txt similarity index 100% rename from raft/corpus_old/topic=22__part=006__n=3.txt rename to raft/corpus/topic=22__part=006__n=3.txt diff --git a/raft/corpus_old/topic=26__part=001__n=60.txt b/raft/corpus/topic=26__part=001__n=60.txt similarity index 100% rename from raft/corpus_old/topic=26__part=001__n=60.txt rename to raft/corpus/topic=26__part=001__n=60.txt diff --git a/raft/corpus_old/topic=26__part=002__n=60.txt b/raft/corpus/topic=26__part=002__n=60.txt similarity index 100% rename from raft/corpus_old/topic=26__part=002__n=60.txt rename to raft/corpus/topic=26__part=002__n=60.txt diff --git a/raft/corpus_old/topic=26__part=003__n=60.txt b/raft/corpus/topic=26__part=003__n=60.txt similarity index 100% rename from raft/corpus_old/topic=26__part=003__n=60.txt rename to raft/corpus/topic=26__part=003__n=60.txt diff --git a/raft/corpus_old/topic=26__part=004__n=60.txt b/raft/corpus/topic=26__part=004__n=60.txt similarity index 100% rename from raft/corpus_old/topic=26__part=004__n=60.txt rename to raft/corpus/topic=26__part=004__n=60.txt diff --git a/raft/corpus_old/topic=26__part=005__n=20.txt b/raft/corpus/topic=26__part=005__n=20.txt similarity index 100% rename from raft/corpus_old/topic=26__part=005__n=20.txt rename to raft/corpus/topic=26__part=005__n=20.txt diff --git a/raft/corpus_old/topic=2__part=001__n=60.txt b/raft/corpus/topic=2__part=001__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=001__n=60.txt rename to raft/corpus/topic=2__part=001__n=60.txt diff --git a/raft/corpus_old/topic=2__part=002__n=60.txt b/raft/corpus/topic=2__part=002__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=002__n=60.txt rename to raft/corpus/topic=2__part=002__n=60.txt diff --git a/raft/corpus_old/topic=2__part=003__n=60.txt b/raft/corpus/topic=2__part=003__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=003__n=60.txt rename to raft/corpus/topic=2__part=003__n=60.txt diff --git a/raft/corpus_old/topic=2__part=004__n=60.txt b/raft/corpus/topic=2__part=004__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=004__n=60.txt rename to raft/corpus/topic=2__part=004__n=60.txt diff --git a/raft/corpus_old/topic=2__part=005__n=60.txt b/raft/corpus/topic=2__part=005__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=005__n=60.txt rename to raft/corpus/topic=2__part=005__n=60.txt diff --git a/raft/corpus_old/topic=2__part=006__n=60.txt b/raft/corpus/topic=2__part=006__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=006__n=60.txt rename to raft/corpus/topic=2__part=006__n=60.txt diff --git a/raft/corpus_old/topic=2__part=007__n=60.txt b/raft/corpus/topic=2__part=007__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=007__n=60.txt rename to raft/corpus/topic=2__part=007__n=60.txt diff --git a/raft/corpus_old/topic=2__part=008__n=60.txt b/raft/corpus/topic=2__part=008__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=008__n=60.txt rename to raft/corpus/topic=2__part=008__n=60.txt diff --git a/raft/corpus_old/topic=2__part=009__n=60.txt b/raft/corpus/topic=2__part=009__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=009__n=60.txt rename to raft/corpus/topic=2__part=009__n=60.txt diff --git a/raft/corpus_old/topic=2__part=010__n=60.txt b/raft/corpus/topic=2__part=010__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=010__n=60.txt rename to raft/corpus/topic=2__part=010__n=60.txt diff --git a/raft/corpus_old/topic=2__part=011__n=60.txt b/raft/corpus/topic=2__part=011__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=011__n=60.txt rename to raft/corpus/topic=2__part=011__n=60.txt diff --git a/raft/corpus_old/topic=2__part=012__n=60.txt b/raft/corpus/topic=2__part=012__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=012__n=60.txt rename to raft/corpus/topic=2__part=012__n=60.txt diff --git a/raft/corpus_old/topic=2__part=013__n=60.txt b/raft/corpus/topic=2__part=013__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=013__n=60.txt rename to raft/corpus/topic=2__part=013__n=60.txt diff --git a/raft/corpus_old/topic=2__part=014__n=60.txt b/raft/corpus/topic=2__part=014__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=014__n=60.txt rename to raft/corpus/topic=2__part=014__n=60.txt diff --git a/raft/corpus_old/topic=2__part=015__n=60.txt b/raft/corpus/topic=2__part=015__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=015__n=60.txt rename to raft/corpus/topic=2__part=015__n=60.txt diff --git a/raft/corpus_old/topic=2__part=016__n=60.txt b/raft/corpus/topic=2__part=016__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=016__n=60.txt rename to raft/corpus/topic=2__part=016__n=60.txt diff --git a/raft/corpus_old/topic=2__part=017__n=60.txt b/raft/corpus/topic=2__part=017__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=017__n=60.txt rename to raft/corpus/topic=2__part=017__n=60.txt diff --git a/raft/corpus_old/topic=2__part=018__n=60.txt b/raft/corpus/topic=2__part=018__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=018__n=60.txt rename to raft/corpus/topic=2__part=018__n=60.txt diff --git a/raft/corpus_old/topic=2__part=019__n=60.txt b/raft/corpus/topic=2__part=019__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=019__n=60.txt rename to raft/corpus/topic=2__part=019__n=60.txt diff --git a/raft/corpus_old/topic=2__part=020__n=60.txt b/raft/corpus/topic=2__part=020__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=020__n=60.txt rename to raft/corpus/topic=2__part=020__n=60.txt diff --git a/raft/corpus_old/topic=2__part=021__n=60.txt b/raft/corpus/topic=2__part=021__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=021__n=60.txt rename to raft/corpus/topic=2__part=021__n=60.txt diff --git a/raft/corpus_old/topic=2__part=022__n=60.txt b/raft/corpus/topic=2__part=022__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=022__n=60.txt rename to raft/corpus/topic=2__part=022__n=60.txt diff --git a/raft/corpus_old/topic=2__part=023__n=60.txt b/raft/corpus/topic=2__part=023__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=023__n=60.txt rename to raft/corpus/topic=2__part=023__n=60.txt diff --git a/raft/corpus_old/topic=2__part=024__n=60.txt b/raft/corpus/topic=2__part=024__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=024__n=60.txt rename to raft/corpus/topic=2__part=024__n=60.txt diff --git a/raft/corpus_old/topic=2__part=025__n=60.txt b/raft/corpus/topic=2__part=025__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=025__n=60.txt rename to raft/corpus/topic=2__part=025__n=60.txt diff --git a/raft/corpus_old/topic=2__part=026__n=60.txt b/raft/corpus/topic=2__part=026__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=026__n=60.txt rename to raft/corpus/topic=2__part=026__n=60.txt diff --git a/raft/corpus_old/topic=2__part=027__n=60.txt b/raft/corpus/topic=2__part=027__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=027__n=60.txt rename to raft/corpus/topic=2__part=027__n=60.txt diff --git a/raft/corpus_old/topic=2__part=028__n=60.txt b/raft/corpus/topic=2__part=028__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=028__n=60.txt rename to raft/corpus/topic=2__part=028__n=60.txt diff --git a/raft/corpus_old/topic=2__part=029__n=60.txt b/raft/corpus/topic=2__part=029__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=029__n=60.txt rename to raft/corpus/topic=2__part=029__n=60.txt diff --git a/raft/corpus_old/topic=2__part=030__n=60.txt b/raft/corpus/topic=2__part=030__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=030__n=60.txt rename to raft/corpus/topic=2__part=030__n=60.txt diff --git a/raft/corpus_old/topic=2__part=031__n=60.txt b/raft/corpus/topic=2__part=031__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=031__n=60.txt rename to raft/corpus/topic=2__part=031__n=60.txt diff --git a/raft/corpus_old/topic=2__part=032__n=60.txt b/raft/corpus/topic=2__part=032__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=032__n=60.txt rename to raft/corpus/topic=2__part=032__n=60.txt diff --git a/raft/corpus_old/topic=2__part=033__n=60.txt b/raft/corpus/topic=2__part=033__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=033__n=60.txt rename to raft/corpus/topic=2__part=033__n=60.txt diff --git a/raft/corpus_old/topic=2__part=034__n=60.txt b/raft/corpus/topic=2__part=034__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=034__n=60.txt rename to raft/corpus/topic=2__part=034__n=60.txt diff --git a/raft/corpus_old/topic=2__part=035__n=60.txt b/raft/corpus/topic=2__part=035__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=035__n=60.txt rename to raft/corpus/topic=2__part=035__n=60.txt diff --git a/raft/corpus_old/topic=2__part=036__n=60.txt b/raft/corpus/topic=2__part=036__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=036__n=60.txt rename to raft/corpus/topic=2__part=036__n=60.txt diff --git a/raft/corpus_old/topic=2__part=037__n=60.txt b/raft/corpus/topic=2__part=037__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=037__n=60.txt rename to raft/corpus/topic=2__part=037__n=60.txt diff --git a/raft/corpus_old/topic=2__part=038__n=60.txt b/raft/corpus/topic=2__part=038__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=038__n=60.txt rename to raft/corpus/topic=2__part=038__n=60.txt diff --git a/raft/corpus_old/topic=2__part=039__n=60.txt b/raft/corpus/topic=2__part=039__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=039__n=60.txt rename to raft/corpus/topic=2__part=039__n=60.txt diff --git a/raft/corpus_old/topic=2__part=040__n=60.txt b/raft/corpus/topic=2__part=040__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=040__n=60.txt rename to raft/corpus/topic=2__part=040__n=60.txt diff --git a/raft/corpus_old/topic=2__part=041__n=60.txt b/raft/corpus/topic=2__part=041__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=041__n=60.txt rename to raft/corpus/topic=2__part=041__n=60.txt diff --git a/raft/corpus_old/topic=2__part=042__n=60.txt b/raft/corpus/topic=2__part=042__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=042__n=60.txt rename to raft/corpus/topic=2__part=042__n=60.txt diff --git a/raft/corpus_old/topic=2__part=043__n=60.txt b/raft/corpus/topic=2__part=043__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=043__n=60.txt rename to raft/corpus/topic=2__part=043__n=60.txt diff --git a/raft/corpus_old/topic=2__part=044__n=60.txt b/raft/corpus/topic=2__part=044__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=044__n=60.txt rename to raft/corpus/topic=2__part=044__n=60.txt diff --git a/raft/corpus_old/topic=2__part=045__n=60.txt b/raft/corpus/topic=2__part=045__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=045__n=60.txt rename to raft/corpus/topic=2__part=045__n=60.txt diff --git a/raft/corpus_old/topic=2__part=046__n=60.txt b/raft/corpus/topic=2__part=046__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=046__n=60.txt rename to raft/corpus/topic=2__part=046__n=60.txt diff --git a/raft/corpus_old/topic=2__part=047__n=60.txt b/raft/corpus/topic=2__part=047__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=047__n=60.txt rename to raft/corpus/topic=2__part=047__n=60.txt diff --git a/raft/corpus_old/topic=2__part=048__n=60.txt b/raft/corpus/topic=2__part=048__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=048__n=60.txt rename to raft/corpus/topic=2__part=048__n=60.txt diff --git a/raft/corpus_old/topic=2__part=049__n=60.txt b/raft/corpus/topic=2__part=049__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=049__n=60.txt rename to raft/corpus/topic=2__part=049__n=60.txt diff --git a/raft/corpus_old/topic=2__part=050__n=60.txt b/raft/corpus/topic=2__part=050__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=050__n=60.txt rename to raft/corpus/topic=2__part=050__n=60.txt diff --git a/raft/corpus_old/topic=2__part=051__n=60.txt b/raft/corpus/topic=2__part=051__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=051__n=60.txt rename to raft/corpus/topic=2__part=051__n=60.txt diff --git a/raft/corpus_old/topic=2__part=052__n=60.txt b/raft/corpus/topic=2__part=052__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=052__n=60.txt rename to raft/corpus/topic=2__part=052__n=60.txt diff --git a/raft/corpus_old/topic=2__part=053__n=60.txt b/raft/corpus/topic=2__part=053__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=053__n=60.txt rename to raft/corpus/topic=2__part=053__n=60.txt diff --git a/raft/corpus_old/topic=2__part=054__n=60.txt b/raft/corpus/topic=2__part=054__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=054__n=60.txt rename to raft/corpus/topic=2__part=054__n=60.txt diff --git a/raft/corpus_old/topic=2__part=055__n=60.txt b/raft/corpus/topic=2__part=055__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=055__n=60.txt rename to raft/corpus/topic=2__part=055__n=60.txt diff --git a/raft/corpus_old/topic=2__part=056__n=60.txt b/raft/corpus/topic=2__part=056__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=056__n=60.txt rename to raft/corpus/topic=2__part=056__n=60.txt diff --git a/raft/corpus_old/topic=2__part=057__n=60.txt b/raft/corpus/topic=2__part=057__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=057__n=60.txt rename to raft/corpus/topic=2__part=057__n=60.txt diff --git a/raft/corpus_old/topic=2__part=058__n=60.txt b/raft/corpus/topic=2__part=058__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=058__n=60.txt rename to raft/corpus/topic=2__part=058__n=60.txt diff --git a/raft/corpus_old/topic=2__part=059__n=60.txt b/raft/corpus/topic=2__part=059__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=059__n=60.txt rename to raft/corpus/topic=2__part=059__n=60.txt diff --git a/raft/corpus_old/topic=2__part=060__n=60.txt b/raft/corpus/topic=2__part=060__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=060__n=60.txt rename to raft/corpus/topic=2__part=060__n=60.txt diff --git a/raft/corpus_old/topic=2__part=061__n=60.txt b/raft/corpus/topic=2__part=061__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=061__n=60.txt rename to raft/corpus/topic=2__part=061__n=60.txt diff --git a/raft/corpus_old/topic=2__part=062__n=60.txt b/raft/corpus/topic=2__part=062__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=062__n=60.txt rename to raft/corpus/topic=2__part=062__n=60.txt diff --git a/raft/corpus_old/topic=2__part=063__n=60.txt b/raft/corpus/topic=2__part=063__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=063__n=60.txt rename to raft/corpus/topic=2__part=063__n=60.txt diff --git a/raft/corpus_old/topic=2__part=064__n=60.txt b/raft/corpus/topic=2__part=064__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=064__n=60.txt rename to raft/corpus/topic=2__part=064__n=60.txt diff --git a/raft/corpus_old/topic=2__part=065__n=60.txt b/raft/corpus/topic=2__part=065__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=065__n=60.txt rename to raft/corpus/topic=2__part=065__n=60.txt diff --git a/raft/corpus_old/topic=2__part=066__n=60.txt b/raft/corpus/topic=2__part=066__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=066__n=60.txt rename to raft/corpus/topic=2__part=066__n=60.txt diff --git a/raft/corpus_old/topic=2__part=067__n=60.txt b/raft/corpus/topic=2__part=067__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=067__n=60.txt rename to raft/corpus/topic=2__part=067__n=60.txt diff --git a/raft/corpus_old/topic=2__part=068__n=60.txt b/raft/corpus/topic=2__part=068__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=068__n=60.txt rename to raft/corpus/topic=2__part=068__n=60.txt diff --git a/raft/corpus_old/topic=2__part=069__n=60.txt b/raft/corpus/topic=2__part=069__n=60.txt similarity index 100% rename from raft/corpus_old/topic=2__part=069__n=60.txt rename to raft/corpus/topic=2__part=069__n=60.txt diff --git a/raft/corpus_old/topic=2__part=070__n=26.txt b/raft/corpus/topic=2__part=070__n=26.txt similarity index 100% rename from raft/corpus_old/topic=2__part=070__n=26.txt rename to raft/corpus/topic=2__part=070__n=26.txt diff --git a/raft/corpus_old/topic=4__part=001__n=60.txt b/raft/corpus/topic=4__part=001__n=60.txt similarity index 100% rename from raft/corpus_old/topic=4__part=001__n=60.txt rename to raft/corpus/topic=4__part=001__n=60.txt diff --git a/raft/corpus_old/topic=4__part=002__n=60.txt b/raft/corpus/topic=4__part=002__n=60.txt similarity index 100% rename from raft/corpus_old/topic=4__part=002__n=60.txt rename to raft/corpus/topic=4__part=002__n=60.txt diff --git a/raft/corpus_old/topic=4__part=003__n=60.txt b/raft/corpus/topic=4__part=003__n=60.txt similarity index 100% rename from raft/corpus_old/topic=4__part=003__n=60.txt rename to raft/corpus/topic=4__part=003__n=60.txt diff --git a/raft/corpus_old/topic=4__part=004__n=60.txt b/raft/corpus/topic=4__part=004__n=60.txt similarity index 100% rename from raft/corpus_old/topic=4__part=004__n=60.txt rename to raft/corpus/topic=4__part=004__n=60.txt diff --git a/raft/corpus_old/topic=4__part=005__n=60.txt b/raft/corpus/topic=4__part=005__n=60.txt similarity index 100% rename from raft/corpus_old/topic=4__part=005__n=60.txt rename to raft/corpus/topic=4__part=005__n=60.txt diff --git a/raft/corpus_old/topic=4__part=006__n=60.txt b/raft/corpus/topic=4__part=006__n=60.txt similarity index 100% rename from raft/corpus_old/topic=4__part=006__n=60.txt rename to raft/corpus/topic=4__part=006__n=60.txt diff --git a/raft/corpus_old/topic=4__part=007__n=60.txt b/raft/corpus/topic=4__part=007__n=60.txt similarity index 100% rename from raft/corpus_old/topic=4__part=007__n=60.txt rename to raft/corpus/topic=4__part=007__n=60.txt diff --git a/raft/corpus_old/topic=4__part=008__n=60.txt b/raft/corpus/topic=4__part=008__n=60.txt similarity index 100% rename from raft/corpus_old/topic=4__part=008__n=60.txt rename to raft/corpus/topic=4__part=008__n=60.txt diff --git a/raft/corpus_old/topic=4__part=009__n=60.txt b/raft/corpus/topic=4__part=009__n=60.txt similarity index 100% rename from raft/corpus_old/topic=4__part=009__n=60.txt rename to raft/corpus/topic=4__part=009__n=60.txt diff --git a/raft/corpus_old/topic=4__part=010__n=60.txt b/raft/corpus/topic=4__part=010__n=60.txt similarity index 100% rename from raft/corpus_old/topic=4__part=010__n=60.txt rename to raft/corpus/topic=4__part=010__n=60.txt diff --git a/raft/corpus_old/topic=4__part=011__n=60.txt b/raft/corpus/topic=4__part=011__n=60.txt similarity index 100% rename from raft/corpus_old/topic=4__part=011__n=60.txt rename to raft/corpus/topic=4__part=011__n=60.txt diff --git a/raft/corpus_old/topic=4__part=012__n=60.txt b/raft/corpus/topic=4__part=012__n=60.txt similarity index 100% rename from raft/corpus_old/topic=4__part=012__n=60.txt rename to raft/corpus/topic=4__part=012__n=60.txt diff --git a/raft/corpus_old/topic=4__part=013__n=60.txt b/raft/corpus/topic=4__part=013__n=60.txt similarity index 100% rename from raft/corpus_old/topic=4__part=013__n=60.txt rename to raft/corpus/topic=4__part=013__n=60.txt diff --git a/raft/corpus_old/topic=4__part=014__n=60.txt b/raft/corpus/topic=4__part=014__n=60.txt similarity index 100% rename from raft/corpus_old/topic=4__part=014__n=60.txt rename to raft/corpus/topic=4__part=014__n=60.txt diff --git a/raft/corpus_old/topic=4__part=015__n=60.txt b/raft/corpus/topic=4__part=015__n=60.txt similarity index 100% rename from raft/corpus_old/topic=4__part=015__n=60.txt rename to raft/corpus/topic=4__part=015__n=60.txt diff --git a/raft/corpus_old/topic=4__part=016__n=60.txt b/raft/corpus/topic=4__part=016__n=60.txt similarity index 100% rename from raft/corpus_old/topic=4__part=016__n=60.txt rename to raft/corpus/topic=4__part=016__n=60.txt diff --git a/raft/corpus_old/topic=4__part=017__n=60.txt b/raft/corpus/topic=4__part=017__n=60.txt similarity index 100% rename from raft/corpus_old/topic=4__part=017__n=60.txt rename to raft/corpus/topic=4__part=017__n=60.txt diff --git a/raft/corpus_old/topic=4__part=018__n=60.txt b/raft/corpus/topic=4__part=018__n=60.txt similarity index 100% rename from raft/corpus_old/topic=4__part=018__n=60.txt rename to raft/corpus/topic=4__part=018__n=60.txt diff --git a/raft/corpus_old/topic=4__part=019__n=60.txt b/raft/corpus/topic=4__part=019__n=60.txt similarity index 100% rename from raft/corpus_old/topic=4__part=019__n=60.txt rename to raft/corpus/topic=4__part=019__n=60.txt diff --git a/raft/corpus_old/topic=4__part=020__n=60.txt b/raft/corpus/topic=4__part=020__n=60.txt similarity index 100% rename from raft/corpus_old/topic=4__part=020__n=60.txt rename to raft/corpus/topic=4__part=020__n=60.txt diff --git a/raft/corpus_old/topic=4__part=021__n=60.txt b/raft/corpus/topic=4__part=021__n=60.txt similarity index 100% rename from raft/corpus_old/topic=4__part=021__n=60.txt rename to raft/corpus/topic=4__part=021__n=60.txt diff --git a/raft/corpus_old/topic=4__part=022__n=60.txt b/raft/corpus/topic=4__part=022__n=60.txt similarity index 100% rename from raft/corpus_old/topic=4__part=022__n=60.txt rename to raft/corpus/topic=4__part=022__n=60.txt diff --git a/raft/corpus_old/topic=4__part=023__n=60.txt b/raft/corpus/topic=4__part=023__n=60.txt similarity index 100% rename from raft/corpus_old/topic=4__part=023__n=60.txt rename to raft/corpus/topic=4__part=023__n=60.txt diff --git a/raft/corpus_old/topic=4__part=024__n=60.txt b/raft/corpus/topic=4__part=024__n=60.txt similarity index 100% rename from raft/corpus_old/topic=4__part=024__n=60.txt rename to raft/corpus/topic=4__part=024__n=60.txt diff --git a/raft/corpus_old/topic=4__part=025__n=60.txt b/raft/corpus/topic=4__part=025__n=60.txt similarity index 100% rename from raft/corpus_old/topic=4__part=025__n=60.txt rename to raft/corpus/topic=4__part=025__n=60.txt diff --git a/raft/corpus_old/topic=4__part=026__n=60.txt b/raft/corpus/topic=4__part=026__n=60.txt similarity index 100% rename from raft/corpus_old/topic=4__part=026__n=60.txt rename to raft/corpus/topic=4__part=026__n=60.txt diff --git a/raft/corpus_old/topic=4__part=027__n=60.txt b/raft/corpus/topic=4__part=027__n=60.txt similarity index 100% rename from raft/corpus_old/topic=4__part=027__n=60.txt rename to raft/corpus/topic=4__part=027__n=60.txt diff --git a/raft/corpus_old/topic=4__part=028__n=60.txt b/raft/corpus/topic=4__part=028__n=60.txt similarity index 100% rename from raft/corpus_old/topic=4__part=028__n=60.txt rename to raft/corpus/topic=4__part=028__n=60.txt diff --git a/raft/corpus_old/topic=4__part=029__n=60.txt b/raft/corpus/topic=4__part=029__n=60.txt similarity index 100% rename from raft/corpus_old/topic=4__part=029__n=60.txt rename to raft/corpus/topic=4__part=029__n=60.txt diff --git a/raft/corpus_old/topic=4__part=030__n=60.txt b/raft/corpus/topic=4__part=030__n=60.txt similarity index 100% rename from raft/corpus_old/topic=4__part=030__n=60.txt rename to raft/corpus/topic=4__part=030__n=60.txt diff --git a/raft/corpus_old/topic=4__part=031__n=60.txt b/raft/corpus/topic=4__part=031__n=60.txt similarity index 100% rename from raft/corpus_old/topic=4__part=031__n=60.txt rename to raft/corpus/topic=4__part=031__n=60.txt diff --git a/raft/corpus_old/topic=4__part=032__n=60.txt b/raft/corpus/topic=4__part=032__n=60.txt similarity index 100% rename from raft/corpus_old/topic=4__part=032__n=60.txt rename to raft/corpus/topic=4__part=032__n=60.txt diff --git a/raft/corpus_old/topic=4__part=033__n=60.txt b/raft/corpus/topic=4__part=033__n=60.txt similarity index 100% rename from raft/corpus_old/topic=4__part=033__n=60.txt rename to raft/corpus/topic=4__part=033__n=60.txt diff --git a/raft/corpus_old/topic=4__part=034__n=41.txt b/raft/corpus/topic=4__part=034__n=41.txt similarity index 100% rename from raft/corpus_old/topic=4__part=034__n=41.txt rename to raft/corpus/topic=4__part=034__n=41.txt diff --git a/raft/corpus_old/topic=5__part=001__n=60.txt b/raft/corpus/topic=5__part=001__n=60.txt similarity index 100% rename from raft/corpus_old/topic=5__part=001__n=60.txt rename to raft/corpus/topic=5__part=001__n=60.txt diff --git a/raft/corpus_old/topic=5__part=002__n=60.txt b/raft/corpus/topic=5__part=002__n=60.txt similarity index 100% rename from raft/corpus_old/topic=5__part=002__n=60.txt rename to raft/corpus/topic=5__part=002__n=60.txt diff --git a/raft/corpus_old/topic=5__part=003__n=60.txt b/raft/corpus/topic=5__part=003__n=60.txt similarity index 100% rename from raft/corpus_old/topic=5__part=003__n=60.txt rename to raft/corpus/topic=5__part=003__n=60.txt diff --git a/raft/corpus_old/topic=5__part=004__n=60.txt b/raft/corpus/topic=5__part=004__n=60.txt similarity index 100% rename from raft/corpus_old/topic=5__part=004__n=60.txt rename to raft/corpus/topic=5__part=004__n=60.txt diff --git a/raft/corpus_old/topic=5__part=005__n=60.txt b/raft/corpus/topic=5__part=005__n=60.txt similarity index 100% rename from raft/corpus_old/topic=5__part=005__n=60.txt rename to raft/corpus/topic=5__part=005__n=60.txt diff --git a/raft/corpus_old/topic=5__part=006__n=60.txt b/raft/corpus/topic=5__part=006__n=60.txt similarity index 100% rename from raft/corpus_old/topic=5__part=006__n=60.txt rename to raft/corpus/topic=5__part=006__n=60.txt diff --git a/raft/corpus_old/topic=5__part=007__n=60.txt b/raft/corpus/topic=5__part=007__n=60.txt similarity index 100% rename from raft/corpus_old/topic=5__part=007__n=60.txt rename to raft/corpus/topic=5__part=007__n=60.txt diff --git a/raft/corpus_old/topic=5__part=008__n=60.txt b/raft/corpus/topic=5__part=008__n=60.txt similarity index 100% rename from raft/corpus_old/topic=5__part=008__n=60.txt rename to raft/corpus/topic=5__part=008__n=60.txt diff --git a/raft/corpus_old/topic=5__part=009__n=60.txt b/raft/corpus/topic=5__part=009__n=60.txt similarity index 100% rename from raft/corpus_old/topic=5__part=009__n=60.txt rename to raft/corpus/topic=5__part=009__n=60.txt diff --git a/raft/corpus_old/topic=5__part=010__n=60.txt b/raft/corpus/topic=5__part=010__n=60.txt similarity index 100% rename from raft/corpus_old/topic=5__part=010__n=60.txt rename to raft/corpus/topic=5__part=010__n=60.txt diff --git a/raft/corpus_old/topic=5__part=011__n=60.txt b/raft/corpus/topic=5__part=011__n=60.txt similarity index 100% rename from raft/corpus_old/topic=5__part=011__n=60.txt rename to raft/corpus/topic=5__part=011__n=60.txt diff --git a/raft/corpus_old/topic=5__part=012__n=60.txt b/raft/corpus/topic=5__part=012__n=60.txt similarity index 100% rename from raft/corpus_old/topic=5__part=012__n=60.txt rename to raft/corpus/topic=5__part=012__n=60.txt diff --git a/raft/corpus_old/topic=5__part=013__n=60.txt b/raft/corpus/topic=5__part=013__n=60.txt similarity index 100% rename from raft/corpus_old/topic=5__part=013__n=60.txt rename to raft/corpus/topic=5__part=013__n=60.txt diff --git a/raft/corpus_old/topic=5__part=014__n=60.txt b/raft/corpus/topic=5__part=014__n=60.txt similarity index 100% rename from raft/corpus_old/topic=5__part=014__n=60.txt rename to raft/corpus/topic=5__part=014__n=60.txt diff --git a/raft/corpus_old/topic=5__part=015__n=60.txt b/raft/corpus/topic=5__part=015__n=60.txt similarity index 100% rename from raft/corpus_old/topic=5__part=015__n=60.txt rename to raft/corpus/topic=5__part=015__n=60.txt diff --git a/raft/corpus_old/topic=5__part=016__n=60.txt b/raft/corpus/topic=5__part=016__n=60.txt similarity index 100% rename from raft/corpus_old/topic=5__part=016__n=60.txt rename to raft/corpus/topic=5__part=016__n=60.txt diff --git a/raft/corpus_old/topic=5__part=017__n=60.txt b/raft/corpus/topic=5__part=017__n=60.txt similarity index 100% rename from raft/corpus_old/topic=5__part=017__n=60.txt rename to raft/corpus/topic=5__part=017__n=60.txt diff --git a/raft/corpus_old/topic=5__part=018__n=60.txt b/raft/corpus/topic=5__part=018__n=60.txt similarity index 100% rename from raft/corpus_old/topic=5__part=018__n=60.txt rename to raft/corpus/topic=5__part=018__n=60.txt diff --git a/raft/corpus_old/topic=5__part=019__n=60.txt b/raft/corpus/topic=5__part=019__n=60.txt similarity index 100% rename from raft/corpus_old/topic=5__part=019__n=60.txt rename to raft/corpus/topic=5__part=019__n=60.txt diff --git a/raft/corpus_old/topic=5__part=020__n=60.txt b/raft/corpus/topic=5__part=020__n=60.txt similarity index 100% rename from raft/corpus_old/topic=5__part=020__n=60.txt rename to raft/corpus/topic=5__part=020__n=60.txt diff --git a/raft/corpus_old/topic=5__part=021__n=60.txt b/raft/corpus/topic=5__part=021__n=60.txt similarity index 100% rename from raft/corpus_old/topic=5__part=021__n=60.txt rename to raft/corpus/topic=5__part=021__n=60.txt diff --git a/raft/corpus_old/topic=5__part=022__n=60.txt b/raft/corpus/topic=5__part=022__n=60.txt similarity index 100% rename from raft/corpus_old/topic=5__part=022__n=60.txt rename to raft/corpus/topic=5__part=022__n=60.txt diff --git a/raft/corpus_old/topic=5__part=023__n=60.txt b/raft/corpus/topic=5__part=023__n=60.txt similarity index 100% rename from raft/corpus_old/topic=5__part=023__n=60.txt rename to raft/corpus/topic=5__part=023__n=60.txt diff --git a/raft/corpus_old/topic=5__part=024__n=60.txt b/raft/corpus/topic=5__part=024__n=60.txt similarity index 100% rename from raft/corpus_old/topic=5__part=024__n=60.txt rename to raft/corpus/topic=5__part=024__n=60.txt diff --git a/raft/corpus_old/topic=5__part=025__n=60.txt b/raft/corpus/topic=5__part=025__n=60.txt similarity index 100% rename from raft/corpus_old/topic=5__part=025__n=60.txt rename to raft/corpus/topic=5__part=025__n=60.txt diff --git a/raft/corpus_old/topic=5__part=026__n=60.txt b/raft/corpus/topic=5__part=026__n=60.txt similarity index 100% rename from raft/corpus_old/topic=5__part=026__n=60.txt rename to raft/corpus/topic=5__part=026__n=60.txt diff --git a/raft/corpus_old/topic=5__part=027__n=60.txt b/raft/corpus/topic=5__part=027__n=60.txt similarity index 100% rename from raft/corpus_old/topic=5__part=027__n=60.txt rename to raft/corpus/topic=5__part=027__n=60.txt diff --git a/raft/corpus_old/topic=5__part=028__n=60.txt b/raft/corpus/topic=5__part=028__n=60.txt similarity index 100% rename from raft/corpus_old/topic=5__part=028__n=60.txt rename to raft/corpus/topic=5__part=028__n=60.txt diff --git a/raft/corpus_old/topic=5__part=029__n=39.txt b/raft/corpus/topic=5__part=029__n=39.txt similarity index 100% rename from raft/corpus_old/topic=5__part=029__n=39.txt rename to raft/corpus/topic=5__part=029__n=39.txt diff --git a/raft/corpus_old/topic=9__part=001__n=60.txt b/raft/corpus/topic=9__part=001__n=60.txt similarity index 100% rename from raft/corpus_old/topic=9__part=001__n=60.txt rename to raft/corpus/topic=9__part=001__n=60.txt diff --git a/raft/corpus_old/topic=9__part=002__n=60.txt b/raft/corpus/topic=9__part=002__n=60.txt similarity index 100% rename from raft/corpus_old/topic=9__part=002__n=60.txt rename to raft/corpus/topic=9__part=002__n=60.txt diff --git a/raft/corpus_old/topic=9__part=003__n=60.txt b/raft/corpus/topic=9__part=003__n=60.txt similarity index 100% rename from raft/corpus_old/topic=9__part=003__n=60.txt rename to raft/corpus/topic=9__part=003__n=60.txt diff --git a/raft/corpus_old/topic=9__part=004__n=60.txt b/raft/corpus/topic=9__part=004__n=60.txt similarity index 100% rename from raft/corpus_old/topic=9__part=004__n=60.txt rename to raft/corpus/topic=9__part=004__n=60.txt diff --git a/raft/corpus_old/topic=9__part=005__n=60.txt b/raft/corpus/topic=9__part=005__n=60.txt similarity index 100% rename from raft/corpus_old/topic=9__part=005__n=60.txt rename to raft/corpus/topic=9__part=005__n=60.txt diff --git a/raft/corpus_old/topic=9__part=006__n=60.txt b/raft/corpus/topic=9__part=006__n=60.txt similarity index 100% rename from raft/corpus_old/topic=9__part=006__n=60.txt rename to raft/corpus/topic=9__part=006__n=60.txt diff --git a/raft/corpus_old/topic=9__part=007__n=60.txt b/raft/corpus/topic=9__part=007__n=60.txt similarity index 100% rename from raft/corpus_old/topic=9__part=007__n=60.txt rename to raft/corpus/topic=9__part=007__n=60.txt diff --git a/raft/corpus_old/topic=9__part=008__n=60.txt b/raft/corpus/topic=9__part=008__n=60.txt similarity index 100% rename from raft/corpus_old/topic=9__part=008__n=60.txt rename to raft/corpus/topic=9__part=008__n=60.txt diff --git a/raft/corpus_old/topic=9__part=009__n=60.txt b/raft/corpus/topic=9__part=009__n=60.txt similarity index 100% rename from raft/corpus_old/topic=9__part=009__n=60.txt rename to raft/corpus/topic=9__part=009__n=60.txt diff --git a/raft/corpus_old/topic=9__part=010__n=60.txt b/raft/corpus/topic=9__part=010__n=60.txt similarity index 100% rename from raft/corpus_old/topic=9__part=010__n=60.txt rename to raft/corpus/topic=9__part=010__n=60.txt diff --git a/raft/corpus_old/topic=9__part=011__n=60.txt b/raft/corpus/topic=9__part=011__n=60.txt similarity index 100% rename from raft/corpus_old/topic=9__part=011__n=60.txt rename to raft/corpus/topic=9__part=011__n=60.txt diff --git a/raft/corpus_old/topic=9__part=012__n=60.txt b/raft/corpus/topic=9__part=012__n=60.txt similarity index 100% rename from raft/corpus_old/topic=9__part=012__n=60.txt rename to raft/corpus/topic=9__part=012__n=60.txt diff --git a/raft/corpus_old/topic=9__part=013__n=60.txt b/raft/corpus/topic=9__part=013__n=60.txt similarity index 100% rename from raft/corpus_old/topic=9__part=013__n=60.txt rename to raft/corpus/topic=9__part=013__n=60.txt diff --git a/raft/corpus_old/topic=9__part=014__n=60.txt b/raft/corpus/topic=9__part=014__n=60.txt similarity index 100% rename from raft/corpus_old/topic=9__part=014__n=60.txt rename to raft/corpus/topic=9__part=014__n=60.txt diff --git a/raft/corpus_old/topic=9__part=015__n=60.txt b/raft/corpus/topic=9__part=015__n=60.txt similarity index 100% rename from raft/corpus_old/topic=9__part=015__n=60.txt rename to raft/corpus/topic=9__part=015__n=60.txt diff --git a/raft/corpus_old/topic=9__part=016__n=60.txt b/raft/corpus/topic=9__part=016__n=60.txt similarity index 100% rename from raft/corpus_old/topic=9__part=016__n=60.txt rename to raft/corpus/topic=9__part=016__n=60.txt diff --git a/raft/corpus_old/topic=9__part=017__n=60.txt b/raft/corpus/topic=9__part=017__n=60.txt similarity index 100% rename from raft/corpus_old/topic=9__part=017__n=60.txt rename to raft/corpus/topic=9__part=017__n=60.txt diff --git a/raft/corpus_old/topic=9__part=018__n=60.txt b/raft/corpus/topic=9__part=018__n=60.txt similarity index 100% rename from raft/corpus_old/topic=9__part=018__n=60.txt rename to raft/corpus/topic=9__part=018__n=60.txt diff --git a/raft/corpus_old/topic=9__part=019__n=60.txt b/raft/corpus/topic=9__part=019__n=60.txt similarity index 100% rename from raft/corpus_old/topic=9__part=019__n=60.txt rename to raft/corpus/topic=9__part=019__n=60.txt diff --git a/raft/corpus_old/topic=9__part=020__n=44.txt b/raft/corpus/topic=9__part=020__n=44.txt similarity index 100% rename from raft/corpus_old/topic=9__part=020__n=44.txt rename to raft/corpus/topic=9__part=020__n=44.txt diff --git a/raft/create_raft_dataset_notebook.py b/raft/create_raft_dataset_notebook.py deleted file mode 100644 index 7d29d20..0000000 --- a/raft/create_raft_dataset_notebook.py +++ /dev/null @@ -1,619 +0,0 @@ -# This script programmatically creates a Jupyter Notebook tailored for -# "Retrieval-Augmented Fine-Tuning (RAFT) dataset generation using local Ollama". -# It saves the notebook to /mnt/data/ so you can download and run it. - -import json -from datetime import datetime -import nbformat as nbf - -nb = nbf.v4.new_notebook() -nb["metadata"].update( - { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3", - }, - "language_info": {"name": "python", "version": "3.x"}, - } -) - -cells = [] - -# Title / Overview -cells.append( - nbf.v4.new_markdown_cell( - """ -# Retrieval-Augmented **Fine-Tuning** (RAFT) Dataset Generation with **Local Ollama** - -This notebook builds a **supervised fine-tuning dataset** (JSONL) for *retrieval-augmented* tasks, by: -1. **Ingesting** your local corpus (Markdown, text, HTML; PDFs optional with extra deps). -2. **Chunking** and **embedding** documents using Ollama's local **embedding model** (e.g., `nomic-embed-text`, `mxbai-embed-large`). -3. Building a **lightweight vector index** (FAISS). -4. **Sampling contexts** and using a local **generation model** via Ollama (e.g., `llama3.1`, `qwen2`, `phi3`) to synthesize **grounded Q&A** or instruction–response pairs. -5. Emitting a **RAFT-style JSONL** for supervised training (e.g., `input`, `output`, `meta` with source citations). - -> **Requirements** -> -> - Local [Ollama](https://ollama.com/) running at `http://localhost:11434` -> - At least one **embedding** model pulled (e.g., `ollama pull nomic-embed-text`) -> - At least one **generation** model pulled (e.g., `ollama pull llama3.1`) -> -> You can adapt the prompts and schema for your specific downstream trainer (Llama.cpp, vLLM, Axolotl, mlx, etc.). -""" - ) -) - -# Setup -cells.append( - nbf.v4.new_markdown_cell( - """ -## 0) Setup - -Install Python dependencies. If you're offline, pre-install or remove what you don't need. -""" - ) -) -cells.append( - nbf.v4.new_code_cell( - """ -# If needed, uncomment: -# %pip install --quiet requests faiss-cpu rich markdownify python-frontmatter pypdf regex -# Optional extras: -# %pip install --quiet tiktoken beautifulsoup4 lxml -""" - ) -) - -# Config -cells.append( - nbf.v4.new_markdown_cell( - """ -## 1) Configuration - -Set paths, models, and chunking/index params. -""" - ) -) -cells.append( - nbf.v4.new_code_cell( - """ -from dataclasses import dataclass, asdict -from pathlib import Path -from typing import List, Dict, Any, Optional, Tuple -import os, re, json, uuid, math, glob, random, time -import hashlib -import requests -from rich import print -import regex -import numpy as np - -# ---- Core config ---- -DATA_DIR = Path("./corpus") # Put your source docs here -OUTPUT_DIR = Path("./outputs") # Where artifacts are saved -OUTPUT_DIR.mkdir(parents=True, exist_ok=True) - -# Ollama endpoints & models -OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434") -EMBED_MODEL = os.environ.get("EMBED_MODEL", "nomic-embed-text") -GEN_MODEL = os.environ.get("GEN_MODEL", "llama3.1") - -# Chunking -CHUNK_SIZE = 1200 # characters -CHUNK_OVERLAP = 200 # characters -MIN_CHARS = 200 # minimum viable chunk length - -# Index -USE_FAISS = True -TOP_K = 4 - -# RAFT generation -SEED = 7 -SAMPLES_PER_DOC = 4 -MAX_TOKENS_GEN = 512 # Generation max tokens (approx; Ollama supports 'num_predict') -TEMPERATURE = 0.6 - -random.seed(SEED) -np.random.seed(SEED) - -print({ - "DATA_DIR": str(DATA_DIR.resolve()), - "OUTPUT_DIR": str(OUTPUT_DIR.resolve()), - "OLLAMA_URL": OLLAMA_URL, - "EMBED_MODEL": EMBED_MODEL, - "GEN_MODEL": GEN_MODEL, - "CHUNK_SIZE": CHUNK_SIZE, - "CHUNK_OVERLAP": CHUNK_OVERLAP, - "TOP_K": TOP_K, - "SAMPLES_PER_DOC": SAMPLES_PER_DOC -}) -""" - ) -) - -# Helpers: loaders -cells.append( - nbf.v4.new_markdown_cell( - """ -## 2) Load & Normalize Documents - -Basic loaders for `.md`, `.txt`, `.html`. PDF support is optional (requires `pypdf`). You can extend as needed. -""" - ) -) -cells.append( - nbf.v4.new_code_cell( - """ -from bs4 import BeautifulSoup # if you didn't install bs4, comment HTML support below -try: - import frontmatter -except Exception: - frontmatter = None - -def read_text_file(p: Path) -> str: - return p.read_text(encoding="utf-8", errors="ignore") - -def read_markdown(p: Path) -> str: - text = p.read_text(encoding="utf-8", errors="ignore") - # Optional: strip YAML frontmatter - if frontmatter: - try: - fm = frontmatter.loads(text) - return fm.content - except Exception: - return text - return text - -def read_html(p: Path) -> str: - html = p.read_text(encoding="utf-8", errors="ignore") - soup = BeautifulSoup(html, "lxml") - # Remove script/style - for tag in soup(["script", "style", "noscript"]): - tag.decompose() - text = soup.get_text(" ", strip=True) - return text - -def read_pdf(p: Path) -> str: - try: - from pypdf import PdfReader - except Exception as e: - print("[yellow]Install pypdf to enable PDF parsing: %pip install pypdf[/yellow]") - raise e - reader = PdfReader(str(p)) - parts = [] - for page in reader.pages: - try: - parts.append(page.extract_text() or "") - except Exception: - parts.append("") - return "\\n".join(parts) - -SUPPORTED_EXTS = {".txt": read_text_file, ".md": read_markdown, ".markdown": read_markdown, - ".html": read_html, ".htm": read_html, ".pdf": read_pdf} - -def load_corpus(data_dir: Path) -> Dict[str, str]: - docs = {} - for p in data_dir.rglob("*"): - if not p.is_file(): - continue - fn = p.suffix.lower() - if fn in SUPPORTED_EXTS: - try: - docs[str(p)] = SUPPORTED_EXTS[fn](p) - except Exception as e: - print(f"[red]Failed to read {p}: {e}[/red]") - print(f"[green]Loaded {len(docs)} documents[/green]") - return docs - -docs = load_corpus(DATA_DIR) -len(docs) -""" - ) -) - -# Chunking -cells.append( - nbf.v4.new_markdown_cell( - """ -## 3) Chunking - -Simple character-based chunker with overlap. Swap in a token-based chunker if you prefer. -""" - ) -) -cells.append( - nbf.v4.new_code_cell( - """ -@dataclass -class Chunk: - id: str - doc_path: str - start: int - end: int - text: str - sha1: str - -def chunk_text(text: str, doc_path: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[Chunk]: - chunks: List[Chunk] = [] - i = 0 - n = len(text) - while i < n: - j = min(i + chunk_size, n) - piece = text[i:j].strip() - if len(piece) >= MIN_CHARS: - sha1 = hashlib.sha1(piece.encode("utf-8")).hexdigest() - chunks.append(Chunk( - id=str(uuid.uuid4()), - doc_path=doc_path, - start=i, end=j, - text=piece, - sha1=sha1 - )) - if j == n: - break - i = j - overlap - if i < 0: - i = 0 - if i >= n: - break - return chunks - -all_chunks: List[Chunk] = [] -for path, text in docs.items(): - all_chunks.extend(chunk_text(text, path)) - -print(f"[green]Total chunks: {len(all_chunks)}[/green]") -""" - ) -) - -# Embeddings via Ollama -cells.append( - nbf.v4.new_markdown_cell( - """ -## 4) Embeddings via Ollama - -Uses Ollama's `POST /api/embeddings` endpoint with your selected embedding model. -Make sure you've pulled it locally: `ollama pull nomic-embed-text` (or your chosen model). -""" - ) -) -cells.append( - nbf.v4.new_code_cell( - """ -EMBED_ENDPOINT = f"{OLLAMA_URL}/api/embeddings" - -def embed_texts(texts: List[str], model: str = EMBED_MODEL, batch_size: int = 32) -> np.ndarray: - vectors = [] - for i in range(0, len(texts), batch_size): - batch = texts[i:i+batch_size] - # Ollama supports a single prompt or list? We'll call one by one to be safe with large content. - for t in batch: - r = requests.post(EMBED_ENDPOINT, json={"model": model, "prompt": t}) - r.raise_for_status() - data = r.json() - vec = np.array(data["embedding"], dtype=np.float32) - vectors.append(vec) - return np.vstack(vectors) if vectors else np.zeros((0, 768), dtype=np.float32) - -chunk_texts = [c.text for c in all_chunks] -emb_matrix = embed_texts(chunk_texts, model=EMBED_MODEL, batch_size=8) -emb_matrix.shape -""" - ) -) - -# Build FAISS -cells.append( - nbf.v4.new_markdown_cell( - """ -## 5) Build Vector Index (FAISS) - -We normalize vectors and use inner product (equivalent to cosine on normalized vectors). -""" - ) -) -cells.append( - nbf.v4.new_code_cell( - """ -def normalize_rows(x: np.ndarray) -> np.ndarray: - norms = np.linalg.norm(x, axis=1, keepdims=True) + 1e-12 - return x / norms - -if USE_FAISS: - import faiss - xb = normalize_rows(emb_matrix).astype(np.float32) - d = xb.shape[1] - index = faiss.IndexFlatIP(d) - index.add(xb) - print("[green]FAISS index built:[/green]", index.ntotal, "vectors") -else: - index = None - xb = normalize_rows(emb_matrix).astype(np.float32) -""" - ) -) - -# Retrieval helper -cells.append( - nbf.v4.new_markdown_cell( - """ -## 6) Retrieval Helper -""" - ) -) -cells.append( - nbf.v4.new_code_cell( - """ -def search(query: str, top_k: int = TOP_K) -> List[Tuple[int, float]]: - # Embed the query - qv = embed_texts([query], model=EMBED_MODEL, batch_size=1) - qv = normalize_rows(qv).astype(np.float32) - if USE_FAISS and index is not None: - D, I = index.search(qv, top_k) - hits = list(zip(I[0].tolist(), D[0].tolist())) - else: - sims = (xb @ qv.T).ravel() - I = np.argsort(-sims)[:top_k] - hits = [(int(i), float(sims[i])) for i in I] - return hits - -# quick smoke test (no error means it's wired up) -# print(search("What does this corpus talk about?", 3)) -""" - ) -) - -# Generation via Ollama -cells.append( - nbf.v4.new_markdown_cell( - """ -## 7) Synthesize Grounded Q&A / Instructions with Ollama - -We sample chunks, retrieve neighbors for richer context, and prompt a local LLM to create **high-quality** pairs. -""" - ) -) -cells.append( - nbf.v4.new_code_cell( - """ -GEN_ENDPOINT = f"{OLLAMA_URL}/api/generate" - -SYSTEM_PROMPT = ( - "You are a careful dataset writer. Given only the provided CONTEXT, craft high-quality, factual " - "question–answer pairs for supervised fine-tuning. Answers must be grounded strictly in the context. " - "If the context lacks the answer, say 'INSUFFICIENT_CONTEXT'. Focus on clarity, specificity, and avoid hallucinations." -) - -USER_PROMPT_TEMPLATE = ( - "CONTEXT:\\n\\n{context}\\n\\n" - "Task: Produce {n} diverse Q&A pairs about the content above. " - "Use JSON lines (one JSON object per line) with keys: 'input' (question/instruction), 'output' (concise grounded answer), " - "'meta' (object with 'source_path', 'chunk_ids', and optional 'citations': list of quotes). " - "Do NOT include markdown; output JSON objects only." -) - -def ollama_generate(prompt: str, model: str = GEN_MODEL, temperature: float = TEMPERATURE, num_predict: int = MAX_TOKENS_GEN) -> str: - payload = { - "model": model, - "prompt": prompt, - "system": SYSTEM_PROMPT, - "options": { - "temperature": temperature, - "num_predict": num_predict - }, - "stream": False - } - r = requests.post(GEN_ENDPOINT, json=payload) - r.raise_for_status() - data = r.json() - return data.get("response", "") - -def build_context(primary_idx: int, k: int = TOP_K) -> Tuple[str, List[str]]: - primary_chunk = all_chunks[primary_idx] - query = primary_chunk.text[:400] # use the start of the chunk as a pseudo-query - hits = search(query, k) - pieces, ids = [], [] - for i, score in hits: - ch = all_chunks[i] - ids.append(ch.id) - pieces.append(f"[{Path(ch.doc_path).name}::{ch.start}-{ch.end}]\\n{ch.text}") - return "\\n\\n---\\n\\n".join(pieces), ids - -def parse_llm_jsonl(text: str) -> List[Dict[str, Any]]: - rows = [] - for line in text.splitlines(): - line = line.strip() - if not line: - continue - # be forgiving for trailing commas etc. - try: - obj = json.loads(line) - if isinstance(obj, dict): - rows.append(obj) - except Exception: - # try to salvage with regex for JSON-ish - try: - fixed = regex.sub(r",\\s*}", "}", line) - fixed = regex.sub(r",\\s*]", "]", fixed) - obj = json.loads(fixed) - if isinstance(obj, dict): - rows.append(obj) - except Exception: - pass - return rows -""" - ) -) - -# Sampling and synthesis loop -cells.append( - nbf.v4.new_markdown_cell( - """ -## 8) Generate the RAFT Dataset - -This step iterates over documents, samples chunks, retrieves neighbors, and asks the model to produce JSONL rows. -""" - ) -) -cells.append( - nbf.v4.new_code_cell( - """ -def synthesize_dataset(samples_per_doc: int = SAMPLES_PER_DOC, out_path: Path = OUTPUT_DIR / "raft_dataset.jsonl") -> Path: - rng = random.Random(SEED) - doc_to_chunk_idx = {} - for i, ch in enumerate(all_chunks): - doc_to_chunk_idx.setdefault(ch.doc_path, []).append(i) - - total_target = 0 - with out_path.open("w", encoding="utf-8") as f: - for doc_path, idxs in doc_to_chunk_idx.items(): - if not idxs: - continue - chosen = rng.sample(idxs, min(samples_per_doc, len(idxs))) - for pi in chosen: - ctx, ids = build_context(pi, k=TOP_K) - user = USER_PROMPT_TEMPLATE.format(context=ctx, n=3) - raw = ollama_generate(user, model=GEN_MODEL, temperature=TEMPERATURE, num_predict=MAX_TOKENS_GEN) - rows = parse_llm_jsonl(raw) - for r in rows: - # enforce schema & enrich meta - inp = r.get("input") or r.get("question") or r.get("query") - out = r.get("output") or r.get("answer") or r.get("response") - meta = r.get("meta") or {} - if not isinstance(meta, dict): - meta = {} - meta.update({ - "source_path": str(doc_path), - "chunk_ids": ids, - "generated_at": datetime.utcnow().isoformat() + "Z", - "model": GEN_MODEL, - "embed_model": EMBED_MODEL - }) - if inp and out: - obj = {"input": inp, "output": out, "meta": meta} - f.write(json.dumps(obj, ensure_ascii=False) + "\\n") - total_target += 1 - print(f"[green]Wrote {total_target} rows -> {out_path}[/green]") - return out_path - -OUT_JSONL = synthesize_dataset(samples_per_doc=SAMPLES_PER_DOC) -OUT_JSONL -""" - ) -) - -# Sanity check / preview -cells.append( - nbf.v4.new_markdown_cell( - """ -## 9) Preview Samples -""" - ) -) -cells.append( - nbf.v4.new_code_cell( - """ -from itertools import islice - -def head_jsonl(p: Path, n: int = 5): - with p.open("r", encoding="utf-8") as f: - for line in islice(f, n): - print(line.rstrip()) - -head_jsonl(OUT_JSONL, 5) -""" - ) -) - -# Optional: small eval -cells.append( - nbf.v4.new_markdown_cell( - """ -## 10) Optional: Spot-Check Generation Quality - -Run a tiny evaluation by asking the model with and without retrieval and compare answers. -""" - ) -) -cells.append( - nbf.v4.new_code_cell( - """ -EVAL_QUESTIONS = [] - -# Collect inputs from the dataset (first N) -with (OUTPUT_DIR / "raft_dataset.jsonl").open("r", encoding="utf-8") as f: - for i, line in enumerate(f): - try: - obj = json.loads(line) - EVAL_QUESTIONS.append(obj["input"]) - except Exception: - pass - if len(EVAL_QUESTIONS) >= 5: - break - -def rag_answer(q: str, k: int = TOP_K) -> str: - hits = search(q, k) - ctx = "\\n\\n".join([all_chunks[i].text for i,_ in hits]) - user = f"Answer the question using ONLY this context. If missing, say INSUFFICIENT_CONTEXT.\\n\\nCONTEXT:\\n{ctx}\\n\\nQUESTION: {q}" - return ollama_generate(user, model=GEN_MODEL, temperature=0.2, num_predict=256) - -for q in EVAL_QUESTIONS: - print("\\n[bold]Q:[/bold]", q) - ans = rag_answer(q) - print("[bold]A:[/bold]", ans.strip()[:500], "...") -""" - ) -) - -# Save artifacts list -cells.append( - nbf.v4.new_markdown_cell( - """ -## 11) Artifacts - -- `outputs/raft_dataset.jsonl` — your RAFT dataset (input/output/meta per line) -- `corpus/` — your source documents (you provide) -- You can also persist `emb_matrix.npy` and a FAISS index for reuse. -""" - ) -) -cells.append( - nbf.v4.new_code_cell( - """ -# Optionally persist embeddings and index for later reuse -np.save(OUTPUT_DIR / "emb_matrix.npy", emb_matrix) - -if USE_FAISS: - import faiss - faiss.write_index(index, str(OUTPUT_DIR / "faiss.index")) - print("[green]Saved FAISS index and embeddings.[/green]") -else: - print("[yellow]FAISS disabled; only saved embeddings.[/yellow]") -""" - ) -) - -# Troubleshooting -cells.append( - nbf.v4.new_markdown_cell( - """ -## 12) Troubleshooting - -- **Connection error to Ollama**: ensure `ollama serve` is running and models are pulled (`ollama pull nomic-embed-text`, `ollama pull llama3.1`). -- **Empty dataset**: your corpus may be too small or the parser skipped files. Check `corpus/` content and chunk parameters. -- **Hallucinations**: tighten the system prompt, lower temperature, or increase `TOP_K` and chunk size. -- **JSON parsing issues**: the notebook tries to be forgiving; you can harden `parse_llm_jsonl` per your needs. -- **PDFs**: `pip install pypdf` and try again. -""" - ) -) - -# Save the notebook -nb["cells"] = cells - -out_path = "raft_ollama_dataset.ipynb" -with open(out_path, "w", encoding="utf-8") as f: - nbf.write(nb, f) - -out_path diff --git a/raft/create_raft_tuning_notebook.py b/raft/create_raft_tuning_notebook.py deleted file mode 100644 index 03cf697..0000000 --- a/raft/create_raft_tuning_notebook.py +++ /dev/null @@ -1,417 +0,0 @@ -# Re-create the Jupyter Notebook for RAFT QLoRA fine-tuning and save to /mnt/data - -import nbformat as nbf -from pathlib import Path - -nb = nbf.v4.new_notebook() -nb.metadata.update( - { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3", - }, - "language_info": {"name": "python", "version": "3.x"}, - } -) - -cells = [] - -cells.append( - nbf.v4.new_markdown_cell( - """ -# RAFT Supervised Fine-Tuning (QLoRA) — Local Training - -This notebook fine-tunes an open-source base model on a RAFT-style dataset (`input` → `output`) using **QLoRA** with **PEFT** and **Transformers**. It is designed to run locally (single or multi-GPU) and to export both **LoRA adapters** and (optionally) a **merged** model for inference. - -> **Assumptions** -> - Your dataset lives at `./outputs/raft_dataset.jsonl` (from the previous notebook). Adjust the path if needed. -> - You have a CUDA-capable GPU and can install `bitsandbytes`. (CPU training is possible but slow.) -> - You have enough VRAM for the chosen base model when loaded in 4-bit NF4. -""" - ) -) - -cells.append(nbf.v4.new_markdown_cell("## 0) Install dependencies")) -cells.append( - nbf.v4.new_code_cell( - """ -# If needed, uncomment the following installs: -# %pip install --quiet transformers==4.44.2 datasets==2.20.0 peft==0.12.0 accelerate==0.34.2 bitsandbytes==0.43.3 evaluate==0.4.2 sentencepiece==0.2.0 -# Optional extras: -# %pip install --quiet trl==0.9.6 sacrebleu==2.4.3 rouge-score==0.1.2 -""" - ) -) - -cells.append(nbf.v4.new_markdown_cell("## 1) Configuration")) -cells.append( - nbf.v4.new_code_cell( - """ -from pathlib import Path - -# Paths -DATA_JSONL = Path("./outputs/raft_dataset.jsonl") # change if different -RUN_NAME = "raft_qlora_run" -OUTPUT_DIR = Path(f"./finetuned/{RUN_NAME}") -OUTPUT_DIR.mkdir(parents=True, exist_ok=True) - -# Base model — examples: "meta-llama/Llama-3.1-8B", "Qwen/Qwen2-7B-Instruct", "mistralai/Mistral-7B-Instruct-v0.3" -# Prefer an instruction-tuned base for better stability on SFT. -BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.3" - -# Tokenization/prompt formatting -SYSTEM_PREFIX = "You are a helpful assistant. Answer concisely and truthfully based ONLY on the user's request." -USE_CHAT_TEMPLATE = True # if the tokenizer has a chat template, we'll leverage it - -# QLoRA/PEFT params -LORA_R = 16 -LORA_ALPHA = 32 -LORA_DROPOUT = 0.05 -TARGET_MODULES = None # None = let PEFT auto-detect common modules (works for most models) - -# 4-bit quantization (QLoRA) -LOAD_IN_4BIT = True -BNB_4BIT_COMPUTE_DTYPE = "bfloat16" # "float16" or "bfloat16" -BNB_4BIT_QUANT_TYPE = "nf4" # "nf4" or "fp4" -BNB_4BIT_USE_DOUBLE_QUANT = True - -# Training -TRAIN_VAL_SPLIT = 0.98 -MAX_SEQ_LEN = 2048 -PER_DEVICE_TRAIN_BATCH = 1 -PER_DEVICE_EVAL_BATCH = 1 -GRADIENT_ACCUM_STEPS = 16 -LEARNING_RATE = 2e-4 -NUM_TRAIN_EPOCHS = 2 -WEIGHT_DECAY = 0.0 -WARMUP_RATIO = 0.03 -LR_SCHEDULER_TYPE = "cosine" -LOGGING_STEPS = 10 -EVAL_STEPS = 200 -SAVE_STEPS = 200 -BF16 = True -FP16 = False - -SEED = 7 -""" - ) -) - -cells.append(nbf.v4.new_markdown_cell("## 2) Load dataset (JSONL)")) -cells.append( - nbf.v4.new_code_cell( - """ -import json, random -from datasets import Dataset - -def read_jsonl(p: Path): - rows = [] - with p.open("r", encoding="utf-8") as f: - for line in f: - line = line.strip() - if not line: - continue - try: - obj = json.loads(line) - if "input" in obj and "output" in obj: - rows.append(obj) - except Exception: - pass - return rows - -rows = read_jsonl(DATA_JSONL) -print(f"Loaded {len(rows)} rows from {DATA_JSONL}") - -random.Random(SEED).shuffle(rows) -split = int(len(rows) * TRAIN_VAL_SPLIT) -train_rows = rows[:split] -val_rows = rows[split:] if split < len(rows) else rows[-max(1, len(rows)//50):] - -train_ds = Dataset.from_list(train_rows) -eval_ds = Dataset.from_list(val_rows) if val_rows else None -train_ds, eval_ds -""" - ) -) - -cells.append(nbf.v4.new_markdown_cell("## 3) Prompt formatting")) -cells.append( - nbf.v4.new_code_cell( - """ -from transformers import AutoTokenizer - -tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True) -if tokenizer.pad_token is None: - tokenizer.pad_token = tokenizer.eos_token - -def format_example(ex): - user = ex["input"] - assistant = ex["output"] - - if USE_CHAT_TEMPLATE and hasattr(tokenizer, "apply_chat_template"): - messages = [ - {"role": "system", "content": SYSTEM_PREFIX}, - {"role": "user", "content": user}, - {"role": "assistant", "content": assistant}, - ] - text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False) - else: - text = f"[SYSTEM]\\n{SYSTEM_PREFIX}\\n[/SYSTEM]\\n[USER]\\n{user}\\n[/USER]\\n[ASSISTANT]\\n{assistant}" - return {"text": text} - -train_ds_fmt = train_ds.map(format_example, remove_columns=train_ds.column_names) -eval_ds_fmt = eval_ds.map(format_example, remove_columns=eval_ds.column_names) if eval_ds else None - -print(train_ds_fmt[0]["text"][:400]) -""" - ) -) - -cells.append(nbf.v4.new_markdown_cell("## 4) Tokenize")) -cells.append( - nbf.v4.new_code_cell( - """ -def tokenize(batch): - return tokenizer( - batch["text"], - truncation=True, - max_length=MAX_SEQ_LEN, - padding="max_length", - return_tensors=None, - ) - -train_tok = train_ds_fmt.map(tokenize, batched=True, remove_columns=train_ds_fmt.column_names) -eval_tok = eval_ds_fmt.map(tokenize, batched=True, remove_columns=eval_ds_fmt.column_names) if eval_ds_fmt else None - -train_tok = train_tok.rename_column("input_ids", "input_ids") -train_tok = train_tok.add_column("labels", train_tok["input_ids"]) -if eval_tok: - eval_tok = eval_tok.add_column("labels", eval_tok["input_ids"]) - -train_tok, (eval_tok[0]['input_ids'][:10] if eval_tok else []) -""" - ) -) - -cells.append( - nbf.v4.new_markdown_cell( - "## 5) Load base model with 4-bit quantization and prepare QLoRA" - ) -) -cells.append( - nbf.v4.new_code_cell( - """ -import torch -from transformers import AutoModelForCausalLM, BitsAndBytesConfig -from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training - -bnb_config = None -if LOAD_IN_4BIT: - bnb_config = BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_use_double_quant=BNB_4BIT_USE_DOUBLE_QUANT, - bnb_4bit_quant_type=BNB_4BIT_QUANT_TYPE, - bnb_4bit_compute_dtype=getattr(torch, BNB_4BIT_COMPUTE_DTYPE) - ) - -model = AutoModelForCausalLM.from_pretrained( - BASE_MODEL, - quantization_config=bnb_config, - torch_dtype=torch.bfloat16 if BF16 else (torch.float16 if FP16 else None), - device_map="auto", -) - -model = prepare_model_for_kbit_training(model) - -peft_config = LoraConfig( - r=LORA_R, - lora_alpha=LORA_ALPHA, - lora_dropout=LORA_DROPOUT, - bias="none", - task_type="CAUSAL_LM", - target_modules=TARGET_MODULES, -) - -model = get_peft_model(model, peft_config) -model.print_trainable_parameters() -""" - ) -) - -cells.append(nbf.v4.new_markdown_cell("## 6) Train")) -cells.append( - nbf.v4.new_code_cell( - """ -from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling -import math - -data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) - -args = TrainingArguments( - output_dir=str(OUTPUT_DIR), - run_name=RUN_NAME, - num_train_epochs=NUM_TRAIN_EPOCHS, - per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH, - per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH, - gradient_accumulation_steps=GRADIENT_ACCUM_STEPS, - learning_rate=LEARNING_RATE, - lr_scheduler_type=LR_SCHEDULER_TYPE, - warmup_ratio=WARMUP_RATIO, - weight_decay=WEIGHT_DECAY, - logging_steps=LOGGING_STEPS, - evaluation_strategy="steps", - eval_steps=EVAL_STEPS, - save_steps=SAVE_STEPS, - save_total_limit=2, - bf16=BF16, - fp16=FP16, - gradient_checkpointing=True, - report_to=["none"], - seed=SEED, -) - -trainer = Trainer( - model=model, - tokenizer=tokenizer, - args=args, - train_dataset=train_tok, - eval_dataset=eval_tok, - data_collator=data_collator, -) - -train_result = trainer.train() -metrics = trainer.evaluate() if eval_tok else {} -perplexity = math.exp(metrics["eval_loss"]) if metrics and "eval_loss" in metrics else None -metrics, perplexity -""" - ) -) - -cells.append(nbf.v4.new_markdown_cell("## 7) Save LoRA adapters")) -cells.append( - nbf.v4.new_code_cell( - """ -adapter_dir = OUTPUT_DIR / "lora_adapter" -adapter_dir.mkdir(parents=True, exist_ok=True) - -model.save_pretrained(str(adapter_dir)) -tokenizer.save_pretrained(str(adapter_dir)) - -print(f"Saved LoRA adapter to: {adapter_dir}") -""" - ) -) - -cells.append( - nbf.v4.new_markdown_cell( - "## 8) (Optional) Merge adapters into base model and save full weights" - ) -) -cells.append( - nbf.v4.new_code_cell( - """ -DO_MERGE = False # set True to produce a standalone merged model - -if DO_MERGE: - from peft import PeftModel - base_model = AutoModelForCausalLM.from_pretrained( - BASE_MODEL, - torch_dtype=torch.bfloat16 if BF16 else (torch.float16 if FP16 else None), - device_map="auto", - ) - merged = PeftModel.from_pretrained(base_model, str(adapter_dir)).merge_and_unload() - merged_dir = OUTPUT_DIR / "merged_model" - merged.save_pretrained(str(merged_dir)) - tokenizer.save_pretrained(str(merged_dir)) - print(f"Merged full model saved to: {merged_dir}") -else: - print("Skipping merge (set DO_MERGE=True to enable).") -""" - ) -) - -cells.append(nbf.v4.new_markdown_cell("## 9) Quick inference with the trained adapter")) -cells.append( - nbf.v4.new_code_cell( - """ -from peft import PeftModel -import torch - -test_model = AutoModelForCausalLM.from_pretrained( - BASE_MODEL, - quantization_config=bnb_config, - torch_dtype=torch.bfloat16 if BF16 else (torch.float16 if FP16 else None), - device_map="auto", -) -test_model = PeftModel.from_pretrained(test_model, str(adapter_dir)) -test_model.eval() - -def generate_answer(prompt, max_new_tokens=256, temperature=0.2, top_p=0.9): - if USE_CHAT_TEMPLATE and hasattr(tokenizer, "apply_chat_template"): - messages = [ - {"role": "system", "content": SYSTEM_PREFIX}, - {"role": "user", "content": prompt}, - ] - model_inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to(test_model.device) - else: - text = f"[SYSTEM]\\n{SYSTEM_PREFIX}\\n[/SYSTEM]\\n[USER]\\n{prompt}\\n[/USER]\\n[ASSISTANT]\\n" - model_inputs = tokenizer([text], return_tensors="pt").to(test_model.device) - - with torch.no_grad(): - out = test_model.generate( - **model_inputs, - do_sample=True, - max_new_tokens=max_new_tokens, - temperature=temperature, - top_p=top_p, - eos_token_id=tokenizer.eos_token_id, - pad_token_id=tokenizer.pad_token_id, - ) - return tokenizer.decode(out[0], skip_special_tokens=True) - -sample_prompt = (train_rows[0]["input"] if len(train_rows)>0 else "What are the visitor crowd levels like?") -print(generate_answer(sample_prompt)[:800]) -""" - ) -) - -cells.append(nbf.v4.new_markdown_cell("## 10) Light evaluation on the validation set")) -cells.append( - nbf.v4.new_code_cell( - """ -import evaluate - -if eval_ds: - rouge = evaluate.load("rouge") - preds, refs = [], [] - for ex in val_rows[:50]: - preds.append(generate_answer(ex["input"], max_new_tokens=192, temperature=0.0)) - refs.append(ex["output"]) - results = rouge.compute(predictions=preds, references=refs) - print(results) -else: - print("No eval split available; skipped.") -""" - ) -) - -cells.append( - nbf.v4.new_markdown_cell( - """ -## 11) (Optional) Use with other runtimes - -- **Python Inference (PEFT)**: Load base model + adapter as shown in Section 9. -- **Merged model**: Set `DO_MERGE=True` to create a standalone model directory; you can then convert to other runtimes (e.g., llama.cpp GGUF) using their conversion tools. -- **Ollama**: If your runtime supports adapters or merged weights for the chosen base model, create a `Modelfile` pointing to them. Need a concrete path? Tell me your base and target runtime and I’ll add exact steps. -""" - ) -) - -nb["cells"] = cells - -out_path = Path("./raft_finetune_qlora.ipynb") -with open(out_path, "w", encoding="utf-8") as f: - nbf.write(nb, f) - -str(out_path) diff --git a/raft/finetuned/raft_qlora_tourist/train.md b/raft/finetuned/raft_qlora_tourist/train.md new file mode 100644 index 0000000..0e3ed7e --- /dev/null +++ b/raft/finetuned/raft_qlora_tourist/train.md @@ -0,0 +1,31 @@ +| epochs | train_loss | eval_loss | +| ------ | ---------- | --------- | +| 50 | 4.377000 | 3.628506 | +| 100 | 2.636800 | 2.558457 | +| 150 | 2.428800 | 2.427239 | +| 200 | 2.334800 | 2.193493 | +| 250 | 2.188500 | 2.186310 | +| 300 | 2.112400 | 2.173394 | +| 350 | 2.122900 | 2.163947 | +| 400 | 2.155400 | 2.162106 | +| 450 | 2.072100 | 2.154830 | +| 500 | 1.979900 | 2.165512 | +| 550 | 1.935800 | 2.176313 | +| 600 | 1.942800 | 2.170668 | +| 650 | 1.968000 | 2.162810 | +| 700 | 1.974100 | 2.167501 | +| 750 | 1.801900 | 2.235841 | +| 800 | 1.768000 | 2.233753 | +| 850 | 1.779100 | 2.218278 | +| 900 | 1.828900 | 2.220891 | +| 950 | 1.854900 | 2.208387 | +| 1000 | 1.653600 | 2.302763 | +| 1050 | 1.663500 | 2.307982 | +| 1100 | 1.673400 | 2.301423 | +| 1150 | 1.608400 | 2.320958 | +| 1200 | 1.683500 | 2.303580 | +| 1250 | 1.532100 | 2.434277 | +| 1300 | 1.558900 | 2.418276 | +| 1350 | 1.508900 | 2.422347 | +| 1400 | 1.535100 | 2.416650 | +| 1450 | 1.529900 | 2.415497 | diff --git a/raft/finetuned/raft_qlora_tourist_0.2/train.md b/raft/finetuned/raft_qlora_tourist_0.2/train.md new file mode 100644 index 0000000..ae814db --- /dev/null +++ b/raft/finetuned/raft_qlora_tourist_0.2/train.md @@ -0,0 +1,16 @@ +| epoch | train_loss | eval_loss | +| ----- | ---------- | --------- | +| 50 | 2.419000 | 1.970156 | +| 100 | 1.952300 | 1.843970 | +| 150 | 1.870500 | 1.846282 | +| 200 | 1.901400 | 1.800553 | +| 250 | 1.739600 | 1.820830 | +| 300 | 1.561900 | 1.817271 | +| 350 | 1.548000 | 1.805666 | +| 400 | 1.572800 | 1.808402 | +| 450 | 1.508000 | 1.794848 | +| 500 | 1.350500 | 1.905955 | +| 550 | 1.169200 | 1.949706 | +| 600 | 1.166600 | 1.940000 | +| 650 | 1.160400 | 1.940272 | +| 700 | 1.182600 | 1.951116 |