BERTopic cleanup

Add survey data
RAFT attempts and fixes
2026-03-22 00:12:42 +01:00 · 2026-02-08 22:43:53 +01:00 · 2026-02-08 20:27:42 +01:00 · 2026-02-08 16:41:24 +01:00
12 changed files with 3131 additions and 63 deletions
--- a/README.md
+++ b/README.md
@@ -0,0 +1,14 @@
+# Masterthesis, praktischer Anteil
+
+## Jupyter Notebooks "rehydrieren"
+
+Damit keine unnötigen Jupyter Outputs etc. im Versionsmanagement landen, gibt es das Skript `convert_jupytext.sh`, welches nur den notwendigen Quelltext in ein `.py` File schreibt. Mit demselben Skript kann dieser Schritt wieder umgekehrt werden, also ein Jupyter Notebook aus dem Python-File geschrieben werden.
+
+Das Skript sollte also immer vor dem Committen von Änderungen mit `py` als erstes Argument ausgeführt werden.
+
+Verwendung:
+
+```bash
+./convert_jupytext.sh py # Jupyter Notebook -> Python
+./convert_jupytext.sh nb # Python -> Jupyter Notebook
+```
--- a/bertopic/bertopic_autotune.py
+++ b/bertopic/bertopic_autotune.py
@@ -3,6 +3,8 @@ import traceback

 import numpy as np
 import pandas as pd
+from bertopic.representation import KeyBERTInspired
+from bertopic.vectorizers import ClassTfidfTransformer
 from hdbscan import HDBSCAN
 from sentence_transformers import SentenceTransformer
 from sklearn.feature_extraction.text import CountVectorizer
@@ -12,55 +14,50 @@ from sklearn.model_selection import ParameterGrid
 from umap import UMAP

 from bertopic import BERTopic
-from bertopic.representation import KeyBERTInspired
-from bertopic.vectorizers import ClassTfidfTransformer

 param_grid = {
-    "nr_topics": [45, 50, 55],
-    "min_topic_size": [30, 40, 50],
-    "n_gram_max": [3],
-    "min_document_frequency": [1, 2],
-    "n_neighbors": [15],
-    "n_components": [2],
-    "min_dist": [0.1],
-    "top_n_words": [10],
+    "n_gram_max": [2, 3],  # Vectorization
+    "min_document_frequency": [1],  # Vectorization
+    "min_samples": [10, 25],  # HDBSCAN
+    "min_topic_size": [10, 20, 30, 40, 50],  # HDBSCAN
+    "n_neighbors": [15],  # UMAP
+    "n_components": [2, 5],  # UMAP
+    "min_dist": [0.01, 0.1],  # UMAP
+    "nr_topics": ["auto"],  # Topic Modeling
+    "top_n_words": [10, 13, 15, 17, 20],  # Topic Modeling
 }


-def calculate_metrics(topic_model, embedder, top_n_words=5):
+def calculate_metrics(topic_model, embedder, top_n_words=10):
    # Get topic words
    topic_words = []
    for topic_id in range(len(topic_model.get_topic_info()) - 1):
        words = [word for word, _ in topic_model.get_topic(topic_id)]
        topic_words.append(words[:top_n_words])

+    # Pre-compute embeddings for all unique words
+    all_words = list(set(word for words in topic_words for word in words))
+    word_embeddings = embedder.encode(all_words)
+    embedding_map = {word: emb for word, emb in zip(all_words, word_embeddings)}
+
    # Coherence
    coherence_scores = []
    for words in topic_words:
-        embeddings = embedder.encode(words)
+        embeddings = np.array([embedding_map[word] for word in words])
        sim_matrix = cosine_similarity(embeddings)
        np.fill_diagonal(sim_matrix, 0)
-        coherence_scores.append(np.mean(sim_matrix))
+        mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)])
+        coherence_scores.append(mean_sim)
    overall_coherence = np.mean(coherence_scores)

    # Diversity
    all_topic_words = [word for topic in topic_words for word in topic]
    diversity = len(set(all_topic_words)) / len(all_topic_words)

-    # Inter-topic distance
-    topic_embeddings = [
-        np.mean(embedder.encode(words), axis=0) for words in topic_words
-    ]
-    topic_distance = pairwise_distances(topic_embeddings, metric="cosine")
-    avg_distance = np.mean(topic_distance[np.triu_indices_from(topic_distance, k=1)])
-
    res = {
        "coherence": float(str(overall_coherence)[:6]),
        "diversity": float(str(diversity)[:6]),
-        "inter_topic_distance": float(str(avg_distance)[:6]),
-        "combined_score": float(
-            str(0.6 * overall_coherence + 0.2 * diversity + 0.2 * avg_distance)[:6]
-        ),
+        "combined_score": float(str(0.7 * overall_coherence + 0.3 * diversity)[:6]),
    }
    print(res)
    return res
@@ -85,6 +82,7 @@ def auto_tune_bertopic(texts, embedding_model, param_grid):

    print(f"Total parameter combinations: {len(param_list)}")
    for params in param_list:
+        print(f"Testing param combination no. {len(history) + 1}/{len(param_list)}...")
        try:
            print(f"Testing params: {params}")
            ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
@@ -143,18 +141,27 @@ def auto_tune_bertopic(texts, embedding_model, param_grid):
            traceback.print_exc()
            continue

-    return best_model, best_params, best_score, history
+    with open("output/autotune.json", "w") as f:
+        json.dump(history, f, indent=2)
+
+    return best_model, best_params, best_score


 SPECIAL_CHARS = ["\n", "\\n"]
 MIN_REVIEW_WORDS = 5

-reviews = pd.read_csv("data.tab", sep="\t").review.to_list()
+print("Loading reviews...")
+reviews = pd.read_csv("../data/original/reviews.tab", sep="\t").review.to_list()

+print("Running light preprocessing...")
 for schar in SPECIAL_CHARS:
    reviews = [
        review.replace(schar, " ") if isinstance(review, str) else review
        for review in reviews
    ]
+
+print("Filtering short reviews...")
 reviews = [review for review in reviews if len(str(review).split()) >= MIN_REVIEW_WORDS]
+
+print("Staring auto-tuning...")
 print(auto_tune_bertopic(reviews, "all-MiniLM-L6-v2", param_grid))
--- a/bertopic/bertopic_autotune_sorter.py
+++ b/bertopic/bertopic_autotune_sorter.py
@@ -2,12 +2,12 @@ import json

 import matplotlib.pyplot as plt

-with open("history.json", "r") as f:
+with open("output/autotune.json", "r") as f:
    history = json.load(f)

-history = sorted(history, key=lambda x: x["metrics"]["combined_score"], reverse=True)
+history = sorted(history, key=lambda x: x["metrics"]["combined_score"], reverse=False)

-with open("history_sorted.json", "w") as f:
+with open("output/autotune_sorted.json", "w") as f:
    json.dump(history, f, indent=2)


--- a/bertopic/combined_score_distribution.png
+++ b/bertopic/combined_score_distribution.png
--- a/bertopic/nb_bertopic.py
+++ b/bertopic/nb_bertopic.py
@@ -23,7 +23,15 @@
 #

 # %%
-from bertopic import BERTopic
+import json
+import pickle
+import re
+
+import gensim.corpora as corpora
+import nltk
+import numpy as np
+import pandas as pd
+import spacy
 from bertopic.representation import KeyBERTInspired
 from bertopic.vectorizers import ClassTfidfTransformer
 from gensim.models.coherencemodel import CoherenceModel
@@ -34,14 +42,8 @@ from sentence_transformers import SentenceTransformer
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 from umap import UMAP
-import gensim.corpora as corpora
-import json
-import nltk
-import numpy as np
-import pandas as pd
-import re
-import spacy
-import pickle
+
+from bertopic import BERTopic

 nlp = spacy.load("en_core_web_sm")

@@ -323,8 +325,8 @@ if REDUCE_OUTLIERS:
 #

 # %%
-from pathlib import Path
 import random
+from pathlib import Path

 # --- config ---
 topics_to_keep = {2, 4, 6, 8, 10, 5, 7}
@@ -468,7 +470,11 @@ topic_model.get_topic_info()

 # %%
 topic_words = []
-for topic_id in range(len(topic_model.get_topic_info()) - 1):
+for topic_id in topic_model.get_topic_info()["Topic"]:
+    # Skip outlier topic
+    if topic_id < 0:
+        continue
+
    words = [word for word, _ in topic_model.get_topic(topic_id)]
    topic_words.append(words)

@@ -477,8 +483,10 @@ coherence_scores = []
 for words in topic_words:
    coherence_embeddings = embedding_model.encode(words)
    sim_matrix = cosine_similarity(coherence_embeddings)
-    np.fill_diagonal(sim_matrix, 0)  # Ignore self-similarity
-    mean_sim = np.mean(sim_matrix)
+
+    # Ignore self-similarity
+    np.fill_diagonal(sim_matrix, 0)
+    mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)])
    coherence_scores.append(mean_sim)

 overall_coherence = np.mean(coherence_scores)
@@ -518,8 +526,8 @@ if CALCULATE_COHERENCE:
        for topic in range(len(set(topics)) - 1)
    ]

-    # %env TOKENIZERS_PARALLELISM=false    
-    
+    # %env TOKENIZERS_PARALLELISM=false
+
    for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]:
        coherence_model = CoherenceModel(
            topics=topic_words,
--- a/bertopic/nb_bertopic_lowprep.py
+++ b/bertopic/nb_bertopic_lowprep.py
@@ -23,7 +23,14 @@
 #

 # %%
-from bertopic import BERTopic
+import pickle
+import re
+
+import gensim.corpora as corpora
+import nltk
+import numpy as np
+import pandas as pd
+import spacy
 from bertopic.representation import KeyBERTInspired
 from bertopic.vectorizers import ClassTfidfTransformer
 from gensim.models.coherencemodel import CoherenceModel
@@ -33,13 +40,8 @@ from sentence_transformers import SentenceTransformer
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 from umap import UMAP
-import gensim.corpora as corpora
-import nltk
-import numpy as np
-import pandas as pd
-import re
-import spacy
-import pickle
+
+from bertopic import BERTopic

 nlp = spacy.load("en_core_web_sm")

@@ -300,8 +302,8 @@ if REDUCE_OUTLIERS:
 #

 # %%
-from pathlib import Path
 import random
+from pathlib import Path

 # --- config ---
 topics_to_keep = {2, 4, 5, 9, 22, 26}
@@ -445,7 +447,11 @@ topic_model.get_topic_info()

 # %%
 topic_words = []
-for topic_id in range(len(topic_model.get_topic_info()) - 1):
+for topic_id in topic_model.get_topic_info()["Topic"]:
+    # Skip outlier topic
+    if topic_id < 0:
+        continue
+
    words = [word for word, _ in topic_model.get_topic(topic_id)]
    topic_words.append(words)

@@ -454,8 +460,10 @@ coherence_scores = []
 for words in topic_words:
    coherence_embeddings = embedding_model.encode(words)
    sim_matrix = cosine_similarity(coherence_embeddings)
-    np.fill_diagonal(sim_matrix, 0)  # Ignore self-similarity
-    mean_sim = np.mean(sim_matrix)
+
+    # Ignore self-similarity
+    np.fill_diagonal(sim_matrix, 0)
+    mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)])
    coherence_scores.append(mean_sim)

 overall_coherence = np.mean(coherence_scores)
@@ -492,10 +500,14 @@ if this_will_crash_your_pc_are_you_sure:
    tokens = [analyzer(doc) for doc in cleaned_docs]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
-    topic_words = [
-        [words for words, _ in topic_model.get_topic(topic)]
-        for topic in range(len(set(topics)) - 1)
-    ]
+
+    for topic_id in topic_model.get_topic_info()["Topic"]:
+        # Skip outlier topic
+        if topic_id < 0:
+            continue
+
+        words = [word for word, _ in topic_model.get_topic(topic_id)]
+        topic_words.append(words)

    # %env TOKENIZERS_PARALLELISM=false

--- a/bertopic/output/autotune.json
+++ b/bertopic/output/autotune.json
--- a/bertopic/output/visualization.html
+++ b/bertopic/output/visualization.html
--- a/raft/nb_raft_finetune_qlora.py
+++ b/raft/nb_raft_finetune_qlora.py
@@ -29,8 +29,9 @@ from peft import PeftModel
 from transformers import AutoModelForCausalLM

 # Paths
-DATA_JSONL = Path("./outputs/raft_dataset.jsonl")  # change if different
-RUN_NAME = "raft_qlora_tourist_0.2"
+# DATA_JSONL = Path("./outputs/raft_dataset.jsonl")  # change if different
+DATA_JSONL = Path("../raft/bali_culture_raft_dataset.jsonl")
+RUN_NAME = "raft_qlora_tourist"
 OUTPUT_DIR = Path(f"./finetuned/{RUN_NAME}")
 OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
 ADAPTER_DIR = OUTPUT_DIR / "lora_adapter"
--- a/raft/nb_raft_finetune_qlora_2.py
+++ b/raft/nb_raft_finetune_qlora_2.py
@@ -0,0 +1,677 @@
+# ---
+# jupyter:
+#   jupytext:
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.18.0
+#   kernelspec:
+#     display_name: .venv
+#     language: python
+#     name: python3
+# ---
+
+# %% [markdown]
+# # QLoRA/RAFT Fine-Tuning
+#
+
+# %% [markdown]
+# ## Configuration
+#
+
+# %%
+from termcolor import colored
+from pathlib import Path
+from transformers import BitsAndBytesConfig
+from torch import torch
+from peft import PeftModel
+from transformers import AutoModelForCausalLM
+
+# Paths
+DATA_JSONL = Path("../raft/remap_bali_raft_dataset.jsonl")  # change if different
+RUN_NAME = "raft_qlora_tourist"
+OUTPUT_DIR = Path(f"./finetuned/{RUN_NAME}")
+OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+ADAPTER_DIR = OUTPUT_DIR / "checkpoint-1550"
+
+# Base model — examples: "meta-llama/Llama-3.1-8B", "Qwen/Qwen2-7B-Instruct", "mistralai/Mistral-7B-Instruct-v0.3"
+# Prefer an instruction-tuned base for better stability on SFT.
+BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
+
+# Tokenization/prompt formatting
+SYSTEM_PREFIX = "You are a helpful assistant. Answer concisely and truthfully based ONLY on the user's request."
+USE_CHAT_TEMPLATE = True  # if the tokenizer has a chat template, we'll leverage it
+
+# BitsAndBytes config
+BNB_CONFIG = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16,
+)
+
+
+# %% [markdown]
+# ## 2) Load dataset (JSONL)
+#
+
+# %%
+import json
+import random
+from datasets import Dataset
+
+
+def read_jsonl(p: Path):
+    rows = []
+    with p.open("r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                obj = json.loads(line)
+                if "input" in obj and "output" in obj:
+                    rows.append(obj)
+            except Exception:
+                pass
+    return rows
+
+
+rows = read_jsonl(DATA_JSONL)
+print(f"Loaded {len(rows)} rows from {DATA_JSONL}")
+print(rows[0])
+
+random.Random(42).shuffle(rows)
+split = int(len(rows) * 0.85)
+train_rows = rows[:split]
+val_rows = rows[split:] if split < len(rows) else rows[-max(1, len(rows) // 50) :]
+
+train_rows = [{"input": r["input"], "output": r["output"]} for r in train_rows]
+val_rows = [{"input": r["input"], "output": r["output"]} for r in val_rows]
+
+train_ds = Dataset.from_list(train_rows)
+eval_ds = Dataset.from_list(val_rows) if val_rows else None
+train_ds, eval_ds
+
+
+# %% [markdown]
+# ## 3) Prompt formatting
+#
+
+# %%
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
+tokenizer.pad_token = tokenizer.eos_token
+
+print(colored("Verifying eos and pad tokens...", "yellow"))
+if tokenizer.pad_token_id != 2:
+    print(colored(f"Expected pad token to be 2, but got {tokenizer.pad_token}", "red"))
+else:
+    print(colored("Pad token is ok", "green"))
+
+if tokenizer.eos_token_id != 2:
+    print(colored(f"Expected eos token to be 2, but got {tokenizer.eos_token}", "red"))
+else:
+    print(colored("Eos token is ok", "green"))
+
+
+def format_example(ex):
+    user = ex["input"]
+    assistant = ex["output"]
+
+    messages = [
+        {"role": "system", "content": SYSTEM_PREFIX},
+        {"role": "user", "content": user},
+        {"role": "assistant", "content": assistant},
+    ]
+    text = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=False
+    )
+    return {"text": text}
+
+
+train_ds_fmt = train_ds.map(format_example, remove_columns=train_ds.column_names)
+eval_ds_fmt = (
+    eval_ds.map(format_example, remove_columns=eval_ds.column_names)
+    if eval_ds
+    else None
+)
+
+for i in range(10):
+    print("👉 " + train_ds_fmt[i]["text"])
+    if train_ds_fmt[i]["text"][-4:] == tokenizer.eos_token:
+        print(f"✅ {colored('EOS is fine.', 'green')}")
+    else:
+        print(f"❌ {colored('EOS is missing.', 'red')}")
+
+# %% [markdown]
+# ## 4) Tokenize
+#
+
+# %%
+IGNORE_INDEX = -100
+
+
+def make_supervised_tensors(batch):
+    enc = tokenizer(
+        batch["text"],
+        truncation=True,
+        max_length=2048,
+        padding="max_length",
+        return_tensors=None,
+    )
+    input_ids = enc["input_ids"]
+    attn_mask = enc["attention_mask"]
+
+    # Mask pads
+    labels = [ids[:] for ids in input_ids]
+    for i in range(len(labels)):
+        for j, m in enumerate(attn_mask[i]):
+            if m == 0:
+                labels[i][j] = IGNORE_INDEX
+
+    return {"input_ids": input_ids, "attention_mask": attn_mask, "labels": labels}
+
+
+train_tok = train_ds_fmt.map(
+    make_supervised_tensors, batched=True, remove_columns=train_ds_fmt.column_names
+)
+eval_tok = (
+    eval_ds_fmt.map(
+        make_supervised_tensors, batched=True, remove_columns=eval_ds_fmt.column_names
+    )
+    if eval_ds_fmt
+    else None
+)
+
+train_tok, eval_tok
+
+train_ds_fmt["text"][0]
+
+
+# %% [markdown]
+# ## Setup sanity check
+#
+
+# %%
+import transformers
+import peft
+import bitsandbytes as bnb
+from bitsandbytes.nn import modules as bnb_modules
+
+print(colored("Sanity check...", "yellow"))
+print("CUDA available:", torch.cuda.is_available())
+print("Torch version:", torch.__version__)
+print("Transformers version:", transformers.__version__)
+print(
+    "Compute capability:",
+    torch.cuda.get_device_capability(0) if torch.cuda.is_available() else "no cuda",
+)
+print("BitsAndbytes:", bnb.__version__)
+print("PEFT:", peft.__version__)
+
+
+print("Embedding4bit available:", hasattr(bnb_modules, "Embedding4bit"))
+
+# %% [markdown]
+# ## 5) Load base model with 4-bit quantization and prepare QLoRA
+#
+
+# %%
+from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
+
+model = AutoModelForCausalLM.from_pretrained(
+    BASE_MODEL,
+    quantization_config=BNB_CONFIG,
+    dtype=torch.bfloat16,
+    device_map="auto",
+)
+
+model = prepare_model_for_kbit_training(model)
+
+peft_config = LoraConfig(
+    r=8,
+    lora_alpha=16,
+    lora_dropout=0.05,
+    bias="none",
+    task_type="CAUSAL_LM",
+    target_modules=[
+        "q_proj",
+        "k_proj",
+        "v_proj",
+        "o_proj",
+        "gate_proj",
+        "up_proj",
+        "down_proj",
+    ],
+)
+
+model = get_peft_model(model, peft_config)
+model.print_trainable_parameters()
+
+
+# %% [markdown]
+# ## 6) Train
+#
+
+# %%
+from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
+import math
+
+data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
+
+args = TrainingArguments(
+    output_dir=str(OUTPUT_DIR),
+    run_name=RUN_NAME,
+    num_train_epochs=3,
+    per_device_train_batch_size=1,
+    per_device_eval_batch_size=1,
+    gradient_accumulation_steps=8,
+    learning_rate=2e-4,
+    warmup_ratio=0.05,
+    weight_decay=0.01,
+    logging_steps=25,
+    eval_steps=50,
+    save_steps=50,
+    save_total_limit=2,
+    bf16=True,
+    fp16=False,
+    gradient_checkpointing=True,
+    report_to=["none"],
+    seed=42,
+    eval_strategy="steps",
+    load_best_model_at_end=True,
+)
+
+trainer = Trainer(
+    model=model,
+    args=args,
+    train_dataset=train_tok,
+    eval_dataset=eval_tok,
+    data_collator=data_collator,
+)
+
+train_result = trainer.train()
+metrics = trainer.evaluate() if eval_tok else {}
+perplexity = (
+    math.exp(metrics["eval_loss"]) if metrics and "eval_loss" in metrics else None
+)
+metrics, perplexity
+
+
+# %% [markdown]
+# | epochs | train_loss | eval_loss |
+# | ------ | ---------- | --------- |
+# | 50     | 4.377000   | 3.628506  |
+# | 100    | 2.636800   | 2.558457  |
+# | 150    | 2.428800   | 2.427239  |
+# | 200    | 2.334800   | 2.193493  |
+# | 250    | 2.188500   | 2.186310  |
+# | 300    | 2.112400   | 2.173394  |
+# | 350    | 2.122900   | 2.163947  |
+# | 400    | 2.155400   | 2.162106  |
+# | 450    | 2.072100   | 2.154830  |
+# | 500    | 1.979900   | 2.165512  |
+# | 550    | 1.935800   | 2.176313  |
+# | 600    | 1.942800   | 2.170668  |
+# | 650    | 1.968000   | 2.162810  |
+# | 700    | 1.974100   | 2.167501  |
+# | 750    | 1.801900   | 2.235841  |
+# | 800    | 1.768000   | 2.233753  |
+# | 850    | 1.779100   | 2.218278  |
+# | 900    | 1.828900   | 2.220891  |
+# | 950    | 1.854900   | 2.208387  |
+# | 1000   | 1.653600   | 2.302763  |
+# | 1050   | 1.663500   | 2.307982  |
+# | 1100   | 1.673400   | 2.301423  |
+# | 1150   | 1.608400   | 2.320958  |
+# | 1200   | 1.683500   | 2.303580  |
+# | 1250   | 1.532100   | 2.434277  |
+# | 1300   | 1.558900   | 2.418276  |
+# | 1350   | 1.508900   | 2.422347  |
+# | 1400   | 1.535100   | 2.416650  |
+# | 1450   | 1.529900   | 2.415497  |
+#
+# | Step | Training Loss | Evaluation Loss |
+# | ---- | ------------- | --------------- |
+# | 50   | 1.173100      | 1.040235        |
+# | 100  | 0.882900      | 0.875235        |
+# | 150  | 0.806600      | 0.820686        |
+# | 200  | 0.785700      | 0.792914        |
+# | 250  | 0.764300      | 0.761308        |
+# | 300  | 0.733900      | 0.745976        |
+# | 350  | 0.744000      | 0.732220        |
+# | 400  | 0.712000      | 0.719414        |
+# | 450  | 0.703800      | 0.709955        |
+# | 500  | 0.684100      | 0.699460        |
+# | 550  | 0.705900      | 0.691758        |
+# | 600  | 0.683200      | 0.688031        |
+# | 650  | 0.670100      | 0.680539        |
+# | 700  | 0.681600      | 0.674205        |
+# | 750  | 0.681500      | 0.671295        |
+# | 800  | 0.651700      | 0.666133        |
+# | 850  | 0.662900      | 0.660661        |
+# | 900  | 0.651400      | 0.656359        |
+# | 950  | 0.648100      | 0.653309        |
+# | 1000 | 0.631500      | 0.648716        |
+# | 1050 | 0.654200      | 0.643737        |
+# | 1100 | 0.571100      | 0.648199        |
+# | 1150 | 0.573500      | 0.648405        |
+# | 1200 | 0.556000      | 0.644185        |
+# | 1250 | 0.568100      | 0.642854        |
+# | 1300 | 0.570200      | 0.640425        |
+# | 1350 | 0.551100      | 0.636319        |
+# | 1400 | 0.551400      | 0.634054        |
+# | 1450 | 0.550100      | 0.631558        |
+# | 1500 | 0.559800      | 0.630046        |
+# | 1550 | 0.556600      | 0.626972        |
+#
+
+# %% [markdown]
+# ## 7) Save LoRA adapters
+#
+
+# %%
+ADAPTER_DIR.mkdir(parents=True, exist_ok=True)
+
+model.save_pretrained(str(ADAPTER_DIR))
+tokenizer.save_pretrained(str(ADAPTER_DIR))
+
+print(f"Saved LoRA adapter to: {ADAPTER_DIR}")
+
+
+# %% [markdown]
+# ## 8) Save merged model
+#
+
+# %%
+# this does not work on my system since I don't have enough VRAM.
+# it should work though provided you have sufficient resources.
+# my next step would have been to convert the merged model to llama.cpp GGUF format so I can run it in Ollama/OpenWebUI.
+DO_MERGE = False
+
+base_model = None
+if DO_MERGE:
+    base_model = AutoModelForCausalLM.from_pretrained(
+        BASE_MODEL,
+        torch_dtype=torch.bfloat16,
+        device_map="auto",
+    )
+    merged = PeftModel.from_pretrained(
+        base_model, str(ADAPTER_DIR), offload_folder="offload/", is_trainable=False
+    ).merge_and_unload()
+    merged_dir = OUTPUT_DIR / "merged_model"
+    merged.save_pretrained(str(merged_dir))
+    tokenizer.save_pretrained(str(merged_dir))
+    print(f"Merged full model saved to: {merged_dir}")
+else:
+    print("Skipping merge (set DO_MERGE=True to enable).")
+
+# %% [markdown]
+# ## 9) Quick inference with the trained adapter
+#
+
+# %%
+test_model = None
+
+print(colored("Loading the base model + trained adapter.", "green"))
+test_model = AutoModelForCausalLM.from_pretrained(
+    BASE_MODEL,
+    quantization_config=BNB_CONFIG,
+    dtype=torch.bfloat16,
+    device_map="auto",
+)
+test_model = PeftModel.from_pretrained(
+    test_model, str(ADAPTER_DIR), offload_folder="offload/", is_trainable=False
+)
+test_model.eval()
+
+
+def generate_answer(prompt, max_new_tokens=256, temperature=0.2, top_p=0.9):
+    messages = [
+        {"role": "system", "content": SYSTEM_PREFIX},
+        {"role": "user", "content": prompt},
+    ]
+    model_inputs = tokenizer.apply_chat_template(
+        messages, return_tensors="pt", add_generation_prompt=True
+    ).to(test_model.device)
+
+    gen_kwargs = {"input_ids": model_inputs}
+
+    with torch.no_grad():
+        out = test_model.generate(
+            **gen_kwargs,
+            do_sample=True,
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            eos_token_id=tokenizer.eos_token_id,
+            pad_token_id=tokenizer.pad_token_id,
+        )
+    return tokenizer.decode(out[0], skip_special_tokens=True)
+
+
+sample_prompt = (
+    train_rows[0]["input"]
+    if len(train_rows) > 0
+    else "What are the visitor crowd levels like?"
+)
+
+for i in range(10):
+    print(generate_answer(train_rows[i]["input"])[:800])
+    print("---")
+
+
+# %%
+generate_answer("What are the visitor crowd levels like?")
+
+
+# %%
+def chat(
+    user, system="You are a precise assistant.", temperature=0.0, max_new_tokens=256
+):
+    msgs = [
+        {"role": "system", "content": system},
+        {"role": "user", "content": user},
+    ]
+    model_inputs = tokenizer.apply_chat_template(
+        msgs, return_tensors="pt", add_generation_prompt=True
+    ).to(test_model.device)
+    gen_kwargs = {"input_ids": model_inputs}
+    with torch.no_grad():
+        out = test_model.generate(
+            **gen_kwargs,
+            # **tokenizer(user, return_tensors="pt").to(test_model.device),
+            max_new_tokens=max_new_tokens,
+            do_sample=(temperature > 0),
+            temperature=temperature,
+            top_p=1.0,
+            pad_token_id=tokenizer.pad_token_id,
+            eos_token_id=tokenizer.eos_token_id
+        )
+    return tokenizer.decode(out[0], skip_special_tokens=True)
+
+
+for i in range(10):
+    prompt = train_rows[i]["input"]
+    out = chat(prompt, max_new_tokens=2000, temperature=0.2)
+
+    print("\n\n💬\n" + out)
+
+# %% [markdown]
+# ## PoS Gradio setup
+#
+
+# %%
+# === Gradio chat for Mistral-Instruct (no self-replies) ===
+# Assumes: `test_model` (HF AutoModelForCausalLM + PEFT adapter) and `BASE_MODEL` are defined.
+
+import torch, threading
+import gradio as gr
+from transformers import (
+    AutoTokenizer,
+    TextIteratorStreamer,
+    StoppingCriteria,
+    StoppingCriteriaList,
+)
+
+# -- Tokenizer (use BASE model tokenizer) --
+tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
+
+# Ensure pad/eos exist and are consistent
+if tokenizer.pad_token is None and tokenizer.eos_token is not None:
+    tokenizer.pad_token = tokenizer.eos_token
+elif tokenizer.eos_token is None and tokenizer.pad_token is not None:
+    tokenizer.eos_token = tokenizer.pad_token
+elif tokenizer.pad_token is None and tokenizer.eos_token is None:
+    tokenizer.add_special_tokens({"eos_token": "</s>"})
+    tokenizer.pad_token = tokenizer.eos_token
+    try:
+        test_model.resize_token_embeddings(len(tokenizer))
+    except Exception:
+        pass
+
+DEVICE = getattr(test_model, "device", "cuda" if torch.cuda.is_available() else "cpu")
+SYSTEM_PROMPT = "You are a helpful assistant."
+
+
+# --- Custom stop: if the model starts a new user turn ([INST]) stop generation immediately.
+# This prevents the model from “answering its own replies”.
+class StopOnInst(StoppingCriteria):
+    def __init__(self, tokenizer, trigger_text="[INST]"):
+        self.trigger_ids = tokenizer.encode(trigger_text, add_special_tokens=False)
+
+    def __call__(
+        self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
+    ) -> bool:
+        if not self.trigger_ids:
+            return False
+        seq = input_ids[0].tolist()
+        tlen = len(self.trigger_ids)
+        if len(seq) < tlen:
+            return False
+        return seq[-tlen:] == self.trigger_ids
+
+
+STOPPING = StoppingCriteriaList([StopOnInst(tokenizer)])
+
+
+def _build_inputs(pairs):
+    """
+    pairs: list of (user, assistant) tuples.
+    We include prior completed assistant replies and the latest user with empty assistant,
+    then ask the model to continue as assistant.
+    """
+    msgs = [{"role": "system", "content": SYSTEM_PROMPT}]
+    for u, a in pairs:
+        u = (u or "").strip()
+        a = (a or "").strip()
+        if not u and not a:
+            continue
+        if u:
+            msgs.append({"role": "user", "content": u})
+        if a:
+            msgs.append({"role": "assistant", "content": a})
+
+    # Use chat template; many Mistral tokenizers return a single Tensor (input_ids)
+    input_ids = tokenizer.apply_chat_template(
+        msgs, add_generation_prompt=True, return_tensors="pt"
+    )
+    if isinstance(input_ids, torch.Tensor):
+        inputs = {"input_ids": input_ids, "attention_mask": torch.ones_like(input_ids)}
+    else:
+        inputs = input_ids
+    return {k: v.to(DEVICE) for k, v in inputs.items()}
+
+
+def stream_reply(history_pairs, max_new_tokens=512, temperature=0.7, top_p=0.9):
+    inputs = _build_inputs(history_pairs)
+
+    streamer = TextIteratorStreamer(
+        tokenizer, skip_prompt=True, skip_special_tokens=True
+    )
+
+    gen_kwargs = dict(
+        **inputs,
+        max_new_tokens=max_new_tokens,
+        do_sample=True,
+        temperature=temperature,
+        top_p=top_p,
+        pad_token_id=tokenizer.pad_token_id,
+        eos_token_id=tokenizer.eos_token_id,  # Mistral uses </s> as EOS
+        streamer=streamer,
+        stopping_criteria=STOPPING,  # <- key fix
+    )
+
+    with torch.inference_mode():
+        t = threading.Thread(target=test_model.generate, kwargs=gen_kwargs)
+        t.start()
+        partial = ""
+        for piece in streamer:
+            partial += piece
+            yield partial
+        t.join()
+
+
+# --- Gradio handlers ---
+
+
+def gr_respond(message, chat_history):
+    message = (message or "").strip()
+    chat_history = chat_history or []
+    # Append new user turn with empty assistant; we stream into that slot.
+    chat_history = chat_history + [(message, "")]
+    pairs = [(u or "", a or "") for (u, a) in chat_history]
+
+    for partial in stream_reply(pairs):
+        chat_history[-1] = (message, partial)
+        yield "", chat_history  # clears textbox, updates chat
+
+
+def gr_clear():
+    return None
+
+
+with gr.Blocks() as demo:
+    gr.Markdown("## 💬 Chat with Touristral")
+    chat = gr.Chatbot(height=200, layout="bubble")
+    with gr.Row():
+        msg = gr.Textbox(placeholder="Type a message and press Enter…", scale=9)
+        send = gr.Button("Send", scale=1)
+    with gr.Row():
+        clear = gr.Button("Clear chat")
+
+    msg.submit(gr_respond, [msg, chat], [msg, chat])
+    send.click(gr_respond, [msg, chat], [msg, chat])
+    clear.click(gr_clear, None, chat, queue=False)
+
+demo.queue().launch(share=False)
+
+# %% [markdown]
+# ## 10) Light evaluation on the validation set
+#
+
+# %%
+import evaluate
+
+if eval_ds:
+    rouge = evaluate.load("rouge")
+    preds, refs = [], []
+    for ex in val_rows[:50]:
+        preds.append(generate_answer(ex["input"], max_new_tokens=192, temperature=0.2))
+        refs.append(ex["output"])
+    results = rouge.compute(predictions=preds, references=refs)
+    print(results)
+else:
+    print("No eval split available; skipped.")
+
+
+# %% [markdown]
+# ## 11) (Optional) Use with other runtimes
+#
+# - **Python Inference (PEFT)**: Load base model + adapter as shown in Section 9.
+# - **Merged model**: Set `DO_MERGE=True` to create a standalone model directory; you can then convert to other runtimes (e.g., llama.cpp GGUF) using their conversion tools.
+# - **Ollama**: If your runtime supports adapters or merged weights for the chosen base model, create a `Modelfile` pointing to them. Need a concrete path? Tell me your base and target runtime and I’ll add exact steps.
+#
--- a/survey/human_eval_example_dataset_integer_only.ods
+++ b/survey/human_eval_example_dataset_integer_only.ods
--- a/survey/human_eval_personalized_targets.csv
+++ b/survey/human_eval_personalized_targets.csv
Author	SHA1	Message	Date
Marvin Scham	c98a1d0c6e	BERTopic cleanup	2026-02-08 22:43:53 +01:00
Marvin Scham	b2da597b18	Add survey data	2026-02-08 20:27:42 +01:00
Marvin Scham	e3c9b7286f	RAFT attempts and fixes	2026-02-08 16:41:24 +01:00