BERTopic cleanup

Add survey data
RAFT attempts and fixes
2026-05-08 13:35:47 +02:00 · 2026-02-08 22:43:53 +01:00 · 2026-02-08 20:27:42 +01:00 · 2026-02-08 16:41:24 +01:00
12 changed files with 3131 additions and 63 deletions
@@ -0,0 +1,14 @@
 # Masterthesis, praktischer Anteil
 ## Jupyter Notebooks "rehydrieren"
 Damit keine unnötigen Jupyter Outputs etc. im Versionsmanagement landen, gibt es das Skript `convert_jupytext.sh`, welches nur den notwendigen Quelltext in ein `.py` File schreibt. Mit demselben Skript kann dieser Schritt wieder umgekehrt werden, also ein Jupyter Notebook aus dem Python-File geschrieben werden.
 Das Skript sollte also immer vor dem Committen von Änderungen mit `py` als erstes Argument ausgeführt werden.
 Verwendung:
 ```bash
 ./convert_jupytext.sh py # Jupyter Notebook -> Python
 ./convert_jupytext.sh nb # Python -> Jupyter Notebook
 ```
@@ -3,6 +3,8 @@ import traceback
 import numpy as np
 import pandas as pd
 from bertopic.representation import KeyBERTInspired
 from bertopic.vectorizers import ClassTfidfTransformer
 from hdbscan import HDBSCAN
 from sentence_transformers import SentenceTransformer
 from sklearn.feature_extraction.text import CountVectorizer
@@ -12,55 +14,50 @@ from sklearn.model_selection import ParameterGrid
 from umap import UMAP
 from bertopic import BERTopic
 from bertopic.representation import KeyBERTInspired
 from bertopic.vectorizers import ClassTfidfTransformer
 param_grid = {
-    "nr_topics": [45, 50, 55],
+    "n_gram_max": [2, 3],  # Vectorization
-    "min_topic_size": [30, 40, 50],
+    "min_document_frequency": [1],  # Vectorization
-    "n_gram_max": [3],
+    "min_samples": [10, 25],  # HDBSCAN
-    "min_document_frequency": [1, 2],
+    "min_topic_size": [10, 20, 30, 40, 50],  # HDBSCAN
-    "n_neighbors": [15],
+    "n_neighbors": [15],  # UMAP
-    "n_components": [2],
+    "n_components": [2, 5],  # UMAP
-    "min_dist": [0.1],
+    "min_dist": [0.01, 0.1],  # UMAP
-    "top_n_words": [10],
+    "nr_topics": ["auto"],  # Topic Modeling
    "top_n_words": [10, 13, 15, 17, 20],  # Topic Modeling
 }
-def calculate_metrics(topic_model, embedder, top_n_words=5):
+def calculate_metrics(topic_model, embedder, top_n_words=10):
    # Get topic words
    topic_words = []
    for topic_id in range(len(topic_model.get_topic_info()) - 1):
        words = [word for word, _ in topic_model.get_topic(topic_id)]
        topic_words.append(words[:top_n_words])
    # Pre-compute embeddings for all unique words
    all_words = list(set(word for words in topic_words for word in words))
    word_embeddings = embedder.encode(all_words)
    embedding_map = {word: emb for word, emb in zip(all_words, word_embeddings)}
    # Coherence
    coherence_scores = []
    for words in topic_words:
-        embeddings = embedder.encode(words)
+        embeddings = np.array([embedding_map[word] for word in words])
        sim_matrix = cosine_similarity(embeddings)
        np.fill_diagonal(sim_matrix, 0)
-        coherence_scores.append(np.mean(sim_matrix))
+        mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)])
        coherence_scores.append(mean_sim)
    overall_coherence = np.mean(coherence_scores)
    # Diversity
    all_topic_words = [word for topic in topic_words for word in topic]
    diversity = len(set(all_topic_words)) / len(all_topic_words)
    # Inter-topic distance
    topic_embeddings = [
        np.mean(embedder.encode(words), axis=0) for words in topic_words
    ]
    topic_distance = pairwise_distances(topic_embeddings, metric="cosine")
    avg_distance = np.mean(topic_distance[np.triu_indices_from(topic_distance, k=1)])
    res = {
        "coherence": float(str(overall_coherence)[:6]),
        "diversity": float(str(diversity)[:6]),
-        "inter_topic_distance": float(str(avg_distance)[:6]),
+        "combined_score": float(str(0.7 * overall_coherence + 0.3 * diversity)[:6]),
        "combined_score": float(
            str(0.6 * overall_coherence + 0.2 * diversity + 0.2 * avg_distance)[:6]
        ),
    }
    print(res)
    return res
@@ -85,6 +82,7 @@ def auto_tune_bertopic(texts, embedding_model, param_grid):
    print(f"Total parameter combinations: {len(param_list)}")
    for params in param_list:
        print(f"Testing param combination no. {len(history) + 1}/{len(param_list)}...")
        try:
            print(f"Testing params: {params}")
            ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
@@ -143,18 +141,27 @@ def auto_tune_bertopic(texts, embedding_model, param_grid):
            traceback.print_exc()
            continue
-    return best_model, best_params, best_score, history
+    with open("output/autotune.json", "w") as f:
        json.dump(history, f, indent=2)
    return best_model, best_params, best_score
 SPECIAL_CHARS = ["\n", "\\n"]
 MIN_REVIEW_WORDS = 5
-reviews = pd.read_csv("data.tab", sep="\t").review.to_list()
+print("Loading reviews...")
 reviews = pd.read_csv("../data/original/reviews.tab", sep="\t").review.to_list()
 print("Running light preprocessing...")
 for schar in SPECIAL_CHARS:
    reviews = [
        review.replace(schar, " ") if isinstance(review, str) else review
        for review in reviews
    ]
 print("Filtering short reviews...")
 reviews = [review for review in reviews if len(str(review).split()) >= MIN_REVIEW_WORDS]
 print("Staring auto-tuning...")
 print(auto_tune_bertopic(reviews, "all-MiniLM-L6-v2", param_grid))
@@ -2,12 +2,12 @@ import json
 import matplotlib.pyplot as plt
-with open("history.json", "r") as f:
+with open("output/autotune.json", "r") as f:
    history = json.load(f)
-history = sorted(history, key=lambda x: x["metrics"]["combined_score"], reverse=True)
+history = sorted(history, key=lambda x: x["metrics"]["combined_score"], reverse=False)
-with open("history_sorted.json", "w") as f:
+with open("output/autotune_sorted.json", "w") as f:
    json.dump(history, f, indent=2)
@@ -23,7 +23,15 @@
 #
 # %%
-from bertopic import BERTopic
+import json
 import pickle
 import re
 import gensim.corpora as corpora
 import nltk
 import numpy as np
 import pandas as pd
 import spacy
 from bertopic.representation import KeyBERTInspired
 from bertopic.vectorizers import ClassTfidfTransformer
 from gensim.models.coherencemodel import CoherenceModel
@@ -34,14 +42,8 @@ from sentence_transformers import SentenceTransformer
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 from umap import UMAP
-import gensim.corpora as corpora
+
-import json
+from bertopic import BERTopic
 import nltk
 import numpy as np
 import pandas as pd
 import re
 import spacy
 import pickle
 nlp = spacy.load("en_core_web_sm")
@@ -323,8 +325,8 @@ if REDUCE_OUTLIERS:
 #
 # %%
 from pathlib import Path
 import random
 from pathlib import Path
 # --- config ---
 topics_to_keep = {2, 4, 6, 8, 10, 5, 7}
@@ -468,7 +470,11 @@ topic_model.get_topic_info()
 # %%
 topic_words = []
-for topic_id in range(len(topic_model.get_topic_info()) - 1):
+for topic_id in topic_model.get_topic_info()["Topic"]:
    # Skip outlier topic
    if topic_id < 0:
        continue
    words = [word for word, _ in topic_model.get_topic(topic_id)]
    topic_words.append(words)
@@ -477,8 +483,10 @@ coherence_scores = []
 for words in topic_words:
    coherence_embeddings = embedding_model.encode(words)
    sim_matrix = cosine_similarity(coherence_embeddings)
-    np.fill_diagonal(sim_matrix, 0)  # Ignore self-similarity
+
-    mean_sim = np.mean(sim_matrix)
+    # Ignore self-similarity
    np.fill_diagonal(sim_matrix, 0)
    mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)])
    coherence_scores.append(mean_sim)
 overall_coherence = np.mean(coherence_scores)
@@ -518,8 +526,8 @@ if CALCULATE_COHERENCE:
        for topic in range(len(set(topics)) - 1)
    ]
-    # %env TOKENIZERS_PARALLELISM=false    
+    # %env TOKENIZERS_PARALLELISM=false
-    
+
    for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]:
        coherence_model = CoherenceModel(
            topics=topic_words,
@@ -23,7 +23,14 @@
 #
 # %%
-from bertopic import BERTopic
+import pickle
 import re
 import gensim.corpora as corpora
 import nltk
 import numpy as np
 import pandas as pd
 import spacy
 from bertopic.representation import KeyBERTInspired
 from bertopic.vectorizers import ClassTfidfTransformer
 from gensim.models.coherencemodel import CoherenceModel
@@ -33,13 +40,8 @@ from sentence_transformers import SentenceTransformer
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 from umap import UMAP
-import gensim.corpora as corpora
+
-import nltk
+from bertopic import BERTopic
 import numpy as np
 import pandas as pd
 import re
 import spacy
 import pickle
 nlp = spacy.load("en_core_web_sm")
@@ -300,8 +302,8 @@ if REDUCE_OUTLIERS:
 #
 # %%
 from pathlib import Path
 import random
 from pathlib import Path
 # --- config ---
 topics_to_keep = {2, 4, 5, 9, 22, 26}
@@ -445,7 +447,11 @@ topic_model.get_topic_info()
 # %%
 topic_words = []
-for topic_id in range(len(topic_model.get_topic_info()) - 1):
+for topic_id in topic_model.get_topic_info()["Topic"]:
    # Skip outlier topic
    if topic_id < 0:
        continue
    words = [word for word, _ in topic_model.get_topic(topic_id)]
    topic_words.append(words)
@@ -454,8 +460,10 @@ coherence_scores = []
 for words in topic_words:
    coherence_embeddings = embedding_model.encode(words)
    sim_matrix = cosine_similarity(coherence_embeddings)
-    np.fill_diagonal(sim_matrix, 0)  # Ignore self-similarity
+
-    mean_sim = np.mean(sim_matrix)
+    # Ignore self-similarity
    np.fill_diagonal(sim_matrix, 0)
    mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)])
    coherence_scores.append(mean_sim)
 overall_coherence = np.mean(coherence_scores)
@@ -492,10 +500,14 @@ if this_will_crash_your_pc_are_you_sure:
    tokens = [analyzer(doc) for doc in cleaned_docs]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
-    topic_words = [
+
-        [words for words, _ in topic_model.get_topic(topic)]
+    for topic_id in topic_model.get_topic_info()["Topic"]:
-        for topic in range(len(set(topics)) - 1)
+        # Skip outlier topic
-    ]
+        if topic_id < 0:
            continue
        words = [word for word, _ in topic_model.get_topic(topic_id)]
        topic_words.append(words)
    # %env TOKENIZERS_PARALLELISM=false
@@ -29,8 +29,9 @@ from peft import PeftModel
 from transformers import AutoModelForCausalLM
 # Paths
-DATA_JSONL = Path("./outputs/raft_dataset.jsonl")  # change if different
+# DATA_JSONL = Path("./outputs/raft_dataset.jsonl")  # change if different
-RUN_NAME = "raft_qlora_tourist_0.2"
+DATA_JSONL = Path("../raft/bali_culture_raft_dataset.jsonl")
 RUN_NAME = "raft_qlora_tourist"
 OUTPUT_DIR = Path(f"./finetuned/{RUN_NAME}")
 OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
 ADAPTER_DIR = OUTPUT_DIR / "lora_adapter"
@@ -0,0 +1,677 @@
 # ---
 # jupyter:
 #   jupytext:
 #     text_representation:
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
 #       jupytext_version: 1.18.0
 #   kernelspec:
 #     display_name: .venv
 #     language: python
 #     name: python3
 # ---
 # %% [markdown]
 # # QLoRA/RAFT Fine-Tuning
 #
 # %% [markdown]
 # ## Configuration
 #
 # %%
 from termcolor import colored
 from pathlib import Path
 from transformers import BitsAndBytesConfig
 from torch import torch
 from peft import PeftModel
 from transformers import AutoModelForCausalLM
 # Paths
 DATA_JSONL = Path("../raft/remap_bali_raft_dataset.jsonl")  # change if different
 RUN_NAME = "raft_qlora_tourist"
 OUTPUT_DIR = Path(f"./finetuned/{RUN_NAME}")
 OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
 ADAPTER_DIR = OUTPUT_DIR / "checkpoint-1550"
 # Base model — examples: "meta-llama/Llama-3.1-8B", "Qwen/Qwen2-7B-Instruct", "mistralai/Mistral-7B-Instruct-v0.3"
 # Prefer an instruction-tuned base for better stability on SFT.
 BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
 # Tokenization/prompt formatting
 SYSTEM_PREFIX = "You are a helpful assistant. Answer concisely and truthfully based ONLY on the user's request."
 USE_CHAT_TEMPLATE = True  # if the tokenizer has a chat template, we'll leverage it
 # BitsAndBytes config
 BNB_CONFIG = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
 )
 # %% [markdown]
 # ## 2) Load dataset (JSONL)
 #
 # %%
 import json
 import random
 from datasets import Dataset
 def read_jsonl(p: Path):
    rows = []
    with p.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
                if "input" in obj and "output" in obj:
                    rows.append(obj)
            except Exception:
                pass
    return rows
 rows = read_jsonl(DATA_JSONL)
 print(f"Loaded {len(rows)} rows from {DATA_JSONL}")
 print(rows[0])
 random.Random(42).shuffle(rows)
 split = int(len(rows) * 0.85)
 train_rows = rows[:split]
 val_rows = rows[split:] if split < len(rows) else rows[-max(1, len(rows) // 50) :]
 train_rows = [{"input": r["input"], "output": r["output"]} for r in train_rows]
 val_rows = [{"input": r["input"], "output": r["output"]} for r in val_rows]
 train_ds = Dataset.from_list(train_rows)
 eval_ds = Dataset.from_list(val_rows) if val_rows else None
 train_ds, eval_ds
 # %% [markdown]
 # ## 3) Prompt formatting
 #
 # %%
 from transformers import AutoTokenizer
 tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
 tokenizer.pad_token = tokenizer.eos_token
 print(colored("Verifying eos and pad tokens...", "yellow"))
 if tokenizer.pad_token_id != 2:
    print(colored(f"Expected pad token to be 2, but got {tokenizer.pad_token}", "red"))
 else:
    print(colored("Pad token is ok", "green"))
 if tokenizer.eos_token_id != 2:
    print(colored(f"Expected eos token to be 2, but got {tokenizer.eos_token}", "red"))
 else:
    print(colored("Eos token is ok", "green"))
 def format_example(ex):
    user = ex["input"]
    assistant = ex["output"]
    messages = [
        {"role": "system", "content": SYSTEM_PREFIX},
        {"role": "user", "content": user},
        {"role": "assistant", "content": assistant},
    ]
    text = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=False
    )
    return {"text": text}
 train_ds_fmt = train_ds.map(format_example, remove_columns=train_ds.column_names)
 eval_ds_fmt = (
    eval_ds.map(format_example, remove_columns=eval_ds.column_names)
    if eval_ds
    else None
 )
 for i in range(10):
    print("👉 " + train_ds_fmt[i]["text"])
    if train_ds_fmt[i]["text"][-4:] == tokenizer.eos_token:
        print(f"✅ {colored('EOS is fine.', 'green')}")
    else:
        print(f"❌ {colored('EOS is missing.', 'red')}")
 # %% [markdown]
 # ## 4) Tokenize
 #
 # %%
 IGNORE_INDEX = -100
 def make_supervised_tensors(batch):
    enc = tokenizer(
        batch["text"],
        truncation=True,
        max_length=2048,
        padding="max_length",
        return_tensors=None,
    )
    input_ids = enc["input_ids"]
    attn_mask = enc["attention_mask"]
    # Mask pads
    labels = [ids[:] for ids in input_ids]
    for i in range(len(labels)):
        for j, m in enumerate(attn_mask[i]):
            if m == 0:
                labels[i][j] = IGNORE_INDEX
    return {"input_ids": input_ids, "attention_mask": attn_mask, "labels": labels}
 train_tok = train_ds_fmt.map(
    make_supervised_tensors, batched=True, remove_columns=train_ds_fmt.column_names
 )
 eval_tok = (
    eval_ds_fmt.map(
        make_supervised_tensors, batched=True, remove_columns=eval_ds_fmt.column_names
    )
    if eval_ds_fmt
    else None
 )
 train_tok, eval_tok
 train_ds_fmt["text"][0]
 # %% [markdown]
 # ## Setup sanity check
 #
 # %%
 import transformers
 import peft
 import bitsandbytes as bnb
 from bitsandbytes.nn import modules as bnb_modules
 print(colored("Sanity check...", "yellow"))
 print("CUDA available:", torch.cuda.is_available())
 print("Torch version:", torch.__version__)
 print("Transformers version:", transformers.__version__)
 print(
    "Compute capability:",
    torch.cuda.get_device_capability(0) if torch.cuda.is_available() else "no cuda",
 )
 print("BitsAndbytes:", bnb.__version__)
 print("PEFT:", peft.__version__)
 print("Embedding4bit available:", hasattr(bnb_modules, "Embedding4bit"))
 # %% [markdown]
 # ## 5) Load base model with 4-bit quantization and prepare QLoRA
 #
 # %%
 from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
 model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=BNB_CONFIG,
    dtype=torch.bfloat16,
    device_map="auto",
 )
 model = prepare_model_for_kbit_training(model)
 peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
 )
 model = get_peft_model(model, peft_config)
 model.print_trainable_parameters()
 # %% [markdown]
 # ## 6) Train
 #
 # %%
 from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
 import math
 data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
 args = TrainingArguments(
    output_dir=str(OUTPUT_DIR),
    run_name=RUN_NAME,
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    warmup_ratio=0.05,
    weight_decay=0.01,
    logging_steps=25,
    eval_steps=50,
    save_steps=50,
    save_total_limit=2,
    bf16=True,
    fp16=False,
    gradient_checkpointing=True,
    report_to=["none"],
    seed=42,
    eval_strategy="steps",
    load_best_model_at_end=True,
 )
 trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=eval_tok,
    data_collator=data_collator,
 )
 train_result = trainer.train()
 metrics = trainer.evaluate() if eval_tok else {}
 perplexity = (
    math.exp(metrics["eval_loss"]) if metrics and "eval_loss" in metrics else None
 )
 metrics, perplexity
 # %% [markdown]
 # | epochs | train_loss | eval_loss |
 # | ------ | ---------- | --------- |
 # | 50     | 4.377000   | 3.628506  |
 # | 100    | 2.636800   | 2.558457  |
 # | 150    | 2.428800   | 2.427239  |
 # | 200    | 2.334800   | 2.193493  |
 # | 250    | 2.188500   | 2.186310  |
 # | 300    | 2.112400   | 2.173394  |
 # | 350    | 2.122900   | 2.163947  |
 # | 400    | 2.155400   | 2.162106  |
 # | 450    | 2.072100   | 2.154830  |
 # | 500    | 1.979900   | 2.165512  |
 # | 550    | 1.935800   | 2.176313  |
 # | 600    | 1.942800   | 2.170668  |
 # | 650    | 1.968000   | 2.162810  |
 # | 700    | 1.974100   | 2.167501  |
 # | 750    | 1.801900   | 2.235841  |
 # | 800    | 1.768000   | 2.233753  |
 # | 850    | 1.779100   | 2.218278  |
 # | 900    | 1.828900   | 2.220891  |
 # | 950    | 1.854900   | 2.208387  |
 # | 1000   | 1.653600   | 2.302763  |
 # | 1050   | 1.663500   | 2.307982  |
 # | 1100   | 1.673400   | 2.301423  |
 # | 1150   | 1.608400   | 2.320958  |
 # | 1200   | 1.683500   | 2.303580  |
 # | 1250   | 1.532100   | 2.434277  |
 # | 1300   | 1.558900   | 2.418276  |
 # | 1350   | 1.508900   | 2.422347  |
 # | 1400   | 1.535100   | 2.416650  |
 # | 1450   | 1.529900   | 2.415497  |
 #
 # | Step | Training Loss | Evaluation Loss |
 # | ---- | ------------- | --------------- |
 # | 50   | 1.173100      | 1.040235        |
 # | 100  | 0.882900      | 0.875235        |
 # | 150  | 0.806600      | 0.820686        |
 # | 200  | 0.785700      | 0.792914        |
 # | 250  | 0.764300      | 0.761308        |
 # | 300  | 0.733900      | 0.745976        |
 # | 350  | 0.744000      | 0.732220        |
 # | 400  | 0.712000      | 0.719414        |
 # | 450  | 0.703800      | 0.709955        |
 # | 500  | 0.684100      | 0.699460        |
 # | 550  | 0.705900      | 0.691758        |
 # | 600  | 0.683200      | 0.688031        |
 # | 650  | 0.670100      | 0.680539        |
 # | 700  | 0.681600      | 0.674205        |
 # | 750  | 0.681500      | 0.671295        |
 # | 800  | 0.651700      | 0.666133        |
 # | 850  | 0.662900      | 0.660661        |
 # | 900  | 0.651400      | 0.656359        |
 # | 950  | 0.648100      | 0.653309        |
 # | 1000 | 0.631500      | 0.648716        |
 # | 1050 | 0.654200      | 0.643737        |
 # | 1100 | 0.571100      | 0.648199        |
 # | 1150 | 0.573500      | 0.648405        |
 # | 1200 | 0.556000      | 0.644185        |
 # | 1250 | 0.568100      | 0.642854        |
 # | 1300 | 0.570200      | 0.640425        |
 # | 1350 | 0.551100      | 0.636319        |
 # | 1400 | 0.551400      | 0.634054        |
 # | 1450 | 0.550100      | 0.631558        |
 # | 1500 | 0.559800      | 0.630046        |
 # | 1550 | 0.556600      | 0.626972        |
 #
 # %% [markdown]
 # ## 7) Save LoRA adapters
 #
 # %%
 ADAPTER_DIR.mkdir(parents=True, exist_ok=True)
 model.save_pretrained(str(ADAPTER_DIR))
 tokenizer.save_pretrained(str(ADAPTER_DIR))
 print(f"Saved LoRA adapter to: {ADAPTER_DIR}")
 # %% [markdown]
 # ## 8) Save merged model
 #
 # %%
 # this does not work on my system since I don't have enough VRAM.
 # it should work though provided you have sufficient resources.
 # my next step would have been to convert the merged model to llama.cpp GGUF format so I can run it in Ollama/OpenWebUI.
 DO_MERGE = False
 base_model = None
 if DO_MERGE:
    base_model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        torch_dtype=torch.bfloat16,
        device_map="auto",
    )
    merged = PeftModel.from_pretrained(
        base_model, str(ADAPTER_DIR), offload_folder="offload/", is_trainable=False
    ).merge_and_unload()
    merged_dir = OUTPUT_DIR / "merged_model"
    merged.save_pretrained(str(merged_dir))
    tokenizer.save_pretrained(str(merged_dir))
    print(f"Merged full model saved to: {merged_dir}")
 else:
    print("Skipping merge (set DO_MERGE=True to enable).")
 # %% [markdown]
 # ## 9) Quick inference with the trained adapter
 #
 # %%
 test_model = None
 print(colored("Loading the base model + trained adapter.", "green"))
 test_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=BNB_CONFIG,
    dtype=torch.bfloat16,
    device_map="auto",
 )
 test_model = PeftModel.from_pretrained(
    test_model, str(ADAPTER_DIR), offload_folder="offload/", is_trainable=False
 )
 test_model.eval()
 def generate_answer(prompt, max_new_tokens=256, temperature=0.2, top_p=0.9):
    messages = [
        {"role": "system", "content": SYSTEM_PREFIX},
        {"role": "user", "content": prompt},
    ]
    model_inputs = tokenizer.apply_chat_template(
        messages, return_tensors="pt", add_generation_prompt=True
    ).to(test_model.device)
    gen_kwargs = {"input_ids": model_inputs}
    with torch.no_grad():
        out = test_model.generate(
            **gen_kwargs,
            do_sample=True,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )
    return tokenizer.decode(out[0], skip_special_tokens=True)
 sample_prompt = (
    train_rows[0]["input"]
    if len(train_rows) > 0
    else "What are the visitor crowd levels like?"
 )
 for i in range(10):
    print(generate_answer(train_rows[i]["input"])[:800])
    print("---")
 # %%
 generate_answer("What are the visitor crowd levels like?")
 # %%
 def chat(
    user, system="You are a precise assistant.", temperature=0.0, max_new_tokens=256
 ):
    msgs = [
        {"role": "system", "content": system},
        {"role": "user", "content": user},
    ]
    model_inputs = tokenizer.apply_chat_template(
        msgs, return_tensors="pt", add_generation_prompt=True
    ).to(test_model.device)
    gen_kwargs = {"input_ids": model_inputs}
    with torch.no_grad():
        out = test_model.generate(
            **gen_kwargs,
            # **tokenizer(user, return_tensors="pt").to(test_model.device),
            max_new_tokens=max_new_tokens,
            do_sample=(temperature > 0),
            temperature=temperature,
            top_p=1.0,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(out[0], skip_special_tokens=True)
 for i in range(10):
    prompt = train_rows[i]["input"]
    out = chat(prompt, max_new_tokens=2000, temperature=0.2)
    print("\n\n💬\n" + out)
 # %% [markdown]
 # ## PoS Gradio setup
 #
 # %%
 # === Gradio chat for Mistral-Instruct (no self-replies) ===
 # Assumes: `test_model` (HF AutoModelForCausalLM + PEFT adapter) and `BASE_MODEL` are defined.
 import torch, threading
 import gradio as gr
 from transformers import (
    AutoTokenizer,
    TextIteratorStreamer,
    StoppingCriteria,
    StoppingCriteriaList,
 )
 # -- Tokenizer (use BASE model tokenizer) --
 tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
 # Ensure pad/eos exist and are consistent
 if tokenizer.pad_token is None and tokenizer.eos_token is not None:
    tokenizer.pad_token = tokenizer.eos_token
 elif tokenizer.eos_token is None and tokenizer.pad_token is not None:
    tokenizer.eos_token = tokenizer.pad_token
 elif tokenizer.pad_token is None and tokenizer.eos_token is None:
    tokenizer.add_special_tokens({"eos_token": "</s>"})
    tokenizer.pad_token = tokenizer.eos_token
    try:
        test_model.resize_token_embeddings(len(tokenizer))
    except Exception:
        pass
 DEVICE = getattr(test_model, "device", "cuda" if torch.cuda.is_available() else "cpu")
 SYSTEM_PROMPT = "You are a helpful assistant."
 # --- Custom stop: if the model starts a new user turn ([INST]) stop generation immediately.
 # This prevents the model from “answering its own replies”.
 class StopOnInst(StoppingCriteria):
    def __init__(self, tokenizer, trigger_text="[INST]"):
        self.trigger_ids = tokenizer.encode(trigger_text, add_special_tokens=False)
    def __call__(
        self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
    ) -> bool:
        if not self.trigger_ids:
            return False
        seq = input_ids[0].tolist()
        tlen = len(self.trigger_ids)
        if len(seq) < tlen:
            return False
        return seq[-tlen:] == self.trigger_ids
 STOPPING = StoppingCriteriaList([StopOnInst(tokenizer)])
 def _build_inputs(pairs):
    """
    pairs: list of (user, assistant) tuples.
    We include prior completed assistant replies and the latest user with empty assistant,
    then ask the model to continue as assistant.
    """
    msgs = [{"role": "system", "content": SYSTEM_PROMPT}]
    for u, a in pairs:
        u = (u or "").strip()
        a = (a or "").strip()
        if not u and not a:
            continue
        if u:
            msgs.append({"role": "user", "content": u})
        if a:
            msgs.append({"role": "assistant", "content": a})
    # Use chat template; many Mistral tokenizers return a single Tensor (input_ids)
    input_ids = tokenizer.apply_chat_template(
        msgs, add_generation_prompt=True, return_tensors="pt"
    )
    if isinstance(input_ids, torch.Tensor):
        inputs = {"input_ids": input_ids, "attention_mask": torch.ones_like(input_ids)}
    else:
        inputs = input_ids
    return {k: v.to(DEVICE) for k, v in inputs.items()}
 def stream_reply(history_pairs, max_new_tokens=512, temperature=0.7, top_p=0.9):
    inputs = _build_inputs(history_pairs)
    streamer = TextIteratorStreamer(
        tokenizer, skip_prompt=True, skip_special_tokens=True
    )
    gen_kwargs = dict(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,  # Mistral uses </s> as EOS
        streamer=streamer,
        stopping_criteria=STOPPING,  # <- key fix
    )
    with torch.inference_mode():
        t = threading.Thread(target=test_model.generate, kwargs=gen_kwargs)
        t.start()
        partial = ""
        for piece in streamer:
            partial += piece
            yield partial
        t.join()
 # --- Gradio handlers ---
 def gr_respond(message, chat_history):
    message = (message or "").strip()
    chat_history = chat_history or []
    # Append new user turn with empty assistant; we stream into that slot.
    chat_history = chat_history + [(message, "")]
    pairs = [(u or "", a or "") for (u, a) in chat_history]
    for partial in stream_reply(pairs):
        chat_history[-1] = (message, partial)
        yield "", chat_history  # clears textbox, updates chat
 def gr_clear():
    return None
 with gr.Blocks() as demo:
    gr.Markdown("## 💬 Chat with Touristral")
    chat = gr.Chatbot(height=200, layout="bubble")
    with gr.Row():
        msg = gr.Textbox(placeholder="Type a message and press Enter…", scale=9)
        send = gr.Button("Send", scale=1)
    with gr.Row():
        clear = gr.Button("Clear chat")
    msg.submit(gr_respond, [msg, chat], [msg, chat])
    send.click(gr_respond, [msg, chat], [msg, chat])
    clear.click(gr_clear, None, chat, queue=False)
 demo.queue().launch(share=False)
 # %% [markdown]
 # ## 10) Light evaluation on the validation set
 #
 # %%
 import evaluate
 if eval_ds:
    rouge = evaluate.load("rouge")
    preds, refs = [], []
    for ex in val_rows[:50]:
        preds.append(generate_answer(ex["input"], max_new_tokens=192, temperature=0.2))
        refs.append(ex["output"])
    results = rouge.compute(predictions=preds, references=refs)
    print(results)
 else:
    print("No eval split available; skipped.")
 # %% [markdown]
 # ## 11) (Optional) Use with other runtimes
 #
 # - **Python Inference (PEFT)**: Load base model + adapter as shown in Section 9.
 # - **Merged model**: Set `DO_MERGE=True` to create a standalone model directory; you can then convert to other runtimes (e.g., llama.cpp GGUF) using their conversion tools.
 # - **Ollama**: If your runtime supports adapters or merged weights for the chosen base model, create a `Modelfile` pointing to them. Need a concrete path? Tell me your base and target runtime and I’ll add exact steps.
 #
Author	SHA1	Message	Date
marvinscham	c98a1d0c6e	BERTopic cleanup	2026-02-08 22:43:53 +01:00
marvinscham	b2da597b18	Add survey data	2026-02-08 20:27:42 +01:00
marvinscham	e3c9b7286f	RAFT attempts and fixes	2026-02-08 16:41:24 +01:00