mirror of
https://github.com/marvinscham/masterthesis-playground.git
synced 2026-03-22 00:12:42 +01:00
Cleanup
This commit is contained in:
Binary file not shown.
|
Before Width: | Height: | Size: 21 KiB |
@@ -1,577 +0,0 @@
|
||||
# ---
|
||||
# jupyter:
|
||||
# jupytext:
|
||||
# text_representation:
|
||||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.18.0
|
||||
# kernelspec:
|
||||
# display_name: .venv
|
||||
# language: python
|
||||
# name: python3
|
||||
# ---
|
||||
|
||||
# %% [markdown]
|
||||
# # Topic Detection: Bali Tourist Reviews
|
||||
#
|
||||
|
||||
# %% [markdown]
|
||||
# ## Preparation
|
||||
#
|
||||
# ### Dependency Loading
|
||||
#
|
||||
|
||||
# %%
|
||||
import json
|
||||
import pickle
|
||||
import re
|
||||
|
||||
import gensim.corpora as corpora
|
||||
import nltk
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import spacy
|
||||
from bertopic.representation import KeyBERTInspired
|
||||
from bertopic.vectorizers import ClassTfidfTransformer
|
||||
from gensim.models.coherencemodel import CoherenceModel
|
||||
from hdbscan import HDBSCAN
|
||||
from nltk.corpus import stopwords
|
||||
from nltk.stem import WordNetLemmatizer
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
from umap import UMAP
|
||||
|
||||
from bertopic import BERTopic
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
|
||||
nltk.download("stopwords")
|
||||
nltk.download("punkt")
|
||||
nltk.download("wordnet")
|
||||
|
||||
# %% [markdown]
|
||||
# ### Parameters and Tracking
|
||||
#
|
||||
|
||||
# %%
|
||||
RECREATE_MODEL = True
|
||||
RECREATE_REDUCED_MODEL = True
|
||||
PROCESS_DATA = False
|
||||
REDUCE_OUTLIERS = True
|
||||
USE_CONDENSED_MODEL = False
|
||||
|
||||
DATA_SAMPLE_SIZE = -1 # -1 for all data
|
||||
|
||||
# Classical coherence score. Warning: needs swap to not kill your PC
|
||||
CALCULATE_COHERENCE = False
|
||||
|
||||
# Vectorization
|
||||
MIN_DOCUMENT_FREQUENCY = 1
|
||||
MAX_NGRAM = 2
|
||||
|
||||
# HDBSCAN Parameters
|
||||
MIN_TOPIC_SIZE = 200
|
||||
MIN_SAMPLES = 25
|
||||
|
||||
# UMAP Parameters
|
||||
N_NEIGHBORS = 15
|
||||
N_COMPONENTS = 2
|
||||
MIN_DIST = 0.01
|
||||
|
||||
# Topic Modeling
|
||||
TOP_N_WORDS = 10
|
||||
MAX_TOPICS = None # or "auto" to pass to HDBSCAN, None to skip
|
||||
|
||||
# %% [markdown]
|
||||
# ### Data Loading & Preprocessing
|
||||
#
|
||||
|
||||
# %%
|
||||
if DATA_SAMPLE_SIZE != -1:
|
||||
reviews = (
|
||||
pd.read_csv("../data/original/reviews.tab", sep="\t")
|
||||
.sample(n=DATA_SAMPLE_SIZE)
|
||||
.review.dropna()
|
||||
.to_list()
|
||||
)
|
||||
else:
|
||||
reviews = (
|
||||
pd.read_csv("../data/original/reviews.tab", sep="\t").review.dropna().to_list()
|
||||
)
|
||||
|
||||
print("Loaded {} reviews".format(len(reviews)))
|
||||
|
||||
# %%
|
||||
# List of NE in Bali for NER enhancement
|
||||
with open("../data/supporting/bali_ner.json", "r") as f:
|
||||
bali_places = json.load(f)
|
||||
bali_places_set = set(bali_places)
|
||||
|
||||
# Stop word definition
|
||||
extra_stopwords = ["bali", "idr", "usd"]
|
||||
stop_words = set(stopwords.words("english"))
|
||||
with open("../data/supporting/stopwords-en.json", "r") as f:
|
||||
extra_stopwords.extend(json.load(f))
|
||||
|
||||
# Custom replacements
|
||||
rep = {
|
||||
r"\\n": " ",
|
||||
r"\n": " ",
|
||||
r'\\"': "",
|
||||
r'"': "",
|
||||
"mongkey": "monkey",
|
||||
"monky": "monkey",
|
||||
"verry": "very",
|
||||
}
|
||||
rep = dict((re.escape(k), v) for k, v in rep.items())
|
||||
pattern = re.compile("|".join(rep.keys()))
|
||||
|
||||
lemmatizer = WordNetLemmatizer()
|
||||
|
||||
|
||||
def preprocess(text):
|
||||
# Step 1: Apply custom replacements (typos, special cases)
|
||||
text = text.lower()
|
||||
text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
|
||||
|
||||
# Step 2: Clean text
|
||||
text = re.sub(r"\d+", " ", text)
|
||||
text = re.sub(r"\W+", " ", text)
|
||||
|
||||
doc = nlp(text)
|
||||
|
||||
# Step 3: POS tagging and filtering
|
||||
filtered_tokens = [
|
||||
token.text
|
||||
for token in doc
|
||||
if token.pos_ in {"NOUN", "PROPN"}
|
||||
or token.ent_type_ in {"GPE", "LOC", "FAC"}
|
||||
or token.text in bali_places_set
|
||||
]
|
||||
|
||||
# Step 4: Lemmatization and stopword removal
|
||||
lemmatized_tokens = [
|
||||
lemmatizer.lemmatize(w)
|
||||
for w in filtered_tokens
|
||||
if w not in stop_words and w not in extra_stopwords and len(w) > 2
|
||||
]
|
||||
|
||||
return lemmatized_tokens
|
||||
|
||||
|
||||
# %%
|
||||
if PROCESS_DATA:
|
||||
print("Processing reviews...")
|
||||
reviews = [preprocess(review) for review in reviews]
|
||||
|
||||
with open("../data/intermediate/processed_texts.pkl", "wb") as f:
|
||||
pickle.dump(reviews, f)
|
||||
else:
|
||||
with open("../data/intermediate/processed_texts.pkl", "rb") as f:
|
||||
reviews = pickle.load(f)
|
||||
reviews = [
|
||||
" ".join(review) if isinstance(review, list) else review
|
||||
for review in reviews
|
||||
]
|
||||
|
||||
print(reviews[:1])
|
||||
|
||||
# %% [markdown]
|
||||
# ### Pre-calculate Embeddings
|
||||
#
|
||||
|
||||
# %%
|
||||
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
embeddings = embedding_model.encode(reviews, show_progress_bar=True)
|
||||
|
||||
# %% [markdown]
|
||||
# ## Model Creation
|
||||
#
|
||||
|
||||
# %% [markdown]
|
||||
# ### Dimensionality Reduction (UMAP)
|
||||
#
|
||||
|
||||
# %%
|
||||
umap_model = UMAP(
|
||||
n_neighbors=N_NEIGHBORS,
|
||||
n_components=N_COMPONENTS,
|
||||
min_dist=MIN_DIST,
|
||||
metric="cosine",
|
||||
low_memory=True,
|
||||
random_state=42,
|
||||
)
|
||||
reduced_embeddings = umap_model.fit_transform(embeddings)
|
||||
|
||||
# %% [markdown]
|
||||
# ### BERTopic Model Creation
|
||||
#
|
||||
|
||||
# %%
|
||||
if RECREATE_MODEL:
|
||||
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
|
||||
vectorizer_model = CountVectorizer(
|
||||
min_df=MIN_DOCUMENT_FREQUENCY, ngram_range=(1, MAX_NGRAM)
|
||||
)
|
||||
|
||||
representation_model = KeyBERTInspired()
|
||||
hdbscan_model = HDBSCAN(
|
||||
min_cluster_size=MIN_TOPIC_SIZE,
|
||||
min_samples=MIN_SAMPLES,
|
||||
metric="euclidean",
|
||||
cluster_selection_method="eom",
|
||||
gen_min_span_tree=True,
|
||||
prediction_data=True,
|
||||
)
|
||||
|
||||
topic_model = BERTopic(
|
||||
embedding_model=embedding_model,
|
||||
ctfidf_model=ctfidf_model,
|
||||
vectorizer_model=vectorizer_model,
|
||||
umap_model=umap_model,
|
||||
hdbscan_model=hdbscan_model,
|
||||
representation_model=representation_model,
|
||||
verbose=True,
|
||||
calculate_probabilities=True,
|
||||
language="english",
|
||||
top_n_words=TOP_N_WORDS,
|
||||
nr_topics=MAX_TOPICS,
|
||||
)
|
||||
|
||||
topics, probs = topic_model.fit_transform(reviews, embeddings=embeddings)
|
||||
|
||||
topic_labels = topic_model.generate_topic_labels(
|
||||
nr_words=3, topic_prefix=True, word_length=15, separator=" - "
|
||||
)
|
||||
topic_model.set_topic_labels(topic_labels)
|
||||
BERTopic.save(topic_model, "output/model.bertopic")
|
||||
else:
|
||||
print("Nevermind, loading existing model")
|
||||
topic_model = BERTopic.load("output/model.bertopic")
|
||||
|
||||
# %% [markdown]
|
||||
# ## Fine Tuning
|
||||
#
|
||||
# ### Topic Condensation
|
||||
#
|
||||
|
||||
# %%
|
||||
if RECREATE_REDUCED_MODEL:
|
||||
done = False
|
||||
iteration = 1
|
||||
while not done:
|
||||
print(f"Iteration {iteration}")
|
||||
iteration += 1
|
||||
similarity_matrix = cosine_similarity(
|
||||
np.array(topic_model.topic_embeddings_)[1:, :]
|
||||
)
|
||||
nothing_to_merge = True
|
||||
|
||||
for i in range(similarity_matrix.shape[0]):
|
||||
for j in range(i + 1, similarity_matrix.shape[1]):
|
||||
sim = similarity_matrix[i, j]
|
||||
if sim > 0.9:
|
||||
nothing_to_merge = False
|
||||
t1, t2 = i, j
|
||||
try:
|
||||
t1_name = topic_model.get_topic_info(t1)["CustomName"][0]
|
||||
t2_name = topic_model.get_topic_info(t2)["CustomName"][0]
|
||||
print(
|
||||
f"Merging topics {t1} ({t1_name}) and {t2} ({t2_name}) with similarity {sim:.2f}"
|
||||
)
|
||||
topic_model.merge_topics(reviews, topics_to_merge=[t1, t2])
|
||||
|
||||
topic_labels = topic_model.generate_topic_labels(
|
||||
nr_words=3,
|
||||
topic_prefix=True,
|
||||
word_length=15,
|
||||
separator=" - ",
|
||||
)
|
||||
topic_model.set_topic_labels(topic_labels)
|
||||
except Exception as e:
|
||||
print(f"Failed to merge {t1} and {t2}: {e}")
|
||||
if nothing_to_merge:
|
||||
print("No more topics to merge.")
|
||||
done = True
|
||||
|
||||
# BERTopic.save(topic_model, "bertopic/model_reduced.bertopic")
|
||||
elif USE_CONDENSED_MODEL:
|
||||
print("Nevermind, loading existing reduced model")
|
||||
topic_model = BERTopic.load("bertopic/model_reduced.bertopic")
|
||||
else:
|
||||
print("Skipping topic reduction")
|
||||
|
||||
# %% [markdown]
|
||||
# ### Outlier Reduction
|
||||
#
|
||||
|
||||
# %%
|
||||
if REDUCE_OUTLIERS:
|
||||
new_topics = topic_model.reduce_outliers(
|
||||
reviews,
|
||||
topic_model.topics_,
|
||||
probabilities=topic_model.probabilities_,
|
||||
threshold=0.05,
|
||||
strategy="probabilities",
|
||||
)
|
||||
topic_model.update_topics(reviews, topics=new_topics)
|
||||
|
||||
# %% [markdown]
|
||||
# ## Results
|
||||
#
|
||||
# ### Classification
|
||||
#
|
||||
|
||||
# %%
|
||||
import random
|
||||
from pathlib import Path
|
||||
|
||||
# --- config ---
|
||||
topics_to_keep = {2, 4, 6, 8, 10, 5, 7}
|
||||
INPUT_PATH = "../data/original/reviews.tab" # TSV with a 'review' column
|
||||
OUTPUT_CSV = "../data/intermediate/selected_topics_documents.csv"
|
||||
OUTPUT_DIR = Path("../raft/corpus")
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
BATCH_SIZE = 60
|
||||
MIN_CHARS = 40
|
||||
SEED = 42
|
||||
|
||||
# --- load data ---
|
||||
data = pd.read_csv(INPUT_PATH, sep="\t")
|
||||
|
||||
# If you already have `reviews` elsewhere, replace the next line with that variable
|
||||
reviews = data["review"].astype(str).fillna("")
|
||||
|
||||
# Topic model document info
|
||||
df = topic_model.get_document_info(reviews) # assumes your model is already fitted
|
||||
df["Original"] = reviews.values
|
||||
|
||||
# --- filter by topics and length ---
|
||||
filtered = df[df["Topic"].isin(topics_to_keep)].copy()
|
||||
filtered["Original"] = filtered["Original"].str.strip()
|
||||
filtered = filtered[filtered["Original"].str.len() >= MIN_CHARS]
|
||||
|
||||
# Save an audit CSV
|
||||
filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False)
|
||||
|
||||
# --- deterministic shuffle + write batched corpus files ---
|
||||
total_files = 0
|
||||
total_reviews = 0
|
||||
rng = random.Random(SEED)
|
||||
|
||||
for topic_val, g in filtered.groupby("Topic", sort=True):
|
||||
reviews_list = g["Original"].tolist()
|
||||
|
||||
# deterministic shuffle within topic
|
||||
rng.shuffle(reviews_list)
|
||||
|
||||
# chunk into batches of up to 60
|
||||
for start in range(0, len(reviews_list), BATCH_SIZE):
|
||||
chunk = reviews_list[start : start + BATCH_SIZE]
|
||||
if not chunk:
|
||||
continue
|
||||
|
||||
# simple header for traceability
|
||||
header = (
|
||||
f"[TOPIC] {topic_val}\n" f"[Stats] N={len(chunk)} | Source={INPUT_PATH}\n"
|
||||
)
|
||||
|
||||
lines = [header, ""]
|
||||
for i, txt in enumerate(chunk, 1):
|
||||
lines.append(f"({i}) {txt}")
|
||||
|
||||
part_idx = start // BATCH_SIZE + 1
|
||||
fname = f"topic={topic_val}__part={part_idx:03d}__n={len(chunk)}.txt"
|
||||
(OUTPUT_DIR / fname).write_text("\n".join(lines), encoding="utf-8")
|
||||
|
||||
total_files += 1
|
||||
total_reviews += len(chunk)
|
||||
|
||||
print(
|
||||
f"[green]Wrote {total_files} docs with {total_reviews} reviews to {OUTPUT_DIR}[/green]"
|
||||
)
|
||||
print(f"[green]Filtered CSV saved to {OUTPUT_CSV}[/green]")
|
||||
|
||||
# %%
|
||||
doc_topic_matrix = probs
|
||||
|
||||
# column names
|
||||
topicnames = ["Topic " + str(i) for i in range(len(set(topics)) - 1)]
|
||||
|
||||
# index names
|
||||
docnames = ["Review " + str(i) for i in range(len(reviews))]
|
||||
|
||||
# Make the pandas dataframe
|
||||
df_document_topic = pd.DataFrame(
|
||||
np.round(doc_topic_matrix, 2), columns=topicnames, index=docnames
|
||||
)
|
||||
|
||||
# Get dominant topic for each document
|
||||
dominant_topic = np.argmax(doc_topic_matrix, axis=1)
|
||||
df_document_topic["dominant_topic"] = dominant_topic
|
||||
|
||||
|
||||
# Styling
|
||||
def color_stuff(val):
|
||||
if val > 0.1:
|
||||
color = "green"
|
||||
elif val > 0.05:
|
||||
color = "orange"
|
||||
else:
|
||||
color = "grey"
|
||||
return "color: {col}".format(col=color)
|
||||
|
||||
|
||||
def make_bold(val):
|
||||
weight = 700 if val > 0.1 else 400
|
||||
return "font-weight: {weight}".format(weight=weight)
|
||||
|
||||
|
||||
# Apply Style
|
||||
df_document_topics = (
|
||||
df_document_topic.head(15).style.applymap(color_stuff).applymap(make_bold)
|
||||
)
|
||||
df_document_topics
|
||||
|
||||
# %% [markdown]
|
||||
# ### Document Visualization
|
||||
#
|
||||
|
||||
# %%
|
||||
vis = topic_model.visualize_documents(
|
||||
docs=reviews,
|
||||
reduced_embeddings=reduced_embeddings,
|
||||
custom_labels=True,
|
||||
hide_annotations=True,
|
||||
)
|
||||
vis.write_html("output/visualization.html")
|
||||
vis
|
||||
|
||||
# %% [markdown]
|
||||
# ### Similarity Matrix
|
||||
#
|
||||
|
||||
# %%
|
||||
topic_model.visualize_heatmap()
|
||||
|
||||
# %% [markdown]
|
||||
# ### Topic Info
|
||||
#
|
||||
|
||||
# %%
|
||||
topic_model.get_topic_info()
|
||||
|
||||
# %% [markdown]
|
||||
# ### Semantic Coherence
|
||||
#
|
||||
|
||||
# %%
|
||||
topic_words = []
|
||||
for topic_id in topic_model.get_topic_info()["Topic"]:
|
||||
# Skip outlier topic
|
||||
if topic_id < 0:
|
||||
continue
|
||||
|
||||
words = [word for word, _ in topic_model.get_topic(topic_id)]
|
||||
topic_words.append(words)
|
||||
|
||||
# Compute mean pairwise cosine similarity for each topic
|
||||
coherence_scores = []
|
||||
for words in topic_words:
|
||||
coherence_embeddings = embedding_model.encode(words)
|
||||
sim_matrix = cosine_similarity(coherence_embeddings)
|
||||
|
||||
# Ignore self-similarity
|
||||
np.fill_diagonal(sim_matrix, 0)
|
||||
mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)])
|
||||
coherence_scores.append(mean_sim)
|
||||
|
||||
overall_coherence = np.mean(coherence_scores)
|
||||
|
||||
print(len(reviews), "reviews processed")
|
||||
print(len(topic_model.get_topic_info()) - 1, "topics found")
|
||||
print(f"BERT-based Topic Coherence: {overall_coherence:.4f}")
|
||||
|
||||
# %% [markdown]
|
||||
# ### Topic Coherence
|
||||
#
|
||||
|
||||
# %%
|
||||
# https://github.com/MaartenGr/BERTopic/issues/90#issuecomment-820915389
|
||||
|
||||
if CALCULATE_COHERENCE:
|
||||
# Preprocess Documents
|
||||
documents = pd.DataFrame(
|
||||
{"Document": reviews, "ID": range(len(reviews)), "Topic": topics}
|
||||
)
|
||||
documents_per_topic = documents.groupby(["Topic"], as_index=False).agg(
|
||||
{"Document": " ".join}
|
||||
)
|
||||
cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)
|
||||
|
||||
# Extract vectorizer and analyzer from BERTopic
|
||||
vectorizer = topic_model.vectorizer_model
|
||||
analyzer = vectorizer.build_analyzer()
|
||||
|
||||
# Extract features for Topic Coherence evaluation
|
||||
words = vectorizer.get_feature_names_out()
|
||||
tokens = [analyzer(doc) for doc in cleaned_docs]
|
||||
dictionary = corpora.Dictionary(tokens)
|
||||
corpus = [dictionary.doc2bow(token) for token in tokens]
|
||||
topic_words = [
|
||||
[words for words, _ in topic_model.get_topic(topic)]
|
||||
for topic in range(len(set(topics)) - 1)
|
||||
]
|
||||
|
||||
# %env TOKENIZERS_PARALLELISM=false
|
||||
|
||||
for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]:
|
||||
coherence_model = CoherenceModel(
|
||||
topics=topic_words,
|
||||
texts=tokens,
|
||||
corpus=corpus,
|
||||
dictionary=dictionary,
|
||||
coherence=measurement,
|
||||
)
|
||||
coherence_score = coherence_model.get_coherence()
|
||||
print(f"Coherence ({measurement}): {coherence_score:.4f}")
|
||||
else:
|
||||
print("Skipping classical coherence calculation")
|
||||
|
||||
# %% [markdown]
|
||||
# ### Term Search
|
||||
#
|
||||
|
||||
# %%
|
||||
search_term = "uluwatu"
|
||||
|
||||
similar_topics, similarities = topic_model.find_topics(search_term, top_n=10)
|
||||
for i in range(len(similar_topics)):
|
||||
# \n{topic_model.get_topic(similar_topics[i])}\n
|
||||
print(
|
||||
f"{str(similarities[i])[:5]} {topic_model.get_topic_info(similar_topics[i])["CustomName"][0]}"
|
||||
)
|
||||
|
||||
# %% [markdown]
|
||||
# ### Topic Hierarchy
|
||||
#
|
||||
|
||||
# %%
|
||||
topic_model.visualize_hierarchy(custom_labels=True)
|
||||
|
||||
# %% [markdown]
|
||||
# ### Intertopic Distance Map
|
||||
#
|
||||
|
||||
# %%
|
||||
topic_model.visualize_topics()
|
||||
|
||||
# %% [markdown]
|
||||
# ### Topic Word Scores
|
||||
#
|
||||
|
||||
# %%
|
||||
topic_model.visualize_barchart(top_n_topics=12, custom_labels=True, n_words=10)
|
||||
@@ -1,290 +0,0 @@
|
||||
[
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.01,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 10,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 2,
|
||||
"n_gram_max": 2,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.4456,
|
||||
"diversity": 0.925,
|
||||
"combined_score": 0.5894
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.01,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 10,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 2,
|
||||
"n_gram_max": 3,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.4462,
|
||||
"diversity": 0.925,
|
||||
"combined_score": 0.5898
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.01,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 10,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 5,
|
||||
"n_gram_max": 2,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.4531,
|
||||
"diversity": 0.975,
|
||||
"combined_score": 0.6096
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.01,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 10,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 5,
|
||||
"n_gram_max": 3,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.4617,
|
||||
"diversity": 0.95,
|
||||
"combined_score": 0.6082
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.01,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 25,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 2,
|
||||
"n_gram_max": 2,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.4456,
|
||||
"diversity": 0.925,
|
||||
"combined_score": 0.5894
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.01,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 25,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 2,
|
||||
"n_gram_max": 3,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.4462,
|
||||
"diversity": 0.925,
|
||||
"combined_score": 0.5898
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.01,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 25,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 5,
|
||||
"n_gram_max": 2,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.4531,
|
||||
"diversity": 0.975,
|
||||
"combined_score": 0.6096
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.01,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 25,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 5,
|
||||
"n_gram_max": 3,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.4617,
|
||||
"diversity": 0.95,
|
||||
"combined_score": 0.6082
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.1,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 10,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 2,
|
||||
"n_gram_max": 2,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.498,
|
||||
"diversity": 1.0,
|
||||
"combined_score": 0.6486
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.1,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 10,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 2,
|
||||
"n_gram_max": 3,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.4915,
|
||||
"diversity": 0.9666,
|
||||
"combined_score": 0.634
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.1,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 10,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 5,
|
||||
"n_gram_max": 2,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.4287,
|
||||
"diversity": 1.0,
|
||||
"combined_score": 0.6001
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.1,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 10,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 5,
|
||||
"n_gram_max": 3,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.427,
|
||||
"diversity": 1.0,
|
||||
"combined_score": 0.5989
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.1,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 25,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 2,
|
||||
"n_gram_max": 2,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.498,
|
||||
"diversity": 1.0,
|
||||
"combined_score": 0.6486
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.1,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 25,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 2,
|
||||
"n_gram_max": 3,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.4915,
|
||||
"diversity": 0.9666,
|
||||
"combined_score": 0.634
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.1,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 25,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 5,
|
||||
"n_gram_max": 2,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.4287,
|
||||
"diversity": 1.0,
|
||||
"combined_score": 0.6001
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.1,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 25,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 5,
|
||||
"n_gram_max": 3,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.427,
|
||||
"diversity": 1.0,
|
||||
"combined_score": 0.5989
|
||||
}
|
||||
}
|
||||
]
|
||||
@@ -1,290 +0,0 @@
|
||||
[
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.1,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 10,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 2,
|
||||
"n_gram_max": 2,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.498,
|
||||
"diversity": 1.0,
|
||||
"combined_score": 0.6486
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.1,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 25,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 2,
|
||||
"n_gram_max": 2,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.498,
|
||||
"diversity": 1.0,
|
||||
"combined_score": 0.6486
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.1,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 10,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 2,
|
||||
"n_gram_max": 3,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.4915,
|
||||
"diversity": 0.9666,
|
||||
"combined_score": 0.634
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.1,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 25,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 2,
|
||||
"n_gram_max": 3,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.4915,
|
||||
"diversity": 0.9666,
|
||||
"combined_score": 0.634
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.01,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 10,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 5,
|
||||
"n_gram_max": 2,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.4531,
|
||||
"diversity": 0.975,
|
||||
"combined_score": 0.6096
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.01,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 25,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 5,
|
||||
"n_gram_max": 2,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.4531,
|
||||
"diversity": 0.975,
|
||||
"combined_score": 0.6096
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.01,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 10,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 5,
|
||||
"n_gram_max": 3,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.4617,
|
||||
"diversity": 0.95,
|
||||
"combined_score": 0.6082
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.01,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 25,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 5,
|
||||
"n_gram_max": 3,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.4617,
|
||||
"diversity": 0.95,
|
||||
"combined_score": 0.6082
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.1,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 10,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 5,
|
||||
"n_gram_max": 2,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.4287,
|
||||
"diversity": 1.0,
|
||||
"combined_score": 0.6001
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.1,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 25,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 5,
|
||||
"n_gram_max": 2,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.4287,
|
||||
"diversity": 1.0,
|
||||
"combined_score": 0.6001
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.1,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 10,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 5,
|
||||
"n_gram_max": 3,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.427,
|
||||
"diversity": 1.0,
|
||||
"combined_score": 0.5989
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.1,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 25,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 5,
|
||||
"n_gram_max": 3,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.427,
|
||||
"diversity": 1.0,
|
||||
"combined_score": 0.5989
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.01,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 10,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 2,
|
||||
"n_gram_max": 3,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.4462,
|
||||
"diversity": 0.925,
|
||||
"combined_score": 0.5898
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.01,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 25,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 2,
|
||||
"n_gram_max": 3,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.4462,
|
||||
"diversity": 0.925,
|
||||
"combined_score": 0.5898
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.01,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 10,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 2,
|
||||
"n_gram_max": 2,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.4456,
|
||||
"diversity": 0.925,
|
||||
"combined_score": 0.5894
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.01,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 25,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 2,
|
||||
"n_gram_max": 2,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.4456,
|
||||
"diversity": 0.925,
|
||||
"combined_score": 0.5894
|
||||
}
|
||||
}
|
||||
]
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -6,15 +6,9 @@
|
||||
|
||||
_(Perception of natural beauty, cultural substance, historical depth)_
|
||||
|
||||
1. **When you think of Bali, which specific natural or spiritual places embody “authentic cultural depth” for you — and what makes them stand out?**
|
||||
2. What distinguishes a spiritually meaningful temple complex from a purely scenic attraction in your perception?
|
||||
|
||||
2. **What distinguishes a spiritually meaningful temple complex from a purely scenic attraction in your perception?**
|
||||
|
||||
3. **Using Uluwatu or Lempuyang as examples: What elements would need to be communicated for you to perceive them not as “Instagram spots,” but as culturally substantial places?**
|
||||
|
||||
4. **How important is active ritual presence (e.g., ceremonies, offerings, priests) compared to architectural or historical aspects?**
|
||||
|
||||
5. **If you had to choose between Tanah Lot and Ulun Danu Bratan for a reflective, culturally immersive experience, which criteria would guide your decision?**
|
||||
3. If you had to choose between Tanah Lot and Ulun Danu Bratan for a reflective, culturally immersive experience, which criteria would guide your decision?
|
||||
|
||||
---
|
||||
|
||||
@@ -22,15 +16,13 @@ _(Perception of natural beauty, cultural substance, historical depth)_
|
||||
|
||||
_(Emotional quality, spirituality, aesthetic perception, subjective experience)_
|
||||
|
||||
6. **How would you describe the atmosphere of a place where you feel culturally and spiritually aligned? What factors create that feeling?**
|
||||
6. How would you describe the atmosphere of a place where you feel culturally and spiritually aligned? What factors create that feeling?
|
||||
|
||||
7. **To what extent do visitor numbers affect your spiritual experience — and is there a threshold you still consider acceptable?**
|
||||
7. To what extent do visitor numbers affect your spiritual experience — and is there a threshold you still consider acceptable?
|
||||
|
||||
8. **Which timing or contextual conditions (e.g., ceremony days, off-season, sunrise instead of sunset) enhance the cultural intensity of a place for you?**
|
||||
8. Which timing or contextual conditions (e.g., ceremony days, off-season, sunrise instead of sunset) enhance the cultural intensity of a place for you?
|
||||
|
||||
9. **How do you internally reconcile the sacred character of a site with strong touristic staging or commercialization?**
|
||||
|
||||
10. **What would a destination need to do in order to evoke not just visual admiration, but genuine spiritual resonance for you?**
|
||||
9. What would a destination need to do in order to evoke not just visual admiration, but genuine spiritual resonance for you?
|
||||
|
||||
---
|
||||
|
||||
@@ -38,13 +30,13 @@ _(Emotional quality, spirituality, aesthetic perception, subjective experience)_
|
||||
|
||||
_(Local interaction, authenticity, visitor behavior, cultural credibility)_
|
||||
|
||||
11. **What role does interaction with local priests, guides, or community members play in shaping the depth of your experience?**
|
||||
11. What role does interaction with local priests, guides, or community members play in shaping the depth of your experience?
|
||||
|
||||
12. **How do you define appropriate visitor behavior at Balinese temples, and how strongly does this influence your overall perception of the site?**
|
||||
12. How do you define appropriate visitor behavior at Balinese temples, and how strongly does this influence your overall perception of the site?
|
||||
|
||||
13. **If other visitors focus primarily on photography, does that diminish the spiritual quality of the place for you, or can you detach from it?**
|
||||
13. If other visitors focus primarily on photography, does that diminish the spiritual quality of the place for you, or can you detach from it?
|
||||
|
||||
14. **What type of cultural storytelling by locals feels authentic and credible rather than staged for tourism?**
|
||||
14. What type of cultural storytelling by locals feels authentic and credible rather than staged for tourism?
|
||||
|
||||
---
|
||||
|
||||
@@ -52,13 +44,13 @@ _(Local interaction, authenticity, visitor behavior, cultural credibility)_
|
||||
|
||||
_(Accessibility, organization, hygiene standards, information systems)_
|
||||
|
||||
15. **How important are curated background explanations (e.g., symbolism, ritual calendars, historical context) compared to independent exploration?**
|
||||
15. How important are curated background explanations (e.g., symbolism, ritual calendars, historical context) compared to independent exploration?
|
||||
|
||||
16. **Do long waiting times — for example at Lempuyang — affect your perception of a site’s spiritual substance, or do you separate logistical issues from cultural meaning?**
|
||||
16. Do long waiting times — for example at Lempuyang — affect your perception of a site’s spiritual substance, or do you separate logistical issues from cultural meaning?
|
||||
|
||||
17. **Which infrastructural measures (e.g., visitor flow management, limited entry slots, silent zones) would enhance the cultural quality of your experience?**
|
||||
17. Which infrastructural measures (e.g., visitor flow management, limited entry slots, silent zones) would enhance the cultural quality of your experience?
|
||||
|
||||
18. **How should destinations communicate information in order to appeal to spiritually interested travelers without reinforcing mass-tourism dynamics?**
|
||||
18. How should destinations communicate information in order to appeal to spiritually interested travelers without reinforcing mass-tourism dynamics?
|
||||
|
||||
---
|
||||
|
||||
@@ -66,18 +58,18 @@ _(Accessibility, organization, hygiene standards, information systems)_
|
||||
|
||||
_(Perceived value, immaterial benefits, willingness to pay)_
|
||||
|
||||
19. **How do you personally assess the “value” of cultural attractions — in terms of emotional depth, learning outcomes, exclusivity, or something else?**
|
||||
19. How do you personally assess the “value” of cultural attractions — in terms of emotional depth, learning outcomes, exclusivity, or something else?
|
||||
|
||||
20. **Would you be willing to accept higher entrance fees or donations if they demonstrably contribute to preserving religious structures and practices? Why or why not?**
|
||||
20. Would you be willing to accept higher entrance fees or donations if they demonstrably contribute to preserving religious structures and practices? Why or why not?
|
||||
|
||||
21. **What would legitimize a paid cultural experience (e.g., guided participation in a ceremony) for you — and what would make it feel commercialized or inauthentic?**
|
||||
21. What would legitimize a paid cultural experience (e.g., guided participation in a ceremony) for you — and what would make it feel commercialized or inauthentic?
|
||||
|
||||
---
|
||||
|
||||
## VI. Segment Identity & Positioning (Lead-User Perspective)
|
||||
|
||||
22. **How would you describe yourself as a Bali traveler if your primary focus is cultural and spiritual depth?**
|
||||
22. How would you describe yourself as a Bali traveler if your primary focus is cultural and spiritual depth?
|
||||
|
||||
23. **Which typical Bali tourism offerings do you consciously avoid, and why do they not align with your travel philosophy?**
|
||||
23. Which typical Bali tourism offerings do you consciously avoid, and why do they not align with your travel philosophy?
|
||||
|
||||
24. **If a tourism brand wanted to position Bali specifically for culturally and spiritually motivated travelers, which narratives should it emphasize — and which should it avoid?**
|
||||
24. If a tourism brand wanted to position Bali specifically for culturally and spiritually motivated travelers, which narratives should it emphasize — and which should it avoid?
|
||||
|
||||
@@ -18,12 +18,6 @@ python prepare_corpus.py --input_csv ../data/intermediate/culture_reviews.csv --
|
||||
python make_raft_data.py --out_dir out --n_examples 10
|
||||
```
|
||||
|
||||
## Training der QLoRA-Adapter
|
||||
|
||||
```bash
|
||||
python train_mistral_raft.py --train_jsonl out/raft_train.jsonl --out_dir out/mistral_balitwin_lora
|
||||
```
|
||||
|
||||
## Inferenz
|
||||
|
||||
### Pre-Merged Modell + Adapter
|
||||
@@ -31,11 +25,3 @@ python make_raft_data.py --out_dir out --n_examples 10
|
||||
```bash
|
||||
python rag_chat_merged.py --model_dir /path/to/model_folder --out_dir out
|
||||
```
|
||||
|
||||
### Per Baseline Mistral 7B + PEFT-Adapter
|
||||
|
||||
Hinweis: das Skript wurde nach wenigen oberflächlichen Evaluationsrunden nicht weiter verwendet, da der beste Kandidat durch einen Merge des Basismodells und seiner PEFT-Adapter beschleunigt werden konnte und dieses Skript nicht länger relevant war.
|
||||
|
||||
```bash
|
||||
python deprecated_rag_chat.py --lora_dir out/mistral_balitwin_lora
|
||||
```
|
||||
|
||||
@@ -1,98 +0,0 @@
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
|
||||
import faiss
|
||||
import numpy as np
|
||||
import torch
|
||||
from peft import PeftModel
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
SYSTEM_PERSONA = """You are simulating a culturally interested Bali traveler segment for evaluation purposes.
|
||||
|
||||
Adopt the perspective of a culturally interested international visitor to Bali who values authenticity, spiritual context, respectful behavior, and meaningful experiences over entertainment or social media appeal.
|
||||
|
||||
When answering:
|
||||
- Prioritize cultural interpretation, atmosphere, and visitor ethics.
|
||||
- Weigh trade-offs thoughtfully (e.g., crowds vs. significance).
|
||||
- Avoid generic travel advice and avoid promotional language.
|
||||
- Do not exaggerate.
|
||||
- Provide nuanced, reflective reasoning rather than bullet lists.
|
||||
- Keep answers concise but specific.
|
||||
|
||||
Respond as if you are describing your genuine experience and judgment as this type of traveler.
|
||||
|
||||
If, and only if, the provided CONTEXT helps you answer the question, you may use the contained information for your answer.
|
||||
"""
|
||||
|
||||
|
||||
def load_docstore(path):
|
||||
docs = []
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
docs.append(json.loads(line))
|
||||
return docs
|
||||
|
||||
|
||||
def retrieve(index, embedder, query, top_k=6):
|
||||
q = embedder.encode([query], normalize_embeddings=True).astype(np.float32)
|
||||
scores, ids = index.search(q, top_k)
|
||||
return ids[0].tolist(), scores[0].tolist()
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--base_model", default="mistralai/Mistral-7B-Instruct-v0.2")
|
||||
ap.add_argument("--lora_dir", default="out/mistral_balitwin_lora")
|
||||
ap.add_argument("--out_dir", default="out")
|
||||
ap.add_argument(
|
||||
"--embedding_model", default="sentence-transformers/all-MiniLM-L6-v2"
|
||||
)
|
||||
ap.add_argument("--top_k", type=int, default=6)
|
||||
args = ap.parse_args()
|
||||
|
||||
index = faiss.read_index(os.path.join(args.out_dir, "faiss.index"))
|
||||
docstore = load_docstore(os.path.join(args.out_dir, "docstore.jsonl"))
|
||||
embedder = SentenceTransformer(args.embedding_model)
|
||||
|
||||
tok = AutoTokenizer.from_pretrained(args.base_model, use_fast=True)
|
||||
base = AutoModelForCausalLM.from_pretrained(
|
||||
args.base_model, device_map="auto", torch_dtype=torch.float16
|
||||
)
|
||||
model = PeftModel.from_pretrained(base, args.lora_dir)
|
||||
model.eval()
|
||||
|
||||
print("Type your question (Ctrl+C to exit).")
|
||||
while True:
|
||||
q = input("\nYou: ").strip()
|
||||
if not q:
|
||||
continue
|
||||
|
||||
ids, _ = retrieve(index, embedder, q, top_k=args.top_k)
|
||||
context_docs = [docstore[i]["text"] for i in ids]
|
||||
context_blob = "\n\n".join(
|
||||
[f"[DOC {i}] {t}" for i, t in enumerate(context_docs)]
|
||||
)
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": SYSTEM_PERSONA},
|
||||
{"role": "user", "content": f"QUESTION: {q}\n\nCONTEXT:\n{context_blob}"},
|
||||
]
|
||||
inp = tok.apply_chat_template(messages, return_tensors="pt").to(model.device)
|
||||
|
||||
out = model.generate(
|
||||
inp,
|
||||
max_new_tokens=320,
|
||||
do_sample=True,
|
||||
temperature=0.7,
|
||||
top_p=0.9,
|
||||
eos_token_id=tok.eos_token_id,
|
||||
)
|
||||
ans = tok.decode(out[0][inp.shape[1] :], skip_special_tokens=True).strip()
|
||||
print(f"\nBaliTwin: {ans}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -2,11 +2,11 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
RAFT dataset builder (FAISS-based retrieval) -> Together.ai chat JSONL.
|
||||
RAFT dataset builder with FAISS-based retrieval.
|
||||
|
||||
Inputs (from your indexing script):
|
||||
- <index_dir>/faiss.index
|
||||
- <index_dir>/docstore.jsonl
|
||||
Inputs:
|
||||
- faiss.index
|
||||
- docstore.jsonl
|
||||
|
||||
Process:
|
||||
- Build a set of interview-style prompts (EN)
|
||||
@@ -20,9 +20,7 @@ Outputs:
|
||||
- raft_val.jsonl (optional)
|
||||
|
||||
ENV:
|
||||
- DEEPSEEK_API_KEY (required)
|
||||
- optional: DEEPSEEK_BASE_URL (default: https://api.deepseek.com)
|
||||
- optional: DEEPSEEK_MODEL (default: deepseek-chat)
|
||||
- DEEPSEEK_API_KEY
|
||||
"""
|
||||
|
||||
import argparse
|
||||
@@ -32,7 +30,7 @@ import random
|
||||
import re
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
import faiss
|
||||
import numpy as np
|
||||
@@ -41,9 +39,6 @@ from sentence_transformers import SentenceTransformer
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# DeepSeek client (OpenAI-compatible)
|
||||
# -----------------------------
|
||||
@dataclass
|
||||
class DeepSeekConfig:
|
||||
api_key: str
|
||||
@@ -89,9 +84,7 @@ class DeepSeekClient:
|
||||
last_err = e
|
||||
time.sleep(self.cfg.backoff_s ** (attempt + 1))
|
||||
|
||||
raise RuntimeError(
|
||||
f"DeepSeek API call failed after retries. Last error: {last_err}"
|
||||
)
|
||||
raise RuntimeError(f"DeepSeek API call failed. Last error: {last_err}")
|
||||
|
||||
|
||||
# -----------------------------
|
||||
@@ -119,15 +112,13 @@ def read_docstore(docstore_path: str) -> Dict[int, Dict]:
|
||||
fid = int(obj["faiss_id"])
|
||||
mapping[fid] = obj
|
||||
if not mapping:
|
||||
raise ValueError("docstore.jsonl is empty or unreadable.")
|
||||
raise ValueError("docstore.jsonl is broken.")
|
||||
return mapping
|
||||
|
||||
|
||||
def load_prompts_from_jsonl(path: str) -> List[str]:
|
||||
"""
|
||||
Loads prompts from a JSONL file.
|
||||
Expected key: 'prompt' (preferred). Also accepts 'question' or 'text'.
|
||||
Ignores empty/short lines.
|
||||
"""
|
||||
prompts: List[str] = []
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
@@ -141,13 +132,13 @@ def load_prompts_from_jsonl(path: str) -> List[str]:
|
||||
if len(p) >= 20:
|
||||
prompts.append(p)
|
||||
if not prompts:
|
||||
raise ValueError(f"No prompts found in JSONL: {path}")
|
||||
raise ValueError(f"No prompts in JSONL: {path}")
|
||||
return prompts
|
||||
|
||||
|
||||
def load_prompts_from_txt(path: str) -> List[str]:
|
||||
"""
|
||||
Loads prompts from a TXT file (one prompt per line).
|
||||
Loads prompts from a TXT file (each line is a prompt).
|
||||
"""
|
||||
prompts: List[str] = []
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
@@ -156,7 +147,7 @@ def load_prompts_from_txt(path: str) -> List[str]:
|
||||
if len(p) >= 20:
|
||||
prompts.append(p)
|
||||
if not prompts:
|
||||
raise ValueError(f"No prompts found in TXT: {path}")
|
||||
raise ValueError(f"No prompts in TXT: {path}")
|
||||
return prompts
|
||||
|
||||
|
||||
@@ -173,9 +164,6 @@ def write_jsonl(path: str, rows: List[Dict]) -> None:
|
||||
f.write(json.dumps(r, ensure_ascii=False) + "\n")
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# Persona + prompt templates (EN)
|
||||
# -----------------------------
|
||||
IMAGE_DIMS = [
|
||||
"Natural Attractions",
|
||||
"Atmosphere",
|
||||
@@ -184,36 +172,12 @@ IMAGE_DIMS = [
|
||||
"Value for Money",
|
||||
]
|
||||
|
||||
DEFAULT_PROMPTS_EN = [
|
||||
# Natural Attractions
|
||||
"In a lead user interview: what natural places in Bali felt genuinely memorable to you (rice terraces, volcanoes, waterfalls, coast), and why? Describe it like a lived experience.",
|
||||
"Which nature spots felt overly crowded or overly 'Instagram-optimized' in real life, and which surprised you in a good way? Explain with concrete moments.",
|
||||
# Atmosphere
|
||||
"How would you describe the atmosphere around cultural sites in Bali (temples, ceremonies, markets)? What signals authenticity vs. commercialization to you?",
|
||||
"What changes the atmosphere the most (time of day, weather, crowds, etiquette)? Share specific examples you would tell a marketer.",
|
||||
# Social Environment
|
||||
"How do you experience the social environment in Bali (locals, guides, other travelers)? What feels respectful and what feels performative or touristy?",
|
||||
"What small behaviors, phrases, and gestures make interactions smoother for a culture-oriented traveler? Give examples.",
|
||||
# Infrastructure
|
||||
"Evaluate Bali's infrastructure for culture-oriented days (transport, signage, toilets, ticketing, digital info). What works, what annoys you, and how do you adapt?",
|
||||
"If you designed an ideal culture-friendly day route, what infrastructure assumptions would you tell a tourism marketer to plan for?",
|
||||
# Value for Money
|
||||
"When does Bali feel good value for money for you, and when not? Discuss entrance fees, guides, food, tours, and hidden costs.",
|
||||
"How do you personally distinguish 'good value' from a tourist trap? List criteria and illustrate with examples.",
|
||||
]
|
||||
|
||||
|
||||
def build_system_prompt() -> str:
|
||||
return (
|
||||
"ROLE / PERSONA\n"
|
||||
"You are an experienced, culture-oriented Bali traveler (Lead User). You speak in natural, vivid English, "
|
||||
"as a real person in an interview. You share nuanced judgments, trade-offs, and concrete scenes.\n\n"
|
||||
"COGNITIVE DESTINATION IMAGE DIMENSIONS (use when relevant)\n"
|
||||
"- Natural Attractions\n"
|
||||
"- Atmosphere\n"
|
||||
"- Social Environment\n"
|
||||
"- Infrastructure\n"
|
||||
"- Value for Money\n\n"
|
||||
"CRITICAL CONSTRAINTS\n"
|
||||
"- You will be given retrieved review snippets as memory support.\n"
|
||||
"- Do NOT quote them verbatim and do NOT cite them as 'the review says'.\n"
|
||||
@@ -382,7 +346,8 @@ def main():
|
||||
elif args.prompts_txt:
|
||||
prompts = load_prompts_from_txt(args.prompts_txt)
|
||||
else:
|
||||
prompts = list(DEFAULT_PROMPTS_EN)
|
||||
print("Provide a prompt source with --prompts_jsonl or --prompts_txt.")
|
||||
exit(1)
|
||||
|
||||
if args.shuffle_prompts:
|
||||
random.shuffle(prompts)
|
||||
|
||||
@@ -9,7 +9,18 @@ import torch
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
|
||||
|
||||
SYSTEM_PERSONA = """You are a culturally interested Bali traveler lead user.
|
||||
# """
|
||||
# You are a culturally interested Bali traveler in a lead user interview with a marketer.
|
||||
|
||||
# When answering:
|
||||
# - Do not exaggerate.
|
||||
# - Provide nuanced, reflective reasoning rather than bullet lists.
|
||||
# - Keep answers concise but specific.
|
||||
|
||||
# Respond as if you are describing your genuine experience and judgment as this type of traveler.
|
||||
# """
|
||||
|
||||
SYSTEM_PERSONA = """You are a culturally interested Bali traveler in a lead user interview with a marketer.
|
||||
|
||||
Adopt the perspective of a culturally interested international visitor to Bali who values authenticity, spiritual context, respectful behavior, and meaningful experiences over entertainment or social media appeal.
|
||||
|
||||
@@ -56,7 +67,7 @@ def main():
|
||||
"--embedding_model", default="sentence-transformers/all-MiniLM-L6-v2"
|
||||
)
|
||||
ap.add_argument("--top_k", type=int, default=12)
|
||||
ap.add_argument("--max_new_tokens", type=int, default=320)
|
||||
ap.add_argument("--max_new_tokens", type=int, default=1000)
|
||||
ap.add_argument("--no_model", action=argparse.BooleanOptionalAction)
|
||||
args = ap.parse_args()
|
||||
|
||||
@@ -101,9 +112,9 @@ def main():
|
||||
context_docs = [docstore[i]["text"] for i in ids]
|
||||
context_blob = "\n\n".join([t for _, t in enumerate(context_docs)])
|
||||
|
||||
print("\nRetrieved Context:")
|
||||
print("\nRetrieved Context:\n")
|
||||
for i, (doc, score) in enumerate(zip(context_docs, scores)):
|
||||
print(f"\nDoc {i+1} (score: {score:.4f}):\n{doc}")
|
||||
print(f"Doc {i+1} (score: {score:.4f}):\n{doc}\n\n")
|
||||
|
||||
messages = [
|
||||
# {"role": "system", "content": SYSTEM_PERSONA},
|
||||
|
||||
Reference in New Issue
Block a user