diff --git a/bertopic/nb_bertopic.py b/bertopic/nb_bertopic.py
new file mode 100644
index 0000000..dd2ea41
--- /dev/null
+++ b/bertopic/nb_bertopic.py
@@ -0,0 +1,577 @@
+# ---
+# jupyter:
+# jupytext:
+# text_representation:
+# extension: .py
+# format_name: percent
+# format_version: '1.3'
+# jupytext_version: 1.18.0
+# kernelspec:
+# display_name: .venv
+# language: python
+# name: python3
+# ---
+
+# %% [markdown]
+# # Topic Detection: Bali Tourist Reviews
+#
+
+# %% [markdown]
+# ## Preparation
+#
+# ### Dependency Loading
+#
+
+# %%
+import json
+import pickle
+import re
+
+import gensim.corpora as corpora
+import nltk
+import numpy as np
+import pandas as pd
+import spacy
+from bertopic.representation import KeyBERTInspired
+from bertopic.vectorizers import ClassTfidfTransformer
+from gensim.models.coherencemodel import CoherenceModel
+from hdbscan import HDBSCAN
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+from sentence_transformers import SentenceTransformer
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+from umap import UMAP
+
+from bertopic import BERTopic
+
+nlp = spacy.load("en_core_web_sm")
+
+nltk.download("stopwords")
+nltk.download("punkt")
+nltk.download("wordnet")
+
+# %% [markdown]
+# ### Parameters and Tracking
+#
+
+# %%
+RECREATE_MODEL = True
+RECREATE_REDUCED_MODEL = True
+PROCESS_DATA = False
+REDUCE_OUTLIERS = True
+USE_CONDENSED_MODEL = False
+
+DATA_SAMPLE_SIZE = -1 # -1 for all data
+
+# Classical coherence score. Warning: needs swap to not kill your PC
+CALCULATE_COHERENCE = False
+
+# Vectorization
+MIN_DOCUMENT_FREQUENCY = 1
+MAX_NGRAM = 2
+
+# HDBSCAN Parameters
+MIN_TOPIC_SIZE = 200
+MIN_SAMPLES = 25
+
+# UMAP Parameters
+N_NEIGHBORS = 15
+N_COMPONENTS = 2
+MIN_DIST = 0.01
+
+# Topic Modeling
+TOP_N_WORDS = 10
+MAX_TOPICS = None # or "auto" to pass to HDBSCAN, None to skip
+
+# %% [markdown]
+# ### Data Loading & Preprocessing
+#
+
+# %%
+if DATA_SAMPLE_SIZE != -1:
+ reviews = (
+ pd.read_csv("../data/original/reviews.tab", sep="\t")
+ .sample(n=DATA_SAMPLE_SIZE)
+ .review.dropna()
+ .to_list()
+ )
+else:
+ reviews = (
+ pd.read_csv("../data/original/reviews.tab", sep="\t").review.dropna().to_list()
+ )
+
+print("Loaded {} reviews".format(len(reviews)))
+
+# %%
+# List of NE in Bali for NER enhancement
+with open("../data/supporting/bali_ner.json", "r") as f:
+ bali_places = json.load(f)
+bali_places_set = set(bali_places)
+
+# Stop word definition
+extra_stopwords = ["bali", "idr", "usd"]
+stop_words = set(stopwords.words("english"))
+with open("../data/supporting/stopwords-en.json", "r") as f:
+ extra_stopwords.extend(json.load(f))
+
+# Custom replacements
+rep = {
+ r"\\n": " ",
+ r"\n": " ",
+ r'\\"': "",
+ r'"': "",
+ "mongkey": "monkey",
+ "monky": "monkey",
+ "verry": "very",
+}
+rep = dict((re.escape(k), v) for k, v in rep.items())
+pattern = re.compile("|".join(rep.keys()))
+
+lemmatizer = WordNetLemmatizer()
+
+
+def preprocess(text):
+ # Step 1: Apply custom replacements (typos, special cases)
+ text = text.lower()
+ text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
+
+ # Step 2: Clean text
+ text = re.sub(r"\d+", " ", text)
+ text = re.sub(r"\W+", " ", text)
+
+ doc = nlp(text)
+
+ # Step 3: POS tagging and filtering
+ filtered_tokens = [
+ token.text
+ for token in doc
+ if token.pos_ in {"NOUN", "PROPN"}
+ or token.ent_type_ in {"GPE", "LOC", "FAC"}
+ or token.text in bali_places_set
+ ]
+
+ # Step 4: Lemmatization and stopword removal
+ lemmatized_tokens = [
+ lemmatizer.lemmatize(w)
+ for w in filtered_tokens
+ if w not in stop_words and w not in extra_stopwords and len(w) > 2
+ ]
+
+ return lemmatized_tokens
+
+
+# %%
+if PROCESS_DATA:
+ print("Processing reviews...")
+ reviews = [preprocess(review) for review in reviews]
+
+ with open("../data/intermediate/processed_texts.pkl", "wb") as f:
+ pickle.dump(reviews, f)
+else:
+ with open("../data/intermediate/processed_texts.pkl", "rb") as f:
+ reviews = pickle.load(f)
+ reviews = [
+ " ".join(review) if isinstance(review, list) else review
+ for review in reviews
+ ]
+
+print(reviews[:1])
+
+# %% [markdown]
+# ### Pre-calculate Embeddings
+#
+
+# %%
+embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
+embeddings = embedding_model.encode(reviews, show_progress_bar=True)
+
+# %% [markdown]
+# ## Model Creation
+#
+
+# %% [markdown]
+# ### Dimensionality Reduction (UMAP)
+#
+
+# %%
+umap_model = UMAP(
+ n_neighbors=N_NEIGHBORS,
+ n_components=N_COMPONENTS,
+ min_dist=MIN_DIST,
+ metric="cosine",
+ low_memory=True,
+ random_state=42,
+)
+reduced_embeddings = umap_model.fit_transform(embeddings)
+
+# %% [markdown]
+# ### BERTopic Model Creation
+#
+
+# %%
+if RECREATE_MODEL:
+ ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
+ vectorizer_model = CountVectorizer(
+ min_df=MIN_DOCUMENT_FREQUENCY, ngram_range=(1, MAX_NGRAM)
+ )
+
+ representation_model = KeyBERTInspired()
+ hdbscan_model = HDBSCAN(
+ min_cluster_size=MIN_TOPIC_SIZE,
+ min_samples=MIN_SAMPLES,
+ metric="euclidean",
+ cluster_selection_method="eom",
+ gen_min_span_tree=True,
+ prediction_data=True,
+ )
+
+ topic_model = BERTopic(
+ embedding_model=embedding_model,
+ ctfidf_model=ctfidf_model,
+ vectorizer_model=vectorizer_model,
+ umap_model=umap_model,
+ hdbscan_model=hdbscan_model,
+ representation_model=representation_model,
+ verbose=True,
+ calculate_probabilities=True,
+ language="english",
+ top_n_words=TOP_N_WORDS,
+ nr_topics=MAX_TOPICS,
+ )
+
+ topics, probs = topic_model.fit_transform(reviews, embeddings=embeddings)
+
+ topic_labels = topic_model.generate_topic_labels(
+ nr_words=3, topic_prefix=True, word_length=15, separator=" - "
+ )
+ topic_model.set_topic_labels(topic_labels)
+ BERTopic.save(topic_model, "output/model.bertopic")
+else:
+ print("Nevermind, loading existing model")
+ topic_model = BERTopic.load("output/model.bertopic")
+
+# %% [markdown]
+# ## Fine Tuning
+#
+# ### Topic Condensation
+#
+
+# %%
+if RECREATE_REDUCED_MODEL:
+ done = False
+ iteration = 1
+ while not done:
+ print(f"Iteration {iteration}")
+ iteration += 1
+ similarity_matrix = cosine_similarity(
+ np.array(topic_model.topic_embeddings_)[1:, :]
+ )
+ nothing_to_merge = True
+
+ for i in range(similarity_matrix.shape[0]):
+ for j in range(i + 1, similarity_matrix.shape[1]):
+ sim = similarity_matrix[i, j]
+ if sim > 0.9:
+ nothing_to_merge = False
+ t1, t2 = i, j
+ try:
+ t1_name = topic_model.get_topic_info(t1)["CustomName"][0]
+ t2_name = topic_model.get_topic_info(t2)["CustomName"][0]
+ print(
+ f"Merging topics {t1} ({t1_name}) and {t2} ({t2_name}) with similarity {sim:.2f}"
+ )
+ topic_model.merge_topics(reviews, topics_to_merge=[t1, t2])
+
+ topic_labels = topic_model.generate_topic_labels(
+ nr_words=3,
+ topic_prefix=True,
+ word_length=15,
+ separator=" - ",
+ )
+ topic_model.set_topic_labels(topic_labels)
+ except Exception as e:
+ print(f"Failed to merge {t1} and {t2}: {e}")
+ if nothing_to_merge:
+ print("No more topics to merge.")
+ done = True
+
+ # BERTopic.save(topic_model, "bertopic/model_reduced.bertopic")
+elif USE_CONDENSED_MODEL:
+ print("Nevermind, loading existing reduced model")
+ topic_model = BERTopic.load("bertopic/model_reduced.bertopic")
+else:
+ print("Skipping topic reduction")
+
+# %% [markdown]
+# ### Outlier Reduction
+#
+
+# %%
+if REDUCE_OUTLIERS:
+ new_topics = topic_model.reduce_outliers(
+ reviews,
+ topic_model.topics_,
+ probabilities=topic_model.probabilities_,
+ threshold=0.05,
+ strategy="probabilities",
+ )
+ topic_model.update_topics(reviews, topics=new_topics)
+
+# %% [markdown]
+# ## Results
+#
+# ### Classification
+#
+
+# %%
+import random
+from pathlib import Path
+
+# --- config ---
+topics_to_keep = {2, 4, 6, 8, 10, 5, 7}
+INPUT_PATH = "../data/original/reviews.tab" # TSV with a 'review' column
+OUTPUT_CSV = "../data/intermediate/selected_topics_documents.csv"
+OUTPUT_DIR = Path("../raft/corpus")
+OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+
+BATCH_SIZE = 60
+MIN_CHARS = 40
+SEED = 42
+
+# --- load data ---
+data = pd.read_csv(INPUT_PATH, sep="\t")
+
+# If you already have `reviews` elsewhere, replace the next line with that variable
+reviews = data["review"].astype(str).fillna("")
+
+# Topic model document info
+df = topic_model.get_document_info(reviews) # assumes your model is already fitted
+df["Original"] = reviews.values
+
+# --- filter by topics and length ---
+filtered = df[df["Topic"].isin(topics_to_keep)].copy()
+filtered["Original"] = filtered["Original"].str.strip()
+filtered = filtered[filtered["Original"].str.len() >= MIN_CHARS]
+
+# Save an audit CSV
+filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False)
+
+# --- deterministic shuffle + write batched corpus files ---
+total_files = 0
+total_reviews = 0
+rng = random.Random(SEED)
+
+for topic_val, g in filtered.groupby("Topic", sort=True):
+ reviews_list = g["Original"].tolist()
+
+ # deterministic shuffle within topic
+ rng.shuffle(reviews_list)
+
+ # chunk into batches of up to 60
+ for start in range(0, len(reviews_list), BATCH_SIZE):
+ chunk = reviews_list[start : start + BATCH_SIZE]
+ if not chunk:
+ continue
+
+ # simple header for traceability
+ header = (
+ f"[TOPIC] {topic_val}\n" f"[Stats] N={len(chunk)} | Source={INPUT_PATH}\n"
+ )
+
+ lines = [header, ""]
+ for i, txt in enumerate(chunk, 1):
+ lines.append(f"({i}) {txt}")
+
+ part_idx = start // BATCH_SIZE + 1
+ fname = f"topic={topic_val}__part={part_idx:03d}__n={len(chunk)}.txt"
+ (OUTPUT_DIR / fname).write_text("\n".join(lines), encoding="utf-8")
+
+ total_files += 1
+ total_reviews += len(chunk)
+
+print(
+ f"[green]Wrote {total_files} docs with {total_reviews} reviews to {OUTPUT_DIR}[/green]"
+)
+print(f"[green]Filtered CSV saved to {OUTPUT_CSV}[/green]")
+
+# %%
+doc_topic_matrix = probs
+
+# column names
+topicnames = ["Topic " + str(i) for i in range(len(set(topics)) - 1)]
+
+# index names
+docnames = ["Review " + str(i) for i in range(len(reviews))]
+
+# Make the pandas dataframe
+df_document_topic = pd.DataFrame(
+ np.round(doc_topic_matrix, 2), columns=topicnames, index=docnames
+)
+
+# Get dominant topic for each document
+dominant_topic = np.argmax(doc_topic_matrix, axis=1)
+df_document_topic["dominant_topic"] = dominant_topic
+
+
+# Styling
+def color_stuff(val):
+ if val > 0.1:
+ color = "green"
+ elif val > 0.05:
+ color = "orange"
+ else:
+ color = "grey"
+ return "color: {col}".format(col=color)
+
+
+def make_bold(val):
+ weight = 700 if val > 0.1 else 400
+ return "font-weight: {weight}".format(weight=weight)
+
+
+# Apply Style
+df_document_topics = (
+ df_document_topic.head(15).style.applymap(color_stuff).applymap(make_bold)
+)
+df_document_topics
+
+# %% [markdown]
+# ### Document Visualization
+#
+
+# %%
+vis = topic_model.visualize_documents(
+ docs=reviews,
+ reduced_embeddings=reduced_embeddings,
+ custom_labels=True,
+ hide_annotations=True,
+)
+vis.write_html("output/visualization.html")
+vis
+
+# %% [markdown]
+# ### Similarity Matrix
+#
+
+# %%
+topic_model.visualize_heatmap()
+
+# %% [markdown]
+# ### Topic Info
+#
+
+# %%
+topic_model.get_topic_info()
+
+# %% [markdown]
+# ### Semantic Coherence
+#
+
+# %%
+topic_words = []
+for topic_id in topic_model.get_topic_info()["Topic"]:
+ # Skip outlier topic
+ if topic_id < 0:
+ continue
+
+ words = [word for word, _ in topic_model.get_topic(topic_id)]
+ topic_words.append(words)
+
+# Compute mean pairwise cosine similarity for each topic
+coherence_scores = []
+for words in topic_words:
+ coherence_embeddings = embedding_model.encode(words)
+ sim_matrix = cosine_similarity(coherence_embeddings)
+
+ # Ignore self-similarity
+ np.fill_diagonal(sim_matrix, 0)
+ mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)])
+ coherence_scores.append(mean_sim)
+
+overall_coherence = np.mean(coherence_scores)
+
+print(len(reviews), "reviews processed")
+print(len(topic_model.get_topic_info()) - 1, "topics found")
+print(f"BERT-based Topic Coherence: {overall_coherence:.4f}")
+
+# %% [markdown]
+# ### Topic Coherence
+#
+
+# %%
+# https://github.com/MaartenGr/BERTopic/issues/90#issuecomment-820915389
+
+if CALCULATE_COHERENCE:
+ # Preprocess Documents
+ documents = pd.DataFrame(
+ {"Document": reviews, "ID": range(len(reviews)), "Topic": topics}
+ )
+ documents_per_topic = documents.groupby(["Topic"], as_index=False).agg(
+ {"Document": " ".join}
+ )
+ cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)
+
+ # Extract vectorizer and analyzer from BERTopic
+ vectorizer = topic_model.vectorizer_model
+ analyzer = vectorizer.build_analyzer()
+
+ # Extract features for Topic Coherence evaluation
+ words = vectorizer.get_feature_names_out()
+ tokens = [analyzer(doc) for doc in cleaned_docs]
+ dictionary = corpora.Dictionary(tokens)
+ corpus = [dictionary.doc2bow(token) for token in tokens]
+ topic_words = [
+ [words for words, _ in topic_model.get_topic(topic)]
+ for topic in range(len(set(topics)) - 1)
+ ]
+
+ # %env TOKENIZERS_PARALLELISM=false
+
+ for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]:
+ coherence_model = CoherenceModel(
+ topics=topic_words,
+ texts=tokens,
+ corpus=corpus,
+ dictionary=dictionary,
+ coherence=measurement,
+ )
+ coherence_score = coherence_model.get_coherence()
+ print(f"Coherence ({measurement}): {coherence_score:.4f}")
+else:
+ print("Skipping classical coherence calculation")
+
+# %% [markdown]
+# ### Term Search
+#
+
+# %%
+search_term = "uluwatu"
+
+similar_topics, similarities = topic_model.find_topics(search_term, top_n=10)
+for i in range(len(similar_topics)):
+ # \n{topic_model.get_topic(similar_topics[i])}\n
+ print(
+ f"{str(similarities[i])[:5]} {topic_model.get_topic_info(similar_topics[i])["CustomName"][0]}"
+ )
+
+# %% [markdown]
+# ### Topic Hierarchy
+#
+
+# %%
+topic_model.visualize_hierarchy(custom_labels=True)
+
+# %% [markdown]
+# ### Intertopic Distance Map
+#
+
+# %%
+topic_model.visualize_topics()
+
+# %% [markdown]
+# ### Topic Word Scores
+#
+
+# %%
+topic_model.visualize_barchart(top_n_topics=12, custom_labels=True, n_words=10)
diff --git a/bertopic/nb_bertopic_temples.py b/bertopic/nb_bertopic_temples.py
index 152cac5..66663be 100644
--- a/bertopic/nb_bertopic_temples.py
+++ b/bertopic/nb_bertopic_temples.py
@@ -70,8 +70,8 @@ MIN_SAMPLES = 15
# UMAP Parameters
N_NEIGHBORS = 15
-N_COMPONENTS = 2
-MIN_DIST = 0.01
+N_COMPONENTS = 5
+MIN_DIST = 0.1
# Topic Modeling
TOP_N_WORDS = 10
diff --git a/figures/bali_destinations_labeled.html b/figures/bali_destinations_labeled.html
deleted file mode 100644
index ac6bed1..0000000
--- a/figures/bali_destinations_labeled.html
+++ /dev/null
@@ -1,722 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/figures/bali_map.py b/figures/bali_map.py
deleted file mode 100644
index 38036d4..0000000
--- a/figures/bali_map.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# bali_map.py
-# Creates an interactive HTML map of Bali (and nearby islands) with readable, always-visible labels.
-
-import folium
-
-DESTINATIONS = {
- "Sacred Monkey Forest": (
- -8.5187511,
- 115.2585973,
- ), # :contentReference[oaicite:0]{index=0}
- "Uluwatu Temple": (
- -8.8291432,
- 115.0849069,
- ), # :contentReference[oaicite:1]{index=1}
- "Sanur Beach": (-8.673889, 115.263611), # :contentReference[oaicite:2]{index=2}
- "Tanah Lot Temple": (
- -8.618786,
- 115.086733,
- ), # :contentReference[oaicite:3]{index=3}
- "Seminyak Beach": (-8.6925, 115.158611), # :contentReference[oaicite:4]{index=4}
- "Nusa Dua": (-8.791918, 115.225375), # :contentReference[oaicite:5]{index=5}
- "Bali Zoo": (-8.59128, 115.26456), # :contentReference[oaicite:6]{index=6}
- "Mount Batur": (-8.23889, 115.37750), # :contentReference[oaicite:7]{index=7}
- "Ulun Danu Bratan": (
- -8.275177,
- 115.1668487,
- ), # :contentReference[oaicite:8]{index=8}
- "Tirta Gangga": (-8.411944, 115.5875), # :contentReference[oaicite:9]{index=9}
- "Pandawa Beach": (-8.84586, 115.18417), # :contentReference[oaicite:10]{index=10}
- "Jimbaran Bay": (-8.79093, 115.16006), # :contentReference[oaicite:11]{index=11}
- "Double Six Beach": (
- -8.6975074,
- 115.1610332,
- ), # :contentReference[oaicite:12]{index=12}
- "Devil Tears": (-8.6905650, 115.4302884), # :contentReference[oaicite:13]{index=13}
- "Kelingking Beach": (
- -8.750644,
- 115.474693,
- ), # :contentReference[oaicite:14]{index=14}
- "Lempuyang Temple": (
- -8.395195,
- 115.647885,
- ), # :contentReference[oaicite:15]{index=15}
- "Canggu Beach": (-8.639877, 115.140172), # :contentReference[oaicite:16]{index=16}
- "Mount Agung": (-8.340686, 115.503622), # :contentReference[oaicite:17]{index=17}
-}
-
-# --- Map base ---
-m = folium.Map(
- location=(-8.45, 115.20),
- zoom_start=9,
- tiles="CartoDB positron",
- control_scale=True,
- zoom_snap=0.1,
- zoom_delta=0.1,
- max_zoom=18,
-)
-
-# --- Label styling (readable, always visible) ---
-LABEL_STYLE = """
-padding: 3px 6px;
-font-size: 16px;
-font-weight: 600;
-color: #111;
-white-space: nowrap;
-"""
-
-# Per-label pixel offsets (x, y). Positive y moves the label down.
-LABEL_OFFSETS = {
- "Nusa Dua": (0, 20),
- "Double Six Beach": (0, 20),
-}
-
-
-def add_point_with_label(name: str, lat: float, lon: float):
- # Small dot at the exact coordinate
- folium.CircleMarker(
- location=(lat, lon),
- radius=4,
- weight=2,
- fill=True,
- fill_opacity=1.0,
- tooltip=name, # still useful on hover
- ).add_to(m)
-
- # Slightly offset label so it doesn't sit directly on the dot
- offset_x, offset_y = LABEL_OFFSETS.get(name, (0, 0))
- base_anchor_x, base_anchor_y = (-8, 12)
- folium.Marker(
- location=(lat, lon),
- icon=folium.DivIcon(
- icon_size=(1, 1),
- icon_anchor=(
- base_anchor_x + offset_x,
- base_anchor_y - offset_y,
- ), # pixel offset: left/up relative to point
- html=f'{name}
',
- ),
- ).add_to(m)
-
-
-# Add all destinations
-lats, lons = [], []
-for name, (lat, lon) in DESTINATIONS.items():
- add_point_with_label(name, lat, lon)
- lats.append(lat)
- lons.append(lon)
-
-# Fit map bounds to include Nusa Penida / Lembongan as well
-pad = 0.005
-m.fit_bounds([[min(lats) - pad, min(lons) - pad], [max(lats) + pad, max(lons) + pad]])
-
-# Output
-out_file = "bali_destinations_labeled.html"
-m.save(out_file)
-print(f"Saved: {out_file}")
diff --git a/figures/bargraph.py b/figures/bargraph.py
deleted file mode 100644
index 0078a7f..0000000
--- a/figures/bargraph.py
+++ /dev/null
@@ -1,114 +0,0 @@
-import argparse
-import json
-import os
-import sys
-
-import matplotlib.pyplot as plt
-
-
-def load_json_data(file_path):
- """
- Load and validate JSON data from a file.
- Expected format:
- {
- "label1": value1,
- "label2": value2,
- ...
- }
- """
- if not os.path.exists(file_path):
- raise FileNotFoundError(f"File not found: {file_path}")
-
- with open(file_path, "r", encoding="utf-8") as f:
- data = json.load(f)
-
- if not isinstance(data, dict):
- raise ValueError(
- "JSON must be an object with key-value pairs (labels: values)."
- )
-
- for key, value in data.items():
- if not isinstance(key, str):
- raise ValueError("All keys must be strings (labels).")
- if not isinstance(value, (int, float)):
- raise ValueError("All values must be numeric (int or float).")
-
- return data
-
-
-def create_bar_graph(
- data, title="Bar Graph", x_label="Labels", y_label="Values", output=None
-):
- """
- Create a bar graph from a dictionary of data.
- """
- labels = list(data.keys())
- values = list(data.values())
-
- plt.figure(figsize=(10, 6))
- plt.bar(labels, values)
- plt.xlabel(x_label)
- plt.ylabel(y_label)
- plt.title(title)
- plt.xticks(rotation=45)
- plt.tight_layout()
-
- if output:
- plt.savefig(output)
- print(f"Graph saved to: {output}")
- else:
- plt.show()
-
-
-def main():
- parser = argparse.ArgumentParser(
- description="Generate a bar graph from a JSON file containing key-value pairs."
- )
- parser.add_argument(
- "json_path",
- type=str,
- help="Path to the JSON file (e.g., data.json)",
- )
- parser.add_argument(
- "--title",
- type=str,
- default="Bar Graph",
- help="Title of the bar graph",
- )
- parser.add_argument(
- "--x_label",
- type=str,
- default="Labels",
- help="Label for the x-axis",
- )
- parser.add_argument(
- "--y_label",
- type=str,
- default="Values",
- help="Label for the y-axis",
- )
- parser.add_argument(
- "--output",
- type=str,
- default=None,
- help="Optional output file path (e.g., graph.png). If not provided, the graph will be displayed.",
- )
-
- args = parser.parse_args()
-
- try:
- data = load_json_data(args.json_path)
- create_bar_graph(
- data,
- title=args.title,
- x_label=args.x_label,
- y_label=args.y_label,
- output=args.output,
- )
- except Exception as e:
- print(f"Error: {e}", file=sys.stderr)
- sys.exit(1)
-
-
-if __name__ == "__main__":
- main()
diff --git a/figures/requirements.txt b/figures/requirements.txt
deleted file mode 100644
index 9aad9da..0000000
--- a/figures/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-matplotlib
-folium
-pandas
diff --git a/figures/review_dist.py b/figures/review_dist.py
deleted file mode 100644
index b92d989..0000000
--- a/figures/review_dist.py
+++ /dev/null
@@ -1,101 +0,0 @@
-#!/usr/bin/env python3
-"""
-Read a .tab (TSV) file with a single column named 'review'.
-1) Print number of rows
-2) Drop exact duplicate reviews and print count again
-3) Build JSON describing the distribution of review length (in words) for remaining reviews
-"""
-
-import argparse
-import json
-import sys
-from collections import Counter
-from pathlib import Path
-
-import pandas as pd
-
-
-def word_count(text: str) -> int:
- # Count words by whitespace splitting after stripping.
- # Treat non-string / NaN as 0 words (you can change this if you want to drop them).
- if not isinstance(text, str):
- return 0
- s = text.strip()
- if not s:
- return 0
- return len(s.split())
-
-
-def main() -> int:
- parser = argparse.ArgumentParser()
- parser.add_argument(
- "input_tab", help="Path to .tab/.tsv file with a 'review' column"
- )
- parser.add_argument(
- "--out",
- default="review_length_distribution.json",
- help="Output JSON path (default: review_length_distribution.json)",
- )
- args = parser.parse_args()
-
- in_path = Path(args.input_tab)
- if not in_path.exists():
- print(f"ERROR: file not found: {in_path}", file=sys.stderr)
- return 1
-
- # Read as TSV. Keep empty strings; pandas will use NaN for empty fields unless keep_default_na=False.
- df = pd.read_csv(in_path, sep="\t", dtype=str, keep_default_na=False)
-
- if "review" not in df.columns:
- print(
- f"ERROR: expected a column named 'review'. Found: {list(df.columns)}",
- file=sys.stderr,
- )
- return 1
-
- n_before = len(df)
- print(f"Rows before dedup: {n_before}")
-
- # Exact duplicates based on the full string in "review".
- # If you want to ignore leading/trailing spaces, do df['review']=df['review'].str.strip() first.
- df_dedup = df.drop_duplicates(subset=["review"], keep="first").reset_index(
- drop=True
- )
-
- n_after = len(df_dedup)
- print(f"Rows after dedup: {n_after}")
-
- # Compute word counts for remaining reviews
- lengths = df_dedup["review"].map(word_count)
-
- # Distribution (histogram): word_count -> number of reviews
- dist = Counter(lengths.tolist())
-
- result = {
- "file": str(in_path),
- "rows_before_dedup": n_before,
- "rows_after_dedup": n_after,
- "distribution_word_length": {
- # JSON keys must be strings; keep as strings for portability.
- str(k): v
- for k, v in sorted(dist.items(), key=lambda kv: int(kv[0]))
- },
- "summary": {
- "min_words": int(lengths.min()) if len(lengths) else 0,
- "max_words": int(lengths.max()) if len(lengths) else 0,
- "mean_words": float(lengths.mean()) if len(lengths) else 0.0,
- "median_words": float(lengths.median()) if len(lengths) else 0.0,
- },
- }
-
- out_path = Path(args.out)
- out_path.write_text(
- json.dumps(result, ensure_ascii=False, indent=2), encoding="utf-8"
- )
- print(f"Wrote JSON: {out_path}")
-
- return 0
-
-
-if __name__ == "__main__":
- raise SystemExit(main())
diff --git a/figures/review_length_info.json b/figures/review_length_info.json
deleted file mode 100644
index 9f55d2d..0000000
--- a/figures/review_length_info.json
+++ /dev/null
@@ -1,604 +0,0 @@
-{
- "file": "../data/original/reviews.tab",
- "rows_before_dedup": 56446,
- "rows_after_dedup": 55662,
- "distribution_word_length": {
- "8": 1,
- "9": 5,
- "10": 10,
- "11": 14,
- "12": 20,
- "13": 29,
- "14": 37,
- "15": 92,
- "16": 163,
- "17": 308,
- "18": 482,
- "19": 728,
- "20": 859,
- "21": 977,
- "22": 944,
- "23": 989,
- "24": 937,
- "25": 1032,
- "26": 946,
- "27": 927,
- "28": 928,
- "29": 920,
- "30": 926,
- "31": 879,
- "32": 897,
- "33": 856,
- "34": 759,
- "35": 829,
- "36": 774,
- "37": 708,
- "38": 771,
- "39": 717,
- "40": 693,
- "41": 737,
- "42": 734,
- "43": 655,
- "44": 616,
- "45": 630,
- "46": 680,
- "47": 609,
- "48": 588,
- "49": 586,
- "50": 598,
- "51": 562,
- "52": 543,
- "53": 563,
- "54": 549,
- "55": 551,
- "56": 478,
- "57": 522,
- "58": 450,
- "59": 515,
- "60": 509,
- "61": 461,
- "62": 453,
- "63": 451,
- "64": 483,
- "65": 403,
- "66": 442,
- "67": 404,
- "68": 418,
- "69": 389,
- "70": 394,
- "71": 355,
- "72": 357,
- "73": 389,
- "74": 360,
- "75": 356,
- "76": 338,
- "77": 330,
- "78": 308,
- "79": 327,
- "80": 303,
- "81": 302,
- "82": 306,
- "83": 273,
- "84": 276,
- "85": 265,
- "86": 268,
- "87": 263,
- "88": 264,
- "89": 229,
- "90": 244,
- "91": 239,
- "92": 212,
- "93": 267,
- "94": 211,
- "95": 226,
- "96": 247,
- "97": 219,
- "98": 239,
- "99": 201,
- "100": 220,
- "101": 213,
- "102": 180,
- "103": 194,
- "104": 204,
- "105": 201,
- "106": 200,
- "107": 149,
- "108": 189,
- "109": 196,
- "110": 178,
- "111": 140,
- "112": 157,
- "113": 150,
- "114": 160,
- "115": 130,
- "116": 151,
- "117": 159,
- "118": 151,
- "119": 118,
- "120": 138,
- "121": 115,
- "122": 107,
- "123": 121,
- "124": 99,
- "125": 135,
- "126": 126,
- "127": 125,
- "128": 97,
- "129": 99,
- "130": 95,
- "131": 92,
- "132": 86,
- "133": 108,
- "134": 115,
- "135": 101,
- "136": 101,
- "137": 103,
- "138": 91,
- "139": 81,
- "140": 92,
- "141": 91,
- "142": 95,
- "143": 76,
- "144": 84,
- "145": 91,
- "146": 84,
- "147": 87,
- "148": 92,
- "149": 73,
- "150": 78,
- "151": 71,
- "152": 76,
- "153": 87,
- "154": 60,
- "155": 67,
- "156": 67,
- "157": 88,
- "158": 56,
- "159": 66,
- "160": 41,
- "161": 56,
- "162": 61,
- "163": 68,
- "164": 62,
- "165": 67,
- "166": 52,
- "167": 62,
- "168": 47,
- "169": 41,
- "170": 49,
- "171": 47,
- "172": 43,
- "173": 39,
- "174": 61,
- "175": 56,
- "176": 55,
- "177": 47,
- "178": 34,
- "179": 44,
- "180": 43,
- "181": 37,
- "182": 48,
- "183": 47,
- "184": 39,
- "185": 38,
- "186": 42,
- "187": 42,
- "188": 35,
- "189": 43,
- "190": 39,
- "191": 38,
- "192": 37,
- "193": 27,
- "194": 28,
- "195": 40,
- "196": 33,
- "197": 36,
- "198": 40,
- "199": 35,
- "200": 30,
- "201": 28,
- "202": 28,
- "203": 26,
- "204": 28,
- "205": 32,
- "206": 31,
- "207": 36,
- "208": 36,
- "209": 24,
- "210": 20,
- "211": 34,
- "212": 26,
- "213": 31,
- "214": 27,
- "215": 25,
- "216": 23,
- "217": 26,
- "218": 20,
- "219": 20,
- "220": 20,
- "221": 28,
- "222": 15,
- "223": 18,
- "224": 17,
- "225": 22,
- "226": 16,
- "227": 29,
- "228": 27,
- "229": 23,
- "230": 14,
- "231": 23,
- "232": 22,
- "233": 21,
- "234": 23,
- "235": 16,
- "236": 18,
- "237": 14,
- "238": 11,
- "239": 17,
- "240": 8,
- "241": 16,
- "242": 12,
- "243": 18,
- "244": 15,
- "245": 11,
- "246": 24,
- "247": 14,
- "248": 18,
- "249": 15,
- "250": 11,
- "251": 17,
- "252": 17,
- "253": 15,
- "254": 17,
- "255": 18,
- "256": 14,
- "257": 21,
- "258": 13,
- "259": 16,
- "260": 10,
- "261": 20,
- "262": 8,
- "263": 9,
- "264": 11,
- "265": 16,
- "266": 6,
- "267": 14,
- "268": 14,
- "269": 12,
- "270": 11,
- "271": 12,
- "272": 9,
- "273": 5,
- "274": 7,
- "275": 4,
- "276": 6,
- "277": 10,
- "278": 11,
- "279": 13,
- "280": 7,
- "281": 9,
- "282": 6,
- "283": 9,
- "284": 10,
- "285": 9,
- "286": 11,
- "287": 8,
- "288": 5,
- "289": 6,
- "290": 8,
- "291": 4,
- "292": 11,
- "293": 6,
- "294": 11,
- "295": 11,
- "296": 7,
- "297": 4,
- "298": 7,
- "299": 13,
- "300": 7,
- "301": 15,
- "302": 10,
- "303": 7,
- "304": 11,
- "305": 3,
- "306": 7,
- "307": 8,
- "308": 6,
- "309": 4,
- "310": 7,
- "311": 4,
- "312": 8,
- "313": 5,
- "314": 1,
- "315": 8,
- "316": 8,
- "317": 9,
- "318": 8,
- "319": 6,
- "320": 8,
- "321": 2,
- "322": 8,
- "323": 6,
- "324": 9,
- "325": 6,
- "326": 8,
- "327": 3,
- "328": 8,
- "329": 7,
- "330": 5,
- "331": 8,
- "332": 7,
- "333": 2,
- "334": 1,
- "335": 9,
- "336": 4,
- "337": 6,
- "338": 4,
- "339": 3,
- "340": 6,
- "341": 5,
- "342": 3,
- "343": 4,
- "344": 3,
- "345": 5,
- "346": 3,
- "347": 5,
- "348": 3,
- "349": 3,
- "350": 3,
- "351": 2,
- "352": 8,
- "353": 4,
- "354": 4,
- "355": 4,
- "356": 3,
- "357": 4,
- "358": 3,
- "359": 3,
- "360": 8,
- "361": 6,
- "362": 5,
- "363": 8,
- "364": 4,
- "365": 6,
- "366": 3,
- "367": 7,
- "368": 4,
- "369": 8,
- "370": 2,
- "371": 2,
- "372": 7,
- "373": 5,
- "374": 4,
- "375": 1,
- "376": 1,
- "377": 3,
- "378": 1,
- "379": 2,
- "380": 2,
- "381": 2,
- "382": 3,
- "383": 2,
- "384": 1,
- "385": 1,
- "386": 2,
- "387": 4,
- "388": 6,
- "389": 4,
- "390": 4,
- "391": 3,
- "392": 3,
- "393": 2,
- "394": 2,
- "395": 7,
- "396": 6,
- "397": 2,
- "398": 2,
- "401": 1,
- "402": 5,
- "403": 1,
- "404": 3,
- "405": 4,
- "406": 1,
- "407": 1,
- "409": 3,
- "410": 2,
- "411": 1,
- "412": 1,
- "413": 2,
- "414": 3,
- "415": 4,
- "416": 2,
- "417": 2,
- "418": 3,
- "419": 1,
- "420": 2,
- "421": 4,
- "422": 1,
- "424": 3,
- "425": 4,
- "426": 4,
- "427": 1,
- "428": 1,
- "429": 2,
- "430": 2,
- "431": 4,
- "433": 1,
- "434": 1,
- "436": 1,
- "437": 1,
- "438": 5,
- "439": 1,
- "440": 2,
- "441": 1,
- "443": 4,
- "444": 3,
- "445": 1,
- "446": 5,
- "448": 1,
- "449": 4,
- "451": 2,
- "452": 1,
- "455": 3,
- "456": 1,
- "457": 1,
- "458": 1,
- "459": 1,
- "463": 2,
- "464": 1,
- "465": 2,
- "466": 2,
- "467": 2,
- "469": 1,
- "470": 1,
- "474": 1,
- "475": 5,
- "476": 1,
- "477": 1,
- "478": 1,
- "479": 3,
- "481": 1,
- "482": 1,
- "484": 1,
- "485": 2,
- "489": 1,
- "490": 1,
- "494": 3,
- "495": 1,
- "497": 1,
- "499": 1,
- "501": 1,
- "502": 1,
- "503": 1,
- "504": 1,
- "505": 1,
- "506": 1,
- "508": 3,
- "510": 2,
- "511": 4,
- "518": 1,
- "519": 2,
- "520": 1,
- "522": 1,
- "523": 1,
- "524": 1,
- "525": 1,
- "526": 1,
- "527": 1,
- "537": 1,
- "540": 1,
- "541": 1,
- "543": 1,
- "545": 2,
- "546": 3,
- "554": 1,
- "555": 1,
- "557": 2,
- "558": 1,
- "559": 1,
- "562": 1,
- "564": 3,
- "566": 1,
- "568": 1,
- "573": 1,
- "578": 2,
- "580": 2,
- "581": 1,
- "583": 1,
- "584": 1,
- "585": 1,
- "586": 1,
- "588": 1,
- "592": 1,
- "594": 2,
- "595": 1,
- "597": 2,
- "598": 1,
- "601": 1,
- "609": 1,
- "610": 1,
- "612": 1,
- "613": 2,
- "615": 1,
- "618": 2,
- "620": 2,
- "622": 1,
- "623": 1,
- "624": 1,
- "626": 1,
- "635": 1,
- "637": 1,
- "639": 1,
- "643": 2,
- "645": 1,
- "649": 2,
- "651": 1,
- "654": 1,
- "658": 1,
- "661": 1,
- "667": 1,
- "670": 1,
- "671": 1,
- "672": 1,
- "673": 1,
- "676": 1,
- "679": 2,
- "686": 1,
- "691": 1,
- "694": 2,
- "698": 1,
- "701": 1,
- "708": 1,
- "710": 1,
- "711": 1,
- "715": 1,
- "719": 1,
- "723": 1,
- "729": 2,
- "737": 1,
- "739": 1,
- "745": 1,
- "747": 1,
- "753": 1,
- "755": 1,
- "756": 1,
- "765": 1,
- "786": 1,
- "794": 1,
- "799": 1,
- "810": 1,
- "813": 1,
- "816": 2,
- "822": 1,
- "873": 1,
- "880": 1,
- "891": 1,
- "912": 1,
- "945": 1,
- "957": 1,
- "960": 1,
- "987": 1,
- "992": 1,
- "1005": 1,
- "1035": 1,
- "1046": 1,
- "1073": 1,
- "1096": 1,
- "1099": 1,
- "1196": 2,
- "1233": 1,
- "1263": 1,
- "1329": 1,
- "1597": 1,
- "1699": 1,
- "1893": 1,
- "2244": 1,
- "2537": 1
- },
- "summary": {
- "min_words": 8,
- "max_words": 2537,
- "mean_words": 72.6454133879487,
- "median_words": 53.0
- }
-}
diff --git a/figures/review_lengths.json b/figures/review_lengths.json
deleted file mode 100644
index 6c800d5..0000000
--- a/figures/review_lengths.json
+++ /dev/null
@@ -1,31 +0,0 @@
-{
- "<10": 6,
- "10-19": 1883,
- "20-29": 9459,
- "30-39": 8116,
- "40-49": 6528,
- "50-59": 5331,
- "60-69": 4413,
- "70-79": 3514,
- "80-89": 2749,
- "90-99": 2305,
- "100-109": 1946,
- "110-119": 1494,
- "120-129": 1162,
- "130-139": 973,
- "140-149": 865,
- "150-159": 716,
- "160-169": 557,
- "170-179": 475,
- "180-189": 414,
- "190-199": 353,
- "200-219": 551,
- "220-239": 394,
- "240-259": 310,
- "260-279": 208,
- "280-299": 162,
- "300-399": 479,
- "400-499": 145,
- "500-999": 138,
- "1000+": 16
-}
diff --git a/figures/reviews_attraktionen.json b/figures/reviews_attraktionen.json
deleted file mode 100644
index 4d59a36..0000000
--- a/figures/reviews_attraktionen.json
+++ /dev/null
@@ -1,20 +0,0 @@
-{
- "Sacred Monkey\nForest": 18542,
- "Uluwatu Temple": 5902,
- "Sanur Beach": 4526,
- "Tanah Lot Temple": 4218,
- "Seminyak Beach": 3761,
- "Nusa Dua": 3324,
- "Bali Zoo": 2640,
- "Mount Batur": 1815,
- "Ulun Danu Bratan": 1722,
- "Tirta Gangga": 1557,
- "Pandawa Beach": 1511,
- "Jimbaran Bay": 1430,
- "Double Six Beach": 1323,
- "Devil Tears": 1263,
- "Kelingking Beach": 713,
- "Lempuyang Temple": 596,
- "Canggu Beach": 555,
- "Mount Agung": 266
-}
diff --git a/figures/simplify_review_lengths.py b/figures/simplify_review_lengths.py
deleted file mode 100644
index 7810530..0000000
--- a/figures/simplify_review_lengths.py
+++ /dev/null
@@ -1,97 +0,0 @@
-#!/usr/bin/env python3
-"""Aggregate review length counts into buckets."""
-from __future__ import annotations
-
-import argparse
-import json
-from pathlib import Path
-from typing import Dict, Iterable, Tuple
-
-Bucket = Tuple[int | None, int | None, str]
-
-
-DEFAULT_BUCKETS: Tuple[Bucket, ...] = (
- (None, 9, "<10"),
- (10, 19, "10-19"),
- (20, 29, "20-29"),
- (30, 39, "30-39"),
- (40, 49, "40-49"),
- (50, 59, "50-59"),
- (60, 69, "60-69"),
- (70, 79, "70-79"),
- (80, 89, "80-89"),
- (90, 99, "90-99"),
- (100, 109, "100-109"),
- (110, 119, "110-119"),
- (120, 129, "120-129"),
- (130, 139, "130-139"),
- (140, 149, "140-149"),
- (150, 159, "150-159"),
- (160, 169, "160-169"),
- (170, 179, "170-179"),
- (180, 189, "180-189"),
- (190, 199, "190-199"),
- (200, 219, "200-219"),
- (220, 239, "220-239"),
- (240, 259, "240-259"),
- (260, 279, "260-279"),
- (280, 299, "280-299"),
- (300, 399, "300-399"),
- (400, 499, "400-499"),
- (500, 999, "500-999"),
- (1000, None, "1000+"),
-)
-
-
-def load_counts(path: Path) -> Dict[int, int]:
- with path.open("r", encoding="utf-8") as handle:
- raw = json.load(handle)
- return {int(k): int(v) for k, v in raw.items()}
-
-
-def aggregate(counts: Dict[int, int], buckets: Iterable[Bucket]) -> Dict[str, int]:
- output: Dict[str, int] = {label: 0 for _, _, label in buckets}
- for length, count in counts.items():
- for start, end, label in buckets:
- if start is None and end is not None and length <= end:
- output[label] += count
- break
- if end is None and start is not None and length >= start:
- output[label] += count
- break
- if start is not None and end is not None and start <= length <= end:
- output[label] += count
- break
- else:
- raise ValueError(f"No bucket found for length {length}.")
- return output
-
-
-def write_output(path: Path, data: Dict[str, int]) -> None:
- with path.open("w", encoding="utf-8") as handle:
- json.dump(data, handle, indent=2, ensure_ascii=False)
- handle.write("\n")
-
-
-def main() -> int:
- parser = argparse.ArgumentParser(description="Bucket review length counts.")
- parser.add_argument(
- "input",
- type=Path,
- help="Path to review_lengths.json (mapping of length -> count).",
- )
- parser.add_argument(
- "output",
- type=Path,
- help="Path to write bucketed counts JSON.",
- )
- args = parser.parse_args()
-
- counts = load_counts(args.input)
- bucketed = aggregate(counts, DEFAULT_BUCKETS)
- write_output(args.output, bucketed)
- return 0
-
-
-if __name__ == "__main__":
- raise SystemExit(main())
diff --git a/questionnaire/questions.md b/questionnaire/questions.md
index a5f6412..2deaa6f 100644
--- a/questionnaire/questions.md
+++ b/questionnaire/questions.md
@@ -2,7 +2,7 @@
## I. Natural Attractions
-### What distinguishes a spiritually meaningful temple complex from a purely scenic attraction in your perception?
+### Frage 1: What distinguishes a spiritually meaningful temple complex from a purely scenic attraction in your perception?
**Answer:**
@@ -63,7 +63,7 @@ Doc 12 (score: 0.7158):
the temple itself was very nice to look at, amazing sunset views, and the temple itself is quite extraordinary. however there are some obvious let downs because it is a iconic tourist attraction there are tourist everywhere which make it a bit less enjoyable.
```
-### If you had to choose between Tanah Lot and Ulun Danu Bratan for a reflective, culturally immersive experience, which criteria would guide your decision?
+### Frage 2: If you had to choose between Tanah Lot and Ulun Danu Bratan for a reflective, culturally immersive experience, which criteria would guide your decision?
**Answer:**
@@ -126,7 +126,7 @@ tanah lot is better than i could have imagined. the temple is stunning, and you
## II. Atmosphere
-### How would you describe the atmosphere of a place where you feel culturally and spiritually aligned? What factors create that feeling?
+### Frage 3: How would you describe the atmosphere of a place where you feel culturally and spiritually aligned? What factors create that feeling?
**Answer:**
@@ -187,7 +187,7 @@ Doc 12 (score: 0.4899):
so glad to come here. breathtakingly beautiful and with the waves crashing below, you really feel spiritual. spend at least a couple of hours here as a minimum. beautiful temples and as usual a bit touristy. would very highly recommend.
```
-### To what extent do visitor numbers affect your spiritual experience — and is there a threshold you still consider acceptable?
+### Frage 4: To what extent do visitor numbers affect your spiritual experience — and is there a threshold you still consider acceptable?
**Answer:**
@@ -250,7 +250,7 @@ i have visited tanah lot three times. each successive visit i recall seeing more
## III. Social Environment
-### If other visitors focus primarily on photography, does that diminish the spiritual quality of the place for you, or can you detach from it?
+### Frage 5: If other visitors focus primarily on photography, does that diminish the spiritual quality of the place for you, or can you detach from it?
**Answer:**
@@ -309,7 +309,7 @@ Doc 12 (score: 0.5297):
you won't feel the magic or serenity since this place is very popular and setup for tourists. but it's still worth a visit, just imagine the place without tourists. get close to the water and you'll get a photo without any people in them. it's worth the hike up to the lookout, to see tanah lot rock below, but be prepared for the heat! you can make a $5 donation for a blessing by the \holy\ spring, why not :)
```
-### What type of cultural storytelling by locals feels authentic and credible rather than staged for tourism?
+### Frage 6: What type of cultural storytelling by locals feels authentic and credible rather than staged for tourism?
**Answer:**
@@ -372,7 +372,7 @@ as part of my trip to bali i really wanted to visit here. my husband and i booke
## IV. Infrastructure
-### Which infrastructural measures (e.g., visitor flow management, limited entry slots, silent zones) would enhance the cultural quality of your experience?
+### Frage 7: Which infrastructural measures (e.g., visitor flow management, limited entry slots, silent zones) would enhance the cultural quality of your experience?
**Answer:**
@@ -433,7 +433,7 @@ Doc 12 (score: 0.4663):
i rate this as one of the more beautiful temples to visit. it is an amazing setting with good shopping. it is very commercial but i think that is inevitable at these kind of attractions.
```
-### How should destinations communicate information in order to appeal to spiritually interested travelers without reinforcing mass###tourism dynamics?
+### Frage 8: How should destinations communicate information in order to appeal to spiritually interested travelers without reinforcing mass tourism dynamics?
**Answer:**
@@ -496,7 +496,7 @@ we wanted to visit a \temple of the sea\, well ok, the sight is beautiful, from
## V. Value for Money
-### Would you be willing to accept higher entrance fees or donations if they demonstrably contribute to preserving religious structures and practices? Why or why not?
+### Frage 9: Would you be willing to accept higher entrance fees or donations if they demonstrably contribute to preserving religious structures and practices? Why or why not?
**Answer:**
@@ -555,7 +555,7 @@ Doc 12 (score: 0.4613):
the views are stunning and the sea is at its best. you cannot enter the temples as normal people are not considered to be, i guess godly enough. this is religion at its worst, you can view it from afar be pestered by people taking your photograph or hawkers selling birds with rubber bands but in terms of understanding the religious significance, well it doesn't work. add to the zillion tourists that turn up (ok i'm exaggerating) and you have something that could be wonderful and surreal but ends up just another opportunity to extract money from the gullible tourist. it's a shame.
```
-### What would legitimize a paid cultural experience (e.g., guided participation in a ceremony) for you — and what would make it feel commercialized or inauthentic?
+### Frage 10: What would legitimize a paid cultural experience (e.g., guided participation in a ceremony) for you — and what would make it feel commercialized or inauthentic?
**Answer:**
@@ -616,7 +616,7 @@ that they would have no idea. whilst i was there, a ceremony had begun as part o
## VI. Segment Identity
-### Which typical Bali tourism offerings do you consciously avoid, and why do they not align with your travel philosophy?
+### Frage 11: Which typical Bali tourism offerings do you consciously avoid, and why do they not align with your travel philosophy?
**Answer:**
@@ -677,7 +677,7 @@ Doc 12 (score: 0.7242):
the hike down to the beach and the entire vibe is amazing. we do enjoy the view so much. so, visiting bali is not complete if you're not going here.
```
-### If a tourism brand wanted to position Bali specifically for culturally and spiritually motivated travelers, which narratives should it emphasize — and which should it avoid?
+### Frage 12: If a tourism brand wanted to position Bali specifically for culturally and spiritually motivated travelers, which narratives should it emphasize — and which should it avoid?
**Answer:**
diff --git a/raft/rag_chat_merged.py b/raft/rag_chat_merged.py
index 9492a0c..5024790 100644
--- a/raft/rag_chat_merged.py
+++ b/raft/rag_chat_merged.py
@@ -9,17 +9,6 @@ import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
-# """
-# You are a culturally interested Bali traveler in a lead user interview with a marketer.
-
-# When answering:
-# - Do not exaggerate.
-# - Provide nuanced, reflective reasoning rather than bullet lists.
-# - Keep answers concise but specific.
-
-# Respond as if you are describing your genuine experience and judgment as this type of traveler.
-# """
-
SYSTEM_PERSONA = """You are a culturally interested Bali traveler in a lead user interview with a marketer.
Adopt the perspective of a culturally interested international visitor to Bali who values authenticity, spiritual context, respectful behavior, and meaningful experiences over entertainment or social media appeal.
@@ -78,7 +67,7 @@ def main():
# Load your externally finetuned model directly from disk
tok = AutoTokenizer.from_pretrained(args.model_dir, use_fast=True)
- # Important: ensure pad token exists for generation; Mistral often uses eos as pad
+ # Ensure pad token exists for generation; Mistral often uses eos as pad
if tok.pad_token is None:
tok.pad_token = tok.eos_token