This commit is contained in:
2026-02-23 23:01:28 +01:00
parent a7efed86f9
commit a2967767d3
2 changed files with 63 additions and 32 deletions

View File

@@ -286,27 +286,6 @@ if REDUCE_OUTLIERS:
# %% [markdown] # %% [markdown]
# ## Results # ## Results
# #
# ### Classification
#
# %%
CLASSIFICATION = True
if CLASSIFICATION:
topics_to_keep = {14, 8, 13, 18, 17, 4, 2, 30, 28}
INPUT_PATH = "../data/intermediate/preprocessed.tab" # TSV with a 'review' column
OUTPUT_CSV = "../data/intermediate/culture_reviews.csv"
# Topic model document info
df = topic_model.get_document_info(reviews)
df["Original"] = reviews
# --- filter by topics and length ---
filtered = df[df["Topic"].isin(topics_to_keep)].copy()
filtered["Original"] = filtered["Original"].str.strip()
# Save an audit CSV
filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False, sep=",")
print(f"Filtered CSV file saved to {OUTPUT_CSV}")
# %% # %%
doc_topic_matrix = probs doc_topic_matrix = probs
@@ -377,7 +356,45 @@ topic_model.visualize_heatmap()
# #
# %% # %%
topic_model.get_topic_info() topic_info = topic_model.get_topic_info()
topic_info
# %%
import matplotlib.pyplot as plt
topic_info = topic_info[topic_info["Topic"] != -1]
# Truncate labels at the third dash
topic_info["ShortName"] = topic_info["CustomName"].apply(
lambda x: "-".join(x.split("-")[:3]) if "-" in x else x
)
# Sort by count in descending order
topic_info = topic_info.sort_values("Count", ascending=True)
plt.figure(figsize=(10, 6))
bars = plt.barh(topic_info["ShortName"], topic_info["Count"])
# Add count labels to each bar
for i, (count, bar) in enumerate(zip(topic_info["Count"], bars)):
plt.text(
count,
bar.get_y() + bar.get_height() / 2,
f" {count}",
va="center",
ha="left",
fontsize=10,
color="black",
)
plt.xscale("log")
plt.ylabel("Topic")
plt.xlabel("Anzahl der Dokumente")
plt.title("")
plt.tight_layout()
plt.show()
# %% [markdown] # %% [markdown]
# ### Semantic Coherence # ### Semantic Coherence
@@ -503,12 +520,35 @@ hierarchical_topics = topic_model.hierarchical_topics(reviews)
tree = topic_model.get_topic_tree(hier_topics=hierarchical_topics) tree = topic_model.get_topic_tree(hier_topics=hierarchical_topics)
print(tree) print(tree)
# %% [markdown]
# ### Classification
#
# %%
CLASSIFICATION = True
if CLASSIFICATION:
topics_to_keep = {14, 8, 13, 18, 17, 4, 2, 30}
INPUT_PATH = "../data/intermediate/preprocessed.tab" # TSV with a 'review' column
OUTPUT_CSV = "../data/intermediate/culture_reviews.csv"
# Topic model document info
df = topic_model.get_document_info(reviews)
df["Original"] = reviews
# --- filter by topics and length ---
filtered = df[df["Topic"].isin(topics_to_keep)].copy()
filtered["Original"] = filtered["Original"].str.strip()
# Save an audit CSV
filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False, sep=",")
print(f"Filtered CSV file saved to {OUTPUT_CSV}")
# %% [markdown] # %% [markdown]
# ### Intertopic Distance Map # ### Intertopic Distance Map
# #
# %% # %%
topic_model.visualize_topics(use_ctfidf=True) topic_model.visualize_topics(use_ctfidf=True, custom_labels=True)
# %% [markdown] # %% [markdown]
# ### Topic Word Scores # ### Topic Word Scores

View File

@@ -93,15 +93,6 @@ CONTEXTS = [
"solo, when you can be more contemplative", "solo, when you can be more contemplative",
"as a repeat visitor, noticing subtler layers", "as a repeat visitor, noticing subtler layers",
] ]
TRAVELER_PROFILE = [
"a culture-first traveler",
"a spirituality-curious traveler",
"a respectful observer who avoids intrusive tourism",
"a slow traveler seeking depth over volume",
"a repeat visitor looking for subtler, less packaged experiences",
]
CONSTRAINTS = [ CONSTRAINTS = [
( (
"time", "time",