mirror of
https://github.com/marvinscham/masterthesis-playground.git
synced 2026-03-22 08:22:43 +01:00
Cleanup
This commit is contained in:
@@ -286,27 +286,6 @@ if REDUCE_OUTLIERS:
|
|||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
# ## Results
|
# ## Results
|
||||||
#
|
#
|
||||||
# ### Classification
|
|
||||||
#
|
|
||||||
|
|
||||||
# %%
|
|
||||||
CLASSIFICATION = True
|
|
||||||
if CLASSIFICATION:
|
|
||||||
topics_to_keep = {14, 8, 13, 18, 17, 4, 2, 30, 28}
|
|
||||||
INPUT_PATH = "../data/intermediate/preprocessed.tab" # TSV with a 'review' column
|
|
||||||
OUTPUT_CSV = "../data/intermediate/culture_reviews.csv"
|
|
||||||
|
|
||||||
# Topic model document info
|
|
||||||
df = topic_model.get_document_info(reviews)
|
|
||||||
df["Original"] = reviews
|
|
||||||
|
|
||||||
# --- filter by topics and length ---
|
|
||||||
filtered = df[df["Topic"].isin(topics_to_keep)].copy()
|
|
||||||
filtered["Original"] = filtered["Original"].str.strip()
|
|
||||||
|
|
||||||
# Save an audit CSV
|
|
||||||
filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False, sep=",")
|
|
||||||
print(f"Filtered CSV file saved to {OUTPUT_CSV}")
|
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
doc_topic_matrix = probs
|
doc_topic_matrix = probs
|
||||||
@@ -377,7 +356,45 @@ topic_model.visualize_heatmap()
|
|||||||
#
|
#
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
topic_model.get_topic_info()
|
topic_info = topic_model.get_topic_info()
|
||||||
|
topic_info
|
||||||
|
|
||||||
|
# %%
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
topic_info = topic_info[topic_info["Topic"] != -1]
|
||||||
|
|
||||||
|
# Truncate labels at the third dash
|
||||||
|
topic_info["ShortName"] = topic_info["CustomName"].apply(
|
||||||
|
lambda x: "-".join(x.split("-")[:3]) if "-" in x else x
|
||||||
|
)
|
||||||
|
|
||||||
|
# Sort by count in descending order
|
||||||
|
topic_info = topic_info.sort_values("Count", ascending=True)
|
||||||
|
|
||||||
|
plt.figure(figsize=(10, 6))
|
||||||
|
|
||||||
|
bars = plt.barh(topic_info["ShortName"], topic_info["Count"])
|
||||||
|
|
||||||
|
# Add count labels to each bar
|
||||||
|
for i, (count, bar) in enumerate(zip(topic_info["Count"], bars)):
|
||||||
|
plt.text(
|
||||||
|
count,
|
||||||
|
bar.get_y() + bar.get_height() / 2,
|
||||||
|
f" {count}",
|
||||||
|
va="center",
|
||||||
|
ha="left",
|
||||||
|
fontsize=10,
|
||||||
|
color="black",
|
||||||
|
)
|
||||||
|
|
||||||
|
plt.xscale("log")
|
||||||
|
plt.ylabel("Topic")
|
||||||
|
plt.xlabel("Anzahl der Dokumente")
|
||||||
|
plt.title("")
|
||||||
|
plt.tight_layout()
|
||||||
|
|
||||||
|
plt.show()
|
||||||
|
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
# ### Semantic Coherence
|
# ### Semantic Coherence
|
||||||
@@ -503,12 +520,35 @@ hierarchical_topics = topic_model.hierarchical_topics(reviews)
|
|||||||
tree = topic_model.get_topic_tree(hier_topics=hierarchical_topics)
|
tree = topic_model.get_topic_tree(hier_topics=hierarchical_topics)
|
||||||
print(tree)
|
print(tree)
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# ### Classification
|
||||||
|
#
|
||||||
|
|
||||||
|
# %%
|
||||||
|
CLASSIFICATION = True
|
||||||
|
if CLASSIFICATION:
|
||||||
|
topics_to_keep = {14, 8, 13, 18, 17, 4, 2, 30}
|
||||||
|
INPUT_PATH = "../data/intermediate/preprocessed.tab" # TSV with a 'review' column
|
||||||
|
OUTPUT_CSV = "../data/intermediate/culture_reviews.csv"
|
||||||
|
|
||||||
|
# Topic model document info
|
||||||
|
df = topic_model.get_document_info(reviews)
|
||||||
|
df["Original"] = reviews
|
||||||
|
|
||||||
|
# --- filter by topics and length ---
|
||||||
|
filtered = df[df["Topic"].isin(topics_to_keep)].copy()
|
||||||
|
filtered["Original"] = filtered["Original"].str.strip()
|
||||||
|
|
||||||
|
# Save an audit CSV
|
||||||
|
filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False, sep=",")
|
||||||
|
print(f"Filtered CSV file saved to {OUTPUT_CSV}")
|
||||||
|
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
# ### Intertopic Distance Map
|
# ### Intertopic Distance Map
|
||||||
#
|
#
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
topic_model.visualize_topics(use_ctfidf=True)
|
topic_model.visualize_topics(use_ctfidf=True, custom_labels=True)
|
||||||
|
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
# ### Topic Word Scores
|
# ### Topic Word Scores
|
||||||
|
|||||||
@@ -93,15 +93,6 @@ CONTEXTS = [
|
|||||||
"solo, when you can be more contemplative",
|
"solo, when you can be more contemplative",
|
||||||
"as a repeat visitor, noticing subtler layers",
|
"as a repeat visitor, noticing subtler layers",
|
||||||
]
|
]
|
||||||
|
|
||||||
TRAVELER_PROFILE = [
|
|
||||||
"a culture-first traveler",
|
|
||||||
"a spirituality-curious traveler",
|
|
||||||
"a respectful observer who avoids intrusive tourism",
|
|
||||||
"a slow traveler seeking depth over volume",
|
|
||||||
"a repeat visitor looking for subtler, less packaged experiences",
|
|
||||||
]
|
|
||||||
|
|
||||||
CONSTRAINTS = [
|
CONSTRAINTS = [
|
||||||
(
|
(
|
||||||
"time",
|
"time",
|
||||||
|
|||||||
Reference in New Issue
Block a user