Cleanup

2026-06-22 23:23:07 +02:00 · 2026-02-23 23:01:28 +01:00
parent a7efed86f9
commit a2967767d3
2 changed files with 63 additions and 32 deletions
@@ -286,27 +286,6 @@ if REDUCE_OUTLIERS:
 # %% [markdown]
 # ## Results
 #
 # ### Classification
 #
 # %%
 CLASSIFICATION = True
 if CLASSIFICATION:
    topics_to_keep = {14, 8, 13, 18, 17, 4, 2, 30, 28}
    INPUT_PATH = "../data/intermediate/preprocessed.tab"  # TSV with a 'review' column
    OUTPUT_CSV = "../data/intermediate/culture_reviews.csv"
    # Topic model document info
    df = topic_model.get_document_info(reviews)
    df["Original"] = reviews
    # --- filter by topics and length ---
    filtered = df[df["Topic"].isin(topics_to_keep)].copy()
    filtered["Original"] = filtered["Original"].str.strip()
    # Save an audit CSV
    filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False, sep=",")
    print(f"Filtered CSV file saved to {OUTPUT_CSV}")
 # %%
 doc_topic_matrix = probs
@@ -377,7 +356,45 @@ topic_model.visualize_heatmap()
 #
 # %%
-topic_model.get_topic_info()
+topic_info = topic_model.get_topic_info()
 topic_info
 # %%
 import matplotlib.pyplot as plt
 topic_info = topic_info[topic_info["Topic"] != -1]
 # Truncate labels at the third dash
 topic_info["ShortName"] = topic_info["CustomName"].apply(
    lambda x: "-".join(x.split("-")[:3]) if "-" in x else x
 )
 # Sort by count in descending order
 topic_info = topic_info.sort_values("Count", ascending=True)
 plt.figure(figsize=(10, 6))
 bars = plt.barh(topic_info["ShortName"], topic_info["Count"])
 # Add count labels to each bar
 for i, (count, bar) in enumerate(zip(topic_info["Count"], bars)):
    plt.text(
        count,
        bar.get_y() + bar.get_height() / 2,
        f" {count}",
        va="center",
        ha="left",
        fontsize=10,
        color="black",
    )
 plt.xscale("log")
 plt.ylabel("Topic")
 plt.xlabel("Anzahl der Dokumente")
 plt.title("")
 plt.tight_layout()
 plt.show()
 # %% [markdown]
 # ### Semantic Coherence
@@ -503,12 +520,35 @@ hierarchical_topics = topic_model.hierarchical_topics(reviews)
 tree = topic_model.get_topic_tree(hier_topics=hierarchical_topics)
 print(tree)
 # %% [markdown]
 # ### Classification
 #
 # %%
 CLASSIFICATION = True
 if CLASSIFICATION:
    topics_to_keep = {14, 8, 13, 18, 17, 4, 2, 30}
    INPUT_PATH = "../data/intermediate/preprocessed.tab"  # TSV with a 'review' column
    OUTPUT_CSV = "../data/intermediate/culture_reviews.csv"
    # Topic model document info
    df = topic_model.get_document_info(reviews)
    df["Original"] = reviews
    # --- filter by topics and length ---
    filtered = df[df["Topic"].isin(topics_to_keep)].copy()
    filtered["Original"] = filtered["Original"].str.strip()
    # Save an audit CSV
    filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False, sep=",")
    print(f"Filtered CSV file saved to {OUTPUT_CSV}")
 # %% [markdown]
 # ### Intertopic Distance Map
 #
 # %%
-topic_model.visualize_topics(use_ctfidf=True)
+topic_model.visualize_topics(use_ctfidf=True, custom_labels=True)
 # %% [markdown]
 # ### Topic Word Scores
@@ -93,15 +93,6 @@ CONTEXTS = [
    "solo, when you can be more contemplative",
    "as a repeat visitor, noticing subtler layers",
 ]
 TRAVELER_PROFILE = [
    "a culture-first traveler",
    "a spirituality-curious traveler",
    "a respectful observer who avoids intrusive tourism",
    "a slow traveler seeking depth over volume",
    "a repeat visitor looking for subtler, less packaged experiences",
 ]
 CONSTRAINTS = [
    (
        "time",