Cleanup

2026-03-22 08:22:43 +01:00 · 2026-02-23 23:01:28 +01:00
parent a7efed86f9
commit a2967767d3
2 changed files with 63 additions and 32 deletions
--- a/bertopic/nb_bertopic_lowprep.py
+++ b/bertopic/nb_bertopic_lowprep.py
@@ -286,27 +286,6 @@ if REDUCE_OUTLIERS:
 # %% [markdown]
 # ## Results
 #
-# ### Classification
-#
-
-# %%
-CLASSIFICATION = True
-if CLASSIFICATION:
-    topics_to_keep = {14, 8, 13, 18, 17, 4, 2, 30, 28}
-    INPUT_PATH = "../data/intermediate/preprocessed.tab"  # TSV with a 'review' column
-    OUTPUT_CSV = "../data/intermediate/culture_reviews.csv"
-
-    # Topic model document info
-    df = topic_model.get_document_info(reviews)
-    df["Original"] = reviews
-
-    # --- filter by topics and length ---
-    filtered = df[df["Topic"].isin(topics_to_keep)].copy()
-    filtered["Original"] = filtered["Original"].str.strip()
-
-    # Save an audit CSV
-    filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False, sep=",")
-    print(f"Filtered CSV file saved to {OUTPUT_CSV}")

 # %%
 doc_topic_matrix = probs
@@ -377,7 +356,45 @@ topic_model.visualize_heatmap()
 #

 # %%
-topic_model.get_topic_info()
+topic_info = topic_model.get_topic_info()
+topic_info
+
+# %%
+import matplotlib.pyplot as plt
+
+topic_info = topic_info[topic_info["Topic"] != -1]
+
+# Truncate labels at the third dash
+topic_info["ShortName"] = topic_info["CustomName"].apply(
+    lambda x: "-".join(x.split("-")[:3]) if "-" in x else x
+)
+
+# Sort by count in descending order
+topic_info = topic_info.sort_values("Count", ascending=True)
+
+plt.figure(figsize=(10, 6))
+
+bars = plt.barh(topic_info["ShortName"], topic_info["Count"])
+
+# Add count labels to each bar
+for i, (count, bar) in enumerate(zip(topic_info["Count"], bars)):
+    plt.text(
+        count,
+        bar.get_y() + bar.get_height() / 2,
+        f" {count}",
+        va="center",
+        ha="left",
+        fontsize=10,
+        color="black",
+    )
+
+plt.xscale("log")
+plt.ylabel("Topic")
+plt.xlabel("Anzahl der Dokumente")
+plt.title("")
+plt.tight_layout()
+
+plt.show()

 # %% [markdown]
 # ### Semantic Coherence
@@ -503,12 +520,35 @@ hierarchical_topics = topic_model.hierarchical_topics(reviews)
 tree = topic_model.get_topic_tree(hier_topics=hierarchical_topics)
 print(tree)

+# %% [markdown]
+# ### Classification
+#
+
+# %%
+CLASSIFICATION = True
+if CLASSIFICATION:
+    topics_to_keep = {14, 8, 13, 18, 17, 4, 2, 30}
+    INPUT_PATH = "../data/intermediate/preprocessed.tab"  # TSV with a 'review' column
+    OUTPUT_CSV = "../data/intermediate/culture_reviews.csv"
+
+    # Topic model document info
+    df = topic_model.get_document_info(reviews)
+    df["Original"] = reviews
+
+    # --- filter by topics and length ---
+    filtered = df[df["Topic"].isin(topics_to_keep)].copy()
+    filtered["Original"] = filtered["Original"].str.strip()
+
+    # Save an audit CSV
+    filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False, sep=",")
+    print(f"Filtered CSV file saved to {OUTPUT_CSV}")
+
 # %% [markdown]
 # ### Intertopic Distance Map
 #

 # %%
-topic_model.visualize_topics(use_ctfidf=True)
+topic_model.visualize_topics(use_ctfidf=True, custom_labels=True)

 # %% [markdown]
 # ### Topic Word Scores