mirror of
https://github.com/marvinscham/masterthesis-playground.git
synced 2026-03-22 08:22:43 +01:00
Cleanup
This commit is contained in:
@@ -286,27 +286,6 @@ if REDUCE_OUTLIERS:
|
||||
# %% [markdown]
|
||||
# ## Results
|
||||
#
|
||||
# ### Classification
|
||||
#
|
||||
|
||||
# %%
|
||||
CLASSIFICATION = True
|
||||
if CLASSIFICATION:
|
||||
topics_to_keep = {14, 8, 13, 18, 17, 4, 2, 30, 28}
|
||||
INPUT_PATH = "../data/intermediate/preprocessed.tab" # TSV with a 'review' column
|
||||
OUTPUT_CSV = "../data/intermediate/culture_reviews.csv"
|
||||
|
||||
# Topic model document info
|
||||
df = topic_model.get_document_info(reviews)
|
||||
df["Original"] = reviews
|
||||
|
||||
# --- filter by topics and length ---
|
||||
filtered = df[df["Topic"].isin(topics_to_keep)].copy()
|
||||
filtered["Original"] = filtered["Original"].str.strip()
|
||||
|
||||
# Save an audit CSV
|
||||
filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False, sep=",")
|
||||
print(f"Filtered CSV file saved to {OUTPUT_CSV}")
|
||||
|
||||
# %%
|
||||
doc_topic_matrix = probs
|
||||
@@ -377,7 +356,45 @@ topic_model.visualize_heatmap()
|
||||
#
|
||||
|
||||
# %%
|
||||
topic_model.get_topic_info()
|
||||
topic_info = topic_model.get_topic_info()
|
||||
topic_info
|
||||
|
||||
# %%
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
topic_info = topic_info[topic_info["Topic"] != -1]
|
||||
|
||||
# Truncate labels at the third dash
|
||||
topic_info["ShortName"] = topic_info["CustomName"].apply(
|
||||
lambda x: "-".join(x.split("-")[:3]) if "-" in x else x
|
||||
)
|
||||
|
||||
# Sort by count in descending order
|
||||
topic_info = topic_info.sort_values("Count", ascending=True)
|
||||
|
||||
plt.figure(figsize=(10, 6))
|
||||
|
||||
bars = plt.barh(topic_info["ShortName"], topic_info["Count"])
|
||||
|
||||
# Add count labels to each bar
|
||||
for i, (count, bar) in enumerate(zip(topic_info["Count"], bars)):
|
||||
plt.text(
|
||||
count,
|
||||
bar.get_y() + bar.get_height() / 2,
|
||||
f" {count}",
|
||||
va="center",
|
||||
ha="left",
|
||||
fontsize=10,
|
||||
color="black",
|
||||
)
|
||||
|
||||
plt.xscale("log")
|
||||
plt.ylabel("Topic")
|
||||
plt.xlabel("Anzahl der Dokumente")
|
||||
plt.title("")
|
||||
plt.tight_layout()
|
||||
|
||||
plt.show()
|
||||
|
||||
# %% [markdown]
|
||||
# ### Semantic Coherence
|
||||
@@ -503,12 +520,35 @@ hierarchical_topics = topic_model.hierarchical_topics(reviews)
|
||||
tree = topic_model.get_topic_tree(hier_topics=hierarchical_topics)
|
||||
print(tree)
|
||||
|
||||
# %% [markdown]
|
||||
# ### Classification
|
||||
#
|
||||
|
||||
# %%
|
||||
CLASSIFICATION = True
|
||||
if CLASSIFICATION:
|
||||
topics_to_keep = {14, 8, 13, 18, 17, 4, 2, 30}
|
||||
INPUT_PATH = "../data/intermediate/preprocessed.tab" # TSV with a 'review' column
|
||||
OUTPUT_CSV = "../data/intermediate/culture_reviews.csv"
|
||||
|
||||
# Topic model document info
|
||||
df = topic_model.get_document_info(reviews)
|
||||
df["Original"] = reviews
|
||||
|
||||
# --- filter by topics and length ---
|
||||
filtered = df[df["Topic"].isin(topics_to_keep)].copy()
|
||||
filtered["Original"] = filtered["Original"].str.strip()
|
||||
|
||||
# Save an audit CSV
|
||||
filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False, sep=",")
|
||||
print(f"Filtered CSV file saved to {OUTPUT_CSV}")
|
||||
|
||||
# %% [markdown]
|
||||
# ### Intertopic Distance Map
|
||||
#
|
||||
|
||||
# %%
|
||||
topic_model.visualize_topics(use_ctfidf=True)
|
||||
topic_model.visualize_topics(use_ctfidf=True, custom_labels=True)
|
||||
|
||||
# %% [markdown]
|
||||
# ### Topic Word Scores
|
||||
|
||||
Reference in New Issue
Block a user