diff --git a/bertopic/nb_bertopic_lowprep.py b/bertopic/nb_bertopic_lowprep.py index 1ce589f..a16da99 100644 --- a/bertopic/nb_bertopic_lowprep.py +++ b/bertopic/nb_bertopic_lowprep.py @@ -286,27 +286,6 @@ if REDUCE_OUTLIERS: # %% [markdown] # ## Results # -# ### Classification -# - -# %% -CLASSIFICATION = True -if CLASSIFICATION: - topics_to_keep = {14, 8, 13, 18, 17, 4, 2, 30, 28} - INPUT_PATH = "../data/intermediate/preprocessed.tab" # TSV with a 'review' column - OUTPUT_CSV = "../data/intermediate/culture_reviews.csv" - - # Topic model document info - df = topic_model.get_document_info(reviews) - df["Original"] = reviews - - # --- filter by topics and length --- - filtered = df[df["Topic"].isin(topics_to_keep)].copy() - filtered["Original"] = filtered["Original"].str.strip() - - # Save an audit CSV - filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False, sep=",") - print(f"Filtered CSV file saved to {OUTPUT_CSV}") # %% doc_topic_matrix = probs @@ -377,7 +356,45 @@ topic_model.visualize_heatmap() # # %% -topic_model.get_topic_info() +topic_info = topic_model.get_topic_info() +topic_info + +# %% +import matplotlib.pyplot as plt + +topic_info = topic_info[topic_info["Topic"] != -1] + +# Truncate labels at the third dash +topic_info["ShortName"] = topic_info["CustomName"].apply( + lambda x: "-".join(x.split("-")[:3]) if "-" in x else x +) + +# Sort by count in descending order +topic_info = topic_info.sort_values("Count", ascending=True) + +plt.figure(figsize=(10, 6)) + +bars = plt.barh(topic_info["ShortName"], topic_info["Count"]) + +# Add count labels to each bar +for i, (count, bar) in enumerate(zip(topic_info["Count"], bars)): + plt.text( + count, + bar.get_y() + bar.get_height() / 2, + f" {count}", + va="center", + ha="left", + fontsize=10, + color="black", + ) + +plt.xscale("log") +plt.ylabel("Topic") +plt.xlabel("Anzahl der Dokumente") +plt.title("") +plt.tight_layout() + +plt.show() # %% [markdown] # ### Semantic Coherence @@ -503,12 +520,35 @@ hierarchical_topics = topic_model.hierarchical_topics(reviews) tree = topic_model.get_topic_tree(hier_topics=hierarchical_topics) print(tree) +# %% [markdown] +# ### Classification +# + +# %% +CLASSIFICATION = True +if CLASSIFICATION: + topics_to_keep = {14, 8, 13, 18, 17, 4, 2, 30} + INPUT_PATH = "../data/intermediate/preprocessed.tab" # TSV with a 'review' column + OUTPUT_CSV = "../data/intermediate/culture_reviews.csv" + + # Topic model document info + df = topic_model.get_document_info(reviews) + df["Original"] = reviews + + # --- filter by topics and length --- + filtered = df[df["Topic"].isin(topics_to_keep)].copy() + filtered["Original"] = filtered["Original"].str.strip() + + # Save an audit CSV + filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False, sep=",") + print(f"Filtered CSV file saved to {OUTPUT_CSV}") + # %% [markdown] # ### Intertopic Distance Map # # %% -topic_model.visualize_topics(use_ctfidf=True) +topic_model.visualize_topics(use_ctfidf=True, custom_labels=True) # %% [markdown] # ### Topic Word Scores diff --git a/raft/generate_trainer_prompts.py b/raft/generate_trainer_prompts.py index 5ef6c32..1a6d3fc 100644 --- a/raft/generate_trainer_prompts.py +++ b/raft/generate_trainer_prompts.py @@ -93,15 +93,6 @@ CONTEXTS = [ "solo, when you can be more contemplative", "as a repeat visitor, noticing subtler layers", ] - -TRAVELER_PROFILE = [ - "a culture-first traveler", - "a spirituality-curious traveler", - "a respectful observer who avoids intrusive tourism", - "a slow traveler seeking depth over volume", - "a repeat visitor looking for subtler, less packaged experiences", -] - CONSTRAINTS = [ ( "time",