22.02.

2026-06-22 07:13:08 +02:00 · 2026-02-22 23:52:26 +01:00
parent 61edb35f70
commit a7efed86f9
16 changed files with 1994 additions and 6420 deletions
@@ -16,10 +16,10 @@ from bertopic import BERTopic

 param_grid = {
    "n_gram_max": [2, 3],  # Vectorization
-    "min_document_frequency": [1],  # Vectorization
+    "min_document_frequency": [1, 2],  # Vectorization
    "min_samples": [10, 25],  # HDBSCAN
-    "min_topic_size": [10, 20, 30, 40, 50],  # HDBSCAN
-    "n_neighbors": [15],  # UMAP
+    "min_topic_size": [100, 200],  # HDBSCAN
+    "n_neighbors": [15, 25],  # UMAP
    "n_components": [2, 5],  # UMAP
    "min_dist": [0.01, 0.1],  # UMAP
    "nr_topics": ["auto"],  # Topic Modeling
@@ -5,7 +5,7 @@ import matplotlib.pyplot as plt
 with open("output/autotune.json", "r") as f:
    history = json.load(f)

-history = sorted(history, key=lambda x: x["metrics"]["combined_score"], reverse=False)
+history = sorted(history, key=lambda x: x["metrics"]["combined_score"], reverse=True)

 with open("output/autotune_sorted.json", "w") as f:
    json.dump(history, f, indent=2)
@@ -360,7 +360,6 @@ vis = topic_model.visualize_documents(
    custom_labels=True,
    hide_annotations=True,
 )
-# vis.write_html("output/visualization.html")
 vis

 # %%
@@ -497,7 +496,12 @@ if CALCULATE_TOKEN_DISTRIBUTIONS:
 #

 # %%
-topic_model.visualize_hierarchy(custom_labels=True)
+topic_model.visualize_hierarchy(custom_labels=True, color_threshold=0.98)
+
+# %%
+hierarchical_topics = topic_model.hierarchical_topics(reviews)
+tree = topic_model.get_topic_tree(hier_topics=hierarchical_topics)
+print(tree)

 # %% [markdown]
 # ### Intertopic Distance Map
@@ -512,3 +516,20 @@ topic_model.visualize_topics(use_ctfidf=True)

 # %%
 topic_model.visualize_barchart(top_n_topics=12, custom_labels=True, n_words=10)
+
+# %%
+from wordcloud import WordCloud
+import matplotlib.pyplot as plt
+
+
+def create_wordcloud(model, topic):
+    text = {word: value for word, value in model.get_topic(topic)}
+    wc = WordCloud(background_color="white", max_words=1000)
+    wc.generate_from_frequencies(text)
+    plt.imshow(wc, interpolation="bilinear")
+    plt.axis("off")
+    plt.show()
+
+
+# Show wordcloud
+create_wordcloud(topic_model, topic=1)
@@ -0,0 +1,519 @@
+# ---
+# jupyter:
+#   jupytext:
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.18.0
+#   kernelspec:
+#     display_name: .venv (3.12.3)
+#     language: python
+#     name: python3
+# ---
+
+# %% [markdown]
+# # Topic Detection: Bali Tourist Reviews
+#
+
+# %% [markdown]
+# ## Preparation
+#
+# ### Dependency Loading
+#
+
+# %%
+import pickle
+import re
+
+import gensim.corpora as corpora
+import nltk
+import numpy as np
+import pandas as pd
+from bertopic.representation import KeyBERTInspired
+from bertopic.vectorizers import ClassTfidfTransformer
+from gensim.models.coherencemodel import CoherenceModel
+from hdbscan import HDBSCAN
+from sentence_transformers import SentenceTransformer
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_extraction import text as skltext
+from sklearn.metrics.pairwise import cosine_similarity
+from umap import UMAP
+
+from bertopic import BERTopic
+
+nltk.download("stopwords")
+nltk.download("punkt")
+nltk.download("wordnet")
+
+# %% [markdown]
+# ### Hyperparameters and Settings
+#
+
+# %%
+RECREATE_MODEL = True
+RECREATE_REDUCED_MODEL = True
+PROCESS_DATA = True
+REDUCE_OUTLIERS = False
+CALCULATE_TOKEN_DISTRIBUTIONS = False
+
+# Data Sample Size, -1 for all data
+DATA_SAMPLE_SIZE = -1
+
+# Vectorization
+MIN_DOCUMENT_FREQUENCY = 1
+MAX_NGRAM = 3
+
+# HDBSCAN Parameters
+MIN_TOPIC_SIZE = 15
+MIN_SAMPLES = 15
+
+# UMAP Parameters
+N_NEIGHBORS = 15
+N_COMPONENTS = 2
+MIN_DIST = 0.01
+
+# Topic Modeling
+TOP_N_WORDS = 10
+MAX_TOPICS = None  # or "auto" to pass to HDBSCAN, None to skip
+
+TF_IDF_STOP_WORDS = ["bali", "place", "visit", "visited", "visiting"]
+
+# %% [markdown]
+# ### Data Loading & Preprocessing
+#
+
+# %%
+# Import data after general preprocessing
+
+if DATA_SAMPLE_SIZE == -1:
+    reviews = pd.read_csv(
+        "../data/intermediate/culture_reviews.csv", sep=","
+    ).Original.to_list()
+else:
+    reviews = (
+        pd.read_csv("../data/intermediate/culture_reviews.csv", sep=",")
+        .sample(n=DATA_SAMPLE_SIZE)
+        .Original.to_list()
+    )
+
+print("Loaded {} reviews".format(len(reviews)))
+
+# %%
+rep = {
+    r"\\n": " ",
+    r"\n": " ",
+    r'\\"': "",
+    r'"': "",
+    r"\s+": " ",
+}
+rep = dict((re.escape(k), v) for k, v in rep.items())
+pattern = re.compile("|".join(rep.keys()))
+
+
+def preprocess(text):
+    text = text.strip()
+    text = text.lower()
+    text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
+    return text
+
+
+# %%
+print(
+    preprocess(
+        "Excellent. Definitely worth coming while in bali. Food and people were very nice.\n🌟 🤩 ⭐️ \nTrisna was our host"
+    )
+)
+
+# %%
+if PROCESS_DATA:
+    print("Processing reviews...")
+    reviews = [preprocess(review) for review in reviews]
+
+    with open("../data/intermediate/processed_texts_culture.pkl", "wb") as f:
+        pickle.dump(reviews, f)
+else:
+    with open("../data/intermediate/processed_texts_culture.pkl", "rb") as f:
+        reviews = pickle.load(f)
+
+print(reviews[:1])
+
+# %% [markdown]
+# ### Pre-calculate Embeddings
+#
+
+# %%
+embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
+embeddings = embedding_model.encode(reviews, show_progress_bar=True)
+
+# %% [markdown]
+# ## Model Creation
+#
+
+# %% [markdown]
+# ### Dimensionality Reduction (UMAP)
+#
+
+# %%
+umap_model = UMAP(
+    n_neighbors=N_NEIGHBORS,
+    n_components=N_COMPONENTS,
+    min_dist=MIN_DIST,
+    metric="cosine",
+    low_memory=True,
+    random_state=42,
+)
+reduced_embeddings = umap_model.fit_transform(embeddings)
+
+# %% [markdown]
+# ### BERTopic Model Creation
+#
+
+# %%
+if RECREATE_MODEL:
+    stop_words = list(skltext.ENGLISH_STOP_WORDS.union(TF_IDF_STOP_WORDS))
+
+    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
+    vectorizer_model = CountVectorizer(
+        min_df=MIN_DOCUMENT_FREQUENCY,
+        ngram_range=(1, MAX_NGRAM),
+        stop_words=stop_words,
+    )
+
+    representation_model = KeyBERTInspired()
+    hdbscan_model = HDBSCAN(
+        min_cluster_size=MIN_TOPIC_SIZE,
+        min_samples=MIN_SAMPLES,
+        metric="euclidean",
+        cluster_selection_method="eom",
+        gen_min_span_tree=True,
+        prediction_data=True,
+    )
+
+    topic_model = BERTopic(
+        embedding_model=embedding_model,
+        ctfidf_model=ctfidf_model,
+        vectorizer_model=vectorizer_model,
+        umap_model=umap_model,
+        hdbscan_model=hdbscan_model,
+        representation_model=representation_model,
+        verbose=True,
+        calculate_probabilities=True,
+        language="english",
+        top_n_words=TOP_N_WORDS,
+        nr_topics=MAX_TOPICS,
+    )
+
+    topics, probs = topic_model.fit_transform(reviews, embeddings=embeddings)
+
+    topic_labels = topic_model.generate_topic_labels(
+        nr_words=3, topic_prefix=True, word_length=15, separator=" - "
+    )
+    topic_model.set_topic_labels(topic_labels)
+    # BERTopic.save(topic_model, "bertopic/model.bertopic")
+else:
+    print("Nevermind, loading existing model")
+    # topic_model = BERTopic.load("bertopic/model.bertopic")
+
+# %% [markdown]
+# ## Fine Tuning
+#
+# ### Topic Condensation
+#
+
+# %%
+if RECREATE_REDUCED_MODEL:
+    done = False
+    iteration = 1
+    while not done:
+        print(f"Iteration {iteration}")
+        iteration += 1
+        similarity_matrix = cosine_similarity(
+            np.array(topic_model.topic_embeddings_)[1:, :]
+        )
+        nothing_to_merge = True
+
+        for i in range(similarity_matrix.shape[0]):
+            for j in range(i + 1, similarity_matrix.shape[1]):
+                try:
+                    sim = similarity_matrix[i, j]
+                    if sim > 0.9:
+                        nothing_to_merge = False
+                        t1, t2 = i, j
+                        try:
+                            t1_name = topic_model.get_topic_info(t1)["CustomName"][0]
+                            t2_name = topic_model.get_topic_info(t2)["CustomName"][0]
+                            print(
+                                f"Merging topics {t1} ({t1_name}) and {t2} ({t2_name}) with similarity {sim:.2f}"
+                            )
+                            topic_model.merge_topics(reviews, topics_to_merge=[t1, t2])
+
+                            topic_labels = topic_model.generate_topic_labels(
+                                nr_words=3,
+                                topic_prefix=True,
+                                word_length=15,
+                                separator=" - ",
+                            )
+                            topic_model.set_topic_labels(topic_labels)
+                            similarity_matrix = cosine_similarity(
+                                np.array(topic_model.topic_embeddings_)[1:, :]
+                            )
+                        except Exception as e:
+                            print(f"Failed to merge {t1} and {t2}: {e}")
+                except IndexError:
+                    pass
+        if nothing_to_merge:
+            print("No more topics to merge.")
+            done = True
+else:
+    print("Skipping topic reduction")
+
+# %% [markdown]
+# ### Outlier Reduction
+#
+
+# %%
+if REDUCE_OUTLIERS:
+    new_topics = topic_model.reduce_outliers(
+        reviews,
+        topic_model.topics_,
+        probabilities=topic_model.probabilities_,
+        threshold=0.05,
+        strategy="probabilities",
+    )
+    topic_model.update_topics(reviews, topics=new_topics)
+
+# %% [markdown]
+# ## Results
+#
+# ### Classification
+#
+
+# %%
+CLASSIFICATION = False
+if CLASSIFICATION:
+    topics_to_keep = {14, 8, 13, 18, 17, 4, 2, 30, 28}
+    INPUT_PATH = "../data/intermediate/preprocessed.tab"  # TSV with a 'review' column
+    OUTPUT_CSV = "../data/intermediate/culture_reviews.csv"
+
+    # Topic model document info
+    df = topic_model.get_document_info(reviews)
+    df["Original"] = reviews
+
+    # --- filter by topics and length ---
+    filtered = df[df["Topic"].isin(topics_to_keep)].copy()
+    filtered["Original"] = filtered["Original"].str.strip()
+
+    # Save an audit CSV
+    filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False, sep=",")
+    print(f"Filtered CSV file saved to {OUTPUT_CSV}")
+
+# %%
+doc_topic_matrix = probs
+
+# column names
+topicnames = ["Topic " + str(i) for i in range(len(set(topics)) - 1)]
+
+# index names
+docnames = ["Review " + str(i) for i in range(len(reviews))]
+
+# Make the pandas dataframe
+df_document_topic = pd.DataFrame(
+    np.round(doc_topic_matrix, 2), columns=topicnames, index=docnames
+)
+
+# Get dominant topic for each document
+dominant_topic = np.argmax(doc_topic_matrix, axis=1)
+df_document_topic["dominant_topic"] = dominant_topic
+
+
+# Styling
+def color_stuff(val):
+    if val > 0.1:
+        color = "green"
+    elif val > 0.05:
+        color = "orange"
+    else:
+        color = "grey"
+    return "color: {col}".format(col=color)
+
+
+def make_bold(val):
+    weight = 700 if val > 0.1 else 400
+    return "font-weight: {weight}".format(weight=weight)
+
+
+# Apply Style
+df_document_topics = (
+    df_document_topic.head(15).style.applymap(color_stuff).applymap(make_bold)
+)
+df_document_topics
+
+# %% [markdown]
+# ### Document Visualization
+#
+
+# %%
+vis = topic_model.visualize_documents(
+    docs=reviews,
+    reduced_embeddings=reduced_embeddings,
+    custom_labels=True,
+    hide_annotations=True,
+)
+# vis.write_html("output/visualization.html")
+vis
+
+# %%
+topic_model.visualize_document_datamap(reviews, reduced_embeddings=reduced_embeddings)
+
+# %% [markdown]
+# ### Similarity Matrix
+#
+
+# %%
+topic_model.visualize_heatmap()
+
+# %% [markdown]
+# ### Topic Info
+#
+
+# %%
+topic_model.get_topic_info()
+
+# %% [markdown]
+# ### Semantic Coherence
+#
+
+# %%
+topic_words = []
+for topic_id in topic_model.get_topic_info()["Topic"]:
+    # Skip outlier topic
+    if topic_id < 0:
+        continue
+
+    words = [word for word, _ in topic_model.get_topic(topic_id)]
+    topic_words.append(words)
+
+# Compute mean pairwise cosine similarity for each topic
+coherence_scores = []
+for words in topic_words:
+    coherence_embeddings = embedding_model.encode(words)
+    sim_matrix = cosine_similarity(coherence_embeddings)
+
+    # Ignore self-similarity
+    np.fill_diagonal(sim_matrix, 0)
+    mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)])
+    coherence_scores.append(mean_sim)
+
+overall_coherence = np.mean(coherence_scores)
+
+print(len(reviews), "reviews processed")
+print(len(topic_model.get_topic_info()) - 1, "topics found")
+print(f"BERT-based Topic Coherence: {overall_coherence:.4f}")
+
+# %% [markdown]
+# ### Topic Coherence
+#
+
+# %%
+# https://github.com/MaartenGr/BERTopic/issues/90#issuecomment-820915389
+
+# This will most likely crash your PC
+this_will_crash_your_pc_are_you_sure = False
+if this_will_crash_your_pc_are_you_sure:
+    # Preprocess Documents
+    documents = pd.DataFrame(
+        {"Document": reviews, "ID": range(len(reviews)), "Topic": topics}
+    )
+    documents_per_topic = documents.groupby(["Topic"], as_index=False).agg(
+        {"Document": " ".join}
+    )
+    cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)
+
+    # Extract vectorizer and analyzer from BERTopic
+    vectorizer = topic_model.vectorizer_model
+    analyzer = vectorizer.build_analyzer()
+
+    # Extract features for Topic Coherence evaluation
+    words = vectorizer.get_feature_names_out()
+    tokens = [analyzer(doc) for doc in cleaned_docs]
+    dictionary = corpora.Dictionary(tokens)
+    corpus = [dictionary.doc2bow(token) for token in tokens]
+
+    for topic_id in topic_model.get_topic_info()["Topic"]:
+        # Skip outlier topic
+        if topic_id < 0:
+            continue
+
+        words = [word for word, _ in topic_model.get_topic(topic_id)]
+        topic_words.append(words)
+
+    # %env TOKENIZERS_PARALLELISM=false
+
+    for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]:
+        coherence_model = CoherenceModel(
+            topics=topic_words,
+            texts=tokens,
+            corpus=corpus,
+            dictionary=dictionary,
+            coherence=measurement,
+        )
+        coherence_score = coherence_model.get_coherence()
+        print(f"Coherence ({measurement}): {coherence_score:.4f}")
+
+# %% [markdown]
+# ### Term Search
+#
+
+# %%
+search_term = "lempuyang"
+
+similar_topics, similarities = topic_model.find_topics(search_term, top_n=10)
+for i in range(len(similar_topics)):
+    print(
+        f"{str(similarities[i])[:5]} {topic_model.get_topic_info(similar_topics[i])['CustomName'][0]}"
+    )
+
+# %%
+# Source: https://maartengr.github.io/BERTopic/getting_started/visualization/visualize_documents.html#visualize-probabilities-or-distribution
+# Calculate the topic distributions on a token-level
+
+if CALCULATE_TOKEN_DISTRIBUTIONS:
+    topic_distr, topic_token_distr = topic_model.approximate_distribution(
+        reviews, calculate_tokens=True, use_embedding_model=True
+    )
+
+# %%
+# Visualize the token-level distributions
+if CALCULATE_TOKEN_DISTRIBUTIONS:
+    DOC_INDEX = 1
+    df = topic_model.visualize_approximate_distribution(
+        reviews[DOC_INDEX], topic_token_distr[DOC_INDEX]
+    )
+    df
+
+# %% [markdown]
+# ### Topic Hierarchy
+#
+
+# %%
+topic_model.visualize_hierarchy(custom_labels=True)
+
+# %%
+hierarchical_topics = topic_model.hierarchical_topics(reviews)
+tree = topic_model.get_topic_tree(hier_topics=hierarchical_topics)
+print(tree)
+
+# %% [markdown]
+# ### Intertopic Distance Map
+#
+
+# %%
+topic_model.visualize_topics(use_ctfidf=True)
+
+# %% [markdown]
+# ### Topic Word Scores
+#
+
+# %%
+topic_model.visualize_barchart(top_n_topics=12, custom_labels=True, n_words=10)
@@ -0,0 +1,290 @@
+[
+  {
+    "params": {
+      "min_dist": 0.1,
+      "min_document_frequency": 1,
+      "min_samples": 10,
+      "min_topic_size": 200,
+      "n_components": 2,
+      "n_gram_max": 2,
+      "n_neighbors": 15,
+      "nr_topics": "auto",
+      "top_n_words": 10
+    },
+    "metrics": {
+      "coherence": 0.498,
+      "diversity": 1.0,
+      "combined_score": 0.6486
+    }
+  },
+  {
+    "params": {
+      "min_dist": 0.1,
+      "min_document_frequency": 1,
+      "min_samples": 25,
+      "min_topic_size": 200,
+      "n_components": 2,
+      "n_gram_max": 2,
+      "n_neighbors": 15,
+      "nr_topics": "auto",
+      "top_n_words": 10
+    },
+    "metrics": {
+      "coherence": 0.498,
+      "diversity": 1.0,
+      "combined_score": 0.6486
+    }
+  },
+  {
+    "params": {
+      "min_dist": 0.1,
+      "min_document_frequency": 1,
+      "min_samples": 10,
+      "min_topic_size": 200,
+      "n_components": 2,
+      "n_gram_max": 3,
+      "n_neighbors": 15,
+      "nr_topics": "auto",
+      "top_n_words": 10
+    },
+    "metrics": {
+      "coherence": 0.4915,
+      "diversity": 0.9666,
+      "combined_score": 0.634
+    }
+  },
+  {
+    "params": {
+      "min_dist": 0.1,
+      "min_document_frequency": 1,
+      "min_samples": 25,
+      "min_topic_size": 200,
+      "n_components": 2,
+      "n_gram_max": 3,
+      "n_neighbors": 15,
+      "nr_topics": "auto",
+      "top_n_words": 10
+    },
+    "metrics": {
+      "coherence": 0.4915,
+      "diversity": 0.9666,
+      "combined_score": 0.634
+    }
+  },
+  {
+    "params": {
+      "min_dist": 0.01,
+      "min_document_frequency": 1,
+      "min_samples": 10,
+      "min_topic_size": 200,
+      "n_components": 5,
+      "n_gram_max": 2,
+      "n_neighbors": 15,
+      "nr_topics": "auto",
+      "top_n_words": 10
+    },
+    "metrics": {
+      "coherence": 0.4531,
+      "diversity": 0.975,
+      "combined_score": 0.6096
+    }
+  },
+  {
+    "params": {
+      "min_dist": 0.01,
+      "min_document_frequency": 1,
+      "min_samples": 25,
+      "min_topic_size": 200,
+      "n_components": 5,
+      "n_gram_max": 2,
+      "n_neighbors": 15,
+      "nr_topics": "auto",
+      "top_n_words": 10
+    },
+    "metrics": {
+      "coherence": 0.4531,
+      "diversity": 0.975,
+      "combined_score": 0.6096
+    }
+  },
+  {
+    "params": {
+      "min_dist": 0.01,
+      "min_document_frequency": 1,
+      "min_samples": 10,
+      "min_topic_size": 200,
+      "n_components": 5,
+      "n_gram_max": 3,
+      "n_neighbors": 15,
+      "nr_topics": "auto",
+      "top_n_words": 10
+    },
+    "metrics": {
+      "coherence": 0.4617,
+      "diversity": 0.95,
+      "combined_score": 0.6082
+    }
+  },
+  {
+    "params": {
+      "min_dist": 0.01,
+      "min_document_frequency": 1,
+      "min_samples": 25,
+      "min_topic_size": 200,
+      "n_components": 5,
+      "n_gram_max": 3,
+      "n_neighbors": 15,
+      "nr_topics": "auto",
+      "top_n_words": 10
+    },
+    "metrics": {
+      "coherence": 0.4617,
+      "diversity": 0.95,
+      "combined_score": 0.6082
+    }
+  },
+  {
+    "params": {
+      "min_dist": 0.1,
+      "min_document_frequency": 1,
+      "min_samples": 10,
+      "min_topic_size": 200,
+      "n_components": 5,
+      "n_gram_max": 2,
+      "n_neighbors": 15,
+      "nr_topics": "auto",
+      "top_n_words": 10
+    },
+    "metrics": {
+      "coherence": 0.4287,
+      "diversity": 1.0,
+      "combined_score": 0.6001
+    }
+  },
+  {
+    "params": {
+      "min_dist": 0.1,
+      "min_document_frequency": 1,
+      "min_samples": 25,
+      "min_topic_size": 200,
+      "n_components": 5,
+      "n_gram_max": 2,
+      "n_neighbors": 15,
+      "nr_topics": "auto",
+      "top_n_words": 10
+    },
+    "metrics": {
+      "coherence": 0.4287,
+      "diversity": 1.0,
+      "combined_score": 0.6001
+    }
+  },
+  {
+    "params": {
+      "min_dist": 0.1,
+      "min_document_frequency": 1,
+      "min_samples": 10,
+      "min_topic_size": 200,
+      "n_components": 5,
+      "n_gram_max": 3,
+      "n_neighbors": 15,
+      "nr_topics": "auto",
+      "top_n_words": 10
+    },
+    "metrics": {
+      "coherence": 0.427,
+      "diversity": 1.0,
+      "combined_score": 0.5989
+    }
+  },
+  {
+    "params": {
+      "min_dist": 0.1,
+      "min_document_frequency": 1,
+      "min_samples": 25,
+      "min_topic_size": 200,
+      "n_components": 5,
+      "n_gram_max": 3,
+      "n_neighbors": 15,
+      "nr_topics": "auto",
+      "top_n_words": 10
+    },
+    "metrics": {
+      "coherence": 0.427,
+      "diversity": 1.0,
+      "combined_score": 0.5989
+    }
+  },
+  {
+    "params": {
+      "min_dist": 0.01,
+      "min_document_frequency": 1,
+      "min_samples": 10,
+      "min_topic_size": 200,
+      "n_components": 2,
+      "n_gram_max": 3,
+      "n_neighbors": 15,
+      "nr_topics": "auto",
+      "top_n_words": 10
+    },
+    "metrics": {
+      "coherence": 0.4462,
+      "diversity": 0.925,
+      "combined_score": 0.5898
+    }
+  },
+  {
+    "params": {
+      "min_dist": 0.01,
+      "min_document_frequency": 1,
+      "min_samples": 25,
+      "min_topic_size": 200,
+      "n_components": 2,
+      "n_gram_max": 3,
+      "n_neighbors": 15,
+      "nr_topics": "auto",
+      "top_n_words": 10
+    },
+    "metrics": {
+      "coherence": 0.4462,
+      "diversity": 0.925,
+      "combined_score": 0.5898
+    }
+  },
+  {
+    "params": {
+      "min_dist": 0.01,
+      "min_document_frequency": 1,
+      "min_samples": 10,
+      "min_topic_size": 200,
+      "n_components": 2,
+      "n_gram_max": 2,
+      "n_neighbors": 15,
+      "nr_topics": "auto",
+      "top_n_words": 10
+    },
+    "metrics": {
+      "coherence": 0.4456,
+      "diversity": 0.925,
+      "combined_score": 0.5894
+    }
+  },
+  {
+    "params": {
+      "min_dist": 0.01,
+      "min_document_frequency": 1,
+      "min_samples": 25,
+      "min_topic_size": 200,
+      "n_components": 2,
+      "n_gram_max": 2,
+      "n_neighbors": 15,
+      "nr_topics": "auto",
+      "top_n_words": 10
+    },
+    "metrics": {
+      "coherence": 0.4456,
+      "diversity": 0.925,
+      "combined_score": 0.5894
+    }
+  }
+]
@@ -131,3 +131,4 @@ spacy
 nbconvert
 jupytext
 datamapplot
+wordcloud