diff --git a/bertopic/bertopic_autotune.py b/bertopic/bertopic_autotune.py index e60064d..e7c3dd0 100644 --- a/bertopic/bertopic_autotune.py +++ b/bertopic/bertopic_autotune.py @@ -16,10 +16,10 @@ from bertopic import BERTopic param_grid = { "n_gram_max": [2, 3], # Vectorization - "min_document_frequency": [1], # Vectorization + "min_document_frequency": [1, 2], # Vectorization "min_samples": [10, 25], # HDBSCAN - "min_topic_size": [10, 20, 30, 40, 50], # HDBSCAN - "n_neighbors": [15], # UMAP + "min_topic_size": [100, 200], # HDBSCAN + "n_neighbors": [15, 25], # UMAP "n_components": [2, 5], # UMAP "min_dist": [0.01, 0.1], # UMAP "nr_topics": ["auto"], # Topic Modeling diff --git a/bertopic/bertopic_autotune_sorter.py b/bertopic/bertopic_autotune_sorter.py index 720f1b9..afa8fc7 100644 --- a/bertopic/bertopic_autotune_sorter.py +++ b/bertopic/bertopic_autotune_sorter.py @@ -5,7 +5,7 @@ import matplotlib.pyplot as plt with open("output/autotune.json", "r") as f: history = json.load(f) -history = sorted(history, key=lambda x: x["metrics"]["combined_score"], reverse=False) +history = sorted(history, key=lambda x: x["metrics"]["combined_score"], reverse=True) with open("output/autotune_sorted.json", "w") as f: json.dump(history, f, indent=2) diff --git a/bertopic/combined_score_distribution.png b/bertopic/combined_score_distribution.png index a18463f..0e45c43 100644 Binary files a/bertopic/combined_score_distribution.png and b/bertopic/combined_score_distribution.png differ diff --git a/bertopic/nb_bertopic_lowprep.py b/bertopic/nb_bertopic_lowprep.py index 855f44c..1ce589f 100644 --- a/bertopic/nb_bertopic_lowprep.py +++ b/bertopic/nb_bertopic_lowprep.py @@ -360,7 +360,6 @@ vis = topic_model.visualize_documents( custom_labels=True, hide_annotations=True, ) -# vis.write_html("output/visualization.html") vis # %% @@ -497,7 +496,12 @@ if CALCULATE_TOKEN_DISTRIBUTIONS: # # %% -topic_model.visualize_hierarchy(custom_labels=True) +topic_model.visualize_hierarchy(custom_labels=True, color_threshold=0.98) + +# %% +hierarchical_topics = topic_model.hierarchical_topics(reviews) +tree = topic_model.get_topic_tree(hier_topics=hierarchical_topics) +print(tree) # %% [markdown] # ### Intertopic Distance Map @@ -512,3 +516,20 @@ topic_model.visualize_topics(use_ctfidf=True) # %% topic_model.visualize_barchart(top_n_topics=12, custom_labels=True, n_words=10) + +# %% +from wordcloud import WordCloud +import matplotlib.pyplot as plt + + +def create_wordcloud(model, topic): + text = {word: value for word, value in model.get_topic(topic)} + wc = WordCloud(background_color="white", max_words=1000) + wc.generate_from_frequencies(text) + plt.imshow(wc, interpolation="bilinear") + plt.axis("off") + plt.show() + + +# Show wordcloud +create_wordcloud(topic_model, topic=1) diff --git a/bertopic/nb_bertopic_temples.py b/bertopic/nb_bertopic_temples.py new file mode 100644 index 0000000..152cac5 --- /dev/null +++ b/bertopic/nb_bertopic_temples.py @@ -0,0 +1,519 @@ +# --- +# jupyter: +# jupytext: +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.18.0 +# kernelspec: +# display_name: .venv (3.12.3) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Topic Detection: Bali Tourist Reviews +# + +# %% [markdown] +# ## Preparation +# +# ### Dependency Loading +# + +# %% +import pickle +import re + +import gensim.corpora as corpora +import nltk +import numpy as np +import pandas as pd +from bertopic.representation import KeyBERTInspired +from bertopic.vectorizers import ClassTfidfTransformer +from gensim.models.coherencemodel import CoherenceModel +from hdbscan import HDBSCAN +from sentence_transformers import SentenceTransformer +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.feature_extraction import text as skltext +from sklearn.metrics.pairwise import cosine_similarity +from umap import UMAP + +from bertopic import BERTopic + +nltk.download("stopwords") +nltk.download("punkt") +nltk.download("wordnet") + +# %% [markdown] +# ### Hyperparameters and Settings +# + +# %% +RECREATE_MODEL = True +RECREATE_REDUCED_MODEL = True +PROCESS_DATA = True +REDUCE_OUTLIERS = False +CALCULATE_TOKEN_DISTRIBUTIONS = False + +# Data Sample Size, -1 for all data +DATA_SAMPLE_SIZE = -1 + +# Vectorization +MIN_DOCUMENT_FREQUENCY = 1 +MAX_NGRAM = 3 + +# HDBSCAN Parameters +MIN_TOPIC_SIZE = 15 +MIN_SAMPLES = 15 + +# UMAP Parameters +N_NEIGHBORS = 15 +N_COMPONENTS = 2 +MIN_DIST = 0.01 + +# Topic Modeling +TOP_N_WORDS = 10 +MAX_TOPICS = None # or "auto" to pass to HDBSCAN, None to skip + +TF_IDF_STOP_WORDS = ["bali", "place", "visit", "visited", "visiting"] + +# %% [markdown] +# ### Data Loading & Preprocessing +# + +# %% +# Import data after general preprocessing + +if DATA_SAMPLE_SIZE == -1: + reviews = pd.read_csv( + "../data/intermediate/culture_reviews.csv", sep="," + ).Original.to_list() +else: + reviews = ( + pd.read_csv("../data/intermediate/culture_reviews.csv", sep=",") + .sample(n=DATA_SAMPLE_SIZE) + .Original.to_list() + ) + +print("Loaded {} reviews".format(len(reviews))) + +# %% +rep = { + r"\\n": " ", + r"\n": " ", + r'\\"': "", + r'"': "", + r"\s+": " ", +} +rep = dict((re.escape(k), v) for k, v in rep.items()) +pattern = re.compile("|".join(rep.keys())) + + +def preprocess(text): + text = text.strip() + text = text.lower() + text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text) + return text + + +# %% +print( + preprocess( + "Excellent. Definitely worth coming while in bali. Food and people were very nice.\nđ 𤊠âď¸ \nTrisna was our host" + ) +) + +# %% +if PROCESS_DATA: + print("Processing reviews...") + reviews = [preprocess(review) for review in reviews] + + with open("../data/intermediate/processed_texts_culture.pkl", "wb") as f: + pickle.dump(reviews, f) +else: + with open("../data/intermediate/processed_texts_culture.pkl", "rb") as f: + reviews = pickle.load(f) + +print(reviews[:1]) + +# %% [markdown] +# ### Pre-calculate Embeddings +# + +# %% +embedding_model = SentenceTransformer("all-MiniLM-L6-v2") +embeddings = embedding_model.encode(reviews, show_progress_bar=True) + +# %% [markdown] +# ## Model Creation +# + +# %% [markdown] +# ### Dimensionality Reduction (UMAP) +# + +# %% +umap_model = UMAP( + n_neighbors=N_NEIGHBORS, + n_components=N_COMPONENTS, + min_dist=MIN_DIST, + metric="cosine", + low_memory=True, + random_state=42, +) +reduced_embeddings = umap_model.fit_transform(embeddings) + +# %% [markdown] +# ### BERTopic Model Creation +# + +# %% +if RECREATE_MODEL: + stop_words = list(skltext.ENGLISH_STOP_WORDS.union(TF_IDF_STOP_WORDS)) + + ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) + vectorizer_model = CountVectorizer( + min_df=MIN_DOCUMENT_FREQUENCY, + ngram_range=(1, MAX_NGRAM), + stop_words=stop_words, + ) + + representation_model = KeyBERTInspired() + hdbscan_model = HDBSCAN( + min_cluster_size=MIN_TOPIC_SIZE, + min_samples=MIN_SAMPLES, + metric="euclidean", + cluster_selection_method="eom", + gen_min_span_tree=True, + prediction_data=True, + ) + + topic_model = BERTopic( + embedding_model=embedding_model, + ctfidf_model=ctfidf_model, + vectorizer_model=vectorizer_model, + umap_model=umap_model, + hdbscan_model=hdbscan_model, + representation_model=representation_model, + verbose=True, + calculate_probabilities=True, + language="english", + top_n_words=TOP_N_WORDS, + nr_topics=MAX_TOPICS, + ) + + topics, probs = topic_model.fit_transform(reviews, embeddings=embeddings) + + topic_labels = topic_model.generate_topic_labels( + nr_words=3, topic_prefix=True, word_length=15, separator=" - " + ) + topic_model.set_topic_labels(topic_labels) + # BERTopic.save(topic_model, "bertopic/model.bertopic") +else: + print("Nevermind, loading existing model") + # topic_model = BERTopic.load("bertopic/model.bertopic") + +# %% [markdown] +# ## Fine Tuning +# +# ### Topic Condensation +# + +# %% +if RECREATE_REDUCED_MODEL: + done = False + iteration = 1 + while not done: + print(f"Iteration {iteration}") + iteration += 1 + similarity_matrix = cosine_similarity( + np.array(topic_model.topic_embeddings_)[1:, :] + ) + nothing_to_merge = True + + for i in range(similarity_matrix.shape[0]): + for j in range(i + 1, similarity_matrix.shape[1]): + try: + sim = similarity_matrix[i, j] + if sim > 0.9: + nothing_to_merge = False + t1, t2 = i, j + try: + t1_name = topic_model.get_topic_info(t1)["CustomName"][0] + t2_name = topic_model.get_topic_info(t2)["CustomName"][0] + print( + f"Merging topics {t1} ({t1_name}) and {t2} ({t2_name}) with similarity {sim:.2f}" + ) + topic_model.merge_topics(reviews, topics_to_merge=[t1, t2]) + + topic_labels = topic_model.generate_topic_labels( + nr_words=3, + topic_prefix=True, + word_length=15, + separator=" - ", + ) + topic_model.set_topic_labels(topic_labels) + similarity_matrix = cosine_similarity( + np.array(topic_model.topic_embeddings_)[1:, :] + ) + except Exception as e: + print(f"Failed to merge {t1} and {t2}: {e}") + except IndexError: + pass + if nothing_to_merge: + print("No more topics to merge.") + done = True +else: + print("Skipping topic reduction") + +# %% [markdown] +# ### Outlier Reduction +# + +# %% +if REDUCE_OUTLIERS: + new_topics = topic_model.reduce_outliers( + reviews, + topic_model.topics_, + probabilities=topic_model.probabilities_, + threshold=0.05, + strategy="probabilities", + ) + topic_model.update_topics(reviews, topics=new_topics) + +# %% [markdown] +# ## Results +# +# ### Classification +# + +# %% +CLASSIFICATION = False +if CLASSIFICATION: + topics_to_keep = {14, 8, 13, 18, 17, 4, 2, 30, 28} + INPUT_PATH = "../data/intermediate/preprocessed.tab" # TSV with a 'review' column + OUTPUT_CSV = "../data/intermediate/culture_reviews.csv" + + # Topic model document info + df = topic_model.get_document_info(reviews) + df["Original"] = reviews + + # --- filter by topics and length --- + filtered = df[df["Topic"].isin(topics_to_keep)].copy() + filtered["Original"] = filtered["Original"].str.strip() + + # Save an audit CSV + filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False, sep=",") + print(f"Filtered CSV file saved to {OUTPUT_CSV}") + +# %% +doc_topic_matrix = probs + +# column names +topicnames = ["Topic " + str(i) for i in range(len(set(topics)) - 1)] + +# index names +docnames = ["Review " + str(i) for i in range(len(reviews))] + +# Make the pandas dataframe +df_document_topic = pd.DataFrame( + np.round(doc_topic_matrix, 2), columns=topicnames, index=docnames +) + +# Get dominant topic for each document +dominant_topic = np.argmax(doc_topic_matrix, axis=1) +df_document_topic["dominant_topic"] = dominant_topic + + +# Styling +def color_stuff(val): + if val > 0.1: + color = "green" + elif val > 0.05: + color = "orange" + else: + color = "grey" + return "color: {col}".format(col=color) + + +def make_bold(val): + weight = 700 if val > 0.1 else 400 + return "font-weight: {weight}".format(weight=weight) + + +# Apply Style +df_document_topics = ( + df_document_topic.head(15).style.applymap(color_stuff).applymap(make_bold) +) +df_document_topics + +# %% [markdown] +# ### Document Visualization +# + +# %% +vis = topic_model.visualize_documents( + docs=reviews, + reduced_embeddings=reduced_embeddings, + custom_labels=True, + hide_annotations=True, +) +# vis.write_html("output/visualization.html") +vis + +# %% +topic_model.visualize_document_datamap(reviews, reduced_embeddings=reduced_embeddings) + +# %% [markdown] +# ### Similarity Matrix +# + +# %% +topic_model.visualize_heatmap() + +# %% [markdown] +# ### Topic Info +# + +# %% +topic_model.get_topic_info() + +# %% [markdown] +# ### Semantic Coherence +# + +# %% +topic_words = [] +for topic_id in topic_model.get_topic_info()["Topic"]: + # Skip outlier topic + if topic_id < 0: + continue + + words = [word for word, _ in topic_model.get_topic(topic_id)] + topic_words.append(words) + +# Compute mean pairwise cosine similarity for each topic +coherence_scores = [] +for words in topic_words: + coherence_embeddings = embedding_model.encode(words) + sim_matrix = cosine_similarity(coherence_embeddings) + + # Ignore self-similarity + np.fill_diagonal(sim_matrix, 0) + mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)]) + coherence_scores.append(mean_sim) + +overall_coherence = np.mean(coherence_scores) + +print(len(reviews), "reviews processed") +print(len(topic_model.get_topic_info()) - 1, "topics found") +print(f"BERT-based Topic Coherence: {overall_coherence:.4f}") + +# %% [markdown] +# ### Topic Coherence +# + +# %% +# https://github.com/MaartenGr/BERTopic/issues/90#issuecomment-820915389 + +# This will most likely crash your PC +this_will_crash_your_pc_are_you_sure = False +if this_will_crash_your_pc_are_you_sure: + # Preprocess Documents + documents = pd.DataFrame( + {"Document": reviews, "ID": range(len(reviews)), "Topic": topics} + ) + documents_per_topic = documents.groupby(["Topic"], as_index=False).agg( + {"Document": " ".join} + ) + cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values) + + # Extract vectorizer and analyzer from BERTopic + vectorizer = topic_model.vectorizer_model + analyzer = vectorizer.build_analyzer() + + # Extract features for Topic Coherence evaluation + words = vectorizer.get_feature_names_out() + tokens = [analyzer(doc) for doc in cleaned_docs] + dictionary = corpora.Dictionary(tokens) + corpus = [dictionary.doc2bow(token) for token in tokens] + + for topic_id in topic_model.get_topic_info()["Topic"]: + # Skip outlier topic + if topic_id < 0: + continue + + words = [word for word, _ in topic_model.get_topic(topic_id)] + topic_words.append(words) + + # %env TOKENIZERS_PARALLELISM=false + + for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]: + coherence_model = CoherenceModel( + topics=topic_words, + texts=tokens, + corpus=corpus, + dictionary=dictionary, + coherence=measurement, + ) + coherence_score = coherence_model.get_coherence() + print(f"Coherence ({measurement}): {coherence_score:.4f}") + +# %% [markdown] +# ### Term Search +# + +# %% +search_term = "lempuyang" + +similar_topics, similarities = topic_model.find_topics(search_term, top_n=10) +for i in range(len(similar_topics)): + print( + f"{str(similarities[i])[:5]} {topic_model.get_topic_info(similar_topics[i])['CustomName'][0]}" + ) + +# %% +# Source: https://maartengr.github.io/BERTopic/getting_started/visualization/visualize_documents.html#visualize-probabilities-or-distribution +# Calculate the topic distributions on a token-level + +if CALCULATE_TOKEN_DISTRIBUTIONS: + topic_distr, topic_token_distr = topic_model.approximate_distribution( + reviews, calculate_tokens=True, use_embedding_model=True + ) + +# %% +# Visualize the token-level distributions +if CALCULATE_TOKEN_DISTRIBUTIONS: + DOC_INDEX = 1 + df = topic_model.visualize_approximate_distribution( + reviews[DOC_INDEX], topic_token_distr[DOC_INDEX] + ) + df + +# %% [markdown] +# ### Topic Hierarchy +# + +# %% +topic_model.visualize_hierarchy(custom_labels=True) + +# %% +hierarchical_topics = topic_model.hierarchical_topics(reviews) +tree = topic_model.get_topic_tree(hier_topics=hierarchical_topics) +print(tree) + +# %% [markdown] +# ### Intertopic Distance Map +# + +# %% +topic_model.visualize_topics(use_ctfidf=True) + +# %% [markdown] +# ### Topic Word Scores +# + +# %% +topic_model.visualize_barchart(top_n_topics=12, custom_labels=True, n_words=10) diff --git a/bertopic/output/autotune.json b/bertopic/output/autotune.json index b292de6..775c5dc 100644 --- a/bertopic/output/autotune.json +++ b/bertopic/output/autotune.json @@ -3,1296 +3,288 @@ "params": { "min_dist": 0.01, "min_document_frequency": 1, - "min_topic_size": 30, + "min_samples": 10, + "min_topic_size": 200, "n_components": 2, "n_gram_max": 2, "n_neighbors": 15, - "nr_topics": 45, + "nr_topics": "auto", "top_n_words": 10 }, "metrics": { - "coherence": 0.6977, - "diversity": 0.8681, - "inter_topic_distance": 0.6188, - "combined_score": 0.716 + "coherence": 0.4456, + "diversity": 0.925, + "combined_score": 0.5894 } }, { "params": { "min_dist": 0.01, "min_document_frequency": 1, - "min_topic_size": 30, + "min_samples": 10, + "min_topic_size": 200, "n_components": 2, - "n_gram_max": 2, + "n_gram_max": 3, "n_neighbors": 15, - "nr_topics": 50, + "nr_topics": "auto", "top_n_words": 10 }, "metrics": { - "coherence": 0.6992, - "diversity": 0.8326, - "inter_topic_distance": 0.6125, - "combined_score": 0.7085 + "coherence": 0.4462, + "diversity": 0.925, + "combined_score": 0.5898 } }, { "params": { "min_dist": 0.01, "min_document_frequency": 1, - "min_topic_size": 30, - "n_components": 2, + "min_samples": 10, + "min_topic_size": 200, + "n_components": 5, "n_gram_max": 2, "n_neighbors": 15, - "nr_topics": 55, + "nr_topics": "auto", "top_n_words": 10 }, "metrics": { - "coherence": 0.7169, - "diversity": 0.8333, - "inter_topic_distance": 0.6094, - "combined_score": 0.7187 + "coherence": 0.4531, + "diversity": 0.975, + "combined_score": 0.6096 } }, { "params": { "min_dist": 0.01, "min_document_frequency": 1, - "min_topic_size": 30, - "n_components": 2, + "min_samples": 10, + "min_topic_size": 200, + "n_components": 5, "n_gram_max": 3, "n_neighbors": 15, - "nr_topics": 45, + "nr_topics": "auto", "top_n_words": 10 }, "metrics": { - "coherence": 0.7569, - "diversity": 0.9454, - "inter_topic_distance": 0.6408, - "combined_score": 0.7714 + "coherence": 0.4617, + "diversity": 0.95, + "combined_score": 0.6082 } }, { "params": { "min_dist": 0.01, "min_document_frequency": 1, - "min_topic_size": 30, + "min_samples": 25, + "min_topic_size": 200, "n_components": 2, - "n_gram_max": 3, + "n_gram_max": 2, "n_neighbors": 15, - "nr_topics": 50, + "nr_topics": "auto", "top_n_words": 10 }, "metrics": { - "coherence": 0.7651, - "diversity": 0.9265, - "inter_topic_distance": 0.6415, - "combined_score": 0.7727 + "coherence": 0.4456, + "diversity": 0.925, + "combined_score": 0.5894 } }, { "params": { "min_dist": 0.01, "min_document_frequency": 1, - "min_topic_size": 30, + "min_samples": 25, + "min_topic_size": 200, "n_components": 2, "n_gram_max": 3, "n_neighbors": 15, - "nr_topics": 55, + "nr_topics": "auto", "top_n_words": 10 }, "metrics": { - "coherence": 0.762, - "diversity": 0.9074, - "inter_topic_distance": 0.6316, - "combined_score": 0.765 + "coherence": 0.4462, + "diversity": 0.925, + "combined_score": 0.5898 } }, { "params": { "min_dist": 0.01, "min_document_frequency": 1, - "min_topic_size": 40, - "n_components": 2, + "min_samples": 25, + "min_topic_size": 200, + "n_components": 5, "n_gram_max": 2, "n_neighbors": 15, - "nr_topics": 45, + "nr_topics": "auto", "top_n_words": 10 }, "metrics": { - "coherence": 0.6986, - "diversity": 0.8681, - "inter_topic_distance": 0.6158, - "combined_score": 0.7159 + "coherence": 0.4531, + "diversity": 0.975, + "combined_score": 0.6096 } }, { "params": { "min_dist": 0.01, "min_document_frequency": 1, - "min_topic_size": 40, - "n_components": 2, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": 50, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7115, - "diversity": 0.853, - "inter_topic_distance": 0.609, - "combined_score": 0.7193 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 1, - "min_topic_size": 40, - "n_components": 2, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": 55, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7274, - "diversity": 0.8111, - "inter_topic_distance": 0.5989, - "combined_score": 0.7184 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 1, - "min_topic_size": 40, - "n_components": 2, + "min_samples": 25, + "min_topic_size": 200, + "n_components": 5, "n_gram_max": 3, "n_neighbors": 15, - "nr_topics": 45, + "nr_topics": "auto", "top_n_words": 10 }, "metrics": { - "coherence": 0.7412, - "diversity": 0.9318, - "inter_topic_distance": 0.6326, - "combined_score": 0.7576 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 1, - "min_topic_size": 40, - "n_components": 2, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": 50, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7633, - "diversity": 0.9346, - "inter_topic_distance": 0.6322, - "combined_score": 0.7713 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 1, - "min_topic_size": 40, - "n_components": 2, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": 55, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7663, - "diversity": 0.9148, - "inter_topic_distance": 0.6197, - "combined_score": 0.7667 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 1, - "min_topic_size": 50, - "n_components": 2, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": 45, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7071, - "diversity": 0.8681, - "inter_topic_distance": 0.5978, - "combined_score": 0.7174 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 1, - "min_topic_size": 50, - "n_components": 2, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": 50, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7107, - "diversity": 0.804, - "inter_topic_distance": 0.584, - "combined_score": 0.704 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 1, - "min_topic_size": 50, - "n_components": 2, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": 55, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7038, - "diversity": 0.7843, - "inter_topic_distance": 0.5799, - "combined_score": 0.6951 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 1, - "min_topic_size": 50, - "n_components": 2, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": 45, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7645, - "diversity": 0.8954, - "inter_topic_distance": 0.6187, - "combined_score": 0.7615 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 1, - "min_topic_size": 50, - "n_components": 2, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": 50, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7697, - "diversity": 0.8979, - "inter_topic_distance": 0.6101, - "combined_score": 0.7634 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 1, - "min_topic_size": 50, - "n_components": 2, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": 55, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7727, - "diversity": 0.8823, - "inter_topic_distance": 0.6038, - "combined_score": 0.7608 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 2, - "min_topic_size": 30, - "n_components": 2, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": 45, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.694, - "diversity": 0.859, - "inter_topic_distance": 0.6127, - "combined_score": 0.7107 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 2, - "min_topic_size": 30, - "n_components": 2, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": 50, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7012, - "diversity": 0.8244, - "inter_topic_distance": 0.6121, - "combined_score": 0.708 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 2, - "min_topic_size": 30, - "n_components": 2, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": 55, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7101, - "diversity": 0.8259, - "inter_topic_distance": 0.6051, - "combined_score": 0.7123 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 2, - "min_topic_size": 30, - "n_components": 2, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": 45, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7534, - "diversity": 0.9409, - "inter_topic_distance": 0.6375, - "combined_score": 0.7677 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 2, - "min_topic_size": 30, - "n_components": 2, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": 50, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7676, - "diversity": 0.9102, - "inter_topic_distance": 0.6424, - "combined_score": 0.7711 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 2, - "min_topic_size": 30, - "n_components": 2, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": 55, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7633, - "diversity": 0.9148, - "inter_topic_distance": 0.6367, - "combined_score": 0.7683 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 2, - "min_topic_size": 40, - "n_components": 2, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": 45, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.6954, - "diversity": 0.8727, - "inter_topic_distance": 0.6101, - "combined_score": 0.7138 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 2, - "min_topic_size": 40, - "n_components": 2, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": 50, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7147, - "diversity": 0.8693, - "inter_topic_distance": 0.6128, - "combined_score": 0.7253 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 2, - "min_topic_size": 40, - "n_components": 2, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": 55, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7173, - "diversity": 0.8148, - "inter_topic_distance": 0.5932, - "combined_score": 0.712 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 2, - "min_topic_size": 40, - "n_components": 2, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": 45, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7624, - "diversity": 0.9454, - "inter_topic_distance": 0.6307, - "combined_score": 0.7727 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 2, - "min_topic_size": 40, - "n_components": 2, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": 50, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7666, - "diversity": 0.9224, - "inter_topic_distance": 0.635, - "combined_score": 0.7714 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 2, - "min_topic_size": 40, - "n_components": 2, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": 55, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7865, - "diversity": 0.9074, - "inter_topic_distance": 0.6205, - "combined_score": 0.7775 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 2, - "min_topic_size": 50, - "n_components": 2, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": 45, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.715, - "diversity": 0.8545, - "inter_topic_distance": 0.5973, - "combined_score": 0.7194 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 2, - "min_topic_size": 50, - "n_components": 2, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": 50, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.713, - "diversity": 0.8081, - "inter_topic_distance": 0.5833, - "combined_score": 0.7061 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 2, - "min_topic_size": 50, - "n_components": 2, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": 55, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7079, - "diversity": 0.796, - "inter_topic_distance": 0.5778, - "combined_score": 0.6995 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 2, - "min_topic_size": 50, - "n_components": 2, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": 45, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7703, - "diversity": 0.8954, - "inter_topic_distance": 0.619, - "combined_score": 0.7651 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 2, - "min_topic_size": 50, - "n_components": 2, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": 50, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7723, - "diversity": 0.9142, - "inter_topic_distance": 0.6135, - "combined_score": 0.7689 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 2, - "min_topic_size": 50, - "n_components": 2, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": 55, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7764, - "diversity": 0.9019, - "inter_topic_distance": 0.6073, - "combined_score": 0.7677 + "coherence": 0.4617, + "diversity": 0.95, + "combined_score": 0.6082 } }, { "params": { "min_dist": 0.1, "min_document_frequency": 1, - "min_topic_size": 30, + "min_samples": 10, + "min_topic_size": 200, "n_components": 2, "n_gram_max": 2, "n_neighbors": 15, - "nr_topics": 45, + "nr_topics": "auto", "top_n_words": 10 }, "metrics": { - "coherence": 0.6901, - "diversity": 0.8727, - "inter_topic_distance": 0.6055, - "combined_score": 0.7097 + "coherence": 0.498, + "diversity": 1.0, + "combined_score": 0.6486 } }, { "params": { "min_dist": 0.1, "min_document_frequency": 1, - "min_topic_size": 30, + "min_samples": 10, + "min_topic_size": 200, "n_components": 2, - "n_gram_max": 2, + "n_gram_max": 3, "n_neighbors": 15, - "nr_topics": 50, + "nr_topics": "auto", "top_n_words": 10 }, "metrics": { - "coherence": 0.6928, - "diversity": 0.8489, - "inter_topic_distance": 0.5986, - "combined_score": 0.7052 + "coherence": 0.4915, + "diversity": 0.9666, + "combined_score": 0.634 } }, { "params": { "min_dist": 0.1, "min_document_frequency": 1, - "min_topic_size": 30, - "n_components": 2, + "min_samples": 10, + "min_topic_size": 200, + "n_components": 5, "n_gram_max": 2, "n_neighbors": 15, - "nr_topics": 55, + "nr_topics": "auto", "top_n_words": 10 }, "metrics": { - "coherence": 0.713, - "diversity": 0.8192, - "inter_topic_distance": 0.5992, - "combined_score": 0.7115 + "coherence": 0.4287, + "diversity": 1.0, + "combined_score": 0.6001 } }, { "params": { "min_dist": 0.1, "min_document_frequency": 1, - "min_topic_size": 30, - "n_components": 2, + "min_samples": 10, + "min_topic_size": 200, + "n_components": 5, "n_gram_max": 3, "n_neighbors": 15, - "nr_topics": 45, + "nr_topics": "auto", "top_n_words": 10 }, "metrics": { - "coherence": 0.7559, - "diversity": 0.9363, - "inter_topic_distance": 0.64, - "combined_score": 0.7688 + "coherence": 0.427, + "diversity": 1.0, + "combined_score": 0.5989 } }, { "params": { "min_dist": 0.1, "min_document_frequency": 1, - "min_topic_size": 30, + "min_samples": 25, + "min_topic_size": 200, "n_components": 2, - "n_gram_max": 3, + "n_gram_max": 2, "n_neighbors": 15, - "nr_topics": 50, + "nr_topics": "auto", "top_n_words": 10 }, "metrics": { - "coherence": 0.7609, - "diversity": 0.9306, - "inter_topic_distance": 0.6287, - "combined_score": 0.7684 + "coherence": 0.498, + "diversity": 1.0, + "combined_score": 0.6486 } }, { "params": { "min_dist": 0.1, "min_document_frequency": 1, - "min_topic_size": 30, + "min_samples": 25, + "min_topic_size": 200, "n_components": 2, "n_gram_max": 3, "n_neighbors": 15, - "nr_topics": 55, + "nr_topics": "auto", "top_n_words": 10 }, "metrics": { - "coherence": 0.7739, - "diversity": 0.9153, - "inter_topic_distance": 0.6167, - "combined_score": 0.7707 + "coherence": 0.4915, + "diversity": 0.9666, + "combined_score": 0.634 } }, { "params": { "min_dist": 0.1, "min_document_frequency": 1, - "min_topic_size": 40, - "n_components": 2, + "min_samples": 25, + "min_topic_size": 200, + "n_components": 5, "n_gram_max": 2, "n_neighbors": 15, - "nr_topics": 45, + "nr_topics": "auto", "top_n_words": 10 }, "metrics": { - "coherence": 0.7098, - "diversity": 0.8318, - "inter_topic_distance": 0.5839, - "combined_score": 0.709 + "coherence": 0.4287, + "diversity": 1.0, + "combined_score": 0.6001 } }, { "params": { "min_dist": 0.1, "min_document_frequency": 1, - "min_topic_size": 40, - "n_components": 2, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": 50, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7076, - "diversity": 0.8304, - "inter_topic_distance": 0.585, - "combined_score": 0.7076 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 1, - "min_topic_size": 40, - "n_components": 2, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": 55, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7076, - "diversity": 0.8304, - "inter_topic_distance": 0.585, - "combined_score": 0.7076 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 1, - "min_topic_size": 40, - "n_components": 2, + "min_samples": 25, + "min_topic_size": 200, + "n_components": 5, "n_gram_max": 3, "n_neighbors": 15, - "nr_topics": 45, + "nr_topics": "auto", "top_n_words": 10 }, "metrics": { - "coherence": 0.7736, - "diversity": 0.9318, - "inter_topic_distance": 0.6133, - "combined_score": 0.7732 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 1, - "min_topic_size": 40, - "n_components": 2, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": 50, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7809, - "diversity": 0.9304, - "inter_topic_distance": 0.6066, - "combined_score": 0.776 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 1, - "min_topic_size": 40, - "n_components": 2, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": 55, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7809, - "diversity": 0.9304, - "inter_topic_distance": 0.6066, - "combined_score": 0.776 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 1, - "min_topic_size": 50, - "n_components": 2, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": 45, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.6938, - "diversity": 0.8545, - "inter_topic_distance": 0.5996, - "combined_score": 0.7071 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 1, - "min_topic_size": 50, - "n_components": 2, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": 50, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7049, - "diversity": 0.8244, - "inter_topic_distance": 0.5761, - "combined_score": 0.703 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 1, - "min_topic_size": 50, - "n_components": 2, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": 55, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7049, - "diversity": 0.8244, - "inter_topic_distance": 0.5761, - "combined_score": 0.703 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 1, - "min_topic_size": 50, - "n_components": 2, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": 45, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7604, - "diversity": 0.9363, - "inter_topic_distance": 0.6263, - "combined_score": 0.7687 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 1, - "min_topic_size": 50, - "n_components": 2, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": 50, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7722, - "diversity": 0.9142, - "inter_topic_distance": 0.6033, - "combined_score": 0.7668 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 1, - "min_topic_size": 50, - "n_components": 2, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": 55, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7722, - "diversity": 0.9142, - "inter_topic_distance": 0.6033, - "combined_score": 0.7668 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 2, - "min_topic_size": 30, - "n_components": 2, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": 45, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7003, - "diversity": 0.8636, - "inter_topic_distance": 0.6077, - "combined_score": 0.7145 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 2, - "min_topic_size": 30, - "n_components": 2, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": 50, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7026, - "diversity": 0.853, - "inter_topic_distance": 0.6005, - "combined_score": 0.7123 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 2, - "min_topic_size": 30, - "n_components": 2, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": 55, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7199, - "diversity": 0.8269, - "inter_topic_distance": 0.5979, - "combined_score": 0.7169 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 2, - "min_topic_size": 30, - "n_components": 2, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": 45, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7635, - "diversity": 0.9363, - "inter_topic_distance": 0.6376, - "combined_score": 0.7729 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 2, - "min_topic_size": 30, - "n_components": 2, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": 50, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7691, - "diversity": 0.9346, - "inter_topic_distance": 0.6297, - "combined_score": 0.7743 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 2, - "min_topic_size": 30, - "n_components": 2, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": 55, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7828, - "diversity": 0.923, - "inter_topic_distance": 0.6195, - "combined_score": 0.7782 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 2, - "min_topic_size": 40, - "n_components": 2, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": 45, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7203, - "diversity": 0.8227, - "inter_topic_distance": 0.5915, - "combined_score": 0.715 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 2, - "min_topic_size": 40, - "n_components": 2, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": 50, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7111, - "diversity": 0.8173, - "inter_topic_distance": 0.585, - "combined_score": 0.7071 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 2, - "min_topic_size": 40, - "n_components": 2, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": 55, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7111, - "diversity": 0.8173, - "inter_topic_distance": 0.585, - "combined_score": 0.7071 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 2, - "min_topic_size": 40, - "n_components": 2, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": 45, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7832, - "diversity": 0.9227, - "inter_topic_distance": 0.6168, - "combined_score": 0.7778 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 2, - "min_topic_size": 40, - "n_components": 2, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": 50, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7819, - "diversity": 0.926, - "inter_topic_distance": 0.6088, - "combined_score": 0.7761 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 2, - "min_topic_size": 40, - "n_components": 2, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": 55, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7819, - "diversity": 0.926, - "inter_topic_distance": 0.6088, - "combined_score": 0.7761 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 2, - "min_topic_size": 50, - "n_components": 2, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": 45, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7075, - "diversity": 0.8681, - "inter_topic_distance": 0.6035, - "combined_score": 0.7188 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 2, - "min_topic_size": 50, - "n_components": 2, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": 50, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7164, - "diversity": 0.8204, - "inter_topic_distance": 0.5796, - "combined_score": 0.7098 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 2, - "min_topic_size": 50, - "n_components": 2, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": 55, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7164, - "diversity": 0.8204, - "inter_topic_distance": 0.5796, - "combined_score": 0.7098 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 2, - "min_topic_size": 50, - "n_components": 2, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": 45, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7672, - "diversity": 0.9409, - "inter_topic_distance": 0.6252, - "combined_score": 0.7735 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 2, - "min_topic_size": 50, - "n_components": 2, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": 50, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7794, - "diversity": 0.9224, - "inter_topic_distance": 0.6077, - "combined_score": 0.7737 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 2, - "min_topic_size": 50, - "n_components": 2, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": 55, - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.7794, - "diversity": 0.9224, - "inter_topic_distance": 0.6077, - "combined_score": 0.7737 + "coherence": 0.427, + "diversity": 1.0, + "combined_score": 0.5989 } } ] \ No newline at end of file diff --git a/bertopic/output/autotune_sorted.json b/bertopic/output/autotune_sorted.json new file mode 100644 index 0000000..7f30e44 --- /dev/null +++ b/bertopic/output/autotune_sorted.json @@ -0,0 +1,290 @@ +[ + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 1, + "min_samples": 10, + "min_topic_size": 200, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": "auto", + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.498, + "diversity": 1.0, + "combined_score": 0.6486 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 1, + "min_samples": 25, + "min_topic_size": 200, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": "auto", + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.498, + "diversity": 1.0, + "combined_score": 0.6486 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 1, + "min_samples": 10, + "min_topic_size": 200, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": "auto", + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.4915, + "diversity": 0.9666, + "combined_score": 0.634 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 1, + "min_samples": 25, + "min_topic_size": 200, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": "auto", + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.4915, + "diversity": 0.9666, + "combined_score": 0.634 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 1, + "min_samples": 10, + "min_topic_size": 200, + "n_components": 5, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": "auto", + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.4531, + "diversity": 0.975, + "combined_score": 0.6096 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 1, + "min_samples": 25, + "min_topic_size": 200, + "n_components": 5, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": "auto", + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.4531, + "diversity": 0.975, + "combined_score": 0.6096 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 1, + "min_samples": 10, + "min_topic_size": 200, + "n_components": 5, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": "auto", + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.4617, + "diversity": 0.95, + "combined_score": 0.6082 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 1, + "min_samples": 25, + "min_topic_size": 200, + "n_components": 5, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": "auto", + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.4617, + "diversity": 0.95, + "combined_score": 0.6082 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 1, + "min_samples": 10, + "min_topic_size": 200, + "n_components": 5, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": "auto", + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.4287, + "diversity": 1.0, + "combined_score": 0.6001 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 1, + "min_samples": 25, + "min_topic_size": 200, + "n_components": 5, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": "auto", + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.4287, + "diversity": 1.0, + "combined_score": 0.6001 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 1, + "min_samples": 10, + "min_topic_size": 200, + "n_components": 5, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": "auto", + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.427, + "diversity": 1.0, + "combined_score": 0.5989 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 1, + "min_samples": 25, + "min_topic_size": 200, + "n_components": 5, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": "auto", + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.427, + "diversity": 1.0, + "combined_score": 0.5989 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 1, + "min_samples": 10, + "min_topic_size": 200, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": "auto", + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.4462, + "diversity": 0.925, + "combined_score": 0.5898 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 1, + "min_samples": 25, + "min_topic_size": 200, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": "auto", + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.4462, + "diversity": 0.925, + "combined_score": 0.5898 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 1, + "min_samples": 10, + "min_topic_size": 200, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": "auto", + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.4456, + "diversity": 0.925, + "combined_score": 0.5894 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 1, + "min_samples": 25, + "min_topic_size": 200, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": "auto", + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.4456, + "diversity": 0.925, + "combined_score": 0.5894 + } + } +] \ No newline at end of file diff --git a/bertopic/output/visualization.html b/bertopic/output/visualization.html deleted file mode 100644 index 7b4a048..0000000 --- a/bertopic/output/visualization.html +++ /dev/null @@ -1,3885 +0,0 @@ - -
- -