diff --git a/README.md b/README.md new file mode 100644 index 0000000..c75509e --- /dev/null +++ b/README.md @@ -0,0 +1,14 @@ +# Masterthesis, praktischer Anteil + +## Jupyter Notebooks "rehydrieren" + +Damit keine unnötigen Jupyter Outputs etc. im Versionsmanagement landen, gibt es das Skript `convert_jupytext.sh`, welches nur den notwendigen Quelltext in ein `.py` File schreibt. Mit demselben Skript kann dieser Schritt wieder umgekehrt werden, also ein Jupyter Notebook aus dem Python-File geschrieben werden. + +Das Skript sollte also immer vor dem Committen von Änderungen mit `py` als erstes Argument ausgeführt werden. + +Verwendung: + +```bash +./convert_jupytext.sh py # Jupyter Notebook -> Python +./convert_jupytext.sh nb # Python -> Jupyter Notebook +``` diff --git a/bertopic/bertopic_autotune.py b/bertopic/bertopic_autotune.py index ada91fe..d4c21f9 100644 --- a/bertopic/bertopic_autotune.py +++ b/bertopic/bertopic_autotune.py @@ -3,6 +3,8 @@ import traceback import numpy as np import pandas as pd +from bertopic.representation import KeyBERTInspired +from bertopic.vectorizers import ClassTfidfTransformer from hdbscan import HDBSCAN from sentence_transformers import SentenceTransformer from sklearn.feature_extraction.text import CountVectorizer @@ -12,55 +14,50 @@ from sklearn.model_selection import ParameterGrid from umap import UMAP from bertopic import BERTopic -from bertopic.representation import KeyBERTInspired -from bertopic.vectorizers import ClassTfidfTransformer param_grid = { - "nr_topics": [45, 50, 55], - "min_topic_size": [30, 40, 50], - "n_gram_max": [3], - "min_document_frequency": [1, 2], - "n_neighbors": [15], - "n_components": [2], - "min_dist": [0.1], - "top_n_words": [10], + "n_gram_max": [2, 3], # Vectorization + "min_document_frequency": [1], # Vectorization + "min_samples": [10, 25], # HDBSCAN + "min_topic_size": [10, 20, 30, 40, 50], # HDBSCAN + "n_neighbors": [15], # UMAP + "n_components": [2, 5], # UMAP + "min_dist": [0.01, 0.1], # UMAP + "nr_topics": ["auto"], # Topic Modeling + "top_n_words": [10, 13, 15, 17, 20], # Topic Modeling } -def calculate_metrics(topic_model, embedder, top_n_words=5): +def calculate_metrics(topic_model, embedder, top_n_words=10): # Get topic words topic_words = [] for topic_id in range(len(topic_model.get_topic_info()) - 1): words = [word for word, _ in topic_model.get_topic(topic_id)] topic_words.append(words[:top_n_words]) + # Pre-compute embeddings for all unique words + all_words = list(set(word for words in topic_words for word in words)) + word_embeddings = embedder.encode(all_words) + embedding_map = {word: emb for word, emb in zip(all_words, word_embeddings)} + # Coherence coherence_scores = [] for words in topic_words: - embeddings = embedder.encode(words) + embeddings = np.array([embedding_map[word] for word in words]) sim_matrix = cosine_similarity(embeddings) np.fill_diagonal(sim_matrix, 0) - coherence_scores.append(np.mean(sim_matrix)) + mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)]) + coherence_scores.append(mean_sim) overall_coherence = np.mean(coherence_scores) # Diversity all_topic_words = [word for topic in topic_words for word in topic] diversity = len(set(all_topic_words)) / len(all_topic_words) - # Inter-topic distance - topic_embeddings = [ - np.mean(embedder.encode(words), axis=0) for words in topic_words - ] - topic_distance = pairwise_distances(topic_embeddings, metric="cosine") - avg_distance = np.mean(topic_distance[np.triu_indices_from(topic_distance, k=1)]) - res = { "coherence": float(str(overall_coherence)[:6]), "diversity": float(str(diversity)[:6]), - "inter_topic_distance": float(str(avg_distance)[:6]), - "combined_score": float( - str(0.6 * overall_coherence + 0.2 * diversity + 0.2 * avg_distance)[:6] - ), + "combined_score": float(str(0.7 * overall_coherence + 0.3 * diversity)[:6]), } print(res) return res @@ -85,6 +82,7 @@ def auto_tune_bertopic(texts, embedding_model, param_grid): print(f"Total parameter combinations: {len(param_list)}") for params in param_list: + print(f"Testing param combination no. {len(history) + 1}/{len(param_list)}...") try: print(f"Testing params: {params}") ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) @@ -143,18 +141,27 @@ def auto_tune_bertopic(texts, embedding_model, param_grid): traceback.print_exc() continue - return best_model, best_params, best_score, history + with open("output/autotune.json", "w") as f: + json.dump(history, f, indent=2) + + return best_model, best_params, best_score SPECIAL_CHARS = ["\n", "\\n"] MIN_REVIEW_WORDS = 5 -reviews = pd.read_csv("data.tab", sep="\t").review.to_list() +print("Loading reviews...") +reviews = pd.read_csv("../data/original/reviews.tab", sep="\t").review.to_list() +print("Running light preprocessing...") for schar in SPECIAL_CHARS: reviews = [ review.replace(schar, " ") if isinstance(review, str) else review for review in reviews ] + +print("Filtering short reviews...") reviews = [review for review in reviews if len(str(review).split()) >= MIN_REVIEW_WORDS] + +print("Staring auto-tuning...") print(auto_tune_bertopic(reviews, "all-MiniLM-L6-v2", param_grid)) diff --git a/bertopic/bertopic_autotune_sorter.py b/bertopic/bertopic_autotune_sorter.py index 9da3eca..720f1b9 100644 --- a/bertopic/bertopic_autotune_sorter.py +++ b/bertopic/bertopic_autotune_sorter.py @@ -2,12 +2,12 @@ import json import matplotlib.pyplot as plt -with open("history.json", "r") as f: +with open("output/autotune.json", "r") as f: history = json.load(f) -history = sorted(history, key=lambda x: x["metrics"]["combined_score"], reverse=True) +history = sorted(history, key=lambda x: x["metrics"]["combined_score"], reverse=False) -with open("history_sorted.json", "w") as f: +with open("output/autotune_sorted.json", "w") as f: json.dump(history, f, indent=2) diff --git a/bertopic/combined_score_distribution.png b/bertopic/combined_score_distribution.png new file mode 100644 index 0000000..a18463f Binary files /dev/null and b/bertopic/combined_score_distribution.png differ diff --git a/bertopic/nb_bertopic.py b/bertopic/nb_bertopic.py index 08042f6..dd2ea41 100644 --- a/bertopic/nb_bertopic.py +++ b/bertopic/nb_bertopic.py @@ -23,7 +23,15 @@ # # %% -from bertopic import BERTopic +import json +import pickle +import re + +import gensim.corpora as corpora +import nltk +import numpy as np +import pandas as pd +import spacy from bertopic.representation import KeyBERTInspired from bertopic.vectorizers import ClassTfidfTransformer from gensim.models.coherencemodel import CoherenceModel @@ -34,14 +42,8 @@ from sentence_transformers import SentenceTransformer from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics.pairwise import cosine_similarity from umap import UMAP -import gensim.corpora as corpora -import json -import nltk -import numpy as np -import pandas as pd -import re -import spacy -import pickle + +from bertopic import BERTopic nlp = spacy.load("en_core_web_sm") @@ -323,8 +325,8 @@ if REDUCE_OUTLIERS: # # %% -from pathlib import Path import random +from pathlib import Path # --- config --- topics_to_keep = {2, 4, 6, 8, 10, 5, 7} @@ -468,7 +470,11 @@ topic_model.get_topic_info() # %% topic_words = [] -for topic_id in range(len(topic_model.get_topic_info()) - 1): +for topic_id in topic_model.get_topic_info()["Topic"]: + # Skip outlier topic + if topic_id < 0: + continue + words = [word for word, _ in topic_model.get_topic(topic_id)] topic_words.append(words) @@ -477,8 +483,10 @@ coherence_scores = [] for words in topic_words: coherence_embeddings = embedding_model.encode(words) sim_matrix = cosine_similarity(coherence_embeddings) - np.fill_diagonal(sim_matrix, 0) # Ignore self-similarity - mean_sim = np.mean(sim_matrix) + + # Ignore self-similarity + np.fill_diagonal(sim_matrix, 0) + mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)]) coherence_scores.append(mean_sim) overall_coherence = np.mean(coherence_scores) @@ -518,8 +526,8 @@ if CALCULATE_COHERENCE: for topic in range(len(set(topics)) - 1) ] - # %env TOKENIZERS_PARALLELISM=false - + # %env TOKENIZERS_PARALLELISM=false + for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]: coherence_model = CoherenceModel( topics=topic_words, diff --git a/bertopic/nb_bertopic_lowprep.py b/bertopic/nb_bertopic_lowprep.py index 86d53c9..c080624 100644 --- a/bertopic/nb_bertopic_lowprep.py +++ b/bertopic/nb_bertopic_lowprep.py @@ -23,7 +23,14 @@ # # %% -from bertopic import BERTopic +import pickle +import re + +import gensim.corpora as corpora +import nltk +import numpy as np +import pandas as pd +import spacy from bertopic.representation import KeyBERTInspired from bertopic.vectorizers import ClassTfidfTransformer from gensim.models.coherencemodel import CoherenceModel @@ -33,13 +40,8 @@ from sentence_transformers import SentenceTransformer from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics.pairwise import cosine_similarity from umap import UMAP -import gensim.corpora as corpora -import nltk -import numpy as np -import pandas as pd -import re -import spacy -import pickle + +from bertopic import BERTopic nlp = spacy.load("en_core_web_sm") @@ -300,8 +302,8 @@ if REDUCE_OUTLIERS: # # %% -from pathlib import Path import random +from pathlib import Path # --- config --- topics_to_keep = {2, 4, 5, 9, 22, 26} @@ -445,7 +447,11 @@ topic_model.get_topic_info() # %% topic_words = [] -for topic_id in range(len(topic_model.get_topic_info()) - 1): +for topic_id in topic_model.get_topic_info()["Topic"]: + # Skip outlier topic + if topic_id < 0: + continue + words = [word for word, _ in topic_model.get_topic(topic_id)] topic_words.append(words) @@ -454,8 +460,10 @@ coherence_scores = [] for words in topic_words: coherence_embeddings = embedding_model.encode(words) sim_matrix = cosine_similarity(coherence_embeddings) - np.fill_diagonal(sim_matrix, 0) # Ignore self-similarity - mean_sim = np.mean(sim_matrix) + + # Ignore self-similarity + np.fill_diagonal(sim_matrix, 0) + mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)]) coherence_scores.append(mean_sim) overall_coherence = np.mean(coherence_scores) @@ -492,10 +500,14 @@ if this_will_crash_your_pc_are_you_sure: tokens = [analyzer(doc) for doc in cleaned_docs] dictionary = corpora.Dictionary(tokens) corpus = [dictionary.doc2bow(token) for token in tokens] - topic_words = [ - [words for words, _ in topic_model.get_topic(topic)] - for topic in range(len(set(topics)) - 1) - ] + + for topic_id in topic_model.get_topic_info()["Topic"]: + # Skip outlier topic + if topic_id < 0: + continue + + words = [word for word, _ in topic_model.get_topic(topic_id)] + topic_words.append(words) # %env TOKENIZERS_PARALLELISM=false diff --git a/bertopic/output/autotune.json b/bertopic/output/autotune.json new file mode 100644 index 0000000..b292de6 --- /dev/null +++ b/bertopic/output/autotune.json @@ -0,0 +1,1298 @@ +[ + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 1, + "min_topic_size": 30, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": 45, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.6977, + "diversity": 0.8681, + "inter_topic_distance": 0.6188, + "combined_score": 0.716 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 1, + "min_topic_size": 30, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": 50, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.6992, + "diversity": 0.8326, + "inter_topic_distance": 0.6125, + "combined_score": 0.7085 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 1, + "min_topic_size": 30, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": 55, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7169, + "diversity": 0.8333, + "inter_topic_distance": 0.6094, + "combined_score": 0.7187 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 1, + "min_topic_size": 30, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": 45, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7569, + "diversity": 0.9454, + "inter_topic_distance": 0.6408, + "combined_score": 0.7714 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 1, + "min_topic_size": 30, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": 50, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7651, + "diversity": 0.9265, + "inter_topic_distance": 0.6415, + "combined_score": 0.7727 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 1, + "min_topic_size": 30, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": 55, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.762, + "diversity": 0.9074, + "inter_topic_distance": 0.6316, + "combined_score": 0.765 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 1, + "min_topic_size": 40, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": 45, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.6986, + "diversity": 0.8681, + "inter_topic_distance": 0.6158, + "combined_score": 0.7159 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 1, + "min_topic_size": 40, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": 50, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7115, + "diversity": 0.853, + "inter_topic_distance": 0.609, + "combined_score": 0.7193 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 1, + "min_topic_size": 40, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": 55, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7274, + "diversity": 0.8111, + "inter_topic_distance": 0.5989, + "combined_score": 0.7184 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 1, + "min_topic_size": 40, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": 45, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7412, + "diversity": 0.9318, + "inter_topic_distance": 0.6326, + "combined_score": 0.7576 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 1, + "min_topic_size": 40, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": 50, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7633, + "diversity": 0.9346, + "inter_topic_distance": 0.6322, + "combined_score": 0.7713 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 1, + "min_topic_size": 40, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": 55, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7663, + "diversity": 0.9148, + "inter_topic_distance": 0.6197, + "combined_score": 0.7667 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 1, + "min_topic_size": 50, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": 45, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7071, + "diversity": 0.8681, + "inter_topic_distance": 0.5978, + "combined_score": 0.7174 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 1, + "min_topic_size": 50, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": 50, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7107, + "diversity": 0.804, + "inter_topic_distance": 0.584, + "combined_score": 0.704 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 1, + "min_topic_size": 50, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": 55, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7038, + "diversity": 0.7843, + "inter_topic_distance": 0.5799, + "combined_score": 0.6951 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 1, + "min_topic_size": 50, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": 45, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7645, + "diversity": 0.8954, + "inter_topic_distance": 0.6187, + "combined_score": 0.7615 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 1, + "min_topic_size": 50, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": 50, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7697, + "diversity": 0.8979, + "inter_topic_distance": 0.6101, + "combined_score": 0.7634 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 1, + "min_topic_size": 50, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": 55, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7727, + "diversity": 0.8823, + "inter_topic_distance": 0.6038, + "combined_score": 0.7608 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 2, + "min_topic_size": 30, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": 45, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.694, + "diversity": 0.859, + "inter_topic_distance": 0.6127, + "combined_score": 0.7107 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 2, + "min_topic_size": 30, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": 50, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7012, + "diversity": 0.8244, + "inter_topic_distance": 0.6121, + "combined_score": 0.708 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 2, + "min_topic_size": 30, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": 55, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7101, + "diversity": 0.8259, + "inter_topic_distance": 0.6051, + "combined_score": 0.7123 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 2, + "min_topic_size": 30, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": 45, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7534, + "diversity": 0.9409, + "inter_topic_distance": 0.6375, + "combined_score": 0.7677 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 2, + "min_topic_size": 30, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": 50, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7676, + "diversity": 0.9102, + "inter_topic_distance": 0.6424, + "combined_score": 0.7711 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 2, + "min_topic_size": 30, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": 55, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7633, + "diversity": 0.9148, + "inter_topic_distance": 0.6367, + "combined_score": 0.7683 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 2, + "min_topic_size": 40, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": 45, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.6954, + "diversity": 0.8727, + "inter_topic_distance": 0.6101, + "combined_score": 0.7138 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 2, + "min_topic_size": 40, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": 50, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7147, + "diversity": 0.8693, + "inter_topic_distance": 0.6128, + "combined_score": 0.7253 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 2, + "min_topic_size": 40, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": 55, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7173, + "diversity": 0.8148, + "inter_topic_distance": 0.5932, + "combined_score": 0.712 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 2, + "min_topic_size": 40, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": 45, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7624, + "diversity": 0.9454, + "inter_topic_distance": 0.6307, + "combined_score": 0.7727 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 2, + "min_topic_size": 40, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": 50, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7666, + "diversity": 0.9224, + "inter_topic_distance": 0.635, + "combined_score": 0.7714 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 2, + "min_topic_size": 40, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": 55, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7865, + "diversity": 0.9074, + "inter_topic_distance": 0.6205, + "combined_score": 0.7775 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 2, + "min_topic_size": 50, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": 45, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.715, + "diversity": 0.8545, + "inter_topic_distance": 0.5973, + "combined_score": 0.7194 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 2, + "min_topic_size": 50, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": 50, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.713, + "diversity": 0.8081, + "inter_topic_distance": 0.5833, + "combined_score": 0.7061 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 2, + "min_topic_size": 50, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": 55, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7079, + "diversity": 0.796, + "inter_topic_distance": 0.5778, + "combined_score": 0.6995 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 2, + "min_topic_size": 50, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": 45, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7703, + "diversity": 0.8954, + "inter_topic_distance": 0.619, + "combined_score": 0.7651 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 2, + "min_topic_size": 50, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": 50, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7723, + "diversity": 0.9142, + "inter_topic_distance": 0.6135, + "combined_score": 0.7689 + } + }, + { + "params": { + "min_dist": 0.01, + "min_document_frequency": 2, + "min_topic_size": 50, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": 55, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7764, + "diversity": 0.9019, + "inter_topic_distance": 0.6073, + "combined_score": 0.7677 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 1, + "min_topic_size": 30, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": 45, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.6901, + "diversity": 0.8727, + "inter_topic_distance": 0.6055, + "combined_score": 0.7097 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 1, + "min_topic_size": 30, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": 50, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.6928, + "diversity": 0.8489, + "inter_topic_distance": 0.5986, + "combined_score": 0.7052 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 1, + "min_topic_size": 30, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": 55, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.713, + "diversity": 0.8192, + "inter_topic_distance": 0.5992, + "combined_score": 0.7115 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 1, + "min_topic_size": 30, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": 45, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7559, + "diversity": 0.9363, + "inter_topic_distance": 0.64, + "combined_score": 0.7688 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 1, + "min_topic_size": 30, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": 50, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7609, + "diversity": 0.9306, + "inter_topic_distance": 0.6287, + "combined_score": 0.7684 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 1, + "min_topic_size": 30, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": 55, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7739, + "diversity": 0.9153, + "inter_topic_distance": 0.6167, + "combined_score": 0.7707 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 1, + "min_topic_size": 40, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": 45, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7098, + "diversity": 0.8318, + "inter_topic_distance": 0.5839, + "combined_score": 0.709 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 1, + "min_topic_size": 40, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": 50, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7076, + "diversity": 0.8304, + "inter_topic_distance": 0.585, + "combined_score": 0.7076 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 1, + "min_topic_size": 40, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": 55, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7076, + "diversity": 0.8304, + "inter_topic_distance": 0.585, + "combined_score": 0.7076 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 1, + "min_topic_size": 40, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": 45, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7736, + "diversity": 0.9318, + "inter_topic_distance": 0.6133, + "combined_score": 0.7732 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 1, + "min_topic_size": 40, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": 50, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7809, + "diversity": 0.9304, + "inter_topic_distance": 0.6066, + "combined_score": 0.776 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 1, + "min_topic_size": 40, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": 55, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7809, + "diversity": 0.9304, + "inter_topic_distance": 0.6066, + "combined_score": 0.776 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 1, + "min_topic_size": 50, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": 45, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.6938, + "diversity": 0.8545, + "inter_topic_distance": 0.5996, + "combined_score": 0.7071 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 1, + "min_topic_size": 50, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": 50, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7049, + "diversity": 0.8244, + "inter_topic_distance": 0.5761, + "combined_score": 0.703 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 1, + "min_topic_size": 50, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": 55, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7049, + "diversity": 0.8244, + "inter_topic_distance": 0.5761, + "combined_score": 0.703 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 1, + "min_topic_size": 50, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": 45, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7604, + "diversity": 0.9363, + "inter_topic_distance": 0.6263, + "combined_score": 0.7687 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 1, + "min_topic_size": 50, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": 50, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7722, + "diversity": 0.9142, + "inter_topic_distance": 0.6033, + "combined_score": 0.7668 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 1, + "min_topic_size": 50, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": 55, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7722, + "diversity": 0.9142, + "inter_topic_distance": 0.6033, + "combined_score": 0.7668 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 2, + "min_topic_size": 30, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": 45, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7003, + "diversity": 0.8636, + "inter_topic_distance": 0.6077, + "combined_score": 0.7145 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 2, + "min_topic_size": 30, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": 50, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7026, + "diversity": 0.853, + "inter_topic_distance": 0.6005, + "combined_score": 0.7123 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 2, + "min_topic_size": 30, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": 55, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7199, + "diversity": 0.8269, + "inter_topic_distance": 0.5979, + "combined_score": 0.7169 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 2, + "min_topic_size": 30, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": 45, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7635, + "diversity": 0.9363, + "inter_topic_distance": 0.6376, + "combined_score": 0.7729 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 2, + "min_topic_size": 30, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": 50, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7691, + "diversity": 0.9346, + "inter_topic_distance": 0.6297, + "combined_score": 0.7743 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 2, + "min_topic_size": 30, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": 55, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7828, + "diversity": 0.923, + "inter_topic_distance": 0.6195, + "combined_score": 0.7782 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 2, + "min_topic_size": 40, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": 45, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7203, + "diversity": 0.8227, + "inter_topic_distance": 0.5915, + "combined_score": 0.715 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 2, + "min_topic_size": 40, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": 50, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7111, + "diversity": 0.8173, + "inter_topic_distance": 0.585, + "combined_score": 0.7071 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 2, + "min_topic_size": 40, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": 55, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7111, + "diversity": 0.8173, + "inter_topic_distance": 0.585, + "combined_score": 0.7071 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 2, + "min_topic_size": 40, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": 45, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7832, + "diversity": 0.9227, + "inter_topic_distance": 0.6168, + "combined_score": 0.7778 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 2, + "min_topic_size": 40, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": 50, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7819, + "diversity": 0.926, + "inter_topic_distance": 0.6088, + "combined_score": 0.7761 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 2, + "min_topic_size": 40, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": 55, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7819, + "diversity": 0.926, + "inter_topic_distance": 0.6088, + "combined_score": 0.7761 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 2, + "min_topic_size": 50, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": 45, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7075, + "diversity": 0.8681, + "inter_topic_distance": 0.6035, + "combined_score": 0.7188 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 2, + "min_topic_size": 50, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": 50, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7164, + "diversity": 0.8204, + "inter_topic_distance": 0.5796, + "combined_score": 0.7098 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 2, + "min_topic_size": 50, + "n_components": 2, + "n_gram_max": 2, + "n_neighbors": 15, + "nr_topics": 55, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7164, + "diversity": 0.8204, + "inter_topic_distance": 0.5796, + "combined_score": 0.7098 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 2, + "min_topic_size": 50, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": 45, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7672, + "diversity": 0.9409, + "inter_topic_distance": 0.6252, + "combined_score": 0.7735 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 2, + "min_topic_size": 50, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": 50, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7794, + "diversity": 0.9224, + "inter_topic_distance": 0.6077, + "combined_score": 0.7737 + } + }, + { + "params": { + "min_dist": 0.1, + "min_document_frequency": 2, + "min_topic_size": 50, + "n_components": 2, + "n_gram_max": 3, + "n_neighbors": 15, + "nr_topics": 55, + "top_n_words": 10 + }, + "metrics": { + "coherence": 0.7794, + "diversity": 0.9224, + "inter_topic_distance": 0.6077, + "combined_score": 0.7737 + } + } +] \ No newline at end of file diff --git a/bertopic/output/visualization.html b/bertopic/output/visualization.html index 36722f3..7b4a048 100644 --- a/bertopic/output/visualization.html +++ b/bertopic/output/visualization.html @@ -3880,6 +3880,6 @@ maplibre-gl/dist/maplibre-gl.js: window.Plotly = Plotly; return Plotly; -}));