diff --git a/bertopic/combined_score_distribution.png b/bertopic/combined_score_distribution.png deleted file mode 100644 index 0e45c43..0000000 Binary files a/bertopic/combined_score_distribution.png and /dev/null differ diff --git a/bertopic/nb_bertopic.py b/bertopic/nb_bertopic.py deleted file mode 100644 index dd2ea41..0000000 --- a/bertopic/nb_bertopic.py +++ /dev/null @@ -1,577 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.18.0 -# kernelspec: -# display_name: .venv -# language: python -# name: python3 -# --- - -# %% [markdown] -# # Topic Detection: Bali Tourist Reviews -# - -# %% [markdown] -# ## Preparation -# -# ### Dependency Loading -# - -# %% -import json -import pickle -import re - -import gensim.corpora as corpora -import nltk -import numpy as np -import pandas as pd -import spacy -from bertopic.representation import KeyBERTInspired -from bertopic.vectorizers import ClassTfidfTransformer -from gensim.models.coherencemodel import CoherenceModel -from hdbscan import HDBSCAN -from nltk.corpus import stopwords -from nltk.stem import WordNetLemmatizer -from sentence_transformers import SentenceTransformer -from sklearn.feature_extraction.text import CountVectorizer -from sklearn.metrics.pairwise import cosine_similarity -from umap import UMAP - -from bertopic import BERTopic - -nlp = spacy.load("en_core_web_sm") - -nltk.download("stopwords") -nltk.download("punkt") -nltk.download("wordnet") - -# %% [markdown] -# ### Parameters and Tracking -# - -# %% -RECREATE_MODEL = True -RECREATE_REDUCED_MODEL = True -PROCESS_DATA = False -REDUCE_OUTLIERS = True -USE_CONDENSED_MODEL = False - -DATA_SAMPLE_SIZE = -1 # -1 for all data - -# Classical coherence score. Warning: needs swap to not kill your PC -CALCULATE_COHERENCE = False - -# Vectorization -MIN_DOCUMENT_FREQUENCY = 1 -MAX_NGRAM = 2 - -# HDBSCAN Parameters -MIN_TOPIC_SIZE = 200 -MIN_SAMPLES = 25 - -# UMAP Parameters -N_NEIGHBORS = 15 -N_COMPONENTS = 2 -MIN_DIST = 0.01 - -# Topic Modeling -TOP_N_WORDS = 10 -MAX_TOPICS = None # or "auto" to pass to HDBSCAN, None to skip - -# %% [markdown] -# ### Data Loading & Preprocessing -# - -# %% -if DATA_SAMPLE_SIZE != -1: - reviews = ( - pd.read_csv("../data/original/reviews.tab", sep="\t") - .sample(n=DATA_SAMPLE_SIZE) - .review.dropna() - .to_list() - ) -else: - reviews = ( - pd.read_csv("../data/original/reviews.tab", sep="\t").review.dropna().to_list() - ) - -print("Loaded {} reviews".format(len(reviews))) - -# %% -# List of NE in Bali for NER enhancement -with open("../data/supporting/bali_ner.json", "r") as f: - bali_places = json.load(f) -bali_places_set = set(bali_places) - -# Stop word definition -extra_stopwords = ["bali", "idr", "usd"] -stop_words = set(stopwords.words("english")) -with open("../data/supporting/stopwords-en.json", "r") as f: - extra_stopwords.extend(json.load(f)) - -# Custom replacements -rep = { - r"\\n": " ", - r"\n": " ", - r'\\"': "", - r'"': "", - "mongkey": "monkey", - "monky": "monkey", - "verry": "very", -} -rep = dict((re.escape(k), v) for k, v in rep.items()) -pattern = re.compile("|".join(rep.keys())) - -lemmatizer = WordNetLemmatizer() - - -def preprocess(text): - # Step 1: Apply custom replacements (typos, special cases) - text = text.lower() - text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text) - - # Step 2: Clean text - text = re.sub(r"\d+", " ", text) - text = re.sub(r"\W+", " ", text) - - doc = nlp(text) - - # Step 3: POS tagging and filtering - filtered_tokens = [ - token.text - for token in doc - if token.pos_ in {"NOUN", "PROPN"} - or token.ent_type_ in {"GPE", "LOC", "FAC"} - or token.text in bali_places_set - ] - - # Step 4: Lemmatization and stopword removal - lemmatized_tokens = [ - lemmatizer.lemmatize(w) - for w in filtered_tokens - if w not in stop_words and w not in extra_stopwords and len(w) > 2 - ] - - return lemmatized_tokens - - -# %% -if PROCESS_DATA: - print("Processing reviews...") - reviews = [preprocess(review) for review in reviews] - - with open("../data/intermediate/processed_texts.pkl", "wb") as f: - pickle.dump(reviews, f) -else: - with open("../data/intermediate/processed_texts.pkl", "rb") as f: - reviews = pickle.load(f) - reviews = [ - " ".join(review) if isinstance(review, list) else review - for review in reviews - ] - -print(reviews[:1]) - -# %% [markdown] -# ### Pre-calculate Embeddings -# - -# %% -embedding_model = SentenceTransformer("all-MiniLM-L6-v2") -embeddings = embedding_model.encode(reviews, show_progress_bar=True) - -# %% [markdown] -# ## Model Creation -# - -# %% [markdown] -# ### Dimensionality Reduction (UMAP) -# - -# %% -umap_model = UMAP( - n_neighbors=N_NEIGHBORS, - n_components=N_COMPONENTS, - min_dist=MIN_DIST, - metric="cosine", - low_memory=True, - random_state=42, -) -reduced_embeddings = umap_model.fit_transform(embeddings) - -# %% [markdown] -# ### BERTopic Model Creation -# - -# %% -if RECREATE_MODEL: - ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) - vectorizer_model = CountVectorizer( - min_df=MIN_DOCUMENT_FREQUENCY, ngram_range=(1, MAX_NGRAM) - ) - - representation_model = KeyBERTInspired() - hdbscan_model = HDBSCAN( - min_cluster_size=MIN_TOPIC_SIZE, - min_samples=MIN_SAMPLES, - metric="euclidean", - cluster_selection_method="eom", - gen_min_span_tree=True, - prediction_data=True, - ) - - topic_model = BERTopic( - embedding_model=embedding_model, - ctfidf_model=ctfidf_model, - vectorizer_model=vectorizer_model, - umap_model=umap_model, - hdbscan_model=hdbscan_model, - representation_model=representation_model, - verbose=True, - calculate_probabilities=True, - language="english", - top_n_words=TOP_N_WORDS, - nr_topics=MAX_TOPICS, - ) - - topics, probs = topic_model.fit_transform(reviews, embeddings=embeddings) - - topic_labels = topic_model.generate_topic_labels( - nr_words=3, topic_prefix=True, word_length=15, separator=" - " - ) - topic_model.set_topic_labels(topic_labels) - BERTopic.save(topic_model, "output/model.bertopic") -else: - print("Nevermind, loading existing model") - topic_model = BERTopic.load("output/model.bertopic") - -# %% [markdown] -# ## Fine Tuning -# -# ### Topic Condensation -# - -# %% -if RECREATE_REDUCED_MODEL: - done = False - iteration = 1 - while not done: - print(f"Iteration {iteration}") - iteration += 1 - similarity_matrix = cosine_similarity( - np.array(topic_model.topic_embeddings_)[1:, :] - ) - nothing_to_merge = True - - for i in range(similarity_matrix.shape[0]): - for j in range(i + 1, similarity_matrix.shape[1]): - sim = similarity_matrix[i, j] - if sim > 0.9: - nothing_to_merge = False - t1, t2 = i, j - try: - t1_name = topic_model.get_topic_info(t1)["CustomName"][0] - t2_name = topic_model.get_topic_info(t2)["CustomName"][0] - print( - f"Merging topics {t1} ({t1_name}) and {t2} ({t2_name}) with similarity {sim:.2f}" - ) - topic_model.merge_topics(reviews, topics_to_merge=[t1, t2]) - - topic_labels = topic_model.generate_topic_labels( - nr_words=3, - topic_prefix=True, - word_length=15, - separator=" - ", - ) - topic_model.set_topic_labels(topic_labels) - except Exception as e: - print(f"Failed to merge {t1} and {t2}: {e}") - if nothing_to_merge: - print("No more topics to merge.") - done = True - - # BERTopic.save(topic_model, "bertopic/model_reduced.bertopic") -elif USE_CONDENSED_MODEL: - print("Nevermind, loading existing reduced model") - topic_model = BERTopic.load("bertopic/model_reduced.bertopic") -else: - print("Skipping topic reduction") - -# %% [markdown] -# ### Outlier Reduction -# - -# %% -if REDUCE_OUTLIERS: - new_topics = topic_model.reduce_outliers( - reviews, - topic_model.topics_, - probabilities=topic_model.probabilities_, - threshold=0.05, - strategy="probabilities", - ) - topic_model.update_topics(reviews, topics=new_topics) - -# %% [markdown] -# ## Results -# -# ### Classification -# - -# %% -import random -from pathlib import Path - -# --- config --- -topics_to_keep = {2, 4, 6, 8, 10, 5, 7} -INPUT_PATH = "../data/original/reviews.tab" # TSV with a 'review' column -OUTPUT_CSV = "../data/intermediate/selected_topics_documents.csv" -OUTPUT_DIR = Path("../raft/corpus") -OUTPUT_DIR.mkdir(parents=True, exist_ok=True) - -BATCH_SIZE = 60 -MIN_CHARS = 40 -SEED = 42 - -# --- load data --- -data = pd.read_csv(INPUT_PATH, sep="\t") - -# If you already have `reviews` elsewhere, replace the next line with that variable -reviews = data["review"].astype(str).fillna("") - -# Topic model document info -df = topic_model.get_document_info(reviews) # assumes your model is already fitted -df["Original"] = reviews.values - -# --- filter by topics and length --- -filtered = df[df["Topic"].isin(topics_to_keep)].copy() -filtered["Original"] = filtered["Original"].str.strip() -filtered = filtered[filtered["Original"].str.len() >= MIN_CHARS] - -# Save an audit CSV -filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False) - -# --- deterministic shuffle + write batched corpus files --- -total_files = 0 -total_reviews = 0 -rng = random.Random(SEED) - -for topic_val, g in filtered.groupby("Topic", sort=True): - reviews_list = g["Original"].tolist() - - # deterministic shuffle within topic - rng.shuffle(reviews_list) - - # chunk into batches of up to 60 - for start in range(0, len(reviews_list), BATCH_SIZE): - chunk = reviews_list[start : start + BATCH_SIZE] - if not chunk: - continue - - # simple header for traceability - header = ( - f"[TOPIC] {topic_val}\n" f"[Stats] N={len(chunk)} | Source={INPUT_PATH}\n" - ) - - lines = [header, ""] - for i, txt in enumerate(chunk, 1): - lines.append(f"({i}) {txt}") - - part_idx = start // BATCH_SIZE + 1 - fname = f"topic={topic_val}__part={part_idx:03d}__n={len(chunk)}.txt" - (OUTPUT_DIR / fname).write_text("\n".join(lines), encoding="utf-8") - - total_files += 1 - total_reviews += len(chunk) - -print( - f"[green]Wrote {total_files} docs with {total_reviews} reviews to {OUTPUT_DIR}[/green]" -) -print(f"[green]Filtered CSV saved to {OUTPUT_CSV}[/green]") - -# %% -doc_topic_matrix = probs - -# column names -topicnames = ["Topic " + str(i) for i in range(len(set(topics)) - 1)] - -# index names -docnames = ["Review " + str(i) for i in range(len(reviews))] - -# Make the pandas dataframe -df_document_topic = pd.DataFrame( - np.round(doc_topic_matrix, 2), columns=topicnames, index=docnames -) - -# Get dominant topic for each document -dominant_topic = np.argmax(doc_topic_matrix, axis=1) -df_document_topic["dominant_topic"] = dominant_topic - - -# Styling -def color_stuff(val): - if val > 0.1: - color = "green" - elif val > 0.05: - color = "orange" - else: - color = "grey" - return "color: {col}".format(col=color) - - -def make_bold(val): - weight = 700 if val > 0.1 else 400 - return "font-weight: {weight}".format(weight=weight) - - -# Apply Style -df_document_topics = ( - df_document_topic.head(15).style.applymap(color_stuff).applymap(make_bold) -) -df_document_topics - -# %% [markdown] -# ### Document Visualization -# - -# %% -vis = topic_model.visualize_documents( - docs=reviews, - reduced_embeddings=reduced_embeddings, - custom_labels=True, - hide_annotations=True, -) -vis.write_html("output/visualization.html") -vis - -# %% [markdown] -# ### Similarity Matrix -# - -# %% -topic_model.visualize_heatmap() - -# %% [markdown] -# ### Topic Info -# - -# %% -topic_model.get_topic_info() - -# %% [markdown] -# ### Semantic Coherence -# - -# %% -topic_words = [] -for topic_id in topic_model.get_topic_info()["Topic"]: - # Skip outlier topic - if topic_id < 0: - continue - - words = [word for word, _ in topic_model.get_topic(topic_id)] - topic_words.append(words) - -# Compute mean pairwise cosine similarity for each topic -coherence_scores = [] -for words in topic_words: - coherence_embeddings = embedding_model.encode(words) - sim_matrix = cosine_similarity(coherence_embeddings) - - # Ignore self-similarity - np.fill_diagonal(sim_matrix, 0) - mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)]) - coherence_scores.append(mean_sim) - -overall_coherence = np.mean(coherence_scores) - -print(len(reviews), "reviews processed") -print(len(topic_model.get_topic_info()) - 1, "topics found") -print(f"BERT-based Topic Coherence: {overall_coherence:.4f}") - -# %% [markdown] -# ### Topic Coherence -# - -# %% -# https://github.com/MaartenGr/BERTopic/issues/90#issuecomment-820915389 - -if CALCULATE_COHERENCE: - # Preprocess Documents - documents = pd.DataFrame( - {"Document": reviews, "ID": range(len(reviews)), "Topic": topics} - ) - documents_per_topic = documents.groupby(["Topic"], as_index=False).agg( - {"Document": " ".join} - ) - cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values) - - # Extract vectorizer and analyzer from BERTopic - vectorizer = topic_model.vectorizer_model - analyzer = vectorizer.build_analyzer() - - # Extract features for Topic Coherence evaluation - words = vectorizer.get_feature_names_out() - tokens = [analyzer(doc) for doc in cleaned_docs] - dictionary = corpora.Dictionary(tokens) - corpus = [dictionary.doc2bow(token) for token in tokens] - topic_words = [ - [words for words, _ in topic_model.get_topic(topic)] - for topic in range(len(set(topics)) - 1) - ] - - # %env TOKENIZERS_PARALLELISM=false - - for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]: - coherence_model = CoherenceModel( - topics=topic_words, - texts=tokens, - corpus=corpus, - dictionary=dictionary, - coherence=measurement, - ) - coherence_score = coherence_model.get_coherence() - print(f"Coherence ({measurement}): {coherence_score:.4f}") -else: - print("Skipping classical coherence calculation") - -# %% [markdown] -# ### Term Search -# - -# %% -search_term = "uluwatu" - -similar_topics, similarities = topic_model.find_topics(search_term, top_n=10) -for i in range(len(similar_topics)): - # \n{topic_model.get_topic(similar_topics[i])}\n - print( - f"{str(similarities[i])[:5]} {topic_model.get_topic_info(similar_topics[i])["CustomName"][0]}" - ) - -# %% [markdown] -# ### Topic Hierarchy -# - -# %% -topic_model.visualize_hierarchy(custom_labels=True) - -# %% [markdown] -# ### Intertopic Distance Map -# - -# %% -topic_model.visualize_topics() - -# %% [markdown] -# ### Topic Word Scores -# - -# %% -topic_model.visualize_barchart(top_n_topics=12, custom_labels=True, n_words=10) diff --git a/bertopic/output/autotune.json b/bertopic/output/autotune.json deleted file mode 100644 index 775c5dc..0000000 --- a/bertopic/output/autotune.json +++ /dev/null @@ -1,290 +0,0 @@ -[ - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 1, - "min_samples": 10, - "min_topic_size": 200, - "n_components": 2, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": "auto", - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.4456, - "diversity": 0.925, - "combined_score": 0.5894 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 1, - "min_samples": 10, - "min_topic_size": 200, - "n_components": 2, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": "auto", - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.4462, - "diversity": 0.925, - "combined_score": 0.5898 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 1, - "min_samples": 10, - "min_topic_size": 200, - "n_components": 5, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": "auto", - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.4531, - "diversity": 0.975, - "combined_score": 0.6096 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 1, - "min_samples": 10, - "min_topic_size": 200, - "n_components": 5, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": "auto", - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.4617, - "diversity": 0.95, - "combined_score": 0.6082 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 1, - "min_samples": 25, - "min_topic_size": 200, - "n_components": 2, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": "auto", - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.4456, - "diversity": 0.925, - "combined_score": 0.5894 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 1, - "min_samples": 25, - "min_topic_size": 200, - "n_components": 2, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": "auto", - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.4462, - "diversity": 0.925, - "combined_score": 0.5898 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 1, - "min_samples": 25, - "min_topic_size": 200, - "n_components": 5, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": "auto", - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.4531, - "diversity": 0.975, - "combined_score": 0.6096 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 1, - "min_samples": 25, - "min_topic_size": 200, - "n_components": 5, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": "auto", - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.4617, - "diversity": 0.95, - "combined_score": 0.6082 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 1, - "min_samples": 10, - "min_topic_size": 200, - "n_components": 2, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": "auto", - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.498, - "diversity": 1.0, - "combined_score": 0.6486 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 1, - "min_samples": 10, - "min_topic_size": 200, - "n_components": 2, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": "auto", - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.4915, - "diversity": 0.9666, - "combined_score": 0.634 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 1, - "min_samples": 10, - "min_topic_size": 200, - "n_components": 5, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": "auto", - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.4287, - "diversity": 1.0, - "combined_score": 0.6001 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 1, - "min_samples": 10, - "min_topic_size": 200, - "n_components": 5, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": "auto", - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.427, - "diversity": 1.0, - "combined_score": 0.5989 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 1, - "min_samples": 25, - "min_topic_size": 200, - "n_components": 2, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": "auto", - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.498, - "diversity": 1.0, - "combined_score": 0.6486 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 1, - "min_samples": 25, - "min_topic_size": 200, - "n_components": 2, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": "auto", - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.4915, - "diversity": 0.9666, - "combined_score": 0.634 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 1, - "min_samples": 25, - "min_topic_size": 200, - "n_components": 5, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": "auto", - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.4287, - "diversity": 1.0, - "combined_score": 0.6001 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 1, - "min_samples": 25, - "min_topic_size": 200, - "n_components": 5, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": "auto", - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.427, - "diversity": 1.0, - "combined_score": 0.5989 - } - } -] \ No newline at end of file diff --git a/bertopic/output/autotune_sorted.json b/bertopic/output/autotune_sorted.json deleted file mode 100644 index 7f30e44..0000000 --- a/bertopic/output/autotune_sorted.json +++ /dev/null @@ -1,290 +0,0 @@ -[ - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 1, - "min_samples": 10, - "min_topic_size": 200, - "n_components": 2, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": "auto", - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.498, - "diversity": 1.0, - "combined_score": 0.6486 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 1, - "min_samples": 25, - "min_topic_size": 200, - "n_components": 2, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": "auto", - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.498, - "diversity": 1.0, - "combined_score": 0.6486 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 1, - "min_samples": 10, - "min_topic_size": 200, - "n_components": 2, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": "auto", - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.4915, - "diversity": 0.9666, - "combined_score": 0.634 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 1, - "min_samples": 25, - "min_topic_size": 200, - "n_components": 2, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": "auto", - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.4915, - "diversity": 0.9666, - "combined_score": 0.634 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 1, - "min_samples": 10, - "min_topic_size": 200, - "n_components": 5, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": "auto", - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.4531, - "diversity": 0.975, - "combined_score": 0.6096 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 1, - "min_samples": 25, - "min_topic_size": 200, - "n_components": 5, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": "auto", - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.4531, - "diversity": 0.975, - "combined_score": 0.6096 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 1, - "min_samples": 10, - "min_topic_size": 200, - "n_components": 5, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": "auto", - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.4617, - "diversity": 0.95, - "combined_score": 0.6082 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 1, - "min_samples": 25, - "min_topic_size": 200, - "n_components": 5, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": "auto", - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.4617, - "diversity": 0.95, - "combined_score": 0.6082 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 1, - "min_samples": 10, - "min_topic_size": 200, - "n_components": 5, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": "auto", - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.4287, - "diversity": 1.0, - "combined_score": 0.6001 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 1, - "min_samples": 25, - "min_topic_size": 200, - "n_components": 5, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": "auto", - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.4287, - "diversity": 1.0, - "combined_score": 0.6001 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 1, - "min_samples": 10, - "min_topic_size": 200, - "n_components": 5, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": "auto", - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.427, - "diversity": 1.0, - "combined_score": 0.5989 - } - }, - { - "params": { - "min_dist": 0.1, - "min_document_frequency": 1, - "min_samples": 25, - "min_topic_size": 200, - "n_components": 5, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": "auto", - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.427, - "diversity": 1.0, - "combined_score": 0.5989 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 1, - "min_samples": 10, - "min_topic_size": 200, - "n_components": 2, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": "auto", - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.4462, - "diversity": 0.925, - "combined_score": 0.5898 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 1, - "min_samples": 25, - "min_topic_size": 200, - "n_components": 2, - "n_gram_max": 3, - "n_neighbors": 15, - "nr_topics": "auto", - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.4462, - "diversity": 0.925, - "combined_score": 0.5898 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 1, - "min_samples": 10, - "min_topic_size": 200, - "n_components": 2, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": "auto", - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.4456, - "diversity": 0.925, - "combined_score": 0.5894 - } - }, - { - "params": { - "min_dist": 0.01, - "min_document_frequency": 1, - "min_samples": 25, - "min_topic_size": 200, - "n_components": 2, - "n_gram_max": 2, - "n_neighbors": 15, - "nr_topics": "auto", - "top_n_words": 10 - }, - "metrics": { - "coherence": 0.4456, - "diversity": 0.925, - "combined_score": 0.5894 - } - } -] \ No newline at end of file diff --git a/bertopic/output/heatmap.html b/bertopic/output/heatmap.html deleted file mode 100644 index 614e2dd..0000000 --- a/bertopic/output/heatmap.html +++ /dev/null @@ -1,3885 +0,0 @@ - -
- -