mirror of
https://github.com/marvinscham/masterthesis-playground.git
synced 2026-03-22 08:22:43 +01:00
22.02.
This commit is contained in:
@@ -16,10 +16,10 @@ from bertopic import BERTopic
|
||||
|
||||
param_grid = {
|
||||
"n_gram_max": [2, 3], # Vectorization
|
||||
"min_document_frequency": [1], # Vectorization
|
||||
"min_document_frequency": [1, 2], # Vectorization
|
||||
"min_samples": [10, 25], # HDBSCAN
|
||||
"min_topic_size": [10, 20, 30, 40, 50], # HDBSCAN
|
||||
"n_neighbors": [15], # UMAP
|
||||
"min_topic_size": [100, 200], # HDBSCAN
|
||||
"n_neighbors": [15, 25], # UMAP
|
||||
"n_components": [2, 5], # UMAP
|
||||
"min_dist": [0.01, 0.1], # UMAP
|
||||
"nr_topics": ["auto"], # Topic Modeling
|
||||
|
||||
@@ -5,7 +5,7 @@ import matplotlib.pyplot as plt
|
||||
with open("output/autotune.json", "r") as f:
|
||||
history = json.load(f)
|
||||
|
||||
history = sorted(history, key=lambda x: x["metrics"]["combined_score"], reverse=False)
|
||||
history = sorted(history, key=lambda x: x["metrics"]["combined_score"], reverse=True)
|
||||
|
||||
with open("output/autotune_sorted.json", "w") as f:
|
||||
json.dump(history, f, indent=2)
|
||||
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 16 KiB After Width: | Height: | Size: 21 KiB |
@@ -360,7 +360,6 @@ vis = topic_model.visualize_documents(
|
||||
custom_labels=True,
|
||||
hide_annotations=True,
|
||||
)
|
||||
# vis.write_html("output/visualization.html")
|
||||
vis
|
||||
|
||||
# %%
|
||||
@@ -497,7 +496,12 @@ if CALCULATE_TOKEN_DISTRIBUTIONS:
|
||||
#
|
||||
|
||||
# %%
|
||||
topic_model.visualize_hierarchy(custom_labels=True)
|
||||
topic_model.visualize_hierarchy(custom_labels=True, color_threshold=0.98)
|
||||
|
||||
# %%
|
||||
hierarchical_topics = topic_model.hierarchical_topics(reviews)
|
||||
tree = topic_model.get_topic_tree(hier_topics=hierarchical_topics)
|
||||
print(tree)
|
||||
|
||||
# %% [markdown]
|
||||
# ### Intertopic Distance Map
|
||||
@@ -512,3 +516,20 @@ topic_model.visualize_topics(use_ctfidf=True)
|
||||
|
||||
# %%
|
||||
topic_model.visualize_barchart(top_n_topics=12, custom_labels=True, n_words=10)
|
||||
|
||||
# %%
|
||||
from wordcloud import WordCloud
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
def create_wordcloud(model, topic):
|
||||
text = {word: value for word, value in model.get_topic(topic)}
|
||||
wc = WordCloud(background_color="white", max_words=1000)
|
||||
wc.generate_from_frequencies(text)
|
||||
plt.imshow(wc, interpolation="bilinear")
|
||||
plt.axis("off")
|
||||
plt.show()
|
||||
|
||||
|
||||
# Show wordcloud
|
||||
create_wordcloud(topic_model, topic=1)
|
||||
|
||||
519
bertopic/nb_bertopic_temples.py
Normal file
519
bertopic/nb_bertopic_temples.py
Normal file
@@ -0,0 +1,519 @@
|
||||
# ---
|
||||
# jupyter:
|
||||
# jupytext:
|
||||
# text_representation:
|
||||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.18.0
|
||||
# kernelspec:
|
||||
# display_name: .venv (3.12.3)
|
||||
# language: python
|
||||
# name: python3
|
||||
# ---
|
||||
|
||||
# %% [markdown]
|
||||
# # Topic Detection: Bali Tourist Reviews
|
||||
#
|
||||
|
||||
# %% [markdown]
|
||||
# ## Preparation
|
||||
#
|
||||
# ### Dependency Loading
|
||||
#
|
||||
|
||||
# %%
|
||||
import pickle
|
||||
import re
|
||||
|
||||
import gensim.corpora as corpora
|
||||
import nltk
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from bertopic.representation import KeyBERTInspired
|
||||
from bertopic.vectorizers import ClassTfidfTransformer
|
||||
from gensim.models.coherencemodel import CoherenceModel
|
||||
from hdbscan import HDBSCAN
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.feature_extraction import text as skltext
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
from umap import UMAP
|
||||
|
||||
from bertopic import BERTopic
|
||||
|
||||
nltk.download("stopwords")
|
||||
nltk.download("punkt")
|
||||
nltk.download("wordnet")
|
||||
|
||||
# %% [markdown]
|
||||
# ### Hyperparameters and Settings
|
||||
#
|
||||
|
||||
# %%
|
||||
RECREATE_MODEL = True
|
||||
RECREATE_REDUCED_MODEL = True
|
||||
PROCESS_DATA = True
|
||||
REDUCE_OUTLIERS = False
|
||||
CALCULATE_TOKEN_DISTRIBUTIONS = False
|
||||
|
||||
# Data Sample Size, -1 for all data
|
||||
DATA_SAMPLE_SIZE = -1
|
||||
|
||||
# Vectorization
|
||||
MIN_DOCUMENT_FREQUENCY = 1
|
||||
MAX_NGRAM = 3
|
||||
|
||||
# HDBSCAN Parameters
|
||||
MIN_TOPIC_SIZE = 15
|
||||
MIN_SAMPLES = 15
|
||||
|
||||
# UMAP Parameters
|
||||
N_NEIGHBORS = 15
|
||||
N_COMPONENTS = 2
|
||||
MIN_DIST = 0.01
|
||||
|
||||
# Topic Modeling
|
||||
TOP_N_WORDS = 10
|
||||
MAX_TOPICS = None # or "auto" to pass to HDBSCAN, None to skip
|
||||
|
||||
TF_IDF_STOP_WORDS = ["bali", "place", "visit", "visited", "visiting"]
|
||||
|
||||
# %% [markdown]
|
||||
# ### Data Loading & Preprocessing
|
||||
#
|
||||
|
||||
# %%
|
||||
# Import data after general preprocessing
|
||||
|
||||
if DATA_SAMPLE_SIZE == -1:
|
||||
reviews = pd.read_csv(
|
||||
"../data/intermediate/culture_reviews.csv", sep=","
|
||||
).Original.to_list()
|
||||
else:
|
||||
reviews = (
|
||||
pd.read_csv("../data/intermediate/culture_reviews.csv", sep=",")
|
||||
.sample(n=DATA_SAMPLE_SIZE)
|
||||
.Original.to_list()
|
||||
)
|
||||
|
||||
print("Loaded {} reviews".format(len(reviews)))
|
||||
|
||||
# %%
|
||||
rep = {
|
||||
r"\\n": " ",
|
||||
r"\n": " ",
|
||||
r'\\"': "",
|
||||
r'"': "",
|
||||
r"\s+": " ",
|
||||
}
|
||||
rep = dict((re.escape(k), v) for k, v in rep.items())
|
||||
pattern = re.compile("|".join(rep.keys()))
|
||||
|
||||
|
||||
def preprocess(text):
|
||||
text = text.strip()
|
||||
text = text.lower()
|
||||
text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
|
||||
return text
|
||||
|
||||
|
||||
# %%
|
||||
print(
|
||||
preprocess(
|
||||
"Excellent. Definitely worth coming while in bali. Food and people were very nice.\n🌟 🤩 ⭐️ \nTrisna was our host"
|
||||
)
|
||||
)
|
||||
|
||||
# %%
|
||||
if PROCESS_DATA:
|
||||
print("Processing reviews...")
|
||||
reviews = [preprocess(review) for review in reviews]
|
||||
|
||||
with open("../data/intermediate/processed_texts_culture.pkl", "wb") as f:
|
||||
pickle.dump(reviews, f)
|
||||
else:
|
||||
with open("../data/intermediate/processed_texts_culture.pkl", "rb") as f:
|
||||
reviews = pickle.load(f)
|
||||
|
||||
print(reviews[:1])
|
||||
|
||||
# %% [markdown]
|
||||
# ### Pre-calculate Embeddings
|
||||
#
|
||||
|
||||
# %%
|
||||
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
embeddings = embedding_model.encode(reviews, show_progress_bar=True)
|
||||
|
||||
# %% [markdown]
|
||||
# ## Model Creation
|
||||
#
|
||||
|
||||
# %% [markdown]
|
||||
# ### Dimensionality Reduction (UMAP)
|
||||
#
|
||||
|
||||
# %%
|
||||
umap_model = UMAP(
|
||||
n_neighbors=N_NEIGHBORS,
|
||||
n_components=N_COMPONENTS,
|
||||
min_dist=MIN_DIST,
|
||||
metric="cosine",
|
||||
low_memory=True,
|
||||
random_state=42,
|
||||
)
|
||||
reduced_embeddings = umap_model.fit_transform(embeddings)
|
||||
|
||||
# %% [markdown]
|
||||
# ### BERTopic Model Creation
|
||||
#
|
||||
|
||||
# %%
|
||||
if RECREATE_MODEL:
|
||||
stop_words = list(skltext.ENGLISH_STOP_WORDS.union(TF_IDF_STOP_WORDS))
|
||||
|
||||
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
|
||||
vectorizer_model = CountVectorizer(
|
||||
min_df=MIN_DOCUMENT_FREQUENCY,
|
||||
ngram_range=(1, MAX_NGRAM),
|
||||
stop_words=stop_words,
|
||||
)
|
||||
|
||||
representation_model = KeyBERTInspired()
|
||||
hdbscan_model = HDBSCAN(
|
||||
min_cluster_size=MIN_TOPIC_SIZE,
|
||||
min_samples=MIN_SAMPLES,
|
||||
metric="euclidean",
|
||||
cluster_selection_method="eom",
|
||||
gen_min_span_tree=True,
|
||||
prediction_data=True,
|
||||
)
|
||||
|
||||
topic_model = BERTopic(
|
||||
embedding_model=embedding_model,
|
||||
ctfidf_model=ctfidf_model,
|
||||
vectorizer_model=vectorizer_model,
|
||||
umap_model=umap_model,
|
||||
hdbscan_model=hdbscan_model,
|
||||
representation_model=representation_model,
|
||||
verbose=True,
|
||||
calculate_probabilities=True,
|
||||
language="english",
|
||||
top_n_words=TOP_N_WORDS,
|
||||
nr_topics=MAX_TOPICS,
|
||||
)
|
||||
|
||||
topics, probs = topic_model.fit_transform(reviews, embeddings=embeddings)
|
||||
|
||||
topic_labels = topic_model.generate_topic_labels(
|
||||
nr_words=3, topic_prefix=True, word_length=15, separator=" - "
|
||||
)
|
||||
topic_model.set_topic_labels(topic_labels)
|
||||
# BERTopic.save(topic_model, "bertopic/model.bertopic")
|
||||
else:
|
||||
print("Nevermind, loading existing model")
|
||||
# topic_model = BERTopic.load("bertopic/model.bertopic")
|
||||
|
||||
# %% [markdown]
|
||||
# ## Fine Tuning
|
||||
#
|
||||
# ### Topic Condensation
|
||||
#
|
||||
|
||||
# %%
|
||||
if RECREATE_REDUCED_MODEL:
|
||||
done = False
|
||||
iteration = 1
|
||||
while not done:
|
||||
print(f"Iteration {iteration}")
|
||||
iteration += 1
|
||||
similarity_matrix = cosine_similarity(
|
||||
np.array(topic_model.topic_embeddings_)[1:, :]
|
||||
)
|
||||
nothing_to_merge = True
|
||||
|
||||
for i in range(similarity_matrix.shape[0]):
|
||||
for j in range(i + 1, similarity_matrix.shape[1]):
|
||||
try:
|
||||
sim = similarity_matrix[i, j]
|
||||
if sim > 0.9:
|
||||
nothing_to_merge = False
|
||||
t1, t2 = i, j
|
||||
try:
|
||||
t1_name = topic_model.get_topic_info(t1)["CustomName"][0]
|
||||
t2_name = topic_model.get_topic_info(t2)["CustomName"][0]
|
||||
print(
|
||||
f"Merging topics {t1} ({t1_name}) and {t2} ({t2_name}) with similarity {sim:.2f}"
|
||||
)
|
||||
topic_model.merge_topics(reviews, topics_to_merge=[t1, t2])
|
||||
|
||||
topic_labels = topic_model.generate_topic_labels(
|
||||
nr_words=3,
|
||||
topic_prefix=True,
|
||||
word_length=15,
|
||||
separator=" - ",
|
||||
)
|
||||
topic_model.set_topic_labels(topic_labels)
|
||||
similarity_matrix = cosine_similarity(
|
||||
np.array(topic_model.topic_embeddings_)[1:, :]
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Failed to merge {t1} and {t2}: {e}")
|
||||
except IndexError:
|
||||
pass
|
||||
if nothing_to_merge:
|
||||
print("No more topics to merge.")
|
||||
done = True
|
||||
else:
|
||||
print("Skipping topic reduction")
|
||||
|
||||
# %% [markdown]
|
||||
# ### Outlier Reduction
|
||||
#
|
||||
|
||||
# %%
|
||||
if REDUCE_OUTLIERS:
|
||||
new_topics = topic_model.reduce_outliers(
|
||||
reviews,
|
||||
topic_model.topics_,
|
||||
probabilities=topic_model.probabilities_,
|
||||
threshold=0.05,
|
||||
strategy="probabilities",
|
||||
)
|
||||
topic_model.update_topics(reviews, topics=new_topics)
|
||||
|
||||
# %% [markdown]
|
||||
# ## Results
|
||||
#
|
||||
# ### Classification
|
||||
#
|
||||
|
||||
# %%
|
||||
CLASSIFICATION = False
|
||||
if CLASSIFICATION:
|
||||
topics_to_keep = {14, 8, 13, 18, 17, 4, 2, 30, 28}
|
||||
INPUT_PATH = "../data/intermediate/preprocessed.tab" # TSV with a 'review' column
|
||||
OUTPUT_CSV = "../data/intermediate/culture_reviews.csv"
|
||||
|
||||
# Topic model document info
|
||||
df = topic_model.get_document_info(reviews)
|
||||
df["Original"] = reviews
|
||||
|
||||
# --- filter by topics and length ---
|
||||
filtered = df[df["Topic"].isin(topics_to_keep)].copy()
|
||||
filtered["Original"] = filtered["Original"].str.strip()
|
||||
|
||||
# Save an audit CSV
|
||||
filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False, sep=",")
|
||||
print(f"Filtered CSV file saved to {OUTPUT_CSV}")
|
||||
|
||||
# %%
|
||||
doc_topic_matrix = probs
|
||||
|
||||
# column names
|
||||
topicnames = ["Topic " + str(i) for i in range(len(set(topics)) - 1)]
|
||||
|
||||
# index names
|
||||
docnames = ["Review " + str(i) for i in range(len(reviews))]
|
||||
|
||||
# Make the pandas dataframe
|
||||
df_document_topic = pd.DataFrame(
|
||||
np.round(doc_topic_matrix, 2), columns=topicnames, index=docnames
|
||||
)
|
||||
|
||||
# Get dominant topic for each document
|
||||
dominant_topic = np.argmax(doc_topic_matrix, axis=1)
|
||||
df_document_topic["dominant_topic"] = dominant_topic
|
||||
|
||||
|
||||
# Styling
|
||||
def color_stuff(val):
|
||||
if val > 0.1:
|
||||
color = "green"
|
||||
elif val > 0.05:
|
||||
color = "orange"
|
||||
else:
|
||||
color = "grey"
|
||||
return "color: {col}".format(col=color)
|
||||
|
||||
|
||||
def make_bold(val):
|
||||
weight = 700 if val > 0.1 else 400
|
||||
return "font-weight: {weight}".format(weight=weight)
|
||||
|
||||
|
||||
# Apply Style
|
||||
df_document_topics = (
|
||||
df_document_topic.head(15).style.applymap(color_stuff).applymap(make_bold)
|
||||
)
|
||||
df_document_topics
|
||||
|
||||
# %% [markdown]
|
||||
# ### Document Visualization
|
||||
#
|
||||
|
||||
# %%
|
||||
vis = topic_model.visualize_documents(
|
||||
docs=reviews,
|
||||
reduced_embeddings=reduced_embeddings,
|
||||
custom_labels=True,
|
||||
hide_annotations=True,
|
||||
)
|
||||
# vis.write_html("output/visualization.html")
|
||||
vis
|
||||
|
||||
# %%
|
||||
topic_model.visualize_document_datamap(reviews, reduced_embeddings=reduced_embeddings)
|
||||
|
||||
# %% [markdown]
|
||||
# ### Similarity Matrix
|
||||
#
|
||||
|
||||
# %%
|
||||
topic_model.visualize_heatmap()
|
||||
|
||||
# %% [markdown]
|
||||
# ### Topic Info
|
||||
#
|
||||
|
||||
# %%
|
||||
topic_model.get_topic_info()
|
||||
|
||||
# %% [markdown]
|
||||
# ### Semantic Coherence
|
||||
#
|
||||
|
||||
# %%
|
||||
topic_words = []
|
||||
for topic_id in topic_model.get_topic_info()["Topic"]:
|
||||
# Skip outlier topic
|
||||
if topic_id < 0:
|
||||
continue
|
||||
|
||||
words = [word for word, _ in topic_model.get_topic(topic_id)]
|
||||
topic_words.append(words)
|
||||
|
||||
# Compute mean pairwise cosine similarity for each topic
|
||||
coherence_scores = []
|
||||
for words in topic_words:
|
||||
coherence_embeddings = embedding_model.encode(words)
|
||||
sim_matrix = cosine_similarity(coherence_embeddings)
|
||||
|
||||
# Ignore self-similarity
|
||||
np.fill_diagonal(sim_matrix, 0)
|
||||
mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)])
|
||||
coherence_scores.append(mean_sim)
|
||||
|
||||
overall_coherence = np.mean(coherence_scores)
|
||||
|
||||
print(len(reviews), "reviews processed")
|
||||
print(len(topic_model.get_topic_info()) - 1, "topics found")
|
||||
print(f"BERT-based Topic Coherence: {overall_coherence:.4f}")
|
||||
|
||||
# %% [markdown]
|
||||
# ### Topic Coherence
|
||||
#
|
||||
|
||||
# %%
|
||||
# https://github.com/MaartenGr/BERTopic/issues/90#issuecomment-820915389
|
||||
|
||||
# This will most likely crash your PC
|
||||
this_will_crash_your_pc_are_you_sure = False
|
||||
if this_will_crash_your_pc_are_you_sure:
|
||||
# Preprocess Documents
|
||||
documents = pd.DataFrame(
|
||||
{"Document": reviews, "ID": range(len(reviews)), "Topic": topics}
|
||||
)
|
||||
documents_per_topic = documents.groupby(["Topic"], as_index=False).agg(
|
||||
{"Document": " ".join}
|
||||
)
|
||||
cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)
|
||||
|
||||
# Extract vectorizer and analyzer from BERTopic
|
||||
vectorizer = topic_model.vectorizer_model
|
||||
analyzer = vectorizer.build_analyzer()
|
||||
|
||||
# Extract features for Topic Coherence evaluation
|
||||
words = vectorizer.get_feature_names_out()
|
||||
tokens = [analyzer(doc) for doc in cleaned_docs]
|
||||
dictionary = corpora.Dictionary(tokens)
|
||||
corpus = [dictionary.doc2bow(token) for token in tokens]
|
||||
|
||||
for topic_id in topic_model.get_topic_info()["Topic"]:
|
||||
# Skip outlier topic
|
||||
if topic_id < 0:
|
||||
continue
|
||||
|
||||
words = [word for word, _ in topic_model.get_topic(topic_id)]
|
||||
topic_words.append(words)
|
||||
|
||||
# %env TOKENIZERS_PARALLELISM=false
|
||||
|
||||
for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]:
|
||||
coherence_model = CoherenceModel(
|
||||
topics=topic_words,
|
||||
texts=tokens,
|
||||
corpus=corpus,
|
||||
dictionary=dictionary,
|
||||
coherence=measurement,
|
||||
)
|
||||
coherence_score = coherence_model.get_coherence()
|
||||
print(f"Coherence ({measurement}): {coherence_score:.4f}")
|
||||
|
||||
# %% [markdown]
|
||||
# ### Term Search
|
||||
#
|
||||
|
||||
# %%
|
||||
search_term = "lempuyang"
|
||||
|
||||
similar_topics, similarities = topic_model.find_topics(search_term, top_n=10)
|
||||
for i in range(len(similar_topics)):
|
||||
print(
|
||||
f"{str(similarities[i])[:5]} {topic_model.get_topic_info(similar_topics[i])['CustomName'][0]}"
|
||||
)
|
||||
|
||||
# %%
|
||||
# Source: https://maartengr.github.io/BERTopic/getting_started/visualization/visualize_documents.html#visualize-probabilities-or-distribution
|
||||
# Calculate the topic distributions on a token-level
|
||||
|
||||
if CALCULATE_TOKEN_DISTRIBUTIONS:
|
||||
topic_distr, topic_token_distr = topic_model.approximate_distribution(
|
||||
reviews, calculate_tokens=True, use_embedding_model=True
|
||||
)
|
||||
|
||||
# %%
|
||||
# Visualize the token-level distributions
|
||||
if CALCULATE_TOKEN_DISTRIBUTIONS:
|
||||
DOC_INDEX = 1
|
||||
df = topic_model.visualize_approximate_distribution(
|
||||
reviews[DOC_INDEX], topic_token_distr[DOC_INDEX]
|
||||
)
|
||||
df
|
||||
|
||||
# %% [markdown]
|
||||
# ### Topic Hierarchy
|
||||
#
|
||||
|
||||
# %%
|
||||
topic_model.visualize_hierarchy(custom_labels=True)
|
||||
|
||||
# %%
|
||||
hierarchical_topics = topic_model.hierarchical_topics(reviews)
|
||||
tree = topic_model.get_topic_tree(hier_topics=hierarchical_topics)
|
||||
print(tree)
|
||||
|
||||
# %% [markdown]
|
||||
# ### Intertopic Distance Map
|
||||
#
|
||||
|
||||
# %%
|
||||
topic_model.visualize_topics(use_ctfidf=True)
|
||||
|
||||
# %% [markdown]
|
||||
# ### Topic Word Scores
|
||||
#
|
||||
|
||||
# %%
|
||||
topic_model.visualize_barchart(top_n_topics=12, custom_labels=True, n_words=10)
|
||||
File diff suppressed because it is too large
Load Diff
290
bertopic/output/autotune_sorted.json
Normal file
290
bertopic/output/autotune_sorted.json
Normal file
@@ -0,0 +1,290 @@
|
||||
[
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.1,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 10,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 2,
|
||||
"n_gram_max": 2,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.498,
|
||||
"diversity": 1.0,
|
||||
"combined_score": 0.6486
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.1,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 25,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 2,
|
||||
"n_gram_max": 2,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.498,
|
||||
"diversity": 1.0,
|
||||
"combined_score": 0.6486
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.1,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 10,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 2,
|
||||
"n_gram_max": 3,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.4915,
|
||||
"diversity": 0.9666,
|
||||
"combined_score": 0.634
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.1,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 25,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 2,
|
||||
"n_gram_max": 3,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.4915,
|
||||
"diversity": 0.9666,
|
||||
"combined_score": 0.634
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.01,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 10,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 5,
|
||||
"n_gram_max": 2,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.4531,
|
||||
"diversity": 0.975,
|
||||
"combined_score": 0.6096
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.01,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 25,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 5,
|
||||
"n_gram_max": 2,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.4531,
|
||||
"diversity": 0.975,
|
||||
"combined_score": 0.6096
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.01,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 10,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 5,
|
||||
"n_gram_max": 3,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.4617,
|
||||
"diversity": 0.95,
|
||||
"combined_score": 0.6082
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.01,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 25,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 5,
|
||||
"n_gram_max": 3,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.4617,
|
||||
"diversity": 0.95,
|
||||
"combined_score": 0.6082
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.1,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 10,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 5,
|
||||
"n_gram_max": 2,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.4287,
|
||||
"diversity": 1.0,
|
||||
"combined_score": 0.6001
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.1,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 25,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 5,
|
||||
"n_gram_max": 2,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.4287,
|
||||
"diversity": 1.0,
|
||||
"combined_score": 0.6001
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.1,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 10,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 5,
|
||||
"n_gram_max": 3,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.427,
|
||||
"diversity": 1.0,
|
||||
"combined_score": 0.5989
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.1,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 25,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 5,
|
||||
"n_gram_max": 3,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.427,
|
||||
"diversity": 1.0,
|
||||
"combined_score": 0.5989
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.01,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 10,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 2,
|
||||
"n_gram_max": 3,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.4462,
|
||||
"diversity": 0.925,
|
||||
"combined_score": 0.5898
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.01,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 25,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 2,
|
||||
"n_gram_max": 3,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.4462,
|
||||
"diversity": 0.925,
|
||||
"combined_score": 0.5898
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.01,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 10,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 2,
|
||||
"n_gram_max": 2,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.4456,
|
||||
"diversity": 0.925,
|
||||
"combined_score": 0.5894
|
||||
}
|
||||
},
|
||||
{
|
||||
"params": {
|
||||
"min_dist": 0.01,
|
||||
"min_document_frequency": 1,
|
||||
"min_samples": 25,
|
||||
"min_topic_size": 200,
|
||||
"n_components": 2,
|
||||
"n_gram_max": 2,
|
||||
"n_neighbors": 15,
|
||||
"nr_topics": "auto",
|
||||
"top_n_words": 10
|
||||
},
|
||||
"metrics": {
|
||||
"coherence": 0.4456,
|
||||
"diversity": 0.925,
|
||||
"combined_score": 0.5894
|
||||
}
|
||||
}
|
||||
]
|
||||
File diff suppressed because one or more lines are too long
@@ -131,3 +131,4 @@ spacy
|
||||
nbconvert
|
||||
jupytext
|
||||
datamapplot
|
||||
wordcloud
|
||||
|
||||
Reference in New Issue
Block a user