mirror of
https://github.com/marvinscham/masterthesis-playground.git
synced 2026-03-22 00:12:42 +01:00
BERTopic cleanup
This commit is contained in:
14
README.md
Normal file
14
README.md
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
# Masterthesis, praktischer Anteil
|
||||||
|
|
||||||
|
## Jupyter Notebooks "rehydrieren"
|
||||||
|
|
||||||
|
Damit keine unnötigen Jupyter Outputs etc. im Versionsmanagement landen, gibt es das Skript `convert_jupytext.sh`, welches nur den notwendigen Quelltext in ein `.py` File schreibt. Mit demselben Skript kann dieser Schritt wieder umgekehrt werden, also ein Jupyter Notebook aus dem Python-File geschrieben werden.
|
||||||
|
|
||||||
|
Das Skript sollte also immer vor dem Committen von Änderungen mit `py` als erstes Argument ausgeführt werden.
|
||||||
|
|
||||||
|
Verwendung:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./convert_jupytext.sh py # Jupyter Notebook -> Python
|
||||||
|
./convert_jupytext.sh nb # Python -> Jupyter Notebook
|
||||||
|
```
|
||||||
@@ -3,6 +3,8 @@ import traceback
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
from bertopic.representation import KeyBERTInspired
|
||||||
|
from bertopic.vectorizers import ClassTfidfTransformer
|
||||||
from hdbscan import HDBSCAN
|
from hdbscan import HDBSCAN
|
||||||
from sentence_transformers import SentenceTransformer
|
from sentence_transformers import SentenceTransformer
|
||||||
from sklearn.feature_extraction.text import CountVectorizer
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
@@ -12,55 +14,50 @@ from sklearn.model_selection import ParameterGrid
|
|||||||
from umap import UMAP
|
from umap import UMAP
|
||||||
|
|
||||||
from bertopic import BERTopic
|
from bertopic import BERTopic
|
||||||
from bertopic.representation import KeyBERTInspired
|
|
||||||
from bertopic.vectorizers import ClassTfidfTransformer
|
|
||||||
|
|
||||||
param_grid = {
|
param_grid = {
|
||||||
"nr_topics": [45, 50, 55],
|
"n_gram_max": [2, 3], # Vectorization
|
||||||
"min_topic_size": [30, 40, 50],
|
"min_document_frequency": [1], # Vectorization
|
||||||
"n_gram_max": [3],
|
"min_samples": [10, 25], # HDBSCAN
|
||||||
"min_document_frequency": [1, 2],
|
"min_topic_size": [10, 20, 30, 40, 50], # HDBSCAN
|
||||||
"n_neighbors": [15],
|
"n_neighbors": [15], # UMAP
|
||||||
"n_components": [2],
|
"n_components": [2, 5], # UMAP
|
||||||
"min_dist": [0.1],
|
"min_dist": [0.01, 0.1], # UMAP
|
||||||
"top_n_words": [10],
|
"nr_topics": ["auto"], # Topic Modeling
|
||||||
|
"top_n_words": [10, 13, 15, 17, 20], # Topic Modeling
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def calculate_metrics(topic_model, embedder, top_n_words=5):
|
def calculate_metrics(topic_model, embedder, top_n_words=10):
|
||||||
# Get topic words
|
# Get topic words
|
||||||
topic_words = []
|
topic_words = []
|
||||||
for topic_id in range(len(topic_model.get_topic_info()) - 1):
|
for topic_id in range(len(topic_model.get_topic_info()) - 1):
|
||||||
words = [word for word, _ in topic_model.get_topic(topic_id)]
|
words = [word for word, _ in topic_model.get_topic(topic_id)]
|
||||||
topic_words.append(words[:top_n_words])
|
topic_words.append(words[:top_n_words])
|
||||||
|
|
||||||
|
# Pre-compute embeddings for all unique words
|
||||||
|
all_words = list(set(word for words in topic_words for word in words))
|
||||||
|
word_embeddings = embedder.encode(all_words)
|
||||||
|
embedding_map = {word: emb for word, emb in zip(all_words, word_embeddings)}
|
||||||
|
|
||||||
# Coherence
|
# Coherence
|
||||||
coherence_scores = []
|
coherence_scores = []
|
||||||
for words in topic_words:
|
for words in topic_words:
|
||||||
embeddings = embedder.encode(words)
|
embeddings = np.array([embedding_map[word] for word in words])
|
||||||
sim_matrix = cosine_similarity(embeddings)
|
sim_matrix = cosine_similarity(embeddings)
|
||||||
np.fill_diagonal(sim_matrix, 0)
|
np.fill_diagonal(sim_matrix, 0)
|
||||||
coherence_scores.append(np.mean(sim_matrix))
|
mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)])
|
||||||
|
coherence_scores.append(mean_sim)
|
||||||
overall_coherence = np.mean(coherence_scores)
|
overall_coherence = np.mean(coherence_scores)
|
||||||
|
|
||||||
# Diversity
|
# Diversity
|
||||||
all_topic_words = [word for topic in topic_words for word in topic]
|
all_topic_words = [word for topic in topic_words for word in topic]
|
||||||
diversity = len(set(all_topic_words)) / len(all_topic_words)
|
diversity = len(set(all_topic_words)) / len(all_topic_words)
|
||||||
|
|
||||||
# Inter-topic distance
|
|
||||||
topic_embeddings = [
|
|
||||||
np.mean(embedder.encode(words), axis=0) for words in topic_words
|
|
||||||
]
|
|
||||||
topic_distance = pairwise_distances(topic_embeddings, metric="cosine")
|
|
||||||
avg_distance = np.mean(topic_distance[np.triu_indices_from(topic_distance, k=1)])
|
|
||||||
|
|
||||||
res = {
|
res = {
|
||||||
"coherence": float(str(overall_coherence)[:6]),
|
"coherence": float(str(overall_coherence)[:6]),
|
||||||
"diversity": float(str(diversity)[:6]),
|
"diversity": float(str(diversity)[:6]),
|
||||||
"inter_topic_distance": float(str(avg_distance)[:6]),
|
"combined_score": float(str(0.7 * overall_coherence + 0.3 * diversity)[:6]),
|
||||||
"combined_score": float(
|
|
||||||
str(0.6 * overall_coherence + 0.2 * diversity + 0.2 * avg_distance)[:6]
|
|
||||||
),
|
|
||||||
}
|
}
|
||||||
print(res)
|
print(res)
|
||||||
return res
|
return res
|
||||||
@@ -85,6 +82,7 @@ def auto_tune_bertopic(texts, embedding_model, param_grid):
|
|||||||
|
|
||||||
print(f"Total parameter combinations: {len(param_list)}")
|
print(f"Total parameter combinations: {len(param_list)}")
|
||||||
for params in param_list:
|
for params in param_list:
|
||||||
|
print(f"Testing param combination no. {len(history) + 1}/{len(param_list)}...")
|
||||||
try:
|
try:
|
||||||
print(f"Testing params: {params}")
|
print(f"Testing params: {params}")
|
||||||
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
|
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
|
||||||
@@ -143,18 +141,27 @@ def auto_tune_bertopic(texts, embedding_model, param_grid):
|
|||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
return best_model, best_params, best_score, history
|
with open("output/autotune.json", "w") as f:
|
||||||
|
json.dump(history, f, indent=2)
|
||||||
|
|
||||||
|
return best_model, best_params, best_score
|
||||||
|
|
||||||
|
|
||||||
SPECIAL_CHARS = ["\n", "\\n"]
|
SPECIAL_CHARS = ["\n", "\\n"]
|
||||||
MIN_REVIEW_WORDS = 5
|
MIN_REVIEW_WORDS = 5
|
||||||
|
|
||||||
reviews = pd.read_csv("data.tab", sep="\t").review.to_list()
|
print("Loading reviews...")
|
||||||
|
reviews = pd.read_csv("../data/original/reviews.tab", sep="\t").review.to_list()
|
||||||
|
|
||||||
|
print("Running light preprocessing...")
|
||||||
for schar in SPECIAL_CHARS:
|
for schar in SPECIAL_CHARS:
|
||||||
reviews = [
|
reviews = [
|
||||||
review.replace(schar, " ") if isinstance(review, str) else review
|
review.replace(schar, " ") if isinstance(review, str) else review
|
||||||
for review in reviews
|
for review in reviews
|
||||||
]
|
]
|
||||||
|
|
||||||
|
print("Filtering short reviews...")
|
||||||
reviews = [review for review in reviews if len(str(review).split()) >= MIN_REVIEW_WORDS]
|
reviews = [review for review in reviews if len(str(review).split()) >= MIN_REVIEW_WORDS]
|
||||||
|
|
||||||
|
print("Staring auto-tuning...")
|
||||||
print(auto_tune_bertopic(reviews, "all-MiniLM-L6-v2", param_grid))
|
print(auto_tune_bertopic(reviews, "all-MiniLM-L6-v2", param_grid))
|
||||||
|
|||||||
@@ -2,12 +2,12 @@ import json
|
|||||||
|
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
with open("history.json", "r") as f:
|
with open("output/autotune.json", "r") as f:
|
||||||
history = json.load(f)
|
history = json.load(f)
|
||||||
|
|
||||||
history = sorted(history, key=lambda x: x["metrics"]["combined_score"], reverse=True)
|
history = sorted(history, key=lambda x: x["metrics"]["combined_score"], reverse=False)
|
||||||
|
|
||||||
with open("history_sorted.json", "w") as f:
|
with open("output/autotune_sorted.json", "w") as f:
|
||||||
json.dump(history, f, indent=2)
|
json.dump(history, f, indent=2)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
BIN
bertopic/combined_score_distribution.png
Normal file
BIN
bertopic/combined_score_distribution.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 16 KiB |
@@ -23,7 +23,15 @@
|
|||||||
#
|
#
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
from bertopic import BERTopic
|
import json
|
||||||
|
import pickle
|
||||||
|
import re
|
||||||
|
|
||||||
|
import gensim.corpora as corpora
|
||||||
|
import nltk
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import spacy
|
||||||
from bertopic.representation import KeyBERTInspired
|
from bertopic.representation import KeyBERTInspired
|
||||||
from bertopic.vectorizers import ClassTfidfTransformer
|
from bertopic.vectorizers import ClassTfidfTransformer
|
||||||
from gensim.models.coherencemodel import CoherenceModel
|
from gensim.models.coherencemodel import CoherenceModel
|
||||||
@@ -34,14 +42,8 @@ from sentence_transformers import SentenceTransformer
|
|||||||
from sklearn.feature_extraction.text import CountVectorizer
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
from sklearn.metrics.pairwise import cosine_similarity
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
from umap import UMAP
|
from umap import UMAP
|
||||||
import gensim.corpora as corpora
|
|
||||||
import json
|
from bertopic import BERTopic
|
||||||
import nltk
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
import re
|
|
||||||
import spacy
|
|
||||||
import pickle
|
|
||||||
|
|
||||||
nlp = spacy.load("en_core_web_sm")
|
nlp = spacy.load("en_core_web_sm")
|
||||||
|
|
||||||
@@ -323,8 +325,8 @@ if REDUCE_OUTLIERS:
|
|||||||
#
|
#
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
from pathlib import Path
|
|
||||||
import random
|
import random
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
# --- config ---
|
# --- config ---
|
||||||
topics_to_keep = {2, 4, 6, 8, 10, 5, 7}
|
topics_to_keep = {2, 4, 6, 8, 10, 5, 7}
|
||||||
@@ -468,7 +470,11 @@ topic_model.get_topic_info()
|
|||||||
|
|
||||||
# %%
|
# %%
|
||||||
topic_words = []
|
topic_words = []
|
||||||
for topic_id in range(len(topic_model.get_topic_info()) - 1):
|
for topic_id in topic_model.get_topic_info()["Topic"]:
|
||||||
|
# Skip outlier topic
|
||||||
|
if topic_id < 0:
|
||||||
|
continue
|
||||||
|
|
||||||
words = [word for word, _ in topic_model.get_topic(topic_id)]
|
words = [word for word, _ in topic_model.get_topic(topic_id)]
|
||||||
topic_words.append(words)
|
topic_words.append(words)
|
||||||
|
|
||||||
@@ -477,8 +483,10 @@ coherence_scores = []
|
|||||||
for words in topic_words:
|
for words in topic_words:
|
||||||
coherence_embeddings = embedding_model.encode(words)
|
coherence_embeddings = embedding_model.encode(words)
|
||||||
sim_matrix = cosine_similarity(coherence_embeddings)
|
sim_matrix = cosine_similarity(coherence_embeddings)
|
||||||
np.fill_diagonal(sim_matrix, 0) # Ignore self-similarity
|
|
||||||
mean_sim = np.mean(sim_matrix)
|
# Ignore self-similarity
|
||||||
|
np.fill_diagonal(sim_matrix, 0)
|
||||||
|
mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)])
|
||||||
coherence_scores.append(mean_sim)
|
coherence_scores.append(mean_sim)
|
||||||
|
|
||||||
overall_coherence = np.mean(coherence_scores)
|
overall_coherence = np.mean(coherence_scores)
|
||||||
|
|||||||
@@ -23,7 +23,14 @@
|
|||||||
#
|
#
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
from bertopic import BERTopic
|
import pickle
|
||||||
|
import re
|
||||||
|
|
||||||
|
import gensim.corpora as corpora
|
||||||
|
import nltk
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import spacy
|
||||||
from bertopic.representation import KeyBERTInspired
|
from bertopic.representation import KeyBERTInspired
|
||||||
from bertopic.vectorizers import ClassTfidfTransformer
|
from bertopic.vectorizers import ClassTfidfTransformer
|
||||||
from gensim.models.coherencemodel import CoherenceModel
|
from gensim.models.coherencemodel import CoherenceModel
|
||||||
@@ -33,13 +40,8 @@ from sentence_transformers import SentenceTransformer
|
|||||||
from sklearn.feature_extraction.text import CountVectorizer
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
from sklearn.metrics.pairwise import cosine_similarity
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
from umap import UMAP
|
from umap import UMAP
|
||||||
import gensim.corpora as corpora
|
|
||||||
import nltk
|
from bertopic import BERTopic
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
import re
|
|
||||||
import spacy
|
|
||||||
import pickle
|
|
||||||
|
|
||||||
nlp = spacy.load("en_core_web_sm")
|
nlp = spacy.load("en_core_web_sm")
|
||||||
|
|
||||||
@@ -300,8 +302,8 @@ if REDUCE_OUTLIERS:
|
|||||||
#
|
#
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
from pathlib import Path
|
|
||||||
import random
|
import random
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
# --- config ---
|
# --- config ---
|
||||||
topics_to_keep = {2, 4, 5, 9, 22, 26}
|
topics_to_keep = {2, 4, 5, 9, 22, 26}
|
||||||
@@ -445,7 +447,11 @@ topic_model.get_topic_info()
|
|||||||
|
|
||||||
# %%
|
# %%
|
||||||
topic_words = []
|
topic_words = []
|
||||||
for topic_id in range(len(topic_model.get_topic_info()) - 1):
|
for topic_id in topic_model.get_topic_info()["Topic"]:
|
||||||
|
# Skip outlier topic
|
||||||
|
if topic_id < 0:
|
||||||
|
continue
|
||||||
|
|
||||||
words = [word for word, _ in topic_model.get_topic(topic_id)]
|
words = [word for word, _ in topic_model.get_topic(topic_id)]
|
||||||
topic_words.append(words)
|
topic_words.append(words)
|
||||||
|
|
||||||
@@ -454,8 +460,10 @@ coherence_scores = []
|
|||||||
for words in topic_words:
|
for words in topic_words:
|
||||||
coherence_embeddings = embedding_model.encode(words)
|
coherence_embeddings = embedding_model.encode(words)
|
||||||
sim_matrix = cosine_similarity(coherence_embeddings)
|
sim_matrix = cosine_similarity(coherence_embeddings)
|
||||||
np.fill_diagonal(sim_matrix, 0) # Ignore self-similarity
|
|
||||||
mean_sim = np.mean(sim_matrix)
|
# Ignore self-similarity
|
||||||
|
np.fill_diagonal(sim_matrix, 0)
|
||||||
|
mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)])
|
||||||
coherence_scores.append(mean_sim)
|
coherence_scores.append(mean_sim)
|
||||||
|
|
||||||
overall_coherence = np.mean(coherence_scores)
|
overall_coherence = np.mean(coherence_scores)
|
||||||
@@ -492,10 +500,14 @@ if this_will_crash_your_pc_are_you_sure:
|
|||||||
tokens = [analyzer(doc) for doc in cleaned_docs]
|
tokens = [analyzer(doc) for doc in cleaned_docs]
|
||||||
dictionary = corpora.Dictionary(tokens)
|
dictionary = corpora.Dictionary(tokens)
|
||||||
corpus = [dictionary.doc2bow(token) for token in tokens]
|
corpus = [dictionary.doc2bow(token) for token in tokens]
|
||||||
topic_words = [
|
|
||||||
[words for words, _ in topic_model.get_topic(topic)]
|
for topic_id in topic_model.get_topic_info()["Topic"]:
|
||||||
for topic in range(len(set(topics)) - 1)
|
# Skip outlier topic
|
||||||
]
|
if topic_id < 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
words = [word for word, _ in topic_model.get_topic(topic_id)]
|
||||||
|
topic_words.append(words)
|
||||||
|
|
||||||
# %env TOKENIZERS_PARALLELISM=false
|
# %env TOKENIZERS_PARALLELISM=false
|
||||||
|
|
||||||
|
|||||||
1298
bertopic/output/autotune.json
Normal file
1298
bertopic/output/autotune.json
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user