mirror of
https://github.com/marvinscham/masterthesis-playground.git
synced 2026-03-22 00:12:42 +01:00
BERTopic cleanup
This commit is contained in:
@@ -23,7 +23,15 @@
|
||||
#
|
||||
|
||||
# %%
|
||||
from bertopic import BERTopic
|
||||
import json
|
||||
import pickle
|
||||
import re
|
||||
|
||||
import gensim.corpora as corpora
|
||||
import nltk
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import spacy
|
||||
from bertopic.representation import KeyBERTInspired
|
||||
from bertopic.vectorizers import ClassTfidfTransformer
|
||||
from gensim.models.coherencemodel import CoherenceModel
|
||||
@@ -34,14 +42,8 @@ from sentence_transformers import SentenceTransformer
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
from umap import UMAP
|
||||
import gensim.corpora as corpora
|
||||
import json
|
||||
import nltk
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import re
|
||||
import spacy
|
||||
import pickle
|
||||
|
||||
from bertopic import BERTopic
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
|
||||
@@ -323,8 +325,8 @@ if REDUCE_OUTLIERS:
|
||||
#
|
||||
|
||||
# %%
|
||||
from pathlib import Path
|
||||
import random
|
||||
from pathlib import Path
|
||||
|
||||
# --- config ---
|
||||
topics_to_keep = {2, 4, 6, 8, 10, 5, 7}
|
||||
@@ -468,7 +470,11 @@ topic_model.get_topic_info()
|
||||
|
||||
# %%
|
||||
topic_words = []
|
||||
for topic_id in range(len(topic_model.get_topic_info()) - 1):
|
||||
for topic_id in topic_model.get_topic_info()["Topic"]:
|
||||
# Skip outlier topic
|
||||
if topic_id < 0:
|
||||
continue
|
||||
|
||||
words = [word for word, _ in topic_model.get_topic(topic_id)]
|
||||
topic_words.append(words)
|
||||
|
||||
@@ -477,8 +483,10 @@ coherence_scores = []
|
||||
for words in topic_words:
|
||||
coherence_embeddings = embedding_model.encode(words)
|
||||
sim_matrix = cosine_similarity(coherence_embeddings)
|
||||
np.fill_diagonal(sim_matrix, 0) # Ignore self-similarity
|
||||
mean_sim = np.mean(sim_matrix)
|
||||
|
||||
# Ignore self-similarity
|
||||
np.fill_diagonal(sim_matrix, 0)
|
||||
mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)])
|
||||
coherence_scores.append(mean_sim)
|
||||
|
||||
overall_coherence = np.mean(coherence_scores)
|
||||
@@ -518,8 +526,8 @@ if CALCULATE_COHERENCE:
|
||||
for topic in range(len(set(topics)) - 1)
|
||||
]
|
||||
|
||||
# %env TOKENIZERS_PARALLELISM=false
|
||||
|
||||
# %env TOKENIZERS_PARALLELISM=false
|
||||
|
||||
for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]:
|
||||
coherence_model = CoherenceModel(
|
||||
topics=topic_words,
|
||||
|
||||
Reference in New Issue
Block a user