mirror of
https://github.com/marvinscham/masterthesis-playground.git
synced 2026-03-22 08:22:43 +01:00
RAFT updates, BERTopic config, cleanup
This commit is contained in:
@@ -47,14 +47,15 @@ nltk.download("punkt")
|
||||
nltk.download("wordnet")
|
||||
|
||||
# %% [markdown]
|
||||
# ### Parameters and Tracking
|
||||
# ### Hyperparameters and Settings
|
||||
#
|
||||
|
||||
# %%
|
||||
RECREATE_MODEL = True
|
||||
RECREATE_REDUCED_MODEL = True
|
||||
PROCESS_DATA = True
|
||||
PROCESS_DATA = False
|
||||
REDUCE_OUTLIERS = False
|
||||
CALCULATE_TOKEN_DISTRIBUTIONS = False
|
||||
|
||||
# Data Sample Size, -1 for all data
|
||||
DATA_SAMPLE_SIZE = -1
|
||||
@@ -76,19 +77,7 @@ MIN_DIST = 0.01
|
||||
TOP_N_WORDS = 10
|
||||
MAX_TOPICS = None # or "auto" to pass to HDBSCAN, None to skip
|
||||
|
||||
tracking = {
|
||||
"input": {
|
||||
"min_document_frequency": MIN_DOCUMENT_FREQUENCY,
|
||||
"max_ngram": MAX_NGRAM,
|
||||
"min_topic_size": MIN_TOPIC_SIZE,
|
||||
"min_samples": MIN_SAMPLES,
|
||||
"n_neighbors": N_NEIGHBORS,
|
||||
"n_components": N_COMPONENTS,
|
||||
"min_dist": MIN_DIST,
|
||||
"top_n_words": TOP_N_WORDS,
|
||||
"max_topics": MAX_TOPICS,
|
||||
},
|
||||
}
|
||||
TF_IDF_STOP_WORDS = ["bali", "place", "visit", "visited", "visiting"]
|
||||
|
||||
# %% [markdown]
|
||||
# ### Data Loading & Preprocessing
|
||||
@@ -116,21 +105,16 @@ rep = {
|
||||
r"\n": " ",
|
||||
r'\\"': "",
|
||||
r'"': "",
|
||||
"bali": "",
|
||||
r"\s+": " ",
|
||||
}
|
||||
rep = dict((re.escape(k), v) for k, v in rep.items())
|
||||
pattern = re.compile("|".join(rep.keys()))
|
||||
|
||||
|
||||
# def preprocess(text):
|
||||
# text = text.strip()
|
||||
# text = text.lower()
|
||||
# text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
|
||||
# return text
|
||||
|
||||
|
||||
def preprocess(text):
|
||||
text = text.strip()
|
||||
text = text.lower()
|
||||
text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
|
||||
return text
|
||||
|
||||
|
||||
@@ -187,7 +171,7 @@ reduced_embeddings = umap_model.fit_transform(embeddings)
|
||||
|
||||
# %%
|
||||
if RECREATE_MODEL:
|
||||
stop_words = list(skltext.ENGLISH_STOP_WORDS.union(["bali"]))
|
||||
stop_words = list(skltext.ENGLISH_STOP_WORDS.union(TF_IDF_STOP_WORDS))
|
||||
|
||||
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
|
||||
vectorizer_model = CountVectorizer(
|
||||
@@ -306,72 +290,23 @@ if REDUCE_OUTLIERS:
|
||||
#
|
||||
|
||||
# %%
|
||||
CLASSIFICATION = False
|
||||
CLASSIFICATION = True
|
||||
if CLASSIFICATION:
|
||||
import random
|
||||
from pathlib import Path
|
||||
|
||||
# --- config ---
|
||||
topics_to_keep = {2, 4, 5, 9, 22, 26}
|
||||
INPUT_PATH = "../data/original/reviews.tab" # TSV with a 'review' column
|
||||
OUTPUT_CSV = "../data/intermediate/selected_topics_documents.csv"
|
||||
OUTPUT_DIR = Path("../raft/corpus")
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
BATCH_SIZE = 60
|
||||
MIN_CHARS = 40
|
||||
SEED = 42
|
||||
topics_to_keep = {14, 8, 13, 18, 17, 4, 2, 30, 28}
|
||||
INPUT_PATH = "../data/intermediate/preprocessed.tab" # TSV with a 'review' column
|
||||
OUTPUT_CSV = "../data/intermediate/culture_reviews.csv"
|
||||
|
||||
# Topic model document info
|
||||
df = topic_model.get_document_info(reviews) # assumes your model is already fitted
|
||||
df["Original"] = reviews.values
|
||||
df = topic_model.get_document_info(reviews)
|
||||
df["Original"] = reviews
|
||||
|
||||
# --- filter by topics and length ---
|
||||
filtered = df[df["Topic"].isin(topics_to_keep)].copy()
|
||||
filtered["Original"] = filtered["Original"].str.strip()
|
||||
filtered = filtered[filtered["Original"].str.len() >= MIN_CHARS]
|
||||
|
||||
# Save an audit CSV
|
||||
filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False)
|
||||
|
||||
# --- deterministic shuffle + write batched corpus files ---
|
||||
total_files = 0
|
||||
total_reviews = 0
|
||||
rng = random.Random(SEED)
|
||||
|
||||
for topic_val, g in filtered.groupby("Topic", sort=True):
|
||||
reviews_list = g["Original"].tolist()
|
||||
|
||||
# deterministic shuffle within topic
|
||||
rng.shuffle(reviews_list)
|
||||
|
||||
# chunk into batches of up to 60
|
||||
for start in range(0, len(reviews_list), BATCH_SIZE):
|
||||
chunk = reviews_list[start : start + BATCH_SIZE]
|
||||
if not chunk:
|
||||
continue
|
||||
|
||||
# simple header for traceability
|
||||
header = (
|
||||
f"[TOPIC] {topic_val}\n"
|
||||
+ f"[Stats] N={len(chunk)} | Source={INPUT_PATH}\n"
|
||||
)
|
||||
|
||||
lines = [header, ""]
|
||||
for i, txt in enumerate(chunk, 1):
|
||||
lines.append(f"({i}) {txt}")
|
||||
|
||||
part_idx = start // BATCH_SIZE + 1
|
||||
fname = f"topic={topic_val}__part={part_idx:03d}__n={len(chunk)}.txt"
|
||||
(OUTPUT_DIR / fname).write_text("\n".join(lines), encoding="utf-8")
|
||||
|
||||
total_files += 1
|
||||
total_reviews += len(chunk)
|
||||
|
||||
print(
|
||||
f"[green]Wrote {total_files} docs with {total_reviews} reviews to {OUTPUT_DIR}[/green]"
|
||||
)
|
||||
print(f"[green]Filtered CSV saved to {OUTPUT_CSV}[/green]")
|
||||
filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False, sep=",")
|
||||
print(f"Filtered CSV file saved to {OUTPUT_CSV}")
|
||||
|
||||
# %%
|
||||
doc_topic_matrix = probs
|
||||
@@ -425,7 +360,7 @@ vis = topic_model.visualize_documents(
|
||||
custom_labels=True,
|
||||
hide_annotations=True,
|
||||
)
|
||||
vis.write_html("output/visualization.html")
|
||||
# vis.write_html("output/visualization.html")
|
||||
vis
|
||||
|
||||
# %%
|
||||
@@ -531,7 +466,7 @@ if this_will_crash_your_pc_are_you_sure:
|
||||
#
|
||||
|
||||
# %%
|
||||
search_term = "spirituality"
|
||||
search_term = "lempuyang"
|
||||
|
||||
similar_topics, similarities = topic_model.find_topics(search_term, top_n=10)
|
||||
for i in range(len(similar_topics)):
|
||||
@@ -542,17 +477,20 @@ for i in range(len(similar_topics)):
|
||||
# %%
|
||||
# Source: https://maartengr.github.io/BERTopic/getting_started/visualization/visualize_documents.html#visualize-probabilities-or-distribution
|
||||
# Calculate the topic distributions on a token-level
|
||||
topic_distr, topic_token_distr = topic_model.approximate_distribution(
|
||||
reviews, calculate_tokens=True, use_embedding_model=True
|
||||
)
|
||||
|
||||
if CALCULATE_TOKEN_DISTRIBUTIONS:
|
||||
topic_distr, topic_token_distr = topic_model.approximate_distribution(
|
||||
reviews, calculate_tokens=True, use_embedding_model=True
|
||||
)
|
||||
|
||||
# %%
|
||||
# Visualize the token-level distributions
|
||||
DOC_INDEX = 6
|
||||
df = topic_model.visualize_approximate_distribution(
|
||||
reviews[DOC_INDEX], topic_token_distr[DOC_INDEX]
|
||||
)
|
||||
df
|
||||
if CALCULATE_TOKEN_DISTRIBUTIONS:
|
||||
DOC_INDEX = 1
|
||||
df = topic_model.visualize_approximate_distribution(
|
||||
reviews[DOC_INDEX], topic_token_distr[DOC_INDEX]
|
||||
)
|
||||
df
|
||||
|
||||
# %% [markdown]
|
||||
# ### Topic Hierarchy
|
||||
|
||||
Reference in New Issue
Block a user