RAFT updates, BERTopic config, cleanup

This commit is contained in:
2026-02-21 01:57:14 +01:00
parent 8cadcb1f69
commit 1a99b53d44
12 changed files with 10750 additions and 9778 deletions

View File

@@ -47,14 +47,15 @@ nltk.download("punkt")
nltk.download("wordnet")
# %% [markdown]
# ### Parameters and Tracking
# ### Hyperparameters and Settings
#
# %%
RECREATE_MODEL = True
RECREATE_REDUCED_MODEL = True
PROCESS_DATA = True
PROCESS_DATA = False
REDUCE_OUTLIERS = False
CALCULATE_TOKEN_DISTRIBUTIONS = False
# Data Sample Size, -1 for all data
DATA_SAMPLE_SIZE = -1
@@ -76,19 +77,7 @@ MIN_DIST = 0.01
TOP_N_WORDS = 10
MAX_TOPICS = None # or "auto" to pass to HDBSCAN, None to skip
tracking = {
"input": {
"min_document_frequency": MIN_DOCUMENT_FREQUENCY,
"max_ngram": MAX_NGRAM,
"min_topic_size": MIN_TOPIC_SIZE,
"min_samples": MIN_SAMPLES,
"n_neighbors": N_NEIGHBORS,
"n_components": N_COMPONENTS,
"min_dist": MIN_DIST,
"top_n_words": TOP_N_WORDS,
"max_topics": MAX_TOPICS,
},
}
TF_IDF_STOP_WORDS = ["bali", "place", "visit", "visited", "visiting"]
# %% [markdown]
# ### Data Loading & Preprocessing
@@ -116,21 +105,16 @@ rep = {
r"\n": " ",
r'\\"': "",
r'"': "",
"bali": "",
r"\s+": " ",
}
rep = dict((re.escape(k), v) for k, v in rep.items())
pattern = re.compile("|".join(rep.keys()))
# def preprocess(text):
# text = text.strip()
# text = text.lower()
# text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
# return text
def preprocess(text):
text = text.strip()
text = text.lower()
text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
return text
@@ -187,7 +171,7 @@ reduced_embeddings = umap_model.fit_transform(embeddings)
# %%
if RECREATE_MODEL:
stop_words = list(skltext.ENGLISH_STOP_WORDS.union(["bali"]))
stop_words = list(skltext.ENGLISH_STOP_WORDS.union(TF_IDF_STOP_WORDS))
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
vectorizer_model = CountVectorizer(
@@ -306,72 +290,23 @@ if REDUCE_OUTLIERS:
#
# %%
CLASSIFICATION = False
CLASSIFICATION = True
if CLASSIFICATION:
import random
from pathlib import Path
# --- config ---
topics_to_keep = {2, 4, 5, 9, 22, 26}
INPUT_PATH = "../data/original/reviews.tab" # TSV with a 'review' column
OUTPUT_CSV = "../data/intermediate/selected_topics_documents.csv"
OUTPUT_DIR = Path("../raft/corpus")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
BATCH_SIZE = 60
MIN_CHARS = 40
SEED = 42
topics_to_keep = {14, 8, 13, 18, 17, 4, 2, 30, 28}
INPUT_PATH = "../data/intermediate/preprocessed.tab" # TSV with a 'review' column
OUTPUT_CSV = "../data/intermediate/culture_reviews.csv"
# Topic model document info
df = topic_model.get_document_info(reviews) # assumes your model is already fitted
df["Original"] = reviews.values
df = topic_model.get_document_info(reviews)
df["Original"] = reviews
# --- filter by topics and length ---
filtered = df[df["Topic"].isin(topics_to_keep)].copy()
filtered["Original"] = filtered["Original"].str.strip()
filtered = filtered[filtered["Original"].str.len() >= MIN_CHARS]
# Save an audit CSV
filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False)
# --- deterministic shuffle + write batched corpus files ---
total_files = 0
total_reviews = 0
rng = random.Random(SEED)
for topic_val, g in filtered.groupby("Topic", sort=True):
reviews_list = g["Original"].tolist()
# deterministic shuffle within topic
rng.shuffle(reviews_list)
# chunk into batches of up to 60
for start in range(0, len(reviews_list), BATCH_SIZE):
chunk = reviews_list[start : start + BATCH_SIZE]
if not chunk:
continue
# simple header for traceability
header = (
f"[TOPIC] {topic_val}\n"
+ f"[Stats] N={len(chunk)} | Source={INPUT_PATH}\n"
)
lines = [header, ""]
for i, txt in enumerate(chunk, 1):
lines.append(f"({i}) {txt}")
part_idx = start // BATCH_SIZE + 1
fname = f"topic={topic_val}__part={part_idx:03d}__n={len(chunk)}.txt"
(OUTPUT_DIR / fname).write_text("\n".join(lines), encoding="utf-8")
total_files += 1
total_reviews += len(chunk)
print(
f"[green]Wrote {total_files} docs with {total_reviews} reviews to {OUTPUT_DIR}[/green]"
)
print(f"[green]Filtered CSV saved to {OUTPUT_CSV}[/green]")
filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False, sep=",")
print(f"Filtered CSV file saved to {OUTPUT_CSV}")
# %%
doc_topic_matrix = probs
@@ -425,7 +360,7 @@ vis = topic_model.visualize_documents(
custom_labels=True,
hide_annotations=True,
)
vis.write_html("output/visualization.html")
# vis.write_html("output/visualization.html")
vis
# %%
@@ -531,7 +466,7 @@ if this_will_crash_your_pc_are_you_sure:
#
# %%
search_term = "spirituality"
search_term = "lempuyang"
similar_topics, similarities = topic_model.find_topics(search_term, top_n=10)
for i in range(len(similar_topics)):
@@ -542,17 +477,20 @@ for i in range(len(similar_topics)):
# %%
# Source: https://maartengr.github.io/BERTopic/getting_started/visualization/visualize_documents.html#visualize-probabilities-or-distribution
# Calculate the topic distributions on a token-level
topic_distr, topic_token_distr = topic_model.approximate_distribution(
reviews, calculate_tokens=True, use_embedding_model=True
)
if CALCULATE_TOKEN_DISTRIBUTIONS:
topic_distr, topic_token_distr = topic_model.approximate_distribution(
reviews, calculate_tokens=True, use_embedding_model=True
)
# %%
# Visualize the token-level distributions
DOC_INDEX = 6
df = topic_model.visualize_approximate_distribution(
reviews[DOC_INDEX], topic_token_distr[DOC_INDEX]
)
df
if CALCULATE_TOKEN_DISTRIBUTIONS:
DOC_INDEX = 1
df = topic_model.visualize_approximate_distribution(
reviews[DOC_INDEX], topic_token_distr[DOC_INDEX]
)
df
# %% [markdown]
# ### Topic Hierarchy