# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: percent
#       format_version: '1.3'
#       jupytext_version: 1.18.0
#   kernelspec:
#     display_name: .venv
#     language: python
#     name: python3
# ---

# %% [markdown]
# # Topic Detection: Bali Tourist Reviews
#

# %% [markdown]
# ## Preparation
#
# ### Dependency Loading
#

# %%
import json
import pickle
import re

import gensim.corpora as corpora
import nltk
import numpy as np
import pandas as pd
import spacy
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
from gensim.models.coherencemodel import CoherenceModel
from hdbscan import HDBSCAN
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from umap import UMAP

from bertopic import BERTopic

nlp = spacy.load("en_core_web_sm")

nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

# %% [markdown]
# ### Parameters and Tracking
#

# %%
RECREATE_MODEL = True
RECREATE_REDUCED_MODEL = True
PROCESS_DATA = False
REDUCE_OUTLIERS = True
USE_CONDENSED_MODEL = False

DATA_SAMPLE_SIZE = -1  # -1 for all data

# Classical coherence score. Warning: needs swap to not kill your PC
CALCULATE_COHERENCE = False

# Vectorization
MIN_DOCUMENT_FREQUENCY = 1
MAX_NGRAM = 2

# HDBSCAN Parameters
MIN_TOPIC_SIZE = 200
MIN_SAMPLES = 25

# UMAP Parameters
N_NEIGHBORS = 15
N_COMPONENTS = 2
MIN_DIST = 0.01

# Topic Modeling
TOP_N_WORDS = 10
MAX_TOPICS = None  # or "auto" to pass to HDBSCAN, None to skip

# %% [markdown]
# ### Data Loading & Preprocessing
#

# %%
if DATA_SAMPLE_SIZE != -1:
    reviews = (
        pd.read_csv("../data/original/reviews.tab", sep="\t")
        .sample(n=DATA_SAMPLE_SIZE)
        .review.dropna()
        .to_list()
    )
else:
    reviews = (
        pd.read_csv("../data/original/reviews.tab", sep="\t").review.dropna().to_list()
    )

print("Loaded {} reviews".format(len(reviews)))

# %%
# List of NE in Bali for NER enhancement
with open("../data/supporting/bali_ner.json", "r") as f:
    bali_places = json.load(f)
bali_places_set = set(bali_places)

# Stop word definition
extra_stopwords = ["bali", "idr", "usd"]
stop_words = set(stopwords.words("english"))
with open("../data/supporting/stopwords-en.json", "r") as f:
    extra_stopwords.extend(json.load(f))

# Custom replacements
rep = {
    r"\\n": " ",
    r"\n": " ",
    r'\\"': "",
    r'"': "",
    "mongkey": "monkey",
    "monky": "monkey",
    "verry": "very",
}
rep = dict((re.escape(k), v) for k, v in rep.items())
pattern = re.compile("|".join(rep.keys()))

lemmatizer = WordNetLemmatizer()


def preprocess(text):
    # Step 1: Apply custom replacements (typos, special cases)
    text = text.lower()
    text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)

    # Step 2: Clean text
    text = re.sub(r"\d+", " ", text)
    text = re.sub(r"\W+", " ", text)

    doc = nlp(text)

    # Step 3: POS tagging and filtering
    filtered_tokens = [
        token.text
        for token in doc
        if token.pos_ in {"NOUN", "PROPN"}
        or token.ent_type_ in {"GPE", "LOC", "FAC"}
        or token.text in bali_places_set
    ]

    # Step 4: Lemmatization and stopword removal
    lemmatized_tokens = [
        lemmatizer.lemmatize(w)
        for w in filtered_tokens
        if w not in stop_words and w not in extra_stopwords and len(w) > 2
    ]

    return lemmatized_tokens


# %%
if PROCESS_DATA:
    print("Processing reviews...")
    reviews = [preprocess(review) for review in reviews]

    with open("../data/intermediate/processed_texts.pkl", "wb") as f:
        pickle.dump(reviews, f)
else:
    with open("../data/intermediate/processed_texts.pkl", "rb") as f:
        reviews = pickle.load(f)
        reviews = [
            " ".join(review) if isinstance(review, list) else review
            for review in reviews
        ]

print(reviews[:1])

# %% [markdown]
# ### Pre-calculate Embeddings
#

# %%
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(reviews, show_progress_bar=True)

# %% [markdown]
# ## Model Creation
#

# %% [markdown]
# ### Dimensionality Reduction (UMAP)
#

# %%
umap_model = UMAP(
    n_neighbors=N_NEIGHBORS,
    n_components=N_COMPONENTS,
    min_dist=MIN_DIST,
    metric="cosine",
    low_memory=True,
    random_state=42,
)
reduced_embeddings = umap_model.fit_transform(embeddings)

# %% [markdown]
# ### BERTopic Model Creation
#

# %%
if RECREATE_MODEL:
    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
    vectorizer_model = CountVectorizer(
        min_df=MIN_DOCUMENT_FREQUENCY, ngram_range=(1, MAX_NGRAM)
    )

    representation_model = KeyBERTInspired()
    hdbscan_model = HDBSCAN(
        min_cluster_size=MIN_TOPIC_SIZE,
        min_samples=MIN_SAMPLES,
        metric="euclidean",
        cluster_selection_method="eom",
        gen_min_span_tree=True,
        prediction_data=True,
    )

    topic_model = BERTopic(
        embedding_model=embedding_model,
        ctfidf_model=ctfidf_model,
        vectorizer_model=vectorizer_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        representation_model=representation_model,
        verbose=True,
        calculate_probabilities=True,
        language="english",
        top_n_words=TOP_N_WORDS,
        nr_topics=MAX_TOPICS,
    )

    topics, probs = topic_model.fit_transform(reviews, embeddings=embeddings)

    topic_labels = topic_model.generate_topic_labels(
        nr_words=3, topic_prefix=True, word_length=15, separator=" - "
    )
    topic_model.set_topic_labels(topic_labels)
    BERTopic.save(topic_model, "output/model.bertopic")
else:
    print("Nevermind, loading existing model")
    topic_model = BERTopic.load("output/model.bertopic")

# %% [markdown]
# ## Fine Tuning
#
# ### Topic Condensation
#

# %%
if RECREATE_REDUCED_MODEL:
    done = False
    iteration = 1
    while not done:
        print(f"Iteration {iteration}")
        iteration += 1
        similarity_matrix = cosine_similarity(
            np.array(topic_model.topic_embeddings_)[1:, :]
        )
        nothing_to_merge = True

        for i in range(similarity_matrix.shape[0]):
            for j in range(i + 1, similarity_matrix.shape[1]):
                sim = similarity_matrix[i, j]
                if sim > 0.9:
                    nothing_to_merge = False
                    t1, t2 = i, j
                    try:
                        t1_name = topic_model.get_topic_info(t1)["CustomName"][0]
                        t2_name = topic_model.get_topic_info(t2)["CustomName"][0]
                        print(
                            f"Merging topics {t1} ({t1_name}) and {t2} ({t2_name}) with similarity {sim:.2f}"
                        )
                        topic_model.merge_topics(reviews, topics_to_merge=[t1, t2])

                        topic_labels = topic_model.generate_topic_labels(
                            nr_words=3,
                            topic_prefix=True,
                            word_length=15,
                            separator=" - ",
                        )
                        topic_model.set_topic_labels(topic_labels)
                    except Exception as e:
                        print(f"Failed to merge {t1} and {t2}: {e}")
        if nothing_to_merge:
            print("No more topics to merge.")
            done = True

    # BERTopic.save(topic_model, "bertopic/model_reduced.bertopic")
elif USE_CONDENSED_MODEL:
    print("Nevermind, loading existing reduced model")
    topic_model = BERTopic.load("bertopic/model_reduced.bertopic")
else:
    print("Skipping topic reduction")

# %% [markdown]
# ### Outlier Reduction
#

# %%
if REDUCE_OUTLIERS:
    new_topics = topic_model.reduce_outliers(
        reviews,
        topic_model.topics_,
        probabilities=topic_model.probabilities_,
        threshold=0.05,
        strategy="probabilities",
    )
    topic_model.update_topics(reviews, topics=new_topics)

# %% [markdown]
# ## Results
#
# ### Classification
#

# %%
import random
from pathlib import Path

# --- config ---
topics_to_keep = {2, 4, 6, 8, 10, 5, 7}
INPUT_PATH = "../data/original/reviews.tab"  # TSV with a 'review' column
OUTPUT_CSV = "../data/intermediate/selected_topics_documents.csv"
OUTPUT_DIR = Path("../raft/corpus")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

BATCH_SIZE = 60
MIN_CHARS = 40
SEED = 42

# --- load data ---
data = pd.read_csv(INPUT_PATH, sep="\t")

# If you already have `reviews` elsewhere, replace the next line with that variable
reviews = data["review"].astype(str).fillna("")

# Topic model document info
df = topic_model.get_document_info(reviews)  # assumes your model is already fitted
df["Original"] = reviews.values

# --- filter by topics and length ---
filtered = df[df["Topic"].isin(topics_to_keep)].copy()
filtered["Original"] = filtered["Original"].str.strip()
filtered = filtered[filtered["Original"].str.len() >= MIN_CHARS]

# Save an audit CSV
filtered[["Original", "Topic"]].to_csv(OUTPUT_CSV, index=False)

# --- deterministic shuffle + write batched corpus files ---
total_files = 0
total_reviews = 0
rng = random.Random(SEED)

for topic_val, g in filtered.groupby("Topic", sort=True):
    reviews_list = g["Original"].tolist()

    # deterministic shuffle within topic
    rng.shuffle(reviews_list)

    # chunk into batches of up to 60
    for start in range(0, len(reviews_list), BATCH_SIZE):
        chunk = reviews_list[start : start + BATCH_SIZE]
        if not chunk:
            continue

        # simple header for traceability
        header = (
            f"[TOPIC] {topic_val}\n" f"[Stats] N={len(chunk)} | Source={INPUT_PATH}\n"
        )

        lines = [header, ""]
        for i, txt in enumerate(chunk, 1):
            lines.append(f"({i}) {txt}")

        part_idx = start // BATCH_SIZE + 1
        fname = f"topic={topic_val}__part={part_idx:03d}__n={len(chunk)}.txt"
        (OUTPUT_DIR / fname).write_text("\n".join(lines), encoding="utf-8")

        total_files += 1
        total_reviews += len(chunk)

print(
    f"[green]Wrote {total_files} docs with {total_reviews} reviews to {OUTPUT_DIR}[/green]"
)
print(f"[green]Filtered CSV saved to {OUTPUT_CSV}[/green]")

# %%
doc_topic_matrix = probs

# column names
topicnames = ["Topic " + str(i) for i in range(len(set(topics)) - 1)]

# index names
docnames = ["Review " + str(i) for i in range(len(reviews))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(
    np.round(doc_topic_matrix, 2), columns=topicnames, index=docnames
)

# Get dominant topic for each document
dominant_topic = np.argmax(doc_topic_matrix, axis=1)
df_document_topic["dominant_topic"] = dominant_topic


# Styling
def color_stuff(val):
    if val > 0.1:
        color = "green"
    elif val > 0.05:
        color = "orange"
    else:
        color = "grey"
    return "color: {col}".format(col=color)


def make_bold(val):
    weight = 700 if val > 0.1 else 400
    return "font-weight: {weight}".format(weight=weight)


# Apply Style
df_document_topics = (
    df_document_topic.head(15).style.applymap(color_stuff).applymap(make_bold)
)
df_document_topics

# %% [markdown]
# ### Document Visualization
#

# %%
vis = topic_model.visualize_documents(
    docs=reviews,
    reduced_embeddings=reduced_embeddings,
    custom_labels=True,
    hide_annotations=True,
)
vis.write_html("output/visualization.html")
vis

# %% [markdown]
# ### Similarity Matrix
#

# %%
topic_model.visualize_heatmap()

# %% [markdown]
# ### Topic Info
#

# %%
topic_model.get_topic_info()

# %% [markdown]
# ### Semantic Coherence
#

# %%
topic_words = []
for topic_id in topic_model.get_topic_info()["Topic"]:
    # Skip outlier topic
    if topic_id < 0:
        continue

    words = [word for word, _ in topic_model.get_topic(topic_id)]
    topic_words.append(words)

# Compute mean pairwise cosine similarity for each topic
coherence_scores = []
for words in topic_words:
    coherence_embeddings = embedding_model.encode(words)
    sim_matrix = cosine_similarity(coherence_embeddings)

    # Ignore self-similarity
    np.fill_diagonal(sim_matrix, 0)
    mean_sim = np.mean(sim_matrix[np.triu_indices(sim_matrix.shape[0], k=1)])
    coherence_scores.append(mean_sim)

overall_coherence = np.mean(coherence_scores)

print(len(reviews), "reviews processed")
print(len(topic_model.get_topic_info()) - 1, "topics found")
print(f"BERT-based Topic Coherence: {overall_coherence:.4f}")

# %% [markdown]
# ### Topic Coherence
#

# %%
# https://github.com/MaartenGr/BERTopic/issues/90#issuecomment-820915389

if CALCULATE_COHERENCE:
    # Preprocess Documents
    documents = pd.DataFrame(
        {"Document": reviews, "ID": range(len(reviews)), "Topic": topics}
    )
    documents_per_topic = documents.groupby(["Topic"], as_index=False).agg(
        {"Document": " ".join}
    )
    cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

    # Extract vectorizer and analyzer from BERTopic
    vectorizer = topic_model.vectorizer_model
    analyzer = vectorizer.build_analyzer()

    # Extract features for Topic Coherence evaluation
    words = vectorizer.get_feature_names_out()
    tokens = [analyzer(doc) for doc in cleaned_docs]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
    topic_words = [
        [words for words, _ in topic_model.get_topic(topic)]
        for topic in range(len(set(topics)) - 1)
    ]

    # %env TOKENIZERS_PARALLELISM=false

    for measurement in ["c_v", "u_mass", "c_uci", "c_npmi"]:
        coherence_model = CoherenceModel(
            topics=topic_words,
            texts=tokens,
            corpus=corpus,
            dictionary=dictionary,
            coherence=measurement,
        )
        coherence_score = coherence_model.get_coherence()
        print(f"Coherence ({measurement}): {coherence_score:.4f}")
else:
    print("Skipping classical coherence calculation")

# %% [markdown]
# ### Term Search
#

# %%
search_term = "uluwatu"

similar_topics, similarities = topic_model.find_topics(search_term, top_n=10)
for i in range(len(similar_topics)):
    # \n{topic_model.get_topic(similar_topics[i])}\n
    print(
        f"{str(similarities[i])[:5]} {topic_model.get_topic_info(similar_topics[i])["CustomName"][0]}"
    )

# %% [markdown]
# ### Topic Hierarchy
#

# %%
topic_model.visualize_hierarchy(custom_labels=True)

# %% [markdown]
# ### Intertopic Distance Map
#

# %%
topic_model.visualize_topics()

# %% [markdown]
# ### Topic Word Scores
#

# %%
topic_model.visualize_barchart(top_n_topics=12, custom_labels=True, n_words=10)