masterthesis-playground/preprocessing/preprocess.py

import re

import pandas as pd

reviews = pd.read_csv("../data/original/reviews.tab", sep="\t").review.to_list()

reviews = list(set(reviews))  # Removes exact duplicates

# print reviews with less than 8 words
for review in reviews:
    if len(review.split()) < 8:
        print("Short review ({} words):".format(len(review.split())))
        print(review)
        print("-" * 60)

# Remove reviews that contain less than 8 words
reviews = [review for review in reviews if len(review.split()) >= 8]

html_tag_pattern = re.compile(r"</?[a-zA-Z][^>]*>")


def preprocess(text):
    if html_tag_pattern.search(text):
        print("Possible HTML tag:")
        print(text)
        print("-" * 60)
        text = re.sub(html_tag_pattern, "", text)

    return text.strip()


with open("../data/intermediate/preprocessed.tab", "w", encoding="utf-8") as f:
    f.write("review\n")
    for review in reviews:
        f.write(preprocess(review) + "\n")