import re import pandas as pd reviews = pd.read_csv("../data/original/reviews.tab", sep="\t").review.to_list() reviews = list(set(reviews)) # Removes exact duplicates # print reviews with less than 8 words for review in reviews: if len(review.split()) < 8: print("Short review ({} words):".format(len(review.split()))) print(review) print("-" * 60) # Remove reviews that contain less than 8 words reviews = [review for review in reviews if len(review.split()) >= 8] html_tag_pattern = re.compile(r"]*>") def preprocess(text): if html_tag_pattern.search(text): print("Possible HTML tag:") print(text) print("-" * 60) text = re.sub(html_tag_pattern, "", text) return text.strip() with open("../data/intermediate/preprocessed.tab", "w", encoding="utf-8") as f: f.write("review\n") for review in reviews: f.write(preprocess(review) + "\n")