Files
masterthesis-playground/preprocessing/preprocess.py

36 lines
946 B
Python

import re
import pandas as pd
reviews = pd.read_csv("../data/original/reviews.tab", sep="\t").review.to_list()
reviews = list(set(reviews)) # Removes exact duplicates
# print reviews with less than 8 words
for review in reviews:
if len(review.split()) < 8:
print("Short review ({} words):".format(len(review.split())))
print(review)
print("-" * 60)
# Remove reviews that contain less than 8 words
reviews = [review for review in reviews if len(review.split()) >= 8]
html_tag_pattern = re.compile(r"</?[a-zA-Z][^>]*>")
def preprocess(text):
if html_tag_pattern.search(text):
print("Possible HTML tag:")
print(text)
print("-" * 60)
text = re.sub(html_tag_pattern, "", text)
return text.strip()
with open("../data/intermediate/preprocessed.tab", "w", encoding="utf-8") as f:
f.write("review\n")
for review in reviews:
f.write(preprocess(review) + "\n")