mirror of
https://github.com/marvinscham/masterthesis-playground.git
synced 2026-03-22 08:22:43 +01:00
36 lines
946 B
Python
36 lines
946 B
Python
import re
|
|
|
|
import pandas as pd
|
|
|
|
reviews = pd.read_csv("../data/original/reviews.tab", sep="\t").review.to_list()
|
|
|
|
reviews = list(set(reviews)) # Removes exact duplicates
|
|
|
|
# print reviews with less than 8 words
|
|
for review in reviews:
|
|
if len(review.split()) < 8:
|
|
print("Short review ({} words):".format(len(review.split())))
|
|
print(review)
|
|
print("-" * 60)
|
|
|
|
# Remove reviews that contain less than 8 words
|
|
reviews = [review for review in reviews if len(review.split()) >= 8]
|
|
|
|
html_tag_pattern = re.compile(r"</?[a-zA-Z][^>]*>")
|
|
|
|
|
|
def preprocess(text):
|
|
if html_tag_pattern.search(text):
|
|
print("Possible HTML tag:")
|
|
print(text)
|
|
print("-" * 60)
|
|
text = re.sub(html_tag_pattern, "", text)
|
|
|
|
return text.strip()
|
|
|
|
|
|
with open("../data/intermediate/preprocessed.tab", "w", encoding="utf-8") as f:
|
|
f.write("review\n")
|
|
for review in reviews:
|
|
f.write(preprocess(review) + "\n")
|