mirror of
https://github.com/marvinscham/masterthesis-playground.git
synced 2026-03-22 00:12:42 +01:00
BERTopic cleanup and structuring
This commit is contained in:
35
preprocessing/preprocess.py
Normal file
35
preprocessing/preprocess.py
Normal file
@@ -0,0 +1,35 @@
|
||||
import re
|
||||
|
||||
import pandas as pd
|
||||
|
||||
reviews = pd.read_csv("../data/original/reviews.tab", sep="\t").review.to_list()
|
||||
|
||||
reviews = list(set(reviews)) # Removes exact duplicates
|
||||
|
||||
# print reviews with less than 8 words
|
||||
for review in reviews:
|
||||
if len(review.split()) < 8:
|
||||
print("Short review ({} words):".format(len(review.split())))
|
||||
print(review)
|
||||
print("-" * 60)
|
||||
|
||||
# Remove reviews that contain less than 8 words
|
||||
reviews = [review for review in reviews if len(review.split()) >= 8]
|
||||
|
||||
html_tag_pattern = re.compile(r"</?[a-zA-Z][^>]*>")
|
||||
|
||||
|
||||
def preprocess(text):
|
||||
if html_tag_pattern.search(text):
|
||||
print("Possible HTML tag:")
|
||||
print(text)
|
||||
print("-" * 60)
|
||||
text = re.sub(html_tag_pattern, "", text)
|
||||
|
||||
return text.strip()
|
||||
|
||||
|
||||
with open("../data/intermediate/preprocessed.tab", "w", encoding="utf-8") as f:
|
||||
f.write("review\n")
|
||||
for review in reviews:
|
||||
f.write(preprocess(review) + "\n")
|
||||
1
preprocessing/requirements.txt
Normal file
1
preprocessing/requirements.txt
Normal file
@@ -0,0 +1 @@
|
||||
pandas
|
||||
Reference in New Issue
Block a user