Spaces:

danielcd99
/

IMDB_Reviews

Sleeping

IMDB_Reviews / preprocess_data.py

feat:added main files

14536de over 1 year ago

2.17 kB

	import re
	import nltk
	from nltk.corpus import stopwords
	from nltk.stem import PorterStemmer


	def lowercase_text(text):
	return text.lower()

	def remove_html(text):
	return re.sub(r'<[^<]+?>', '', text)

	def remove_url(text):
	return re.sub(r'http[s]?://\S+\|www\.\S+', '', text)

	def remove_punctuations(text):
	tokens_list = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{\|}~'
	for char in text:
	if char in tokens_list:
	text = text.replace(char, ' ')

	return text

	def remove_emojis(text):
	emojis = re.compile("["
	u"\U0001F600-\U0001F64F"
	u"\U0001F300-\U0001F5FF"
	u"\U0001F680-\U0001F6FF"
	u"\U0001F1E0-\U0001F1FF"
	u"\U00002500-\U00002BEF"
	u"\U00002702-\U000027B0"
	u"\U00002702-\U000027B0"
	u"\U000024C2-\U0001F251"
	u"\U0001f926-\U0001f937"
	u"\U00010000-\U0010ffff"
	u"\u2640-\u2642"
	u"\u2600-\u2B55"
	u"\u200d"
	u"\u23cf"
	u"\u23e9"
	u"\u231a"
	u"\ufe0f"
	u"\u3030"
	"]+", re.UNICODE)

	text = re.sub(emojis, '', text)
	return text

	def remove_stop_words(text):
	stop_words = stopwords.words('english')
	new_text = ''
	for word in text.split():
	if word not in stop_words:
	new_text += ''.join(f'{word} ')

	return new_text.strip()

	def stem_words(text):
	stemmer = PorterStemmer()
	new_text = ''
	for word in text.split():
	new_text += ''.join(f'{stemmer.stem(word)} ')

	return new_text

	def get_stopwords():
	nltk.download('stopwords')

	def preprocess_text(text):
	text = lowercase_text(text)
	text = remove_html(text)
	text = remove_url(text)
	text = remove_punctuations(text)
	text = remove_emojis(text)
	text = remove_stop_words(text)
	text = stem_words(text)

	return text

	if __name__ == "__main__":
	pass