Spaces:
Sleeping
Sleeping
| import re | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.stem import PorterStemmer | |
| def lowercase_text(text): | |
| return text.lower() | |
| def remove_html(text): | |
| return re.sub(r'<[^<]+?>', '', text) | |
| def remove_url(text): | |
| return re.sub(r'http[s]?://\S+|www\.\S+', '', text) | |
| def remove_punctuations(text): | |
| tokens_list = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' | |
| for char in text: | |
| if char in tokens_list: | |
| text = text.replace(char, ' ') | |
| return text | |
| def remove_emojis(text): | |
| emojis = re.compile("[" | |
| u"\U0001F600-\U0001F64F" | |
| u"\U0001F300-\U0001F5FF" | |
| u"\U0001F680-\U0001F6FF" | |
| u"\U0001F1E0-\U0001F1FF" | |
| u"\U00002500-\U00002BEF" | |
| u"\U00002702-\U000027B0" | |
| u"\U00002702-\U000027B0" | |
| u"\U000024C2-\U0001F251" | |
| u"\U0001f926-\U0001f937" | |
| u"\U00010000-\U0010ffff" | |
| u"\u2640-\u2642" | |
| u"\u2600-\u2B55" | |
| u"\u200d" | |
| u"\u23cf" | |
| u"\u23e9" | |
| u"\u231a" | |
| u"\ufe0f" | |
| u"\u3030" | |
| "]+", re.UNICODE) | |
| text = re.sub(emojis, '', text) | |
| return text | |
| def remove_stop_words(text): | |
| stop_words = stopwords.words('english') | |
| new_text = '' | |
| for word in text.split(): | |
| if word not in stop_words: | |
| new_text += ''.join(f'{word} ') | |
| return new_text.strip() | |
| def stem_words(text): | |
| stemmer = PorterStemmer() | |
| new_text = '' | |
| for word in text.split(): | |
| new_text += ''.join(f'{stemmer.stem(word)} ') | |
| return new_text | |
| def get_stopwords(): | |
| nltk.download('stopwords') | |
| def preprocess_text(text): | |
| text = lowercase_text(text) | |
| text = remove_html(text) | |
| text = remove_url(text) | |
| text = remove_punctuations(text) | |
| text = remove_emojis(text) | |
| text = remove_stop_words(text) | |
| text = stem_words(text) | |
| return text | |
| if __name__ == "__main__": | |
| pass |