| from nltk.corpus import stopwords | |
| from nltk.stem import WordNetLemmatizer | |
| from bs4 import BeautifulSoup | |
| import re | |
| def clean(df): | |
| stop_words = set(stopwords.words('english')) | |
| lemmatizer = WordNetLemmatizer() | |
| cleaned_headlines = [] | |
| for headline in df['title']: | |
| headline = BeautifulSoup(headline, 'html.parser').get_text() | |
| headline = re.sub(r'[^a-zA-Z0-9\s]', '', headline) | |
| headline = re.sub(r'\s+', ' ', headline).strip() | |
| headline = headline.lower() | |
| words = headline.split() | |
| words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words] | |
| cleaned_headline = ' '.join(words) | |
| cleaned_headlines.append(cleaned_headline) | |
| df['title'] = cleaned_headlines | |
| df.drop_duplicates(subset=['title'], inplace=True) | |
| return df | |