|
|
|
|
|
import re |
|
import os |
|
import glob |
|
import string |
|
import pandas as pd |
|
from datetime import datetime |
|
import nltk |
|
from nltk.corpus import stopwords |
|
from nltk.stem import WordNetLemmatizer |
|
import contractions |
|
|
|
|
|
def remove_deleted(df): |
|
r""" |
|
remove_deleted function. |
|
This function appears to remove deleted post from crawled website data. |
|
|
|
Args: |
|
df: dataframe of crawled website data. |
|
|
|
Returns: |
|
df: dataframe of crawled website data without deleted post. |
|
""" |
|
|
|
df = df[~df['timestamp'].str.isnumeric()] |
|
df.reset_index(drop=True, inplace=True) |
|
return df |
|
|
|
|
|
def remove_deleted_post(df): |
|
r""" |
|
remove_deleted_post function. |
|
This function appears to remove deleted post where is in another format. |
|
|
|
Args: |
|
df: dataframe of crawled website data. |
|
|
|
Returns: |
|
df: dataframe of crawled website data without deleted post. |
|
""" |
|
|
|
df = df[df['post'] != 'del'] |
|
df.reset_index(drop=True, inplace=True) |
|
return df |
|
|
|
|
|
def update_lastEdit(df): |
|
r""" |
|
update_lastEdit function. |
|
This function appears to fill NaN values in the 'last_edit' column with corresponding values from the 'timestamp' column |
|
|
|
Args: |
|
df: dataframe of crawled website data. |
|
|
|
Returns: |
|
df: dataframe of crawled website data with updated last_edit. |
|
""" |
|
df.loc[:, 'last_edit'] = df['last_edit'].fillna(df['timestamp']) |
|
return df |
|
|
|
|
|
def preprocess_date(date_str): |
|
r""" |
|
preprocess_date function. |
|
This function appears to convert occurrences of 'Today' in a date string to the current date |
|
Args: |
|
date_str: str that contains date information. |
|
|
|
Returns: |
|
str that contains date information with updated 'Today' to current date. |
|
""" |
|
if "Today " in date_str: |
|
current_date = datetime.now().strftime("%B %d, %Y") |
|
return date_str.replace("Today", current_date) |
|
return date_str |
|
|
|
|
|
def convert_datetime_with_multiple_formats(date_str, formats): |
|
r""" |
|
convert_datetime_with_multiple_formats function. |
|
This function appears to Convert a date string to a datetime object using multiple possible formats. |
|
|
|
Args: |
|
date_str: str that contains date information. |
|
formats: list of possible date formats. |
|
|
|
Returns: |
|
datetime object. |
|
""" |
|
for fmt in formats: |
|
try: |
|
return pd.to_datetime(date_str, format=fmt) |
|
except ValueError: |
|
continue |
|
raise ValueError(f"Time data {date_str} doesn't match provided formats") |
|
|
|
|
|
def convert_to_datetime(df_): |
|
r""" |
|
convert_to_datetime function. |
|
This function appears to convert 'timestamp' and 'last_edit' columns to datetime format |
|
|
|
Args: |
|
df_: dataframe of crawled website data. |
|
|
|
Returns: |
|
df: dataframe of crawled website data with datatime format in 'timestamp' and 'last_edit' columns. |
|
""" |
|
df = df_.copy() |
|
|
|
|
|
df['timestamp'] = df['timestamp'].apply(preprocess_date) |
|
df['last_edit'] = df['last_edit'].apply(preprocess_date) |
|
|
|
|
|
datetime_formats = ["%B %d, %Y at %I:%M:%S %p", "%B %d, %Y, %I:%M:%S %p"] |
|
|
|
df['timestamp'] = df['timestamp'].apply( |
|
convert_datetime_with_multiple_formats, formats=datetime_formats) |
|
df['timestamp'] = df['timestamp'].dt.date |
|
df['last_edit'] = df['last_edit'].apply( |
|
convert_datetime_with_multiple_formats, formats=datetime_formats) |
|
df['last_edit'] = df['last_edit'].dt.date |
|
|
|
return df |
|
|
|
|
|
def remove_urls(text): |
|
r""" |
|
remove_urls function. |
|
This function appears to Remove URLs from a text. |
|
""" |
|
return re.sub(r'http\S+', '', text) |
|
|
|
|
|
|
|
|
|
def remove_extra_whitespace(text): |
|
r""" |
|
remove_extra_whitespace function. |
|
This function appears to Remove extra whitespace characters from a text. |
|
""" |
|
return ' '.join(text.split()) |
|
|
|
|
|
def remove_special_characters(text): |
|
r""" |
|
remove_special_characters function. |
|
This function appears to remove special characters from a text. |
|
""" |
|
return re.sub(r'[^\w\s]', '', text) |
|
|
|
|
|
def to_lowercase(text): |
|
r""" |
|
to_lowercase function. |
|
This function appears to convert a text to lowercase. |
|
""" |
|
return text.lower() |
|
|
|
|
|
def remove_meta_info(text): |
|
r""" |
|
remove_meta_info function. |
|
This function appears to remove meta information where it contain quotes information. |
|
""" |
|
text = str(text) |
|
return re.sub(r'Quote from: [a-zA-Z0-9_]+ on [a-zA-Z0-9, :]+ (AM|PM)', '', text) |
|
|
|
|
|
def tokenize(text): |
|
r""" |
|
tokenize function. |
|
This function appears to Tokenize a text into individual words. |
|
""" |
|
return text.split(' ') |
|
|
|
|
|
def remove_sentence_punctuation(text): |
|
r""" |
|
remove_sentence_punctuation function. |
|
This function appears to remove punctuation from a text, excluding math symbols. |
|
""" |
|
math_symbols = "+-×*÷/=()[]{},.<>%^" |
|
punctuations_to_remove = ''.join( |
|
set(string.punctuation) - set(math_symbols)) |
|
return text.translate(str.maketrans(punctuations_to_remove, ' ' * len(punctuations_to_remove))) |
|
|
|
|
|
def lemmatize_text(text): |
|
r""" |
|
lemmatize_text function. |
|
This function appears to lemmatize text, where it convert words to their base form. |
|
""" |
|
lemmatizer = WordNetLemmatizer() |
|
return ' '.join([lemmatizer.lemmatize(word) for word in text.split()]) |
|
|
|
|
|
def replace_numbers(text, replace_with="<NUM>"): |
|
r""" |
|
replace_numbers function. |
|
This function appears to replace numbers in a text with a specified string (default is "<NUM>"). |
|
""" |
|
return re.sub(r'\b\d+\b', replace_with, text) |
|
|
|
|
|
def remove_stopwords(tokens): |
|
r""" |
|
remove_stopwords function. |
|
This function appears to remove stopwords from a list of tokens. |
|
""" |
|
stop_words = set(stopwords.words('english')) |
|
return [word for word in tokens if word not in stop_words] |
|
|
|
|
|
def expand_contractions(tokens): |
|
r""" |
|
expand_contractions function. |
|
This function appears to expand contractions in a list of tokens (e.g., "isn't" to "is not") |
|
""" |
|
return [contractions.fix(word) for word in tokens] |
|
|
|
|
|
def remove_repeated_phrases(text): |
|
r""" |
|
remove_repeated_phrases function. |
|
This function appears to remove repeated phrases from a text. |
|
eg. "hello hello world" -> "hello world" |
|
""" |
|
phrases = text.split() |
|
seen = set() |
|
output = [] |
|
for phrase in phrases: |
|
if phrase not in seen: |
|
seen.add(phrase) |
|
output.append(phrase) |
|
return ' '.join(output) |
|
|