File size: 6,715 Bytes

3327b57

# preprocessing sub functions

import re
import os
import glob
import string
import pandas as pd
from datetime import datetime
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import contractions


def remove_deleted(df):
    r"""
    remove_deleted function.
    This function appears to remove deleted post from crawled website data.

    Args:
        df: dataframe of crawled website data.

    Returns:
        df: dataframe of crawled website data without deleted post.
    """
    # Remove rows where the 'timestamp' column is numeric
    df = df[~df['timestamp'].str.isnumeric()]
    df.reset_index(drop=True, inplace=True)
    return df


def remove_deleted_post(df):
    r"""
    remove_deleted_post function.
    This function appears to remove deleted post where is in another format.

    Args:
        df: dataframe of crawled website data.

    Returns:
        df: dataframe of crawled website data without deleted post.
    """
    # Remove rows where the 'post' column contains 'del'
    df = df[df['post'] != 'del']
    df.reset_index(drop=True, inplace=True)
    return df


def update_lastEdit(df):
    r"""
    update_lastEdit function.
    This function appears to fill NaN values in the 'last_edit' column with corresponding values from the 'timestamp' column 

    Args:
        df: dataframe of crawled website data.

    Returns:
        df: dataframe of crawled website data with updated last_edit.
    """
    df.loc[:, 'last_edit'] = df['last_edit'].fillna(df['timestamp'])
    return df


def preprocess_date(date_str):
    r"""
    preprocess_date function.
    This function appears to convert occurrences of 'Today' in a date string to the current date
    Args:
        date_str: str that contains date information.

    Returns:
        str that contains date information with updated 'Today' to current date.
    """
    if "Today " in date_str:
        current_date = datetime.now().strftime("%B %d, %Y")
        return date_str.replace("Today", current_date)
    return date_str


def convert_datetime_with_multiple_formats(date_str, formats):
    r"""
    convert_datetime_with_multiple_formats function.
    This function appears to Convert a date string to a datetime object using multiple possible formats.

    Args:
        date_str: str that contains date information.
        formats: list of possible date formats.

    Returns:
        datetime object.
    """
    for fmt in formats:
        try:
            return pd.to_datetime(date_str, format=fmt)
        except ValueError:
            continue
    raise ValueError(f"Time data {date_str} doesn't match provided formats")


def convert_to_datetime(df_):
    r"""
    convert_to_datetime function.
    This function appears to convert 'timestamp' and 'last_edit' columns to datetime format

    Args:
        df_: dataframe of crawled website data.

    Returns:
        df: dataframe of crawled website data with datatime format in 'timestamp' and 'last_edit' columns.
    """
    df = df_.copy()

    # Preprocess 'timestamp' and 'last_edit' columns to handle 'Today' values
    df['timestamp'] = df['timestamp'].apply(preprocess_date)
    df['last_edit'] = df['last_edit'].apply(preprocess_date)

    # List of potential datetime formats
    datetime_formats = ["%B %d, %Y at %I:%M:%S %p", "%B %d, %Y, %I:%M:%S %p"]

    df['timestamp'] = df['timestamp'].apply(
        convert_datetime_with_multiple_formats, formats=datetime_formats)
    df['timestamp'] = df['timestamp'].dt.date
    df['last_edit'] = df['last_edit'].apply(
        convert_datetime_with_multiple_formats, formats=datetime_formats)
    df['last_edit'] = df['last_edit'].dt.date

    return df


def remove_urls(text):
    r"""
    remove_urls function.
    This function appears to Remove URLs from a text.
    """
    return re.sub(r'http\S+', '', text)

#


def remove_extra_whitespace(text):
    r"""
    remove_extra_whitespace function.
    This function appears to Remove extra whitespace characters from a text.
    """
    return ' '.join(text.split())


def remove_special_characters(text):
    r"""
    remove_special_characters function.
    This function appears to remove special characters from a text.
    """
    return re.sub(r'[^\w\s]', '', text)


def to_lowercase(text):
    r"""
    to_lowercase function.
    This function appears to convert a text to lowercase.
    """
    return text.lower()


def remove_meta_info(text):
    r"""
    remove_meta_info function.
    This function appears to remove meta information where it contain quotes information.
    """
    text = str(text)
    return re.sub(r'Quote from: [a-zA-Z0-9_]+ on [a-zA-Z0-9, :]+ (AM|PM)', '', text)


def tokenize(text):
    r"""
    tokenize function.
    This function appears to Tokenize a text into individual words.
    """
    return text.split(' ')


def remove_sentence_punctuation(text):
    r"""
    remove_sentence_punctuation function.
    This function appears to remove punctuation from a text, excluding math symbols.
    """
    math_symbols = "+-×*÷/=()[]{},.<>%^"
    punctuations_to_remove = ''.join(
        set(string.punctuation) - set(math_symbols))
    return text.translate(str.maketrans(punctuations_to_remove, ' ' * len(punctuations_to_remove)))


def lemmatize_text(text):
    r"""
    lemmatize_text function.
    This function appears to lemmatize text, where it convert words to their base form.
    """
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])


def replace_numbers(text, replace_with="<NUM>"):
    r"""
    replace_numbers function.
    This function appears to replace numbers in a text with a specified string (default is "<NUM>").
    """
    return re.sub(r'\b\d+\b', replace_with, text)


def remove_stopwords(tokens):
    r"""
    remove_stopwords function.
    This function appears to remove stopwords from a list of tokens.
    """
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words]


def expand_contractions(tokens):
    r"""
    expand_contractions function.
    This function appears to expand contractions in a list of tokens (e.g., "isn't" to "is not")
    """
    return [contractions.fix(word) for word in tokens]


def remove_repeated_phrases(text):
    r"""
    remove_repeated_phrases function.
    This function appears to remove repeated phrases from a text.
    eg. "hello hello world" -> "hello world"
    """
    phrases = text.split()
    seen = set()
    output = []
    for phrase in phrases:
        if phrase not in seen:
            seen.add(phrase)
            output.append(phrase)
    return ' '.join(output)