File size: 4,691 Bytes

3327b57

# Importing standard libraries
import os
import glob
import argparse
import pandas as pd
from tqdm import tqdm
from pathlib import Path

# Additional preprocessing functions are imported from another module.
from preprocessing_sub_functions import *


# This function returns a list of all CSV files in the given directory path.
def get_files(path):
    return glob.glob(path + "/*.csv")


# This function aims to remove meta information from the text.
# The specifics of what meta information is removed depends on the function 'remove_meta_info'.
def raw_preprocess(text):
    text = remove_meta_info(text)
    return text


# A comprehensive text preprocessing function that applies several common preprocessing steps:
# - URLs are removed from the text.
# - The entire text is converted to lowercase to ensure uniformity.
# - Punctuation is stripped from the text.
# - Extra whitespaces (if any) are removed.
# - The text is tokenized (split into individual words or tokens).
# - Contractions (like "can't" or "won't") are expanded to their full forms.
# - Common words (stopwords) that don't add significant meaning are removed.
# Finally, the cleaned tokens are joined back into a string.
def text_preprocess(text):
    text = remove_urls(text)
    text = to_lowercase(text)
    text = remove_sentence_punctuation(text)
    text = remove_extra_whitespace(text)
    tokens = tokenize(text)
    tokens = expand_contractions(tokens)
    tokens = remove_stopwords(tokens)
    text = " ".join(tokens)
    return text


# This function preprocesses a dataframe.
# Specific preprocessing steps include:
# - Removing rows marked as 'deleted'.
# - Removing posts marked as 'deleted'.
# - Updating the 'lastEdit' column.
# - Converting timestamps to a datetime format.
# - Renaming the 'timestamp' column to 'start_edit'.
def csv_preprocess(df):
    df = remove_deleted(df)
    df = remove_deleted_post(df)
    df = update_lastEdit(df)
    df = convert_to_datetime(df)
    df.rename(columns={"timestamp": "start_edit"}, inplace=True)
    return df


# This function processes individual CSV files:
# - Reads the CSV into a DataFrame.
# - Applies dataframe preprocessing.
# - Applies raw text preprocessing to the 'post' column.
# - Saves the raw preprocessed data into a 'raw-data' folder.
# - Applies comprehensive text preprocessing to the 'post' column.
# - Saves the fully preprocessed data into a 'preprocessed-data' folder.
def loop_through_csvs(filePath):
    file = os.path.basename(filePath)
    folder = os.path.basename(os.path.dirname(filePath))
    df = pd.read_csv(filePath)
    df = csv_preprocess(df)

    # Create a directory for raw data if it doesn't exist.
    raw_folder = Path(f"raw-data/{folder}")
    raw_folder.mkdir(parents=True, exist_ok=True)

    # Apply raw preprocessing to the 'post' column of the dataframe.
    df["post"] = df["post"].apply(raw_preprocess)

    # Sort the dataframe by the 'last_edit' column.
    df.sort_values(by=["last_edit"], inplace=True)

    # Save the raw preprocessed dataframe to a CSV file.
    df.to_csv(f"{raw_folder}/{file}", index=False)

    # Create a directory for fully preprocessed data if it doesn't exist.
    clean_folder = Path(f"preprocessed-data/{folder}")
    clean_folder.mkdir(parents=True, exist_ok=True)

    # Apply the comprehensive text preprocessing to the 'post' column and store the result in a new column.
    df["preprocessed_post"] = df["post"].apply(text_preprocess)

    # Sort the dataframe by the 'last_edit' column again.
    df.sort_values(by=["last_edit"], inplace=True)

    # Save the fully preprocessed dataframe to a CSV file.
    df.to_csv(f"{clean_folder}/{file}", index=False)

    return df


# A function to parse command-line arguments.
# The script expects a 'path' argument which indicates the directory where the raw CSV files are located.
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("path", help="path for the extraction")
    return vars(parser.parse_args())


# The main function of the script:
# - It retrieves all the CSV files from the specified directory.
# - Loops through each file, applying the preprocessing steps.
# - If an error occurs during processing, the error message is appended to an 'error_log.txt' file.
def main(path):
    rawFiles = get_files(path)
    for filePath in tqdm(rawFiles):
        try:
            df = loop_through_csvs(filePath)
        except Exception as e:
            # If an error occurs, log the error message to a file.
            with open(f"{path}/error_log.txt", "a") as f:
                f.write(f"{filePath} -- {e}\\n")
            continue


if __name__ == "__main__":
    main(**parse_args())