File size: 4,691 Bytes
3327b57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# Importing standard libraries
import os
import glob
import argparse
import pandas as pd
from tqdm import tqdm
from pathlib import Path

# Additional preprocessing functions are imported from another module.
from preprocessing_sub_functions import *


# This function returns a list of all CSV files in the given directory path.
def get_files(path):
    return glob.glob(path + "/*.csv")


# This function aims to remove meta information from the text.
# The specifics of what meta information is removed depends on the function 'remove_meta_info'.
def raw_preprocess(text):
    text = remove_meta_info(text)
    return text


# A comprehensive text preprocessing function that applies several common preprocessing steps:
# - URLs are removed from the text.
# - The entire text is converted to lowercase to ensure uniformity.
# - Punctuation is stripped from the text.
# - Extra whitespaces (if any) are removed.
# - The text is tokenized (split into individual words or tokens).
# - Contractions (like "can't" or "won't") are expanded to their full forms.
# - Common words (stopwords) that don't add significant meaning are removed.
# Finally, the cleaned tokens are joined back into a string.
def text_preprocess(text):
    text = remove_urls(text)
    text = to_lowercase(text)
    text = remove_sentence_punctuation(text)
    text = remove_extra_whitespace(text)
    tokens = tokenize(text)
    tokens = expand_contractions(tokens)
    tokens = remove_stopwords(tokens)
    text = " ".join(tokens)
    return text


# This function preprocesses a dataframe.
# Specific preprocessing steps include:
# - Removing rows marked as 'deleted'.
# - Removing posts marked as 'deleted'.
# - Updating the 'lastEdit' column.
# - Converting timestamps to a datetime format.
# - Renaming the 'timestamp' column to 'start_edit'.
def csv_preprocess(df):
    df = remove_deleted(df)
    df = remove_deleted_post(df)
    df = update_lastEdit(df)
    df = convert_to_datetime(df)
    df.rename(columns={"timestamp": "start_edit"}, inplace=True)
    return df


# This function processes individual CSV files:
# - Reads the CSV into a DataFrame.
# - Applies dataframe preprocessing.
# - Applies raw text preprocessing to the 'post' column.
# - Saves the raw preprocessed data into a 'raw-data' folder.
# - Applies comprehensive text preprocessing to the 'post' column.
# - Saves the fully preprocessed data into a 'preprocessed-data' folder.
def loop_through_csvs(filePath):
    file = os.path.basename(filePath)
    folder = os.path.basename(os.path.dirname(filePath))
    df = pd.read_csv(filePath)
    df = csv_preprocess(df)

    # Create a directory for raw data if it doesn't exist.
    raw_folder = Path(f"raw-data/{folder}")
    raw_folder.mkdir(parents=True, exist_ok=True)

    # Apply raw preprocessing to the 'post' column of the dataframe.
    df["post"] = df["post"].apply(raw_preprocess)

    # Sort the dataframe by the 'last_edit' column.
    df.sort_values(by=["last_edit"], inplace=True)

    # Save the raw preprocessed dataframe to a CSV file.
    df.to_csv(f"{raw_folder}/{file}", index=False)

    # Create a directory for fully preprocessed data if it doesn't exist.
    clean_folder = Path(f"preprocessed-data/{folder}")
    clean_folder.mkdir(parents=True, exist_ok=True)

    # Apply the comprehensive text preprocessing to the 'post' column and store the result in a new column.
    df["preprocessed_post"] = df["post"].apply(text_preprocess)

    # Sort the dataframe by the 'last_edit' column again.
    df.sort_values(by=["last_edit"], inplace=True)

    # Save the fully preprocessed dataframe to a CSV file.
    df.to_csv(f"{clean_folder}/{file}", index=False)

    return df


# A function to parse command-line arguments.
# The script expects a 'path' argument which indicates the directory where the raw CSV files are located.
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("path", help="path for the extraction")
    return vars(parser.parse_args())


# The main function of the script:
# - It retrieves all the CSV files from the specified directory.
# - Loops through each file, applying the preprocessing steps.
# - If an error occurs during processing, the error message is appended to an 'error_log.txt' file.
def main(path):
    rawFiles = get_files(path)
    for filePath in tqdm(rawFiles):
        try:
            df = loop_through_csvs(filePath)
        except Exception as e:
            # If an error occurs, log the error message to a file.
            with open(f"{path}/error_log.txt", "a") as f:
                f.write(f"{filePath} -- {e}\\n")
            continue


if __name__ == "__main__":
    main(**parse_args())