File size: 4,691 Bytes
3327b57 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
# Importing standard libraries
import os
import glob
import argparse
import pandas as pd
from tqdm import tqdm
from pathlib import Path
# Additional preprocessing functions are imported from another module.
from preprocessing_sub_functions import *
# This function returns a list of all CSV files in the given directory path.
def get_files(path):
return glob.glob(path + "/*.csv")
# This function aims to remove meta information from the text.
# The specifics of what meta information is removed depends on the function 'remove_meta_info'.
def raw_preprocess(text):
text = remove_meta_info(text)
return text
# A comprehensive text preprocessing function that applies several common preprocessing steps:
# - URLs are removed from the text.
# - The entire text is converted to lowercase to ensure uniformity.
# - Punctuation is stripped from the text.
# - Extra whitespaces (if any) are removed.
# - The text is tokenized (split into individual words or tokens).
# - Contractions (like "can't" or "won't") are expanded to their full forms.
# - Common words (stopwords) that don't add significant meaning are removed.
# Finally, the cleaned tokens are joined back into a string.
def text_preprocess(text):
text = remove_urls(text)
text = to_lowercase(text)
text = remove_sentence_punctuation(text)
text = remove_extra_whitespace(text)
tokens = tokenize(text)
tokens = expand_contractions(tokens)
tokens = remove_stopwords(tokens)
text = " ".join(tokens)
return text
# This function preprocesses a dataframe.
# Specific preprocessing steps include:
# - Removing rows marked as 'deleted'.
# - Removing posts marked as 'deleted'.
# - Updating the 'lastEdit' column.
# - Converting timestamps to a datetime format.
# - Renaming the 'timestamp' column to 'start_edit'.
def csv_preprocess(df):
df = remove_deleted(df)
df = remove_deleted_post(df)
df = update_lastEdit(df)
df = convert_to_datetime(df)
df.rename(columns={"timestamp": "start_edit"}, inplace=True)
return df
# This function processes individual CSV files:
# - Reads the CSV into a DataFrame.
# - Applies dataframe preprocessing.
# - Applies raw text preprocessing to the 'post' column.
# - Saves the raw preprocessed data into a 'raw-data' folder.
# - Applies comprehensive text preprocessing to the 'post' column.
# - Saves the fully preprocessed data into a 'preprocessed-data' folder.
def loop_through_csvs(filePath):
file = os.path.basename(filePath)
folder = os.path.basename(os.path.dirname(filePath))
df = pd.read_csv(filePath)
df = csv_preprocess(df)
# Create a directory for raw data if it doesn't exist.
raw_folder = Path(f"raw-data/{folder}")
raw_folder.mkdir(parents=True, exist_ok=True)
# Apply raw preprocessing to the 'post' column of the dataframe.
df["post"] = df["post"].apply(raw_preprocess)
# Sort the dataframe by the 'last_edit' column.
df.sort_values(by=["last_edit"], inplace=True)
# Save the raw preprocessed dataframe to a CSV file.
df.to_csv(f"{raw_folder}/{file}", index=False)
# Create a directory for fully preprocessed data if it doesn't exist.
clean_folder = Path(f"preprocessed-data/{folder}")
clean_folder.mkdir(parents=True, exist_ok=True)
# Apply the comprehensive text preprocessing to the 'post' column and store the result in a new column.
df["preprocessed_post"] = df["post"].apply(text_preprocess)
# Sort the dataframe by the 'last_edit' column again.
df.sort_values(by=["last_edit"], inplace=True)
# Save the fully preprocessed dataframe to a CSV file.
df.to_csv(f"{clean_folder}/{file}", index=False)
return df
# A function to parse command-line arguments.
# The script expects a 'path' argument which indicates the directory where the raw CSV files are located.
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("path", help="path for the extraction")
return vars(parser.parse_args())
# The main function of the script:
# - It retrieves all the CSV files from the specified directory.
# - Loops through each file, applying the preprocessing steps.
# - If an error occurs during processing, the error message is appended to an 'error_log.txt' file.
def main(path):
rawFiles = get_files(path)
for filePath in tqdm(rawFiles):
try:
df = loop_through_csvs(filePath)
except Exception as e:
# If an error occurs, log the error message to a file.
with open(f"{path}/error_log.txt", "a") as f:
f.write(f"{filePath} -- {e}\\n")
continue
if __name__ == "__main__":
main(**parse_args())
|