cgrumbach's picture
Upload 59 files
3327b57 verified
import os
import pandas as pd
from pathlib import Path
from tqdm import tqdm
# Function to process and sort CSV files within a given folder
def process_csvs(folder_path, new_folder_name):
# Extracting the name of the board from the folder path
board = os.path.basename(folder_path)
# Creating a new directory to store the sorted CSV files
sorted_folder = Path(new_folder_name)
sorted_folder.mkdir(parents=True, exist_ok=True)
# Retrieving all CSV files from the given folder path
all_files = [
os.path.join(folder_path, file)
for file in os.listdir(folder_path)
if file.endswith(".csv")
]
# Reading each CSV file into a dataframe
list_of_dataframes = [pd.read_csv(file) for file in all_files]
# Combining all dataframes into a single dataframe
combined_df = pd.concat(list_of_dataframes, ignore_index=True)
# Sorting the combined dataframe based on the "last_edit" column
combined_df = combined_df.sort_values(by="last_edit")
# Splitting the sorted dataframe into chunks of 10,000 rows each
num_chunks = len(combined_df) // 10000 + (1 if len(combined_df) % 10000 else 0)
chunks = [combined_df.iloc[i * 10000 : (i + 1) * 10000] for i in range(num_chunks)]
# Saving each chunk as a separate CSV with a filename based on date ranges
for idx, chunk in tqdm(enumerate(chunks)):
start_date = pd.to_datetime(chunk["last_edit"].iloc[0]).strftime("%d%m%y")
end_date = pd.to_datetime(chunk["last_edit"].iloc[-1]).strftime("%d%m%y")
filename = f"BitcoinForum_{board}_{start_date}_to_{end_date}.csv"
chunk.to_csv(os.path.join(sorted_folder, filename), index=False)
folder_paths = [
"./raw-data",
"./preprocessed-data",
]
# Iterating over each folder path and processing its CSV files
for folder_path in folder_paths:
folder_name = os.path.basename(folder_path)
new_folder_name = f"sorted-{folder_name}"
for folder in tqdm(os.listdir(folder_path)):
if os.path.isdir(os.path.join(folder_path, folder)):
process_csvs(os.path.join(folder_path, folder), new_folder_name)