|
import os |
|
import pandas as pd |
|
from pathlib import Path |
|
from tqdm import tqdm |
|
|
|
|
|
|
|
|
|
def process_csvs(folder_path, new_folder_name): |
|
|
|
board = os.path.basename(folder_path) |
|
|
|
sorted_folder = Path(new_folder_name) |
|
sorted_folder.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
all_files = [ |
|
os.path.join(folder_path, file) |
|
for file in os.listdir(folder_path) |
|
if file.endswith(".csv") |
|
] |
|
|
|
list_of_dataframes = [pd.read_csv(file) for file in all_files] |
|
|
|
combined_df = pd.concat(list_of_dataframes, ignore_index=True) |
|
|
|
|
|
combined_df = combined_df.sort_values(by="last_edit") |
|
|
|
|
|
num_chunks = len(combined_df) // 10000 + (1 if len(combined_df) % 10000 else 0) |
|
chunks = [combined_df.iloc[i * 10000 : (i + 1) * 10000] for i in range(num_chunks)] |
|
|
|
|
|
for idx, chunk in tqdm(enumerate(chunks)): |
|
start_date = pd.to_datetime(chunk["last_edit"].iloc[0]).strftime("%d%m%y") |
|
end_date = pd.to_datetime(chunk["last_edit"].iloc[-1]).strftime("%d%m%y") |
|
filename = f"BitcoinForum_{board}_{start_date}_to_{end_date}.csv" |
|
chunk.to_csv(os.path.join(sorted_folder, filename), index=False) |
|
|
|
|
|
folder_paths = [ |
|
"./raw-data", |
|
"./preprocessed-data", |
|
] |
|
|
|
|
|
for folder_path in folder_paths: |
|
folder_name = os.path.basename(folder_path) |
|
new_folder_name = f"sorted-{folder_name}" |
|
for folder in tqdm(os.listdir(folder_path)): |
|
if os.path.isdir(os.path.join(folder_path, folder)): |
|
process_csvs(os.path.join(folder_path, folder), new_folder_name) |
|
|