misinfo / src /data_loader /download_data.py
gyigit's picture
update
54e8a79
import os
import zipfile
import gdown
from getpass import getpass
import shutil
from pathlib import Path
from src.utils.path_utils import get_project_root
# Constants
PROJECT_ROOT = get_project_root()
ZIP_FILE_PATH = str(PROJECT_ROOT / "data/raw/factify/factify_data.zip")
EXTRACTION_DIR = str(PROJECT_ROOT / "data/raw/factify/extracted")
TEMP_EXTRACTION_DIR = str(PROJECT_ROOT / "data/raw/factify/public_folder")
GDRIVE_FILE_URL = "https://drive.google.com/uc?id=1ig7XEYU1UKDHrHgDYgqiARWvNdswgFEX"
def ensure_directories():
"""Ensure necessary directories exist."""
os.makedirs(os.path.dirname(ZIP_FILE_PATH), exist_ok=True)
def download_zip():
"""Download the ZIP file if it doesn't already exist."""
if os.path.exists(ZIP_FILE_PATH):
print(f"Zip file already exists at {ZIP_FILE_PATH}. Skipping download...")
return
print("Downloading zip file from Google Drive...")
gdown.download(GDRIVE_FILE_URL, ZIP_FILE_PATH, quiet=False)
print(f"Downloaded zip file to {ZIP_FILE_PATH}")
def extract_zip():
"""Extract the ZIP file and handle folder and file renaming."""
train_csv_path = os.path.join(EXTRACTION_DIR, "train.csv")
if os.path.exists(train_csv_path):
print(f"{train_csv_path} already exists. Skipping extraction...")
return
print("Extracting zip file...")
# Get password for the zip file
password = getpass("Enter the password for the zip file: ")
with zipfile.ZipFile(ZIP_FILE_PATH, "r") as zip_ref:
try:
zip_ref.extractall(
str(PROJECT_ROOT / "data/raw/factify/"), pwd=password.encode()
)
print(f"Extracted files to temporary folder: {TEMP_EXTRACTION_DIR}")
except RuntimeError:
print("Incorrect password. Exiting...")
exit(1)
# Remove existing extracted directory if it exists
if os.path.exists(EXTRACTION_DIR):
shutil.rmtree(EXTRACTION_DIR)
print(f"Removed existing directory: {EXTRACTION_DIR}")
# Rename extracted folder
if os.path.exists(TEMP_EXTRACTION_DIR):
os.rename(TEMP_EXTRACTION_DIR, EXTRACTION_DIR)
print(f"Renamed folder {TEMP_EXTRACTION_DIR} to {EXTRACTION_DIR}")
# Rename val.csv to test.csv
val_csv_path = os.path.join(EXTRACTION_DIR, "val.csv")
test_csv_path = os.path.join(EXTRACTION_DIR, "test.csv")
if os.path.exists(val_csv_path):
os.rename(val_csv_path, test_csv_path)
print(f"Renamed {val_csv_path} to {test_csv_path}")
def main():
ensure_directories()
download_zip()
extract_zip()
if __name__ == "__main__":
main()