|
import os |
|
import zipfile |
|
import gdown |
|
from getpass import getpass |
|
import shutil |
|
from pathlib import Path |
|
from src.utils.path_utils import get_project_root |
|
|
|
|
|
PROJECT_ROOT = get_project_root() |
|
ZIP_FILE_PATH = str(PROJECT_ROOT / "data/raw/factify/factify_data.zip") |
|
EXTRACTION_DIR = str(PROJECT_ROOT / "data/raw/factify/extracted") |
|
TEMP_EXTRACTION_DIR = str(PROJECT_ROOT / "data/raw/factify/public_folder") |
|
GDRIVE_FILE_URL = "https://drive.google.com/uc?id=1ig7XEYU1UKDHrHgDYgqiARWvNdswgFEX" |
|
|
|
|
|
def ensure_directories(): |
|
"""Ensure necessary directories exist.""" |
|
os.makedirs(os.path.dirname(ZIP_FILE_PATH), exist_ok=True) |
|
|
|
|
|
def download_zip(): |
|
"""Download the ZIP file if it doesn't already exist.""" |
|
if os.path.exists(ZIP_FILE_PATH): |
|
print(f"Zip file already exists at {ZIP_FILE_PATH}. Skipping download...") |
|
return |
|
print("Downloading zip file from Google Drive...") |
|
gdown.download(GDRIVE_FILE_URL, ZIP_FILE_PATH, quiet=False) |
|
print(f"Downloaded zip file to {ZIP_FILE_PATH}") |
|
|
|
|
|
def extract_zip(): |
|
"""Extract the ZIP file and handle folder and file renaming.""" |
|
train_csv_path = os.path.join(EXTRACTION_DIR, "train.csv") |
|
if os.path.exists(train_csv_path): |
|
print(f"{train_csv_path} already exists. Skipping extraction...") |
|
return |
|
print("Extracting zip file...") |
|
|
|
password = getpass("Enter the password for the zip file: ") |
|
with zipfile.ZipFile(ZIP_FILE_PATH, "r") as zip_ref: |
|
try: |
|
zip_ref.extractall( |
|
str(PROJECT_ROOT / "data/raw/factify/"), pwd=password.encode() |
|
) |
|
print(f"Extracted files to temporary folder: {TEMP_EXTRACTION_DIR}") |
|
except RuntimeError: |
|
print("Incorrect password. Exiting...") |
|
exit(1) |
|
|
|
|
|
if os.path.exists(EXTRACTION_DIR): |
|
shutil.rmtree(EXTRACTION_DIR) |
|
print(f"Removed existing directory: {EXTRACTION_DIR}") |
|
|
|
|
|
if os.path.exists(TEMP_EXTRACTION_DIR): |
|
os.rename(TEMP_EXTRACTION_DIR, EXTRACTION_DIR) |
|
print(f"Renamed folder {TEMP_EXTRACTION_DIR} to {EXTRACTION_DIR}") |
|
|
|
|
|
val_csv_path = os.path.join(EXTRACTION_DIR, "val.csv") |
|
test_csv_path = os.path.join(EXTRACTION_DIR, "test.csv") |
|
if os.path.exists(val_csv_path): |
|
os.rename(val_csv_path, test_csv_path) |
|
print(f"Renamed {val_csv_path} to {test_csv_path}") |
|
|
|
|
|
def main(): |
|
ensure_directories() |
|
download_zip() |
|
extract_zip() |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|