File size: 2,399 Bytes
54e8a79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import os
import requests
import tarfile
from tqdm import tqdm

DATA_URL: str = (
    "http://nlplab1.cs.vt.edu/~menglong/project/multimodal/fact_checking/MOCHEG/dataset/latest_dataset/mocheg_with_tweet_2023_03.tar.gz"
)
RAW_DATA_DIR: str = "data/raw"
ARCHIVE_NAME: str = "mocheg_with_tweet_2023_03.tar.gz"
CHUNK_SIZE: int = 16 * 1024 * 1024  # 16 MB

# Ensure the raw data directory exists
os.makedirs(RAW_DATA_DIR, exist_ok=True)
archive_path: str = os.path.join(RAW_DATA_DIR, ARCHIVE_NAME)


def check_disk_space(required_space_gb: int) -> bool:
    """Check if there is enough free disk space."""
    stat = os.statvfs(RAW_DATA_DIR)
    free_space_gb: float = (stat.f_bavail * stat.f_frsize) / (1024**3)
    return free_space_gb > required_space_gb


def download_data() -> None:
    """Download the data if not already present and extract it."""
    # Check if the data file already exists
    if os.path.exists(archive_path):
        print(f"Data already downloaded at {archive_path}. Skipping download.")
        return

    # Ensure enough disk space (approximate)
    required_space_gb: int = 80  # Adjust based on expected file size + extraction space
    if not check_disk_space(required_space_gb):
        print(f"Not enough disk space. At least {required_space_gb} GB required.")
        return

    # Download the data in larger chunks
    print(f"Downloading data from {DATA_URL}...")
    response = requests.get(DATA_URL, stream=True)
    response.raise_for_status()  # Ensure the URL is accessible

    total_size: int = int(response.headers.get("content-length", 0))
    with open(archive_path, "wb") as file, tqdm(
        desc=ARCHIVE_NAME,
        total=total_size,
        unit="B",
        unit_scale=True,
        unit_divisor=1024,
    ) as progress_bar:
        for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
            if chunk:
                file.write(chunk)
                progress_bar.update(len(chunk))

    print(f"Download completed: {archive_path}")

    # Extract the tar.gz file
    extract_data(archive_path)


def extract_data(archive_path: str) -> None:
    """Extract the downloaded tar.gz file."""
    print(f"Extracting data from {archive_path}...")
    with tarfile.open(archive_path, "r:gz") as tar:
        tar.extractall(path=RAW_DATA_DIR)
    print(f"Data extracted to {RAW_DATA_DIR}")


if __name__ == "__main__":
    download_data()