misinfo / src /data_loader /download_images.py
gyigit's picture
update
54e8a79
import os
import argparse
import pandas as pd
import requests
import json
import io
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict
from PIL import Image
from src.utils.data_utils import HEADERS
from src.utils.path_utils import get_project_root
# Constants
PROJECT_ROOT = get_project_root()
EXTRACTION_DIR = str(PROJECT_ROOT / "data/raw/factify/extracted")
IMAGES_DIR = os.path.join(EXTRACTION_DIR, "images")
def ensure_directories(images_folder):
"""Ensure the image directory exists."""
os.makedirs(images_folder, exist_ok=True)
def download_image(url, save_path):
"""Download a single image if not already downloaded."""
# Check if the image already exists
if os.path.exists(save_path):
print(f"Image already exists: {save_path}")
return True
headers = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"
)
}
try:
response = requests.get(url, headers=headers, stream=True, timeout=30)
response.raise_for_status() # Raise an error for HTTP issues
img = Image.open(io.BytesIO(response.content))
img = img.convert("RGB") # Ensure the image is in RGB format
img.save(save_path)
print(f"Downloaded and saved image: {save_path}")
return True
except Exception as e:
print(f"Failed to download image from {url}: {e}")
return False
def process_image(row, images_folder, stats, dataset_name):
"""Process claim and evidence image downloads."""
file_id = str(row["id"])
category = row.get("category", "Unknown")
claim_image_url = row.get("claim_image", "")
evidence_image_url = row.get("evidence_image", "")
# Ensure category stats exist
stats["categories"].setdefault(
category,
{
"total_claim": 0,
"successful_claim": 0,
"total_evidence": 0,
"successful_evidence": 0,
},
)
stats["categories"][category]["total_claim"] += 1
stats["categories"][category]["total_evidence"] += 1
# Download claim image
if claim_image_url:
success = download_image(
claim_image_url, os.path.join(images_folder, f"{file_id}_claim.jpg")
)
if success:
stats["successful_claim"] += 1
stats["categories"][category]["successful_claim"] += 1
# Download evidence image
if evidence_image_url:
success = download_image(
evidence_image_url, os.path.join(images_folder, f"{file_id}_evidence.jpg")
)
if success:
stats["successful_evidence"] += 1
stats["categories"][category]["successful_evidence"] += 1
def download_images(dataset, use_threading):
"""Download images for the specified dataset (train or test)."""
csv_path = os.path.join(EXTRACTION_DIR, f"{dataset}.csv")
images_folder = os.path.join(IMAGES_DIR, dataset)
stats_file_path = os.path.join(
EXTRACTION_DIR, f"{dataset}_image_download_stats.json"
)
ensure_directories(images_folder)
if not os.path.exists(csv_path):
print(f"CSV file not found for {dataset}: {csv_path}")
return
stats = {
"successful_claim": 0,
"successful_evidence": 0,
"categories": defaultdict(
lambda: {
"total_claim": 0,
"successful_claim": 0,
"total_evidence": 0,
"successful_evidence": 0,
}
),
}
df = pd.read_csv(csv_path, names=HEADERS, header=None, sep="\t", skiprows=1)
if use_threading:
with ThreadPoolExecutor(max_workers=10) as executor:
futures = [
executor.submit(process_image, row, images_folder, stats, dataset)
for _, row in df.iterrows()
]
for _ in tqdm(
as_completed(futures),
total=len(futures),
desc=f"Downloading {dataset} images",
):
pass
else:
for _, row in tqdm(
df.iterrows(), total=len(df), desc=f"Downloading {dataset} images"
):
process_image(row, images_folder, stats, dataset)
with open(stats_file_path, "w") as stats_file:
json.dump(stats, stats_file, indent=4)
print(f"Image download stats saved to {stats_file_path}")
def main():
parser = argparse.ArgumentParser(description="Download images for Factify dataset.")
parser.add_argument(
"--dataset",
choices=["train", "test"],
help="Specify which dataset to download images for (train or test). If not specified, both will be downloaded.",
)
parser.add_argument(
"--use-threading",
action="store_true",
default=True,
help="Enable threading for image downloads (default: True).",
)
args = parser.parse_args()
if args.dataset:
# Run for the specified dataset
download_images(args.dataset, args.use_threading)
else:
# Run for both train and test if no dataset is specified
print("No dataset specified. Downloading images for both train and test...")
for dataset in ["train", "test"]:
download_images(dataset, args.use_threading)
if __name__ == "__main__":
main()