Spaces:

mgyigit
/

misinfo

Sleeping

App Files Files Community

misinfo / src /data_loader /download_images.py

gyigit

update

54e8a79 3 months ago

raw

history blame contribute delete

5.48 kB

	import os
	import argparse
	import pandas as pd
	import requests
	import json
	import io
	from tqdm import tqdm
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from collections import defaultdict
	from PIL import Image

	from src.utils.data_utils import HEADERS
	from src.utils.path_utils import get_project_root

	# Constants
	PROJECT_ROOT = get_project_root()
	EXTRACTION_DIR = str(PROJECT_ROOT / "data/raw/factify/extracted")
	IMAGES_DIR = os.path.join(EXTRACTION_DIR, "images")


	def ensure_directories(images_folder):
	"""Ensure the image directory exists."""
	os.makedirs(images_folder, exist_ok=True)


	def download_image(url, save_path):
	"""Download a single image if not already downloaded."""
	# Check if the image already exists
	if os.path.exists(save_path):
	print(f"Image already exists: {save_path}")
	return True

	headers = {
	"User-Agent": (
	"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) "
	"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"
	)
	}
	try:
	response = requests.get(url, headers=headers, stream=True, timeout=30)
	response.raise_for_status() # Raise an error for HTTP issues
	img = Image.open(io.BytesIO(response.content))
	img = img.convert("RGB") # Ensure the image is in RGB format
	img.save(save_path)
	print(f"Downloaded and saved image: {save_path}")
	return True
	except Exception as e:
	print(f"Failed to download image from {url}: {e}")
	return False


	def process_image(row, images_folder, stats, dataset_name):
	"""Process claim and evidence image downloads."""
	file_id = str(row["id"])
	category = row.get("category", "Unknown")
	claim_image_url = row.get("claim_image", "")
	evidence_image_url = row.get("evidence_image", "")

	# Ensure category stats exist
	stats["categories"].setdefault(
	category,
	{
	"total_claim": 0,
	"successful_claim": 0,
	"total_evidence": 0,
	"successful_evidence": 0,
	},
	)
	stats["categories"][category]["total_claim"] += 1
	stats["categories"][category]["total_evidence"] += 1

	# Download claim image
	if claim_image_url:
	success = download_image(
	claim_image_url, os.path.join(images_folder, f"{file_id}_claim.jpg")
	)
	if success:
	stats["successful_claim"] += 1
	stats["categories"][category]["successful_claim"] += 1

	# Download evidence image
	if evidence_image_url:
	success = download_image(
	evidence_image_url, os.path.join(images_folder, f"{file_id}_evidence.jpg")
	)
	if success:
	stats["successful_evidence"] += 1
	stats["categories"][category]["successful_evidence"] += 1


	def download_images(dataset, use_threading):
	"""Download images for the specified dataset (train or test)."""
	csv_path = os.path.join(EXTRACTION_DIR, f"{dataset}.csv")
	images_folder = os.path.join(IMAGES_DIR, dataset)
	stats_file_path = os.path.join(
	EXTRACTION_DIR, f"{dataset}_image_download_stats.json"
	)
	ensure_directories(images_folder)

	if not os.path.exists(csv_path):
	print(f"CSV file not found for {dataset}: {csv_path}")
	return

	stats = {
	"successful_claim": 0,
	"successful_evidence": 0,
	"categories": defaultdict(
	lambda: {
	"total_claim": 0,
	"successful_claim": 0,
	"total_evidence": 0,
	"successful_evidence": 0,
	}
	),
	}

	df = pd.read_csv(csv_path, names=HEADERS, header=None, sep="\t", skiprows=1)

	if use_threading:
	with ThreadPoolExecutor(max_workers=10) as executor:
	futures = [
	executor.submit(process_image, row, images_folder, stats, dataset)
	for _, row in df.iterrows()
	]
	for _ in tqdm(
	as_completed(futures),
	total=len(futures),
	desc=f"Downloading {dataset} images",
	):
	pass
	else:
	for _, row in tqdm(
	df.iterrows(), total=len(df), desc=f"Downloading {dataset} images"
	):
	process_image(row, images_folder, stats, dataset)

	with open(stats_file_path, "w") as stats_file:
	json.dump(stats, stats_file, indent=4)
	print(f"Image download stats saved to {stats_file_path}")


	def main():
	parser = argparse.ArgumentParser(description="Download images for Factify dataset.")
	parser.add_argument(
	"--dataset",
	choices=["train", "test"],
	help="Specify which dataset to download images for (train or test). If not specified, both will be downloaded.",
	)
	parser.add_argument(
	"--use-threading",
	action="store_true",
	default=True,
	help="Enable threading for image downloads (default: True).",
	)
	args = parser.parse_args()

	if args.dataset:
	# Run for the specified dataset
	download_images(args.dataset, args.use_threading)
	else:
	# Run for both train and test if no dataset is specified
	print("No dataset specified. Downloading images for both train and test...")
	for dataset in ["train", "test"]:
	download_images(dataset, args.use_threading)


	if __name__ == "__main__":
	main()