# 2. dataset_utils.py # Dataset loading and preprocessing from datasets import load_dataset from transformers import ViTImageProcessor from PIL import Image import os class DatasetHandler: def __init__(self, dataset_name="Gharaee/BIOSCAN-5M"): self.dataset_name = dataset_name def load_descriptions(self, max_records=500): dataset = load_dataset(self.dataset_name) descriptions = {} # Limit to the first 500 records for testing for record in dataset["train"].select(range(max_records)): species_name = record.get("species_name", "Unknown Species") description = record.get("description", "No description available.") descriptions[species_name] = description return descriptions def preprocess_image(image_path): processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224") image = Image.open(image_path).convert("RGB") return processor(image, return_tensors="pt")