# 2. dataset_utils.py
# Dataset loading and preprocessing
from datasets import load_dataset
from transformers import ViTImageProcessor
from PIL import Image
import os

class DatasetHandler:
    def __init__(self, dataset_name="Gharaee/BIOSCAN-5M"):
        self.dataset_name = dataset_name

    def load_descriptions(self, max_records=500):
        dataset = load_dataset(self.dataset_name)
        descriptions = {}
        # Limit to the first 500 records for testing
        for record in dataset["train"].select(range(max_records)):
            species_name = record.get("species_name", "Unknown Species")
            description = record.get("description", "No description available.")
            descriptions[species_name] = description
        return descriptions

def preprocess_image(image_path):
    processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224")
    image = Image.open(image_path).convert("RGB")
    return processor(image, return_tensors="pt")