Spaces:
Sleeping
Sleeping
# 2. dataset_utils.py | |
# Dataset loading and preprocessing | |
from datasets import load_dataset | |
from transformers import ViTImageProcessor | |
from PIL import Image | |
import os | |
class DatasetHandler: | |
def __init__(self, dataset_name="Gharaee/BIOSCAN-5M"): | |
self.dataset_name = dataset_name | |
def load_descriptions(self, max_records=500): | |
dataset = load_dataset(self.dataset_name) | |
descriptions = {} | |
# Limit to the first 500 records for testing | |
for record in dataset["train"].select(range(max_records)): | |
species_name = record.get("species_name", "Unknown Species") | |
description = record.get("description", "No description available.") | |
descriptions[species_name] = description | |
return descriptions | |
def preprocess_image(image_path): | |
processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224") | |
image = Image.open(image_path).convert("RGB") | |
return processor(image, return_tensors="pt") |