Spaces:
Sleeping
Sleeping
File size: 986 Bytes
72394c9 2e6655e 72394c9 2e6655e 90e019a 2e6655e 72394c9 2e6655e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 |
# 2. dataset_utils.py
# Dataset loading and preprocessing
from datasets import load_dataset
from transformers import ViTImageProcessor
from PIL import Image
import os
class DatasetHandler:
def __init__(self, dataset_name="Gharaee/BIOSCAN-5M"):
self.dataset_name = dataset_name
def load_descriptions(self, max_records=500):
dataset = load_dataset(self.dataset_name)
descriptions = {}
# Limit to the first 500 records for testing
for record in dataset["train"].select(range(max_records)):
species_name = record.get("species_name", "Unknown Species")
description = record.get("description", "No description available.")
descriptions[species_name] = description
return descriptions
def preprocess_image(image_path):
processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224")
image = Image.open(image_path).convert("RGB")
return processor(image, return_tensors="pt") |