Bug-O-Scope / dataset_utils.py
dalybuilds's picture
Update dataset_utils.py
90e019a verified
raw
history blame contribute delete
986 Bytes
# 2. dataset_utils.py
# Dataset loading and preprocessing
from datasets import load_dataset
from transformers import ViTImageProcessor
from PIL import Image
import os
class DatasetHandler:
def __init__(self, dataset_name="Gharaee/BIOSCAN-5M"):
self.dataset_name = dataset_name
def load_descriptions(self, max_records=500):
dataset = load_dataset(self.dataset_name)
descriptions = {}
# Limit to the first 500 records for testing
for record in dataset["train"].select(range(max_records)):
species_name = record.get("species_name", "Unknown Species")
description = record.get("description", "No description available.")
descriptions[species_name] = description
return descriptions
def preprocess_image(image_path):
processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224")
image = Image.open(image_path).convert("RGB")
return processor(image, return_tensors="pt")