--- license: mit base_model: - facebook/vjepa2-vitl-fpc32-256-diving48 pipeline_tag: video-classification --- ## Usage ### ONNXRuntime

First, define the read_gif_frames helper function (click to expand):

```py import numpy as np from PIL import Image, ImageSequence import requests from io import BytesIO import os def read_gif_frames(path_or_url, shortest_edge=None, center_crop=None): # Load GIF from URL or local path if path_or_url.startswith("http://") or path_or_url.startswith("https://"): response = requests.get(path_or_url) gif = Image.open(BytesIO(response.content)) elif os.path.exists(path_or_url): gif = Image.open(path_or_url) else: raise ValueError("Invalid URL or file path") # Ensure it's a GIF if gif.format != "GIF": raise ValueError("Not a GIF file") # Extract frames and convert to RGB frames = [] for frame in ImageSequence.Iterator(gif): rgb_frame = frame.convert("RGB") # Force 3 channels # Resize if specified if shortest_edge is not None: w, h = rgb_frame.size if h < w: new_h = shortest_edge new_w = int(w * shortest_edge / h) else: new_w = shortest_edge new_h = int(h * shortest_edge / w) rgb_frame = rgb_frame.resize((new_w, new_h), Image.LANCZOS) # Center crop if specified if center_crop is not None: w, h = rgb_frame.size left = (w - center_crop) // 2 top = (h - center_crop) // 2 right = left + center_crop bottom = top + center_crop rgb_frame = rgb_frame.crop((left, top, right, bottom)) frame_np = np.array(rgb_frame, dtype=np.uint8) frame_np = np.transpose(frame_np, (2, 0, 1)) # HWC -> CHW frames.append(frame_np) return np.stack(frames) # Shape: [num_frames, 3, height, width] ```

You can then run the model as follows: ```py import onnxruntime as ort from huggingface_hub import hf_hub_download from transformers import AutoConfig model_id = "onnx-community/vjepa2-vitl-fpc32-256-diving48-ONNX" config = AutoConfig.from_pretrained(model_id) path = hf_hub_download( repo_id=model_id, filename="onnx/model.onnx", ) ort_session = ort.InferenceSession(path) # Load and preprocess video frames video = read_gif_frames( "http://www.svcl.ucsd.edu/projects/resound/imgs/19.gif", shortest_edge=292, center_crop=256, ) mean = np.array([0.485, 0.456, 0.406]).reshape(3, 1, 1) std = np.array([0.229, 0.224, 0.225]).reshape(3, 1, 1) inputs = { "pixel_values_videos": ((video / 255 - mean) / std)[np.newaxis, ...].astype(np.float32) } # Run the model logits = ort_session.run( None, input_feed=inputs, )[0] top_k = 5 indices = np.argsort(logits[0])[-top_k:][::-1] # Calculate softmax probabilities exp_logits = np.exp(logits[0] - np.max(logits[0])) softmax_probs = exp_logits / np.sum(exp_logits) print(f"Top {top_k} predicted class names:") for idx in indices: text_label = config.id2label[idx] print(f" - {text_label}: {softmax_probs[idx]:.2f}") ``` Example output: ``` Top 5 predicted class names: - ['Forward', '15som', 'NoTwis', 'PIKE']: 0.69 - ['Reverse', 'Dive', 'NoTwis', 'PIKE']: 0.22 - ['Inward', '15som', 'NoTwis', 'PIKE']: 0.06 - ['Reverse', '15som', '05Twis', 'FREE']: 0.01 - ['Forward', '25som', 'NoTwis', 'PIKE']: 0.00 ```