|
import spaces |
|
import os |
|
import cv2 |
|
import torch |
|
import yt_dlp |
|
import numpy as np |
|
from PIL import Image |
|
import gradio as gr |
|
|
|
from transformers import AutoFeatureExtractor, AutoModelForVideoClassification |
|
|
|
|
|
MODEL_NAME = "microsoft/timesformer-base-finetuned-k400" |
|
|
|
def extract_frames(video_path, num_frames=16, target_size=(224, 224)): |
|
""" |
|
Extract up to `num_frames` uniformly-sampled frames from the video. |
|
If the video has fewer frames, all frames are returned. |
|
""" |
|
cap = cv2.VideoCapture(video_path) |
|
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) |
|
frames = [] |
|
if total_frames <= 0: |
|
cap.release() |
|
return frames |
|
indices = np.linspace(0, total_frames - 1, num_frames, dtype=int) |
|
current_frame = 0 |
|
while True: |
|
ret, frame = cap.read() |
|
if not ret: |
|
break |
|
if current_frame in indices: |
|
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) |
|
frame = cv2.resize(frame, target_size) |
|
frames.append(Image.fromarray(frame)) |
|
current_frame += 1 |
|
cap.release() |
|
return frames |
|
|
|
@spaces.GPU |
|
def classify_video(video_path): |
|
""" |
|
Loads the TimeSformer model and feature extractor inside the GPU context, |
|
extracts frames from the video, runs inference, and returns the top 5 predicted actions. |
|
""" |
|
|
|
feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME) |
|
model = AutoModelForVideoClassification.from_pretrained(MODEL_NAME) |
|
model.eval() |
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
model.to(device) |
|
|
|
|
|
frames = extract_frames(video_path, num_frames=16, target_size=(224, 224)) |
|
if len(frames) == 0: |
|
return "No frames extracted from video." |
|
|
|
|
|
inputs = feature_extractor(frames, return_tensors="pt") |
|
inputs = {key: val.to(device) for key, val in inputs.items()} |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
|
|
|
|
logits = outputs.logits |
|
probs = torch.nn.functional.softmax(logits, dim=-1)[0] |
|
|
|
|
|
top_probs, top_indices = torch.topk(probs, k=5) |
|
top_probs = top_probs.cpu().numpy() |
|
top_indices = top_indices.cpu().numpy() |
|
|
|
|
|
id2label = feature_extractor.feature_extractor_config.get("id2label", {}) |
|
results = [] |
|
for idx, prob in zip(top_indices, top_probs): |
|
label = id2label.get(str(idx), f"Class {idx}") |
|
results.append(f"{label}: {prob:.3f}") |
|
|
|
return "\n".join(results) |
|
|
|
def process_video(video_file): |
|
if video_file is None: |
|
return "No video provided." |
|
result = classify_video(video_file) |
|
return result |
|
|
|
|
|
demo = gr.Interface( |
|
fn=process_video, |
|
inputs=gr.Video(source="upload", label="Upload Video Clip"), |
|
outputs=gr.Textbox(label="Predicted Actions"), |
|
title="Video Human Detection Demo using TimeSformer", |
|
description=( |
|
"Upload a video clip to see the top predicted human action labels using the TimeSformer model " |
|
"(fine-tuned on Kinetics-400). This demo loads the model and feature extractor within the GPU context " |
|
"for optimized inference in Hugging Face ZeroGPU Spaces while also supporting CPU-only environments." |
|
) |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|