Spaces:

Luigi
/

Video-Human-Fall-Detection-with-TimeSformer

Paused

Video-Human-Fall-Detection-with-TimeSformer

File size: 3,838 Bytes

288784f

import spaces  # Import spaces immediately for HF ZeroGPU support.
import os
import cv2
import torch
import yt_dlp  # (Retained in requirements for potential video fetching use)
import numpy as np
from PIL import Image
import gradio as gr

from transformers import AutoFeatureExtractor, AutoModelForVideoClassification

# Specify the model checkpoint for TimeSformer.
MODEL_NAME = "microsoft/timesformer-base-finetuned-k400"

def extract_frames(video_path, num_frames=16, target_size=(224, 224)):
    """
    Extract up to `num_frames` uniformly-sampled frames from the video.
    If the video has fewer frames, all frames are returned.
    """
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frames = []
    if total_frames <= 0:
        cap.release()
        return frames
    indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
    current_frame = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if current_frame in indices:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = cv2.resize(frame, target_size)
            frames.append(Image.fromarray(frame))
        current_frame += 1
    cap.release()
    return frames

@spaces.GPU
def classify_video(video_path):
    """
    Loads the TimeSformer model and feature extractor inside the GPU context,
    extracts frames from the video, runs inference, and returns the top 5 predicted actions.
    """
    # Load the feature extractor and model inside the GPU context.
    feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
    model = AutoModelForVideoClassification.from_pretrained(MODEL_NAME)
    model.eval()

    # Determine the device.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    # Extract frames from the video (here we sample 16 frames).
    frames = extract_frames(video_path, num_frames=16, target_size=(224, 224))
    if len(frames) == 0:
        return "No frames extracted from video."

    # Preprocess the frames.
    inputs = feature_extractor(frames, return_tensors="pt")
    inputs = {key: val.to(device) for key, val in inputs.items()}
    
    # Run inference.
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Compute probabilities from logits.
    logits = outputs.logits  # Shape: [batch_size, num_classes]; batch_size is 1 here.
    probs = torch.nn.functional.softmax(logits, dim=-1)[0]
    
    # Get the top 5 predictions.
    top_probs, top_indices = torch.topk(probs, k=5)
    top_probs = top_probs.cpu().numpy()
    top_indices = top_indices.cpu().numpy()
    
    # Retrieve the label mapping from the feature extractor's config.
    id2label = feature_extractor.feature_extractor_config.get("id2label", {})
    results = []
    for idx, prob in zip(top_indices, top_probs):
        label = id2label.get(str(idx), f"Class {idx}")
        results.append(f"{label}: {prob:.3f}")
    
    return "\n".join(results)

def process_video(video_file):
    if video_file is None:
        return "No video provided."
    result = classify_video(video_file)
    return result

# Gradio interface definition.
demo = gr.Interface(
    fn=process_video,
    inputs=gr.Video(source="upload", label="Upload Video Clip"),
    outputs=gr.Textbox(label="Predicted Actions"),
    title="Video Human Detection Demo using TimeSformer",
    description=(
        "Upload a video clip to see the top predicted human action labels using the TimeSformer model "
        "(fine-tuned on Kinetics-400). This demo loads the model and feature extractor within the GPU context "
        "for optimized inference in Hugging Face ZeroGPU Spaces while also supporting CPU-only environments."
    )
)

if __name__ == "__main__":
    demo.launch()