Spaces:

Luigi
/

Video-Human-Fall-Detection-with-TimeSformer

Paused

App Files Files Community

Video-Human-Fall-Detection-with-TimeSformer / app.py

Luigi

initial commit

288784f 5 months ago

raw

history blame

3.84 kB

	import spaces # Import spaces immediately for HF ZeroGPU support.
	import os
	import cv2
	import torch
	import yt_dlp # (Retained in requirements for potential video fetching use)
	import numpy as np
	from PIL import Image
	import gradio as gr

	from transformers import AutoFeatureExtractor, AutoModelForVideoClassification

	# Specify the model checkpoint for TimeSformer.
	MODEL_NAME = "microsoft/timesformer-base-finetuned-k400"

	def extract_frames(video_path, num_frames=16, target_size=(224, 224)):
	"""
	Extract up to `num_frames` uniformly-sampled frames from the video.
	If the video has fewer frames, all frames are returned.
	"""
	cap = cv2.VideoCapture(video_path)
	total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
	frames = []
	if total_frames <= 0:
	cap.release()
	return frames
	indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
	current_frame = 0
	while True:
	ret, frame = cap.read()
	if not ret:
	break
	if current_frame in indices:
	frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	frame = cv2.resize(frame, target_size)
	frames.append(Image.fromarray(frame))
	current_frame += 1
	cap.release()
	return frames

	@spaces.GPU
	def classify_video(video_path):
	"""
	Loads the TimeSformer model and feature extractor inside the GPU context,
	extracts frames from the video, runs inference, and returns the top 5 predicted actions.
	"""
	# Load the feature extractor and model inside the GPU context.
	feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
	model = AutoModelForVideoClassification.from_pretrained(MODEL_NAME)
	model.eval()

	# Determine the device.
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model.to(device)

	# Extract frames from the video (here we sample 16 frames).
	frames = extract_frames(video_path, num_frames=16, target_size=(224, 224))
	if len(frames) == 0:
	return "No frames extracted from video."

	# Preprocess the frames.
	inputs = feature_extractor(frames, return_tensors="pt")
	inputs = {key: val.to(device) for key, val in inputs.items()}

	# Run inference.
	with torch.no_grad():
	outputs = model(**inputs)

	# Compute probabilities from logits.
	logits = outputs.logits # Shape: [batch_size, num_classes]; batch_size is 1 here.
	probs = torch.nn.functional.softmax(logits, dim=-1)[0]

	# Get the top 5 predictions.
	top_probs, top_indices = torch.topk(probs, k=5)
	top_probs = top_probs.cpu().numpy()
	top_indices = top_indices.cpu().numpy()

	# Retrieve the label mapping from the feature extractor's config.
	id2label = feature_extractor.feature_extractor_config.get("id2label", {})
	results = []
	for idx, prob in zip(top_indices, top_probs):
	label = id2label.get(str(idx), f"Class {idx}")
	results.append(f"{label}: {prob:.3f}")

	return "\n".join(results)

	def process_video(video_file):
	if video_file is None:
	return "No video provided."
	result = classify_video(video_file)
	return result

	# Gradio interface definition.
	demo = gr.Interface(
	fn=process_video,
	inputs=gr.Video(source="upload", label="Upload Video Clip"),
	outputs=gr.Textbox(label="Predicted Actions"),
	title="Video Human Detection Demo using TimeSformer",
	description=(
	"Upload a video clip to see the top predicted human action labels using the TimeSformer model "
	"(fine-tuned on Kinetics-400). This demo loads the model and feature extractor within the GPU context "
	"for optimized inference in Hugging Face ZeroGPU Spaces while also supporting CPU-only environments."
	)
	)

	if __name__ == "__main__":
	demo.launch()