import cv2 import gradio as gr import numpy as np from transformers import pipeline # Load the hand detection model from Hugging Face gesture_pipeline = pipeline("image-classification", model="google/vit-base-patch16-224-in21k") # Function to process the video stream def process_frame(frame): # Convert the frame to RGB for the Hugging Face model rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # You can apply hand gesture recognition logic here (e.g., hand landmarks tracking) gesture = gesture_pipeline(rgb_frame) # Output gesture recognition results gesture_name = gesture[0]["label"] gesture_confidence = gesture[0]["score"] # Display gesture on the screen (in this case, we'll move the elements or give a thumbs up) if "Thumbs up" in gesture_name: print("Gesture recognized: Thumbs Up!") if "Heart" in gesture_name: print("Gesture recognized: Heart!") # Update the frame with the recognized gesture cv2.putText(frame, f"Gesture: {gesture_name}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2) return frame # Gradio interface function def video_input(video): # Process the video frame by frame while True: ret, frame = video.read() if not ret: break processed_frame = process_frame(frame) yield processed_frame # Set up the Gradio interface with the webcam iface = gr.Interface(fn=video_input, inputs=gr.Video(type="webcam"), outputs="video", live=True) iface.launch()