import cv2
import gradio as gr
import numpy as np
from transformers import pipeline

# Load the hand detection model from Hugging Face
gesture_pipeline = pipeline("image-classification", model="google/vit-base-patch16-224-in21k")

# Function to process the video stream
def process_frame(frame):
    # Convert the frame to RGB for the Hugging Face model
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    # You can apply hand gesture recognition logic here (e.g., hand landmarks tracking)
    gesture = gesture_pipeline(rgb_frame)

    # Output gesture recognition results
    gesture_name = gesture[0]["label"]
    gesture_confidence = gesture[0]["score"]

    # Display gesture on the screen (in this case, we'll move the elements or give a thumbs up)
    if "Thumbs up" in gesture_name:
        print("Gesture recognized: Thumbs Up!")
    if "Heart" in gesture_name:
        print("Gesture recognized: Heart!")

    # Update the frame with the recognized gesture
    cv2.putText(frame, f"Gesture: {gesture_name}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
    return frame

# Gradio interface function
def video_input(video):
    # Process the video frame by frame
    while True:
        ret, frame = video.read()
        if not ret:
            break
        processed_frame = process_frame(frame)
        yield processed_frame

# Set up the Gradio interface with the webcam
iface = gr.Interface(fn=video_input, inputs=gr.Video(type="webcam"), outputs="video", live=True)

iface.launch()