import gradio as gr
import cv2
import numpy as np
from transformers import pipeline

# Load the YOLO model using Hugging Face's pipeline
model = pipeline("object-detection", model="hustvl/yolos-tiny")

# Function to run YOLO on each video frame
def detect_objects(frame):
    # Convert frame to RGB as required by the model
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Run object detection
    results = model(rgb_frame)

    # Draw bounding boxes and labels
    for result in results:
        # Extract details
        label = result['label']
        score = result['score']
        box = result['box']
        x1, y1, x2, y2 = int(box['xmin']), int(box['ymin']), int(box['xmax']), int(box['ymax'])

        # Draw rectangle and label on the frame
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        text = f"{label}: {score:.2f}"
        cv2.putText(frame, text, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    return cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert back to RGB for Gradio

# Gradio interface to capture video frames
def video_stream(frame):
    # Run object detection on the frame
    annotated_frame = detect_objects(frame)
    return annotated_frame

# Create Gradio interface
webcam_interface = gr.Interface(
    fn=video_stream,
    inputs=gr.Video(source="webcam", streaming=True),
    outputs=gr.Image(shape=(640, 480)),
    live=True,
    description="Real-Time Object Detection with YOLO on Hugging Face"
)

# Launch Gradio app
if __name__ == "__main__":
    webcam_interface.launch()