Spaces:

dioarafl
/

assisTen

Runtime error

File size: 4,370 Bytes

8a4b9ae
f2fc28a
 
6c8e0a0
bf3e8dc
8a4b9ae
 
 
bf3e8dc
8a4b9ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c8e0a0
 
bf3e8dc
 
6c8e0a0
 
f2fc28a
 
 
6c8e0a0
bf3e8dc
 
 
 
 
 
 
6c8e0a0
f2fc28a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c8e0a0
8a4b9ae
6c8e0a0
f2fc28a
 
 
 
 
 
 
 
 
 
 
6c8e0a0

import gradio as gr
import subprocess
import cv2
import torch
import torchaudio
from torchvision.models.detection import fasterrcnn_resnet50_fpn
import torchvision.transforms as transforms
from PIL import Image
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

class FasterRCNNDetector:
    def __init__(self):
        self.model = fasterrcnn_resnet50_fpn(pretrained=True)
        self.model.eval()
        self.classes = [
            "__background__", "person", "bicycle", "car", "motorcycle", "airplane", "bus",
            "train", "truck", "boat", "traffic light", "fire hydrant", "N/A", "stop sign",
            "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
            "elephant", "bear", "zebra", "giraffe", "N/A", "backpack", "umbrella", "N/A", "N/A",
            "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball",
            "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket",
            "bottle", "N/A", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
            "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza",
            "donut", "cake", "chair", "couch", "potted plant", "bed", "N/A", "dining table",
            "N/A", "N/A", "toilet", "N/A", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
            "microwave", "oven", "toaster", "sink", "refrigerator", "N/A", "book",
            "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"
        ]

    def detect_objects(self, image):
        image_pil = Image.fromarray(image)
        transform = transforms.Compose([transforms.ToTensor()])
        image_tensor = transform(image_pil).unsqueeze(0)
        
        with torch.no_grad():
            prediction = self.model(image_tensor)
        
        boxes = prediction[0]['boxes']
        labels = prediction[0]['labels']
        scores = prediction[0]['scores']
        
        for box, label, score in zip(boxes, labels, scores):
            box = [int(i) for i in box]
            cv2.rectangle(image, (box[0], box[1]), (box[2], box[3]), (0, 255, 0), 2)
            cv2.putText(image, self.classes[label], (box[0], box[1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (36,255,12), 2)
        
        return image

class JarvisModels:
    def __init__(self):
        self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
        self.model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

    async def generate_response(self, prompt):
        # Logika untuk menghasilkan tanggapan
        response = gr.Interface.load("models/openai-community/gpt2").process(prompt)
        return response

    async def transcribe_audio(self, audio_file):
        input_audio, _ = torchaudio.load(audio_file)
        input_values = self.processor(input_audio, return_tensors="pt").input_values
        logits = self.model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = self.processor.batch_decode(predicted_ids)
        return transcription[0]

def transcribe(audio):
    global messages

    audio_file = open(audio, "rb")
    # Transkripsi audio secara lokal (Anda dapat menambahkan logika transkripsi sesuai kebutuhan)
    transcript = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
    
    # Logika tanggapan (Anda dapat menambahkan logika untuk menghasilkan tanggapan sesuai kebutuhan)
    system_message = {"role": "system", "content": "Lorem ipsum dolor sit amet, consectetur adipiscing elit."}

    subprocess.call(["say", system_message['content']])

    chat_transcript = "User: " + transcript + "\n\n" + "System: " + system_message['content'] + "\n\n"

    return chat_transcript

detector = FasterRCNNDetector()

iface = gr.Interface(
    fn=[detector.detect_objects, JarvisModels().transcribe_audio, JarvisModels().generate_response, transcribe],
    inputs=[
        gr.inputs.Video(label="Webcam", parameters={"fps": 30}),
        gr.inputs.Audio(source="microphone", type="filepath")
    ],
    outputs=[
        gr.outputs.Image(), 
        "text",
        "text",
        "text"
    ],
    title="Vision and Speech Interface",
    description="This interface detects objects in the webcam feed and transcribes speech recorded through the microphone."
)
iface.launch()