Spaces:
Paused
Paused
File size: 1,919 Bytes
25a3bd2 b11de8a 25a3bd2 2b9ea7f 25a3bd2 2b9ea7f 25a3bd2 dc9f5f5 25a3bd2 2b9ea7f 25a3bd2 ee7d893 25a3bd2 dc9f5f5 ee7d893 25a3bd2 7c2ed5f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
from gtts import gTTS
from io import BytesIO
import base64
from PIL import Image
import cv2
import numpy as np
import gradio as gr
from ultralyticsplus import YOLO
from base64 import b64encode
from speech_recognition import AudioFile, Recognizer
import numpy as np
from utils import tts, read_image_file, pil_to_base64, base64_to_pil, get_hist
from scipy.spatial import distance as dist
model = YOLO('ultralyticsplus/yolov8s')
CLASS = model.model.names
defaul_bot_voice = "γγ―γγγγγγγΎγ"
area_thres = 0.3
def infer(image, last_seen):
results = model.predict(image, show=False)[0]
masks, boxes = results.masks, results.boxes
area_image = image.width * image.height
voice_bot = None
most_close = 0
out_img = None
diff_value = 0.5
if boxes is not None:
for xyxy, conf, cls in zip(boxes.xyxy, boxes.conf, boxes.cls):
if int(cls) != 0:
continue
box = xyxy.tolist()
area_rate = (box[2] - box[0]) * (box[3] - box[1]) / area_image
if area_rate >= most_close:
out_img = image.crop(tuple(box)).resize((64, 64))
most_close = area_rate
if last_seen != "":
last_seen = base64_to_pil(last_seen)
if out_img is not None:
diff_value = dist.euclidean(get_hist(out_img), get_hist(last_seen))
print(most_close, diff_value)
if most_close >= area_thres and diff_value >= 0.5:
voice_bot = tts(defaul_bot_voice, language="ja")
return out_img, voice_bot
iface = gr.Interface(
fn=infer,
title="aisatsu api",
inputs=[gr.Image(label="image", type="pil", shape=(320, 320)), gr.Textbox(label="last seen", value="")],
outputs=[gr.Image(label="output image"), gr.Textbox(label="output voice")],
article = "Author: <a href=\"https://huggingface.co/vumichien\">Vu Minh Chien</a>.",
).launch(enable_queue=True, debug=True) |