Spaces:
Paused
Paused
| from gtts import gTTS | |
| from io import BytesIO | |
| import base64 | |
| from PIL import Image | |
| import cv2 | |
| import numpy as np | |
| import gradio as gr | |
| from ultralyticsplus import YOLO | |
| from base64 import b64encode | |
| from speech_recognition import AudioFile, Recognizer | |
| import numpy as np | |
| from utils import tts, read_image_file, pil_to_base64, base64_to_pil, get_hist | |
| from scipy.spatial import distance as dist | |
| model = YOLO('ultralyticsplus/yolov8s') | |
| CLASS = model.model.names | |
| defaul_bot_voice = "γγ―γγγγγγγΎγ" | |
| area_thres = 0.3 | |
| def infer(image, last_seen): | |
| results = model.predict(image, show=False)[0] | |
| masks, boxes = results.masks, results.boxes | |
| area_image = image.width * image.height | |
| voice_bot = None | |
| most_close = 0 | |
| out_img = None | |
| diff_value = 0.5 | |
| if boxes is not None: | |
| for xyxy, conf, cls in zip(boxes.xyxy, boxes.conf, boxes.cls): | |
| if int(cls) != 0: | |
| continue | |
| box = xyxy.tolist() | |
| area_rate = (box[2] - box[0]) * (box[3] - box[1]) / area_image | |
| if area_rate >= most_close: | |
| out_img = image.crop(tuple(box)).resize((128, 128)) | |
| most_close = area_rate | |
| if last_seen != "": | |
| last_seen = base64_to_pil(last_seen) | |
| if out_img is not None: | |
| diff_value = dist.euclidean(get_hist(out_img), get_hist(last_seen)) | |
| print(most_close, diff_value) | |
| if most_close >= area_thres and diff_value >= 0.5: | |
| voice_bot = tts(defaul_bot_voice, language="ja") | |
| return out_img, voice_bot | |
| iface = gr.Interface( | |
| fn=infer, | |
| title="aisatsu api", | |
| inputs=[gr.Image(label="image", type="pil", shape=(960, 640)), gr.Textbox(label="last seen", value="")], | |
| outputs=[gr.Image(label="output image"), gr.Textbox(label="output voice")], | |
| article = "Author: <a href=\"https://huggingface.co/vumichien\">Vu Minh Chien</a>.", | |
| ).launch(enable_queue=True, debug=True) |