Spaces:

Detomo
/

aisatsu-app-api

Paused

App Files Files Community

aisatsu-app-api / app.py

vumichien

Update app.py

2b9ea7f over 2 years ago

raw

history blame

1.92 kB

	from gtts import gTTS
	from io import BytesIO
	import base64
	from PIL import Image
	import cv2
	import numpy as np
	import gradio as gr
	from ultralyticsplus import YOLO
	from base64 import b64encode
	from speech_recognition import AudioFile, Recognizer
	import numpy as np
	from utils import tts, read_image_file, pil_to_base64, base64_to_pil, get_hist
	from scipy.spatial import distance as dist

	model = YOLO('ultralyticsplus/yolov8s')
	CLASS = model.model.names
	defaul_bot_voice = "おはいようございます"
	area_thres = 0.3

	def infer(image, last_seen):
	results = model.predict(image, show=False)[0]
	masks, boxes = results.masks, results.boxes
	area_image = image.width * image.height
	voice_bot = None
	most_close = 0
	out_img = None
	diff_value = 0.5
	if boxes is not None:
	for xyxy, conf, cls in zip(boxes.xyxy, boxes.conf, boxes.cls):
	if int(cls) != 0:
	continue
	box = xyxy.tolist()
	area_rate = (box[2] - box[0]) * (box[3] - box[1]) / area_image
	if area_rate >= most_close:
	out_img = image.crop(tuple(box)).resize((128, 128))
	most_close = area_rate
	if last_seen != "":
	last_seen = base64_to_pil(last_seen)
	if out_img is not None:
	diff_value = dist.euclidean(get_hist(out_img), get_hist(last_seen))
	print(most_close, diff_value)
	if most_close >= area_thres and diff_value >= 0.5:
	voice_bot = tts(defaul_bot_voice, language="ja")
	return out_img, voice_bot

	iface = gr.Interface(
	fn=infer,
	title="aisatsu api",
	inputs=[gr.Image(label="image", type="pil", shape=(960, 640)), gr.Textbox(label="last seen", value="")],
	outputs=[gr.Image(label="output image"), gr.Textbox(label="output voice")],
	article = "Author: <a href=\"https://huggingface.co/vumichien\">Vu Minh Chien</a>.",
	).launch(enable_queue=True, debug=True)