Spaces:
Sleeping
Sleeping
File size: 3,184 Bytes
443cd8b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import gradio as gr
import torch
from ultralyticsplus import YOLO
import numpy as np
from sahi.prediction import ObjectPrediction, PredictionScore
from sahi.utils.cv import (
get_bool_mask_from_coco_segmentation,
read_image_as_pil,
visualize_object_predictions,
)
from base64 import b64encode
from io import BytesIO
from gtts import gTTS
from mtranslate import translate
from speech_recognition import AudioFile, Recognizer
import time
model = YOLO('ultralyticsplus/yolov8s')
CLASS = model.model.names
def tts(text: str, language="ja") -> object:
"""Converts text into autoplay html.
Args:
text (str): generated answer of bot
Returns:
html: autoplay object
"""
tts_object = gTTS(text=text, lang=language, slow=False)
bytes_object = BytesIO()
tts_object.write_to_fp(bytes_object)
bytes_object.seek(0)
b64 = b64encode(bytes_object.getvalue()).decode()
html = f"""
<audio controls autoplay>
<source src="data:audio/wav;base64,{b64}" type="audio/wav">
</audio>
"""
return html
def yolov8_inference(
image,
area_thres=0.2,
defaul_bot_voice="おはいようございます"
):
"""
YOLOv8 inference function
Args:
image: Input image
Returns:
Rendered image
"""
time.sleep(2)
# set model parameters
model.overrides['conf'] = 0.25 # NMS confidence threshold
model.overrides['iou'] = 0.45 # NMS IoU threshold
model.overrides['agnostic_nms'] = False # NMS class-agnostic
model.overrides['max_det'] = 1000 # maximum number of detections per image
results = model.predict(image, show=False)[0]
image = read_image_as_pil(image)
np_image = np.ascontiguousarray(image)
masks, boxes = results.masks, results.boxes
area_image = image.width*image.height
object_predictions = []
html_bot_voice = ""
if boxes is not None:
det_ind = 0
for xyxy, conf, cls in zip(boxes.xyxy, boxes.conf, boxes.cls):
if int(cls) != 0:
continue
box = xyxy.tolist()
area_rate = (box[2] - box[0]) * (box[3] - box[1]) / area_image
if area_rate >= area_thres:
object_prediction = ObjectPrediction(
bbox=box,
category_name=CLASS[int(cls)],
category_id=int(cls),
score=area_rate,
)
object_predictions.append(object_prediction)
det_ind += 1
html_bot_voice = tts(defaul_bot_voice, language="ja")
result = visualize_object_predictions(
image=np_image,
object_prediction_list=object_predictions,
rect_th=2,
text_th=2,
)
return Image.fromarray(result["image"]), html_bot_voice
outputs = [gr.Image(type="filepath", label="Output Image"),
gr.HTML()]
title = "State-of-the-Art YOLO Models for Object detection"
demo_app = gr.Interface(
fn=yolov8_inference,
inputs=gr.Image(source="webcam", streaming=True, label="Input Image"),
outputs=outputs,
title=title,
live=True,
)
demo_app.launch(debug=True) |