Spaces:

Detomo
/

aisatsu-api

Sleeping

File size: 2,499 Bytes

f840717
06bad2e
 
 
 
b7f8699
06bad2e
ca0120c
 
06bad2e
cf598a0
06bad2e
cf598a0
 
 
 
 
06bad2e
08ce8d2
 
6a076a8
b7f8699
5138fea
 
 
 
 
 
ca0120c
5138fea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca0120c
5138fea
 
 
 
ca0120c

from ultralytics import YOLO
from base64 import b64encode
from speech_recognition import AudioFile, Recognizer
import numpy as np
from scipy.spatial import distance as dist

from sahi.utils.cv import read_image_as_pil
from fastapi import FastAPI, File, UploadFile
from utils import tts, read_image_file, pil_to_base64, get_hist
from typing import Optional
from huggingface_hub import hf_hub_download

model_path = hf_hub_download(repo_id="ultralyticsplus/yolov8s", filename='yolov8s.pt')
model = YOLO(model_path)
CLASS = model.model.names
defaul_bot_voice = "おはいようございます"
area_thres = 0.3

app = FastAPI()

@app.get("/")
def read_root():
    return {"Message": "Application startup complete"}


@app.post("/aisatsu_api/")
async def predict_api(
        file: UploadFile = File(...),
        last_seen: Union[UploadFile, None] = File(None)
):
    image = read_image_file(await file.read())
    results = model.predict(image, show=False)[0]
    image = read_image_as_pil(image)
    masks, boxes = results.masks, results.boxes
    area_image = image.width * image.height
    most_close = 0
    out_img = None
    diff_value = 0.5
    if boxes is not None:
        for xyxy, conf, cls in zip(boxes.xyxy, boxes.conf, boxes.cls):
            if int(cls) != 0:
                continue
            box = xyxy.tolist()
            area_rate = (box[2] - box[0]) * (box[3] - box[1]) / area_image
            if area_rate >= most_close:
                out_img = image.crop(tuple(box)).resize((64, 64))
                most_close = area_rate
    if last_seen is not None:
        last_seen = read_image_file(await last_seen.read())
        if out_img is not None:
            diff_value = dist.euclidean(get_hist(out_img), get_hist(last_seen))
    print(most_close, diff_value)
    if most_close >= area_thres and diff_value >= 0.5:
        voice_bot_path = tts(defaul_bot_voice, language="ja")
        image_bot_path = pil_to_base64(out_img)
        io = BytesIO()
        zip_filename = "final_archive.zip"
        with zipfile.ZipFile(io, mode='w', compression=zipfile.ZIP_DEFLATED) as zf:
            for file_path in [voice_bot_path, image_bot_path]:
                zf.write(file_path)
            zf.close()
        return StreamingResponse(
            iter([io.getvalue()]),
            media_type="application/x-zip-compressed",
            headers={"Content-Disposition": f"attachment;filename=%s" % zip_filename}
        )
    else:
        return {"message": "No face detected"}