aisatsu-api / utils.py
vumichien's picture
Update utils.py
cd09ca8
raw
history blame
3.13 kB
from gtts import gTTS
from io import BytesIO
import base64
from PIL import Image
import cv2
import numpy as np
import subprocess
from speech_recognition import AudioFile, Recognizer
def tts(text: str, language="ja", encode=False) -> object:
"""Converts text into autoplay html.
Args:
text (str): generated answer of bot
language (str): language of text
encode (bool): if True, return base64 encoded string
Returns:
html: autoplay object
"""
tts_object = gTTS(text=text, lang=language, slow=False)
if encode:
bytes_object = BytesIO()
tts_object.write_to_fp(bytes_object)
bytes_object.seek(0)
b64 = base64.b64encode(bytes_object.getvalue()).decode()
return b64
else:
tts_object.save("temp.mp3")
return "temp.mp3"
def stt(audio: object, language='ja') -> str:
"""Converts speech to text.
Args:
audio: record of user speech
language (str): language of text
Returns:
text (str): recognized speech of user
"""
# Create a Recognizer object
r = Recognizer()
# Open the audio file
with AudioFile(audio) as source:
# Listen for the data (load audio to memory)
audio_data = r.record(source)
# Transcribe the audio using Google's speech-to-text API
text = r.recognize_google(audio_data, language=language)
return text
def read_image_file(file) -> Image.Image:
image = Image.open(BytesIO(file))
return image
def pil_to_base64(img, format="jpeg", encode=False):
if encode:
bytes_object = BytesIO()
img.save(bytes_object, format)
bytes_object.seek(0)
b64 = base64.b64encode(bytes_object.getvalue()).decode("ascii")
return b64
else:
temp_path = f"temp.{format}"
img.save(temp_path)
return temp_path
def base64_to_pil(img_str):
if "base64," in img_str:
img_str = img_str.split(",")[1]
img_raw = base64.b64decode(img_str)
img = Image.open(BytesIO(img_raw))
return img
def get_hist(image):
hist = cv2.calcHist([np.array(image)], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
hist = cv2.normalize(hist, hist).flatten()
return hist
def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
"""
Helper function to read an audio file through ffmpeg.
"""
ar = f"{sampling_rate}"
ac = "1"
format_for_conversion = "f32le"
ffmpeg_command = [
"ffmpeg",
"-i",
"pipe:0",
"-ac",
ac,
"-ar",
ar,
"-f",
format_for_conversion,
"-hide_banner",
"-loglevel",
"quiet",
"pipe:1",
]
try:
ffmpeg_process = subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
except FileNotFoundError:
raise ValueError("ffmpeg was not found but is required to load audio files from filename")
output_stream = ffmpeg_process.communicate(bpayload)
out_bytes = output_stream[0]
audio = np.frombuffer(out_bytes, np.float32)
return audio