terry-li-hm
Update
96bcc68
raw
history blame
2.7 kB
# coding=utf-8
import base64
import io
import os
import re
import tempfile
import gradio as gr
import librosa
import numpy as np
import soundfile as sf
import spaces
import torch
import torchaudio
from funasr import AutoModel
from sv import clean_and_emoji_annotate_speech, process_audio
@spaces.GPU
def model_inference(input_wav, language, fs=16000):
language_abbr = {
"auto": "auto",
"zh": "zh",
"en": "en",
"yue": "yue",
"ja": "ja",
"ko": "ko",
"nospeech": "nospeech",
}
language = "auto" if len(language) < 1 else language
selected_language = language_abbr[language]
# Handle input_wav format
if isinstance(input_wav, tuple):
fs, input_wav = input_wav
input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
if len(input_wav.shape) > 1:
input_wav = input_wav.mean(-1)
if fs != 16000:
resampler = torchaudio.transforms.Resample(fs, 16000)
input_wav_t = torch.from_numpy(input_wav).to(torch.float32)
input_wav = resampler(input_wav_t[None, :])[0, :].numpy()
# Save the input audio to a temporary file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
sf.write(temp_audio.name, input_wav, 16000)
temp_audio_path = temp_audio.name
try:
# Process the audio using the function from sv.py
result = process_audio(temp_audio_path, language=selected_language)
finally:
# Remove the temporary audio file
os.remove(temp_audio_path)
return result
audio_examples = [
["example/mtr.mp3", "auto"],
]
def launch():
with gr.Blocks(theme=gr.themes.Soft()) as demo:
with gr.Row():
with gr.Column():
audio_inputs = gr.Audio(label="Upload audio or use the microphone")
with gr.Accordion("Configuration"):
language_inputs = gr.Dropdown(
choices=["auto", "zh", "en", "yue", "ja", "ko", "nospeech"],
value="auto",
label="Language",
)
fn_button = gr.Button("Start", variant="primary")
text_outputs = gr.Textbox(label="Results")
gr.Examples(
examples=audio_examples,
inputs=[audio_inputs, language_inputs],
examples_per_page=20,
)
fn_button.click(
model_inference,
inputs=[audio_inputs, language_inputs],
outputs=text_outputs,
)
demo.launch()
if __name__ == "__main__":
# iface.launch()
launch()