Spaces:
Sleeping
Sleeping
# coding=utf-8 | |
import base64 | |
import io | |
import os | |
import re | |
import tempfile | |
import gradio as gr | |
import librosa | |
import numpy as np | |
import soundfile as sf | |
import spaces | |
import torch | |
import torchaudio | |
from funasr import AutoModel | |
from sv import clean_and_emoji_annotate_speech, process_audio | |
def model_inference(input_wav, language, fs=16000): | |
language_abbr = { | |
"auto": "auto", | |
"zh": "zh", | |
"en": "en", | |
"yue": "yue", | |
"ja": "ja", | |
"ko": "ko", | |
"nospeech": "nospeech", | |
} | |
language = "auto" if len(language) < 1 else language | |
selected_language = language_abbr[language] | |
# Handle input_wav format | |
if isinstance(input_wav, tuple): | |
fs, input_wav = input_wav | |
input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max | |
if len(input_wav.shape) > 1: | |
input_wav = input_wav.mean(-1) | |
if fs != 16000: | |
resampler = torchaudio.transforms.Resample(fs, 16000) | |
input_wav_t = torch.from_numpy(input_wav).to(torch.float32) | |
input_wav = resampler(input_wav_t[None, :])[0, :].numpy() | |
# Save the input audio to a temporary file | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio: | |
sf.write(temp_audio.name, input_wav, 16000) | |
temp_audio_path = temp_audio.name | |
try: | |
# Process the audio using the function from sv.py | |
result = process_audio(temp_audio_path, language=selected_language) | |
finally: | |
# Remove the temporary audio file | |
os.remove(temp_audio_path) | |
return result | |
audio_examples = [ | |
["example/mtr.mp3", "auto"], | |
] | |
def launch(): | |
with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
with gr.Row(): | |
with gr.Column(): | |
audio_inputs = gr.Audio(label="Upload audio or use the microphone") | |
with gr.Accordion("Configuration"): | |
language_inputs = gr.Dropdown( | |
choices=["auto", "zh", "en", "yue", "ja", "ko", "nospeech"], | |
value="auto", | |
label="Language", | |
) | |
fn_button = gr.Button("Start", variant="primary") | |
text_outputs = gr.Textbox(label="Results") | |
gr.Examples( | |
examples=audio_examples, | |
inputs=[audio_inputs, language_inputs], | |
examples_per_page=20, | |
) | |
fn_button.click( | |
model_inference, | |
inputs=[audio_inputs, language_inputs], | |
outputs=text_outputs, | |
) | |
demo.launch() | |
if __name__ == "__main__": | |
# iface.launch() | |
launch() | |