File size: 2,696 Bytes
5cb9c90 57d9268 5cb9c90 9ecefd1 5cb9c90 57d9268 5cb9c90 9ecefd1 57d9268 5cb9c90 9ecefd1 5cb9c90 27c943a 5cb9c90 57d9268 9ecefd1 57d9268 9ecefd1 57d9268 9ecefd1 57d9268 9ecefd1 5cb9c90 a08029c 5cb9c90 57d9268 5cb9c90 57d9268 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
# coding=utf-8
import base64
import io
import os
import re
import tempfile
import gradio as gr
import librosa
import numpy as np
import soundfile as sf
import spaces
import torch
import torchaudio
from funasr import AutoModel
from sv import clean_and_emoji_annotate_speech, process_audio
@spaces.GPU
def model_inference(input_wav, language, fs=16000):
language_abbr = {
"auto": "auto",
"zh": "zh",
"en": "en",
"yue": "yue",
"ja": "ja",
"ko": "ko",
"nospeech": "nospeech",
}
language = "auto" if len(language) < 1 else language
selected_language = language_abbr[language]
# Handle input_wav format
if isinstance(input_wav, tuple):
fs, input_wav = input_wav
input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
if len(input_wav.shape) > 1:
input_wav = input_wav.mean(-1)
if fs != 16000:
resampler = torchaudio.transforms.Resample(fs, 16000)
input_wav_t = torch.from_numpy(input_wav).to(torch.float32)
input_wav = resampler(input_wav_t[None, :])[0, :].numpy()
# Save the input audio to a temporary file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
sf.write(temp_audio.name, input_wav, 16000)
temp_audio_path = temp_audio.name
try:
# Process the audio using the function from sv.py
result = process_audio(temp_audio_path, language=selected_language)
finally:
# Remove the temporary audio file
os.remove(temp_audio_path)
return result
audio_examples = [
["example/mtr.mp3", "auto"],
]
def launch():
with gr.Blocks(theme=gr.themes.Soft()) as demo:
with gr.Row():
with gr.Column():
audio_inputs = gr.Audio(label="Upload audio or use the microphone")
with gr.Accordion("Configuration"):
language_inputs = gr.Dropdown(
choices=["auto", "zh", "en", "yue", "ja", "ko", "nospeech"],
value="auto",
label="Language",
)
fn_button = gr.Button("Start", variant="primary")
text_outputs = gr.Textbox(label="Results")
gr.Examples(
examples=audio_examples,
inputs=[audio_inputs, language_inputs],
examples_per_page=20,
)
fn_button.click(
model_inference,
inputs=[audio_inputs, language_inputs],
outputs=text_outputs,
)
demo.launch()
if __name__ == "__main__":
# iface.launch()
launch()
|