File size: 1,888 Bytes
5cb9c90 57d9268 5cb9c90 9ecefd1 57d9268 5cb9c90 103d57b 5cb9c90 27c943a 103d57b 57d9268 9ecefd1 57d9268 103d57b 57d9268 103d57b 57d9268 103d57b 57d9268 9ecefd1 5cb9c90 57d9268 103d57b 57d9268 103d57b 57d9268 5cb9c90 57d9268 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
# coding=utf-8
import gradio as gr
import numpy as np
import soundfile as sf
import spaces
import torch
import torchaudio
from sv import process_audio
@spaces.GPU
def model_inference(input_wav, language):
# Simplify language selection
language = language if language else "auto"
# Handle input_wav format
if isinstance(input_wav, tuple):
fs, input_wav = input_wav
input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
input_wav = input_wav.mean(-1) if len(input_wav.shape) > 1 else input_wav
if fs != 16000:
resampler = torchaudio.transforms.Resample(fs, 16000)
input_wav = resampler(torch.from_numpy(input_wav).float()[None, :])[
0
].numpy()
# Process audio
with sf.SoundFile("temp.wav", "w", samplerate=16000, channels=1) as f:
f.write(input_wav)
result = process_audio("temp.wav", language=language)
return result
def launch():
with gr.Blocks(theme=gr.themes.Soft()) as demo:
with gr.Row():
with gr.Column():
audio_inputs = gr.Audio(label="Upload audio or use the microphone")
language_inputs = gr.Dropdown(
choices=["auto", "zh", "en", "yue", "ja", "ko", "nospeech"],
value="auto",
label="Language",
)
fn_button = gr.Button("Start", variant="primary")
text_outputs = gr.Textbox(label="Results")
gr.Examples(
examples=[["example/mtr.mp3", "yue"]],
inputs=[audio_inputs, language_inputs],
examples_per_page=20,
)
fn_button.click(
model_inference,
inputs=[audio_inputs, language_inputs],
outputs=text_outputs,
)
demo.launch()
if __name__ == "__main__":
launch()
|