|
|
|
|
|
import gradio as gr |
|
import numpy as np |
|
import soundfile as sf |
|
import spaces |
|
import torch |
|
import torchaudio |
|
from sv import process_audio |
|
|
|
|
|
@spaces.GPU |
|
def model_inference(input_wav, language): |
|
|
|
language = language if language else "auto" |
|
|
|
|
|
if isinstance(input_wav, tuple): |
|
fs, input_wav = input_wav |
|
input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max |
|
input_wav = input_wav.mean(-1) if len(input_wav.shape) > 1 else input_wav |
|
if fs != 16000: |
|
resampler = torchaudio.transforms.Resample(fs, 16000) |
|
input_wav = resampler(torch.from_numpy(input_wav).float()[None, :])[ |
|
0 |
|
].numpy() |
|
|
|
|
|
with sf.SoundFile("temp.wav", "w", samplerate=16000, channels=1) as f: |
|
f.write(input_wav) |
|
result = process_audio("temp.wav", language=language) |
|
|
|
return result |
|
|
|
|
|
def launch(): |
|
with gr.Blocks(theme=gr.themes.Soft()) as demo: |
|
gr.Markdown("# SenseVoice Audio Processing") |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=2): |
|
gr.Markdown("## Input") |
|
audio_inputs = gr.Audio(label="Upload audio or use the microphone") |
|
language_inputs = gr.Dropdown( |
|
choices=["auto", "zh", "en", "yue", "ja", "ko", "nospeech"], |
|
value="auto", |
|
label="Language", |
|
) |
|
fn_button = gr.Button("Process Audio", variant="primary") |
|
|
|
with gr.Column(scale=3): |
|
gr.Markdown("## Output") |
|
text_outputs = gr.Textbox(label="Results", lines=10) |
|
|
|
with gr.Row(): |
|
gr.Markdown("## Examples") |
|
gr.Examples( |
|
examples=[["example/mtr.mp3", "yue"]], |
|
inputs=[audio_inputs, language_inputs], |
|
outputs=text_outputs, |
|
fn=model_inference, |
|
) |
|
|
|
fn_button.click( |
|
model_inference, |
|
inputs=[audio_inputs, language_inputs], |
|
outputs=text_outputs, |
|
) |
|
|
|
demo.launch() |
|
|
|
|
|
if __name__ == "__main__": |
|
launch() |
|
|