Spaces:
Sleeping
Sleeping
# coding=utf-8 | |
import gradio as gr | |
import numpy as np | |
import soundfile as sf | |
import spaces | |
import torch | |
import torchaudio | |
from sv import process_audio | |
def model_inference(input_wav, language): | |
# Simplify language selection | |
language = language if language else "auto" | |
# Handle input_wav format | |
if isinstance(input_wav, tuple): | |
fs, input_wav = input_wav | |
input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max | |
input_wav = input_wav.mean(-1) if len(input_wav.shape) > 1 else input_wav | |
if fs != 16000: | |
resampler = torchaudio.transforms.Resample(fs, 16000) | |
input_wav = resampler(torch.from_numpy(input_wav).float()[None, :])[ | |
0 | |
].numpy() | |
# Process audio | |
with sf.SoundFile("temp.wav", "w", samplerate=16000, channels=1) as f: | |
f.write(input_wav) | |
result = process_audio("temp.wav", language=language) | |
return result | |
def launch(): | |
with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
gr.Markdown("# SenseVoice Audio Processing") | |
with gr.Row(): | |
with gr.Column(scale=2): | |
gr.Markdown("## Input") | |
audio_inputs = gr.Audio(label="Upload audio or use the microphone") | |
language_inputs = gr.Dropdown( | |
choices=["auto", "zh", "en", "yue", "ja", "ko", "nospeech"], | |
value="auto", | |
label="Language", | |
) | |
fn_button = gr.Button("Process Audio", variant="primary") | |
with gr.Column(scale=3): | |
gr.Markdown("## Output") | |
text_outputs = gr.Textbox(label="Results", lines=10) | |
with gr.Row(): | |
gr.Markdown("## Examples") | |
gr.Examples( | |
examples=[["example/mtr.mp3", "yue"]], | |
inputs=[audio_inputs, language_inputs], | |
outputs=text_outputs, | |
fn=model_inference, | |
) | |
fn_button.click( | |
model_inference, | |
inputs=[audio_inputs, language_inputs], | |
outputs=text_outputs, | |
) | |
demo.launch() | |
if __name__ == "__main__": | |
launch() | |