# coding=utf-8 import gradio as gr import numpy as np import soundfile as sf import spaces import torch import torchaudio from sv import process_audio @spaces.GPU def model_inference(input_wav, language): # Simplify language selection language = language if language else "auto" # Handle input_wav format if isinstance(input_wav, tuple): fs, input_wav = input_wav input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max input_wav = input_wav.mean(-1) if len(input_wav.shape) > 1 else input_wav if fs != 16000: resampler = torchaudio.transforms.Resample(fs, 16000) input_wav = resampler(torch.from_numpy(input_wav).float()[None, :])[ 0 ].numpy() # Process audio with sf.SoundFile("temp.wav", "w", samplerate=16000, channels=1) as f: f.write(input_wav) result = process_audio("temp.wav", language=language) return result def launch(): with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("# SenseVoice Audio Processing") with gr.Row(): with gr.Column(scale=2): gr.Markdown("## Input") audio_inputs = gr.Audio(label="Upload audio or use the microphone") language_inputs = gr.Dropdown( choices=["auto", "zh", "en", "yue", "ja", "ko", "nospeech"], value="auto", label="Language", ) fn_button = gr.Button("Process Audio", variant="primary") with gr.Column(scale=3): gr.Markdown("## Output") text_outputs = gr.Textbox(label="Results", lines=10) with gr.Row(): gr.Markdown("## Examples") gr.Examples( examples=[["example/mtr.mp3", "yue"]], inputs=[audio_inputs, language_inputs], outputs=text_outputs, fn=model_inference, ) fn_button.click( model_inference, inputs=[audio_inputs, language_inputs], outputs=text_outputs, ) demo.launch() if __name__ == "__main__": launch()