# coding=utf-8 import gradio as gr import numpy as np import soundfile as sf import spaces import torch import torchaudio from sv import process_audio @spaces.GPU def model_inference(input_wav, language): # Simplify language selection language = language if language else "auto" # Handle input_wav format if isinstance(input_wav, tuple): fs, input_wav = input_wav input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max input_wav = input_wav.mean(-1) if len(input_wav.shape) > 1 else input_wav if fs != 16000: resampler = torchaudio.transforms.Resample(fs, 16000) input_wav = resampler(torch.from_numpy(input_wav).float()[None, :])[ 0 ].numpy() # Process audio with sf.SoundFile("temp.wav", "w", samplerate=16000, channels=1) as f: f.write(input_wav) result = process_audio("temp.wav", language=language) return result def launch(): with gr.Blocks(theme=gr.themes.Soft()) as demo: with gr.Row(): gr.Examples( examples=[["example/scb.mp3"]], inputs=[audio_inputs], outputs=text_outputs, fn=lambda x: model_inference(x, "yue"), ) with gr.Row(): with gr.Column(scale=2): audio_inputs = gr.Audio(label="Input") fn_button = gr.Button("Process Audio", variant="primary") with gr.Column(scale=3): text_outputs = gr.Textbox(lines=10, label="Output") demo.launch() if __name__ == "__main__": launch()