|
|
|
|
|
import gradio as gr |
|
import numpy as np |
|
import soundfile as sf |
|
import spaces |
|
import torch |
|
import torchaudio |
|
from sv import process_audio |
|
|
|
|
|
@spaces.GPU |
|
def model_inference(input_wav, language): |
|
|
|
language = language if language else "auto" |
|
|
|
|
|
if isinstance(input_wav, tuple): |
|
fs, input_wav = input_wav |
|
input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max |
|
input_wav = input_wav.mean(-1) if len(input_wav.shape) > 1 else input_wav |
|
if fs != 16000: |
|
resampler = torchaudio.transforms.Resample(fs, 16000) |
|
input_wav = resampler(torch.from_numpy(input_wav).float()[None, :])[ |
|
0 |
|
].numpy() |
|
|
|
|
|
with sf.SoundFile("temp.wav", "w", samplerate=16000, channels=1) as f: |
|
f.write(input_wav) |
|
result = process_audio("temp.wav", language=language) |
|
|
|
return result |
|
|
|
|
|
def launch(): |
|
with gr.Blocks(theme=gr.themes.Soft()) as demo: |
|
gr.Markdown("# Cantonese Call Transcriber") |
|
gr.Markdown("## Try an example:") |
|
|
|
|
|
audio_inputs = gr.Audio(label="Input") |
|
text_outputs = gr.Textbox(lines=10, label="Output") |
|
|
|
|
|
with gr.Row(): |
|
gr.Examples( |
|
examples=[["example/scb.mp3"]], |
|
inputs=[audio_inputs], |
|
outputs=text_outputs, |
|
fn=lambda x: model_inference(x, "yue"), |
|
examples_per_page=1, |
|
scale=1, |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=2): |
|
|
|
audio_inputs |
|
fn_button = gr.Button("Process Audio", variant="primary") |
|
|
|
with gr.Column(scale=3): |
|
|
|
text_outputs |
|
|
|
demo.launch() |
|
|
|
|
|
if __name__ == "__main__": |
|
launch() |
|
|