Spaces:
Sleeping
Sleeping
# coding=utf-8 | |
import gradio as gr | |
import numpy as np | |
import soundfile as sf | |
import spaces | |
import torch | |
import torchaudio | |
from sv import process_audio | |
def model_inference(input_wav, language): | |
# Simplify language selection | |
language = language if language else "auto" | |
# Handle input_wav format | |
if isinstance(input_wav, tuple): | |
fs, input_wav = input_wav | |
input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max | |
input_wav = input_wav.mean(-1) if len(input_wav.shape) > 1 else input_wav | |
if fs != 16000: | |
resampler = torchaudio.transforms.Resample(fs, 16000) | |
input_wav = resampler(torch.from_numpy(input_wav).float()[None, :])[ | |
0 | |
].numpy() | |
# Process audio | |
with sf.SoundFile("temp.wav", "w", samplerate=16000, channels=1) as f: | |
f.write(input_wav) | |
result = process_audio("temp.wav", language=language) | |
return result | |
def launch(): | |
with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
gr.Markdown("# Cantonese Call Transcriber") | |
gr.Markdown("## Try an example:") | |
# Define components first | |
audio_inputs = gr.Audio(label="Input") | |
text_outputs = gr.Textbox(lines=10, label="Output") | |
# Place the Examples component above the input | |
with gr.Row(): | |
gr.Examples( | |
examples=[["example/scb.mp3"]], | |
inputs=[audio_inputs], | |
outputs=text_outputs, | |
fn=lambda x: model_inference(x, "yue"), | |
examples_per_page=1, | |
) | |
with gr.Row(): | |
with gr.Column(scale=2): | |
# Audio input is already defined, just reference it here | |
audio_inputs | |
fn_button = gr.Button("Process Audio", variant="primary") | |
with gr.Column(scale=3): | |
# Text output is already defined, just reference it here | |
text_outputs | |
demo.launch() | |
if __name__ == "__main__": | |
launch() | |