Spaces:
Sleeping
Sleeping
# coding=utf-8 | |
import gradio as gr | |
import numpy as np | |
import soundfile as sf | |
import spaces | |
import torch | |
import torchaudio | |
from gradio.themes import Base | |
from sv import process_audio | |
def model_inference(input_wav, language): | |
# Simplify language selection | |
language = language if language else "auto" | |
# Handle input_wav format | |
if isinstance(input_wav, tuple): | |
fs, input_wav = input_wav | |
input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max | |
input_wav = input_wav.mean(-1) if len(input_wav.shape) > 1 else input_wav | |
if fs != 16000: | |
resampler = torchaudio.transforms.Resample(fs, 16000) | |
input_wav = resampler(torch.from_numpy(input_wav).float()[None, :])[ | |
0 | |
].numpy() | |
# Process audio | |
with sf.SoundFile("temp.wav", "w", samplerate=16000, channels=1) as f: | |
f.write(input_wav) | |
result = process_audio("temp.wav", language=language) | |
return result | |
def launch(): | |
# Create a custom theme | |
custom_css = """ | |
.gradio-container {color: rgb(70, 70, 70);} | |
""" | |
with gr.Blocks(css=custom_css) as demo: | |
gr.Markdown("# Cantonese Call Transcriber") | |
gr.Markdown( | |
""" | |
This tool transcribes Cantonese audio calls into text. | |
## How to use: | |
1. Upload an audio file or use the example provided at the bottom of the page. | |
2. Click the 'Process Audio' button. | |
3. The transcription will appear in the output box. | |
""" | |
) | |
# Define components | |
audio_input = gr.Audio(label="Input") | |
text_output = gr.Textbox(lines=10, label="Output") | |
# Custom render function for Examples | |
def render_example(example): | |
return gr.Button("Try Example Audio") | |
# Update the Examples component | |
gr.Examples( | |
examples=[["example/scb.mp3"]], | |
inputs=[audio_input], | |
outputs=[text_output], | |
fn=lambda x: model_inference(x, "yue"), | |
examples_per_page=1, | |
) | |
# Main interface | |
with gr.Row(): | |
with gr.Column(scale=2): | |
audio_input | |
fn_button = gr.Button("Process Audio", variant="primary") | |
with gr.Column(scale=3): | |
text_output | |
# Set up event handler | |
fn_button.click( | |
fn=lambda x: model_inference(x, "yue"), | |
inputs=[audio_input], | |
outputs=[text_output], | |
) | |
demo.launch() | |
if __name__ == "__main__": | |
launch() | |