cantonese-call-transcriber

Sleeping

terry-li-hm

d199239 10 months ago

2.33 kB

	# coding=utf-8

	import gradio as gr
	import numpy as np
	import soundfile as sf
	import spaces
	import torch
	import torchaudio
	from gradio.themes.utils import colors
	from sv import process_audio


	@spaces.GPU
	def model_inference(input_wav, language):
	# Simplify language selection
	language = language if language else "auto"

	# Handle input_wav format
	if isinstance(input_wav, tuple):
	fs, input_wav = input_wav
	input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
	input_wav = input_wav.mean(-1) if len(input_wav.shape) > 1 else input_wav
	if fs != 16000:
	resampler = torchaudio.transforms.Resample(fs, 16000)
	input_wav = resampler(torch.from_numpy(input_wav).float()[None, :])[
	0
	].numpy()

	# Process audio
	with sf.SoundFile("temp.wav", "w", samplerate=16000, channels=1) as f:
	f.write(input_wav)
	result = process_audio("temp.wav", language=language)

	return result


	def launch():
	with gr.Blocks() as demo:
	gr.Markdown("# Cantonese Call Transcriber")
	gr.Markdown(
	"""
	This tool transcribes Cantonese audio calls into text.

	## How to use:
	1. Upload an audio file or use the provided example.
	2. Click the 'Process Audio' button.
	3. The transcription will appear in the output box.
	"""
	)

	# Define components
	audio_input = gr.Audio(label="Input")
	text_output = gr.Textbox(lines=10, label="Output")

	# Place the Examples component first
	gr.Examples(
	examples=[["example/scb.mp3"]],
	inputs=[audio_input],
	outputs=[text_output],
	fn=lambda x: model_inference(x, "yue"),
	examples_per_page=1,
	)

	# Main interface
	with gr.Row():
	with gr.Column(scale=2):
	audio_input
	fn_button = gr.Button("Process Audio", variant="primary")

	with gr.Column(scale=3):
	text_output

	# Set up event handler
	fn_button.click(
	fn=lambda x: model_inference(x, "yue"),
	inputs=[audio_input],
	outputs=[text_output],
	)

	demo.launch()


	if __name__ == "__main__":
	launch()