# coding=utf-8

import gradio as gr
import numpy as np
import soundfile as sf
import spaces
import torch
import torchaudio
from gradio.themes.utils import colors
from sv import process_audio


@spaces.GPU
def model_inference(input_wav, language):
    # Simplify language selection
    language = language if language else "auto"

    # Handle input_wav format
    if isinstance(input_wav, tuple):
        fs, input_wav = input_wav
        input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
        input_wav = input_wav.mean(-1) if len(input_wav.shape) > 1 else input_wav
        if fs != 16000:
            resampler = torchaudio.transforms.Resample(fs, 16000)
            input_wav = resampler(torch.from_numpy(input_wav).float()[None, :])[
                0
            ].numpy()

    # Process audio
    with sf.SoundFile("temp.wav", "w", samplerate=16000, channels=1) as f:
        f.write(input_wav)
    result = process_audio("temp.wav", language=language)

    return result


def launch():
    with gr.Blocks() as demo:
        gr.Markdown("# Cantonese Call Transcriber")
        gr.Markdown(
            """
        This tool transcribes Cantonese audio calls into text.
        
        ## How to use:
        1. Upload an audio file or use the example provided at the bottom of the page.
        2. Click the 'Process Audio' button.
        3. The transcription will appear in the output box.
        """
        )

        # Define components
        audio_input = gr.Audio(label="Input")
        text_output = gr.Textbox(lines=10, label="Output")

        # Place the Examples component first
        gr.Examples(
            examples=[["example/scb.mp3"]],
            inputs=[audio_input],
            outputs=[text_output],
            fn=lambda x: model_inference(x, "yue"),
            examples_per_page=1,
        )

        # Main interface
        with gr.Row():
            with gr.Column(scale=2):
                audio_input
                fn_button = gr.Button("Process Audio", variant="primary")

            with gr.Column(scale=3):
                text_output

        # Set up event handler
        fn_button.click(
            fn=lambda x: model_inference(x, "yue"),
            inputs=[audio_input],
            outputs=[text_output],
        )

    demo.launch()


if __name__ == "__main__":
    launch()