File size: 2,696 Bytes
5cb9c90
 
 
 
57d9268
5cb9c90
9ecefd1
5cb9c90
57d9268
 
5cb9c90
9ecefd1
57d9268
5cb9c90
 
 
9ecefd1
5cb9c90
 
27c943a
5cb9c90
57d9268
 
 
 
 
 
 
 
 
 
 
 
 
9ecefd1
57d9268
 
 
 
 
 
 
 
 
 
9ecefd1
 
 
 
57d9268
9ecefd1
 
 
 
 
 
57d9268
9ecefd1
5cb9c90
 
 
a08029c
5cb9c90
 
 
 
57d9268
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5cb9c90
 
 
57d9268
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# coding=utf-8

import base64
import io
import os
import re
import tempfile

import gradio as gr
import librosa
import numpy as np
import soundfile as sf
import spaces
import torch
import torchaudio
from funasr import AutoModel
from sv import clean_and_emoji_annotate_speech, process_audio


@spaces.GPU
def model_inference(input_wav, language, fs=16000):
    language_abbr = {
        "auto": "auto",
        "zh": "zh",
        "en": "en",
        "yue": "yue",
        "ja": "ja",
        "ko": "ko",
        "nospeech": "nospeech",
    }

    language = "auto" if len(language) < 1 else language
    selected_language = language_abbr[language]

    # Handle input_wav format
    if isinstance(input_wav, tuple):
        fs, input_wav = input_wav
        input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
        if len(input_wav.shape) > 1:
            input_wav = input_wav.mean(-1)
        if fs != 16000:
            resampler = torchaudio.transforms.Resample(fs, 16000)
            input_wav_t = torch.from_numpy(input_wav).to(torch.float32)
            input_wav = resampler(input_wav_t[None, :])[0, :].numpy()

    # Save the input audio to a temporary file
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
        sf.write(temp_audio.name, input_wav, 16000)
        temp_audio_path = temp_audio.name

    try:
        # Process the audio using the function from sv.py
        result = process_audio(temp_audio_path, language=selected_language)
    finally:
        # Remove the temporary audio file
        os.remove(temp_audio_path)

    return result


audio_examples = [
    ["example/mtr.mp3", "auto"],
]


def launch():
    with gr.Blocks(theme=gr.themes.Soft()) as demo:
        with gr.Row():
            with gr.Column():
                audio_inputs = gr.Audio(label="Upload audio or use the microphone")

                with gr.Accordion("Configuration"):
                    language_inputs = gr.Dropdown(
                        choices=["auto", "zh", "en", "yue", "ja", "ko", "nospeech"],
                        value="auto",
                        label="Language",
                    )
                fn_button = gr.Button("Start", variant="primary")
                text_outputs = gr.Textbox(label="Results")
            gr.Examples(
                examples=audio_examples,
                inputs=[audio_inputs, language_inputs],
                examples_per_page=20,
            )

        fn_button.click(
            model_inference,
            inputs=[audio_inputs, language_inputs],
            outputs=text_outputs,
        )

    demo.launch()


if __name__ == "__main__":
    # iface.launch()
    launch()