File size: 2,204 Bytes
5cb9c90
 
57d9268
5cb9c90
9ecefd1
57d9268
5cb9c90
 
103d57b
5cb9c90
 
27c943a
103d57b
 
 
57d9268
9ecefd1
57d9268
 
 
103d57b
57d9268
 
103d57b
 
 
57d9268
103d57b
 
 
 
57d9268
9ecefd1
5cb9c90
 
 
57d9268
666f662
 
57d9268
666f662
 
57d9268
103d57b
 
 
 
 
666f662
103d57b
666f662
 
 
 
 
 
 
 
 
 
 
 
57d9268
 
 
 
 
 
 
 
5cb9c90
 
 
57d9268
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# coding=utf-8

import gradio as gr
import numpy as np
import soundfile as sf
import spaces
import torch
import torchaudio
from sv import process_audio


@spaces.GPU
def model_inference(input_wav, language):
    # Simplify language selection
    language = language if language else "auto"

    # Handle input_wav format
    if isinstance(input_wav, tuple):
        fs, input_wav = input_wav
        input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
        input_wav = input_wav.mean(-1) if len(input_wav.shape) > 1 else input_wav
        if fs != 16000:
            resampler = torchaudio.transforms.Resample(fs, 16000)
            input_wav = resampler(torch.from_numpy(input_wav).float()[None, :])[
                0
            ].numpy()

    # Process audio
    with sf.SoundFile("temp.wav", "w", samplerate=16000, channels=1) as f:
        f.write(input_wav)
    result = process_audio("temp.wav", language=language)

    return result


def launch():
    with gr.Blocks(theme=gr.themes.Soft()) as demo:
        gr.Markdown("# SenseVoice Audio Processing")

        with gr.Row():
            with gr.Column(scale=2):
                gr.Markdown("## Input")
                audio_inputs = gr.Audio(label="Upload audio or use the microphone")
                language_inputs = gr.Dropdown(
                    choices=["auto", "zh", "en", "yue", "ja", "ko", "nospeech"],
                    value="auto",
                    label="Language",
                )
                fn_button = gr.Button("Process Audio", variant="primary")

            with gr.Column(scale=3):
                gr.Markdown("## Output")
                text_outputs = gr.Textbox(label="Results", lines=10)

        with gr.Row():
            gr.Markdown("## Examples")
            gr.Examples(
                examples=[["example/mtr.mp3", "yue"]],
                inputs=[audio_inputs, language_inputs],
                outputs=text_outputs,
                fn=model_inference,
            )

        fn_button.click(
            model_inference,
            inputs=[audio_inputs, language_inputs],
            outputs=text_outputs,
        )

    demo.launch()


if __name__ == "__main__":
    launch()