terry-li-hm commited on
Commit
103d57b
·
1 Parent(s): 794435e
Files changed (1) hide show
  1. app.py +23 -55
app.py CHANGED
@@ -1,87 +1,56 @@
1
  # coding=utf-8
2
 
3
- import base64
4
- import io
5
- import os
6
- import re
7
- import tempfile
8
-
9
  import gradio as gr
10
- import librosa
11
  import numpy as np
12
  import soundfile as sf
13
  import spaces
14
  import torch
15
  import torchaudio
16
- from funasr import AutoModel
17
- from sv import clean_and_emoji_annotate_speech, process_audio
18
 
19
 
20
  @spaces.GPU
21
- def model_inference(input_wav, language, fs=16000):
22
- language_abbr = {
23
- "auto": "auto",
24
- "zh": "zh",
25
- "en": "en",
26
- "yue": "yue",
27
- "ja": "ja",
28
- "ko": "ko",
29
- "nospeech": "nospeech",
30
- }
31
-
32
- language = "auto" if len(language) < 1 else language
33
- selected_language = language_abbr[language]
34
 
35
  # Handle input_wav format
36
  if isinstance(input_wav, tuple):
37
  fs, input_wav = input_wav
38
  input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
39
- if len(input_wav.shape) > 1:
40
- input_wav = input_wav.mean(-1)
41
  if fs != 16000:
42
  resampler = torchaudio.transforms.Resample(fs, 16000)
43
- input_wav_t = torch.from_numpy(input_wav).to(torch.float32)
44
- input_wav = resampler(input_wav_t[None, :])[0, :].numpy()
45
-
46
- # Save the input audio to a temporary file
47
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
48
- sf.write(temp_audio.name, input_wav, 16000)
49
- temp_audio_path = temp_audio.name
50
 
51
- try:
52
- # Process the audio using the function from sv.py
53
- result = process_audio(temp_audio_path, language=selected_language)
54
- finally:
55
- # Remove the temporary audio file
56
- os.remove(temp_audio_path)
57
 
58
  return result
59
 
60
 
61
- audio_examples = [
62
- ["example/mtr.mp3", "auto"],
63
- ]
64
-
65
-
66
  def launch():
67
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
68
  with gr.Row():
69
  with gr.Column():
70
  audio_inputs = gr.Audio(label="Upload audio or use the microphone")
71
-
72
- with gr.Accordion("Configuration"):
73
- language_inputs = gr.Dropdown(
74
- choices=["auto", "zh", "en", "yue", "ja", "ko", "nospeech"],
75
- value="auto",
76
- label="Language",
77
- )
78
  fn_button = gr.Button("Start", variant="primary")
79
  text_outputs = gr.Textbox(label="Results")
80
- gr.Examples(
81
- examples=audio_examples,
82
- inputs=[audio_inputs, language_inputs],
83
- examples_per_page=20,
84
- )
 
85
 
86
  fn_button.click(
87
  model_inference,
@@ -93,5 +62,4 @@ def launch():
93
 
94
 
95
  if __name__ == "__main__":
96
- # iface.launch()
97
  launch()
 
1
  # coding=utf-8
2
 
 
 
 
 
 
 
3
  import gradio as gr
 
4
  import numpy as np
5
  import soundfile as sf
6
  import spaces
7
  import torch
8
  import torchaudio
9
+ from sv import process_audio
 
10
 
11
 
12
  @spaces.GPU
13
+ def model_inference(input_wav, language):
14
+ # Simplify language selection
15
+ language = language if language else "auto"
 
 
 
 
 
 
 
 
 
 
16
 
17
  # Handle input_wav format
18
  if isinstance(input_wav, tuple):
19
  fs, input_wav = input_wav
20
  input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
21
+ input_wav = input_wav.mean(-1) if len(input_wav.shape) > 1 else input_wav
 
22
  if fs != 16000:
23
  resampler = torchaudio.transforms.Resample(fs, 16000)
24
+ input_wav = resampler(torch.from_numpy(input_wav).float()[None, :])[
25
+ 0
26
+ ].numpy()
 
 
 
 
27
 
28
+ # Process audio
29
+ with sf.SoundFile("temp.wav", "w", samplerate=16000, channels=1) as f:
30
+ f.write(input_wav)
31
+ result = process_audio("temp.wav", language=language)
 
 
32
 
33
  return result
34
 
35
 
 
 
 
 
 
36
  def launch():
37
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
38
  with gr.Row():
39
  with gr.Column():
40
  audio_inputs = gr.Audio(label="Upload audio or use the microphone")
41
+ language_inputs = gr.Dropdown(
42
+ choices=["auto", "zh", "en", "yue", "ja", "ko", "nospeech"],
43
+ value="auto",
44
+ label="Language",
45
+ )
 
 
46
  fn_button = gr.Button("Start", variant="primary")
47
  text_outputs = gr.Textbox(label="Results")
48
+
49
+ gr.Examples(
50
+ examples=[["example/mtr.mp3", "yue"]],
51
+ inputs=[audio_inputs, language_inputs],
52
+ examples_per_page=20,
53
+ )
54
 
55
  fn_button.click(
56
  model_inference,
 
62
 
63
 
64
  if __name__ == "__main__":
 
65
  launch()