terry-li-hm commited on
Commit
9ecefd1
·
1 Parent(s): 09b9a40
Files changed (1) hide show
  1. app.py +15 -22
app.py CHANGED
@@ -4,14 +4,17 @@ import base64
4
  import io
5
  import os
6
  import re
 
7
 
8
  import gradio as gr
9
  import librosa
10
  import numpy as np
 
11
  import spaces
12
  import torch
13
  import torchaudio
14
  from funasr import AutoModel
 
15
 
16
  model = "FunAudioLLM/SenseVoiceSmall"
17
  model = AutoModel(
@@ -145,7 +148,6 @@ def format_str_v3(s):
145
 
146
  @spaces.GPU
147
  def model_inference(input_wav, language, fs=16000):
148
- # task_abbr = {"Speech Recognition": "ASR", "Rich Text Transcription": ("ASR", "AED", "SER")}
149
  language_abbr = {
150
  "auto": "auto",
151
  "zh": "zh",
@@ -156,42 +158,33 @@ def model_inference(input_wav, language, fs=16000):
156
  "nospeech": "nospeech",
157
  }
158
 
159
- # task = "Speech Recognition" if task is None else task
160
  language = "auto" if len(language) < 1 else language
161
  selected_language = language_abbr[language]
162
- # selected_task = task_abbr.get(task)
163
-
164
- # print(f"input_wav: {type(input_wav)}, {input_wav[1].shape}, {input_wav}")
165
 
 
166
  if isinstance(input_wav, tuple):
167
  fs, input_wav = input_wav
168
  input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
169
  if len(input_wav.shape) > 1:
170
  input_wav = input_wav.mean(-1)
171
  if fs != 16000:
172
- print(f"audio_fs: {fs}")
173
  resampler = torchaudio.transforms.Resample(fs, 16000)
174
  input_wav_t = torch.from_numpy(input_wav).to(torch.float32)
175
  input_wav = resampler(input_wav_t[None, :])[0, :].numpy()
176
 
177
- merge_vad = True # False if selected_task == "ASR" else True
178
- print(f"language: {language}, merge_vad: {merge_vad}")
179
- text = model.generate(
180
- input=input_wav,
181
- cache={},
182
- language=language,
183
- use_itn=True,
184
- batch_size_s=500,
185
- merge_vad=merge_vad,
186
- )
187
-
188
- print(text)
189
- text = text[0]["text"]
190
- text = format_str_v3(text)
191
 
192
- print(text)
 
 
 
 
 
193
 
194
- return text
195
 
196
 
197
  audio_examples = [
 
4
  import io
5
  import os
6
  import re
7
+ import tempfile
8
 
9
  import gradio as gr
10
  import librosa
11
  import numpy as np
12
+ import soundfile as sf
13
  import spaces
14
  import torch
15
  import torchaudio
16
  from funasr import AutoModel
17
+ from sv import clean_and_emoji_annotate_speech, process_audio
18
 
19
  model = "FunAudioLLM/SenseVoiceSmall"
20
  model = AutoModel(
 
148
 
149
  @spaces.GPU
150
  def model_inference(input_wav, language, fs=16000):
 
151
  language_abbr = {
152
  "auto": "auto",
153
  "zh": "zh",
 
158
  "nospeech": "nospeech",
159
  }
160
 
 
161
  language = "auto" if len(language) < 1 else language
162
  selected_language = language_abbr[language]
 
 
 
163
 
164
+ # Handle input_wav format
165
  if isinstance(input_wav, tuple):
166
  fs, input_wav = input_wav
167
  input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
168
  if len(input_wav.shape) > 1:
169
  input_wav = input_wav.mean(-1)
170
  if fs != 16000:
 
171
  resampler = torchaudio.transforms.Resample(fs, 16000)
172
  input_wav_t = torch.from_numpy(input_wav).to(torch.float32)
173
  input_wav = resampler(input_wav_t[None, :])[0, :].numpy()
174
 
175
+ # Save the input audio to a temporary file
176
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
177
+ sf.write(temp_audio.name, input_wav, 16000)
178
+ temp_audio_path = temp_audio.name
 
 
 
 
 
 
 
 
 
 
179
 
180
+ try:
181
+ # Process the audio using the function from sv.py
182
+ result = process_audio(temp_audio_path, language=selected_language)
183
+ finally:
184
+ # Remove the temporary audio file
185
+ os.remove(temp_audio_path)
186
 
187
+ return result
188
 
189
 
190
  audio_examples = [