terry-li-hm
commited on
Commit
·
9ecefd1
1
Parent(s):
09b9a40
Update
Browse files
app.py
CHANGED
@@ -4,14 +4,17 @@ import base64
|
|
4 |
import io
|
5 |
import os
|
6 |
import re
|
|
|
7 |
|
8 |
import gradio as gr
|
9 |
import librosa
|
10 |
import numpy as np
|
|
|
11 |
import spaces
|
12 |
import torch
|
13 |
import torchaudio
|
14 |
from funasr import AutoModel
|
|
|
15 |
|
16 |
model = "FunAudioLLM/SenseVoiceSmall"
|
17 |
model = AutoModel(
|
@@ -145,7 +148,6 @@ def format_str_v3(s):
|
|
145 |
|
146 |
@spaces.GPU
|
147 |
def model_inference(input_wav, language, fs=16000):
|
148 |
-
# task_abbr = {"Speech Recognition": "ASR", "Rich Text Transcription": ("ASR", "AED", "SER")}
|
149 |
language_abbr = {
|
150 |
"auto": "auto",
|
151 |
"zh": "zh",
|
@@ -156,42 +158,33 @@ def model_inference(input_wav, language, fs=16000):
|
|
156 |
"nospeech": "nospeech",
|
157 |
}
|
158 |
|
159 |
-
# task = "Speech Recognition" if task is None else task
|
160 |
language = "auto" if len(language) < 1 else language
|
161 |
selected_language = language_abbr[language]
|
162 |
-
# selected_task = task_abbr.get(task)
|
163 |
-
|
164 |
-
# print(f"input_wav: {type(input_wav)}, {input_wav[1].shape}, {input_wav}")
|
165 |
|
|
|
166 |
if isinstance(input_wav, tuple):
|
167 |
fs, input_wav = input_wav
|
168 |
input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
|
169 |
if len(input_wav.shape) > 1:
|
170 |
input_wav = input_wav.mean(-1)
|
171 |
if fs != 16000:
|
172 |
-
print(f"audio_fs: {fs}")
|
173 |
resampler = torchaudio.transforms.Resample(fs, 16000)
|
174 |
input_wav_t = torch.from_numpy(input_wav).to(torch.float32)
|
175 |
input_wav = resampler(input_wav_t[None, :])[0, :].numpy()
|
176 |
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
cache={},
|
182 |
-
language=language,
|
183 |
-
use_itn=True,
|
184 |
-
batch_size_s=500,
|
185 |
-
merge_vad=merge_vad,
|
186 |
-
)
|
187 |
-
|
188 |
-
print(text)
|
189 |
-
text = text[0]["text"]
|
190 |
-
text = format_str_v3(text)
|
191 |
|
192 |
-
|
|
|
|
|
|
|
|
|
|
|
193 |
|
194 |
-
return
|
195 |
|
196 |
|
197 |
audio_examples = [
|
|
|
4 |
import io
|
5 |
import os
|
6 |
import re
|
7 |
+
import tempfile
|
8 |
|
9 |
import gradio as gr
|
10 |
import librosa
|
11 |
import numpy as np
|
12 |
+
import soundfile as sf
|
13 |
import spaces
|
14 |
import torch
|
15 |
import torchaudio
|
16 |
from funasr import AutoModel
|
17 |
+
from sv import clean_and_emoji_annotate_speech, process_audio
|
18 |
|
19 |
model = "FunAudioLLM/SenseVoiceSmall"
|
20 |
model = AutoModel(
|
|
|
148 |
|
149 |
@spaces.GPU
|
150 |
def model_inference(input_wav, language, fs=16000):
|
|
|
151 |
language_abbr = {
|
152 |
"auto": "auto",
|
153 |
"zh": "zh",
|
|
|
158 |
"nospeech": "nospeech",
|
159 |
}
|
160 |
|
|
|
161 |
language = "auto" if len(language) < 1 else language
|
162 |
selected_language = language_abbr[language]
|
|
|
|
|
|
|
163 |
|
164 |
+
# Handle input_wav format
|
165 |
if isinstance(input_wav, tuple):
|
166 |
fs, input_wav = input_wav
|
167 |
input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
|
168 |
if len(input_wav.shape) > 1:
|
169 |
input_wav = input_wav.mean(-1)
|
170 |
if fs != 16000:
|
|
|
171 |
resampler = torchaudio.transforms.Resample(fs, 16000)
|
172 |
input_wav_t = torch.from_numpy(input_wav).to(torch.float32)
|
173 |
input_wav = resampler(input_wav_t[None, :])[0, :].numpy()
|
174 |
|
175 |
+
# Save the input audio to a temporary file
|
176 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
|
177 |
+
sf.write(temp_audio.name, input_wav, 16000)
|
178 |
+
temp_audio_path = temp_audio.name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
|
180 |
+
try:
|
181 |
+
# Process the audio using the function from sv.py
|
182 |
+
result = process_audio(temp_audio_path, language=selected_language)
|
183 |
+
finally:
|
184 |
+
# Remove the temporary audio file
|
185 |
+
os.remove(temp_audio_path)
|
186 |
|
187 |
+
return result
|
188 |
|
189 |
|
190 |
audio_examples = [
|