Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -33,7 +33,7 @@ _whitespace_re = re.compile(r"\s+")
|
|
33 |
def get_random_string():
|
34 |
return "".join(str(uuid.uuid4()).split("-"))
|
35 |
|
36 |
-
|
37 |
def seed_everything(seed):
|
38 |
if seed != -1:
|
39 |
os.environ['PYTHONHASHSEED'] = str(seed)
|
@@ -71,7 +71,8 @@ def get_mask_interval(transcribe_state, word_span):
|
|
71 |
end = float(data[e][0]) if e < len(data) else float(data[-1][1])
|
72 |
|
73 |
return (start, end)
|
74 |
-
|
|
|
75 |
class WhisperxAlignModel:
|
76 |
def __init__(self):
|
77 |
from whisperx import load_align_model
|
@@ -82,7 +83,7 @@ class WhisperxAlignModel:
|
|
82 |
audio = load_audio(audio_path)
|
83 |
return align(segments, self.model, self.metadata, audio, device, return_char_alignments=False)["segments"]
|
84 |
|
85 |
-
|
86 |
class WhisperModel:
|
87 |
def __init__(self, model_name):
|
88 |
from whisper import load_model
|
@@ -99,7 +100,7 @@ class WhisperModel:
|
|
99 |
def transcribe(self, audio_path):
|
100 |
return self.model.transcribe(audio_path, suppress_tokens=self.supress_tokens, word_timestamps=True)["segments"]
|
101 |
|
102 |
-
|
103 |
class WhisperxModel:
|
104 |
def __init__(self, model_name, align_model: WhisperxAlignModel):
|
105 |
from whisperx import load_model
|
@@ -112,7 +113,7 @@ class WhisperxModel:
|
|
112 |
segment['text'] = replace_numbers_with_words(segment['text'])
|
113 |
return self.align_model.align(segments, audio_path)
|
114 |
|
115 |
-
|
116 |
def load_models(whisper_backend_name, whisper_model_name, alignment_model_name, ssrspeech_model_name):
|
117 |
global transcribe_model, align_model, ssrspeech_model
|
118 |
|
@@ -172,7 +173,7 @@ def get_transcribe_state(segments):
|
|
172 |
"word_bounds": [f"{word['start']} {word['word']} {word['end']}" for word in words_info]
|
173 |
}
|
174 |
|
175 |
-
|
176 |
def transcribe(seed, audio_path):
|
177 |
if transcribe_model is None:
|
178 |
raise gr.Error("Transcription model not loaded")
|
@@ -187,7 +188,7 @@ def transcribe(seed, audio_path):
|
|
187 |
state, success_message
|
188 |
]
|
189 |
|
190 |
-
|
191 |
def align_segments(transcript, audio_path):
|
192 |
from aeneas.executetask import ExecuteTask
|
193 |
from aeneas.task import Task
|
@@ -209,7 +210,7 @@ def align_segments(transcript, audio_path):
|
|
209 |
with open(tmp_sync_map_path, "r") as f:
|
210 |
return json.load(f)
|
211 |
|
212 |
-
|
213 |
def align(seed, transcript, audio_path):
|
214 |
if align_model is None:
|
215 |
raise gr.Error("Align model not loaded")
|
@@ -248,6 +249,7 @@ def replace_numbers_with_words(sentence):
|
|
248 |
return num # In case num2words fails (unlikely with digits but just to be safe)
|
249 |
return re.sub(r'\b\d+\b', replace_with_words, sentence) # Regular expression that matches numbers
|
250 |
|
|
|
251 |
def run(seed, sub_amount, ssrspeech_model_choice, codec_audio_sr, codec_sr, top_k, top_p, temperature,
|
252 |
stop_repetition, kvcache, silence_tokens, aug_text, cfg_coef,
|
253 |
audio_path, transcribe_state, original_transcript, transcript,
|
|
|
33 |
def get_random_string():
|
34 |
return "".join(str(uuid.uuid4()).split("-"))
|
35 |
|
36 |
+
@spaces.GPU(duration=30)
|
37 |
def seed_everything(seed):
|
38 |
if seed != -1:
|
39 |
os.environ['PYTHONHASHSEED'] = str(seed)
|
|
|
71 |
end = float(data[e][0]) if e < len(data) else float(data[-1][1])
|
72 |
|
73 |
return (start, end)
|
74 |
+
|
75 |
+
@spaces.GPU(duration=120)
|
76 |
class WhisperxAlignModel:
|
77 |
def __init__(self):
|
78 |
from whisperx import load_align_model
|
|
|
83 |
audio = load_audio(audio_path)
|
84 |
return align(segments, self.model, self.metadata, audio, device, return_char_alignments=False)["segments"]
|
85 |
|
86 |
+
@spaces.GPU(duration=120)
|
87 |
class WhisperModel:
|
88 |
def __init__(self, model_name):
|
89 |
from whisper import load_model
|
|
|
100 |
def transcribe(self, audio_path):
|
101 |
return self.model.transcribe(audio_path, suppress_tokens=self.supress_tokens, word_timestamps=True)["segments"]
|
102 |
|
103 |
+
@spaces.GPU(duration=120)
|
104 |
class WhisperxModel:
|
105 |
def __init__(self, model_name, align_model: WhisperxAlignModel):
|
106 |
from whisperx import load_model
|
|
|
113 |
segment['text'] = replace_numbers_with_words(segment['text'])
|
114 |
return self.align_model.align(segments, audio_path)
|
115 |
|
116 |
+
@spaces.GPU(duration=120)
|
117 |
def load_models(whisper_backend_name, whisper_model_name, alignment_model_name, ssrspeech_model_name):
|
118 |
global transcribe_model, align_model, ssrspeech_model
|
119 |
|
|
|
173 |
"word_bounds": [f"{word['start']} {word['word']} {word['end']}" for word in words_info]
|
174 |
}
|
175 |
|
176 |
+
@spaces.GPU(duration=60)
|
177 |
def transcribe(seed, audio_path):
|
178 |
if transcribe_model is None:
|
179 |
raise gr.Error("Transcription model not loaded")
|
|
|
188 |
state, success_message
|
189 |
]
|
190 |
|
191 |
+
@spaces.GPU(duration=60)
|
192 |
def align_segments(transcript, audio_path):
|
193 |
from aeneas.executetask import ExecuteTask
|
194 |
from aeneas.task import Task
|
|
|
210 |
with open(tmp_sync_map_path, "r") as f:
|
211 |
return json.load(f)
|
212 |
|
213 |
+
@spaces.GPU(duration=90)
|
214 |
def align(seed, transcript, audio_path):
|
215 |
if align_model is None:
|
216 |
raise gr.Error("Align model not loaded")
|
|
|
249 |
return num # In case num2words fails (unlikely with digits but just to be safe)
|
250 |
return re.sub(r'\b\d+\b', replace_with_words, sentence) # Regular expression that matches numbers
|
251 |
|
252 |
+
@spaces.GPU(duration=90)
|
253 |
def run(seed, sub_amount, ssrspeech_model_choice, codec_audio_sr, codec_sr, top_k, top_p, temperature,
|
254 |
stop_repetition, kvcache, silence_tokens, aug_text, cfg_coef,
|
255 |
audio_path, transcribe_state, original_transcript, transcript,
|