OpenSound commited on
Commit
68683f4
·
1 Parent(s): a11f6a1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -8
app.py CHANGED
@@ -33,7 +33,7 @@ _whitespace_re = re.compile(r"\s+")
33
  def get_random_string():
34
  return "".join(str(uuid.uuid4()).split("-"))
35
 
36
-
37
  def seed_everything(seed):
38
  if seed != -1:
39
  os.environ['PYTHONHASHSEED'] = str(seed)
@@ -71,7 +71,8 @@ def get_mask_interval(transcribe_state, word_span):
71
  end = float(data[e][0]) if e < len(data) else float(data[-1][1])
72
 
73
  return (start, end)
74
-
 
75
  class WhisperxAlignModel:
76
  def __init__(self):
77
  from whisperx import load_align_model
@@ -82,7 +83,7 @@ class WhisperxAlignModel:
82
  audio = load_audio(audio_path)
83
  return align(segments, self.model, self.metadata, audio, device, return_char_alignments=False)["segments"]
84
 
85
-
86
  class WhisperModel:
87
  def __init__(self, model_name):
88
  from whisper import load_model
@@ -99,7 +100,7 @@ class WhisperModel:
99
  def transcribe(self, audio_path):
100
  return self.model.transcribe(audio_path, suppress_tokens=self.supress_tokens, word_timestamps=True)["segments"]
101
 
102
-
103
  class WhisperxModel:
104
  def __init__(self, model_name, align_model: WhisperxAlignModel):
105
  from whisperx import load_model
@@ -112,7 +113,7 @@ class WhisperxModel:
112
  segment['text'] = replace_numbers_with_words(segment['text'])
113
  return self.align_model.align(segments, audio_path)
114
 
115
-
116
  def load_models(whisper_backend_name, whisper_model_name, alignment_model_name, ssrspeech_model_name):
117
  global transcribe_model, align_model, ssrspeech_model
118
 
@@ -172,7 +173,7 @@ def get_transcribe_state(segments):
172
  "word_bounds": [f"{word['start']} {word['word']} {word['end']}" for word in words_info]
173
  }
174
 
175
-
176
  def transcribe(seed, audio_path):
177
  if transcribe_model is None:
178
  raise gr.Error("Transcription model not loaded")
@@ -187,7 +188,7 @@ def transcribe(seed, audio_path):
187
  state, success_message
188
  ]
189
 
190
-
191
  def align_segments(transcript, audio_path):
192
  from aeneas.executetask import ExecuteTask
193
  from aeneas.task import Task
@@ -209,7 +210,7 @@ def align_segments(transcript, audio_path):
209
  with open(tmp_sync_map_path, "r") as f:
210
  return json.load(f)
211
 
212
-
213
  def align(seed, transcript, audio_path):
214
  if align_model is None:
215
  raise gr.Error("Align model not loaded")
@@ -248,6 +249,7 @@ def replace_numbers_with_words(sentence):
248
  return num # In case num2words fails (unlikely with digits but just to be safe)
249
  return re.sub(r'\b\d+\b', replace_with_words, sentence) # Regular expression that matches numbers
250
 
 
251
  def run(seed, sub_amount, ssrspeech_model_choice, codec_audio_sr, codec_sr, top_k, top_p, temperature,
252
  stop_repetition, kvcache, silence_tokens, aug_text, cfg_coef,
253
  audio_path, transcribe_state, original_transcript, transcript,
 
33
  def get_random_string():
34
  return "".join(str(uuid.uuid4()).split("-"))
35
 
36
+ @spaces.GPU(duration=30)
37
  def seed_everything(seed):
38
  if seed != -1:
39
  os.environ['PYTHONHASHSEED'] = str(seed)
 
71
  end = float(data[e][0]) if e < len(data) else float(data[-1][1])
72
 
73
  return (start, end)
74
+
75
+ @spaces.GPU(duration=120)
76
  class WhisperxAlignModel:
77
  def __init__(self):
78
  from whisperx import load_align_model
 
83
  audio = load_audio(audio_path)
84
  return align(segments, self.model, self.metadata, audio, device, return_char_alignments=False)["segments"]
85
 
86
+ @spaces.GPU(duration=120)
87
  class WhisperModel:
88
  def __init__(self, model_name):
89
  from whisper import load_model
 
100
  def transcribe(self, audio_path):
101
  return self.model.transcribe(audio_path, suppress_tokens=self.supress_tokens, word_timestamps=True)["segments"]
102
 
103
+ @spaces.GPU(duration=120)
104
  class WhisperxModel:
105
  def __init__(self, model_name, align_model: WhisperxAlignModel):
106
  from whisperx import load_model
 
113
  segment['text'] = replace_numbers_with_words(segment['text'])
114
  return self.align_model.align(segments, audio_path)
115
 
116
+ @spaces.GPU(duration=120)
117
  def load_models(whisper_backend_name, whisper_model_name, alignment_model_name, ssrspeech_model_name):
118
  global transcribe_model, align_model, ssrspeech_model
119
 
 
173
  "word_bounds": [f"{word['start']} {word['word']} {word['end']}" for word in words_info]
174
  }
175
 
176
+ @spaces.GPU(duration=60)
177
  def transcribe(seed, audio_path):
178
  if transcribe_model is None:
179
  raise gr.Error("Transcription model not loaded")
 
188
  state, success_message
189
  ]
190
 
191
+ @spaces.GPU(duration=60)
192
  def align_segments(transcript, audio_path):
193
  from aeneas.executetask import ExecuteTask
194
  from aeneas.task import Task
 
210
  with open(tmp_sync_map_path, "r") as f:
211
  return json.load(f)
212
 
213
+ @spaces.GPU(duration=90)
214
  def align(seed, transcript, audio_path):
215
  if align_model is None:
216
  raise gr.Error("Align model not loaded")
 
249
  return num # In case num2words fails (unlikely with digits but just to be safe)
250
  return re.sub(r'\b\d+\b', replace_with_words, sentence) # Regular expression that matches numbers
251
 
252
+ @spaces.GPU(duration=90)
253
  def run(seed, sub_amount, ssrspeech_model_choice, codec_audio_sr, codec_sr, top_k, top_p, temperature,
254
  stop_repetition, kvcache, silence_tokens, aug_text, cfg_coef,
255
  audio_path, transcribe_state, original_transcript, transcript,