Spaces:

OpenSound
/

SSR-Speech

Running on Zero

App Files Files Community

OpenSound commited on Sep 12, 2024

Commit

68683f4

1 Parent(s): a11f6a1

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -8

app.py CHANGED Viewed

@@ -33,7 +33,7 @@ _whitespace_re = re.compile(r"\s+")
 def get_random_string():
     return "".join(str(uuid.uuid4()).split("-"))
 def seed_everything(seed):
     if seed != -1:
         os.environ['PYTHONHASHSEED'] = str(seed)
@@ -71,7 +71,8 @@ def get_mask_interval(transcribe_state, word_span):
         end = float(data[e][0]) if e < len(data) else float(data[-1][1])
     return (start, end)
 class WhisperxAlignModel:
     def __init__(self):
         from whisperx import load_align_model
@@ -82,7 +83,7 @@ class WhisperxAlignModel:
         audio = load_audio(audio_path)
         return align(segments, self.model, self.metadata, audio, device, return_char_alignments=False)["segments"]
 class WhisperModel:
     def __init__(self, model_name):
         from whisper import load_model
@@ -99,7 +100,7 @@ class WhisperModel:
     def transcribe(self, audio_path):
         return self.model.transcribe(audio_path, suppress_tokens=self.supress_tokens, word_timestamps=True)["segments"]
 class WhisperxModel:
     def __init__(self, model_name, align_model: WhisperxAlignModel):
         from whisperx import load_model
@@ -112,7 +113,7 @@ class WhisperxModel:
             segment['text'] = replace_numbers_with_words(segment['text'])
         return self.align_model.align(segments, audio_path)
 def load_models(whisper_backend_name, whisper_model_name, alignment_model_name, ssrspeech_model_name):
     global transcribe_model, align_model, ssrspeech_model
@@ -172,7 +173,7 @@ def get_transcribe_state(segments):
         "word_bounds": [f"{word['start']} {word['word']} {word['end']}" for word in words_info]
     }
 def transcribe(seed, audio_path):
     if transcribe_model is None:
         raise gr.Error("Transcription model not loaded")
@@ -187,7 +188,7 @@ def transcribe(seed, audio_path):
         state, success_message
     ]
 def align_segments(transcript, audio_path):
     from aeneas.executetask import ExecuteTask
     from aeneas.task import Task
@@ -209,7 +210,7 @@ def align_segments(transcript, audio_path):
     with open(tmp_sync_map_path, "r") as f:
         return json.load(f)
 def align(seed, transcript, audio_path):
     if align_model is None:
         raise gr.Error("Align model not loaded")
@@ -248,6 +249,7 @@ def replace_numbers_with_words(sentence):
             return num # In case num2words fails (unlikely with digits but just to be safe)
     return re.sub(r'\b\d+\b', replace_with_words, sentence) # Regular expression that matches numbers
 def run(seed, sub_amount, ssrspeech_model_choice, codec_audio_sr, codec_sr, top_k, top_p, temperature,
         stop_repetition, kvcache, silence_tokens, aug_text, cfg_coef,
         audio_path, transcribe_state, original_transcript, transcript,

 def get_random_string():
     return "".join(str(uuid.uuid4()).split("-"))
+@spaces.GPU(duration=30)
 def seed_everything(seed):
     if seed != -1:
         os.environ['PYTHONHASHSEED'] = str(seed)
         end = float(data[e][0]) if e < len(data) else float(data[-1][1])
     return (start, end)
+@spaces.GPU(duration=120)
 class WhisperxAlignModel:
     def __init__(self):
         from whisperx import load_align_model
         audio = load_audio(audio_path)
         return align(segments, self.model, self.metadata, audio, device, return_char_alignments=False)["segments"]
+@spaces.GPU(duration=120)
 class WhisperModel:
     def __init__(self, model_name):
         from whisper import load_model
     def transcribe(self, audio_path):
         return self.model.transcribe(audio_path, suppress_tokens=self.supress_tokens, word_timestamps=True)["segments"]
+@spaces.GPU(duration=120)
 class WhisperxModel:
     def __init__(self, model_name, align_model: WhisperxAlignModel):
         from whisperx import load_model
             segment['text'] = replace_numbers_with_words(segment['text'])
         return self.align_model.align(segments, audio_path)
+@spaces.GPU(duration=120)
 def load_models(whisper_backend_name, whisper_model_name, alignment_model_name, ssrspeech_model_name):
     global transcribe_model, align_model, ssrspeech_model
         "word_bounds": [f"{word['start']} {word['word']} {word['end']}" for word in words_info]
     }
+@spaces.GPU(duration=60)
 def transcribe(seed, audio_path):
     if transcribe_model is None:
         raise gr.Error("Transcription model not loaded")
         state, success_message
     ]
+@spaces.GPU(duration=60)
 def align_segments(transcript, audio_path):
     from aeneas.executetask import ExecuteTask
     from aeneas.task import Task
     with open(tmp_sync_map_path, "r") as f:
         return json.load(f)
+@spaces.GPU(duration=90)
 def align(seed, transcript, audio_path):
     if align_model is None:
         raise gr.Error("Align model not loaded")
             return num # In case num2words fails (unlikely with digits but just to be safe)
     return re.sub(r'\b\d+\b', replace_with_words, sentence) # Regular expression that matches numbers
+@spaces.GPU(duration=90)
 def run(seed, sub_amount, ssrspeech_model_choice, codec_audio_sr, codec_sr, top_k, top_p, temperature,
         stop_repetition, kvcache, silence_tokens, aug_text, cfg_coef,
         audio_path, transcribe_state, original_transcript, transcript,