insanely-fast-whisper-webui-zero

Running on Zero

App Files Files Community

reedmayhew commited on Aug 9, 2024

Commit

5d0e917

verified ·

1 Parent(s): d4a3a9d

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -22

app.py CHANGED Viewed

@@ -18,13 +18,8 @@ def write_file(output_file, subtitle):
         f.write(subtitle)
 def create_pipe(model, flash):
-    if torch.cuda.is_available():
-        device = "cuda:0"
-    elif platform == "darwin":
-        device = "mps"
-    else:
-        device = "cpu"
-    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
     model_id = model
     model = AutoModelForSpeechSeq2Seq.from_pretrained(
@@ -33,13 +28,8 @@ def create_pipe(model, flash):
         low_cpu_mem_usage=True,
         use_safetensors=True,
         attn_implementation="flash_attention_2" if flash and is_flash_attn_2_available() else "sdpa",
-        # eager (manual attention implementation)
-        # flash_attention_2 (implementation using flash attention 2)
-        # sdpa (implementation using torch.nn.functional.scaled_dot_product_attention)
-        # PyTorch SDPA requirements in Transformers are not met. Please install torch>=2.1.1.
     )
-    model.to(device)
     processor = AutoProcessor.from_pretrained(model_id)
     pipe = pipeline(
@@ -47,15 +37,25 @@ def create_pipe(model, flash):
         model=model,
         tokenizer=processor.tokenizer,
         feature_extractor=processor.feature_extractor,
-        # max_new_tokens=128,
-        # chunk_length_s=15,
-        # batch_size=16,
-        torch_dtype=torch_dtype,
-        device=device,
     )
-    return pipe
 @spaces.GPU
 def transcribe_webui_simple_progress(modelName, languageName, urlData, multipleFiles, microphoneData, task, flash,
                                      chunk_length_s, batch_size, progress=gr.Progress()):
     global last_model
@@ -73,16 +73,24 @@ def transcribe_webui_simple_progress(modelName, languageName, urlData, multipleF
     if last_model is None:
         logging.info("first model")
         progress(0.1, desc="Loading Model..")
-        pipe = create_pipe(modelName, flash)
     elif modelName != last_model:
         logging.info("new model")
         torch.cuda.empty_cache()
         progress(0.1, desc="Loading Model..")
-        pipe = create_pipe(modelName, flash)
     else:
         logging.info("Model not changed")
     last_model = modelName
     srt_sub = Subtitle("srt")
     vtt_sub = Subtitle("vtt")
     txt_sub = Subtitle("txt")
@@ -176,4 +184,4 @@ with gr.Blocks(title="Insanely Fast Whisper") as demo:
                                      )
 if __name__ == "__main__":
-    demo.launch()

         f.write(subtitle)
 def create_pipe(model, flash):
+    # Load the model into RAM first
+    torch_dtype = torch.float32  # Load onto CPU with float32 precision
     model_id = model
     model = AutoModelForSpeechSeq2Seq.from_pretrained(
         low_cpu_mem_usage=True,
         use_safetensors=True,
         attn_implementation="flash_attention_2" if flash and is_flash_attn_2_available() else "sdpa",
     )
     processor = AutoProcessor.from_pretrained(model_id)
     pipe = pipeline(
         model=model,
         tokenizer=processor.tokenizer,
         feature_extractor=processor.feature_extractor,
+        torch_dtype=torch_dtype,  # Keep in CPU until GPU is requested
+        device="cpu",  # Initially stay on CPU
     )
+    return pipe, model  # Return both pipe and model for later GPU switch
 @spaces.GPU
+def move_to_gpu(model):
+    if torch.cuda.is_available():
+        device = "cuda:0"
+        torch_dtype = torch.float16  # Use float16 precision on GPU
+        model.to(device, dtype=torch_dtype)
+    elif platform == "darwin":
+        device = "mps"
+        model.to(device)
+    else:
+        device = "cpu"
+    return device
 def transcribe_webui_simple_progress(modelName, languageName, urlData, multipleFiles, microphoneData, task, flash,
                                      chunk_length_s, batch_size, progress=gr.Progress()):
     global last_model
     if last_model is None:
         logging.info("first model")
         progress(0.1, desc="Loading Model..")
+        pipe, model = create_pipe(modelName, flash)
     elif modelName != last_model:
         logging.info("new model")
         torch.cuda.empty_cache()
         progress(0.1, desc="Loading Model..")
+        pipe, model = create_pipe(modelName, flash)
     else:
         logging.info("Model not changed")
     last_model = modelName
+    # Now move the model to GPU after the pipe is created
+    device = move_to_gpu(pipe.model)
+    # Update pipe's device
+    pipe.device = torch.device(device)
+    pipe.model.to(pipe.device)
     srt_sub = Subtitle("srt")
     vtt_sub = Subtitle("vtt")
     txt_sub = Subtitle("txt")
                                      )
 if __name__ == "__main__":
+    demo.launch()