nb-whisper-demo

Running on T4

App Files Files Community

pere commited on Oct 8, 2024

Commit

99d9b3e

1 Parent(s): ac1dacb

update test

Browse files

Files changed (1) hide show

app.py +21 -25

app.py CHANGED Viewed

@@ -4,7 +4,6 @@ import os
 import torch
 import gradio as gr
-import pytube as pt
 import spaces
 from transformers import AutoFeatureExtractor, AutoTokenizer, WhisperForConditionalGeneration, WhisperProcessor, pipeline
 from huggingface_hub import model_info
@@ -14,6 +13,7 @@ try:
 except ImportError:
     FLASH_ATTENTION = False
 MODEL_NAME = "NbAiLab/nb-whisper-large"
 lang = "no"
@@ -25,16 +25,9 @@ print(f"Using device: {device}")
 @spaces.GPU(duration=60 * 2)
 def pipe(file, return_timestamps=False):
-    # model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, low_cpu_mem_usage=True)
-    # model.to(device)
-    # processor = WhisperProcessor.from_pretrained(MODEL_NAME)
-    # model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
-    # model.generation_config.cache_implementation = "static"
     asr = pipeline(
         task="automatic-speech-recognition",
         model=MODEL_NAME,
-        # tokenizer=AutoTokenizer.from_pretrained(MODEL_NAME),
-        # feature_extractor=AutoFeatureExtractor.from_pretrained(MODEL_NAME),
         chunk_length_s=30,
         device=device,
         token=auth_token,
@@ -46,7 +39,6 @@ def pipe(file, return_timestamps=False):
         task="transcribe",
         no_timestamps=not return_timestamps,
     )
-    # asr.model.config.no_timestamps_token_id = asr.tokenizer.encode("<|notimestamps|>", add_special_tokens=False)[0]
     return asr(file, return_timestamps=return_timestamps, batch_size=24)
 def transcribe(file, return_timestamps=False):
@@ -63,7 +55,6 @@ def transcribe(file, return_timestamps=False):
         text = "\n".join(text)
     return text
 def _return_yt_html_embed(yt_url):
     video_id = yt_url.split("?v=")[-1]
     HTML_str = (
@@ -72,18 +63,26 @@ def _return_yt_html_embed(yt_url):
     )
     return HTML_str
 def yt_transcribe(yt_url, return_timestamps=False):
-    yt = pt.YouTube(yt_url)
     html_embed_str = _return_yt_html_embed(yt_url)
-    stream = yt.streams.filter(only_audio=True)[0]
-    stream.download(filename="audio.mp3")
     text = transcribe("audio.mp3", return_timestamps=return_timestamps)
     return html_embed_str, text
 demo = gr.Blocks()
 mf_transcribe = gr.Interface(
@@ -102,7 +101,7 @@ mf_transcribe = gr.Interface(
     allow_flagging="never",
 )
-yt_transcribe = gr.Interface(
     fn=yt_transcribe,
     inputs=[
         gr.components.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
@@ -120,12 +119,9 @@ yt_transcribe = gr.Interface(
 )
 with demo:
-    gr.TabbedInterface([
-        mf_transcribe,
-        yt_transcribe
-    ], [
-        "Transcribe Audio",
-        "Transcribe YouTube"
-    ])
-demo.launch(share=share).queue()

 import torch
 import gradio as gr
 import spaces
 from transformers import AutoFeatureExtractor, AutoTokenizer, WhisperForConditionalGeneration, WhisperProcessor, pipeline
 from huggingface_hub import model_info
 except ImportError:
     FLASH_ATTENTION = False
+import yt_dlp  # Added import for yt-dlp
 MODEL_NAME = "NbAiLab/nb-whisper-large"
 lang = "no"
 @spaces.GPU(duration=60 * 2)
 def pipe(file, return_timestamps=False):
     asr = pipeline(
         task="automatic-speech-recognition",
         model=MODEL_NAME,
         chunk_length_s=30,
         device=device,
         token=auth_token,
         task="transcribe",
         no_timestamps=not return_timestamps,
     )
     return asr(file, return_timestamps=return_timestamps, batch_size=24)
 def transcribe(file, return_timestamps=False):
         text = "\n".join(text)
     return text
 def _return_yt_html_embed(yt_url):
     video_id = yt_url.split("?v=")[-1]
     HTML_str = (
     )
     return HTML_str
 def yt_transcribe(yt_url, return_timestamps=False):
     html_embed_str = _return_yt_html_embed(yt_url)
+    ydl_opts = {
+        'format': 'bestaudio/best',
+        'outtmpl': 'audio.%(ext)s',
+        'postprocessors': [{
+            'key': 'FFmpegExtractAudio',
+            'preferredcodec': 'mp3',
+            'preferredquality': '192',
+        }],
+        'quiet': True,
+    }
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        ydl.download([yt_url])
     text = transcribe("audio.mp3", return_timestamps=return_timestamps)
     return html_embed_str, text
 demo = gr.Blocks()
 mf_transcribe = gr.Interface(
     allow_flagging="never",
 )
+yt_transcribe_interface = gr.Interface(
     fn=yt_transcribe,
     inputs=[
         gr.components.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
 )
 with demo:
+    gr.TabbedInterface(
+        [mf_transcribe, yt_transcribe_interface],
+        ["Transcribe Audio", "Transcribe YouTube"]
+    )
+demo.launch(share=share).queue()