Spaces:

anzorq
/

w2v-bert-2.0-kbd

Running on Zero

App Files Files Community

anzorq commited on May 17, 2024

Commit

1c4ba6c

verified ·

1 Parent(s): 550d732

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -39

app.py CHANGED Viewed

@@ -1,9 +1,10 @@
 import spaces
 import gradio as gr
 import torch
 import torchaudio
 from transformers import AutoModelForCTC, Wav2Vec2BertProcessor
-import yt_dlp
 model = AutoModelForCTC.from_pretrained("anzorq/w2v-bert-2.0-kbd")
 processor = Wav2Vec2BertProcessor.from_pretrained("anzorq/w2v-bert-2.0-kbd")
@@ -11,9 +12,15 @@ processor = Wav2Vec2BertProcessor.from_pretrained("anzorq/w2v-bert-2.0-kbd")
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model.to(device)
 @spaces.GPU
 def transcribe_speech(audio):
-    # Load the audio file
     waveform, sr = torchaudio.load(audio)
     # Resample the audio if needed
@@ -23,57 +30,82 @@ def transcribe_speech(audio):
     # Convert to mono if needed
     if waveform.dim() > 1:
-        waveform = torchaudio.transforms.DownmixMono()(waveform)
     # Normalize the audio
     waveform = waveform / torch.max(torch.abs(waveform))
-    # Extract input features
-    with torch.no_grad():
-        input_features = processor(waveform.unsqueeze(0), sampling_rate=16000).input_features
-        input_features = torch.from_numpy(input_features).to(device)
-        # Generate logits using the model
-        logits = model(input_features).logits
-    # Decode the predicted ids to text
-    pred_ids = torch.argmax(logits, dim=-1)[0]
-    pred_text = processor.decode(pred_ids)
-    return pred_text
-@spaces.GPU
 def transcribe_from_youtube(url):
-    # Download audio from YouTube using yt-dlp
-    audio_path = f"downloaded_audio_{url.split('=')[-1]}.wav"
-    ydl_opts = {
-        'format': 'bestaudio/best',
-        'outtmpl': audio_path,
-        'postprocessors': [{
-            'key': 'FFmpegExtractAudio',
-            'preferredcodec': 'wav',
-            'preferredquality': '192',
-        }],
-        'postprocessor_args': ['-ar', '16000'],  # Ensure audio is at 16000 Hz
-        'prefer_ffmpeg': True,
-    }
-    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
-        ydl.download([url])
-    # # Check if the file exists
-    # if not os.path.exists(audio_path):
-    #     raise FileNotFoundError(f"Failed to find the audio file {audio_path}")
     # Transcribe the downloaded audio
     transcription = transcribe_speech(audio_path)
-    # Optionally, clean up the downloaded file
     os.remove(audio_path)
     return transcription
 with gr.Blocks() as demo:
     with gr.Tab("Microphone Input"):
         gr.Markdown("## Transcribe speech from microphone")
         mic_audio = gr.Audio(sources="microphone", type="filepath", label="Speak into your microphone")
@@ -85,9 +117,12 @@ with gr.Blocks() as demo:
     with gr.Tab("YouTube URL"):
         gr.Markdown("## Transcribe speech from YouTube video")
         youtube_url = gr.Textbox(label="Enter YouTube video URL")
         transcribe_button = gr.Button("Transcribe")
-        transcription_output = gr.Textbox(label="Transcription")
         transcribe_button.click(fn=transcribe_from_youtube, inputs=youtube_url, outputs=transcription_output)
-demo.launch()

 import spaces
+import os
 import gradio as gr
 import torch
 import torchaudio
 from transformers import AutoModelForCTC, Wav2Vec2BertProcessor
+from pytube import YouTube
 model = AutoModelForCTC.from_pretrained("anzorq/w2v-bert-2.0-kbd")
 processor = Wav2Vec2BertProcessor.from_pretrained("anzorq/w2v-bert-2.0-kbd")
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model.to(device)
+# Chunk processing parameters
+chunk_length_s = 10  # Chunk length in seconds
+stride_length_s = (4, 2)  # Stride lengths in seconds
 @spaces.GPU
 def transcribe_speech(audio):
+    if audio is None:  # Handle the NoneType error for microphone input
+        return "No audio received."
     waveform, sr = torchaudio.load(audio)
     # Resample the audio if needed
     # Convert to mono if needed
     if waveform.dim() > 1:
+        waveform = torch.mean(waveform, dim=0)
+    # Ensure the waveform is a 2D tensor for chunking
+    waveform = waveform.unsqueeze(0)  # Add a dimension if it's mono
     # Normalize the audio
     waveform = waveform / torch.max(torch.abs(waveform))
+    # Chunk the audio
+    chunks = torch.split(waveform, int(chunk_length_s * sr), dim=1)
+    # Process each chunk with striding
+    full_transcription = ""
+    for i, chunk in enumerate(chunks):
+        with torch.no_grad():
+            # Calculate stride lengths in frames
+            left_stride_frames = int(stride_length_s[0] * sr)
+            right_stride_frames = int(stride_length_s[1] * sr)
+            # Extract the effective chunk with stride
+            start_frame = max(0, left_stride_frames * (i - 1))
+            end_frame = min(chunk.size(1), chunk.size(1) - right_stride_frames * i)
+            # Check for negative duration before processing
+            if end_frame <= start_frame:
+                continue  # Skip this chunk
+            effective_chunk = chunk[:, start_frame:end_frame]
+            # Extract input features
+            input_features = processor(effective_chunk, sampling_rate=16000).input_features
+            input_features = torch.from_numpy(input_features).to(device)
+            # Generate logits using the model
+            logits = model(input_features).logits
+        # Decode the predicted ids to text
+        pred_ids = torch.argmax(logits, dim=-1)[0]
+        pred_text = processor.decode(pred_ids)
+        # Append the chunk's transcription to the full transcription
+        full_transcription += pred_text
+    return full_transcription
 def transcribe_from_youtube(url):
+    # Download audio from YouTube using pytube
+    yt = YouTube(url)
+    audio_path = yt.streams.filter(only_audio=True)[0].download(filename="tmp.mp4")
     # Transcribe the downloaded audio
     transcription = transcribe_speech(audio_path)
+    # Clean up the downloaded file
     os.remove(audio_path)
     return transcription
+def populate_metadata(url):
+    yt = YouTube(url)
+    return yt.thumbnail_url, yt.title
 with gr.Blocks() as demo:
+    gr.HTML(
+        """
+            <div style="text-align: center; max-width: 500px; margin: 0 auto;">
+              <div>
+                <h1>Youtube Speech Transcription</h1>
+              </div>
+              <p style="margin-bottom: 10px; font-size: 94%">
+                Speech to text transcription of Youtube videos using Wav2Vec2-BERT
+              </p>
+            </div>
+        """
+    )
     with gr.Tab("Microphone Input"):
         gr.Markdown("## Transcribe speech from microphone")
         mic_audio = gr.Audio(sources="microphone", type="filepath", label="Speak into your microphone")
     with gr.Tab("YouTube URL"):
         gr.Markdown("## Transcribe speech from YouTube video")
         youtube_url = gr.Textbox(label="Enter YouTube video URL")
+        title = gr.Label(label="Video Title")
+        img = gr.Image(label="Thumbnail")
         transcribe_button = gr.Button("Transcribe")
+        transcription_output = gr.Textbox(label="Transcription", placeholder="Transcription Output", lines=10)
         transcribe_button.click(fn=transcribe_from_youtube, inputs=youtube_url, outputs=transcription_output)
+        youtube_url.change(populate_metadata, inputs=[youtube_url], outputs=[img, title])
+demo.launch(debug=True)