Spaces:

anzorq
/

w2v-bert-2.0-kbd

Sleeping

App Files Files Community

anzorq commited on May 17, 2024

Commit

0863f8c

verified ·

1 Parent(s): b4959b1

Update app.py

Browse files

Files changed (1) hide show

app.py +5 -64

app.py CHANGED Viewed

@@ -5,75 +5,16 @@ import torch
 import torchaudio
 from transformers import AutoModelForCTC, Wav2Vec2BertProcessor
 from pytube import YouTube
-model = AutoModelForCTC.from_pretrained("anzorq/w2v-bert-2.0-kbd")
-processor = Wav2Vec2BertProcessor.from_pretrained("anzorq/w2v-bert-2.0-kbd")
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model.to(device)
-# Chunk processing parameters
-chunk_length_s = 10  # Chunk length in seconds
-stride_length_s = (4, 2)  # Stride lengths in seconds
 @spaces.GPU
 def transcribe_speech(audio):
     if audio is None:  # Handle the NoneType error for microphone input
         return "No audio received."
-    waveform, sr = torchaudio.load(audio)
-    # Resample the audio if needed
-    if sr != 16000:
-        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
-        waveform = resampler(waveform)
-    # Convert to mono if needed
-    if waveform.dim() > 1:
-        waveform = torch.mean(waveform, dim=0)
-    # Ensure the waveform is a 2D tensor for chunking
-    waveform = waveform.unsqueeze(0)  # Add a dimension if it's mono
-    # Normalize the audio
-    waveform = waveform / torch.max(torch.abs(waveform))
-    # Chunk the audio
-    chunks = torch.split(waveform, int(chunk_length_s * sr), dim=1)
-    # Process each chunk with striding
-    full_transcription = ""
-    for i, chunk in enumerate(chunks):
-        with torch.no_grad():
-            # Calculate stride lengths in frames
-            left_stride_frames = int(stride_length_s[0] * sr)
-            right_stride_frames = int(stride_length_s[1] * sr)
-            # Extract the effective chunk with stride
-            start_frame = max(0, left_stride_frames * (i - 1))
-            end_frame = min(chunk.size(1), chunk.size(1) - right_stride_frames * i)
-            # Check for negative duration before processing
-            if end_frame <= start_frame:
-                continue  # Skip this chunk
-            effective_chunk = chunk[:, start_frame:end_frame]
-            # Extract input features
-            input_features = processor(effective_chunk, sampling_rate=16000).input_features
-            input_features = torch.from_numpy(input_features).to(device)
-            # Generate logits using the model
-            logits = model(input_features).logits
-        # Decode the predicted ids to text
-        pred_ids = torch.argmax(logits, dim=-1)[0]
-        pred_text = processor.decode(pred_ids)
-        # Append the chunk's transcription to the full transcription
-        full_transcription += pred_text
-    return full_transcription
 def transcribe_from_youtube(url):
     # Download audio from YouTube using pytube
@@ -118,11 +59,11 @@ with gr.Blocks() as demo:
         gr.Markdown("## Transcribe speech from YouTube video")
         youtube_url = gr.Textbox(label="Enter YouTube video URL")
         title = gr.Label(label="Video Title")
-        img = gr.Image(label="Thumbnail")
         transcribe_button = gr.Button("Transcribe")
         transcription_output = gr.Textbox(label="Transcription", placeholder="Transcription Output", lines=10)
         transcribe_button.click(fn=transcribe_from_youtube, inputs=youtube_url, outputs=transcription_output)
         youtube_url.change(populate_metadata, inputs=[youtube_url], outputs=[img, title])
-demo.launch(debug=True)

 import torchaudio
 from transformers import AutoModelForCTC, Wav2Vec2BertProcessor
 from pytube import YouTube
+from transformers import pipeline
+pipe = pipeline(model="anzorq/w2v-bert-2.0-kbd", device=0)
 @spaces.GPU
 def transcribe_speech(audio):
     if audio is None:  # Handle the NoneType error for microphone input
         return "No audio received."
+    return pipe(audio, chunk_length_s=10)['text']#, return_timestamps='word')
 def transcribe_from_youtube(url):
     # Download audio from YouTube using pytube
         gr.Markdown("## Transcribe speech from YouTube video")
         youtube_url = gr.Textbox(label="Enter YouTube video URL")
         title = gr.Label(label="Video Title")
+        img = gr.Image(label="Thumbnail", height=120, width=120)
         transcribe_button = gr.Button("Transcribe")
         transcription_output = gr.Textbox(label="Transcription", placeholder="Transcription Output", lines=10)
         transcribe_button.click(fn=transcribe_from_youtube, inputs=youtube_url, outputs=transcription_output)
         youtube_url.change(populate_metadata, inputs=[youtube_url], outputs=[img, title])
+demo.launch()