Spaces:

anzorq
/

w2v-bert-2.0-kbd

Sleeping

File size: 2,780 Bytes

bfb5ccb
8ca2e83
 
 
 
eaed2c2
8ca2e83
1c803c5
 
8ca2e83
0c872e7
 
 
bfb5ccb
8ca2e83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c872e7
8ca2e83
 
 
 
 
 
 
 
 
 
 
eaed2c2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ca2e83
eaed2c2

import spaces
import gradio as gr
import torch
import torchaudio
from transformers import AutoModelForCTC, Wav2Vec2BertProcessor
import yt_dlp

model = AutoModelForCTC.from_pretrained("anzorq/w2v-bert-2.0-kbd")
processor = Wav2Vec2BertProcessor.from_pretrained("anzorq/w2v-bert-2.0-kbd")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

@spaces.GPU
def transcribe_speech(audio):
    # Load the audio file
    waveform, sr = torchaudio.load(audio)

    # Resample the audio if needed
    resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
    waveform = resampler(waveform)

    # Convert to mono if needed
    if waveform.dim() > 1:
        waveform = torch.mean(waveform, dim=0)

    # Normalize the audio
    waveform = waveform / torch.max(torch.abs(waveform))

    # Extract input features
    input_features = processor(waveform.unsqueeze(0), sampling_rate=16000).input_features
    input_features = torch.from_numpy(input_features).to(device)

    # Generate logits using the model
    with torch.no_grad():
        logits = model(input_features).logits

    # Decode the predicted ids to text
    pred_ids = torch.argmax(logits, dim=-1)[0]
    pred_text = processor.decode(pred_ids)

    return pred_text

@spaces.GPU
def transcribe_from_youtube(url):
    # Download audio from YouTube using yt-dlp
    audio_path = "downloaded_audio.wav"
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': audio_path,
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'wav',
            'preferredquality': '192',
        }],
        'postprocessor_args': ['-ar', '16000'],  # Ensure audio is at 16000 Hz
        'prefer_ffmpeg': True,
    }
    
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])
    
    # Transcribe the downloaded audio
    return transcribe_speech(audio_path)

with gr.Blocks() as demo:
    with gr.Tab("Microphone Input"):
        gr.Markdown("## Transcribe speech from microphone")
        mic_audio = gr.Audio(source="microphone", type="filepath", label="Speak into your microphone")
        transcribe_button = gr.Button("Transcribe")
        transcription_output = gr.Textbox(label="Transcription")
        
        transcribe_button.click(fn=transcribe_speech, inputs=mic_audio, outputs=transcription_output)

    with gr.Tab("YouTube URL"):
        gr.Markdown("## Transcribe speech from YouTube video")
        youtube_url = gr.Textbox(label="Enter YouTube video URL")
        transcribe_button = gr.Button("Transcribe")
        transcription_output = gr.Textbox(label="Transcription")
        
        transcribe_button.click(fn=transcribe_from_youtube, inputs=youtube_url, outputs=transcription_output)

demo.launch()