Spaces:
Sleeping
Sleeping
import spaces | |
import os | |
import gradio as gr | |
import torch | |
import torchaudio | |
from transformers import AutoModelForCTC, Wav2Vec2BertProcessor | |
from pytube import YouTube | |
from transformers import pipeline | |
import re | |
import numpy as np | |
pipe = pipeline(model="anzorq/w2v-bert-2.0-kbd-v2", device=0) | |
replacements = [ | |
('гъ', 'ɣ'), ('дж', 'j'), ('дз', 'ӡ'), ('жь', 'ʐ'), ('кӏ', 'қ'), | |
('кхъ', 'qҳ'), ('къ', 'q'), ('лъ', 'ɬ'), ('лӏ', 'ԯ'), ('пӏ', 'ԥ'), | |
('тӏ', 'ҭ'), ('фӏ', 'ჶ'), ('хь', 'h'), ('хъ', 'ҳ'), ('цӏ', 'ҵ'), | |
('щӏ', 'ɕ'), ('я', 'йа') | |
] | |
reverse_replacements = {v: k for k, v in replacements} | |
reverse_pattern = re.compile('|'.join(re.escape(key) for key in reverse_replacements)) | |
def replace_symbols_back(text): | |
return reverse_pattern.sub(lambda match: reverse_replacements[match.group(0)], text) | |
def transcribe_speech(audio): | |
if audio is None: # Handle the NoneType error for microphone input | |
return "No audio received." | |
transcription = pipe(audio, chunk_length_s=10)['text'] | |
return replace_symbols_back(transcription) | |
def transcribe_streaming(stream, new_chunk): | |
if new_chunk is None: # Handle the NoneType error for microphone input | |
return "No audio received.", "" | |
sampling_rate, audio_data = new_chunk | |
audio_data = audio_data.astype(np.float32) | |
audio_data /= np.max(np.abs(audio_data)) | |
# Convert audio data to mono if it has multiple channels | |
if audio_data.ndim > 1: | |
audio_data = np.mean(audio_data, axis=1) | |
if stream is not None: | |
stream = np.concatenate([stream, audio_data]) | |
else: | |
stream = audio_data | |
transcription = pipe({"sampling_rate": sampling_rate, "raw": stream})['text'] | |
return stream, replace_symbols_back(transcription) | |
def transcribe_from_youtube(url, progress=gr.Progress()): | |
progress(0, "Downloading YouTube audio...") | |
# Download audio from YouTube using pytube | |
audio_path = YouTube(url).streams.filter(only_audio=True)[0].download(filename="tmp.mp4") | |
progress(0.5, "Transcribing audio...") | |
transcription = transcribe_speech(audio_path) | |
return audio_path, transcription | |
def populate_metadata(url): | |
yt = YouTube(url) | |
return yt.thumbnail_url, yt.title | |
with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
gr.HTML( | |
""" | |
<div style="text-align: center; max-width: 500px; margin: 0 auto;"> | |
<div> | |
<h1>Kabardian Speech Transcription</h1> | |
</div> | |
<p style="margin-bottom: 10px; font-size: 94%"> | |
Kabardian speech to text transcription using a fine-tuned Wav2Vec2-BERT model | |
</p> | |
</div> | |
""" | |
) | |
with gr.Tab("Microphone Input"): | |
gr.Markdown("## Transcribe speech from microphone") | |
mic_audio = gr.Audio(sources='microphone', streaming=True) | |
transcription_output = gr.Textbox(label="Transcription", lines=10) | |
mic_audio.stream(fn=transcribe_streaming, inputs=[gr.State(), mic_audio], outputs=[gr.State(), transcription_output]) | |
with gr.Tab("File Upload"): | |
gr.Markdown("## Transcribe speech from uploaded file") | |
upload_audio = gr.Audio(sources="upload", type="filepath") | |
transcribe_button = gr.Button("Transcribe") | |
file_transcription_output = gr.Textbox(label="Transcription") | |
transcribe_button.click(fn=transcribe_speech, inputs=upload_audio, outputs=file_transcription_output) | |
with gr.Tab("YouTube URL"): | |
gr.Markdown("## Transcribe speech from YouTube video") | |
youtube_url = gr.Textbox(label="Enter YouTube video URL") | |
with gr.Row(): | |
img = gr.Image(label="Thumbnail", height=240, width=240, scale=1) | |
title = gr.Label(label="Video Title", scale=2) | |
transcribe_button = gr.Button("Transcribe") | |
transcription_output = gr.Textbox(label="Transcription", placeholder="Transcription Output", lines=10) | |
youtube_audio_output = gr.Audio(label="Downloaded Audio", type="filepath") | |
transcribe_button.click(fn=transcribe_from_youtube, inputs=youtube_url, outputs=[youtube_audio_output, transcription_output]) | |
youtube_url.change(populate_metadata, inputs=[youtube_url], outputs=[img, title]) | |
demo.launch() |