Spaces:
Sleeping
Sleeping
File size: 4,395 Bytes
bfb5ccb 1c4ba6c 8ca2e83 1c4ba6c 0863f8c 3f40220 d1e3f48 8ca2e83 9485209 3f40220 1c4ba6c bfb5ccb d1e3f48 9485209 9f4fdf3 8e25cf3 9485209 d22f729 9485209 8e25cf3 9485209 8e25cf3 9485209 8e25cf3 9f4fdf3 9485209 8ca2e83 9f4fdf3 6e5c3ef 1c4ba6c 3f40220 d562ee1 d1e3f48 9f4fdf3 d562ee1 eaed2c2 1c4ba6c 14f36e9 1c4ba6c 29c16a4 1c4ba6c 6e35142 1c4ba6c 6e35142 1c4ba6c eaed2c2 dbc9269 9485209 eaed2c2 d1e3f48 eaed2c2 fd5f6db 578d71a 030116b fd5f6db eaed2c2 1c4ba6c d562ee1 eaed2c2 d562ee1 1c4ba6c 8ca2e83 0863f8c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
import spaces
import os
import gradio as gr
import torch
import torchaudio
from transformers import AutoModelForCTC, Wav2Vec2BertProcessor
from pytube import YouTube
from transformers import pipeline
import re
import numpy as np
pipe = pipeline(model="anzorq/w2v-bert-2.0-kbd-v2", device=0)
replacements = [
('гъ', 'ɣ'), ('дж', 'j'), ('дз', 'ӡ'), ('жь', 'ʐ'), ('кӏ', 'қ'),
('кхъ', 'qҳ'), ('къ', 'q'), ('лъ', 'ɬ'), ('лӏ', 'ԯ'), ('пӏ', 'ԥ'),
('тӏ', 'ҭ'), ('фӏ', 'ჶ'), ('хь', 'h'), ('хъ', 'ҳ'), ('цӏ', 'ҵ'),
('щӏ', 'ɕ'), ('я', 'йа')
]
reverse_replacements = {v: k for k, v in replacements}
reverse_pattern = re.compile('|'.join(re.escape(key) for key in reverse_replacements))
def replace_symbols_back(text):
return reverse_pattern.sub(lambda match: reverse_replacements[match.group(0)], text)
@spaces.GPU
def transcribe_speech(audio):
if audio is None: # Handle the NoneType error for microphone input
return "No audio received."
transcription = pipe(audio, chunk_length_s=10)['text']
return replace_symbols_back(transcription)
@spaces.GPU
def transcribe_streaming(stream, new_chunk):
if new_chunk is None: # Handle the NoneType error for microphone input
return "No audio received.", ""
sampling_rate, audio_data = new_chunk
audio_data = audio_data.astype(np.float32)
audio_data /= np.max(np.abs(audio_data))
# Convert audio data to mono if it has multiple channels
if audio_data.ndim > 1:
audio_data = np.mean(audio_data, axis=1)
if stream is not None:
stream = np.concatenate([stream, audio_data])
else:
stream = audio_data
transcription = pipe({"sampling_rate": sampling_rate, "raw": stream})['text']
return stream, replace_symbols_back(transcription)
def transcribe_from_youtube(url, progress=gr.Progress()):
progress(0, "Downloading YouTube audio...")
# Download audio from YouTube using pytube
audio_path = YouTube(url).streams.filter(only_audio=True)[0].download(filename="tmp.mp4")
progress(0.5, "Transcribing audio...")
transcription = transcribe_speech(audio_path)
return audio_path, transcription
def populate_metadata(url):
yt = YouTube(url)
return yt.thumbnail_url, yt.title
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.HTML(
"""
<div style="text-align: center; max-width: 500px; margin: 0 auto;">
<div>
<h1>Kabardian Speech Transcription</h1>
</div>
<p style="margin-bottom: 10px; font-size: 94%">
Kabardian speech to text transcription using a fine-tuned Wav2Vec2-BERT model
</p>
</div>
"""
)
with gr.Tab("Microphone Input"):
gr.Markdown("## Transcribe speech from microphone")
mic_audio = gr.Audio(sources='microphone', streaming=True)
transcription_output = gr.Textbox(label="Transcription", lines=10)
mic_audio.stream(fn=transcribe_streaming, inputs=[gr.State(), mic_audio], outputs=[gr.State(), transcription_output])
with gr.Tab("File Upload"):
gr.Markdown("## Transcribe speech from uploaded file")
upload_audio = gr.Audio(sources="upload", type="filepath")
transcribe_button = gr.Button("Transcribe")
file_transcription_output = gr.Textbox(label="Transcription")
transcribe_button.click(fn=transcribe_speech, inputs=upload_audio, outputs=file_transcription_output)
with gr.Tab("YouTube URL"):
gr.Markdown("## Transcribe speech from YouTube video")
youtube_url = gr.Textbox(label="Enter YouTube video URL")
with gr.Row():
img = gr.Image(label="Thumbnail", height=240, width=240, scale=1)
title = gr.Label(label="Video Title", scale=2)
transcribe_button = gr.Button("Transcribe")
transcription_output = gr.Textbox(label="Transcription", placeholder="Transcription Output", lines=10)
youtube_audio_output = gr.Audio(label="Downloaded Audio", type="filepath")
transcribe_button.click(fn=transcribe_from_youtube, inputs=youtube_url, outputs=[youtube_audio_output, transcription_output])
youtube_url.change(populate_metadata, inputs=[youtube_url], outputs=[img, title])
demo.launch() |