Spaces:

anzorq
/

w2v-bert-2.0-kbd

Sleeping

App Files Files Community

w2v-bert-2.0-kbd / app.py

anzorq

Update app.py

d22f729 verified about 1 year ago

raw

history blame

4.4 kB

	import spaces
	import os
	import gradio as gr
	import torch
	import torchaudio
	from transformers import AutoModelForCTC, Wav2Vec2BertProcessor
	from pytube import YouTube
	from transformers import pipeline
	import re
	import numpy as np

	pipe = pipeline(model="anzorq/w2v-bert-2.0-kbd-v2", device=0)

	replacements = [
	('гъ', 'ɣ'), ('дж', 'j'), ('дз', 'ӡ'), ('жь', 'ʐ'), ('кӏ', 'қ'),
	('кхъ', 'qҳ'), ('къ', 'q'), ('лъ', 'ɬ'), ('лӏ', 'ԯ'), ('пӏ', 'ԥ'),
	('тӏ', 'ҭ'), ('фӏ', 'ჶ'), ('хь', 'h'), ('хъ', 'ҳ'), ('цӏ', 'ҵ'),
	('щӏ', 'ɕ'), ('я', 'йа')
	]

	reverse_replacements = {v: k for k, v in replacements}
	reverse_pattern = re.compile('\|'.join(re.escape(key) for key in reverse_replacements))

	def replace_symbols_back(text):
	return reverse_pattern.sub(lambda match: reverse_replacements[match.group(0)], text)

	@spaces.GPU
	def transcribe_speech(audio):
	if audio is None: # Handle the NoneType error for microphone input
	return "No audio received."

	transcription = pipe(audio, chunk_length_s=10)['text']

	return replace_symbols_back(transcription)

	@spaces.GPU
	def transcribe_streaming(stream, new_chunk):
	if new_chunk is None: # Handle the NoneType error for microphone input
	return "No audio received.", ""

	sampling_rate, audio_data = new_chunk
	audio_data = audio_data.astype(np.float32)
	audio_data /= np.max(np.abs(audio_data))

	# Convert audio data to mono if it has multiple channels
	if audio_data.ndim > 1:
	audio_data = np.mean(audio_data, axis=1)

	if stream is not None:
	stream = np.concatenate([stream, audio_data])
	else:
	stream = audio_data

	transcription = pipe({"sampling_rate": sampling_rate, "raw": stream})['text']

	return stream, replace_symbols_back(transcription)

	def transcribe_from_youtube(url, progress=gr.Progress()):
	progress(0, "Downloading YouTube audio...")
	# Download audio from YouTube using pytube
	audio_path = YouTube(url).streams.filter(only_audio=True)[0].download(filename="tmp.mp4")

	progress(0.5, "Transcribing audio...")
	transcription = transcribe_speech(audio_path)

	return audio_path, transcription

	def populate_metadata(url):
	yt = YouTube(url)
	return yt.thumbnail_url, yt.title

	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.HTML(
	"""
	<div style="text-align: center; max-width: 500px; margin: 0 auto;">
	<div>
	<h1>Kabardian Speech Transcription</h1>
	</div>
	<p style="margin-bottom: 10px; font-size: 94%">
	Kabardian speech to text transcription using a fine-tuned Wav2Vec2-BERT model
	</p>
	</div>
	"""
	)

	with gr.Tab("Microphone Input"):
	gr.Markdown("## Transcribe speech from microphone")
	mic_audio = gr.Audio(sources='microphone', streaming=True)
	transcription_output = gr.Textbox(label="Transcription", lines=10)

	mic_audio.stream(fn=transcribe_streaming, inputs=[gr.State(), mic_audio], outputs=[gr.State(), transcription_output])

	with gr.Tab("File Upload"):
	gr.Markdown("## Transcribe speech from uploaded file")
	upload_audio = gr.Audio(sources="upload", type="filepath")
	transcribe_button = gr.Button("Transcribe")
	file_transcription_output = gr.Textbox(label="Transcription")

	transcribe_button.click(fn=transcribe_speech, inputs=upload_audio, outputs=file_transcription_output)

	with gr.Tab("YouTube URL"):
	gr.Markdown("## Transcribe speech from YouTube video")
	youtube_url = gr.Textbox(label="Enter YouTube video URL")

	with gr.Row():
	img = gr.Image(label="Thumbnail", height=240, width=240, scale=1)
	title = gr.Label(label="Video Title", scale=2)

	transcribe_button = gr.Button("Transcribe")
	transcription_output = gr.Textbox(label="Transcription", placeholder="Transcription Output", lines=10)
	youtube_audio_output = gr.Audio(label="Downloaded Audio", type="filepath")

	transcribe_button.click(fn=transcribe_from_youtube, inputs=youtube_url, outputs=[youtube_audio_output, transcription_output])
	youtube_url.change(populate_metadata, inputs=[youtube_url], outputs=[img, title])

	demo.launch()