File size: 3,369 Bytes
bfb5ccb
1c4ba6c
8ca2e83
 
 
 
1c4ba6c
0863f8c
3f40220
8ca2e83
beedcb4
 
3f40220
 
 
 
 
 
 
 
 
 
 
 
 
1c4ba6c
bfb5ccb
beedcb4
d1e3f48
 
 
beedcb4
d1e3f48
 
 
 
9f4fdf3
6e5c3ef
1c4ba6c
3f40220
beedcb4
d1e3f48
beedcb4
 
9f4fdf3
beedcb4
eaed2c2
1c4ba6c
14f36e9
 
1c4ba6c
29c16a4
1c4ba6c
 
 
 
6e35142
1c4ba6c
 
6e35142
1c4ba6c
 
 
 
 
eaed2c2
 
beedcb4
d1e3f48
beedcb4
 
 
eaed2c2
 
 
 
fd5f6db
578d71a
030116b
 
fd5f6db
eaed2c2
1c4ba6c
eaed2c2
beedcb4
1c4ba6c
8ca2e83
0863f8c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import spaces
import os
import gradio as gr
import torch
import torchaudio
from transformers import AutoModelForCTC, Wav2Vec2BertProcessor
from pytube import YouTube
from transformers import pipeline
import re

# pipe = pipeline(model="anzorq/w2v-bert-2.0-kbd", device=0) # old model
pipe = pipeline(model="anzorq/w2v-bert-2.0-kbd-v2", device=0) # new model with a new tokenizer

replacements = [
    ('гъ', 'ɣ'), ('дж', 'j'), ('дз', 'ӡ'), ('жь', 'ʐ'), ('кӏ', 'қ'),
    ('кхъ', 'qҳ'), ('къ', 'q'), ('лъ', 'ɬ'), ('лӏ', 'ԯ'), ('пӏ', 'ԥ'),
    ('тӏ', 'ҭ'), ('фӏ', 'ჶ'), ('хь', 'h'), ('хъ', 'ҳ'), ('цӏ', 'ҵ'),
    ('щӏ', 'ɕ'), ('я', 'йа')
]

reverse_replacements = {v: k for k, v in replacements}
reverse_pattern = re.compile('|'.join(re.escape(key) for key in reverse_replacements))

def replace_symbols_back(text):
    return reverse_pattern.sub(lambda match: reverse_replacements[match.group(0)], text)

@spaces.GPU
def transcribe_speech(audio, progress=gr.Progress()):
    if audio is None:  # Handle the NoneType error for microphone input
        return "No audio received."
    
    progress(0.5, desc="Transcribing audio...")
    transcription = pipe(audio, chunk_length_s=10)['text']
    
    return replace_symbols_back(transcription)

def transcribe_from_youtube(url, progress=gr.Progress()):
    progress(0, "Downloading YouTube audio...")
    # Download audio from YouTube using pytube
    audio_path = YouTube(url).streams.filter(only_audio=True)[0].download(filename="tmp.mp4")

    transcription = transcribe_speech(audio_path)

    os.remove(audio_path)
    
    return transcription

def populate_metadata(url):
    yt = YouTube(url)
    return yt.thumbnail_url, yt.title

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.HTML(
        """
            <div style="text-align: center; max-width: 500px; margin: 0 auto;">
              <div>
                <h1>Kabardian Speech Transcription</h1>
              </div>
              <p style="margin-bottom: 10px; font-size: 94%">
                Kabardian speech to text transcription using a fine-tuned Wav2Vec2-BERT model
              </p>
            </div>
        """
    )
    
    with gr.Tab("Microphone Input"):
        gr.Markdown("## Transcribe speech from microphone")
        mic_audio = gr.Audio(sources=['microphone','upload'], type="filepath", label="Record or upload an audio")
        transcribe_button = gr.Button("Transcribe")
        transcription_output = gr.Textbox(label="Transcription")
        
        transcribe_button.click(fn=transcribe_speech, inputs=mic_audio, outputs=transcription_output)

    with gr.Tab("YouTube URL"):
        gr.Markdown("## Transcribe speech from YouTube video")
        youtube_url = gr.Textbox(label="Enter YouTube video URL")
        
        with gr.Row():
            img = gr.Image(label="Thumbnail", height=240, width=240, scale=1)
            title = gr.Label(label="Video Title", scale=2)
        
        transcribe_button = gr.Button("Transcribe")
        transcription_output = gr.Textbox(label="Transcription", placeholder="Transcription Output", lines=10)
        
        transcribe_button.click(fn=transcribe_from_youtube, inputs=youtube_url, outputs=transcription_output)
        youtube_url.change(populate_metadata, inputs=[youtube_url], outputs=[img, title])

demo.launch()