File size: 4,395 Bytes
bfb5ccb
1c4ba6c
8ca2e83
 
 
 
1c4ba6c
0863f8c
3f40220
d1e3f48
8ca2e83
9485209
3f40220
 
 
 
 
 
 
 
 
 
 
 
 
1c4ba6c
bfb5ccb
d1e3f48
 
 
 
 
 
 
 
 
 
9485209
 
9f4fdf3
8e25cf3
 
 
9485209
d22f729
 
 
 
9485209
8e25cf3
9485209
8e25cf3
9485209
8e25cf3
9f4fdf3
9485209
8ca2e83
9f4fdf3
6e5c3ef
1c4ba6c
3f40220
d562ee1
 
d1e3f48
9f4fdf3
d562ee1
eaed2c2
1c4ba6c
14f36e9
 
1c4ba6c
29c16a4
1c4ba6c
 
 
 
6e35142
1c4ba6c
 
6e35142
1c4ba6c
 
 
 
 
eaed2c2
 
dbc9269
9485209
eaed2c2
d1e3f48
 
 
 
 
 
 
 
 
eaed2c2
 
 
 
fd5f6db
578d71a
030116b
 
fd5f6db
eaed2c2
1c4ba6c
d562ee1
eaed2c2
d562ee1
1c4ba6c
8ca2e83
0863f8c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import spaces
import os
import gradio as gr
import torch
import torchaudio
from transformers import AutoModelForCTC, Wav2Vec2BertProcessor
from pytube import YouTube
from transformers import pipeline
import re
import numpy as np

pipe = pipeline(model="anzorq/w2v-bert-2.0-kbd-v2", device=0)

replacements = [
    ('гъ', 'ɣ'), ('дж', 'j'), ('дз', 'ӡ'), ('жь', 'ʐ'), ('кӏ', 'қ'),
    ('кхъ', 'qҳ'), ('къ', 'q'), ('лъ', 'ɬ'), ('лӏ', 'ԯ'), ('пӏ', 'ԥ'),
    ('тӏ', 'ҭ'), ('фӏ', 'ჶ'), ('хь', 'h'), ('хъ', 'ҳ'), ('цӏ', 'ҵ'),
    ('щӏ', 'ɕ'), ('я', 'йа')
]

reverse_replacements = {v: k for k, v in replacements}
reverse_pattern = re.compile('|'.join(re.escape(key) for key in reverse_replacements))

def replace_symbols_back(text):
    return reverse_pattern.sub(lambda match: reverse_replacements[match.group(0)], text)

@spaces.GPU
def transcribe_speech(audio):
    if audio is None:  # Handle the NoneType error for microphone input
        return "No audio received."
    
    transcription = pipe(audio, chunk_length_s=10)['text']
    
    return replace_symbols_back(transcription)

@spaces.GPU
def transcribe_streaming(stream, new_chunk):
    if new_chunk is None:  # Handle the NoneType error for microphone input
        return "No audio received.", ""
    
    sampling_rate, audio_data = new_chunk
    audio_data = audio_data.astype(np.float32)
    audio_data /= np.max(np.abs(audio_data))

    # Convert audio data to mono if it has multiple channels
    if audio_data.ndim > 1:
        audio_data = np.mean(audio_data, axis=1)

    if stream is not None:
        stream = np.concatenate([stream, audio_data])
    else:
        stream = audio_data

    transcription = pipe({"sampling_rate": sampling_rate, "raw": stream})['text']
    
    return stream, replace_symbols_back(transcription)

def transcribe_from_youtube(url, progress=gr.Progress()):
    progress(0, "Downloading YouTube audio...")
    # Download audio from YouTube using pytube
    audio_path = YouTube(url).streams.filter(only_audio=True)[0].download(filename="tmp.mp4")
    
    progress(0.5, "Transcribing audio...")
    transcription = transcribe_speech(audio_path)
    
    return audio_path, transcription

def populate_metadata(url):
    yt = YouTube(url)
    return yt.thumbnail_url, yt.title

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.HTML(
        """
            <div style="text-align: center; max-width: 500px; margin: 0 auto;">
              <div>
                <h1>Kabardian Speech Transcription</h1>
              </div>
              <p style="margin-bottom: 10px; font-size: 94%">
                Kabardian speech to text transcription using a fine-tuned Wav2Vec2-BERT model
              </p>
            </div>
        """
    )
    
    with gr.Tab("Microphone Input"):
        gr.Markdown("## Transcribe speech from microphone")
        mic_audio = gr.Audio(sources='microphone', streaming=True)
        transcription_output = gr.Textbox(label="Transcription", lines=10)
        
        mic_audio.stream(fn=transcribe_streaming, inputs=[gr.State(), mic_audio], outputs=[gr.State(), transcription_output])

    with gr.Tab("File Upload"):
        gr.Markdown("## Transcribe speech from uploaded file")
        upload_audio = gr.Audio(sources="upload", type="filepath")
        transcribe_button = gr.Button("Transcribe")
        file_transcription_output = gr.Textbox(label="Transcription")

        transcribe_button.click(fn=transcribe_speech, inputs=upload_audio, outputs=file_transcription_output)

    with gr.Tab("YouTube URL"):
        gr.Markdown("## Transcribe speech from YouTube video")
        youtube_url = gr.Textbox(label="Enter YouTube video URL")
        
        with gr.Row():
            img = gr.Image(label="Thumbnail", height=240, width=240, scale=1)
            title = gr.Label(label="Video Title", scale=2)
        
        transcribe_button = gr.Button("Transcribe")
        transcription_output = gr.Textbox(label="Transcription", placeholder="Transcription Output", lines=10)
        youtube_audio_output = gr.Audio(label="Downloaded Audio", type="filepath")
        
        transcribe_button.click(fn=transcribe_from_youtube, inputs=youtube_url, outputs=[youtube_audio_output, transcription_output])
        youtube_url.change(populate_metadata, inputs=[youtube_url], outputs=[img, title])

demo.launch()