File size: 2,410 Bytes
6b9c021
 
c3ed104
6b9c021
 
ea2f95f
81a412a
c27a48e
6b9c021
ea2f95f
cd94084
6b9c021
ea2f95f
 
 
ae4f86f
81a412a
ea2f95f
81a412a
a61a05c
c27a48e
 
 
ff8667c
a4340f7
 
 
c27a48e
a4340f7
c27a48e
 
dc74cbe
 
5721619
 
6b9c021
 
a4340f7
6b9c021
 
ea2f95f
 
 
6b9c021
 
 
 
 
 
a4340f7
4c81c7d
a4340f7
6b9c021
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import os
import gradio as gr
import torch
from transformers import pipeline


title = "Transcribe speech in several languages"
device = "cuda:0" if torch.cuda.is_available() else "cpu"

asr_pipe_audio2Text_Ge = pipeline(task="automatic-speech-recognition", model="jonatasgrosman/wav2vec2-large-xlsr-53-german")
asr_pipe_whisper = pipeline(task="automatic-speech-recognition", model="openai/whisper-base", device=device)

def transcribeFile(inputlang, audio_path : str) -> str:
    #transcription = asr_pipe_audio2Text_Ge(audio_path)
    #transcription = asr_pipe_whisper(audio_path, max_new_tokens=256, generate_kwargs={"task":"transcribe"})
    if inputlang == "English":
        transcription = asr_pipe_whisper(audio_path, chunk_length_s=25, stride_length_s=(5, 0), generate_kwargs={"task":"transcribe"})
    elif inputlang == "German":
        transcription = asr_pipe_audio2Text_Ge(audio_path, chunk_length_s=25, stride_length_s=(5, 0))    
    return transcription["text"]

def translateAudio(audio_path):
    translationOutput = asr_pipe_whisper(audio_path, max_new_tokens=256, generate_kwargs={"task":"translate"})
    return translationOutput

def transcribeFileMulti(inputlang, audio_path : str) -> str:
    if inputlang == "English":
        transcription = asr_pipe_whisper(audio_path)
    elif inputlang == "German":
        transcription = asr_pipe_audio2Text_Ge(audio_path)
        translation = translateAudio(audio_path)
        t1 = transcription["text"]
        t2 = translation["text"]
        output = t1+t2
    return output #transcription["text"]


    
app1 = gr.Interface(
    fn=transcribeFile,
    #inputs=gr.inputs.Audio(label="Upload audio file", type="filepath"),
    inputs=[gr.Radio(["English", "German"], value="German", label="Source Language", info="Select the language of the speech you want to transcribe"),
                     gr.Audio(source="upload", type="filepath",label="Upload audio file")],     
    outputs="text",
    title=title
)


app2 = gr.Interface(
    fn=transcribeFileMulti,
    inputs=[gr.Radio(["English", "German"], value="German", label="Source Language", info="Select the language of the speech you want to transcribe"),
                     gr.Audio(source="microphone", type="filepath")], 
    outputs="text",
    title=title
)


demo = gr.TabbedInterface([app1, app2], ["Audio File", "Microphone"])

if __name__ == "__main__":
    demo.launch()