Kamtera's picture
Update app.py
62f49fe
raw
history blame
7.42 kB
import gradio as gr
from transformers import pipeline
from pydub import AudioSegment
import os
import speech_recognition as sr
html_seeker='''
<html> <head> <meta charset="utf-8" /> <title>Gentle</title> <style> html, body { margin: 0; padding: 0; min-width: 900px; } #header { position: fixed; top: 0; left: 0; height: 50px; min-width: 900px; line-height: 50px; width: 100%; background-color: #999; box-shadow: 0px 0px 5px 0px rgba(0,0,0,0.5); font-family: Helvetica, sans-serif; } #header, #header a { color: white; } .home { margin: 0; font-weight: bold; text-transform: lowercase; width: 100px; } h4.home { margin: 0; background: #666; padding-left: 25px; padding-right: 30px; margin-right: 20px; float: left; text-decoration: none; } .home:hover a { background: #555; } #audio { margin-top: 9px; width: 500px; display: inline-block; } #transcript { margin: 0 15px; margin-bottom: 5em; white-space: pre-wrap; line-height: 2em; max-width: 600px; color: #999; clear: both; margin-top: 75px; /*direction: rtl;*/ } .success { color: black; } .success:hover { text-decoration: underline; } .active { color: magenta; background-color: yellow; } #preloader { visibility: hidden; } </style> </head> <body> <div id="header"> <h4 class="home">Model name</h4>'''
html_seeker1='''</div> </div> <div id="transcript" dir="auto"></div> <script> var $a = document.getElementById("audio"); window.onkeydown = function(ev) { if(ev.keyCode == 32) { ev.preventDefault(); $a.pause(); } } var $trans = document.getElementById("transcript"); var wds = []; var cur_wd; function highlight_word() { var t = $a.currentTime; // XXX: O(N); use binary search var hits = wds.filter(function(x) { return (t - x['timestamp']['0']) > 0.01 && (x['timestamp']['1'] - t) > 0.01; }, wds); var next_wd = hits[hits.length - 1]; if(cur_wd != next_wd) { var active = document.querySelectorAll('.active'); for(var i = 0; i < active.length; i++) { active[i].classList.remove('active'); } if(next_wd && next_wd.$div) { next_wd.$div.classList.add('active'); //render_phones(next_wd); } } cur_wd = next_wd; //highlight_phone(t); window.requestAnimationFrame(highlight_word); } window.requestAnimationFrame(highlight_word); $trans.innerHTML = "Loading..."; function render(ret) { wds = ret['chunks'] || []; transcript = ret['text']; $trans.innerHTML = ''; var currentOffset = 0; wds.forEach(function(wd) { var $wd = document.createElement('span'); var txt = wd['text']; var $wdText = document.createTextNode(txt); $wd.appendChild($wdText); wd.$div = $wd; $wd.className = 'success'; $wd.onclick = function() { console.log(wd['timestamp']['0']); $a.currentTime = wd['timestamp']['0']; $a.play(); }; $trans.appendChild($wd); $trans.appendChild(document.createTextNode(' ')); }); } function update() { if(INLINE_JSON) { // We want this to work from file:/// domains, so we provide a // mechanism for inlining the alignment data. render(INLINE_JSON); } } var INLINE_JSON='''
html_seeker2=''';update();
</script>'''
model_name = "voidful/wav2vec2-xlsr-multilingual-56"
model0 = pipeline(task="automatic-speech-recognition",
model=model_name)
model_name = "SLPL/Sharif-wav2vec2"
model2 = pipeline(task="automatic-speech-recognition",
model=model_name)
model_name = "ghofrani/common8"
model1 = pipeline(task="automatic-speech-recognition",
model=model_name)
import json
def predict_fa(speech,model):
if model== "SLPL/Sharif-wav2vec2":
text = model2(speech,return_timestamps="word" )
elif model== "ghofrani/common8":
text = model1(speech,return_timestamps="word" )
elif model== "voidful/wav2vec2-xlsr-multilingual-56":
text = model0(speech,return_timestamps="word" )
return [text['text'],json.dumps(text),html_seeker+speech+html_seeker1+json.dumps(text)+html_seeker2]
def convert_to_wav(filename):
filenameObj=os.path.splitext(filename)
audio = AudioSegment.from_file(filename,format=filenameObj[1].replace(".",""))
new_filename = filenameObj[0] + ".wav"
while os.path.exists(new_filename):
new_filename = os.path.splitext(new_filename)[0]+"(1)"+ ".wav"
audio.export(new_filename, format="wav")
print(f"Converting {filename} to {new_filename}...")
return new_filename
def g_rec(audio_File ,language):
r = sr.Recognizer()
print(audio_File)
#if not os.path.splitext(audio_File)[1]==".wav":
# audio_File=convert_to_wav(audio_File)
hellow=sr.AudioFile(audio_File)
with hellow as source:
audio = r.record(source)
try:
s = r.recognize_google(audio,language =language)
res= "Text: "+s
except Exception as e:
res= "Exception: "+str(e)
return res
# Export file as .wav
#predict(load_file_to_data('audio file path',sampling_rate=16_000)) # beware of the audio file sampling rate
#predict_lang_specific(load_file_to_data('audio file path',sampling_rate=16_000),'en') # beware of the audio file sampling rate
with gr.Blocks() as demo:
gr.Markdown("multilingual Speech Recognition")
with gr.Tab("Persian models"):
inputs_speech_fa =gr.Audio(source="upload", type="filepath", optional=True,label="Upload your audio:")
inputs_model_fa =gr.inputs.Radio(label="Language", choices=["ghofrani/common8","SLPL/Sharif-wav2vec2","voidful/wav2vec2-xlsr-multilingual-56"])
output_transcribe1_fa = gr.Textbox(label="Transcribed text:")
output_transcribe1_fa1 = gr.Textbox(label="Transcribed text with timestamps:")
output_transcribe1_fa2 =gradio.HTML("")
transcribe_audio1_fa= gr.Button("Submit")
with gr.Tab("google"):
gr.Markdown("set your speech language")
inputs_speech1 =[
gr.Audio(source="upload", type="filepath"),
gr.Dropdown(choices=["af-ZA","am-ET","ar-AE","ar-BH","ar-DZ","ar-EG","ar-IL","ar-IQ","ar-JO","ar-KW","ar-LB","ar-MA","ar-MR","ar-OM","ar-PS","ar-QA","ar-SA","ar-TN","ar-YE","az-AZ","bg-BG","bn-BD","bn-IN","bs-BA","ca-ES","cs-CZ","da-DK","de-AT","de-CH","de-DE","el-GR","en-AU","en-CA","en-GB","en-GH","en-HK","en-IE","en-IN","en-KE","en-NG","en-NZ","en-PH","en-PK","en-SG","en-TZ","en-US","en-ZA","es-AR","es-BO","es-CL","es-CO","es-CR","es-DO","es-EC","es-ES","es-GT","es-HN","es-MX","es-NI","es-PA","es-PE","es-PR","es-PY","es-SV","es-US","es-UY","es-VE","et-EE","eu-ES","fa-IR","fi-FI","fil-PH","fr-BE","fr-CA","fr-CH","fr-FR","gl-ES","gu-IN","hi-IN","hr-HR","hu-HU","hy-AM","id-ID","is-IS","it-CH","it-IT","iw-IL","ja-JP","jv-ID","ka-GE","kk-KZ","km-KH","kn-IN","ko-KR","lo-LA","lt-LT","lv-LV","mk-MK","ml-IN","mn-MN","mr-IN","ms-MY","my-MM","ne-NP","nl-BE","nl-NL","no-NO","pa-Guru-IN","pl-PL","pt-BR","pt-PT","ro-RO","ru-RU","si-LK","sk-SK","sl-SI","sq-AL","sr-RS","su-ID","sv-SE","sw-KE","sw-TZ","ta-IN","ta-LK","ta-MY","ta-SG","te-IN","th-TH","tr-TR","uk-UA","ur-IN","ur-PK","uz-UZ","vi-VN","yue-Hant-HK","zh (cmn-Hans-CN)","zh-TW (cmn-Hant-TW)","zu-ZA"]
,value="fa-IR",label="language code")
]
output_transcribe1 = gr.Textbox(label="output")
transcribe_audio1_go= gr.Button("Submit")
transcribe_audio1_fa.click(fn=predict_fa,
inputs=[inputs_speech_fa ,inputs_model_fa ],
outputs=[output_transcribe1_fa ,output_transcribe1_fa1,output_transcribe1_fa2 ] )
transcribe_audio1_go.click(fn=g_rec,
inputs=inputs_speech1 ,
outputs=output_transcribe1 )
if __name__ == "__main__":
demo.launch()