Hrsh-Venket's picture
modified: app.py
168d2de
raw
history blame
4.3 kB
import gradio as gr
from huggingsound import SpeechRecognitionModel
from transformers import logging
from transformers import pipeline
from transformers import BertTokenizer, BertModel
from pydub import AudioSegment
unmasker = pipeline('fill-mask', model='bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")
import os
def levenshtein_distance(s, t):
m, n = len(s), len(t)
d = [[0] * (n+1) for _ in range(m+1)]
for i in range(m+1):
d[i][0] = i
for j in range(n+1):
d[0][j] = j
for j in range(1, n+1):
for i in range(1, m+1):
if s[i-1] == t[j-1]:
d[i][j] = d[i-1][j-1]
else:
d[i][j] = 1 + min(d[i-1][j], d[i][j-1], d[i-1][j-1])
return d[m][n]
def collate(input):
pun_marks = [",", ".", "?", "!", ";", ":", "-", "β€”", "(", ")", "[", "]", "{", "}", "'", "\"", "`"]
output = ""
Capital = True
Dash = False
for i in range(len(input)):
if input[i] in pun_marks:
output += input[i]
if input[i] in [".", "("]:
Capital = True
if input[i] in ["-", "'"]:
Dash = True
else:
Dash = False
else:
str = ""
if (Dash == False):
str += " "
if Capital:
str += input[i].capitalize()
Capital = False
else:
str += input[i]
output += str
return output
def everything(audio_paths):
w2vmodel = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-english")
logging.set_verbosity_error() #change'error' to 'warning' or remove this if you want to see the warning
# https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-english
# https://huggingface.co/bert-base-uncased
transcriptions = w2vmodel.transcribe(audio_paths)
return transcriptions
# input = transcriptions[0]["transcription"]
# input = input.split()
# #(1) is a strategy where tokens are used to determine lexicographic distance
# #(2) is a strategy where replaced words
# for t in range(1):
# # output = [] #(2)
# for i in range(len(input)):
# temp = input[i]
# token = tokenizer(temp)['input_ids'][1]
# input[i] = "[MASK]"
# apiint = unmasker(' '.join(input))
# dist = []
# for r in range(5):
# # if (np.abs((apiint[r]['token'] - token)) < 2): #(1)
# dist.append(levenshtein_distance(temp, apiint[r]['token_str']))
# lindex = 0
# l = dist[0]
# for r in range(5):
# if dist[r] < l:
# lindex = r
# l = dist[r]
# if l <= 2:
# input[i] = apiint[lindex]['token_str']
# # output.append(apiint[lindex]['token_str']) #(2)
# else:
# input[i] = temp
# # output.append(temp) #(2)
# # input[i] = temp #(2)
# for t in range(1):
# inndex = 1
# for i in range(len(input)):
# input.insert(inndex, "[MASK]")
# # print(' '.join(input))
# apiint = unmasker(' '.join(input))
# if (apiint[0]['token'] < 1500):
# input[inndex] = apiint[0]["token_str"]
# inndex += 2
# else:
# del input[inndex]
# inndex += 1
# st.write(collate(input))
# # In comparison, a plain autocorrect gives this output:
# # "The b-movie by Jerry Sinclair, the sound of buzzing
# # bees, can be heard according to all known laws of
# # aviation that is no way for b to be able to fly its
# # wings are too small to get its start little body off
# # the ground, the be, of course, flies anyway because ``
# # bees don't care what humans think is possible.
# # Barbuda is guaranteed one member of the House of
# # Representatives and two members of the Senate."
# # - https://huggingface.co/oliverguhr/spelling-correction-english-base?text=lets+do+a+comparsion
demo = gr.Interface(fn=everything,
inputs = [gr.UploadButton],
outputs = ["text"]))