Spaces:
Sleeping
Sleeping
Commit
·
023650f
1
Parent(s):
6f91997
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import json
|
| 3 |
+
from urllib.request import urlopen
|
| 4 |
+
from thefuzz import fuzz
|
| 5 |
+
from itertools import combinations
|
| 6 |
+
from keras_transformer import get_model, decode
|
| 7 |
+
|
| 8 |
+
####################################################################################################
|
| 9 |
+
# FUNCTIONS
|
| 10 |
+
def search_fit(word, data, threshold=50, fraction=2/3):
|
| 11 |
+
# Esta función se puede usar para n palabras, basta con quitar los espacios
|
| 12 |
+
# entre palabras
|
| 13 |
+
target = ''
|
| 14 |
+
original = ''
|
| 15 |
+
best_score = 0
|
| 16 |
+
|
| 17 |
+
for item in data.keys():
|
| 18 |
+
for i in range(len(data[item])):
|
| 19 |
+
data_item = data[item][i].replace(' ', '')
|
| 20 |
+
score = fuzz.ratio(word, data_item)
|
| 21 |
+
if score>best_score and score>=threshold and len(data_item)>=fraction*len(word) and len(data_item)<=len(word)/fraction:
|
| 22 |
+
best_score = score
|
| 23 |
+
target = item
|
| 24 |
+
original = data_item
|
| 25 |
+
|
| 26 |
+
return target, best_score, original
|
| 27 |
+
|
| 28 |
+
def find_longest_phrase(data):
|
| 29 |
+
biggest_len = max([max([len(data[item][i].split()) for i in range(len(data[item]))]) for item in data.keys()])
|
| 30 |
+
return biggest_len
|
| 31 |
+
|
| 32 |
+
def create_tuples(sample_list, tuple_size):
|
| 33 |
+
tuple_list = [tuple([i+j for j in range(tuple_size)]) \
|
| 34 |
+
for i in range(len(sample_list)-tuple_size+1)]
|
| 35 |
+
#print(tuple_list)
|
| 36 |
+
return tuple_list
|
| 37 |
+
|
| 38 |
+
# OJO: CAMBIAR LA FUNCION COMBINATION POR ALGO QUE HAGA PERMUTACIONES CICLICAS
|
| 39 |
+
def make_translation(transcription, data, threshold=50, fraction=2/3):
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
# To set limits for comparison size
|
| 43 |
+
data_len = find_longest_phrase(data)
|
| 44 |
+
transcription_len = len(transcription.split())
|
| 45 |
+
biggest_len = min(data_len, transcription_len)
|
| 46 |
+
|
| 47 |
+
# To get the best translation given a phrase
|
| 48 |
+
index_transcription = list(range(transcription_len))
|
| 49 |
+
index_translation = list(range(transcription_len))
|
| 50 |
+
|
| 51 |
+
translation_dict = {}
|
| 52 |
+
translation = transcription#.copy()
|
| 53 |
+
transcription_split = transcription.split()
|
| 54 |
+
|
| 55 |
+
for i in range(1, 0, -1):
|
| 56 |
+
# Match comparisons
|
| 57 |
+
if i>1:
|
| 58 |
+
translation_dict.update({combination: search_fit(''.join(transcription_split[combination[0]:combination[-1]+1]), data, threshold, fraction) for combination in create_tuples(transcription_split, i)})
|
| 59 |
+
else:
|
| 60 |
+
translation_dict.update({combination: search_fit(transcription_split[combination[0]], data, threshold, fraction) for combination in create_tuples(transcription_split, i)})
|
| 61 |
+
|
| 62 |
+
# Get the best translation priorizing the longest phrases
|
| 63 |
+
for combination in create_tuples(transcription_split, i): # AQUI SE PUEDE MEJORAR LA BÚSQUEDA, PRIORIZANDO POR MAYOR SCORE EN LUGAR DE POR ORDEN SECUENCIAL
|
| 64 |
+
clear_index = min([1*(item in index_translation) for item in combination]) # 1 if all indexes are free
|
| 65 |
+
if clear_index and i>1 and translation_dict[combination][1]>threshold:
|
| 66 |
+
taken = False
|
| 67 |
+
translation_split = translation.split()
|
| 68 |
+
for number, word in enumerate(translation_split):
|
| 69 |
+
if number in combination:
|
| 70 |
+
if not taken:
|
| 71 |
+
if len(translation_dict[combination][0].split())>1:
|
| 72 |
+
translation_split[number] = '-'.join(translation_dict[combination][0])
|
| 73 |
+
else:
|
| 74 |
+
translation_split[number] = translation_dict[combination][0]
|
| 75 |
+
taken = True
|
| 76 |
+
else:
|
| 77 |
+
translation_split[number] = '<>'
|
| 78 |
+
translation = ' '.join(translation_split)
|
| 79 |
+
|
| 80 |
+
index_translation = [item if item not in combination else 0 for item in index_translation]
|
| 81 |
+
|
| 82 |
+
elif index_translation[combination[0]]!=0 and i==1 and translation_dict[combination][1]>threshold:
|
| 83 |
+
taken = False
|
| 84 |
+
translation_split = translation.split()
|
| 85 |
+
for number, word in enumerate(translation_split):
|
| 86 |
+
if number in combination:
|
| 87 |
+
if not taken:
|
| 88 |
+
if len(translation_dict[combination][0].split())>1:
|
| 89 |
+
translation_split[number] = '-'.join(translation_dict[combination][0])
|
| 90 |
+
else:
|
| 91 |
+
translation_split[number] = translation_dict[combination][0]
|
| 92 |
+
taken = True
|
| 93 |
+
else:
|
| 94 |
+
translation_split[number] = '<>'
|
| 95 |
+
translation = ' '.join(translation_split)
|
| 96 |
+
index_translation = [item if item not in combination else 0 for item in index_translation]
|
| 97 |
+
|
| 98 |
+
return translation.replace('-', ' ').replace('<>', '').replace(' ', ' ').replace(' ', ' ').strip()
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def remover(my_string = ""):
|
| 102 |
+
for item in my_string:
|
| 103 |
+
if item not in values:
|
| 104 |
+
my_string = my_string.replace(item, "")
|
| 105 |
+
return my_string
|
| 106 |
+
|
| 107 |
+
def translate(oracion, model):
|
| 108 |
+
sentence = oracion[:] # make_translation(oracion.strip().lower(), dictionary, threshold=90, fraction=4/5) #
|
| 109 |
+
sentence_tokens = [tokens + ['<END>', '<PAD>'] for tokens in [sentence.split(' ')]]
|
| 110 |
+
tr_input = [list(map(lambda x: source_token_dict[x] if x in source_token_dict.keys() else source_token_dict['<UNK>'], tokens)) for tokens in sentence_tokens][0]
|
| 111 |
+
decoded = decode(
|
| 112 |
+
model,
|
| 113 |
+
tr_input,
|
| 114 |
+
start_token = target_token_dict['<START>'],
|
| 115 |
+
end_token = target_token_dict['<END>'],
|
| 116 |
+
pad_token = target_token_dict['<PAD>']
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
return ' '.join(map(lambda x: target_token_dict_inv[x], decoded[1:-1]))
|
| 120 |
+
|
| 121 |
+
####################################################################################################
|
| 122 |
+
# MAIN APP
|
| 123 |
+
path_dict = 'https://huggingface.co/spaces/gilesitorr/Nahuatl2Spanish/raw/main/Dictionaries/'
|
| 124 |
+
|
| 125 |
+
response = urlopen(path_dict+'uncased_tokens_pretrained.json')
|
| 126 |
+
source_token_dict = json.loads(response.read())
|
| 127 |
+
target_token_dict = source_token_dict.copy()
|
| 128 |
+
|
| 129 |
+
response = urlopen(path_dict+'uncased_tokens_inv_pretrained.json')
|
| 130 |
+
target_token_dict_inv = json.loads(response.read())
|
| 131 |
+
target_token_dict_inv = {int(k): v for k,v in target_token_dict_inv.items()}
|
| 132 |
+
|
| 133 |
+
response = urlopen(path_dict+'es_nah.json')
|
| 134 |
+
dictionary = json.loads(response.read())
|
| 135 |
+
|
| 136 |
+
model = get_model(
|
| 137 |
+
token_num = max(len(source_token_dict),len(target_token_dict)),
|
| 138 |
+
embed_dim = 256,
|
| 139 |
+
encoder_num = 2,
|
| 140 |
+
decoder_num = 2,
|
| 141 |
+
head_num = 32,
|
| 142 |
+
hidden_dim = 2048,
|
| 143 |
+
dropout_rate = 0.1,
|
| 144 |
+
use_same_embed = False,
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
from keras.utils.data_utils import get_file
|
| 148 |
+
|
| 149 |
+
path_model = 'https://huggingface.co/spaces/gilesitorr/Nahuatl2Spanish/resolve/main/Models/'
|
| 150 |
+
filename = path_model+'uncased_translator_espanol2nahuatl+hybrid.h5'
|
| 151 |
+
weights_path = get_file(
|
| 152 |
+
'.././model.h5',
|
| 153 |
+
filename)
|
| 154 |
+
model.load_weights(weights_path)
|
| 155 |
+
|
| 156 |
+
values = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ")
|
| 157 |
+
text = st.text_area('Escriba una frase a traducir: ')
|
| 158 |
+
if text:
|
| 159 |
+
out = translate(remover(text.lower()), model)
|
| 160 |
+
st.text(out)
|