gilesitorr commited on
Commit
023650f
·
1 Parent(s): 6f91997

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +160 -0
app.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import json
3
+ from urllib.request import urlopen
4
+ from thefuzz import fuzz
5
+ from itertools import combinations
6
+ from keras_transformer import get_model, decode
7
+
8
+ ####################################################################################################
9
+ # FUNCTIONS
10
+ def search_fit(word, data, threshold=50, fraction=2/3):
11
+ # Esta función se puede usar para n palabras, basta con quitar los espacios
12
+ # entre palabras
13
+ target = ''
14
+ original = ''
15
+ best_score = 0
16
+
17
+ for item in data.keys():
18
+ for i in range(len(data[item])):
19
+ data_item = data[item][i].replace(' ', '')
20
+ score = fuzz.ratio(word, data_item)
21
+ if score>best_score and score>=threshold and len(data_item)>=fraction*len(word) and len(data_item)<=len(word)/fraction:
22
+ best_score = score
23
+ target = item
24
+ original = data_item
25
+
26
+ return target, best_score, original
27
+
28
+ def find_longest_phrase(data):
29
+ biggest_len = max([max([len(data[item][i].split()) for i in range(len(data[item]))]) for item in data.keys()])
30
+ return biggest_len
31
+
32
+ def create_tuples(sample_list, tuple_size):
33
+ tuple_list = [tuple([i+j for j in range(tuple_size)]) \
34
+ for i in range(len(sample_list)-tuple_size+1)]
35
+ #print(tuple_list)
36
+ return tuple_list
37
+
38
+ # OJO: CAMBIAR LA FUNCION COMBINATION POR ALGO QUE HAGA PERMUTACIONES CICLICAS
39
+ def make_translation(transcription, data, threshold=50, fraction=2/3):
40
+
41
+
42
+ # To set limits for comparison size
43
+ data_len = find_longest_phrase(data)
44
+ transcription_len = len(transcription.split())
45
+ biggest_len = min(data_len, transcription_len)
46
+
47
+ # To get the best translation given a phrase
48
+ index_transcription = list(range(transcription_len))
49
+ index_translation = list(range(transcription_len))
50
+
51
+ translation_dict = {}
52
+ translation = transcription#.copy()
53
+ transcription_split = transcription.split()
54
+
55
+ for i in range(1, 0, -1):
56
+ # Match comparisons
57
+ if i>1:
58
+ translation_dict.update({combination: search_fit(''.join(transcription_split[combination[0]:combination[-1]+1]), data, threshold, fraction) for combination in create_tuples(transcription_split, i)})
59
+ else:
60
+ translation_dict.update({combination: search_fit(transcription_split[combination[0]], data, threshold, fraction) for combination in create_tuples(transcription_split, i)})
61
+
62
+ # Get the best translation priorizing the longest phrases
63
+ for combination in create_tuples(transcription_split, i): # AQUI SE PUEDE MEJORAR LA BÚSQUEDA, PRIORIZANDO POR MAYOR SCORE EN LUGAR DE POR ORDEN SECUENCIAL
64
+ clear_index = min([1*(item in index_translation) for item in combination]) # 1 if all indexes are free
65
+ if clear_index and i>1 and translation_dict[combination][1]>threshold:
66
+ taken = False
67
+ translation_split = translation.split()
68
+ for number, word in enumerate(translation_split):
69
+ if number in combination:
70
+ if not taken:
71
+ if len(translation_dict[combination][0].split())>1:
72
+ translation_split[number] = '-'.join(translation_dict[combination][0])
73
+ else:
74
+ translation_split[number] = translation_dict[combination][0]
75
+ taken = True
76
+ else:
77
+ translation_split[number] = '<>'
78
+ translation = ' '.join(translation_split)
79
+
80
+ index_translation = [item if item not in combination else 0 for item in index_translation]
81
+
82
+ elif index_translation[combination[0]]!=0 and i==1 and translation_dict[combination][1]>threshold:
83
+ taken = False
84
+ translation_split = translation.split()
85
+ for number, word in enumerate(translation_split):
86
+ if number in combination:
87
+ if not taken:
88
+ if len(translation_dict[combination][0].split())>1:
89
+ translation_split[number] = '-'.join(translation_dict[combination][0])
90
+ else:
91
+ translation_split[number] = translation_dict[combination][0]
92
+ taken = True
93
+ else:
94
+ translation_split[number] = '<>'
95
+ translation = ' '.join(translation_split)
96
+ index_translation = [item if item not in combination else 0 for item in index_translation]
97
+
98
+ return translation.replace('-', ' ').replace('<>', '').replace(' ', ' ').replace(' ', ' ').strip()
99
+
100
+
101
+ def remover(my_string = ""):
102
+ for item in my_string:
103
+ if item not in values:
104
+ my_string = my_string.replace(item, "")
105
+ return my_string
106
+
107
+ def translate(oracion, model):
108
+ sentence = oracion[:] # make_translation(oracion.strip().lower(), dictionary, threshold=90, fraction=4/5) #
109
+ sentence_tokens = [tokens + ['<END>', '<PAD>'] for tokens in [sentence.split(' ')]]
110
+ tr_input = [list(map(lambda x: source_token_dict[x] if x in source_token_dict.keys() else source_token_dict['<UNK>'], tokens)) for tokens in sentence_tokens][0]
111
+ decoded = decode(
112
+ model,
113
+ tr_input,
114
+ start_token = target_token_dict['<START>'],
115
+ end_token = target_token_dict['<END>'],
116
+ pad_token = target_token_dict['<PAD>']
117
+ )
118
+
119
+ return ' '.join(map(lambda x: target_token_dict_inv[x], decoded[1:-1]))
120
+
121
+ ####################################################################################################
122
+ # MAIN APP
123
+ path_dict = 'https://huggingface.co/spaces/gilesitorr/Nahuatl2Spanish/raw/main/Dictionaries/'
124
+
125
+ response = urlopen(path_dict+'uncased_tokens_pretrained.json')
126
+ source_token_dict = json.loads(response.read())
127
+ target_token_dict = source_token_dict.copy()
128
+
129
+ response = urlopen(path_dict+'uncased_tokens_inv_pretrained.json')
130
+ target_token_dict_inv = json.loads(response.read())
131
+ target_token_dict_inv = {int(k): v for k,v in target_token_dict_inv.items()}
132
+
133
+ response = urlopen(path_dict+'es_nah.json')
134
+ dictionary = json.loads(response.read())
135
+
136
+ model = get_model(
137
+ token_num = max(len(source_token_dict),len(target_token_dict)),
138
+ embed_dim = 256,
139
+ encoder_num = 2,
140
+ decoder_num = 2,
141
+ head_num = 32,
142
+ hidden_dim = 2048,
143
+ dropout_rate = 0.1,
144
+ use_same_embed = False,
145
+ )
146
+
147
+ from keras.utils.data_utils import get_file
148
+
149
+ path_model = 'https://huggingface.co/spaces/gilesitorr/Nahuatl2Spanish/resolve/main/Models/'
150
+ filename = path_model+'uncased_translator_espanol2nahuatl+hybrid.h5'
151
+ weights_path = get_file(
152
+ '.././model.h5',
153
+ filename)
154
+ model.load_weights(weights_path)
155
+
156
+ values = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ")
157
+ text = st.text_area('Escriba una frase a traducir: ')
158
+ if text:
159
+ out = translate(remover(text.lower()), model)
160
+ st.text(out)