Spaces:

gilesitorr
/

Spanish2Nahuatl

Sleeping

App Files Files Community

gilesitorr commited on Nov 16, 2022

Commit

023650f

1 Parent(s): 6f91997

Create app.py

Browse files

Files changed (1) hide show

app.py +160 -0

app.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import streamlit as st
+import json
+from urllib.request import urlopen
+from thefuzz import fuzz
+from itertools import combinations
+from keras_transformer import get_model, decode
+####################################################################################################
+# FUNCTIONS
+def search_fit(word, data, threshold=50, fraction=2/3):
+  # Esta función se puede usar para n palabras, basta con quitar los espacios
+  # entre palabras
+  target = ''
+  original = ''
+  best_score = 0
+  for item in data.keys():
+    for i in range(len(data[item])):
+      data_item = data[item][i].replace(' ', '')
+      score = fuzz.ratio(word, data_item)
+      if score>best_score and score>=threshold and len(data_item)>=fraction*len(word) and len(data_item)<=len(word)/fraction:
+        best_score = score
+        target = item
+        original = data_item
+  return target, best_score, original
+def find_longest_phrase(data):
+  biggest_len = max([max([len(data[item][i].split()) for i in range(len(data[item]))]) for item in data.keys()])
+  return biggest_len
+def create_tuples(sample_list, tuple_size):
+  tuple_list = [tuple([i+j for j in range(tuple_size)]) \
+                for i in range(len(sample_list)-tuple_size+1)]
+  #print(tuple_list)
+  return tuple_list
+# OJO: CAMBIAR LA FUNCION COMBINATION POR ALGO QUE HAGA PERMUTACIONES CICLICAS
+def make_translation(transcription, data, threshold=50, fraction=2/3):
+  # To set limits for comparison size
+  data_len = find_longest_phrase(data)
+  transcription_len = len(transcription.split())
+  biggest_len = min(data_len, transcription_len)
+  # To get the best translation given a phrase
+  index_transcription = list(range(transcription_len))
+  index_translation = list(range(transcription_len))
+  translation_dict = {}
+  translation = transcription#.copy()
+  transcription_split = transcription.split()
+  for i in range(1, 0, -1):
+    # Match comparisons
+    if i>1:
+      translation_dict.update({combination: search_fit(''.join(transcription_split[combination[0]:combination[-1]+1]), data, threshold, fraction) for combination in create_tuples(transcription_split, i)})
+    else:
+      translation_dict.update({combination: search_fit(transcription_split[combination[0]], data, threshold, fraction) for combination in create_tuples(transcription_split, i)})
+    # Get the best translation priorizing the longest phrases
+    for combination in create_tuples(transcription_split, i): # AQUI SE PUEDE MEJORAR LA BÚSQUEDA, PRIORIZANDO POR MAYOR SCORE EN LUGAR DE  POR ORDEN SECUENCIAL
+      clear_index = min([1*(item in index_translation) for item in combination]) # 1 if all indexes are free
+      if clear_index and i>1 and translation_dict[combination][1]>threshold:
+        taken = False
+        translation_split = translation.split()
+        for number, word in enumerate(translation_split):
+          if number in combination:
+            if not taken:
+              if len(translation_dict[combination][0].split())>1:
+                translation_split[number] = '-'.join(translation_dict[combination][0])
+              else:
+                translation_split[number] = translation_dict[combination][0]
+              taken = True
+            else:
+              translation_split[number] = '<>'
+        translation = ' '.join(translation_split)
+        index_translation = [item if item not in combination else 0 for item in index_translation]
+      elif index_translation[combination[0]]!=0 and i==1 and translation_dict[combination][1]>threshold:
+        taken = False
+        translation_split = translation.split()
+        for number, word in enumerate(translation_split):
+          if number in combination:
+            if not taken:
+              if len(translation_dict[combination][0].split())>1:
+                translation_split[number] = '-'.join(translation_dict[combination][0])
+              else:
+                translation_split[number] = translation_dict[combination][0]
+              taken = True
+            else:
+              translation_split[number] = '<>'
+        translation = ' '.join(translation_split)
+        index_translation = [item if item not in combination else 0 for item in index_translation]
+  return translation.replace('-', ' ').replace('<>', '').replace('  ', ' ').replace('  ', ' ').strip()
+def remover(my_string = ""):
+  for item in my_string:
+    if item not in values:
+      my_string = my_string.replace(item, "")
+  return my_string
+def translate(oracion, model):
+  sentence = oracion[:] # make_translation(oracion.strip().lower(), dictionary, threshold=90, fraction=4/5) #
+  sentence_tokens = [tokens + ['<END>', '<PAD>'] for tokens in [sentence.split(' ')]]
+  tr_input = [list(map(lambda x: source_token_dict[x] if x in source_token_dict.keys() else source_token_dict['<UNK>'], tokens)) for tokens in sentence_tokens][0]
+  decoded = decode(
+      model,
+      tr_input,
+      start_token = target_token_dict['<START>'],
+      end_token = target_token_dict['<END>'],
+      pad_token = target_token_dict['<PAD>']
+  )
+  return ' '.join(map(lambda x: target_token_dict_inv[x], decoded[1:-1]))
+####################################################################################################
+# MAIN APP
+path_dict = 'https://huggingface.co/spaces/gilesitorr/Nahuatl2Spanish/raw/main/Dictionaries/'
+response = urlopen(path_dict+'uncased_tokens_pretrained.json')
+source_token_dict = json.loads(response.read())
+target_token_dict = source_token_dict.copy()
+response = urlopen(path_dict+'uncased_tokens_inv_pretrained.json')
+target_token_dict_inv = json.loads(response.read())
+target_token_dict_inv = {int(k): v for k,v in target_token_dict_inv.items()}
+response = urlopen(path_dict+'es_nah.json')
+dictionary = json.loads(response.read())
+model = get_model(
+      token_num = max(len(source_token_dict),len(target_token_dict)),
+      embed_dim = 256,
+      encoder_num = 2,
+      decoder_num = 2,
+      head_num = 32,
+      hidden_dim = 2048,
+      dropout_rate = 0.1,
+      use_same_embed = False,
+)
+from keras.utils.data_utils import get_file
+path_model = 'https://huggingface.co/spaces/gilesitorr/Nahuatl2Spanish/resolve/main/Models/'
+filename = path_model+'uncased_translator_espanol2nahuatl+hybrid.h5'
+weights_path = get_file(
+            '.././model.h5',
+            filename)
+model.load_weights(weights_path)
+values = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ")
+text = st.text_area('Escriba una frase a traducir: ')
+if text:
+  out = translate(remover(text.lower()), model)
+  st.text(out)