import streamlit as st import json from urllib.request import urlopen from thefuzz import fuzz from itertools import combinations from keras_transformer import get_model, decode #################################################################################################### # FUNCTIONS def search_fit(word, data, threshold=50, fraction=2/3): # Esta función se puede usar para n palabras, basta con quitar los espacios # entre palabras target = '' original = '' best_score = 0 for item in data.keys(): for i in range(len(data[item])): data_item = data[item][i].replace(' ', '') score = fuzz.ratio(word, data_item) if score>best_score and score>=threshold and len(data_item)>=fraction*len(word) and len(data_item)<=len(word)/fraction: best_score = score target = item original = data_item return target, best_score, original def find_longest_phrase(data): biggest_len = max([max([len(data[item][i].split()) for i in range(len(data[item]))]) for item in data.keys()]) return biggest_len def create_tuples(sample_list, tuple_size): tuple_list = [tuple([i+j for j in range(tuple_size)]) \ for i in range(len(sample_list)-tuple_size+1)] #print(tuple_list) return tuple_list # OJO: CAMBIAR LA FUNCION COMBINATION POR ALGO QUE HAGA PERMUTACIONES CICLICAS def make_translation(transcription, data, threshold=50, fraction=2/3): # To set limits for comparison size data_len = find_longest_phrase(data) transcription_len = len(transcription.split()) biggest_len = min(data_len, transcription_len) # To get the best translation given a phrase index_transcription = list(range(transcription_len)) index_translation = list(range(transcription_len)) translation_dict = {} translation = transcription#.copy() transcription_split = transcription.split() for i in range(1, 0, -1): # Match comparisons if i>1: translation_dict.update({combination: search_fit(''.join(transcription_split[combination[0]:combination[-1]+1]), data, threshold, fraction) for combination in create_tuples(transcription_split, i)}) else: translation_dict.update({combination: search_fit(transcription_split[combination[0]], data, threshold, fraction) for combination in create_tuples(transcription_split, i)}) # Get the best translation priorizing the longest phrases for combination in create_tuples(transcription_split, i): # AQUI SE PUEDE MEJORAR LA BÚSQUEDA, PRIORIZANDO POR MAYOR SCORE EN LUGAR DE POR ORDEN SECUENCIAL clear_index = min([1*(item in index_translation) for item in combination]) # 1 if all indexes are free if clear_index and i>1 and translation_dict[combination][1]>threshold: taken = False translation_split = translation.split() for number, word in enumerate(translation_split): if number in combination: if not taken: if len(translation_dict[combination][0].split())>1: translation_split[number] = '-'.join(translation_dict[combination][0]) else: translation_split[number] = translation_dict[combination][0] taken = True else: translation_split[number] = '<>' translation = ' '.join(translation_split) index_translation = [item if item not in combination else 0 for item in index_translation] elif index_translation[combination[0]]!=0 and i==1 and translation_dict[combination][1]>threshold: taken = False translation_split = translation.split() for number, word in enumerate(translation_split): if number in combination: if not taken: if len(translation_dict[combination][0].split())>1: translation_split[number] = '-'.join(translation_dict[combination][0]) else: translation_split[number] = translation_dict[combination][0] taken = True else: translation_split[number] = '<>' translation = ' '.join(translation_split) index_translation = [item if item not in combination else 0 for item in index_translation] return translation.replace('-', ' ').replace('<>', '').replace(' ', ' ').replace(' ', ' ').strip() def remover(my_string = ""): for item in my_string: if item not in values: my_string = my_string.replace(item, "") return my_string def translate(oracion, model): sentence = oracion[:] # make_translation(oracion.strip().lower(), dictionary, threshold=90, fraction=4/5) # sentence_tokens = [tokens + ['', ''] for tokens in [sentence.split(' ')]] tr_input = [list(map(lambda x: source_token_dict[x] if x in source_token_dict.keys() else source_token_dict[''], tokens)) for tokens in sentence_tokens][0] decoded = decode( model, tr_input, start_token = target_token_dict[''], end_token = target_token_dict[''], pad_token = target_token_dict[''] ) return ' '.join(map(lambda x: target_token_dict_inv[x], decoded[1:-1])) #################################################################################################### # MAIN APP path_dict = 'https://huggingface.co/spaces/gilesitorr/Nahuatl2Spanish/raw/main/Dictionaries/' response = urlopen(path_dict+'uncased_tokens_pretrained.json') source_token_dict = json.loads(response.read()) target_token_dict = source_token_dict.copy() response = urlopen(path_dict+'uncased_tokens_inv_pretrained.json') target_token_dict_inv = json.loads(response.read()) target_token_dict_inv = {int(k): v for k,v in target_token_dict_inv.items()} response = urlopen(path_dict+'nah_es.json') dictionary = json.loads(response.read()) model = get_model( token_num = max(len(source_token_dict),len(target_token_dict)), embed_dim = 256, encoder_num = 2, decoder_num = 2, head_num = 32, hidden_dim = 2048, dropout_rate = 0.1, use_same_embed = False, ) from keras.utils.data_utils import get_file path_model = 'https://huggingface.co/spaces/gilesitorr/Nahuatl2Spanish/resolve/main/Models/' filename = path_model+'uncased_translator_nahuatl2espanol+hybrid.h5' weights_path = get_file( '.././model.h5', filename) model.load_weights(weights_path) values = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ") text = st.text_area('Escriba una frase a traducir: ') if text: out = translate(remover(text.lower()), model) st.text(out)