Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
import json | |
import pandas as pd | |
import numpy as np | |
from transformers import AutoTokenizer, RobertaForTokenClassification | |
from transformers import AutoTokenizer, AutoModelForTokenClassification | |
from transformers import AutoModelForSequenceClassification, AutoTokenizer | |
from json import JSONEncoder | |
from faker import Faker | |
from keras.utils import pad_sequences | |
import calendar | |
class out_json(): | |
def __init__(self, w,l): | |
self.word = w | |
self.label = l | |
class MyEncoder(json.JSONEncoder): | |
def default(self, obj): | |
return { | |
'word': obj.word, | |
'label': obj.label | |
} | |
class Model: | |
def __init__(self): | |
self.texto="" | |
self.idioma="" | |
self.modelo_ner="" | |
self.categoria_texto="" | |
## | |
### Función que aplica el modelo e identifica su idioma | |
### | |
def identificacion_idioma(self,text): | |
self.texto=text | |
tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection") | |
model = AutoModelForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection") | |
inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt") | |
with torch.no_grad(): | |
logits = model(**inputs).logits | |
preds = torch.softmax(logits, dim=-1) | |
id2lang = model.config.id2label | |
vals, idxs = torch.max(preds, dim=1) | |
#retorna el idioma con mayor porcentaje | |
maximo=vals.max() | |
idioma='' | |
porcentaje=0 | |
for k, v in zip(idxs, vals): | |
if v.item()==maximo: | |
idioma,porcentaje=id2lang[k.item()],v.item() | |
if idioma=='es': | |
self.idioma="es" | |
self.modelo_ner='BSC-LT/roberta_model_for_anonimization' | |
self.faker_ = Faker('es_MX') | |
self.model = RobertaForTokenClassification.from_pretrained(self.modelo_ner) | |
else: | |
self.idioma="en" | |
self.faker_ = Faker('en_US') | |
self.modelo_ner="dayannex/distilbert-tuned-4labels" | |
self.model = AutoModelForTokenClassification.from_pretrained(self.modelo_ner) | |
self.categorizar_texto(self.texto) | |
def reordenacion_tokens(self,tokens,caracter): | |
i=0 | |
new_tokens=[] | |
ig_tokens=[] | |
for token in tokens: | |
#print('token_texto:',token,caracter) | |
ind=len(new_tokens) | |
if i<len(tokens): | |
if not token.startswith(caracter): | |
new_tokens.append(token) | |
i=i+1 | |
else: | |
new_tokens[ind-1] = (new_tokens[ind-1] + token.replace(caracter,'')) | |
ig_tokens.append(i) | |
i=i+1 | |
return ( | |
new_tokens, | |
ig_tokens | |
) | |
def reordenacion_tokens_es(self,tokens,caracter): | |
i=0 | |
new_tokens=[] | |
ig_tokens=[] #ignorar estos indices del array de indentificadores | |
for token in tokens: | |
ind=len(new_tokens) | |
if i<len(tokens): | |
if token.startswith(caracter): | |
new_tokens.append(token) | |
i=i+1 | |
else: | |
#if i==0: new_tokens.append(token) | |
#else: | |
new_tokens[ind-1] = (new_tokens[ind-1] + token.replace(caracter,'')) | |
ig_tokens.append(i) | |
i=i+1 | |
return ( | |
new_tokens, | |
ig_tokens | |
) | |
def reordenacion_identificadores(self,ig_tokens,predicted_tokens_classes): | |
x=0 | |
new_identificadores=[] | |
for token in predicted_tokens_classes: | |
if x not in ig_tokens: | |
new_identificadores.append(token) | |
x=x+1 | |
else: | |
x=x+1 | |
return new_identificadores | |
def salida_json(self,tokens,pre_tokens): | |
list=[] | |
i=0 | |
for t in tokens: | |
if pre_tokens[i]!='O': | |
a = out_json(t.replace('##','').replace('Ġ','').replace('Ċ',''),pre_tokens[i].replace('▁','')) | |
list.append(a) | |
i=i+1 | |
return json.dumps(list, cls=MyEncoder, ensure_ascii=False)#MyEncoder().encode(list) | |
def tokens_identificados(self,tokens,pre_tokens): | |
list=[] | |
i=0 | |
for t in tokens: | |
if pre_tokens[i]!='O': | |
a = t.replace('##','').replace('Ġ','').replace('Ċ','') | |
list.append(a) | |
i=i+1 | |
return list | |
def metricas_anonimizacion(self,_f,t,id): | |
i=0 | |
coincidencia=0 | |
Z=['O'] | |
_fake_filter= [x for x in _f if x not in Z] | |
new_tokens_filter= self.tokens_identificados(t,id) | |
for token in new_tokens_filter: | |
if token==_fake_filter[i]: | |
coincidencia=coincidencia+1 | |
i=i+1 | |
return str(coincidencia) + "/" + str(len(_fake_filter)) | |
def salida_texto( self,tokens,pre_tokens): | |
new_labels = [] | |
current_word = None | |
i=0 | |
for token in tokens: | |
if pre_tokens[i]=='O' or 'MISC' in pre_tokens[i]: | |
new_labels.append(' ' +token.replace('##','').replace('Ġ','')) | |
else: | |
new_labels.append(' ' + pre_tokens[i]) | |
i=i+1 | |
a='' | |
for i in new_labels: | |
a = a+i | |
return a | |
def salida_texto_anonimizado(self, ids,pre_tokens): | |
new_labels = [] | |
current_word = None | |
i=0 | |
for identificador in pre_tokens: | |
if identificador=='O' or 'OTH' in identificador: | |
new_labels.append(self.tokenizer.decode(ids[i])) | |
else: | |
new_labels.append(' ' + identificador) | |
i=i+1 | |
a='' | |
for i in new_labels: | |
a = a+i | |
return a | |
def is_integer_string(self,value): | |
try: | |
int(value) | |
return True | |
except ValueError: | |
return False | |
def formato_salida(self,out): | |
a="" | |
for i in out: | |
a = a + i.replace('▁','').replace(' ','') + ' ' | |
return a | |
def fake_pers(self): | |
return self.faker_.name(self) | |
def fake_word(self): | |
return self.faker_.word() | |
def fake_first_name(self): | |
return self.faker_.first_name() | |
def fake_last_name(self): | |
return self.faker_.last_name() | |
def fake_address(self): | |
return self.faker_.address() | |
def fake_sentence(self,n): | |
return self.faker_.sentence(nb_words=n) | |
def fake_text(self): | |
return self.faker_.text() | |
def fake_company(self): | |
return self.faker_.company() | |
def fake_city(self): | |
return self.faker_.city() | |
def get_day_of(self, month_name, year=2024): | |
months = { | |
'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4, 'mayo': 5, 'junio': 6, | |
'julio': 7, 'agosto': 8, 'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12, | |
'january': 1, 'february': 2, 'march': 3, 'april': 4, 'may': 5, 'june': 6, | |
'july': 7, 'august': 8, 'september': 9, 'october': 10, 'november': 11, 'december': 12 | |
} | |
month = months[month_name] | |
_, num_days = calendar.monthrange(year, month) | |
return str(num_days) | |
def reemplazo_fake(self,identificadores, new_tokens): | |
a=['Enero','January', 'February','Febrero','Marzo','March','Abril','April','Mayo','May','Junio','June','Julio','July','Agosto','August','Septiembre','September','Octubre','October','Noviembre','November','Diciembre','December'] | |
b=['Ene','Jan', 'Feb','Mar','Mar','Abr','Apr','May','May','Jun','Jun','Jul','Jul','Ago','Aug','Sep','Oct','Nov','Dic','Dec'] | |
i=0 | |
new_iden=[] | |
for id in identificadores: | |
if 'PER' in id: | |
new_iden.append(self.fake_first_name()) | |
elif 'ORG' in id: | |
new_iden.append(self.fake_company()) | |
elif 'LOC' in id: | |
new_iden.append(self.fake_city()) | |
elif 'DATE' in id: | |
if self.is_integer_string(new_tokens[i]): | |
match len(new_tokens[i]): | |
case 4: | |
new_iden.append(self.faker_.date()[:4]) | |
case 10: | |
new_iden.append(self.faker_.date()) | |
case 1: | |
new_iden.append(self.get_day_of('february')) | |
case 2: | |
new_iden.append(self.get_day_of('february')) | |
case _: | |
new_iden.append(id) | |
else: | |
match new_tokens[i]: | |
case w if w in a: | |
new_iden.append(self.faker_.month_name()) | |
case w if w in b: | |
new_iden.append(self.faker_.month_name()[:3]) | |
case "-": | |
new_iden.append("-") | |
case ".": | |
new_iden.append(".") | |
case ",": | |
new_iden.append(",") | |
case "/": | |
new_iden.append("/") | |
case _: | |
new_iden.append(id) | |
else: | |
new_iden.append(id) | |
i=i+1 | |
return new_iden | |
### | |
### Función que aplica los modelo para categorizar el texto segun su contexto | |
### | |
def categorizar_texto(self,texto): | |
name="elozano/bert-base-cased-news-category" | |
tokenizer = AutoTokenizer.from_pretrained(name) | |
model_ = AutoModelForSequenceClassification.from_pretrained(name) | |
inputs_ = tokenizer(texto, padding=True, truncation=True, return_tensors="pt") | |
with torch.no_grad(): | |
logits = model_(**inputs_).logits | |
preds = torch.softmax(logits, dim=-1) | |
id2lang = model_.config.id2label | |
vals, idxs = torch.max(preds, dim=1) | |
#retorna el idioma con mayor porcentaje | |
maximo=vals.max() | |
cat='' | |
self.categoria_texto='' | |
porcentaje=0 | |
for k, v in zip(idxs, vals): | |
if v.item()==maximo: | |
cat,porcentaje=id2lang[k.item()],v.item() | |
self.categoria_texto=cat | |
return cat, porcentaje | |
### | |
### Función que aplica los modelos sobre un texto | |
### | |
def predict(self,etiquetas): | |
categoria, porcentaje = self.categorizar_texto(self.texto) | |
print(categoria, porcentaje) | |
self.tokenizer = AutoTokenizer.from_pretrained(self.modelo_ner) | |
inputs = self.tokenizer(self.texto, return_tensors="pt") | |
with torch.no_grad(): | |
outputs = self.model(**inputs) | |
logits = outputs.logits | |
predictions = torch.argmax(logits, dim=2) | |
predicted_token_class_ids = predictions[0].tolist() | |
predicted_tokens_classes = [self.model.config.id2label[label_id] for label_id in predicted_token_class_ids] | |
tokens = self.tokenizer.convert_ids_to_tokens(inputs.input_ids[0],skip_special_tokens=False)## OJO skip_special_tokens=False ojo alli esta cero y es i | |
predicted_tokens_classes.pop(0) | |
predicted_tokens_classes.pop(len(predicted_tokens_classes)-1) | |
tokens.pop(0) | |
tokens.pop(len(tokens)-1) | |
if (self.idioma=='es'): | |
inputs = self.tokenizer(self.texto, return_tensors="pt",max_length=512, truncation=True) | |
with torch.no_grad(): | |
outputs = self.model(**inputs) | |
logits = outputs.logits | |
predictions = torch.argmax(logits, dim=2) | |
predicted_token_class_ids = predictions[0].tolist() | |
predicted_tokens_classes = [self.model.config.id2label[label_id] for label_id in predicted_token_class_ids] | |
tokens = self.tokenizer.convert_ids_to_tokens(inputs.input_ids[0]) | |
predicted_tokens_classes.pop(0) | |
predicted_tokens_classes.pop(len(predicted_tokens_classes)-1) | |
tokens.pop(0) | |
tokens.pop(len(tokens)-1) | |
new_tokens,ig_tokens=self.reordenacion_tokens_es(tokens,'Ġ') | |
else: | |
inputs = self.tokenizer(self.texto, return_tensors="pt") | |
with torch.no_grad(): | |
outputs = self.model(**inputs) | |
logits = outputs.logits | |
predictions = torch.argmax(logits, dim=2) | |
predicted_token_class_ids = predictions[0].tolist() | |
predicted_tokens_classes = [self.model.config.id2label[label_id] for label_id in predicted_token_class_ids] | |
tokens = self.tokenizer.convert_ids_to_tokens(inputs.input_ids[0]) | |
predicted_tokens_classes.pop(0) | |
predicted_tokens_classes.pop(len(predicted_tokens_classes)-1) | |
tokens.pop(0) | |
tokens.pop(len(tokens)-1) | |
new_tokens,ig_tokens=self.reordenacion_tokens(tokens,'#') | |
new_identificadores = self.reordenacion_identificadores(ig_tokens,predicted_tokens_classes) | |
out1 = self.salida_json(new_tokens,new_identificadores) | |
if etiquetas: | |
out2 = self.salida_texto(new_tokens,new_identificadores)#solo identificadores | |
out3="" | |
coincidencia="" | |
else: | |
#out2 = self.salida_texto(new_tokens,self.reemplazo_fake(new_identificadores)) | |
_fake=self.reemplazo_fake(new_identificadores,new_tokens) | |
coincidencia=self.metricas_anonimizacion(_fake,new_tokens,new_identificadores) | |
out2 = self.salida_texto(new_tokens,_fake) | |
out3 = self.salida_json(_fake,new_identificadores) | |
return ( | |
out1, | |
str(out2), | |
out3, | |
coincidencia | |
) | |
class ModeloDataset: | |
def __init__(self): | |
self.texto="" | |
self.idioma="" | |
self.modelo_ner="" | |
self.categoria_texto="" | |
#self.tokenizer = AutoTokenizer.from_pretrained("BSC-LT/roberta_model_for_anonimization") | |
def reordenacion_tokens(self,tokens,caracter): | |
i=0 | |
new_tokens=[] | |
ig_tokens=[] | |
for token in tokens: | |
ind=len(new_tokens) | |
if i<len(tokens): | |
if not token.startswith(caracter): | |
new_tokens.append(token) | |
i=i+1 | |
else: | |
new_tokens[ind-1] = (new_tokens[ind-1] + token.replace(caracter,'')) | |
ig_tokens.append(i) | |
i=i+1 | |
return ( | |
new_tokens, | |
ig_tokens | |
) | |
def reordenacion_tokens_es(self,tokens,caracter): | |
i=0 | |
new_tokens=[] | |
ig_tokens=[] #ignorar estos indices del array de indentificadores | |
for token in tokens: | |
ind=len(new_tokens) | |
if i<len(tokens): | |
if token.startswith(caracter): | |
new_tokens.append(token) | |
i=i+1 | |
else: | |
#if i==0: new_tokens.append(token) | |
#else: | |
new_tokens[ind-1] = (new_tokens[ind-1] + token.replace(caracter,'')) | |
ig_tokens.append(i) | |
i=i+1 | |
return ( | |
new_tokens, | |
ig_tokens | |
) | |
def reordenacion_identificadores(self,ig_tokens,predicted_tokens_classes, tamano): | |
x=0 | |
new_identificadores=[] | |
for token in predicted_tokens_classes: | |
if x not in ig_tokens: | |
if len(new_identificadores) < tamano: | |
new_identificadores.append(token) | |
x=x+1 | |
else: | |
x=x+1 | |
return new_identificadores | |
def is_integer_string(self,value): | |
try: | |
int(value) | |
return True | |
except ValueError: | |
return False | |
def get_day_of(self, month_name, year=2024): | |
months = { | |
'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4, 'mayo': 5, 'junio': 6, | |
'julio': 7, 'agosto': 8, 'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12, | |
'january': 1, 'february': 2, 'march': 3, 'april': 4, 'may': 5, 'june': 6, | |
'july': 7, 'august': 8, 'september': 9, 'october': 10, 'november': 11, 'december': 12 | |
} | |
month = months[month_name] | |
_, num_days = calendar.monthrange(year, month) | |
return str(num_days) | |
### | |
### Funciones para generar diversos datos fake dependiendo de la catagoria | |
### | |
def fake_pers(self): | |
return self.faker_.name(self) | |
def fake_word(self): | |
return self.faker_.word() | |
def fake_first_name(self): | |
return self.faker_.first_name() | |
def fake_last_name(self): | |
return self.faker_.last_name() | |
def fake_address(self): | |
return self.faker_.address() | |
def fake_sentence(self,n): | |
return self.faker_.sentence(nb_words=n) | |
def fake_text(self): | |
return self.faker_.text() | |
def fake_company(self): | |
return self.faker_.company() | |
def fake_city(self): | |
return self.faker_.city() | |
def reemplazo_fake(self,identificadores,new_tokens): | |
a=['Enero','January', 'February','Febrero','Marzo','March','Abril','April','Mayo','May','Junio','June','Julio','July','Agosto','August','Septiembre','September','Octubre','October','Noviembre','November','Diciembre','December'] | |
b=['Ene','Jan', 'Feb','Mar','Mar','Abr','Apr','May','May','Jun','Jun','Jul','Jul','Ago','Aug','Sep','Oct','Nov','Dic','Dec'] | |
i=0 | |
if self.idioma=='es': | |
self.faker_ = Faker('es_MX') | |
else: | |
self.faker_ = Faker('en_US') | |
new_iden=[] | |
for id in identificadores: | |
if 'PER' in id: | |
new_iden.append(self.fake_first_name()) | |
elif 'ORG' in id: | |
new_iden.append(self.fake_company()) | |
elif 'LOC' in id: | |
new_iden.append(self.fake_city()) | |
elif 'DATE' in id: | |
if self.is_integer_string(new_tokens[i]): | |
match len(new_tokens[i]): | |
case 4: | |
new_iden.append(self.faker_.date()[:4]) | |
case 10: | |
new_iden.append(self.faker_.date()) | |
case 1: | |
new_iden.append(self.get_day_of('february')) | |
case 2: | |
new_iden.append(self.get_day_of('february')) | |
case _: | |
new_iden.append(id) | |
else: | |
match new_tokens[i]: | |
case w if w in a: | |
new_iden.append(self.faker_.month_name()) | |
case w if w in b: | |
new_iden.append(self.faker_.month_name()[:3]) | |
case "-": | |
new_iden.append("-") | |
case ".": | |
new_iden.append(".") | |
case ",": | |
new_iden.append(",") | |
case "/": | |
new_iden.append("/") | |
case _: | |
new_iden.append(id) | |
else: | |
new_iden.append(id) | |
i=i+1 | |
return new_iden | |
### | |
### Función que aplica los modelos de acuerdo al idioma detectado | |
### | |
def aplicar_modelo(self,_sentences,idioma, etiquetas): | |
if idioma=="es": | |
self.tokenizer = AutoTokenizer.from_pretrained("BSC-LT/roberta_model_for_anonimization") | |
tokenized_text=[self.tokenizer.tokenize(sentence[:500]) for sentence in _sentences] | |
ids = [self.tokenizer.convert_tokens_to_ids(x) for x in tokenized_text] | |
MAX_LEN=128 | |
ids=pad_sequences(ids,maxlen=MAX_LEN,dtype="long",truncating="post", padding="post") | |
input_ids = torch.tensor(ids) | |
self.model = RobertaForTokenClassification.from_pretrained("BSC-LT/roberta_model_for_anonimization") | |
with torch.no_grad(): | |
logits = self.model(input_ids).logits | |
predicted_token_class_ids = logits.argmax(-1) | |
i=0 | |
_predicted_tokens_classes=[] | |
for a in predicted_token_class_ids: | |
_predicted_tokens_classes.append([self.model.config.id2label[t.item()] for t in predicted_token_class_ids[i]]) | |
i=i+1 | |
labels = predicted_token_class_ids | |
loss = self.model(input_ids, labels=labels).loss | |
new_tokens=[] | |
ig_tok=[] | |
i=0 | |
new_identificadores=[] | |
for item in tokenized_text: | |
aux1, aux2= self.reordenacion_tokens_es(item,"Ġ") | |
new_tokens.append(aux1) | |
ig_tok.append(aux2) | |
for items in _predicted_tokens_classes: | |
aux=self.reordenacion_identificadores(ig_tok[i],items,len(new_tokens[i])) | |
new_identificadores.append(aux) | |
i=i+1 | |
return new_identificadores, new_tokens | |
else: | |
print('idioma:',idioma) | |
self.tokenizer = AutoTokenizer.from_pretrained("dayannex/distilbert-tuned-4labels") | |
self.model = AutoModelForTokenClassification.from_pretrained("dayannex/distilbert-tuned-4labels") | |
sentences_list = _sentences.tolist() | |
inputs = self.tokenizer(sentences_list, padding=True, truncation=True, return_tensors="pt", max_length=512) | |
with torch.no_grad(): | |
outputs = self.model(**inputs) | |
logits = outputs.logits | |
predictions = torch.argmax(logits, dim=2) | |
id2label = self.model.config.id2label | |
all_tokens = [] | |
all_label_ids = [] | |
all_labels = [] | |
for i, sentence in enumerate(sentences_list): | |
tokens = self.tokenizer.convert_ids_to_tokens(inputs.input_ids[i]) | |
label_ids = predictions[i].tolist() | |
labels = [id2label[label_id] for label_id in label_ids] | |
all_tokens.append(tokens) | |
all_label_ids.append(label_ids) | |
all_labels.append(labels) | |
#se eliminan el primer y ultimo elemento | |
for item in all_tokens: | |
item.pop(0) | |
item.pop(len(item)-1) | |
for item in all_labels: | |
item.pop(0) | |
item.pop(len(item)-1) | |
new_tokens=[] | |
ig_tok=[] | |
i=0 | |
new_identificadores=[] | |
for item in all_tokens: | |
aux1, aux2= self.reordenacion_tokens(item,"#") | |
new_tokens.append(aux1) | |
ig_tok.append(aux2) | |
print('ig_tok') | |
print(ig_tok) | |
i=0 | |
for items in all_labels: | |
aux=self.reordenacion_identificadores(ig_tok[i],items,len(new_tokens[i])) | |
new_identificadores.append(aux) | |
i=i+1 | |
special_tokens = self.tokenizer.all_special_tokens | |
filtered_tokens = [] | |
filtered_labels = [] | |
tok_new=[] | |
lab_new=[] | |
#se descartan los tokens speciales | |
for token_linea, label_linea in zip(new_tokens, new_identificadores): | |
filtered_tokens = [] | |
filtered_labels = [] | |
for token, label in zip(token_linea, label_linea): | |
if token not in special_tokens: | |
filtered_tokens.append(token) | |
filtered_labels.append(label) | |
tok_new.append(filtered_tokens) | |
lab_new.append(filtered_labels) | |
return lab_new,tok_new #new_identificadores, new_tokens | |
### | |
### Procesa los tokens generados del texto de entradas con los tokens predichos, para generar los tokens por palabra | |
### | |
def salida_texto( self,tokens,pre_tokens): | |
new_labels = [] | |
current_word = None | |
i=0 | |
for token in tokens: | |
if pre_tokens[i]=='O' or 'MISC' in pre_tokens[i]: | |
new_labels.append(' ' +token.replace('▁','').replace('Ġ','')) | |
else: | |
new_labels.append(' ' + pre_tokens[i]) | |
i=i+1 | |
a='' | |
for i in new_labels: | |
a = a+i | |
return a | |
def salida_texto2(self, tokens,labels,etiquetas): | |
i=0 | |
out=[] | |
for iden in labels: | |
if etiquetas: | |
out.append(self.salida_texto( iden,np.array(tokens[i]))) | |
else: | |
out.append(self.salida_texto(iden,self.reemplazo_fake(np.array(tokens[i]),labels[i]))) | |
i=i+1 | |
return out | |
def unir_array(self,_out): | |
i=0 | |
salida=[] | |
for item in _out: | |
salida.append("".join(str(x) for x in _out[i])) | |
i=i+1 | |
return salida | |
def unir_columna_valores(self,df,columna): | |
out = ','.join(df[columna]) | |
return out | |
### | |
### Funcion para procesar archivos json, recibe archivo | |
### | |
class utilJSON: | |
def __init__(self,archivo): | |
with open(archivo, encoding='utf-8') as f: | |
self.data = json.load(f) | |
def obtener_keys_json(self,data): | |
out=[] | |
for key in data: | |
out.append(key) | |
return(out) | |
### | |
### funcion "flatten_json" tomada de https://levelup.gitconnected.com/a-deep-dive-into-nested-json-to-data-frame-with-python-69bdabb41938 | |
### Renu Khandelwal Jul 23, 2023 | |
def flatten_json(self,y): | |
try: | |
out = {} | |
def flatten(x, name=''): | |
if type(x) is dict: | |
for a in x: | |
flatten(x[a], name + a + '_') | |
elif type(x) is list: | |
i = 0 | |
for a in x: | |
flatten(a, name + str(i) + '_') | |
i += 1 | |
else: | |
out[name[:-1]] = x | |
flatten(y) | |
return out | |
except json.JSONDecodeError: | |
print("Error: The JSON document could not be decoded.") | |
except TypeError: | |
print("Error: Invalid operation or function argument type.") | |
except KeyError: | |
print("Error: One or more keys do not exist.") | |
except ValueError: | |
print("Error: Invalid value detected.") | |
except Exception as e: | |
print(f"An unexpected error occurred: {str(e)}") | |
def obtener_dataframe(self,data): | |
claves=self.obtener_keys_json(data) | |
if len(claves)==1: | |
data_flattened = [self.flatten_json(class_info) for class_info in data[claves[0]]] | |
df = pd.DataFrame(data_flattened) | |
else: | |
data_flattened = [self.flatten_json(class_info) for class_info in data] | |
df = pd.DataFrame(data_flattened) | |
return df | |
modelo = ModeloDataset() | |
model = Model() | |
def get_model(): | |
return model | |
### | |
### Función que interactúa con la interfaz Gradio para el procesamiento de texto, csv o json | |
### | |
def procesar(texto,archivo, etiquetas): | |
if len(texto)>0: | |
print('text') | |
model.identificacion_idioma(texto[:1700]) | |
labels, textoProcesado, labels_fake, coincidencia= model.predict(etiquetas) | |
return model.idioma + "/" + model.categoria_texto,labels, textoProcesado,gr.Dataframe(),gr.File(),labels_fake, coincidencia | |
else: | |
if archivo.name.split(".")[1]=="csv": | |
print('csv') | |
#df=pd.read_csv(archivo.name,delimiter=";",encoding='latin-1') | |
df=pd.read_csv(archivo.name,delimiter=";") | |
df_new = pd.DataFrame( columns=df.columns.values) | |
model.identificacion_idioma(df.iloc[0][0]) | |
modelo.idioma=model.idioma | |
print(model.idioma) | |
for item in df.columns.values: | |
sentences=df[item] | |
ides, predicted = modelo.aplicar_modelo(sentences,model.idioma,etiquetas) | |
out=modelo.salida_texto2( ides,predicted,etiquetas) | |
print('out csv:',out) | |
df_new[item] = modelo.unir_array(out) | |
return modelo.idioma,"","", df_new, df_new.to_csv(sep='\t', encoding='utf-8',index=False),"","" | |
else: | |
print('json') | |
if archivo.name.split(".")[1]=="json": | |
util = utilJSON(archivo.name) | |
df=util.obtener_dataframe(util.data) | |
df_new = pd.DataFrame( columns=df.columns.values) | |
model.identificacion_idioma(df.iloc[0][0]) | |
modelo.idioma=model.idioma | |
for item in df.columns.values: | |
sentences=df[item] | |
print('sen') | |
ides, predicted = modelo.aplicar_modelo(sentences,modelo.idioma,etiquetas) | |
print('ap') | |
out=modelo.salida_texto2( ides,predicted,etiquetas) | |
print('sa') | |
print('out json:',out) | |
df_new[item] = modelo.unir_array(out) | |
print('un') | |
return modelo.idioma,"","", df_new, df_new.to_csv(sep='\t', encoding='utf-8',index=False),"","" | |
demo = gr.Interface(fn=procesar,inputs=["text",gr.File(), "checkbox"] , outputs=[gr.Label(label="idioma/categoría"),gr.Textbox(label="etiquetas"),gr.Textbox(label="texto procesado"),gr.Dataframe(label="Datos procesados en dataframe",interactive=False),gr.Textbox(label="datos csv"),gr.Textbox(label="etiquetas anonimizadas"),gr.Label(label="coincidencia tokens originales vs anonimizados")]) | |
# | |
demo.launch(share=True) | |