Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
import json | |
import pandas as pd | |
import numpy as np | |
from transformers import AutoTokenizer, RobertaForTokenClassification | |
from transformers import AutoTokenizer, AutoModelForTokenClassification | |
from transformers import AutoModelForSequenceClassification, AutoTokenizer | |
from json import JSONEncoder | |
from faker import Faker | |
class out_json(): | |
def __init__(self, w,l): | |
self.word = w | |
self.label = l | |
class MyEncoder(JSONEncoder): | |
def default(self, o): | |
return o.__dict__ | |
class Model: | |
def __init__(self): | |
self.texto="" | |
self.idioma="" | |
self.modelo_ner="" | |
self.categoria_texto="" | |
def identificacion_idioma(self,text): | |
self.texto=text | |
tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection") | |
model = AutoModelForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection") | |
inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt") | |
with torch.no_grad(): | |
logits = model(**inputs).logits | |
preds = torch.softmax(logits, dim=-1) | |
id2lang = model.config.id2label | |
vals, idxs = torch.max(preds, dim=1) | |
#retorna el idioma con mayor porcentaje | |
maximo=vals.max() | |
idioma='' | |
porcentaje=0 | |
for k, v in zip(idxs, vals): | |
if v.item()==maximo: | |
idioma,porcentaje=id2lang[k.item()],v.item() | |
if idioma=='es': | |
self.idioma="es" | |
self.modelo_ner='BSC-LT/roberta_model_for_anonimization' | |
self.faker_ = Faker('es_MX') | |
self.model = RobertaForTokenClassification.from_pretrained(self.modelo_ner) | |
else: | |
self.idioma="en" | |
self.faker_ = Faker('en_US') | |
self.modelo_ner="FacebookAI/xlm-roberta-large-finetuned-conll03-english" | |
self.model = AutoModelForTokenClassification.from_pretrained(self.modelo_ner) | |
self.categorizar_texto(self.texto) | |
def reordenacion_tokens(self,tokens): | |
i=0 | |
new_tokens=[] | |
ig_tokens=[] #ignorar estos indices del array de indentificadores | |
for token in tokens: | |
ind=len(new_tokens) | |
if i<len(tokens): | |
if token.startswith("▁"): | |
new_tokens.append(token) | |
i=i+1 | |
else: | |
new_tokens[ind-1] = (new_tokens[ind-1] + token) | |
ig_tokens.append(i) | |
i=i+1 | |
return ( | |
new_tokens, | |
ig_tokens | |
) | |
def reordenacion_identificadores(self,ig_tokens,predicted_tokens_classes): | |
x=0 | |
new_identificadores=[] | |
for token in predicted_tokens_classes: | |
if x not in ig_tokens: | |
new_identificadores.append(token) | |
x=x+1 | |
else: | |
x=x+1 | |
return new_identificadores | |
def salida_json(self,tokens,pre_tokens): | |
list=[] | |
i=0 | |
for t in tokens: | |
if pre_tokens[i]!='O': | |
a = out_json(t.replace('▁','').replace('Ġ',''),pre_tokens[i].replace('▁','')) | |
list.append(a) | |
i=i+1 | |
return MyEncoder().encode(list) | |
def salida_texto( self,tokens,pre_tokens): | |
new_labels = [] | |
current_word = None | |
i=0 | |
for token in tokens: | |
if pre_tokens[i]=='O' or 'MISC' in pre_tokens[i]: | |
new_labels.append(' ' +token.replace('▁','')) | |
else: | |
new_labels.append(' ' + pre_tokens[i]) | |
i=i+1 | |
a='' | |
for i in new_labels: | |
a = a+i | |
return a | |
#return new_labels | |
def salida_texto_anonimizado(self, ids,pre_tokens): | |
new_labels = [] | |
current_word = None | |
i=0 | |
for identificador in pre_tokens: | |
if identificador=='O' or 'OTH' in identificador: | |
new_labels.append(self.tokenizer.decode(ids[i])) | |
else: | |
new_labels.append(' ' + identificador) | |
i=i+1 | |
a='' | |
for i in new_labels: | |
a = a+i | |
return a | |
def formato_salida(self,out): | |
a="" | |
for i in out: | |
a = a + i.replace('▁','').replace(' ','') + ' ' | |
return a | |
def fake_pers(self): | |
return self.faker_.name(self) | |
def fake_word(self): | |
return self.faker_.word() | |
def fake_first_name(self): | |
return self.faker_.first_name() | |
def fake_last_name(self): | |
return self.faker_.last_name() | |
def fake_address(self): | |
return self.faker_.address() | |
def fake_sentence(self,n): | |
return self.faker_.sentence(nb_words=n) | |
def fake_text(self): | |
return self.faker_.text() | |
def fake_company(self): | |
return self.faker_.company() | |
def fake_city(self): | |
return self.faker_.city() | |
def reemplazo_fake(self,identificadores): | |
new_iden=[] | |
for id in identificadores: | |
if 'PER' in id: | |
new_iden.append(self.fake_first_name()) | |
elif 'ORG' in id: | |
new_iden.append(self.fake_company()) | |
elif 'LOC' in id: | |
new_iden.append(self.fake_city()) | |
else: | |
new_iden.append(id) | |
return new_iden | |
def categorizar_texto(self,texto): | |
name="elozano/bert-base-cased-news-category" | |
tokenizer = AutoTokenizer.from_pretrained(name) | |
model_ = AutoModelForSequenceClassification.from_pretrained(name) | |
inputs_ = tokenizer(texto, padding=True, truncation=True, return_tensors="pt") | |
with torch.no_grad(): | |
logits = model_(**inputs_).logits | |
preds = torch.softmax(logits, dim=-1) | |
id2lang = model_.config.id2label | |
vals, idxs = torch.max(preds, dim=1) | |
#retorna el idioma con mayor porcentaje | |
maximo=vals.max() | |
cat='' | |
self.categoria_texto='' | |
porcentaje=0 | |
for k, v in zip(idxs, vals): | |
if v.item()==maximo: | |
cat,porcentaje=id2lang[k.item()],v.item() | |
self.categoria_texto=cat | |
return cat, porcentaje | |
def predict(self): | |
categoria, porcentaje = self.categorizar_texto(self.texto) | |
print(categoria, porcentaje) | |
self.tokenizer = AutoTokenizer.from_pretrained(self.modelo_ner) | |
tokens = self.tokenizer.tokenize(self.texto) | |
ids = self.tokenizer.convert_tokens_to_ids(tokens) | |
input_ids = torch.tensor([ids]) | |
with torch.no_grad(): | |
logits = self.model(input_ids).logits | |
predicted_token_class_ids = logits.argmax(-1) | |
predicted_tokens_classes = [self.model.config.id2label[t.item()] for t in predicted_token_class_ids[0]] | |
labels = predicted_token_class_ids | |
loss = self.model(input_ids, labels=labels).loss | |
if (self.idioma=='es'): | |
out1 = self.salida_json(tokens,predicted_tokens_classes) #spanish solo palabras sensibles | |
out2 = self.salida_texto_anonimizado(ids,self.reemplazo_fake(predicted_tokens_classes)) #español texto completo | |
else: | |
new_tokens,ig_tokens=self.reordenacion_tokens(tokens) | |
new_identificadores = self.reordenacion_identificadores(ig_tokens,predicted_tokens_classes) | |
out1 = self.salida_json(new_tokens,new_identificadores), | |
out2 = self.salida_texto(new_tokens,self.reemplazo_fake(new_identificadores)) | |
return ( | |
self.texto, | |
out1, | |
str(out2) | |
) | |
model = Model() | |
def get_model(): | |
return model | |
def procesar(texto): | |
model.identificacion_idioma(texto) | |
return model.predict(texto) | |
demo = gr.Interface(fn=procesar, inputs="text", outputs="text") | |
demo.launch() |