urchade's picture
Update app.py
df65a41 verified
from typing import Dict, Union
from gliner import GLiNER
import gradio as gr
model = GLiNER.from_pretrained("urchade/gliner_multi_pii-v1")
examples = [
[
"Pierre Dubois, résident de Paris, a fondé sa propre entreprise, Le Petit Café, située au 15 Rue de la Paix. Son numéro d'entreprise est FR-987654321-1, et il utilise le compte bancaire 9876543210 pour les transactions.",
"person, organization, address, company registration number, bank account number",
0.5,
False,
],
[
"Leticia Ramírez, una habitante de Barcelona, tiene una cita médica programada en el Hospital General de Cataluña, situado en 10 Calle de los Ángeles. Su número de la seguridad social es ES-123456789-A y su grupo sanguíneo es AB+.",
"person, location, address, social security number, blood type",
0.5,
False,
],
[
"John Smith, from London, teaches mathematics at Royal Academy located at 25 King’s Road. His employee ID is UK-987654-321 and he has been working there since 2015.",
"person, profession, organization, address, employee ID number",
0.5,
False,
],
[
"In Frankfurt, Claudia Weber frequently visits her local bank branch, Deutsche Bank, at 48 Hauptstraße. Her account number is DE-1234567890123456, used primarily for her mortgage payments.",
"person, location, address, bank account number",
0.5,
False,
],
[
"Marta Rossi, residente a Roma, ha acquistato un appartamento al 123 Via Condotti. Il numero di registrazione della proprietà è IT-654321-2018 e il mutuo è gestito tramite la Banca d'Italia con numero di conto 3216549870.",
"person, address, property registration number, bank account number",
0.5,
False,
],
[
"Paulo Coelho, um turista do Brasil, fez um seguro de viagem com a empresa Seguros PT antes de sua viagem para Lisboa. O número da apólice é BR-987654321-123 e inclui cobertura médica.",
"person, nationality, company, insurance policy number, coverage",
0.5,
False,
],
[
"Julia Fischer, eine Kundin aus München, hat bei der BayWa AG, einem großen Anbieter von Baustoffen mit Sitz am 77 Industriestraße, einen Kredit aufgenommen. Die Kreditnummer lautet DE-12345678.",
"person, city, organization, address, loan number",
0.5,
False,
],
[
"Carlos Sánchez, profesor en la Universidad de Madrid, reside en el 5 Calle de Alcalá. Su número de identificación de profesor es ES-192837465 y tiene un doctorado en filosofía.",
"person, profession, address, teacher ID number, degree",
0.5,
False,
],
[
"Sophie Dupont, une journaliste française, travaille pour Le Monde, basé au 33 rue des Écoles à Paris. Son numéro d'identification de presse est FR-75649023.",
"person, profession, organization, address, press ID number",
0.5,
False,
],
[
"Manuel Oliveira, um agricultor em Porto, possui uma grande plantação de vinhas na Rua da Estrada, 120. O número de registro agrícola é PT-5678912345.",
"person, profession, address, agricultural registration number",
0.5,
False,
],
[
"Elisa Müller, eine Künstlerin aus Berlin, hat ihre neueste Skulptur im öffentlichen Park am Alexanderplatz ausgestellt. Ihre Künstlernummer lautet DE-112233445.",
"person, profession, location, artist ID number",
0.5,
False,
],
[
"Federico García, un jugador de fútbol de Sevilla, ha firmado un contrato de tres años con el club Real Betis. Su número de licencia deportiva es ES-9876543210.",
"person, profession, organization, sports license number",
0.5,
False,
],
[
"Sarah White, a London-based actress, will be performing in 'Hamlet' at the Globe Theatre located at 21 New Globe Walk. Her Equity membership number is UK-1234567.",
"person, profession, location, address, membership number",
0.5,
False,
],
[
"Ricardo Mello, engenheiro civil, trabalha na construção da nova barragem no Rio Douro, Portugal. Seu número de registro profissional é PT-987654321.",
"person, profession, project location, professional registration number",
0.5,
False,
],
[
"Giuseppe Conti, un cliente di Milano, ha fatto un acquisto presso il negozio La Rinascente situato in Piazza Duomo. Il numero della sua carta di credito è IT-4567891234567891.",
"person, location, address, credit card number",
0.5,
False,
]
]
def ner(
text, labels: str, threshold: float, nested_ner: bool
) -> Dict[str, Union[str, int, float]]:
labels = labels.split(",")
return {
"text": text,
"entities": [
{
"entity": entity["label"],
"word": entity["text"],
"start": entity["start"],
"end": entity["end"],
"score": 0,
}
for entity in model.predict_entities(
text, labels, flat_ner=not nested_ner, threshold=threshold
)
],
}
with gr.Blocks(title="GLiNER-M-v2.1") as demo:
gr.Markdown(
"""
# GLiNER-PII (Personnally Identifiable Information extraction)
GLiNER is a Named Entity Recognition (NER) model capable of identifying any entity type using a bidirectional transformer encoder (BERT-like). It provides a practical alternative to traditional NER models, which are limited to predefined entities, and Large Language Models (LLMs) that, despite their flexibility, are costly and large for resource-constrained scenarios.
The model has been trained by fine-tuning urchade/gliner_multi-v2.1 on the urchade/synthetic-pii-ner-mistral-v1 dataset.
## Links
* Model: https://huggingface.co/urchade/gliner_multi_pii-v1
* All GLiNER models: https://huggingface.co/models?library=gliner
* Paper: https://arxiv.org/abs/2311.08526
* Repository: https://github.com/urchade/GLiNER
"""
)
with gr.Accordion("How to run this model locally", open=False):
gr.Markdown(
"""
## Installation
To use this model, you must install the GLiNER Python library:
```
!pip install gliner
```
## Usage
Once you've downloaded the GLiNER library, you can import the GLiNER class. You can then load this model using `GLiNER.from_pretrained` and predict entities with `predict_entities`.
"""
)
gr.Code(
'''
from gliner import GLiNER
model = GLiNER.from_pretrained("urchade/gliner_multi_pii-v1")
text = """
Harilala Rasoanaivo, un homme d'affaires local d'Antananarivo, a enregistré une nouvelle société nommée "Rasoanaivo Enterprises" au Lot II M 92 Antohomadinika. Son numéro est le +261 32 22 345 67, et son adresse électronique est [email protected]. Il a fourni son numéro de sécu 501-02-1234 pour l'enregistrement.
"""
labels = ["work", "booking number", "personally identifiable information", "driver licence", "person", "book", "full address", "company", "actor", "character", "email", "passport number", "Social Security Number", "phone number"]
entities = model.predict_entities(text, labels)
for entity in entities:
print(entity["text"], "=>", entity["label"])
''',
language="python",
)
gr.Code(
"""
Harilala Rasoanaivo => person
Rasoanaivo Enterprises => company
Lot II M 92 Antohomadinika => full address
+261 32 22 345 67 => phone number
[email protected] => email
501-02-1234 => Social Security Number
"""
)
input_text = gr.Textbox(
value=examples[0][0], label="Text input", placeholder="Enter your text here"
)
with gr.Row() as row:
labels = gr.Textbox(
value=examples[0][1],
label="Labels",
placeholder="Enter your labels here (comma separated)",
scale=2,
)
threshold = gr.Slider(
0,
1,
value=0.3,
step=0.01,
label="Threshold",
info="Lower the threshold to increase how many entities get predicted.",
scale=1,
)
nested_ner = gr.Checkbox(
value=examples[0][2],
label="Nested NER",
info="Allow for nested NER?",
scale=0,
)
output = gr.HighlightedText(label="Predicted Entities")
submit_btn = gr.Button("Submit")
examples = gr.Examples(
examples,
fn=ner,
inputs=[input_text, labels, threshold, nested_ner],
outputs=output,
cache_examples=True,
)
# Submitting
input_text.submit(
fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output
)
labels.submit(
fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output
)
threshold.release(
fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output
)
submit_btn.click(
fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output
)
nested_ner.change(
fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output
)
demo.queue()
demo.launch(debug=True)