|
import streamlit as st |
|
import pandas as pd |
|
import re |
|
import json |
|
import transformers |
|
import torch |
|
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer |
|
|
|
st.set_page_config( |
|
page_title="Named Entity Recognition Wolof", |
|
page_icon="๐" |
|
) |
|
|
|
def convert_df(df: pd.DataFrame): |
|
return df.to_csv(index=False).encode('utf-8') |
|
|
|
def convert_json(df: pd.DataFrame): |
|
result = df.to_json(orient="index") |
|
parsed = json.loads(result) |
|
json_string = json.dumps(parsed) |
|
return json_string |
|
|
|
def load_model(): |
|
model = AutoModelForTokenClassification.from_pretrained("vonewman/wolof-finetuned-ner") |
|
trainer = Trainer(model=model) |
|
tokenizer = AutoTokenizer.from_pretrained("vonewman/wolof-finetuned-ner") |
|
return trainer, model, tokenizer |
|
|
|
def align_word_ids(texts): |
|
trainer, model, tokenizer = load_model() |
|
tokenized_inputs = tokenizer(texts, padding='max_length', max_length=218, truncation=True) |
|
word_ids = tokenized_inputs.word_ids() |
|
previous_word_idx = None |
|
label_ids = [] |
|
for word_idx in word_ids: |
|
if word_idx is None: |
|
label_ids.append(-100) |
|
elif word_idx != previous_word_idx: |
|
try: |
|
label_ids.append(1) |
|
except: |
|
label_ids.append(-100) |
|
else: |
|
try: |
|
label_ids.append(1 if label_all_tokens else -100) |
|
except: |
|
label_ids.append(-100) |
|
previous_word_idx = word_idx |
|
return label_ids |
|
|
|
def predict_ner_labels(model, tokenizer, sentence): |
|
use_cuda = torch.cuda.is_available() |
|
device = torch.device("cuda" if use_cuda else "cpu") |
|
if use_cuda: |
|
model = model.cuda() |
|
text = tokenizer(sentence, padding='max_length', max_length=218, truncation=True, return_tensors="pt") |
|
mask = text['attention_mask'].to(device) |
|
input_id = text['input_ids'].to(device) |
|
label_ids = torch.Tensor(align_word_ids(sentence)).unsqueeze(0).to(device) |
|
logits = model(input_id, mask, None) |
|
logits_clean = logits[0][label_ids != -100] |
|
predictions = logits_clean.argmax(dim=1).tolist() |
|
prediction_label = [id2tag[i] for i in predictions] |
|
return prediction_label |
|
|
|
id2tag = {0: 'O', 1: 'B-LOC', 2: 'B-PER', 3: 'I-PER', 4: 'B-ORG', 5: 'I-DATE', 6: 'B-DATE', 7: 'I-ORG', 8: 'I-LOC'} |
|
|
|
def tag_sentence(text): |
|
trainer, model, tokenizer = load_model() |
|
predictions = predict_ner_labels(model, tokenizer, text) |
|
|
|
df = pd.DataFrame({'words': text.split(), 'tags': predictions}) |
|
return df |
|
|
|
st.title("๐ Named Entity Recognition Wolof") |
|
|
|
with st.form(key='my_form'): |
|
x1 = st.text_input(label='Enter a sentence:', max_chars=250) |
|
submit_button = st.form_submit_button(label='๐ท๏ธ Create tags') |
|
|
|
if submit_button: |
|
if re.sub('\s+', '', x1) == '': |
|
st.error('Please enter a non-empty sentence.') |
|
elif re.match(r'\A\s*\w+\s*\Z', x1): |
|
st.error("Please enter a sentence with at least one word") |
|
else: |
|
st.markdown("### Tagged Sentence") |
|
st.header("") |
|
results = tag_sentence(x1) |
|
cs, c1, c2, c3, cLast = st.columns([0.75, 1.5, 1.5, 1.5, 0.75]) |
|
with c1: |
|
csvbutton = st.download_button(label="๐ฅ Download .csv", data=convert_df(results), |
|
file_name="results.csv", mime='text/csv', key='csv') |
|
with c2: |
|
textbutton = st.download_button(label="๐ฅ Download .txt", data=convert_df(results), |
|
file_name="results.text", mime='text/plain', key='text') |
|
with c3: |
|
jsonbutton = st.download_button(label="๐ฅ Download .json", data=convert_json(results), |
|
file_name="results.json", mime='application/json', key='json') |
|
st.header("") |
|
c1, c2, c3 = st.columns([1, 3, 1]) |
|
with c2: |
|
st.table(results[['words', 'tags']]) |
|
|
|
st.header("") |
|
st.header("") |
|
st.header("") |
|
with st.expander("โน๏ธ - About this app", expanded=True): |
|
st.write( |
|
""" |
|
- The **Named Entity Recognition Wolof** app is a tool that performs named entity recognition in Wolof. |
|
- The available entities are: *corporation*, *location*, *person*, and *date*. |
|
- The app uses the [XLMRoberta model](https://huggingface.co/xlm-roberta-base), fine-tuned on the [masakhaNER](https://huggingface.co/datasets/masakhane/masakhaner2) dataset. |
|
- The model uses the **byte-level BPE tokenizer**. Each sentence is first tokenized. |
|
""" |
|
) |
|
|