|
--- |
|
license: mit |
|
base_model: xlm-roberta-base |
|
tags: |
|
- generated_from_trainer |
|
metrics: |
|
- precision |
|
- recall |
|
- f1 |
|
- accuracy |
|
model-index: |
|
- name: xlm-roberta-base-kyrgyzNER |
|
results: [] |
|
language: |
|
- ky |
|
--- |
|
|
|
|
|
|
|
# kyrgyzNER model (xlm-roberta-base) by The_Cramer_Project |
|
|
|
|
|
- The original repository: https://github.com/Akyl-AI/KyrgyzNER |
|
- Paper will be uploaded soon |
|
- KyrgyzNER dataset and Codes will be uploaded soon |
|
|
|
|
|
This model is a fine-tuned version of [xlm-roberta-base](https://huggingface.co/xlm-roberta-base) on the KyrgyzNER dataset. |
|
It achieves the following results on the evaluation set: |
|
- Loss: 0.3273 |
|
- Precision: 0.7090 |
|
- Recall: 0.6946 |
|
- F1: 0.7017 |
|
- Accuracy: 0.9119 |
|
|
|
|
|
## How to use |
|
You can use this model with the Transformers pipeline for NER. |
|
|
|
```python |
|
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoConfig |
|
from transformers import pipeline |
|
|
|
id2label = { |
|
'LABEL_0': 'B-NATIONAL', |
|
'LABEL_1': 'I-PLANT', |
|
'LABEL_2': 'I-ORGANISATION', |
|
'LABEL_3': 'B-ORGANISATION', |
|
'LABEL_4': 'B-MEDIA', |
|
'LABEL_5': 'I-ARTIFACT', |
|
'LABEL_6': 'B-AWARD', |
|
'LABEL_7': 'B-UNKNOWN', |
|
'LABEL_8': 'I-LOCATION', |
|
'LABEL_9': 'B-PERSON', |
|
'LABEL_10': 'I-LEGAL', |
|
'LABEL_11': 'B-BUSINESS', |
|
'LABEL_12': 'B-ACRONYM', |
|
'LABEL_13': 'I-PERIOD', |
|
'LABEL_14': 'B-INSTITUTION', |
|
'LABEL_15': 'I-MEASURE', |
|
'LABEL_16': 'B-CREATION', |
|
'LABEL_17': 'I-ACRONYM', |
|
'LABEL_18': 'I-AWARD', |
|
'LABEL_19': 'I-WEBSITE', |
|
'LABEL_20': 'B-PERIOD', |
|
'LABEL_21': 'I-PERSON', |
|
'LABEL_22': 'I-PERSON_TYPE', |
|
'LABEL_23': 'B-SUBSTANCE', |
|
'LABEL_24': 'O', |
|
'LABEL_25': 'B-PLANT', |
|
'LABEL_26': 'I-INSTITUTION', |
|
'LABEL_27': 'I-SUBSTANCE', |
|
'LABEL_28': 'I-INSTALLATION', |
|
'LABEL_29': 'B-CONCEPT', |
|
'LABEL_30': 'B-TITLE', |
|
'LABEL_31': 'I-EVENT', |
|
'LABEL_32': 'B-ARTIFACT', |
|
'LABEL_33': 'B-MEASURE', |
|
'LABEL_34': 'B-LOCATION', |
|
'LABEL_35': 'I-BUSINESS', |
|
'LABEL_36': 'B-ANIMAL', |
|
'LABEL_37': 'B-PERSON_TYPE', |
|
'LABEL_38': 'B-INSTALLATION', |
|
'LABEL_39': 'I-TITLE', |
|
'LABEL_40': 'B-IDENTIFIER', |
|
'LABEL_41': 'I-IDENTIFIER', |
|
'LABEL_42': 'B-LEGAL', |
|
'LABEL_43': 'I-MEDIA', |
|
'LABEL_44': 'I-CONCEPT', |
|
'LABEL_45': 'I-UNKNOWN', |
|
'LABEL_46': 'B-EVENT', |
|
'LABEL_47': 'B-WEBSITE', |
|
'LABEL_48': 'I-NATIONAL', |
|
'LABEL_49': 'I-CREATION', |
|
'LABEL_50': 'I-ANIMAL'} |
|
|
|
model_ckpt = "TTimur/xlm-roberta-base-kyrgyzNER" |
|
|
|
config = AutoConfig.from_pretrained(model_ckpt) |
|
tokenizer = AutoTokenizer.from_pretrained(model_ckpt) |
|
model = AutoModelForTokenClassification.from_pretrained(model_ckpt, config = config) |
|
|
|
# aggregation_strategy = "none" |
|
nlp = pipeline("ner", model = model, tokenizer = tokenizer, aggregation_strategy = "none") |
|
|
|
example = "Кыргызстан Орто Азиянын түндүк-чыгышында орун алган мамлекет." |
|
ner_results = nlp(example) |
|
for result in ner_results: |
|
result.update({'entity': id2label[result['entity']]}) |
|
print(result) |
|
|
|
# output: |
|
# {'entity': 'B-LOCATION', 'score': 0.95103735, 'index': 1, 'word': '▁Кыргызстан', 'start': 0, 'end': 10} |
|
# {'entity': 'B-LOCATION', 'score': 0.79447913, 'index': 2, 'word': '▁Ор', 'start': 11, 'end': 13} |
|
# {'entity': 'I-LOCATION', 'score': 0.8703734, 'index': 3, 'word': 'то', 'start': 13, 'end': 15} |
|
# {'entity': 'I-LOCATION', 'score': 0.942387, 'index': 4, 'word': '▁Азия', 'start': 16, 'end': 20} |
|
# {'entity': 'I-LOCATION', 'score': 0.8542615, 'index': 5, 'word': 'нын', 'start': 20, 'end': 23} |
|
# {'entity': 'I-LOCATION', 'score': 0.70930535, 'index': 6, 'word': '▁түн', 'start': 24, 'end': 27} |
|
# {'entity': 'I-LOCATION', 'score': 0.6540094, 'index': 7, 'word': 'дүк', 'start': 27, 'end': 30} |
|
# {'entity': 'I-LOCATION', 'score': 0.63446337, 'index': 8, 'word': '-', 'start': 30, 'end': 31} |
|
# {'entity': 'I-LOCATION', 'score': 0.6204858, 'index': 9, 'word': 'чы', 'start': 31, 'end': 33} |
|
# {'entity': 'I-LOCATION', 'score': 0.6786872, 'index': 10, 'word': 'г', 'start': 33, 'end': 34} |
|
# {'entity': 'I-LOCATION', 'score': 0.64190257, 'index': 11, 'word': 'ыш', 'start': 34, 'end': 36} |
|
# {'entity': 'O', 'score': 0.64438057, 'index': 12, 'word': 'ында', 'start': 36, 'end': 40} |
|
# {'entity': 'O', 'score': 0.9916931, 'index': 13, 'word': '▁орун', 'start': 41, 'end': 45} |
|
# {'entity': 'O', 'score': 0.9953047, 'index': 14, 'word': '▁алган', 'start': 46, 'end': 51} |
|
# {'entity': 'O', 'score': 0.9901377, 'index': 15, 'word': '▁мамлекет', 'start': 52, 'end': 60} |
|
# {'entity': 'O', 'score': 0.99605453, 'index': 16, 'word': '.', 'start': 60, 'end': 61} |
|
|
|
|
|
token = "" |
|
label_list = [] |
|
token_list = [] |
|
|
|
for result in ner_results: |
|
if result["word"].startswith("▁"): |
|
if token: |
|
token_list.append(token.replace("▁", "")) |
|
token = result["word"] |
|
label_list.append(result["entity"]) |
|
else: |
|
token += result["word"] |
|
|
|
token_list.append(token.replace("▁", "")) |
|
|
|
for token, label in zip(token_list, label_list): |
|
print(f"{token}\t{label}") |
|
|
|
|
|
# output: |
|
# Кыргызстан B-LOCATION |
|
# Орто B-LOCATION |
|
# Азиянын I-LOCATION |
|
# түндүк-чыгышында I-LOCATION |
|
# орун O |
|
# алган O |
|
# мамлекет. O |
|
|
|
# aggregation_strategy = "simple" |
|
nlp = pipeline("ner", model = model, tokenizer = tokenizer, aggregation_strategy = "simple") |
|
example = "Кыргызстан Орто Азиянын түндүк-чыгышында орун алган мамлекет." |
|
|
|
ner_results = nlp(example) |
|
for result in ner_results: |
|
result.update({'entity_group': id2label[result['entity_group']]}) |
|
print(result) |
|
|
|
# output: |
|
# {'entity_group': 'B-LOCATION', 'score': 0.87275827, 'word': 'Кыргызстан Ор', 'start': 0, 'end': 13} |
|
# {'entity_group': 'I-LOCATION', 'score': 0.73398614, 'word': 'то Азиянын түндүк-чыгыш', 'start': 13, 'end': 36} |
|
# {'entity_group': 'O', 'score': 0.92351407, 'word': 'ында орун алган мамлекет.', 'start': 36, 'end': 61} |
|
|
|
``` |
|
|
|
|
|
# NE classes |
|
|
|
**PERSON**, **LOCATION** , **MEASURE** , **INSTITUTION** , **PERIOD** , **ORGANISATION** , **MEDIA** , **TITLE** , **BUSINESS** , **LEGAL** , **EVENT** , **ARTIFACT** , **INSTALLATION** , **PERSON_TYPE**, **NATIONAL**, **CONCEPT**, **CREATION**, **WEBSITE**, **SUBSTANCE**, **ACRONYM**, **IDENTIFIER**, **UNKNOWN**, **AWARD**, **ANIMAL** |