Update config and tokenizer
Browse files- README.md +25 -4
- config.json +2 -1
- merges.txt +0 -0
- special_tokens_map.json +1 -1
- tokenizer.json +0 -0
- tokenizer_config.json +1 -1
- vocab.json +0 -0
README.md
CHANGED
@@ -1,14 +1,14 @@
|
|
1 |
---
|
2 |
language: ti
|
3 |
widget:
|
4 |
-
- text: "ዓቕሚ
|
5 |
---
|
6 |
|
7 |
-
# RoBERTa Pretrained for Tigrinya Language
|
8 |
|
9 |
We pretrain a RoBERTa base model for Tigrinya on a dataset of 40 million tokens trained for 40 epochs.
|
10 |
|
11 |
-
Contained in this repo
|
12 |
|
13 |
|
14 |
## Hyperparameters
|
@@ -17,6 +17,27 @@ The hyperparameters corresponding to model sizes mentioned above are as follows:
|
|
17 |
|
18 |
| Model Size | L | AH | HS | FFN | P | Seq |
|
19 |
|------------|----|----|-----|------|------|------|
|
20 |
-
| BASE | 12 | 12 | 768 | 3072 | 125M |
|
21 |
|
22 |
(L = number of layers; AH = number of attention heads; HS = hidden size; FFN = feedforward network dimension; P = number of parameters; Seq = maximum sequence length.)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
language: ti
|
3 |
widget:
|
4 |
+
- text: "ዓቕሚ መንእሰይ ኤርትራ <mask> ተራእዩ"
|
5 |
---
|
6 |
|
7 |
+
# TiRoBERTa: RoBERTa Pretrained for the Tigrinya Language
|
8 |
|
9 |
We pretrain a RoBERTa base model for Tigrinya on a dataset of 40 million tokens trained for 40 epochs.
|
10 |
|
11 |
+
Contained in this repo is the original pretrained Flax model that was trained on a TPU v3.8 and it's corresponding PyTorch version.
|
12 |
|
13 |
|
14 |
## Hyperparameters
|
|
|
17 |
|
18 |
| Model Size | L | AH | HS | FFN | P | Seq |
|
19 |
|------------|----|----|-----|------|------|------|
|
20 |
+
| BASE | 12 | 12 | 768 | 3072 | 125M | 512 |
|
21 |
|
22 |
(L = number of layers; AH = number of attention heads; HS = hidden size; FFN = feedforward network dimension; P = number of parameters; Seq = maximum sequence length.)
|
23 |
+
|
24 |
+
### Framework versions
|
25 |
+
|
26 |
+
- Transformers 4.12.0.dev0
|
27 |
+
- Pytorch 1.9.0+cu111
|
28 |
+
- Datasets 1.13.3
|
29 |
+
- Tokenizers 0.10.3
|
30 |
+
|
31 |
+
|
32 |
+
## Citation
|
33 |
+
|
34 |
+
If you use this model in your product or research, please cite as follows:
|
35 |
+
|
36 |
+
```
|
37 |
+
@article{Fitsum2021TiPLMs,
|
38 |
+
author={Fitsum Gaim and Wonsuk Yang and Jong C. Park},
|
39 |
+
title={Monolingual Pre-trained Language Models for Tigrinya},
|
40 |
+
year=2021,
|
41 |
+
publisher={WiNLP 2021 at EMNLP 2021}
|
42 |
+
}
|
43 |
+
```
|
config.json
CHANGED
@@ -5,6 +5,7 @@
|
|
5 |
],
|
6 |
"attention_probs_dropout_prob": 0.1,
|
7 |
"bos_token_id": 0,
|
|
|
8 |
"eos_token_id": 2,
|
9 |
"gradient_checkpointing": false,
|
10 |
"hidden_act": "gelu",
|
@@ -20,7 +21,7 @@
|
|
20 |
"pad_token_id": 1,
|
21 |
"position_embedding_type": "absolute",
|
22 |
"torch_dtype": "float32",
|
23 |
-
"transformers_version": "4.
|
24 |
"type_vocab_size": 1,
|
25 |
"use_cache": true,
|
26 |
"vocab_size": 50265
|
|
|
5 |
],
|
6 |
"attention_probs_dropout_prob": 0.1,
|
7 |
"bos_token_id": 0,
|
8 |
+
"classifier_dropout": null,
|
9 |
"eos_token_id": 2,
|
10 |
"gradient_checkpointing": false,
|
11 |
"hidden_act": "gelu",
|
|
|
21 |
"pad_token_id": 1,
|
22 |
"position_embedding_type": "absolute",
|
23 |
"torch_dtype": "float32",
|
24 |
+
"transformers_version": "4.12.0.dev0",
|
25 |
"type_vocab_size": 1,
|
26 |
"use_cache": true,
|
27 |
"vocab_size": 50265
|
merges.txt
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
special_tokens_map.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized":
|
|
|
1 |
+
{"bos_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "unk_token": {"content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "sep_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "cls_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true}}
|
tokenizer.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "
|
|
|
1 |
+
{"errors": "replace", "unk_token": {"content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": false, "sep_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "cls_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "special_tokens_map_file": null, "name_or_path": "./", "tokenizer_class": "RobertaTokenizer"}
|
vocab.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|