nie3e commited on
Commit
7eedecd
1 Parent(s): 361fb91

Initial commit

Browse files
Files changed (8) hide show
  1. README.md +285 -3
  2. config.json +223 -0
  3. merges.txt +0 -0
  4. model.safetensors +3 -0
  5. special_tokens_map.json +6 -0
  6. tokenizer.json +0 -0
  7. tokenizer_config.json +20 -0
  8. vocab.json +0 -0
README.md CHANGED
@@ -1,3 +1,285 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ base_model: openai-community/gpt2
4
+ tags:
5
+ - generated_from_trainer
6
+ metrics:
7
+ - accuracy
8
+ model-index:
9
+ - name: gpt2-lang-ident
10
+ results: []
11
+ ---
12
+
13
+ # gpt2-lang-ident
14
+
15
+ This model is a fine-tuned version of [openai-community/gpt2](https://huggingface.co/openai-community/gpt2) on sampled sentences from `stanford-oval/ccnews` and `qanastek/EMEA-V3` datasets.
16
+ It achieves the following results on the evaluation set:
17
+ - Loss: 0.1210
18
+ - Accuracy: 0.9721
19
+
20
+ ## Model description
21
+
22
+ This model is trained to predict the language of the input text.
23
+
24
+ ## Intended uses & limitations
25
+
26
+ The model can predict the following languages:
27
+ ```
28
+ ["af", "am", "ar", "as", "az", "ba", "be", "bg", "bn", "ca", "ceb", "ckb", "cs", "cy", "da", "de", "dv", "el", "en", "eo", "es", "et", "eu", "fa", "fi", "fr", "fy", "ga", "gd", "gl", "gu", "he", "hi", "hr", "hu", "hy", "id", "is", "it", "ja", "ka", "kk", "kn", "ku", "ky", "la", "lb", "lt", "lv", "mg", "mk", "ml", "mn", "mr", "mt", "my", "nds", "ne", "nl", "nn", "no", "or", "pa", "pl", "ps", "pt", "ro", "ru", "sah", "sd", "si", "sk", "sl", "sq", "sr", "sv", "sw", "ta", "te", "tg", "th", "tk", "tl", "tr", "tt", "ug", "uk", "ur", "vi", "yi"]
29
+ ```
30
+
31
+ How to use:
32
+
33
+ ```python
34
+ from transformers import (AutoModelForSequenceClassification, AutoTokenizer,
35
+ pipeline)
36
+
37
+ checkpoint = f"nie3e/gpt2-lang-ident"
38
+ model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
39
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
40
+
41
+ pipe = pipeline(
42
+ task="text-classification",
43
+ model=model,
44
+ tokenizer=tokenizer,
45
+ top_k=5
46
+ )
47
+
48
+ result = pipe("To jest model służący do identyfikacji języka!")
49
+ print(result)
50
+ ```
51
+ ```
52
+ [[{'label': 'pl', 'score': 0.9999653100967407}, {'label': 'sr', 'score': 1.5228776646836195e-05}, {'label': 'hr', 'score': 1.057955432770541e-05}, {'label': 'bn', 'score': 1.590750912328076e-06}, {'label': 'cs', 'score': 1.3942196801508544e-06}]]
53
+ ```
54
+
55
+ ## Training and evaluation data
56
+
57
+ <details><summary>Training data ([lang]: count)</summary>
58
+
59
+ [bn]: 1947
60
+ [ar]: 1947
61
+ [vi]: 1947
62
+ [uk]: 1947
63
+ [kn]: 1947
64
+ [mr]: 1947
65
+ [id]: 1947
66
+ [te]: 1947
67
+ [no]: 1947
68
+ [ru]: 1947
69
+ [he]: 1947
70
+ [az]: 1947
71
+ [ca]: 1946
72
+ [fa]: 1946
73
+ [hi]: 1946
74
+ [th]: 1946
75
+ [tr]: 1946
76
+ [mk]: 1946
77
+ [ta]: 1945
78
+ [sq]: 1945
79
+ [ur]: 1942
80
+ [gu]: 1939
81
+ [ml]: 1936
82
+ [is]: 1738
83
+ [de]: 1543
84
+ [da]: 1521
85
+ [fi]: 1461
86
+ [el]: 1431
87
+ [nl]: 1424
88
+ [fr]: 1408
89
+ [cs]: 1401
90
+ [es]: 1397
91
+ [en]: 1394
92
+ [lt]: 1392
93
+ [hu]: 1379
94
+ [pt]: 1375
95
+ [lv]: 1373
96
+ [it]: 1360
97
+ [pl]: 1355
98
+ [sk]: 1355
99
+ [et]: 1348
100
+ [sl]: 1328
101
+ [sv]: 1300
102
+ [bg]: 1278
103
+ [mt]: 1234
104
+ [ro]: 1218
105
+ [kk]: 1179
106
+ [hy]: 1176
107
+ [or]: 1112
108
+ [pa]: 780
109
+ [sr]: 744
110
+ [as]: 735
111
+ [hr]: 722
112
+ [ne]: 626
113
+ [gl]: 566
114
+ [ckb]: 563
115
+ [ka]: 560
116
+ [ug]: 485
117
+ [ky]: 453
118
+ [eu]: 351
119
+ [ps]: 311
120
+ [tl]: 307
121
+ [fy]: 290
122
+ [mn]: 289
123
+ [si]: 244
124
+ [cy]: 214
125
+ [nn]: 212
126
+ [ku]: 195
127
+ [tg]: 176
128
+ [am]: 141
129
+ [tt]: 121
130
+ [ja]: 104
131
+ [lb]: 93
132
+ [tk]: 72
133
+ [be]: 64
134
+ [sw]: 45
135
+ [af]: 44
136
+ [my]: 40
137
+ [ceb]: 35
138
+ [la]: 33
139
+ [dv]: 20
140
+ [ba]: 19
141
+ [ga]: 19
142
+ [eo]: 19
143
+ [gd]: 16
144
+ [mg]: 15
145
+ [yi]: 14
146
+ [sah]: 14
147
+ [sd]: 11
148
+ [nds]: 11
149
+ </details>
150
+
151
+ <details><summary>Eval data ([lang]: count)</summary>
152
+
153
+ [te]: 195
154
+ [mk]: 195
155
+ [bn]: 195
156
+ [uk]: 195
157
+ [hi]: 195
158
+ [ar]: 195
159
+ [sq]: 195
160
+ [kn]: 195
161
+ [tr]: 195
162
+ [ca]: 195
163
+ [az]: 195
164
+ [fa]: 195
165
+ [ru]: 195
166
+ [mr]: 195
167
+ [id]: 195
168
+ [no]: 195
169
+ [vi]: 195
170
+ [th]: 195
171
+ [he]: 195
172
+ [gu]: 194
173
+ [ml]: 194
174
+ [ta]: 194
175
+ [ur]: 194
176
+ [is]: 174
177
+ [de]: 154
178
+ [da]: 152
179
+ [fi]: 146
180
+ [el]: 143
181
+ [nl]: 142
182
+ [fr]: 141
183
+ [es]: 140
184
+ [cs]: 140
185
+ [en]: 139
186
+ [lt]: 139
187
+ [hu]: 138
188
+ [lv]: 137
189
+ [pt]: 137
190
+ [it]: 136
191
+ [et]: 135
192
+ [pl]: 135
193
+ [sk]: 135
194
+ [sl]: 133
195
+ [sv]: 130
196
+ [bg]: 128
197
+ [mt]: 123
198
+ [ro]: 122
199
+ [hy]: 118
200
+ [kk]: 118
201
+ [or]: 111
202
+ [pa]: 78
203
+ [sr]: 74
204
+ [as]: 74
205
+ [hr]: 72
206
+ [ne]: 63
207
+ [gl]: 57
208
+ [ckb]: 56
209
+ [ka]: 56
210
+ [ug]: 49
211
+ [ky]: 45
212
+ [eu]: 35
213
+ [ps]: 31
214
+ [tl]: 31
215
+ [mn]: 29
216
+ [fy]: 29
217
+ [si]: 24
218
+ [nn]: 21
219
+ [cy]: 21
220
+ [ku]: 19
221
+ [tg]: 18
222
+ [am]: 14
223
+ [tt]: 12
224
+ [ja]: 10
225
+ [lb]: 9
226
+ [tk]: 7
227
+ [be]: 6
228
+ [my]: 4
229
+ [sw]: 4
230
+ [af]: 4
231
+ [ceb]: 3
232
+ [la]: 3
233
+ [ba]: 2
234
+ [dv]: 2
235
+ [eo]: 2
236
+ [gd]: 2
237
+ [ga]: 2
238
+ [mg]: 1
239
+ [sd]: 1
240
+ [nds]: 1
241
+ [yi]: 1
242
+ [sah]: 1
243
+ </details>
244
+
245
+ ### Training procedure
246
+
247
+ GPU: RTX 3090 \
248
+ Training time: 1h 53min
249
+
250
+ ### Training hyperparameters
251
+
252
+ The following hyperparameters were used during training:
253
+ - learning_rate: 2e-05
254
+ - train_batch_size: 8
255
+ - eval_batch_size: 4
256
+ - seed: 42
257
+ - gradient_accumulation_steps: 4
258
+ - total_train_batch_size: 32
259
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
260
+ - lr_scheduler_type: linear
261
+ - num_epochs: 10
262
+ - mixed_precision_training: Native AMP
263
+
264
+ ### Training results
265
+
266
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy |
267
+ |:-------------:|:-----:|:-----:|:---------------:|:--------:|
268
+ | 0.2833 | 1.0 | 2812 | 0.2004 | 0.94 |
269
+ | 0.168 | 2.0 | 5625 | 0.1567 | 0.954 |
270
+ | 0.1131 | 3.0 | 8437 | 0.1429 | 0.9586 |
271
+ | 0.0832 | 4.0 | 11250 | 0.1257 | 0.967 |
272
+ | 0.0635 | 5.0 | 14062 | 0.1222 | 0.9682 |
273
+ | 0.0479 | 6.0 | 16875 | 0.1214 | 0.9704 |
274
+ | 0.0361 | 7.0 | 19687 | 0.1255 | 0.9712 |
275
+ | 0.0258 | 8.0 | 22500 | 0.1178 | 0.9712 |
276
+ | 0.0243 | 9.0 | 25312 | 0.1223 | 0.9724 |
277
+ | 0.0171 | 10.0 | 28120 | 0.1210 | 0.9721 |
278
+
279
+
280
+ ### Framework versions
281
+
282
+ - Transformers 4.36.2
283
+ - Pytorch 2.1.2+cu121
284
+ - Datasets 2.16.1
285
+ - Tokenizers 0.15.0
config.json ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "openai-community/gpt2",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2ForSequenceClassification"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "id2label": {"0": "af",
12
+ "1": "am",
13
+ "2": "ar",
14
+ "3": "as",
15
+ "4": "az",
16
+ "5": "ba",
17
+ "6": "be",
18
+ "7": "bg",
19
+ "8": "bn",
20
+ "9": "ca",
21
+ "10": "ceb",
22
+ "11": "ckb",
23
+ "12": "cs",
24
+ "13": "cy",
25
+ "14": "da",
26
+ "15": "de",
27
+ "16": "dv",
28
+ "17": "el",
29
+ "18": "en",
30
+ "19": "eo",
31
+ "20": "es",
32
+ "21": "et",
33
+ "22": "eu",
34
+ "23": "fa",
35
+ "24": "fi",
36
+ "25": "fr",
37
+ "26": "fy",
38
+ "27": "ga",
39
+ "28": "gd",
40
+ "29": "gl",
41
+ "30": "gu",
42
+ "31": "he",
43
+ "32": "hi",
44
+ "33": "hr",
45
+ "34": "hu",
46
+ "35": "hy",
47
+ "36": "id",
48
+ "37": "is",
49
+ "38": "it",
50
+ "39": "ja",
51
+ "40": "ka",
52
+ "41": "kk",
53
+ "42": "kn",
54
+ "43": "ku",
55
+ "44": "ky",
56
+ "45": "la",
57
+ "46": "lb",
58
+ "47": "lt",
59
+ "48": "lv",
60
+ "49": "mg",
61
+ "50": "mk",
62
+ "51": "ml",
63
+ "52": "mn",
64
+ "53": "mr",
65
+ "54": "mt",
66
+ "55": "my",
67
+ "56": "nds",
68
+ "57": "ne",
69
+ "58": "nl",
70
+ "59": "nn",
71
+ "60": "no",
72
+ "61": "or",
73
+ "62": "pa",
74
+ "63": "pl",
75
+ "64": "ps",
76
+ "65": "pt",
77
+ "66": "ro",
78
+ "67": "ru",
79
+ "68": "sah",
80
+ "69": "sd",
81
+ "70": "si",
82
+ "71": "sk",
83
+ "72": "sl",
84
+ "73": "sq",
85
+ "74": "sr",
86
+ "75": "sv",
87
+ "76": "sw",
88
+ "77": "ta",
89
+ "78": "te",
90
+ "79": "tg",
91
+ "80": "th",
92
+ "81": "tk",
93
+ "82": "tl",
94
+ "83": "tr",
95
+ "84": "tt",
96
+ "85": "ug",
97
+ "86": "uk",
98
+ "87": "ur",
99
+ "88": "vi",
100
+ "89": "yi"},
101
+ "initializer_range": 0.02,
102
+ "label2id": {
103
+ "af": 0,
104
+ "am": 1,
105
+ "ar": 2,
106
+ "as": 3,
107
+ "az": 4,
108
+ "ba": 5,
109
+ "be": 6,
110
+ "bg": 7,
111
+ "bn": 8,
112
+ "ca": 9,
113
+ "ceb": 10,
114
+ "ckb": 11,
115
+ "cs": 12,
116
+ "cy": 13,
117
+ "da": 14,
118
+ "de": 15,
119
+ "dv": 16,
120
+ "el": 17,
121
+ "en": 18,
122
+ "eo": 19,
123
+ "es": 20,
124
+ "et": 21,
125
+ "eu": 22,
126
+ "fa": 23,
127
+ "fi": 24,
128
+ "fr": 25,
129
+ "fy": 26,
130
+ "ga": 27,
131
+ "gd": 28,
132
+ "gl": 29,
133
+ "gu": 30,
134
+ "he": 31,
135
+ "hi": 32,
136
+ "hr": 33,
137
+ "hu": 34,
138
+ "hy": 35,
139
+ "id": 36,
140
+ "is": 37,
141
+ "it": 38,
142
+ "ja": 39,
143
+ "ka": 40,
144
+ "kk": 41,
145
+ "kn": 42,
146
+ "ku": 43,
147
+ "ky": 44,
148
+ "la": 45,
149
+ "lb": 46,
150
+ "lt": 47,
151
+ "lv": 48,
152
+ "mg": 49,
153
+ "mk": 50,
154
+ "ml": 51,
155
+ "mn": 52,
156
+ "mr": 53,
157
+ "mt": 54,
158
+ "my": 55,
159
+ "nds": 56,
160
+ "ne": 57,
161
+ "nl": 58,
162
+ "nn": 59,
163
+ "no": 60,
164
+ "or": 61,
165
+ "pa": 62,
166
+ "pl": 63,
167
+ "ps": 64,
168
+ "pt": 65,
169
+ "ro": 66,
170
+ "ru": 67,
171
+ "sah": 68,
172
+ "sd": 69,
173
+ "si": 70,
174
+ "sk": 71,
175
+ "sl": 72,
176
+ "sq": 73,
177
+ "sr": 74,
178
+ "sv": 75,
179
+ "sw": 76,
180
+ "ta": 77,
181
+ "te": 78,
182
+ "tg": 79,
183
+ "th": 80,
184
+ "tk": 81,
185
+ "tl": 82,
186
+ "tr": 83,
187
+ "tt": 84,
188
+ "ug": 85,
189
+ "uk": 86,
190
+ "ur": 87,
191
+ "vi": 88,
192
+ "yi": 89
193
+ },
194
+ "layer_norm_epsilon": 1e-05,
195
+ "model_type": "gpt2",
196
+ "n_ctx": 1024,
197
+ "n_embd": 768,
198
+ "n_head": 12,
199
+ "n_inner": null,
200
+ "n_layer": 12,
201
+ "n_positions": 1024,
202
+ "pad_token_id": 50256,
203
+ "problem_type": "single_label_classification",
204
+ "reorder_and_upcast_attn": false,
205
+ "resid_pdrop": 0.1,
206
+ "scale_attn_by_inverse_layer_idx": false,
207
+ "scale_attn_weights": true,
208
+ "summary_activation": null,
209
+ "summary_first_dropout": 0.1,
210
+ "summary_proj_to_labels": true,
211
+ "summary_type": "cls_index",
212
+ "summary_use_proj": true,
213
+ "task_specific_params": {
214
+ "text-generation": {
215
+ "do_sample": true,
216
+ "max_length": 50
217
+ }
218
+ },
219
+ "torch_dtype": "float32",
220
+ "transformers_version": "4.36.2",
221
+ "use_cache": true,
222
+ "vocab_size": 50257
223
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:041d6a825692c31bad97df5b268b6f8ab6976cd971c33366fbf0c738fa61546a
3
+ size 498050768
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "pad_token": "<|endoftext|>",
5
+ "unk_token": "<|endoftext|>"
6
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50256": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ }
12
+ },
13
+ "bos_token": "<|endoftext|>",
14
+ "clean_up_tokenization_spaces": true,
15
+ "eos_token": "<|endoftext|>",
16
+ "model_max_length": 1024,
17
+ "pad_token": "<|endoftext|>",
18
+ "tokenizer_class": "GPT2Tokenizer",
19
+ "unk_token": "<|endoftext|>"
20
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff