jinymusim commited on
Commit
3bf0e27
1 Parent(s): 87de783

Upload 6 files

Browse files

UNICODE tokenizer model

config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "lchaloupsky/czech-gpt2-oscar",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "eos_token_ids": 0,
12
+ "initializer_range": 0.02,
13
+ "layer_norm_epsilon": 1e-05,
14
+ "model_type": "gpt2",
15
+ "n_ctx": 1024,
16
+ "n_embd": 768,
17
+ "n_head": 12,
18
+ "n_inner": null,
19
+ "n_layer": 12,
20
+ "n_positions": 1024,
21
+ "output_hidden_states": true,
22
+ "output_past": true,
23
+ "pad_token_id": 0,
24
+ "reorder_and_upcast_attn": false,
25
+ "resid_pdrop": 0.1,
26
+ "scale_attn_by_inverse_layer_idx": false,
27
+ "scale_attn_weights": true,
28
+ "summary_activation": null,
29
+ "summary_first_dropout": 0.1,
30
+ "summary_proj_to_labels": true,
31
+ "summary_type": "cls_index",
32
+ "summary_use_proj": true,
33
+ "task_specific_params": {
34
+ "text-generation": {
35
+ "do_sample": true,
36
+ "max_length": 50
37
+ }
38
+ },
39
+ "torch_dtype": "float32",
40
+ "transformers_version": "4.34.1",
41
+ "use_cache": true,
42
+ "vocab_size": 106
43
+ }
generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "output_hidden_states": true,
6
+ "pad_token_id": 0,
7
+ "transformers_version": "4.34.1"
8
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72f1e7ac5a6928a0a60b0d77606ccb47955187da2efb3c066ddcfad651047996
3
+ size 343741722
special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "eos_token": "<|EOS|>",
3
+ "pad_token": "<|PAD|>",
4
+ "unk_token": "<|UNK|>"
5
+ }
tokenizer.json ADDED
@@ -0,0 +1,496 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": {
4
+ "direction": "Right",
5
+ "max_length": 1024,
6
+ "strategy": "LongestFirst",
7
+ "stride": 0
8
+ },
9
+ "padding": {
10
+ "strategy": "BatchLongest",
11
+ "direction": "Right",
12
+ "pad_to_multiple_of": null,
13
+ "pad_id": 1,
14
+ "pad_type_id": 0,
15
+ "pad_token": "<|PAD|>"
16
+ },
17
+ "added_tokens": [
18
+ {
19
+ "id": 0,
20
+ "content": "<|EOS|>",
21
+ "single_word": false,
22
+ "lstrip": false,
23
+ "rstrip": false,
24
+ "normalized": false,
25
+ "special": true
26
+ },
27
+ {
28
+ "id": 1,
29
+ "content": "<|PAD|>",
30
+ "single_word": false,
31
+ "lstrip": false,
32
+ "rstrip": false,
33
+ "normalized": false,
34
+ "special": true
35
+ },
36
+ {
37
+ "id": 2,
38
+ "content": "<|UNK|>",
39
+ "single_word": false,
40
+ "lstrip": false,
41
+ "rstrip": false,
42
+ "normalized": false,
43
+ "special": true
44
+ }
45
+ ],
46
+ "normalizer": null,
47
+ "pre_tokenizer": {
48
+ "type": "ByteLevel",
49
+ "add_prefix_space": false,
50
+ "trim_offsets": true,
51
+ "use_regex": true
52
+ },
53
+ "post_processor": {
54
+ "type": "ByteLevel",
55
+ "add_prefix_space": true,
56
+ "trim_offsets": false,
57
+ "use_regex": true
58
+ },
59
+ "decoder": {
60
+ "type": "ByteLevel",
61
+ "add_prefix_space": true,
62
+ "trim_offsets": true,
63
+ "use_regex": true
64
+ },
65
+ "model": {
66
+ "type": "Unigram",
67
+ "unk_id": 2,
68
+ "vocab": [
69
+ [
70
+ "<|EOS|>",
71
+ 0.0
72
+ ],
73
+ [
74
+ "<|PAD|>",
75
+ 0.0
76
+ ],
77
+ [
78
+ "<|UNK|>",
79
+ 0.0
80
+ ],
81
+ [
82
+ "Ġ",
83
+ -1.5260435611730827
84
+ ],
85
+ [
86
+ "#",
87
+ -2.7402009810912062
88
+ ],
89
+ [
90
+ "Ã",
91
+ -3.1166905774882707
92
+ ],
93
+ [
94
+ "e",
95
+ -3.2260840326309896
96
+ ],
97
+ [
98
+ "a",
99
+ -3.400272151145682
100
+ ],
101
+ [
102
+ "o",
103
+ -3.4247111498875125
104
+ ],
105
+ [
106
+ "n",
107
+ -3.4843012141328966
108
+ ],
109
+ [
110
+ "t",
111
+ -3.5851466950828534
112
+ ],
113
+ [
114
+ "l",
115
+ -3.5966397727159105
116
+ ],
117
+ [
118
+ "s",
119
+ -3.606519701447205
120
+ ],
121
+ [
122
+ "v",
123
+ -3.673389625897381
124
+ ],
125
+ [
126
+ "Å",
127
+ -3.738661921889596
128
+ ],
129
+ [
130
+ "k",
131
+ -3.88985675826423
132
+ ],
133
+ [
134
+ "¡",
135
+ -3.896324504449853
136
+ ],
137
+ [
138
+ "m",
139
+ -3.900019204288416
140
+ ],
141
+ [
142
+ "d",
143
+ -3.924142000578403
144
+ ],
145
+ [
146
+ "i",
147
+ -3.957515863848771
148
+ ],
149
+ [
150
+ "u",
151
+ -3.98427965340516
152
+ ],
153
+ [
154
+ "Ń",
155
+ -4.01885777375824
156
+ ],
157
+ [
158
+ "1",
159
+ -4.076782718271231
160
+ ],
161
+ [
162
+ "r",
163
+ -4.101788234787666
164
+ ],
165
+ [
166
+ "h",
167
+ -4.165955307243209
168
+ ],
169
+ [
170
+ "Ä",
171
+ -4.209926280752244
172
+ ],
173
+ [
174
+ ",",
175
+ -4.2650454975175265
176
+ ],
177
+ [
178
+ "c",
179
+ -4.2986020434927
180
+ ],
181
+ [
182
+ "j",
183
+ -4.299016883023702
184
+ ],
185
+ [
186
+ "Ċ",
187
+ -4.318158151676027
188
+ ],
189
+ [
190
+ "p",
191
+ -4.340959953844402
192
+ ],
193
+ [
194
+ "z",
195
+ -4.458983744584648
196
+ ],
197
+ [
198
+ "y",
199
+ -4.51828856430196
200
+ ],
201
+ [
202
+ "Ľ",
203
+ -4.625728639873191
204
+ ],
205
+ [
206
+ "J",
207
+ -4.632495838144841
208
+ ],
209
+ [
210
+ "b",
211
+ -4.702284800331416
212
+ ],
213
+ [
214
+ "0",
215
+ -4.89877498638786
216
+ ],
217
+ [
218
+ "T",
219
+ -4.930942842309104
220
+ ],
221
+ [
222
+ "¾",
223
+ -4.9475363370353715
224
+ ],
225
+ [
226
+ "½",
227
+ -5.112839051696183
228
+ ],
229
+ [
230
+ "Ļ",
231
+ -5.1232783823983485
232
+ ],
233
+ [
234
+ "8",
235
+ -5.165680149039558
236
+ ],
237
+ [
238
+ "©",
239
+ -5.1662964199731185
240
+ ],
241
+ [
242
+ ".",
243
+ -5.311316866688184
244
+ ],
245
+ [
246
+ "į",
247
+ -5.390915371298602
248
+ ],
249
+ [
250
+ "X",
251
+ -5.441727913115729
252
+ ],
253
+ [
254
+ "A",
255
+ -5.5452444992227115
256
+ ],
257
+ [
258
+ "9",
259
+ -5.700158070211225
260
+ ],
261
+ [
262
+ "¯",
263
+ -5.794693725474856
264
+ ],
265
+ [
266
+ "B",
267
+ -5.946768892374564
268
+ ],
269
+ [
270
+ "N",
271
+ -6.231750626230964
272
+ ],
273
+ [
274
+ "!",
275
+ -6.43383380299835
276
+ ],
277
+ [
278
+ "2",
279
+ -6.458671934150379
280
+ ],
281
+ [
282
+ "7",
283
+ -6.481553328401322
284
+ ],
285
+ [
286
+ "6",
287
+ -6.54739707067993
288
+ ],
289
+ [
290
+ "¥",
291
+ -7.119259719177025
292
+ ],
293
+ [
294
+ "3",
295
+ -7.1764773118059555
296
+ ],
297
+ [
298
+ "4",
299
+ -7.283631800958723
300
+ ],
301
+ [
302
+ "?",
303
+ -7.301649946289903
304
+ ],
305
+ [
306
+ "5",
307
+ -7.427897072063589
308
+ ],
309
+ [
310
+ "D",
311
+ -7.4577806463041565
312
+ ],
313
+ [
314
+ "Ī",
315
+ -7.530995391628789
316
+ ],
317
+ [
318
+ "C",
319
+ -7.594917277581869
320
+ ],
321
+ [
322
+ "ı",
323
+ -7.619961419829636
324
+ ],
325
+ [
326
+ "º",
327
+ -7.878503588175192
328
+ ],
329
+ [
330
+ "f",
331
+ -7.964820780829486
332
+ ],
333
+ [
334
+ "g",
335
+ -8.277706569992713
336
+ ],
337
+ [
338
+ "³",
339
+ -8.34045981977625
340
+ ],
341
+ [
342
+ "H",
343
+ -10.272558688607978
344
+ ],
345
+ [
346
+ "Y",
347
+ -10.559397898241222
348
+ ],
349
+ [
350
+ "x",
351
+ -10.567971957971364
352
+ ],
353
+ [
354
+ "w",
355
+ -11.220052439924926
356
+ ],
357
+ [
358
+ "q",
359
+ -12.08126913846109
360
+ ],
361
+ [
362
+ "P",
363
+ -12.337988972147176
364
+ ],
365
+ [
366
+ "¶",
367
+ -12.928795639896862
368
+ ],
369
+ [
370
+ "ĩ",
371
+ -12.979709743960514
372
+ ],
373
+ [
374
+ "¼",
375
+ -13.307792559933008
376
+ ],
377
+ [
378
+ "¤",
379
+ -13.509243173516644
380
+ ],
381
+ [
382
+ "â",
383
+ -13.57425863274756
384
+ ],
385
+ [
386
+ "§",
387
+ -13.57425863274756
388
+ ],
389
+ [
390
+ "Ģ",
391
+ -13.59274912950999
392
+ ],
393
+ [
394
+ "ļ",
395
+ -13.677077762284064
396
+ ],
397
+ [
398
+ "¨",
399
+ -13.925495429629638
400
+ ],
401
+ [
402
+ "´",
403
+ -14.030614768906831
404
+ ],
405
+ [
406
+ "«",
407
+ -14.15899565226756
408
+ ],
409
+ [
410
+ "ł",
411
+ -14.358448819224304
412
+ ],
413
+ [
414
+ "¢",
415
+ -14.479162057573522
416
+ ],
417
+ [
418
+ "Â",
419
+ -14.660728142957538
420
+ ],
421
+ [
422
+ "ª",
423
+ -14.775686233662904
424
+ ],
425
+ [
426
+ "®",
427
+ -14.905597093757487
428
+ ],
429
+ [
430
+ "»",
431
+ -15.23056985336876
432
+ ],
433
+ [
434
+ "Ĥ",
435
+ -15.296708147458816
436
+ ],
437
+ [
438
+ "+",
439
+ -16.127004563448008
440
+ ],
441
+ [
442
+ "¦",
443
+ -16.252149490984237
444
+ ],
445
+ [
446
+ "Ħ",
447
+ -16.691400414639208
448
+ ],
449
+ [
450
+ "²",
451
+ -17.225105598348073
452
+ ],
453
+ [
454
+ "=",
455
+ -17.350105598357175
456
+ ],
457
+ [
458
+ "Ĵ",
459
+ -17.659629407920946
460
+ ],
461
+ [
462
+ "Ł",
463
+ -18.109629407920945
464
+ ],
465
+ [
466
+ "/",
467
+ -18.109629407920945
468
+ ],
469
+ [
470
+ "¹",
471
+ -18.44296274125428
472
+ ],
473
+ [
474
+ "`",
475
+ -18.44296274125428
476
+ ],
477
+ [
478
+ "Ĭ",
479
+ -18.942962741214338
480
+ ],
481
+ [
482
+ "ĵ",
483
+ -18.94296274125428
484
+ ],
485
+ [
486
+ "ħ",
487
+ -18.94296274125428
488
+ ],
489
+ [
490
+ "Ï",
491
+ -18.94296274125428
492
+ ]
493
+ ],
494
+ "byte_fallback": false
495
+ }
496
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<|EOS|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<|PAD|>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "<|UNK|>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ }
27
+ },
28
+ "clean_up_tokenization_spaces": true,
29
+ "eos_token": "<|EOS|>",
30
+ "model_max_length": 1024,
31
+ "pad_token": "<|PAD|>",
32
+ "tokenizer_class": "PreTrainedTokenizerFast",
33
+ "unk_token": "<|UNK|>"
34
+ }