Xenova HF staff commited on
Commit
dc6502a
1 Parent(s): 7f1db52

Upload tokenizer.json

Browse files
Files changed (1) hide show
  1. tokenizer.json +102 -0
tokenizer.json ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 44,
8
+ "content": "<unk>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ }
15
+ ],
16
+ "normalizer": {
17
+ "type": "Sequence",
18
+ "normalizers": [
19
+ {
20
+ "type": "Lowercase"
21
+ },
22
+ {
23
+ "type": "Replace",
24
+ "pattern": {
25
+ "Regex": "[^\u0447 \u0434\u044c\u044f\u0439\u0441\u0443\u0430\u043e20\u044a\u044b\u043d\u043b\u0436\u0445\u0448\u0437c_\u043fm\u044e\u044d\u04404\u0449o\u2013\u0444q\u0438\u0446\u043a1\u0435\u0431\\-\u0442\u0432\u0433\u043c]"
26
+ },
27
+ "content": ""
28
+ },
29
+ {
30
+ "type": "Strip",
31
+ "strip_left": true,
32
+ "strip_right": true
33
+ },
34
+ {
35
+ "type": "Replace",
36
+ "pattern": {
37
+ "Regex": "(?=.)|(?<!^)$"
38
+ },
39
+ "content": "\u0447"
40
+ }
41
+ ]
42
+ },
43
+ "pre_tokenizer": {
44
+ "type": "Split",
45
+ "pattern": {
46
+ "Regex": ""
47
+ },
48
+ "behavior": "Isolated",
49
+ "invert": false
50
+ },
51
+ "post_processor": null,
52
+ "decoder": null,
53
+ "model": {
54
+ "vocab": {
55
+ "\u0447": 0,
56
+ " ": 1,
57
+ "\u0434": 2,
58
+ "\u044c": 3,
59
+ "\u044f": 4,
60
+ "\u0439": 5,
61
+ "\u0441": 6,
62
+ "\u0443": 7,
63
+ "\u0430": 8,
64
+ "\u043e": 9,
65
+ "2": 10,
66
+ "0": 11,
67
+ "\u044a": 12,
68
+ "\u044b": 13,
69
+ "\u043d": 14,
70
+ "\u043b": 15,
71
+ "\u0436": 16,
72
+ "\u0445": 17,
73
+ "\u0448": 18,
74
+ "\u0437": 19,
75
+ "c": 20,
76
+ "_": 21,
77
+ "\u043f": 22,
78
+ "m": 23,
79
+ "\u044e": 24,
80
+ "\u044d": 25,
81
+ "\u0440": 26,
82
+ "4": 27,
83
+ "\u0449": 28,
84
+ "o": 29,
85
+ "\u2013": 30,
86
+ "\u0444": 31,
87
+ "q": 32,
88
+ "\u0438": 33,
89
+ "\u0446": 34,
90
+ "\u043a": 35,
91
+ "1": 36,
92
+ "\u0435": 37,
93
+ "\u0431": 38,
94
+ "-": 39,
95
+ "\u0442": 40,
96
+ "\u0432": 41,
97
+ "\u0433": 42,
98
+ "\u043c": 43,
99
+ "<unk>": 44
100
+ }
101
+ }
102
+ }