Thunpitcha commited on
Commit
1130606
1 Parent(s): 8d69fe4

Upload tokenizer

Browse files
Files changed (3) hide show
  1. added_tokens.json +2 -2
  2. tokenizer_config.json +4 -4
  3. vocab.json +67 -86
added_tokens.json CHANGED
@@ -1,4 +1,4 @@
1
  {
2
- "</s>": 88,
3
- "<s>": 87
4
  }
 
1
  {
2
+ "</s>": 69,
3
+ "<s>": 68
4
  }
tokenizer_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "added_tokens_decoder": {
3
- "85": {
4
  "content": "[UNK]",
5
  "lstrip": true,
6
  "normalized": false,
@@ -8,7 +8,7 @@
8
  "single_word": false,
9
  "special": false
10
  },
11
- "86": {
12
  "content": "[PAD]",
13
  "lstrip": true,
14
  "normalized": false,
@@ -16,7 +16,7 @@
16
  "single_word": false,
17
  "special": false
18
  },
19
- "87": {
20
  "content": "<s>",
21
  "lstrip": false,
22
  "normalized": true,
@@ -24,7 +24,7 @@
24
  "single_word": false,
25
  "special": true
26
  },
27
- "88": {
28
  "content": "</s>",
29
  "lstrip": false,
30
  "normalized": true,
 
1
  {
2
  "added_tokens_decoder": {
3
+ "66": {
4
  "content": "[UNK]",
5
  "lstrip": true,
6
  "normalized": false,
 
8
  "single_word": false,
9
  "special": false
10
  },
11
+ "67": {
12
  "content": "[PAD]",
13
  "lstrip": true,
14
  "normalized": false,
 
16
  "single_word": false,
17
  "special": false
18
  },
19
+ "68": {
20
  "content": "<s>",
21
  "lstrip": false,
22
  "normalized": true,
 
24
  "single_word": false,
25
  "special": true
26
  },
27
+ "69": {
28
  "content": "</s>",
29
  "lstrip": false,
30
  "normalized": true,
vocab.json CHANGED
@@ -1,89 +1,70 @@
1
  {
2
- "'": 69,
3
- "[PAD]": 86,
4
- "[UNK]": 85,
5
- "_": 14,
6
- "a": 55,
7
- "c": 54,
8
- "e": 62,
9
- "h": 82,
10
- "i": 75,
11
- "j": 27,
12
- "k": 50,
13
- "l": 18,
14
- "m": 83,
15
- "n": 76,
16
- "o": 32,
17
- "r": 12,
18
- "s": 20,
19
- "t": 80,
20
- "y": 15,
21
- "|": 37,
22
- "~": 52,
23
- "ก": 3,
24
- "ข": 68,
25
- "ค": 59,
26
- "ฆ": 57,
27
- "ง": 67,
28
- "จ": 38,
29
- "ฉ": 17,
30
- "ช": 70,
31
- "ซ": 7,
32
- "ฌ": 61,
33
- "ญ": 77,
34
- "ฎ": 39,
35
- "ฏ": 8,
36
- "ฐ": 0,
37
- "ฑ": 9,
38
- "ฒ": 49,
39
- "ณ": 56,
40
  "ด": 47,
41
- "ต": 30,
42
- "ถ": 81,
43
- "ท": 16,
44
- "ธ": 84,
45
- "น": 66,
46
- "บ": 19,
47
- "ป": 40,
48
- "ผ": 78,
49
- "ฝ": 28,
50
- "พ": 73,
51
- "ฟ": 44,
52
- "ภ": 24,
53
- "ม": 64,
54
- "ย": 29,
55
- "ร": 11,
56
- "ฤ": 2,
57
- "ล": 33,
58
- "ว": 1,
59
- "ศ": 25,
60
- "ษ": 10,
61
- "ส": 74,
62
- "ห": 23,
63
- "ฬ": 31,
64
- "อ": 58,
65
- "ฮ": 65,
66
- "ะ": 6,
67
- "ั": 72,
68
- "า": 51,
69
- "ำ": 63,
70
- "ิ": 26,
71
- "ี": 48,
72
- "ึ": 46,
73
- "ื": 13,
74
- "ุ": 34,
75
- "ู": 35,
76
- "เ": 79,
77
- "แ": 21,
78
- "โ": 42,
79
- "ใ": 41,
80
- "ไ": 4,
81
- "ๆ": 60,
82
- "็": 5,
83
- "่": 45,
84
- "้": 36,
85
- "๊": 53,
86
- "๋": 22,
87
- "์": 43,
88
- "’": 71
89
  }
 
1
  {
2
+ "'": 12,
3
+ "[PAD]": 67,
4
+ "[UNK]": 66,
5
+ "|": 31,
6
+ "": 48,
7
+ "": 37,
8
+ "": 6,
9
+ "": 65,
10
+ "": 45,
11
+ "": 46,
12
+ "": 64,
13
+ "": 20,
14
+ "": 61,
15
+ "": 10,
16
+ "": 43,
17
+ "": 25,
18
+ "": 56,
19
+ "": 55,
20
+ "": 4,
21
+ "": 40,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  "ด": 47,
23
+ "ต": 21,
24
+ "ถ": 62,
25
+ "ท": 9,
26
+ "ธ": 59,
27
+ "น": 30,
28
+ "บ": 41,
29
+ "ป": 2,
30
+ "ผ": 17,
31
+ "ฝ": 52,
32
+ "พ": 32,
33
+ "ฟ": 13,
34
+ "ภ": 27,
35
+ "ม": 49,
36
+ "ย": 33,
37
+ "ร": 53,
38
+ "ฤ": 44,
39
+ "ล": 7,
40
+ "ว": 38,
41
+ "ศ": 39,
42
+ "ษ": 63,
43
+ "ส": 23,
44
+ "ห": 36,
45
+ "ฬ": 3,
46
+ "อ": 5,
47
+ "ฮ": 51,
48
+ "ะ": 34,
49
+ "ั": 60,
50
+ "า": 0,
51
+ "ำ": 16,
52
+ "ิ": 15,
53
+ "ี": 24,
54
+ "ึ": 54,
55
+ "ื": 14,
56
+ "ุ": 22,
57
+ "ู": 42,
58
+ "เ": 50,
59
+ "แ": 28,
60
+ "โ": 11,
61
+ "ใ": 19,
62
+ "ไ": 18,
63
+ "ๆ": 26,
64
+ "็": 29,
65
+ "่": 58,
66
+ "้": 1,
67
+ "๊": 8,
68
+ "๋": 35,
69
+ "์": 57
 
70
  }