Upload tokenizer
Browse files- merges.txt +4 -4
- tokenizer.json +25 -37
- tokenizer_config.json +1 -1
- vocab.json +0 -0
merges.txt
CHANGED
@@ -2056,8 +2056,8 @@ d o
|
|
2056 |
Ġchan ged
|
2057 |
Ġcontro l
|
2058 |
Ġs ense
|
2059 |
-
it ch
|
2060 |
as ure
|
|
|
2061 |
Ġfe bruary
|
2062 |
Ġ3 0
|
2063 |
Ġdad dy
|
@@ -2569,9 +2569,9 @@ in c
|
|
2569 |
Ġm ir
|
2570 |
Ġf em
|
2571 |
Ġbec om
|
2572 |
-
Ġ q
|
2573 |
em pt
|
2574 |
a f
|
|
|
2575 |
ri age
|
2576 |
Ġbr own
|
2577 |
od e
|
@@ -3068,10 +3068,10 @@ ur ity
|
|
3068 |
Ġb ath
|
3069 |
Ġl ength
|
3070 |
Ġtr ade
|
3071 |
-
bo ard
|
3072 |
Ġkn ock
|
3073 |
ast ic
|
3074 |
Ġjo ined
|
|
|
3075 |
b a
|
3076 |
ter day
|
3077 |
Ġco ffee
|
@@ -4739,7 +4739,6 @@ u ly
|
|
4739 |
Ġcom ment
|
4740 |
Ġtr ave
|
4741 |
Ġdis play
|
4742 |
-
Ġsw itch
|
4743 |
Ġass um
|
4744 |
Ġadv ice
|
4745 |
Ġstep s
|
@@ -4749,6 +4748,7 @@ u ly
|
|
4749 |
Ġk im
|
4750 |
im um
|
4751 |
Ġrec om
|
|
|
4752 |
Ġsign al
|
4753 |
Ġill ust
|
4754 |
ipp ing
|
|
|
2056 |
Ġchan ged
|
2057 |
Ġcontro l
|
2058 |
Ġs ense
|
|
|
2059 |
as ure
|
2060 |
+
it ch
|
2061 |
Ġfe bruary
|
2062 |
Ġ3 0
|
2063 |
Ġdad dy
|
|
|
2569 |
Ġm ir
|
2570 |
Ġf em
|
2571 |
Ġbec om
|
|
|
2572 |
em pt
|
2573 |
a f
|
2574 |
+
Ġ q
|
2575 |
ri age
|
2576 |
Ġbr own
|
2577 |
od e
|
|
|
3068 |
Ġb ath
|
3069 |
Ġl ength
|
3070 |
Ġtr ade
|
|
|
3071 |
Ġkn ock
|
3072 |
ast ic
|
3073 |
Ġjo ined
|
3074 |
+
bo ard
|
3075 |
b a
|
3076 |
ter day
|
3077 |
Ġco ffee
|
|
|
4739 |
Ġcom ment
|
4740 |
Ġtr ave
|
4741 |
Ġdis play
|
|
|
4742 |
Ġass um
|
4743 |
Ġadv ice
|
4744 |
Ġstep s
|
|
|
4748 |
Ġk im
|
4749 |
im um
|
4750 |
Ġrec om
|
4751 |
+
Ġsw itch
|
4752 |
Ġsign al
|
4753 |
Ġill ust
|
4754 |
ipp ing
|
tokenizer.json
CHANGED
@@ -1,19 +1,7 @@
|
|
1 |
{
|
2 |
"version": "1.0",
|
3 |
-
"truncation":
|
4 |
-
|
5 |
-
"max_length": 128,
|
6 |
-
"strategy": "LongestFirst",
|
7 |
-
"stride": 0
|
8 |
-
},
|
9 |
-
"padding": {
|
10 |
-
"strategy": "BatchLongest",
|
11 |
-
"direction": "Right",
|
12 |
-
"pad_to_multiple_of": null,
|
13 |
-
"pad_id": 1,
|
14 |
-
"pad_type_id": 0,
|
15 |
-
"pad_token": "<pad>"
|
16 |
-
},
|
17 |
"added_tokens": [
|
18 |
{
|
19 |
"id": 0,
|
@@ -2415,8 +2403,8 @@
|
|
2415 |
"Ġchanged": 2315,
|
2416 |
"Ġcontrol": 2316,
|
2417 |
"Ġsense": 2317,
|
2418 |
-
"
|
2419 |
-
"
|
2420 |
"Ġfebruary": 2320,
|
2421 |
"Ġ30": 2321,
|
2422 |
"Ġdaddy": 2322,
|
@@ -2928,9 +2916,9 @@
|
|
2928 |
"Ġmir": 2828,
|
2929 |
"Ġfem": 2829,
|
2930 |
"Ġbecom": 2830,
|
2931 |
-
"
|
2932 |
-
"
|
2933 |
-
"
|
2934 |
"riage": 2834,
|
2935 |
"Ġbrown": 2835,
|
2936 |
"ode": 2836,
|
@@ -3427,10 +3415,10 @@
|
|
3427 |
"Ġbath": 3327,
|
3428 |
"Ġlength": 3328,
|
3429 |
"Ġtrade": 3329,
|
3430 |
-
"
|
3431 |
-
"
|
3432 |
-
"
|
3433 |
-
"
|
3434 |
"ba": 3334,
|
3435 |
"terday": 3335,
|
3436 |
"Ġcoffee": 3336,
|
@@ -5098,16 +5086,16 @@
|
|
5098 |
"Ġcomment": 4998,
|
5099 |
"Ġtrave": 4999,
|
5100 |
"Ġdisplay": 5000,
|
5101 |
-
"
|
5102 |
-
"
|
5103 |
-
"
|
5104 |
-
"
|
5105 |
-
"
|
5106 |
-
"
|
5107 |
-
"
|
5108 |
-
"
|
5109 |
-
"
|
5110 |
-
"
|
5111 |
"Ġsignal": 5011,
|
5112 |
"Ġillust": 5012,
|
5113 |
"ipping": 5013,
|
@@ -18540,8 +18528,8 @@
|
|
18540 |
"Ġchan ged",
|
18541 |
"Ġcontro l",
|
18542 |
"Ġs ense",
|
18543 |
-
"it ch",
|
18544 |
"as ure",
|
|
|
18545 |
"Ġfe bruary",
|
18546 |
"Ġ3 0",
|
18547 |
"Ġdad dy",
|
@@ -19053,9 +19041,9 @@
|
|
19053 |
"Ġm ir",
|
19054 |
"Ġf em",
|
19055 |
"Ġbec om",
|
19056 |
-
"Ġ q",
|
19057 |
"em pt",
|
19058 |
"a f",
|
|
|
19059 |
"ri age",
|
19060 |
"Ġbr own",
|
19061 |
"od e",
|
@@ -19552,10 +19540,10 @@
|
|
19552 |
"Ġb ath",
|
19553 |
"Ġl ength",
|
19554 |
"Ġtr ade",
|
19555 |
-
"bo ard",
|
19556 |
"Ġkn ock",
|
19557 |
"ast ic",
|
19558 |
"Ġjo ined",
|
|
|
19559 |
"b a",
|
19560 |
"ter day",
|
19561 |
"Ġco ffee",
|
@@ -21223,7 +21211,6 @@
|
|
21223 |
"Ġcom ment",
|
21224 |
"Ġtr ave",
|
21225 |
"Ġdis play",
|
21226 |
-
"Ġsw itch",
|
21227 |
"Ġass um",
|
21228 |
"Ġadv ice",
|
21229 |
"Ġstep s",
|
@@ -21233,6 +21220,7 @@
|
|
21233 |
"Ġk im",
|
21234 |
"im um",
|
21235 |
"Ġrec om",
|
|
|
21236 |
"Ġsign al",
|
21237 |
"Ġill ust",
|
21238 |
"ipp ing",
|
|
|
1 |
{
|
2 |
"version": "1.0",
|
3 |
+
"truncation": null,
|
4 |
+
"padding": null,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
"added_tokens": [
|
6 |
{
|
7 |
"id": 0,
|
|
|
2403 |
"Ġchanged": 2315,
|
2404 |
"Ġcontrol": 2316,
|
2405 |
"Ġsense": 2317,
|
2406 |
+
"asure": 2318,
|
2407 |
+
"itch": 2319,
|
2408 |
"Ġfebruary": 2320,
|
2409 |
"Ġ30": 2321,
|
2410 |
"Ġdaddy": 2322,
|
|
|
2916 |
"Ġmir": 2828,
|
2917 |
"Ġfem": 2829,
|
2918 |
"Ġbecom": 2830,
|
2919 |
+
"empt": 2831,
|
2920 |
+
"af": 2832,
|
2921 |
+
"Ġq": 2833,
|
2922 |
"riage": 2834,
|
2923 |
"Ġbrown": 2835,
|
2924 |
"ode": 2836,
|
|
|
3415 |
"Ġbath": 3327,
|
3416 |
"Ġlength": 3328,
|
3417 |
"Ġtrade": 3329,
|
3418 |
+
"Ġknock": 3330,
|
3419 |
+
"astic": 3331,
|
3420 |
+
"Ġjoined": 3332,
|
3421 |
+
"board": 3333,
|
3422 |
"ba": 3334,
|
3423 |
"terday": 3335,
|
3424 |
"Ġcoffee": 3336,
|
|
|
5086 |
"Ġcomment": 4998,
|
5087 |
"Ġtrave": 4999,
|
5088 |
"Ġdisplay": 5000,
|
5089 |
+
"Ġassum": 5001,
|
5090 |
+
"Ġadvice": 5002,
|
5091 |
+
"Ġsteps": 5003,
|
5092 |
+
"Ġdefeated": 5004,
|
5093 |
+
"Ġresources": 5005,
|
5094 |
+
"Ġrick": 5006,
|
5095 |
+
"Ġkim": 5007,
|
5096 |
+
"imum": 5008,
|
5097 |
+
"Ġrecom": 5009,
|
5098 |
+
"Ġswitch": 5010,
|
5099 |
"Ġsignal": 5011,
|
5100 |
"Ġillust": 5012,
|
5101 |
"ipping": 5013,
|
|
|
18528 |
"Ġchan ged",
|
18529 |
"Ġcontro l",
|
18530 |
"Ġs ense",
|
|
|
18531 |
"as ure",
|
18532 |
+
"it ch",
|
18533 |
"Ġfe bruary",
|
18534 |
"Ġ3 0",
|
18535 |
"Ġdad dy",
|
|
|
19041 |
"Ġm ir",
|
19042 |
"Ġf em",
|
19043 |
"Ġbec om",
|
|
|
19044 |
"em pt",
|
19045 |
"a f",
|
19046 |
+
"Ġ q",
|
19047 |
"ri age",
|
19048 |
"Ġbr own",
|
19049 |
"od e",
|
|
|
19540 |
"Ġb ath",
|
19541 |
"Ġl ength",
|
19542 |
"Ġtr ade",
|
|
|
19543 |
"Ġkn ock",
|
19544 |
"ast ic",
|
19545 |
"Ġjo ined",
|
19546 |
+
"bo ard",
|
19547 |
"b a",
|
19548 |
"ter day",
|
19549 |
"Ġco ffee",
|
|
|
21211 |
"Ġcom ment",
|
21212 |
"Ġtr ave",
|
21213 |
"Ġdis play",
|
|
|
21214 |
"Ġass um",
|
21215 |
"Ġadv ice",
|
21216 |
"Ġstep s",
|
|
|
21220 |
"Ġk im",
|
21221 |
"im um",
|
21222 |
"Ġrec om",
|
21223 |
+
"Ġsw itch",
|
21224 |
"Ġsign al",
|
21225 |
"Ġill ust",
|
21226 |
"ipp ing",
|
tokenizer_config.json
CHANGED
@@ -13,7 +13,7 @@
|
|
13 |
"single_word": false
|
14 |
},
|
15 |
"model_max_length": 1000000000000000019884624838656,
|
16 |
-
"name_or_path": "CamBabyTrainers/
|
17 |
"pad_token": "<pad>",
|
18 |
"sep_token": "</s>",
|
19 |
"special_tokens_map_file": null,
|
|
|
13 |
"single_word": false
|
14 |
},
|
15 |
"model_max_length": 1000000000000000019884624838656,
|
16 |
+
"name_or_path": "CamBabyTrainers/CamBabyTokenizer-8192",
|
17 |
"pad_token": "<pad>",
|
18 |
"sep_token": "</s>",
|
19 |
"special_tokens_map_file": null,
|
vocab.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|