codebyzeb commited on
Commit
53206a1
1 Parent(s): 71de7f5

Upload tokenizer

Browse files
Files changed (4) hide show
  1. merges.txt +4 -4
  2. tokenizer.json +25 -37
  3. tokenizer_config.json +1 -1
  4. vocab.json +0 -0
merges.txt CHANGED
@@ -2056,8 +2056,8 @@ d o
2056
  Ġchan ged
2057
  Ġcontro l
2058
  Ġs ense
2059
- it ch
2060
  as ure
 
2061
  Ġfe bruary
2062
  Ġ3 0
2063
  Ġdad dy
@@ -2569,9 +2569,9 @@ in c
2569
  Ġm ir
2570
  Ġf em
2571
  Ġbec om
2572
- Ġ q
2573
  em pt
2574
  a f
 
2575
  ri age
2576
  Ġbr own
2577
  od e
@@ -3068,10 +3068,10 @@ ur ity
3068
  Ġb ath
3069
  Ġl ength
3070
  Ġtr ade
3071
- bo ard
3072
  Ġkn ock
3073
  ast ic
3074
  Ġjo ined
 
3075
  b a
3076
  ter day
3077
  Ġco ffee
@@ -4739,7 +4739,6 @@ u ly
4739
  Ġcom ment
4740
  Ġtr ave
4741
  Ġdis play
4742
- Ġsw itch
4743
  Ġass um
4744
  Ġadv ice
4745
  Ġstep s
@@ -4749,6 +4748,7 @@ u ly
4749
  Ġk im
4750
  im um
4751
  Ġrec om
 
4752
  Ġsign al
4753
  Ġill ust
4754
  ipp ing
 
2056
  Ġchan ged
2057
  Ġcontro l
2058
  Ġs ense
 
2059
  as ure
2060
+ it ch
2061
  Ġfe bruary
2062
  Ġ3 0
2063
  Ġdad dy
 
2569
  Ġm ir
2570
  Ġf em
2571
  Ġbec om
 
2572
  em pt
2573
  a f
2574
+ Ġ q
2575
  ri age
2576
  Ġbr own
2577
  od e
 
3068
  Ġb ath
3069
  Ġl ength
3070
  Ġtr ade
 
3071
  Ġkn ock
3072
  ast ic
3073
  Ġjo ined
3074
+ bo ard
3075
  b a
3076
  ter day
3077
  Ġco ffee
 
4739
  Ġcom ment
4740
  Ġtr ave
4741
  Ġdis play
 
4742
  Ġass um
4743
  Ġadv ice
4744
  Ġstep s
 
4748
  Ġk im
4749
  im um
4750
  Ġrec om
4751
+ Ġsw itch
4752
  Ġsign al
4753
  Ġill ust
4754
  ipp ing
tokenizer.json CHANGED
@@ -1,19 +1,7 @@
1
  {
2
  "version": "1.0",
3
- "truncation": {
4
- "direction": "Right",
5
- "max_length": 128,
6
- "strategy": "LongestFirst",
7
- "stride": 0
8
- },
9
- "padding": {
10
- "strategy": "BatchLongest",
11
- "direction": "Right",
12
- "pad_to_multiple_of": null,
13
- "pad_id": 1,
14
- "pad_type_id": 0,
15
- "pad_token": "<pad>"
16
- },
17
  "added_tokens": [
18
  {
19
  "id": 0,
@@ -2415,8 +2403,8 @@
2415
  "Ġchanged": 2315,
2416
  "Ġcontrol": 2316,
2417
  "Ġsense": 2317,
2418
- "itch": 2318,
2419
- "asure": 2319,
2420
  "Ġfebruary": 2320,
2421
  "Ġ30": 2321,
2422
  "Ġdaddy": 2322,
@@ -2928,9 +2916,9 @@
2928
  "Ġmir": 2828,
2929
  "Ġfem": 2829,
2930
  "Ġbecom": 2830,
2931
- "Ġq": 2831,
2932
- "empt": 2832,
2933
- "af": 2833,
2934
  "riage": 2834,
2935
  "Ġbrown": 2835,
2936
  "ode": 2836,
@@ -3427,10 +3415,10 @@
3427
  "Ġbath": 3327,
3428
  "Ġlength": 3328,
3429
  "Ġtrade": 3329,
3430
- "board": 3330,
3431
- "Ġknock": 3331,
3432
- "astic": 3332,
3433
- "Ġjoined": 3333,
3434
  "ba": 3334,
3435
  "terday": 3335,
3436
  "Ġcoffee": 3336,
@@ -5098,16 +5086,16 @@
5098
  "Ġcomment": 4998,
5099
  "Ġtrave": 4999,
5100
  "Ġdisplay": 5000,
5101
- "Ġswitch": 5001,
5102
- "Ġassum": 5002,
5103
- "Ġadvice": 5003,
5104
- "Ġsteps": 5004,
5105
- "Ġdefeated": 5005,
5106
- "Ġresources": 5006,
5107
- "Ġrick": 5007,
5108
- "Ġkim": 5008,
5109
- "imum": 5009,
5110
- "Ġrecom": 5010,
5111
  "Ġsignal": 5011,
5112
  "Ġillust": 5012,
5113
  "ipping": 5013,
@@ -18540,8 +18528,8 @@
18540
  "Ġchan ged",
18541
  "Ġcontro l",
18542
  "Ġs ense",
18543
- "it ch",
18544
  "as ure",
 
18545
  "Ġfe bruary",
18546
  "Ġ3 0",
18547
  "Ġdad dy",
@@ -19053,9 +19041,9 @@
19053
  "Ġm ir",
19054
  "Ġf em",
19055
  "Ġbec om",
19056
- "Ġ q",
19057
  "em pt",
19058
  "a f",
 
19059
  "ri age",
19060
  "Ġbr own",
19061
  "od e",
@@ -19552,10 +19540,10 @@
19552
  "Ġb ath",
19553
  "Ġl ength",
19554
  "Ġtr ade",
19555
- "bo ard",
19556
  "Ġkn ock",
19557
  "ast ic",
19558
  "Ġjo ined",
 
19559
  "b a",
19560
  "ter day",
19561
  "Ġco ffee",
@@ -21223,7 +21211,6 @@
21223
  "Ġcom ment",
21224
  "Ġtr ave",
21225
  "Ġdis play",
21226
- "Ġsw itch",
21227
  "Ġass um",
21228
  "Ġadv ice",
21229
  "Ġstep s",
@@ -21233,6 +21220,7 @@
21233
  "Ġk im",
21234
  "im um",
21235
  "Ġrec om",
 
21236
  "Ġsign al",
21237
  "Ġill ust",
21238
  "ipp ing",
 
1
  {
2
  "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
 
 
 
 
 
 
 
 
 
 
 
 
5
  "added_tokens": [
6
  {
7
  "id": 0,
 
2403
  "Ġchanged": 2315,
2404
  "Ġcontrol": 2316,
2405
  "Ġsense": 2317,
2406
+ "asure": 2318,
2407
+ "itch": 2319,
2408
  "Ġfebruary": 2320,
2409
  "Ġ30": 2321,
2410
  "Ġdaddy": 2322,
 
2916
  "Ġmir": 2828,
2917
  "Ġfem": 2829,
2918
  "Ġbecom": 2830,
2919
+ "empt": 2831,
2920
+ "af": 2832,
2921
+ "Ġq": 2833,
2922
  "riage": 2834,
2923
  "Ġbrown": 2835,
2924
  "ode": 2836,
 
3415
  "Ġbath": 3327,
3416
  "Ġlength": 3328,
3417
  "Ġtrade": 3329,
3418
+ "Ġknock": 3330,
3419
+ "astic": 3331,
3420
+ "Ġjoined": 3332,
3421
+ "board": 3333,
3422
  "ba": 3334,
3423
  "terday": 3335,
3424
  "Ġcoffee": 3336,
 
5086
  "Ġcomment": 4998,
5087
  "Ġtrave": 4999,
5088
  "Ġdisplay": 5000,
5089
+ "Ġassum": 5001,
5090
+ "Ġadvice": 5002,
5091
+ "Ġsteps": 5003,
5092
+ "Ġdefeated": 5004,
5093
+ "Ġresources": 5005,
5094
+ "Ġrick": 5006,
5095
+ "Ġkim": 5007,
5096
+ "imum": 5008,
5097
+ "Ġrecom": 5009,
5098
+ "Ġswitch": 5010,
5099
  "Ġsignal": 5011,
5100
  "Ġillust": 5012,
5101
  "ipping": 5013,
 
18528
  "Ġchan ged",
18529
  "Ġcontro l",
18530
  "Ġs ense",
 
18531
  "as ure",
18532
+ "it ch",
18533
  "Ġfe bruary",
18534
  "Ġ3 0",
18535
  "Ġdad dy",
 
19041
  "Ġm ir",
19042
  "Ġf em",
19043
  "Ġbec om",
 
19044
  "em pt",
19045
  "a f",
19046
+ "Ġ q",
19047
  "ri age",
19048
  "Ġbr own",
19049
  "od e",
 
19540
  "Ġb ath",
19541
  "Ġl ength",
19542
  "Ġtr ade",
 
19543
  "Ġkn ock",
19544
  "ast ic",
19545
  "Ġjo ined",
19546
+ "bo ard",
19547
  "b a",
19548
  "ter day",
19549
  "Ġco ffee",
 
21211
  "Ġcom ment",
21212
  "Ġtr ave",
21213
  "Ġdis play",
 
21214
  "Ġass um",
21215
  "Ġadv ice",
21216
  "Ġstep s",
 
21220
  "Ġk im",
21221
  "im um",
21222
  "Ġrec om",
21223
+ "Ġsw itch",
21224
  "Ġsign al",
21225
  "Ġill ust",
21226
  "ipp ing",
tokenizer_config.json CHANGED
@@ -13,7 +13,7 @@
13
  "single_word": false
14
  },
15
  "model_max_length": 1000000000000000019884624838656,
16
- "name_or_path": "CamBabyTrainers/BabyBERTa-3-8192-tokenizer",
17
  "pad_token": "<pad>",
18
  "sep_token": "</s>",
19
  "special_tokens_map_file": null,
 
13
  "single_word": false
14
  },
15
  "model_max_length": 1000000000000000019884624838656,
16
+ "name_or_path": "CamBabyTrainers/CamBabyTokenizer-8192",
17
  "pad_token": "<pad>",
18
  "sep_token": "</s>",
19
  "special_tokens_map_file": null,
vocab.json CHANGED
The diff for this file is too large to render. See raw diff