new tokenizer 38400
Browse files- merges.txt +0 -0
- scripts/model.yaml +10 -10
- scripts/prepare_pretrain_dataset.py +85 -50
- scripts/train_tokenizer.py +15 -6
- tokenizer.json +0 -0
- tokenizer_config.json +0 -384
- vocab.json +0 -0
merges.txt
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
scripts/model.yaml
CHANGED
@@ -5,13 +5,13 @@ model_name: "tiny-llama-1.1b"
|
|
5 |
# A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with
|
6 |
# ``model_config``. (type: Optional[Config], default: null)
|
7 |
model_config:
|
8 |
-
padded_vocab_size:
|
9 |
-
vocab_size:
|
10 |
block_size: 131072
|
11 |
-
n_layer:
|
12 |
n_head: 32
|
13 |
head_size: null
|
14 |
-
n_embd:
|
15 |
n_query_groups: 8
|
16 |
rotary_percentage: 1.0
|
17 |
parallel_residual: false
|
@@ -19,7 +19,7 @@ model_config:
|
|
19 |
norm_class_name: "RMSNorm"
|
20 |
norm_eps: 1e-05
|
21 |
mlp_class_name: "LLaMAMLP"
|
22 |
-
intermediate_size:
|
23 |
rope_base: 1000000
|
24 |
|
25 |
# Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
|
@@ -52,7 +52,7 @@ data:
|
|
52 |
# Training-related arguments. See ``litgpt.args.TrainArgs`` for details
|
53 |
train:
|
54 |
# Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
|
55 |
-
save_interval:
|
56 |
|
57 |
# Number of iterations between logging calls (type: int, default: 1)
|
58 |
log_interval: 1
|
@@ -61,7 +61,7 @@ train:
|
|
61 |
global_batch_size: 512
|
62 |
|
63 |
# Number of samples per data-parallel rank (type: int, default: 4)
|
64 |
-
micro_batch_size:
|
65 |
|
66 |
# Number of iterations with learning rate warmup active (type: int, default: 2000)
|
67 |
lr_warmup_steps: 2000
|
@@ -71,13 +71,13 @@ train:
|
|
71 |
|
72 |
# Total number of tokens to train on (type: Optional[int], default: 3000000000000)
|
73 |
# max_tokens: 3000000000000
|
74 |
-
max_tokens:
|
75 |
|
76 |
# Limits the number of optimizer steps to run. (type: Optional[int], default: null)
|
77 |
max_steps:
|
78 |
|
79 |
# Limits the length of samples. Off by default (type: Optional[int], default: null)
|
80 |
-
max_seq_length:
|
81 |
|
82 |
# Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
|
83 |
tie_embeddings:
|
@@ -86,7 +86,7 @@ train:
|
|
86 |
max_norm: 1.0
|
87 |
|
88 |
# (type: float, default: 4e-05)
|
89 |
-
min_lr:
|
90 |
|
91 |
# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
|
92 |
eval:
|
|
|
5 |
# A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with
|
6 |
# ``model_config``. (type: Optional[Config], default: null)
|
7 |
model_config:
|
8 |
+
padded_vocab_size: 38400
|
9 |
+
vocab_size: 38400
|
10 |
block_size: 131072
|
11 |
+
n_layer: 5
|
12 |
n_head: 32
|
13 |
head_size: null
|
14 |
+
n_embd: 1024
|
15 |
n_query_groups: 8
|
16 |
rotary_percentage: 1.0
|
17 |
parallel_residual: false
|
|
|
19 |
norm_class_name: "RMSNorm"
|
20 |
norm_eps: 1e-05
|
21 |
mlp_class_name: "LLaMAMLP"
|
22 |
+
intermediate_size: 3584
|
23 |
rope_base: 1000000
|
24 |
|
25 |
# Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
|
|
|
52 |
# Training-related arguments. See ``litgpt.args.TrainArgs`` for details
|
53 |
train:
|
54 |
# Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
|
55 |
+
save_interval: 100
|
56 |
|
57 |
# Number of iterations between logging calls (type: int, default: 1)
|
58 |
log_interval: 1
|
|
|
61 |
global_batch_size: 512
|
62 |
|
63 |
# Number of samples per data-parallel rank (type: int, default: 4)
|
64 |
+
micro_batch_size: 8
|
65 |
|
66 |
# Number of iterations with learning rate warmup active (type: int, default: 2000)
|
67 |
lr_warmup_steps: 2000
|
|
|
71 |
|
72 |
# Total number of tokens to train on (type: Optional[int], default: 3000000000000)
|
73 |
# max_tokens: 3000000000000
|
74 |
+
max_tokens: ??? # ? * 8193 * 3
|
75 |
|
76 |
# Limits the number of optimizer steps to run. (type: Optional[int], default: null)
|
77 |
max_steps:
|
78 |
|
79 |
# Limits the length of samples. Off by default (type: Optional[int], default: null)
|
80 |
+
max_seq_length: 4096
|
81 |
|
82 |
# Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
|
83 |
tie_embeddings:
|
|
|
86 |
max_norm: 1.0
|
87 |
|
88 |
# (type: float, default: 4e-05)
|
89 |
+
min_lr: 1e-4
|
90 |
|
91 |
# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
|
92 |
eval:
|
scripts/prepare_pretrain_dataset.py
CHANGED
@@ -7,11 +7,43 @@ from functools import partial
|
|
7 |
|
8 |
|
9 |
def batch_iterator(name=None):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
# text
|
11 |
if name in (None, 'xu-song/cc100-samples'):
|
12 |
dataset = (
|
13 |
load_dataset(name, lang, split='train')
|
14 |
-
for lang in [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
)
|
16 |
|
17 |
for d in dataset:
|
@@ -21,19 +53,48 @@ def batch_iterator(name=None):
|
|
21 |
del dataset
|
22 |
gc.collect()
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
# code
|
25 |
if name in (None, 'bigcode/the-stack-smol-xs'):
|
26 |
dataset = (
|
27 |
load_dataset(name, lang, split='train', trust_remote_code=True)
|
28 |
for lang in [
|
29 |
-
'ada', 'agda', 'alloy', 'antlr', 'applescript', 'assembly',
|
30 |
-
'
|
31 |
-
'
|
32 |
-
'
|
33 |
-
'
|
34 |
-
'
|
35 |
-
'
|
36 |
-
'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
]
|
38 |
)
|
39 |
|
@@ -44,17 +105,17 @@ def batch_iterator(name=None):
|
|
44 |
del dataset
|
45 |
gc.collect()
|
46 |
|
47 |
-
#
|
48 |
-
if name in (None, 'nampdn-ai/tiny-
|
49 |
dataset = load_dataset(name, split='train')
|
50 |
-
|
51 |
for row in dataset:
|
52 |
-
yield row['
|
53 |
-
|
54 |
del dataset
|
55 |
gc.collect()
|
56 |
|
57 |
-
# code
|
58 |
if name in (None, 'm-a-p/CodeFeedback-Filtered-Instruction'):
|
59 |
dataset = load_dataset(name, split='train')
|
60 |
|
@@ -64,12 +125,12 @@ def batch_iterator(name=None):
|
|
64 |
del dataset
|
65 |
gc.collect()
|
66 |
|
67 |
-
#
|
68 |
-
if name in (None, '
|
69 |
dataset = load_dataset(name, split='train')
|
70 |
|
71 |
for row in dataset:
|
72 |
-
yield row['
|
73 |
|
74 |
del dataset
|
75 |
gc.collect()
|
@@ -114,29 +175,6 @@ def batch_iterator(name=None):
|
|
114 |
del dataset
|
115 |
gc.collect()
|
116 |
|
117 |
-
# instructions
|
118 |
-
alpaca_datasets_names = [
|
119 |
-
'saillab/alpaca-english-cleaned',
|
120 |
-
'saillab/alpaca-serbian-cleaned',
|
121 |
-
'saillab/alpaca-croatian-cleaned',
|
122 |
-
'saillab/alpaca-bosnian-cleaned',
|
123 |
-
'saillab/alpaca-macedonian-cleaned',
|
124 |
-
'saillab/alpaca-slovenian-cleaned',
|
125 |
-
]
|
126 |
-
|
127 |
-
if name in (None, *alpaca_datasets_names):
|
128 |
-
for split in ['train', 'test']:
|
129 |
-
dataset = load_dataset(name, split=split)
|
130 |
-
|
131 |
-
for row in dataset:
|
132 |
-
if row['input'] in (None, '', 'nan'):
|
133 |
-
yield row['instruction'] + '\n' + row['output']
|
134 |
-
else:
|
135 |
-
yield row['instruction'] + '\n' + row['input'] + '\n' + row['output']
|
136 |
-
|
137 |
-
del dataset
|
138 |
-
gc.collect()
|
139 |
-
|
140 |
|
141 |
def tokenize_fn(dataset_name, tokenizer=None):
|
142 |
for text in batch_iterator(dataset_name):
|
@@ -145,21 +183,18 @@ def tokenize_fn(dataset_name, tokenizer=None):
|
|
145 |
|
146 |
|
147 |
datasets_names = [
|
|
|
148 |
'xu-song/cc100-samples',
|
149 |
-
'
|
150 |
'nampdn-ai/tiny-textbooks',
|
151 |
-
'
|
152 |
'nampdn-ai/tiny-codes',
|
|
|
|
|
153 |
'ajibawa-2023/Maths-College',
|
154 |
'microsoft/orca-math-word-problems-200k',
|
155 |
'datatab/orca_math_world_problem_200k_serbian',
|
156 |
'badrex/llm-emoji-dataset',
|
157 |
-
'saillab/alpaca-english-cleaned',
|
158 |
-
'saillab/alpaca-serbian-cleaned',
|
159 |
-
'saillab/alpaca-croatian-cleaned',
|
160 |
-
'saillab/alpaca-bosnian-cleaned',
|
161 |
-
'saillab/alpaca-macedonian-cleaned',
|
162 |
-
'saillab/alpaca-slovenian-cleaned',
|
163 |
]
|
164 |
|
165 |
outputs = optimize(
|
@@ -167,6 +202,6 @@ outputs = optimize(
|
|
167 |
inputs=datasets_names,
|
168 |
output_dir='../data/',
|
169 |
# Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
|
170 |
-
chunk_size=(
|
171 |
num_workers=16,
|
172 |
)
|
|
|
7 |
|
8 |
|
9 |
def batch_iterator(name=None):
|
10 |
+
# text
|
11 |
+
if name in (None, 'saillab/taco-datasets'):
|
12 |
+
dataset = (
|
13 |
+
load_dataset(name, data_dir=data_dir, split='train')
|
14 |
+
for data_dir in [
|
15 |
+
'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4',
|
16 |
+
'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k',
|
17 |
+
]
|
18 |
+
)
|
19 |
+
|
20 |
+
for d in dataset:
|
21 |
+
for row in d:
|
22 |
+
for n in row:
|
23 |
+
yield row['instruction'] + '\n' + row['input'] + '\n' + row['output']
|
24 |
+
|
25 |
+
del dataset
|
26 |
+
gc.collect()
|
27 |
+
|
28 |
# text
|
29 |
if name in (None, 'xu-song/cc100-samples'):
|
30 |
dataset = (
|
31 |
load_dataset(name, lang, split='train')
|
32 |
+
for lang in [
|
33 |
+
'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'bn_rom', 'br',
|
34 |
+
'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es',
|
35 |
+
'et', 'eu', 'fa', 'ff', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl',
|
36 |
+
'gn', 'gu', 'ha', 'he', 'hi', 'hi_rom', 'hr', 'ht', 'hu',
|
37 |
+
'hy', 'id', 'ig', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'km',
|
38 |
+
'kn', 'ko', 'ku', 'ky', 'la', 'lg', 'li', 'ln', 'lo', 'lt',
|
39 |
+
'lv', 'mg', 'mk', 'ml', 'mn', 'mr', 'ms', 'my', 'my_zaw',
|
40 |
+
'ne', 'nl', 'no', 'ns', 'om', 'or', 'pa', 'pl', 'ps', 'pt',
|
41 |
+
'qu', 'rm', 'ro', 'ru', 'sa', 'si', 'sc', 'sd', 'sk', 'sl',
|
42 |
+
'so', 'sq', 'sr', 'ss', 'su', 'sv', 'sw', 'ta', 'ta_rom',
|
43 |
+
'te', 'te_rom', 'th', 'tl', 'tn', 'tr', 'ug', 'uk', 'ur',
|
44 |
+
'ur_rom', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo',
|
45 |
+
'zh-Hans', 'zh-Hant', 'zu',
|
46 |
+
]
|
47 |
)
|
48 |
|
49 |
for d in dataset:
|
|
|
53 |
del dataset
|
54 |
gc.collect()
|
55 |
|
56 |
+
# text
|
57 |
+
if name in (None, 'ontocord/fineweb-permissive-multilingual-2m'):
|
58 |
+
dataset = load_dataset(name, split='train')
|
59 |
+
|
60 |
+
for row in dataset:
|
61 |
+
yield row['text']
|
62 |
+
|
63 |
+
del dataset
|
64 |
+
gc.collect()
|
65 |
+
|
66 |
+
# text
|
67 |
+
if name in (None, 'nampdn-ai/tiny-textbooks'):
|
68 |
+
for split in ['train', 'test']:
|
69 |
+
dataset = load_dataset(name, split=split)
|
70 |
+
|
71 |
+
for row in dataset:
|
72 |
+
yield row['textbook']
|
73 |
+
|
74 |
+
del dataset
|
75 |
+
gc.collect()
|
76 |
+
|
77 |
# code
|
78 |
if name in (None, 'bigcode/the-stack-smol-xs'):
|
79 |
dataset = (
|
80 |
load_dataset(name, lang, split='train', trust_remote_code=True)
|
81 |
for lang in [
|
82 |
+
'ada', 'agda', 'alloy', 'antlr', 'applescript', 'assembly',
|
83 |
+
'augeas', 'awk', 'batchfile', 'bison', 'bluespec', 'c',
|
84 |
+
'c++', 'c-sharp', 'clojure', 'cmake', 'coffeescript', 'common-lisp',
|
85 |
+
'css', 'cuda', 'dart', 'dockerfile', 'elixir',
|
86 |
+
'elm', 'emacs-lisp','erlang', 'f-sharp', 'fortran', 'glsl', 'go',
|
87 |
+
'groovy', 'haskell','html', 'idris', 'isabelle', 'java',
|
88 |
+
'java-server-pages', 'javascript', 'julia', 'kotlin', 'lean',
|
89 |
+
'literate-agda', 'literate-coffeescript', 'literate-haskell',
|
90 |
+
'lua', 'makefile', 'maple', 'markdown', 'mathematica', 'matlab',
|
91 |
+
'ocaml', 'pascal', 'perl', 'php', 'powershell', 'prolog',
|
92 |
+
'protocol-buffer', 'python', 'r', 'racket', 'restructuredtext',
|
93 |
+
'rmarkdown', 'ruby', 'rust', 'sas', 'scala', 'scheme',
|
94 |
+
'shell', 'smalltalk', 'solidity', 'sparql', 'sql', 'stan',
|
95 |
+
'standard-ml', 'stata', 'systemverilog', 'tcl', 'tcsh', 'tex',
|
96 |
+
'thrift', 'typescript', 'verilog', 'vhdl', 'visual-basic', 'xslt',
|
97 |
+
'yacc', 'zig',
|
98 |
]
|
99 |
)
|
100 |
|
|
|
105 |
del dataset
|
106 |
gc.collect()
|
107 |
|
108 |
+
# code
|
109 |
+
if name in (None, 'nampdn-ai/tiny-codes'):
|
110 |
dataset = load_dataset(name, split='train')
|
111 |
+
|
112 |
for row in dataset:
|
113 |
+
yield row['prompt'] + '\n' + row['response']
|
114 |
+
|
115 |
del dataset
|
116 |
gc.collect()
|
117 |
|
118 |
+
# text + code
|
119 |
if name in (None, 'm-a-p/CodeFeedback-Filtered-Instruction'):
|
120 |
dataset = load_dataset(name, split='train')
|
121 |
|
|
|
125 |
del dataset
|
126 |
gc.collect()
|
127 |
|
128 |
+
# math
|
129 |
+
if name in (None, 'gair-prox/open-web-math-pro'):
|
130 |
dataset = load_dataset(name, split='train')
|
131 |
|
132 |
for row in dataset:
|
133 |
+
yield row['text']
|
134 |
|
135 |
del dataset
|
136 |
gc.collect()
|
|
|
175 |
del dataset
|
176 |
gc.collect()
|
177 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
|
179 |
def tokenize_fn(dataset_name, tokenizer=None):
|
180 |
for text in batch_iterator(dataset_name):
|
|
|
183 |
|
184 |
|
185 |
datasets_names = [
|
186 |
+
'saillab/taco-datasets',
|
187 |
'xu-song/cc100-samples',
|
188 |
+
'ontocord/fineweb-permissive-multilingual-2m',
|
189 |
'nampdn-ai/tiny-textbooks',
|
190 |
+
'bigcode/the-stack-smol-xs',
|
191 |
'nampdn-ai/tiny-codes',
|
192 |
+
'm-a-p/CodeFeedback-Filtered-Instruction',
|
193 |
+
'gair-prox/open-web-math-pro',
|
194 |
'ajibawa-2023/Maths-College',
|
195 |
'microsoft/orca-math-word-problems-200k',
|
196 |
'datatab/orca_math_world_problem_200k_serbian',
|
197 |
'badrex/llm-emoji-dataset',
|
|
|
|
|
|
|
|
|
|
|
|
|
198 |
]
|
199 |
|
200 |
outputs = optimize(
|
|
|
202 |
inputs=datasets_names,
|
203 |
output_dir='../data/',
|
204 |
# Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
|
205 |
+
chunk_size=(4097 * 4006),
|
206 |
num_workers=16,
|
207 |
)
|
scripts/train_tokenizer.py
CHANGED
@@ -110,11 +110,11 @@ def batch_iterator():
|
|
110 |
gc.collect()
|
111 |
|
112 |
# math
|
113 |
-
dataset = load_dataset('
|
114 |
-
|
115 |
for row in dataset:
|
116 |
-
yield row['
|
117 |
-
|
118 |
del dataset
|
119 |
gc.collect()
|
120 |
|
@@ -127,6 +127,15 @@ def batch_iterator():
|
|
127 |
del dataset
|
128 |
gc.collect()
|
129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
# emoji
|
131 |
dataset = load_dataset('badrex/llm-emoji-dataset', split='train')
|
132 |
|
@@ -206,7 +215,7 @@ special_tokens = [
|
|
206 |
for i in range(2, 25):
|
207 |
special_tokens.append(' ' * i)
|
208 |
|
209 |
-
for i in range(
|
210 |
special_tokens.append(f'<|reserved_{i}|>')
|
211 |
|
212 |
# emoji
|
@@ -235,7 +244,7 @@ tokenizer.post_processor = TemplateProcessing(
|
|
235 |
tokenizer.decoder = decoders.ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True)
|
236 |
|
237 |
trainer = BpeTrainer(
|
238 |
-
vocab_size=
|
239 |
min_frequency=2,
|
240 |
special_tokens=special_tokens,
|
241 |
initial_alphabet=emoji_chars + programming_languages + code_keywords,
|
|
|
110 |
gc.collect()
|
111 |
|
112 |
# math
|
113 |
+
dataset = load_dataset('gair-prox/open-web-math-pro', split='train')
|
114 |
+
|
115 |
for row in dataset:
|
116 |
+
yield row['text']
|
117 |
+
|
118 |
del dataset
|
119 |
gc.collect()
|
120 |
|
|
|
127 |
del dataset
|
128 |
gc.collect()
|
129 |
|
130 |
+
# math
|
131 |
+
dataset = load_dataset('microsoft/orca-math-word-problems-200k', split='train')
|
132 |
+
|
133 |
+
for row in dataset:
|
134 |
+
yield row['question'] + '\n' + row['answer']
|
135 |
+
|
136 |
+
del dataset
|
137 |
+
gc.collect()
|
138 |
+
|
139 |
# emoji
|
140 |
dataset = load_dataset('badrex/llm-emoji-dataset', split='train')
|
141 |
|
|
|
215 |
for i in range(2, 25):
|
216 |
special_tokens.append(' ' * i)
|
217 |
|
218 |
+
for i in range(64 - len(special_tokens)):
|
219 |
special_tokens.append(f'<|reserved_{i}|>')
|
220 |
|
221 |
# emoji
|
|
|
244 |
tokenizer.decoder = decoders.ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True)
|
245 |
|
246 |
trainer = BpeTrainer(
|
247 |
+
vocab_size=38400, # 32768 chars + 5034 emojis
|
248 |
min_frequency=2,
|
249 |
special_tokens=special_tokens,
|
250 |
initial_alphabet=emoji_chars + programming_languages + code_keywords,
|
tokenizer.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
CHANGED
@@ -639,390 +639,6 @@
|
|
639 |
"rstrip": false,
|
640 |
"single_word": false,
|
641 |
"special": true
|
642 |
-
},
|
643 |
-
"80": {
|
644 |
-
"content": "<|reserved_0|>",
|
645 |
-
"lstrip": false,
|
646 |
-
"normalized": false,
|
647 |
-
"rstrip": false,
|
648 |
-
"single_word": false,
|
649 |
-
"special": true
|
650 |
-
},
|
651 |
-
"81": {
|
652 |
-
"content": "<|reserved_1|>",
|
653 |
-
"lstrip": false,
|
654 |
-
"normalized": false,
|
655 |
-
"rstrip": false,
|
656 |
-
"single_word": false,
|
657 |
-
"special": true
|
658 |
-
},
|
659 |
-
"82": {
|
660 |
-
"content": "<|reserved_2|>",
|
661 |
-
"lstrip": false,
|
662 |
-
"normalized": false,
|
663 |
-
"rstrip": false,
|
664 |
-
"single_word": false,
|
665 |
-
"special": true
|
666 |
-
},
|
667 |
-
"83": {
|
668 |
-
"content": "<|reserved_3|>",
|
669 |
-
"lstrip": false,
|
670 |
-
"normalized": false,
|
671 |
-
"rstrip": false,
|
672 |
-
"single_word": false,
|
673 |
-
"special": true
|
674 |
-
},
|
675 |
-
"84": {
|
676 |
-
"content": "<|reserved_4|>",
|
677 |
-
"lstrip": false,
|
678 |
-
"normalized": false,
|
679 |
-
"rstrip": false,
|
680 |
-
"single_word": false,
|
681 |
-
"special": true
|
682 |
-
},
|
683 |
-
"85": {
|
684 |
-
"content": "<|reserved_5|>",
|
685 |
-
"lstrip": false,
|
686 |
-
"normalized": false,
|
687 |
-
"rstrip": false,
|
688 |
-
"single_word": false,
|
689 |
-
"special": true
|
690 |
-
},
|
691 |
-
"86": {
|
692 |
-
"content": "<|reserved_6|>",
|
693 |
-
"lstrip": false,
|
694 |
-
"normalized": false,
|
695 |
-
"rstrip": false,
|
696 |
-
"single_word": false,
|
697 |
-
"special": true
|
698 |
-
},
|
699 |
-
"87": {
|
700 |
-
"content": "<|reserved_7|>",
|
701 |
-
"lstrip": false,
|
702 |
-
"normalized": false,
|
703 |
-
"rstrip": false,
|
704 |
-
"single_word": false,
|
705 |
-
"special": true
|
706 |
-
},
|
707 |
-
"88": {
|
708 |
-
"content": "<|reserved_8|>",
|
709 |
-
"lstrip": false,
|
710 |
-
"normalized": false,
|
711 |
-
"rstrip": false,
|
712 |
-
"single_word": false,
|
713 |
-
"special": true
|
714 |
-
},
|
715 |
-
"89": {
|
716 |
-
"content": "<|reserved_9|>",
|
717 |
-
"lstrip": false,
|
718 |
-
"normalized": false,
|
719 |
-
"rstrip": false,
|
720 |
-
"single_word": false,
|
721 |
-
"special": true
|
722 |
-
},
|
723 |
-
"90": {
|
724 |
-
"content": "<|reserved_10|>",
|
725 |
-
"lstrip": false,
|
726 |
-
"normalized": false,
|
727 |
-
"rstrip": false,
|
728 |
-
"single_word": false,
|
729 |
-
"special": true
|
730 |
-
},
|
731 |
-
"91": {
|
732 |
-
"content": "<|reserved_11|>",
|
733 |
-
"lstrip": false,
|
734 |
-
"normalized": false,
|
735 |
-
"rstrip": false,
|
736 |
-
"single_word": false,
|
737 |
-
"special": true
|
738 |
-
},
|
739 |
-
"92": {
|
740 |
-
"content": "<|reserved_12|>",
|
741 |
-
"lstrip": false,
|
742 |
-
"normalized": false,
|
743 |
-
"rstrip": false,
|
744 |
-
"single_word": false,
|
745 |
-
"special": true
|
746 |
-
},
|
747 |
-
"93": {
|
748 |
-
"content": "<|reserved_13|>",
|
749 |
-
"lstrip": false,
|
750 |
-
"normalized": false,
|
751 |
-
"rstrip": false,
|
752 |
-
"single_word": false,
|
753 |
-
"special": true
|
754 |
-
},
|
755 |
-
"94": {
|
756 |
-
"content": "<|reserved_14|>",
|
757 |
-
"lstrip": false,
|
758 |
-
"normalized": false,
|
759 |
-
"rstrip": false,
|
760 |
-
"single_word": false,
|
761 |
-
"special": true
|
762 |
-
},
|
763 |
-
"95": {
|
764 |
-
"content": "<|reserved_15|>",
|
765 |
-
"lstrip": false,
|
766 |
-
"normalized": false,
|
767 |
-
"rstrip": false,
|
768 |
-
"single_word": false,
|
769 |
-
"special": true
|
770 |
-
},
|
771 |
-
"96": {
|
772 |
-
"content": "<|reserved_16|>",
|
773 |
-
"lstrip": false,
|
774 |
-
"normalized": false,
|
775 |
-
"rstrip": false,
|
776 |
-
"single_word": false,
|
777 |
-
"special": true
|
778 |
-
},
|
779 |
-
"97": {
|
780 |
-
"content": "<|reserved_17|>",
|
781 |
-
"lstrip": false,
|
782 |
-
"normalized": false,
|
783 |
-
"rstrip": false,
|
784 |
-
"single_word": false,
|
785 |
-
"special": true
|
786 |
-
},
|
787 |
-
"98": {
|
788 |
-
"content": "<|reserved_18|>",
|
789 |
-
"lstrip": false,
|
790 |
-
"normalized": false,
|
791 |
-
"rstrip": false,
|
792 |
-
"single_word": false,
|
793 |
-
"special": true
|
794 |
-
},
|
795 |
-
"99": {
|
796 |
-
"content": "<|reserved_19|>",
|
797 |
-
"lstrip": false,
|
798 |
-
"normalized": false,
|
799 |
-
"rstrip": false,
|
800 |
-
"single_word": false,
|
801 |
-
"special": true
|
802 |
-
},
|
803 |
-
"100": {
|
804 |
-
"content": "<|reserved_20|>",
|
805 |
-
"lstrip": false,
|
806 |
-
"normalized": false,
|
807 |
-
"rstrip": false,
|
808 |
-
"single_word": false,
|
809 |
-
"special": true
|
810 |
-
},
|
811 |
-
"101": {
|
812 |
-
"content": "<|reserved_21|>",
|
813 |
-
"lstrip": false,
|
814 |
-
"normalized": false,
|
815 |
-
"rstrip": false,
|
816 |
-
"single_word": false,
|
817 |
-
"special": true
|
818 |
-
},
|
819 |
-
"102": {
|
820 |
-
"content": "<|reserved_22|>",
|
821 |
-
"lstrip": false,
|
822 |
-
"normalized": false,
|
823 |
-
"rstrip": false,
|
824 |
-
"single_word": false,
|
825 |
-
"special": true
|
826 |
-
},
|
827 |
-
"103": {
|
828 |
-
"content": "<|reserved_23|>",
|
829 |
-
"lstrip": false,
|
830 |
-
"normalized": false,
|
831 |
-
"rstrip": false,
|
832 |
-
"single_word": false,
|
833 |
-
"special": true
|
834 |
-
},
|
835 |
-
"104": {
|
836 |
-
"content": "<|reserved_24|>",
|
837 |
-
"lstrip": false,
|
838 |
-
"normalized": false,
|
839 |
-
"rstrip": false,
|
840 |
-
"single_word": false,
|
841 |
-
"special": true
|
842 |
-
},
|
843 |
-
"105": {
|
844 |
-
"content": "<|reserved_25|>",
|
845 |
-
"lstrip": false,
|
846 |
-
"normalized": false,
|
847 |
-
"rstrip": false,
|
848 |
-
"single_word": false,
|
849 |
-
"special": true
|
850 |
-
},
|
851 |
-
"106": {
|
852 |
-
"content": "<|reserved_26|>",
|
853 |
-
"lstrip": false,
|
854 |
-
"normalized": false,
|
855 |
-
"rstrip": false,
|
856 |
-
"single_word": false,
|
857 |
-
"special": true
|
858 |
-
},
|
859 |
-
"107": {
|
860 |
-
"content": "<|reserved_27|>",
|
861 |
-
"lstrip": false,
|
862 |
-
"normalized": false,
|
863 |
-
"rstrip": false,
|
864 |
-
"single_word": false,
|
865 |
-
"special": true
|
866 |
-
},
|
867 |
-
"108": {
|
868 |
-
"content": "<|reserved_28|>",
|
869 |
-
"lstrip": false,
|
870 |
-
"normalized": false,
|
871 |
-
"rstrip": false,
|
872 |
-
"single_word": false,
|
873 |
-
"special": true
|
874 |
-
},
|
875 |
-
"109": {
|
876 |
-
"content": "<|reserved_29|>",
|
877 |
-
"lstrip": false,
|
878 |
-
"normalized": false,
|
879 |
-
"rstrip": false,
|
880 |
-
"single_word": false,
|
881 |
-
"special": true
|
882 |
-
},
|
883 |
-
"110": {
|
884 |
-
"content": "<|reserved_30|>",
|
885 |
-
"lstrip": false,
|
886 |
-
"normalized": false,
|
887 |
-
"rstrip": false,
|
888 |
-
"single_word": false,
|
889 |
-
"special": true
|
890 |
-
},
|
891 |
-
"111": {
|
892 |
-
"content": "<|reserved_31|>",
|
893 |
-
"lstrip": false,
|
894 |
-
"normalized": false,
|
895 |
-
"rstrip": false,
|
896 |
-
"single_word": false,
|
897 |
-
"special": true
|
898 |
-
},
|
899 |
-
"112": {
|
900 |
-
"content": "<|reserved_32|>",
|
901 |
-
"lstrip": false,
|
902 |
-
"normalized": false,
|
903 |
-
"rstrip": false,
|
904 |
-
"single_word": false,
|
905 |
-
"special": true
|
906 |
-
},
|
907 |
-
"113": {
|
908 |
-
"content": "<|reserved_33|>",
|
909 |
-
"lstrip": false,
|
910 |
-
"normalized": false,
|
911 |
-
"rstrip": false,
|
912 |
-
"single_word": false,
|
913 |
-
"special": true
|
914 |
-
},
|
915 |
-
"114": {
|
916 |
-
"content": "<|reserved_34|>",
|
917 |
-
"lstrip": false,
|
918 |
-
"normalized": false,
|
919 |
-
"rstrip": false,
|
920 |
-
"single_word": false,
|
921 |
-
"special": true
|
922 |
-
},
|
923 |
-
"115": {
|
924 |
-
"content": "<|reserved_35|>",
|
925 |
-
"lstrip": false,
|
926 |
-
"normalized": false,
|
927 |
-
"rstrip": false,
|
928 |
-
"single_word": false,
|
929 |
-
"special": true
|
930 |
-
},
|
931 |
-
"116": {
|
932 |
-
"content": "<|reserved_36|>",
|
933 |
-
"lstrip": false,
|
934 |
-
"normalized": false,
|
935 |
-
"rstrip": false,
|
936 |
-
"single_word": false,
|
937 |
-
"special": true
|
938 |
-
},
|
939 |
-
"117": {
|
940 |
-
"content": "<|reserved_37|>",
|
941 |
-
"lstrip": false,
|
942 |
-
"normalized": false,
|
943 |
-
"rstrip": false,
|
944 |
-
"single_word": false,
|
945 |
-
"special": true
|
946 |
-
},
|
947 |
-
"118": {
|
948 |
-
"content": "<|reserved_38|>",
|
949 |
-
"lstrip": false,
|
950 |
-
"normalized": false,
|
951 |
-
"rstrip": false,
|
952 |
-
"single_word": false,
|
953 |
-
"special": true
|
954 |
-
},
|
955 |
-
"119": {
|
956 |
-
"content": "<|reserved_39|>",
|
957 |
-
"lstrip": false,
|
958 |
-
"normalized": false,
|
959 |
-
"rstrip": false,
|
960 |
-
"single_word": false,
|
961 |
-
"special": true
|
962 |
-
},
|
963 |
-
"120": {
|
964 |
-
"content": "<|reserved_40|>",
|
965 |
-
"lstrip": false,
|
966 |
-
"normalized": false,
|
967 |
-
"rstrip": false,
|
968 |
-
"single_word": false,
|
969 |
-
"special": true
|
970 |
-
},
|
971 |
-
"121": {
|
972 |
-
"content": "<|reserved_41|>",
|
973 |
-
"lstrip": false,
|
974 |
-
"normalized": false,
|
975 |
-
"rstrip": false,
|
976 |
-
"single_word": false,
|
977 |
-
"special": true
|
978 |
-
},
|
979 |
-
"122": {
|
980 |
-
"content": "<|reserved_42|>",
|
981 |
-
"lstrip": false,
|
982 |
-
"normalized": false,
|
983 |
-
"rstrip": false,
|
984 |
-
"single_word": false,
|
985 |
-
"special": true
|
986 |
-
},
|
987 |
-
"123": {
|
988 |
-
"content": "<|reserved_43|>",
|
989 |
-
"lstrip": false,
|
990 |
-
"normalized": false,
|
991 |
-
"rstrip": false,
|
992 |
-
"single_word": false,
|
993 |
-
"special": true
|
994 |
-
},
|
995 |
-
"124": {
|
996 |
-
"content": "<|reserved_44|>",
|
997 |
-
"lstrip": false,
|
998 |
-
"normalized": false,
|
999 |
-
"rstrip": false,
|
1000 |
-
"single_word": false,
|
1001 |
-
"special": true
|
1002 |
-
},
|
1003 |
-
"125": {
|
1004 |
-
"content": "<|reserved_45|>",
|
1005 |
-
"lstrip": false,
|
1006 |
-
"normalized": false,
|
1007 |
-
"rstrip": false,
|
1008 |
-
"single_word": false,
|
1009 |
-
"special": true
|
1010 |
-
},
|
1011 |
-
"126": {
|
1012 |
-
"content": "<|reserved_46|>",
|
1013 |
-
"lstrip": false,
|
1014 |
-
"normalized": false,
|
1015 |
-
"rstrip": false,
|
1016 |
-
"single_word": false,
|
1017 |
-
"special": true
|
1018 |
-
},
|
1019 |
-
"127": {
|
1020 |
-
"content": "<|reserved_47|>",
|
1021 |
-
"lstrip": false,
|
1022 |
-
"normalized": false,
|
1023 |
-
"rstrip": false,
|
1024 |
-
"single_word": false,
|
1025 |
-
"special": true
|
1026 |
}
|
1027 |
},
|
1028 |
"bos_token": "<s>",
|
|
|
639 |
"rstrip": false,
|
640 |
"single_word": false,
|
641 |
"special": true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
642 |
}
|
643 |
},
|
644 |
"bos_token": "<s>",
|
vocab.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|