new tokenizer 38400

Browse files

Files changed (7) hide show

merges.txt +0 -0
scripts/model.yaml +10 -10
scripts/prepare_pretrain_dataset.py +85 -50
scripts/train_tokenizer.py +15 -6
tokenizer.json +0 -0
tokenizer_config.json +0 -384
vocab.json +0 -0

merges.txt CHANGED Viewed

The diff for this file is too large to render. See raw diff

scripts/model.yaml CHANGED Viewed

@@ -5,13 +5,13 @@ model_name: "tiny-llama-1.1b"
 # A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with
 # ``model_config``. (type: Optional[Config], default: null)
 model_config:
-  padded_vocab_size: 32768
-  vocab_size: 32768
   block_size: 131072
-  n_layer: 10
   n_head: 32
   head_size: null
-  n_embd: 320
   n_query_groups: 8
   rotary_percentage: 1.0
   parallel_residual: false
@@ -19,7 +19,7 @@ model_config:
   norm_class_name: "RMSNorm"
   norm_eps: 1e-05
   mlp_class_name: "LLaMAMLP"
-  intermediate_size: 1120
   rope_base: 1000000
 # Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
@@ -52,7 +52,7 @@ data:
 # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
 train:
   # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
-  save_interval: 1000
   # Number of iterations between logging calls (type: int, default: 1)
   log_interval: 1
@@ -61,7 +61,7 @@ train:
   global_batch_size: 512
   # Number of samples per data-parallel rank (type: int, default: 4)
-  micro_batch_size: 4
   # Number of iterations with learning rate warmup active (type: int, default: 2000)
   lr_warmup_steps: 2000
@@ -71,13 +71,13 @@ train:
   # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
   # max_tokens: 3000000000000
-  max_tokens: 8628998688 # 351072 * 8193 * 3
   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
   max_steps:
   # Limits the length of samples. Off by default (type: Optional[int], default: null)
-  max_seq_length: 8192
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
   tie_embeddings:
@@ -86,7 +86,7 @@ train:
   max_norm: 1.0
   #   (type: float, default: 4e-05)
-  min_lr: 4e-05
 # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 eval:

 # A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with
 # ``model_config``. (type: Optional[Config], default: null)
 model_config:
+  padded_vocab_size: 38400
+  vocab_size: 38400
   block_size: 131072
+  n_layer: 5
   n_head: 32
   head_size: null
+  n_embd: 1024
   n_query_groups: 8
   rotary_percentage: 1.0
   parallel_residual: false
   norm_class_name: "RMSNorm"
   norm_eps: 1e-05
   mlp_class_name: "LLaMAMLP"
+  intermediate_size: 3584
   rope_base: 1000000
 # Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
 # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
 train:
   # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
+  save_interval: 100
   # Number of iterations between logging calls (type: int, default: 1)
   log_interval: 1
   global_batch_size: 512
   # Number of samples per data-parallel rank (type: int, default: 4)
+  micro_batch_size: 8
   # Number of iterations with learning rate warmup active (type: int, default: 2000)
   lr_warmup_steps: 2000
   # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
   # max_tokens: 3000000000000
+  max_tokens: ??? # ? * 8193 * 3
   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
   max_steps:
   # Limits the length of samples. Off by default (type: Optional[int], default: null)
+  max_seq_length: 4096
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
   tie_embeddings:
   max_norm: 1.0
   #   (type: float, default: 4e-05)
+  min_lr: 1e-4
 # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 eval:

scripts/prepare_pretrain_dataset.py CHANGED Viewed

@@ -7,11 +7,43 @@ from functools import partial
 def batch_iterator(name=None):
     # text
     if name in (None, 'xu-song/cc100-samples'):
         dataset = (
             load_dataset(name, lang, split='train')
-            for lang in ['am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'bn_rom', 'br', 'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'ff', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl', 'gn', 'gu', 'ha', 'he', 'hi', 'hi_rom', 'hr', 'ht', 'hu', 'hy', 'id', 'ig', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'km', 'kn', 'ko', 'ku', 'ky', 'la', 'lg', 'li', 'ln', 'lo', 'lt', 'lv', 'mg', 'mk', 'ml', 'mn', 'mr', 'ms', 'my', 'my_zaw', 'ne', 'nl', 'no', 'ns', 'om', 'or', 'pa', 'pl', 'ps', 'pt', 'qu', 'rm', 'ro', 'ru', 'sa', 'si', 'sc', 'sd', 'sk', 'sl', 'so', 'sq', 'sr', 'ss', 'su', 'sv', 'sw', 'ta', 'ta_rom', 'te', 'te_rom', 'th', 'tl', 'tn', 'tr', 'ug', 'uk', 'ur', 'ur_rom', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo', 'zh-Hans', 'zh-Hant', 'zu']
         )
         for d in dataset:
@@ -21,19 +53,48 @@ def batch_iterator(name=None):
         del dataset
         gc.collect()
     # code
     if name in (None, 'bigcode/the-stack-smol-xs'):
         dataset = (
             load_dataset(name, lang, split='train', trust_remote_code=True)
             for lang in [
-                'ada', 'agda', 'alloy', 'antlr', 'applescript', 'assembly', 'augeas', 'awk', 'batchfile', 'bison', 'bluespec', 'c',
-                'c++', 'c-sharp', 'clojure', 'cmake', 'coffeescript', 'common-lisp', 'css', 'cuda', 'dart', 'dockerfile', 'elixir',
-                'elm', 'emacs-lisp','erlang', 'f-sharp', 'fortran', 'glsl', 'go', 'groovy', 'haskell','html', 'idris', 'isabelle', 'java',
-                'java-server-pages', 'javascript', 'julia', 'kotlin', 'lean', 'literate-agda', 'literate-coffeescript', 'literate-haskell',
-                'lua', 'makefile', 'maple', 'markdown', 'mathematica', 'matlab', 'ocaml', 'pascal', 'perl', 'php', 'powershell', 'prolog',
-                'protocol-buffer', 'python', 'r', 'racket', 'restructuredtext', 'rmarkdown', 'ruby', 'rust', 'sas', 'scala', 'scheme',
-                'shell', 'smalltalk', 'solidity', 'sparql', 'sql', 'stan', 'standard-ml', 'stata', 'systemverilog', 'tcl', 'tcsh', 'tex',
-                'thrift', 'typescript', 'verilog', 'vhdl', 'visual-basic', 'xslt', 'yacc', 'zig'
             ]
         )
@@ -44,17 +105,17 @@ def batch_iterator(name=None):
         del dataset
         gc.collect()
-    # text
-    if name in (None, 'nampdn-ai/tiny-textbooks'):
         dataset = load_dataset(name, split='train')
         for row in dataset:
-            yield row['textbook']
         del dataset
         gc.collect()
-    # code
     if name in (None, 'm-a-p/CodeFeedback-Filtered-Instruction'):
         dataset = load_dataset(name, split='train')
@@ -64,12 +125,12 @@ def batch_iterator(name=None):
         del dataset
         gc.collect()
-    # code
-    if name in (None, 'nampdn-ai/tiny-codes'):
         dataset = load_dataset(name, split='train')
         for row in dataset:
-            yield row['prompt'] + '\n' + row['response']
         del dataset
         gc.collect()
@@ -114,29 +175,6 @@ def batch_iterator(name=None):
         del dataset
         gc.collect()
-    # instructions
-    alpaca_datasets_names = [
-        'saillab/alpaca-english-cleaned',
-        'saillab/alpaca-serbian-cleaned',
-        'saillab/alpaca-croatian-cleaned',
-        'saillab/alpaca-bosnian-cleaned',
-        'saillab/alpaca-macedonian-cleaned',
-        'saillab/alpaca-slovenian-cleaned',
-    ]
-    if name in (None, *alpaca_datasets_names):
-        for split in ['train', 'test']:
-            dataset = load_dataset(name, split=split)
-            for row in dataset:
-                if row['input'] in (None, '', 'nan'):
-                    yield row['instruction'] + '\n' + row['output']
-                else:
-                    yield row['instruction'] + '\n' + row['input'] + '\n' + row['output']
-            del dataset
-            gc.collect()
 def tokenize_fn(dataset_name, tokenizer=None):
     for text in batch_iterator(dataset_name):
@@ -145,21 +183,18 @@ def tokenize_fn(dataset_name, tokenizer=None):
 datasets_names = [
     'xu-song/cc100-samples',
-    'bigcode/the-stack-smol-xs',
     'nampdn-ai/tiny-textbooks',
-    'm-a-p/CodeFeedback-Filtered-Instruction',
     'nampdn-ai/tiny-codes',
     'ajibawa-2023/Maths-College',
     'microsoft/orca-math-word-problems-200k',
     'datatab/orca_math_world_problem_200k_serbian',
     'badrex/llm-emoji-dataset',
-    'saillab/alpaca-english-cleaned',
-    'saillab/alpaca-serbian-cleaned',
-    'saillab/alpaca-croatian-cleaned',
-    'saillab/alpaca-bosnian-cleaned',
-    'saillab/alpaca-macedonian-cleaned',
-    'saillab/alpaca-slovenian-cleaned',
 ]
 outputs = optimize(
@@ -167,6 +202,6 @@ outputs = optimize(
     inputs=datasets_names,
     output_dir='../data/',
     # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
-    chunk_size=(8193 * 2003),
     num_workers=16,
 )

 def batch_iterator(name=None):
+    # text
+    if name in (None, 'saillab/taco-datasets'):
+        dataset = (
+            load_dataset(name, data_dir=data_dir, split='train')
+            for data_dir in [
+                'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4',
+                'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k',
+            ]
+        )
+        for d in dataset:
+            for row in d:
+                for n in row:
+                    yield row['instruction'] + '\n' + row['input'] + '\n' + row['output']
+        del dataset
+        gc.collect()
     # text
     if name in (None, 'xu-song/cc100-samples'):
         dataset = (
             load_dataset(name, lang, split='train')
+            for lang in [
+                'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'bn_rom', 'br',
+                'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es',
+                'et', 'eu', 'fa', 'ff', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl',
+                'gn', 'gu', 'ha', 'he', 'hi', 'hi_rom', 'hr', 'ht', 'hu',
+                'hy', 'id', 'ig', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'km',
+                'kn', 'ko', 'ku', 'ky', 'la', 'lg', 'li', 'ln', 'lo', 'lt',
+                'lv', 'mg', 'mk', 'ml', 'mn', 'mr', 'ms', 'my', 'my_zaw',
+                'ne', 'nl', 'no', 'ns', 'om', 'or', 'pa', 'pl', 'ps', 'pt',
+                'qu', 'rm', 'ro', 'ru', 'sa', 'si', 'sc', 'sd', 'sk', 'sl',
+                'so', 'sq', 'sr', 'ss', 'su', 'sv', 'sw', 'ta', 'ta_rom',
+                'te', 'te_rom', 'th', 'tl', 'tn', 'tr', 'ug', 'uk', 'ur',
+                'ur_rom', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo',
+                'zh-Hans', 'zh-Hant', 'zu',
+            ]
         )
         for d in dataset:
         del dataset
         gc.collect()
+    # text
+    if name in (None, 'ontocord/fineweb-permissive-multilingual-2m'):
+        dataset = load_dataset(name, split='train')
+        for row in dataset:
+            yield row['text']
+        del dataset
+        gc.collect()
+    # text
+    if name in (None, 'nampdn-ai/tiny-textbooks'):
+        for split in ['train', 'test']:
+            dataset = load_dataset(name, split=split)
+            for row in dataset:
+                yield row['textbook']
+            del dataset
+            gc.collect()
     # code
     if name in (None, 'bigcode/the-stack-smol-xs'):
         dataset = (
             load_dataset(name, lang, split='train', trust_remote_code=True)
             for lang in [
+                'ada', 'agda', 'alloy', 'antlr', 'applescript', 'assembly',
+                'augeas', 'awk', 'batchfile', 'bison', 'bluespec', 'c',
+                'c++', 'c-sharp', 'clojure', 'cmake', 'coffeescript', 'common-lisp',
+                'css', 'cuda', 'dart', 'dockerfile', 'elixir',
+                'elm', 'emacs-lisp','erlang', 'f-sharp', 'fortran', 'glsl', 'go',
+                'groovy', 'haskell','html', 'idris', 'isabelle', 'java',
+                'java-server-pages', 'javascript', 'julia', 'kotlin', 'lean',
+                'literate-agda', 'literate-coffeescript', 'literate-haskell',
+                'lua', 'makefile', 'maple', 'markdown', 'mathematica', 'matlab',
+                'ocaml', 'pascal', 'perl', 'php', 'powershell', 'prolog',
+                'protocol-buffer', 'python', 'r', 'racket', 'restructuredtext',
+                'rmarkdown', 'ruby', 'rust', 'sas', 'scala', 'scheme',
+                'shell', 'smalltalk', 'solidity', 'sparql', 'sql', 'stan',
+                'standard-ml', 'stata', 'systemverilog', 'tcl', 'tcsh', 'tex',
+                'thrift', 'typescript', 'verilog', 'vhdl', 'visual-basic', 'xslt',
+                'yacc', 'zig',
             ]
         )
         del dataset
         gc.collect()
+    # code
+    if name in (None, 'nampdn-ai/tiny-codes'):
         dataset = load_dataset(name, split='train')
         for row in dataset:
+            yield row['prompt'] + '\n' + row['response']
         del dataset
         gc.collect()
+    # text + code
     if name in (None, 'm-a-p/CodeFeedback-Filtered-Instruction'):
         dataset = load_dataset(name, split='train')
         del dataset
         gc.collect()
+    # math
+    if name in (None, 'gair-prox/open-web-math-pro'):
         dataset = load_dataset(name, split='train')
         for row in dataset:
+            yield row['text']
         del dataset
         gc.collect()
         del dataset
         gc.collect()
 def tokenize_fn(dataset_name, tokenizer=None):
     for text in batch_iterator(dataset_name):
 datasets_names = [
+    'saillab/taco-datasets',
     'xu-song/cc100-samples',
+    'ontocord/fineweb-permissive-multilingual-2m',
     'nampdn-ai/tiny-textbooks',
+    'bigcode/the-stack-smol-xs',
     'nampdn-ai/tiny-codes',
+    'm-a-p/CodeFeedback-Filtered-Instruction',
+    'gair-prox/open-web-math-pro',
     'ajibawa-2023/Maths-College',
     'microsoft/orca-math-word-problems-200k',
     'datatab/orca_math_world_problem_200k_serbian',
     'badrex/llm-emoji-dataset',
 ]
 outputs = optimize(
     inputs=datasets_names,
     output_dir='../data/',
     # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
+    chunk_size=(4097 * 4006),
     num_workers=16,
 )

scripts/train_tokenizer.py CHANGED Viewed

@@ -110,11 +110,11 @@ def batch_iterator():
     gc.collect()
     # math
-    dataset = load_dataset('microsoft/orca-math-word-problems-200k', split='train')
     for row in dataset:
-        yield row['question'] + '\n' + row['answer']
     del dataset
     gc.collect()
@@ -127,6 +127,15 @@ def batch_iterator():
     del dataset
     gc.collect()
     # emoji
     dataset = load_dataset('badrex/llm-emoji-dataset', split='train')
@@ -206,7 +215,7 @@ special_tokens = [
 for i in range(2, 25):
     special_tokens.append(' ' * i)
-for i in range(128 - len(special_tokens)):
     special_tokens.append(f'<|reserved_{i}|>')
 # emoji
@@ -235,7 +244,7 @@ tokenizer.post_processor = TemplateProcessing(
 tokenizer.decoder = decoders.ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True)
 trainer = BpeTrainer(
-    vocab_size=131072, # 2 ** 17, 128k
     min_frequency=2,
     special_tokens=special_tokens,
     initial_alphabet=emoji_chars + programming_languages + code_keywords,

     gc.collect()
     # math
+    dataset = load_dataset('gair-prox/open-web-math-pro', split='train')
     for row in dataset:
+        yield row['text']
     del dataset
     gc.collect()
     del dataset
     gc.collect()
+    # math
+    dataset = load_dataset('microsoft/orca-math-word-problems-200k', split='train')
+    for row in dataset:
+        yield row['question'] + '\n' + row['answer']
+    del dataset
+    gc.collect()
     # emoji
     dataset = load_dataset('badrex/llm-emoji-dataset', split='train')
 for i in range(2, 25):
     special_tokens.append(' ' * i)
+for i in range(64 - len(special_tokens)):
     special_tokens.append(f'<|reserved_{i}|>')
 # emoji
 tokenizer.decoder = decoders.ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True)
 trainer = BpeTrainer(
+    vocab_size=38400, # 32768 chars + 5034 emojis
     min_frequency=2,
     special_tokens=special_tokens,
     initial_alphabet=emoji_chars + programming_languages + code_keywords,

tokenizer.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json CHANGED Viewed

@@ -639,390 +639,6 @@
       "rstrip": false,
       "single_word": false,
       "special": true
-    },
-    "80": {
-      "content": "<|reserved_0|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "81": {
-      "content": "<|reserved_1|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "82": {
-      "content": "<|reserved_2|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "83": {
-      "content": "<|reserved_3|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "84": {
-      "content": "<|reserved_4|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "85": {
-      "content": "<|reserved_5|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "86": {
-      "content": "<|reserved_6|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "87": {
-      "content": "<|reserved_7|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "88": {
-      "content": "<|reserved_8|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "89": {
-      "content": "<|reserved_9|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "90": {
-      "content": "<|reserved_10|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "91": {
-      "content": "<|reserved_11|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "92": {
-      "content": "<|reserved_12|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "93": {
-      "content": "<|reserved_13|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "94": {
-      "content": "<|reserved_14|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "95": {
-      "content": "<|reserved_15|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "96": {
-      "content": "<|reserved_16|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "97": {
-      "content": "<|reserved_17|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "98": {
-      "content": "<|reserved_18|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "99": {
-      "content": "<|reserved_19|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "100": {
-      "content": "<|reserved_20|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "101": {
-      "content": "<|reserved_21|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "102": {
-      "content": "<|reserved_22|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "103": {
-      "content": "<|reserved_23|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "104": {
-      "content": "<|reserved_24|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "105": {
-      "content": "<|reserved_25|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "106": {
-      "content": "<|reserved_26|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "107": {
-      "content": "<|reserved_27|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "108": {
-      "content": "<|reserved_28|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "109": {
-      "content": "<|reserved_29|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "110": {
-      "content": "<|reserved_30|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "111": {
-      "content": "<|reserved_31|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "112": {
-      "content": "<|reserved_32|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "113": {
-      "content": "<|reserved_33|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "114": {
-      "content": "<|reserved_34|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "115": {
-      "content": "<|reserved_35|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "116": {
-      "content": "<|reserved_36|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "117": {
-      "content": "<|reserved_37|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "118": {
-      "content": "<|reserved_38|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "119": {
-      "content": "<|reserved_39|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "120": {
-      "content": "<|reserved_40|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "121": {
-      "content": "<|reserved_41|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "122": {
-      "content": "<|reserved_42|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "123": {
-      "content": "<|reserved_43|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "124": {
-      "content": "<|reserved_44|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "125": {
-      "content": "<|reserved_45|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "126": {
-      "content": "<|reserved_46|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "127": {
-      "content": "<|reserved_47|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
     }
   },
   "bos_token": "<s>",

       "rstrip": false,
       "single_word": false,
       "special": true
     }
   },
   "bos_token": "<s>",

vocab.json CHANGED Viewed

The diff for this file is too large to render. See raw diff