Upload tokenizer

Files changed (3) hide show

special_tokens_map.json CHANGED Viewed

@@ -7,10 +7,11 @@
     "single_word": false
   },
   "eos_token": {
-    "content": "<|eot_id|>",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,
     "single_word": false
-  }
 }

     "single_word": false
   },
   "eos_token": {
+    "content": "<|end_of_text|>",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,
     "single_word": false
+  },
+  "unk_token": "<unk>"
 }

tokenizer.json CHANGED Viewed

@@ -2306,6 +2306,15 @@
       "rstrip": false,
       "normalized": false,
       "special": true
     }
   ],
   "normalizer": null,
@@ -2329,10 +2338,58 @@
     ]
   },
   "post_processor": {
-    "type": "ByteLevel",
-    "add_prefix_space": true,
-    "trim_offsets": false,
-    "use_regex": true
   },
   "decoder": {
     "type": "ByteLevel",

       "rstrip": false,
       "normalized": false,
       "special": true
+    },
+    {
+      "id": 128256,
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
     }
   ],
   "normalizer": null,
     ]
   },
   "post_processor": {
+    "type": "TemplateProcessing",
+    "single": [
+      {
+        "SpecialToken": {
+          "id": "<|begin_of_text|>",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      }
+    ],
+    "pair": [
+      {
+        "SpecialToken": {
+          "id": "<|begin_of_text|>",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "<|begin_of_text|>",
+          "type_id": 1
+        }
+      },
+      {
+        "Sequence": {
+          "id": "B",
+          "type_id": 1
+        }
+      }
+    ],
+    "special_tokens": {
+      "<|begin_of_text|>": {
+        "id": "<|begin_of_text|>",
+        "ids": [
+          128000
+        ],
+        "tokens": [
+          "<|begin_of_text|>"
+        ]
+      }
+    }
   },
   "decoder": {
     "type": "ByteLevel",

tokenizer_config.json CHANGED Viewed

@@ -1,4 +1,6 @@
 {
   "added_tokens_decoder": {
     "128000": {
       "content": "<|begin_of_text|>",
@@ -2047,16 +2049,25 @@
       "rstrip": false,
       "single_word": false,
       "special": true
     }
   },
   "bos_token": "<|begin_of_text|>",
-  "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}",
   "clean_up_tokenization_spaces": true,
-  "eos_token": "<|eot_id|>",
   "model_input_names": [
     "input_ids",
     "attention_mask"
   ],
   "model_max_length": 1000000000000000019884624838656,
-  "tokenizer_class": "PreTrainedTokenizerFast"
 }

 {
+  "add_bos_token": true,
+  "add_eos_token": false,
   "added_tokens_decoder": {
     "128000": {
       "content": "<|begin_of_text|>",
       "rstrip": false,
       "single_word": false,
       "special": true
+    },
+    "128256": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
     }
   },
   "bos_token": "<|begin_of_text|>",
   "clean_up_tokenization_spaces": true,
+  "eos_token": "<|end_of_text|>",
   "model_input_names": [
     "input_ids",
     "attention_mask"
   ],
   "model_max_length": 1000000000000000019884624838656,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
 }