ruggsea commited on
Commit
4146ec5
1 Parent(s): 21ce69d

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +19 -21
  2. tokenizer.json +10 -1
  3. tokenizer_config.json +17 -4
special_tokens_map.json CHANGED
@@ -1,23 +1,21 @@
1
  {
2
- "bos_token": {
3
- "content": "<|begin_of_text|>",
4
- "lstrip": false,
5
- "normalized": false,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "eos_token": {
10
- "content": "<|end_of_text|>",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "pad_token": {
17
- "content": "<pad>",
18
- "lstrip": false,
19
- "normalized": false,
20
- "rstrip": false,
21
- "single_word": false
22
- }
23
  }
 
1
  {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<|im_start|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "<|im_end|>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ }
17
+ ],
18
+ "bos_token": "<|im_start|>",
19
+ "eos_token": "<|im_end|>",
20
+ "pad_token": "<|im_end|>"
 
 
21
  }
tokenizer.json CHANGED
@@ -2309,7 +2309,16 @@
2309
  },
2310
  {
2311
  "id": 128256,
2312
- "content": "<pad>",
 
 
 
 
 
 
 
 
 
2313
  "single_word": false,
2314
  "lstrip": false,
2315
  "rstrip": false,
 
2309
  },
2310
  {
2311
  "id": 128256,
2312
+ "content": "<|im_start|>",
2313
+ "single_word": false,
2314
+ "lstrip": false,
2315
+ "rstrip": false,
2316
+ "normalized": false,
2317
+ "special": true
2318
+ },
2319
+ {
2320
+ "id": 128257,
2321
+ "content": "<|im_end|>",
2322
  "single_word": false,
2323
  "lstrip": false,
2324
  "rstrip": false,
tokenizer_config.json CHANGED
@@ -2049,7 +2049,15 @@
2049
  "special": true
2050
  },
2051
  "128256": {
2052
- "content": "<pad>",
 
 
 
 
 
 
 
 
2053
  "lstrip": false,
2054
  "normalized": false,
2055
  "rstrip": false,
@@ -2057,14 +2065,19 @@
2057
  "special": true
2058
  }
2059
  },
2060
- "bos_token": "<|begin_of_text|>",
 
 
 
 
 
2061
  "clean_up_tokenization_spaces": true,
2062
- "eos_token": "<|end_of_text|>",
2063
  "model_input_names": [
2064
  "input_ids",
2065
  "attention_mask"
2066
  ],
2067
  "model_max_length": 1000000000000000019884624838656,
2068
- "pad_token": "<pad>",
2069
  "tokenizer_class": "PreTrainedTokenizerFast"
2070
  }
 
2049
  "special": true
2050
  },
2051
  "128256": {
2052
+ "content": "<|im_start|>",
2053
+ "lstrip": false,
2054
+ "normalized": false,
2055
+ "rstrip": false,
2056
+ "single_word": false,
2057
+ "special": true
2058
+ },
2059
+ "128257": {
2060
+ "content": "<|im_end|>",
2061
  "lstrip": false,
2062
  "normalized": false,
2063
  "rstrip": false,
 
2065
  "special": true
2066
  }
2067
  },
2068
+ "additional_special_tokens": [
2069
+ "<|im_start|>",
2070
+ "<|im_end|>"
2071
+ ],
2072
+ "bos_token": "<|im_start|>",
2073
+ "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
2074
  "clean_up_tokenization_spaces": true,
2075
+ "eos_token": "<|im_end|>",
2076
  "model_input_names": [
2077
  "input_ids",
2078
  "attention_mask"
2079
  ],
2080
  "model_max_length": 1000000000000000019884624838656,
2081
+ "pad_token": "<|im_end|>",
2082
  "tokenizer_class": "PreTrainedTokenizerFast"
2083
  }