mlabonne commited on
Commit
adc761e
1 Parent(s): d7cd5a8

Upload tokenizer

Browse files
special_tokens_map.json CHANGED
@@ -7,7 +7,7 @@
7
  "single_word": false
8
  },
9
  "eos_token": {
10
- "content": "<|im_end|>",
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
 
7
  "single_word": false
8
  },
9
  "eos_token": {
10
+ "content": "<|end_of_text|>",
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
tokenizer.json CHANGED
@@ -14,7 +14,7 @@
14
  },
15
  {
16
  "id": 128001,
17
- "content": "<|im_end|>",
18
  "single_word": false,
19
  "lstrip": false,
20
  "rstrip": false,
 
14
  },
15
  {
16
  "id": 128001,
17
+ "content": "<|end_of_text|>",
18
  "single_word": false,
19
  "lstrip": false,
20
  "rstrip": false,
tokenizer_config.json CHANGED
@@ -9,7 +9,7 @@
9
  "special": true
10
  },
11
  "128001": {
12
- "content": "<|im_end|>",
13
  "lstrip": false,
14
  "normalized": false,
15
  "rstrip": false,
@@ -2050,12 +2050,14 @@
2050
  }
2051
  },
2052
  "bos_token": "<|begin_of_text|>",
2053
- "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'<|im_start|>user\n' + message['content'] + '<|im_end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|im_start|>assistant\n' + message['content'] + '<|im_end|>\n' }}{% else %}{{ '<|im_start|>system\n' + message['content'] + '<|im_end|>\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
2054
  "clean_up_tokenization_spaces": true,
2055
- "eos_token": "<|im_end|>",
 
 
 
 
2056
  "model_max_length": 131072,
2057
  "pad_token": "<|finetune_right_pad_id|>",
2058
  "padding_side": "left",
2059
- "tokenizer_class": "PreTrainedTokenizerFast",
2060
- "unk_token": null
2061
  }
 
9
  "special": true
10
  },
11
  "128001": {
12
+ "content": "<|end_of_text|>",
13
  "lstrip": false,
14
  "normalized": false,
15
  "rstrip": false,
 
2050
  }
2051
  },
2052
  "bos_token": "<|begin_of_text|>",
 
2053
  "clean_up_tokenization_spaces": true,
2054
+ "eos_token": "<|end_of_text|>",
2055
+ "model_input_names": [
2056
+ "input_ids",
2057
+ "attention_mask"
2058
+ ],
2059
  "model_max_length": 131072,
2060
  "pad_token": "<|finetune_right_pad_id|>",
2061
  "padding_side": "left",
2062
+ "tokenizer_class": "PreTrainedTokenizerFast"
 
2063
  }