Locutusque commited on
Commit
a0d791a
1 Parent(s): 8857129

Upload tokenizer

Browse files
Files changed (4) hide show
  1. README.md +4 -4
  2. special_tokens_map.json +11 -1
  3. tokenizer.json +0 -0
  4. tokenizer_config.json +7 -2
README.md CHANGED
@@ -1,7 +1,7 @@
1
  ---
2
- license: apache-2.0
3
- language:
4
- - en
5
  datasets:
6
  - Locutusque/UltraTextbooks
7
- ---
 
 
 
 
1
  ---
 
 
 
2
  datasets:
3
  - Locutusque/UltraTextbooks
4
+ language:
5
+ - en
6
+ license: apache-2.0
7
+ ---
special_tokens_map.json CHANGED
@@ -1,4 +1,8 @@
1
  {
 
 
 
 
2
  "bos_token": {
3
  "content": "<|bos|>",
4
  "lstrip": false,
@@ -13,7 +17,13 @@
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
- "pad_token": "<|endoftext|>",
 
 
 
 
 
 
17
  "unk_token": {
18
  "content": "<unk>",
19
  "lstrip": false,
 
1
  {
2
+ "additional_special_tokens": [
3
+ "<|ASSISTANT|>",
4
+ "<|USER|>"
5
+ ],
6
  "bos_token": {
7
  "content": "<|bos|>",
8
  "lstrip": false,
 
17
  "rstrip": false,
18
  "single_word": false
19
  },
20
+ "pad_token": {
21
+ "content": "[PAD]",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
  "unk_token": {
28
  "content": "<unk>",
29
  "lstrip": false,
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,6 +1,7 @@
1
  {
2
  "add_bos_token": true,
3
  "add_eos_token": false,
 
4
  "added_tokens_decoder": {
5
  "0": {
6
  "content": "<unk>",
@@ -67,14 +68,18 @@
67
  "special": true
68
  }
69
  },
70
- "additional_special_tokens": [],
 
 
 
71
  "bos_token": "<|bos|>",
 
72
  "clean_up_tokenization_spaces": false,
73
  "eos_token": "<|endoftext|>",
74
  "legacy": true,
75
  "max_length": 1536,
76
  "model_max_length": 1000000000000000019884624838656,
77
- "pad_token": "<|endoftext|>",
78
  "sp_model_kwargs": {},
79
  "spaces_between_special_tokens": false,
80
  "stride": 0,
 
1
  {
2
  "add_bos_token": true,
3
  "add_eos_token": false,
4
+ "add_prefix_space": null,
5
  "added_tokens_decoder": {
6
  "0": {
7
  "content": "<unk>",
 
68
  "special": true
69
  }
70
  },
71
+ "additional_special_tokens": [
72
+ "<|ASSISTANT|>",
73
+ "<|USER|>"
74
+ ],
75
  "bos_token": "<|bos|>",
76
+ "chat_template": "{%- set ns = namespace(found=false) -%}{%- for message in messages -%}{%- if message['role'] == 'system' -%}{%- set ns.found = true -%}{%- endif -%}{%- endfor -%}{%- for message in messages %}{%- if message['role'] == 'system' -%}{{- '<|im_start|>system\n' + message['content'].rstrip() + '<|im_end|>\n' -}}{%- else -%}{%- if message['role'] == 'user' -%}{{-'<|im_start|>user\n' + message['content'].rstrip() + '<|im_end|>\n'-}}{%- else -%}{{-'<|im_start|>assistant\n' + message['content'] + '<|im_end|>\n' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{-'<|im_start|>assistant\n'-}}{%- endif -%}",
77
  "clean_up_tokenization_spaces": false,
78
  "eos_token": "<|endoftext|>",
79
  "legacy": true,
80
  "max_length": 1536,
81
  "model_max_length": 1000000000000000019884624838656,
82
+ "pad_token": "[PAD]",
83
  "sp_model_kwargs": {},
84
  "spaces_between_special_tokens": false,
85
  "stride": 0,