rhysjones commited on
Commit
1fd1d1b
1 Parent(s): 8f1055c

Upload folder using huggingface_hub

Browse files
added_tokens.json CHANGED
@@ -36,5 +36,8 @@
36
  " ": 50260,
37
  " ": 50259,
38
  " ": 50258,
39
- " ": 50257
 
 
 
40
  }
 
36
  " ": 50260,
37
  " ": 50259,
38
  " ": 50258,
39
+ " ": 50257,
40
+ "<|im_end|>": 50295,
41
+ "<|im_start|>": 50296,
42
+ "<|startoftext|>": 50297
43
  }
config.json CHANGED
@@ -6,7 +6,7 @@
6
  "attention_dropout": 0.0,
7
  "bos_token_id": 50256,
8
  "embd_pdrop": 0.0,
9
- "eos_token_id": 50256,
10
  "hidden_act": "gelu_new",
11
  "hidden_size": 2560,
12
  "initializer_range": 0.02,
 
6
  "attention_dropout": 0.0,
7
  "bos_token_id": 50256,
8
  "embd_pdrop": 0.0,
9
+ "eos_token_id": 50295,
10
  "hidden_act": "gelu_new",
11
  "hidden_size": 2560,
12
  "initializer_range": 0.02,
generation_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "_from_model_config": true,
3
  "bos_token_id": 50256,
4
- "eos_token_id": 50256,
5
  "transformers_version": "4.37.0"
6
  }
 
1
  {
2
  "_from_model_config": true,
3
  "bos_token_id": 50256,
4
+ "eos_token_id": 50295,
5
  "transformers_version": "4.37.0"
6
  }
special_tokens_map.json CHANGED
@@ -1,5 +1,30 @@
1
  {
2
- "bos_token": "<|endoftext|>",
3
- "eos_token": "<|endoftext|>",
4
- "unk_token": "<|endoftext|>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  }
 
1
  {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|im_end|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
  }
tokenizer.json CHANGED
@@ -353,6 +353,33 @@
353
  "rstrip": false,
354
  "normalized": true,
355
  "special": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
356
  }
357
  ],
358
  "normalizer": null,
@@ -100644,4 +100671,4 @@
100644
  "Ġg azed"
100645
  ]
100646
  }
100647
- }
 
353
  "rstrip": false,
354
  "normalized": true,
355
  "special": false
356
+ },
357
+ {
358
+ "id": 50295,
359
+ "content": "<|im_end|>",
360
+ "single_word": false,
361
+ "lstrip": false,
362
+ "rstrip": false,
363
+ "normalized": false,
364
+ "special": true
365
+ },
366
+ {
367
+ "id": 50296,
368
+ "content": "<|im_start|>",
369
+ "single_word": false,
370
+ "lstrip": false,
371
+ "rstrip": false,
372
+ "normalized": false,
373
+ "special": false
374
+ },
375
+ {
376
+ "id": 50297,
377
+ "content": "<|startoftext|>",
378
+ "single_word": false,
379
+ "lstrip": false,
380
+ "rstrip": false,
381
+ "normalized": false,
382
+ "special": false
383
  }
384
  ],
385
  "normalizer": null,
 
100671
  "Ġg azed"
100672
  ]
100673
  }
100674
+ }
tokenizer_config.json CHANGED
@@ -312,13 +312,37 @@
312
  "rstrip": false,
313
  "single_word": false,
314
  "special": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
  }
316
  },
317
- "bos_token": "<|endoftext|>",
318
  "clean_up_tokenization_spaces": true,
319
- "eos_token": "<|endoftext|>",
320
  "model_max_length": 2048,
 
321
  "tokenizer_class": "CodeGenTokenizer",
322
- "unk_token": "<|endoftext|>",
323
- "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
324
- }
 
312
  "rstrip": false,
313
  "single_word": false,
314
  "special": false
315
+ },
316
+ "50295": {
317
+ "content": "<|im_end|>",
318
+ "lstrip": false,
319
+ "normalized": false,
320
+ "rstrip": false,
321
+ "single_word": false,
322
+ "special": true
323
+ },
324
+ "50296": {
325
+ "content": "<|im_start|>",
326
+ "lstrip": false,
327
+ "normalized": false,
328
+ "rstrip": false,
329
+ "single_word": false,
330
+ "special": false
331
+ },
332
+ "50297": {
333
+ "content": "<|startoftext|>",
334
+ "lstrip": false,
335
+ "normalized": false,
336
+ "rstrip": false,
337
+ "single_word": false,
338
+ "special": false
339
  }
340
  },
341
+ "bos_token": "<|startoftext|>",
342
  "clean_up_tokenization_spaces": true,
343
+ "eos_token": "<|im_end|>",
344
  "model_max_length": 2048,
345
+ "pad_token": "<|endoftext|>",
346
  "tokenizer_class": "CodeGenTokenizer",
347
+ "unk_token": "<|endoftext|>"
348
+ }
 
vocab.json CHANGED
The diff for this file is too large to render. See raw diff