Deci
/

Text Generation
Transformers
Safetensors
deci
custom_code
NajeebDeci commited on
Commit
e084f01
1 Parent(s): 41064f3

tokenizer files

Browse files
Files changed (2) hide show
  1. tokenization_decicoder.py +35 -0
  2. tokenizer_config.json +12 -0
tokenization_decicoder.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers.models.auto.tokenization_auto import get_class_from_dynamic_module
2
+ from transformers.tokenization_utils import AddedToken
3
+
4
+ CodeGen25Tokenizer = get_class_from_dynamic_module("tokenization_codegen25.CodeGen25Tokenizer",
5
+ "Salesforce/codegen25-7b-multi")
6
+ tiktoken_tokenizer = get_class_from_dynamic_module("tokenization_codegen25.tiktoken_tokenizer",
7
+ "Salesforce/codegen25-7b-multi")
8
+
9
+
10
+ class DeciCoderTokenizer(CodeGen25Tokenizer):
11
+ def __init__(
12
+ self,
13
+ pad_token=None,
14
+ eos_token="<|endoftext|>",
15
+ add_eos_token=False,
16
+ add_special_tokens=True,
17
+ **kwargs,
18
+ ):
19
+ self.add_eos_token = add_eos_token
20
+ self.encoder = tiktoken_tokenizer(base="gpt2", pad_token=pad_token, add_special=add_special_tokens)
21
+ pad_token_added = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
22
+ eos_token_added = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
23
+ super().__init__(
24
+ pad_token=pad_token_added,
25
+ eos_token=eos_token_added,
26
+ add_eos_token=add_eos_token,
27
+ add_special_tokens=add_special_tokens,
28
+ **kwargs,
29
+ )
30
+
31
+ def _convert_id_to_token(self, index):
32
+ try:
33
+ return super()._convert_id_to_token(index)
34
+ except:
35
+ return None
tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_eos_token": false,
3
+ "add_special_tokens": true,
4
+ "clean_up_tokenization_spaces": true,
5
+ "eos_token": "<|endoftext|>",
6
+ "model_max_length": 1000000000000000019884624838656,
7
+ "pad_token": null,
8
+ "tokenizer_class": "DeciCoderTokenizer",
9
+ "auto_map": {
10
+ "AutoTokenizer": ["tokenization_decicoder.DeciCoderTokenizer", null]
11
+ }
12
+ }