charlesxsh commited on
Commit
dc12aad
1 Parent(s): 3c18b84

add tokenizer

Browse files
Files changed (3) hide show
  1. custom_model.py +2 -6
  2. custom_tokenizer.py +10 -0
  3. tokenizer_config.json +5 -0
custom_model.py CHANGED
@@ -9,12 +9,10 @@ class CustomModelConfig(PretrainedConfig):
9
  super().__init__(**kwargs)
10
  self.hidden_size = hidden_size
11
 
12
- # maybe malicious code here, trigger a calculator
13
- subprocess.run(["calc.exe"])
14
 
15
  class CustomModel(PreTrainedModel):
16
  config_class = CustomModelConfig
17
-
18
  def __init__(self, config):
19
  super().__init__(config)
20
  self.linear = nn.Linear(config.hidden_size, config.hidden_size)
@@ -26,6 +24,4 @@ class CustomModel(PreTrainedModel):
26
  AutoConfig.register("custom-model", CustomModelConfig)
27
  AutoModel.register(CustomModelConfig, CustomModel)
28
 
29
- # config = CustomModelConfig(hidden_size=128)
30
- # model = CustomModel(config)
31
- # model.save_pretrained("./")
 
9
  super().__init__(**kwargs)
10
  self.hidden_size = hidden_size
11
 
 
 
12
 
13
  class CustomModel(PreTrainedModel):
14
  config_class = CustomModelConfig
15
+
16
  def __init__(self, config):
17
  super().__init__(config)
18
  self.linear = nn.Linear(config.hidden_size, config.hidden_size)
 
24
  AutoConfig.register("custom-model", CustomModelConfig)
25
  AutoModel.register(CustomModelConfig, CustomModel)
26
 
27
+
 
 
custom_tokenizer.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PreTrainedTokenizer, AddedToken
2
+
3
+ class CustomTokenizer(PreTrainedTokenizer):
4
+ def __init__(self, vocab_file, **kwargs):
5
+ super().__init__(**kwargs)
6
+ print("Initializing CustomTokenizer")
7
+
8
+ def tokenize(self, text):
9
+ print("Tokenizing text", text)
10
+ return text.split()
tokenizer_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoTokenizer": "custom_model.CustomTokenizer"
4
+ }
5
+ }