Matt commited on
Commit
f2847d8
1 Parent(s): 023e8a0

Re-add custom tokenizer

Browse files
Files changed (2) hide show
  1. tokenization_internlm.py +57 -0
  2. tokenizer_config.json +8 -2
tokenization_internlm.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 Shanghai Artificial Intelligence Laboratory and the
3
+ # HuggingFace Inc. team. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ """Tokenization classes for IntermLM."""
18
+ from transformers.tokenization_utils import LlamaTokenizer
19
+
20
+
21
+ class InternLMTokenizer(LlamaTokenizer):
22
+
23
+ @property
24
+ def no_prefix_space_tokens(self):
25
+ if self._no_prefix_space_tokens is None:
26
+ vocab = self.convert_ids_to_tokens(list(range(self.vocab_size)))
27
+ self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith("▁")}
28
+ return self._no_prefix_space_tokens
29
+
30
+ def _maybe_add_prefix_space(self, tokens, decoded):
31
+ if tokens and tokens[0] not in self.no_prefix_space_tokens:
32
+ return " " + decoded
33
+ else:
34
+ return decoded
35
+
36
+ def convert_tokens_to_string(self, tokens):
37
+ """Converts a sequence of tokens (string) in a single string."""
38
+ current_sub_tokens = []
39
+ out_string = ""
40
+ prev_is_special = False
41
+ for token in tokens:
42
+ # make sure that special tokens are not decoded using sentencepiece model
43
+ if token in self.all_special_tokens:
44
+ if not prev_is_special:
45
+ out_string += " "
46
+ out_string += self.sp_model.decode(current_sub_tokens) + token
47
+ prev_is_special = True
48
+ current_sub_tokens = []
49
+ else:
50
+ current_sub_tokens.append(token)
51
+ prev_is_special = False
52
+ out_string += self.sp_model.decode(current_sub_tokens)
53
+ out_string = self.clean_up_tokenization(out_string)
54
+ out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string)
55
+ return out_string[1:]
56
+
57
+
tokenizer_config.json CHANGED
@@ -1,13 +1,19 @@
1
  {
2
  "add_bos_token": true,
3
  "add_eos_token": false,
 
 
 
 
 
 
4
  "bos_token": "<s>",
5
- "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.last and message['role'] != 'user' %}{{ raise_exception('Most recent message must come from user!') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|User|>:' + message['content'] + '<eoh>\n'}}{% elif message['role'] == 'assistant' %}{{ '<|Bot|>:' + message['content'] + '<eoa>\n'}}{% else %}{{ raise_exception('Only user and assistant roles are supported in this model!') }}{% endif %}{% endfor %}{{ '<|Bot|>:' }}",
6
  "clean_up_tokenization_spaces": false,
7
  "eos_token": "</s>",
 
8
  "legacy": true,
9
  "model_max_length": 1000000000000000019884624838656,
10
  "pad_token": "</s>",
11
- "tokenizer_class": "LlamaTokenizer",
12
  "unk_token": "<unk>"
13
  }
 
1
  {
2
  "add_bos_token": true,
3
  "add_eos_token": false,
4
+ "auto_map": {
5
+ "AutoTokenizer": [
6
+ "tokenization_internlm.InternLMTokenizer",
7
+ null
8
+ ]
9
+ },
10
  "bos_token": "<s>",
 
11
  "clean_up_tokenization_spaces": false,
12
  "eos_token": "</s>",
13
+ "use_fast": false,
14
  "legacy": true,
15
  "model_max_length": 1000000000000000019884624838656,
16
  "pad_token": "</s>",
17
+ "tokenizer_class": "InternLMTokenizer",
18
  "unk_token": "<unk>"
19
  }