Re-add custom tokenizer

Browse files

Files changed (2) hide show

tokenization_internlm.py +57 -0
tokenizer_config.json +8 -2

tokenization_internlm.py ADDED Viewed

	@@ -0,0 +1,57 @@

+# coding=utf-8
+# Copyright 2023 Shanghai Artificial Intelligence Laboratory and the
+# HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for IntermLM."""
+from transformers.tokenization_utils import LlamaTokenizer
+class InternLMTokenizer(LlamaTokenizer):
+    @property
+    def no_prefix_space_tokens(self):
+        if self._no_prefix_space_tokens is None:
+            vocab = self.convert_ids_to_tokens(list(range(self.vocab_size)))
+            self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith("▁")}
+        return self._no_prefix_space_tokens
+    def _maybe_add_prefix_space(self, tokens, decoded):
+        if tokens and tokens[0] not in self.no_prefix_space_tokens:
+            return " " + decoded
+        else:
+            return decoded
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        current_sub_tokens = []
+        out_string = ""
+        prev_is_special = False
+        for token in tokens:
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                if not prev_is_special:
+                    out_string += " "
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                prev_is_special = True
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+                prev_is_special = False
+        out_string += self.sp_model.decode(current_sub_tokens)
+        out_string = self.clean_up_tokenization(out_string)
+        out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string)
+        return out_string[1:]

tokenizer_config.json CHANGED Viewed

@@ -1,13 +1,19 @@
 {
   "add_bos_token": true,
   "add_eos_token": false,
   "bos_token": "<s>",
-  "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.last and message['role'] != 'user' %}{{ raise_exception('Most recent message must come from user!') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|User|>:' + message['content'] + '<eoh>\n'}}{% elif message['role'] == 'assistant' %}{{ '<|Bot|>:'  + message['content'] + '<eoa>\n'}}{% else %}{{ raise_exception('Only user and assistant roles are supported in this model!') }}{% endif %}{% endfor %}{{ '<|Bot|>:' }}",
   "clean_up_tokenization_spaces": false,
   "eos_token": "</s>",
   "legacy": true,
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "</s>",
-  "tokenizer_class": "LlamaTokenizer",
   "unk_token": "<unk>"
 }

 {
   "add_bos_token": true,
   "add_eos_token": false,
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_internlm.InternLMTokenizer",
+      null
+    ]
+  },
   "bos_token": "<s>",
   "clean_up_tokenization_spaces": false,
   "eos_token": "</s>",
+  "use_fast": false,
   "legacy": true,
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "</s>",
+  "tokenizer_class": "InternLMTokenizer",
   "unk_token": "<unk>"
 }