File size: 6,035 Bytes
a74f280 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
import regex as re
import base64
import tiktoken
import os
import json
from transformers import PreTrainedTokenizer
class BaseTokenizer(PreTrainedTokenizer):
"""Abstract class for tokenizer."""
def __init__(self, **kwargs):
super().__init__()
@property
def add_prefix_space(self):
return False
@property
def vocab_size(self):
raise NotImplemented
def tokenize(self, text):
raise NotImplemented
def detokenize(self, token_ids, ignore_special_tokens=True):
raise NotImplemented
def build_single_message(self, role, metadata, message):
assert role in ["system", "user", "assistant", "observation"], role
role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n")
message_tokens = self.tokenizer.encode(message, disallowed_special=())
tokens = role_tokens + message_tokens
return tokens
def build_chat_input(self, query, history=None, role="user", metadata=""):
if history is None:
history = []
input_ids = []
for item in history:
content = item["content"]
if item["role"] == "system" and "tools" in item:
content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False)
input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content))
input_ids.extend(self.build_single_message(role, metadata, query))
input_ids.extend([self.get_command("<|assistant|>")])
return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True)
@property
def eos_id(self):
raise NotImplemented
def get_command(self, token):
return NotImplemented
class TikTokenizer(BaseTokenizer):
vocab_files_names = {"vocab_file": "tokenizer.tiktoken"}
def __init__(self, vocab_file, **kwargs):
pat_str = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
self.pat_str = re.compile(pat_str)
self.b64_vocab = {}
mergeable_ranks = {}
with open(vocab_file) as f:
for line in f:
token, rank = line.strip().split()
rank = int(rank)
token = base64.b64decode(token)
mergeable_ranks[token] = rank
self.b64_vocab['%s' % token] = rank
self.special_tokens = ["<|endoftext|>", "[MASK]", "[gMASK]", "[sMASK]", "<sop>", "<eop>", "<|system|>",
"<|user|>", "<|assistant|>", "<|observation|>"]
self.special_tokens = {
token: idx for idx, token in enumerate(self.special_tokens, start=len(mergeable_ranks))
}
self.special_token_ids = {idx: token for token, idx in self.special_tokens.items()}
self.tokenizer = tiktoken.Encoding(
name="my_tokenizer",
pat_str=pat_str,
mergeable_ranks=mergeable_ranks,
special_tokens=self.special_tokens
)
self.decoder = {rank: token for token, rank in mergeable_ranks.items()}
self.n_words = len(self.decoder) + len(self.special_tokens)
super().__init__()
@property
def add_prefix_space(self):
return False
def tokenize(self, text, add_special_tokens=True):
ids = self.encode(text, add_special_tokens=add_special_tokens)
return [self.convert_id_to_token(_id) for _id in ids]
def detokenize(self, ids, ignore_special_tokens=True):
if ignore_special_tokens:
ids = [idx for idx in ids if idx not in self.special_token_ids]
return self.tokenizer.decode(ids)
def encode(self, text, add_special_tokens=True):
ids = self.tokenizer.encode(text, disallowed_special=(), allowed_special="all")
if add_special_tokens:
ids = [self.special_tokens["[gMASK]"], self.special_tokens["<sop>"]] + ids
return ids
def decode(self, ids, skip_special_tokens=False, clean_up_tokenization_spaces=False):
if type(ids) is int:
ids = [ids]
return self.detokenize(ids, ignore_special_tokens=skip_special_tokens)
def encode_pieces(self, text):
ids = self.tokenizer.encode(text, disallowed_special=())
return list(map(lambda x: self.decoder[x].detokenize('utf-8', errors='replace'), ids))
@property
def vocab_size(self):
return self.n_words
@property
def eos_token_id(self):
return self.special_tokens["<|endoftext|>"]
def convert_token_to_id(self, token):
""" Converts a token (str) in an id using the vocab. """
if token in self.special_tokens:
return self.special_tokens[token]
# assert type(token) == str, "type of token (%s) is %s" % (token, type(token))
# ids = self.tokenizer.encode(token, disallowed_special=())
if token in self.b64_vocab:
return self.b64_vocab[token]
# if len(ids) == 1:
# return ids[0]
else:
raise RuntimeError(f"{token} is not a single token")
def _convert_token_to_id(self, token):
return self.convert_token_to_id(token)
def convert_id_to_token(self, index):
if index in self.special_token_ids:
return self.special_token_ids[index]
return '%s' % self.decoder[index]
# try:
# return self.decoder[index].decode('utf-8')
# except Exception as e:
# print("Exception: %s for (%d)%s" % (e, index, self.decoder[index]))
# return ""
#return self.decoder[index].detokenize('utf-8', errors='replace')
def _convert_id_to_token(self, index):
return self.convert_id_to_token(index)
def get_command(self, token):
return self.special_tokens[token]
def get_vocab(self):
vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
return vocab
|