frankminors123 commited on
Commit
4bcd72d
1 Parent(s): 3414c75

Upload 6 files

Browse files
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "hidden_act": "silu",
8
+ "hidden_size": 4096,
9
+ "initializer_range": 0.02,
10
+ "intermediate_size": 14336,
11
+ "max_position_embeddings": 8192,
12
+ "model_type": "llama",
13
+ "num_attention_heads": 32,
14
+ "num_hidden_layers": 32,
15
+ "num_key_value_heads": 8,
16
+ "pretraining_tp": 1,
17
+ "rms_norm_eps": 1e-05,
18
+ "rope_scaling": null,
19
+ "rope_theta": 500000.0,
20
+ "tie_word_embeddings": false,
21
+ "torch_dtype": "bfloat16",
22
+ "transformers_version": "4.40.0.dev0",
23
+ "use_cache": true,
24
+ "vocab_size": 128256
25
+ }
configuration.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"framework":"Pytorch","task":"text-generation"}
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 128000,
3
+ "eos_token_id": 128001,
4
+ "do_sample": true,
5
+ "temperature": 0.6,
6
+ "max_length": 4096,
7
+ "top_p": 0.9,
8
+ "transformers_version": "4.40.0.dev0"
9
+ }
llama.tiktoken ADDED
The diff for this file is too large to render. See raw diff
 
tokenization_llama.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tokenization classes for LLaMA-3."""
2
+
3
+ import os
4
+ import base64
5
+ import logging
6
+ import unicodedata
7
+ from typing import Collection, Dict, List, Set, Tuple, Union
8
+
9
+ import tiktoken
10
+ from transformers import PreTrainedTokenizer, AddedToken
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ VOCAB_FILES_NAMES = {"vocab_file": "llama.tiktoken"}
15
+
16
+ PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
17
+
18
+ NUM_RESERVED_SPECIAL_TOKENS = 256
19
+
20
+ SPECIAL_TOKENS = [
21
+ "<|begin_of_text|>",
22
+ "<|end_of_text|>",
23
+ "<|reserved_special_token_0|>",
24
+ "<|reserved_special_token_1|>",
25
+ "<|reserved_special_token_2|>",
26
+ "<|reserved_special_token_3|>",
27
+ "<|start_header_id|>",
28
+ "<|end_header_id|>",
29
+ "<|reserved_special_token_4|>",
30
+ "<|eot_id|>",
31
+ ] + [f"<|reserved_special_token_{i}|>" for i in range(5, NUM_RESERVED_SPECIAL_TOKENS - 5)]
32
+
33
+
34
+ def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
35
+ with open(tiktoken_bpe_file, "rb") as f:
36
+ contents = f.read()
37
+ return {
38
+ base64.b64decode(token): int(rank)
39
+ for token, rank in (line.split() for line in contents.splitlines() if line)
40
+ }
41
+
42
+
43
+ class LLaMATokenizer(PreTrainedTokenizer):
44
+ """LLaMA tokenizer."""
45
+
46
+ vocab_files_names = VOCAB_FILES_NAMES
47
+
48
+ def __init__(
49
+ self,
50
+ vocab_file,
51
+ errors="replace",
52
+ **kwargs,
53
+ ):
54
+ super().__init__(**kwargs)
55
+
56
+ self.errors = errors # how to handle errors in decoding
57
+
58
+ self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: dict[bytes, int]
59
+ self.special_tokens = {
60
+ token: index
61
+ for index, token in enumerate(
62
+ SPECIAL_TOKENS, start=len(self.mergeable_ranks)
63
+ )
64
+ }
65
+
66
+ enc = tiktoken.Encoding(
67
+ "LLaMA",
68
+ pat_str=PAT_STR,
69
+ mergeable_ranks=self.mergeable_ranks,
70
+ special_tokens=self.special_tokens,
71
+ )
72
+ assert (
73
+ len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
74
+ ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
75
+
76
+ self.decoder = {
77
+ v: k for k, v in self.mergeable_ranks.items()
78
+ } # type: dict[int, bytes|str]
79
+ self.decoder.update({v: k for k, v in self.special_tokens.items()})
80
+
81
+ self.tokenizer = enc # type: tiktoken.Encoding
82
+
83
+ self.bos_id: int = self.special_tokens["<|begin_of_text|>"]
84
+ self.eos_id: int = self.special_tokens["<|end_of_text|>"]
85
+ self.pad_id: int = -1
86
+ self.stop_tokens = {
87
+ self.special_tokens["<|end_of_text|>"],
88
+ self.special_tokens["<|eot_id|>"],
89
+ }
90
+
91
+ def __getstate__(self):
92
+ # for pickle lovers
93
+ state = self.__dict__.copy()
94
+ del state['tokenizer']
95
+ return state
96
+
97
+ def __setstate__(self, state):
98
+ # tokenizer is not python native; don't pass it; rebuild it
99
+ self.__dict__.update(state)
100
+ enc = tiktoken.Encoding(
101
+ "LLaMA",
102
+ pat_str=PAT_STR,
103
+ mergeable_ranks=self.mergeable_ranks,
104
+ special_tokens=self.special_tokens,
105
+ )
106
+ self.tokenizer = enc
107
+
108
+ def __len__(self) -> int:
109
+ return self.tokenizer.n_vocab
110
+
111
+ def get_vocab(self) -> Dict[bytes, int]:
112
+ return self.mergeable_ranks
113
+
114
+ def convert_tokens_to_ids(
115
+ self, tokens: Union[bytes, str, List[Union[bytes, str]]]
116
+ ) -> List[int]:
117
+ ids = []
118
+ if isinstance(tokens, (str, bytes)):
119
+ if tokens in self.special_tokens:
120
+ return self.special_tokens[tokens]
121
+ else:
122
+ return self.mergeable_ranks.get(tokens)
123
+ for token in tokens:
124
+ if token in self.special_tokens:
125
+ ids.append(self.special_tokens[token])
126
+ else:
127
+ ids.append(self.mergeable_ranks.get(token))
128
+ return ids
129
+
130
+ def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
131
+ if not special_tokens and new_tokens:
132
+ raise ValueError('Adding regular tokens is not supported')
133
+ for token in new_tokens:
134
+ surface_form = token.content if isinstance(token, AddedToken) else token
135
+ if surface_form not in SPECIAL_TOKENS:
136
+ raise ValueError('Adding unknown special tokens is not supported')
137
+ return 0
138
+
139
+ def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
140
+ """
141
+ Save only the vocabulary of the tokenizer (vocabulary).
142
+ Returns:
143
+ `Tuple(str)`: Paths to the files saved.
144
+ """
145
+ file_path = os.path.join(save_directory, "llama.tiktoken")
146
+ with open(file_path, "w", encoding="utf8") as w:
147
+ for k, v in self.mergeable_ranks.items():
148
+ line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
149
+ w.write(line)
150
+ return (file_path,)
151
+
152
+ def tokenize(
153
+ self,
154
+ text: str,
155
+ allowed_special: Union[Set, str] = "all",
156
+ disallowed_special: Union[Collection, str] = (),
157
+ **kwargs,
158
+ ) -> List[Union[bytes, str]]:
159
+ """
160
+ Converts a string in a sequence of tokens.
161
+ Args:
162
+ text (`str`):
163
+ The sequence to be encoded.
164
+ allowed_special (`Literal["all"]` or `set`):
165
+ The surface forms of the tokens to be encoded as special tokens in regular texts.
166
+ Default to "all".
167
+ disallowed_special (`Literal["all"]` or `Collection`):
168
+ The surface forms of the tokens that should not be in regular texts and trigger errors.
169
+ Default to an empty tuple.
170
+ kwargs (additional keyword arguments, *optional*):
171
+ Will be passed to the underlying model specific encode method.
172
+ Returns:
173
+ `List[bytes|str]`: The list of tokens.
174
+ """
175
+ tokens = []
176
+ text = unicodedata.normalize("NFC", text)
177
+
178
+ # this implementation takes a detour: text -> token id -> token surface forms
179
+ for t in self.tokenizer.encode(
180
+ text, allowed_special=allowed_special, disallowed_special=disallowed_special
181
+ ):
182
+ tokens.append(self.decoder[t])
183
+ return tokens
184
+
185
+ def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
186
+ """
187
+ Converts a sequence of tokens in a single string.
188
+ """
189
+ text = ""
190
+ temp = b""
191
+ for t in tokens:
192
+ if isinstance(t, str):
193
+ if temp:
194
+ text += temp.decode("utf-8", errors=self.errors)
195
+ temp = b""
196
+ text += t
197
+ elif isinstance(t, bytes):
198
+ temp += t
199
+ else:
200
+ raise TypeError("token should only be of type types or str")
201
+ if temp:
202
+ text += temp.decode("utf-8", errors=self.errors)
203
+ return text
204
+
205
+ @property
206
+ def vocab_size(self):
207
+ return self.tokenizer.n_vocab
208
+
209
+ def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
210
+ """Converts an id to a token, special tokens included"""
211
+ if index in self.decoder:
212
+ return self.decoder[index]
213
+ raise ValueError("unknown ids")
214
+
215
+ def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
216
+ """Converts a token to an id using the vocab, special tokens included"""
217
+ if token in self.special_tokens:
218
+ return self.special_tokens[token]
219
+ if token in self.mergeable_ranks:
220
+ return self.mergeable_ranks[token]
221
+ raise ValueError("unknown token")
222
+
223
+ def _tokenize(self, text: str, **kwargs):
224
+ """
225
+ Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
226
+ vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
227
+ Do NOT take care of added tokens.
228
+ """
229
+ raise NotImplementedError
230
+
231
+ def _decode(
232
+ self,
233
+ token_ids: Union[int, List[int]],
234
+ skip_special_tokens: bool = False,
235
+ errors: str = None,
236
+ **kwargs,
237
+ ) -> str:
238
+ if isinstance(token_ids, int):
239
+ token_ids = [token_ids]
240
+ if skip_special_tokens:
241
+ token_ids = [i for i in token_ids if i < self.eos_id]
242
+ return self.tokenizer.decode(token_ids, errors=errors or self.errors)
243
+
tokenizer_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoTokenizer": [
4
+ "tokenization_llama.LLaMATokenizer",
5
+ null
6
+ ]
7
+ },
8
+ "clean_up_tokenization_spaces": true,
9
+ "model_max_length": 1000000000000000019884624838656,
10
+ "tokenizer_class": "LLaMATokenizer"
11
+ }