zgce commited on
Commit
14e13d7
1 Parent(s): 097376b

Upload 9 files

Browse files
adapter_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "F:/models/Skywork-13B-Base-8bit",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "lora_alpha": 16.0,
12
+ "lora_dropout": 0.05,
13
+ "modules_to_save": null,
14
+ "peft_type": "LORA",
15
+ "r": 32,
16
+ "rank_pattern": {},
17
+ "revision": null,
18
+ "target_modules": [
19
+ "up_proj",
20
+ "k_proj",
21
+ "down_proj",
22
+ "q_proj",
23
+ "o_proj",
24
+ "gate_proj",
25
+ "v_proj"
26
+ ],
27
+ "task_type": "CAUSAL_LM"
28
+ }
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84eea2a3984b682b18065e1d6afe72fa7aa84a72ff3ae3983068d21907ec9822
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc96be171124899b6c8872f1c0f1be2fe6adb73f5c5f69a7f8fb5a44d218108e
3
+ size 1064
special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenization_skywork.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) SkyworkAI and the HuggingFace Inc. team. All rights reserved.
2
+ # This code is built upon Huggingface's transformers repository.
3
+
4
+ """Tokenization classes for Skywork."""
5
+ import os
6
+ from shutil import copyfile
7
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
8
+
9
+ import sentencepiece as spm
10
+
11
+ from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
12
+ from transformers.utils import logging
13
+
14
+ if TYPE_CHECKING:
15
+ from transformers.pipelines.conversational import Conversation
16
+
17
+ logger = logging.get_logger(__name__)
18
+
19
+ VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
20
+
21
+
22
+ SPIECE_UNDERLINE = "▁"
23
+
24
+ B_INST, E_INST = "[INST]", "[/INST]"
25
+ B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
26
+
27
+ DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
28
+ that your responses are socially unbiased and positive in nature.
29
+
30
+ If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
31
+
32
+ class SkyworkTokenizer(PreTrainedTokenizer):
33
+
34
+ vocab_files_names = VOCAB_FILES_NAMES
35
+ # pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
36
+ # max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
37
+ model_input_names = ["input_ids", "attention_mask"]
38
+
39
+ def __init__(
40
+ self,
41
+ vocab_file,
42
+ unk_token="<unk>",
43
+ bos_token="<s>",
44
+ eos_token="</s>",
45
+ pad_token=None,
46
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
47
+ add_bos_token=True,
48
+ add_eos_token=False,
49
+ clean_up_tokenization_spaces=False,
50
+ legacy=True,
51
+ **kwargs,
52
+ ):
53
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
54
+ bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
55
+ eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
56
+ unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
57
+ pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
58
+ self.legacy = legacy
59
+ self.vocab_file = vocab_file
60
+ self.add_bos_token = add_bos_token
61
+ self.add_eos_token = add_eos_token
62
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
63
+ self.sp_model.Load(vocab_file)
64
+ super().__init__(
65
+ bos_token=bos_token,
66
+ eos_token=eos_token,
67
+ unk_token=unk_token,
68
+ pad_token=pad_token,
69
+ add_bos_token=add_bos_token,
70
+ add_eos_token=add_eos_token,
71
+ sp_model_kwargs=self.sp_model_kwargs,
72
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
73
+ legacy=legacy,
74
+ **kwargs,
75
+ )
76
+ if legacy:
77
+ logger.warning_once(
78
+ f"You are using the legacy behaviour of the {self.__class__}. This means that tokens that come after special tokens will not be properly handled. "
79
+ )
80
+
81
+
82
+ def __getstate__(self):
83
+ state = self.__dict__.copy()
84
+ state["sp_model"] = None
85
+ state["sp_model_proto"] = self.sp_model.serialized_model_proto()
86
+ return state
87
+
88
+ def __setstate__(self, d):
89
+ self.__dict__ = d
90
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
91
+ self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
92
+
93
+ @property
94
+ def vocab_size(self):
95
+ """Returns vocab size"""
96
+ return self.sp_model.get_piece_size()
97
+
98
+ def get_vocab(self):
99
+ """Returns vocab as a dict"""
100
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
101
+ vocab.update(self.added_tokens_encoder)
102
+ return vocab
103
+
104
+ # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize
105
+ def tokenize(self, text, **kwargs) -> List[str]:
106
+ # Replace the SPIECE_UNDERLINE with a space to make sure SPIECE_UNDERLINE is only used at
107
+ # the beginning of the text
108
+ if not self.legacy:
109
+ text = SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " ")
110
+ return super().tokenize(text, **kwargs)
111
+
112
+ # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._tokenize
113
+ def _tokenize(self, text):
114
+ if not self.legacy:
115
+ is_first = text.startswith(SPIECE_UNDERLINE)
116
+ if is_first:
117
+ text = text[1:]
118
+
119
+ tokens = self.sp_model.encode(text, out_type=str)
120
+
121
+ if not self.legacy and not is_first and not text.startswith(" ") and tokens[0].startswith(SPIECE_UNDERLINE):
122
+ tokens = ([tokens[0][1:]] if len(tokens[0]) > 1 else []) + tokens[1:]
123
+ return tokens
124
+
125
+ def _convert_token_to_id(self, token):
126
+ """Converts a token (str) in an id using the vocab."""
127
+ return self.sp_model.piece_to_id(token)
128
+
129
+ def _convert_id_to_token(self, index):
130
+ """Converts an index (integer) in a token (str) using the vocab."""
131
+ token = self.sp_model.IdToPiece(index)
132
+ return token
133
+
134
+ def convert_tokens_to_string(self, tokens):
135
+ """Converts a sequence of tokens (string) in a single string."""
136
+ current_sub_tokens = []
137
+ out_string = ""
138
+ prev_is_special = False
139
+ for i, token in enumerate(tokens):
140
+ # make sure that special tokens are not decoded using sentencepiece model
141
+ if token in self.all_special_tokens:
142
+ if not prev_is_special and i != 0:
143
+ out_string += " "
144
+ out_string += self.sp_model.decode(current_sub_tokens) + token
145
+ prev_is_special = True
146
+ current_sub_tokens = []
147
+ else:
148
+ current_sub_tokens.append(token)
149
+ prev_is_special = False
150
+ out_string += self.sp_model.decode(current_sub_tokens)
151
+ return out_string
152
+
153
+ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
154
+ if not os.path.isdir(save_directory):
155
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
156
+ return
157
+ out_vocab_file = os.path.join(
158
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
159
+ )
160
+
161
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
162
+ copyfile(self.vocab_file, out_vocab_file)
163
+ elif not os.path.isfile(self.vocab_file):
164
+ with open(out_vocab_file, "wb") as fi:
165
+ content_spiece_model = self.sp_model.serialized_model_proto()
166
+ fi.write(content_spiece_model)
167
+
168
+ return (out_vocab_file,)
169
+
170
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
171
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
172
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
173
+
174
+ output = bos_token_id + token_ids_0 + eos_token_id
175
+
176
+ if token_ids_1 is not None:
177
+ output = output + bos_token_id + token_ids_1 + eos_token_id
178
+
179
+ return output
180
+
181
+ def get_special_tokens_mask(
182
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
183
+ ) -> List[int]:
184
+ if already_has_special_tokens:
185
+ return super().get_special_tokens_mask(
186
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
187
+ )
188
+
189
+ bos_token_id = [1] if self.add_bos_token else []
190
+ eos_token_id = [1] if self.add_eos_token else []
191
+
192
+ if token_ids_1 is None:
193
+ return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
194
+ return (
195
+ bos_token_id
196
+ + ([0] * len(token_ids_0))
197
+ + eos_token_id
198
+ + bos_token_id
199
+ + ([0] * len(token_ids_1))
200
+ + eos_token_id
201
+ )
202
+
203
+ def create_token_type_ids_from_sequences(
204
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
205
+ ) -> List[int]:
206
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
207
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
208
+
209
+ output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
210
+
211
+ if token_ids_1 is not None:
212
+ output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
213
+
214
+ return output
215
+
216
+ def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
217
+ dialogue = list(conversation.iter_texts())
218
+ if not all([is_user for is_user, msg in dialogue[::2]]) or not all(
219
+ [not is_user for is_user, msg in dialogue[1::2]]
220
+ ):
221
+ raise ValueError(
222
+ "The model only supports 'user' and 'assistant' roles, starting with user and alternating (u/a/u/a/u...)"
223
+ )
224
+
225
+ dialog_tokens: List[int] = []
226
+ if len(conversation.past_user_inputs) > 0:
227
+ if not conversation.past_user_inputs[0].startswith(B_SYS) or E_SYS not in conversation.past_user_inputs[0]:
228
+ conversation.past_user_inputs[0] = (
229
+ B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
230
+ )
231
+ elif not dialogue[0][1].startswith(B_SYS) or E_SYS not in dialogue[0][1]:
232
+ dialogue[0] = (dialogue[0][0], B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + dialogue[0][1])
233
+
234
+ dialog_tokens += sum(
235
+ [
236
+ [self.bos_token_id]
237
+ + self.encode(
238
+ f"{B_INST} {(prompt[1]).strip()} {E_INST} {(answer[1]).strip()} ", add_special_tokens=False
239
+ )
240
+ + [self.eos_token_id]
241
+ for prompt, answer in zip(dialogue[::2], dialogue[1::2])
242
+ ],
243
+ [],
244
+ )
245
+ if not (dialogue[-1][0]):
246
+ raise ValueError(f"Last message must be from user, got {dialogue[-1]['role']}")
247
+ dialog_tokens += [self.bos_token_id] + self.encode(
248
+ f"{B_INST} {(dialogue[-1][1]).strip()} {E_INST}", add_special_tokens=False
249
+ )
250
+ return dialog_tokens
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36ec9a4d6fd7cc78fbb9e4afd89fb04cba0381b08a842ca0b60826073821f594
3
+ size 994250
tokenizer_config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ }
29
+ },
30
+ "auto_map": {
31
+ "AutoTokenizer": [
32
+ "tokenization_skywork.SkyworkTokenizer",
33
+ null
34
+ ]
35
+ },
36
+ "bos_token": "<s>",
37
+ "clean_up_tokenization_spaces": false,
38
+ "eos_token": "</s>",
39
+ "legacy": true,
40
+ "model_max_length": 1000000000000000019884624838656,
41
+ "pad_token": "</s>",
42
+ "padding_side": "right",
43
+ "sp_model_kwargs": {},
44
+ "split_special_tokens": false,
45
+ "tokenizer_class": "SkyworkTokenizer",
46
+ "unk_token": "<unk>"
47
+ }
trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98c3b3fa3bf45f46c10a314adbb9b805d535a5d5c81d57cf17d6f17332b87519
3
+ size 4728