x54-729
commited on
Commit
•
acd3787
1
Parent(s):
b0460b5
update modeling file to newest
Browse files- modeling_internlm2.py +11 -3
- tokenization_internlm2_fast.py +14 -22
modeling_internlm2.py
CHANGED
@@ -13,7 +13,7 @@
|
|
13 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14 |
# See the License for the specific language governing permissions and
|
15 |
# limitations under the License.
|
16 |
-
"""PyTorch InternLM2
|
17 |
import math
|
18 |
import queue
|
19 |
import threading
|
@@ -59,6 +59,10 @@ try:
|
|
59 |
except:
|
60 |
pass
|
61 |
|
|
|
|
|
|
|
|
|
62 |
|
63 |
logger = logging.get_logger(__name__)
|
64 |
|
@@ -1093,7 +1097,11 @@ class InternLM2Model(InternLM2PreTrainedModel):
|
|
1093 |
else:
|
1094 |
causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
|
1095 |
if sequence_length != 1:
|
1096 |
-
|
|
|
|
|
|
|
|
|
1097 |
causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
|
1098 |
causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
|
1099 |
if attention_mask is not None:
|
@@ -1797,4 +1805,4 @@ class InternLM2ForTokenClassification(InternLM2PreTrainedModel):
|
|
1797 |
logits=logits,
|
1798 |
hidden_states=outputs.hidden_states,
|
1799 |
attentions=outputs.attentions,
|
1800 |
-
)
|
|
|
13 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14 |
# See the License for the specific language governing permissions and
|
15 |
# limitations under the License.
|
16 |
+
"""PyTorch InternLM2 model."""
|
17 |
import math
|
18 |
import queue
|
19 |
import threading
|
|
|
59 |
except:
|
60 |
pass
|
61 |
|
62 |
+
try:
|
63 |
+
support_bf16_triu = torch.__version__ >= "2.1.0"
|
64 |
+
except Exception:
|
65 |
+
support_bf16_triu = False
|
66 |
|
67 |
logger = logging.get_logger(__name__)
|
68 |
|
|
|
1097 |
else:
|
1098 |
causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
|
1099 |
if sequence_length != 1:
|
1100 |
+
if support_bf16_triu or dtype == torch.float32:
|
1101 |
+
causal_mask = torch.triu(causal_mask, diagonal=1)
|
1102 |
+
else:
|
1103 |
+
triu_mask = torch.triu(torch.ones(causal_mask.size(), device=device), diagonal=1).bool()
|
1104 |
+
causal_mask.masked_fill_(~triu_mask, 0)
|
1105 |
causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
|
1106 |
causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
|
1107 |
if attention_mask is not None:
|
|
|
1805 |
logits=logits,
|
1806 |
hidden_states=outputs.hidden_states,
|
1807 |
attentions=outputs.attentions,
|
1808 |
+
)
|
tokenization_internlm2_fast.py
CHANGED
@@ -20,15 +20,17 @@ import os
|
|
20 |
from shutil import copyfile
|
21 |
from typing import Any, Dict, Optional, Tuple
|
22 |
|
23 |
-
from tokenizers import
|
24 |
from tokenizers.models import BPE
|
|
|
|
|
|
|
|
|
25 |
from transformers.convert_slow_tokenizer import (
|
26 |
SLOW_TO_FAST_CONVERTERS,
|
27 |
-
SentencePieceExtractor,
|
28 |
SpmConverter,
|
|
|
29 |
)
|
30 |
-
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
|
31 |
-
from transformers.utils import logging
|
32 |
|
33 |
from .tokenization_internlm2 import InternLM2Tokenizer
|
34 |
|
@@ -36,13 +38,8 @@ logger = logging.get_logger(__name__)
|
|
36 |
|
37 |
VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
|
38 |
|
39 |
-
|
40 |
# Modified from transformers.convert_slow_tokenizer.LlamaConverter
|
41 |
class InternLM2Converter(SpmConverter):
|
42 |
-
"""
|
43 |
-
Fast tokenizer converter for InternLM2.
|
44 |
-
"""
|
45 |
-
|
46 |
handle_byte_fallback = True
|
47 |
|
48 |
def vocab(self, proto):
|
@@ -54,11 +51,11 @@ class InternLM2Converter(SpmConverter):
|
|
54 |
vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
|
55 |
return vocab
|
56 |
|
57 |
-
def unk_id(self, proto):
|
58 |
unk_id = 0
|
59 |
return unk_id
|
60 |
|
61 |
-
def decoder(self, replacement, add_prefix_space):
|
62 |
decoders_sequence = [
|
63 |
decoders.Replace("▁", " "),
|
64 |
decoders.ByteFallback(),
|
@@ -74,7 +71,7 @@ class InternLM2Converter(SpmConverter):
|
|
74 |
# special tokens
|
75 |
added_tokens = self.original_tokenizer.added_tokens_decoder
|
76 |
for i in range(len(vocab_scores)):
|
77 |
-
|
78 |
if i in added_tokens:
|
79 |
vocab_scores[i] = (added_tokens[i].content, score)
|
80 |
if model_type == 1:
|
@@ -86,7 +83,9 @@ class InternLM2Converter(SpmConverter):
|
|
86 |
tokenizer = Tokenizer(
|
87 |
BPE(bpe_vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True, byte_fallback=True)
|
88 |
)
|
89 |
-
tokenizer.add_special_tokens(
|
|
|
|
|
90 |
else:
|
91 |
raise Exception(
|
92 |
"You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
|
@@ -101,19 +100,14 @@ class InternLM2Converter(SpmConverter):
|
|
101 |
normalizers_list.append(normalizers.Replace(pattern=" ", content="▁"))
|
102 |
return normalizers.Sequence(normalizers_list)
|
103 |
|
104 |
-
def pre_tokenizer(self, replacement, add_prefix_space):
|
105 |
return None
|
106 |
|
107 |
-
|
108 |
SLOW_TO_FAST_CONVERTERS["InternLM2Tokenizer"] = InternLM2Converter
|
109 |
|
110 |
|
111 |
# Modified from transformers.model.llama.tokenization_llama_fast.LlamaTokenizerFast -> InternLM2TokenizerFast
|
112 |
class InternLM2TokenizerFast(PreTrainedTokenizerFast):
|
113 |
-
"""
|
114 |
-
Fast tokenizer for InternLM2.
|
115 |
-
"""
|
116 |
-
|
117 |
vocab_files_names = VOCAB_FILES_NAMES
|
118 |
slow_tokenizer_class = InternLM2Tokenizer
|
119 |
padding_side = "left"
|
@@ -171,9 +165,7 @@ class InternLM2TokenizerFast(PreTrainedTokenizerFast):
|
|
171 |
raise ValueError("add_eos_token = True but eos_token = None")
|
172 |
|
173 |
single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
|
174 |
-
pair = (
|
175 |
-
f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
|
176 |
-
)
|
177 |
|
178 |
special_tokens = []
|
179 |
if self.add_bos_token:
|
|
|
20 |
from shutil import copyfile
|
21 |
from typing import Any, Dict, Optional, Tuple
|
22 |
|
23 |
+
from tokenizers import processors, decoders, Tokenizer, normalizers
|
24 |
from tokenizers.models import BPE
|
25 |
+
|
26 |
+
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
|
27 |
+
from transformers.utils import logging
|
28 |
+
|
29 |
from transformers.convert_slow_tokenizer import (
|
30 |
SLOW_TO_FAST_CONVERTERS,
|
|
|
31 |
SpmConverter,
|
32 |
+
SentencePieceExtractor,
|
33 |
)
|
|
|
|
|
34 |
|
35 |
from .tokenization_internlm2 import InternLM2Tokenizer
|
36 |
|
|
|
38 |
|
39 |
VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
|
40 |
|
|
|
41 |
# Modified from transformers.convert_slow_tokenizer.LlamaConverter
|
42 |
class InternLM2Converter(SpmConverter):
|
|
|
|
|
|
|
|
|
43 |
handle_byte_fallback = True
|
44 |
|
45 |
def vocab(self, proto):
|
|
|
51 |
vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
|
52 |
return vocab
|
53 |
|
54 |
+
def unk_id(self, proto):
|
55 |
unk_id = 0
|
56 |
return unk_id
|
57 |
|
58 |
+
def decoder(self, replacement, add_prefix_space):
|
59 |
decoders_sequence = [
|
60 |
decoders.Replace("▁", " "),
|
61 |
decoders.ByteFallback(),
|
|
|
71 |
# special tokens
|
72 |
added_tokens = self.original_tokenizer.added_tokens_decoder
|
73 |
for i in range(len(vocab_scores)):
|
74 |
+
piece, score = vocab_scores[i]
|
75 |
if i in added_tokens:
|
76 |
vocab_scores[i] = (added_tokens[i].content, score)
|
77 |
if model_type == 1:
|
|
|
83 |
tokenizer = Tokenizer(
|
84 |
BPE(bpe_vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True, byte_fallback=True)
|
85 |
)
|
86 |
+
tokenizer.add_special_tokens(
|
87 |
+
[ added_token for index, added_token in added_tokens.items()]
|
88 |
+
)
|
89 |
else:
|
90 |
raise Exception(
|
91 |
"You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
|
|
|
100 |
normalizers_list.append(normalizers.Replace(pattern=" ", content="▁"))
|
101 |
return normalizers.Sequence(normalizers_list)
|
102 |
|
103 |
+
def pre_tokenizer(self, replacement, add_prefix_space):
|
104 |
return None
|
105 |
|
|
|
106 |
SLOW_TO_FAST_CONVERTERS["InternLM2Tokenizer"] = InternLM2Converter
|
107 |
|
108 |
|
109 |
# Modified from transformers.model.llama.tokenization_llama_fast.LlamaTokenizerFast -> InternLM2TokenizerFast
|
110 |
class InternLM2TokenizerFast(PreTrainedTokenizerFast):
|
|
|
|
|
|
|
|
|
111 |
vocab_files_names = VOCAB_FILES_NAMES
|
112 |
slow_tokenizer_class = InternLM2Tokenizer
|
113 |
padding_side = "left"
|
|
|
165 |
raise ValueError("add_eos_token = True but eos_token = None")
|
166 |
|
167 |
single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
|
168 |
+
pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
|
|
|
|
|
169 |
|
170 |
special_tokens = []
|
171 |
if self.add_bos_token:
|