msr2000
commited on
Commit
•
2c190bb
1
Parent(s):
c10690d
Update model names
Browse files- config.json +5 -5
- configuration_deepseek.py +8 -8
- modeling_deepseek.py +69 -93
config.json
CHANGED
@@ -1,13 +1,13 @@
|
|
1 |
{
|
2 |
"architectures": [
|
3 |
-
"
|
4 |
],
|
5 |
"attention_bias": false,
|
6 |
"attention_dropout": 0.0,
|
7 |
"auto_map": {
|
8 |
-
"AutoConfig": "configuration_deepseek.
|
9 |
-
"AutoModel": "modeling_deepseek.
|
10 |
-
"AutoModelForCausalLM": "modeling_deepseek.
|
11 |
},
|
12 |
"aux_loss_alpha": 0.001,
|
13 |
"bos_token_id": 100000,
|
@@ -19,7 +19,7 @@
|
|
19 |
"intermediate_size": 12288,
|
20 |
"kv_lora_rank": 512,
|
21 |
"max_position_embeddings": 163840,
|
22 |
-
"model_type": "
|
23 |
"moe_intermediate_size": 1536,
|
24 |
"moe_layer_freq": 1,
|
25 |
"n_group": 8,
|
|
|
1 |
{
|
2 |
"architectures": [
|
3 |
+
"DeepseekV2ForCausalLM"
|
4 |
],
|
5 |
"attention_bias": false,
|
6 |
"attention_dropout": 0.0,
|
7 |
"auto_map": {
|
8 |
+
"AutoConfig": "configuration_deepseek.DeepseekV2Config",
|
9 |
+
"AutoModel": "modeling_deepseek.DeepseekV2Model",
|
10 |
+
"AutoModelForCausalLM": "modeling_deepseek.DeepseekV2ForCausalLM"
|
11 |
},
|
12 |
"aux_loss_alpha": 0.001,
|
13 |
"bos_token_id": 100000,
|
|
|
19 |
"intermediate_size": 12288,
|
20 |
"kv_lora_rank": 512,
|
21 |
"max_position_embeddings": 163840,
|
22 |
+
"model_type": "deepseek_v2",
|
23 |
"moe_intermediate_size": 1536,
|
24 |
"moe_layer_freq": 1,
|
25 |
"n_group": 8,
|
configuration_deepseek.py
CHANGED
@@ -4,11 +4,11 @@ from transformers.utils import logging
|
|
4 |
logger = logging.get_logger(__name__)
|
5 |
|
6 |
DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
|
7 |
-
class
|
8 |
r"""
|
9 |
-
This is the configuration class to store the configuration of a [`
|
10 |
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
|
11 |
-
defaults will yield a similar configuration to that of the DeepSeek-
|
12 |
|
13 |
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
14 |
documentation from [`PretrainedConfig`] for more information.
|
@@ -17,7 +17,7 @@ class DeepseekConfig(PretrainedConfig):
|
|
17 |
Args:
|
18 |
vocab_size (`int`, *optional*, defaults to 102400):
|
19 |
Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
|
20 |
-
`inputs_ids` passed when calling [`
|
21 |
hidden_size (`int`, *optional*, defaults to 4096):
|
22 |
Dimension of the hidden representations.
|
23 |
intermediate_size (`int`, *optional*, defaults to 11008):
|
@@ -100,16 +100,16 @@ class DeepseekConfig(PretrainedConfig):
|
|
100 |
The dropout ratio for the attention probabilities.
|
101 |
|
102 |
```python
|
103 |
-
>>> from transformers import
|
104 |
|
105 |
-
>>> # Initializing a Deepseek
|
106 |
-
>>> configuration =
|
107 |
|
108 |
>>> # Accessing the model configuration
|
109 |
>>> configuration = model.config
|
110 |
```"""
|
111 |
|
112 |
-
model_type = "
|
113 |
keys_to_ignore_at_inference = ["past_key_values"]
|
114 |
|
115 |
def __init__(
|
|
|
4 |
logger = logging.get_logger(__name__)
|
5 |
|
6 |
DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
|
7 |
+
class DeepseekV2Config(PretrainedConfig):
|
8 |
r"""
|
9 |
+
This is the configuration class to store the configuration of a [`DeepseekV2Model`]. It is used to instantiate an DeepSeek
|
10 |
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
|
11 |
+
defaults will yield a similar configuration to that of the DeepSeek-V2.
|
12 |
|
13 |
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
14 |
documentation from [`PretrainedConfig`] for more information.
|
|
|
17 |
Args:
|
18 |
vocab_size (`int`, *optional*, defaults to 102400):
|
19 |
Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
|
20 |
+
`inputs_ids` passed when calling [`DeepseekV2Model`]
|
21 |
hidden_size (`int`, *optional*, defaults to 4096):
|
22 |
Dimension of the hidden representations.
|
23 |
intermediate_size (`int`, *optional*, defaults to 11008):
|
|
|
100 |
The dropout ratio for the attention probabilities.
|
101 |
|
102 |
```python
|
103 |
+
>>> from transformers import DeepseekV2Model, DeepseekV2Config
|
104 |
|
105 |
+
>>> # Initializing a Deepseek-V2 style configuration
|
106 |
+
>>> configuration = DeepseekV2Config()
|
107 |
|
108 |
>>> # Accessing the model configuration
|
109 |
>>> configuration = model.config
|
110 |
```"""
|
111 |
|
112 |
+
model_type = "deepseek_v2"
|
113 |
keys_to_ignore_at_inference = ["past_key_values"]
|
114 |
|
115 |
def __init__(
|
modeling_deepseek.py
CHANGED
@@ -55,7 +55,7 @@ from transformers.utils import (
|
|
55 |
replace_return_docstrings,
|
56 |
)
|
57 |
from transformers.utils.import_utils import is_torch_fx_available
|
58 |
-
from .configuration_deepseek import
|
59 |
import torch.distributed as dist
|
60 |
import numpy as np
|
61 |
|
@@ -75,7 +75,7 @@ if is_torch_fx_available():
|
|
75 |
|
76 |
logger = logging.get_logger(__name__)
|
77 |
|
78 |
-
_CONFIG_FOR_DOC = "
|
79 |
|
80 |
|
81 |
def _get_unpad_data(attention_mask):
|
@@ -92,34 +92,10 @@ def _get_unpad_data(attention_mask):
|
|
92 |
)
|
93 |
|
94 |
|
95 |
-
|
96 |
-
warnings.warn(
|
97 |
-
"Calling `transformers.models.Deepseek.modeling_Deepseek._prepare_4d_attention_mask` is deprecated and will be removed in v4.37. Use `transformers.modeling_attn_mask_utils._prepare_4d_attention_mask"
|
98 |
-
)
|
99 |
-
return _prepare_4d_attention_mask(mask=mask, dtype=dtype, tgt_len=tgt_len)
|
100 |
-
|
101 |
-
|
102 |
-
def _make_causal_mask(
|
103 |
-
input_ids_shape: torch.Size,
|
104 |
-
dtype: torch.dtype,
|
105 |
-
device: torch.device,
|
106 |
-
past_key_values_length: int = 0,
|
107 |
-
):
|
108 |
-
warnings.warn(
|
109 |
-
"Calling `transformers.models.Deepseek.modeling_Deepseek._make_causal_mask` is deprecated and will be removed in v4.37. Use `transformers.models.Deepseek.modeling_Deepseek.AttentionMaskConverter._make_causal_mask"
|
110 |
-
)
|
111 |
-
return AttentionMaskConverter._make_causal_mask(
|
112 |
-
input_ids_shape=input_ids_shape,
|
113 |
-
dtype=dtype,
|
114 |
-
device=device,
|
115 |
-
past_key_values_length=past_key_values_length,
|
116 |
-
)
|
117 |
-
|
118 |
-
|
119 |
-
class DeepseekRMSNorm(nn.Module):
|
120 |
def __init__(self, hidden_size, eps=1e-6):
|
121 |
"""
|
122 |
-
|
123 |
"""
|
124 |
super().__init__()
|
125 |
self.weight = nn.Parameter(torch.ones(hidden_size))
|
@@ -133,10 +109,10 @@ class DeepseekRMSNorm(nn.Module):
|
|
133 |
return self.weight * hidden_states.to(input_dtype)
|
134 |
|
135 |
|
136 |
-
ALL_LAYERNORM_LAYERS.append(
|
137 |
|
138 |
|
139 |
-
class
|
140 |
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
|
141 |
super().__init__()
|
142 |
|
@@ -179,9 +155,9 @@ class DeepseekRotaryEmbedding(nn.Module):
|
|
179 |
)
|
180 |
|
181 |
|
182 |
-
# Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->
|
183 |
-
class
|
184 |
-
"""
|
185 |
|
186 |
def __init__(
|
187 |
self,
|
@@ -208,9 +184,9 @@ class DeepseekLinearScalingRotaryEmbedding(DeepseekRotaryEmbedding):
|
|
208 |
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
|
209 |
|
210 |
|
211 |
-
# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->
|
212 |
-
class
|
213 |
-
"""
|
214 |
|
215 |
def __init__(
|
216 |
self,
|
@@ -284,7 +260,7 @@ def yarn_linear_ramp_mask(min, max, dim):
|
|
284 |
return ramp_func
|
285 |
|
286 |
|
287 |
-
class
|
288 |
|
289 |
def __init__(
|
290 |
self,
|
@@ -396,7 +372,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
|
|
396 |
return q_embed, k_embed
|
397 |
|
398 |
|
399 |
-
class
|
400 |
def __init__(self, config, hidden_size=None, intermediate_size=None):
|
401 |
super().__init__()
|
402 |
self.config = config
|
@@ -543,7 +519,7 @@ class AddAuxiliaryLoss(torch.autograd.Function):
|
|
543 |
return grad_output, grad_loss
|
544 |
|
545 |
|
546 |
-
class
|
547 |
"""
|
548 |
A mixed expert module containing shared experts.
|
549 |
"""
|
@@ -561,7 +537,7 @@ class DeepseekMoE(nn.Module):
|
|
561 |
self.experts = nn.ModuleList(
|
562 |
[
|
563 |
(
|
564 |
-
|
565 |
config, intermediate_size=config.moe_intermediate_size
|
566 |
)
|
567 |
if i >= self.ep_rank * self.experts_per_rank
|
@@ -577,14 +553,14 @@ class DeepseekMoE(nn.Module):
|
|
577 |
self.ep_rank = 0
|
578 |
self.experts = nn.ModuleList(
|
579 |
[
|
580 |
-
|
581 |
for i in range(config.n_routed_experts)
|
582 |
]
|
583 |
)
|
584 |
self.gate = MoEGate(config)
|
585 |
if config.n_shared_experts is not None:
|
586 |
intermediate_size = config.moe_intermediate_size * config.n_shared_experts
|
587 |
-
self.shared_experts =
|
588 |
config=config, intermediate_size=intermediate_size
|
589 |
)
|
590 |
|
@@ -702,11 +678,11 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
|
|
702 |
return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
|
703 |
|
704 |
|
705 |
-
# Copied from transformers.models.llama.modeling_llama.LlamaAttention with Llama->
|
706 |
-
class
|
707 |
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
708 |
|
709 |
-
def __init__(self, config:
|
710 |
super().__init__()
|
711 |
self.config = config
|
712 |
self.layer_idx = layer_idx
|
@@ -735,7 +711,7 @@ class DeepseekAttention(nn.Module):
|
|
735 |
self.q_a_proj = nn.Linear(
|
736 |
self.hidden_size, config.q_lora_rank, bias=config.attention_bias
|
737 |
)
|
738 |
-
self.q_a_layernorm =
|
739 |
self.q_b_proj = nn.Linear(
|
740 |
config.q_lora_rank, self.num_heads * self.q_head_dim, bias=False
|
741 |
)
|
@@ -745,7 +721,7 @@ class DeepseekAttention(nn.Module):
|
|
745 |
config.kv_lora_rank + config.qk_rope_head_dim,
|
746 |
bias=config.attention_bias,
|
747 |
)
|
748 |
-
self.kv_a_layernorm =
|
749 |
self.kv_b_proj = nn.Linear(
|
750 |
config.kv_lora_rank,
|
751 |
self.num_heads
|
@@ -770,7 +746,7 @@ class DeepseekAttention(nn.Module):
|
|
770 |
|
771 |
def _init_rope(self):
|
772 |
if self.config.rope_scaling is None:
|
773 |
-
self.rotary_emb =
|
774 |
self.qk_rope_head_dim,
|
775 |
max_position_embeddings=self.max_position_embeddings,
|
776 |
base=self.rope_theta,
|
@@ -779,14 +755,14 @@ class DeepseekAttention(nn.Module):
|
|
779 |
scaling_type = self.config.rope_scaling["type"]
|
780 |
scaling_factor = self.config.rope_scaling["factor"]
|
781 |
if scaling_type == "linear":
|
782 |
-
self.rotary_emb =
|
783 |
self.qk_rope_head_dim,
|
784 |
max_position_embeddings=self.max_position_embeddings,
|
785 |
scaling_factor=scaling_factor,
|
786 |
base=self.rope_theta,
|
787 |
)
|
788 |
elif scaling_type == "dynamic":
|
789 |
-
self.rotary_emb =
|
790 |
self.qk_rope_head_dim,
|
791 |
max_position_embeddings=self.max_position_embeddings,
|
792 |
scaling_factor=scaling_factor,
|
@@ -804,7 +780,7 @@ class DeepseekAttention(nn.Module):
|
|
804 |
]
|
805 |
if key in self.config.rope_scaling
|
806 |
}
|
807 |
-
self.rotary_emb =
|
808 |
self.qk_rope_head_dim,
|
809 |
max_position_embeddings=self.max_position_embeddings,
|
810 |
scaling_factor=scaling_factor,
|
@@ -927,10 +903,10 @@ class DeepseekAttention(nn.Module):
|
|
927 |
return attn_output, attn_weights, past_key_value
|
928 |
|
929 |
|
930 |
-
# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->
|
931 |
-
class
|
932 |
"""
|
933 |
-
|
934 |
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
|
935 |
flash attention and deal with padding tokens in case the input contains any of them.
|
936 |
"""
|
@@ -953,7 +929,7 @@ class DeepseekFlashAttention2(DeepseekAttention):
|
|
953 |
use_cache: bool = False,
|
954 |
**kwargs,
|
955 |
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
956 |
-
#
|
957 |
if "padding_mask" in kwargs:
|
958 |
warnings.warn(
|
959 |
"Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
|
@@ -1027,7 +1003,7 @@ class DeepseekFlashAttention2(DeepseekAttention):
|
|
1027 |
# therefore the input hidden states gets silently casted in float32. Hence, we need
|
1028 |
# cast them back in the correct dtype just to be sure everything works as expected.
|
1029 |
# This might slowdown training & inference so it is recommended to not cast the LayerNorms
|
1030 |
-
# in fp32. (
|
1031 |
|
1032 |
input_dtype = query_states.dtype
|
1033 |
if input_dtype == torch.float32:
|
@@ -1103,7 +1079,7 @@ class DeepseekFlashAttention2(DeepseekAttention):
|
|
1103 |
if not self._flash_attn_uses_top_left_mask:
|
1104 |
causal = self.is_causal
|
1105 |
else:
|
1106 |
-
# TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in
|
1107 |
causal = self.is_causal and query_length != 1
|
1108 |
|
1109 |
# Contains at least one padding token in the sequence
|
@@ -1198,13 +1174,13 @@ class DeepseekFlashAttention2(DeepseekAttention):
|
|
1198 |
|
1199 |
|
1200 |
ATTENTION_CLASSES = {
|
1201 |
-
"eager":
|
1202 |
-
"flash_attention_2":
|
1203 |
}
|
1204 |
|
1205 |
|
1206 |
-
class
|
1207 |
-
def __init__(self, config:
|
1208 |
super().__init__()
|
1209 |
self.hidden_size = config.hidden_size
|
1210 |
|
@@ -1213,18 +1189,18 @@ class DeepseekDecoderLayer(nn.Module):
|
|
1213 |
)
|
1214 |
|
1215 |
self.mlp = (
|
1216 |
-
|
1217 |
if (
|
1218 |
config.n_routed_experts is not None
|
1219 |
and layer_idx >= config.first_k_dense_replace
|
1220 |
and layer_idx % config.moe_layer_freq == 0
|
1221 |
)
|
1222 |
-
else
|
1223 |
)
|
1224 |
-
self.input_layernorm =
|
1225 |
config.hidden_size, eps=config.rms_norm_eps
|
1226 |
)
|
1227 |
-
self.post_attention_layernorm =
|
1228 |
config.hidden_size, eps=config.rms_norm_eps
|
1229 |
)
|
1230 |
|
@@ -1291,7 +1267,7 @@ class DeepseekDecoderLayer(nn.Module):
|
|
1291 |
return outputs
|
1292 |
|
1293 |
|
1294 |
-
|
1295 |
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
1296 |
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
1297 |
etc.)
|
@@ -1301,7 +1277,7 @@ Deepseek_START_DOCSTRING = r"""
|
|
1301 |
and behavior.
|
1302 |
|
1303 |
Parameters:
|
1304 |
-
config ([`
|
1305 |
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
1306 |
load the weights associated with the model, only the configuration. Check out the
|
1307 |
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
@@ -1309,14 +1285,14 @@ Deepseek_START_DOCSTRING = r"""
|
|
1309 |
|
1310 |
|
1311 |
@add_start_docstrings(
|
1312 |
-
"The bare
|
1313 |
-
|
1314 |
)
|
1315 |
-
class
|
1316 |
-
config_class =
|
1317 |
base_model_prefix = "model"
|
1318 |
supports_gradient_checkpointing = True
|
1319 |
-
_no_split_modules = ["
|
1320 |
_skip_keys_device_placement = "past_key_values"
|
1321 |
_supports_flash_attn_2 = True
|
1322 |
_supports_sdpa = True
|
@@ -1334,7 +1310,7 @@ class DeepseekPreTrainedModel(PreTrainedModel):
|
|
1334 |
module.weight.data[module.padding_idx].zero_()
|
1335 |
|
1336 |
|
1337 |
-
|
1338 |
Args:
|
1339 |
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
1340 |
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
@@ -1405,18 +1381,18 @@ Deepseek_INPUTS_DOCSTRING = r"""
|
|
1405 |
|
1406 |
|
1407 |
@add_start_docstrings(
|
1408 |
-
"The bare
|
1409 |
-
|
1410 |
)
|
1411 |
-
class
|
1412 |
"""
|
1413 |
-
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`
|
1414 |
|
1415 |
Args:
|
1416 |
-
config:
|
1417 |
"""
|
1418 |
|
1419 |
-
def __init__(self, config:
|
1420 |
super().__init__(config)
|
1421 |
self.padding_idx = config.pad_token_id
|
1422 |
self.vocab_size = config.vocab_size
|
@@ -1426,13 +1402,13 @@ class DeepseekModel(DeepseekPreTrainedModel):
|
|
1426 |
)
|
1427 |
self.layers = nn.ModuleList(
|
1428 |
[
|
1429 |
-
|
1430 |
for layer_idx in range(config.num_hidden_layers)
|
1431 |
]
|
1432 |
)
|
1433 |
self._use_sdpa = config._attn_implementation == "sdpa"
|
1434 |
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
|
1435 |
-
self.norm =
|
1436 |
|
1437 |
self.gradient_checkpointing = False
|
1438 |
# Initialize weights and apply final processing
|
@@ -1444,7 +1420,7 @@ class DeepseekModel(DeepseekPreTrainedModel):
|
|
1444 |
def set_input_embeddings(self, value):
|
1445 |
self.embed_tokens = value
|
1446 |
|
1447 |
-
@add_start_docstrings_to_model_forward(
|
1448 |
def forward(
|
1449 |
self,
|
1450 |
input_ids: torch.LongTensor = None,
|
@@ -1604,12 +1580,12 @@ class DeepseekModel(DeepseekPreTrainedModel):
|
|
1604 |
)
|
1605 |
|
1606 |
|
1607 |
-
class
|
1608 |
_tied_weights_keys = ["lm_head.weight"]
|
1609 |
|
1610 |
def __init__(self, config):
|
1611 |
super().__init__(config)
|
1612 |
-
self.model =
|
1613 |
self.vocab_size = config.vocab_size
|
1614 |
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
1615 |
|
@@ -1634,7 +1610,7 @@ class DeepseekForCausalLM(DeepseekPreTrainedModel):
|
|
1634 |
def get_decoder(self):
|
1635 |
return self.model
|
1636 |
|
1637 |
-
@add_start_docstrings_to_model_forward(
|
1638 |
@replace_return_docstrings(
|
1639 |
output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
|
1640 |
)
|
@@ -1663,9 +1639,9 @@ class DeepseekForCausalLM(DeepseekPreTrainedModel):
|
|
1663 |
Example:
|
1664 |
|
1665 |
```python
|
1666 |
-
>>> from transformers import AutoTokenizer,
|
1667 |
|
1668 |
-
>>> model =
|
1669 |
>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
|
1670 |
|
1671 |
>>> prompt = "Hey, are you conscious? Can you talk to me?"
|
@@ -1811,9 +1787,9 @@ class DeepseekForCausalLM(DeepseekPreTrainedModel):
|
|
1811 |
|
1812 |
@add_start_docstrings(
|
1813 |
"""
|
1814 |
-
The
|
1815 |
|
1816 |
-
[`
|
1817 |
(e.g. GPT-2) do.
|
1818 |
|
1819 |
Since it does classification on the last token, it requires to know the position of the last token. If a
|
@@ -1822,13 +1798,13 @@ class DeepseekForCausalLM(DeepseekPreTrainedModel):
|
|
1822 |
padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
|
1823 |
each row of the batch).
|
1824 |
""",
|
1825 |
-
|
1826 |
)
|
1827 |
-
class
|
1828 |
def __init__(self, config):
|
1829 |
super().__init__(config)
|
1830 |
self.num_labels = config.num_labels
|
1831 |
-
self.model =
|
1832 |
self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
|
1833 |
|
1834 |
# Initialize weights and apply final processing
|
@@ -1840,7 +1816,7 @@ class DeepseekForSequenceClassification(DeepseekPreTrainedModel):
|
|
1840 |
def set_input_embeddings(self, value):
|
1841 |
self.model.embed_tokens = value
|
1842 |
|
1843 |
-
@add_start_docstrings_to_model_forward(
|
1844 |
def forward(
|
1845 |
self,
|
1846 |
input_ids: torch.LongTensor = None,
|
|
|
55 |
replace_return_docstrings,
|
56 |
)
|
57 |
from transformers.utils.import_utils import is_torch_fx_available
|
58 |
+
from .configuration_deepseek import DeepseekV2Config
|
59 |
import torch.distributed as dist
|
60 |
import numpy as np
|
61 |
|
|
|
75 |
|
76 |
logger = logging.get_logger(__name__)
|
77 |
|
78 |
+
_CONFIG_FOR_DOC = "DeepseekV2Config"
|
79 |
|
80 |
|
81 |
def _get_unpad_data(attention_mask):
|
|
|
92 |
)
|
93 |
|
94 |
|
95 |
+
class DeepseekV2RMSNorm(nn.Module):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
def __init__(self, hidden_size, eps=1e-6):
|
97 |
"""
|
98 |
+
DeepseekV2RMSNorm is equivalent to T5LayerNorm
|
99 |
"""
|
100 |
super().__init__()
|
101 |
self.weight = nn.Parameter(torch.ones(hidden_size))
|
|
|
109 |
return self.weight * hidden_states.to(input_dtype)
|
110 |
|
111 |
|
112 |
+
ALL_LAYERNORM_LAYERS.append(DeepseekV2RMSNorm)
|
113 |
|
114 |
|
115 |
+
class DeepseekV2RotaryEmbedding(nn.Module):
|
116 |
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
|
117 |
super().__init__()
|
118 |
|
|
|
155 |
)
|
156 |
|
157 |
|
158 |
+
# Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->DeepseekV2
|
159 |
+
class DeepseekV2LinearScalingRotaryEmbedding(DeepseekV2RotaryEmbedding):
|
160 |
+
"""DeepseekV2RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
|
161 |
|
162 |
def __init__(
|
163 |
self,
|
|
|
184 |
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
|
185 |
|
186 |
|
187 |
+
# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->DeepseekV2
|
188 |
+
class DeepseekV2DynamicNTKScalingRotaryEmbedding(DeepseekV2RotaryEmbedding):
|
189 |
+
"""DeepseekV2RotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
|
190 |
|
191 |
def __init__(
|
192 |
self,
|
|
|
260 |
return ramp_func
|
261 |
|
262 |
|
263 |
+
class DeepseekV2YarnRotaryEmbedding(DeepseekV2RotaryEmbedding):
|
264 |
|
265 |
def __init__(
|
266 |
self,
|
|
|
372 |
return q_embed, k_embed
|
373 |
|
374 |
|
375 |
+
class DeepseekV2MLP(nn.Module):
|
376 |
def __init__(self, config, hidden_size=None, intermediate_size=None):
|
377 |
super().__init__()
|
378 |
self.config = config
|
|
|
519 |
return grad_output, grad_loss
|
520 |
|
521 |
|
522 |
+
class DeepseekV2MoE(nn.Module):
|
523 |
"""
|
524 |
A mixed expert module containing shared experts.
|
525 |
"""
|
|
|
537 |
self.experts = nn.ModuleList(
|
538 |
[
|
539 |
(
|
540 |
+
DeepseekV2MLP(
|
541 |
config, intermediate_size=config.moe_intermediate_size
|
542 |
)
|
543 |
if i >= self.ep_rank * self.experts_per_rank
|
|
|
553 |
self.ep_rank = 0
|
554 |
self.experts = nn.ModuleList(
|
555 |
[
|
556 |
+
DeepseekV2MLP(config, intermediate_size=config.moe_intermediate_size)
|
557 |
for i in range(config.n_routed_experts)
|
558 |
]
|
559 |
)
|
560 |
self.gate = MoEGate(config)
|
561 |
if config.n_shared_experts is not None:
|
562 |
intermediate_size = config.moe_intermediate_size * config.n_shared_experts
|
563 |
+
self.shared_experts = DeepseekV2MLP(
|
564 |
config=config, intermediate_size=intermediate_size
|
565 |
)
|
566 |
|
|
|
678 |
return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
|
679 |
|
680 |
|
681 |
+
# Copied from transformers.models.llama.modeling_llama.LlamaAttention with Llama->DeepseekV2
|
682 |
+
class DeepseekV2Attention(nn.Module):
|
683 |
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
684 |
|
685 |
+
def __init__(self, config: DeepseekV2Config, layer_idx: Optional[int] = None):
|
686 |
super().__init__()
|
687 |
self.config = config
|
688 |
self.layer_idx = layer_idx
|
|
|
711 |
self.q_a_proj = nn.Linear(
|
712 |
self.hidden_size, config.q_lora_rank, bias=config.attention_bias
|
713 |
)
|
714 |
+
self.q_a_layernorm = DeepseekV2RMSNorm(config.q_lora_rank)
|
715 |
self.q_b_proj = nn.Linear(
|
716 |
config.q_lora_rank, self.num_heads * self.q_head_dim, bias=False
|
717 |
)
|
|
|
721 |
config.kv_lora_rank + config.qk_rope_head_dim,
|
722 |
bias=config.attention_bias,
|
723 |
)
|
724 |
+
self.kv_a_layernorm = DeepseekV2RMSNorm(config.kv_lora_rank)
|
725 |
self.kv_b_proj = nn.Linear(
|
726 |
config.kv_lora_rank,
|
727 |
self.num_heads
|
|
|
746 |
|
747 |
def _init_rope(self):
|
748 |
if self.config.rope_scaling is None:
|
749 |
+
self.rotary_emb = DeepseekV2RotaryEmbedding(
|
750 |
self.qk_rope_head_dim,
|
751 |
max_position_embeddings=self.max_position_embeddings,
|
752 |
base=self.rope_theta,
|
|
|
755 |
scaling_type = self.config.rope_scaling["type"]
|
756 |
scaling_factor = self.config.rope_scaling["factor"]
|
757 |
if scaling_type == "linear":
|
758 |
+
self.rotary_emb = DeepseekV2LinearScalingRotaryEmbedding(
|
759 |
self.qk_rope_head_dim,
|
760 |
max_position_embeddings=self.max_position_embeddings,
|
761 |
scaling_factor=scaling_factor,
|
762 |
base=self.rope_theta,
|
763 |
)
|
764 |
elif scaling_type == "dynamic":
|
765 |
+
self.rotary_emb = DeepseekV2DynamicNTKScalingRotaryEmbedding(
|
766 |
self.qk_rope_head_dim,
|
767 |
max_position_embeddings=self.max_position_embeddings,
|
768 |
scaling_factor=scaling_factor,
|
|
|
780 |
]
|
781 |
if key in self.config.rope_scaling
|
782 |
}
|
783 |
+
self.rotary_emb = DeepseekV2YarnRotaryEmbedding(
|
784 |
self.qk_rope_head_dim,
|
785 |
max_position_embeddings=self.max_position_embeddings,
|
786 |
scaling_factor=scaling_factor,
|
|
|
903 |
return attn_output, attn_weights, past_key_value
|
904 |
|
905 |
|
906 |
+
# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->DeepseekV2
|
907 |
+
class DeepseekV2FlashAttention2(DeepseekV2Attention):
|
908 |
"""
|
909 |
+
DeepseekV2 flash attention module. This module inherits from `DeepseekV2Attention` as the weights of the module stays
|
910 |
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
|
911 |
flash attention and deal with padding tokens in case the input contains any of them.
|
912 |
"""
|
|
|
929 |
use_cache: bool = False,
|
930 |
**kwargs,
|
931 |
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
932 |
+
# DeepseekV2FlashAttention2 attention does not support output_attentions
|
933 |
if "padding_mask" in kwargs:
|
934 |
warnings.warn(
|
935 |
"Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
|
|
|
1003 |
# therefore the input hidden states gets silently casted in float32. Hence, we need
|
1004 |
# cast them back in the correct dtype just to be sure everything works as expected.
|
1005 |
# This might slowdown training & inference so it is recommended to not cast the LayerNorms
|
1006 |
+
# in fp32. (DeepseekV2RMSNorm handles it correctly)
|
1007 |
|
1008 |
input_dtype = query_states.dtype
|
1009 |
if input_dtype == torch.float32:
|
|
|
1079 |
if not self._flash_attn_uses_top_left_mask:
|
1080 |
causal = self.is_causal
|
1081 |
else:
|
1082 |
+
# TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in DeepseekV2FlashAttention2 __init__.
|
1083 |
causal = self.is_causal and query_length != 1
|
1084 |
|
1085 |
# Contains at least one padding token in the sequence
|
|
|
1174 |
|
1175 |
|
1176 |
ATTENTION_CLASSES = {
|
1177 |
+
"eager": DeepseekV2Attention,
|
1178 |
+
"flash_attention_2": DeepseekV2FlashAttention2,
|
1179 |
}
|
1180 |
|
1181 |
|
1182 |
+
class DeepseekV2DecoderLayer(nn.Module):
|
1183 |
+
def __init__(self, config: DeepseekV2Config, layer_idx: int):
|
1184 |
super().__init__()
|
1185 |
self.hidden_size = config.hidden_size
|
1186 |
|
|
|
1189 |
)
|
1190 |
|
1191 |
self.mlp = (
|
1192 |
+
DeepseekV2MoE(config)
|
1193 |
if (
|
1194 |
config.n_routed_experts is not None
|
1195 |
and layer_idx >= config.first_k_dense_replace
|
1196 |
and layer_idx % config.moe_layer_freq == 0
|
1197 |
)
|
1198 |
+
else DeepseekV2MLP(config)
|
1199 |
)
|
1200 |
+
self.input_layernorm = DeepseekV2RMSNorm(
|
1201 |
config.hidden_size, eps=config.rms_norm_eps
|
1202 |
)
|
1203 |
+
self.post_attention_layernorm = DeepseekV2RMSNorm(
|
1204 |
config.hidden_size, eps=config.rms_norm_eps
|
1205 |
)
|
1206 |
|
|
|
1267 |
return outputs
|
1268 |
|
1269 |
|
1270 |
+
DeepseekV2_START_DOCSTRING = r"""
|
1271 |
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
1272 |
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
1273 |
etc.)
|
|
|
1277 |
and behavior.
|
1278 |
|
1279 |
Parameters:
|
1280 |
+
config ([`DeepseekV2Config`]):
|
1281 |
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
1282 |
load the weights associated with the model, only the configuration. Check out the
|
1283 |
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
|
|
1285 |
|
1286 |
|
1287 |
@add_start_docstrings(
|
1288 |
+
"The bare DeepseekV2 Model outputting raw hidden-states without any specific head on top.",
|
1289 |
+
DeepseekV2_START_DOCSTRING,
|
1290 |
)
|
1291 |
+
class DeepseekV2PreTrainedModel(PreTrainedModel):
|
1292 |
+
config_class = DeepseekV2Config
|
1293 |
base_model_prefix = "model"
|
1294 |
supports_gradient_checkpointing = True
|
1295 |
+
_no_split_modules = ["DeepseekV2DecoderLayer"]
|
1296 |
_skip_keys_device_placement = "past_key_values"
|
1297 |
_supports_flash_attn_2 = True
|
1298 |
_supports_sdpa = True
|
|
|
1310 |
module.weight.data[module.padding_idx].zero_()
|
1311 |
|
1312 |
|
1313 |
+
DeepseekV2_INPUTS_DOCSTRING = r"""
|
1314 |
Args:
|
1315 |
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
1316 |
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
|
|
1381 |
|
1382 |
|
1383 |
@add_start_docstrings(
|
1384 |
+
"The bare DeepseekV2 Model outputting raw hidden-states without any specific head on top.",
|
1385 |
+
DeepseekV2_START_DOCSTRING,
|
1386 |
)
|
1387 |
+
class DeepseekV2Model(DeepseekV2PreTrainedModel):
|
1388 |
"""
|
1389 |
+
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV2DecoderLayer`]
|
1390 |
|
1391 |
Args:
|
1392 |
+
config: DeepseekV2Config
|
1393 |
"""
|
1394 |
|
1395 |
+
def __init__(self, config: DeepseekV2Config):
|
1396 |
super().__init__(config)
|
1397 |
self.padding_idx = config.pad_token_id
|
1398 |
self.vocab_size = config.vocab_size
|
|
|
1402 |
)
|
1403 |
self.layers = nn.ModuleList(
|
1404 |
[
|
1405 |
+
DeepseekV2DecoderLayer(config, layer_idx)
|
1406 |
for layer_idx in range(config.num_hidden_layers)
|
1407 |
]
|
1408 |
)
|
1409 |
self._use_sdpa = config._attn_implementation == "sdpa"
|
1410 |
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
|
1411 |
+
self.norm = DeepseekV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
1412 |
|
1413 |
self.gradient_checkpointing = False
|
1414 |
# Initialize weights and apply final processing
|
|
|
1420 |
def set_input_embeddings(self, value):
|
1421 |
self.embed_tokens = value
|
1422 |
|
1423 |
+
@add_start_docstrings_to_model_forward(DeepseekV2_INPUTS_DOCSTRING)
|
1424 |
def forward(
|
1425 |
self,
|
1426 |
input_ids: torch.LongTensor = None,
|
|
|
1580 |
)
|
1581 |
|
1582 |
|
1583 |
+
class DeepseekV2ForCausalLM(DeepseekV2PreTrainedModel):
|
1584 |
_tied_weights_keys = ["lm_head.weight"]
|
1585 |
|
1586 |
def __init__(self, config):
|
1587 |
super().__init__(config)
|
1588 |
+
self.model = DeepseekV2Model(config)
|
1589 |
self.vocab_size = config.vocab_size
|
1590 |
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
1591 |
|
|
|
1610 |
def get_decoder(self):
|
1611 |
return self.model
|
1612 |
|
1613 |
+
@add_start_docstrings_to_model_forward(DeepseekV2_INPUTS_DOCSTRING)
|
1614 |
@replace_return_docstrings(
|
1615 |
output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
|
1616 |
)
|
|
|
1639 |
Example:
|
1640 |
|
1641 |
```python
|
1642 |
+
>>> from transformers import AutoTokenizer, DeepseekV2ForCausalLM
|
1643 |
|
1644 |
+
>>> model = DeepseekV2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
|
1645 |
>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
|
1646 |
|
1647 |
>>> prompt = "Hey, are you conscious? Can you talk to me?"
|
|
|
1787 |
|
1788 |
@add_start_docstrings(
|
1789 |
"""
|
1790 |
+
The DeepseekV2 Model transformer with a sequence classification head on top (linear layer).
|
1791 |
|
1792 |
+
[`DeepseekV2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
|
1793 |
(e.g. GPT-2) do.
|
1794 |
|
1795 |
Since it does classification on the last token, it requires to know the position of the last token. If a
|
|
|
1798 |
padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
|
1799 |
each row of the batch).
|
1800 |
""",
|
1801 |
+
DeepseekV2_START_DOCSTRING,
|
1802 |
)
|
1803 |
+
class DeepseekV2ForSequenceClassification(DeepseekV2PreTrainedModel):
|
1804 |
def __init__(self, config):
|
1805 |
super().__init__(config)
|
1806 |
self.num_labels = config.num_labels
|
1807 |
+
self.model = DeepseekV2Model(config)
|
1808 |
self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
|
1809 |
|
1810 |
# Initialize weights and apply final processing
|
|
|
1816 |
def set_input_embeddings(self, value):
|
1817 |
self.model.embed_tokens = value
|
1818 |
|
1819 |
+
@add_start_docstrings_to_model_forward(DeepseekV2_INPUTS_DOCSTRING)
|
1820 |
def forward(
|
1821 |
self,
|
1822 |
input_ids: torch.LongTensor = None,
|