Update modeling_Llamoe.py

Browse files

Files changed (1) hide show

modeling_Llamoe.py +9 -10

modeling_Llamoe.py CHANGED Viewed

@@ -646,14 +646,14 @@ class LlamoeFlashAttention2(LlamoeAttention):
         )
-class LlamoeSdpaAttention(LlamoeAttention):
     """
-    Llama attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `LlamaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
     SDPA API.
     """
-    # Adapted from LlamaAttention.forward
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -667,7 +667,7 @@ class LlamoeSdpaAttention(LlamoeAttention):
         if output_attentions:
             # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
             logger.warning_once(
-                "LlamaModel is using LlamaSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
                 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
             )
             return super().forward(
@@ -690,10 +690,9 @@ class LlamoeSdpaAttention(LlamoeAttention):
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        cos, sin = self.rotary_emb(value_states, position_ids)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
-        # In case static cache is used, it is an instance attribute.
         past_key_value = getattr(self, "past_key_value", past_key_value)
         if past_key_value is not None:
@@ -724,12 +723,12 @@ class LlamoeSdpaAttention(LlamoeAttention):
         )
         attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
         attn_output = self.o_proj(attn_output)
         return attn_output, None, past_key_value
 LLAMOE_ATTENTION_CLASSES = {
     "eager": LlamoeAttention,

         )
+class LlamoeSdpaAttention(GemmoeAttention):
     """
+    Gemmoe attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `GemmoeAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
     SDPA API.
     """
+    # Ignore copy
     def forward(
         self,
         hidden_states: torch.Tensor,
         if output_attentions:
             # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
             logger.warning_once(
+                "LlamoeModel is using LlamoeSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
                 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
             )
             return super().forward(
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        cos, sin = self.rotary_emb(value_states, position_ids, seq_len=None)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, None)
         past_key_value = getattr(self, "past_key_value", past_key_value)
         if past_key_value is not None:
         )
         attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, -1)
         attn_output = self.o_proj(attn_output)
         return attn_output, None, past_key_value
 LLAMOE_ATTENTION_CLASSES = {
     "eager": LlamoeAttention,