Update modeling_Llamoe.py

Files changed (1) hide show

modeling_Llamoe.py CHANGED Viewed

@@ -525,7 +525,7 @@ class LlamoeFlashAttention2(LlamoeAttention):
 # Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Gemmoe
-class LlamoeSdpaAttention(LlamoeAttention):
     """
     Gemmoe attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
     `GemmoeAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
@@ -564,17 +564,10 @@ class LlamoeSdpaAttention(LlamoeAttention):
         query_states = self.q_proj(hidden_states)
         key_states = self.k_proj(hidden_states)
         value_states = self.v_proj(hidden_states)
-        print("query :",query_states.shape)
-        print("key :",key_states.shape)
-        print("value :",value_states.shape)
-        query_states = query_states.view(bsz, self.num_heads, q_len, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, self.num_key_value_heads, q_len, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, self.num_key_value_heads, q_len, self.head_dim).transpose(1, 2)
-        print("queryafter :",query_states.shape)
-        print("ketafter :",key_states.shape)
-        print("valueafter :",value_states.shape)
         cos, sin = self.rotary_emb(value_states, position_ids, seq_len=None)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, None)

 # Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Gemmoe
+class LlamoeSdpaAttention(GemmoeAttention):
     """
     Gemmoe attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
     `GemmoeAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
         query_states = self.q_proj(hidden_states)
         key_states = self.k_proj(hidden_states)
         value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         cos, sin = self.rotary_emb(value_states, position_ids, seq_len=None)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, None)