Use attention dropout during training (#10)

- fix: use attention dropout with torch SDPA implementation (5bc298762e07eccbf5cd43fa33904d6dfdc1c601)
- feat: set self.dropout_p in constructor (02ebe529e85a23d23c837d4e7a1041a8c955af77)

Files changed (1) hide show

modeling_bert.py CHANGED Viewed

@@ -281,7 +281,8 @@ class JinaBertSelfAttention(nn.Module):
         self.key = nn.Linear(config.hidden_size, self.all_head_size)
         self.value = nn.Linear(config.hidden_size, self.all_head_size)
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
         self.position_embedding_type = position_embedding_type or getattr(
             config, "position_embedding_type", "absolute"
         )
@@ -356,7 +357,8 @@ class JinaBertSelfAttention(nn.Module):
         if self.attn_implementation == 'torch' and scaled_dot_product_attention is not None:
             b, _, s, _ = query_layer.shape
             new_bias = attention_mask + bias
-            attn = scaled_dot_product_attention(query_layer, key_layer, value_layer, new_bias)
             attn = attn.permute(0, 2, 1, 3).contiguous()
             return (attn.view(b, s, self.all_head_size),)

         self.key = nn.Linear(config.hidden_size, self.all_head_size)
         self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dropout_p = config.attention_probs_dropout_prob
+        self.dropout = nn.Dropout(self.dropout_p)
         self.position_embedding_type = position_embedding_type or getattr(
             config, "position_embedding_type", "absolute"
         )
         if self.attn_implementation == 'torch' and scaled_dot_product_attention is not None:
             b, _, s, _ = query_layer.shape
             new_bias = attention_mask + bias
+            dropout_p = self.dropout_p if self.training else 0.0
+            attn = scaled_dot_product_attention(query_layer, key_layer, value_layer, new_bias, dropout_p=dropout_p)
             attn = attn.permute(0, 2, 1, 3).contiguous()
             return (attn.view(b, s, self.all_head_size),)