liminghong
/

DNABERT-2-117M

Inference Endpoints

Model card Files Files and versions Community

liminghong commited on Oct 15, 2023

Commit

e228d76

•

1 Parent(s): ff974d2

convert bias type

Files changed (1) hide show

bert_layers.py +3 -3

bert_layers.py CHANGED Viewed

@@ -171,18 +171,18 @@ class BertUnpadSelfAttention(nn.Module):
                                                                  3)  # b s h d
         else:
             # Triton implementation only supports 0 attention dropout
             convert_dtype = qkv.dtype not in [torch.float16, torch.bfloat16]
             if convert_dtype:
                 # Triton implementation only supports fp16 and bf16
                 orig_dtype = qkv.dtype
                 qkv = qkv.to(torch.float16)
-                bias_dtype = bias.dtype
-                bias = bias.to(torch.float16)
                 attention = flash_attn_qkvpacked_func(qkv, bias)
                 attention = attention.to(orig_dtype)
-                bias = bias.to(bias_dtype)
             else:
                 attention = flash_attn_qkvpacked_func(qkv, bias)
         # attn_mask is 1 for attend and 0 for don't
         attention = unpad_input_only(attention, torch.squeeze(attn_mask) == 1)

                                                                  3)  # b s h d
         else:
             # Triton implementation only supports 0 attention dropout
+            bias_dtype = bias.dtype
+            bias = bias.to(torch.float16)
             convert_dtype = qkv.dtype not in [torch.float16, torch.bfloat16]
             if convert_dtype:
                 # Triton implementation only supports fp16 and bf16
                 orig_dtype = qkv.dtype
                 qkv = qkv.to(torch.float16)
                 attention = flash_attn_qkvpacked_func(qkv, bias)
                 attention = attention.to(orig_dtype)
             else:
                 attention = flash_attn_qkvpacked_func(qkv, bias)
+            bias = bias.to(bias_dtype)
         # attn_mask is 1 for attend and 0 for don't
         attention = unpad_input_only(attention, torch.squeeze(attn_mask) == 1)