flax-community
/

vit-gpt2

TensorBoard

Model card Files Files and versions Metrics Training metrics Community

ydshieh commited on Aug 3, 2021

Commit

a01b02a

•

1 Parent(s): f082d66

remove only_self_attn

Browse files

Files changed (1) hide show

vit_gpt2/modeling_flax_gpt2.py +9 -10

vit_gpt2/modeling_flax_gpt2.py CHANGED Viewed

@@ -299,15 +299,13 @@ class FlaxGPT2Block(nn.Module):
     def setup(self):
-        self.only_self_attn = not self.config.add_cross_attention
         hidden_size = self.config.hidden_size
         inner_dim = self.config.n_inner if self.config.n_inner is not None else 4 * hidden_size
         self.ln_1 = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
         self.attn = FlaxGPT2Attention(self.config, dtype=self.dtype)
-        if not self.only_self_attn:
             self.cross_attn_ln = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
             # [IMPORTANT] Cross attention requires ``causal=False``! This is a bug I made previously.
             self.cross_attn = FlaxGPT2Attention(config=self.config, dtype=self.dtype, causal=False, self_attn=False)
@@ -343,16 +341,17 @@ class FlaxGPT2Block(nn.Module):
         attn_output = outputs[0]
         hidden_states = attn_output + residual
-        # sanity check
-        if not self.only_self_attn:
-            assert encoder_hidden_states is not None
-        else:
-            assert encoder_hidden_states is None
         # Cross-Attention Block
         cross_attn_weights = None
         if encoder_hidden_states is not None:
             project_encoder = getattr(self.config, "project_encoder", None)
             if project_encoder:
                 residual = encoder_hidden_states
@@ -393,7 +392,7 @@ class FlaxGPT2Block(nn.Module):
         if output_attentions:
             self_attn_weights = attn_output[1]
             outputs += (self_attn_weights,)
-            if not self.only_self_attn:
                 outputs += (cross_attn_weights,)
         return outputs

     def setup(self):
         hidden_size = self.config.hidden_size
         inner_dim = self.config.n_inner if self.config.n_inner is not None else 4 * hidden_size
         self.ln_1 = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
         self.attn = FlaxGPT2Attention(self.config, dtype=self.dtype)
+        if self.config.add_cross_attention:
             self.cross_attn_ln = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
             # [IMPORTANT] Cross attention requires ``causal=False``! This is a bug I made previously.
             self.cross_attn = FlaxGPT2Attention(config=self.config, dtype=self.dtype, causal=False, self_attn=False)
         attn_output = outputs[0]
         hidden_states = attn_output + residual
         # Cross-Attention Block
         cross_attn_weights = None
         if encoder_hidden_states is not None:
+            # add one self-attention block for cross-attention
+            if not hasattr(self, "cross_attn"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
+                    "cross-attention layers by setting `config.add_cross_attention=True`"
+                )
             project_encoder = getattr(self.config, "project_encoder", None)
             if project_encoder:
                 residual = encoder_hidden_states
         if output_attentions:
             self_attn_weights = attn_output[1]
             outputs += (self_attn_weights,)
+            if cross_attn_weights is not None:
                 outputs += (cross_attn_weights,)
         return outputs