flax-community
/

vit-gpt2

TensorBoard

Model card Files Files and versions Metrics Training metrics Community

ydshieh commited on Aug 4, 2021

Commit

87485e5

•

1 Parent(s): 3a17811

final ver

Browse files

Files changed (1) hide show

vit_gpt2/modeling_flax_gpt2.py +159 -37

vit_gpt2/modeling_flax_gpt2.py CHANGED Viewed

@@ -24,7 +24,7 @@ from flax.linen.attention import dot_product_attention_weights
 from jax import lax
 from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
-from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxBaseModelOutputWithPast, FlaxCausalLMOutput
 from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring
 from ...utils import logging
 from .configuration_gpt2 import GPT2Config
@@ -117,6 +117,8 @@ class FlaxConv1D(nn.Module):
 class FlaxGPT2Attention(nn.Module):
     config: GPT2Config
     dtype: jnp.dtype = jnp.float32
     def setup(self):
         config = self.config
@@ -124,10 +126,19 @@ class FlaxGPT2Attention(nn.Module):
         self.num_heads = config.num_attention_heads
         self.head_dim = self.embed_dim // self.num_heads
-        self.c_attn = FlaxConv1D(features=3 * self.embed_dim, dtype=self.dtype)
         self.c_proj = FlaxConv1D(self.embed_dim, dtype=self.dtype)
         self.resid_dropout = nn.Dropout(rate=config.resid_pdrop)
-        self.causal_mask = make_causal_mask(jnp.ones((1, config.max_position_embeddings), dtype="bool"), dtype="bool")
     def _split_heads(self, hidden_states):
         return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim))
@@ -170,13 +181,26 @@ class FlaxGPT2Attention(nn.Module):
     def __call__(
         self,
         hidden_states,
         attention_mask=None,
         deterministic: bool = True,
         init_cache: bool = False,
         output_attentions: bool = False,
     ):
-        qkv_out = self.c_attn(hidden_states)
-        query, key, value = jnp.split(qkv_out, 3, axis=2)
         query = self._split_heads(query)
         key = self._split_heads(key)
@@ -184,20 +208,25 @@ class FlaxGPT2Attention(nn.Module):
         query_length, key_length = query.shape[1], key.shape[1]
-        if self.has_variable("cache", "cached_key"):
-            mask_shift = self.variables["cache"]["cache_index"]
-            max_decoder_length = self.variables["cache"]["cached_key"].shape[1]
-            causal_mask = lax.dynamic_slice(
-                self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length)
-            )
-        else:
-            causal_mask = self.causal_mask[:, :, :query_length, :key_length]
-        batch_size = hidden_states.shape[0]
-        causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
-        attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
-        attention_mask = combine_masks(attention_mask, causal_mask)
         dropout_rng = None
         if not deterministic and self.config.attn_pdrop > 0.0:
@@ -205,15 +234,18 @@ class FlaxGPT2Attention(nn.Module):
         # During fast autoregressive decoding, we feed one position at a time,
         # and cache the keys and values step by step.
-        if self.has_variable("cache", "cached_key") or init_cache:
             key, value, attention_mask = self._concatenate_to_cache(key, value, query, attention_mask)
         # transform boolean mask into float mask
-        attention_bias = lax.select(
-            attention_mask > 0,
-            jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
-            jnp.full(attention_mask.shape, -1e4).astype(self.dtype),
-        )
         # usual dot product attention
         attn_weights = dot_product_attention_weights(
@@ -267,19 +299,31 @@ class FlaxGPT2Block(nn.Module):
         self.ln_1 = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
         self.attn = FlaxGPT2Attention(self.config, dtype=self.dtype)
         self.ln_2 = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
         self.mlp = FlaxGPT2MLP(self.config, inner_dim, dtype=self.dtype)
     def __call__(
         self,
         hidden_states,
         attention_mask=None,
         deterministic: bool = True,
         init_cache: bool = False,
         output_attentions: bool = False,
     ):
         residual = hidden_states
         hidden_states = self.ln_1(hidden_states)
-        outputs = self.attn(
             hidden_states,
             attention_mask=attention_mask,
             deterministic=deterministic,
@@ -287,16 +331,53 @@ class FlaxGPT2Block(nn.Module):
             output_attentions=output_attentions,
         )
         # residual connection
-        attn_output = outputs[0]
         hidden_states = attn_output + residual
         residual = hidden_states
         hidden_states = self.ln_2(hidden_states)
         feed_forward_hidden_states = self.mlp(hidden_states, deterministic=deterministic)
         # residual connection
         hidden_states = residual + feed_forward_hidden_states
-        return (hidden_states,) + outputs[1:]
 class FlaxGPT2PreTrainedModel(FlaxPreTrainedModel):
@@ -328,7 +409,19 @@ class FlaxGPT2PreTrainedModel(FlaxPreTrainedModel):
         params_rng, dropout_rng = jax.random.split(rng)
         rngs = {"params": params_rng, "dropout": dropout_rng}
-        return self.module.init(rngs, input_ids, attention_mask, position_ids, return_dict=False)["params"]
     def init_cache(self, batch_size, max_length):
         r"""
@@ -355,6 +448,8 @@ class FlaxGPT2PreTrainedModel(FlaxPreTrainedModel):
         input_ids,
         attention_mask=None,
         position_ids=None,
         params: dict = None,
         past_key_values: dict = None,
         dropout_rng: jax.random.PRNGKey = None,
@@ -369,6 +464,10 @@ class FlaxGPT2PreTrainedModel(FlaxPreTrainedModel):
         )
         return_dict = return_dict if return_dict is not None else self.config.return_dict
         batch_size, sequence_length = input_ids.shape
         if position_ids is None:
@@ -399,6 +498,8 @@ class FlaxGPT2PreTrainedModel(FlaxPreTrainedModel):
             jnp.array(input_ids, dtype="i4"),
             jnp.array(attention_mask, dtype="i4"),
             jnp.array(position_ids, dtype="i4"),
             not train,
             False,
             output_attentions,
@@ -433,6 +534,8 @@ class FlaxGPT2BlockCollection(nn.Module):
         self,
         hidden_states,
         attention_mask=None,
         deterministic: bool = True,
         init_cache: bool = False,
         output_attentions: bool = False,
@@ -441,6 +544,7 @@ class FlaxGPT2BlockCollection(nn.Module):
     ):
         all_attentions = () if output_attentions else None
         all_hidden_states = () if output_hidden_states else None
         for block in self.blocks:
             if output_hidden_states:
@@ -449,6 +553,8 @@ class FlaxGPT2BlockCollection(nn.Module):
             layer_outputs = block(
                 hidden_states,
                 attention_mask,
                 deterministic=deterministic,
                 init_cache=init_cache,
                 output_attentions=output_attentions,
@@ -458,19 +564,22 @@ class FlaxGPT2BlockCollection(nn.Module):
             if output_attentions:
                 all_attentions += (layer_outputs[1],)
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
-        outputs = (hidden_states,)
         if not return_dict:
             return tuple(v for v in outputs if v is not None)
-        return FlaxBaseModelOutputWithPast(
             last_hidden_state=hidden_states,
-            past_key_values=None,
             hidden_states=all_hidden_states,
             attentions=all_attentions,
         )
@@ -502,6 +611,8 @@ class FlaxGPT2Module(nn.Module):
         input_ids,
         attention_mask,
         position_ids,
         deterministic=True,
         init_cache: bool = False,
         output_attentions: bool = False,
@@ -517,6 +628,8 @@ class FlaxGPT2Module(nn.Module):
         outputs = self.h(
             hidden_states,
             attention_mask,
             deterministic=deterministic,
             init_cache=init_cache,
             output_attentions=output_attentions,
@@ -530,10 +643,11 @@ class FlaxGPT2Module(nn.Module):
         if not return_dict:
             return (hidden_states,) + outputs[1:]
-        return FlaxBaseModelOutput(
             last_hidden_state=hidden_states,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
@@ -546,7 +660,7 @@ class FlaxGPT2Model(FlaxGPT2PreTrainedModel):
 append_call_sample_docstring(
-    FlaxGPT2Model, _TOKENIZER_FOR_DOC, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC
 )
@@ -568,6 +682,8 @@ class FlaxGPT2LMHeadModule(nn.Module):
         input_ids,
         attention_mask,
         position_ids,
         deterministic: bool = True,
         init_cache: bool = False,
         output_attentions: bool = False,
@@ -578,6 +694,8 @@ class FlaxGPT2LMHeadModule(nn.Module):
             input_ids,
             attention_mask,
             position_ids,
             deterministic=deterministic,
             init_cache=init_cache,
             output_attentions=output_attentions,
@@ -596,8 +714,12 @@ class FlaxGPT2LMHeadModule(nn.Module):
         if not return_dict:
             return (lm_logits,) + outputs[1:]
-        return FlaxCausalLMOutput(logits=lm_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
 @add_start_docstrings(
     """
@@ -637,5 +759,5 @@ class FlaxGPT2LMHeadModel(FlaxGPT2PreTrainedModel):
 append_call_sample_docstring(
-    FlaxGPT2LMHeadModel, _TOKENIZER_FOR_DOC, _CHECKPOINT_FOR_DOC, FlaxCausalLMOutput, _CONFIG_FOR_DOC
 )

 from jax import lax
 from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
+from ...modeling_flax_outputs import FlaxBaseModelOutputWithPastAndCrossAttentions, FlaxCausalLMOutputWithCrossAttentions
 from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring
 from ...utils import logging
 from .configuration_gpt2 import GPT2Config
 class FlaxGPT2Attention(nn.Module):
     config: GPT2Config
     dtype: jnp.dtype = jnp.float32
+    causal: bool = True
+    is_cross_attention: bool = False
     def setup(self):
         config = self.config
         self.num_heads = config.num_attention_heads
         self.head_dim = self.embed_dim // self.num_heads
+        if self.is_cross_attention:
+            self.c_attn = FlaxConv1D(2 * self.embed_dim, dtype=self.dtype)
+            self.q_attn = FlaxConv1D(self.embed_dim, dtype=self.dtype)
+        else:
+            self.c_attn = FlaxConv1D(3 * self.embed_dim, dtype=self.dtype)
         self.c_proj = FlaxConv1D(self.embed_dim, dtype=self.dtype)
         self.resid_dropout = nn.Dropout(rate=config.resid_pdrop)
+        if self.causal:
+            self.causal_mask = make_causal_mask(
+                jnp.ones((1, config.max_position_embeddings), dtype="bool"), dtype="bool"
+            )
     def _split_heads(self, hidden_states):
         return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim))
     def __call__(
         self,
         hidden_states,
+        key_value_states: Optional[jnp.ndarray] = None,
         attention_mask=None,
         deterministic: bool = True,
         init_cache: bool = False,
         output_attentions: bool = False,
     ):
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        batch_size = hidden_states.shape[0]
+        if not is_cross_attention:
+            qkv_out = self.c_attn(hidden_states)
+            query, key, value = jnp.split(qkv_out, 3, axis=2)
+        else:
+            q_out = self.q_attn(hidden_states)
+            (query,) = jnp.split(q_out, 1, axis=2)
+            kv_out = self.c_attn(key_value_states)
+            key, value = jnp.split(kv_out, 2, axis=2)
         query = self._split_heads(query)
         key = self._split_heads(key)
         query_length, key_length = query.shape[1], key.shape[1]
+        if self.causal:
+            if self.has_variable("cache", "cached_key"):
+                mask_shift = self.variables["cache"]["cache_index"]
+                max_decoder_length = self.variables["cache"]["cached_key"].shape[1]
+                causal_mask = lax.dynamic_slice(
+                    self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length)
+                )
+            else:
+                causal_mask = self.causal_mask[:, :, :query_length, :key_length]
+            causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
+        # combine masks if needed
+        if attention_mask is not None and self.causal:
+            attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
+            attention_mask = combine_masks(attention_mask, causal_mask)
+        elif self.causal:
+            attention_mask = causal_mask
+        elif attention_mask is not None:
+            attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2))
         dropout_rng = None
         if not deterministic and self.config.attn_pdrop > 0.0:
         # During fast autoregressive decoding, we feed one position at a time,
         # and cache the keys and values step by step.
+        if self.causal and (self.has_variable("cache", "cached_key") or init_cache):
             key, value, attention_mask = self._concatenate_to_cache(key, value, query, attention_mask)
         # transform boolean mask into float mask
+        if attention_mask is not None:
+            attention_bias = lax.select(
+                attention_mask > 0,
+                jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
+                jnp.full(attention_mask.shape, -1e4).astype(self.dtype),
+            )
+        else:
+            attention_bias = None
         # usual dot product attention
         attn_weights = dot_product_attention_weights(
         self.ln_1 = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
         self.attn = FlaxGPT2Attention(self.config, dtype=self.dtype)
         self.ln_2 = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
+        if self.config.add_cross_attention:
+            self.crossattention = FlaxGPT2Attention(config=self.config, dtype=self.dtype, causal=False, is_cross_attention=True)
+            self.ln_cross_attn = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
+            project_encoder = getattr(self.config, "project_encoder", None)
+            if project_encoder:
+                self.encoder_projection_ln = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
+                self.encoder_projection_mlp = FlaxGPT2MLP(self.config, self.config.hidden_size, dtype=self.dtype)
         self.mlp = FlaxGPT2MLP(self.config, inner_dim, dtype=self.dtype)
     def __call__(
         self,
         hidden_states,
         attention_mask=None,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
         deterministic: bool = True,
         init_cache: bool = False,
         output_attentions: bool = False,
     ):
         residual = hidden_states
         hidden_states = self.ln_1(hidden_states)
+        attn_outputs = self.attn(
             hidden_states,
             attention_mask=attention_mask,
             deterministic=deterministic,
             output_attentions=output_attentions,
         )
         # residual connection
+        attn_output = attn_outputs[0]  # output_attn: a, (attentions)
+        outputs = attn_outputs[1:]
+        # residual connection
         hidden_states = attn_output + residual
+        # Cross-Attention Block
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            # add one self-attention block for cross-attention
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
+                    "cross-attention layers by setting `config.add_cross_attention=True`"
+                )
+            project_encoder = getattr(self.config, "project_encoder", None)
+            if project_encoder:
+                encoder_hidden_states = self.encoder_projection_ln(encoder_hidden_states)
+                feed_forward_hidden_states = self.encoder_projection_mlp(
+                    encoder_hidden_states, deterministic=deterministic
+                )
+                # residual connection
+                encoder_hidden_states = feed_forward_hidden_states
+            residual = hidden_states
+            hidden_states = self.ln_cross_attn(hidden_states)
+            cross_attn_outputs = self.crossattention(
+                hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                deterministic=deterministic,
+                output_attentions=output_attentions,
+            )
+            attn_output = cross_attn_outputs[0]
+            # residual connection
+            hidden_states = residual + attn_output
+            outputs = outputs + cross_attn_outputs[1:]  # add cross attentions if we output attention weights
         residual = hidden_states
         hidden_states = self.ln_2(hidden_states)
         feed_forward_hidden_states = self.mlp(hidden_states, deterministic=deterministic)
         # residual connection
         hidden_states = residual + feed_forward_hidden_states
+        outputs = (hidden_states,) + outputs
+        return outputs
 class FlaxGPT2PreTrainedModel(FlaxPreTrainedModel):
         params_rng, dropout_rng = jax.random.split(rng)
         rngs = {"params": params_rng, "dropout": dropout_rng}
+        if self.config.add_cross_attention:
+            encoder_hidden_states = jnp.zeros(input_shape + (self.config.n_embd,))
+            encoder_attention_mask = attention_mask
+            module_init_outputs = self.module.init(
+                rngs, input_ids, attention_mask, position_ids,
+                encoder_hidden_states, encoder_attention_mask, return_dict=False
+            )
+        else:
+            module_init_outputs = self.module.init(
+                rngs, input_ids, attention_mask, position_ids, return_dict=False
+            )
+        return module_init_outputs["params"]
     def init_cache(self, batch_size, max_length):
         r"""
         input_ids,
         attention_mask=None,
         position_ids=None,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
         params: dict = None,
         past_key_values: dict = None,
         dropout_rng: jax.random.PRNGKey = None,
         )
         return_dict = return_dict if return_dict is not None else self.config.return_dict
+        if encoder_hidden_states is not None and encoder_attention_mask is None:
+            batch_size, sequence_length = encoder_hidden_states.shape[:2]
+            encoder_attention_mask = jnp.ones((batch_size, sequence_length))
         batch_size, sequence_length = input_ids.shape
         if position_ids is None:
             jnp.array(input_ids, dtype="i4"),
             jnp.array(attention_mask, dtype="i4"),
             jnp.array(position_ids, dtype="i4"),
+            encoder_hidden_states,
+            encoder_attention_mask,
             not train,
             False,
             output_attentions,
         self,
         hidden_states,
         attention_mask=None,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
         deterministic: bool = True,
         init_cache: bool = False,
         output_attentions: bool = False,
     ):
         all_attentions = () if output_attentions else None
         all_hidden_states = () if output_hidden_states else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
         for block in self.blocks:
             if output_hidden_states:
             layer_outputs = block(
                 hidden_states,
                 attention_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
                 deterministic=deterministic,
                 init_cache=init_cache,
                 output_attentions=output_attentions,
             if output_attentions:
                 all_attentions += (layer_outputs[1],)
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
+        outputs = [hidden_states, all_hidden_states, all_attentions, all_cross_attentions]
         if not return_dict:
             return tuple(v for v in outputs if v is not None)
+        return FlaxBaseModelOutputWithPastAndCrossAttentions(
             last_hidden_state=hidden_states,
             hidden_states=all_hidden_states,
             attentions=all_attentions,
+            cross_attentions=all_cross_attentions,
         )
         input_ids,
         attention_mask,
         position_ids,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
         deterministic=True,
         init_cache: bool = False,
         output_attentions: bool = False,
         outputs = self.h(
             hidden_states,
             attention_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
             deterministic=deterministic,
             init_cache=init_cache,
             output_attentions=output_attentions,
         if not return_dict:
             return (hidden_states,) + outputs[1:]
+        return FlaxBaseModelOutputWithPastAndCrossAttentions(
             last_hidden_state=hidden_states,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
         )
 append_call_sample_docstring(
+    FlaxGPT2Model, _TOKENIZER_FOR_DOC, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutputWithPastAndCrossAttentions, _CONFIG_FOR_DOC
 )
         input_ids,
         attention_mask,
         position_ids,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
         deterministic: bool = True,
         init_cache: bool = False,
         output_attentions: bool = False,
             input_ids,
             attention_mask,
             position_ids,
+            encoder_hidden_states,
+            encoder_attention_mask,
             deterministic=deterministic,
             init_cache=init_cache,
             output_attentions=output_attentions,
         if not return_dict:
             return (lm_logits,) + outputs[1:]
+        return FlaxCausalLMOutputWithCrossAttentions(
+            logits=lm_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions
+        )
 @add_start_docstrings(
     """
 append_call_sample_docstring(
+    FlaxGPT2LMHeadModel, _TOKENIZER_FOR_DOC, _CHECKPOINT_FOR_DOC, FlaxCausalLMOutputWithCrossAttentions, _CONFIG_FOR_DOC
 )