Spaces:

flax-community
/

dalle-mini

Running

App Files Files Community

boris commited on Dec 19, 2021

Commit

a96f4dc

•

1 Parent(s): a11892f

fix: adjust training script + dataloader

Browse files

Files changed (7) hide show

dalle_mini/data.py +4 -6
dalle_mini/model.py +0 -64
dalle_mini/model/__init__.py +2 -0
dalle_mini/{configuration_bart.py → model/configuration.py} +8 -4
dalle_mini/{modeling_bart_flax.py → model/modeling.py} +24 -24
dalle_mini/{partitions.py → model/partitions.py} +1 -1
tools/train/train.py +6 -63

dalle_mini/data.py CHANGED Viewed

@@ -15,12 +15,10 @@ class Dataset:
     dataset_repo_or_path: str
     train_file: str = None
     validation_file: str = None
-    dataset_type: str = "dataset"
     streaming: bool = True
     use_auth_token: bool = False
     text_column: str = "caption"
     encoding_column: str = "encoding"
-    max_source_length: int = 128
     max_train_samples: int = None
     max_eval_samples: int = None
     preprocessing_num_workers: int = None
@@ -70,7 +68,7 @@ class Dataset:
                     else self.eval_dataset.select(range(self.max_eval_samples))
                 )
-    def preprocess(self, tokenizer, decoder_start_token_id, normalize_text):
         if self.streaming:
             # we need to shuffle early in streaming mode
             if hasattr(self, "train_dataset"):
@@ -112,7 +110,7 @@ class Dataset:
             tokenizer=tokenizer,
             text_column=self.text_column,
             encoding_column=self.encoding_column,
-            max_source_length=self.max_source_length,
             decoder_start_token_id=decoder_start_token_id,
         )
         for ds in ["train_dataset", "eval_dataset"]:
@@ -232,14 +230,14 @@ def preprocess_function(
     tokenizer,
     text_column,
     encoding_column,
-    max_source_length,
     decoder_start_token_id,
 ):
     inputs = examples[text_column]
     # Setting padding="max_length" as we need fixed length inputs for jitted functions
     model_inputs = tokenizer(
         inputs,
-        max_length=max_source_length,
         padding="max_length",
         truncation=True,
         return_tensors="np",

     dataset_repo_or_path: str
     train_file: str = None
     validation_file: str = None
     streaming: bool = True
     use_auth_token: bool = False
     text_column: str = "caption"
     encoding_column: str = "encoding"
     max_train_samples: int = None
     max_eval_samples: int = None
     preprocessing_num_workers: int = None
                     else self.eval_dataset.select(range(self.max_eval_samples))
                 )
+    def preprocess(self, tokenizer, decoder_start_token_id, normalize_text, max_length):
         if self.streaming:
             # we need to shuffle early in streaming mode
             if hasattr(self, "train_dataset"):
             tokenizer=tokenizer,
             text_column=self.text_column,
             encoding_column=self.encoding_column,
+            max_length=max_length,
             decoder_start_token_id=decoder_start_token_id,
         )
         for ds in ["train_dataset", "eval_dataset"]:
     tokenizer,
     text_column,
     encoding_column,
+    max_length,
     decoder_start_token_id,
 ):
     inputs = examples[text_column]
     # Setting padding="max_length" as we need fixed length inputs for jitted functions
     model_inputs = tokenizer(
         inputs,
+        max_length=max_length,
         padding="max_length",
         truncation=True,
         return_tensors="np",

dalle_mini/model.py DELETED Viewed

@@ -1,64 +0,0 @@
-import flax.linen as nn
-import jax
-from transformers import BartConfig
-from transformers.models.bart.modeling_flax_bart import (
-    FlaxBartDecoder,
-    FlaxBartEncoder,
-    FlaxBartForConditionalGeneration,
-    FlaxBartForConditionalGenerationModule,
-    FlaxBartModule,
-)
-class CustomFlaxBartModule(FlaxBartModule):
-    def setup(self):
-        # we keep shared to easily load pre-trained weights
-        self.shared = nn.Embed(
-            self.config.vocab_size,
-            self.config.d_model,
-            embedding_init=jax.nn.initializers.normal(self.config.init_std),
-        )
-        # a separate embedding is used for the decoder
-        self.decoder_embed = nn.Embed(
-            self.config.image_vocab_size + 1,
-            self.config.d_model,
-            embedding_init=jax.nn.initializers.normal(self.config.init_std),
-        )
-        self.encoder = FlaxBartEncoder(
-            self.config, dtype=self.dtype, embed_tokens=self.shared
-        )
-        # the decoder has a different config
-        # TODO: should not be needed once we have custom config/module
-        decoder_config = BartConfig(self.config.to_dict())
-        decoder_config.max_position_embeddings = (
-            self.config.image_length + 1  # image tokens + BOS
-        )
-        decoder_config.vocab_size = self.config.image_vocab_size + 1
-        self.decoder = FlaxBartDecoder(
-            decoder_config, dtype=self.dtype, embed_tokens=self.decoder_embed
-        )
-class CustomFlaxBartForConditionalGenerationModule(
-    FlaxBartForConditionalGenerationModule
-):
-    def setup(self):
-        # set default config
-        self.config.normalize_text = getattr(self.config, "normalize_text", False)
-        self.config.image_length = getattr(self.config, "image_length", 256)
-        self.config.image_vocab_size = getattr(self.config, "image_vocab_size", 16384)
-        self.model = CustomFlaxBartModule(config=self.config, dtype=self.dtype)
-        self.lm_head = nn.Dense(
-            self.config.image_vocab_size + 1,  # encoded image token space + 1 for bos
-            use_bias=False,
-            kernel_init=jax.nn.initializers.normal(self.config.init_std),
-        )
-        self.final_logits_bias = self.param(
-            "final_logits_bias", self.bias_init, (1, self.config.image_vocab_size + 1)
-        )
-class CustomFlaxBartForConditionalGeneration(FlaxBartForConditionalGeneration):
-    module_class = CustomFlaxBartForConditionalGenerationModule

dalle_mini/model/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .configuration import DalleBartConfig
2	+ from .modeling import DalleBartForConditionalGeneration

dalle_mini/{configuration_bart.py → model/configuration.py} RENAMED Viewed

@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" BART model configuration """
 import warnings
 from transformers.configuration_utils import PretrainedConfig
@@ -123,7 +123,7 @@ class DalleBartConfig(PretrainedConfig):
     ):
         self.normalize_text = normalize_text
         self.encoder_vocab_size = encoder_vocab_size
-        self.decoder_vocab_size = image_vocab_size
         self.image_length = image_length
         self.max_text_length = max_text_length
         self.d_model = d_model
@@ -145,17 +145,21 @@ class DalleBartConfig(PretrainedConfig):
         self.num_hidden_layers = encoder_layers
         self.gradient_checkpointing = gradient_checkpointing
         self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
-        self.decoder_start_token_id = image_vocab_size,  # BOS appended to vocab
         self.min_length = image_length + 1
         self.max_length = image_length + 1
         super().__init__(
             num_labels=num_labels,
             pad_token_id=image_vocab_size + 1,  # needed to avoid errors during generation (converted to jnp.array)
             bos_token_id=image_vocab_size + 1,  # set to unreachable values
             eos_token_id=image_vocab_size + 1,
             is_encoder_decoder=is_encoder_decoder,
-            decoder_start_token_id=decoder_start_token_id,
             forced_eos_token_id=forced_eos_token_id,
             tie_word_embeddings=tie_word_embeddings,
             **kwargs,

 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+""" DalleBart model configuration """
 import warnings
 from transformers.configuration_utils import PretrainedConfig
     ):
         self.normalize_text = normalize_text
         self.encoder_vocab_size = encoder_vocab_size
+        self.image_vocab_size = image_vocab_size
         self.image_length = image_length
         self.max_text_length = max_text_length
         self.d_model = d_model
         self.num_hidden_layers = encoder_layers
         self.gradient_checkpointing = gradient_checkpointing
         self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+        self.decoder_start_token_id = image_vocab_size  # BOS appended to vocab
         self.min_length = image_length + 1
         self.max_length = image_length + 1
+        # remove keys we are about to set to prevent errors
+        for k in ['bos_token_id', 'eos_token_id', 'pad_token_id', 'decoder_start_token_id', 'forced_eos_token_id']:
+            kwargs.pop(k, None)
         super().__init__(
             num_labels=num_labels,
             pad_token_id=image_vocab_size + 1,  # needed to avoid errors during generation (converted to jnp.array)
             bos_token_id=image_vocab_size + 1,  # set to unreachable values
             eos_token_id=image_vocab_size + 1,
             is_encoder_decoder=is_encoder_decoder,
+            decoder_start_token_id=self.decoder_start_token_id,
             forced_eos_token_id=forced_eos_token_id,
             tie_word_embeddings=tie_word_embeddings,
             **kwargs,

dalle_mini/{modeling_bart_flax.py → model/modeling.py} RENAMED Viewed

@@ -45,7 +45,7 @@ from transformers.modeling_flax_utils import (
 from transformers.utils import logging
-from .configuration_bart import BartConfig
 logger = logging.get_logger(__name__)
@@ -64,7 +64,7 @@ def shift_tokens_right(input_ids: np.array, pad_token_id: int, decoder_start_tok
 class FlaxBartAttention(nn.Module):
-    config: BartConfig
     embed_dim: int
     num_heads: int
     dropout: float = 0.0
@@ -93,7 +93,7 @@ class FlaxBartAttention(nn.Module):
         if self.causal:
             self.causal_mask = make_causal_mask(
-                jnp.ones((1, embed_dim), dtype="bool"), dtype="bool"
             )
     def _split_heads(self, hidden_states):
@@ -224,7 +224,7 @@ class FlaxBartAttention(nn.Module):
 class FlaxBartEncoderLayer(nn.Module):
-    config: BartConfig
     dtype: jnp.dtype = jnp.float32
     def setup(self) -> None:
@@ -279,7 +279,7 @@ class FlaxBartEncoderLayer(nn.Module):
 class FlaxBartEncoderLayerCollection(nn.Module):
-    config: BartConfig
     dtype: jnp.dtype = jnp.float32  # the dtype of the computation
     def setup(self):
@@ -306,7 +306,7 @@ class FlaxBartEncoderLayerCollection(nn.Module):
 class FlaxBartDecoderLayer(nn.Module):
-    config: BartConfig
     dtype: jnp.dtype = jnp.float32
     def setup(self) -> None:
@@ -390,7 +390,7 @@ class FlaxBartDecoderLayer(nn.Module):
 class FlaxBartDecoderLayerCollection(nn.Module):
-    config: BartConfig
     dtype: jnp.dtype = jnp.float32  # the dtype of the computation
     def setup(self):
@@ -422,8 +422,8 @@ class FlaxBartDecoderLayerCollection(nn.Module):
         return FlaxBaseModelOutputWithPastAndCrossAttentions(last_hidden_state=hidden_states)
-class FlaxBartEncoder(nn.Module):
-    config: BartConfig
     dtype: jnp.dtype = jnp.float32  # the dtype of the computation
     def setup(self):
@@ -479,8 +479,8 @@ class FlaxBartEncoder(nn.Module):
         )
-class FlaxBartDecoder(nn.Module):
-    config: BartConfig
     dtype: jnp.dtype = jnp.float32  # the dtype of the computation
     def setup(self):
@@ -550,13 +550,13 @@ class FlaxBartDecoder(nn.Module):
         )
-class FlaxBartModule(nn.Module):
-    config: BartConfig
     dtype: jnp.dtype = jnp.float32  # the dtype of the computation
     def setup(self):
-        self.encoder = FlaxBartEncoder(self.config, dtype=self.dtype)
-        self.decoder = FlaxBartDecoder(self.config, dtype=self.dtype)
     def _get_encoder_module(self):
         return self.encoder
@@ -605,14 +605,14 @@ class FlaxBartModule(nn.Module):
         )
-class FlaxBartPreTrainedModel(FlaxPreTrainedModel):
-    config_class = BartConfig
-    base_model_prefix: str = "model"
     module_class: nn.Module = None
     def __init__(
         self,
-        config: BartConfig,
         input_shape: Tuple[int] = (1, 1),
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
@@ -792,13 +792,13 @@ class FlaxBartPreTrainedModel(FlaxPreTrainedModel):
         )
-class FlaxBartForConditionalGenerationModule(nn.Module):
-    config: BartConfig
     dtype: jnp.dtype = jnp.float32
     bias_init: Callable[..., jnp.ndarray] = jax.nn.initializers.zeros
     def setup(self):
-        self.model = FlaxBartModule(config=self.config, dtype=self.dtype)
         self.lm_head = nn.Dense(
             self.config.image_vocab_size + 1,  # image vocab size + 1 for BOS
             use_bias=False,
@@ -854,8 +854,8 @@ class FlaxBartForConditionalGenerationModule(nn.Module):
         )
-class FlaxBartForConditionalGeneration(FlaxBartPreTrainedModel):
-    module_class = FlaxBartForConditionalGenerationModule
     dtype: jnp.dtype = jnp.float32
     def decode(

 from transformers.utils import logging
+from .configuration import DalleBartConfig
 logger = logging.get_logger(__name__)
 class FlaxBartAttention(nn.Module):
+    config: DalleBartConfig
     embed_dim: int
     num_heads: int
     dropout: float = 0.0
         if self.causal:
             self.causal_mask = make_causal_mask(
+                jnp.ones((1, self.embed_dim), dtype="bool"), dtype="bool"
             )
     def _split_heads(self, hidden_states):
 class FlaxBartEncoderLayer(nn.Module):
+    config: DalleBartConfig
     dtype: jnp.dtype = jnp.float32
     def setup(self) -> None:
 class FlaxBartEncoderLayerCollection(nn.Module):
+    config: DalleBartConfig
     dtype: jnp.dtype = jnp.float32  # the dtype of the computation
     def setup(self):
 class FlaxBartDecoderLayer(nn.Module):
+    config: DalleBartConfig
     dtype: jnp.dtype = jnp.float32
     def setup(self) -> None:
 class FlaxBartDecoderLayerCollection(nn.Module):
+    config: DalleBartConfig
     dtype: jnp.dtype = jnp.float32  # the dtype of the computation
     def setup(self):
         return FlaxBaseModelOutputWithPastAndCrossAttentions(last_hidden_state=hidden_states)
+class DalleBartEncoder(nn.Module):
+    config: DalleBartConfig
     dtype: jnp.dtype = jnp.float32  # the dtype of the computation
     def setup(self):
         )
+class DalleBartDecoder(nn.Module):
+    config: DalleBartConfig
     dtype: jnp.dtype = jnp.float32  # the dtype of the computation
     def setup(self):
         )
+class DalleBartModule(nn.Module):
+    config: DalleBartConfig
     dtype: jnp.dtype = jnp.float32  # the dtype of the computation
     def setup(self):
+        self.encoder = DalleBartEncoder(self.config, dtype=self.dtype)
+        self.decoder = DalleBartDecoder(self.config, dtype=self.dtype)
     def _get_encoder_module(self):
         return self.encoder
         )
+class DalleBartPreTrainedModel(FlaxPreTrainedModel):
+    config_class = DalleBartConfig
+    base_model_prefix: str = "dallebart"
     module_class: nn.Module = None
     def __init__(
         self,
+        config: DalleBartConfig,
         input_shape: Tuple[int] = (1, 1),
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         )
+class DalleBartForConditionalGenerationModule(nn.Module):
+    config: DalleBartConfig
     dtype: jnp.dtype = jnp.float32
     bias_init: Callable[..., jnp.ndarray] = jax.nn.initializers.zeros
     def setup(self):
+        self.model = DalleBartModule(config=self.config, dtype=self.dtype)
         self.lm_head = nn.Dense(
             self.config.image_vocab_size + 1,  # image vocab size + 1 for BOS
             use_bias=False,
         )
+class DalleBartForConditionalGeneration(DalleBartPreTrainedModel):
+    module_class = DalleBartForConditionalGenerationModule
     dtype: jnp.dtype = jnp.float32
     def decode(

dalle_mini/{partitions.py → model/partitions.py} RENAMED Viewed

@@ -5,7 +5,7 @@ from flax.traverse_util import flatten_dict, unflatten_dict
 from jax.experimental import PartitionSpec as P
-# utils adapted from https://gitihub.com/google-research/google-research/blob/master/flax_models/t5x/partitions.py
 # Sentinels
 _unmatched = object()

 from jax.experimental import PartitionSpec as P
+# utils adapted from https://github.com/google-research/google-research/blob/master/flax_models/t5x/partitions.py
 # Sentinels
 _unmatched = object()

tools/train/train.py CHANGED Viewed

@@ -44,7 +44,7 @@ from transformers import AutoTokenizer, HfArgumentParser
 from transformers.models.bart.modeling_flax_bart import BartConfig
 from dalle_mini.data import Dataset
-from dalle_mini.model import CustomFlaxBartForConditionalGeneration
 logger = logging.getLogger(__name__)
@@ -68,26 +68,12 @@ class ModelArguments:
             "help": "Pretrained config name or path if not the same as model_name"
         },
     )
-    image_vocab_size: Optional[int] = field(
-        default=None,
-        metadata={"help": "Vocab size of image encoder"},
-    )
-    image_length: Optional[int] = field(
-        default=None,
-        metadata={"help": "Number of tokens per image"},
-    )
     tokenizer_name: Optional[str] = field(
         default=None,
         metadata={
             "help": "Pretrained tokenizer name or path if not the same as model_name_or_path"
         },
     )
-    normalize_text: Optional[bool] = field(
-        default=None,
-        metadata={
-            "help": "Whether to normalize text or not. By default, we refer to base model or don't normalize for new models."
-        },
-    )
     dtype: Optional[str] = field(
         default="float32",
         metadata={
@@ -126,10 +112,6 @@ class DataTrainingArguments:
         default=None,
         metadata={"help": "An optional input evaluation data file (glob acceptable)."},
     )
-    dataset_type: str = field(
-        default="datasets",
-        metadata={"help": "Either 🤗 'dataset' (default) or 'webdataset'."},
-    )
     # data loading should not be a bottleneck so we use "streaming" mode by default
     streaming: bool = field(
         default=True,
@@ -141,13 +123,6 @@ class DataTrainingArguments:
             "help": "Whether to use the authentication token for private datasets."
         },
     )
-    max_source_length: Optional[int] = field(
-        default=128,
-        metadata={
-            "help": "The maximum total input sequence length after tokenization. Sequences longer "
-            "than this will be truncated, sequences shorter will be padded."
-        },
-    )
     max_train_samples: Optional[int] = field(
         default=None,
         metadata={
@@ -436,47 +411,14 @@ def main():
     else:
         # Set up our new model config
-        # TODO: simplify with custom config class
         if model_args.config_name:
-            config = BartConfig.from_pretrained(model_args.config_name)
-        else:
-            config = BartConfig.from_pretrained(model_args.model_name_or_path)
-        if model_args.image_vocab_size:
-            config.image_vocab_size = model_args.image_vocab_size
-        assert (
-            getattr(config, "image_vocab_size") is not None
-        ), "image_vocab_size must be specified when not present in base model/config"
-        if model_args.image_length:
-            config.image_length = model_args.image_length
-        assert (
-            getattr(config, "image_length") is not None
-        ), "image_length must be specified when not present in base model/config"
-        # we append decoder bos to image vocab
-        config.decoder_start_token_id = config.image_vocab_size
-        # ensure we don't generate bos (in addition to decoder start token)
-        config.force_bos_token_to_be_generated = False
-        config.forced_bos_token_id = None  # we don't need this token
-        config.forced_eos_token_id = None  # we don't need this token
-        config.tie_word_embeddings = False
-        config.min_length = config.image_length + 1
-        config.max_length = config.image_length + 1
-        # below tokens need to be set to avoid error during generation (converted to jnp.array)
-        # they are not expected to be used and are set to unreachable token id
-        config.bos_token_id = config.image_vocab_size + 1
-        config.pos_token_id = config.image_vocab_size + 1
-        config.eos_token_id = config.image_vocab_size + 1
-        # save whether we normalize the text
-        if model_args.normalize_text is not None:
-            config.normalize_text = model_args.normalize_text
         else:
-            config.normalize_text = getattr(config, "normalize_text", False)
         # Load or create new model
         if model_args.model_name_or_path:
-            model = CustomFlaxBartForConditionalGeneration.from_pretrained(
                 model_args.model_name_or_path,
                 config=config,
                 seed=training_args.seed_model,
@@ -485,7 +427,7 @@ def main():
             # avoid OOM on TPU: see https://github.com/google/flax/issues/1658
             print(model.params)
         else:
-            model = CustomFlaxBartForConditionalGeneration(
                 config,
                 seed=training_args.seed_model,
                 dtype=getattr(jnp, model_args.dtype),
@@ -512,6 +454,7 @@ def main():
         tokenizer=tokenizer,
         decoder_start_token_id=model.config.decoder_start_token_id,
         normalize_text=model.config.normalize_text,
     )
     # Initialize our training

 from transformers.models.bart.modeling_flax_bart import BartConfig
 from dalle_mini.data import Dataset
+from dalle_mini.model import DalleBartConfig, DalleBartForConditionalGeneration
 logger = logging.getLogger(__name__)
             "help": "Pretrained config name or path if not the same as model_name"
         },
     )
     tokenizer_name: Optional[str] = field(
         default=None,
         metadata={
             "help": "Pretrained tokenizer name or path if not the same as model_name_or_path"
         },
     )
     dtype: Optional[str] = field(
         default="float32",
         metadata={
         default=None,
         metadata={"help": "An optional input evaluation data file (glob acceptable)."},
     )
     # data loading should not be a bottleneck so we use "streaming" mode by default
     streaming: bool = field(
         default=True,
             "help": "Whether to use the authentication token for private datasets."
         },
     )
     max_train_samples: Optional[int] = field(
         default=None,
         metadata={
     else:
         # Set up our new model config
         if model_args.config_name:
+            config = DalleBartConfig.from_pretrained(model_args.config_name)
         else:
+            config = DalleBartConfig.from_pretrained(model_args.model_name_or_path)
         # Load or create new model
         if model_args.model_name_or_path:
+            model = DalleBartForConditionalGeneration.from_pretrained(
                 model_args.model_name_or_path,
                 config=config,
                 seed=training_args.seed_model,
             # avoid OOM on TPU: see https://github.com/google/flax/issues/1658
             print(model.params)
         else:
+            model = DalleBartForConditionalGeneration(
                 config,
                 seed=training_args.seed_model,
                 dtype=getattr(jnp, model_args.dtype),
         tokenizer=tokenizer,
         decoder_start_token_id=model.config.decoder_start_token_id,
         normalize_text=model.config.normalize_text,
+        max_length=model.config.max_text_length,
     )
     # Initialize our training