boris commited on
Commit
0f2cf98
1 Parent(s): 42968cf

feat: layernorm > rmsnorm in long runs

Browse files
src/dalle_mini/model/configuration.py CHANGED
@@ -60,7 +60,7 @@ class DalleBartConfig(PretrainedFromWandbMixin, PretrainedConfig):
60
  do_sample=True,
61
  # transformer variants
62
  use_bias=False, # use bias in attention and dense layers (except for lm_head)
63
- ln_type="rmsnorm", # layer normalization type, "rmsnorm", "layernorm"
64
  ln_positions="normformer", # layer normalization positions, "normformer", "swinv2", "cogview", "postln", "preln", "deepnet" (same as postln)
65
  use_head_scale=False, # used in NormFormer
66
  use_cosine_attention=False, # used in Swin v2
 
60
  do_sample=True,
61
  # transformer variants
62
  use_bias=False, # use bias in attention and dense layers (except for lm_head)
63
+ ln_type="layernorm", # layer normalization type, "rmsnorm", "layernorm"
64
  ln_positions="normformer", # layer normalization positions, "normformer", "swinv2", "cogview", "postln", "preln", "deepnet" (same as postln)
65
  use_head_scale=False, # used in NormFormer
66
  use_cosine_attention=False, # used in Swin v2