feat: layernorm > rmsnorm in long runs
Browse files
src/dalle_mini/model/configuration.py
CHANGED
@@ -60,7 +60,7 @@ class DalleBartConfig(PretrainedFromWandbMixin, PretrainedConfig):
|
|
60 |
do_sample=True,
|
61 |
# transformer variants
|
62 |
use_bias=False, # use bias in attention and dense layers (except for lm_head)
|
63 |
-
ln_type="
|
64 |
ln_positions="normformer", # layer normalization positions, "normformer", "swinv2", "cogview", "postln", "preln", "deepnet" (same as postln)
|
65 |
use_head_scale=False, # used in NormFormer
|
66 |
use_cosine_attention=False, # used in Swin v2
|
|
|
60 |
do_sample=True,
|
61 |
# transformer variants
|
62 |
use_bias=False, # use bias in attention and dense layers (except for lm_head)
|
63 |
+
ln_type="layernorm", # layer normalization type, "rmsnorm", "layernorm"
|
64 |
ln_positions="normformer", # layer normalization positions, "normformer", "swinv2", "cogview", "postln", "preln", "deepnet" (same as postln)
|
65 |
use_head_scale=False, # used in NormFormer
|
66 |
use_cosine_attention=False, # used in Swin v2
|