Text Generation
scaling
umup-research-3b-fp8 / config.yml
GregorZiegltrumAA's picture
Add Model
52f8912
optimizer:
allreduce_bucket_size: 500000000
beta1: 0.9
beta2: 0.95
debug_log: false
eps: 1e-08
gradient_clipping: 0.0
zero: true
zero_save_static: false
topology:
activation_checkpointing_type: disabled
global_batch_size: 1024
gradient_accumulation_steps: 2
micro_batch_size: 2
model_parallel_size: 1
pipe_parallel_size: 1
pipe_partition_method: balanced
pipe_partition_overwrite: null
sequence_parallel: false
trainer:
seed: 42
train_iterations: 72000
training:
allow_missing_params_in_optimizer: true
training_groups:
- group_name: param_group
independent_weight_decay: true
learning_rate_scheduler:
learning_rate: 11.313708498984761
learning_rate_decay_iters: 72000
learning_rate_decay_style: cosine
learning_rate_minimum: 1.131370849898476
learning_rate_warmup_steps: 500
parameters_exclude:
- norm
weight_decay: 0.0001221
transformer_architecture:
attention_bias: false
attention_num_kv_heads: null
attention_qkv_in_one: true
dropout_after_attention: 0.0
dropout_after_mlp: 0.0
dropout_attention_probs: 0.0
dropout_embedding: 0.0
dropout_image_encoder: 0.0
fp8_config_attention:
dtypes_forward:
left_dtype: e4m3
right_dtype: e4m3
dtypes_grad_input:
left_dtype: e5m2
right_dtype: e4m3
dtypes_grad_weight:
left_dtype: e4m3
right_dtype: e5m2
fp8_config_mlp:
dtypes_forward:
left_dtype: e4m3
right_dtype: e4m3
dtypes_grad_input:
left_dtype: e5m2
right_dtype: e4m3
dtypes_grad_weight:
left_dtype: e4m3
right_dtype: e5m2
hidden_size: 3072
image_encoder: false
key_query_norm: false
layernorm:
layernorm_epsilon: 1e-05
optimization_type: torch
local_attention_window_size: null
masked_softmax:
kernel: flash_attention
scale: 1.0
softmax_in_fp32: false
mlp_bias: false
mlp_factor: 2.6666666666666665
mlp_type: swiglu
norm_type: rms
num_attention_heads: 24
num_layers: 24
num_local_attention_heads: 0
precision: bfloat16
relative_position_embedding_type: rotary_complex
reset_attention_mask: false
reset_position_ids: false
rotary_embedding_base: 10000
rotary_percentage: 1.0
sequence_length: 4096
umup:
act_mult: 1.0
attn_mult: 1.0
enable: true
loss_mult: 1.0
normalize_depth_to_num_layers: true
residual_attn_ratio: 0.25
residual_mult: 1.0
vocab_file: null
vocab_size: 65536
weight_tying: false