File size: 2,009 Bytes
aa41939 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
optimizer:
allreduce_bucket_size: 500000000
beta1: 0.9
beta2: 0.95
debug_log: false
eps: 1e-08
gradient_clipping: 0.0
zero: true
zero_save_static: false
topology:
activation_checkpointing_type: disabled
global_batch_size: 1024
gradient_accumulation_steps: 4
micro_batch_size: 2
model_parallel_size: 1
pipe_parallel_size: 2
pipe_partition_method: balanced
pipe_partition_overwrite: null
sequence_parallel: false
trainer:
seed: 42
train_iterations: 72000
training:
allow_missing_params_in_optimizer: true
training_groups:
- group_name: param_group
independent_weight_decay: true
learning_rate_scheduler:
learning_rate: 11.313708498984761
learning_rate_decay_iters: 72000
learning_rate_decay_style: cosine
learning_rate_minimum: 1.131370849898476
learning_rate_warmup_steps: 500
parameters_exclude:
- norm
weight_decay: 0.0001221
transformer_architecture:
attention_bias: false
attention_num_kv_heads: null
attention_qkv_in_one: true
dropout_after_attention: 0.0
dropout_after_mlp: 0.0
dropout_attention_probs: 0.0
dropout_embedding: 0.0
dropout_image_encoder: 0.0
hidden_size: 4096
image_encoder: false
key_query_norm: false
layernorm:
layernorm_epsilon: 1e-05
optimization_type: torch
local_attention_window_size: null
masked_softmax:
kernel: flash_attention
scale: 1.0
softmax_in_fp32: false
mlp_bias: false
mlp_factor: 2.66796875
mlp_type: swiglu
norm_type: rms
num_attention_heads: 32
num_layers: 32
num_local_attention_heads: 0
precision: bfloat16
relative_position_embedding_type: rotary_complex
reset_attention_mask: false
reset_position_ids: false
rotary_embedding_base: 10000
rotary_percentage: 1.0
sequence_length: 4096
umup:
act_mult: 1.0
attn_mult: 1.0
enable: true
loss_mult: 1.0
normalize_depth_to_num_layers: true
residual_attn_ratio: 0.25
residual_mult: 1.0
vocab_file: null
vocab_size: 65536
weight_tying: false
|