optimizer: allreduce_bucket_size: 500000000 beta1: 0.9 beta2: 0.95 debug_log: false eps: 1e-08 gradient_clipping: 0.0 zero: true zero_save_static: false topology: activation_checkpointing_type: disabled global_batch_size: 1024 gradient_accumulation_steps: 4 micro_batch_size: 2 model_parallel_size: 1 pipe_parallel_size: 2 pipe_partition_method: balanced pipe_partition_overwrite: null sequence_parallel: false trainer: seed: 42 train_iterations: 72000 training: allow_missing_params_in_optimizer: true training_groups: - group_name: param_group independent_weight_decay: true learning_rate_scheduler: learning_rate: 11.313708498984761 learning_rate_decay_iters: 72000 learning_rate_decay_style: cosine learning_rate_minimum: 1.131370849898476 learning_rate_warmup_steps: 500 parameters_exclude: - norm weight_decay: 0.0001221 transformer_architecture: attention_bias: false attention_num_kv_heads: null attention_qkv_in_one: true dropout_after_attention: 0.0 dropout_after_mlp: 0.0 dropout_attention_probs: 0.0 dropout_embedding: 0.0 dropout_image_encoder: 0.0 fp8_config_attention: dtypes_forward: left_dtype: e4m3 right_dtype: e4m3 dtypes_grad_input: left_dtype: e5m2 right_dtype: e4m3 dtypes_grad_weight: left_dtype: e4m3 right_dtype: e5m2 fp8_config_mlp: dtypes_forward: left_dtype: e4m3 right_dtype: e4m3 dtypes_grad_input: left_dtype: e5m2 right_dtype: e4m3 dtypes_grad_weight: left_dtype: e4m3 right_dtype: e5m2 hidden_size: 4096 image_encoder: false key_query_norm: false layernorm: layernorm_epsilon: 1e-05 optimization_type: torch local_attention_window_size: null masked_softmax: kernel: flash_attention scale: 1.0 softmax_in_fp32: false mlp_bias: false mlp_factor: 2.66796875 mlp_type: swiglu norm_type: rms num_attention_heads: 32 num_layers: 32 num_local_attention_heads: 0 precision: bfloat16 relative_position_embedding_type: rotary_complex reset_attention_mask: false reset_position_ids: false rotary_embedding_base: 10000 rotary_percentage: 1.0 sequence_length: 4096 umup: act_mult: 1.0 attn_mult: 1.0 enable: true loss_mult: 1.0 normalize_depth_to_num_layers: true residual_attn_ratio: 0.25 residual_mult: 1.0 vocab_file: null vocab_size: 65536 weight_tying: false