optimizer: allreduce_bucket_size: 500000000 beta1: 0.9 beta2: 0.95 debug_log: false eps: 1e-08 gradient_clipping: 0.0 zero: true zero_save_static: false topology: activation_checkpointing_type: disabled global_batch_size: 1024 gradient_accumulation_steps: 2 micro_batch_size: 2 model_parallel_size: 1 pipe_parallel_size: 1 pipe_partition_method: balanced pipe_partition_overwrite: null sequence_parallel: false trainer: seed: 42 train_iterations: 72000 training: allow_missing_params_in_optimizer: true training_groups: - group_name: param_group independent_weight_decay: false learning_rate_scheduler: learning_rate: 0.0004 learning_rate_decay_iters: 72000 learning_rate_decay_style: cosine learning_rate_minimum: 4e-05 learning_rate_warmup_steps: 500 parameters_exclude: [] weight_decay: 0.1 transformer_architecture: attention_bias: false attention_num_kv_heads: null attention_qkv_in_one: true dropout_after_attention: 0.0 dropout_after_mlp: 0.0 dropout_attention_probs: 0.0 dropout_embedding: 0.0 dropout_image_encoder: 0.0 hidden_size: 3072 image_encoder: false key_query_norm: false layernorm: layernorm_epsilon: 1e-05 optimization_type: torch local_attention_window_size: null masked_softmax: kernel: flash_attention scale: 1.0 softmax_in_fp32: false mlp_bias: false mlp_factor: 2.6666666666666665 mlp_type: swiglu norm_type: rms num_attention_heads: 24 num_layers: 24 num_local_attention_heads: 0 precision: bfloat16 relative_position_embedding_type: rotary_complex reset_attention_mask: false reset_position_ids: false rotary_embedding_base: 10000 rotary_percentage: 1.0 sequence_length: 4096 umup: act_mult: 1.0 attn_mult: 1.0 enable: false loss_mult: 1.0 residual_attn_ratio: 1.0 residual_mult: 1.0 vocab_file: null vocab_size: 65536 weight_tying: false