tensor_model_parallel_size: 1 pipeline_model_parallel_size: 1 virtual_pipeline_model_parallel_size: null sequence_parallel: false context_parallel_size: 1 expert_model_parallel_size: 1 moe_extended_tp: false perform_initialization: true use_cpu_initialization: false fp16: false bf16: false params_dtype: float32 timers: null finalize_model_grads_func: null grad_scale_func: null no_sync_func: null grad_sync_func: null param_sync_func: null deterministic_mode: false enable_autocast: false autocast_dtype: float32 num_microbatches_with_partial_activation_checkpoints: null gradient_accumulation_fusion: false async_tensor_model_parallel_allreduce: false use_te_rng_tracker: false tp_comm_overlap: false tp_comm_bulk_wgrad: true tp_comm_bulk_dgrad: true tp_comm_overlap_ag: true tp_comm_overlap_rs: true tp_comm_overlap_rs_dgrad: false tp_comm_split_ag: true tp_comm_atomic_ag: false tp_comm_split_rs: true tp_comm_atomic_rs: false pipeline_dtype: null variable_seq_lengths: false overlap_p2p_comm: false batch_p2p_comm: true batch_p2p_sync: true use_ring_exchange_p2p: false deallocate_pipeline_outputs: false defer_embedding_wgrad_compute: false pipeline_model_parallel_split_rank: null cpu_offloading: false cpu_offloading_num_layers: 0 _cpu_offloading_context: null cpu_offloading_activations: true cpu_offloading_weights: true barrier_with_L1_time: true fp16_lm_cross_entropy: false parallel_output: true share_embeddings_and_output_weights: false make_vocab_size_divisible_by: 128 position_embedding_type: learned_absolute rotary_base: 10000 rotary_percent: 1.0 seq_len_interpolation_factor: null seq_length: 2048 optim: name: fused_adam sched: null optimizer_fn: null tokenizer_filepath: null num_layers: 4 hidden_size: 256 num_attention_heads: 4 num_query_groups: 4 ffn_hidden_size: 256 kv_channels: 64 hidden_dropout: 0.1 attention_dropout: 0.1 fp32_residual_connection: false apply_residual_connection_post_layernorm: false layernorm_epsilon: 1.0e-05 layernorm_zero_centered_gamma: false add_bias_linear: true add_qkv_bias: false gated_linear_unit: false activation_func: gelu activation_func_fp8_input_store: false num_moe_experts: null rotary_interleaved: false window_size: null normalization: LayerNorm qk_layernorm: false test_mode: false calculate_per_token_loss: false init_method: init_ output_layer_init_method: init_ init_method_std: 0.02 apply_query_key_layer_scaling: false attention_softmax_in_fp32: true bias_activation_fusion: false masked_softmax_fusion: false persist_layer_norm: false memory_efficient_layer_norm: false bias_dropout_fusion: false apply_rope_fusion: false recompute_granularity: null recompute_method: null recompute_num_layers: null distribute_saved_activations: null fp8: null fp8_margin: 0 fp8_interval: 1 fp8_amax_history_len: 1 fp8_amax_compute_algo: most_recent fp8_wgrad: true fp8_dot_product_attention: false fp8_multi_head_attention: false moe_router_load_balancing_type: aux_loss moe_router_topk: 2 moe_grouped_gemm: false moe_aux_loss_coeff: 0.0 moe_z_loss_coeff: null moe_input_jitter_eps: null moe_token_dropping: false moe_token_dispatcher_type: allgather moe_per_layer_logging: false moe_expert_capacity_factor: null moe_pad_expert_input_to_capacity: false moe_token_drop_policy: probs moe_layer_recompute: false clone_scatter_output_in_embedding: true disable_parameter_transpose_cache: false enable_cuda_graph: false target: nemo.collections.llm.gpt.model.base_v2.GPTModelV2 nemo_version: 2.0.0rc1