tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
virtual_pipeline_model_parallel_size: null
sequence_parallel: false
context_parallel_size: 1
expert_model_parallel_size: 1
moe_extended_tp: false
perform_initialization: true
use_cpu_initialization: false
fp16: false
bf16: false
params_dtype: float32
timers: null
finalize_model_grads_func: null
grad_scale_func: null
no_sync_func: null
grad_sync_func: null
param_sync_func: null
deterministic_mode: false
enable_autocast: false
autocast_dtype: float32
num_microbatches_with_partial_activation_checkpoints: null
gradient_accumulation_fusion: false
async_tensor_model_parallel_allreduce: false
use_te_rng_tracker: false
tp_comm_overlap: false
tp_comm_bulk_wgrad: true
tp_comm_bulk_dgrad: true
tp_comm_overlap_ag: true
tp_comm_overlap_rs: true
tp_comm_overlap_rs_dgrad: false
tp_comm_split_ag: true
tp_comm_atomic_ag: false
tp_comm_split_rs: true
tp_comm_atomic_rs: false
pipeline_dtype: null
variable_seq_lengths: false
overlap_p2p_comm: false
batch_p2p_comm: true
batch_p2p_sync: true
use_ring_exchange_p2p: false
deallocate_pipeline_outputs: false
defer_embedding_wgrad_compute: false
pipeline_model_parallel_split_rank: null
cpu_offloading: false
cpu_offloading_num_layers: 0
_cpu_offloading_context: null
cpu_offloading_activations: true
cpu_offloading_weights: true
barrier_with_L1_time: true
fp16_lm_cross_entropy: false
parallel_output: true
share_embeddings_and_output_weights: false
make_vocab_size_divisible_by: 128
position_embedding_type: learned_absolute
rotary_base: 10000
rotary_percent: 1.0
seq_len_interpolation_factor: null
seq_length: 2048
optim:
  name: fused_adam
  sched: null
optimizer_fn: null
tokenizer_filepath: null
num_layers: 4
hidden_size: 256
num_attention_heads: 4
num_query_groups: 4
ffn_hidden_size: 256
kv_channels: 64
hidden_dropout: 0.1
attention_dropout: 0.1
fp32_residual_connection: false
apply_residual_connection_post_layernorm: false
layernorm_epsilon: 1.0e-05
layernorm_zero_centered_gamma: false
add_bias_linear: true
add_qkv_bias: false
gated_linear_unit: false
activation_func: gelu
activation_func_fp8_input_store: false
num_moe_experts: null
rotary_interleaved: false
window_size: null
normalization: LayerNorm
qk_layernorm: false
test_mode: false
calculate_per_token_loss: false
init_method: init_
output_layer_init_method: init_
init_method_std: 0.02
apply_query_key_layer_scaling: false
attention_softmax_in_fp32: true
bias_activation_fusion: false
masked_softmax_fusion: false
persist_layer_norm: false
memory_efficient_layer_norm: false
bias_dropout_fusion: false
apply_rope_fusion: false
recompute_granularity: null
recompute_method: null
recompute_num_layers: null
distribute_saved_activations: null
fp8: null
fp8_margin: 0
fp8_interval: 1
fp8_amax_history_len: 1
fp8_amax_compute_algo: most_recent
fp8_wgrad: true
fp8_dot_product_attention: false
fp8_multi_head_attention: false
moe_router_load_balancing_type: aux_loss
moe_router_topk: 2
moe_grouped_gemm: false
moe_aux_loss_coeff: 0.0
moe_z_loss_coeff: null
moe_input_jitter_eps: null
moe_token_dropping: false
moe_token_dispatcher_type: allgather
moe_per_layer_logging: false
moe_expert_capacity_factor: null
moe_pad_expert_input_to_capacity: false
moe_token_drop_policy: probs
moe_layer_recompute: false
clone_scatter_output_in_embedding: true
disable_parameter_transpose_cache: false
enable_cuda_graph: false
target: nemo.collections.llm.gpt.model.base_v2.GPTModelV2
nemo_version: 2.0.0rc1