robinq's picture
Upload nemo_train_params.yaml with huggingface_hub
eb25f2f verified
cfg:
micro_batch_size: 20
global_batch_size: 8000
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
encoder_seq_length: 512
max_position_embeddings: 512
num_layers: 24
hidden_size: 1024
ffn_hidden_size: 4096
num_attention_heads: 16
init_method_std: 0.02
hidden_dropout: 0.1
kv_channels: null
apply_query_key_layer_scaling: true
layernorm_epsilon: 1.0e-05
make_vocab_size_divisible_by: 128
pre_process: true
post_process: true
bert_binary_head: true
tokenizer:
library: huggingface
type: KBLab/wordpiece-64k-pretok-small_data-tokenizer
model: null
vocab_file: null
merge_file: null
native_amp_init_scale: 4294967296
native_amp_growth_interval: 1000
fp32_residual_connection: false
fp16_lm_cross_entropy: false
megatron_amp_O2: false
grad_allreduce_chunk_size_mb: 125
grad_div_ar_fusion: false
seed: 666
use_cpu_initialization: false
onnx_safe: false
gradient_as_bucket_view: true
activations_checkpoint_granularity: null
activations_checkpoint_method: null
activations_checkpoint_num_layers: null
num_micro_batches_with_partial_activation_checkpoints: null
activations_checkpoint_layers_per_pipeline: null
sequence_parallel: false
data:
data_prefix:
- 1
- /project/scratch/$PID/data/wordpiece-64k-pretok-small_data/wikipedia-wordpiece-64k-pretok-small_data_text_sentence
- 1
- /project/scratch/$PID/data/wordpiece-64k-pretok-small_data/edepos_html-wordpiece-64k-pretok-small_data_text_sentence
- 1
- /project/scratch/$PID/data/wordpiece-64k-pretok-small_data/oscar-wordpiece-64k-pretok-small_data_text_sentence
- 1
- /project/scratch/$PID/data/wordpiece-64k-pretok-small_data/kw3-2017-wordpiece-64k-pretok-small_data_text_sentence
- 1
- /project/scratch/$PID/data/wordpiece-64k-pretok-small_data/issues-wordpiece-64k-pretok-small_data_text_sentence
- 1
- /project/scratch/$PID/data/wordpiece-64k-pretok-small_data/mc4-wordpiece-64k-pretok-small_data_text_sentence
index_mapping_dir: /project/scratch/$PID/data/wordpiece-64k-pretok-small_data/npy_files/
data_impl: mmap
splits_string: 980,10,10
seq_length: 512
skip_warmup: true
num_workers: 32
dataloader_type: single
reset_position_ids: false
reset_attention_mask: false
eod_mask_loss: false
masked_lm_prob: 0.15
short_seq_prob: 0.1
optim:
name: fused_adam
lr: 0.0006
weight_decay: 0.01
betas:
- 0.9
- 0.98
sched:
name: CosineAnnealing
warmup_steps: 500
constant_steps: 500
min_lr: 2.0e-05
precision: 16