File size: 2,618 Bytes
eb25f2f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
cfg:
micro_batch_size: 20
global_batch_size: 8000
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
encoder_seq_length: 512
max_position_embeddings: 512
num_layers: 24
hidden_size: 1024
ffn_hidden_size: 4096
num_attention_heads: 16
init_method_std: 0.02
hidden_dropout: 0.1
kv_channels: null
apply_query_key_layer_scaling: true
layernorm_epsilon: 1.0e-05
make_vocab_size_divisible_by: 128
pre_process: true
post_process: true
bert_binary_head: true
tokenizer:
library: huggingface
type: KBLab/wordpiece-64k-pretok-small_data-tokenizer
model: null
vocab_file: null
merge_file: null
native_amp_init_scale: 4294967296
native_amp_growth_interval: 1000
fp32_residual_connection: false
fp16_lm_cross_entropy: false
megatron_amp_O2: false
grad_allreduce_chunk_size_mb: 125
grad_div_ar_fusion: false
seed: 666
use_cpu_initialization: false
onnx_safe: false
gradient_as_bucket_view: true
activations_checkpoint_granularity: null
activations_checkpoint_method: null
activations_checkpoint_num_layers: null
num_micro_batches_with_partial_activation_checkpoints: null
activations_checkpoint_layers_per_pipeline: null
sequence_parallel: false
data:
data_prefix:
- 1
- /project/scratch/$PID/data/wordpiece-64k-pretok-small_data/wikipedia-wordpiece-64k-pretok-small_data_text_sentence
- 1
- /project/scratch/$PID/data/wordpiece-64k-pretok-small_data/edepos_html-wordpiece-64k-pretok-small_data_text_sentence
- 1
- /project/scratch/$PID/data/wordpiece-64k-pretok-small_data/oscar-wordpiece-64k-pretok-small_data_text_sentence
- 1
- /project/scratch/$PID/data/wordpiece-64k-pretok-small_data/kw3-2017-wordpiece-64k-pretok-small_data_text_sentence
- 1
- /project/scratch/$PID/data/wordpiece-64k-pretok-small_data/issues-wordpiece-64k-pretok-small_data_text_sentence
- 1
- /project/scratch/$PID/data/wordpiece-64k-pretok-small_data/mc4-wordpiece-64k-pretok-small_data_text_sentence
index_mapping_dir: /project/scratch/$PID/data/wordpiece-64k-pretok-small_data/npy_files/
data_impl: mmap
splits_string: 980,10,10
seq_length: 512
skip_warmup: true
num_workers: 32
dataloader_type: single
reset_position_ids: false
reset_attention_mask: false
eod_mask_loss: false
masked_lm_prob: 0.15
short_seq_prob: 0.1
optim:
name: fused_adam
lr: 0.0006
weight_decay: 0.01
betas:
- 0.9
- 0.98
sched:
name: CosineAnnealing
warmup_steps: 500
constant_steps: 500
min_lr: 2.0e-05
precision: 16
|