robinq commited on
Commit
1d2499d
1 Parent(s): 01a40d9

Upload nemo_train_params.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. nemo_train_params.yaml +82 -0
nemo_train_params.yaml ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cfg:
2
+ micro_batch_size: 62
3
+ global_batch_size: 7936
4
+ tensor_model_parallel_size: 1
5
+ pipeline_model_parallel_size: 1
6
+ encoder_seq_length: 512
7
+ max_position_embeddings: 512
8
+ num_layers: 12
9
+ hidden_size: 768
10
+ ffn_hidden_size: 3072
11
+ num_attention_heads: 12
12
+ init_method_std: 0.02
13
+ hidden_dropout: 0.1
14
+ kv_channels: null
15
+ apply_query_key_layer_scaling: true
16
+ layernorm_epsilon: 1.0e-05
17
+ make_vocab_size_divisible_by: 128
18
+ pre_process: true
19
+ post_process: true
20
+ bert_binary_head: true
21
+ tokenizer:
22
+ library: huggingface
23
+ type: KBLab/wordpiece-32k-no_pretok-small_data-tokenizer
24
+ model: null
25
+ vocab_file: null
26
+ merge_file: null
27
+ native_amp_init_scale: 4294967296
28
+ native_amp_growth_interval: 1000
29
+ fp32_residual_connection: false
30
+ fp16_lm_cross_entropy: false
31
+ megatron_amp_O2: false
32
+ grad_allreduce_chunk_size_mb: 125
33
+ grad_div_ar_fusion: false
34
+ seed: 666
35
+ use_cpu_initialization: false
36
+ onnx_safe: false
37
+ gradient_as_bucket_view: true
38
+ activations_checkpoint_granularity: null
39
+ activations_checkpoint_method: null
40
+ activations_checkpoint_num_layers: null
41
+ num_micro_batches_with_partial_activation_checkpoints: null
42
+ activations_checkpoint_layers_per_pipeline: null
43
+ sequence_parallel: false
44
+ data:
45
+ data_prefix:
46
+ - 1
47
+ - /project/scratch/$PID/data/wordpiece-32k-no_pretok-small_data/wikipedia-wordpiece-32k-no_pretok-small_data_text_sentence
48
+ - 1
49
+ - /project/scratch/$PID/data/wordpiece-32k-no_pretok-small_data/edepos_html-wordpiece-32k-no_pretok-small_data_text_sentence
50
+ - 1
51
+ - /project/scratch/$PID/data/wordpiece-32k-no_pretok-small_data/oscar-wordpiece-32k-no_pretok-small_data_text_sentence
52
+ - 1
53
+ - /project/scratch/$PID/data/wordpiece-32k-no_pretok-small_data/kw3-2017-wordpiece-32k-no_pretok-small_data_text_sentence
54
+ - 1
55
+ - /project/scratch/$PID/data/wordpiece-32k-no_pretok-small_data/issues-wordpiece-32k-no_pretok-small_data_text_sentence
56
+ - 1
57
+ - /project/scratch/$PID/data/wordpiece-32k-no_pretok-small_data/mc4-wordpiece-32k-no_pretok-small_data_text_sentence
58
+ index_mapping_dir: /project/scratch/$PID/data/wordpiece-32k-no_pretok-small_data/npy_files/
59
+ data_impl: mmap
60
+ splits_string: 980,10,10
61
+ seq_length: 512
62
+ skip_warmup: true
63
+ num_workers: 32
64
+ dataloader_type: single
65
+ reset_position_ids: false
66
+ reset_attention_mask: false
67
+ eod_mask_loss: false
68
+ masked_lm_prob: 0.15
69
+ short_seq_prob: 0.1
70
+ optim:
71
+ name: fused_adam
72
+ lr: 0.0006
73
+ weight_decay: 0.01
74
+ betas:
75
+ - 0.9
76
+ - 0.98
77
+ sched:
78
+ name: CosineAnnealing
79
+ warmup_steps: 500
80
+ constant_steps: 500
81
+ min_lr: 2.0e-05
82
+ precision: 16