File size: 2,950 Bytes
1361151 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
checkpoints:
checkpoint_interval: 5000
checkpoints_path: /fsx/phuc/checkpoints/doremi/big-run-02/reference-2.8b-llama-tuned-weights_with_100k_proxy
checkpoints_path_is_shared_file_system: true
resume_checkpoint_path: null
save_initial_state: false
data:
dataset:
dataset_overwrite_cache: false
dataset_processing_num_proc_per_process: 1
hf_dataset_config_name: null
hf_dataset_or_datasets: /fsx/phuc/project_data/doremi/datasets/the_pile_raw/tokenized_data/train
hf_dataset_splits: train
text_column_name: text
num_loading_workers: 1
seed: 42
doremi:
domain_names:
- Pile-CC
- Github
- OpenWebText2
- StackExchange
- Wikipedia (en)
- PubMed Abstracts
- USPTO Backgrounds
- FreeLaw
- PubMed Central
- Enron Emails
- HackerNews
- NIH ExPorter
- Books3
- ArXiv
- DM Mathematics
- OpenSubtitles
- Gutenberg (PG-19)
- Ubuntu IRC
- BookCorpus2
- EuroParl
- YoutubeSubtitles
- PhilPapers
domain_weights:
- 0.2333
- 0.07
- 0.1154
- 0.0528
- 0.0665
- 0.067
- 0.0366
- 0.0571
- 0.0451
- 0.0036
- 0.0087
- 0.0078
- 0.0708
- 0.0656
- 0.0034
- 0.0048
- 0.0222
- 0.0084
- 0.0038
- 0.0186
- 0.0149
- 0.0235
ref_model_checkpoint_path: null
ref_model_resume_checkpoint_path: null
general:
benchmark_csv_path: null
consumed_train_samples: 35840000
ignore_sanity_checks: true
project: nanotron
run: train_tuned_2.8b_model
seed: 42
step: 70000
logging:
iteration_step_info_interval: 1
log_level: info
log_level_replica: info
model:
ddp_bucket_cap_mb: 120
dtype: bfloat16
init_method:
std: 0.025
make_vocab_size_divisible_by: 1
model_config:
bos_token_id: 1
eos_token_id: 2
hidden_act: silu
hidden_size: 4096
initializer_range: 0.02
intermediate_size: 24576
is_llama_config: true
max_position_embeddings: 1024
num_attention_heads: 32
num_hidden_layers: 6
num_key_value_heads: 16
pad_token_id: null
pretraining_tp: 1
rms_norm_eps: 1.0e-05
rope_scaling: null
tie_word_embeddings: true
use_cache: true
vocab_size: 49152
optimizer:
accumulate_grad_in_fp32: true
adam_beta1: 0.9
adam_beta2: 0.95
adam_eps: 1.0e-08
clip_grad: 1.0
learning_rate_scheduler:
learning_rate: 0.0003
lr_decay_steps: 8
lr_decay_style: cosine
lr_warmup_steps: 2
lr_warmup_style: linear
min_decay_lr: 1.0e-05
torch_adam_is_fused: true
weight_decay: 0.01
zero_stage: 0
parallelism:
dp: 8
pp: 1
pp_engine: 1f1b
recompute_granularity: SELECTIVE
tp: 8
tp_linear_async_communication: true
tp_mode: REDUCE_SCATTER
profiler: null
tokenizer:
tokenizer_max_length: null
tokenizer_name_or_path: gpt2
tokenizer_revision: null
tokens:
batch_accumulation_per_replica: 1
limit_test_batches: 0
limit_val_batches: 8
micro_batch_size: 64
sequence_length: 1024
train_steps: 70000
val_check_interval: -1
|