pretrain model, extend from 5 to 8 epochs
Browse files- scripts/TRAIN.md +13 -3
- scripts/pretrain-model.yaml +7 -4
scripts/TRAIN.md
CHANGED
@@ -46,9 +46,19 @@ save_file(state_dict, 'out/converted_model/model.safetensors')
|
|
46 |
## Evaluate
|
47 |
|
48 |
```bash
|
49 |
-
litgpt evaluate --tasks '
|
50 |
|
51 |
-
litgpt evaluate --tasks '
|
52 |
|
53 |
-
litgpt evaluate --tasks '
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
```
|
|
|
46 |
## Evaluate
|
47 |
|
48 |
```bash
|
49 |
+
litgpt evaluate --tasks 'hellaswag,gsm8k,truthfulqa_mc2,mmlu,winogrande,arc_challenge' --out_dir 'evaluate-quick/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
|
50 |
|
51 |
+
litgpt evaluate --tasks 'leaderboard' --out_dir 'evaluate-leaderboard/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
|
52 |
|
53 |
+
litgpt evaluate --tasks 'bbh_zeroshot,bbh_fewshot,bbh_cot_fewshot,bbh_cot_zeroshot' --out_dir 'evaluate-bigbenchhard/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
|
54 |
+
|
55 |
+
litgpt evaluate --tasks 'mmlu,mmlu_pro' --out_dir 'evaluate-mmlu/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
|
56 |
+
|
57 |
+
litgpt evaluate --tasks 'arc_challenge,boolq,gpqa,hellaswag,openbookqa,piqa,siqa,truthfulqa_mc2,winogrande' --out_dir 'evaluate-reasoning/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
|
58 |
+
|
59 |
+
litgpt evaluate --tasks 'mmlu_multilingual,mgsm' --out_dir 'evaluate-multilinguals/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
|
60 |
+
|
61 |
+
litgpt evaluate --tasks 'gsm8k,mathqa' --out_dir 'evaluate-math/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
|
62 |
+
|
63 |
+
litgpt evaluate --tasks 'qasper' --out_dir 'evaluate-long/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
|
64 |
```
|
scripts/pretrain-model.yaml
CHANGED
@@ -57,7 +57,7 @@ data:
|
|
57 |
# Training-related arguments. See ``litgpt.args.TrainArgs`` for details
|
58 |
train:
|
59 |
# Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
|
60 |
-
save_interval:
|
61 |
|
62 |
# Number of iterations between logging calls (type: int, default: 1)
|
63 |
log_interval: 1
|
@@ -77,7 +77,8 @@ train:
|
|
77 |
|
78 |
# Total number of tokens to train on (type: Optional[int], default: 3000000000000)
|
79 |
# max_tokens: 3000000000000
|
80 |
-
max_tokens: 8159107755 # 796399 * 2049 * 5
|
|
|
81 |
|
82 |
# Limits the number of optimizer steps to run. (type: Optional[int], default: null)
|
83 |
max_steps:
|
@@ -120,10 +121,12 @@ optimizer:
|
|
120 |
|
121 |
init_args:
|
122 |
# (type: float, default: 0.001)
|
123 |
-
lr: 1e-3
|
|
|
124 |
|
125 |
# (type: float, default: 0.01)
|
126 |
-
weight_decay: 0.01
|
|
|
127 |
|
128 |
# (type: tuple, default: (0.9,0.999))
|
129 |
betas:
|
|
|
57 |
# Training-related arguments. See ``litgpt.args.TrainArgs`` for details
|
58 |
train:
|
59 |
# Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
|
60 |
+
save_interval: 200
|
61 |
|
62 |
# Number of iterations between logging calls (type: int, default: 1)
|
63 |
log_interval: 1
|
|
|
77 |
|
78 |
# Total number of tokens to train on (type: Optional[int], default: 3000000000000)
|
79 |
# max_tokens: 3000000000000
|
80 |
+
# max_tokens: 8159107755 # 796399 * 2049 * 5
|
81 |
+
max_tokens: 13054572408 # 796399 * 2049 * 8
|
82 |
|
83 |
# Limits the number of optimizer steps to run. (type: Optional[int], default: null)
|
84 |
max_steps:
|
|
|
121 |
|
122 |
init_args:
|
123 |
# (type: float, default: 0.001)
|
124 |
+
# lr: 1e-3
|
125 |
+
lr: 1e-4
|
126 |
|
127 |
# (type: float, default: 0.01)
|
128 |
+
# weight_decay: 0.01
|
129 |
+
weight_decay: 0.1
|
130 |
|
131 |
# (type: tuple, default: (0.9,0.999))
|
132 |
betas:
|