pretrain model
Browse files
scripts/pretrain-model.yaml
CHANGED
@@ -52,7 +52,7 @@ data:
|
|
52 |
|
53 |
init_args:
|
54 |
data_path: "../pretrain-data/"
|
55 |
-
num_workers:
|
56 |
|
57 |
# Training-related arguments. See ``litgpt.args.TrainArgs`` for details
|
58 |
train:
|
@@ -77,7 +77,8 @@ train:
|
|
77 |
|
78 |
# Total number of tokens to train on (type: Optional[int], default: 3000000000000)
|
79 |
# max_tokens: 3000000000000
|
80 |
-
max_tokens: 36852166560 # 3597088 * 2049 * 5
|
|
|
81 |
|
82 |
# Limits the number of optimizer steps to run. (type: Optional[int], default: null)
|
83 |
max_steps:
|
@@ -114,8 +115,8 @@ eval:
|
|
114 |
# Optimizer-related arguments
|
115 |
optimizer:
|
116 |
# class_path: torch.optim.AdamW
|
117 |
-
|
118 |
-
class_path: bitsandbytes.optim.AdamW8bit
|
119 |
# class_path: bitsandbytes.optim.PagedAdamW8bit
|
120 |
|
121 |
init_args:
|
|
|
52 |
|
53 |
init_args:
|
54 |
data_path: "../pretrain-data/"
|
55 |
+
num_workers: 32
|
56 |
|
57 |
# Training-related arguments. See ``litgpt.args.TrainArgs`` for details
|
58 |
train:
|
|
|
77 |
|
78 |
# Total number of tokens to train on (type: Optional[int], default: 3000000000000)
|
79 |
# max_tokens: 3000000000000
|
80 |
+
# max_tokens: 36852166560 # 3597088 * 2049 * 5
|
81 |
+
max_tokens: 22111299936 # 3597088 * 2049 * 3
|
82 |
|
83 |
# Limits the number of optimizer steps to run. (type: Optional[int], default: null)
|
84 |
max_steps:
|
|
|
115 |
# Optimizer-related arguments
|
116 |
optimizer:
|
117 |
# class_path: torch.optim.AdamW
|
118 |
+
class_path: grokadamw.GrokAdamW
|
119 |
+
# class_path: bitsandbytes.optim.AdamW8bit
|
120 |
# class_path: bitsandbytes.optim.PagedAdamW8bit
|
121 |
|
122 |
init_args:
|