|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9991235758106923, |
|
"eval_steps": 500, |
|
"global_step": 285, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.017528483786152498, |
|
"grad_norm": 2.3260412216186523, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4745, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.035056967572304996, |
|
"grad_norm": 2.4670772552490234, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5049, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05258545135845749, |
|
"grad_norm": 2.598940372467041, |
|
"learning_rate": 2e-05, |
|
"loss": 0.509, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.07011393514460999, |
|
"grad_norm": 2.3951876163482666, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4642, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0876424189307625, |
|
"grad_norm": 2.50634765625, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4899, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.10517090271691498, |
|
"grad_norm": 2.51271390914917, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5057, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.12269938650306748, |
|
"grad_norm": 2.2455062866210938, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4374, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.14022787028921999, |
|
"grad_norm": 2.7102108001708984, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4747, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.15775635407537247, |
|
"grad_norm": 2.5008888244628906, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4686, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.175284837861525, |
|
"grad_norm": 2.638371467590332, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4867, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.19281332164767748, |
|
"grad_norm": 2.447908878326416, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4578, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.21034180543382996, |
|
"grad_norm": 3.2303571701049805, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4797, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.22787028921998248, |
|
"grad_norm": 2.4729976654052734, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4778, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.24539877300613497, |
|
"grad_norm": 2.7793662548065186, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4679, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.26292725679228746, |
|
"grad_norm": 2.6131370067596436, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4866, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.28045574057843997, |
|
"grad_norm": 2.5630428791046143, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4802, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2979842243645925, |
|
"grad_norm": 2.7526726722717285, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4572, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.31551270815074495, |
|
"grad_norm": 2.934720993041992, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4557, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.33304119193689746, |
|
"grad_norm": 2.7996726036071777, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4692, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.35056967572305, |
|
"grad_norm": 3.004939079284668, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4786, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.36809815950920244, |
|
"grad_norm": 2.67604660987854, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4955, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.38562664329535495, |
|
"grad_norm": 2.9146041870117188, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4509, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.40315512708150747, |
|
"grad_norm": 2.9048011302948, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4457, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.42068361086765993, |
|
"grad_norm": 2.4492969512939453, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4572, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.43821209465381245, |
|
"grad_norm": 2.833976984024048, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4538, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.45574057843996496, |
|
"grad_norm": 2.866576671600342, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4817, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4732690622261174, |
|
"grad_norm": 3.0097880363464355, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4555, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.49079754601226994, |
|
"grad_norm": 2.912531852722168, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5023, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.5083260297984225, |
|
"grad_norm": 2.936351776123047, |
|
"learning_rate": 2e-05, |
|
"loss": 0.48, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.5258545135845749, |
|
"grad_norm": 2.962674856185913, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4526, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5433829973707275, |
|
"grad_norm": 2.743187189102173, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4774, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.5609114811568799, |
|
"grad_norm": 2.742067575454712, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4789, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5784399649430324, |
|
"grad_norm": 3.111201047897339, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4877, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.595968448729185, |
|
"grad_norm": 2.6774637699127197, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4867, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.6134969325153374, |
|
"grad_norm": 3.2346479892730713, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4781, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.6310254163014899, |
|
"grad_norm": 2.714986801147461, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5008, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6485539000876425, |
|
"grad_norm": 2.6662895679473877, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4053, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.6660823838737949, |
|
"grad_norm": 3.0469818115234375, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4482, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6836108676599474, |
|
"grad_norm": 2.78334379196167, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4731, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.7011393514461, |
|
"grad_norm": 2.9470932483673096, |
|
"learning_rate": 2e-05, |
|
"loss": 0.444, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.7186678352322524, |
|
"grad_norm": 3.256516218185425, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4826, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.7361963190184049, |
|
"grad_norm": 3.080620765686035, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4561, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.7537248028045574, |
|
"grad_norm": 3.4312586784362793, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4471, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.7712532865907099, |
|
"grad_norm": 2.649163246154785, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4449, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7887817703768624, |
|
"grad_norm": 2.7402210235595703, |
|
"learning_rate": 2e-05, |
|
"loss": 0.469, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.8063102541630149, |
|
"grad_norm": 3.1415464878082275, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4706, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.8238387379491674, |
|
"grad_norm": 3.1218481063842773, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4754, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.8413672217353199, |
|
"grad_norm": 3.197633743286133, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4614, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.8588957055214724, |
|
"grad_norm": 2.9509947299957275, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5037, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.8764241893076249, |
|
"grad_norm": 3.126260995864868, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4472, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8939526730937774, |
|
"grad_norm": 3.562556028366089, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4515, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.9114811568799299, |
|
"grad_norm": 3.086526870727539, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5097, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.9290096406660824, |
|
"grad_norm": 3.190669059753418, |
|
"learning_rate": 2e-05, |
|
"loss": 0.462, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.9465381244522348, |
|
"grad_norm": 3.133270263671875, |
|
"learning_rate": 2e-05, |
|
"loss": 0.445, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.9640666082383874, |
|
"grad_norm": 3.3561840057373047, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4711, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.9815950920245399, |
|
"grad_norm": 3.056903123855591, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5028, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.9991235758106923, |
|
"grad_norm": 2.7743754386901855, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4612, |
|
"step": 285 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 285, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 285, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.111992390689751e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|