|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.0, |
|
"eval_steps": 500, |
|
"global_step": 8336, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.2399232245681382, |
|
"grad_norm": 2.465574026107788, |
|
"learning_rate": 2.8200575815738963e-05, |
|
"loss": 1.1212, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.4798464491362764, |
|
"grad_norm": 3.261763572692871, |
|
"learning_rate": 2.640115163147793e-05, |
|
"loss": 0.9721, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.7197696737044146, |
|
"grad_norm": 3.3122525215148926, |
|
"learning_rate": 2.4601727447216888e-05, |
|
"loss": 0.9282, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.9596928982725528, |
|
"grad_norm": 2.439178466796875, |
|
"learning_rate": 2.2802303262955854e-05, |
|
"loss": 0.9108, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.199616122840691, |
|
"grad_norm": 2.484846830368042, |
|
"learning_rate": 2.1002879078694817e-05, |
|
"loss": 0.8742, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.4395393474088292, |
|
"grad_norm": 3.605393409729004, |
|
"learning_rate": 1.9203454894433783e-05, |
|
"loss": 0.8873, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.6794625719769674, |
|
"grad_norm": 2.1433281898498535, |
|
"learning_rate": 1.7404030710172745e-05, |
|
"loss": 0.8618, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.9193857965451055, |
|
"grad_norm": 3.013890504837036, |
|
"learning_rate": 1.5604606525911708e-05, |
|
"loss": 0.8517, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.159309021113244, |
|
"grad_norm": 3.020458936691284, |
|
"learning_rate": 1.3805182341650672e-05, |
|
"loss": 0.8473, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.399232245681382, |
|
"grad_norm": 2.4140872955322266, |
|
"learning_rate": 1.2005758157389636e-05, |
|
"loss": 0.8312, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.63915547024952, |
|
"grad_norm": 2.538238763809204, |
|
"learning_rate": 1.0206333973128599e-05, |
|
"loss": 0.8291, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 2.8790786948176583, |
|
"grad_norm": 2.649840831756592, |
|
"learning_rate": 8.406909788867563e-06, |
|
"loss": 0.8218, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 3.1190019193857967, |
|
"grad_norm": 2.9255142211914062, |
|
"learning_rate": 6.607485604606526e-06, |
|
"loss": 0.8212, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 3.3589251439539347, |
|
"grad_norm": 2.9272096157073975, |
|
"learning_rate": 4.8080614203454895e-06, |
|
"loss": 0.8067, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 3.5988483685220727, |
|
"grad_norm": 3.7314858436584473, |
|
"learning_rate": 3.008637236084453e-06, |
|
"loss": 0.818, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 3.838771593090211, |
|
"grad_norm": 2.498305320739746, |
|
"learning_rate": 1.2092130518234166e-06, |
|
"loss": 0.8026, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"step": 8336, |
|
"total_flos": 5.57671514112e+16, |
|
"train_loss": 0.8718166570974631, |
|
"train_runtime": 11124.9814, |
|
"train_samples_per_second": 35.955, |
|
"train_steps_per_second": 0.749 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 8336, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.57671514112e+16, |
|
"train_batch_size": 48, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|