|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9968, |
|
"eval_steps": 500, |
|
"global_step": 312, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 2.304359943215424, |
|
"learning_rate": 6.25e-06, |
|
"loss": 0.5192, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 3.367745922034929, |
|
"learning_rate": 1.25e-05, |
|
"loss": 0.1579, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 0.3120002563540976, |
|
"learning_rate": 1.8750000000000002e-05, |
|
"loss": 0.1235, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 0.34393102439669215, |
|
"learning_rate": 1.9959742939952393e-05, |
|
"loss": 0.1141, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.22846734971092744, |
|
"learning_rate": 1.9796753984232357e-05, |
|
"loss": 0.1154, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 0.22187942259321225, |
|
"learning_rate": 1.9510565162951538e-05, |
|
"loss": 0.1013, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 0.1937516417577725, |
|
"learning_rate": 1.9104775466588162e-05, |
|
"loss": 0.0975, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 0.18776075554849062, |
|
"learning_rate": 1.8584487936018663e-05, |
|
"loss": 0.1003, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 0.16374083531357114, |
|
"learning_rate": 1.795624548881781e-05, |
|
"loss": 0.1011, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.3083074608208316, |
|
"learning_rate": 1.7227948638273918e-05, |
|
"loss": 0.1019, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 0.14996630876365036, |
|
"learning_rate": 1.6408756139850243e-05, |
|
"loss": 0.096, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 0.10713257524125, |
|
"learning_rate": 1.5508969814521026e-05, |
|
"loss": 0.0918, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 0.16430611401390374, |
|
"learning_rate": 1.4539904997395468e-05, |
|
"loss": 0.0938, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 0.13018547178323414, |
|
"learning_rate": 1.3513748240813429e-05, |
|
"loss": 0.0974, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.249248518955414, |
|
"learning_rate": 1.2443404061378941e-05, |
|
"loss": 0.0994, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.9984, |
|
"eval_loss": 0.09072314202785492, |
|
"eval_runtime": 40.5957, |
|
"eval_samples_per_second": 24.633, |
|
"eval_steps_per_second": 1.552, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.024, |
|
"grad_norm": 0.13728098111933706, |
|
"learning_rate": 1.1342332658176556e-05, |
|
"loss": 0.088, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.088, |
|
"grad_norm": 0.16275246072291852, |
|
"learning_rate": 1.0224380642958052e-05, |
|
"loss": 0.0947, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.152, |
|
"grad_norm": 0.15001498024621265, |
|
"learning_rate": 9.103606910965666e-06, |
|
"loss": 0.0939, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.216, |
|
"grad_norm": 0.13963224848380856, |
|
"learning_rate": 7.994105842167274e-06, |
|
"loss": 0.0876, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.137367259845856, |
|
"learning_rate": 6.909830056250527e-06, |
|
"loss": 0.0869, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.3439999999999999, |
|
"grad_norm": 0.11133790927990313, |
|
"learning_rate": 5.864414950334796e-06, |
|
"loss": 0.0907, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.408, |
|
"grad_norm": 0.09323151899431767, |
|
"learning_rate": 4.87100722594094e-06, |
|
"loss": 0.0953, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.472, |
|
"grad_norm": 0.15783290675271067, |
|
"learning_rate": 3.942099561591802e-06, |
|
"loss": 0.0911, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.536, |
|
"grad_norm": 0.13377672988354583, |
|
"learning_rate": 3.089373510131354e-06, |
|
"loss": 0.0867, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.1192199124767211, |
|
"learning_rate": 2.323552596419889e-06, |
|
"loss": 0.0871, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.6640000000000001, |
|
"grad_norm": 0.11020679845316991, |
|
"learning_rate": 1.6542674627869738e-06, |
|
"loss": 0.0896, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.728, |
|
"grad_norm": 0.09198353508041567, |
|
"learning_rate": 1.0899347581163222e-06, |
|
"loss": 0.0928, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.792, |
|
"grad_norm": 0.11654532424334699, |
|
"learning_rate": 6.37651293602628e-07, |
|
"loss": 0.0917, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.8559999999999999, |
|
"grad_norm": 0.10356962879053153, |
|
"learning_rate": 3.0310479623313125e-07, |
|
"loss": 0.0847, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.0958617515134905, |
|
"learning_rate": 9.0502382320653e-08, |
|
"loss": 0.0868, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.984, |
|
"grad_norm": 0.08860455805163223, |
|
"learning_rate": 2.5176505749346937e-09, |
|
"loss": 0.0916, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.9968, |
|
"eval_loss": 0.08628984540700912, |
|
"eval_runtime": 40.3884, |
|
"eval_samples_per_second": 24.76, |
|
"eval_steps_per_second": 1.56, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 1.9968, |
|
"step": 312, |
|
"total_flos": 101696396132352.0, |
|
"train_loss": 0.11124454142573552, |
|
"train_runtime": 7872.3532, |
|
"train_samples_per_second": 5.081, |
|
"train_steps_per_second": 0.04 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 312, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 101696396132352.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|