|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 500, |
|
"global_step": 21994, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.045466945530599254, |
|
"grad_norm": 39.23893737792969, |
|
"learning_rate": 6.818181818181818e-06, |
|
"loss": 3.9076, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.09093389106119851, |
|
"grad_norm": 18.871334075927734, |
|
"learning_rate": 1.3636363636363637e-05, |
|
"loss": 1.415, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.13640083659179777, |
|
"grad_norm": 14.570379257202148, |
|
"learning_rate": 2.0454545454545454e-05, |
|
"loss": 1.2569, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.18186778212239701, |
|
"grad_norm": 16.230432510375977, |
|
"learning_rate": 2.7272727272727273e-05, |
|
"loss": 1.2395, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.22733472765299628, |
|
"grad_norm": 15.570598602294922, |
|
"learning_rate": 2.954531676265535e-05, |
|
"loss": 1.2152, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.27280167318359555, |
|
"grad_norm": 14.000185012817383, |
|
"learning_rate": 2.8787511367080933e-05, |
|
"loss": 1.1669, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.31826861871419476, |
|
"grad_norm": 19.740062713623047, |
|
"learning_rate": 2.802970597150652e-05, |
|
"loss": 1.1665, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.36373556424479403, |
|
"grad_norm": 13.226868629455566, |
|
"learning_rate": 2.7271900575932103e-05, |
|
"loss": 1.1422, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.4092025097753933, |
|
"grad_norm": 8.243535041809082, |
|
"learning_rate": 2.6514095180357682e-05, |
|
"loss": 1.115, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.45466945530599256, |
|
"grad_norm": 23.291852951049805, |
|
"learning_rate": 2.575628978478327e-05, |
|
"loss": 1.1059, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.5001364008365918, |
|
"grad_norm": 14.429057121276855, |
|
"learning_rate": 2.499848438920885e-05, |
|
"loss": 1.1121, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.5456033463671911, |
|
"grad_norm": 9.61043930053711, |
|
"learning_rate": 2.4240678993634438e-05, |
|
"loss": 1.0893, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.5910702918977903, |
|
"grad_norm": 7.90361213684082, |
|
"learning_rate": 2.3482873598060018e-05, |
|
"loss": 1.0813, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.6365372374283895, |
|
"grad_norm": 11.351470947265625, |
|
"learning_rate": 2.27250682024856e-05, |
|
"loss": 1.0667, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.6820041829589888, |
|
"grad_norm": 15.591328620910645, |
|
"learning_rate": 2.1967262806911187e-05, |
|
"loss": 1.0682, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.7274711284895881, |
|
"grad_norm": 9.37597942352295, |
|
"learning_rate": 2.120945741133677e-05, |
|
"loss": 1.0462, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.7729380740201873, |
|
"grad_norm": 17.270828247070312, |
|
"learning_rate": 2.0451652015762353e-05, |
|
"loss": 1.0626, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.8184050195507866, |
|
"grad_norm": 14.909830093383789, |
|
"learning_rate": 1.9693846620187936e-05, |
|
"loss": 1.0214, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.8638719650813859, |
|
"grad_norm": 7.522629261016846, |
|
"learning_rate": 1.893604122461352e-05, |
|
"loss": 1.0459, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.9093389106119851, |
|
"grad_norm": 47.28670883178711, |
|
"learning_rate": 1.8178235829039105e-05, |
|
"loss": 1.0277, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.9548058561425843, |
|
"grad_norm": 14.066143989562988, |
|
"learning_rate": 1.7420430433464688e-05, |
|
"loss": 1.0183, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.0002728016731837, |
|
"grad_norm": 10.882994651794434, |
|
"learning_rate": 1.6662625037890268e-05, |
|
"loss": 0.9878, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.0457397472037828, |
|
"grad_norm": 8.7605562210083, |
|
"learning_rate": 1.5904819642315854e-05, |
|
"loss": 0.8088, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.0912066927343822, |
|
"grad_norm": 5.1129326820373535, |
|
"learning_rate": 1.5147014246741437e-05, |
|
"loss": 0.8296, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.1366736382649814, |
|
"grad_norm": 11.089341163635254, |
|
"learning_rate": 1.438920885116702e-05, |
|
"loss": 0.8104, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.1821405837955807, |
|
"grad_norm": 10.97964096069336, |
|
"learning_rate": 1.3631403455592605e-05, |
|
"loss": 0.8171, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.22760752932618, |
|
"grad_norm": 21.10997200012207, |
|
"learning_rate": 1.2873598060018188e-05, |
|
"loss": 0.8129, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.273074474856779, |
|
"grad_norm": 1.2680716514587402, |
|
"learning_rate": 1.211579266444377e-05, |
|
"loss": 0.7884, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.3185414203873784, |
|
"grad_norm": 11.013956069946289, |
|
"learning_rate": 1.1357987268869355e-05, |
|
"loss": 0.8036, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.3640083659179776, |
|
"grad_norm": 4.082338333129883, |
|
"learning_rate": 1.0600181873294938e-05, |
|
"loss": 0.7772, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.409475311448577, |
|
"grad_norm": 5.8771071434021, |
|
"learning_rate": 9.842376477720523e-06, |
|
"loss": 0.8058, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.4549422569791761, |
|
"grad_norm": 10.076021194458008, |
|
"learning_rate": 9.084571082146104e-06, |
|
"loss": 0.7847, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.5004092025097755, |
|
"grad_norm": 7.645974636077881, |
|
"learning_rate": 8.326765686571689e-06, |
|
"loss": 0.7793, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.5458761480403747, |
|
"grad_norm": 15.3417329788208, |
|
"learning_rate": 7.568960290997272e-06, |
|
"loss": 0.7844, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.5913430935709738, |
|
"grad_norm": 6.472328186035156, |
|
"learning_rate": 6.8111548954228554e-06, |
|
"loss": 0.7767, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.6368100391015732, |
|
"grad_norm": 10.42813777923584, |
|
"learning_rate": 6.053349499848439e-06, |
|
"loss": 0.7792, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.6822769846321726, |
|
"grad_norm": 3.3688242435455322, |
|
"learning_rate": 5.295544104274023e-06, |
|
"loss": 0.7702, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.7277439301627715, |
|
"grad_norm": 5.880104064941406, |
|
"learning_rate": 4.537738708699606e-06, |
|
"loss": 0.8026, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.7732108756933709, |
|
"grad_norm": 20.736509323120117, |
|
"learning_rate": 3.7799333131251894e-06, |
|
"loss": 0.7483, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 1.8186778212239703, |
|
"grad_norm": 5.447836399078369, |
|
"learning_rate": 3.0221279175507728e-06, |
|
"loss": 0.7301, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.8641447667545694, |
|
"grad_norm": 6.6951141357421875, |
|
"learning_rate": 2.264322521976356e-06, |
|
"loss": 0.7655, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 1.9096117122851686, |
|
"grad_norm": 5.314289093017578, |
|
"learning_rate": 1.50651712640194e-06, |
|
"loss": 0.7631, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.955078657815768, |
|
"grad_norm": 9.385791778564453, |
|
"learning_rate": 7.487117308275235e-07, |
|
"loss": 0.744, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"step": 21994, |
|
"total_flos": 1.2277795031512474e+17, |
|
"train_loss": 1.0133790219709247, |
|
"train_runtime": 87315.1804, |
|
"train_samples_per_second": 3.023, |
|
"train_steps_per_second": 0.252 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 21994, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 5000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.2277795031512474e+17, |
|
"train_batch_size": 12, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|