|
{ |
|
"best_metric": 1.5669885873794556, |
|
"best_model_checkpoint": "/scratch/czm5kz/llama2-7b_8_50_0.0003_sg_finetuned_with_output/checkpoint-180", |
|
"epoch": 47.407407407407405, |
|
"eval_steps": 20, |
|
"global_step": 640, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 2.398926019668579, |
|
"learning_rate": 0.0002986153846153846, |
|
"loss": 5.3458, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.079797387123108, |
|
"learning_rate": 0.0002963076923076923, |
|
"loss": 4.6876, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 7.786762237548828, |
|
"learning_rate": 0.000294, |
|
"loss": 4.3658, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 1.545445442199707, |
|
"learning_rate": 0.0002916923076923077, |
|
"loss": 3.4792, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"eval_loss": 3.3610048294067383, |
|
"eval_runtime": 0.4367, |
|
"eval_samples_per_second": 61.822, |
|
"eval_steps_per_second": 9.159, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 1.6737030744552612, |
|
"learning_rate": 0.0002893846153846154, |
|
"loss": 3.0828, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 2.414368152618408, |
|
"learning_rate": 0.00028707692307692305, |
|
"loss": 2.6784, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 3.006044864654541, |
|
"learning_rate": 0.00028476923076923075, |
|
"loss": 2.4781, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 2.7223751544952393, |
|
"learning_rate": 0.00028246153846153845, |
|
"loss": 2.2626, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"eval_loss": 2.527669906616211, |
|
"eval_runtime": 0.4336, |
|
"eval_samples_per_second": 62.265, |
|
"eval_steps_per_second": 9.224, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 4.4172210693359375, |
|
"learning_rate": 0.00028015384615384615, |
|
"loss": 1.8627, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"grad_norm": 3.698852062225342, |
|
"learning_rate": 0.0002778461538461538, |
|
"loss": 1.6018, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 4.07, |
|
"grad_norm": 3.051743984222412, |
|
"learning_rate": 0.0002755384615384615, |
|
"loss": 1.4827, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"grad_norm": 5.160757064819336, |
|
"learning_rate": 0.0002732307692307692, |
|
"loss": 1.0585, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"eval_loss": 2.1469380855560303, |
|
"eval_runtime": 0.4338, |
|
"eval_samples_per_second": 62.246, |
|
"eval_steps_per_second": 9.222, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 4.81, |
|
"grad_norm": 4.5085062980651855, |
|
"learning_rate": 0.0002709230769230769, |
|
"loss": 1.0433, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 5.19, |
|
"grad_norm": 3.886779546737671, |
|
"learning_rate": 0.00026861538461538456, |
|
"loss": 0.9697, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 5.56, |
|
"grad_norm": 4.663851261138916, |
|
"learning_rate": 0.00026630769230769226, |
|
"loss": 0.6188, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 5.93, |
|
"grad_norm": 5.576120853424072, |
|
"learning_rate": 0.00026399999999999997, |
|
"loss": 0.7594, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 5.93, |
|
"eval_loss": 1.739160418510437, |
|
"eval_runtime": 0.4327, |
|
"eval_samples_per_second": 62.404, |
|
"eval_steps_per_second": 9.245, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 6.3, |
|
"grad_norm": 2.55850887298584, |
|
"learning_rate": 0.00026169230769230767, |
|
"loss": 0.5985, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 6.67, |
|
"grad_norm": 3.429755687713623, |
|
"learning_rate": 0.00025938461538461537, |
|
"loss": 0.4957, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 7.04, |
|
"grad_norm": 4.753056526184082, |
|
"learning_rate": 0.000257076923076923, |
|
"loss": 0.6527, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 7.41, |
|
"grad_norm": 2.638728380203247, |
|
"learning_rate": 0.0002547692307692307, |
|
"loss": 0.4148, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 7.41, |
|
"eval_loss": 1.5832433700561523, |
|
"eval_runtime": 0.4351, |
|
"eval_samples_per_second": 62.055, |
|
"eval_steps_per_second": 9.193, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 7.78, |
|
"grad_norm": 2.906188726425171, |
|
"learning_rate": 0.0002524615384615384, |
|
"loss": 0.4678, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 8.15, |
|
"grad_norm": 2.0787110328674316, |
|
"learning_rate": 0.00025015384615384613, |
|
"loss": 0.4415, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 8.52, |
|
"grad_norm": 2.6470298767089844, |
|
"learning_rate": 0.00024784615384615383, |
|
"loss": 0.3638, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 8.89, |
|
"grad_norm": 3.3667097091674805, |
|
"learning_rate": 0.00024553846153846154, |
|
"loss": 0.4925, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 8.89, |
|
"eval_loss": 1.6094781160354614, |
|
"eval_runtime": 0.4339, |
|
"eval_samples_per_second": 62.231, |
|
"eval_steps_per_second": 9.219, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 9.26, |
|
"grad_norm": 1.8297497034072876, |
|
"learning_rate": 0.0002432307692307692, |
|
"loss": 0.3612, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 9.63, |
|
"grad_norm": 2.8483166694641113, |
|
"learning_rate": 0.0002409230769230769, |
|
"loss": 0.39, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 3.550173282623291, |
|
"learning_rate": 0.0002386153846153846, |
|
"loss": 0.4645, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 10.37, |
|
"grad_norm": 1.982654333114624, |
|
"learning_rate": 0.0002363076923076923, |
|
"loss": 0.3114, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 10.37, |
|
"eval_loss": 1.7044697999954224, |
|
"eval_runtime": 0.4349, |
|
"eval_samples_per_second": 62.08, |
|
"eval_steps_per_second": 9.197, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 10.74, |
|
"grad_norm": 2.9532089233398438, |
|
"learning_rate": 0.000234, |
|
"loss": 0.4241, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 11.11, |
|
"grad_norm": 1.4060415029525757, |
|
"learning_rate": 0.0002316923076923077, |
|
"loss": 0.3897, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 11.48, |
|
"grad_norm": 1.5898511409759521, |
|
"learning_rate": 0.00022938461538461535, |
|
"loss": 0.3231, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 11.85, |
|
"grad_norm": 2.111783266067505, |
|
"learning_rate": 0.00022707692307692305, |
|
"loss": 0.4056, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 11.85, |
|
"eval_loss": 1.7500680685043335, |
|
"eval_runtime": 0.4341, |
|
"eval_samples_per_second": 62.203, |
|
"eval_steps_per_second": 9.215, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 12.22, |
|
"grad_norm": 1.6774641275405884, |
|
"learning_rate": 0.00022476923076923075, |
|
"loss": 0.3562, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 12.59, |
|
"grad_norm": 1.5955278873443604, |
|
"learning_rate": 0.00022246153846153846, |
|
"loss": 0.3484, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 12.96, |
|
"grad_norm": 1.2137044668197632, |
|
"learning_rate": 0.00022015384615384613, |
|
"loss": 0.3616, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 13.33, |
|
"grad_norm": 1.3046234846115112, |
|
"learning_rate": 0.00021784615384615383, |
|
"loss": 0.3443, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 13.33, |
|
"eval_loss": 1.5669885873794556, |
|
"eval_runtime": 0.4348, |
|
"eval_samples_per_second": 62.098, |
|
"eval_steps_per_second": 9.2, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 13.7, |
|
"grad_norm": 2.3079042434692383, |
|
"learning_rate": 0.00021553846153846154, |
|
"loss": 0.3503, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 14.07, |
|
"grad_norm": 0.9355862736701965, |
|
"learning_rate": 0.00021323076923076921, |
|
"loss": 0.3417, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 14.44, |
|
"grad_norm": 1.706770420074463, |
|
"learning_rate": 0.0002109230769230769, |
|
"loss": 0.3187, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 14.81, |
|
"grad_norm": 1.9932715892791748, |
|
"learning_rate": 0.0002086153846153846, |
|
"loss": 0.3489, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 14.81, |
|
"eval_loss": 1.6050857305526733, |
|
"eval_runtime": 0.4347, |
|
"eval_samples_per_second": 62.115, |
|
"eval_steps_per_second": 9.202, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 15.19, |
|
"grad_norm": 0.947511613368988, |
|
"learning_rate": 0.0002063076923076923, |
|
"loss": 0.3556, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 15.56, |
|
"grad_norm": 1.3406760692596436, |
|
"learning_rate": 0.000204, |
|
"loss": 0.3065, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 15.93, |
|
"grad_norm": 1.2684600353240967, |
|
"learning_rate": 0.00020169230769230767, |
|
"loss": 0.3413, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 16.3, |
|
"grad_norm": 1.7013931274414062, |
|
"learning_rate": 0.00019938461538461538, |
|
"loss": 0.3156, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 16.3, |
|
"eval_loss": 1.7684317827224731, |
|
"eval_runtime": 0.4353, |
|
"eval_samples_per_second": 62.027, |
|
"eval_steps_per_second": 9.189, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 16.67, |
|
"grad_norm": 1.1557570695877075, |
|
"learning_rate": 0.00019707692307692305, |
|
"loss": 0.3084, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 17.04, |
|
"grad_norm": 1.1206583976745605, |
|
"learning_rate": 0.00019476923076923076, |
|
"loss": 0.4039, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 17.41, |
|
"grad_norm": 1.4177170991897583, |
|
"learning_rate": 0.00019246153846153843, |
|
"loss": 0.3247, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 17.78, |
|
"grad_norm": 1.4315319061279297, |
|
"learning_rate": 0.00019015384615384613, |
|
"loss": 0.3107, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 17.78, |
|
"eval_loss": 1.6817841529846191, |
|
"eval_runtime": 0.4335, |
|
"eval_samples_per_second": 62.287, |
|
"eval_steps_per_second": 9.228, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 18.15, |
|
"grad_norm": 1.2824641466140747, |
|
"learning_rate": 0.00018784615384615384, |
|
"loss": 0.3149, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 18.52, |
|
"grad_norm": 1.095780849456787, |
|
"learning_rate": 0.00018553846153846154, |
|
"loss": 0.2916, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 18.89, |
|
"grad_norm": 1.2812676429748535, |
|
"learning_rate": 0.00018323076923076922, |
|
"loss": 0.3353, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 19.26, |
|
"grad_norm": 1.171350359916687, |
|
"learning_rate": 0.0001809230769230769, |
|
"loss": 0.309, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 19.26, |
|
"eval_loss": 1.7549753189086914, |
|
"eval_runtime": 0.4347, |
|
"eval_samples_per_second": 62.113, |
|
"eval_steps_per_second": 9.202, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 19.63, |
|
"grad_norm": 1.1714686155319214, |
|
"learning_rate": 0.0001786153846153846, |
|
"loss": 0.3119, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 1.0897107124328613, |
|
"learning_rate": 0.0001763076923076923, |
|
"loss": 0.326, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 20.37, |
|
"grad_norm": 1.1124438047409058, |
|
"learning_rate": 0.00017399999999999997, |
|
"loss": 0.2985, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 20.74, |
|
"grad_norm": 0.9445765018463135, |
|
"learning_rate": 0.00017169230769230768, |
|
"loss": 0.2918, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 20.74, |
|
"eval_loss": 1.7201834917068481, |
|
"eval_runtime": 0.4514, |
|
"eval_samples_per_second": 59.813, |
|
"eval_steps_per_second": 8.861, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 21.11, |
|
"grad_norm": 0.9067970514297485, |
|
"learning_rate": 0.00016938461538461538, |
|
"loss": 0.3118, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 21.48, |
|
"grad_norm": 1.0378901958465576, |
|
"learning_rate": 0.00016707692307692308, |
|
"loss": 0.3016, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 21.85, |
|
"grad_norm": 1.2347774505615234, |
|
"learning_rate": 0.00016476923076923073, |
|
"loss": 0.2904, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 22.22, |
|
"grad_norm": 0.9320012927055359, |
|
"learning_rate": 0.00016246153846153843, |
|
"loss": 0.3348, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 22.22, |
|
"eval_loss": 1.7654225826263428, |
|
"eval_runtime": 0.4352, |
|
"eval_samples_per_second": 62.036, |
|
"eval_steps_per_second": 9.191, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 22.59, |
|
"grad_norm": 0.8344219326972961, |
|
"learning_rate": 0.00016015384615384614, |
|
"loss": 0.2905, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 22.96, |
|
"grad_norm": 1.3457179069519043, |
|
"learning_rate": 0.00015784615384615384, |
|
"loss": 0.3244, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 23.33, |
|
"grad_norm": 1.1215949058532715, |
|
"learning_rate": 0.00015553846153846152, |
|
"loss": 0.2964, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 23.7, |
|
"grad_norm": 0.8459953665733337, |
|
"learning_rate": 0.00015323076923076922, |
|
"loss": 0.3206, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 23.7, |
|
"eval_loss": 1.8162420988082886, |
|
"eval_runtime": 0.4339, |
|
"eval_samples_per_second": 62.219, |
|
"eval_steps_per_second": 9.218, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 24.07, |
|
"grad_norm": 0.8673954010009766, |
|
"learning_rate": 0.00015092307692307692, |
|
"loss": 0.2768, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 24.44, |
|
"grad_norm": 0.9475287795066833, |
|
"learning_rate": 0.0001486153846153846, |
|
"loss": 0.2818, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 24.81, |
|
"grad_norm": 0.9035760164260864, |
|
"learning_rate": 0.0001463076923076923, |
|
"loss": 0.3097, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 25.19, |
|
"grad_norm": 0.8320503830909729, |
|
"learning_rate": 0.00014399999999999998, |
|
"loss": 0.2968, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 25.19, |
|
"eval_loss": 1.8249504566192627, |
|
"eval_runtime": 0.4343, |
|
"eval_samples_per_second": 62.165, |
|
"eval_steps_per_second": 9.21, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 25.56, |
|
"grad_norm": 1.0484280586242676, |
|
"learning_rate": 0.00014169230769230768, |
|
"loss": 0.2661, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 25.93, |
|
"grad_norm": 1.3034342527389526, |
|
"learning_rate": 0.00013938461538461536, |
|
"loss": 0.3249, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 26.3, |
|
"grad_norm": 0.7918898463249207, |
|
"learning_rate": 0.00013707692307692306, |
|
"loss": 0.2881, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 26.67, |
|
"grad_norm": 0.8644436001777649, |
|
"learning_rate": 0.00013476923076923076, |
|
"loss": 0.3108, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 26.67, |
|
"eval_loss": 1.859883189201355, |
|
"eval_runtime": 0.4339, |
|
"eval_samples_per_second": 62.222, |
|
"eval_steps_per_second": 9.218, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 27.04, |
|
"grad_norm": 0.8299930095672607, |
|
"learning_rate": 0.00013246153846153846, |
|
"loss": 0.2873, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 27.41, |
|
"grad_norm": 0.7016355991363525, |
|
"learning_rate": 0.00013015384615384614, |
|
"loss": 0.3059, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 27.78, |
|
"grad_norm": 0.9215915203094482, |
|
"learning_rate": 0.00012784615384615384, |
|
"loss": 0.2854, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 28.15, |
|
"grad_norm": 0.8328156471252441, |
|
"learning_rate": 0.00012553846153846152, |
|
"loss": 0.274, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 28.15, |
|
"eval_loss": 1.9108076095581055, |
|
"eval_runtime": 0.4355, |
|
"eval_samples_per_second": 62.003, |
|
"eval_steps_per_second": 9.186, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 28.52, |
|
"grad_norm": 0.9325290322303772, |
|
"learning_rate": 0.00012323076923076922, |
|
"loss": 0.2775, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 28.89, |
|
"grad_norm": 0.882301926612854, |
|
"learning_rate": 0.00012092307692307691, |
|
"loss": 0.3149, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 29.26, |
|
"grad_norm": 0.7358686923980713, |
|
"learning_rate": 0.0001186153846153846, |
|
"loss": 0.2858, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 29.63, |
|
"grad_norm": 1.316838264465332, |
|
"learning_rate": 0.00011630769230769229, |
|
"loss": 0.2914, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 29.63, |
|
"eval_loss": 1.9297478199005127, |
|
"eval_runtime": 0.4351, |
|
"eval_samples_per_second": 62.054, |
|
"eval_steps_per_second": 9.193, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"grad_norm": 1.2439055442810059, |
|
"learning_rate": 0.00011399999999999999, |
|
"loss": 0.3069, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 30.37, |
|
"grad_norm": 0.6521609425544739, |
|
"learning_rate": 0.00011169230769230768, |
|
"loss": 0.2713, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 30.74, |
|
"grad_norm": 1.044469952583313, |
|
"learning_rate": 0.00010938461538461537, |
|
"loss": 0.2722, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 31.11, |
|
"grad_norm": 0.9478164911270142, |
|
"learning_rate": 0.00010707692307692306, |
|
"loss": 0.3216, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 31.11, |
|
"eval_loss": 1.9086456298828125, |
|
"eval_runtime": 0.4351, |
|
"eval_samples_per_second": 62.057, |
|
"eval_steps_per_second": 9.194, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 31.48, |
|
"grad_norm": 0.8392044305801392, |
|
"learning_rate": 0.00010476923076923076, |
|
"loss": 0.2754, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 31.85, |
|
"grad_norm": 0.9244445562362671, |
|
"learning_rate": 0.00010246153846153844, |
|
"loss": 0.2899, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 32.22, |
|
"grad_norm": 0.7770041227340698, |
|
"learning_rate": 0.00010015384615384614, |
|
"loss": 0.3074, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 32.59, |
|
"grad_norm": 0.9049687385559082, |
|
"learning_rate": 9.784615384615383e-05, |
|
"loss": 0.2837, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 32.59, |
|
"eval_loss": 1.9283045530319214, |
|
"eval_runtime": 0.436, |
|
"eval_samples_per_second": 61.931, |
|
"eval_steps_per_second": 9.175, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 32.96, |
|
"grad_norm": 0.9280600547790527, |
|
"learning_rate": 9.553846153846153e-05, |
|
"loss": 0.2832, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 33.33, |
|
"grad_norm": 1.0627598762512207, |
|
"learning_rate": 9.323076923076921e-05, |
|
"loss": 0.3019, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 33.7, |
|
"grad_norm": 0.8776496052742004, |
|
"learning_rate": 9.092307692307691e-05, |
|
"loss": 0.2691, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 34.07, |
|
"grad_norm": 1.0023120641708374, |
|
"learning_rate": 8.861538461538462e-05, |
|
"loss": 0.2908, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 34.07, |
|
"eval_loss": 1.9639641046524048, |
|
"eval_runtime": 0.4354, |
|
"eval_samples_per_second": 62.01, |
|
"eval_steps_per_second": 9.187, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 34.44, |
|
"grad_norm": 0.7977310419082642, |
|
"learning_rate": 8.63076923076923e-05, |
|
"loss": 0.2448, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 34.81, |
|
"grad_norm": 0.941286563873291, |
|
"learning_rate": 8.4e-05, |
|
"loss": 0.3116, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 35.19, |
|
"grad_norm": 0.8777848482131958, |
|
"learning_rate": 8.169230769230768e-05, |
|
"loss": 0.297, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 35.56, |
|
"grad_norm": 1.213904857635498, |
|
"learning_rate": 7.938461538461539e-05, |
|
"loss": 0.2789, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 35.56, |
|
"eval_loss": 1.9745168685913086, |
|
"eval_runtime": 0.4348, |
|
"eval_samples_per_second": 62.101, |
|
"eval_steps_per_second": 9.2, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 35.93, |
|
"grad_norm": 0.9805024266242981, |
|
"learning_rate": 7.707692307692306e-05, |
|
"loss": 0.2894, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 36.3, |
|
"grad_norm": 0.829560399055481, |
|
"learning_rate": 7.476923076923077e-05, |
|
"loss": 0.2597, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 36.67, |
|
"grad_norm": 1.1455888748168945, |
|
"learning_rate": 7.246153846153846e-05, |
|
"loss": 0.3081, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 37.04, |
|
"grad_norm": 0.9539237022399902, |
|
"learning_rate": 7.015384615384615e-05, |
|
"loss": 0.2773, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 37.04, |
|
"eval_loss": 1.9331037998199463, |
|
"eval_runtime": 0.4366, |
|
"eval_samples_per_second": 61.836, |
|
"eval_steps_per_second": 9.161, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 37.41, |
|
"grad_norm": 0.8768445253372192, |
|
"learning_rate": 6.784615384615383e-05, |
|
"loss": 0.2595, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 37.78, |
|
"grad_norm": 0.9345956444740295, |
|
"learning_rate": 6.553846153846154e-05, |
|
"loss": 0.287, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 38.15, |
|
"grad_norm": 1.006366491317749, |
|
"learning_rate": 6.323076923076923e-05, |
|
"loss": 0.2988, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 38.52, |
|
"grad_norm": 0.999914824962616, |
|
"learning_rate": 6.0923076923076916e-05, |
|
"loss": 0.2734, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 38.52, |
|
"eval_loss": 1.9785383939743042, |
|
"eval_runtime": 0.4355, |
|
"eval_samples_per_second": 62.002, |
|
"eval_steps_per_second": 9.186, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 38.89, |
|
"grad_norm": 0.9857865571975708, |
|
"learning_rate": 5.8615384615384606e-05, |
|
"loss": 0.288, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 39.26, |
|
"grad_norm": 1.0521814823150635, |
|
"learning_rate": 5.63076923076923e-05, |
|
"loss": 0.2844, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 39.63, |
|
"grad_norm": 0.8729674220085144, |
|
"learning_rate": 5.399999999999999e-05, |
|
"loss": 0.2641, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"grad_norm": 1.0058977603912354, |
|
"learning_rate": 5.169230769230769e-05, |
|
"loss": 0.2916, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_loss": 2.0248498916625977, |
|
"eval_runtime": 0.4342, |
|
"eval_samples_per_second": 62.187, |
|
"eval_steps_per_second": 9.213, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 40.37, |
|
"grad_norm": 1.0700985193252563, |
|
"learning_rate": 4.938461538461538e-05, |
|
"loss": 0.2675, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 40.74, |
|
"grad_norm": 0.9227583408355713, |
|
"learning_rate": 4.707692307692307e-05, |
|
"loss": 0.2607, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 41.11, |
|
"grad_norm": 0.658815324306488, |
|
"learning_rate": 4.476923076923076e-05, |
|
"loss": 0.2806, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 41.48, |
|
"grad_norm": 0.849854052066803, |
|
"learning_rate": 4.246153846153846e-05, |
|
"loss": 0.2703, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 41.48, |
|
"eval_loss": 1.9876518249511719, |
|
"eval_runtime": 0.4356, |
|
"eval_samples_per_second": 61.988, |
|
"eval_steps_per_second": 9.183, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 41.85, |
|
"grad_norm": 1.0666935443878174, |
|
"learning_rate": 4.015384615384615e-05, |
|
"loss": 0.2946, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 42.22, |
|
"grad_norm": 1.0425372123718262, |
|
"learning_rate": 3.784615384615384e-05, |
|
"loss": 0.2973, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 42.59, |
|
"grad_norm": 1.135412335395813, |
|
"learning_rate": 3.553846153846153e-05, |
|
"loss": 0.2846, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 42.96, |
|
"grad_norm": 0.9318748712539673, |
|
"learning_rate": 3.323076923076923e-05, |
|
"loss": 0.2608, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 42.96, |
|
"eval_loss": 2.014052391052246, |
|
"eval_runtime": 0.4365, |
|
"eval_samples_per_second": 61.863, |
|
"eval_steps_per_second": 9.165, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 43.33, |
|
"grad_norm": 0.740696370601654, |
|
"learning_rate": 3.092307692307692e-05, |
|
"loss": 0.2745, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 43.7, |
|
"grad_norm": 1.0200663805007935, |
|
"learning_rate": 2.8615384615384615e-05, |
|
"loss": 0.2722, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 44.07, |
|
"grad_norm": 0.9688575863838196, |
|
"learning_rate": 2.6307692307692304e-05, |
|
"loss": 0.2811, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 44.44, |
|
"grad_norm": 1.2867329120635986, |
|
"learning_rate": 2.3999999999999997e-05, |
|
"loss": 0.262, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 44.44, |
|
"eval_loss": 2.0364649295806885, |
|
"eval_runtime": 0.4343, |
|
"eval_samples_per_second": 62.168, |
|
"eval_steps_per_second": 9.21, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 44.81, |
|
"grad_norm": 1.0685030221939087, |
|
"learning_rate": 2.169230769230769e-05, |
|
"loss": 0.2679, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 45.19, |
|
"grad_norm": 0.9567782282829285, |
|
"learning_rate": 1.9384615384615383e-05, |
|
"loss": 0.2855, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 45.56, |
|
"grad_norm": 0.8821234703063965, |
|
"learning_rate": 1.7076923076923076e-05, |
|
"loss": 0.2807, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 45.93, |
|
"grad_norm": 1.212229609489441, |
|
"learning_rate": 1.4769230769230768e-05, |
|
"loss": 0.2767, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 45.93, |
|
"eval_loss": 2.0518877506256104, |
|
"eval_runtime": 0.4334, |
|
"eval_samples_per_second": 62.291, |
|
"eval_steps_per_second": 9.228, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 46.3, |
|
"grad_norm": 1.0351111888885498, |
|
"learning_rate": 1.2461538461538461e-05, |
|
"loss": 0.2699, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 46.67, |
|
"grad_norm": 1.1969187259674072, |
|
"learning_rate": 1.0153846153846152e-05, |
|
"loss": 0.277, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 47.04, |
|
"grad_norm": 0.7708118557929993, |
|
"learning_rate": 7.846153846153845e-06, |
|
"loss": 0.2637, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 47.41, |
|
"grad_norm": 0.8004487752914429, |
|
"learning_rate": 5.5384615384615385e-06, |
|
"loss": 0.2642, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 47.41, |
|
"eval_loss": 2.0601537227630615, |
|
"eval_runtime": 0.4355, |
|
"eval_samples_per_second": 61.996, |
|
"eval_steps_per_second": 9.185, |
|
"step": 640 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 650, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 20, |
|
"total_flos": 5533698562129920.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|