|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.996825396825397, |
|
"eval_steps": 500, |
|
"global_step": 118000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.012698412698412698, |
|
"grad_norm": 0.4761015474796295, |
|
"learning_rate": 1.9915343915343918e-05, |
|
"loss": 0.0794, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.025396825396825397, |
|
"grad_norm": 0.43550318479537964, |
|
"learning_rate": 1.983068783068783e-05, |
|
"loss": 0.0811, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.0380952380952381, |
|
"grad_norm": 0.4672704339027405, |
|
"learning_rate": 1.9746031746031748e-05, |
|
"loss": 0.0819, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.050793650793650794, |
|
"grad_norm": 0.5426394939422607, |
|
"learning_rate": 1.9661375661375664e-05, |
|
"loss": 0.0829, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.06349206349206349, |
|
"grad_norm": 0.3974975645542145, |
|
"learning_rate": 1.9576719576719577e-05, |
|
"loss": 0.0816, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.0761904761904762, |
|
"grad_norm": 0.6599302887916565, |
|
"learning_rate": 1.9492063492063494e-05, |
|
"loss": 0.0815, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.08888888888888889, |
|
"grad_norm": 0.35329556465148926, |
|
"learning_rate": 1.9407407407407407e-05, |
|
"loss": 0.0841, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.10158730158730159, |
|
"grad_norm": 0.42421749234199524, |
|
"learning_rate": 1.9322751322751327e-05, |
|
"loss": 0.0833, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.11428571428571428, |
|
"grad_norm": 0.4479866325855255, |
|
"learning_rate": 1.923809523809524e-05, |
|
"loss": 0.0839, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.12698412698412698, |
|
"grad_norm": 0.372086763381958, |
|
"learning_rate": 1.9153439153439156e-05, |
|
"loss": 0.0835, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.13968253968253969, |
|
"grad_norm": 0.38730981945991516, |
|
"learning_rate": 1.906878306878307e-05, |
|
"loss": 0.0841, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.1523809523809524, |
|
"grad_norm": 0.5003937482833862, |
|
"learning_rate": 1.8984126984126986e-05, |
|
"loss": 0.0829, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.16507936507936508, |
|
"grad_norm": 0.42826735973358154, |
|
"learning_rate": 1.8899470899470903e-05, |
|
"loss": 0.0835, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.17777777777777778, |
|
"grad_norm": 0.49070820212364197, |
|
"learning_rate": 1.8814814814814816e-05, |
|
"loss": 0.0827, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.19047619047619047, |
|
"grad_norm": 0.4903796911239624, |
|
"learning_rate": 1.8730158730158732e-05, |
|
"loss": 0.0823, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.20317460317460317, |
|
"grad_norm": 0.4144362211227417, |
|
"learning_rate": 1.8645502645502645e-05, |
|
"loss": 0.0842, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.21587301587301588, |
|
"grad_norm": 0.6519999504089355, |
|
"learning_rate": 1.8560846560846562e-05, |
|
"loss": 0.0827, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.22857142857142856, |
|
"grad_norm": 0.37082576751708984, |
|
"learning_rate": 1.8476190476190478e-05, |
|
"loss": 0.0835, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.24126984126984127, |
|
"grad_norm": 0.319024920463562, |
|
"learning_rate": 1.8391534391534395e-05, |
|
"loss": 0.0829, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.25396825396825395, |
|
"grad_norm": 0.4173873960971832, |
|
"learning_rate": 1.8306878306878308e-05, |
|
"loss": 0.0814, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.26666666666666666, |
|
"grad_norm": 0.4521333873271942, |
|
"learning_rate": 1.8222222222222224e-05, |
|
"loss": 0.0825, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.27936507936507937, |
|
"grad_norm": 0.4372086822986603, |
|
"learning_rate": 1.8137566137566137e-05, |
|
"loss": 0.0844, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.2920634920634921, |
|
"grad_norm": 0.40673378109931946, |
|
"learning_rate": 1.8052910052910054e-05, |
|
"loss": 0.0846, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.3047619047619048, |
|
"grad_norm": 0.524502694606781, |
|
"learning_rate": 1.796825396825397e-05, |
|
"loss": 0.0843, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.31746031746031744, |
|
"grad_norm": 0.36854442954063416, |
|
"learning_rate": 1.7883597883597884e-05, |
|
"loss": 0.0838, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.33015873015873015, |
|
"grad_norm": 0.4694221019744873, |
|
"learning_rate": 1.77989417989418e-05, |
|
"loss": 0.0834, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.34285714285714286, |
|
"grad_norm": 0.384512335062027, |
|
"learning_rate": 1.7714285714285717e-05, |
|
"loss": 0.0825, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.35555555555555557, |
|
"grad_norm": 0.3776947855949402, |
|
"learning_rate": 1.7629629629629633e-05, |
|
"loss": 0.081, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.3682539682539683, |
|
"grad_norm": 0.44691145420074463, |
|
"learning_rate": 1.7544973544973546e-05, |
|
"loss": 0.0844, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.38095238095238093, |
|
"grad_norm": 0.38754552602767944, |
|
"learning_rate": 1.7460317460317463e-05, |
|
"loss": 0.0834, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.39365079365079364, |
|
"grad_norm": 0.3924926221370697, |
|
"learning_rate": 1.7375661375661376e-05, |
|
"loss": 0.0836, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.40634920634920635, |
|
"grad_norm": 0.41219380497932434, |
|
"learning_rate": 1.7291005291005292e-05, |
|
"loss": 0.0827, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.41904761904761906, |
|
"grad_norm": 0.36697277426719666, |
|
"learning_rate": 1.720634920634921e-05, |
|
"loss": 0.0833, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.43174603174603177, |
|
"grad_norm": 0.37833482027053833, |
|
"learning_rate": 1.7121693121693125e-05, |
|
"loss": 0.0831, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 0.33408552408218384, |
|
"learning_rate": 1.7037037037037038e-05, |
|
"loss": 0.0818, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.45714285714285713, |
|
"grad_norm": 0.4245634377002716, |
|
"learning_rate": 1.6952380952380955e-05, |
|
"loss": 0.0838, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.46984126984126984, |
|
"grad_norm": 0.4424809217453003, |
|
"learning_rate": 1.6867724867724868e-05, |
|
"loss": 0.0828, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.48253968253968255, |
|
"grad_norm": 0.47369641065597534, |
|
"learning_rate": 1.6783068783068784e-05, |
|
"loss": 0.0828, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.49523809523809526, |
|
"grad_norm": 0.417057603597641, |
|
"learning_rate": 1.66984126984127e-05, |
|
"loss": 0.0839, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.5079365079365079, |
|
"grad_norm": 0.450612336397171, |
|
"learning_rate": 1.6613756613756614e-05, |
|
"loss": 0.0832, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.5206349206349207, |
|
"grad_norm": 0.35937097668647766, |
|
"learning_rate": 1.652910052910053e-05, |
|
"loss": 0.0816, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 0.4366040527820587, |
|
"learning_rate": 1.6444444444444444e-05, |
|
"loss": 0.0817, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.546031746031746, |
|
"grad_norm": 0.3630824387073517, |
|
"learning_rate": 1.6359788359788363e-05, |
|
"loss": 0.0823, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.5587301587301587, |
|
"grad_norm": 0.45653077960014343, |
|
"learning_rate": 1.6275132275132277e-05, |
|
"loss": 0.0814, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 0.4124685525894165, |
|
"learning_rate": 1.6190476190476193e-05, |
|
"loss": 0.0828, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.5841269841269842, |
|
"grad_norm": 0.4182330071926117, |
|
"learning_rate": 1.6105820105820106e-05, |
|
"loss": 0.0825, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.5968253968253968, |
|
"grad_norm": 0.7457558512687683, |
|
"learning_rate": 1.6021164021164023e-05, |
|
"loss": 0.0828, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.6095238095238096, |
|
"grad_norm": 0.41049671173095703, |
|
"learning_rate": 1.5936507936507936e-05, |
|
"loss": 0.0831, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.6222222222222222, |
|
"grad_norm": 0.4230283498764038, |
|
"learning_rate": 1.5851851851851852e-05, |
|
"loss": 0.0823, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.6349206349206349, |
|
"grad_norm": 0.38568949699401855, |
|
"learning_rate": 1.576719576719577e-05, |
|
"loss": 0.0811, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.6476190476190476, |
|
"grad_norm": 0.42709481716156006, |
|
"learning_rate": 1.5682539682539685e-05, |
|
"loss": 0.0818, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.6603174603174603, |
|
"grad_norm": 0.37508589029312134, |
|
"learning_rate": 1.55978835978836e-05, |
|
"loss": 0.0828, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.6730158730158731, |
|
"grad_norm": 0.43134260177612305, |
|
"learning_rate": 1.5513227513227515e-05, |
|
"loss": 0.0824, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.6857142857142857, |
|
"grad_norm": 0.37693992257118225, |
|
"learning_rate": 1.542857142857143e-05, |
|
"loss": 0.0811, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.6984126984126984, |
|
"grad_norm": 0.34098678827285767, |
|
"learning_rate": 1.5343915343915344e-05, |
|
"loss": 0.0819, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.7111111111111111, |
|
"grad_norm": 0.47179728746414185, |
|
"learning_rate": 1.525925925925926e-05, |
|
"loss": 0.082, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.7238095238095238, |
|
"grad_norm": 0.4184609651565552, |
|
"learning_rate": 1.5174603174603176e-05, |
|
"loss": 0.0825, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.7365079365079366, |
|
"grad_norm": 0.3582792282104492, |
|
"learning_rate": 1.508994708994709e-05, |
|
"loss": 0.0821, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.7492063492063492, |
|
"grad_norm": 0.5200299620628357, |
|
"learning_rate": 1.5005291005291007e-05, |
|
"loss": 0.0817, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.7619047619047619, |
|
"grad_norm": 0.4461567997932434, |
|
"learning_rate": 1.4920634920634922e-05, |
|
"loss": 0.0814, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.7746031746031746, |
|
"grad_norm": 0.3920634388923645, |
|
"learning_rate": 1.4835978835978837e-05, |
|
"loss": 0.0819, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 0.7873015873015873, |
|
"grad_norm": 0.41001540422439575, |
|
"learning_rate": 1.4751322751322751e-05, |
|
"loss": 0.0802, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.4187995493412018, |
|
"learning_rate": 1.4666666666666666e-05, |
|
"loss": 0.0816, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 0.8126984126984127, |
|
"grad_norm": 0.39321765303611755, |
|
"learning_rate": 1.4582010582010584e-05, |
|
"loss": 0.0824, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.8253968253968254, |
|
"grad_norm": 0.3958302140235901, |
|
"learning_rate": 1.44973544973545e-05, |
|
"loss": 0.0801, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 0.8380952380952381, |
|
"grad_norm": 0.3932056725025177, |
|
"learning_rate": 1.4412698412698414e-05, |
|
"loss": 0.0808, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.8507936507936508, |
|
"grad_norm": 0.3314465284347534, |
|
"learning_rate": 1.4328042328042329e-05, |
|
"loss": 0.0827, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 0.8634920634920635, |
|
"grad_norm": 0.43675485253334045, |
|
"learning_rate": 1.4243386243386244e-05, |
|
"loss": 0.0811, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.8761904761904762, |
|
"grad_norm": 0.6284595131874084, |
|
"learning_rate": 1.415873015873016e-05, |
|
"loss": 0.0805, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 0.39293691515922546, |
|
"learning_rate": 1.4074074074074075e-05, |
|
"loss": 0.0803, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.9015873015873016, |
|
"grad_norm": 0.4092639088630676, |
|
"learning_rate": 1.398941798941799e-05, |
|
"loss": 0.0813, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 0.9142857142857143, |
|
"grad_norm": 0.41005492210388184, |
|
"learning_rate": 1.3904761904761905e-05, |
|
"loss": 0.0811, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.926984126984127, |
|
"grad_norm": 0.5190646052360535, |
|
"learning_rate": 1.3820105820105821e-05, |
|
"loss": 0.0811, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 0.9396825396825397, |
|
"grad_norm": 0.32034316658973694, |
|
"learning_rate": 1.3735449735449738e-05, |
|
"loss": 0.0812, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.9523809523809523, |
|
"grad_norm": 0.4857613742351532, |
|
"learning_rate": 1.3650793650793652e-05, |
|
"loss": 0.0813, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 0.9650793650793651, |
|
"grad_norm": 0.4523787796497345, |
|
"learning_rate": 1.3566137566137567e-05, |
|
"loss": 0.0816, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.9777777777777777, |
|
"grad_norm": 0.4204433262348175, |
|
"learning_rate": 1.3481481481481482e-05, |
|
"loss": 0.0806, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 0.9904761904761905, |
|
"grad_norm": 0.4313475787639618, |
|
"learning_rate": 1.3396825396825397e-05, |
|
"loss": 0.0806, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.07647726684808731, |
|
"eval_runtime": 270.8786, |
|
"eval_samples_per_second": 516.837, |
|
"eval_steps_per_second": 64.605, |
|
"step": 39375 |
|
}, |
|
{ |
|
"epoch": 1.0031746031746032, |
|
"grad_norm": 0.44939786195755005, |
|
"learning_rate": 1.3312169312169313e-05, |
|
"loss": 0.0795, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 1.0158730158730158, |
|
"grad_norm": 0.48013949394226074, |
|
"learning_rate": 1.322751322751323e-05, |
|
"loss": 0.0751, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 1.0285714285714285, |
|
"grad_norm": 0.4185923635959625, |
|
"learning_rate": 1.3142857142857145e-05, |
|
"loss": 0.0736, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 1.0412698412698413, |
|
"grad_norm": 0.397386759519577, |
|
"learning_rate": 1.305820105820106e-05, |
|
"loss": 0.0748, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 1.053968253968254, |
|
"grad_norm": 0.39524805545806885, |
|
"learning_rate": 1.2973544973544974e-05, |
|
"loss": 0.0735, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 1.0666666666666667, |
|
"grad_norm": 0.34505075216293335, |
|
"learning_rate": 1.288888888888889e-05, |
|
"loss": 0.0746, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 1.0793650793650793, |
|
"grad_norm": 0.37381839752197266, |
|
"learning_rate": 1.2804232804232805e-05, |
|
"loss": 0.0728, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 1.0920634920634922, |
|
"grad_norm": 0.6797782182693481, |
|
"learning_rate": 1.271957671957672e-05, |
|
"loss": 0.0741, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 1.1047619047619048, |
|
"grad_norm": 0.41272956132888794, |
|
"learning_rate": 1.2634920634920635e-05, |
|
"loss": 0.0738, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 1.1174603174603175, |
|
"grad_norm": 0.382468044757843, |
|
"learning_rate": 1.255026455026455e-05, |
|
"loss": 0.0738, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 1.1301587301587301, |
|
"grad_norm": 0.3978229761123657, |
|
"learning_rate": 1.2465608465608468e-05, |
|
"loss": 0.074, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 1.1428571428571428, |
|
"grad_norm": 0.3431326746940613, |
|
"learning_rate": 1.2380952380952383e-05, |
|
"loss": 0.0745, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 1.1555555555555554, |
|
"grad_norm": 0.3610997200012207, |
|
"learning_rate": 1.2296296296296298e-05, |
|
"loss": 0.0729, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 1.1682539682539683, |
|
"grad_norm": 0.49680083990097046, |
|
"learning_rate": 1.2211640211640212e-05, |
|
"loss": 0.0732, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 1.180952380952381, |
|
"grad_norm": 0.3833047151565552, |
|
"learning_rate": 1.2126984126984127e-05, |
|
"loss": 0.0732, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 1.1936507936507936, |
|
"grad_norm": 0.2808152139186859, |
|
"learning_rate": 1.2042328042328044e-05, |
|
"loss": 0.0733, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 1.2063492063492063, |
|
"grad_norm": 0.5429581999778748, |
|
"learning_rate": 1.1957671957671959e-05, |
|
"loss": 0.0729, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 1.2190476190476192, |
|
"grad_norm": 0.34248363971710205, |
|
"learning_rate": 1.1873015873015873e-05, |
|
"loss": 0.0746, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 1.2317460317460318, |
|
"grad_norm": 0.5099675059318542, |
|
"learning_rate": 1.1788359788359788e-05, |
|
"loss": 0.0739, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 1.2444444444444445, |
|
"grad_norm": 0.3858914375305176, |
|
"learning_rate": 1.1703703703703703e-05, |
|
"loss": 0.0721, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 1.2571428571428571, |
|
"grad_norm": 0.3453405201435089, |
|
"learning_rate": 1.1619047619047621e-05, |
|
"loss": 0.0737, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 1.2698412698412698, |
|
"grad_norm": 0.4647195637226105, |
|
"learning_rate": 1.1534391534391536e-05, |
|
"loss": 0.0736, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 1.2825396825396824, |
|
"grad_norm": 0.4548490345478058, |
|
"learning_rate": 1.144973544973545e-05, |
|
"loss": 0.0742, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 1.2952380952380953, |
|
"grad_norm": 0.4145970046520233, |
|
"learning_rate": 1.1365079365079366e-05, |
|
"loss": 0.0748, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 1.307936507936508, |
|
"grad_norm": 0.4032251536846161, |
|
"learning_rate": 1.128042328042328e-05, |
|
"loss": 0.073, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 1.3206349206349206, |
|
"grad_norm": 0.5053452849388123, |
|
"learning_rate": 1.1195767195767197e-05, |
|
"loss": 0.0742, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 0.42281991243362427, |
|
"learning_rate": 1.1111111111111113e-05, |
|
"loss": 0.0728, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 1.3460317460317461, |
|
"grad_norm": 0.4088720679283142, |
|
"learning_rate": 1.1026455026455028e-05, |
|
"loss": 0.0737, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 1.3587301587301588, |
|
"grad_norm": 0.4682016968727112, |
|
"learning_rate": 1.0941798941798943e-05, |
|
"loss": 0.0754, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 1.3714285714285714, |
|
"grad_norm": 0.35886242985725403, |
|
"learning_rate": 1.0857142857142858e-05, |
|
"loss": 0.0739, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 1.384126984126984, |
|
"grad_norm": 0.5034026503562927, |
|
"learning_rate": 1.0772486772486774e-05, |
|
"loss": 0.0744, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 1.3968253968253967, |
|
"grad_norm": 0.6038418412208557, |
|
"learning_rate": 1.0687830687830689e-05, |
|
"loss": 0.073, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 1.4095238095238094, |
|
"grad_norm": 0.4263134002685547, |
|
"learning_rate": 1.0603174603174604e-05, |
|
"loss": 0.0743, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 1.4222222222222223, |
|
"grad_norm": 0.3092331886291504, |
|
"learning_rate": 1.0518518518518519e-05, |
|
"loss": 0.0747, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 1.434920634920635, |
|
"grad_norm": 0.41775885224342346, |
|
"learning_rate": 1.0433862433862433e-05, |
|
"loss": 0.0736, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 1.4476190476190476, |
|
"grad_norm": 0.3818839192390442, |
|
"learning_rate": 1.0349206349206352e-05, |
|
"loss": 0.0736, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 1.4603174603174602, |
|
"grad_norm": 0.42527565360069275, |
|
"learning_rate": 1.0264550264550266e-05, |
|
"loss": 0.0741, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 1.4730158730158731, |
|
"grad_norm": 0.37903305888175964, |
|
"learning_rate": 1.0179894179894181e-05, |
|
"loss": 0.0727, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 1.4857142857142858, |
|
"grad_norm": 0.41770797967910767, |
|
"learning_rate": 1.0095238095238096e-05, |
|
"loss": 0.0733, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 1.4984126984126984, |
|
"grad_norm": 0.6334396600723267, |
|
"learning_rate": 1.001058201058201e-05, |
|
"loss": 0.073, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 1.511111111111111, |
|
"grad_norm": 0.3735711872577667, |
|
"learning_rate": 9.925925925925927e-06, |
|
"loss": 0.0739, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 1.5238095238095237, |
|
"grad_norm": 0.40507185459136963, |
|
"learning_rate": 9.841269841269842e-06, |
|
"loss": 0.0731, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 1.5365079365079364, |
|
"grad_norm": 0.4952349066734314, |
|
"learning_rate": 9.756613756613757e-06, |
|
"loss": 0.0741, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 1.5492063492063493, |
|
"grad_norm": 0.4670361280441284, |
|
"learning_rate": 9.671957671957672e-06, |
|
"loss": 0.0736, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 1.561904761904762, |
|
"grad_norm": 0.2984641492366791, |
|
"learning_rate": 9.587301587301588e-06, |
|
"loss": 0.0732, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 1.5746031746031746, |
|
"grad_norm": 0.5101374983787537, |
|
"learning_rate": 9.502645502645503e-06, |
|
"loss": 0.0759, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 1.5873015873015874, |
|
"grad_norm": 0.38656944036483765, |
|
"learning_rate": 9.417989417989418e-06, |
|
"loss": 0.0741, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.508953869342804, |
|
"learning_rate": 9.333333333333334e-06, |
|
"loss": 0.0737, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 1.6126984126984127, |
|
"grad_norm": 0.49415382742881775, |
|
"learning_rate": 9.248677248677249e-06, |
|
"loss": 0.0736, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 1.6253968253968254, |
|
"grad_norm": 0.48334264755249023, |
|
"learning_rate": 9.164021164021166e-06, |
|
"loss": 0.0739, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 1.638095238095238, |
|
"grad_norm": 0.3960755467414856, |
|
"learning_rate": 9.07936507936508e-06, |
|
"loss": 0.0723, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 1.6507936507936507, |
|
"grad_norm": 0.4537145495414734, |
|
"learning_rate": 8.994708994708995e-06, |
|
"loss": 0.0723, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 1.6634920634920634, |
|
"grad_norm": 0.4759564697742462, |
|
"learning_rate": 8.910052910052912e-06, |
|
"loss": 0.0737, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 1.6761904761904762, |
|
"grad_norm": 0.564620316028595, |
|
"learning_rate": 8.825396825396827e-06, |
|
"loss": 0.0726, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 1.6888888888888889, |
|
"grad_norm": 0.3793913424015045, |
|
"learning_rate": 8.740740740740741e-06, |
|
"loss": 0.0725, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 1.7015873015873015, |
|
"grad_norm": 0.3748345673084259, |
|
"learning_rate": 8.656084656084656e-06, |
|
"loss": 0.0734, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 1.7142857142857144, |
|
"grad_norm": 0.31550857424736023, |
|
"learning_rate": 8.571428571428571e-06, |
|
"loss": 0.0728, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 1.726984126984127, |
|
"grad_norm": 0.39485469460487366, |
|
"learning_rate": 8.486772486772487e-06, |
|
"loss": 0.074, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 1.7396825396825397, |
|
"grad_norm": 0.3833816647529602, |
|
"learning_rate": 8.402116402116402e-06, |
|
"loss": 0.0727, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 1.7523809523809524, |
|
"grad_norm": 0.45526403188705444, |
|
"learning_rate": 8.317460317460319e-06, |
|
"loss": 0.0721, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 1.765079365079365, |
|
"grad_norm": 0.4437309801578522, |
|
"learning_rate": 8.232804232804234e-06, |
|
"loss": 0.0714, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 0.3827795386314392, |
|
"learning_rate": 8.148148148148148e-06, |
|
"loss": 0.0736, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 1.7904761904761903, |
|
"grad_norm": 0.3821280896663666, |
|
"learning_rate": 8.063492063492065e-06, |
|
"loss": 0.0742, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 1.8031746031746032, |
|
"grad_norm": 0.3558200001716614, |
|
"learning_rate": 7.97883597883598e-06, |
|
"loss": 0.0733, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 1.8158730158730159, |
|
"grad_norm": 0.35507771372795105, |
|
"learning_rate": 7.894179894179896e-06, |
|
"loss": 0.073, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 1.8285714285714287, |
|
"grad_norm": 0.4878668785095215, |
|
"learning_rate": 7.809523809523811e-06, |
|
"loss": 0.0726, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 1.8412698412698414, |
|
"grad_norm": 0.46924230456352234, |
|
"learning_rate": 7.724867724867726e-06, |
|
"loss": 0.0729, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 1.853968253968254, |
|
"grad_norm": 0.5545886158943176, |
|
"learning_rate": 7.64021164021164e-06, |
|
"loss": 0.0728, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 1.8666666666666667, |
|
"grad_norm": 0.33820512890815735, |
|
"learning_rate": 7.555555555555556e-06, |
|
"loss": 0.0727, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 1.8793650793650793, |
|
"grad_norm": 0.4180295169353485, |
|
"learning_rate": 7.470899470899472e-06, |
|
"loss": 0.0722, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 1.892063492063492, |
|
"grad_norm": 0.41895756125450134, |
|
"learning_rate": 7.386243386243387e-06, |
|
"loss": 0.0721, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 1.9047619047619047, |
|
"grad_norm": 0.37801581621170044, |
|
"learning_rate": 7.301587301587301e-06, |
|
"loss": 0.0754, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 1.9174603174603173, |
|
"grad_norm": 0.42890599370002747, |
|
"learning_rate": 7.216931216931218e-06, |
|
"loss": 0.0727, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 1.9301587301587302, |
|
"grad_norm": 0.36311328411102295, |
|
"learning_rate": 7.132275132275133e-06, |
|
"loss": 0.0732, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 1.9428571428571428, |
|
"grad_norm": 0.4069361090660095, |
|
"learning_rate": 7.047619047619048e-06, |
|
"loss": 0.0731, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 1.9555555555555557, |
|
"grad_norm": 0.38275209069252014, |
|
"learning_rate": 6.962962962962964e-06, |
|
"loss": 0.0729, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 1.9682539682539684, |
|
"grad_norm": 0.3496081829071045, |
|
"learning_rate": 6.878306878306879e-06, |
|
"loss": 0.0725, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 1.980952380952381, |
|
"grad_norm": 0.37429070472717285, |
|
"learning_rate": 6.7936507936507944e-06, |
|
"loss": 0.0726, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 1.9936507936507937, |
|
"grad_norm": 0.4195725619792938, |
|
"learning_rate": 6.708994708994709e-06, |
|
"loss": 0.0724, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.0749795213341713, |
|
"eval_runtime": 269.1515, |
|
"eval_samples_per_second": 520.153, |
|
"eval_steps_per_second": 65.019, |
|
"step": 78750 |
|
}, |
|
{ |
|
"epoch": 2.0063492063492063, |
|
"grad_norm": 0.4257189631462097, |
|
"learning_rate": 6.624338624338626e-06, |
|
"loss": 0.07, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 2.019047619047619, |
|
"grad_norm": 0.37472862005233765, |
|
"learning_rate": 6.5396825396825405e-06, |
|
"loss": 0.0664, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 2.0317460317460316, |
|
"grad_norm": 0.4728703796863556, |
|
"learning_rate": 6.455026455026455e-06, |
|
"loss": 0.0664, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 2.0444444444444443, |
|
"grad_norm": 0.42774897813796997, |
|
"learning_rate": 6.370370370370371e-06, |
|
"loss": 0.0661, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 2.057142857142857, |
|
"grad_norm": 0.4025447368621826, |
|
"learning_rate": 6.285714285714286e-06, |
|
"loss": 0.0679, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 2.06984126984127, |
|
"grad_norm": 0.41302409768104553, |
|
"learning_rate": 6.201058201058202e-06, |
|
"loss": 0.0662, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 2.0825396825396827, |
|
"grad_norm": 0.4339478611946106, |
|
"learning_rate": 6.116402116402117e-06, |
|
"loss": 0.0662, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 2.0952380952380953, |
|
"grad_norm": 0.38711288571357727, |
|
"learning_rate": 6.031746031746032e-06, |
|
"loss": 0.0677, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 2.107936507936508, |
|
"grad_norm": 0.44815394282341003, |
|
"learning_rate": 5.9470899470899475e-06, |
|
"loss": 0.0674, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 2.1206349206349207, |
|
"grad_norm": 0.4252176582813263, |
|
"learning_rate": 5.862433862433863e-06, |
|
"loss": 0.067, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 2.1333333333333333, |
|
"grad_norm": 0.4019823670387268, |
|
"learning_rate": 5.777777777777778e-06, |
|
"loss": 0.0676, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 2.146031746031746, |
|
"grad_norm": 0.37775805592536926, |
|
"learning_rate": 5.693121693121694e-06, |
|
"loss": 0.0671, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 2.1587301587301586, |
|
"grad_norm": 0.5179104208946228, |
|
"learning_rate": 5.6084656084656084e-06, |
|
"loss": 0.0671, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 2.1714285714285713, |
|
"grad_norm": 0.37160980701446533, |
|
"learning_rate": 5.523809523809525e-06, |
|
"loss": 0.0677, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 2.1841269841269844, |
|
"grad_norm": 0.4610843360424042, |
|
"learning_rate": 5.43915343915344e-06, |
|
"loss": 0.0671, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 2.196825396825397, |
|
"grad_norm": 0.4135109484195709, |
|
"learning_rate": 5.3544973544973545e-06, |
|
"loss": 0.0678, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 2.2095238095238097, |
|
"grad_norm": 0.38079920411109924, |
|
"learning_rate": 5.26984126984127e-06, |
|
"loss": 0.0678, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 0.39888954162597656, |
|
"learning_rate": 5.185185185185185e-06, |
|
"loss": 0.0669, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 2.234920634920635, |
|
"grad_norm": 0.37562116980552673, |
|
"learning_rate": 5.1005291005291015e-06, |
|
"loss": 0.0661, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 2.2476190476190476, |
|
"grad_norm": 0.4394863247871399, |
|
"learning_rate": 5.015873015873016e-06, |
|
"loss": 0.0671, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 2.2603174603174603, |
|
"grad_norm": 0.4748270511627197, |
|
"learning_rate": 4.931216931216932e-06, |
|
"loss": 0.067, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 2.273015873015873, |
|
"grad_norm": 0.4593636095523834, |
|
"learning_rate": 4.846560846560847e-06, |
|
"loss": 0.067, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 2.2857142857142856, |
|
"grad_norm": 0.3517415225505829, |
|
"learning_rate": 4.761904761904762e-06, |
|
"loss": 0.0669, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 2.2984126984126982, |
|
"grad_norm": 0.40983742475509644, |
|
"learning_rate": 4.677248677248677e-06, |
|
"loss": 0.0681, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 2.311111111111111, |
|
"grad_norm": 0.46570950746536255, |
|
"learning_rate": 4.592592592592593e-06, |
|
"loss": 0.0672, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 2.323809523809524, |
|
"grad_norm": 0.4733307957649231, |
|
"learning_rate": 4.5079365079365085e-06, |
|
"loss": 0.0671, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 2.3365079365079366, |
|
"grad_norm": 0.38432806730270386, |
|
"learning_rate": 4.423280423280424e-06, |
|
"loss": 0.0672, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 2.3492063492063493, |
|
"grad_norm": 0.31346267461776733, |
|
"learning_rate": 4.338624338624339e-06, |
|
"loss": 0.066, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 2.361904761904762, |
|
"grad_norm": 0.5612916350364685, |
|
"learning_rate": 4.2539682539682546e-06, |
|
"loss": 0.0666, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 2.3746031746031746, |
|
"grad_norm": 0.3445761501789093, |
|
"learning_rate": 4.169312169312169e-06, |
|
"loss": 0.0675, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 2.3873015873015873, |
|
"grad_norm": 0.41335174441337585, |
|
"learning_rate": 4.084656084656085e-06, |
|
"loss": 0.0676, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.42691895365715027, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.0669, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 2.4126984126984126, |
|
"grad_norm": 0.44459807872772217, |
|
"learning_rate": 3.9153439153439155e-06, |
|
"loss": 0.0661, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 2.425396825396825, |
|
"grad_norm": 0.39611610770225525, |
|
"learning_rate": 3.830687830687831e-06, |
|
"loss": 0.0665, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 2.4380952380952383, |
|
"grad_norm": 0.41603508591651917, |
|
"learning_rate": 3.7460317460317463e-06, |
|
"loss": 0.067, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 2.450793650793651, |
|
"grad_norm": 0.45685020089149475, |
|
"learning_rate": 3.661375661375662e-06, |
|
"loss": 0.0664, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 2.4634920634920636, |
|
"grad_norm": 0.41426390409469604, |
|
"learning_rate": 3.5767195767195772e-06, |
|
"loss": 0.0665, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 2.4761904761904763, |
|
"grad_norm": 0.4311801791191101, |
|
"learning_rate": 3.492063492063492e-06, |
|
"loss": 0.0673, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 2.488888888888889, |
|
"grad_norm": 0.39366066455841064, |
|
"learning_rate": 3.4074074074074077e-06, |
|
"loss": 0.0676, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 2.5015873015873016, |
|
"grad_norm": 0.46240171790122986, |
|
"learning_rate": 3.322751322751323e-06, |
|
"loss": 0.0674, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 2.5142857142857142, |
|
"grad_norm": 0.45865318179130554, |
|
"learning_rate": 3.2380952380952385e-06, |
|
"loss": 0.0671, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 2.526984126984127, |
|
"grad_norm": 0.38405075669288635, |
|
"learning_rate": 3.1534391534391538e-06, |
|
"loss": 0.0678, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 2.5396825396825395, |
|
"grad_norm": 0.48667874932289124, |
|
"learning_rate": 3.068783068783069e-06, |
|
"loss": 0.0661, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 2.552380952380952, |
|
"grad_norm": 0.3919212818145752, |
|
"learning_rate": 2.984126984126984e-06, |
|
"loss": 0.0654, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 2.565079365079365, |
|
"grad_norm": 0.4081352651119232, |
|
"learning_rate": 2.8994708994709e-06, |
|
"loss": 0.0669, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 2.5777777777777775, |
|
"grad_norm": 0.33449599146842957, |
|
"learning_rate": 2.814814814814815e-06, |
|
"loss": 0.0655, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 2.5904761904761906, |
|
"grad_norm": 0.37508487701416016, |
|
"learning_rate": 2.7301587301587303e-06, |
|
"loss": 0.0659, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 2.6031746031746033, |
|
"grad_norm": 0.43301156163215637, |
|
"learning_rate": 2.6455026455026455e-06, |
|
"loss": 0.0684, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 2.615873015873016, |
|
"grad_norm": 0.31652727723121643, |
|
"learning_rate": 2.560846560846561e-06, |
|
"loss": 0.0674, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 2.6285714285714286, |
|
"grad_norm": 0.38132810592651367, |
|
"learning_rate": 2.4761904761904764e-06, |
|
"loss": 0.0665, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 2.641269841269841, |
|
"grad_norm": 0.4249517023563385, |
|
"learning_rate": 2.3915343915343916e-06, |
|
"loss": 0.0678, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 2.653968253968254, |
|
"grad_norm": 0.42605915665626526, |
|
"learning_rate": 2.3068783068783073e-06, |
|
"loss": 0.0659, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 0.4002751111984253, |
|
"learning_rate": 2.222222222222222e-06, |
|
"loss": 0.0665, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 2.6793650793650796, |
|
"grad_norm": 0.5232521891593933, |
|
"learning_rate": 2.1375661375661377e-06, |
|
"loss": 0.0676, |
|
"step": 105500 |
|
}, |
|
{ |
|
"epoch": 2.6920634920634923, |
|
"grad_norm": 0.409422367811203, |
|
"learning_rate": 2.0529100529100534e-06, |
|
"loss": 0.0658, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 2.704761904761905, |
|
"grad_norm": 0.3971617519855499, |
|
"learning_rate": 1.968253968253968e-06, |
|
"loss": 0.0655, |
|
"step": 106500 |
|
}, |
|
{ |
|
"epoch": 2.7174603174603176, |
|
"grad_norm": 0.35877570509910583, |
|
"learning_rate": 1.8835978835978838e-06, |
|
"loss": 0.0673, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 2.7301587301587302, |
|
"grad_norm": 0.36749425530433655, |
|
"learning_rate": 1.798941798941799e-06, |
|
"loss": 0.0681, |
|
"step": 107500 |
|
}, |
|
{ |
|
"epoch": 2.742857142857143, |
|
"grad_norm": 0.3727457821369171, |
|
"learning_rate": 1.7142857142857145e-06, |
|
"loss": 0.0665, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 2.7555555555555555, |
|
"grad_norm": 0.40977808833122253, |
|
"learning_rate": 1.62962962962963e-06, |
|
"loss": 0.0672, |
|
"step": 108500 |
|
}, |
|
{ |
|
"epoch": 2.768253968253968, |
|
"grad_norm": 0.4265407621860504, |
|
"learning_rate": 1.5449735449735451e-06, |
|
"loss": 0.0666, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 2.780952380952381, |
|
"grad_norm": 0.3894596993923187, |
|
"learning_rate": 1.4603174603174606e-06, |
|
"loss": 0.0673, |
|
"step": 109500 |
|
}, |
|
{ |
|
"epoch": 2.7936507936507935, |
|
"grad_norm": 0.526606023311615, |
|
"learning_rate": 1.3756613756613758e-06, |
|
"loss": 0.0676, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 2.806349206349206, |
|
"grad_norm": 0.2910812497138977, |
|
"learning_rate": 1.2910052910052912e-06, |
|
"loss": 0.0671, |
|
"step": 110500 |
|
}, |
|
{ |
|
"epoch": 2.819047619047619, |
|
"grad_norm": 0.3701234757900238, |
|
"learning_rate": 1.2063492063492065e-06, |
|
"loss": 0.0666, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 2.831746031746032, |
|
"grad_norm": 0.3969452679157257, |
|
"learning_rate": 1.1216931216931217e-06, |
|
"loss": 0.0668, |
|
"step": 111500 |
|
}, |
|
{ |
|
"epoch": 2.8444444444444446, |
|
"grad_norm": 0.4415270686149597, |
|
"learning_rate": 1.0370370370370371e-06, |
|
"loss": 0.0661, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 0.3490103483200073, |
|
"learning_rate": 9.523809523809525e-07, |
|
"loss": 0.0658, |
|
"step": 112500 |
|
}, |
|
{ |
|
"epoch": 2.86984126984127, |
|
"grad_norm": 0.35733526945114136, |
|
"learning_rate": 8.677248677248679e-07, |
|
"loss": 0.0661, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 2.8825396825396825, |
|
"grad_norm": 0.4992692172527313, |
|
"learning_rate": 7.830687830687832e-07, |
|
"loss": 0.068, |
|
"step": 113500 |
|
}, |
|
{ |
|
"epoch": 2.895238095238095, |
|
"grad_norm": 0.4047030508518219, |
|
"learning_rate": 6.984126984126984e-07, |
|
"loss": 0.0683, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 2.907936507936508, |
|
"grad_norm": 0.4468993544578552, |
|
"learning_rate": 6.137566137566138e-07, |
|
"loss": 0.0664, |
|
"step": 114500 |
|
}, |
|
{ |
|
"epoch": 2.9206349206349205, |
|
"grad_norm": 0.41356751322746277, |
|
"learning_rate": 5.291005291005291e-07, |
|
"loss": 0.067, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 2.9333333333333336, |
|
"grad_norm": 0.4459340572357178, |
|
"learning_rate": 4.444444444444445e-07, |
|
"loss": 0.0671, |
|
"step": 115500 |
|
}, |
|
{ |
|
"epoch": 2.9460317460317462, |
|
"grad_norm": 0.42610964179039, |
|
"learning_rate": 3.597883597883598e-07, |
|
"loss": 0.0664, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 2.958730158730159, |
|
"grad_norm": 0.5059521794319153, |
|
"learning_rate": 2.7513227513227515e-07, |
|
"loss": 0.0658, |
|
"step": 116500 |
|
}, |
|
{ |
|
"epoch": 2.9714285714285715, |
|
"grad_norm": 0.3404170572757721, |
|
"learning_rate": 1.904761904761905e-07, |
|
"loss": 0.0667, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 2.984126984126984, |
|
"grad_norm": 0.4388870894908905, |
|
"learning_rate": 1.0582010582010582e-07, |
|
"loss": 0.0658, |
|
"step": 117500 |
|
}, |
|
{ |
|
"epoch": 2.996825396825397, |
|
"grad_norm": 0.39170539379119873, |
|
"learning_rate": 2.1164021164021167e-08, |
|
"loss": 0.0665, |
|
"step": 118000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 118125, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.4389780414464e+17, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|