Kobart-Jeju-translation / trainer_state.json
Junhoee's picture
Upload 11 files
96f8d79 verified
raw
history blame
42.6 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.996825396825397,
"eval_steps": 500,
"global_step": 118000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.012698412698412698,
"grad_norm": 0.4761015474796295,
"learning_rate": 1.9915343915343918e-05,
"loss": 0.0794,
"step": 500
},
{
"epoch": 0.025396825396825397,
"grad_norm": 0.43550318479537964,
"learning_rate": 1.983068783068783e-05,
"loss": 0.0811,
"step": 1000
},
{
"epoch": 0.0380952380952381,
"grad_norm": 0.4672704339027405,
"learning_rate": 1.9746031746031748e-05,
"loss": 0.0819,
"step": 1500
},
{
"epoch": 0.050793650793650794,
"grad_norm": 0.5426394939422607,
"learning_rate": 1.9661375661375664e-05,
"loss": 0.0829,
"step": 2000
},
{
"epoch": 0.06349206349206349,
"grad_norm": 0.3974975645542145,
"learning_rate": 1.9576719576719577e-05,
"loss": 0.0816,
"step": 2500
},
{
"epoch": 0.0761904761904762,
"grad_norm": 0.6599302887916565,
"learning_rate": 1.9492063492063494e-05,
"loss": 0.0815,
"step": 3000
},
{
"epoch": 0.08888888888888889,
"grad_norm": 0.35329556465148926,
"learning_rate": 1.9407407407407407e-05,
"loss": 0.0841,
"step": 3500
},
{
"epoch": 0.10158730158730159,
"grad_norm": 0.42421749234199524,
"learning_rate": 1.9322751322751327e-05,
"loss": 0.0833,
"step": 4000
},
{
"epoch": 0.11428571428571428,
"grad_norm": 0.4479866325855255,
"learning_rate": 1.923809523809524e-05,
"loss": 0.0839,
"step": 4500
},
{
"epoch": 0.12698412698412698,
"grad_norm": 0.372086763381958,
"learning_rate": 1.9153439153439156e-05,
"loss": 0.0835,
"step": 5000
},
{
"epoch": 0.13968253968253969,
"grad_norm": 0.38730981945991516,
"learning_rate": 1.906878306878307e-05,
"loss": 0.0841,
"step": 5500
},
{
"epoch": 0.1523809523809524,
"grad_norm": 0.5003937482833862,
"learning_rate": 1.8984126984126986e-05,
"loss": 0.0829,
"step": 6000
},
{
"epoch": 0.16507936507936508,
"grad_norm": 0.42826735973358154,
"learning_rate": 1.8899470899470903e-05,
"loss": 0.0835,
"step": 6500
},
{
"epoch": 0.17777777777777778,
"grad_norm": 0.49070820212364197,
"learning_rate": 1.8814814814814816e-05,
"loss": 0.0827,
"step": 7000
},
{
"epoch": 0.19047619047619047,
"grad_norm": 0.4903796911239624,
"learning_rate": 1.8730158730158732e-05,
"loss": 0.0823,
"step": 7500
},
{
"epoch": 0.20317460317460317,
"grad_norm": 0.4144362211227417,
"learning_rate": 1.8645502645502645e-05,
"loss": 0.0842,
"step": 8000
},
{
"epoch": 0.21587301587301588,
"grad_norm": 0.6519999504089355,
"learning_rate": 1.8560846560846562e-05,
"loss": 0.0827,
"step": 8500
},
{
"epoch": 0.22857142857142856,
"grad_norm": 0.37082576751708984,
"learning_rate": 1.8476190476190478e-05,
"loss": 0.0835,
"step": 9000
},
{
"epoch": 0.24126984126984127,
"grad_norm": 0.319024920463562,
"learning_rate": 1.8391534391534395e-05,
"loss": 0.0829,
"step": 9500
},
{
"epoch": 0.25396825396825395,
"grad_norm": 0.4173873960971832,
"learning_rate": 1.8306878306878308e-05,
"loss": 0.0814,
"step": 10000
},
{
"epoch": 0.26666666666666666,
"grad_norm": 0.4521333873271942,
"learning_rate": 1.8222222222222224e-05,
"loss": 0.0825,
"step": 10500
},
{
"epoch": 0.27936507936507937,
"grad_norm": 0.4372086822986603,
"learning_rate": 1.8137566137566137e-05,
"loss": 0.0844,
"step": 11000
},
{
"epoch": 0.2920634920634921,
"grad_norm": 0.40673378109931946,
"learning_rate": 1.8052910052910054e-05,
"loss": 0.0846,
"step": 11500
},
{
"epoch": 0.3047619047619048,
"grad_norm": 0.524502694606781,
"learning_rate": 1.796825396825397e-05,
"loss": 0.0843,
"step": 12000
},
{
"epoch": 0.31746031746031744,
"grad_norm": 0.36854442954063416,
"learning_rate": 1.7883597883597884e-05,
"loss": 0.0838,
"step": 12500
},
{
"epoch": 0.33015873015873015,
"grad_norm": 0.4694221019744873,
"learning_rate": 1.77989417989418e-05,
"loss": 0.0834,
"step": 13000
},
{
"epoch": 0.34285714285714286,
"grad_norm": 0.384512335062027,
"learning_rate": 1.7714285714285717e-05,
"loss": 0.0825,
"step": 13500
},
{
"epoch": 0.35555555555555557,
"grad_norm": 0.3776947855949402,
"learning_rate": 1.7629629629629633e-05,
"loss": 0.081,
"step": 14000
},
{
"epoch": 0.3682539682539683,
"grad_norm": 0.44691145420074463,
"learning_rate": 1.7544973544973546e-05,
"loss": 0.0844,
"step": 14500
},
{
"epoch": 0.38095238095238093,
"grad_norm": 0.38754552602767944,
"learning_rate": 1.7460317460317463e-05,
"loss": 0.0834,
"step": 15000
},
{
"epoch": 0.39365079365079364,
"grad_norm": 0.3924926221370697,
"learning_rate": 1.7375661375661376e-05,
"loss": 0.0836,
"step": 15500
},
{
"epoch": 0.40634920634920635,
"grad_norm": 0.41219380497932434,
"learning_rate": 1.7291005291005292e-05,
"loss": 0.0827,
"step": 16000
},
{
"epoch": 0.41904761904761906,
"grad_norm": 0.36697277426719666,
"learning_rate": 1.720634920634921e-05,
"loss": 0.0833,
"step": 16500
},
{
"epoch": 0.43174603174603177,
"grad_norm": 0.37833482027053833,
"learning_rate": 1.7121693121693125e-05,
"loss": 0.0831,
"step": 17000
},
{
"epoch": 0.4444444444444444,
"grad_norm": 0.33408552408218384,
"learning_rate": 1.7037037037037038e-05,
"loss": 0.0818,
"step": 17500
},
{
"epoch": 0.45714285714285713,
"grad_norm": 0.4245634377002716,
"learning_rate": 1.6952380952380955e-05,
"loss": 0.0838,
"step": 18000
},
{
"epoch": 0.46984126984126984,
"grad_norm": 0.4424809217453003,
"learning_rate": 1.6867724867724868e-05,
"loss": 0.0828,
"step": 18500
},
{
"epoch": 0.48253968253968255,
"grad_norm": 0.47369641065597534,
"learning_rate": 1.6783068783068784e-05,
"loss": 0.0828,
"step": 19000
},
{
"epoch": 0.49523809523809526,
"grad_norm": 0.417057603597641,
"learning_rate": 1.66984126984127e-05,
"loss": 0.0839,
"step": 19500
},
{
"epoch": 0.5079365079365079,
"grad_norm": 0.450612336397171,
"learning_rate": 1.6613756613756614e-05,
"loss": 0.0832,
"step": 20000
},
{
"epoch": 0.5206349206349207,
"grad_norm": 0.35937097668647766,
"learning_rate": 1.652910052910053e-05,
"loss": 0.0816,
"step": 20500
},
{
"epoch": 0.5333333333333333,
"grad_norm": 0.4366040527820587,
"learning_rate": 1.6444444444444444e-05,
"loss": 0.0817,
"step": 21000
},
{
"epoch": 0.546031746031746,
"grad_norm": 0.3630824387073517,
"learning_rate": 1.6359788359788363e-05,
"loss": 0.0823,
"step": 21500
},
{
"epoch": 0.5587301587301587,
"grad_norm": 0.45653077960014343,
"learning_rate": 1.6275132275132277e-05,
"loss": 0.0814,
"step": 22000
},
{
"epoch": 0.5714285714285714,
"grad_norm": 0.4124685525894165,
"learning_rate": 1.6190476190476193e-05,
"loss": 0.0828,
"step": 22500
},
{
"epoch": 0.5841269841269842,
"grad_norm": 0.4182330071926117,
"learning_rate": 1.6105820105820106e-05,
"loss": 0.0825,
"step": 23000
},
{
"epoch": 0.5968253968253968,
"grad_norm": 0.7457558512687683,
"learning_rate": 1.6021164021164023e-05,
"loss": 0.0828,
"step": 23500
},
{
"epoch": 0.6095238095238096,
"grad_norm": 0.41049671173095703,
"learning_rate": 1.5936507936507936e-05,
"loss": 0.0831,
"step": 24000
},
{
"epoch": 0.6222222222222222,
"grad_norm": 0.4230283498764038,
"learning_rate": 1.5851851851851852e-05,
"loss": 0.0823,
"step": 24500
},
{
"epoch": 0.6349206349206349,
"grad_norm": 0.38568949699401855,
"learning_rate": 1.576719576719577e-05,
"loss": 0.0811,
"step": 25000
},
{
"epoch": 0.6476190476190476,
"grad_norm": 0.42709481716156006,
"learning_rate": 1.5682539682539685e-05,
"loss": 0.0818,
"step": 25500
},
{
"epoch": 0.6603174603174603,
"grad_norm": 0.37508589029312134,
"learning_rate": 1.55978835978836e-05,
"loss": 0.0828,
"step": 26000
},
{
"epoch": 0.6730158730158731,
"grad_norm": 0.43134260177612305,
"learning_rate": 1.5513227513227515e-05,
"loss": 0.0824,
"step": 26500
},
{
"epoch": 0.6857142857142857,
"grad_norm": 0.37693992257118225,
"learning_rate": 1.542857142857143e-05,
"loss": 0.0811,
"step": 27000
},
{
"epoch": 0.6984126984126984,
"grad_norm": 0.34098678827285767,
"learning_rate": 1.5343915343915344e-05,
"loss": 0.0819,
"step": 27500
},
{
"epoch": 0.7111111111111111,
"grad_norm": 0.47179728746414185,
"learning_rate": 1.525925925925926e-05,
"loss": 0.082,
"step": 28000
},
{
"epoch": 0.7238095238095238,
"grad_norm": 0.4184609651565552,
"learning_rate": 1.5174603174603176e-05,
"loss": 0.0825,
"step": 28500
},
{
"epoch": 0.7365079365079366,
"grad_norm": 0.3582792282104492,
"learning_rate": 1.508994708994709e-05,
"loss": 0.0821,
"step": 29000
},
{
"epoch": 0.7492063492063492,
"grad_norm": 0.5200299620628357,
"learning_rate": 1.5005291005291007e-05,
"loss": 0.0817,
"step": 29500
},
{
"epoch": 0.7619047619047619,
"grad_norm": 0.4461567997932434,
"learning_rate": 1.4920634920634922e-05,
"loss": 0.0814,
"step": 30000
},
{
"epoch": 0.7746031746031746,
"grad_norm": 0.3920634388923645,
"learning_rate": 1.4835978835978837e-05,
"loss": 0.0819,
"step": 30500
},
{
"epoch": 0.7873015873015873,
"grad_norm": 0.41001540422439575,
"learning_rate": 1.4751322751322751e-05,
"loss": 0.0802,
"step": 31000
},
{
"epoch": 0.8,
"grad_norm": 0.4187995493412018,
"learning_rate": 1.4666666666666666e-05,
"loss": 0.0816,
"step": 31500
},
{
"epoch": 0.8126984126984127,
"grad_norm": 0.39321765303611755,
"learning_rate": 1.4582010582010584e-05,
"loss": 0.0824,
"step": 32000
},
{
"epoch": 0.8253968253968254,
"grad_norm": 0.3958302140235901,
"learning_rate": 1.44973544973545e-05,
"loss": 0.0801,
"step": 32500
},
{
"epoch": 0.8380952380952381,
"grad_norm": 0.3932056725025177,
"learning_rate": 1.4412698412698414e-05,
"loss": 0.0808,
"step": 33000
},
{
"epoch": 0.8507936507936508,
"grad_norm": 0.3314465284347534,
"learning_rate": 1.4328042328042329e-05,
"loss": 0.0827,
"step": 33500
},
{
"epoch": 0.8634920634920635,
"grad_norm": 0.43675485253334045,
"learning_rate": 1.4243386243386244e-05,
"loss": 0.0811,
"step": 34000
},
{
"epoch": 0.8761904761904762,
"grad_norm": 0.6284595131874084,
"learning_rate": 1.415873015873016e-05,
"loss": 0.0805,
"step": 34500
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.39293691515922546,
"learning_rate": 1.4074074074074075e-05,
"loss": 0.0803,
"step": 35000
},
{
"epoch": 0.9015873015873016,
"grad_norm": 0.4092639088630676,
"learning_rate": 1.398941798941799e-05,
"loss": 0.0813,
"step": 35500
},
{
"epoch": 0.9142857142857143,
"grad_norm": 0.41005492210388184,
"learning_rate": 1.3904761904761905e-05,
"loss": 0.0811,
"step": 36000
},
{
"epoch": 0.926984126984127,
"grad_norm": 0.5190646052360535,
"learning_rate": 1.3820105820105821e-05,
"loss": 0.0811,
"step": 36500
},
{
"epoch": 0.9396825396825397,
"grad_norm": 0.32034316658973694,
"learning_rate": 1.3735449735449738e-05,
"loss": 0.0812,
"step": 37000
},
{
"epoch": 0.9523809523809523,
"grad_norm": 0.4857613742351532,
"learning_rate": 1.3650793650793652e-05,
"loss": 0.0813,
"step": 37500
},
{
"epoch": 0.9650793650793651,
"grad_norm": 0.4523787796497345,
"learning_rate": 1.3566137566137567e-05,
"loss": 0.0816,
"step": 38000
},
{
"epoch": 0.9777777777777777,
"grad_norm": 0.4204433262348175,
"learning_rate": 1.3481481481481482e-05,
"loss": 0.0806,
"step": 38500
},
{
"epoch": 0.9904761904761905,
"grad_norm": 0.4313475787639618,
"learning_rate": 1.3396825396825397e-05,
"loss": 0.0806,
"step": 39000
},
{
"epoch": 1.0,
"eval_loss": 0.07647726684808731,
"eval_runtime": 270.8786,
"eval_samples_per_second": 516.837,
"eval_steps_per_second": 64.605,
"step": 39375
},
{
"epoch": 1.0031746031746032,
"grad_norm": 0.44939786195755005,
"learning_rate": 1.3312169312169313e-05,
"loss": 0.0795,
"step": 39500
},
{
"epoch": 1.0158730158730158,
"grad_norm": 0.48013949394226074,
"learning_rate": 1.322751322751323e-05,
"loss": 0.0751,
"step": 40000
},
{
"epoch": 1.0285714285714285,
"grad_norm": 0.4185923635959625,
"learning_rate": 1.3142857142857145e-05,
"loss": 0.0736,
"step": 40500
},
{
"epoch": 1.0412698412698413,
"grad_norm": 0.397386759519577,
"learning_rate": 1.305820105820106e-05,
"loss": 0.0748,
"step": 41000
},
{
"epoch": 1.053968253968254,
"grad_norm": 0.39524805545806885,
"learning_rate": 1.2973544973544974e-05,
"loss": 0.0735,
"step": 41500
},
{
"epoch": 1.0666666666666667,
"grad_norm": 0.34505075216293335,
"learning_rate": 1.288888888888889e-05,
"loss": 0.0746,
"step": 42000
},
{
"epoch": 1.0793650793650793,
"grad_norm": 0.37381839752197266,
"learning_rate": 1.2804232804232805e-05,
"loss": 0.0728,
"step": 42500
},
{
"epoch": 1.0920634920634922,
"grad_norm": 0.6797782182693481,
"learning_rate": 1.271957671957672e-05,
"loss": 0.0741,
"step": 43000
},
{
"epoch": 1.1047619047619048,
"grad_norm": 0.41272956132888794,
"learning_rate": 1.2634920634920635e-05,
"loss": 0.0738,
"step": 43500
},
{
"epoch": 1.1174603174603175,
"grad_norm": 0.382468044757843,
"learning_rate": 1.255026455026455e-05,
"loss": 0.0738,
"step": 44000
},
{
"epoch": 1.1301587301587301,
"grad_norm": 0.3978229761123657,
"learning_rate": 1.2465608465608468e-05,
"loss": 0.074,
"step": 44500
},
{
"epoch": 1.1428571428571428,
"grad_norm": 0.3431326746940613,
"learning_rate": 1.2380952380952383e-05,
"loss": 0.0745,
"step": 45000
},
{
"epoch": 1.1555555555555554,
"grad_norm": 0.3610997200012207,
"learning_rate": 1.2296296296296298e-05,
"loss": 0.0729,
"step": 45500
},
{
"epoch": 1.1682539682539683,
"grad_norm": 0.49680083990097046,
"learning_rate": 1.2211640211640212e-05,
"loss": 0.0732,
"step": 46000
},
{
"epoch": 1.180952380952381,
"grad_norm": 0.3833047151565552,
"learning_rate": 1.2126984126984127e-05,
"loss": 0.0732,
"step": 46500
},
{
"epoch": 1.1936507936507936,
"grad_norm": 0.2808152139186859,
"learning_rate": 1.2042328042328044e-05,
"loss": 0.0733,
"step": 47000
},
{
"epoch": 1.2063492063492063,
"grad_norm": 0.5429581999778748,
"learning_rate": 1.1957671957671959e-05,
"loss": 0.0729,
"step": 47500
},
{
"epoch": 1.2190476190476192,
"grad_norm": 0.34248363971710205,
"learning_rate": 1.1873015873015873e-05,
"loss": 0.0746,
"step": 48000
},
{
"epoch": 1.2317460317460318,
"grad_norm": 0.5099675059318542,
"learning_rate": 1.1788359788359788e-05,
"loss": 0.0739,
"step": 48500
},
{
"epoch": 1.2444444444444445,
"grad_norm": 0.3858914375305176,
"learning_rate": 1.1703703703703703e-05,
"loss": 0.0721,
"step": 49000
},
{
"epoch": 1.2571428571428571,
"grad_norm": 0.3453405201435089,
"learning_rate": 1.1619047619047621e-05,
"loss": 0.0737,
"step": 49500
},
{
"epoch": 1.2698412698412698,
"grad_norm": 0.4647195637226105,
"learning_rate": 1.1534391534391536e-05,
"loss": 0.0736,
"step": 50000
},
{
"epoch": 1.2825396825396824,
"grad_norm": 0.4548490345478058,
"learning_rate": 1.144973544973545e-05,
"loss": 0.0742,
"step": 50500
},
{
"epoch": 1.2952380952380953,
"grad_norm": 0.4145970046520233,
"learning_rate": 1.1365079365079366e-05,
"loss": 0.0748,
"step": 51000
},
{
"epoch": 1.307936507936508,
"grad_norm": 0.4032251536846161,
"learning_rate": 1.128042328042328e-05,
"loss": 0.073,
"step": 51500
},
{
"epoch": 1.3206349206349206,
"grad_norm": 0.5053452849388123,
"learning_rate": 1.1195767195767197e-05,
"loss": 0.0742,
"step": 52000
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.42281991243362427,
"learning_rate": 1.1111111111111113e-05,
"loss": 0.0728,
"step": 52500
},
{
"epoch": 1.3460317460317461,
"grad_norm": 0.4088720679283142,
"learning_rate": 1.1026455026455028e-05,
"loss": 0.0737,
"step": 53000
},
{
"epoch": 1.3587301587301588,
"grad_norm": 0.4682016968727112,
"learning_rate": 1.0941798941798943e-05,
"loss": 0.0754,
"step": 53500
},
{
"epoch": 1.3714285714285714,
"grad_norm": 0.35886242985725403,
"learning_rate": 1.0857142857142858e-05,
"loss": 0.0739,
"step": 54000
},
{
"epoch": 1.384126984126984,
"grad_norm": 0.5034026503562927,
"learning_rate": 1.0772486772486774e-05,
"loss": 0.0744,
"step": 54500
},
{
"epoch": 1.3968253968253967,
"grad_norm": 0.6038418412208557,
"learning_rate": 1.0687830687830689e-05,
"loss": 0.073,
"step": 55000
},
{
"epoch": 1.4095238095238094,
"grad_norm": 0.4263134002685547,
"learning_rate": 1.0603174603174604e-05,
"loss": 0.0743,
"step": 55500
},
{
"epoch": 1.4222222222222223,
"grad_norm": 0.3092331886291504,
"learning_rate": 1.0518518518518519e-05,
"loss": 0.0747,
"step": 56000
},
{
"epoch": 1.434920634920635,
"grad_norm": 0.41775885224342346,
"learning_rate": 1.0433862433862433e-05,
"loss": 0.0736,
"step": 56500
},
{
"epoch": 1.4476190476190476,
"grad_norm": 0.3818839192390442,
"learning_rate": 1.0349206349206352e-05,
"loss": 0.0736,
"step": 57000
},
{
"epoch": 1.4603174603174602,
"grad_norm": 0.42527565360069275,
"learning_rate": 1.0264550264550266e-05,
"loss": 0.0741,
"step": 57500
},
{
"epoch": 1.4730158730158731,
"grad_norm": 0.37903305888175964,
"learning_rate": 1.0179894179894181e-05,
"loss": 0.0727,
"step": 58000
},
{
"epoch": 1.4857142857142858,
"grad_norm": 0.41770797967910767,
"learning_rate": 1.0095238095238096e-05,
"loss": 0.0733,
"step": 58500
},
{
"epoch": 1.4984126984126984,
"grad_norm": 0.6334396600723267,
"learning_rate": 1.001058201058201e-05,
"loss": 0.073,
"step": 59000
},
{
"epoch": 1.511111111111111,
"grad_norm": 0.3735711872577667,
"learning_rate": 9.925925925925927e-06,
"loss": 0.0739,
"step": 59500
},
{
"epoch": 1.5238095238095237,
"grad_norm": 0.40507185459136963,
"learning_rate": 9.841269841269842e-06,
"loss": 0.0731,
"step": 60000
},
{
"epoch": 1.5365079365079364,
"grad_norm": 0.4952349066734314,
"learning_rate": 9.756613756613757e-06,
"loss": 0.0741,
"step": 60500
},
{
"epoch": 1.5492063492063493,
"grad_norm": 0.4670361280441284,
"learning_rate": 9.671957671957672e-06,
"loss": 0.0736,
"step": 61000
},
{
"epoch": 1.561904761904762,
"grad_norm": 0.2984641492366791,
"learning_rate": 9.587301587301588e-06,
"loss": 0.0732,
"step": 61500
},
{
"epoch": 1.5746031746031746,
"grad_norm": 0.5101374983787537,
"learning_rate": 9.502645502645503e-06,
"loss": 0.0759,
"step": 62000
},
{
"epoch": 1.5873015873015874,
"grad_norm": 0.38656944036483765,
"learning_rate": 9.417989417989418e-06,
"loss": 0.0741,
"step": 62500
},
{
"epoch": 1.6,
"grad_norm": 0.508953869342804,
"learning_rate": 9.333333333333334e-06,
"loss": 0.0737,
"step": 63000
},
{
"epoch": 1.6126984126984127,
"grad_norm": 0.49415382742881775,
"learning_rate": 9.248677248677249e-06,
"loss": 0.0736,
"step": 63500
},
{
"epoch": 1.6253968253968254,
"grad_norm": 0.48334264755249023,
"learning_rate": 9.164021164021166e-06,
"loss": 0.0739,
"step": 64000
},
{
"epoch": 1.638095238095238,
"grad_norm": 0.3960755467414856,
"learning_rate": 9.07936507936508e-06,
"loss": 0.0723,
"step": 64500
},
{
"epoch": 1.6507936507936507,
"grad_norm": 0.4537145495414734,
"learning_rate": 8.994708994708995e-06,
"loss": 0.0723,
"step": 65000
},
{
"epoch": 1.6634920634920634,
"grad_norm": 0.4759564697742462,
"learning_rate": 8.910052910052912e-06,
"loss": 0.0737,
"step": 65500
},
{
"epoch": 1.6761904761904762,
"grad_norm": 0.564620316028595,
"learning_rate": 8.825396825396827e-06,
"loss": 0.0726,
"step": 66000
},
{
"epoch": 1.6888888888888889,
"grad_norm": 0.3793913424015045,
"learning_rate": 8.740740740740741e-06,
"loss": 0.0725,
"step": 66500
},
{
"epoch": 1.7015873015873015,
"grad_norm": 0.3748345673084259,
"learning_rate": 8.656084656084656e-06,
"loss": 0.0734,
"step": 67000
},
{
"epoch": 1.7142857142857144,
"grad_norm": 0.31550857424736023,
"learning_rate": 8.571428571428571e-06,
"loss": 0.0728,
"step": 67500
},
{
"epoch": 1.726984126984127,
"grad_norm": 0.39485469460487366,
"learning_rate": 8.486772486772487e-06,
"loss": 0.074,
"step": 68000
},
{
"epoch": 1.7396825396825397,
"grad_norm": 0.3833816647529602,
"learning_rate": 8.402116402116402e-06,
"loss": 0.0727,
"step": 68500
},
{
"epoch": 1.7523809523809524,
"grad_norm": 0.45526403188705444,
"learning_rate": 8.317460317460319e-06,
"loss": 0.0721,
"step": 69000
},
{
"epoch": 1.765079365079365,
"grad_norm": 0.4437309801578522,
"learning_rate": 8.232804232804234e-06,
"loss": 0.0714,
"step": 69500
},
{
"epoch": 1.7777777777777777,
"grad_norm": 0.3827795386314392,
"learning_rate": 8.148148148148148e-06,
"loss": 0.0736,
"step": 70000
},
{
"epoch": 1.7904761904761903,
"grad_norm": 0.3821280896663666,
"learning_rate": 8.063492063492065e-06,
"loss": 0.0742,
"step": 70500
},
{
"epoch": 1.8031746031746032,
"grad_norm": 0.3558200001716614,
"learning_rate": 7.97883597883598e-06,
"loss": 0.0733,
"step": 71000
},
{
"epoch": 1.8158730158730159,
"grad_norm": 0.35507771372795105,
"learning_rate": 7.894179894179896e-06,
"loss": 0.073,
"step": 71500
},
{
"epoch": 1.8285714285714287,
"grad_norm": 0.4878668785095215,
"learning_rate": 7.809523809523811e-06,
"loss": 0.0726,
"step": 72000
},
{
"epoch": 1.8412698412698414,
"grad_norm": 0.46924230456352234,
"learning_rate": 7.724867724867726e-06,
"loss": 0.0729,
"step": 72500
},
{
"epoch": 1.853968253968254,
"grad_norm": 0.5545886158943176,
"learning_rate": 7.64021164021164e-06,
"loss": 0.0728,
"step": 73000
},
{
"epoch": 1.8666666666666667,
"grad_norm": 0.33820512890815735,
"learning_rate": 7.555555555555556e-06,
"loss": 0.0727,
"step": 73500
},
{
"epoch": 1.8793650793650793,
"grad_norm": 0.4180295169353485,
"learning_rate": 7.470899470899472e-06,
"loss": 0.0722,
"step": 74000
},
{
"epoch": 1.892063492063492,
"grad_norm": 0.41895756125450134,
"learning_rate": 7.386243386243387e-06,
"loss": 0.0721,
"step": 74500
},
{
"epoch": 1.9047619047619047,
"grad_norm": 0.37801581621170044,
"learning_rate": 7.301587301587301e-06,
"loss": 0.0754,
"step": 75000
},
{
"epoch": 1.9174603174603173,
"grad_norm": 0.42890599370002747,
"learning_rate": 7.216931216931218e-06,
"loss": 0.0727,
"step": 75500
},
{
"epoch": 1.9301587301587302,
"grad_norm": 0.36311328411102295,
"learning_rate": 7.132275132275133e-06,
"loss": 0.0732,
"step": 76000
},
{
"epoch": 1.9428571428571428,
"grad_norm": 0.4069361090660095,
"learning_rate": 7.047619047619048e-06,
"loss": 0.0731,
"step": 76500
},
{
"epoch": 1.9555555555555557,
"grad_norm": 0.38275209069252014,
"learning_rate": 6.962962962962964e-06,
"loss": 0.0729,
"step": 77000
},
{
"epoch": 1.9682539682539684,
"grad_norm": 0.3496081829071045,
"learning_rate": 6.878306878306879e-06,
"loss": 0.0725,
"step": 77500
},
{
"epoch": 1.980952380952381,
"grad_norm": 0.37429070472717285,
"learning_rate": 6.7936507936507944e-06,
"loss": 0.0726,
"step": 78000
},
{
"epoch": 1.9936507936507937,
"grad_norm": 0.4195725619792938,
"learning_rate": 6.708994708994709e-06,
"loss": 0.0724,
"step": 78500
},
{
"epoch": 2.0,
"eval_loss": 0.0749795213341713,
"eval_runtime": 269.1515,
"eval_samples_per_second": 520.153,
"eval_steps_per_second": 65.019,
"step": 78750
},
{
"epoch": 2.0063492063492063,
"grad_norm": 0.4257189631462097,
"learning_rate": 6.624338624338626e-06,
"loss": 0.07,
"step": 79000
},
{
"epoch": 2.019047619047619,
"grad_norm": 0.37472862005233765,
"learning_rate": 6.5396825396825405e-06,
"loss": 0.0664,
"step": 79500
},
{
"epoch": 2.0317460317460316,
"grad_norm": 0.4728703796863556,
"learning_rate": 6.455026455026455e-06,
"loss": 0.0664,
"step": 80000
},
{
"epoch": 2.0444444444444443,
"grad_norm": 0.42774897813796997,
"learning_rate": 6.370370370370371e-06,
"loss": 0.0661,
"step": 80500
},
{
"epoch": 2.057142857142857,
"grad_norm": 0.4025447368621826,
"learning_rate": 6.285714285714286e-06,
"loss": 0.0679,
"step": 81000
},
{
"epoch": 2.06984126984127,
"grad_norm": 0.41302409768104553,
"learning_rate": 6.201058201058202e-06,
"loss": 0.0662,
"step": 81500
},
{
"epoch": 2.0825396825396827,
"grad_norm": 0.4339478611946106,
"learning_rate": 6.116402116402117e-06,
"loss": 0.0662,
"step": 82000
},
{
"epoch": 2.0952380952380953,
"grad_norm": 0.38711288571357727,
"learning_rate": 6.031746031746032e-06,
"loss": 0.0677,
"step": 82500
},
{
"epoch": 2.107936507936508,
"grad_norm": 0.44815394282341003,
"learning_rate": 5.9470899470899475e-06,
"loss": 0.0674,
"step": 83000
},
{
"epoch": 2.1206349206349207,
"grad_norm": 0.4252176582813263,
"learning_rate": 5.862433862433863e-06,
"loss": 0.067,
"step": 83500
},
{
"epoch": 2.1333333333333333,
"grad_norm": 0.4019823670387268,
"learning_rate": 5.777777777777778e-06,
"loss": 0.0676,
"step": 84000
},
{
"epoch": 2.146031746031746,
"grad_norm": 0.37775805592536926,
"learning_rate": 5.693121693121694e-06,
"loss": 0.0671,
"step": 84500
},
{
"epoch": 2.1587301587301586,
"grad_norm": 0.5179104208946228,
"learning_rate": 5.6084656084656084e-06,
"loss": 0.0671,
"step": 85000
},
{
"epoch": 2.1714285714285713,
"grad_norm": 0.37160980701446533,
"learning_rate": 5.523809523809525e-06,
"loss": 0.0677,
"step": 85500
},
{
"epoch": 2.1841269841269844,
"grad_norm": 0.4610843360424042,
"learning_rate": 5.43915343915344e-06,
"loss": 0.0671,
"step": 86000
},
{
"epoch": 2.196825396825397,
"grad_norm": 0.4135109484195709,
"learning_rate": 5.3544973544973545e-06,
"loss": 0.0678,
"step": 86500
},
{
"epoch": 2.2095238095238097,
"grad_norm": 0.38079920411109924,
"learning_rate": 5.26984126984127e-06,
"loss": 0.0678,
"step": 87000
},
{
"epoch": 2.2222222222222223,
"grad_norm": 0.39888954162597656,
"learning_rate": 5.185185185185185e-06,
"loss": 0.0669,
"step": 87500
},
{
"epoch": 2.234920634920635,
"grad_norm": 0.37562116980552673,
"learning_rate": 5.1005291005291015e-06,
"loss": 0.0661,
"step": 88000
},
{
"epoch": 2.2476190476190476,
"grad_norm": 0.4394863247871399,
"learning_rate": 5.015873015873016e-06,
"loss": 0.0671,
"step": 88500
},
{
"epoch": 2.2603174603174603,
"grad_norm": 0.4748270511627197,
"learning_rate": 4.931216931216932e-06,
"loss": 0.067,
"step": 89000
},
{
"epoch": 2.273015873015873,
"grad_norm": 0.4593636095523834,
"learning_rate": 4.846560846560847e-06,
"loss": 0.067,
"step": 89500
},
{
"epoch": 2.2857142857142856,
"grad_norm": 0.3517415225505829,
"learning_rate": 4.761904761904762e-06,
"loss": 0.0669,
"step": 90000
},
{
"epoch": 2.2984126984126982,
"grad_norm": 0.40983742475509644,
"learning_rate": 4.677248677248677e-06,
"loss": 0.0681,
"step": 90500
},
{
"epoch": 2.311111111111111,
"grad_norm": 0.46570950746536255,
"learning_rate": 4.592592592592593e-06,
"loss": 0.0672,
"step": 91000
},
{
"epoch": 2.323809523809524,
"grad_norm": 0.4733307957649231,
"learning_rate": 4.5079365079365085e-06,
"loss": 0.0671,
"step": 91500
},
{
"epoch": 2.3365079365079366,
"grad_norm": 0.38432806730270386,
"learning_rate": 4.423280423280424e-06,
"loss": 0.0672,
"step": 92000
},
{
"epoch": 2.3492063492063493,
"grad_norm": 0.31346267461776733,
"learning_rate": 4.338624338624339e-06,
"loss": 0.066,
"step": 92500
},
{
"epoch": 2.361904761904762,
"grad_norm": 0.5612916350364685,
"learning_rate": 4.2539682539682546e-06,
"loss": 0.0666,
"step": 93000
},
{
"epoch": 2.3746031746031746,
"grad_norm": 0.3445761501789093,
"learning_rate": 4.169312169312169e-06,
"loss": 0.0675,
"step": 93500
},
{
"epoch": 2.3873015873015873,
"grad_norm": 0.41335174441337585,
"learning_rate": 4.084656084656085e-06,
"loss": 0.0676,
"step": 94000
},
{
"epoch": 2.4,
"grad_norm": 0.42691895365715027,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0669,
"step": 94500
},
{
"epoch": 2.4126984126984126,
"grad_norm": 0.44459807872772217,
"learning_rate": 3.9153439153439155e-06,
"loss": 0.0661,
"step": 95000
},
{
"epoch": 2.425396825396825,
"grad_norm": 0.39611610770225525,
"learning_rate": 3.830687830687831e-06,
"loss": 0.0665,
"step": 95500
},
{
"epoch": 2.4380952380952383,
"grad_norm": 0.41603508591651917,
"learning_rate": 3.7460317460317463e-06,
"loss": 0.067,
"step": 96000
},
{
"epoch": 2.450793650793651,
"grad_norm": 0.45685020089149475,
"learning_rate": 3.661375661375662e-06,
"loss": 0.0664,
"step": 96500
},
{
"epoch": 2.4634920634920636,
"grad_norm": 0.41426390409469604,
"learning_rate": 3.5767195767195772e-06,
"loss": 0.0665,
"step": 97000
},
{
"epoch": 2.4761904761904763,
"grad_norm": 0.4311801791191101,
"learning_rate": 3.492063492063492e-06,
"loss": 0.0673,
"step": 97500
},
{
"epoch": 2.488888888888889,
"grad_norm": 0.39366066455841064,
"learning_rate": 3.4074074074074077e-06,
"loss": 0.0676,
"step": 98000
},
{
"epoch": 2.5015873015873016,
"grad_norm": 0.46240171790122986,
"learning_rate": 3.322751322751323e-06,
"loss": 0.0674,
"step": 98500
},
{
"epoch": 2.5142857142857142,
"grad_norm": 0.45865318179130554,
"learning_rate": 3.2380952380952385e-06,
"loss": 0.0671,
"step": 99000
},
{
"epoch": 2.526984126984127,
"grad_norm": 0.38405075669288635,
"learning_rate": 3.1534391534391538e-06,
"loss": 0.0678,
"step": 99500
},
{
"epoch": 2.5396825396825395,
"grad_norm": 0.48667874932289124,
"learning_rate": 3.068783068783069e-06,
"loss": 0.0661,
"step": 100000
},
{
"epoch": 2.552380952380952,
"grad_norm": 0.3919212818145752,
"learning_rate": 2.984126984126984e-06,
"loss": 0.0654,
"step": 100500
},
{
"epoch": 2.565079365079365,
"grad_norm": 0.4081352651119232,
"learning_rate": 2.8994708994709e-06,
"loss": 0.0669,
"step": 101000
},
{
"epoch": 2.5777777777777775,
"grad_norm": 0.33449599146842957,
"learning_rate": 2.814814814814815e-06,
"loss": 0.0655,
"step": 101500
},
{
"epoch": 2.5904761904761906,
"grad_norm": 0.37508487701416016,
"learning_rate": 2.7301587301587303e-06,
"loss": 0.0659,
"step": 102000
},
{
"epoch": 2.6031746031746033,
"grad_norm": 0.43301156163215637,
"learning_rate": 2.6455026455026455e-06,
"loss": 0.0684,
"step": 102500
},
{
"epoch": 2.615873015873016,
"grad_norm": 0.31652727723121643,
"learning_rate": 2.560846560846561e-06,
"loss": 0.0674,
"step": 103000
},
{
"epoch": 2.6285714285714286,
"grad_norm": 0.38132810592651367,
"learning_rate": 2.4761904761904764e-06,
"loss": 0.0665,
"step": 103500
},
{
"epoch": 2.641269841269841,
"grad_norm": 0.4249517023563385,
"learning_rate": 2.3915343915343916e-06,
"loss": 0.0678,
"step": 104000
},
{
"epoch": 2.653968253968254,
"grad_norm": 0.42605915665626526,
"learning_rate": 2.3068783068783073e-06,
"loss": 0.0659,
"step": 104500
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.4002751111984253,
"learning_rate": 2.222222222222222e-06,
"loss": 0.0665,
"step": 105000
},
{
"epoch": 2.6793650793650796,
"grad_norm": 0.5232521891593933,
"learning_rate": 2.1375661375661377e-06,
"loss": 0.0676,
"step": 105500
},
{
"epoch": 2.6920634920634923,
"grad_norm": 0.409422367811203,
"learning_rate": 2.0529100529100534e-06,
"loss": 0.0658,
"step": 106000
},
{
"epoch": 2.704761904761905,
"grad_norm": 0.3971617519855499,
"learning_rate": 1.968253968253968e-06,
"loss": 0.0655,
"step": 106500
},
{
"epoch": 2.7174603174603176,
"grad_norm": 0.35877570509910583,
"learning_rate": 1.8835978835978838e-06,
"loss": 0.0673,
"step": 107000
},
{
"epoch": 2.7301587301587302,
"grad_norm": 0.36749425530433655,
"learning_rate": 1.798941798941799e-06,
"loss": 0.0681,
"step": 107500
},
{
"epoch": 2.742857142857143,
"grad_norm": 0.3727457821369171,
"learning_rate": 1.7142857142857145e-06,
"loss": 0.0665,
"step": 108000
},
{
"epoch": 2.7555555555555555,
"grad_norm": 0.40977808833122253,
"learning_rate": 1.62962962962963e-06,
"loss": 0.0672,
"step": 108500
},
{
"epoch": 2.768253968253968,
"grad_norm": 0.4265407621860504,
"learning_rate": 1.5449735449735451e-06,
"loss": 0.0666,
"step": 109000
},
{
"epoch": 2.780952380952381,
"grad_norm": 0.3894596993923187,
"learning_rate": 1.4603174603174606e-06,
"loss": 0.0673,
"step": 109500
},
{
"epoch": 2.7936507936507935,
"grad_norm": 0.526606023311615,
"learning_rate": 1.3756613756613758e-06,
"loss": 0.0676,
"step": 110000
},
{
"epoch": 2.806349206349206,
"grad_norm": 0.2910812497138977,
"learning_rate": 1.2910052910052912e-06,
"loss": 0.0671,
"step": 110500
},
{
"epoch": 2.819047619047619,
"grad_norm": 0.3701234757900238,
"learning_rate": 1.2063492063492065e-06,
"loss": 0.0666,
"step": 111000
},
{
"epoch": 2.831746031746032,
"grad_norm": 0.3969452679157257,
"learning_rate": 1.1216931216931217e-06,
"loss": 0.0668,
"step": 111500
},
{
"epoch": 2.8444444444444446,
"grad_norm": 0.4415270686149597,
"learning_rate": 1.0370370370370371e-06,
"loss": 0.0661,
"step": 112000
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.3490103483200073,
"learning_rate": 9.523809523809525e-07,
"loss": 0.0658,
"step": 112500
},
{
"epoch": 2.86984126984127,
"grad_norm": 0.35733526945114136,
"learning_rate": 8.677248677248679e-07,
"loss": 0.0661,
"step": 113000
},
{
"epoch": 2.8825396825396825,
"grad_norm": 0.4992692172527313,
"learning_rate": 7.830687830687832e-07,
"loss": 0.068,
"step": 113500
},
{
"epoch": 2.895238095238095,
"grad_norm": 0.4047030508518219,
"learning_rate": 6.984126984126984e-07,
"loss": 0.0683,
"step": 114000
},
{
"epoch": 2.907936507936508,
"grad_norm": 0.4468993544578552,
"learning_rate": 6.137566137566138e-07,
"loss": 0.0664,
"step": 114500
},
{
"epoch": 2.9206349206349205,
"grad_norm": 0.41356751322746277,
"learning_rate": 5.291005291005291e-07,
"loss": 0.067,
"step": 115000
},
{
"epoch": 2.9333333333333336,
"grad_norm": 0.4459340572357178,
"learning_rate": 4.444444444444445e-07,
"loss": 0.0671,
"step": 115500
},
{
"epoch": 2.9460317460317462,
"grad_norm": 0.42610964179039,
"learning_rate": 3.597883597883598e-07,
"loss": 0.0664,
"step": 116000
},
{
"epoch": 2.958730158730159,
"grad_norm": 0.5059521794319153,
"learning_rate": 2.7513227513227515e-07,
"loss": 0.0658,
"step": 116500
},
{
"epoch": 2.9714285714285715,
"grad_norm": 0.3404170572757721,
"learning_rate": 1.904761904761905e-07,
"loss": 0.0667,
"step": 117000
},
{
"epoch": 2.984126984126984,
"grad_norm": 0.4388870894908905,
"learning_rate": 1.0582010582010582e-07,
"loss": 0.0658,
"step": 117500
},
{
"epoch": 2.996825396825397,
"grad_norm": 0.39170539379119873,
"learning_rate": 2.1164021164021167e-08,
"loss": 0.0665,
"step": 118000
}
],
"logging_steps": 500,
"max_steps": 118125,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.4389780414464e+17,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}