|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 63.298904538341155, |
|
"eval_steps": 1000, |
|
"global_step": 5056, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.12519561815336464, |
|
"grad_norm": 27.375, |
|
"learning_rate": 1.9762845849802374e-07, |
|
"loss": 2.599, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.25039123630672927, |
|
"grad_norm": 24.625, |
|
"learning_rate": 3.9525691699604747e-07, |
|
"loss": 2.5019, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.3755868544600939, |
|
"grad_norm": 23.875, |
|
"learning_rate": 5.928853754940712e-07, |
|
"loss": 2.4051, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.5007824726134585, |
|
"grad_norm": 15.9375, |
|
"learning_rate": 7.905138339920949e-07, |
|
"loss": 2.4794, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.6259780907668232, |
|
"grad_norm": 37.25, |
|
"learning_rate": 9.881422924901187e-07, |
|
"loss": 2.3233, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.7511737089201878, |
|
"grad_norm": 26.375, |
|
"learning_rate": 1.1857707509881424e-06, |
|
"loss": 2.2146, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.8763693270735524, |
|
"grad_norm": 22.875, |
|
"learning_rate": 1.3833992094861662e-06, |
|
"loss": 2.259, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.001564945226917, |
|
"grad_norm": 20.625, |
|
"learning_rate": 1.5810276679841899e-06, |
|
"loss": 2.1342, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.1267605633802817, |
|
"grad_norm": 17.375, |
|
"learning_rate": 1.7786561264822136e-06, |
|
"loss": 2.1221, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.2519561815336462, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 1.9762845849802374e-06, |
|
"loss": 2.0237, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.3771517996870108, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 2.173913043478261e-06, |
|
"loss": 1.9913, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.5023474178403755, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 2.371541501976285e-06, |
|
"loss": 2.0106, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.6275430359937402, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 2.5691699604743086e-06, |
|
"loss": 1.9704, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.7527386541471048, |
|
"grad_norm": 5.0, |
|
"learning_rate": 2.7667984189723323e-06, |
|
"loss": 1.9563, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.8779342723004695, |
|
"grad_norm": 3.84375, |
|
"learning_rate": 2.964426877470356e-06, |
|
"loss": 1.939, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.003129890453834, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 3.1620553359683798e-06, |
|
"loss": 1.9485, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.128325508607199, |
|
"grad_norm": 4.625, |
|
"learning_rate": 3.3596837944664035e-06, |
|
"loss": 1.8826, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.2535211267605635, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 3.5573122529644273e-06, |
|
"loss": 1.7508, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.378716744913928, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 3.754940711462451e-06, |
|
"loss": 1.7644, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.5039123630672924, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 3.952569169960475e-06, |
|
"loss": 1.9122, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.629107981220657, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 4.150197628458498e-06, |
|
"loss": 1.7887, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.7543035993740217, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 4.347826086956522e-06, |
|
"loss": 1.8224, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.8794992175273864, |
|
"grad_norm": 3.8125, |
|
"learning_rate": 4.5454545454545455e-06, |
|
"loss": 1.8029, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 3.004694835680751, |
|
"grad_norm": 5.25, |
|
"learning_rate": 4.74308300395257e-06, |
|
"loss": 1.8384, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.1298904538341157, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 4.940711462450593e-06, |
|
"loss": 1.6869, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 3.2550860719874803, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 5.138339920948617e-06, |
|
"loss": 1.6394, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 3.380281690140845, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 5.335968379446641e-06, |
|
"loss": 1.6657, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 3.5054773082942097, |
|
"grad_norm": 3.953125, |
|
"learning_rate": 5.533596837944665e-06, |
|
"loss": 1.6843, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 3.6306729264475743, |
|
"grad_norm": 4.125, |
|
"learning_rate": 5.731225296442689e-06, |
|
"loss": 1.6496, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 3.755868544600939, |
|
"grad_norm": 3.875, |
|
"learning_rate": 5.928853754940712e-06, |
|
"loss": 1.7035, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 3.8810641627543037, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 6.126482213438736e-06, |
|
"loss": 1.6954, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 4.006259780907668, |
|
"grad_norm": 3.390625, |
|
"learning_rate": 6.3241106719367596e-06, |
|
"loss": 1.7039, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 4.131455399061033, |
|
"grad_norm": 4.125, |
|
"learning_rate": 6.521739130434783e-06, |
|
"loss": 1.5767, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 4.256651017214398, |
|
"grad_norm": 5.0, |
|
"learning_rate": 6.719367588932807e-06, |
|
"loss": 1.5438, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 4.381846635367762, |
|
"grad_norm": 3.125, |
|
"learning_rate": 6.91699604743083e-06, |
|
"loss": 1.4615, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 4.507042253521127, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 7.1146245059288545e-06, |
|
"loss": 1.5243, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 4.632237871674492, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 7.312252964426878e-06, |
|
"loss": 1.5667, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 4.757433489827856, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 7.509881422924902e-06, |
|
"loss": 1.548, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 4.882629107981221, |
|
"grad_norm": 3.421875, |
|
"learning_rate": 7.707509881422925e-06, |
|
"loss": 1.5454, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 5.007824726134586, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 7.90513833992095e-06, |
|
"loss": 1.526, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 5.13302034428795, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 8.102766798418974e-06, |
|
"loss": 1.4051, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 5.258215962441315, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 8.300395256916996e-06, |
|
"loss": 1.4057, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 5.383411580594679, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 8.49802371541502e-06, |
|
"loss": 1.3852, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 5.508607198748043, |
|
"grad_norm": 3.78125, |
|
"learning_rate": 8.695652173913044e-06, |
|
"loss": 1.3708, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 5.633802816901408, |
|
"grad_norm": 3.125, |
|
"learning_rate": 8.893280632411067e-06, |
|
"loss": 1.3361, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 5.758998435054773, |
|
"grad_norm": 3.3125, |
|
"learning_rate": 9.090909090909091e-06, |
|
"loss": 1.4118, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 5.884194053208137, |
|
"grad_norm": 3.5, |
|
"learning_rate": 9.288537549407115e-06, |
|
"loss": 1.3362, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 6.009389671361502, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 9.48616600790514e-06, |
|
"loss": 1.477, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 6.134585289514867, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 9.683794466403162e-06, |
|
"loss": 1.2172, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 6.259780907668231, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 9.881422924901186e-06, |
|
"loss": 1.1359, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 6.384976525821596, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 9.999980930615864e-06, |
|
"loss": 1.2047, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 6.510172143974961, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 9.999766401714795e-06, |
|
"loss": 1.1727, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 6.635367762128325, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 9.999313517443876e-06, |
|
"loss": 1.3046, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 6.76056338028169, |
|
"grad_norm": 3.640625, |
|
"learning_rate": 9.998622299393598e-06, |
|
"loss": 1.2101, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 6.885758998435055, |
|
"grad_norm": 3.25, |
|
"learning_rate": 9.997692780516608e-06, |
|
"loss": 1.1944, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 7.010954616588419, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 9.996525005126135e-06, |
|
"loss": 1.2125, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 7.136150234741784, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 9.995119028893888e-06, |
|
"loss": 1.0638, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 7.261345852895149, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 9.993474918847401e-06, |
|
"loss": 1.0207, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 7.386541471048513, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 9.991592753366822e-06, |
|
"loss": 1.0174, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 7.511737089201878, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 9.989472622181194e-06, |
|
"loss": 1.0496, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 7.636932707355243, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 9.987114626364172e-06, |
|
"loss": 0.9966, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 7.762128325508607, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 9.984518878329197e-06, |
|
"loss": 0.9866, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 7.887323943661972, |
|
"grad_norm": 3.25, |
|
"learning_rate": 9.98168550182415e-06, |
|
"loss": 1.0372, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 8.012519561815337, |
|
"grad_norm": 3.125, |
|
"learning_rate": 9.978614631925442e-06, |
|
"loss": 0.9753, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 8.137715179968701, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 9.975306415031577e-06, |
|
"loss": 0.8432, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 8.262910798122066, |
|
"grad_norm": 3.390625, |
|
"learning_rate": 9.97176100885618e-06, |
|
"loss": 0.8252, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 8.38810641627543, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 9.967978582420463e-06, |
|
"loss": 0.8158, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 8.513302034428795, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 9.963959316045185e-06, |
|
"loss": 0.833, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 8.63849765258216, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 9.959703401342037e-06, |
|
"loss": 0.8328, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 8.763693270735525, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 9.955211041204529e-06, |
|
"loss": 0.7759, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 8.88888888888889, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 9.950482449798295e-06, |
|
"loss": 0.8572, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 9.014084507042254, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 9.9455178525509e-06, |
|
"loss": 0.7794, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 9.139280125195619, |
|
"grad_norm": 3.84375, |
|
"learning_rate": 9.940317486141084e-06, |
|
"loss": 0.6747, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 9.264475743348983, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 9.934881598487478e-06, |
|
"loss": 0.6434, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 9.389671361502348, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 9.929210448736797e-06, |
|
"loss": 0.6149, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 9.514866979655713, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 9.923304307251467e-06, |
|
"loss": 0.6818, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 9.640062597809077, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 9.917163455596753e-06, |
|
"loss": 0.6376, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 9.765258215962442, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 9.910788186527325e-06, |
|
"loss": 0.6487, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 9.890453834115807, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 9.904178803973306e-06, |
|
"loss": 0.6511, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 10.015649452269171, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 9.89733562302578e-06, |
|
"loss": 0.6073, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 10.140845070422536, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 9.890258969921777e-06, |
|
"loss": 0.4669, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 10.2660406885759, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 9.882949182028709e-06, |
|
"loss": 0.5051, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 10.391236306729265, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 9.8754066078283e-06, |
|
"loss": 0.4987, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 10.51643192488263, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 9.867631606899957e-06, |
|
"loss": 0.5019, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 10.641627543035995, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 9.859624549903646e-06, |
|
"loss": 0.5047, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 10.766823161189357, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 9.851385818562204e-06, |
|
"loss": 0.5325, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 10.892018779342724, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 9.842915805643156e-06, |
|
"loss": 0.5332, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 11.017214397496087, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 9.834214914939977e-06, |
|
"loss": 0.4857, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 11.142410015649451, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 9.82528356125285e-06, |
|
"loss": 0.3812, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 11.267605633802816, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 9.816122170368891e-06, |
|
"loss": 0.3738, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 11.39280125195618, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 9.806731179041849e-06, |
|
"loss": 0.4202, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 11.517996870109545, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 9.797111034971278e-06, |
|
"loss": 0.3592, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 11.64319248826291, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 9.787262196781208e-06, |
|
"loss": 0.3406, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 11.768388106416275, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 9.777185133998268e-06, |
|
"loss": 0.3592, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 11.89358372456964, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 9.76688032702931e-06, |
|
"loss": 0.3725, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 12.018779342723004, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 9.756348267138497e-06, |
|
"loss": 0.4022, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 12.143974960876369, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 9.745589456423897e-06, |
|
"loss": 0.2901, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 12.269170579029733, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 9.734604407793529e-06, |
|
"loss": 0.3043, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 12.394366197183098, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 9.72339364494093e-06, |
|
"loss": 0.2778, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 12.519561815336463, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 9.711957702320176e-06, |
|
"loss": 0.2795, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 12.519561815336463, |
|
"eval_loss": 2.0127437114715576, |
|
"eval_runtime": 3.1921, |
|
"eval_samples_per_second": 22.556, |
|
"eval_steps_per_second": 22.556, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 12.644757433489827, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 9.7002971251204e-06, |
|
"loss": 0.2932, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 12.769953051643192, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 9.688412469239812e-06, |
|
"loss": 0.3021, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 12.895148669796557, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 9.676304301259196e-06, |
|
"loss": 0.2861, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 13.020344287949921, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 9.663973198414888e-06, |
|
"loss": 0.2959, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 13.145539906103286, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 9.651419748571272e-06, |
|
"loss": 0.2115, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 13.27073552425665, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 9.638644550192741e-06, |
|
"loss": 0.2322, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 13.395931142410015, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 9.625648212315177e-06, |
|
"loss": 0.2443, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 13.52112676056338, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 9.612431354516912e-06, |
|
"loss": 0.2237, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 13.646322378716745, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 9.598994606889187e-06, |
|
"loss": 0.2261, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 13.77151799687011, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 9.585338610006122e-06, |
|
"loss": 0.2163, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 13.896713615023474, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 9.571464014894168e-06, |
|
"loss": 0.2223, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 14.021909233176839, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 9.557371483001078e-06, |
|
"loss": 0.2216, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 14.147104851330203, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 9.543061686164374e-06, |
|
"loss": 0.1752, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 14.272300469483568, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 9.528535306579306e-06, |
|
"loss": 0.1694, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 14.397496087636933, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 9.513793036766345e-06, |
|
"loss": 0.1597, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 14.522691705790297, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 9.498835579538164e-06, |
|
"loss": 0.1627, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 14.647887323943662, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 9.483663647966124e-06, |
|
"loss": 0.187, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 14.773082942097027, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 9.468277965346292e-06, |
|
"loss": 0.168, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 14.898278560250391, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 9.452679265164951e-06, |
|
"loss": 0.1605, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 15.023474178403756, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 9.43686829106363e-06, |
|
"loss": 0.1667, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 15.14866979655712, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 9.42084579680366e-06, |
|
"loss": 0.1442, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 15.273865414710485, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 9.404612546230244e-06, |
|
"loss": 0.1222, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 15.39906103286385, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 9.38816931323602e-06, |
|
"loss": 0.1281, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 15.524256651017215, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 9.371516881724192e-06, |
|
"loss": 0.1207, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 15.64945226917058, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 9.35465604557114e-06, |
|
"loss": 0.1191, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 15.774647887323944, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 9.337587608588588e-06, |
|
"loss": 0.1207, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 15.899843505477309, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 9.320312384485274e-06, |
|
"loss": 0.1312, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 16.025039123630673, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 9.30283119682816e-06, |
|
"loss": 0.1173, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 16.150234741784036, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 9.285144879003173e-06, |
|
"loss": 0.0862, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 16.275430359937403, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 9.267254274175467e-06, |
|
"loss": 0.0801, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 16.400625978090765, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 9.24916023524924e-06, |
|
"loss": 0.0852, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 16.525821596244132, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 9.23086362482706e-06, |
|
"loss": 0.113, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 16.651017214397495, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 9.212365315168743e-06, |
|
"loss": 0.0951, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 16.77621283255086, |
|
"grad_norm": 0.625, |
|
"learning_rate": 9.193666188149782e-06, |
|
"loss": 0.0917, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 16.901408450704224, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 9.174767135219291e-06, |
|
"loss": 0.0849, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 17.02660406885759, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 9.155669057357515e-06, |
|
"loss": 0.0907, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 17.151799687010953, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 9.136372865032871e-06, |
|
"loss": 0.0577, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 17.27699530516432, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 9.116879478158552e-06, |
|
"loss": 0.0689, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 17.402190923317683, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 9.09718982604866e-06, |
|
"loss": 0.0723, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 17.52738654147105, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 9.077304847373913e-06, |
|
"loss": 0.0639, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 17.652582159624412, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 9.057225490116887e-06, |
|
"loss": 0.0594, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 17.77777777777778, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 9.036952711526834e-06, |
|
"loss": 0.0752, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 17.90297339593114, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 9.016487478074032e-06, |
|
"loss": 0.0627, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 18.028169014084508, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 8.995830765403721e-06, |
|
"loss": 0.0656, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 18.15336463223787, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 8.974983558289586e-06, |
|
"loss": 0.0413, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 18.278560250391237, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 8.953946850586813e-06, |
|
"loss": 0.0448, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 18.4037558685446, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 8.932721645184707e-06, |
|
"loss": 0.0438, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 18.528951486697967, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 8.911308953958875e-06, |
|
"loss": 0.0477, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 18.65414710485133, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 8.889709797723002e-06, |
|
"loss": 0.0478, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 18.779342723004696, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 8.867925206180166e-06, |
|
"loss": 0.0505, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 18.90453834115806, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 8.845956217873763e-06, |
|
"loss": 0.0463, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 19.029733959311425, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 8.823803880137993e-06, |
|
"loss": 0.0517, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 19.154929577464788, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 8.801469249047923e-06, |
|
"loss": 0.0342, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 19.280125195618155, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 8.77895338936915e-06, |
|
"loss": 0.0287, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 19.405320813771517, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 8.756257374507036e-06, |
|
"loss": 0.0333, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 19.530516431924884, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 8.733382286455536e-06, |
|
"loss": 0.0313, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 19.655712050078247, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 8.710329215745612e-06, |
|
"loss": 0.0274, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 19.780907668231613, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 8.687099261393249e-06, |
|
"loss": 0.0351, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 19.906103286384976, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 8.663693530847056e-06, |
|
"loss": 0.0331, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 20.031298904538342, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 8.640113139935484e-06, |
|
"loss": 0.0275, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 20.156494522691705, |
|
"grad_norm": 13.75, |
|
"learning_rate": 8.616359212813607e-06, |
|
"loss": 0.0466, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 20.281690140845072, |
|
"grad_norm": 16.375, |
|
"learning_rate": 8.592432881909548e-06, |
|
"loss": 0.0779, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 20.406885758998435, |
|
"grad_norm": 16.75, |
|
"learning_rate": 8.568335287870488e-06, |
|
"loss": 0.116, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 20.5320813771518, |
|
"grad_norm": 11.25, |
|
"learning_rate": 8.544067579508292e-06, |
|
"loss": 0.1198, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 20.657276995305164, |
|
"grad_norm": 38.5, |
|
"learning_rate": 8.519630913744726e-06, |
|
"loss": 0.1259, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 20.78247261345853, |
|
"grad_norm": 26.875, |
|
"learning_rate": 8.495026455556318e-06, |
|
"loss": 0.1304, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 20.907668231611893, |
|
"grad_norm": 30.375, |
|
"learning_rate": 8.470255377918821e-06, |
|
"loss": 0.1338, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 21.03286384976526, |
|
"grad_norm": 31.75, |
|
"learning_rate": 8.445318861751278e-06, |
|
"loss": 0.1232, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 21.158059467918623, |
|
"grad_norm": 28.25, |
|
"learning_rate": 8.420218095859735e-06, |
|
"loss": 0.154, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 21.28325508607199, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 8.394954276880568e-06, |
|
"loss": 0.1363, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 21.408450704225352, |
|
"grad_norm": 8.25, |
|
"learning_rate": 8.36952860922343e-06, |
|
"loss": 0.13, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 21.53364632237872, |
|
"grad_norm": 11.9375, |
|
"learning_rate": 8.343942305013833e-06, |
|
"loss": 0.1398, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 21.65884194053208, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 8.318196584035367e-06, |
|
"loss": 0.1428, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 21.784037558685448, |
|
"grad_norm": 8.5, |
|
"learning_rate": 8.292292673671542e-06, |
|
"loss": 0.1451, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 21.90923317683881, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 8.266231808847284e-06, |
|
"loss": 0.1157, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 22.034428794992174, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 8.24001523197005e-06, |
|
"loss": 0.1287, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 22.15962441314554, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 8.213644192870609e-06, |
|
"loss": 0.1034, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 22.284820031298903, |
|
"grad_norm": 7.6875, |
|
"learning_rate": 8.18711994874345e-06, |
|
"loss": 0.1126, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 22.41001564945227, |
|
"grad_norm": 9.375, |
|
"learning_rate": 8.160443764086855e-06, |
|
"loss": 0.1295, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 22.535211267605632, |
|
"grad_norm": 10.0625, |
|
"learning_rate": 8.13361691064261e-06, |
|
"loss": 0.1149, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 22.660406885759, |
|
"grad_norm": 7.875, |
|
"learning_rate": 8.10664066733538e-06, |
|
"loss": 0.1125, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 22.78560250391236, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 8.079516320211746e-06, |
|
"loss": 0.1538, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 22.910798122065728, |
|
"grad_norm": 6.6875, |
|
"learning_rate": 8.052245162378871e-06, |
|
"loss": 0.1213, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 23.03599374021909, |
|
"grad_norm": 5.125, |
|
"learning_rate": 8.024828493942882e-06, |
|
"loss": 0.1065, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 23.161189358372457, |
|
"grad_norm": 6.65625, |
|
"learning_rate": 7.997267621946871e-06, |
|
"loss": 0.0972, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 23.28638497652582, |
|
"grad_norm": 7.71875, |
|
"learning_rate": 7.96956386030859e-06, |
|
"loss": 0.0994, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 23.411580594679187, |
|
"grad_norm": 7.21875, |
|
"learning_rate": 7.94171852975782e-06, |
|
"loss": 0.1061, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 23.53677621283255, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 7.913732957773385e-06, |
|
"loss": 0.1005, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 23.661971830985916, |
|
"grad_norm": 6.3125, |
|
"learning_rate": 7.885608478519894e-06, |
|
"loss": 0.0963, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 23.78716744913928, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 7.857346432784116e-06, |
|
"loss": 0.1074, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 23.912363067292645, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 7.828948167911073e-06, |
|
"loss": 0.0948, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 24.037558685446008, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 7.800415037739802e-06, |
|
"loss": 0.1132, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 24.162754303599375, |
|
"grad_norm": 6.5, |
|
"learning_rate": 7.771748402538808e-06, |
|
"loss": 0.0783, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 24.287949921752737, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 7.742949628941232e-06, |
|
"loss": 0.0743, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 24.413145539906104, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 7.714020089879683e-06, |
|
"loss": 0.092, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 24.538341158059467, |
|
"grad_norm": 5.0, |
|
"learning_rate": 7.684961164520792e-06, |
|
"loss": 0.0822, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 24.663536776212833, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 7.655774238199459e-06, |
|
"loss": 0.0989, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 24.788732394366196, |
|
"grad_norm": 7.59375, |
|
"learning_rate": 7.6264607023528135e-06, |
|
"loss": 0.1003, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 24.913928012519563, |
|
"grad_norm": 5.25, |
|
"learning_rate": 7.597021954453887e-06, |
|
"loss": 0.089, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 25.039123630672925, |
|
"grad_norm": 4.375, |
|
"learning_rate": 7.567459397944972e-06, |
|
"loss": 0.0784, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 25.039123630672925, |
|
"eval_loss": 2.359957456588745, |
|
"eval_runtime": 3.1779, |
|
"eval_samples_per_second": 22.657, |
|
"eval_steps_per_second": 22.657, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 25.164319248826292, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 7.537774442170731e-06, |
|
"loss": 0.0569, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 25.289514866979655, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 7.507968502311005e-06, |
|
"loss": 0.0682, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 25.41471048513302, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 7.478042999313342e-06, |
|
"loss": 0.0679, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 25.539906103286384, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 7.447999359825263e-06, |
|
"loss": 0.0564, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 25.66510172143975, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 7.417839016126242e-06, |
|
"loss": 0.0661, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 25.790297339593113, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 7.387563406059433e-06, |
|
"loss": 0.0728, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 25.91549295774648, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 7.357173972963112e-06, |
|
"loss": 0.0758, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 26.040688575899843, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 7.32667216560188e-06, |
|
"loss": 0.0679, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 26.16588419405321, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 7.296059438097589e-06, |
|
"loss": 0.0469, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 26.291079812206572, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 7.265337249860015e-06, |
|
"loss": 0.0479, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 26.41627543035994, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 7.234507065517297e-06, |
|
"loss": 0.0499, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 26.5414710485133, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 7.2035703548461e-06, |
|
"loss": 0.0529, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 26.666666666666668, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 7.17252859270155e-06, |
|
"loss": 0.0557, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 26.79186228482003, |
|
"grad_norm": 2.625, |
|
"learning_rate": 7.141383258946926e-06, |
|
"loss": 0.0492, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 26.917057902973397, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 7.110135838383105e-06, |
|
"loss": 0.0541, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 27.04225352112676, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 7.078787820677784e-06, |
|
"loss": 0.0528, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 27.167449139280127, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 7.047340700294454e-06, |
|
"loss": 0.0453, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 27.29264475743349, |
|
"grad_norm": 3.5, |
|
"learning_rate": 7.015795976421156e-06, |
|
"loss": 0.036, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 27.417840375586856, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 6.984155152899016e-06, |
|
"loss": 0.0427, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 27.54303599374022, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 6.952419738150546e-06, |
|
"loss": 0.0424, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 27.668231611893585, |
|
"grad_norm": 3.6875, |
|
"learning_rate": 6.9205912451077305e-06, |
|
"loss": 0.0336, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 27.793427230046948, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 6.88867119113991e-06, |
|
"loss": 0.0462, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 27.918622848200314, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 6.856661097981433e-06, |
|
"loss": 0.0492, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 28.043818466353677, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 6.824562491659112e-06, |
|
"loss": 0.0363, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 28.169014084507044, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 6.792376902419478e-06, |
|
"loss": 0.03, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 28.294209702660407, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 6.7601058646558195e-06, |
|
"loss": 0.0331, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 28.419405320813773, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 6.7277509168350445e-06, |
|
"loss": 0.0364, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 28.544600938967136, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 6.695313601424326e-06, |
|
"loss": 0.0296, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 28.669796557120502, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 6.662795464817573e-06, |
|
"loss": 0.0323, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 28.794992175273865, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 6.63019805726171e-06, |
|
"loss": 0.0312, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 28.920187793427232, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 6.597522932782765e-06, |
|
"loss": 0.0318, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 29.045383411580595, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 6.564771649111792e-06, |
|
"loss": 0.0269, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 29.170579029733958, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 6.531945767610604e-06, |
|
"loss": 0.0205, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 29.295774647887324, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 6.499046853197338e-06, |
|
"loss": 0.024, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 29.420970266040687, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 6.46607647427185e-06, |
|
"loss": 0.024, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 29.546165884194053, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 6.4330362026409506e-06, |
|
"loss": 0.0262, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 29.671361502347416, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 6.3999276134434595e-06, |
|
"loss": 0.0252, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 29.796557120500783, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 6.366752285075125e-06, |
|
"loss": 0.0224, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 29.921752738654146, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 6.33351179911337e-06, |
|
"loss": 0.0204, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 30.046948356807512, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 6.300207740241895e-06, |
|
"loss": 0.0195, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 30.172143974960875, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 6.266841696175132e-06, |
|
"loss": 0.0164, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 30.29733959311424, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 6.233415257582551e-06, |
|
"loss": 0.0167, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 30.422535211267604, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 6.19993001801283e-06, |
|
"loss": 0.0149, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 30.54773082942097, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 6.166387573817881e-06, |
|
"loss": 0.0178, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 30.672926447574334, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 6.132789524076751e-06, |
|
"loss": 0.0181, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 30.7981220657277, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 6.0991374705193866e-06, |
|
"loss": 0.0147, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 30.923317683881063, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 6.065433017450276e-06, |
|
"loss": 0.0182, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 31.04851330203443, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 6.031677771671962e-06, |
|
"loss": 0.0149, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 31.173708920187792, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 5.997873342408446e-06, |
|
"loss": 0.0135, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 31.29890453834116, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 5.964021341228468e-06, |
|
"loss": 0.0138, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 31.42410015649452, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 5.930123381968677e-06, |
|
"loss": 0.0138, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 31.549295774647888, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 5.8961810806567e-06, |
|
"loss": 0.0158, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 31.67449139280125, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 5.862196055434089e-06, |
|
"loss": 0.0116, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 31.799687010954617, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 5.828169926479191e-06, |
|
"loss": 0.0124, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 31.92488262910798, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 5.794104315929904e-06, |
|
"loss": 0.0144, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 32.05007824726135, |
|
"grad_norm": 0.25, |
|
"learning_rate": 5.760000847806337e-06, |
|
"loss": 0.0127, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 32.17527386541471, |
|
"grad_norm": 0.12060546875, |
|
"learning_rate": 5.725861147933403e-06, |
|
"loss": 0.0116, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 32.30046948356807, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 5.6916868438632976e-06, |
|
"loss": 0.0133, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 32.42566510172144, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 5.657479564797914e-06, |
|
"loss": 0.0091, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 32.550860719874805, |
|
"grad_norm": 0.1123046875, |
|
"learning_rate": 5.623240941511173e-06, |
|
"loss": 0.01, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 32.67605633802817, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 5.588972606271276e-06, |
|
"loss": 0.0091, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 32.80125195618153, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 5.554676192762891e-06, |
|
"loss": 0.0111, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 32.9264475743349, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 5.520353336009274e-06, |
|
"loss": 0.0102, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 33.051643192488264, |
|
"grad_norm": 0.0849609375, |
|
"learning_rate": 5.48600567229431e-06, |
|
"loss": 0.0084, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 33.17683881064163, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 5.451634839084523e-06, |
|
"loss": 0.009, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 33.30203442879499, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 5.417242474950999e-06, |
|
"loss": 0.0083, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 33.42723004694836, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 5.382830219491271e-06, |
|
"loss": 0.0091, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 33.55242566510172, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 5.348399713251163e-06, |
|
"loss": 0.0115, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 33.677621283255085, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 5.3139525976465675e-06, |
|
"loss": 0.0089, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 33.80281690140845, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 5.279490514885207e-06, |
|
"loss": 0.0075, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 33.92801251956182, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 5.245015107888335e-06, |
|
"loss": 0.0095, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 34.05320813771518, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 5.210528020212412e-06, |
|
"loss": 0.0081, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 34.178403755868544, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 5.176030895970761e-06, |
|
"loss": 0.007, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 34.30359937402191, |
|
"grad_norm": 0.087890625, |
|
"learning_rate": 5.141525379755178e-06, |
|
"loss": 0.0053, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 34.42879499217528, |
|
"grad_norm": 0.09326171875, |
|
"learning_rate": 5.10701311655753e-06, |
|
"loss": 0.0065, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 34.55399061032864, |
|
"grad_norm": 0.06689453125, |
|
"learning_rate": 5.072495751691338e-06, |
|
"loss": 0.0072, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 34.679186228482, |
|
"grad_norm": 0.10546875, |
|
"learning_rate": 5.037974930713338e-06, |
|
"loss": 0.0058, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 34.804381846635366, |
|
"grad_norm": 0.0859375, |
|
"learning_rate": 5.003452299345024e-06, |
|
"loss": 0.0055, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 34.929577464788736, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 4.968929503394206e-06, |
|
"loss": 0.0061, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 35.0547730829421, |
|
"grad_norm": 0.056396484375, |
|
"learning_rate": 4.934408188676531e-06, |
|
"loss": 0.0062, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 35.17996870109546, |
|
"grad_norm": 0.1435546875, |
|
"learning_rate": 4.8998900009370366e-06, |
|
"loss": 0.0049, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 35.305164319248824, |
|
"grad_norm": 0.06298828125, |
|
"learning_rate": 4.865376585771687e-06, |
|
"loss": 0.0043, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 35.430359937402194, |
|
"grad_norm": 0.056640625, |
|
"learning_rate": 4.830869588548918e-06, |
|
"loss": 0.006, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 35.55555555555556, |
|
"grad_norm": 0.048828125, |
|
"learning_rate": 4.796370654331205e-06, |
|
"loss": 0.0055, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 35.68075117370892, |
|
"grad_norm": 0.171875, |
|
"learning_rate": 4.7618814277966325e-06, |
|
"loss": 0.0052, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 35.80594679186228, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 4.727403553160484e-06, |
|
"loss": 0.0057, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 35.93114241001565, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 4.692938674096867e-06, |
|
"loss": 0.005, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 36.056338028169016, |
|
"grad_norm": 0.03857421875, |
|
"learning_rate": 4.658488433660341e-06, |
|
"loss": 0.0047, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 36.18153364632238, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 4.624054474207597e-06, |
|
"loss": 0.0048, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 36.30672926447574, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 4.589638437319157e-06, |
|
"loss": 0.0054, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 36.431924882629104, |
|
"grad_norm": 0.11962890625, |
|
"learning_rate": 4.555241963721118e-06, |
|
"loss": 0.0041, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 36.557120500782474, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 4.5208666932069255e-06, |
|
"loss": 0.0043, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 36.68231611893584, |
|
"grad_norm": 0.154296875, |
|
"learning_rate": 4.486514264559206e-06, |
|
"loss": 0.0045, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 36.8075117370892, |
|
"grad_norm": 0.0400390625, |
|
"learning_rate": 4.452186315471641e-06, |
|
"loss": 0.0039, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 36.93270735524256, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 4.417884482470887e-06, |
|
"loss": 0.0042, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 37.05790297339593, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 4.383610400838561e-06, |
|
"loss": 0.0039, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 37.183098591549296, |
|
"grad_norm": 0.034423828125, |
|
"learning_rate": 4.349365704533285e-06, |
|
"loss": 0.0038, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 37.30829420970266, |
|
"grad_norm": 0.03173828125, |
|
"learning_rate": 4.31515202611278e-06, |
|
"loss": 0.0037, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 37.43348982785602, |
|
"grad_norm": 0.033447265625, |
|
"learning_rate": 4.2809709966560435e-06, |
|
"loss": 0.0031, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 37.55868544600939, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 4.246824245685591e-06, |
|
"loss": 0.0037, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 37.55868544600939, |
|
"eval_loss": 2.755915641784668, |
|
"eval_runtime": 3.2484, |
|
"eval_samples_per_second": 22.165, |
|
"eval_steps_per_second": 22.165, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 37.683881064162755, |
|
"grad_norm": 0.031005859375, |
|
"learning_rate": 4.2127134010897695e-06, |
|
"loss": 0.003, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 37.80907668231612, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 4.178640089045147e-06, |
|
"loss": 0.0031, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 37.93427230046948, |
|
"grad_norm": 0.1708984375, |
|
"learning_rate": 4.144605933938993e-06, |
|
"loss": 0.0032, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 38.05946791862285, |
|
"grad_norm": 0.02783203125, |
|
"learning_rate": 4.1106125582918385e-06, |
|
"loss": 0.0032, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 38.18466353677621, |
|
"grad_norm": 0.03125, |
|
"learning_rate": 4.07666158268012e-06, |
|
"loss": 0.003, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 38.309859154929576, |
|
"grad_norm": 0.037109375, |
|
"learning_rate": 4.042754625658929e-06, |
|
"loss": 0.0028, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 38.43505477308294, |
|
"grad_norm": 0.0303955078125, |
|
"learning_rate": 4.008893303684837e-06, |
|
"loss": 0.0035, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 38.56025039123631, |
|
"grad_norm": 0.05908203125, |
|
"learning_rate": 3.975079231038848e-06, |
|
"loss": 0.0025, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 38.68544600938967, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 3.941314019749438e-06, |
|
"loss": 0.0036, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 38.810641627543035, |
|
"grad_norm": 0.033203125, |
|
"learning_rate": 3.9075992795156916e-06, |
|
"loss": 0.003, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 38.9358372456964, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 3.873936617630578e-06, |
|
"loss": 0.0028, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 39.06103286384977, |
|
"grad_norm": 0.03369140625, |
|
"learning_rate": 3.840327638904321e-06, |
|
"loss": 0.0026, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 39.18622848200313, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 3.8067739455878844e-06, |
|
"loss": 0.0026, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 39.31142410015649, |
|
"grad_norm": 0.0341796875, |
|
"learning_rate": 3.7732771372965987e-06, |
|
"loss": 0.0024, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 39.436619718309856, |
|
"grad_norm": 0.0458984375, |
|
"learning_rate": 3.7398388109338984e-06, |
|
"loss": 0.0026, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 39.561815336463226, |
|
"grad_norm": 0.02783203125, |
|
"learning_rate": 3.7064605606151866e-06, |
|
"loss": 0.0026, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 39.68701095461659, |
|
"grad_norm": 0.0703125, |
|
"learning_rate": 3.6731439775918467e-06, |
|
"loss": 0.0024, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 39.81220657276995, |
|
"grad_norm": 0.052978515625, |
|
"learning_rate": 3.639890650175379e-06, |
|
"loss": 0.0027, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 39.937402190923315, |
|
"grad_norm": 0.04296875, |
|
"learning_rate": 3.6067021636616793e-06, |
|
"loss": 0.002, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 40.062597809076685, |
|
"grad_norm": 0.1357421875, |
|
"learning_rate": 3.5735801002554615e-06, |
|
"loss": 0.002, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 40.18779342723005, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 3.540526038994834e-06, |
|
"loss": 0.0018, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 40.31298904538341, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 3.5075415556760157e-06, |
|
"loss": 0.0025, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 40.438184663536774, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 3.4746282227782164e-06, |
|
"loss": 0.0037, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 40.563380281690144, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 3.4417876093886705e-06, |
|
"loss": 0.0042, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 40.68857589984351, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 3.409021281127835e-06, |
|
"loss": 0.0085, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 40.81377151799687, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 3.3763308000747453e-06, |
|
"loss": 0.0072, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 40.93896713615023, |
|
"grad_norm": 9.6875, |
|
"learning_rate": 3.3437177246925547e-06, |
|
"loss": 0.0108, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 41.0641627543036, |
|
"grad_norm": 11.125, |
|
"learning_rate": 3.31118360975423e-06, |
|
"loss": 0.0087, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 41.189358372456965, |
|
"grad_norm": 15.875, |
|
"learning_rate": 3.278730006268432e-06, |
|
"loss": 0.0112, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 41.31455399061033, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 3.246358461405579e-06, |
|
"loss": 0.0105, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 41.43974960876369, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 3.2140705184240783e-06, |
|
"loss": 0.0098, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 41.56494522691706, |
|
"grad_norm": 3.125, |
|
"learning_rate": 3.181867716596765e-06, |
|
"loss": 0.0071, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 41.690140845070424, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 3.1497515911375113e-06, |
|
"loss": 0.0124, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 41.81533646322379, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 3.11772367312804e-06, |
|
"loss": 0.0075, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 41.94053208137715, |
|
"grad_norm": 10.6875, |
|
"learning_rate": 3.085785489444936e-06, |
|
"loss": 0.0093, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 42.06572769953052, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 3.05393856268685e-06, |
|
"loss": 0.0073, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 42.19092331768388, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 3.0221844111019166e-06, |
|
"loss": 0.0039, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 42.316118935837245, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 2.99052454851537e-06, |
|
"loss": 0.0065, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 42.44131455399061, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 2.9589604842573762e-06, |
|
"loss": 0.0064, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 42.56651017214398, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 2.927493723091078e-06, |
|
"loss": 0.0058, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 42.69170579029734, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 2.8961257651408627e-06, |
|
"loss": 0.0094, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 42.816901408450704, |
|
"grad_norm": 5.5, |
|
"learning_rate": 2.8648581058208387e-06, |
|
"loss": 0.0053, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 42.94209702660407, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 2.8336922357635464e-06, |
|
"loss": 0.0084, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 43.06729264475744, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 2.802629640748898e-06, |
|
"loss": 0.0044, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 43.1924882629108, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 2.7716718016333432e-06, |
|
"loss": 0.0069, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 43.31768388106416, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 2.7408201942792755e-06, |
|
"loss": 0.0061, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 43.442879499217526, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 2.7100762894846633e-06, |
|
"loss": 0.0065, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 43.568075117370896, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 2.6794415529129402e-06, |
|
"loss": 0.0052, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 43.69327073552426, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 2.6489174450231353e-06, |
|
"loss": 0.0056, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 43.81846635367762, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 2.618505421000237e-06, |
|
"loss": 0.005, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 43.943661971830984, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 2.588206930685827e-06, |
|
"loss": 0.0069, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 44.06885758998435, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 2.5580234185089647e-06, |
|
"loss": 0.0043, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 44.19405320813772, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 2.5279563234173177e-06, |
|
"loss": 0.0054, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 44.31924882629108, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 2.4980070788085655e-06, |
|
"loss": 0.0043, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 44.44444444444444, |
|
"grad_norm": 1.25, |
|
"learning_rate": 2.4681771124620716e-06, |
|
"loss": 0.005, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 44.569640062597806, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 2.4384678464708077e-06, |
|
"loss": 0.0042, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 44.694835680751176, |
|
"grad_norm": 2.5, |
|
"learning_rate": 2.4088806971735584e-06, |
|
"loss": 0.0056, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 44.82003129890454, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 2.3794170750874094e-06, |
|
"loss": 0.0052, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 44.9452269170579, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 2.3500783848404906e-06, |
|
"loss": 0.0055, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 45.070422535211264, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 2.320866025105016e-06, |
|
"loss": 0.0041, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 45.195618153364634, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 2.2917813885306196e-06, |
|
"loss": 0.0035, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 45.320813771518, |
|
"grad_norm": 0.1181640625, |
|
"learning_rate": 2.262825861677938e-06, |
|
"loss": 0.0026, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 45.44600938967136, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 2.234000824952525e-06, |
|
"loss": 0.0046, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 45.57120500782472, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 2.2053076525390434e-06, |
|
"loss": 0.004, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 45.69640062597809, |
|
"grad_norm": 0.1923828125, |
|
"learning_rate": 2.1767477123357424e-06, |
|
"loss": 0.0041, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 45.821596244131456, |
|
"grad_norm": 2.875, |
|
"learning_rate": 2.1483223658892545e-06, |
|
"loss": 0.0041, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 45.94679186228482, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 2.120032968329687e-06, |
|
"loss": 0.004, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 46.07198748043818, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 2.091880868306011e-06, |
|
"loss": 0.003, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 46.19718309859155, |
|
"grad_norm": 0.0869140625, |
|
"learning_rate": 2.0638674079217687e-06, |
|
"loss": 0.0023, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 46.322378716744915, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 2.0359939226711002e-06, |
|
"loss": 0.0025, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 46.44757433489828, |
|
"grad_norm": 0.08251953125, |
|
"learning_rate": 2.008261741375063e-06, |
|
"loss": 0.0026, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 46.57276995305164, |
|
"grad_norm": 0.0810546875, |
|
"learning_rate": 1.9806721861182907e-06, |
|
"loss": 0.0026, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 46.69796557120501, |
|
"grad_norm": 0.095703125, |
|
"learning_rate": 1.95322657218596e-06, |
|
"loss": 0.0037, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 46.82316118935837, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 1.9259262080010938e-06, |
|
"loss": 0.0028, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 46.948356807511736, |
|
"grad_norm": 0.08544921875, |
|
"learning_rate": 1.8987723950621805e-06, |
|
"loss": 0.0024, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 47.0735524256651, |
|
"grad_norm": 0.06640625, |
|
"learning_rate": 1.8717664278811198e-06, |
|
"loss": 0.0023, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 47.19874804381847, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 1.844909593921525e-06, |
|
"loss": 0.0021, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 47.32394366197183, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 1.8182031735373302e-06, |
|
"loss": 0.002, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 47.449139280125195, |
|
"grad_norm": 0.07373046875, |
|
"learning_rate": 1.7916484399117579e-06, |
|
"loss": 0.0038, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 47.57433489827856, |
|
"grad_norm": 0.08056640625, |
|
"learning_rate": 1.7652466589966271e-06, |
|
"loss": 0.004, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 47.69953051643193, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 1.738999089451991e-06, |
|
"loss": 0.0032, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 47.82472613458529, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 1.7129069825861388e-06, |
|
"loss": 0.0023, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 47.94992175273865, |
|
"grad_norm": 0.1552734375, |
|
"learning_rate": 1.6869715822959437e-06, |
|
"loss": 0.0021, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 48.075117370892016, |
|
"grad_norm": 0.07568359375, |
|
"learning_rate": 1.6611941250075558e-06, |
|
"loss": 0.002, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 48.200312989045386, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 1.6355758396174603e-06, |
|
"loss": 0.0023, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 48.32550860719875, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 1.610117947433897e-06, |
|
"loss": 0.002, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 48.45070422535211, |
|
"grad_norm": 0.044189453125, |
|
"learning_rate": 1.5848216621186268e-06, |
|
"loss": 0.0026, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 48.575899843505475, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 1.55968818962908e-06, |
|
"loss": 0.0018, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 48.701095461658845, |
|
"grad_norm": 0.06640625, |
|
"learning_rate": 1.5347187281608622e-06, |
|
"loss": 0.0019, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 48.82629107981221, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 1.5099144680906348e-06, |
|
"loss": 0.0022, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 48.95148669796557, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 1.4852765919193584e-06, |
|
"loss": 0.0019, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 49.076682316118934, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 1.460806274215924e-06, |
|
"loss": 0.0019, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 49.201877934272304, |
|
"grad_norm": 0.041748046875, |
|
"learning_rate": 1.4365046815611622e-06, |
|
"loss": 0.0017, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 49.32707355242567, |
|
"grad_norm": 0.052734375, |
|
"learning_rate": 1.4123729724922198e-06, |
|
"loss": 0.0017, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 49.45226917057903, |
|
"grad_norm": 0.03955078125, |
|
"learning_rate": 1.3884122974473307e-06, |
|
"loss": 0.0019, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 49.57746478873239, |
|
"grad_norm": 0.048828125, |
|
"learning_rate": 1.3646237987109772e-06, |
|
"loss": 0.0018, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 49.70266040688576, |
|
"grad_norm": 0.048828125, |
|
"learning_rate": 1.3410086103594256e-06, |
|
"loss": 0.0018, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 49.827856025039125, |
|
"grad_norm": 0.03466796875, |
|
"learning_rate": 1.317567858206661e-06, |
|
"loss": 0.002, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 49.95305164319249, |
|
"grad_norm": 0.044189453125, |
|
"learning_rate": 1.2943026597507268e-06, |
|
"loss": 0.0017, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 50.07824726134585, |
|
"grad_norm": 0.0380859375, |
|
"learning_rate": 1.2712141241204352e-06, |
|
"loss": 0.0016, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 50.07824726134585, |
|
"eval_loss": 2.970012903213501, |
|
"eval_runtime": 3.2782, |
|
"eval_samples_per_second": 21.963, |
|
"eval_steps_per_second": 21.963, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 50.20344287949922, |
|
"grad_norm": 0.0296630859375, |
|
"learning_rate": 1.2483033520224996e-06, |
|
"loss": 0.0016, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 50.328638497652584, |
|
"grad_norm": 0.0302734375, |
|
"learning_rate": 1.225571435689062e-06, |
|
"loss": 0.0016, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 50.45383411580595, |
|
"grad_norm": 0.02880859375, |
|
"learning_rate": 1.2030194588256183e-06, |
|
"loss": 0.0016, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 50.57902973395931, |
|
"grad_norm": 0.0322265625, |
|
"learning_rate": 1.1806484965593546e-06, |
|
"loss": 0.0016, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 50.70422535211267, |
|
"grad_norm": 0.0284423828125, |
|
"learning_rate": 1.1584596153878923e-06, |
|
"loss": 0.0016, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 50.82942097026604, |
|
"grad_norm": 0.030029296875, |
|
"learning_rate": 1.1364538731284514e-06, |
|
"loss": 0.0021, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 50.954616588419405, |
|
"grad_norm": 0.0262451171875, |
|
"learning_rate": 1.1146323188674102e-06, |
|
"loss": 0.0015, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 51.07981220657277, |
|
"grad_norm": 0.024658203125, |
|
"learning_rate": 1.0929959929102968e-06, |
|
"loss": 0.0016, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 51.20500782472613, |
|
"grad_norm": 0.0286865234375, |
|
"learning_rate": 1.0715459267321998e-06, |
|
"loss": 0.0014, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 51.3302034428795, |
|
"grad_norm": 0.0255126953125, |
|
"learning_rate": 1.0502831429285842e-06, |
|
"loss": 0.0014, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 51.455399061032864, |
|
"grad_norm": 0.024658203125, |
|
"learning_rate": 1.0292086551665464e-06, |
|
"loss": 0.0015, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 51.58059467918623, |
|
"grad_norm": 0.0238037109375, |
|
"learning_rate": 1.0083234681364934e-06, |
|
"loss": 0.0015, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 51.70579029733959, |
|
"grad_norm": 0.0233154296875, |
|
"learning_rate": 9.87628577504236e-07, |
|
"loss": 0.0015, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 51.83098591549296, |
|
"grad_norm": 0.0250244140625, |
|
"learning_rate": 9.671249698635294e-07, |
|
"loss": 0.0014, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 51.95618153364632, |
|
"grad_norm": 0.0260009765625, |
|
"learning_rate": 9.468136226890384e-07, |
|
"loss": 0.0014, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 52.081377151799686, |
|
"grad_norm": 0.0172119140625, |
|
"learning_rate": 9.266955042897357e-07, |
|
"loss": 0.0014, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 52.20657276995305, |
|
"grad_norm": 0.017578125, |
|
"learning_rate": 9.067715737627391e-07, |
|
"loss": 0.0014, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 52.33176838810642, |
|
"grad_norm": 0.0181884765625, |
|
"learning_rate": 8.870427809475907e-07, |
|
"loss": 0.0014, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 52.45696400625978, |
|
"grad_norm": 0.0167236328125, |
|
"learning_rate": 8.675100663809766e-07, |
|
"loss": 0.0013, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 52.582159624413144, |
|
"grad_norm": 0.019287109375, |
|
"learning_rate": 8.481743612518795e-07, |
|
"loss": 0.0014, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 52.70735524256651, |
|
"grad_norm": 0.02001953125, |
|
"learning_rate": 8.290365873571954e-07, |
|
"loss": 0.0014, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 52.83255086071988, |
|
"grad_norm": 0.022705078125, |
|
"learning_rate": 8.100976570577856e-07, |
|
"loss": 0.0013, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 52.95774647887324, |
|
"grad_norm": 0.0206298828125, |
|
"learning_rate": 7.913584732349788e-07, |
|
"loss": 0.0013, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 53.0829420970266, |
|
"grad_norm": 0.0181884765625, |
|
"learning_rate": 7.728199292475297e-07, |
|
"loss": 0.0014, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 53.208137715179966, |
|
"grad_norm": 0.017333984375, |
|
"learning_rate": 7.544829088890326e-07, |
|
"loss": 0.0013, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 53.333333333333336, |
|
"grad_norm": 0.09130859375, |
|
"learning_rate": 7.363482863457821e-07, |
|
"loss": 0.0013, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 53.4585289514867, |
|
"grad_norm": 0.017333984375, |
|
"learning_rate": 7.184169261551005e-07, |
|
"loss": 0.0013, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 53.58372456964006, |
|
"grad_norm": 0.02978515625, |
|
"learning_rate": 7.006896831641257e-07, |
|
"loss": 0.0013, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 53.708920187793424, |
|
"grad_norm": 0.01611328125, |
|
"learning_rate": 6.831674024890533e-07, |
|
"loss": 0.0012, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 53.834115805946794, |
|
"grad_norm": 0.015869140625, |
|
"learning_rate": 6.658509194748463e-07, |
|
"loss": 0.0013, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 53.95931142410016, |
|
"grad_norm": 0.018310546875, |
|
"learning_rate": 6.487410596554178e-07, |
|
"loss": 0.0013, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 54.08450704225352, |
|
"grad_norm": 0.0184326171875, |
|
"learning_rate": 6.3183863871427e-07, |
|
"loss": 0.0013, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 54.20970266040688, |
|
"grad_norm": 0.01470947265625, |
|
"learning_rate": 6.15144462445606e-07, |
|
"loss": 0.0013, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 54.33489827856025, |
|
"grad_norm": 0.0147705078125, |
|
"learning_rate": 5.986593267159224e-07, |
|
"loss": 0.0013, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 54.460093896713616, |
|
"grad_norm": 0.01348876953125, |
|
"learning_rate": 5.823840174260603e-07, |
|
"loss": 0.0012, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 54.58528951486698, |
|
"grad_norm": 0.01611328125, |
|
"learning_rate": 5.663193104737413e-07, |
|
"loss": 0.0014, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 54.71048513302034, |
|
"grad_norm": 0.01409912109375, |
|
"learning_rate": 5.504659717165812e-07, |
|
"loss": 0.0012, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 54.83568075117371, |
|
"grad_norm": 0.01953125, |
|
"learning_rate": 5.348247569355736e-07, |
|
"loss": 0.0013, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 54.960876369327075, |
|
"grad_norm": 0.01806640625, |
|
"learning_rate": 5.193964117990625e-07, |
|
"loss": 0.0013, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 55.08607198748044, |
|
"grad_norm": 0.0400390625, |
|
"learning_rate": 5.041816718271925e-07, |
|
"loss": 0.0012, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 55.2112676056338, |
|
"grad_norm": 0.01409912109375, |
|
"learning_rate": 4.891812623568476e-07, |
|
"loss": 0.0012, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 55.33646322378717, |
|
"grad_norm": 0.0135498046875, |
|
"learning_rate": 4.743958985070662e-07, |
|
"loss": 0.0013, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 55.46165884194053, |
|
"grad_norm": 0.0142822265625, |
|
"learning_rate": 4.598262851449525e-07, |
|
"loss": 0.0013, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 55.586854460093896, |
|
"grad_norm": 0.01171875, |
|
"learning_rate": 4.454731168520754e-07, |
|
"loss": 0.0013, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 55.71205007824726, |
|
"grad_norm": 0.01202392578125, |
|
"learning_rate": 4.3133707789134895e-07, |
|
"loss": 0.0011, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 55.83724569640063, |
|
"grad_norm": 0.01226806640625, |
|
"learning_rate": 4.174188421744174e-07, |
|
"loss": 0.0013, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 55.96244131455399, |
|
"grad_norm": 0.01300048828125, |
|
"learning_rate": 4.0371907322952654e-07, |
|
"loss": 0.0012, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 56.087636932707355, |
|
"grad_norm": 0.01226806640625, |
|
"learning_rate": 3.902384241698876e-07, |
|
"loss": 0.0012, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 56.21283255086072, |
|
"grad_norm": 0.01202392578125, |
|
"learning_rate": 3.769775376625423e-07, |
|
"loss": 0.0012, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 56.33802816901409, |
|
"grad_norm": 0.01318359375, |
|
"learning_rate": 3.639370458977304e-07, |
|
"loss": 0.0013, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 56.46322378716745, |
|
"grad_norm": 0.01171875, |
|
"learning_rate": 3.511175705587433e-07, |
|
"loss": 0.0012, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 56.58841940532081, |
|
"grad_norm": 0.01385498046875, |
|
"learning_rate": 3.3851972279228983e-07, |
|
"loss": 0.0012, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 56.713615023474176, |
|
"grad_norm": 0.0177001953125, |
|
"learning_rate": 3.261441031793638e-07, |
|
"loss": 0.0013, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 56.838810641627546, |
|
"grad_norm": 0.01214599609375, |
|
"learning_rate": 3.139913017066054e-07, |
|
"loss": 0.0012, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 56.96400625978091, |
|
"grad_norm": 0.012939453125, |
|
"learning_rate": 3.0206189773818005e-07, |
|
"loss": 0.0013, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 57.08920187793427, |
|
"grad_norm": 0.01507568359375, |
|
"learning_rate": 2.903564599881586e-07, |
|
"loss": 0.0013, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 57.214397496087635, |
|
"grad_norm": 0.0189208984375, |
|
"learning_rate": 2.788755464934001e-07, |
|
"loss": 0.0012, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 57.339593114241005, |
|
"grad_norm": 0.01275634765625, |
|
"learning_rate": 2.676197045869511e-07, |
|
"loss": 0.0012, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 57.46478873239437, |
|
"grad_norm": 0.01129150390625, |
|
"learning_rate": 2.565894708719552e-07, |
|
"loss": 0.0013, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 57.58998435054773, |
|
"grad_norm": 0.01361083984375, |
|
"learning_rate": 2.457853711960673e-07, |
|
"loss": 0.0012, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 57.715179968701094, |
|
"grad_norm": 0.0135498046875, |
|
"learning_rate": 2.3520792062638576e-07, |
|
"loss": 0.0012, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 57.840375586854464, |
|
"grad_norm": 0.01385498046875, |
|
"learning_rate": 2.248576234248967e-07, |
|
"loss": 0.0012, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 57.96557120500783, |
|
"grad_norm": 0.0155029296875, |
|
"learning_rate": 2.1473497302443857e-07, |
|
"loss": 0.0012, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 58.09076682316119, |
|
"grad_norm": 0.01300048828125, |
|
"learning_rate": 2.0484045200517222e-07, |
|
"loss": 0.0013, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 58.21596244131455, |
|
"grad_norm": 0.0137939453125, |
|
"learning_rate": 1.9517453207157865e-07, |
|
"loss": 0.0012, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 58.341158059467915, |
|
"grad_norm": 0.0125732421875, |
|
"learning_rate": 1.8573767402997155e-07, |
|
"loss": 0.0012, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 58.466353677621285, |
|
"grad_norm": 0.01531982421875, |
|
"learning_rate": 1.7653032776652702e-07, |
|
"loss": 0.0012, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 58.59154929577465, |
|
"grad_norm": 0.014404296875, |
|
"learning_rate": 1.675529322258368e-07, |
|
"loss": 0.0013, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 58.71674491392801, |
|
"grad_norm": 0.0142822265625, |
|
"learning_rate": 1.5880591538998292e-07, |
|
"loss": 0.0012, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 58.841940532081374, |
|
"grad_norm": 0.0128173828125, |
|
"learning_rate": 1.50289694258135e-07, |
|
"loss": 0.0011, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 58.967136150234744, |
|
"grad_norm": 0.01123046875, |
|
"learning_rate": 1.420046748266668e-07, |
|
"loss": 0.0012, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 59.09233176838811, |
|
"grad_norm": 0.0145263671875, |
|
"learning_rate": 1.3395125206980774e-07, |
|
"loss": 0.0012, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 59.21752738654147, |
|
"grad_norm": 0.0126953125, |
|
"learning_rate": 1.261298099208047e-07, |
|
"loss": 0.0012, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 59.34272300469483, |
|
"grad_norm": 0.048828125, |
|
"learning_rate": 1.185407212536277e-07, |
|
"loss": 0.0013, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 59.4679186228482, |
|
"grad_norm": 0.0198974609375, |
|
"learning_rate": 1.1118434786518473e-07, |
|
"loss": 0.0013, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 59.593114241001565, |
|
"grad_norm": 0.01470947265625, |
|
"learning_rate": 1.0406104045808274e-07, |
|
"loss": 0.0012, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 59.71830985915493, |
|
"grad_norm": 0.015869140625, |
|
"learning_rate": 9.717113862389993e-08, |
|
"loss": 0.0012, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 59.84350547730829, |
|
"grad_norm": 0.024169921875, |
|
"learning_rate": 9.051497082700256e-08, |
|
"loss": 0.0012, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 59.96870109546166, |
|
"grad_norm": 0.0205078125, |
|
"learning_rate": 8.40928543888836e-08, |
|
"loss": 0.0012, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 60.093896713615024, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 7.790509547303427e-08, |
|
"loss": 0.0011, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 60.21909233176839, |
|
"grad_norm": 0.06787109375, |
|
"learning_rate": 7.195198907034906e-08, |
|
"loss": 0.0013, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 60.34428794992175, |
|
"grad_norm": 0.08154296875, |
|
"learning_rate": 6.623381898506365e-08, |
|
"loss": 0.0012, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 60.46948356807512, |
|
"grad_norm": 0.1455078125, |
|
"learning_rate": 6.075085782122237e-08, |
|
"loss": 0.0012, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 60.59467918622848, |
|
"grad_norm": 0.053466796875, |
|
"learning_rate": 5.550336696968472e-08, |
|
"loss": 0.0012, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 60.719874804381845, |
|
"grad_norm": 0.1455078125, |
|
"learning_rate": 5.0491596595663714e-08, |
|
"loss": 0.0014, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 60.84507042253521, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 4.571578562679757e-08, |
|
"loss": 0.0012, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 60.97026604068858, |
|
"grad_norm": 0.138671875, |
|
"learning_rate": 4.1176161741760535e-08, |
|
"loss": 0.0012, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 61.09546165884194, |
|
"grad_norm": 0.142578125, |
|
"learning_rate": 3.687294135941044e-08, |
|
"loss": 0.0013, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 61.220657276995304, |
|
"grad_norm": 0.1298828125, |
|
"learning_rate": 3.280632962846919e-08, |
|
"loss": 0.0013, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 61.34585289514867, |
|
"grad_norm": 0.0322265625, |
|
"learning_rate": 2.8976520417742794e-08, |
|
"loss": 0.0012, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 61.47104851330204, |
|
"grad_norm": 0.043212890625, |
|
"learning_rate": 2.5383696306878756e-08, |
|
"loss": 0.0012, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 61.5962441314554, |
|
"grad_norm": 0.042724609375, |
|
"learning_rate": 2.202802857766362e-08, |
|
"loss": 0.0012, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 61.72143974960876, |
|
"grad_norm": 0.04052734375, |
|
"learning_rate": 1.8909677205856682e-08, |
|
"loss": 0.0012, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 61.846635367762126, |
|
"grad_norm": 0.039794921875, |
|
"learning_rate": 1.6028790853561126e-08, |
|
"loss": 0.0013, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 61.971830985915496, |
|
"grad_norm": 0.03564453125, |
|
"learning_rate": 1.3385506862140795e-08, |
|
"loss": 0.0013, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 62.09702660406886, |
|
"grad_norm": 0.040771484375, |
|
"learning_rate": 1.0979951245669307e-08, |
|
"loss": 0.0013, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 62.22222222222222, |
|
"grad_norm": 0.05419921875, |
|
"learning_rate": 8.812238684923758e-09, |
|
"loss": 0.0012, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 62.347417840375584, |
|
"grad_norm": 0.0400390625, |
|
"learning_rate": 6.882472521919093e-09, |
|
"loss": 0.0012, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 62.472613458528954, |
|
"grad_norm": 0.06982421875, |
|
"learning_rate": 5.190744754978716e-09, |
|
"loss": 0.0012, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 62.59780907668232, |
|
"grad_norm": 0.033447265625, |
|
"learning_rate": 3.737136034349109e-09, |
|
"loss": 0.0013, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 62.59780907668232, |
|
"eval_loss": 3.0162200927734375, |
|
"eval_runtime": 3.1935, |
|
"eval_samples_per_second": 22.546, |
|
"eval_steps_per_second": 22.546, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 62.72300469483568, |
|
"grad_norm": 0.03369140625, |
|
"learning_rate": 2.5217156583579037e-09, |
|
"loss": 0.0012, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 62.84820031298904, |
|
"grad_norm": 0.041259765625, |
|
"learning_rate": 1.5445415701065281e-09, |
|
"loss": 0.0013, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 62.97339593114241, |
|
"grad_norm": 0.03564453125, |
|
"learning_rate": 8.056603547090813e-10, |
|
"loss": 0.0012, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 63.098591549295776, |
|
"grad_norm": 0.02685546875, |
|
"learning_rate": 3.0510723707299907e-10, |
|
"loss": 0.0013, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 63.22378716744914, |
|
"grad_norm": 0.060546875, |
|
"learning_rate": 4.290608021706444e-11, |
|
"loss": 0.0012, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 63.298904538341155, |
|
"step": 5056, |
|
"total_flos": 5.606358848176128e+17, |
|
"train_loss": 0.27379138119090674, |
|
"train_runtime": 5351.3398, |
|
"train_samples_per_second": 7.642, |
|
"train_steps_per_second": 0.945 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 5056, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 64, |
|
"save_steps": 0, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.606358848176128e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|