{ "best_metric": null, "best_model_checkpoint": null, "epoch": 63.298904538341155, "eval_steps": 1000, "global_step": 5056, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.12519561815336464, "grad_norm": 27.375, "learning_rate": 1.9762845849802374e-07, "loss": 2.599, "step": 10 }, { "epoch": 0.25039123630672927, "grad_norm": 24.625, "learning_rate": 3.9525691699604747e-07, "loss": 2.5019, "step": 20 }, { "epoch": 0.3755868544600939, "grad_norm": 23.875, "learning_rate": 5.928853754940712e-07, "loss": 2.4051, "step": 30 }, { "epoch": 0.5007824726134585, "grad_norm": 15.9375, "learning_rate": 7.905138339920949e-07, "loss": 2.4794, "step": 40 }, { "epoch": 0.6259780907668232, "grad_norm": 37.25, "learning_rate": 9.881422924901187e-07, "loss": 2.3233, "step": 50 }, { "epoch": 0.7511737089201878, "grad_norm": 26.375, "learning_rate": 1.1857707509881424e-06, "loss": 2.2146, "step": 60 }, { "epoch": 0.8763693270735524, "grad_norm": 22.875, "learning_rate": 1.3833992094861662e-06, "loss": 2.259, "step": 70 }, { "epoch": 1.001564945226917, "grad_norm": 20.625, "learning_rate": 1.5810276679841899e-06, "loss": 2.1342, "step": 80 }, { "epoch": 1.1267605633802817, "grad_norm": 17.375, "learning_rate": 1.7786561264822136e-06, "loss": 2.1221, "step": 90 }, { "epoch": 1.2519561815336462, "grad_norm": 4.8125, "learning_rate": 1.9762845849802374e-06, "loss": 2.0237, "step": 100 }, { "epoch": 1.3771517996870108, "grad_norm": 4.5625, "learning_rate": 2.173913043478261e-06, "loss": 1.9913, "step": 110 }, { "epoch": 1.5023474178403755, "grad_norm": 4.4375, "learning_rate": 2.371541501976285e-06, "loss": 2.0106, "step": 120 }, { "epoch": 1.6275430359937402, "grad_norm": 4.09375, "learning_rate": 2.5691699604743086e-06, "loss": 1.9704, "step": 130 }, { "epoch": 1.7527386541471048, "grad_norm": 5.0, "learning_rate": 2.7667984189723323e-06, "loss": 1.9563, "step": 140 }, { "epoch": 1.8779342723004695, "grad_norm": 3.84375, "learning_rate": 2.964426877470356e-06, "loss": 1.939, "step": 150 }, { "epoch": 2.003129890453834, "grad_norm": 4.40625, "learning_rate": 3.1620553359683798e-06, "loss": 1.9485, "step": 160 }, { "epoch": 2.128325508607199, "grad_norm": 4.625, "learning_rate": 3.3596837944664035e-06, "loss": 1.8826, "step": 170 }, { "epoch": 2.2535211267605635, "grad_norm": 4.53125, "learning_rate": 3.5573122529644273e-06, "loss": 1.7508, "step": 180 }, { "epoch": 2.378716744913928, "grad_norm": 5.21875, "learning_rate": 3.754940711462451e-06, "loss": 1.7644, "step": 190 }, { "epoch": 2.5039123630672924, "grad_norm": 4.34375, "learning_rate": 3.952569169960475e-06, "loss": 1.9122, "step": 200 }, { "epoch": 2.629107981220657, "grad_norm": 4.8125, "learning_rate": 4.150197628458498e-06, "loss": 1.7887, "step": 210 }, { "epoch": 2.7543035993740217, "grad_norm": 4.21875, "learning_rate": 4.347826086956522e-06, "loss": 1.8224, "step": 220 }, { "epoch": 2.8794992175273864, "grad_norm": 3.8125, "learning_rate": 4.5454545454545455e-06, "loss": 1.8029, "step": 230 }, { "epoch": 3.004694835680751, "grad_norm": 5.25, "learning_rate": 4.74308300395257e-06, "loss": 1.8384, "step": 240 }, { "epoch": 3.1298904538341157, "grad_norm": 4.15625, "learning_rate": 4.940711462450593e-06, "loss": 1.6869, "step": 250 }, { "epoch": 3.2550860719874803, "grad_norm": 3.71875, "learning_rate": 5.138339920948617e-06, "loss": 1.6394, "step": 260 }, { "epoch": 3.380281690140845, "grad_norm": 4.0625, "learning_rate": 5.335968379446641e-06, "loss": 1.6657, "step": 270 }, { "epoch": 3.5054773082942097, "grad_norm": 3.953125, "learning_rate": 5.533596837944665e-06, "loss": 1.6843, "step": 280 }, { "epoch": 3.6306729264475743, "grad_norm": 4.125, "learning_rate": 5.731225296442689e-06, "loss": 1.6496, "step": 290 }, { "epoch": 3.755868544600939, "grad_norm": 3.875, "learning_rate": 5.928853754940712e-06, "loss": 1.7035, "step": 300 }, { "epoch": 3.8810641627543037, "grad_norm": 3.453125, "learning_rate": 6.126482213438736e-06, "loss": 1.6954, "step": 310 }, { "epoch": 4.006259780907668, "grad_norm": 3.390625, "learning_rate": 6.3241106719367596e-06, "loss": 1.7039, "step": 320 }, { "epoch": 4.131455399061033, "grad_norm": 4.125, "learning_rate": 6.521739130434783e-06, "loss": 1.5767, "step": 330 }, { "epoch": 4.256651017214398, "grad_norm": 5.0, "learning_rate": 6.719367588932807e-06, "loss": 1.5438, "step": 340 }, { "epoch": 4.381846635367762, "grad_norm": 3.125, "learning_rate": 6.91699604743083e-06, "loss": 1.4615, "step": 350 }, { "epoch": 4.507042253521127, "grad_norm": 4.28125, "learning_rate": 7.1146245059288545e-06, "loss": 1.5243, "step": 360 }, { "epoch": 4.632237871674492, "grad_norm": 3.46875, "learning_rate": 7.312252964426878e-06, "loss": 1.5667, "step": 370 }, { "epoch": 4.757433489827856, "grad_norm": 3.5625, "learning_rate": 7.509881422924902e-06, "loss": 1.548, "step": 380 }, { "epoch": 4.882629107981221, "grad_norm": 3.421875, "learning_rate": 7.707509881422925e-06, "loss": 1.5454, "step": 390 }, { "epoch": 5.007824726134586, "grad_norm": 3.03125, "learning_rate": 7.90513833992095e-06, "loss": 1.526, "step": 400 }, { "epoch": 5.13302034428795, "grad_norm": 3.234375, "learning_rate": 8.102766798418974e-06, "loss": 1.4051, "step": 410 }, { "epoch": 5.258215962441315, "grad_norm": 2.953125, "learning_rate": 8.300395256916996e-06, "loss": 1.4057, "step": 420 }, { "epoch": 5.383411580594679, "grad_norm": 2.90625, "learning_rate": 8.49802371541502e-06, "loss": 1.3852, "step": 430 }, { "epoch": 5.508607198748043, "grad_norm": 3.78125, "learning_rate": 8.695652173913044e-06, "loss": 1.3708, "step": 440 }, { "epoch": 5.633802816901408, "grad_norm": 3.125, "learning_rate": 8.893280632411067e-06, "loss": 1.3361, "step": 450 }, { "epoch": 5.758998435054773, "grad_norm": 3.3125, "learning_rate": 9.090909090909091e-06, "loss": 1.4118, "step": 460 }, { "epoch": 5.884194053208137, "grad_norm": 3.5, "learning_rate": 9.288537549407115e-06, "loss": 1.3362, "step": 470 }, { "epoch": 6.009389671361502, "grad_norm": 3.453125, "learning_rate": 9.48616600790514e-06, "loss": 1.477, "step": 480 }, { "epoch": 6.134585289514867, "grad_norm": 4.46875, "learning_rate": 9.683794466403162e-06, "loss": 1.2172, "step": 490 }, { "epoch": 6.259780907668231, "grad_norm": 3.5625, "learning_rate": 9.881422924901186e-06, "loss": 1.1359, "step": 500 }, { "epoch": 6.384976525821596, "grad_norm": 3.359375, "learning_rate": 9.999980930615864e-06, "loss": 1.2047, "step": 510 }, { "epoch": 6.510172143974961, "grad_norm": 3.515625, "learning_rate": 9.999766401714795e-06, "loss": 1.1727, "step": 520 }, { "epoch": 6.635367762128325, "grad_norm": 3.4375, "learning_rate": 9.999313517443876e-06, "loss": 1.3046, "step": 530 }, { "epoch": 6.76056338028169, "grad_norm": 3.640625, "learning_rate": 9.998622299393598e-06, "loss": 1.2101, "step": 540 }, { "epoch": 6.885758998435055, "grad_norm": 3.25, "learning_rate": 9.997692780516608e-06, "loss": 1.1944, "step": 550 }, { "epoch": 7.010954616588419, "grad_norm": 3.53125, "learning_rate": 9.996525005126135e-06, "loss": 1.2125, "step": 560 }, { "epoch": 7.136150234741784, "grad_norm": 2.96875, "learning_rate": 9.995119028893888e-06, "loss": 1.0638, "step": 570 }, { "epoch": 7.261345852895149, "grad_norm": 4.03125, "learning_rate": 9.993474918847401e-06, "loss": 1.0207, "step": 580 }, { "epoch": 7.386541471048513, "grad_norm": 3.484375, "learning_rate": 9.991592753366822e-06, "loss": 1.0174, "step": 590 }, { "epoch": 7.511737089201878, "grad_norm": 3.484375, "learning_rate": 9.989472622181194e-06, "loss": 1.0496, "step": 600 }, { "epoch": 7.636932707355243, "grad_norm": 3.359375, "learning_rate": 9.987114626364172e-06, "loss": 0.9966, "step": 610 }, { "epoch": 7.762128325508607, "grad_norm": 3.484375, "learning_rate": 9.984518878329197e-06, "loss": 0.9866, "step": 620 }, { "epoch": 7.887323943661972, "grad_norm": 3.25, "learning_rate": 9.98168550182415e-06, "loss": 1.0372, "step": 630 }, { "epoch": 8.012519561815337, "grad_norm": 3.125, "learning_rate": 9.978614631925442e-06, "loss": 0.9753, "step": 640 }, { "epoch": 8.137715179968701, "grad_norm": 3.234375, "learning_rate": 9.975306415031577e-06, "loss": 0.8432, "step": 650 }, { "epoch": 8.262910798122066, "grad_norm": 3.390625, "learning_rate": 9.97176100885618e-06, "loss": 0.8252, "step": 660 }, { "epoch": 8.38810641627543, "grad_norm": 3.484375, "learning_rate": 9.967978582420463e-06, "loss": 0.8158, "step": 670 }, { "epoch": 8.513302034428795, "grad_norm": 3.34375, "learning_rate": 9.963959316045185e-06, "loss": 0.833, "step": 680 }, { "epoch": 8.63849765258216, "grad_norm": 3.078125, "learning_rate": 9.959703401342037e-06, "loss": 0.8328, "step": 690 }, { "epoch": 8.763693270735525, "grad_norm": 2.703125, "learning_rate": 9.955211041204529e-06, "loss": 0.7759, "step": 700 }, { "epoch": 8.88888888888889, "grad_norm": 3.015625, "learning_rate": 9.950482449798295e-06, "loss": 0.8572, "step": 710 }, { "epoch": 9.014084507042254, "grad_norm": 2.296875, "learning_rate": 9.9455178525509e-06, "loss": 0.7794, "step": 720 }, { "epoch": 9.139280125195619, "grad_norm": 3.84375, "learning_rate": 9.940317486141084e-06, "loss": 0.6747, "step": 730 }, { "epoch": 9.264475743348983, "grad_norm": 3.03125, "learning_rate": 9.934881598487478e-06, "loss": 0.6434, "step": 740 }, { "epoch": 9.389671361502348, "grad_norm": 2.296875, "learning_rate": 9.929210448736797e-06, "loss": 0.6149, "step": 750 }, { "epoch": 9.514866979655713, "grad_norm": 2.8125, "learning_rate": 9.923304307251467e-06, "loss": 0.6818, "step": 760 }, { "epoch": 9.640062597809077, "grad_norm": 2.640625, "learning_rate": 9.917163455596753e-06, "loss": 0.6376, "step": 770 }, { "epoch": 9.765258215962442, "grad_norm": 2.953125, "learning_rate": 9.910788186527325e-06, "loss": 0.6487, "step": 780 }, { "epoch": 9.890453834115807, "grad_norm": 2.640625, "learning_rate": 9.904178803973306e-06, "loss": 0.6511, "step": 790 }, { "epoch": 10.015649452269171, "grad_norm": 1.8046875, "learning_rate": 9.89733562302578e-06, "loss": 0.6073, "step": 800 }, { "epoch": 10.140845070422536, "grad_norm": 2.1875, "learning_rate": 9.890258969921777e-06, "loss": 0.4669, "step": 810 }, { "epoch": 10.2660406885759, "grad_norm": 2.359375, "learning_rate": 9.882949182028709e-06, "loss": 0.5051, "step": 820 }, { "epoch": 10.391236306729265, "grad_norm": 2.140625, "learning_rate": 9.8754066078283e-06, "loss": 0.4987, "step": 830 }, { "epoch": 10.51643192488263, "grad_norm": 2.046875, "learning_rate": 9.867631606899957e-06, "loss": 0.5019, "step": 840 }, { "epoch": 10.641627543035995, "grad_norm": 1.90625, "learning_rate": 9.859624549903646e-06, "loss": 0.5047, "step": 850 }, { "epoch": 10.766823161189357, "grad_norm": 1.7109375, "learning_rate": 9.851385818562204e-06, "loss": 0.5325, "step": 860 }, { "epoch": 10.892018779342724, "grad_norm": 1.890625, "learning_rate": 9.842915805643156e-06, "loss": 0.5332, "step": 870 }, { "epoch": 11.017214397496087, "grad_norm": 1.484375, "learning_rate": 9.834214914939977e-06, "loss": 0.4857, "step": 880 }, { "epoch": 11.142410015649451, "grad_norm": 1.84375, "learning_rate": 9.82528356125285e-06, "loss": 0.3812, "step": 890 }, { "epoch": 11.267605633802816, "grad_norm": 1.234375, "learning_rate": 9.816122170368891e-06, "loss": 0.3738, "step": 900 }, { "epoch": 11.39280125195618, "grad_norm": 1.5234375, "learning_rate": 9.806731179041849e-06, "loss": 0.4202, "step": 910 }, { "epoch": 11.517996870109545, "grad_norm": 1.3984375, "learning_rate": 9.797111034971278e-06, "loss": 0.3592, "step": 920 }, { "epoch": 11.64319248826291, "grad_norm": 1.359375, "learning_rate": 9.787262196781208e-06, "loss": 0.3406, "step": 930 }, { "epoch": 11.768388106416275, "grad_norm": 1.3828125, "learning_rate": 9.777185133998268e-06, "loss": 0.3592, "step": 940 }, { "epoch": 11.89358372456964, "grad_norm": 1.3046875, "learning_rate": 9.76688032702931e-06, "loss": 0.3725, "step": 950 }, { "epoch": 12.018779342723004, "grad_norm": 1.296875, "learning_rate": 9.756348267138497e-06, "loss": 0.4022, "step": 960 }, { "epoch": 12.143974960876369, "grad_norm": 1.265625, "learning_rate": 9.745589456423897e-06, "loss": 0.2901, "step": 970 }, { "epoch": 12.269170579029733, "grad_norm": 1.5625, "learning_rate": 9.734604407793529e-06, "loss": 0.3043, "step": 980 }, { "epoch": 12.394366197183098, "grad_norm": 1.4453125, "learning_rate": 9.72339364494093e-06, "loss": 0.2778, "step": 990 }, { "epoch": 12.519561815336463, "grad_norm": 1.234375, "learning_rate": 9.711957702320176e-06, "loss": 0.2795, "step": 1000 }, { "epoch": 12.519561815336463, "eval_loss": 2.0127437114715576, "eval_runtime": 3.1921, "eval_samples_per_second": 22.556, "eval_steps_per_second": 22.556, "step": 1000 }, { "epoch": 12.644757433489827, "grad_norm": 1.203125, "learning_rate": 9.7002971251204e-06, "loss": 0.2932, "step": 1010 }, { "epoch": 12.769953051643192, "grad_norm": 1.1484375, "learning_rate": 9.688412469239812e-06, "loss": 0.3021, "step": 1020 }, { "epoch": 12.895148669796557, "grad_norm": 1.0390625, "learning_rate": 9.676304301259196e-06, "loss": 0.2861, "step": 1030 }, { "epoch": 13.020344287949921, "grad_norm": 0.921875, "learning_rate": 9.663973198414888e-06, "loss": 0.2959, "step": 1040 }, { "epoch": 13.145539906103286, "grad_norm": 1.1953125, "learning_rate": 9.651419748571272e-06, "loss": 0.2115, "step": 1050 }, { "epoch": 13.27073552425665, "grad_norm": 1.1640625, "learning_rate": 9.638644550192741e-06, "loss": 0.2322, "step": 1060 }, { "epoch": 13.395931142410015, "grad_norm": 1.3359375, "learning_rate": 9.625648212315177e-06, "loss": 0.2443, "step": 1070 }, { "epoch": 13.52112676056338, "grad_norm": 1.3125, "learning_rate": 9.612431354516912e-06, "loss": 0.2237, "step": 1080 }, { "epoch": 13.646322378716745, "grad_norm": 1.7578125, "learning_rate": 9.598994606889187e-06, "loss": 0.2261, "step": 1090 }, { "epoch": 13.77151799687011, "grad_norm": 1.0625, "learning_rate": 9.585338610006122e-06, "loss": 0.2163, "step": 1100 }, { "epoch": 13.896713615023474, "grad_norm": 1.1640625, "learning_rate": 9.571464014894168e-06, "loss": 0.2223, "step": 1110 }, { "epoch": 14.021909233176839, "grad_norm": 0.8828125, "learning_rate": 9.557371483001078e-06, "loss": 0.2216, "step": 1120 }, { "epoch": 14.147104851330203, "grad_norm": 0.921875, "learning_rate": 9.543061686164374e-06, "loss": 0.1752, "step": 1130 }, { "epoch": 14.272300469483568, "grad_norm": 0.953125, "learning_rate": 9.528535306579306e-06, "loss": 0.1694, "step": 1140 }, { "epoch": 14.397496087636933, "grad_norm": 0.94921875, "learning_rate": 9.513793036766345e-06, "loss": 0.1597, "step": 1150 }, { "epoch": 14.522691705790297, "grad_norm": 1.1328125, "learning_rate": 9.498835579538164e-06, "loss": 0.1627, "step": 1160 }, { "epoch": 14.647887323943662, "grad_norm": 0.8359375, "learning_rate": 9.483663647966124e-06, "loss": 0.187, "step": 1170 }, { "epoch": 14.773082942097027, "grad_norm": 1.015625, "learning_rate": 9.468277965346292e-06, "loss": 0.168, "step": 1180 }, { "epoch": 14.898278560250391, "grad_norm": 1.140625, "learning_rate": 9.452679265164951e-06, "loss": 0.1605, "step": 1190 }, { "epoch": 15.023474178403756, "grad_norm": 0.74609375, "learning_rate": 9.43686829106363e-06, "loss": 0.1667, "step": 1200 }, { "epoch": 15.14866979655712, "grad_norm": 0.8046875, "learning_rate": 9.42084579680366e-06, "loss": 0.1442, "step": 1210 }, { "epoch": 15.273865414710485, "grad_norm": 1.03125, "learning_rate": 9.404612546230244e-06, "loss": 0.1222, "step": 1220 }, { "epoch": 15.39906103286385, "grad_norm": 0.640625, "learning_rate": 9.38816931323602e-06, "loss": 0.1281, "step": 1230 }, { "epoch": 15.524256651017215, "grad_norm": 0.765625, "learning_rate": 9.371516881724192e-06, "loss": 0.1207, "step": 1240 }, { "epoch": 15.64945226917058, "grad_norm": 0.68359375, "learning_rate": 9.35465604557114e-06, "loss": 0.1191, "step": 1250 }, { "epoch": 15.774647887323944, "grad_norm": 0.75390625, "learning_rate": 9.337587608588588e-06, "loss": 0.1207, "step": 1260 }, { "epoch": 15.899843505477309, "grad_norm": 0.59765625, "learning_rate": 9.320312384485274e-06, "loss": 0.1312, "step": 1270 }, { "epoch": 16.025039123630673, "grad_norm": 0.447265625, "learning_rate": 9.30283119682816e-06, "loss": 0.1173, "step": 1280 }, { "epoch": 16.150234741784036, "grad_norm": 0.7734375, "learning_rate": 9.285144879003173e-06, "loss": 0.0862, "step": 1290 }, { "epoch": 16.275430359937403, "grad_norm": 0.53125, "learning_rate": 9.267254274175467e-06, "loss": 0.0801, "step": 1300 }, { "epoch": 16.400625978090765, "grad_norm": 0.609375, "learning_rate": 9.24916023524924e-06, "loss": 0.0852, "step": 1310 }, { "epoch": 16.525821596244132, "grad_norm": 0.796875, "learning_rate": 9.23086362482706e-06, "loss": 0.113, "step": 1320 }, { "epoch": 16.651017214397495, "grad_norm": 0.66015625, "learning_rate": 9.212365315168743e-06, "loss": 0.0951, "step": 1330 }, { "epoch": 16.77621283255086, "grad_norm": 0.625, "learning_rate": 9.193666188149782e-06, "loss": 0.0917, "step": 1340 }, { "epoch": 16.901408450704224, "grad_norm": 0.640625, "learning_rate": 9.174767135219291e-06, "loss": 0.0849, "step": 1350 }, { "epoch": 17.02660406885759, "grad_norm": 0.61328125, "learning_rate": 9.155669057357515e-06, "loss": 0.0907, "step": 1360 }, { "epoch": 17.151799687010953, "grad_norm": 0.828125, "learning_rate": 9.136372865032871e-06, "loss": 0.0577, "step": 1370 }, { "epoch": 17.27699530516432, "grad_norm": 0.4921875, "learning_rate": 9.116879478158552e-06, "loss": 0.0689, "step": 1380 }, { "epoch": 17.402190923317683, "grad_norm": 0.51953125, "learning_rate": 9.09718982604866e-06, "loss": 0.0723, "step": 1390 }, { "epoch": 17.52738654147105, "grad_norm": 0.578125, "learning_rate": 9.077304847373913e-06, "loss": 0.0639, "step": 1400 }, { "epoch": 17.652582159624412, "grad_norm": 0.68359375, "learning_rate": 9.057225490116887e-06, "loss": 0.0594, "step": 1410 }, { "epoch": 17.77777777777778, "grad_norm": 0.640625, "learning_rate": 9.036952711526834e-06, "loss": 0.0752, "step": 1420 }, { "epoch": 17.90297339593114, "grad_norm": 0.59375, "learning_rate": 9.016487478074032e-06, "loss": 0.0627, "step": 1430 }, { "epoch": 18.028169014084508, "grad_norm": 0.462890625, "learning_rate": 8.995830765403721e-06, "loss": 0.0656, "step": 1440 }, { "epoch": 18.15336463223787, "grad_norm": 0.5078125, "learning_rate": 8.974983558289586e-06, "loss": 0.0413, "step": 1450 }, { "epoch": 18.278560250391237, "grad_norm": 0.53125, "learning_rate": 8.953946850586813e-06, "loss": 0.0448, "step": 1460 }, { "epoch": 18.4037558685446, "grad_norm": 0.451171875, "learning_rate": 8.932721645184707e-06, "loss": 0.0438, "step": 1470 }, { "epoch": 18.528951486697967, "grad_norm": 0.578125, "learning_rate": 8.911308953958875e-06, "loss": 0.0477, "step": 1480 }, { "epoch": 18.65414710485133, "grad_norm": 0.6875, "learning_rate": 8.889709797723002e-06, "loss": 0.0478, "step": 1490 }, { "epoch": 18.779342723004696, "grad_norm": 0.40234375, "learning_rate": 8.867925206180166e-06, "loss": 0.0505, "step": 1500 }, { "epoch": 18.90453834115806, "grad_norm": 0.43359375, "learning_rate": 8.845956217873763e-06, "loss": 0.0463, "step": 1510 }, { "epoch": 19.029733959311425, "grad_norm": 0.37109375, "learning_rate": 8.823803880137993e-06, "loss": 0.0517, "step": 1520 }, { "epoch": 19.154929577464788, "grad_norm": 0.455078125, "learning_rate": 8.801469249047923e-06, "loss": 0.0342, "step": 1530 }, { "epoch": 19.280125195618155, "grad_norm": 0.302734375, "learning_rate": 8.77895338936915e-06, "loss": 0.0287, "step": 1540 }, { "epoch": 19.405320813771517, "grad_norm": 0.62890625, "learning_rate": 8.756257374507036e-06, "loss": 0.0333, "step": 1550 }, { "epoch": 19.530516431924884, "grad_norm": 0.41796875, "learning_rate": 8.733382286455536e-06, "loss": 0.0313, "step": 1560 }, { "epoch": 19.655712050078247, "grad_norm": 0.515625, "learning_rate": 8.710329215745612e-06, "loss": 0.0274, "step": 1570 }, { "epoch": 19.780907668231613, "grad_norm": 0.4296875, "learning_rate": 8.687099261393249e-06, "loss": 0.0351, "step": 1580 }, { "epoch": 19.906103286384976, "grad_norm": 0.380859375, "learning_rate": 8.663693530847056e-06, "loss": 0.0331, "step": 1590 }, { "epoch": 20.031298904538342, "grad_norm": 1.96875, "learning_rate": 8.640113139935484e-06, "loss": 0.0275, "step": 1600 }, { "epoch": 20.156494522691705, "grad_norm": 13.75, "learning_rate": 8.616359212813607e-06, "loss": 0.0466, "step": 1610 }, { "epoch": 20.281690140845072, "grad_norm": 16.375, "learning_rate": 8.592432881909548e-06, "loss": 0.0779, "step": 1620 }, { "epoch": 20.406885758998435, "grad_norm": 16.75, "learning_rate": 8.568335287870488e-06, "loss": 0.116, "step": 1630 }, { "epoch": 20.5320813771518, "grad_norm": 11.25, "learning_rate": 8.544067579508292e-06, "loss": 0.1198, "step": 1640 }, { "epoch": 20.657276995305164, "grad_norm": 38.5, "learning_rate": 8.519630913744726e-06, "loss": 0.1259, "step": 1650 }, { "epoch": 20.78247261345853, "grad_norm": 26.875, "learning_rate": 8.495026455556318e-06, "loss": 0.1304, "step": 1660 }, { "epoch": 20.907668231611893, "grad_norm": 30.375, "learning_rate": 8.470255377918821e-06, "loss": 0.1338, "step": 1670 }, { "epoch": 21.03286384976526, "grad_norm": 31.75, "learning_rate": 8.445318861751278e-06, "loss": 0.1232, "step": 1680 }, { "epoch": 21.158059467918623, "grad_norm": 28.25, "learning_rate": 8.420218095859735e-06, "loss": 0.154, "step": 1690 }, { "epoch": 21.28325508607199, "grad_norm": 9.3125, "learning_rate": 8.394954276880568e-06, "loss": 0.1363, "step": 1700 }, { "epoch": 21.408450704225352, "grad_norm": 8.25, "learning_rate": 8.36952860922343e-06, "loss": 0.13, "step": 1710 }, { "epoch": 21.53364632237872, "grad_norm": 11.9375, "learning_rate": 8.343942305013833e-06, "loss": 0.1398, "step": 1720 }, { "epoch": 21.65884194053208, "grad_norm": 8.6875, "learning_rate": 8.318196584035367e-06, "loss": 0.1428, "step": 1730 }, { "epoch": 21.784037558685448, "grad_norm": 8.5, "learning_rate": 8.292292673671542e-06, "loss": 0.1451, "step": 1740 }, { "epoch": 21.90923317683881, "grad_norm": 9.3125, "learning_rate": 8.266231808847284e-06, "loss": 0.1157, "step": 1750 }, { "epoch": 22.034428794992174, "grad_norm": 5.78125, "learning_rate": 8.24001523197005e-06, "loss": 0.1287, "step": 1760 }, { "epoch": 22.15962441314554, "grad_norm": 5.84375, "learning_rate": 8.213644192870609e-06, "loss": 0.1034, "step": 1770 }, { "epoch": 22.284820031298903, "grad_norm": 7.6875, "learning_rate": 8.18711994874345e-06, "loss": 0.1126, "step": 1780 }, { "epoch": 22.41001564945227, "grad_norm": 9.375, "learning_rate": 8.160443764086855e-06, "loss": 0.1295, "step": 1790 }, { "epoch": 22.535211267605632, "grad_norm": 10.0625, "learning_rate": 8.13361691064261e-06, "loss": 0.1149, "step": 1800 }, { "epoch": 22.660406885759, "grad_norm": 7.875, "learning_rate": 8.10664066733538e-06, "loss": 0.1125, "step": 1810 }, { "epoch": 22.78560250391236, "grad_norm": 8.3125, "learning_rate": 8.079516320211746e-06, "loss": 0.1538, "step": 1820 }, { "epoch": 22.910798122065728, "grad_norm": 6.6875, "learning_rate": 8.052245162378871e-06, "loss": 0.1213, "step": 1830 }, { "epoch": 23.03599374021909, "grad_norm": 5.125, "learning_rate": 8.024828493942882e-06, "loss": 0.1065, "step": 1840 }, { "epoch": 23.161189358372457, "grad_norm": 6.65625, "learning_rate": 7.997267621946871e-06, "loss": 0.0972, "step": 1850 }, { "epoch": 23.28638497652582, "grad_norm": 7.71875, "learning_rate": 7.96956386030859e-06, "loss": 0.0994, "step": 1860 }, { "epoch": 23.411580594679187, "grad_norm": 7.21875, "learning_rate": 7.94171852975782e-06, "loss": 0.1061, "step": 1870 }, { "epoch": 23.53677621283255, "grad_norm": 5.9375, "learning_rate": 7.913732957773385e-06, "loss": 0.1005, "step": 1880 }, { "epoch": 23.661971830985916, "grad_norm": 6.3125, "learning_rate": 7.885608478519894e-06, "loss": 0.0963, "step": 1890 }, { "epoch": 23.78716744913928, "grad_norm": 6.4375, "learning_rate": 7.857346432784116e-06, "loss": 0.1074, "step": 1900 }, { "epoch": 23.912363067292645, "grad_norm": 5.5625, "learning_rate": 7.828948167911073e-06, "loss": 0.0948, "step": 1910 }, { "epoch": 24.037558685446008, "grad_norm": 5.71875, "learning_rate": 7.800415037739802e-06, "loss": 0.1132, "step": 1920 }, { "epoch": 24.162754303599375, "grad_norm": 6.5, "learning_rate": 7.771748402538808e-06, "loss": 0.0783, "step": 1930 }, { "epoch": 24.287949921752737, "grad_norm": 5.46875, "learning_rate": 7.742949628941232e-06, "loss": 0.0743, "step": 1940 }, { "epoch": 24.413145539906104, "grad_norm": 5.34375, "learning_rate": 7.714020089879683e-06, "loss": 0.092, "step": 1950 }, { "epoch": 24.538341158059467, "grad_norm": 5.0, "learning_rate": 7.684961164520792e-06, "loss": 0.0822, "step": 1960 }, { "epoch": 24.663536776212833, "grad_norm": 5.3125, "learning_rate": 7.655774238199459e-06, "loss": 0.0989, "step": 1970 }, { "epoch": 24.788732394366196, "grad_norm": 7.59375, "learning_rate": 7.6264607023528135e-06, "loss": 0.1003, "step": 1980 }, { "epoch": 24.913928012519563, "grad_norm": 5.25, "learning_rate": 7.597021954453887e-06, "loss": 0.089, "step": 1990 }, { "epoch": 25.039123630672925, "grad_norm": 4.375, "learning_rate": 7.567459397944972e-06, "loss": 0.0784, "step": 2000 }, { "epoch": 25.039123630672925, "eval_loss": 2.359957456588745, "eval_runtime": 3.1779, "eval_samples_per_second": 22.657, "eval_steps_per_second": 22.657, "step": 2000 }, { "epoch": 25.164319248826292, "grad_norm": 5.46875, "learning_rate": 7.537774442170731e-06, "loss": 0.0569, "step": 2010 }, { "epoch": 25.289514866979655, "grad_norm": 5.4375, "learning_rate": 7.507968502311005e-06, "loss": 0.0682, "step": 2020 }, { "epoch": 25.41471048513302, "grad_norm": 5.28125, "learning_rate": 7.478042999313342e-06, "loss": 0.0679, "step": 2030 }, { "epoch": 25.539906103286384, "grad_norm": 3.234375, "learning_rate": 7.447999359825263e-06, "loss": 0.0564, "step": 2040 }, { "epoch": 25.66510172143975, "grad_norm": 4.6875, "learning_rate": 7.417839016126242e-06, "loss": 0.0661, "step": 2050 }, { "epoch": 25.790297339593113, "grad_norm": 5.78125, "learning_rate": 7.387563406059433e-06, "loss": 0.0728, "step": 2060 }, { "epoch": 25.91549295774648, "grad_norm": 4.4375, "learning_rate": 7.357173972963112e-06, "loss": 0.0758, "step": 2070 }, { "epoch": 26.040688575899843, "grad_norm": 2.90625, "learning_rate": 7.32667216560188e-06, "loss": 0.0679, "step": 2080 }, { "epoch": 26.16588419405321, "grad_norm": 3.546875, "learning_rate": 7.296059438097589e-06, "loss": 0.0469, "step": 2090 }, { "epoch": 26.291079812206572, "grad_norm": 2.984375, "learning_rate": 7.265337249860015e-06, "loss": 0.0479, "step": 2100 }, { "epoch": 26.41627543035994, "grad_norm": 2.859375, "learning_rate": 7.234507065517297e-06, "loss": 0.0499, "step": 2110 }, { "epoch": 26.5414710485133, "grad_norm": 5.3125, "learning_rate": 7.2035703548461e-06, "loss": 0.0529, "step": 2120 }, { "epoch": 26.666666666666668, "grad_norm": 3.078125, "learning_rate": 7.17252859270155e-06, "loss": 0.0557, "step": 2130 }, { "epoch": 26.79186228482003, "grad_norm": 2.625, "learning_rate": 7.141383258946926e-06, "loss": 0.0492, "step": 2140 }, { "epoch": 26.917057902973397, "grad_norm": 3.921875, "learning_rate": 7.110135838383105e-06, "loss": 0.0541, "step": 2150 }, { "epoch": 27.04225352112676, "grad_norm": 2.640625, "learning_rate": 7.078787820677784e-06, "loss": 0.0528, "step": 2160 }, { "epoch": 27.167449139280127, "grad_norm": 2.765625, "learning_rate": 7.047340700294454e-06, "loss": 0.0453, "step": 2170 }, { "epoch": 27.29264475743349, "grad_norm": 3.5, "learning_rate": 7.015795976421156e-06, "loss": 0.036, "step": 2180 }, { "epoch": 27.417840375586856, "grad_norm": 4.96875, "learning_rate": 6.984155152899016e-06, "loss": 0.0427, "step": 2190 }, { "epoch": 27.54303599374022, "grad_norm": 4.21875, "learning_rate": 6.952419738150546e-06, "loss": 0.0424, "step": 2200 }, { "epoch": 27.668231611893585, "grad_norm": 3.6875, "learning_rate": 6.9205912451077305e-06, "loss": 0.0336, "step": 2210 }, { "epoch": 27.793427230046948, "grad_norm": 2.609375, "learning_rate": 6.88867119113991e-06, "loss": 0.0462, "step": 2220 }, { "epoch": 27.918622848200314, "grad_norm": 2.296875, "learning_rate": 6.856661097981433e-06, "loss": 0.0492, "step": 2230 }, { "epoch": 28.043818466353677, "grad_norm": 1.3359375, "learning_rate": 6.824562491659112e-06, "loss": 0.0363, "step": 2240 }, { "epoch": 28.169014084507044, "grad_norm": 1.9609375, "learning_rate": 6.792376902419478e-06, "loss": 0.03, "step": 2250 }, { "epoch": 28.294209702660407, "grad_norm": 2.15625, "learning_rate": 6.7601058646558195e-06, "loss": 0.0331, "step": 2260 }, { "epoch": 28.419405320813773, "grad_norm": 2.046875, "learning_rate": 6.7277509168350445e-06, "loss": 0.0364, "step": 2270 }, { "epoch": 28.544600938967136, "grad_norm": 2.890625, "learning_rate": 6.695313601424326e-06, "loss": 0.0296, "step": 2280 }, { "epoch": 28.669796557120502, "grad_norm": 2.0625, "learning_rate": 6.662795464817573e-06, "loss": 0.0323, "step": 2290 }, { "epoch": 28.794992175273865, "grad_norm": 2.578125, "learning_rate": 6.63019805726171e-06, "loss": 0.0312, "step": 2300 }, { "epoch": 28.920187793427232, "grad_norm": 1.109375, "learning_rate": 6.597522932782765e-06, "loss": 0.0318, "step": 2310 }, { "epoch": 29.045383411580595, "grad_norm": 0.859375, "learning_rate": 6.564771649111792e-06, "loss": 0.0269, "step": 2320 }, { "epoch": 29.170579029733958, "grad_norm": 1.578125, "learning_rate": 6.531945767610604e-06, "loss": 0.0205, "step": 2330 }, { "epoch": 29.295774647887324, "grad_norm": 1.640625, "learning_rate": 6.499046853197338e-06, "loss": 0.024, "step": 2340 }, { "epoch": 29.420970266040687, "grad_norm": 1.4765625, "learning_rate": 6.46607647427185e-06, "loss": 0.024, "step": 2350 }, { "epoch": 29.546165884194053, "grad_norm": 0.65234375, "learning_rate": 6.4330362026409506e-06, "loss": 0.0262, "step": 2360 }, { "epoch": 29.671361502347416, "grad_norm": 1.4140625, "learning_rate": 6.3999276134434595e-06, "loss": 0.0252, "step": 2370 }, { "epoch": 29.796557120500783, "grad_norm": 1.8203125, "learning_rate": 6.366752285075125e-06, "loss": 0.0224, "step": 2380 }, { "epoch": 29.921752738654146, "grad_norm": 0.515625, "learning_rate": 6.33351179911337e-06, "loss": 0.0204, "step": 2390 }, { "epoch": 30.046948356807512, "grad_norm": 0.70703125, "learning_rate": 6.300207740241895e-06, "loss": 0.0195, "step": 2400 }, { "epoch": 30.172143974960875, "grad_norm": 0.796875, "learning_rate": 6.266841696175132e-06, "loss": 0.0164, "step": 2410 }, { "epoch": 30.29733959311424, "grad_norm": 1.2109375, "learning_rate": 6.233415257582551e-06, "loss": 0.0167, "step": 2420 }, { "epoch": 30.422535211267604, "grad_norm": 0.87890625, "learning_rate": 6.19993001801283e-06, "loss": 0.0149, "step": 2430 }, { "epoch": 30.54773082942097, "grad_norm": 1.1015625, "learning_rate": 6.166387573817881e-06, "loss": 0.0178, "step": 2440 }, { "epoch": 30.672926447574334, "grad_norm": 0.66015625, "learning_rate": 6.132789524076751e-06, "loss": 0.0181, "step": 2450 }, { "epoch": 30.7981220657277, "grad_norm": 0.6484375, "learning_rate": 6.0991374705193866e-06, "loss": 0.0147, "step": 2460 }, { "epoch": 30.923317683881063, "grad_norm": 0.96875, "learning_rate": 6.065433017450276e-06, "loss": 0.0182, "step": 2470 }, { "epoch": 31.04851330203443, "grad_norm": 0.68359375, "learning_rate": 6.031677771671962e-06, "loss": 0.0149, "step": 2480 }, { "epoch": 31.173708920187792, "grad_norm": 0.23046875, "learning_rate": 5.997873342408446e-06, "loss": 0.0135, "step": 2490 }, { "epoch": 31.29890453834116, "grad_norm": 0.25390625, "learning_rate": 5.964021341228468e-06, "loss": 0.0138, "step": 2500 }, { "epoch": 31.42410015649452, "grad_norm": 1.8203125, "learning_rate": 5.930123381968677e-06, "loss": 0.0138, "step": 2510 }, { "epoch": 31.549295774647888, "grad_norm": 0.453125, "learning_rate": 5.8961810806567e-06, "loss": 0.0158, "step": 2520 }, { "epoch": 31.67449139280125, "grad_norm": 0.58984375, "learning_rate": 5.862196055434089e-06, "loss": 0.0116, "step": 2530 }, { "epoch": 31.799687010954617, "grad_norm": 0.7265625, "learning_rate": 5.828169926479191e-06, "loss": 0.0124, "step": 2540 }, { "epoch": 31.92488262910798, "grad_norm": 0.498046875, "learning_rate": 5.794104315929904e-06, "loss": 0.0144, "step": 2550 }, { "epoch": 32.05007824726135, "grad_norm": 0.25, "learning_rate": 5.760000847806337e-06, "loss": 0.0127, "step": 2560 }, { "epoch": 32.17527386541471, "grad_norm": 0.12060546875, "learning_rate": 5.725861147933403e-06, "loss": 0.0116, "step": 2570 }, { "epoch": 32.30046948356807, "grad_norm": 0.279296875, "learning_rate": 5.6916868438632976e-06, "loss": 0.0133, "step": 2580 }, { "epoch": 32.42566510172144, "grad_norm": 0.435546875, "learning_rate": 5.657479564797914e-06, "loss": 0.0091, "step": 2590 }, { "epoch": 32.550860719874805, "grad_norm": 0.1123046875, "learning_rate": 5.623240941511173e-06, "loss": 0.01, "step": 2600 }, { "epoch": 32.67605633802817, "grad_norm": 0.1806640625, "learning_rate": 5.588972606271276e-06, "loss": 0.0091, "step": 2610 }, { "epoch": 32.80125195618153, "grad_norm": 0.400390625, "learning_rate": 5.554676192762891e-06, "loss": 0.0111, "step": 2620 }, { "epoch": 32.9264475743349, "grad_norm": 0.5859375, "learning_rate": 5.520353336009274e-06, "loss": 0.0102, "step": 2630 }, { "epoch": 33.051643192488264, "grad_norm": 0.0849609375, "learning_rate": 5.48600567229431e-06, "loss": 0.0084, "step": 2640 }, { "epoch": 33.17683881064163, "grad_norm": 0.18359375, "learning_rate": 5.451634839084523e-06, "loss": 0.009, "step": 2650 }, { "epoch": 33.30203442879499, "grad_norm": 0.349609375, "learning_rate": 5.417242474950999e-06, "loss": 0.0083, "step": 2660 }, { "epoch": 33.42723004694836, "grad_norm": 0.34765625, "learning_rate": 5.382830219491271e-06, "loss": 0.0091, "step": 2670 }, { "epoch": 33.55242566510172, "grad_norm": 0.447265625, "learning_rate": 5.348399713251163e-06, "loss": 0.0115, "step": 2680 }, { "epoch": 33.677621283255085, "grad_norm": 0.275390625, "learning_rate": 5.3139525976465675e-06, "loss": 0.0089, "step": 2690 }, { "epoch": 33.80281690140845, "grad_norm": 0.263671875, "learning_rate": 5.279490514885207e-06, "loss": 0.0075, "step": 2700 }, { "epoch": 33.92801251956182, "grad_norm": 0.236328125, "learning_rate": 5.245015107888335e-06, "loss": 0.0095, "step": 2710 }, { "epoch": 34.05320813771518, "grad_norm": 0.23828125, "learning_rate": 5.210528020212412e-06, "loss": 0.0081, "step": 2720 }, { "epoch": 34.178403755868544, "grad_norm": 0.310546875, "learning_rate": 5.176030895970761e-06, "loss": 0.007, "step": 2730 }, { "epoch": 34.30359937402191, "grad_norm": 0.087890625, "learning_rate": 5.141525379755178e-06, "loss": 0.0053, "step": 2740 }, { "epoch": 34.42879499217528, "grad_norm": 0.09326171875, "learning_rate": 5.10701311655753e-06, "loss": 0.0065, "step": 2750 }, { "epoch": 34.55399061032864, "grad_norm": 0.06689453125, "learning_rate": 5.072495751691338e-06, "loss": 0.0072, "step": 2760 }, { "epoch": 34.679186228482, "grad_norm": 0.10546875, "learning_rate": 5.037974930713338e-06, "loss": 0.0058, "step": 2770 }, { "epoch": 34.804381846635366, "grad_norm": 0.0859375, "learning_rate": 5.003452299345024e-06, "loss": 0.0055, "step": 2780 }, { "epoch": 34.929577464788736, "grad_norm": 0.24609375, "learning_rate": 4.968929503394206e-06, "loss": 0.0061, "step": 2790 }, { "epoch": 35.0547730829421, "grad_norm": 0.056396484375, "learning_rate": 4.934408188676531e-06, "loss": 0.0062, "step": 2800 }, { "epoch": 35.17996870109546, "grad_norm": 0.1435546875, "learning_rate": 4.8998900009370366e-06, "loss": 0.0049, "step": 2810 }, { "epoch": 35.305164319248824, "grad_norm": 0.06298828125, "learning_rate": 4.865376585771687e-06, "loss": 0.0043, "step": 2820 }, { "epoch": 35.430359937402194, "grad_norm": 0.056640625, "learning_rate": 4.830869588548918e-06, "loss": 0.006, "step": 2830 }, { "epoch": 35.55555555555556, "grad_norm": 0.048828125, "learning_rate": 4.796370654331205e-06, "loss": 0.0055, "step": 2840 }, { "epoch": 35.68075117370892, "grad_norm": 0.171875, "learning_rate": 4.7618814277966325e-06, "loss": 0.0052, "step": 2850 }, { "epoch": 35.80594679186228, "grad_norm": 0.045166015625, "learning_rate": 4.727403553160484e-06, "loss": 0.0057, "step": 2860 }, { "epoch": 35.93114241001565, "grad_norm": 0.048583984375, "learning_rate": 4.692938674096867e-06, "loss": 0.005, "step": 2870 }, { "epoch": 36.056338028169016, "grad_norm": 0.03857421875, "learning_rate": 4.658488433660341e-06, "loss": 0.0047, "step": 2880 }, { "epoch": 36.18153364632238, "grad_norm": 0.357421875, "learning_rate": 4.624054474207597e-06, "loss": 0.0048, "step": 2890 }, { "epoch": 36.30672926447574, "grad_norm": 0.185546875, "learning_rate": 4.589638437319157e-06, "loss": 0.0054, "step": 2900 }, { "epoch": 36.431924882629104, "grad_norm": 0.11962890625, "learning_rate": 4.555241963721118e-06, "loss": 0.0041, "step": 2910 }, { "epoch": 36.557120500782474, "grad_norm": 0.2158203125, "learning_rate": 4.5208666932069255e-06, "loss": 0.0043, "step": 2920 }, { "epoch": 36.68231611893584, "grad_norm": 0.154296875, "learning_rate": 4.486514264559206e-06, "loss": 0.0045, "step": 2930 }, { "epoch": 36.8075117370892, "grad_norm": 0.0400390625, "learning_rate": 4.452186315471641e-06, "loss": 0.0039, "step": 2940 }, { "epoch": 36.93270735524256, "grad_norm": 0.396484375, "learning_rate": 4.417884482470887e-06, "loss": 0.0042, "step": 2950 }, { "epoch": 37.05790297339593, "grad_norm": 0.189453125, "learning_rate": 4.383610400838561e-06, "loss": 0.0039, "step": 2960 }, { "epoch": 37.183098591549296, "grad_norm": 0.034423828125, "learning_rate": 4.349365704533285e-06, "loss": 0.0038, "step": 2970 }, { "epoch": 37.30829420970266, "grad_norm": 0.03173828125, "learning_rate": 4.31515202611278e-06, "loss": 0.0037, "step": 2980 }, { "epoch": 37.43348982785602, "grad_norm": 0.033447265625, "learning_rate": 4.2809709966560435e-06, "loss": 0.0031, "step": 2990 }, { "epoch": 37.55868544600939, "grad_norm": 0.2275390625, "learning_rate": 4.246824245685591e-06, "loss": 0.0037, "step": 3000 }, { "epoch": 37.55868544600939, "eval_loss": 2.755915641784668, "eval_runtime": 3.2484, "eval_samples_per_second": 22.165, "eval_steps_per_second": 22.165, "step": 3000 }, { "epoch": 37.683881064162755, "grad_norm": 0.031005859375, "learning_rate": 4.2127134010897695e-06, "loss": 0.003, "step": 3010 }, { "epoch": 37.80907668231612, "grad_norm": 0.043701171875, "learning_rate": 4.178640089045147e-06, "loss": 0.0031, "step": 3020 }, { "epoch": 37.93427230046948, "grad_norm": 0.1708984375, "learning_rate": 4.144605933938993e-06, "loss": 0.0032, "step": 3030 }, { "epoch": 38.05946791862285, "grad_norm": 0.02783203125, "learning_rate": 4.1106125582918385e-06, "loss": 0.0032, "step": 3040 }, { "epoch": 38.18466353677621, "grad_norm": 0.03125, "learning_rate": 4.07666158268012e-06, "loss": 0.003, "step": 3050 }, { "epoch": 38.309859154929576, "grad_norm": 0.037109375, "learning_rate": 4.042754625658929e-06, "loss": 0.0028, "step": 3060 }, { "epoch": 38.43505477308294, "grad_norm": 0.0303955078125, "learning_rate": 4.008893303684837e-06, "loss": 0.0035, "step": 3070 }, { "epoch": 38.56025039123631, "grad_norm": 0.05908203125, "learning_rate": 3.975079231038848e-06, "loss": 0.0025, "step": 3080 }, { "epoch": 38.68544600938967, "grad_norm": 0.050048828125, "learning_rate": 3.941314019749438e-06, "loss": 0.0036, "step": 3090 }, { "epoch": 38.810641627543035, "grad_norm": 0.033203125, "learning_rate": 3.9075992795156916e-06, "loss": 0.003, "step": 3100 }, { "epoch": 38.9358372456964, "grad_norm": 0.04931640625, "learning_rate": 3.873936617630578e-06, "loss": 0.0028, "step": 3110 }, { "epoch": 39.06103286384977, "grad_norm": 0.03369140625, "learning_rate": 3.840327638904321e-06, "loss": 0.0026, "step": 3120 }, { "epoch": 39.18622848200313, "grad_norm": 0.328125, "learning_rate": 3.8067739455878844e-06, "loss": 0.0026, "step": 3130 }, { "epoch": 39.31142410015649, "grad_norm": 0.0341796875, "learning_rate": 3.7732771372965987e-06, "loss": 0.0024, "step": 3140 }, { "epoch": 39.436619718309856, "grad_norm": 0.0458984375, "learning_rate": 3.7398388109338984e-06, "loss": 0.0026, "step": 3150 }, { "epoch": 39.561815336463226, "grad_norm": 0.02783203125, "learning_rate": 3.7064605606151866e-06, "loss": 0.0026, "step": 3160 }, { "epoch": 39.68701095461659, "grad_norm": 0.0703125, "learning_rate": 3.6731439775918467e-06, "loss": 0.0024, "step": 3170 }, { "epoch": 39.81220657276995, "grad_norm": 0.052978515625, "learning_rate": 3.639890650175379e-06, "loss": 0.0027, "step": 3180 }, { "epoch": 39.937402190923315, "grad_norm": 0.04296875, "learning_rate": 3.6067021636616793e-06, "loss": 0.002, "step": 3190 }, { "epoch": 40.062597809076685, "grad_norm": 0.1357421875, "learning_rate": 3.5735801002554615e-06, "loss": 0.002, "step": 3200 }, { "epoch": 40.18779342723005, "grad_norm": 0.2412109375, "learning_rate": 3.540526038994834e-06, "loss": 0.0018, "step": 3210 }, { "epoch": 40.31298904538341, "grad_norm": 1.7890625, "learning_rate": 3.5075415556760157e-06, "loss": 0.0025, "step": 3220 }, { "epoch": 40.438184663536774, "grad_norm": 0.69921875, "learning_rate": 3.4746282227782164e-06, "loss": 0.0037, "step": 3230 }, { "epoch": 40.563380281690144, "grad_norm": 3.21875, "learning_rate": 3.4417876093886705e-06, "loss": 0.0042, "step": 3240 }, { "epoch": 40.68857589984351, "grad_norm": 1.9375, "learning_rate": 3.409021281127835e-06, "loss": 0.0085, "step": 3250 }, { "epoch": 40.81377151799687, "grad_norm": 5.21875, "learning_rate": 3.3763308000747453e-06, "loss": 0.0072, "step": 3260 }, { "epoch": 40.93896713615023, "grad_norm": 9.6875, "learning_rate": 3.3437177246925547e-06, "loss": 0.0108, "step": 3270 }, { "epoch": 41.0641627543036, "grad_norm": 11.125, "learning_rate": 3.31118360975423e-06, "loss": 0.0087, "step": 3280 }, { "epoch": 41.189358372456965, "grad_norm": 15.875, "learning_rate": 3.278730006268432e-06, "loss": 0.0112, "step": 3290 }, { "epoch": 41.31455399061033, "grad_norm": 0.46484375, "learning_rate": 3.246358461405579e-06, "loss": 0.0105, "step": 3300 }, { "epoch": 41.43974960876369, "grad_norm": 1.9609375, "learning_rate": 3.2140705184240783e-06, "loss": 0.0098, "step": 3310 }, { "epoch": 41.56494522691706, "grad_norm": 3.125, "learning_rate": 3.181867716596765e-06, "loss": 0.0071, "step": 3320 }, { "epoch": 41.690140845070424, "grad_norm": 3.71875, "learning_rate": 3.1497515911375113e-06, "loss": 0.0124, "step": 3330 }, { "epoch": 41.81533646322379, "grad_norm": 0.84375, "learning_rate": 3.11772367312804e-06, "loss": 0.0075, "step": 3340 }, { "epoch": 41.94053208137715, "grad_norm": 10.6875, "learning_rate": 3.085785489444936e-06, "loss": 0.0093, "step": 3350 }, { "epoch": 42.06572769953052, "grad_norm": 0.90625, "learning_rate": 3.05393856268685e-06, "loss": 0.0073, "step": 3360 }, { "epoch": 42.19092331768388, "grad_norm": 2.8125, "learning_rate": 3.0221844111019166e-06, "loss": 0.0039, "step": 3370 }, { "epoch": 42.316118935837245, "grad_norm": 1.1015625, "learning_rate": 2.99052454851537e-06, "loss": 0.0065, "step": 3380 }, { "epoch": 42.44131455399061, "grad_norm": 1.140625, "learning_rate": 2.9589604842573762e-06, "loss": 0.0064, "step": 3390 }, { "epoch": 42.56651017214398, "grad_norm": 2.140625, "learning_rate": 2.927493723091078e-06, "loss": 0.0058, "step": 3400 }, { "epoch": 42.69170579029734, "grad_norm": 0.9140625, "learning_rate": 2.8961257651408627e-06, "loss": 0.0094, "step": 3410 }, { "epoch": 42.816901408450704, "grad_norm": 5.5, "learning_rate": 2.8648581058208387e-06, "loss": 0.0053, "step": 3420 }, { "epoch": 42.94209702660407, "grad_norm": 2.84375, "learning_rate": 2.8336922357635464e-06, "loss": 0.0084, "step": 3430 }, { "epoch": 43.06729264475744, "grad_norm": 1.0625, "learning_rate": 2.802629640748898e-06, "loss": 0.0044, "step": 3440 }, { "epoch": 43.1924882629108, "grad_norm": 2.078125, "learning_rate": 2.7716718016333432e-06, "loss": 0.0069, "step": 3450 }, { "epoch": 43.31768388106416, "grad_norm": 3.09375, "learning_rate": 2.7408201942792755e-06, "loss": 0.0061, "step": 3460 }, { "epoch": 43.442879499217526, "grad_norm": 2.109375, "learning_rate": 2.7100762894846633e-06, "loss": 0.0065, "step": 3470 }, { "epoch": 43.568075117370896, "grad_norm": 0.80859375, "learning_rate": 2.6794415529129402e-06, "loss": 0.0052, "step": 3480 }, { "epoch": 43.69327073552426, "grad_norm": 0.34375, "learning_rate": 2.6489174450231353e-06, "loss": 0.0056, "step": 3490 }, { "epoch": 43.81846635367762, "grad_norm": 0.6328125, "learning_rate": 2.618505421000237e-06, "loss": 0.005, "step": 3500 }, { "epoch": 43.943661971830984, "grad_norm": 0.62109375, "learning_rate": 2.588206930685827e-06, "loss": 0.0069, "step": 3510 }, { "epoch": 44.06885758998435, "grad_norm": 1.5234375, "learning_rate": 2.5580234185089647e-06, "loss": 0.0043, "step": 3520 }, { "epoch": 44.19405320813772, "grad_norm": 1.3125, "learning_rate": 2.5279563234173177e-06, "loss": 0.0054, "step": 3530 }, { "epoch": 44.31924882629108, "grad_norm": 0.7265625, "learning_rate": 2.4980070788085655e-06, "loss": 0.0043, "step": 3540 }, { "epoch": 44.44444444444444, "grad_norm": 1.25, "learning_rate": 2.4681771124620716e-06, "loss": 0.005, "step": 3550 }, { "epoch": 44.569640062597806, "grad_norm": 2.84375, "learning_rate": 2.4384678464708077e-06, "loss": 0.0042, "step": 3560 }, { "epoch": 44.694835680751176, "grad_norm": 2.5, "learning_rate": 2.4088806971735584e-06, "loss": 0.0056, "step": 3570 }, { "epoch": 44.82003129890454, "grad_norm": 1.6875, "learning_rate": 2.3794170750874094e-06, "loss": 0.0052, "step": 3580 }, { "epoch": 44.9452269170579, "grad_norm": 2.171875, "learning_rate": 2.3500783848404906e-06, "loss": 0.0055, "step": 3590 }, { "epoch": 45.070422535211264, "grad_norm": 0.412109375, "learning_rate": 2.320866025105016e-06, "loss": 0.0041, "step": 3600 }, { "epoch": 45.195618153364634, "grad_norm": 0.4375, "learning_rate": 2.2917813885306196e-06, "loss": 0.0035, "step": 3610 }, { "epoch": 45.320813771518, "grad_norm": 0.1181640625, "learning_rate": 2.262825861677938e-06, "loss": 0.0026, "step": 3620 }, { "epoch": 45.44600938967136, "grad_norm": 0.84765625, "learning_rate": 2.234000824952525e-06, "loss": 0.0046, "step": 3630 }, { "epoch": 45.57120500782472, "grad_norm": 1.40625, "learning_rate": 2.2053076525390434e-06, "loss": 0.004, "step": 3640 }, { "epoch": 45.69640062597809, "grad_norm": 0.1923828125, "learning_rate": 2.1767477123357424e-06, "loss": 0.0041, "step": 3650 }, { "epoch": 45.821596244131456, "grad_norm": 2.875, "learning_rate": 2.1483223658892545e-06, "loss": 0.0041, "step": 3660 }, { "epoch": 45.94679186228482, "grad_norm": 0.55078125, "learning_rate": 2.120032968329687e-06, "loss": 0.004, "step": 3670 }, { "epoch": 46.07198748043818, "grad_norm": 1.1484375, "learning_rate": 2.091880868306011e-06, "loss": 0.003, "step": 3680 }, { "epoch": 46.19718309859155, "grad_norm": 0.0869140625, "learning_rate": 2.0638674079217687e-06, "loss": 0.0023, "step": 3690 }, { "epoch": 46.322378716744915, "grad_norm": 0.9921875, "learning_rate": 2.0359939226711002e-06, "loss": 0.0025, "step": 3700 }, { "epoch": 46.44757433489828, "grad_norm": 0.08251953125, "learning_rate": 2.008261741375063e-06, "loss": 0.0026, "step": 3710 }, { "epoch": 46.57276995305164, "grad_norm": 0.0810546875, "learning_rate": 1.9806721861182907e-06, "loss": 0.0026, "step": 3720 }, { "epoch": 46.69796557120501, "grad_norm": 0.095703125, "learning_rate": 1.95322657218596e-06, "loss": 0.0037, "step": 3730 }, { "epoch": 46.82316118935837, "grad_norm": 1.1328125, "learning_rate": 1.9259262080010938e-06, "loss": 0.0028, "step": 3740 }, { "epoch": 46.948356807511736, "grad_norm": 0.08544921875, "learning_rate": 1.8987723950621805e-06, "loss": 0.0024, "step": 3750 }, { "epoch": 47.0735524256651, "grad_norm": 0.06640625, "learning_rate": 1.8717664278811198e-06, "loss": 0.0023, "step": 3760 }, { "epoch": 47.19874804381847, "grad_norm": 0.5703125, "learning_rate": 1.844909593921525e-06, "loss": 0.0021, "step": 3770 }, { "epoch": 47.32394366197183, "grad_norm": 0.271484375, "learning_rate": 1.8182031735373302e-06, "loss": 0.002, "step": 3780 }, { "epoch": 47.449139280125195, "grad_norm": 0.07373046875, "learning_rate": 1.7916484399117579e-06, "loss": 0.0038, "step": 3790 }, { "epoch": 47.57433489827856, "grad_norm": 0.08056640625, "learning_rate": 1.7652466589966271e-06, "loss": 0.004, "step": 3800 }, { "epoch": 47.69953051643193, "grad_norm": 0.439453125, "learning_rate": 1.738999089451991e-06, "loss": 0.0032, "step": 3810 }, { "epoch": 47.82472613458529, "grad_norm": 0.0693359375, "learning_rate": 1.7129069825861388e-06, "loss": 0.0023, "step": 3820 }, { "epoch": 47.94992175273865, "grad_norm": 0.1552734375, "learning_rate": 1.6869715822959437e-06, "loss": 0.0021, "step": 3830 }, { "epoch": 48.075117370892016, "grad_norm": 0.07568359375, "learning_rate": 1.6611941250075558e-06, "loss": 0.002, "step": 3840 }, { "epoch": 48.200312989045386, "grad_norm": 0.0751953125, "learning_rate": 1.6355758396174603e-06, "loss": 0.0023, "step": 3850 }, { "epoch": 48.32550860719875, "grad_norm": 0.83203125, "learning_rate": 1.610117947433897e-06, "loss": 0.002, "step": 3860 }, { "epoch": 48.45070422535211, "grad_norm": 0.044189453125, "learning_rate": 1.5848216621186268e-06, "loss": 0.0026, "step": 3870 }, { "epoch": 48.575899843505475, "grad_norm": 0.04638671875, "learning_rate": 1.55968818962908e-06, "loss": 0.0018, "step": 3880 }, { "epoch": 48.701095461658845, "grad_norm": 0.06640625, "learning_rate": 1.5347187281608622e-06, "loss": 0.0019, "step": 3890 }, { "epoch": 48.82629107981221, "grad_norm": 0.04833984375, "learning_rate": 1.5099144680906348e-06, "loss": 0.0022, "step": 3900 }, { "epoch": 48.95148669796557, "grad_norm": 0.045166015625, "learning_rate": 1.4852765919193584e-06, "loss": 0.0019, "step": 3910 }, { "epoch": 49.076682316118934, "grad_norm": 0.043701171875, "learning_rate": 1.460806274215924e-06, "loss": 0.0019, "step": 3920 }, { "epoch": 49.201877934272304, "grad_norm": 0.041748046875, "learning_rate": 1.4365046815611622e-06, "loss": 0.0017, "step": 3930 }, { "epoch": 49.32707355242567, "grad_norm": 0.052734375, "learning_rate": 1.4123729724922198e-06, "loss": 0.0017, "step": 3940 }, { "epoch": 49.45226917057903, "grad_norm": 0.03955078125, "learning_rate": 1.3884122974473307e-06, "loss": 0.0019, "step": 3950 }, { "epoch": 49.57746478873239, "grad_norm": 0.048828125, "learning_rate": 1.3646237987109772e-06, "loss": 0.0018, "step": 3960 }, { "epoch": 49.70266040688576, "grad_norm": 0.048828125, "learning_rate": 1.3410086103594256e-06, "loss": 0.0018, "step": 3970 }, { "epoch": 49.827856025039125, "grad_norm": 0.03466796875, "learning_rate": 1.317567858206661e-06, "loss": 0.002, "step": 3980 }, { "epoch": 49.95305164319249, "grad_norm": 0.044189453125, "learning_rate": 1.2943026597507268e-06, "loss": 0.0017, "step": 3990 }, { "epoch": 50.07824726134585, "grad_norm": 0.0380859375, "learning_rate": 1.2712141241204352e-06, "loss": 0.0016, "step": 4000 }, { "epoch": 50.07824726134585, "eval_loss": 2.970012903213501, "eval_runtime": 3.2782, "eval_samples_per_second": 21.963, "eval_steps_per_second": 21.963, "step": 4000 }, { "epoch": 50.20344287949922, "grad_norm": 0.0296630859375, "learning_rate": 1.2483033520224996e-06, "loss": 0.0016, "step": 4010 }, { "epoch": 50.328638497652584, "grad_norm": 0.0302734375, "learning_rate": 1.225571435689062e-06, "loss": 0.0016, "step": 4020 }, { "epoch": 50.45383411580595, "grad_norm": 0.02880859375, "learning_rate": 1.2030194588256183e-06, "loss": 0.0016, "step": 4030 }, { "epoch": 50.57902973395931, "grad_norm": 0.0322265625, "learning_rate": 1.1806484965593546e-06, "loss": 0.0016, "step": 4040 }, { "epoch": 50.70422535211267, "grad_norm": 0.0284423828125, "learning_rate": 1.1584596153878923e-06, "loss": 0.0016, "step": 4050 }, { "epoch": 50.82942097026604, "grad_norm": 0.030029296875, "learning_rate": 1.1364538731284514e-06, "loss": 0.0021, "step": 4060 }, { "epoch": 50.954616588419405, "grad_norm": 0.0262451171875, "learning_rate": 1.1146323188674102e-06, "loss": 0.0015, "step": 4070 }, { "epoch": 51.07981220657277, "grad_norm": 0.024658203125, "learning_rate": 1.0929959929102968e-06, "loss": 0.0016, "step": 4080 }, { "epoch": 51.20500782472613, "grad_norm": 0.0286865234375, "learning_rate": 1.0715459267321998e-06, "loss": 0.0014, "step": 4090 }, { "epoch": 51.3302034428795, "grad_norm": 0.0255126953125, "learning_rate": 1.0502831429285842e-06, "loss": 0.0014, "step": 4100 }, { "epoch": 51.455399061032864, "grad_norm": 0.024658203125, "learning_rate": 1.0292086551665464e-06, "loss": 0.0015, "step": 4110 }, { "epoch": 51.58059467918623, "grad_norm": 0.0238037109375, "learning_rate": 1.0083234681364934e-06, "loss": 0.0015, "step": 4120 }, { "epoch": 51.70579029733959, "grad_norm": 0.0233154296875, "learning_rate": 9.87628577504236e-07, "loss": 0.0015, "step": 4130 }, { "epoch": 51.83098591549296, "grad_norm": 0.0250244140625, "learning_rate": 9.671249698635294e-07, "loss": 0.0014, "step": 4140 }, { "epoch": 51.95618153364632, "grad_norm": 0.0260009765625, "learning_rate": 9.468136226890384e-07, "loss": 0.0014, "step": 4150 }, { "epoch": 52.081377151799686, "grad_norm": 0.0172119140625, "learning_rate": 9.266955042897357e-07, "loss": 0.0014, "step": 4160 }, { "epoch": 52.20657276995305, "grad_norm": 0.017578125, "learning_rate": 9.067715737627391e-07, "loss": 0.0014, "step": 4170 }, { "epoch": 52.33176838810642, "grad_norm": 0.0181884765625, "learning_rate": 8.870427809475907e-07, "loss": 0.0014, "step": 4180 }, { "epoch": 52.45696400625978, "grad_norm": 0.0167236328125, "learning_rate": 8.675100663809766e-07, "loss": 0.0013, "step": 4190 }, { "epoch": 52.582159624413144, "grad_norm": 0.019287109375, "learning_rate": 8.481743612518795e-07, "loss": 0.0014, "step": 4200 }, { "epoch": 52.70735524256651, "grad_norm": 0.02001953125, "learning_rate": 8.290365873571954e-07, "loss": 0.0014, "step": 4210 }, { "epoch": 52.83255086071988, "grad_norm": 0.022705078125, "learning_rate": 8.100976570577856e-07, "loss": 0.0013, "step": 4220 }, { "epoch": 52.95774647887324, "grad_norm": 0.0206298828125, "learning_rate": 7.913584732349788e-07, "loss": 0.0013, "step": 4230 }, { "epoch": 53.0829420970266, "grad_norm": 0.0181884765625, "learning_rate": 7.728199292475297e-07, "loss": 0.0014, "step": 4240 }, { "epoch": 53.208137715179966, "grad_norm": 0.017333984375, "learning_rate": 7.544829088890326e-07, "loss": 0.0013, "step": 4250 }, { "epoch": 53.333333333333336, "grad_norm": 0.09130859375, "learning_rate": 7.363482863457821e-07, "loss": 0.0013, "step": 4260 }, { "epoch": 53.4585289514867, "grad_norm": 0.017333984375, "learning_rate": 7.184169261551005e-07, "loss": 0.0013, "step": 4270 }, { "epoch": 53.58372456964006, "grad_norm": 0.02978515625, "learning_rate": 7.006896831641257e-07, "loss": 0.0013, "step": 4280 }, { "epoch": 53.708920187793424, "grad_norm": 0.01611328125, "learning_rate": 6.831674024890533e-07, "loss": 0.0012, "step": 4290 }, { "epoch": 53.834115805946794, "grad_norm": 0.015869140625, "learning_rate": 6.658509194748463e-07, "loss": 0.0013, "step": 4300 }, { "epoch": 53.95931142410016, "grad_norm": 0.018310546875, "learning_rate": 6.487410596554178e-07, "loss": 0.0013, "step": 4310 }, { "epoch": 54.08450704225352, "grad_norm": 0.0184326171875, "learning_rate": 6.3183863871427e-07, "loss": 0.0013, "step": 4320 }, { "epoch": 54.20970266040688, "grad_norm": 0.01470947265625, "learning_rate": 6.15144462445606e-07, "loss": 0.0013, "step": 4330 }, { "epoch": 54.33489827856025, "grad_norm": 0.0147705078125, "learning_rate": 5.986593267159224e-07, "loss": 0.0013, "step": 4340 }, { "epoch": 54.460093896713616, "grad_norm": 0.01348876953125, "learning_rate": 5.823840174260603e-07, "loss": 0.0012, "step": 4350 }, { "epoch": 54.58528951486698, "grad_norm": 0.01611328125, "learning_rate": 5.663193104737413e-07, "loss": 0.0014, "step": 4360 }, { "epoch": 54.71048513302034, "grad_norm": 0.01409912109375, "learning_rate": 5.504659717165812e-07, "loss": 0.0012, "step": 4370 }, { "epoch": 54.83568075117371, "grad_norm": 0.01953125, "learning_rate": 5.348247569355736e-07, "loss": 0.0013, "step": 4380 }, { "epoch": 54.960876369327075, "grad_norm": 0.01806640625, "learning_rate": 5.193964117990625e-07, "loss": 0.0013, "step": 4390 }, { "epoch": 55.08607198748044, "grad_norm": 0.0400390625, "learning_rate": 5.041816718271925e-07, "loss": 0.0012, "step": 4400 }, { "epoch": 55.2112676056338, "grad_norm": 0.01409912109375, "learning_rate": 4.891812623568476e-07, "loss": 0.0012, "step": 4410 }, { "epoch": 55.33646322378717, "grad_norm": 0.0135498046875, "learning_rate": 4.743958985070662e-07, "loss": 0.0013, "step": 4420 }, { "epoch": 55.46165884194053, "grad_norm": 0.0142822265625, "learning_rate": 4.598262851449525e-07, "loss": 0.0013, "step": 4430 }, { "epoch": 55.586854460093896, "grad_norm": 0.01171875, "learning_rate": 4.454731168520754e-07, "loss": 0.0013, "step": 4440 }, { "epoch": 55.71205007824726, "grad_norm": 0.01202392578125, "learning_rate": 4.3133707789134895e-07, "loss": 0.0011, "step": 4450 }, { "epoch": 55.83724569640063, "grad_norm": 0.01226806640625, "learning_rate": 4.174188421744174e-07, "loss": 0.0013, "step": 4460 }, { "epoch": 55.96244131455399, "grad_norm": 0.01300048828125, "learning_rate": 4.0371907322952654e-07, "loss": 0.0012, "step": 4470 }, { "epoch": 56.087636932707355, "grad_norm": 0.01226806640625, "learning_rate": 3.902384241698876e-07, "loss": 0.0012, "step": 4480 }, { "epoch": 56.21283255086072, "grad_norm": 0.01202392578125, "learning_rate": 3.769775376625423e-07, "loss": 0.0012, "step": 4490 }, { "epoch": 56.33802816901409, "grad_norm": 0.01318359375, "learning_rate": 3.639370458977304e-07, "loss": 0.0013, "step": 4500 }, { "epoch": 56.46322378716745, "grad_norm": 0.01171875, "learning_rate": 3.511175705587433e-07, "loss": 0.0012, "step": 4510 }, { "epoch": 56.58841940532081, "grad_norm": 0.01385498046875, "learning_rate": 3.3851972279228983e-07, "loss": 0.0012, "step": 4520 }, { "epoch": 56.713615023474176, "grad_norm": 0.0177001953125, "learning_rate": 3.261441031793638e-07, "loss": 0.0013, "step": 4530 }, { "epoch": 56.838810641627546, "grad_norm": 0.01214599609375, "learning_rate": 3.139913017066054e-07, "loss": 0.0012, "step": 4540 }, { "epoch": 56.96400625978091, "grad_norm": 0.012939453125, "learning_rate": 3.0206189773818005e-07, "loss": 0.0013, "step": 4550 }, { "epoch": 57.08920187793427, "grad_norm": 0.01507568359375, "learning_rate": 2.903564599881586e-07, "loss": 0.0013, "step": 4560 }, { "epoch": 57.214397496087635, "grad_norm": 0.0189208984375, "learning_rate": 2.788755464934001e-07, "loss": 0.0012, "step": 4570 }, { "epoch": 57.339593114241005, "grad_norm": 0.01275634765625, "learning_rate": 2.676197045869511e-07, "loss": 0.0012, "step": 4580 }, { "epoch": 57.46478873239437, "grad_norm": 0.01129150390625, "learning_rate": 2.565894708719552e-07, "loss": 0.0013, "step": 4590 }, { "epoch": 57.58998435054773, "grad_norm": 0.01361083984375, "learning_rate": 2.457853711960673e-07, "loss": 0.0012, "step": 4600 }, { "epoch": 57.715179968701094, "grad_norm": 0.0135498046875, "learning_rate": 2.3520792062638576e-07, "loss": 0.0012, "step": 4610 }, { "epoch": 57.840375586854464, "grad_norm": 0.01385498046875, "learning_rate": 2.248576234248967e-07, "loss": 0.0012, "step": 4620 }, { "epoch": 57.96557120500783, "grad_norm": 0.0155029296875, "learning_rate": 2.1473497302443857e-07, "loss": 0.0012, "step": 4630 }, { "epoch": 58.09076682316119, "grad_norm": 0.01300048828125, "learning_rate": 2.0484045200517222e-07, "loss": 0.0013, "step": 4640 }, { "epoch": 58.21596244131455, "grad_norm": 0.0137939453125, "learning_rate": 1.9517453207157865e-07, "loss": 0.0012, "step": 4650 }, { "epoch": 58.341158059467915, "grad_norm": 0.0125732421875, "learning_rate": 1.8573767402997155e-07, "loss": 0.0012, "step": 4660 }, { "epoch": 58.466353677621285, "grad_norm": 0.01531982421875, "learning_rate": 1.7653032776652702e-07, "loss": 0.0012, "step": 4670 }, { "epoch": 58.59154929577465, "grad_norm": 0.014404296875, "learning_rate": 1.675529322258368e-07, "loss": 0.0013, "step": 4680 }, { "epoch": 58.71674491392801, "grad_norm": 0.0142822265625, "learning_rate": 1.5880591538998292e-07, "loss": 0.0012, "step": 4690 }, { "epoch": 58.841940532081374, "grad_norm": 0.0128173828125, "learning_rate": 1.50289694258135e-07, "loss": 0.0011, "step": 4700 }, { "epoch": 58.967136150234744, "grad_norm": 0.01123046875, "learning_rate": 1.420046748266668e-07, "loss": 0.0012, "step": 4710 }, { "epoch": 59.09233176838811, "grad_norm": 0.0145263671875, "learning_rate": 1.3395125206980774e-07, "loss": 0.0012, "step": 4720 }, { "epoch": 59.21752738654147, "grad_norm": 0.0126953125, "learning_rate": 1.261298099208047e-07, "loss": 0.0012, "step": 4730 }, { "epoch": 59.34272300469483, "grad_norm": 0.048828125, "learning_rate": 1.185407212536277e-07, "loss": 0.0013, "step": 4740 }, { "epoch": 59.4679186228482, "grad_norm": 0.0198974609375, "learning_rate": 1.1118434786518473e-07, "loss": 0.0013, "step": 4750 }, { "epoch": 59.593114241001565, "grad_norm": 0.01470947265625, "learning_rate": 1.0406104045808274e-07, "loss": 0.0012, "step": 4760 }, { "epoch": 59.71830985915493, "grad_norm": 0.015869140625, "learning_rate": 9.717113862389993e-08, "loss": 0.0012, "step": 4770 }, { "epoch": 59.84350547730829, "grad_norm": 0.024169921875, "learning_rate": 9.051497082700256e-08, "loss": 0.0012, "step": 4780 }, { "epoch": 59.96870109546166, "grad_norm": 0.0205078125, "learning_rate": 8.40928543888836e-08, "loss": 0.0012, "step": 4790 }, { "epoch": 60.093896713615024, "grad_norm": 0.050048828125, "learning_rate": 7.790509547303427e-08, "loss": 0.0011, "step": 4800 }, { "epoch": 60.21909233176839, "grad_norm": 0.06787109375, "learning_rate": 7.195198907034906e-08, "loss": 0.0013, "step": 4810 }, { "epoch": 60.34428794992175, "grad_norm": 0.08154296875, "learning_rate": 6.623381898506365e-08, "loss": 0.0012, "step": 4820 }, { "epoch": 60.46948356807512, "grad_norm": 0.1455078125, "learning_rate": 6.075085782122237e-08, "loss": 0.0012, "step": 4830 }, { "epoch": 60.59467918622848, "grad_norm": 0.053466796875, "learning_rate": 5.550336696968472e-08, "loss": 0.0012, "step": 4840 }, { "epoch": 60.719874804381845, "grad_norm": 0.1455078125, "learning_rate": 5.0491596595663714e-08, "loss": 0.0014, "step": 4850 }, { "epoch": 60.84507042253521, "grad_norm": 0.51171875, "learning_rate": 4.571578562679757e-08, "loss": 0.0012, "step": 4860 }, { "epoch": 60.97026604068858, "grad_norm": 0.138671875, "learning_rate": 4.1176161741760535e-08, "loss": 0.0012, "step": 4870 }, { "epoch": 61.09546165884194, "grad_norm": 0.142578125, "learning_rate": 3.687294135941044e-08, "loss": 0.0013, "step": 4880 }, { "epoch": 61.220657276995304, "grad_norm": 0.1298828125, "learning_rate": 3.280632962846919e-08, "loss": 0.0013, "step": 4890 }, { "epoch": 61.34585289514867, "grad_norm": 0.0322265625, "learning_rate": 2.8976520417742794e-08, "loss": 0.0012, "step": 4900 }, { "epoch": 61.47104851330204, "grad_norm": 0.043212890625, "learning_rate": 2.5383696306878756e-08, "loss": 0.0012, "step": 4910 }, { "epoch": 61.5962441314554, "grad_norm": 0.042724609375, "learning_rate": 2.202802857766362e-08, "loss": 0.0012, "step": 4920 }, { "epoch": 61.72143974960876, "grad_norm": 0.04052734375, "learning_rate": 1.8909677205856682e-08, "loss": 0.0012, "step": 4930 }, { "epoch": 61.846635367762126, "grad_norm": 0.039794921875, "learning_rate": 1.6028790853561126e-08, "loss": 0.0013, "step": 4940 }, { "epoch": 61.971830985915496, "grad_norm": 0.03564453125, "learning_rate": 1.3385506862140795e-08, "loss": 0.0013, "step": 4950 }, { "epoch": 62.09702660406886, "grad_norm": 0.040771484375, "learning_rate": 1.0979951245669307e-08, "loss": 0.0013, "step": 4960 }, { "epoch": 62.22222222222222, "grad_norm": 0.05419921875, "learning_rate": 8.812238684923758e-09, "loss": 0.0012, "step": 4970 }, { "epoch": 62.347417840375584, "grad_norm": 0.0400390625, "learning_rate": 6.882472521919093e-09, "loss": 0.0012, "step": 4980 }, { "epoch": 62.472613458528954, "grad_norm": 0.06982421875, "learning_rate": 5.190744754978716e-09, "loss": 0.0012, "step": 4990 }, { "epoch": 62.59780907668232, "grad_norm": 0.033447265625, "learning_rate": 3.737136034349109e-09, "loss": 0.0013, "step": 5000 }, { "epoch": 62.59780907668232, "eval_loss": 3.0162200927734375, "eval_runtime": 3.1935, "eval_samples_per_second": 22.546, "eval_steps_per_second": 22.546, "step": 5000 }, { "epoch": 62.72300469483568, "grad_norm": 0.03369140625, "learning_rate": 2.5217156583579037e-09, "loss": 0.0012, "step": 5010 }, { "epoch": 62.84820031298904, "grad_norm": 0.041259765625, "learning_rate": 1.5445415701065281e-09, "loss": 0.0013, "step": 5020 }, { "epoch": 62.97339593114241, "grad_norm": 0.03564453125, "learning_rate": 8.056603547090813e-10, "loss": 0.0012, "step": 5030 }, { "epoch": 63.098591549295776, "grad_norm": 0.02685546875, "learning_rate": 3.0510723707299907e-10, "loss": 0.0013, "step": 5040 }, { "epoch": 63.22378716744914, "grad_norm": 0.060546875, "learning_rate": 4.290608021706444e-11, "loss": 0.0012, "step": 5050 }, { "epoch": 63.298904538341155, "step": 5056, "total_flos": 5.606358848176128e+17, "train_loss": 0.27379138119090674, "train_runtime": 5351.3398, "train_samples_per_second": 7.642, "train_steps_per_second": 0.945 } ], "logging_steps": 10, "max_steps": 5056, "num_input_tokens_seen": 0, "num_train_epochs": 64, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.606358848176128e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }