diff --git "a/checkpoint-3506/trainer_state.json" "b/checkpoint-3506/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-3506/trainer_state.json" @@ -0,0 +1,24965 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6000342289919562, + "eval_steps": 877, + "global_step": 3506, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00017114495978093444, + "grad_norm": NaN, + "learning_rate": 0.0, + "loss": 9.5502, + "step": 1 + }, + { + "epoch": 0.0003422899195618689, + "grad_norm": NaN, + "learning_rate": 0.0, + "loss": 17.5546, + "step": 2 + }, + { + "epoch": 0.0005134348793428033, + "grad_norm": Infinity, + "learning_rate": 0.0, + "loss": 25.9988, + "step": 3 + }, + { + "epoch": 0.0006845798391237378, + "grad_norm": 60.18782043457031, + "learning_rate": 5.704506560182544e-09, + "loss": 9.6042, + "step": 4 + }, + { + "epoch": 0.0008557247989046722, + "grad_norm": Infinity, + "learning_rate": 5.704506560182544e-09, + "loss": 17.9169, + "step": 5 + }, + { + "epoch": 0.0010268697586856067, + "grad_norm": 84.36212158203125, + "learning_rate": 1.1409013120365088e-08, + "loss": 10.1633, + "step": 6 + }, + { + "epoch": 0.001198014718466541, + "grad_norm": 82.20382690429688, + "learning_rate": 1.711351968054763e-08, + "loss": 8.4392, + "step": 7 + }, + { + "epoch": 0.0013691596782474755, + "grad_norm": 16.687606811523438, + "learning_rate": 2.2818026240730176e-08, + "loss": 6.4113, + "step": 8 + }, + { + "epoch": 0.00154030463802841, + "grad_norm": NaN, + "learning_rate": 2.2818026240730176e-08, + "loss": 17.185, + "step": 9 + }, + { + "epoch": 0.0017114495978093444, + "grad_norm": 48.527183532714844, + "learning_rate": 2.852253280091272e-08, + "loss": 7.868, + "step": 10 + }, + { + "epoch": 0.0018825945575902789, + "grad_norm": 76.19969177246094, + "learning_rate": 3.422703936109526e-08, + "loss": 25.5097, + "step": 11 + }, + { + "epoch": 0.0020537395173712133, + "grad_norm": 44.624080657958984, + "learning_rate": 3.9931545921277814e-08, + "loss": 8.533, + "step": 12 + }, + { + "epoch": 0.002224884477152148, + "grad_norm": 68.30242156982422, + "learning_rate": 4.563605248146035e-08, + "loss": 12.1618, + "step": 13 + }, + { + "epoch": 0.002396029436933082, + "grad_norm": 77.02323913574219, + "learning_rate": 5.1340559041642904e-08, + "loss": 10.1531, + "step": 14 + }, + { + "epoch": 0.002567174396714017, + "grad_norm": 37.27943420410156, + "learning_rate": 5.704506560182544e-08, + "loss": 8.2236, + "step": 15 + }, + { + "epoch": 0.002738319356494951, + "grad_norm": 47.14276123046875, + "learning_rate": 6.274957216200798e-08, + "loss": 6.6764, + "step": 16 + }, + { + "epoch": 0.0029094643162758858, + "grad_norm": 68.288818359375, + "learning_rate": 6.845407872219053e-08, + "loss": 9.6404, + "step": 17 + }, + { + "epoch": 0.00308060927605682, + "grad_norm": 72.5563735961914, + "learning_rate": 7.415858528237308e-08, + "loss": 7.2553, + "step": 18 + }, + { + "epoch": 0.0032517542358377546, + "grad_norm": 133.2047576904297, + "learning_rate": 7.986309184255563e-08, + "loss": 17.6761, + "step": 19 + }, + { + "epoch": 0.003422899195618689, + "grad_norm": 96.22279357910156, + "learning_rate": 8.556759840273816e-08, + "loss": 23.6993, + "step": 20 + }, + { + "epoch": 0.0035940441553996235, + "grad_norm": 72.40164947509766, + "learning_rate": 9.12721049629207e-08, + "loss": 12.5069, + "step": 21 + }, + { + "epoch": 0.0037651891151805577, + "grad_norm": 58.689125061035156, + "learning_rate": 9.697661152310325e-08, + "loss": 10.4915, + "step": 22 + }, + { + "epoch": 0.003936334074961492, + "grad_norm": 41.391170501708984, + "learning_rate": 1.0268111808328581e-07, + "loss": 8.2323, + "step": 23 + }, + { + "epoch": 0.004107479034742427, + "grad_norm": 60.9368896484375, + "learning_rate": 1.0838562464346835e-07, + "loss": 9.4007, + "step": 24 + }, + { + "epoch": 0.004278623994523361, + "grad_norm": 132.59597778320312, + "learning_rate": 1.1409013120365088e-07, + "loss": 16.9119, + "step": 25 + }, + { + "epoch": 0.004449768954304296, + "grad_norm": 72.2205581665039, + "learning_rate": 1.1979463776383346e-07, + "loss": 12.5137, + "step": 26 + }, + { + "epoch": 0.00462091391408523, + "grad_norm": 69.2486572265625, + "learning_rate": 1.2549914432401596e-07, + "loss": 10.2477, + "step": 27 + }, + { + "epoch": 0.004792058873866164, + "grad_norm": 30.01091194152832, + "learning_rate": 1.3120365088419852e-07, + "loss": 6.6456, + "step": 28 + }, + { + "epoch": 0.0049632038336470995, + "grad_norm": 75.28530883789062, + "learning_rate": 1.3690815744438105e-07, + "loss": 9.7946, + "step": 29 + }, + { + "epoch": 0.005134348793428034, + "grad_norm": 70.37921142578125, + "learning_rate": 1.426126640045636e-07, + "loss": 12.4969, + "step": 30 + }, + { + "epoch": 0.005305493753208968, + "grad_norm": 90.83671569824219, + "learning_rate": 1.4831717056474617e-07, + "loss": 7.6589, + "step": 31 + }, + { + "epoch": 0.005476638712989902, + "grad_norm": 65.92588806152344, + "learning_rate": 1.540216771249287e-07, + "loss": 9.7764, + "step": 32 + }, + { + "epoch": 0.005647783672770837, + "grad_norm": 86.14967346191406, + "learning_rate": 1.5972618368511126e-07, + "loss": 8.2129, + "step": 33 + }, + { + "epoch": 0.0058189286325517715, + "grad_norm": 145.2432098388672, + "learning_rate": 1.654306902452938e-07, + "loss": 17.7377, + "step": 34 + }, + { + "epoch": 0.005990073592332706, + "grad_norm": 64.69364929199219, + "learning_rate": 1.7113519680547632e-07, + "loss": 9.9994, + "step": 35 + }, + { + "epoch": 0.00616121855211364, + "grad_norm": 58.662803649902344, + "learning_rate": 1.7683970336565888e-07, + "loss": 11.459, + "step": 36 + }, + { + "epoch": 0.006332363511894575, + "grad_norm": 123.47699737548828, + "learning_rate": 1.825442099258414e-07, + "loss": 16.278, + "step": 37 + }, + { + "epoch": 0.006503508471675509, + "grad_norm": 40.24553680419922, + "learning_rate": 1.8824871648602397e-07, + "loss": 8.1959, + "step": 38 + }, + { + "epoch": 0.0066746534314564435, + "grad_norm": 71.55098724365234, + "learning_rate": 1.939532230462065e-07, + "loss": 12.4429, + "step": 39 + }, + { + "epoch": 0.006845798391237378, + "grad_norm": 73.94329833984375, + "learning_rate": 1.9965772960638906e-07, + "loss": 12.4672, + "step": 40 + }, + { + "epoch": 0.007016943351018313, + "grad_norm": 78.10585021972656, + "learning_rate": 2.0536223616657162e-07, + "loss": 12.6372, + "step": 41 + }, + { + "epoch": 0.007188088310799247, + "grad_norm": 154.09982299804688, + "learning_rate": 2.1106674272675415e-07, + "loss": 18.2102, + "step": 42 + }, + { + "epoch": 0.007359233270580181, + "grad_norm": 69.93523406982422, + "learning_rate": 2.167712492869367e-07, + "loss": 10.2355, + "step": 43 + }, + { + "epoch": 0.0075303782303611155, + "grad_norm": 53.971500396728516, + "learning_rate": 2.224757558471192e-07, + "loss": 11.0821, + "step": 44 + }, + { + "epoch": 0.007701523190142051, + "grad_norm": 79.0840835571289, + "learning_rate": 2.2818026240730177e-07, + "loss": 12.838, + "step": 45 + }, + { + "epoch": 0.007872668149922985, + "grad_norm": 68.8064956665039, + "learning_rate": 2.3388476896748433e-07, + "loss": 11.9172, + "step": 46 + }, + { + "epoch": 0.00804381310970392, + "grad_norm": 114.59835815429688, + "learning_rate": 2.395892755276669e-07, + "loss": 16.4603, + "step": 47 + }, + { + "epoch": 0.008214958069484853, + "grad_norm": 76.0896987915039, + "learning_rate": 2.452937820878494e-07, + "loss": 12.7007, + "step": 48 + }, + { + "epoch": 0.008386103029265788, + "grad_norm": 45.47982406616211, + "learning_rate": 2.509982886480319e-07, + "loss": 10.2213, + "step": 49 + }, + { + "epoch": 0.008557247989046722, + "grad_norm": 136.47836303710938, + "learning_rate": 2.567027952082145e-07, + "loss": 17.2777, + "step": 50 + }, + { + "epoch": 0.008728392948827657, + "grad_norm": 57.07033920288086, + "learning_rate": 2.6240730176839704e-07, + "loss": 10.8704, + "step": 51 + }, + { + "epoch": 0.008899537908608592, + "grad_norm": 50.97236633300781, + "learning_rate": 2.681118083285796e-07, + "loss": 10.8266, + "step": 52 + }, + { + "epoch": 0.009070682868389525, + "grad_norm": 166.14840698242188, + "learning_rate": 2.738163148887621e-07, + "loss": 17.2345, + "step": 53 + }, + { + "epoch": 0.00924182782817046, + "grad_norm": 154.5965576171875, + "learning_rate": 2.795208214489447e-07, + "loss": 18.5922, + "step": 54 + }, + { + "epoch": 0.009412972787951395, + "grad_norm": 61.19700622558594, + "learning_rate": 2.852253280091272e-07, + "loss": 9.3514, + "step": 55 + }, + { + "epoch": 0.009584117747732329, + "grad_norm": 64.54351806640625, + "learning_rate": 2.909298345693098e-07, + "loss": 7.0867, + "step": 56 + }, + { + "epoch": 0.009755262707513264, + "grad_norm": 41.97494888305664, + "learning_rate": 2.9663434112949233e-07, + "loss": 8.319, + "step": 57 + }, + { + "epoch": 0.009926407667294199, + "grad_norm": 158.86936950683594, + "learning_rate": 3.023388476896748e-07, + "loss": 18.7347, + "step": 58 + }, + { + "epoch": 0.010097552627075132, + "grad_norm": 93.42990112304688, + "learning_rate": 3.080433542498574e-07, + "loss": 24.4121, + "step": 59 + }, + { + "epoch": 0.010268697586856067, + "grad_norm": 40.078529357910156, + "learning_rate": 3.1374786081003993e-07, + "loss": 8.4254, + "step": 60 + }, + { + "epoch": 0.010439842546637, + "grad_norm": 172.25357055664062, + "learning_rate": 3.194523673702225e-07, + "loss": 17.8112, + "step": 61 + }, + { + "epoch": 0.010610987506417936, + "grad_norm": 64.05378723144531, + "learning_rate": 3.2515687393040504e-07, + "loss": 11.648, + "step": 62 + }, + { + "epoch": 0.010782132466198871, + "grad_norm": 31.915542602539062, + "learning_rate": 3.308613804905876e-07, + "loss": 6.576, + "step": 63 + }, + { + "epoch": 0.010953277425979804, + "grad_norm": 97.90230560302734, + "learning_rate": 3.365658870507701e-07, + "loss": 24.4094, + "step": 64 + }, + { + "epoch": 0.01112442238576074, + "grad_norm": 49.965126037597656, + "learning_rate": 3.4227039361095264e-07, + "loss": 10.7296, + "step": 65 + }, + { + "epoch": 0.011295567345541675, + "grad_norm": 103.74539947509766, + "learning_rate": 3.479749001711352e-07, + "loss": 24.3446, + "step": 66 + }, + { + "epoch": 0.011466712305322608, + "grad_norm": 100.20292663574219, + "learning_rate": 3.5367940673131776e-07, + "loss": 24.4799, + "step": 67 + }, + { + "epoch": 0.011637857265103543, + "grad_norm": 87.23709869384766, + "learning_rate": 3.593839132915003e-07, + "loss": 7.8589, + "step": 68 + }, + { + "epoch": 0.011809002224884476, + "grad_norm": 105.97203063964844, + "learning_rate": 3.650884198516828e-07, + "loss": 24.5711, + "step": 69 + }, + { + "epoch": 0.011980147184665411, + "grad_norm": 139.98709106445312, + "learning_rate": 3.707929264118654e-07, + "loss": 18.0228, + "step": 70 + }, + { + "epoch": 0.012151292144446347, + "grad_norm": 32.724159240722656, + "learning_rate": 3.7649743297204793e-07, + "loss": 8.2503, + "step": 71 + }, + { + "epoch": 0.01232243710422728, + "grad_norm": 55.843509674072266, + "learning_rate": 3.822019395322305e-07, + "loss": 7.1578, + "step": 72 + }, + { + "epoch": 0.012493582064008215, + "grad_norm": 183.3024444580078, + "learning_rate": 3.87906446092413e-07, + "loss": 18.0044, + "step": 73 + }, + { + "epoch": 0.01266472702378915, + "grad_norm": 118.88136291503906, + "learning_rate": 3.9361095265259553e-07, + "loss": 15.586, + "step": 74 + }, + { + "epoch": 0.012835871983570083, + "grad_norm": 64.66754150390625, + "learning_rate": 3.993154592127781e-07, + "loss": 12.3289, + "step": 75 + }, + { + "epoch": 0.013007016943351019, + "grad_norm": 62.58869552612305, + "learning_rate": 4.0501996577296065e-07, + "loss": 8.9949, + "step": 76 + }, + { + "epoch": 0.013178161903131952, + "grad_norm": 66.82809448242188, + "learning_rate": 4.1072447233314323e-07, + "loss": 12.1411, + "step": 77 + }, + { + "epoch": 0.013349306862912887, + "grad_norm": 144.4269256591797, + "learning_rate": 4.164289788933257e-07, + "loss": 18.5364, + "step": 78 + }, + { + "epoch": 0.013520451822693822, + "grad_norm": 60.626834869384766, + "learning_rate": 4.221334854535083e-07, + "loss": 11.9262, + "step": 79 + }, + { + "epoch": 0.013691596782474755, + "grad_norm": 139.23158264160156, + "learning_rate": 4.278379920136908e-07, + "loss": 17.5537, + "step": 80 + }, + { + "epoch": 0.01386274174225569, + "grad_norm": 74.82394409179688, + "learning_rate": 4.335424985738734e-07, + "loss": 12.6047, + "step": 81 + }, + { + "epoch": 0.014033886702036626, + "grad_norm": 128.57716369628906, + "learning_rate": 4.3924700513405594e-07, + "loss": 17.2671, + "step": 82 + }, + { + "epoch": 0.014205031661817559, + "grad_norm": 50.15757369995117, + "learning_rate": 4.449515116942384e-07, + "loss": 9.8308, + "step": 83 + }, + { + "epoch": 0.014376176621598494, + "grad_norm": 46.99615478515625, + "learning_rate": 4.50656018254421e-07, + "loss": 8.6575, + "step": 84 + }, + { + "epoch": 0.01454732158137943, + "grad_norm": 61.31080627441406, + "learning_rate": 4.5636052481460354e-07, + "loss": 6.8446, + "step": 85 + }, + { + "epoch": 0.014718466541160363, + "grad_norm": 60.068443298339844, + "learning_rate": 4.620650313747861e-07, + "loss": 11.2968, + "step": 86 + }, + { + "epoch": 0.014889611500941298, + "grad_norm": 67.9156265258789, + "learning_rate": 4.6776953793496865e-07, + "loss": 12.672, + "step": 87 + }, + { + "epoch": 0.015060756460722231, + "grad_norm": 49.77825164794922, + "learning_rate": 4.734740444951512e-07, + "loss": 9.5329, + "step": 88 + }, + { + "epoch": 0.015231901420503166, + "grad_norm": 57.99795913696289, + "learning_rate": 4.791785510553338e-07, + "loss": 11.3883, + "step": 89 + }, + { + "epoch": 0.015403046380284101, + "grad_norm": 54.070491790771484, + "learning_rate": 4.848830576155162e-07, + "loss": 11.282, + "step": 90 + }, + { + "epoch": 0.015574191340065035, + "grad_norm": 46.44948959350586, + "learning_rate": 4.905875641756988e-07, + "loss": 10.4933, + "step": 91 + }, + { + "epoch": 0.01574533629984597, + "grad_norm": 91.90750885009766, + "learning_rate": 4.962920707358814e-07, + "loss": 14.1568, + "step": 92 + }, + { + "epoch": 0.015916481259626903, + "grad_norm": 45.950286865234375, + "learning_rate": 5.019965772960638e-07, + "loss": 8.6589, + "step": 93 + }, + { + "epoch": 0.01608762621940784, + "grad_norm": 172.71951293945312, + "learning_rate": 5.077010838562465e-07, + "loss": 16.7382, + "step": 94 + }, + { + "epoch": 0.016258771179188773, + "grad_norm": 54.993247985839844, + "learning_rate": 5.13405590416429e-07, + "loss": 11.4407, + "step": 95 + }, + { + "epoch": 0.016429916138969707, + "grad_norm": 110.25296020507812, + "learning_rate": 5.191100969766115e-07, + "loss": 23.8689, + "step": 96 + }, + { + "epoch": 0.016601061098750643, + "grad_norm": 43.9764289855957, + "learning_rate": 5.248146035367941e-07, + "loss": 7.3984, + "step": 97 + }, + { + "epoch": 0.016772206058531577, + "grad_norm": 72.5784912109375, + "learning_rate": 5.305191100969766e-07, + "loss": 7.2256, + "step": 98 + }, + { + "epoch": 0.01694335101831251, + "grad_norm": 35.58343505859375, + "learning_rate": 5.362236166571592e-07, + "loss": 7.8513, + "step": 99 + }, + { + "epoch": 0.017114495978093443, + "grad_norm": 69.7222900390625, + "learning_rate": 5.419281232173417e-07, + "loss": 11.8726, + "step": 100 + }, + { + "epoch": 0.01728564093787438, + "grad_norm": 54.240943908691406, + "learning_rate": 5.476326297775242e-07, + "loss": 11.6433, + "step": 101 + }, + { + "epoch": 0.017456785897655314, + "grad_norm": 40.061763763427734, + "learning_rate": 5.533371363377068e-07, + "loss": 6.452, + "step": 102 + }, + { + "epoch": 0.017627930857436247, + "grad_norm": 43.3102912902832, + "learning_rate": 5.590416428978894e-07, + "loss": 10.9229, + "step": 103 + }, + { + "epoch": 0.017799075817217184, + "grad_norm": 48.96671676635742, + "learning_rate": 5.647461494580719e-07, + "loss": 10.9523, + "step": 104 + }, + { + "epoch": 0.017970220776998117, + "grad_norm": 107.66687774658203, + "learning_rate": 5.704506560182544e-07, + "loss": 15.756, + "step": 105 + }, + { + "epoch": 0.01814136573677905, + "grad_norm": 50.87533950805664, + "learning_rate": 5.76155162578437e-07, + "loss": 9.8941, + "step": 106 + }, + { + "epoch": 0.018312510696559987, + "grad_norm": 142.70115661621094, + "learning_rate": 5.818596691386196e-07, + "loss": 16.205, + "step": 107 + }, + { + "epoch": 0.01848365565634092, + "grad_norm": 62.69704818725586, + "learning_rate": 5.87564175698802e-07, + "loss": 9.7933, + "step": 108 + }, + { + "epoch": 0.018654800616121854, + "grad_norm": 52.710227966308594, + "learning_rate": 5.932686822589847e-07, + "loss": 10.7189, + "step": 109 + }, + { + "epoch": 0.01882594557590279, + "grad_norm": 131.87474060058594, + "learning_rate": 5.989731888191672e-07, + "loss": 24.6335, + "step": 110 + }, + { + "epoch": 0.018997090535683724, + "grad_norm": 105.79902648925781, + "learning_rate": 6.046776953793496e-07, + "loss": 16.133, + "step": 111 + }, + { + "epoch": 0.019168235495464658, + "grad_norm": 56.011474609375, + "learning_rate": 6.103822019395323e-07, + "loss": 11.9402, + "step": 112 + }, + { + "epoch": 0.019339380455245594, + "grad_norm": 97.4761962890625, + "learning_rate": 6.160867084997148e-07, + "loss": 14.1356, + "step": 113 + }, + { + "epoch": 0.019510525415026528, + "grad_norm": 52.7200813293457, + "learning_rate": 6.217912150598974e-07, + "loss": 11.4511, + "step": 114 + }, + { + "epoch": 0.01968167037480746, + "grad_norm": 42.6909065246582, + "learning_rate": 6.274957216200799e-07, + "loss": 8.5194, + "step": 115 + }, + { + "epoch": 0.019852815334588398, + "grad_norm": 44.77908706665039, + "learning_rate": 6.332002281802624e-07, + "loss": 11.437, + "step": 116 + }, + { + "epoch": 0.02002396029436933, + "grad_norm": 136.7108612060547, + "learning_rate": 6.38904734740445e-07, + "loss": 23.712, + "step": 117 + }, + { + "epoch": 0.020195105254150265, + "grad_norm": 44.484893798828125, + "learning_rate": 6.446092413006275e-07, + "loss": 10.4753, + "step": 118 + }, + { + "epoch": 0.020366250213931198, + "grad_norm": 57.66374206542969, + "learning_rate": 6.503137478608101e-07, + "loss": 11.2803, + "step": 119 + }, + { + "epoch": 0.020537395173712135, + "grad_norm": 110.59872436523438, + "learning_rate": 6.560182544209926e-07, + "loss": 15.7956, + "step": 120 + }, + { + "epoch": 0.02070854013349307, + "grad_norm": 33.806732177734375, + "learning_rate": 6.617227609811752e-07, + "loss": 10.327, + "step": 121 + }, + { + "epoch": 0.020879685093274, + "grad_norm": 52.41442108154297, + "learning_rate": 6.674272675413577e-07, + "loss": 10.5716, + "step": 122 + }, + { + "epoch": 0.02105083005305494, + "grad_norm": 40.95213317871094, + "learning_rate": 6.731317741015402e-07, + "loss": 11.6353, + "step": 123 + }, + { + "epoch": 0.021221975012835872, + "grad_norm": 43.268775939941406, + "learning_rate": 6.788362806617229e-07, + "loss": 11.2876, + "step": 124 + }, + { + "epoch": 0.021393119972616805, + "grad_norm": 102.84829711914062, + "learning_rate": 6.845407872219053e-07, + "loss": 15.5444, + "step": 125 + }, + { + "epoch": 0.021564264932397742, + "grad_norm": 56.55605697631836, + "learning_rate": 6.902452937820878e-07, + "loss": 10.2011, + "step": 126 + }, + { + "epoch": 0.021735409892178675, + "grad_norm": 37.294795989990234, + "learning_rate": 6.959498003422704e-07, + "loss": 8.1014, + "step": 127 + }, + { + "epoch": 0.02190655485195961, + "grad_norm": 55.67061233520508, + "learning_rate": 7.01654306902453e-07, + "loss": 11.638, + "step": 128 + }, + { + "epoch": 0.022077699811740546, + "grad_norm": 67.4786605834961, + "learning_rate": 7.073588134626355e-07, + "loss": 6.8779, + "step": 129 + }, + { + "epoch": 0.02224884477152148, + "grad_norm": 30.9260196685791, + "learning_rate": 7.13063320022818e-07, + "loss": 9.4357, + "step": 130 + }, + { + "epoch": 0.022419989731302412, + "grad_norm": 100.22219848632812, + "learning_rate": 7.187678265830006e-07, + "loss": 14.6476, + "step": 131 + }, + { + "epoch": 0.02259113469108335, + "grad_norm": 29.24936294555664, + "learning_rate": 7.244723331431832e-07, + "loss": 6.5599, + "step": 132 + }, + { + "epoch": 0.022762279650864282, + "grad_norm": 31.59239959716797, + "learning_rate": 7.301768397033656e-07, + "loss": 6.6734, + "step": 133 + }, + { + "epoch": 0.022933424610645216, + "grad_norm": 42.93860626220703, + "learning_rate": 7.358813462635483e-07, + "loss": 9.125, + "step": 134 + }, + { + "epoch": 0.023104569570426153, + "grad_norm": 46.8751335144043, + "learning_rate": 7.415858528237308e-07, + "loss": 10.3878, + "step": 135 + }, + { + "epoch": 0.023275714530207086, + "grad_norm": 106.5069351196289, + "learning_rate": 7.472903593839132e-07, + "loss": 14.3693, + "step": 136 + }, + { + "epoch": 0.02344685948998802, + "grad_norm": 126.05512237548828, + "learning_rate": 7.529948659440959e-07, + "loss": 22.8488, + "step": 137 + }, + { + "epoch": 0.023618004449768953, + "grad_norm": 89.99185180664062, + "learning_rate": 7.586993725042784e-07, + "loss": 13.3307, + "step": 138 + }, + { + "epoch": 0.02378914940954989, + "grad_norm": 35.95622253417969, + "learning_rate": 7.64403879064461e-07, + "loss": 9.7673, + "step": 139 + }, + { + "epoch": 0.023960294369330823, + "grad_norm": 31.504724502563477, + "learning_rate": 7.701083856246435e-07, + "loss": 9.0915, + "step": 140 + }, + { + "epoch": 0.024131439329111756, + "grad_norm": 74.6131591796875, + "learning_rate": 7.75812892184826e-07, + "loss": 12.7378, + "step": 141 + }, + { + "epoch": 0.024302584288892693, + "grad_norm": 40.63880920410156, + "learning_rate": 7.815173987450086e-07, + "loss": 8.9323, + "step": 142 + }, + { + "epoch": 0.024473729248673626, + "grad_norm": 101.69804382324219, + "learning_rate": 7.872219053051911e-07, + "loss": 14.547, + "step": 143 + }, + { + "epoch": 0.02464487420845456, + "grad_norm": 131.046142578125, + "learning_rate": 7.929264118653737e-07, + "loss": 23.0012, + "step": 144 + }, + { + "epoch": 0.024816019168235497, + "grad_norm": 74.94658660888672, + "learning_rate": 7.986309184255562e-07, + "loss": 12.8286, + "step": 145 + }, + { + "epoch": 0.02498716412801643, + "grad_norm": 50.227718353271484, + "learning_rate": 8.043354249857388e-07, + "loss": 10.9684, + "step": 146 + }, + { + "epoch": 0.025158309087797363, + "grad_norm": 162.12603759765625, + "learning_rate": 8.100399315459213e-07, + "loss": 23.0181, + "step": 147 + }, + { + "epoch": 0.0253294540475783, + "grad_norm": 34.11660385131836, + "learning_rate": 8.157444381061038e-07, + "loss": 9.2808, + "step": 148 + }, + { + "epoch": 0.025500599007359234, + "grad_norm": 34.07155990600586, + "learning_rate": 8.214489446662865e-07, + "loss": 11.3059, + "step": 149 + }, + { + "epoch": 0.025671743967140167, + "grad_norm": 42.31085968017578, + "learning_rate": 8.271534512264689e-07, + "loss": 8.7746, + "step": 150 + }, + { + "epoch": 0.025842888926921104, + "grad_norm": 132.6522216796875, + "learning_rate": 8.328579577866514e-07, + "loss": 16.6408, + "step": 151 + }, + { + "epoch": 0.026014033886702037, + "grad_norm": 21.736328125, + "learning_rate": 8.385624643468341e-07, + "loss": 9.6907, + "step": 152 + }, + { + "epoch": 0.02618517884648297, + "grad_norm": 64.54568481445312, + "learning_rate": 8.442669709070166e-07, + "loss": 12.8573, + "step": 153 + }, + { + "epoch": 0.026356323806263904, + "grad_norm": 32.484703063964844, + "learning_rate": 8.499714774671991e-07, + "loss": 7.4299, + "step": 154 + }, + { + "epoch": 0.02652746876604484, + "grad_norm": 86.55378723144531, + "learning_rate": 8.556759840273817e-07, + "loss": 13.347, + "step": 155 + }, + { + "epoch": 0.026698613725825774, + "grad_norm": 32.97962188720703, + "learning_rate": 8.613804905875642e-07, + "loss": 9.2004, + "step": 156 + }, + { + "epoch": 0.026869758685606707, + "grad_norm": 66.87654113769531, + "learning_rate": 8.670849971477468e-07, + "loss": 6.6107, + "step": 157 + }, + { + "epoch": 0.027040903645387644, + "grad_norm": 56.53002166748047, + "learning_rate": 8.727895037079292e-07, + "loss": 6.0957, + "step": 158 + }, + { + "epoch": 0.027212048605168578, + "grad_norm": 37.223453521728516, + "learning_rate": 8.784940102681119e-07, + "loss": 8.9968, + "step": 159 + }, + { + "epoch": 0.02738319356494951, + "grad_norm": 30.637619018554688, + "learning_rate": 8.841985168282944e-07, + "loss": 8.9597, + "step": 160 + }, + { + "epoch": 0.027554338524730448, + "grad_norm": 22.8154354095459, + "learning_rate": 8.899030233884768e-07, + "loss": 9.2794, + "step": 161 + }, + { + "epoch": 0.02772548348451138, + "grad_norm": 64.24419403076172, + "learning_rate": 8.956075299486595e-07, + "loss": 12.9209, + "step": 162 + }, + { + "epoch": 0.027896628444292314, + "grad_norm": 27.159826278686523, + "learning_rate": 9.01312036508842e-07, + "loss": 10.7092, + "step": 163 + }, + { + "epoch": 0.02806777340407325, + "grad_norm": 29.741992950439453, + "learning_rate": 9.070165430690246e-07, + "loss": 10.1098, + "step": 164 + }, + { + "epoch": 0.028238918363854185, + "grad_norm": 61.4916877746582, + "learning_rate": 9.127210496292071e-07, + "loss": 12.5023, + "step": 165 + }, + { + "epoch": 0.028410063323635118, + "grad_norm": 21.36608123779297, + "learning_rate": 9.184255561893896e-07, + "loss": 7.2161, + "step": 166 + }, + { + "epoch": 0.028581208283416055, + "grad_norm": 51.13070297241211, + "learning_rate": 9.241300627495722e-07, + "loss": 5.5324, + "step": 167 + }, + { + "epoch": 0.028752353243196988, + "grad_norm": 27.232070922851562, + "learning_rate": 9.298345693097547e-07, + "loss": 9.3162, + "step": 168 + }, + { + "epoch": 0.02892349820297792, + "grad_norm": 51.84492111206055, + "learning_rate": 9.355390758699373e-07, + "loss": 6.0306, + "step": 169 + }, + { + "epoch": 0.02909464316275886, + "grad_norm": 24.21738052368164, + "learning_rate": 9.412435824301197e-07, + "loss": 6.6994, + "step": 170 + }, + { + "epoch": 0.02926578812253979, + "grad_norm": 27.428897857666016, + "learning_rate": 9.469480889903024e-07, + "loss": 10.5412, + "step": 171 + }, + { + "epoch": 0.029436933082320725, + "grad_norm": 123.71875762939453, + "learning_rate": 9.526525955504849e-07, + "loss": 15.9849, + "step": 172 + }, + { + "epoch": 0.02960807804210166, + "grad_norm": 34.90501403808594, + "learning_rate": 9.583571021106676e-07, + "loss": 9.2574, + "step": 173 + }, + { + "epoch": 0.029779223001882595, + "grad_norm": 26.623390197753906, + "learning_rate": 9.6406160867085e-07, + "loss": 8.7904, + "step": 174 + }, + { + "epoch": 0.02995036796166353, + "grad_norm": 21.868566513061523, + "learning_rate": 9.697661152310325e-07, + "loss": 9.2638, + "step": 175 + }, + { + "epoch": 0.030121512921444462, + "grad_norm": 28.389110565185547, + "learning_rate": 9.754706217912152e-07, + "loss": 5.9086, + "step": 176 + }, + { + "epoch": 0.0302926578812254, + "grad_norm": 51.29762649536133, + "learning_rate": 9.811751283513976e-07, + "loss": 5.9646, + "step": 177 + }, + { + "epoch": 0.030463802841006332, + "grad_norm": 28.91325569152832, + "learning_rate": 9.8687963491158e-07, + "loss": 6.0877, + "step": 178 + }, + { + "epoch": 0.030634947800787266, + "grad_norm": 66.74105834960938, + "learning_rate": 9.925841414717628e-07, + "loss": 12.4348, + "step": 179 + }, + { + "epoch": 0.030806092760568202, + "grad_norm": 19.138124465942383, + "learning_rate": 9.982886480319452e-07, + "loss": 9.5496, + "step": 180 + }, + { + "epoch": 0.030977237720349136, + "grad_norm": 43.17308044433594, + "learning_rate": 1.0039931545921277e-06, + "loss": 5.5641, + "step": 181 + }, + { + "epoch": 0.03114838268013007, + "grad_norm": 32.97599411010742, + "learning_rate": 1.0096976611523104e-06, + "loss": 9.0529, + "step": 182 + }, + { + "epoch": 0.031319527639911006, + "grad_norm": 56.315521240234375, + "learning_rate": 1.015402167712493e-06, + "loss": 12.0747, + "step": 183 + }, + { + "epoch": 0.03149067259969194, + "grad_norm": 76.77662658691406, + "learning_rate": 1.0211066742726755e-06, + "loss": 13.0892, + "step": 184 + }, + { + "epoch": 0.03166181755947287, + "grad_norm": 25.544397354125977, + "learning_rate": 1.026811180832858e-06, + "loss": 7.7117, + "step": 185 + }, + { + "epoch": 0.031832962519253806, + "grad_norm": 24.205764770507812, + "learning_rate": 1.0325156873930406e-06, + "loss": 6.6426, + "step": 186 + }, + { + "epoch": 0.03200410747903474, + "grad_norm": 25.586280822753906, + "learning_rate": 1.038220193953223e-06, + "loss": 10.4785, + "step": 187 + }, + { + "epoch": 0.03217525243881568, + "grad_norm": 68.83911895751953, + "learning_rate": 1.0439247005134056e-06, + "loss": 12.2132, + "step": 188 + }, + { + "epoch": 0.03234639739859661, + "grad_norm": 24.825489044189453, + "learning_rate": 1.0496292070735881e-06, + "loss": 6.3336, + "step": 189 + }, + { + "epoch": 0.032517542358377546, + "grad_norm": 28.293699264526367, + "learning_rate": 1.0553337136337707e-06, + "loss": 8.5374, + "step": 190 + }, + { + "epoch": 0.03268868731815848, + "grad_norm": 28.26664924621582, + "learning_rate": 1.0610382201939532e-06, + "loss": 9.7218, + "step": 191 + }, + { + "epoch": 0.03285983227793941, + "grad_norm": 84.32862854003906, + "learning_rate": 1.0667427267541357e-06, + "loss": 12.782, + "step": 192 + }, + { + "epoch": 0.033030977237720346, + "grad_norm": 26.818071365356445, + "learning_rate": 1.0724472333143185e-06, + "loss": 7.3125, + "step": 193 + }, + { + "epoch": 0.03320212219750129, + "grad_norm": 16.650196075439453, + "learning_rate": 1.0781517398745008e-06, + "loss": 9.0232, + "step": 194 + }, + { + "epoch": 0.03337326715728222, + "grad_norm": 22.659135818481445, + "learning_rate": 1.0838562464346833e-06, + "loss": 6.2787, + "step": 195 + }, + { + "epoch": 0.03354441211706315, + "grad_norm": 24.644168853759766, + "learning_rate": 1.089560752994866e-06, + "loss": 6.0047, + "step": 196 + }, + { + "epoch": 0.03371555707684409, + "grad_norm": 32.078712463378906, + "learning_rate": 1.0952652595550484e-06, + "loss": 7.5748, + "step": 197 + }, + { + "epoch": 0.03388670203662502, + "grad_norm": 55.345855712890625, + "learning_rate": 1.1009697661152311e-06, + "loss": 11.8703, + "step": 198 + }, + { + "epoch": 0.034057846996405954, + "grad_norm": 70.49486541748047, + "learning_rate": 1.1066742726754137e-06, + "loss": 11.7983, + "step": 199 + }, + { + "epoch": 0.03422899195618689, + "grad_norm": 29.946758270263672, + "learning_rate": 1.112378779235596e-06, + "loss": 7.6286, + "step": 200 + }, + { + "epoch": 0.03440013691596783, + "grad_norm": 278.3395080566406, + "learning_rate": 1.1180832857957787e-06, + "loss": 17.6192, + "step": 201 + }, + { + "epoch": 0.03457128187574876, + "grad_norm": 310.682861328125, + "learning_rate": 1.1237877923559613e-06, + "loss": 17.6315, + "step": 202 + }, + { + "epoch": 0.034742426835529694, + "grad_norm": 46.159568786621094, + "learning_rate": 1.1294922989161438e-06, + "loss": 11.6001, + "step": 203 + }, + { + "epoch": 0.03491357179531063, + "grad_norm": 20.635892868041992, + "learning_rate": 1.1351968054763263e-06, + "loss": 9.4128, + "step": 204 + }, + { + "epoch": 0.03508471675509156, + "grad_norm": 143.4097137451172, + "learning_rate": 1.1409013120365089e-06, + "loss": 16.3943, + "step": 205 + }, + { + "epoch": 0.035255861714872494, + "grad_norm": 265.5577087402344, + "learning_rate": 1.1466058185966914e-06, + "loss": 18.6869, + "step": 206 + }, + { + "epoch": 0.035427006674653434, + "grad_norm": 19.766063690185547, + "learning_rate": 1.152310325156874e-06, + "loss": 8.6515, + "step": 207 + }, + { + "epoch": 0.03559815163443437, + "grad_norm": 43.8801383972168, + "learning_rate": 1.1580148317170565e-06, + "loss": 11.424, + "step": 208 + }, + { + "epoch": 0.0357692965942153, + "grad_norm": 12.928386688232422, + "learning_rate": 1.1637193382772392e-06, + "loss": 5.5902, + "step": 209 + }, + { + "epoch": 0.035940441553996234, + "grad_norm": 123.55076599121094, + "learning_rate": 1.1694238448374215e-06, + "loss": 15.6958, + "step": 210 + }, + { + "epoch": 0.03611158651377717, + "grad_norm": 44.79010772705078, + "learning_rate": 1.175128351397604e-06, + "loss": 11.1894, + "step": 211 + }, + { + "epoch": 0.0362827314735581, + "grad_norm": 26.461137771606445, + "learning_rate": 1.1808328579577868e-06, + "loss": 7.3237, + "step": 212 + }, + { + "epoch": 0.03645387643333904, + "grad_norm": 24.63947296142578, + "learning_rate": 1.1865373645179693e-06, + "loss": 5.7252, + "step": 213 + }, + { + "epoch": 0.036625021393119975, + "grad_norm": 17.151113510131836, + "learning_rate": 1.1922418710781517e-06, + "loss": 9.0419, + "step": 214 + }, + { + "epoch": 0.03679616635290091, + "grad_norm": 26.69593620300293, + "learning_rate": 1.1979463776383344e-06, + "loss": 9.4836, + "step": 215 + }, + { + "epoch": 0.03696731131268184, + "grad_norm": 50.901573181152344, + "learning_rate": 1.203650884198517e-06, + "loss": 11.2858, + "step": 216 + }, + { + "epoch": 0.037138456272462775, + "grad_norm": 48.110328674316406, + "learning_rate": 1.2093553907586992e-06, + "loss": 11.5594, + "step": 217 + }, + { + "epoch": 0.03730960123224371, + "grad_norm": 51.77389907836914, + "learning_rate": 1.215059897318882e-06, + "loss": 11.6974, + "step": 218 + }, + { + "epoch": 0.03748074619202464, + "grad_norm": 23.52347183227539, + "learning_rate": 1.2207644038790645e-06, + "loss": 9.5737, + "step": 219 + }, + { + "epoch": 0.03765189115180558, + "grad_norm": 20.402074813842773, + "learning_rate": 1.2264689104392468e-06, + "loss": 6.1995, + "step": 220 + }, + { + "epoch": 0.037823036111586515, + "grad_norm": 18.76962661743164, + "learning_rate": 1.2321734169994296e-06, + "loss": 7.1013, + "step": 221 + }, + { + "epoch": 0.03799418107136745, + "grad_norm": 21.817501068115234, + "learning_rate": 1.2378779235596121e-06, + "loss": 9.3332, + "step": 222 + }, + { + "epoch": 0.03816532603114838, + "grad_norm": 11.452000617980957, + "learning_rate": 1.2435824301197949e-06, + "loss": 6.2887, + "step": 223 + }, + { + "epoch": 0.038336470990929315, + "grad_norm": 22.69776153564453, + "learning_rate": 1.2492869366799772e-06, + "loss": 7.9947, + "step": 224 + }, + { + "epoch": 0.03850761595071025, + "grad_norm": 25.39488410949707, + "learning_rate": 1.2549914432401597e-06, + "loss": 5.1894, + "step": 225 + }, + { + "epoch": 0.03867876091049119, + "grad_norm": 17.65719223022461, + "learning_rate": 1.2606959498003425e-06, + "loss": 7.4931, + "step": 226 + }, + { + "epoch": 0.03884990587027212, + "grad_norm": 23.45711898803711, + "learning_rate": 1.2664004563605248e-06, + "loss": 9.6157, + "step": 227 + }, + { + "epoch": 0.039021050830053056, + "grad_norm": 29.114194869995117, + "learning_rate": 1.2721049629207073e-06, + "loss": 10.4857, + "step": 228 + }, + { + "epoch": 0.03919219578983399, + "grad_norm": 46.365013122558594, + "learning_rate": 1.27780946948089e-06, + "loss": 11.9216, + "step": 229 + }, + { + "epoch": 0.03936334074961492, + "grad_norm": 23.066879272460938, + "learning_rate": 1.2835139760410724e-06, + "loss": 9.4344, + "step": 230 + }, + { + "epoch": 0.039534485709395856, + "grad_norm": 15.414644241333008, + "learning_rate": 1.289218482601255e-06, + "loss": 6.4409, + "step": 231 + }, + { + "epoch": 0.039705630669176796, + "grad_norm": 16.58795166015625, + "learning_rate": 1.2949229891614376e-06, + "loss": 7.3307, + "step": 232 + }, + { + "epoch": 0.03987677562895773, + "grad_norm": 36.44779968261719, + "learning_rate": 1.3006274957216202e-06, + "loss": 11.1388, + "step": 233 + }, + { + "epoch": 0.04004792058873866, + "grad_norm": 20.902912139892578, + "learning_rate": 1.3063320022818027e-06, + "loss": 7.378, + "step": 234 + }, + { + "epoch": 0.040219065548519596, + "grad_norm": 20.50259017944336, + "learning_rate": 1.3120365088419852e-06, + "loss": 6.156, + "step": 235 + }, + { + "epoch": 0.04039021050830053, + "grad_norm": 22.57229995727539, + "learning_rate": 1.3177410154021678e-06, + "loss": 7.0029, + "step": 236 + }, + { + "epoch": 0.04056135546808146, + "grad_norm": 25.610868453979492, + "learning_rate": 1.3234455219623503e-06, + "loss": 8.7721, + "step": 237 + }, + { + "epoch": 0.040732500427862396, + "grad_norm": 278.795654296875, + "learning_rate": 1.3291500285225328e-06, + "loss": 15.9633, + "step": 238 + }, + { + "epoch": 0.040903645387643336, + "grad_norm": 11.644048690795898, + "learning_rate": 1.3348545350827154e-06, + "loss": 6.3707, + "step": 239 + }, + { + "epoch": 0.04107479034742427, + "grad_norm": 36.32057189941406, + "learning_rate": 1.340559041642898e-06, + "loss": 10.7901, + "step": 240 + }, + { + "epoch": 0.0412459353072052, + "grad_norm": 22.911476135253906, + "learning_rate": 1.3462635482030804e-06, + "loss": 9.4097, + "step": 241 + }, + { + "epoch": 0.04141708026698614, + "grad_norm": 24.35552406311035, + "learning_rate": 1.351968054763263e-06, + "loss": 9.081, + "step": 242 + }, + { + "epoch": 0.04158822522676707, + "grad_norm": 18.466432571411133, + "learning_rate": 1.3576725613234457e-06, + "loss": 7.4805, + "step": 243 + }, + { + "epoch": 0.041759370186548, + "grad_norm": 44.41029357910156, + "learning_rate": 1.363377067883628e-06, + "loss": 11.2131, + "step": 244 + }, + { + "epoch": 0.041930515146328944, + "grad_norm": 15.328824043273926, + "learning_rate": 1.3690815744438106e-06, + "loss": 8.2706, + "step": 245 + }, + { + "epoch": 0.04210166010610988, + "grad_norm": 274.3642578125, + "learning_rate": 1.3747860810039933e-06, + "loss": 15.8791, + "step": 246 + }, + { + "epoch": 0.04227280506589081, + "grad_norm": 18.105318069458008, + "learning_rate": 1.3804905875641756e-06, + "loss": 8.9079, + "step": 247 + }, + { + "epoch": 0.042443950025671744, + "grad_norm": 22.90168571472168, + "learning_rate": 1.3861950941243584e-06, + "loss": 6.6905, + "step": 248 + }, + { + "epoch": 0.04261509498545268, + "grad_norm": 16.96687126159668, + "learning_rate": 1.391899600684541e-06, + "loss": 8.5567, + "step": 249 + }, + { + "epoch": 0.04278623994523361, + "grad_norm": 283.76409912109375, + "learning_rate": 1.3976041072447232e-06, + "loss": 14.4204, + "step": 250 + }, + { + "epoch": 0.04295738490501455, + "grad_norm": 22.41378402709961, + "learning_rate": 1.403308613804906e-06, + "loss": 9.6063, + "step": 251 + }, + { + "epoch": 0.043128529864795484, + "grad_norm": 23.26137924194336, + "learning_rate": 1.4090131203650885e-06, + "loss": 9.9569, + "step": 252 + }, + { + "epoch": 0.04329967482457642, + "grad_norm": 19.40400505065918, + "learning_rate": 1.414717626925271e-06, + "loss": 6.4322, + "step": 253 + }, + { + "epoch": 0.04347081978435735, + "grad_norm": 21.541933059692383, + "learning_rate": 1.4204221334854536e-06, + "loss": 4.5325, + "step": 254 + }, + { + "epoch": 0.043641964744138284, + "grad_norm": 17.52275276184082, + "learning_rate": 1.426126640045636e-06, + "loss": 8.3479, + "step": 255 + }, + { + "epoch": 0.04381310970391922, + "grad_norm": 125.6756591796875, + "learning_rate": 1.4318311466058186e-06, + "loss": 15.4145, + "step": 256 + }, + { + "epoch": 0.04398425466370015, + "grad_norm": 18.166152954101562, + "learning_rate": 1.4375356531660011e-06, + "loss": 4.2531, + "step": 257 + }, + { + "epoch": 0.04415539962348109, + "grad_norm": 25.4247989654541, + "learning_rate": 1.4432401597261837e-06, + "loss": 10.4856, + "step": 258 + }, + { + "epoch": 0.044326544583262024, + "grad_norm": 17.259897232055664, + "learning_rate": 1.4489446662863664e-06, + "loss": 8.6032, + "step": 259 + }, + { + "epoch": 0.04449768954304296, + "grad_norm": 23.197059631347656, + "learning_rate": 1.4546491728465487e-06, + "loss": 7.8062, + "step": 260 + }, + { + "epoch": 0.04466883450282389, + "grad_norm": 43.4500617980957, + "learning_rate": 1.4603536794067313e-06, + "loss": 11.1986, + "step": 261 + }, + { + "epoch": 0.044839979462604825, + "grad_norm": 122.06368255615234, + "learning_rate": 1.466058185966914e-06, + "loss": 15.5832, + "step": 262 + }, + { + "epoch": 0.04501112442238576, + "grad_norm": 16.506317138671875, + "learning_rate": 1.4717626925270965e-06, + "loss": 8.7747, + "step": 263 + }, + { + "epoch": 0.0451822693821667, + "grad_norm": 19.03982162475586, + "learning_rate": 1.4774671990872789e-06, + "loss": 7.6134, + "step": 264 + }, + { + "epoch": 0.04535341434194763, + "grad_norm": 33.20307540893555, + "learning_rate": 1.4831717056474616e-06, + "loss": 10.3325, + "step": 265 + }, + { + "epoch": 0.045524559301728565, + "grad_norm": 16.946876525878906, + "learning_rate": 1.4888762122076441e-06, + "loss": 8.2866, + "step": 266 + }, + { + "epoch": 0.0456957042615095, + "grad_norm": 25.170318603515625, + "learning_rate": 1.4945807187678265e-06, + "loss": 10.2958, + "step": 267 + }, + { + "epoch": 0.04586684922129043, + "grad_norm": 16.860721588134766, + "learning_rate": 1.5002852253280092e-06, + "loss": 8.5198, + "step": 268 + }, + { + "epoch": 0.046037994181071365, + "grad_norm": 18.003284454345703, + "learning_rate": 1.5059897318881917e-06, + "loss": 8.8484, + "step": 269 + }, + { + "epoch": 0.046209139140852305, + "grad_norm": 17.796016693115234, + "learning_rate": 1.511694238448374e-06, + "loss": 6.2495, + "step": 270 + }, + { + "epoch": 0.04638028410063324, + "grad_norm": 23.97182846069336, + "learning_rate": 1.5173987450085568e-06, + "loss": 7.0879, + "step": 271 + }, + { + "epoch": 0.04655142906041417, + "grad_norm": 213.1482696533203, + "learning_rate": 1.5231032515687393e-06, + "loss": 12.8754, + "step": 272 + }, + { + "epoch": 0.046722574020195105, + "grad_norm": 25.503662109375, + "learning_rate": 1.528807758128922e-06, + "loss": 7.9125, + "step": 273 + }, + { + "epoch": 0.04689371897997604, + "grad_norm": 19.832860946655273, + "learning_rate": 1.5345122646891044e-06, + "loss": 9.0794, + "step": 274 + }, + { + "epoch": 0.04706486393975697, + "grad_norm": 32.311920166015625, + "learning_rate": 1.540216771249287e-06, + "loss": 10.648, + "step": 275 + }, + { + "epoch": 0.047236008899537905, + "grad_norm": 39.916603088378906, + "learning_rate": 1.5459212778094697e-06, + "loss": 10.9246, + "step": 276 + }, + { + "epoch": 0.047407153859318846, + "grad_norm": 21.337602615356445, + "learning_rate": 1.551625784369652e-06, + "loss": 9.2191, + "step": 277 + }, + { + "epoch": 0.04757829881909978, + "grad_norm": 25.114675521850586, + "learning_rate": 1.5573302909298345e-06, + "loss": 10.2576, + "step": 278 + }, + { + "epoch": 0.04774944377888071, + "grad_norm": 14.945568084716797, + "learning_rate": 1.5630347974900173e-06, + "loss": 8.4857, + "step": 279 + }, + { + "epoch": 0.047920588738661646, + "grad_norm": 33.542449951171875, + "learning_rate": 1.5687393040501996e-06, + "loss": 10.9193, + "step": 280 + }, + { + "epoch": 0.04809173369844258, + "grad_norm": 27.331628799438477, + "learning_rate": 1.5744438106103821e-06, + "loss": 9.9441, + "step": 281 + }, + { + "epoch": 0.04826287865822351, + "grad_norm": 17.784677505493164, + "learning_rate": 1.5801483171705649e-06, + "loss": 6.4105, + "step": 282 + }, + { + "epoch": 0.04843402361800445, + "grad_norm": 46.38033676147461, + "learning_rate": 1.5858528237307474e-06, + "loss": 10.5075, + "step": 283 + }, + { + "epoch": 0.048605168577785386, + "grad_norm": 13.535309791564941, + "learning_rate": 1.59155733029093e-06, + "loss": 4.4568, + "step": 284 + }, + { + "epoch": 0.04877631353756632, + "grad_norm": 27.45166015625, + "learning_rate": 1.5972618368511125e-06, + "loss": 10.2344, + "step": 285 + }, + { + "epoch": 0.04894745849734725, + "grad_norm": 16.50087547302246, + "learning_rate": 1.602966343411295e-06, + "loss": 8.5428, + "step": 286 + }, + { + "epoch": 0.049118603457128186, + "grad_norm": 42.31341552734375, + "learning_rate": 1.6086708499714775e-06, + "loss": 10.0868, + "step": 287 + }, + { + "epoch": 0.04928974841690912, + "grad_norm": 17.977153778076172, + "learning_rate": 1.61437535653166e-06, + "loss": 9.012, + "step": 288 + }, + { + "epoch": 0.04946089337669006, + "grad_norm": 104.7464828491211, + "learning_rate": 1.6200798630918426e-06, + "loss": 14.6671, + "step": 289 + }, + { + "epoch": 0.04963203833647099, + "grad_norm": 17.432056427001953, + "learning_rate": 1.6257843696520251e-06, + "loss": 6.8872, + "step": 290 + }, + { + "epoch": 0.04980318329625193, + "grad_norm": 242.7275390625, + "learning_rate": 1.6314888762122076e-06, + "loss": 11.2526, + "step": 291 + }, + { + "epoch": 0.04997432825603286, + "grad_norm": 15.779862403869629, + "learning_rate": 1.6371933827723902e-06, + "loss": 8.7887, + "step": 292 + }, + { + "epoch": 0.05014547321581379, + "grad_norm": 13.621806144714355, + "learning_rate": 1.642897889332573e-06, + "loss": 7.0578, + "step": 293 + }, + { + "epoch": 0.05031661817559473, + "grad_norm": 14.4631986618042, + "learning_rate": 1.6486023958927552e-06, + "loss": 8.2147, + "step": 294 + }, + { + "epoch": 0.05048776313537566, + "grad_norm": 18.11038589477539, + "learning_rate": 1.6543069024529378e-06, + "loss": 6.4308, + "step": 295 + }, + { + "epoch": 0.0506589080951566, + "grad_norm": 16.797258377075195, + "learning_rate": 1.6600114090131205e-06, + "loss": 6.3738, + "step": 296 + }, + { + "epoch": 0.050830053054937534, + "grad_norm": 17.457462310791016, + "learning_rate": 1.6657159155733028e-06, + "loss": 6.3681, + "step": 297 + }, + { + "epoch": 0.05100119801471847, + "grad_norm": 14.502140045166016, + "learning_rate": 1.6714204221334856e-06, + "loss": 7.1297, + "step": 298 + }, + { + "epoch": 0.0511723429744994, + "grad_norm": 14.4544677734375, + "learning_rate": 1.6771249286936681e-06, + "loss": 8.5584, + "step": 299 + }, + { + "epoch": 0.051343487934280334, + "grad_norm": 13.313618659973145, + "learning_rate": 1.6828294352538504e-06, + "loss": 8.1348, + "step": 300 + }, + { + "epoch": 0.05151463289406127, + "grad_norm": 91.8434829711914, + "learning_rate": 1.6885339418140332e-06, + "loss": 13.9421, + "step": 301 + }, + { + "epoch": 0.05168577785384221, + "grad_norm": 39.31818389892578, + "learning_rate": 1.6942384483742157e-06, + "loss": 10.3291, + "step": 302 + }, + { + "epoch": 0.05185692281362314, + "grad_norm": 16.320667266845703, + "learning_rate": 1.6999429549343982e-06, + "loss": 4.6866, + "step": 303 + }, + { + "epoch": 0.052028067773404074, + "grad_norm": 13.367071151733398, + "learning_rate": 1.7056474614945808e-06, + "loss": 8.1535, + "step": 304 + }, + { + "epoch": 0.05219921273318501, + "grad_norm": 186.96824645996094, + "learning_rate": 1.7113519680547633e-06, + "loss": 10.6341, + "step": 305 + }, + { + "epoch": 0.05237035769296594, + "grad_norm": 28.400169372558594, + "learning_rate": 1.7170564746149458e-06, + "loss": 9.7369, + "step": 306 + }, + { + "epoch": 0.052541502652746874, + "grad_norm": 15.559652328491211, + "learning_rate": 1.7227609811751284e-06, + "loss": 7.1427, + "step": 307 + }, + { + "epoch": 0.05271264761252781, + "grad_norm": 5.730342864990234, + "learning_rate": 1.728465487735311e-06, + "loss": 5.4861, + "step": 308 + }, + { + "epoch": 0.05288379257230875, + "grad_norm": 19.06242561340332, + "learning_rate": 1.7341699942954936e-06, + "loss": 9.0657, + "step": 309 + }, + { + "epoch": 0.05305493753208968, + "grad_norm": 18.580720901489258, + "learning_rate": 1.739874500855676e-06, + "loss": 5.9947, + "step": 310 + }, + { + "epoch": 0.053226082491870615, + "grad_norm": 13.939530372619629, + "learning_rate": 1.7455790074158585e-06, + "loss": 7.1715, + "step": 311 + }, + { + "epoch": 0.05339722745165155, + "grad_norm": 12.347646713256836, + "learning_rate": 1.7512835139760412e-06, + "loss": 4.5087, + "step": 312 + }, + { + "epoch": 0.05356837241143248, + "grad_norm": 16.251863479614258, + "learning_rate": 1.7569880205362238e-06, + "loss": 8.7544, + "step": 313 + }, + { + "epoch": 0.053739517371213415, + "grad_norm": 18.887571334838867, + "learning_rate": 1.762692527096406e-06, + "loss": 7.1006, + "step": 314 + }, + { + "epoch": 0.053910662330994355, + "grad_norm": 29.57771873474121, + "learning_rate": 1.7683970336565888e-06, + "loss": 10.2554, + "step": 315 + }, + { + "epoch": 0.05408180729077529, + "grad_norm": 215.26080322265625, + "learning_rate": 1.7741015402167714e-06, + "loss": 10.6589, + "step": 316 + }, + { + "epoch": 0.05425295225055622, + "grad_norm": 6.18715763092041, + "learning_rate": 1.7798060467769537e-06, + "loss": 5.3794, + "step": 317 + }, + { + "epoch": 0.054424097210337155, + "grad_norm": 30.351348876953125, + "learning_rate": 1.7855105533371364e-06, + "loss": 10.3749, + "step": 318 + }, + { + "epoch": 0.05459524217011809, + "grad_norm": 16.978347778320312, + "learning_rate": 1.791215059897319e-06, + "loss": 6.2012, + "step": 319 + }, + { + "epoch": 0.05476638712989902, + "grad_norm": 19.239072799682617, + "learning_rate": 1.7969195664575015e-06, + "loss": 9.1925, + "step": 320 + }, + { + "epoch": 0.05493753208967996, + "grad_norm": 20.378984451293945, + "learning_rate": 1.802624073017684e-06, + "loss": 8.7484, + "step": 321 + }, + { + "epoch": 0.055108677049460895, + "grad_norm": 11.863981246948242, + "learning_rate": 1.8083285795778666e-06, + "loss": 6.308, + "step": 322 + }, + { + "epoch": 0.05527982200924183, + "grad_norm": 15.815791130065918, + "learning_rate": 1.8140330861380493e-06, + "loss": 8.9935, + "step": 323 + }, + { + "epoch": 0.05545096696902276, + "grad_norm": 31.865665435791016, + "learning_rate": 1.8197375926982316e-06, + "loss": 10.1397, + "step": 324 + }, + { + "epoch": 0.055622111928803696, + "grad_norm": 160.87301635742188, + "learning_rate": 1.8254420992584141e-06, + "loss": 9.2965, + "step": 325 + }, + { + "epoch": 0.05579325688858463, + "grad_norm": 16.763856887817383, + "learning_rate": 1.8311466058185969e-06, + "loss": 6.6638, + "step": 326 + }, + { + "epoch": 0.05596440184836556, + "grad_norm": 12.291769981384277, + "learning_rate": 1.8368511123787792e-06, + "loss": 8.2182, + "step": 327 + }, + { + "epoch": 0.0561355468081465, + "grad_norm": 20.839473724365234, + "learning_rate": 1.8425556189389617e-06, + "loss": 5.9446, + "step": 328 + }, + { + "epoch": 0.056306691767927436, + "grad_norm": 41.371337890625, + "learning_rate": 1.8482601254991445e-06, + "loss": 10.0738, + "step": 329 + }, + { + "epoch": 0.05647783672770837, + "grad_norm": 12.416519165039062, + "learning_rate": 1.8539646320593268e-06, + "loss": 7.9372, + "step": 330 + }, + { + "epoch": 0.0566489816874893, + "grad_norm": 12.856998443603516, + "learning_rate": 1.8596691386195093e-06, + "loss": 8.5894, + "step": 331 + }, + { + "epoch": 0.056820126647270236, + "grad_norm": 28.67165184020996, + "learning_rate": 1.865373645179692e-06, + "loss": 9.856, + "step": 332 + }, + { + "epoch": 0.05699127160705117, + "grad_norm": 17.425006866455078, + "learning_rate": 1.8710781517398746e-06, + "loss": 7.6487, + "step": 333 + }, + { + "epoch": 0.05716241656683211, + "grad_norm": 29.102951049804688, + "learning_rate": 1.8767826583000571e-06, + "loss": 9.7985, + "step": 334 + }, + { + "epoch": 0.05733356152661304, + "grad_norm": 15.120597839355469, + "learning_rate": 1.8824871648602395e-06, + "loss": 8.856, + "step": 335 + }, + { + "epoch": 0.057504706486393976, + "grad_norm": 188.02642822265625, + "learning_rate": 1.8881916714204222e-06, + "loss": 9.3274, + "step": 336 + }, + { + "epoch": 0.05767585144617491, + "grad_norm": 14.4713134765625, + "learning_rate": 1.8938961779806047e-06, + "loss": 8.8408, + "step": 337 + }, + { + "epoch": 0.05784699640595584, + "grad_norm": 27.848546981811523, + "learning_rate": 1.8996006845407875e-06, + "loss": 10.1598, + "step": 338 + }, + { + "epoch": 0.058018141365736776, + "grad_norm": 12.024163246154785, + "learning_rate": 1.9053051911009698e-06, + "loss": 6.2088, + "step": 339 + }, + { + "epoch": 0.05818928632551772, + "grad_norm": 11.968954086303711, + "learning_rate": 1.9110096976611523e-06, + "loss": 7.3791, + "step": 340 + }, + { + "epoch": 0.05836043128529865, + "grad_norm": 27.01519775390625, + "learning_rate": 1.9167142042213353e-06, + "loss": 10.5111, + "step": 341 + }, + { + "epoch": 0.05853157624507958, + "grad_norm": 13.136455535888672, + "learning_rate": 1.9224187107815174e-06, + "loss": 4.512, + "step": 342 + }, + { + "epoch": 0.05870272120486052, + "grad_norm": 16.26902198791504, + "learning_rate": 1.9281232173417e-06, + "loss": 6.8285, + "step": 343 + }, + { + "epoch": 0.05887386616464145, + "grad_norm": 16.47487449645996, + "learning_rate": 1.933827723901883e-06, + "loss": 8.8793, + "step": 344 + }, + { + "epoch": 0.059045011124422384, + "grad_norm": 32.750850677490234, + "learning_rate": 1.939532230462065e-06, + "loss": 10.0536, + "step": 345 + }, + { + "epoch": 0.05921615608420332, + "grad_norm": 18.996196746826172, + "learning_rate": 1.9452367370222475e-06, + "loss": 6.1966, + "step": 346 + }, + { + "epoch": 0.05938730104398426, + "grad_norm": 24.546964645385742, + "learning_rate": 1.9509412435824305e-06, + "loss": 9.4677, + "step": 347 + }, + { + "epoch": 0.05955844600376519, + "grad_norm": 84.20301055908203, + "learning_rate": 1.9566457501426126e-06, + "loss": 14.0007, + "step": 348 + }, + { + "epoch": 0.059729590963546124, + "grad_norm": 18.845518112182617, + "learning_rate": 1.962350256702795e-06, + "loss": 6.1715, + "step": 349 + }, + { + "epoch": 0.05990073592332706, + "grad_norm": 32.177085876464844, + "learning_rate": 1.968054763262978e-06, + "loss": 10.5934, + "step": 350 + }, + { + "epoch": 0.06007188088310799, + "grad_norm": 24.051923751831055, + "learning_rate": 1.97375926982316e-06, + "loss": 9.6501, + "step": 351 + }, + { + "epoch": 0.060243025842888924, + "grad_norm": 13.522736549377441, + "learning_rate": 1.9794637763833427e-06, + "loss": 8.8967, + "step": 352 + }, + { + "epoch": 0.060414170802669864, + "grad_norm": 21.437868118286133, + "learning_rate": 1.9851682829435257e-06, + "loss": 9.8243, + "step": 353 + }, + { + "epoch": 0.0605853157624508, + "grad_norm": 30.177589416503906, + "learning_rate": 1.9908727895037078e-06, + "loss": 9.5401, + "step": 354 + }, + { + "epoch": 0.06075646072223173, + "grad_norm": 12.939532279968262, + "learning_rate": 1.9965772960638903e-06, + "loss": 6.4143, + "step": 355 + }, + { + "epoch": 0.060927605682012664, + "grad_norm": 18.022136688232422, + "learning_rate": 2.0022818026240733e-06, + "loss": 9.6522, + "step": 356 + }, + { + "epoch": 0.0610987506417936, + "grad_norm": 12.483067512512207, + "learning_rate": 2.0079863091842554e-06, + "loss": 9.2254, + "step": 357 + }, + { + "epoch": 0.06126989560157453, + "grad_norm": 19.432615280151367, + "learning_rate": 2.0136908157444383e-06, + "loss": 6.2483, + "step": 358 + }, + { + "epoch": 0.06144104056135547, + "grad_norm": 177.3258819580078, + "learning_rate": 2.019395322304621e-06, + "loss": 9.2975, + "step": 359 + }, + { + "epoch": 0.061612185521136405, + "grad_norm": 14.458636283874512, + "learning_rate": 2.025099828864803e-06, + "loss": 8.5874, + "step": 360 + }, + { + "epoch": 0.06178333048091734, + "grad_norm": 21.112350463867188, + "learning_rate": 2.030804335424986e-06, + "loss": 9.4896, + "step": 361 + }, + { + "epoch": 0.06195447544069827, + "grad_norm": 15.956084251403809, + "learning_rate": 2.0365088419851685e-06, + "loss": 9.3311, + "step": 362 + }, + { + "epoch": 0.062125620400479205, + "grad_norm": 11.96216869354248, + "learning_rate": 2.042213348545351e-06, + "loss": 8.2885, + "step": 363 + }, + { + "epoch": 0.06229676536026014, + "grad_norm": 16.588687896728516, + "learning_rate": 2.0479178551055335e-06, + "loss": 8.5745, + "step": 364 + }, + { + "epoch": 0.06246791032004107, + "grad_norm": 20.95501708984375, + "learning_rate": 2.053622361665716e-06, + "loss": 9.5327, + "step": 365 + }, + { + "epoch": 0.06263905527982201, + "grad_norm": 14.255351066589355, + "learning_rate": 2.0593268682258986e-06, + "loss": 9.1372, + "step": 366 + }, + { + "epoch": 0.06281020023960295, + "grad_norm": 17.529571533203125, + "learning_rate": 2.065031374786081e-06, + "loss": 6.9098, + "step": 367 + }, + { + "epoch": 0.06298134519938388, + "grad_norm": 23.381641387939453, + "learning_rate": 2.0707358813462636e-06, + "loss": 9.4994, + "step": 368 + }, + { + "epoch": 0.06315249015916481, + "grad_norm": 152.30535888671875, + "learning_rate": 2.076440387906446e-06, + "loss": 8.5952, + "step": 369 + }, + { + "epoch": 0.06332363511894575, + "grad_norm": 15.447931289672852, + "learning_rate": 2.0821448944666287e-06, + "loss": 7.1287, + "step": 370 + }, + { + "epoch": 0.06349478007872668, + "grad_norm": 13.553053855895996, + "learning_rate": 2.0878494010268112e-06, + "loss": 8.0622, + "step": 371 + }, + { + "epoch": 0.06366592503850761, + "grad_norm": 13.198517799377441, + "learning_rate": 2.0935539075869938e-06, + "loss": 8.3527, + "step": 372 + }, + { + "epoch": 0.06383706999828855, + "grad_norm": 21.851369857788086, + "learning_rate": 2.0992584141471763e-06, + "loss": 6.7771, + "step": 373 + }, + { + "epoch": 0.06400821495806948, + "grad_norm": 30.56134605407715, + "learning_rate": 2.104962920707359e-06, + "loss": 9.666, + "step": 374 + }, + { + "epoch": 0.06417935991785043, + "grad_norm": 18.76494026184082, + "learning_rate": 2.1106674272675414e-06, + "loss": 9.3941, + "step": 375 + }, + { + "epoch": 0.06435050487763136, + "grad_norm": 19.92658805847168, + "learning_rate": 2.116371933827724e-06, + "loss": 9.3741, + "step": 376 + }, + { + "epoch": 0.06452164983741229, + "grad_norm": 10.430363655090332, + "learning_rate": 2.1220764403879064e-06, + "loss": 7.8113, + "step": 377 + }, + { + "epoch": 0.06469279479719323, + "grad_norm": 18.093847274780273, + "learning_rate": 2.1277809469480894e-06, + "loss": 6.1706, + "step": 378 + }, + { + "epoch": 0.06486393975697416, + "grad_norm": 21.807714462280273, + "learning_rate": 2.1334854535082715e-06, + "loss": 9.471, + "step": 379 + }, + { + "epoch": 0.06503508471675509, + "grad_norm": 10.38511848449707, + "learning_rate": 2.139189960068454e-06, + "loss": 4.2784, + "step": 380 + }, + { + "epoch": 0.06520622967653603, + "grad_norm": 18.564613342285156, + "learning_rate": 2.144894466628637e-06, + "loss": 9.548, + "step": 381 + }, + { + "epoch": 0.06537737463631696, + "grad_norm": 13.890935897827148, + "learning_rate": 2.150598973188819e-06, + "loss": 7.9354, + "step": 382 + }, + { + "epoch": 0.06554851959609789, + "grad_norm": 18.593252182006836, + "learning_rate": 2.1563034797490016e-06, + "loss": 6.34, + "step": 383 + }, + { + "epoch": 0.06571966455587883, + "grad_norm": 10.455931663513184, + "learning_rate": 2.1620079863091846e-06, + "loss": 6.2716, + "step": 384 + }, + { + "epoch": 0.06589080951565976, + "grad_norm": 21.231943130493164, + "learning_rate": 2.1677124928693667e-06, + "loss": 5.9761, + "step": 385 + }, + { + "epoch": 0.06606195447544069, + "grad_norm": 11.568195343017578, + "learning_rate": 2.173416999429549e-06, + "loss": 4.4776, + "step": 386 + }, + { + "epoch": 0.06623309943522163, + "grad_norm": 23.829204559326172, + "learning_rate": 2.179121505989732e-06, + "loss": 9.648, + "step": 387 + }, + { + "epoch": 0.06640424439500257, + "grad_norm": 10.398987770080566, + "learning_rate": 2.1848260125499147e-06, + "loss": 4.7062, + "step": 388 + }, + { + "epoch": 0.0665753893547835, + "grad_norm": 11.396307945251465, + "learning_rate": 2.190530519110097e-06, + "loss": 8.1087, + "step": 389 + }, + { + "epoch": 0.06674653431456444, + "grad_norm": 18.780866622924805, + "learning_rate": 2.1962350256702798e-06, + "loss": 6.196, + "step": 390 + }, + { + "epoch": 0.06691767927434537, + "grad_norm": 18.36736488342285, + "learning_rate": 2.2019395322304623e-06, + "loss": 6.2459, + "step": 391 + }, + { + "epoch": 0.0670888242341263, + "grad_norm": 18.681446075439453, + "learning_rate": 2.2076440387906444e-06, + "loss": 9.4161, + "step": 392 + }, + { + "epoch": 0.06725996919390724, + "grad_norm": 15.113629341125488, + "learning_rate": 2.2133485453508274e-06, + "loss": 6.3517, + "step": 393 + }, + { + "epoch": 0.06743111415368817, + "grad_norm": 11.273137092590332, + "learning_rate": 2.21905305191101e-06, + "loss": 8.0886, + "step": 394 + }, + { + "epoch": 0.06760225911346911, + "grad_norm": 17.580646514892578, + "learning_rate": 2.224757558471192e-06, + "loss": 6.5059, + "step": 395 + }, + { + "epoch": 0.06777340407325004, + "grad_norm": 15.864416122436523, + "learning_rate": 2.230462065031375e-06, + "loss": 8.5624, + "step": 396 + }, + { + "epoch": 0.06794454903303097, + "grad_norm": 11.407431602478027, + "learning_rate": 2.2361665715915575e-06, + "loss": 7.853, + "step": 397 + }, + { + "epoch": 0.06811569399281191, + "grad_norm": 28.192079544067383, + "learning_rate": 2.24187107815174e-06, + "loss": 9.4467, + "step": 398 + }, + { + "epoch": 0.06828683895259284, + "grad_norm": 19.4180965423584, + "learning_rate": 2.2475755847119225e-06, + "loss": 5.6605, + "step": 399 + }, + { + "epoch": 0.06845798391237377, + "grad_norm": 19.75929069519043, + "learning_rate": 2.253280091272105e-06, + "loss": 9.4512, + "step": 400 + }, + { + "epoch": 0.06862912887215472, + "grad_norm": 10.311906814575195, + "learning_rate": 2.2589845978322876e-06, + "loss": 7.9644, + "step": 401 + }, + { + "epoch": 0.06880027383193565, + "grad_norm": 20.4741268157959, + "learning_rate": 2.26468910439247e-06, + "loss": 9.4577, + "step": 402 + }, + { + "epoch": 0.06897141879171659, + "grad_norm": 25.65606117248535, + "learning_rate": 2.2703936109526527e-06, + "loss": 9.6371, + "step": 403 + }, + { + "epoch": 0.06914256375149752, + "grad_norm": 26.26441192626953, + "learning_rate": 2.276098117512835e-06, + "loss": 9.5365, + "step": 404 + }, + { + "epoch": 0.06931370871127845, + "grad_norm": 14.249612808227539, + "learning_rate": 2.2818026240730177e-06, + "loss": 8.5897, + "step": 405 + }, + { + "epoch": 0.06948485367105939, + "grad_norm": 17.306989669799805, + "learning_rate": 2.2875071306332003e-06, + "loss": 6.9065, + "step": 406 + }, + { + "epoch": 0.06965599863084032, + "grad_norm": 10.925597190856934, + "learning_rate": 2.293211637193383e-06, + "loss": 4.4132, + "step": 407 + }, + { + "epoch": 0.06982714359062125, + "grad_norm": 20.995426177978516, + "learning_rate": 2.2989161437535653e-06, + "loss": 9.6018, + "step": 408 + }, + { + "epoch": 0.06999828855040219, + "grad_norm": 13.343510627746582, + "learning_rate": 2.304620650313748e-06, + "loss": 8.3354, + "step": 409 + }, + { + "epoch": 0.07016943351018312, + "grad_norm": 21.461809158325195, + "learning_rate": 2.3103251568739304e-06, + "loss": 9.3101, + "step": 410 + }, + { + "epoch": 0.07034057846996405, + "grad_norm": 25.428903579711914, + "learning_rate": 2.316029663434113e-06, + "loss": 9.4155, + "step": 411 + }, + { + "epoch": 0.07051172342974499, + "grad_norm": 22.469390869140625, + "learning_rate": 2.3217341699942955e-06, + "loss": 6.4331, + "step": 412 + }, + { + "epoch": 0.07068286838952594, + "grad_norm": 157.02752685546875, + "learning_rate": 2.3274386765544784e-06, + "loss": 7.6313, + "step": 413 + }, + { + "epoch": 0.07085401334930687, + "grad_norm": 12.20741081237793, + "learning_rate": 2.3331431831146605e-06, + "loss": 4.2273, + "step": 414 + }, + { + "epoch": 0.0710251583090878, + "grad_norm": 19.81876564025879, + "learning_rate": 2.338847689674843e-06, + "loss": 9.5364, + "step": 415 + }, + { + "epoch": 0.07119630326886874, + "grad_norm": 17.362276077270508, + "learning_rate": 2.344552196235026e-06, + "loss": 9.4605, + "step": 416 + }, + { + "epoch": 0.07136744822864967, + "grad_norm": 22.898147583007812, + "learning_rate": 2.350256702795208e-06, + "loss": 9.5846, + "step": 417 + }, + { + "epoch": 0.0715385931884306, + "grad_norm": 17.685535430908203, + "learning_rate": 2.3559612093553906e-06, + "loss": 8.0604, + "step": 418 + }, + { + "epoch": 0.07170973814821154, + "grad_norm": 16.97225570678711, + "learning_rate": 2.3616657159155736e-06, + "loss": 9.0822, + "step": 419 + }, + { + "epoch": 0.07188088310799247, + "grad_norm": 21.690431594848633, + "learning_rate": 2.3673702224757557e-06, + "loss": 5.9587, + "step": 420 + }, + { + "epoch": 0.0720520280677734, + "grad_norm": 20.209810256958008, + "learning_rate": 2.3730747290359387e-06, + "loss": 6.1507, + "step": 421 + }, + { + "epoch": 0.07222317302755434, + "grad_norm": 19.15233039855957, + "learning_rate": 2.378779235596121e-06, + "loss": 9.5222, + "step": 422 + }, + { + "epoch": 0.07239431798733527, + "grad_norm": 15.19393539428711, + "learning_rate": 2.3844837421563033e-06, + "loss": 8.3063, + "step": 423 + }, + { + "epoch": 0.0725654629471162, + "grad_norm": 14.138923645019531, + "learning_rate": 2.3901882487164863e-06, + "loss": 8.3802, + "step": 424 + }, + { + "epoch": 0.07273660790689714, + "grad_norm": 23.83425521850586, + "learning_rate": 2.395892755276669e-06, + "loss": 9.554, + "step": 425 + }, + { + "epoch": 0.07290775286667808, + "grad_norm": 19.778850555419922, + "learning_rate": 2.401597261836851e-06, + "loss": 6.0866, + "step": 426 + }, + { + "epoch": 0.07307889782645902, + "grad_norm": 12.418360710144043, + "learning_rate": 2.407301768397034e-06, + "loss": 7.7723, + "step": 427 + }, + { + "epoch": 0.07325004278623995, + "grad_norm": 21.105587005615234, + "learning_rate": 2.4130062749572164e-06, + "loss": 6.011, + "step": 428 + }, + { + "epoch": 0.07342118774602088, + "grad_norm": 18.78055763244629, + "learning_rate": 2.4187107815173985e-06, + "loss": 6.4389, + "step": 429 + }, + { + "epoch": 0.07359233270580182, + "grad_norm": 17.227916717529297, + "learning_rate": 2.4244152880775814e-06, + "loss": 6.6973, + "step": 430 + }, + { + "epoch": 0.07376347766558275, + "grad_norm": 21.845876693725586, + "learning_rate": 2.430119794637764e-06, + "loss": 5.9158, + "step": 431 + }, + { + "epoch": 0.07393462262536368, + "grad_norm": 14.355096817016602, + "learning_rate": 2.435824301197946e-06, + "loss": 8.6576, + "step": 432 + }, + { + "epoch": 0.07410576758514462, + "grad_norm": 149.28054809570312, + "learning_rate": 2.441528807758129e-06, + "loss": 7.7649, + "step": 433 + }, + { + "epoch": 0.07427691254492555, + "grad_norm": 18.152389526367188, + "learning_rate": 2.4472333143183116e-06, + "loss": 6.6434, + "step": 434 + }, + { + "epoch": 0.07444805750470648, + "grad_norm": 17.05584716796875, + "learning_rate": 2.4529378208784937e-06, + "loss": 9.1462, + "step": 435 + }, + { + "epoch": 0.07461920246448742, + "grad_norm": 11.82278060913086, + "learning_rate": 2.4586423274386766e-06, + "loss": 8.2832, + "step": 436 + }, + { + "epoch": 0.07479034742426835, + "grad_norm": 17.951648712158203, + "learning_rate": 2.464346833998859e-06, + "loss": 8.4052, + "step": 437 + }, + { + "epoch": 0.07496149238404928, + "grad_norm": 31.258188247680664, + "learning_rate": 2.4700513405590417e-06, + "loss": 9.4477, + "step": 438 + }, + { + "epoch": 0.07513263734383023, + "grad_norm": 138.91761779785156, + "learning_rate": 2.4757558471192242e-06, + "loss": 8.3869, + "step": 439 + }, + { + "epoch": 0.07530378230361116, + "grad_norm": 17.930551528930664, + "learning_rate": 2.4814603536794068e-06, + "loss": 9.1768, + "step": 440 + }, + { + "epoch": 0.0754749272633921, + "grad_norm": 10.999883651733398, + "learning_rate": 2.4871648602395897e-06, + "loss": 4.1341, + "step": 441 + }, + { + "epoch": 0.07564607222317303, + "grad_norm": 19.707490921020508, + "learning_rate": 2.492869366799772e-06, + "loss": 6.0241, + "step": 442 + }, + { + "epoch": 0.07581721718295396, + "grad_norm": 19.63069725036621, + "learning_rate": 2.4985738733599544e-06, + "loss": 9.5659, + "step": 443 + }, + { + "epoch": 0.0759883621427349, + "grad_norm": 19.783658981323242, + "learning_rate": 2.5042783799201373e-06, + "loss": 6.632, + "step": 444 + }, + { + "epoch": 0.07615950710251583, + "grad_norm": 11.193924903869629, + "learning_rate": 2.5099828864803194e-06, + "loss": 4.213, + "step": 445 + }, + { + "epoch": 0.07633065206229676, + "grad_norm": 65.09992218017578, + "learning_rate": 2.515687393040502e-06, + "loss": 13.1721, + "step": 446 + }, + { + "epoch": 0.0765017970220777, + "grad_norm": 19.081214904785156, + "learning_rate": 2.521391899600685e-06, + "loss": 8.7605, + "step": 447 + }, + { + "epoch": 0.07667294198185863, + "grad_norm": 17.08602523803711, + "learning_rate": 2.527096406160867e-06, + "loss": 8.4352, + "step": 448 + }, + { + "epoch": 0.07684408694163956, + "grad_norm": 11.796391487121582, + "learning_rate": 2.5328009127210495e-06, + "loss": 7.9838, + "step": 449 + }, + { + "epoch": 0.0770152319014205, + "grad_norm": 17.306316375732422, + "learning_rate": 2.5385054192812325e-06, + "loss": 7.9123, + "step": 450 + }, + { + "epoch": 0.07718637686120144, + "grad_norm": 11.991724014282227, + "learning_rate": 2.5442099258414146e-06, + "loss": 7.904, + "step": 451 + }, + { + "epoch": 0.07735752182098238, + "grad_norm": 18.394563674926758, + "learning_rate": 2.549914432401597e-06, + "loss": 6.7541, + "step": 452 + }, + { + "epoch": 0.07752866678076331, + "grad_norm": 21.436811447143555, + "learning_rate": 2.55561893896178e-06, + "loss": 5.5488, + "step": 453 + }, + { + "epoch": 0.07769981174054424, + "grad_norm": 15.822162628173828, + "learning_rate": 2.561323445521962e-06, + "loss": 7.7392, + "step": 454 + }, + { + "epoch": 0.07787095670032518, + "grad_norm": 19.68645668029785, + "learning_rate": 2.5670279520821447e-06, + "loss": 6.6529, + "step": 455 + }, + { + "epoch": 0.07804210166010611, + "grad_norm": 18.808198928833008, + "learning_rate": 2.5727324586423277e-06, + "loss": 8.784, + "step": 456 + }, + { + "epoch": 0.07821324661988704, + "grad_norm": 131.1753692626953, + "learning_rate": 2.57843696520251e-06, + "loss": 7.8706, + "step": 457 + }, + { + "epoch": 0.07838439157966798, + "grad_norm": 11.708639144897461, + "learning_rate": 2.5841414717626923e-06, + "loss": 7.7402, + "step": 458 + }, + { + "epoch": 0.07855553653944891, + "grad_norm": 15.965631484985352, + "learning_rate": 2.5898459783228753e-06, + "loss": 8.301, + "step": 459 + }, + { + "epoch": 0.07872668149922984, + "grad_norm": 14.710309982299805, + "learning_rate": 2.5955504848830574e-06, + "loss": 7.9566, + "step": 460 + }, + { + "epoch": 0.07889782645901078, + "grad_norm": 15.00783634185791, + "learning_rate": 2.6012549914432404e-06, + "loss": 8.488, + "step": 461 + }, + { + "epoch": 0.07906897141879171, + "grad_norm": 13.231627464294434, + "learning_rate": 2.606959498003423e-06, + "loss": 8.1184, + "step": 462 + }, + { + "epoch": 0.07924011637857264, + "grad_norm": 170.4566192626953, + "learning_rate": 2.6126640045636054e-06, + "loss": 8.1805, + "step": 463 + }, + { + "epoch": 0.07941126133835359, + "grad_norm": 23.66990089416504, + "learning_rate": 2.618368511123788e-06, + "loss": 9.2852, + "step": 464 + }, + { + "epoch": 0.07958240629813453, + "grad_norm": 20.218496322631836, + "learning_rate": 2.6240730176839705e-06, + "loss": 6.264, + "step": 465 + }, + { + "epoch": 0.07975355125791546, + "grad_norm": 27.905323028564453, + "learning_rate": 2.629777524244153e-06, + "loss": 10.0002, + "step": 466 + }, + { + "epoch": 0.07992469621769639, + "grad_norm": 22.043649673461914, + "learning_rate": 2.6354820308043355e-06, + "loss": 8.6303, + "step": 467 + }, + { + "epoch": 0.08009584117747733, + "grad_norm": 20.095890045166016, + "learning_rate": 2.641186537364518e-06, + "loss": 8.7857, + "step": 468 + }, + { + "epoch": 0.08026698613725826, + "grad_norm": 30.715435028076172, + "learning_rate": 2.6468910439247006e-06, + "loss": 9.6486, + "step": 469 + }, + { + "epoch": 0.08043813109703919, + "grad_norm": 18.83611488342285, + "learning_rate": 2.652595550484883e-06, + "loss": 7.9544, + "step": 470 + }, + { + "epoch": 0.08060927605682013, + "grad_norm": 20.929931640625, + "learning_rate": 2.6583000570450657e-06, + "loss": 6.2772, + "step": 471 + }, + { + "epoch": 0.08078042101660106, + "grad_norm": 18.414594650268555, + "learning_rate": 2.664004563605248e-06, + "loss": 6.1477, + "step": 472 + }, + { + "epoch": 0.08095156597638199, + "grad_norm": 18.188846588134766, + "learning_rate": 2.6697090701654307e-06, + "loss": 7.099, + "step": 473 + }, + { + "epoch": 0.08112271093616293, + "grad_norm": 8.666217803955078, + "learning_rate": 2.6754135767256133e-06, + "loss": 5.0929, + "step": 474 + }, + { + "epoch": 0.08129385589594386, + "grad_norm": 15.457167625427246, + "learning_rate": 2.681118083285796e-06, + "loss": 7.8706, + "step": 475 + }, + { + "epoch": 0.08146500085572479, + "grad_norm": 17.11892318725586, + "learning_rate": 2.6868225898459783e-06, + "loss": 8.5293, + "step": 476 + }, + { + "epoch": 0.08163614581550574, + "grad_norm": 28.18759536743164, + "learning_rate": 2.692527096406161e-06, + "loss": 5.7448, + "step": 477 + }, + { + "epoch": 0.08180729077528667, + "grad_norm": 19.842830657958984, + "learning_rate": 2.6982316029663434e-06, + "loss": 8.5854, + "step": 478 + }, + { + "epoch": 0.0819784357350676, + "grad_norm": 59.76820373535156, + "learning_rate": 2.703936109526526e-06, + "loss": 12.4879, + "step": 479 + }, + { + "epoch": 0.08214958069484854, + "grad_norm": 15.530830383300781, + "learning_rate": 2.7096406160867085e-06, + "loss": 8.191, + "step": 480 + }, + { + "epoch": 0.08232072565462947, + "grad_norm": 21.211435317993164, + "learning_rate": 2.7153451226468914e-06, + "loss": 9.4326, + "step": 481 + }, + { + "epoch": 0.0824918706144104, + "grad_norm": 16.38536834716797, + "learning_rate": 2.7210496292070735e-06, + "loss": 6.3342, + "step": 482 + }, + { + "epoch": 0.08266301557419134, + "grad_norm": 30.17742919921875, + "learning_rate": 2.726754135767256e-06, + "loss": 5.5068, + "step": 483 + }, + { + "epoch": 0.08283416053397227, + "grad_norm": 27.44713020324707, + "learning_rate": 2.732458642327439e-06, + "loss": 9.4586, + "step": 484 + }, + { + "epoch": 0.0830053054937532, + "grad_norm": 59.46120071411133, + "learning_rate": 2.738163148887621e-06, + "loss": 12.3889, + "step": 485 + }, + { + "epoch": 0.08317645045353414, + "grad_norm": 26.801589965820312, + "learning_rate": 2.7438676554478036e-06, + "loss": 5.3141, + "step": 486 + }, + { + "epoch": 0.08334759541331507, + "grad_norm": 32.20411682128906, + "learning_rate": 2.7495721620079866e-06, + "loss": 5.4274, + "step": 487 + }, + { + "epoch": 0.083518740373096, + "grad_norm": 16.14412498474121, + "learning_rate": 2.755276668568169e-06, + "loss": 8.3447, + "step": 488 + }, + { + "epoch": 0.08368988533287694, + "grad_norm": 16.79600715637207, + "learning_rate": 2.7609811751283512e-06, + "loss": 7.7737, + "step": 489 + }, + { + "epoch": 0.08386103029265789, + "grad_norm": 171.59872436523438, + "learning_rate": 2.766685681688534e-06, + "loss": 8.277, + "step": 490 + }, + { + "epoch": 0.08403217525243882, + "grad_norm": 29.80289649963379, + "learning_rate": 2.7723901882487167e-06, + "loss": 5.273, + "step": 491 + }, + { + "epoch": 0.08420332021221975, + "grad_norm": 15.38176155090332, + "learning_rate": 2.778094694808899e-06, + "loss": 7.8611, + "step": 492 + }, + { + "epoch": 0.08437446517200069, + "grad_norm": 19.766082763671875, + "learning_rate": 2.783799201369082e-06, + "loss": 7.7926, + "step": 493 + }, + { + "epoch": 0.08454561013178162, + "grad_norm": 13.274962425231934, + "learning_rate": 2.7895037079292643e-06, + "loss": 4.1215, + "step": 494 + }, + { + "epoch": 0.08471675509156255, + "grad_norm": 29.015403747558594, + "learning_rate": 2.7952082144894464e-06, + "loss": 5.4146, + "step": 495 + }, + { + "epoch": 0.08488790005134349, + "grad_norm": 22.243703842163086, + "learning_rate": 2.8009127210496294e-06, + "loss": 5.753, + "step": 496 + }, + { + "epoch": 0.08505904501112442, + "grad_norm": 23.75475311279297, + "learning_rate": 2.806617227609812e-06, + "loss": 5.7119, + "step": 497 + }, + { + "epoch": 0.08523018997090535, + "grad_norm": 19.524032592773438, + "learning_rate": 2.812321734169994e-06, + "loss": 8.9719, + "step": 498 + }, + { + "epoch": 0.08540133493068629, + "grad_norm": 22.207155227661133, + "learning_rate": 2.818026240730177e-06, + "loss": 8.5433, + "step": 499 + }, + { + "epoch": 0.08557247989046722, + "grad_norm": 20.369564056396484, + "learning_rate": 2.8237307472903595e-06, + "loss": 9.2212, + "step": 500 + }, + { + "epoch": 0.08574362485024815, + "grad_norm": 12.617632865905762, + "learning_rate": 2.829435253850542e-06, + "loss": 7.5878, + "step": 501 + }, + { + "epoch": 0.0859147698100291, + "grad_norm": 16.92389678955078, + "learning_rate": 2.8351397604107246e-06, + "loss": 8.1394, + "step": 502 + }, + { + "epoch": 0.08608591476981003, + "grad_norm": 52.22781753540039, + "learning_rate": 2.840844266970907e-06, + "loss": 11.9304, + "step": 503 + }, + { + "epoch": 0.08625705972959097, + "grad_norm": 19.299196243286133, + "learning_rate": 2.8465487735310896e-06, + "loss": 7.5487, + "step": 504 + }, + { + "epoch": 0.0864282046893719, + "grad_norm": 25.007366180419922, + "learning_rate": 2.852253280091272e-06, + "loss": 6.4953, + "step": 505 + }, + { + "epoch": 0.08659934964915283, + "grad_norm": 44.58477020263672, + "learning_rate": 2.8579577866514547e-06, + "loss": 11.543, + "step": 506 + }, + { + "epoch": 0.08677049460893377, + "grad_norm": 18.95302963256836, + "learning_rate": 2.8636622932116372e-06, + "loss": 7.7713, + "step": 507 + }, + { + "epoch": 0.0869416395687147, + "grad_norm": 15.56648063659668, + "learning_rate": 2.8693667997718198e-06, + "loss": 8.5567, + "step": 508 + }, + { + "epoch": 0.08711278452849563, + "grad_norm": 20.78284454345703, + "learning_rate": 2.8750713063320023e-06, + "loss": 7.7135, + "step": 509 + }, + { + "epoch": 0.08728392948827657, + "grad_norm": 23.176607131958008, + "learning_rate": 2.880775812892185e-06, + "loss": 8.2685, + "step": 510 + }, + { + "epoch": 0.0874550744480575, + "grad_norm": 25.212718963623047, + "learning_rate": 2.8864803194523674e-06, + "loss": 8.9983, + "step": 511 + }, + { + "epoch": 0.08762621940783843, + "grad_norm": 27.220836639404297, + "learning_rate": 2.89218482601255e-06, + "loss": 6.6334, + "step": 512 + }, + { + "epoch": 0.08779736436761937, + "grad_norm": 13.128168106079102, + "learning_rate": 2.897889332572733e-06, + "loss": 3.847, + "step": 513 + }, + { + "epoch": 0.0879685093274003, + "grad_norm": 19.84160614013672, + "learning_rate": 2.903593839132915e-06, + "loss": 8.0045, + "step": 514 + }, + { + "epoch": 0.08813965428718125, + "grad_norm": 15.77076530456543, + "learning_rate": 2.9092983456930975e-06, + "loss": 7.8019, + "step": 515 + }, + { + "epoch": 0.08831079924696218, + "grad_norm": 158.41465759277344, + "learning_rate": 2.9150028522532804e-06, + "loss": 8.6448, + "step": 516 + }, + { + "epoch": 0.08848194420674312, + "grad_norm": 23.563339233398438, + "learning_rate": 2.9207073588134625e-06, + "loss": 8.8163, + "step": 517 + }, + { + "epoch": 0.08865308916652405, + "grad_norm": 30.82549476623535, + "learning_rate": 2.926411865373645e-06, + "loss": 8.627, + "step": 518 + }, + { + "epoch": 0.08882423412630498, + "grad_norm": 24.138612747192383, + "learning_rate": 2.932116371933828e-06, + "loss": 6.2112, + "step": 519 + }, + { + "epoch": 0.08899537908608592, + "grad_norm": 42.6961784362793, + "learning_rate": 2.93782087849401e-06, + "loss": 11.5101, + "step": 520 + }, + { + "epoch": 0.08916652404586685, + "grad_norm": 16.58330726623535, + "learning_rate": 2.943525385054193e-06, + "loss": 7.885, + "step": 521 + }, + { + "epoch": 0.08933766900564778, + "grad_norm": 17.490467071533203, + "learning_rate": 2.9492298916143756e-06, + "loss": 7.6631, + "step": 522 + }, + { + "epoch": 0.08950881396542872, + "grad_norm": 24.303665161132812, + "learning_rate": 2.9549343981745577e-06, + "loss": 8.5512, + "step": 523 + }, + { + "epoch": 0.08967995892520965, + "grad_norm": 14.5447416305542, + "learning_rate": 2.9606389047347407e-06, + "loss": 3.9844, + "step": 524 + }, + { + "epoch": 0.08985110388499058, + "grad_norm": 28.421756744384766, + "learning_rate": 2.9663434112949232e-06, + "loss": 9.2179, + "step": 525 + }, + { + "epoch": 0.09002224884477152, + "grad_norm": 20.097034454345703, + "learning_rate": 2.9720479178551053e-06, + "loss": 9.2422, + "step": 526 + }, + { + "epoch": 0.09019339380455245, + "grad_norm": 20.862869262695312, + "learning_rate": 2.9777524244152883e-06, + "loss": 8.2303, + "step": 527 + }, + { + "epoch": 0.0903645387643334, + "grad_norm": 30.980390548706055, + "learning_rate": 2.983456930975471e-06, + "loss": 9.1253, + "step": 528 + }, + { + "epoch": 0.09053568372411433, + "grad_norm": 29.973567962646484, + "learning_rate": 2.989161437535653e-06, + "loss": 4.8928, + "step": 529 + }, + { + "epoch": 0.09070682868389526, + "grad_norm": 35.399349212646484, + "learning_rate": 2.994865944095836e-06, + "loss": 4.5559, + "step": 530 + }, + { + "epoch": 0.0908779736436762, + "grad_norm": 21.178098678588867, + "learning_rate": 3.0005704506560184e-06, + "loss": 8.8876, + "step": 531 + }, + { + "epoch": 0.09104911860345713, + "grad_norm": 24.755205154418945, + "learning_rate": 3.0062749572162005e-06, + "loss": 7.5809, + "step": 532 + }, + { + "epoch": 0.09122026356323806, + "grad_norm": 23.76934051513672, + "learning_rate": 3.0119794637763835e-06, + "loss": 8.5706, + "step": 533 + }, + { + "epoch": 0.091391408523019, + "grad_norm": 40.431190490722656, + "learning_rate": 3.017683970336566e-06, + "loss": 11.3342, + "step": 534 + }, + { + "epoch": 0.09156255348279993, + "grad_norm": 22.674354553222656, + "learning_rate": 3.023388476896748e-06, + "loss": 8.8756, + "step": 535 + }, + { + "epoch": 0.09173369844258086, + "grad_norm": 33.92606735229492, + "learning_rate": 3.029092983456931e-06, + "loss": 4.738, + "step": 536 + }, + { + "epoch": 0.0919048434023618, + "grad_norm": 27.1170711517334, + "learning_rate": 3.0347974900171136e-06, + "loss": 6.4223, + "step": 537 + }, + { + "epoch": 0.09207598836214273, + "grad_norm": 25.11066246032715, + "learning_rate": 3.040501996577296e-06, + "loss": 9.0946, + "step": 538 + }, + { + "epoch": 0.09224713332192366, + "grad_norm": 23.894901275634766, + "learning_rate": 3.0462065031374787e-06, + "loss": 6.3395, + "step": 539 + }, + { + "epoch": 0.09241827828170461, + "grad_norm": 20.199861526489258, + "learning_rate": 3.051911009697661e-06, + "loss": 6.0524, + "step": 540 + }, + { + "epoch": 0.09258942324148554, + "grad_norm": 22.757362365722656, + "learning_rate": 3.057615516257844e-06, + "loss": 7.1293, + "step": 541 + }, + { + "epoch": 0.09276056820126648, + "grad_norm": 22.62543487548828, + "learning_rate": 3.0633200228180263e-06, + "loss": 7.5053, + "step": 542 + }, + { + "epoch": 0.09293171316104741, + "grad_norm": 16.598411560058594, + "learning_rate": 3.069024529378209e-06, + "loss": 7.8322, + "step": 543 + }, + { + "epoch": 0.09310285812082834, + "grad_norm": 20.656627655029297, + "learning_rate": 3.0747290359383917e-06, + "loss": 8.8013, + "step": 544 + }, + { + "epoch": 0.09327400308060928, + "grad_norm": 20.95423126220703, + "learning_rate": 3.080433542498574e-06, + "loss": 6.1923, + "step": 545 + }, + { + "epoch": 0.09344514804039021, + "grad_norm": 175.26722717285156, + "learning_rate": 3.0861380490587564e-06, + "loss": 10.2252, + "step": 546 + }, + { + "epoch": 0.09361629300017114, + "grad_norm": 21.737558364868164, + "learning_rate": 3.0918425556189393e-06, + "loss": 7.7486, + "step": 547 + }, + { + "epoch": 0.09378743795995208, + "grad_norm": 41.67558288574219, + "learning_rate": 3.0975470621791215e-06, + "loss": 11.1347, + "step": 548 + }, + { + "epoch": 0.09395858291973301, + "grad_norm": 24.20724868774414, + "learning_rate": 3.103251568739304e-06, + "loss": 8.1228, + "step": 549 + }, + { + "epoch": 0.09412972787951394, + "grad_norm": 23.995750427246094, + "learning_rate": 3.108956075299487e-06, + "loss": 8.1871, + "step": 550 + }, + { + "epoch": 0.09430087283929488, + "grad_norm": 18.58646583557129, + "learning_rate": 3.114660581859669e-06, + "loss": 7.4311, + "step": 551 + }, + { + "epoch": 0.09447201779907581, + "grad_norm": 26.01420021057129, + "learning_rate": 3.1203650884198516e-06, + "loss": 9.3426, + "step": 552 + }, + { + "epoch": 0.09464316275885676, + "grad_norm": 18.335588455200195, + "learning_rate": 3.1260695949800345e-06, + "loss": 8.9575, + "step": 553 + }, + { + "epoch": 0.09481430771863769, + "grad_norm": 21.414621353149414, + "learning_rate": 3.1317741015402166e-06, + "loss": 7.744, + "step": 554 + }, + { + "epoch": 0.09498545267841862, + "grad_norm": 15.28297233581543, + "learning_rate": 3.137478608100399e-06, + "loss": 8.0683, + "step": 555 + }, + { + "epoch": 0.09515659763819956, + "grad_norm": 20.182992935180664, + "learning_rate": 3.143183114660582e-06, + "loss": 7.5161, + "step": 556 + }, + { + "epoch": 0.09532774259798049, + "grad_norm": 22.94892120361328, + "learning_rate": 3.1488876212207642e-06, + "loss": 7.728, + "step": 557 + }, + { + "epoch": 0.09549888755776142, + "grad_norm": 16.93927764892578, + "learning_rate": 3.1545921277809468e-06, + "loss": 7.7731, + "step": 558 + }, + { + "epoch": 0.09567003251754236, + "grad_norm": 21.27535629272461, + "learning_rate": 3.1602966343411297e-06, + "loss": 8.7629, + "step": 559 + }, + { + "epoch": 0.09584117747732329, + "grad_norm": 20.056377410888672, + "learning_rate": 3.166001140901312e-06, + "loss": 7.3943, + "step": 560 + }, + { + "epoch": 0.09601232243710422, + "grad_norm": 37.84750747680664, + "learning_rate": 3.1717056474614948e-06, + "loss": 10.5613, + "step": 561 + }, + { + "epoch": 0.09618346739688516, + "grad_norm": 19.577177047729492, + "learning_rate": 3.1774101540216773e-06, + "loss": 7.6761, + "step": 562 + }, + { + "epoch": 0.09635461235666609, + "grad_norm": 22.209712982177734, + "learning_rate": 3.18311466058186e-06, + "loss": 6.9584, + "step": 563 + }, + { + "epoch": 0.09652575731644703, + "grad_norm": 25.258302688598633, + "learning_rate": 3.1888191671420424e-06, + "loss": 8.0565, + "step": 564 + }, + { + "epoch": 0.09669690227622796, + "grad_norm": 15.993329048156738, + "learning_rate": 3.194523673702225e-06, + "loss": 4.2519, + "step": 565 + }, + { + "epoch": 0.0968680472360089, + "grad_norm": 18.609046936035156, + "learning_rate": 3.2002281802624074e-06, + "loss": 8.0901, + "step": 566 + }, + { + "epoch": 0.09703919219578984, + "grad_norm": 39.24065017700195, + "learning_rate": 3.20593268682259e-06, + "loss": 4.2716, + "step": 567 + }, + { + "epoch": 0.09721033715557077, + "grad_norm": 158.3350067138672, + "learning_rate": 3.2116371933827725e-06, + "loss": 9.5332, + "step": 568 + }, + { + "epoch": 0.0973814821153517, + "grad_norm": 59.29450607299805, + "learning_rate": 3.217341699942955e-06, + "loss": 10.343, + "step": 569 + }, + { + "epoch": 0.09755262707513264, + "grad_norm": 16.113664627075195, + "learning_rate": 3.2230462065031376e-06, + "loss": 3.4944, + "step": 570 + }, + { + "epoch": 0.09772377203491357, + "grad_norm": 23.105350494384766, + "learning_rate": 3.22875071306332e-06, + "loss": 8.2848, + "step": 571 + }, + { + "epoch": 0.0978949169946945, + "grad_norm": 21.425796508789062, + "learning_rate": 3.2344552196235026e-06, + "loss": 8.7655, + "step": 572 + }, + { + "epoch": 0.09806606195447544, + "grad_norm": 22.587278366088867, + "learning_rate": 3.240159726183685e-06, + "loss": 5.394, + "step": 573 + }, + { + "epoch": 0.09823720691425637, + "grad_norm": 37.69017028808594, + "learning_rate": 3.2458642327438677e-06, + "loss": 9.9902, + "step": 574 + }, + { + "epoch": 0.0984083518740373, + "grad_norm": 25.255393981933594, + "learning_rate": 3.2515687393040502e-06, + "loss": 8.3951, + "step": 575 + }, + { + "epoch": 0.09857949683381824, + "grad_norm": 18.790040969848633, + "learning_rate": 3.2572732458642328e-06, + "loss": 8.2647, + "step": 576 + }, + { + "epoch": 0.09875064179359917, + "grad_norm": 18.215757369995117, + "learning_rate": 3.2629777524244153e-06, + "loss": 7.8027, + "step": 577 + }, + { + "epoch": 0.09892178675338012, + "grad_norm": 17.263294219970703, + "learning_rate": 3.268682258984598e-06, + "loss": 7.7728, + "step": 578 + }, + { + "epoch": 0.09909293171316105, + "grad_norm": 18.384496688842773, + "learning_rate": 3.2743867655447804e-06, + "loss": 7.9191, + "step": 579 + }, + { + "epoch": 0.09926407667294199, + "grad_norm": 32.68930435180664, + "learning_rate": 3.280091272104963e-06, + "loss": 7.8591, + "step": 580 + }, + { + "epoch": 0.09943522163272292, + "grad_norm": 31.514266967773438, + "learning_rate": 3.285795778665146e-06, + "loss": 5.5621, + "step": 581 + }, + { + "epoch": 0.09960636659250385, + "grad_norm": 18.912736892700195, + "learning_rate": 3.291500285225328e-06, + "loss": 7.6952, + "step": 582 + }, + { + "epoch": 0.09977751155228479, + "grad_norm": 37.68309783935547, + "learning_rate": 3.2972047917855105e-06, + "loss": 4.1392, + "step": 583 + }, + { + "epoch": 0.09994865651206572, + "grad_norm": 31.56082534790039, + "learning_rate": 3.3029092983456934e-06, + "loss": 4.3801, + "step": 584 + }, + { + "epoch": 0.10011980147184665, + "grad_norm": 24.57911491394043, + "learning_rate": 3.3086138049058755e-06, + "loss": 8.6316, + "step": 585 + }, + { + "epoch": 0.10029094643162759, + "grad_norm": 23.80208396911621, + "learning_rate": 3.314318311466058e-06, + "loss": 7.9077, + "step": 586 + }, + { + "epoch": 0.10046209139140852, + "grad_norm": 16.849803924560547, + "learning_rate": 3.320022818026241e-06, + "loss": 7.6992, + "step": 587 + }, + { + "epoch": 0.10063323635118945, + "grad_norm": 18.981300354003906, + "learning_rate": 3.3257273245864236e-06, + "loss": 8.6023, + "step": 588 + }, + { + "epoch": 0.10080438131097039, + "grad_norm": 24.398378372192383, + "learning_rate": 3.3314318311466057e-06, + "loss": 7.0319, + "step": 589 + }, + { + "epoch": 0.10097552627075132, + "grad_norm": 14.639533996582031, + "learning_rate": 3.3371363377067886e-06, + "loss": 4.7698, + "step": 590 + }, + { + "epoch": 0.10114667123053227, + "grad_norm": 25.046255111694336, + "learning_rate": 3.342840844266971e-06, + "loss": 7.1782, + "step": 591 + }, + { + "epoch": 0.1013178161903132, + "grad_norm": 20.012542724609375, + "learning_rate": 3.3485453508271533e-06, + "loss": 7.3167, + "step": 592 + }, + { + "epoch": 0.10148896115009413, + "grad_norm": 27.766891479492188, + "learning_rate": 3.3542498573873362e-06, + "loss": 4.6521, + "step": 593 + }, + { + "epoch": 0.10166010610987507, + "grad_norm": 30.79694175720215, + "learning_rate": 3.3599543639475188e-06, + "loss": 5.1196, + "step": 594 + }, + { + "epoch": 0.101831251069656, + "grad_norm": 24.9854736328125, + "learning_rate": 3.365658870507701e-06, + "loss": 7.8056, + "step": 595 + }, + { + "epoch": 0.10200239602943693, + "grad_norm": 30.63117218017578, + "learning_rate": 3.371363377067884e-06, + "loss": 4.7903, + "step": 596 + }, + { + "epoch": 0.10217354098921787, + "grad_norm": 34.852256774902344, + "learning_rate": 3.3770678836280663e-06, + "loss": 4.1994, + "step": 597 + }, + { + "epoch": 0.1023446859489988, + "grad_norm": 26.979557037353516, + "learning_rate": 3.3827723901882485e-06, + "loss": 8.5955, + "step": 598 + }, + { + "epoch": 0.10251583090877973, + "grad_norm": 21.797626495361328, + "learning_rate": 3.3884768967484314e-06, + "loss": 7.5743, + "step": 599 + }, + { + "epoch": 0.10268697586856067, + "grad_norm": 37.774139404296875, + "learning_rate": 3.394181403308614e-06, + "loss": 6.7935, + "step": 600 + }, + { + "epoch": 0.1028581208283416, + "grad_norm": 27.917823791503906, + "learning_rate": 3.3998859098687965e-06, + "loss": 7.9018, + "step": 601 + }, + { + "epoch": 0.10302926578812253, + "grad_norm": 28.479934692382812, + "learning_rate": 3.405590416428979e-06, + "loss": 4.566, + "step": 602 + }, + { + "epoch": 0.10320041074790347, + "grad_norm": 33.35675811767578, + "learning_rate": 3.4112949229891615e-06, + "loss": 4.1986, + "step": 603 + }, + { + "epoch": 0.10337155570768441, + "grad_norm": 17.17736053466797, + "learning_rate": 3.416999429549344e-06, + "loss": 4.4831, + "step": 604 + }, + { + "epoch": 0.10354270066746535, + "grad_norm": 33.52507781982422, + "learning_rate": 3.4227039361095266e-06, + "loss": 5.2033, + "step": 605 + }, + { + "epoch": 0.10371384562724628, + "grad_norm": 38.001678466796875, + "learning_rate": 3.428408442669709e-06, + "loss": 4.2796, + "step": 606 + }, + { + "epoch": 0.10388499058702722, + "grad_norm": 27.487375259399414, + "learning_rate": 3.4341129492298917e-06, + "loss": 4.3677, + "step": 607 + }, + { + "epoch": 0.10405613554680815, + "grad_norm": 43.33926010131836, + "learning_rate": 3.439817455790074e-06, + "loss": 8.1044, + "step": 608 + }, + { + "epoch": 0.10422728050658908, + "grad_norm": 19.231143951416016, + "learning_rate": 3.4455219623502567e-06, + "loss": 3.8195, + "step": 609 + }, + { + "epoch": 0.10439842546637002, + "grad_norm": 51.54021453857422, + "learning_rate": 3.4512264689104393e-06, + "loss": 8.4844, + "step": 610 + }, + { + "epoch": 0.10456957042615095, + "grad_norm": 18.752532958984375, + "learning_rate": 3.456930975470622e-06, + "loss": 4.7959, + "step": 611 + }, + { + "epoch": 0.10474071538593188, + "grad_norm": 31.644916534423828, + "learning_rate": 3.4626354820308043e-06, + "loss": 3.5838, + "step": 612 + }, + { + "epoch": 0.10491186034571282, + "grad_norm": 29.887203216552734, + "learning_rate": 3.4683399885909873e-06, + "loss": 8.1693, + "step": 613 + }, + { + "epoch": 0.10508300530549375, + "grad_norm": 26.583890914916992, + "learning_rate": 3.4740444951511694e-06, + "loss": 6.6237, + "step": 614 + }, + { + "epoch": 0.10525415026527468, + "grad_norm": 30.845338821411133, + "learning_rate": 3.479749001711352e-06, + "loss": 3.4824, + "step": 615 + }, + { + "epoch": 0.10542529522505562, + "grad_norm": 38.40910339355469, + "learning_rate": 3.485453508271535e-06, + "loss": 3.9889, + "step": 616 + }, + { + "epoch": 0.10559644018483656, + "grad_norm": 39.16193389892578, + "learning_rate": 3.491158014831717e-06, + "loss": 8.4244, + "step": 617 + }, + { + "epoch": 0.1057675851446175, + "grad_norm": 17.97920036315918, + "learning_rate": 3.4968625213918995e-06, + "loss": 3.2418, + "step": 618 + }, + { + "epoch": 0.10593873010439843, + "grad_norm": 176.26966857910156, + "learning_rate": 3.5025670279520825e-06, + "loss": 7.2639, + "step": 619 + }, + { + "epoch": 0.10610987506417936, + "grad_norm": 31.25491714477539, + "learning_rate": 3.5082715345122646e-06, + "loss": 7.428, + "step": 620 + }, + { + "epoch": 0.1062810200239603, + "grad_norm": 291.8013916015625, + "learning_rate": 3.5139760410724475e-06, + "loss": 12.9756, + "step": 621 + }, + { + "epoch": 0.10645216498374123, + "grad_norm": 34.713497161865234, + "learning_rate": 3.51968054763263e-06, + "loss": 4.2338, + "step": 622 + }, + { + "epoch": 0.10662330994352216, + "grad_norm": 30.151296615600586, + "learning_rate": 3.525385054192812e-06, + "loss": 8.7988, + "step": 623 + }, + { + "epoch": 0.1067944549033031, + "grad_norm": 36.128414154052734, + "learning_rate": 3.531089560752995e-06, + "loss": 8.0574, + "step": 624 + }, + { + "epoch": 0.10696559986308403, + "grad_norm": 15.85501480102539, + "learning_rate": 3.5367940673131777e-06, + "loss": 3.4627, + "step": 625 + }, + { + "epoch": 0.10713674482286496, + "grad_norm": 296.1280212402344, + "learning_rate": 3.5424985738733598e-06, + "loss": 11.8753, + "step": 626 + }, + { + "epoch": 0.1073078897826459, + "grad_norm": 30.480113983154297, + "learning_rate": 3.5482030804335427e-06, + "loss": 6.966, + "step": 627 + }, + { + "epoch": 0.10747903474242683, + "grad_norm": 34.42314529418945, + "learning_rate": 3.5539075869937253e-06, + "loss": 6.8004, + "step": 628 + }, + { + "epoch": 0.10765017970220778, + "grad_norm": 239.69007873535156, + "learning_rate": 3.5596120935539074e-06, + "loss": 10.727, + "step": 629 + }, + { + "epoch": 0.10782132466198871, + "grad_norm": 42.74559783935547, + "learning_rate": 3.5653166001140903e-06, + "loss": 7.948, + "step": 630 + }, + { + "epoch": 0.10799246962176964, + "grad_norm": 24.176240921020508, + "learning_rate": 3.571021106674273e-06, + "loss": 5.4348, + "step": 631 + }, + { + "epoch": 0.10816361458155058, + "grad_norm": 32.64130783081055, + "learning_rate": 3.576725613234455e-06, + "loss": 8.4, + "step": 632 + }, + { + "epoch": 0.10833475954133151, + "grad_norm": 32.354248046875, + "learning_rate": 3.582430119794638e-06, + "loss": 6.4397, + "step": 633 + }, + { + "epoch": 0.10850590450111244, + "grad_norm": 25.767475128173828, + "learning_rate": 3.5881346263548204e-06, + "loss": 5.7136, + "step": 634 + }, + { + "epoch": 0.10867704946089338, + "grad_norm": 28.90591812133789, + "learning_rate": 3.593839132915003e-06, + "loss": 5.9131, + "step": 635 + }, + { + "epoch": 0.10884819442067431, + "grad_norm": 32.62278747558594, + "learning_rate": 3.5995436394751855e-06, + "loss": 7.6547, + "step": 636 + }, + { + "epoch": 0.10901933938045524, + "grad_norm": 30.387760162353516, + "learning_rate": 3.605248146035368e-06, + "loss": 4.7063, + "step": 637 + }, + { + "epoch": 0.10919048434023618, + "grad_norm": 33.034420013427734, + "learning_rate": 3.6109526525955506e-06, + "loss": 6.8851, + "step": 638 + }, + { + "epoch": 0.10936162930001711, + "grad_norm": 31.42691421508789, + "learning_rate": 3.616657159155733e-06, + "loss": 4.1348, + "step": 639 + }, + { + "epoch": 0.10953277425979804, + "grad_norm": 32.439395904541016, + "learning_rate": 3.6223616657159156e-06, + "loss": 6.3162, + "step": 640 + }, + { + "epoch": 0.10970391921957898, + "grad_norm": 26.49324607849121, + "learning_rate": 3.6280661722760986e-06, + "loss": 5.1818, + "step": 641 + }, + { + "epoch": 0.10987506417935992, + "grad_norm": 25.558427810668945, + "learning_rate": 3.6337706788362807e-06, + "loss": 6.7631, + "step": 642 + }, + { + "epoch": 0.11004620913914086, + "grad_norm": 24.655729293823242, + "learning_rate": 3.6394751853964632e-06, + "loss": 7.4925, + "step": 643 + }, + { + "epoch": 0.11021735409892179, + "grad_norm": 28.129770278930664, + "learning_rate": 3.645179691956646e-06, + "loss": 7.4969, + "step": 644 + }, + { + "epoch": 0.11038849905870272, + "grad_norm": 14.367050170898438, + "learning_rate": 3.6508841985168283e-06, + "loss": 2.9942, + "step": 645 + }, + { + "epoch": 0.11055964401848366, + "grad_norm": 17.681976318359375, + "learning_rate": 3.656588705077011e-06, + "loss": 3.4156, + "step": 646 + }, + { + "epoch": 0.11073078897826459, + "grad_norm": 16.25703239440918, + "learning_rate": 3.6622932116371938e-06, + "loss": 4.095, + "step": 647 + }, + { + "epoch": 0.11090193393804552, + "grad_norm": 26.604623794555664, + "learning_rate": 3.667997718197376e-06, + "loss": 7.7521, + "step": 648 + }, + { + "epoch": 0.11107307889782646, + "grad_norm": 24.250492095947266, + "learning_rate": 3.6737022247575584e-06, + "loss": 5.5501, + "step": 649 + }, + { + "epoch": 0.11124422385760739, + "grad_norm": 31.94316864013672, + "learning_rate": 3.6794067313177414e-06, + "loss": 6.24, + "step": 650 + }, + { + "epoch": 0.11141536881738832, + "grad_norm": 18.14836883544922, + "learning_rate": 3.6851112378779235e-06, + "loss": 3.024, + "step": 651 + }, + { + "epoch": 0.11158651377716926, + "grad_norm": 25.239274978637695, + "learning_rate": 3.690815744438106e-06, + "loss": 7.4699, + "step": 652 + }, + { + "epoch": 0.11175765873695019, + "grad_norm": 66.97354125976562, + "learning_rate": 3.696520250998289e-06, + "loss": 11.3565, + "step": 653 + }, + { + "epoch": 0.11192880369673112, + "grad_norm": 30.029356002807617, + "learning_rate": 3.702224757558471e-06, + "loss": 7.0383, + "step": 654 + }, + { + "epoch": 0.11209994865651207, + "grad_norm": 22.021820068359375, + "learning_rate": 3.7079292641186536e-06, + "loss": 5.1655, + "step": 655 + }, + { + "epoch": 0.112271093616293, + "grad_norm": 40.79402160644531, + "learning_rate": 3.7136337706788366e-06, + "loss": 7.3738, + "step": 656 + }, + { + "epoch": 0.11244223857607394, + "grad_norm": 49.726810455322266, + "learning_rate": 3.7193382772390187e-06, + "loss": 10.1751, + "step": 657 + }, + { + "epoch": 0.11261338353585487, + "grad_norm": 34.322078704833984, + "learning_rate": 3.725042783799201e-06, + "loss": 8.127, + "step": 658 + }, + { + "epoch": 0.1127845284956358, + "grad_norm": 31.094890594482422, + "learning_rate": 3.730747290359384e-06, + "loss": 7.2578, + "step": 659 + }, + { + "epoch": 0.11295567345541674, + "grad_norm": 17.61489486694336, + "learning_rate": 3.7364517969195667e-06, + "loss": 2.9985, + "step": 660 + }, + { + "epoch": 0.11312681841519767, + "grad_norm": 31.467206954956055, + "learning_rate": 3.7421563034797492e-06, + "loss": 3.7049, + "step": 661 + }, + { + "epoch": 0.1132979633749786, + "grad_norm": 24.94162368774414, + "learning_rate": 3.7478608100399318e-06, + "loss": 5.2144, + "step": 662 + }, + { + "epoch": 0.11346910833475954, + "grad_norm": 61.16570281982422, + "learning_rate": 3.7535653166001143e-06, + "loss": 10.6124, + "step": 663 + }, + { + "epoch": 0.11364025329454047, + "grad_norm": 30.18357276916504, + "learning_rate": 3.7592698231602964e-06, + "loss": 7.2241, + "step": 664 + }, + { + "epoch": 0.1138113982543214, + "grad_norm": 40.68777847290039, + "learning_rate": 3.764974329720479e-06, + "loss": 7.604, + "step": 665 + }, + { + "epoch": 0.11398254321410234, + "grad_norm": 24.30128288269043, + "learning_rate": 3.7706788362806623e-06, + "loss": 4.6842, + "step": 666 + }, + { + "epoch": 0.11415368817388329, + "grad_norm": 33.77325439453125, + "learning_rate": 3.7763833428408444e-06, + "loss": 7.3903, + "step": 667 + }, + { + "epoch": 0.11432483313366422, + "grad_norm": 30.10031509399414, + "learning_rate": 3.782087849401027e-06, + "loss": 7.0533, + "step": 668 + }, + { + "epoch": 0.11449597809344515, + "grad_norm": 34.8586540222168, + "learning_rate": 3.7877923559612095e-06, + "loss": 7.5631, + "step": 669 + }, + { + "epoch": 0.11466712305322609, + "grad_norm": 33.20988082885742, + "learning_rate": 3.7934968625213916e-06, + "loss": 3.7029, + "step": 670 + }, + { + "epoch": 0.11483826801300702, + "grad_norm": 31.075176239013672, + "learning_rate": 3.799201369081575e-06, + "loss": 8.2182, + "step": 671 + }, + { + "epoch": 0.11500941297278795, + "grad_norm": 30.962139129638672, + "learning_rate": 3.8049058756417575e-06, + "loss": 6.5288, + "step": 672 + }, + { + "epoch": 0.11518055793256889, + "grad_norm": 37.01807403564453, + "learning_rate": 3.8106103822019396e-06, + "loss": 7.8449, + "step": 673 + }, + { + "epoch": 0.11535170289234982, + "grad_norm": 35.002742767333984, + "learning_rate": 3.816314888762122e-06, + "loss": 6.4509, + "step": 674 + }, + { + "epoch": 0.11552284785213075, + "grad_norm": 51.06761169433594, + "learning_rate": 3.822019395322305e-06, + "loss": 10.6236, + "step": 675 + }, + { + "epoch": 0.11569399281191169, + "grad_norm": 37.48448181152344, + "learning_rate": 3.827723901882487e-06, + "loss": 6.7785, + "step": 676 + }, + { + "epoch": 0.11586513777169262, + "grad_norm": 35.638832092285156, + "learning_rate": 3.8334284084426706e-06, + "loss": 7.6172, + "step": 677 + }, + { + "epoch": 0.11603628273147355, + "grad_norm": 35.00564956665039, + "learning_rate": 3.839132915002852e-06, + "loss": 7.1866, + "step": 678 + }, + { + "epoch": 0.11620742769125449, + "grad_norm": 31.42662811279297, + "learning_rate": 3.844837421563035e-06, + "loss": 3.2075, + "step": 679 + }, + { + "epoch": 0.11637857265103543, + "grad_norm": 16.111412048339844, + "learning_rate": 3.850541928123217e-06, + "loss": 4.0132, + "step": 680 + }, + { + "epoch": 0.11654971761081637, + "grad_norm": 29.7305850982666, + "learning_rate": 3.8562464346834e-06, + "loss": 7.1253, + "step": 681 + }, + { + "epoch": 0.1167208625705973, + "grad_norm": 28.033987045288086, + "learning_rate": 3.861950941243582e-06, + "loss": 3.1947, + "step": 682 + }, + { + "epoch": 0.11689200753037823, + "grad_norm": 31.460405349731445, + "learning_rate": 3.867655447803766e-06, + "loss": 7.1427, + "step": 683 + }, + { + "epoch": 0.11706315249015917, + "grad_norm": 269.6858825683594, + "learning_rate": 3.8733599543639474e-06, + "loss": 11.9947, + "step": 684 + }, + { + "epoch": 0.1172342974499401, + "grad_norm": 35.384727478027344, + "learning_rate": 3.87906446092413e-06, + "loss": 6.8334, + "step": 685 + }, + { + "epoch": 0.11740544240972103, + "grad_norm": 25.98334312438965, + "learning_rate": 3.8847689674843125e-06, + "loss": 7.2384, + "step": 686 + }, + { + "epoch": 0.11757658736950197, + "grad_norm": 33.84842300415039, + "learning_rate": 3.890473474044495e-06, + "loss": 5.9906, + "step": 687 + }, + { + "epoch": 0.1177477323292829, + "grad_norm": 41.04487609863281, + "learning_rate": 3.8961779806046776e-06, + "loss": 6.2759, + "step": 688 + }, + { + "epoch": 0.11791887728906383, + "grad_norm": 16.468915939331055, + "learning_rate": 3.901882487164861e-06, + "loss": 3.8545, + "step": 689 + }, + { + "epoch": 0.11809002224884477, + "grad_norm": 27.10782241821289, + "learning_rate": 3.907586993725043e-06, + "loss": 7.596, + "step": 690 + }, + { + "epoch": 0.1182611672086257, + "grad_norm": 25.684919357299805, + "learning_rate": 3.913291500285225e-06, + "loss": 4.2235, + "step": 691 + }, + { + "epoch": 0.11843231216840663, + "grad_norm": 27.2288761138916, + "learning_rate": 3.918996006845408e-06, + "loss": 6.9975, + "step": 692 + }, + { + "epoch": 0.11860345712818758, + "grad_norm": 33.26142120361328, + "learning_rate": 3.92470051340559e-06, + "loss": 6.5592, + "step": 693 + }, + { + "epoch": 0.11877460208796851, + "grad_norm": 38.89694595336914, + "learning_rate": 3.930405019965774e-06, + "loss": 7.2757, + "step": 694 + }, + { + "epoch": 0.11894574704774945, + "grad_norm": 27.99795150756836, + "learning_rate": 3.936109526525956e-06, + "loss": 6.7422, + "step": 695 + }, + { + "epoch": 0.11911689200753038, + "grad_norm": 24.109289169311523, + "learning_rate": 3.941814033086138e-06, + "loss": 4.9175, + "step": 696 + }, + { + "epoch": 0.11928803696731131, + "grad_norm": 15.462040901184082, + "learning_rate": 3.94751853964632e-06, + "loss": 2.7684, + "step": 697 + }, + { + "epoch": 0.11945918192709225, + "grad_norm": 39.17838668823242, + "learning_rate": 3.953223046206503e-06, + "loss": 7.1518, + "step": 698 + }, + { + "epoch": 0.11963032688687318, + "grad_norm": 30.83951759338379, + "learning_rate": 3.958927552766685e-06, + "loss": 3.8832, + "step": 699 + }, + { + "epoch": 0.11980147184665411, + "grad_norm": 26.964744567871094, + "learning_rate": 3.964632059326869e-06, + "loss": 7.224, + "step": 700 + }, + { + "epoch": 0.11997261680643505, + "grad_norm": 36.607975006103516, + "learning_rate": 3.970336565887051e-06, + "loss": 7.3389, + "step": 701 + }, + { + "epoch": 0.12014376176621598, + "grad_norm": 37.18532180786133, + "learning_rate": 3.976041072447234e-06, + "loss": 6.1083, + "step": 702 + }, + { + "epoch": 0.12031490672599691, + "grad_norm": 29.550649642944336, + "learning_rate": 3.9817455790074155e-06, + "loss": 5.1898, + "step": 703 + }, + { + "epoch": 0.12048605168577785, + "grad_norm": 24.146198272705078, + "learning_rate": 3.987450085567598e-06, + "loss": 4.6196, + "step": 704 + }, + { + "epoch": 0.1206571966455588, + "grad_norm": 25.126737594604492, + "learning_rate": 3.993154592127781e-06, + "loss": 2.7422, + "step": 705 + }, + { + "epoch": 0.12082834160533973, + "grad_norm": 18.79334259033203, + "learning_rate": 3.998859098687964e-06, + "loss": 2.6716, + "step": 706 + }, + { + "epoch": 0.12099948656512066, + "grad_norm": 33.249168395996094, + "learning_rate": 4.0045636052481465e-06, + "loss": 5.3053, + "step": 707 + }, + { + "epoch": 0.1211706315249016, + "grad_norm": 26.934682846069336, + "learning_rate": 4.010268111808329e-06, + "loss": 4.514, + "step": 708 + }, + { + "epoch": 0.12134177648468253, + "grad_norm": 44.88846206665039, + "learning_rate": 4.015972618368511e-06, + "loss": 6.4733, + "step": 709 + }, + { + "epoch": 0.12151292144446346, + "grad_norm": 41.93711471557617, + "learning_rate": 4.021677124928693e-06, + "loss": 7.4558, + "step": 710 + }, + { + "epoch": 0.1216840664042444, + "grad_norm": 41.59209060668945, + "learning_rate": 4.027381631488877e-06, + "loss": 7.0372, + "step": 711 + }, + { + "epoch": 0.12185521136402533, + "grad_norm": 41.47358703613281, + "learning_rate": 4.033086138049059e-06, + "loss": 7.2868, + "step": 712 + }, + { + "epoch": 0.12202635632380626, + "grad_norm": 41.380741119384766, + "learning_rate": 4.038790644609242e-06, + "loss": 7.6225, + "step": 713 + }, + { + "epoch": 0.1221975012835872, + "grad_norm": 40.343788146972656, + "learning_rate": 4.044495151169424e-06, + "loss": 7.2916, + "step": 714 + }, + { + "epoch": 0.12236864624336813, + "grad_norm": 30.69339370727539, + "learning_rate": 4.050199657729606e-06, + "loss": 5.8809, + "step": 715 + }, + { + "epoch": 0.12253979120314906, + "grad_norm": 25.84669303894043, + "learning_rate": 4.0559041642897885e-06, + "loss": 2.8596, + "step": 716 + }, + { + "epoch": 0.12271093616293, + "grad_norm": 37.5709114074707, + "learning_rate": 4.061608670849972e-06, + "loss": 6.5536, + "step": 717 + }, + { + "epoch": 0.12288208112271094, + "grad_norm": 44.87430191040039, + "learning_rate": 4.067313177410154e-06, + "loss": 7.8952, + "step": 718 + }, + { + "epoch": 0.12305322608249188, + "grad_norm": 29.630413055419922, + "learning_rate": 4.073017683970337e-06, + "loss": 7.2852, + "step": 719 + }, + { + "epoch": 0.12322437104227281, + "grad_norm": 38.17768096923828, + "learning_rate": 4.0787221905305194e-06, + "loss": 7.2025, + "step": 720 + }, + { + "epoch": 0.12339551600205374, + "grad_norm": 31.9378719329834, + "learning_rate": 4.084426697090702e-06, + "loss": 3.3231, + "step": 721 + }, + { + "epoch": 0.12356666096183468, + "grad_norm": 15.323390007019043, + "learning_rate": 4.090131203650884e-06, + "loss": 3.5441, + "step": 722 + }, + { + "epoch": 0.12373780592161561, + "grad_norm": 32.09744644165039, + "learning_rate": 4.095835710211067e-06, + "loss": 5.9737, + "step": 723 + }, + { + "epoch": 0.12390895088139654, + "grad_norm": 32.49777603149414, + "learning_rate": 4.1015402167712496e-06, + "loss": 5.6116, + "step": 724 + }, + { + "epoch": 0.12408009584117748, + "grad_norm": 32.568031311035156, + "learning_rate": 4.107244723331432e-06, + "loss": 6.8512, + "step": 725 + }, + { + "epoch": 0.12425124080095841, + "grad_norm": 27.68449592590332, + "learning_rate": 4.112949229891615e-06, + "loss": 3.8807, + "step": 726 + }, + { + "epoch": 0.12442238576073934, + "grad_norm": 28.595746994018555, + "learning_rate": 4.118653736451797e-06, + "loss": 2.7513, + "step": 727 + }, + { + "epoch": 0.12459353072052028, + "grad_norm": 40.44917678833008, + "learning_rate": 4.124358243011979e-06, + "loss": 6.9441, + "step": 728 + }, + { + "epoch": 0.12476467568030121, + "grad_norm": 34.75537872314453, + "learning_rate": 4.130062749572162e-06, + "loss": 6.2836, + "step": 729 + }, + { + "epoch": 0.12493582064008214, + "grad_norm": 32.49576950073242, + "learning_rate": 4.135767256132345e-06, + "loss": 3.6985, + "step": 730 + }, + { + "epoch": 0.1251069655998631, + "grad_norm": 33.09941482543945, + "learning_rate": 4.141471762692527e-06, + "loss": 6.414, + "step": 731 + }, + { + "epoch": 0.12527811055964402, + "grad_norm": 33.988101959228516, + "learning_rate": 4.14717626925271e-06, + "loss": 6.8846, + "step": 732 + }, + { + "epoch": 0.12544925551942496, + "grad_norm": 34.69337844848633, + "learning_rate": 4.152880775812892e-06, + "loss": 5.9908, + "step": 733 + }, + { + "epoch": 0.1256204004792059, + "grad_norm": 42.33815383911133, + "learning_rate": 4.158585282373075e-06, + "loss": 7.2186, + "step": 734 + }, + { + "epoch": 0.12579154543898682, + "grad_norm": 21.35869598388672, + "learning_rate": 4.164289788933257e-06, + "loss": 3.2239, + "step": 735 + }, + { + "epoch": 0.12596269039876776, + "grad_norm": 34.62517166137695, + "learning_rate": 4.16999429549344e-06, + "loss": 6.2926, + "step": 736 + }, + { + "epoch": 0.1261338353585487, + "grad_norm": 32.758544921875, + "learning_rate": 4.1756988020536225e-06, + "loss": 6.2203, + "step": 737 + }, + { + "epoch": 0.12630498031832962, + "grad_norm": 17.39285659790039, + "learning_rate": 4.181403308613805e-06, + "loss": 3.2563, + "step": 738 + }, + { + "epoch": 0.12647612527811056, + "grad_norm": 32.22175598144531, + "learning_rate": 4.1871078151739875e-06, + "loss": 5.3068, + "step": 739 + }, + { + "epoch": 0.1266472702378915, + "grad_norm": 38.13700485229492, + "learning_rate": 4.19281232173417e-06, + "loss": 7.6128, + "step": 740 + }, + { + "epoch": 0.12681841519767242, + "grad_norm": 35.74038314819336, + "learning_rate": 4.198516828294353e-06, + "loss": 6.6528, + "step": 741 + }, + { + "epoch": 0.12698956015745336, + "grad_norm": 12.027849197387695, + "learning_rate": 4.204221334854535e-06, + "loss": 2.255, + "step": 742 + }, + { + "epoch": 0.1271607051172343, + "grad_norm": 36.75061798095703, + "learning_rate": 4.209925841414718e-06, + "loss": 6.2444, + "step": 743 + }, + { + "epoch": 0.12733185007701522, + "grad_norm": 43.853187561035156, + "learning_rate": 4.2156303479749e-06, + "loss": 6.3509, + "step": 744 + }, + { + "epoch": 0.12750299503679616, + "grad_norm": 31.670143127441406, + "learning_rate": 4.221334854535083e-06, + "loss": 7.3242, + "step": 745 + }, + { + "epoch": 0.1276741399965771, + "grad_norm": 24.049455642700195, + "learning_rate": 4.227039361095265e-06, + "loss": 4.5557, + "step": 746 + }, + { + "epoch": 0.12784528495635802, + "grad_norm": 22.603431701660156, + "learning_rate": 4.232743867655448e-06, + "loss": 4.3686, + "step": 747 + }, + { + "epoch": 0.12801642991613896, + "grad_norm": 33.28196716308594, + "learning_rate": 4.23844837421563e-06, + "loss": 7.897, + "step": 748 + }, + { + "epoch": 0.1281875748759199, + "grad_norm": 14.154582023620605, + "learning_rate": 4.244152880775813e-06, + "loss": 2.5184, + "step": 749 + }, + { + "epoch": 0.12835871983570085, + "grad_norm": 34.31758117675781, + "learning_rate": 4.249857387335995e-06, + "loss": 6.4663, + "step": 750 + }, + { + "epoch": 0.12852986479548179, + "grad_norm": 29.4487361907959, + "learning_rate": 4.255561893896179e-06, + "loss": 6.4908, + "step": 751 + }, + { + "epoch": 0.12870100975526272, + "grad_norm": 26.144145965576172, + "learning_rate": 4.261266400456361e-06, + "loss": 2.7209, + "step": 752 + }, + { + "epoch": 0.12887215471504365, + "grad_norm": 32.20002746582031, + "learning_rate": 4.266970907016543e-06, + "loss": 5.3474, + "step": 753 + }, + { + "epoch": 0.12904329967482459, + "grad_norm": 22.889114379882812, + "learning_rate": 4.2726754135767255e-06, + "loss": 4.439, + "step": 754 + }, + { + "epoch": 0.12921444463460552, + "grad_norm": 29.033794403076172, + "learning_rate": 4.278379920136908e-06, + "loss": 3.1768, + "step": 755 + }, + { + "epoch": 0.12938558959438645, + "grad_norm": 36.977718353271484, + "learning_rate": 4.2840844266970906e-06, + "loss": 6.725, + "step": 756 + }, + { + "epoch": 0.12955673455416739, + "grad_norm": 24.76682472229004, + "learning_rate": 4.289788933257274e-06, + "loss": 2.3437, + "step": 757 + }, + { + "epoch": 0.12972787951394832, + "grad_norm": 16.016826629638672, + "learning_rate": 4.2954934398174565e-06, + "loss": 2.8201, + "step": 758 + }, + { + "epoch": 0.12989902447372925, + "grad_norm": 14.587915420532227, + "learning_rate": 4.301197946377638e-06, + "loss": 3.5146, + "step": 759 + }, + { + "epoch": 0.13007016943351019, + "grad_norm": 26.081321716308594, + "learning_rate": 4.306902452937821e-06, + "loss": 2.6582, + "step": 760 + }, + { + "epoch": 0.13024131439329112, + "grad_norm": 16.497404098510742, + "learning_rate": 4.312606959498003e-06, + "loss": 2.6039, + "step": 761 + }, + { + "epoch": 0.13041245935307205, + "grad_norm": 30.642013549804688, + "learning_rate": 4.318311466058186e-06, + "loss": 5.6791, + "step": 762 + }, + { + "epoch": 0.13058360431285299, + "grad_norm": 78.80982971191406, + "learning_rate": 4.324015972618369e-06, + "loss": 7.0474, + "step": 763 + }, + { + "epoch": 0.13075474927263392, + "grad_norm": 31.678878784179688, + "learning_rate": 4.329720479178552e-06, + "loss": 7.2185, + "step": 764 + }, + { + "epoch": 0.13092589423241485, + "grad_norm": 67.14193725585938, + "learning_rate": 4.335424985738733e-06, + "loss": 11.1752, + "step": 765 + }, + { + "epoch": 0.13109703919219579, + "grad_norm": 30.7507381439209, + "learning_rate": 4.341129492298916e-06, + "loss": 5.3445, + "step": 766 + }, + { + "epoch": 0.13126818415197672, + "grad_norm": 295.94195556640625, + "learning_rate": 4.346833998859098e-06, + "loss": 14.8463, + "step": 767 + }, + { + "epoch": 0.13143932911175765, + "grad_norm": 31.96709442138672, + "learning_rate": 4.352538505419281e-06, + "loss": 5.9573, + "step": 768 + }, + { + "epoch": 0.13161047407153859, + "grad_norm": 21.086137771606445, + "learning_rate": 4.358243011979464e-06, + "loss": 2.51, + "step": 769 + }, + { + "epoch": 0.13178161903131952, + "grad_norm": 23.69211196899414, + "learning_rate": 4.363947518539647e-06, + "loss": 3.9384, + "step": 770 + }, + { + "epoch": 0.13195276399110045, + "grad_norm": 29.09503173828125, + "learning_rate": 4.369652025099829e-06, + "loss": 2.6477, + "step": 771 + }, + { + "epoch": 0.13212390895088139, + "grad_norm": 34.086483001708984, + "learning_rate": 4.375356531660011e-06, + "loss": 6.8362, + "step": 772 + }, + { + "epoch": 0.13229505391066232, + "grad_norm": 22.358131408691406, + "learning_rate": 4.381061038220194e-06, + "loss": 2.5553, + "step": 773 + }, + { + "epoch": 0.13246619887044325, + "grad_norm": 32.83020782470703, + "learning_rate": 4.386765544780377e-06, + "loss": 5.6526, + "step": 774 + }, + { + "epoch": 0.1326373438302242, + "grad_norm": 32.111629486083984, + "learning_rate": 4.3924700513405595e-06, + "loss": 7.0077, + "step": 775 + }, + { + "epoch": 0.13280848879000515, + "grad_norm": 28.587032318115234, + "learning_rate": 4.398174557900742e-06, + "loss": 3.9449, + "step": 776 + }, + { + "epoch": 0.13297963374978608, + "grad_norm": 28.547178268432617, + "learning_rate": 4.403879064460925e-06, + "loss": 5.1286, + "step": 777 + }, + { + "epoch": 0.133150778709567, + "grad_norm": 31.409543991088867, + "learning_rate": 4.409583571021106e-06, + "loss": 5.3343, + "step": 778 + }, + { + "epoch": 0.13332192366934795, + "grad_norm": 33.33236312866211, + "learning_rate": 4.415288077581289e-06, + "loss": 6.5061, + "step": 779 + }, + { + "epoch": 0.13349306862912888, + "grad_norm": 226.51580810546875, + "learning_rate": 4.420992584141472e-06, + "loss": 13.5725, + "step": 780 + }, + { + "epoch": 0.1336642135889098, + "grad_norm": 29.707599639892578, + "learning_rate": 4.426697090701655e-06, + "loss": 3.5054, + "step": 781 + }, + { + "epoch": 0.13383535854869075, + "grad_norm": 29.84592628479004, + "learning_rate": 4.432401597261837e-06, + "loss": 5.5461, + "step": 782 + }, + { + "epoch": 0.13400650350847168, + "grad_norm": 23.87710189819336, + "learning_rate": 4.43810610382202e-06, + "loss": 2.7581, + "step": 783 + }, + { + "epoch": 0.1341776484682526, + "grad_norm": 27.90047264099121, + "learning_rate": 4.4438106103822015e-06, + "loss": 5.0602, + "step": 784 + }, + { + "epoch": 0.13434879342803355, + "grad_norm": 229.59202575683594, + "learning_rate": 4.449515116942384e-06, + "loss": 8.4299, + "step": 785 + }, + { + "epoch": 0.13451993838781448, + "grad_norm": 35.904483795166016, + "learning_rate": 4.455219623502567e-06, + "loss": 6.6261, + "step": 786 + }, + { + "epoch": 0.13469108334759541, + "grad_norm": 13.451172828674316, + "learning_rate": 4.46092413006275e-06, + "loss": 3.6844, + "step": 787 + }, + { + "epoch": 0.13486222830737635, + "grad_norm": 220.5408172607422, + "learning_rate": 4.4666286366229324e-06, + "loss": 12.0786, + "step": 788 + }, + { + "epoch": 0.13503337326715728, + "grad_norm": 30.378768920898438, + "learning_rate": 4.472333143183115e-06, + "loss": 6.4301, + "step": 789 + }, + { + "epoch": 0.13520451822693821, + "grad_norm": 24.894784927368164, + "learning_rate": 4.478037649743297e-06, + "loss": 4.0456, + "step": 790 + }, + { + "epoch": 0.13537566318671915, + "grad_norm": 63.11225509643555, + "learning_rate": 4.48374215630348e-06, + "loss": 10.8823, + "step": 791 + }, + { + "epoch": 0.13554680814650008, + "grad_norm": 30.484046936035156, + "learning_rate": 4.4894466628636626e-06, + "loss": 4.5215, + "step": 792 + }, + { + "epoch": 0.13571795310628101, + "grad_norm": 33.15967559814453, + "learning_rate": 4.495151169423845e-06, + "loss": 6.1466, + "step": 793 + }, + { + "epoch": 0.13588909806606195, + "grad_norm": 31.415679931640625, + "learning_rate": 4.500855675984028e-06, + "loss": 5.0997, + "step": 794 + }, + { + "epoch": 0.13606024302584288, + "grad_norm": 29.878276824951172, + "learning_rate": 4.50656018254421e-06, + "loss": 6.9375, + "step": 795 + }, + { + "epoch": 0.13623138798562381, + "grad_norm": 33.10092544555664, + "learning_rate": 4.512264689104393e-06, + "loss": 6.2231, + "step": 796 + }, + { + "epoch": 0.13640253294540475, + "grad_norm": 21.412826538085938, + "learning_rate": 4.517969195664575e-06, + "loss": 1.8474, + "step": 797 + }, + { + "epoch": 0.13657367790518568, + "grad_norm": 31.297100067138672, + "learning_rate": 4.523673702224758e-06, + "loss": 5.4762, + "step": 798 + }, + { + "epoch": 0.13674482286496661, + "grad_norm": 234.58111572265625, + "learning_rate": 4.52937820878494e-06, + "loss": 11.739, + "step": 799 + }, + { + "epoch": 0.13691596782474755, + "grad_norm": 204.88748168945312, + "learning_rate": 4.535082715345123e-06, + "loss": 13.5482, + "step": 800 + }, + { + "epoch": 0.1370871127845285, + "grad_norm": 33.66855239868164, + "learning_rate": 4.540787221905305e-06, + "loss": 5.9551, + "step": 801 + }, + { + "epoch": 0.13725825774430944, + "grad_norm": 30.423555374145508, + "learning_rate": 4.546491728465488e-06, + "loss": 6.3931, + "step": 802 + }, + { + "epoch": 0.13742940270409038, + "grad_norm": 30.737445831298828, + "learning_rate": 4.55219623502567e-06, + "loss": 4.7871, + "step": 803 + }, + { + "epoch": 0.1376005476638713, + "grad_norm": 95.86985778808594, + "learning_rate": 4.557900741585853e-06, + "loss": 6.8129, + "step": 804 + }, + { + "epoch": 0.13777169262365224, + "grad_norm": 36.5138053894043, + "learning_rate": 4.5636052481460355e-06, + "loss": 6.4333, + "step": 805 + }, + { + "epoch": 0.13794283758343318, + "grad_norm": 31.310596466064453, + "learning_rate": 4.569309754706218e-06, + "loss": 6.1982, + "step": 806 + }, + { + "epoch": 0.1381139825432141, + "grad_norm": 32.40011978149414, + "learning_rate": 4.5750142612664005e-06, + "loss": 6.5281, + "step": 807 + }, + { + "epoch": 0.13828512750299504, + "grad_norm": 33.58089828491211, + "learning_rate": 4.580718767826583e-06, + "loss": 5.0059, + "step": 808 + }, + { + "epoch": 0.13845627246277598, + "grad_norm": 46.53955841064453, + "learning_rate": 4.586423274386766e-06, + "loss": 10.3345, + "step": 809 + }, + { + "epoch": 0.1386274174225569, + "grad_norm": 23.006080627441406, + "learning_rate": 4.592127780946948e-06, + "loss": 2.1468, + "step": 810 + }, + { + "epoch": 0.13879856238233784, + "grad_norm": 21.113685607910156, + "learning_rate": 4.597832287507131e-06, + "loss": 2.0972, + "step": 811 + }, + { + "epoch": 0.13896970734211878, + "grad_norm": 29.228193283081055, + "learning_rate": 4.603536794067313e-06, + "loss": 2.9408, + "step": 812 + }, + { + "epoch": 0.1391408523018997, + "grad_norm": 39.542686462402344, + "learning_rate": 4.609241300627496e-06, + "loss": 6.4624, + "step": 813 + }, + { + "epoch": 0.13931199726168064, + "grad_norm": 42.17389678955078, + "learning_rate": 4.614945807187679e-06, + "loss": 7.4244, + "step": 814 + }, + { + "epoch": 0.13948314222146158, + "grad_norm": 31.26105308532715, + "learning_rate": 4.620650313747861e-06, + "loss": 6.5606, + "step": 815 + }, + { + "epoch": 0.1396542871812425, + "grad_norm": 40.22693634033203, + "learning_rate": 4.626354820308043e-06, + "loss": 6.2725, + "step": 816 + }, + { + "epoch": 0.13982543214102344, + "grad_norm": 25.14350700378418, + "learning_rate": 4.632059326868226e-06, + "loss": 4.0754, + "step": 817 + }, + { + "epoch": 0.13999657710080438, + "grad_norm": 23.578937530517578, + "learning_rate": 4.637763833428408e-06, + "loss": 4.1309, + "step": 818 + }, + { + "epoch": 0.1401677220605853, + "grad_norm": 37.57481002807617, + "learning_rate": 4.643468339988591e-06, + "loss": 5.8135, + "step": 819 + }, + { + "epoch": 0.14033886702036624, + "grad_norm": 35.21710205078125, + "learning_rate": 4.649172846548774e-06, + "loss": 6.6982, + "step": 820 + }, + { + "epoch": 0.14051001198014718, + "grad_norm": 14.915112495422363, + "learning_rate": 4.654877353108957e-06, + "loss": 2.1068, + "step": 821 + }, + { + "epoch": 0.1406811569399281, + "grad_norm": 27.366252899169922, + "learning_rate": 4.6605818596691385e-06, + "loss": 3.2475, + "step": 822 + }, + { + "epoch": 0.14085230189970904, + "grad_norm": 36.40489196777344, + "learning_rate": 4.666286366229321e-06, + "loss": 6.7448, + "step": 823 + }, + { + "epoch": 0.14102344685948998, + "grad_norm": 37.40996551513672, + "learning_rate": 4.6719908727895036e-06, + "loss": 6.8328, + "step": 824 + }, + { + "epoch": 0.1411945918192709, + "grad_norm": 255.09320068359375, + "learning_rate": 4.677695379349686e-06, + "loss": 12.0992, + "step": 825 + }, + { + "epoch": 0.14136573677905187, + "grad_norm": 41.39365768432617, + "learning_rate": 4.6833998859098695e-06, + "loss": 6.1908, + "step": 826 + }, + { + "epoch": 0.1415368817388328, + "grad_norm": 14.086997032165527, + "learning_rate": 4.689104392470052e-06, + "loss": 3.3856, + "step": 827 + }, + { + "epoch": 0.14170802669861374, + "grad_norm": 33.170352935791016, + "learning_rate": 4.694808899030234e-06, + "loss": 6.9479, + "step": 828 + }, + { + "epoch": 0.14187917165839467, + "grad_norm": 37.625064849853516, + "learning_rate": 4.700513405590416e-06, + "loss": 7.6713, + "step": 829 + }, + { + "epoch": 0.1420503166181756, + "grad_norm": 25.476303100585938, + "learning_rate": 4.706217912150599e-06, + "loss": 4.2481, + "step": 830 + }, + { + "epoch": 0.14222146157795654, + "grad_norm": 27.399072647094727, + "learning_rate": 4.711922418710781e-06, + "loss": 5.508, + "step": 831 + }, + { + "epoch": 0.14239260653773747, + "grad_norm": 31.020893096923828, + "learning_rate": 4.717626925270965e-06, + "loss": 5.8831, + "step": 832 + }, + { + "epoch": 0.1425637514975184, + "grad_norm": 26.108135223388672, + "learning_rate": 4.723331431831147e-06, + "loss": 3.5932, + "step": 833 + }, + { + "epoch": 0.14273489645729934, + "grad_norm": 35.8662109375, + "learning_rate": 4.729035938391329e-06, + "loss": 5.1499, + "step": 834 + }, + { + "epoch": 0.14290604141708027, + "grad_norm": 34.714324951171875, + "learning_rate": 4.734740444951511e-06, + "loss": 5.9969, + "step": 835 + }, + { + "epoch": 0.1430771863768612, + "grad_norm": 34.023067474365234, + "learning_rate": 4.740444951511694e-06, + "loss": 6.4575, + "step": 836 + }, + { + "epoch": 0.14324833133664214, + "grad_norm": 17.601118087768555, + "learning_rate": 4.746149458071877e-06, + "loss": 2.5208, + "step": 837 + }, + { + "epoch": 0.14341947629642307, + "grad_norm": 19.672815322875977, + "learning_rate": 4.75185396463206e-06, + "loss": 2.1216, + "step": 838 + }, + { + "epoch": 0.143590621256204, + "grad_norm": 25.771137237548828, + "learning_rate": 4.757558471192242e-06, + "loss": 2.6146, + "step": 839 + }, + { + "epoch": 0.14376176621598494, + "grad_norm": 32.17550277709961, + "learning_rate": 4.763262977752424e-06, + "loss": 5.8516, + "step": 840 + }, + { + "epoch": 0.14393291117576587, + "grad_norm": 72.34523010253906, + "learning_rate": 4.768967484312607e-06, + "loss": 11.0212, + "step": 841 + }, + { + "epoch": 0.1441040561355468, + "grad_norm": 22.756717681884766, + "learning_rate": 4.774671990872789e-06, + "loss": 2.3166, + "step": 842 + }, + { + "epoch": 0.14427520109532774, + "grad_norm": 22.13291358947754, + "learning_rate": 4.7803764974329725e-06, + "loss": 2.2079, + "step": 843 + }, + { + "epoch": 0.14444634605510867, + "grad_norm": 65.32748413085938, + "learning_rate": 4.786081003993155e-06, + "loss": 6.3309, + "step": 844 + }, + { + "epoch": 0.1446174910148896, + "grad_norm": 242.9714813232422, + "learning_rate": 4.791785510553338e-06, + "loss": 11.9379, + "step": 845 + }, + { + "epoch": 0.14478863597467054, + "grad_norm": 21.737802505493164, + "learning_rate": 4.79749001711352e-06, + "loss": 3.806, + "step": 846 + }, + { + "epoch": 0.14495978093445147, + "grad_norm": 29.438758850097656, + "learning_rate": 4.803194523673702e-06, + "loss": 5.7729, + "step": 847 + }, + { + "epoch": 0.1451309258942324, + "grad_norm": 25.701087951660156, + "learning_rate": 4.808899030233884e-06, + "loss": 2.8536, + "step": 848 + }, + { + "epoch": 0.14530207085401334, + "grad_norm": 130.01524353027344, + "learning_rate": 4.814603536794068e-06, + "loss": 7.5391, + "step": 849 + }, + { + "epoch": 0.14547321581379427, + "grad_norm": 30.284828186035156, + "learning_rate": 4.82030804335425e-06, + "loss": 3.806, + "step": 850 + }, + { + "epoch": 0.1456443607735752, + "grad_norm": 23.351642608642578, + "learning_rate": 4.826012549914433e-06, + "loss": 4.2263, + "step": 851 + }, + { + "epoch": 0.14581550573335617, + "grad_norm": 216.2431182861328, + "learning_rate": 4.831717056474615e-06, + "loss": 9.159, + "step": 852 + }, + { + "epoch": 0.1459866506931371, + "grad_norm": 35.071754455566406, + "learning_rate": 4.837421563034797e-06, + "loss": 6.503, + "step": 853 + }, + { + "epoch": 0.14615779565291803, + "grad_norm": 34.0211296081543, + "learning_rate": 4.84312606959498e-06, + "loss": 6.4636, + "step": 854 + }, + { + "epoch": 0.14632894061269897, + "grad_norm": 17.20896339416504, + "learning_rate": 4.848830576155163e-06, + "loss": 2.7218, + "step": 855 + }, + { + "epoch": 0.1465000855724799, + "grad_norm": 136.72647094726562, + "learning_rate": 4.8545350827153454e-06, + "loss": 7.7082, + "step": 856 + }, + { + "epoch": 0.14667123053226083, + "grad_norm": 53.50956344604492, + "learning_rate": 4.860239589275528e-06, + "loss": 10.0171, + "step": 857 + }, + { + "epoch": 0.14684237549204177, + "grad_norm": 21.030473709106445, + "learning_rate": 4.8659440958357105e-06, + "loss": 4.1916, + "step": 858 + }, + { + "epoch": 0.1470135204518227, + "grad_norm": 34.38727569580078, + "learning_rate": 4.871648602395892e-06, + "loss": 5.969, + "step": 859 + }, + { + "epoch": 0.14718466541160363, + "grad_norm": 22.703882217407227, + "learning_rate": 4.8773531089560756e-06, + "loss": 2.4073, + "step": 860 + }, + { + "epoch": 0.14735581037138457, + "grad_norm": 33.388858795166016, + "learning_rate": 4.883057615516258e-06, + "loss": 5.7571, + "step": 861 + }, + { + "epoch": 0.1475269553311655, + "grad_norm": 35.79853820800781, + "learning_rate": 4.888762122076441e-06, + "loss": 5.9363, + "step": 862 + }, + { + "epoch": 0.14769810029094643, + "grad_norm": 20.656721115112305, + "learning_rate": 4.894466628636623e-06, + "loss": 2.0406, + "step": 863 + }, + { + "epoch": 0.14786924525072737, + "grad_norm": 35.20976638793945, + "learning_rate": 4.900171135196806e-06, + "loss": 5.8613, + "step": 864 + }, + { + "epoch": 0.1480403902105083, + "grad_norm": 22.342880249023438, + "learning_rate": 4.905875641756987e-06, + "loss": 4.0119, + "step": 865 + }, + { + "epoch": 0.14821153517028923, + "grad_norm": 33.253292083740234, + "learning_rate": 4.911580148317171e-06, + "loss": 4.62, + "step": 866 + }, + { + "epoch": 0.14838268013007017, + "grad_norm": 186.65093994140625, + "learning_rate": 4.917284654877353e-06, + "loss": 11.2662, + "step": 867 + }, + { + "epoch": 0.1485538250898511, + "grad_norm": 15.842426300048828, + "learning_rate": 4.922989161437536e-06, + "loss": 2.0607, + "step": 868 + }, + { + "epoch": 0.14872497004963203, + "grad_norm": 26.70699119567871, + "learning_rate": 4.928693667997718e-06, + "loss": 3.1737, + "step": 869 + }, + { + "epoch": 0.14889611500941297, + "grad_norm": 33.37158966064453, + "learning_rate": 4.934398174557901e-06, + "loss": 4.7352, + "step": 870 + }, + { + "epoch": 0.1490672599691939, + "grad_norm": 26.4490966796875, + "learning_rate": 4.940102681118083e-06, + "loss": 4.2178, + "step": 871 + }, + { + "epoch": 0.14923840492897483, + "grad_norm": 33.25678634643555, + "learning_rate": 4.945807187678266e-06, + "loss": 5.0764, + "step": 872 + }, + { + "epoch": 0.14940954988875577, + "grad_norm": 38.204769134521484, + "learning_rate": 4.9515116942384485e-06, + "loss": 5.8078, + "step": 873 + }, + { + "epoch": 0.1495806948485367, + "grad_norm": 27.79875946044922, + "learning_rate": 4.957216200798631e-06, + "loss": 5.6432, + "step": 874 + }, + { + "epoch": 0.14975183980831763, + "grad_norm": 32.442115783691406, + "learning_rate": 4.9629207073588135e-06, + "loss": 5.7378, + "step": 875 + }, + { + "epoch": 0.14992298476809857, + "grad_norm": 57.06877517700195, + "learning_rate": 4.968625213918996e-06, + "loss": 10.3136, + "step": 876 + }, + { + "epoch": 0.15009412972787953, + "grad_norm": 32.131187438964844, + "learning_rate": 4.9743297204791794e-06, + "loss": 4.6921, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_nli-pairs_loss": 5.535374164581299, + "eval_nli-pairs_runtime": 4.3709, + "eval_nli-pairs_samples_per_second": 45.757, + "eval_nli-pairs_steps_per_second": 1.601, + "eval_sts-test_pearson_cosine": 0.6147169012893178, + "eval_sts-test_pearson_dot": 0.4334302941897573, + "eval_sts-test_pearson_euclidean": 0.6082490673246602, + "eval_sts-test_pearson_manhattan": 0.616700428941834, + "eval_sts-test_pearson_max": 0.616700428941834, + "eval_sts-test_spearman_cosine": 0.5972327557562241, + "eval_sts-test_spearman_dot": 0.41946207508864325, + "eval_sts-test_spearman_euclidean": 0.5959187544369754, + "eval_sts-test_spearman_manhattan": 0.6029031731511296, + "eval_sts-test_spearman_max": 0.6029031731511296, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_vitaminc-pairs_loss": 3.619838237762451, + "eval_vitaminc-pairs_runtime": 2.7372, + "eval_vitaminc-pairs_samples_per_second": 73.068, + "eval_vitaminc-pairs_steps_per_second": 2.557, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_qnli-contrastive_loss": 12.3779878616333, + "eval_qnli-contrastive_runtime": 0.6382, + "eval_qnli-contrastive_samples_per_second": 313.373, + "eval_qnli-contrastive_steps_per_second": 10.968, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_scitail-pairs-qa_loss": 1.6706750392913818, + "eval_scitail-pairs-qa_runtime": 1.6279, + "eval_scitail-pairs-qa_samples_per_second": 122.855, + "eval_scitail-pairs-qa_steps_per_second": 4.3, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_scitail-pairs-pos_loss": 3.0242857933044434, + "eval_scitail-pairs-pos_runtime": 2.6188, + "eval_scitail-pairs-pos_samples_per_second": 76.369, + "eval_scitail-pairs-pos_steps_per_second": 2.673, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_xsum-pairs_loss": 3.0581634044647217, + "eval_xsum-pairs_runtime": 2.6458, + "eval_xsum-pairs_samples_per_second": 66.142, + "eval_xsum-pairs_steps_per_second": 2.268, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_compression-pairs_loss": 1.9685934782028198, + "eval_compression-pairs_runtime": 0.5084, + "eval_compression-pairs_samples_per_second": 393.398, + "eval_compression-pairs_steps_per_second": 13.769, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_sciq_pairs_loss": 6.824851989746094, + "eval_sciq_pairs_runtime": 9.1685, + "eval_sciq_pairs_samples_per_second": 21.814, + "eval_sciq_pairs_steps_per_second": 0.763, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_qasc_pairs_loss": 10.253314018249512, + "eval_qasc_pairs_runtime": 2.6538, + "eval_qasc_pairs_samples_per_second": 75.363, + "eval_qasc_pairs_steps_per_second": 2.638, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_openbookqa_pairs_loss": 5.933743953704834, + "eval_openbookqa_pairs_runtime": 0.6418, + "eval_openbookqa_pairs_samples_per_second": 107.513, + "eval_openbookqa_pairs_steps_per_second": 4.674, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_msmarco_pairs_loss": 5.185385704040527, + "eval_msmarco_pairs_runtime": 3.9947, + "eval_msmarco_pairs_samples_per_second": 50.067, + "eval_msmarco_pairs_steps_per_second": 1.752, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_nq_pairs_loss": 6.44993782043457, + "eval_nq_pairs_runtime": 8.638, + "eval_nq_pairs_samples_per_second": 23.153, + "eval_nq_pairs_steps_per_second": 0.81, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_trivia_pairs_loss": 6.129721641540527, + "eval_trivia_pairs_runtime": 12.8296, + "eval_trivia_pairs_samples_per_second": 15.589, + "eval_trivia_pairs_steps_per_second": 0.546, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_quora_pairs_loss": 1.7218067646026611, + "eval_quora_pairs_runtime": 1.5931, + "eval_quora_pairs_samples_per_second": 125.544, + "eval_quora_pairs_steps_per_second": 4.394, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_gooaq_pairs_loss": 4.168159008026123, + "eval_gooaq_pairs_runtime": 2.6679, + "eval_gooaq_pairs_samples_per_second": 74.966, + "eval_gooaq_pairs_steps_per_second": 2.624, + "step": 877 + }, + { + "epoch": 0.15026527468766046, + "grad_norm": 29.085119247436523, + "learning_rate": 4.980034227039361e-06, + "loss": 5.8249, + "step": 878 + }, + { + "epoch": 0.1504364196474414, + "grad_norm": 35.45232009887695, + "learning_rate": 4.985738733599544e-06, + "loss": 6.378, + "step": 879 + }, + { + "epoch": 0.15060756460722233, + "grad_norm": 34.018470764160156, + "learning_rate": 4.991443240159726e-06, + "loss": 5.326, + "step": 880 + }, + { + "epoch": 0.15077870956700326, + "grad_norm": 22.30814552307129, + "learning_rate": 4.997147746719909e-06, + "loss": 2.6674, + "step": 881 + }, + { + "epoch": 0.1509498545267842, + "grad_norm": 36.679046630859375, + "learning_rate": 5.002852253280091e-06, + "loss": 6.6655, + "step": 882 + }, + { + "epoch": 0.15112099948656513, + "grad_norm": 36.78900146484375, + "learning_rate": 5.008556759840275e-06, + "loss": 4.5851, + "step": 883 + }, + { + "epoch": 0.15129214444634606, + "grad_norm": 46.770057678222656, + "learning_rate": 5.014261266400456e-06, + "loss": 9.9308, + "step": 884 + }, + { + "epoch": 0.151463289406127, + "grad_norm": 27.262338638305664, + "learning_rate": 5.019965772960639e-06, + "loss": 2.2515, + "step": 885 + }, + { + "epoch": 0.15163443436590793, + "grad_norm": 193.24122619628906, + "learning_rate": 5.025670279520821e-06, + "loss": 10.7631, + "step": 886 + }, + { + "epoch": 0.15180557932568886, + "grad_norm": 30.53336524963379, + "learning_rate": 5.031374786081004e-06, + "loss": 3.9297, + "step": 887 + }, + { + "epoch": 0.1519767242854698, + "grad_norm": 13.035544395446777, + "learning_rate": 5.0370792926411864e-06, + "loss": 3.16, + "step": 888 + }, + { + "epoch": 0.15214786924525073, + "grad_norm": 27.65202522277832, + "learning_rate": 5.04278379920137e-06, + "loss": 3.1012, + "step": 889 + }, + { + "epoch": 0.15231901420503166, + "grad_norm": 28.412954330444336, + "learning_rate": 5.0484883057615515e-06, + "loss": 2.4251, + "step": 890 + }, + { + "epoch": 0.1524901591648126, + "grad_norm": 35.567386627197266, + "learning_rate": 5.054192812321734e-06, + "loss": 5.1793, + "step": 891 + }, + { + "epoch": 0.15266130412459353, + "grad_norm": 31.945302963256836, + "learning_rate": 5.0598973188819166e-06, + "loss": 4.9138, + "step": 892 + }, + { + "epoch": 0.15283244908437446, + "grad_norm": 30.31682014465332, + "learning_rate": 5.065601825442099e-06, + "loss": 4.8582, + "step": 893 + }, + { + "epoch": 0.1530035940441554, + "grad_norm": 22.3225040435791, + "learning_rate": 5.0713063320022825e-06, + "loss": 2.003, + "step": 894 + }, + { + "epoch": 0.15317473900393633, + "grad_norm": 23.375139236450195, + "learning_rate": 5.077010838562465e-06, + "loss": 2.3547, + "step": 895 + }, + { + "epoch": 0.15334588396371726, + "grad_norm": 32.41263198852539, + "learning_rate": 5.0827153451226475e-06, + "loss": 6.2287, + "step": 896 + }, + { + "epoch": 0.1535170289234982, + "grad_norm": 20.43022346496582, + "learning_rate": 5.088419851682829e-06, + "loss": 2.1189, + "step": 897 + }, + { + "epoch": 0.15368817388327913, + "grad_norm": 37.203250885009766, + "learning_rate": 5.094124358243012e-06, + "loss": 6.3629, + "step": 898 + }, + { + "epoch": 0.15385931884306006, + "grad_norm": 19.725624084472656, + "learning_rate": 5.099828864803194e-06, + "loss": 2.2277, + "step": 899 + }, + { + "epoch": 0.154030463802841, + "grad_norm": 27.29782485961914, + "learning_rate": 5.105533371363378e-06, + "loss": 2.8851, + "step": 900 + }, + { + "epoch": 0.15420160876262193, + "grad_norm": 172.8111572265625, + "learning_rate": 5.11123787792356e-06, + "loss": 9.9783, + "step": 901 + }, + { + "epoch": 0.1543727537224029, + "grad_norm": 56.5546875, + "learning_rate": 5.116942384483743e-06, + "loss": 10.3301, + "step": 902 + }, + { + "epoch": 0.15454389868218382, + "grad_norm": 32.12007522583008, + "learning_rate": 5.122646891043924e-06, + "loss": 3.3146, + "step": 903 + }, + { + "epoch": 0.15471504364196476, + "grad_norm": 197.39170837402344, + "learning_rate": 5.128351397604107e-06, + "loss": 11.016, + "step": 904 + }, + { + "epoch": 0.1548861886017457, + "grad_norm": 36.48847579956055, + "learning_rate": 5.1340559041642895e-06, + "loss": 4.8215, + "step": 905 + }, + { + "epoch": 0.15505733356152662, + "grad_norm": 31.014644622802734, + "learning_rate": 5.139760410724473e-06, + "loss": 4.7237, + "step": 906 + }, + { + "epoch": 0.15522847852130756, + "grad_norm": 31.436952590942383, + "learning_rate": 5.145464917284655e-06, + "loss": 4.6175, + "step": 907 + }, + { + "epoch": 0.1553996234810885, + "grad_norm": 27.38591194152832, + "learning_rate": 5.151169423844838e-06, + "loss": 4.0958, + "step": 908 + }, + { + "epoch": 0.15557076844086942, + "grad_norm": 31.732324600219727, + "learning_rate": 5.15687393040502e-06, + "loss": 4.4682, + "step": 909 + }, + { + "epoch": 0.15574191340065036, + "grad_norm": 15.360635757446289, + "learning_rate": 5.162578436965202e-06, + "loss": 2.4148, + "step": 910 + }, + { + "epoch": 0.1559130583604313, + "grad_norm": 172.3378448486328, + "learning_rate": 5.168282943525385e-06, + "loss": 9.8466, + "step": 911 + }, + { + "epoch": 0.15608420332021222, + "grad_norm": 31.59737777709961, + "learning_rate": 5.173987450085568e-06, + "loss": 6.1221, + "step": 912 + }, + { + "epoch": 0.15625534827999316, + "grad_norm": 20.06523323059082, + "learning_rate": 5.179691956645751e-06, + "loss": 2.0035, + "step": 913 + }, + { + "epoch": 0.1564264932397741, + "grad_norm": 25.82581329345703, + "learning_rate": 5.185396463205933e-06, + "loss": 4.7388, + "step": 914 + }, + { + "epoch": 0.15659763819955502, + "grad_norm": 13.644715309143066, + "learning_rate": 5.191100969766115e-06, + "loss": 2.1442, + "step": 915 + }, + { + "epoch": 0.15676878315933596, + "grad_norm": 36.4990119934082, + "learning_rate": 5.196805476326297e-06, + "loss": 6.2552, + "step": 916 + }, + { + "epoch": 0.1569399281191169, + "grad_norm": 35.6190185546875, + "learning_rate": 5.202509982886481e-06, + "loss": 6.3529, + "step": 917 + }, + { + "epoch": 0.15711107307889782, + "grad_norm": 13.495047569274902, + "learning_rate": 5.208214489446663e-06, + "loss": 3.5731, + "step": 918 + }, + { + "epoch": 0.15728221803867876, + "grad_norm": 236.7681121826172, + "learning_rate": 5.213918996006846e-06, + "loss": 10.5726, + "step": 919 + }, + { + "epoch": 0.1574533629984597, + "grad_norm": 34.39946746826172, + "learning_rate": 5.219623502567028e-06, + "loss": 6.0673, + "step": 920 + }, + { + "epoch": 0.15762450795824062, + "grad_norm": 12.590995788574219, + "learning_rate": 5.225328009127211e-06, + "loss": 2.77, + "step": 921 + }, + { + "epoch": 0.15779565291802156, + "grad_norm": 31.968891143798828, + "learning_rate": 5.2310325156873925e-06, + "loss": 4.1677, + "step": 922 + }, + { + "epoch": 0.1579667978778025, + "grad_norm": 31.067489624023438, + "learning_rate": 5.236737022247576e-06, + "loss": 4.716, + "step": 923 + }, + { + "epoch": 0.15813794283758342, + "grad_norm": 36.08390808105469, + "learning_rate": 5.2424415288077584e-06, + "loss": 6.528, + "step": 924 + }, + { + "epoch": 0.15830908779736436, + "grad_norm": 34.2723274230957, + "learning_rate": 5.248146035367941e-06, + "loss": 6.4655, + "step": 925 + }, + { + "epoch": 0.1584802327571453, + "grad_norm": 43.43145751953125, + "learning_rate": 5.2538505419281235e-06, + "loss": 5.6795, + "step": 926 + }, + { + "epoch": 0.15865137771692622, + "grad_norm": 32.78499221801758, + "learning_rate": 5.259555048488306e-06, + "loss": 5.6396, + "step": 927 + }, + { + "epoch": 0.15882252267670718, + "grad_norm": 35.156925201416016, + "learning_rate": 5.265259555048488e-06, + "loss": 4.7143, + "step": 928 + }, + { + "epoch": 0.15899366763648812, + "grad_norm": 34.6341552734375, + "learning_rate": 5.270964061608671e-06, + "loss": 5.6931, + "step": 929 + }, + { + "epoch": 0.15916481259626905, + "grad_norm": 35.668331146240234, + "learning_rate": 5.276668568168854e-06, + "loss": 5.6404, + "step": 930 + }, + { + "epoch": 0.15933595755604998, + "grad_norm": 34.62514877319336, + "learning_rate": 5.282373074729036e-06, + "loss": 5.0469, + "step": 931 + }, + { + "epoch": 0.15950710251583092, + "grad_norm": 37.79499435424805, + "learning_rate": 5.288077581289219e-06, + "loss": 5.3761, + "step": 932 + }, + { + "epoch": 0.15967824747561185, + "grad_norm": 40.4017333984375, + "learning_rate": 5.293782087849401e-06, + "loss": 5.6738, + "step": 933 + }, + { + "epoch": 0.15984939243539278, + "grad_norm": 35.31856155395508, + "learning_rate": 5.299486594409584e-06, + "loss": 6.4936, + "step": 934 + }, + { + "epoch": 0.16002053739517372, + "grad_norm": 126.11963653564453, + "learning_rate": 5.305191100969766e-06, + "loss": 9.9326, + "step": 935 + }, + { + "epoch": 0.16019168235495465, + "grad_norm": 34.740753173828125, + "learning_rate": 5.310895607529949e-06, + "loss": 2.0987, + "step": 936 + }, + { + "epoch": 0.16036282731473558, + "grad_norm": 34.9671745300293, + "learning_rate": 5.316600114090131e-06, + "loss": 6.2338, + "step": 937 + }, + { + "epoch": 0.16053397227451652, + "grad_norm": 21.198925018310547, + "learning_rate": 5.322304620650314e-06, + "loss": 3.5463, + "step": 938 + }, + { + "epoch": 0.16070511723429745, + "grad_norm": 30.98229217529297, + "learning_rate": 5.328009127210496e-06, + "loss": 4.7342, + "step": 939 + }, + { + "epoch": 0.16087626219407838, + "grad_norm": 41.88993835449219, + "learning_rate": 5.333713633770679e-06, + "loss": 6.5058, + "step": 940 + }, + { + "epoch": 0.16104740715385932, + "grad_norm": 24.218576431274414, + "learning_rate": 5.3394181403308615e-06, + "loss": 2.0172, + "step": 941 + }, + { + "epoch": 0.16121855211364025, + "grad_norm": 32.891719818115234, + "learning_rate": 5.345122646891044e-06, + "loss": 5.893, + "step": 942 + }, + { + "epoch": 0.16138969707342118, + "grad_norm": 38.93867874145508, + "learning_rate": 5.3508271534512265e-06, + "loss": 5.8157, + "step": 943 + }, + { + "epoch": 0.16156084203320212, + "grad_norm": 31.02938461303711, + "learning_rate": 5.356531660011409e-06, + "loss": 5.529, + "step": 944 + }, + { + "epoch": 0.16173198699298305, + "grad_norm": 36.240440368652344, + "learning_rate": 5.362236166571592e-06, + "loss": 4.7931, + "step": 945 + }, + { + "epoch": 0.16190313195276398, + "grad_norm": 23.227556228637695, + "learning_rate": 5.367940673131775e-06, + "loss": 2.1265, + "step": 946 + }, + { + "epoch": 0.16207427691254492, + "grad_norm": 40.07374954223633, + "learning_rate": 5.373645179691957e-06, + "loss": 5.8823, + "step": 947 + }, + { + "epoch": 0.16224542187232585, + "grad_norm": 29.960735321044922, + "learning_rate": 5.379349686252139e-06, + "loss": 4.6281, + "step": 948 + }, + { + "epoch": 0.16241656683210678, + "grad_norm": 173.5910186767578, + "learning_rate": 5.385054192812322e-06, + "loss": 10.3282, + "step": 949 + }, + { + "epoch": 0.16258771179188772, + "grad_norm": 37.48442840576172, + "learning_rate": 5.390758699372504e-06, + "loss": 6.1584, + "step": 950 + }, + { + "epoch": 0.16275885675166865, + "grad_norm": 39.48939514160156, + "learning_rate": 5.396463205932687e-06, + "loss": 5.655, + "step": 951 + }, + { + "epoch": 0.16293000171144958, + "grad_norm": 34.57015609741211, + "learning_rate": 5.40216771249287e-06, + "loss": 5.4251, + "step": 952 + }, + { + "epoch": 0.16310114667123055, + "grad_norm": 51.02991485595703, + "learning_rate": 5.407872219053052e-06, + "loss": 10.2283, + "step": 953 + }, + { + "epoch": 0.16327229163101148, + "grad_norm": 31.77302360534668, + "learning_rate": 5.413576725613234e-06, + "loss": 4.0174, + "step": 954 + }, + { + "epoch": 0.1634434365907924, + "grad_norm": 31.242929458618164, + "learning_rate": 5.419281232173417e-06, + "loss": 5.5883, + "step": 955 + }, + { + "epoch": 0.16361458155057335, + "grad_norm": 31.789701461791992, + "learning_rate": 5.4249857387335994e-06, + "loss": 4.5646, + "step": 956 + }, + { + "epoch": 0.16378572651035428, + "grad_norm": 34.09980392456055, + "learning_rate": 5.430690245293783e-06, + "loss": 4.9872, + "step": 957 + }, + { + "epoch": 0.1639568714701352, + "grad_norm": 31.57735252380371, + "learning_rate": 5.436394751853965e-06, + "loss": 5.158, + "step": 958 + }, + { + "epoch": 0.16412801642991615, + "grad_norm": 32.941917419433594, + "learning_rate": 5.442099258414147e-06, + "loss": 5.4497, + "step": 959 + }, + { + "epoch": 0.16429916138969708, + "grad_norm": 200.919921875, + "learning_rate": 5.4478037649743296e-06, + "loss": 9.7888, + "step": 960 + }, + { + "epoch": 0.164470306349478, + "grad_norm": 28.78856658935547, + "learning_rate": 5.453508271534512e-06, + "loss": 5.0757, + "step": 961 + }, + { + "epoch": 0.16464145130925895, + "grad_norm": 22.877927780151367, + "learning_rate": 5.459212778094695e-06, + "loss": 3.6177, + "step": 962 + }, + { + "epoch": 0.16481259626903988, + "grad_norm": 24.904977798461914, + "learning_rate": 5.464917284654878e-06, + "loss": 4.2287, + "step": 963 + }, + { + "epoch": 0.1649837412288208, + "grad_norm": 35.849124908447266, + "learning_rate": 5.4706217912150605e-06, + "loss": 5.1121, + "step": 964 + }, + { + "epoch": 0.16515488618860175, + "grad_norm": 31.580976486206055, + "learning_rate": 5.476326297775242e-06, + "loss": 4.4859, + "step": 965 + }, + { + "epoch": 0.16532603114838268, + "grad_norm": 30.3056697845459, + "learning_rate": 5.482030804335425e-06, + "loss": 4.5076, + "step": 966 + }, + { + "epoch": 0.1654971761081636, + "grad_norm": 34.674468994140625, + "learning_rate": 5.487735310895607e-06, + "loss": 5.7789, + "step": 967 + }, + { + "epoch": 0.16566832106794455, + "grad_norm": 28.0445556640625, + "learning_rate": 5.49343981745579e-06, + "loss": 2.7613, + "step": 968 + }, + { + "epoch": 0.16583946602772548, + "grad_norm": 33.28575134277344, + "learning_rate": 5.499144324015973e-06, + "loss": 5.1032, + "step": 969 + }, + { + "epoch": 0.1660106109875064, + "grad_norm": 35.53700637817383, + "learning_rate": 5.504848830576156e-06, + "loss": 5.2129, + "step": 970 + }, + { + "epoch": 0.16618175594728735, + "grad_norm": 33.2183952331543, + "learning_rate": 5.510553337136338e-06, + "loss": 5.6908, + "step": 971 + }, + { + "epoch": 0.16635290090706828, + "grad_norm": 30.640926361083984, + "learning_rate": 5.51625784369652e-06, + "loss": 4.4325, + "step": 972 + }, + { + "epoch": 0.1665240458668492, + "grad_norm": 24.672338485717773, + "learning_rate": 5.5219623502567025e-06, + "loss": 3.9552, + "step": 973 + }, + { + "epoch": 0.16669519082663015, + "grad_norm": 33.66337585449219, + "learning_rate": 5.527666856816886e-06, + "loss": 5.4014, + "step": 974 + }, + { + "epoch": 0.16686633578641108, + "grad_norm": 32.082942962646484, + "learning_rate": 5.533371363377068e-06, + "loss": 5.9258, + "step": 975 + }, + { + "epoch": 0.167037480746192, + "grad_norm": 37.91094970703125, + "learning_rate": 5.539075869937251e-06, + "loss": 5.717, + "step": 976 + }, + { + "epoch": 0.16720862570597295, + "grad_norm": 20.26280975341797, + "learning_rate": 5.5447803764974335e-06, + "loss": 2.2263, + "step": 977 + }, + { + "epoch": 0.16737977066575388, + "grad_norm": 48.14308547973633, + "learning_rate": 5.550484883057615e-06, + "loss": 9.6938, + "step": 978 + }, + { + "epoch": 0.16755091562553484, + "grad_norm": 22.81192970275879, + "learning_rate": 5.556189389617798e-06, + "loss": 3.7015, + "step": 979 + }, + { + "epoch": 0.16772206058531577, + "grad_norm": 27.474571228027344, + "learning_rate": 5.561893896177981e-06, + "loss": 2.9404, + "step": 980 + }, + { + "epoch": 0.1678932055450967, + "grad_norm": 25.376007080078125, + "learning_rate": 5.567598402738164e-06, + "loss": 2.3926, + "step": 981 + }, + { + "epoch": 0.16806435050487764, + "grad_norm": 31.575468063354492, + "learning_rate": 5.573302909298346e-06, + "loss": 4.7349, + "step": 982 + }, + { + "epoch": 0.16823549546465857, + "grad_norm": 194.93817138671875, + "learning_rate": 5.579007415858529e-06, + "loss": 9.7172, + "step": 983 + }, + { + "epoch": 0.1684066404244395, + "grad_norm": 31.26558494567871, + "learning_rate": 5.58471192241871e-06, + "loss": 3.9837, + "step": 984 + }, + { + "epoch": 0.16857778538422044, + "grad_norm": 32.1373405456543, + "learning_rate": 5.590416428978893e-06, + "loss": 5.0026, + "step": 985 + }, + { + "epoch": 0.16874893034400137, + "grad_norm": 37.07416915893555, + "learning_rate": 5.596120935539076e-06, + "loss": 5.8572, + "step": 986 + }, + { + "epoch": 0.1689200753037823, + "grad_norm": 35.09983825683594, + "learning_rate": 5.601825442099259e-06, + "loss": 5.6302, + "step": 987 + }, + { + "epoch": 0.16909122026356324, + "grad_norm": 46.96855926513672, + "learning_rate": 5.607529948659441e-06, + "loss": 9.6255, + "step": 988 + }, + { + "epoch": 0.16926236522334417, + "grad_norm": 36.15262985229492, + "learning_rate": 5.613234455219624e-06, + "loss": 5.5484, + "step": 989 + }, + { + "epoch": 0.1694335101831251, + "grad_norm": 33.642967224121094, + "learning_rate": 5.6189389617798055e-06, + "loss": 5.5827, + "step": 990 + }, + { + "epoch": 0.16960465514290604, + "grad_norm": 27.581716537475586, + "learning_rate": 5.624643468339988e-06, + "loss": 2.9652, + "step": 991 + }, + { + "epoch": 0.16977580010268697, + "grad_norm": 19.107044219970703, + "learning_rate": 5.6303479749001714e-06, + "loss": 1.7442, + "step": 992 + }, + { + "epoch": 0.1699469450624679, + "grad_norm": 165.6937255859375, + "learning_rate": 5.636052481460354e-06, + "loss": 10.2439, + "step": 993 + }, + { + "epoch": 0.17011809002224884, + "grad_norm": 171.38658142089844, + "learning_rate": 5.6417569880205365e-06, + "loss": 10.7544, + "step": 994 + }, + { + "epoch": 0.17028923498202977, + "grad_norm": 29.20503807067871, + "learning_rate": 5.647461494580719e-06, + "loss": 4.176, + "step": 995 + }, + { + "epoch": 0.1704603799418107, + "grad_norm": 29.09612274169922, + "learning_rate": 5.6531660011409016e-06, + "loss": 4.1945, + "step": 996 + }, + { + "epoch": 0.17063152490159164, + "grad_norm": 39.78682327270508, + "learning_rate": 5.658870507701084e-06, + "loss": 6.4205, + "step": 997 + }, + { + "epoch": 0.17080266986137257, + "grad_norm": 13.687639236450195, + "learning_rate": 5.664575014261267e-06, + "loss": 3.468, + "step": 998 + }, + { + "epoch": 0.1709738148211535, + "grad_norm": 41.89799118041992, + "learning_rate": 5.670279520821449e-06, + "loss": 7.13, + "step": 999 + }, + { + "epoch": 0.17114495978093444, + "grad_norm": 22.78835678100586, + "learning_rate": 5.675984027381632e-06, + "loss": 2.7249, + "step": 1000 + }, + { + "epoch": 0.17131610474071537, + "grad_norm": 26.538780212402344, + "learning_rate": 5.681688533941814e-06, + "loss": 3.2385, + "step": 1001 + }, + { + "epoch": 0.1714872497004963, + "grad_norm": 24.171205520629883, + "learning_rate": 5.687393040501997e-06, + "loss": 3.7183, + "step": 1002 + }, + { + "epoch": 0.17165839466027724, + "grad_norm": 35.46499252319336, + "learning_rate": 5.693097547062179e-06, + "loss": 5.4996, + "step": 1003 + }, + { + "epoch": 0.1718295396200582, + "grad_norm": 15.119646072387695, + "learning_rate": 5.698802053622362e-06, + "loss": 2.4476, + "step": 1004 + }, + { + "epoch": 0.17200068457983914, + "grad_norm": 43.560546875, + "learning_rate": 5.704506560182544e-06, + "loss": 9.1856, + "step": 1005 + }, + { + "epoch": 0.17217182953962007, + "grad_norm": 42.41808319091797, + "learning_rate": 5.710211066742727e-06, + "loss": 5.6756, + "step": 1006 + }, + { + "epoch": 0.172342974499401, + "grad_norm": 34.344207763671875, + "learning_rate": 5.715915573302909e-06, + "loss": 5.2383, + "step": 1007 + }, + { + "epoch": 0.17251411945918194, + "grad_norm": 19.511310577392578, + "learning_rate": 5.721620079863092e-06, + "loss": 3.3214, + "step": 1008 + }, + { + "epoch": 0.17268526441896287, + "grad_norm": 33.06563949584961, + "learning_rate": 5.7273245864232745e-06, + "loss": 5.6944, + "step": 1009 + }, + { + "epoch": 0.1728564093787438, + "grad_norm": 38.382041931152344, + "learning_rate": 5.733029092983457e-06, + "loss": 5.9898, + "step": 1010 + }, + { + "epoch": 0.17302755433852474, + "grad_norm": 28.5861759185791, + "learning_rate": 5.7387335995436395e-06, + "loss": 5.2048, + "step": 1011 + }, + { + "epoch": 0.17319869929830567, + "grad_norm": 31.76646614074707, + "learning_rate": 5.744438106103822e-06, + "loss": 6.0811, + "step": 1012 + }, + { + "epoch": 0.1733698442580866, + "grad_norm": 37.81482696533203, + "learning_rate": 5.750142612664005e-06, + "loss": 4.8642, + "step": 1013 + }, + { + "epoch": 0.17354098921786754, + "grad_norm": 45.32394790649414, + "learning_rate": 5.755847119224188e-06, + "loss": 9.5803, + "step": 1014 + }, + { + "epoch": 0.17371213417764847, + "grad_norm": 35.39071273803711, + "learning_rate": 5.76155162578437e-06, + "loss": 4.3758, + "step": 1015 + }, + { + "epoch": 0.1738832791374294, + "grad_norm": 31.971323013305664, + "learning_rate": 5.767256132344552e-06, + "loss": 4.2616, + "step": 1016 + }, + { + "epoch": 0.17405442409721034, + "grad_norm": 29.855161666870117, + "learning_rate": 5.772960638904735e-06, + "loss": 5.5371, + "step": 1017 + }, + { + "epoch": 0.17422556905699127, + "grad_norm": 21.00974464416504, + "learning_rate": 5.778665145464917e-06, + "loss": 1.9809, + "step": 1018 + }, + { + "epoch": 0.1743967140167722, + "grad_norm": 23.60835075378418, + "learning_rate": 5.7843696520251e-06, + "loss": 2.5916, + "step": 1019 + }, + { + "epoch": 0.17456785897655314, + "grad_norm": 36.11520767211914, + "learning_rate": 5.790074158585283e-06, + "loss": 4.9198, + "step": 1020 + }, + { + "epoch": 0.17473900393633407, + "grad_norm": 21.838703155517578, + "learning_rate": 5.795778665145466e-06, + "loss": 2.1235, + "step": 1021 + }, + { + "epoch": 0.174910148896115, + "grad_norm": 28.41387367248535, + "learning_rate": 5.801483171705647e-06, + "loss": 5.0401, + "step": 1022 + }, + { + "epoch": 0.17508129385589594, + "grad_norm": 28.482187271118164, + "learning_rate": 5.80718767826583e-06, + "loss": 4.7167, + "step": 1023 + }, + { + "epoch": 0.17525243881567687, + "grad_norm": 33.954307556152344, + "learning_rate": 5.8128921848260124e-06, + "loss": 4.9666, + "step": 1024 + }, + { + "epoch": 0.1754235837754578, + "grad_norm": 33.401920318603516, + "learning_rate": 5.818596691386195e-06, + "loss": 6.3783, + "step": 1025 + }, + { + "epoch": 0.17559472873523874, + "grad_norm": 37.047691345214844, + "learning_rate": 5.824301197946378e-06, + "loss": 5.5925, + "step": 1026 + }, + { + "epoch": 0.17576587369501967, + "grad_norm": 30.060083389282227, + "learning_rate": 5.830005704506561e-06, + "loss": 3.8415, + "step": 1027 + }, + { + "epoch": 0.1759370186548006, + "grad_norm": 30.832544326782227, + "learning_rate": 5.8357102110667426e-06, + "loss": 4.9379, + "step": 1028 + }, + { + "epoch": 0.17610816361458156, + "grad_norm": 30.651966094970703, + "learning_rate": 5.841414717626925e-06, + "loss": 3.9393, + "step": 1029 + }, + { + "epoch": 0.1762793085743625, + "grad_norm": 12.284616470336914, + "learning_rate": 5.847119224187108e-06, + "loss": 2.7979, + "step": 1030 + }, + { + "epoch": 0.17645045353414343, + "grad_norm": 25.138864517211914, + "learning_rate": 5.85282373074729e-06, + "loss": 3.6294, + "step": 1031 + }, + { + "epoch": 0.17662159849392436, + "grad_norm": 19.136524200439453, + "learning_rate": 5.8585282373074735e-06, + "loss": 1.5926, + "step": 1032 + }, + { + "epoch": 0.1767927434537053, + "grad_norm": 36.646968841552734, + "learning_rate": 5.864232743867656e-06, + "loss": 5.8265, + "step": 1033 + }, + { + "epoch": 0.17696388841348623, + "grad_norm": 17.363170623779297, + "learning_rate": 5.869937250427838e-06, + "loss": 1.7465, + "step": 1034 + }, + { + "epoch": 0.17713503337326716, + "grad_norm": 29.55439567565918, + "learning_rate": 5.87564175698802e-06, + "loss": 3.617, + "step": 1035 + }, + { + "epoch": 0.1773061783330481, + "grad_norm": 203.16549682617188, + "learning_rate": 5.881346263548203e-06, + "loss": 7.9826, + "step": 1036 + }, + { + "epoch": 0.17747732329282903, + "grad_norm": 17.790836334228516, + "learning_rate": 5.887050770108386e-06, + "loss": 2.1574, + "step": 1037 + }, + { + "epoch": 0.17764846825260996, + "grad_norm": 40.40040969848633, + "learning_rate": 5.892755276668569e-06, + "loss": 5.5116, + "step": 1038 + }, + { + "epoch": 0.1778196132123909, + "grad_norm": 30.316959381103516, + "learning_rate": 5.898459783228751e-06, + "loss": 4.4268, + "step": 1039 + }, + { + "epoch": 0.17799075817217183, + "grad_norm": 34.86418151855469, + "learning_rate": 5.904164289788933e-06, + "loss": 4.9673, + "step": 1040 + }, + { + "epoch": 0.17816190313195276, + "grad_norm": 198.34268188476562, + "learning_rate": 5.9098687963491155e-06, + "loss": 10.3881, + "step": 1041 + }, + { + "epoch": 0.1783330480917337, + "grad_norm": 29.608211517333984, + "learning_rate": 5.915573302909298e-06, + "loss": 3.9641, + "step": 1042 + }, + { + "epoch": 0.17850419305151463, + "grad_norm": 28.76857566833496, + "learning_rate": 5.921277809469481e-06, + "loss": 4.0211, + "step": 1043 + }, + { + "epoch": 0.17867533801129556, + "grad_norm": 26.37080955505371, + "learning_rate": 5.926982316029664e-06, + "loss": 4.6642, + "step": 1044 + }, + { + "epoch": 0.1788464829710765, + "grad_norm": 32.01490020751953, + "learning_rate": 5.9326868225898464e-06, + "loss": 5.5217, + "step": 1045 + }, + { + "epoch": 0.17901762793085743, + "grad_norm": 22.62516212463379, + "learning_rate": 5.938391329150029e-06, + "loss": 1.9563, + "step": 1046 + }, + { + "epoch": 0.17918877289063836, + "grad_norm": 40.089229583740234, + "learning_rate": 5.944095835710211e-06, + "loss": 5.9567, + "step": 1047 + }, + { + "epoch": 0.1793599178504193, + "grad_norm": 22.854562759399414, + "learning_rate": 5.949800342270393e-06, + "loss": 1.9063, + "step": 1048 + }, + { + "epoch": 0.17953106281020023, + "grad_norm": 99.86076354980469, + "learning_rate": 5.9555048488305766e-06, + "loss": 6.6872, + "step": 1049 + }, + { + "epoch": 0.17970220776998116, + "grad_norm": 42.04011154174805, + "learning_rate": 5.961209355390759e-06, + "loss": 6.4974, + "step": 1050 + }, + { + "epoch": 0.1798733527297621, + "grad_norm": 26.85508155822754, + "learning_rate": 5.966913861950942e-06, + "loss": 4.3443, + "step": 1051 + }, + { + "epoch": 0.18004449768954303, + "grad_norm": 29.8301944732666, + "learning_rate": 5.972618368511124e-06, + "loss": 5.0599, + "step": 1052 + }, + { + "epoch": 0.18021564264932396, + "grad_norm": 50.89991760253906, + "learning_rate": 5.978322875071306e-06, + "loss": 9.764, + "step": 1053 + }, + { + "epoch": 0.1803867876091049, + "grad_norm": 32.19784927368164, + "learning_rate": 5.984027381631489e-06, + "loss": 4.1811, + "step": 1054 + }, + { + "epoch": 0.18055793256888586, + "grad_norm": 46.780487060546875, + "learning_rate": 5.989731888191672e-06, + "loss": 9.4505, + "step": 1055 + }, + { + "epoch": 0.1807290775286668, + "grad_norm": 17.571828842163086, + "learning_rate": 5.995436394751854e-06, + "loss": 1.8957, + "step": 1056 + }, + { + "epoch": 0.18090022248844773, + "grad_norm": 30.740095138549805, + "learning_rate": 6.001140901312037e-06, + "loss": 4.0522, + "step": 1057 + }, + { + "epoch": 0.18107136744822866, + "grad_norm": 36.38762283325195, + "learning_rate": 6.006845407872219e-06, + "loss": 5.546, + "step": 1058 + }, + { + "epoch": 0.1812425124080096, + "grad_norm": 37.66824722290039, + "learning_rate": 6.012549914432401e-06, + "loss": 4.7406, + "step": 1059 + }, + { + "epoch": 0.18141365736779053, + "grad_norm": 33.9829216003418, + "learning_rate": 6.018254420992584e-06, + "loss": 4.8123, + "step": 1060 + }, + { + "epoch": 0.18158480232757146, + "grad_norm": 25.99117088317871, + "learning_rate": 6.023958927552767e-06, + "loss": 4.6063, + "step": 1061 + }, + { + "epoch": 0.1817559472873524, + "grad_norm": 29.198394775390625, + "learning_rate": 6.0296634341129495e-06, + "loss": 5.0514, + "step": 1062 + }, + { + "epoch": 0.18192709224713333, + "grad_norm": 14.127655982971191, + "learning_rate": 6.035367940673132e-06, + "loss": 1.3962, + "step": 1063 + }, + { + "epoch": 0.18209823720691426, + "grad_norm": 12.10257339477539, + "learning_rate": 6.0410724472333145e-06, + "loss": 2.0181, + "step": 1064 + }, + { + "epoch": 0.1822693821666952, + "grad_norm": 19.635854721069336, + "learning_rate": 6.046776953793496e-06, + "loss": 1.7151, + "step": 1065 + }, + { + "epoch": 0.18244052712647613, + "grad_norm": 189.35772705078125, + "learning_rate": 6.05248146035368e-06, + "loss": 9.8327, + "step": 1066 + }, + { + "epoch": 0.18261167208625706, + "grad_norm": 34.833229064941406, + "learning_rate": 6.058185966913862e-06, + "loss": 5.6448, + "step": 1067 + }, + { + "epoch": 0.182782817046038, + "grad_norm": 24.17336654663086, + "learning_rate": 6.063890473474045e-06, + "loss": 3.8977, + "step": 1068 + }, + { + "epoch": 0.18295396200581893, + "grad_norm": 32.84638214111328, + "learning_rate": 6.069594980034227e-06, + "loss": 5.7649, + "step": 1069 + }, + { + "epoch": 0.18312510696559986, + "grad_norm": 46.32835388183594, + "learning_rate": 6.07529948659441e-06, + "loss": 9.2569, + "step": 1070 + }, + { + "epoch": 0.1832962519253808, + "grad_norm": 15.697673797607422, + "learning_rate": 6.081003993154592e-06, + "loss": 1.6445, + "step": 1071 + }, + { + "epoch": 0.18346739688516173, + "grad_norm": 31.891868591308594, + "learning_rate": 6.086708499714775e-06, + "loss": 5.4669, + "step": 1072 + }, + { + "epoch": 0.18363854184494266, + "grad_norm": 29.735248565673828, + "learning_rate": 6.092413006274957e-06, + "loss": 5.0552, + "step": 1073 + }, + { + "epoch": 0.1838096868047236, + "grad_norm": 15.486328125, + "learning_rate": 6.09811751283514e-06, + "loss": 2.2292, + "step": 1074 + }, + { + "epoch": 0.18398083176450453, + "grad_norm": 24.518693923950195, + "learning_rate": 6.103822019395322e-06, + "loss": 3.5355, + "step": 1075 + }, + { + "epoch": 0.18415197672428546, + "grad_norm": 27.474645614624023, + "learning_rate": 6.109526525955505e-06, + "loss": 2.0704, + "step": 1076 + }, + { + "epoch": 0.1843231216840664, + "grad_norm": 21.003856658935547, + "learning_rate": 6.115231032515688e-06, + "loss": 2.0773, + "step": 1077 + }, + { + "epoch": 0.18449426664384733, + "grad_norm": 12.948555946350098, + "learning_rate": 6.12093553907587e-06, + "loss": 1.9105, + "step": 1078 + }, + { + "epoch": 0.18466541160362826, + "grad_norm": 28.35967254638672, + "learning_rate": 6.1266400456360525e-06, + "loss": 5.1778, + "step": 1079 + }, + { + "epoch": 0.18483655656340922, + "grad_norm": 28.59235954284668, + "learning_rate": 6.132344552196235e-06, + "loss": 3.9724, + "step": 1080 + }, + { + "epoch": 0.18500770152319015, + "grad_norm": 32.077518463134766, + "learning_rate": 6.138049058756418e-06, + "loss": 4.2397, + "step": 1081 + }, + { + "epoch": 0.1851788464829711, + "grad_norm": 34.8428955078125, + "learning_rate": 6.1437535653166e-06, + "loss": 4.3906, + "step": 1082 + }, + { + "epoch": 0.18534999144275202, + "grad_norm": 36.8244743347168, + "learning_rate": 6.1494580718767835e-06, + "loss": 4.6433, + "step": 1083 + }, + { + "epoch": 0.18552113640253295, + "grad_norm": 34.37318420410156, + "learning_rate": 6.155162578436965e-06, + "loss": 4.7285, + "step": 1084 + }, + { + "epoch": 0.1856922813623139, + "grad_norm": 34.02301025390625, + "learning_rate": 6.160867084997148e-06, + "loss": 5.1995, + "step": 1085 + }, + { + "epoch": 0.18586342632209482, + "grad_norm": 15.779897689819336, + "learning_rate": 6.16657159155733e-06, + "loss": 1.5138, + "step": 1086 + }, + { + "epoch": 0.18603457128187575, + "grad_norm": 45.183841705322266, + "learning_rate": 6.172276098117513e-06, + "loss": 6.6194, + "step": 1087 + }, + { + "epoch": 0.1862057162416567, + "grad_norm": 15.437774658203125, + "learning_rate": 6.177980604677695e-06, + "loss": 1.4242, + "step": 1088 + }, + { + "epoch": 0.18637686120143762, + "grad_norm": 246.0555419921875, + "learning_rate": 6.183685111237879e-06, + "loss": 10.7677, + "step": 1089 + }, + { + "epoch": 0.18654800616121855, + "grad_norm": 8.7081937789917, + "learning_rate": 6.18938961779806e-06, + "loss": 2.3527, + "step": 1090 + }, + { + "epoch": 0.1867191511209995, + "grad_norm": 35.0928840637207, + "learning_rate": 6.195094124358243e-06, + "loss": 5.4856, + "step": 1091 + }, + { + "epoch": 0.18689029608078042, + "grad_norm": 36.24078369140625, + "learning_rate": 6.2007986309184254e-06, + "loss": 5.1105, + "step": 1092 + }, + { + "epoch": 0.18706144104056135, + "grad_norm": 41.07029724121094, + "learning_rate": 6.206503137478608e-06, + "loss": 5.543, + "step": 1093 + }, + { + "epoch": 0.1872325860003423, + "grad_norm": 36.27534484863281, + "learning_rate": 6.212207644038791e-06, + "loss": 4.4058, + "step": 1094 + }, + { + "epoch": 0.18740373096012322, + "grad_norm": 34.61309814453125, + "learning_rate": 6.217912150598974e-06, + "loss": 4.9065, + "step": 1095 + }, + { + "epoch": 0.18757487591990415, + "grad_norm": 36.856388092041016, + "learning_rate": 6.223616657159156e-06, + "loss": 4.8059, + "step": 1096 + }, + { + "epoch": 0.1877460208796851, + "grad_norm": 39.40951156616211, + "learning_rate": 6.229321163719338e-06, + "loss": 5.8853, + "step": 1097 + }, + { + "epoch": 0.18791716583946602, + "grad_norm": 30.013790130615234, + "learning_rate": 6.235025670279521e-06, + "loss": 4.1051, + "step": 1098 + }, + { + "epoch": 0.18808831079924695, + "grad_norm": 27.43667984008789, + "learning_rate": 6.240730176839703e-06, + "loss": 3.661, + "step": 1099 + }, + { + "epoch": 0.1882594557590279, + "grad_norm": 22.01202964782715, + "learning_rate": 6.2464346833998865e-06, + "loss": 2.0165, + "step": 1100 + }, + { + "epoch": 0.18843060071880882, + "grad_norm": 23.981887817382812, + "learning_rate": 6.252139189960069e-06, + "loss": 1.8586, + "step": 1101 + }, + { + "epoch": 0.18860174567858976, + "grad_norm": 221.93540954589844, + "learning_rate": 6.257843696520252e-06, + "loss": 8.0869, + "step": 1102 + }, + { + "epoch": 0.1887728906383707, + "grad_norm": 32.2524299621582, + "learning_rate": 6.263548203080433e-06, + "loss": 4.6553, + "step": 1103 + }, + { + "epoch": 0.18894403559815162, + "grad_norm": 14.555329322814941, + "learning_rate": 6.269252709640616e-06, + "loss": 2.0657, + "step": 1104 + }, + { + "epoch": 0.18911518055793256, + "grad_norm": 27.233903884887695, + "learning_rate": 6.274957216200798e-06, + "loss": 3.7143, + "step": 1105 + }, + { + "epoch": 0.18928632551771352, + "grad_norm": 15.294402122497559, + "learning_rate": 6.280661722760982e-06, + "loss": 1.4409, + "step": 1106 + }, + { + "epoch": 0.18945747047749445, + "grad_norm": 223.1316375732422, + "learning_rate": 6.286366229321164e-06, + "loss": 9.676, + "step": 1107 + }, + { + "epoch": 0.18962861543727538, + "grad_norm": 36.643463134765625, + "learning_rate": 6.292070735881347e-06, + "loss": 4.7202, + "step": 1108 + }, + { + "epoch": 0.18979976039705632, + "grad_norm": 37.47721481323242, + "learning_rate": 6.2977752424415285e-06, + "loss": 4.8366, + "step": 1109 + }, + { + "epoch": 0.18997090535683725, + "grad_norm": 34.74982833862305, + "learning_rate": 6.303479749001711e-06, + "loss": 4.6667, + "step": 1110 + }, + { + "epoch": 0.19014205031661818, + "grad_norm": 38.055728912353516, + "learning_rate": 6.3091842555618935e-06, + "loss": 5.3396, + "step": 1111 + }, + { + "epoch": 0.19031319527639912, + "grad_norm": 33.44966506958008, + "learning_rate": 6.314888762122077e-06, + "loss": 5.0909, + "step": 1112 + }, + { + "epoch": 0.19048434023618005, + "grad_norm": 34.397132873535156, + "learning_rate": 6.3205932686822594e-06, + "loss": 5.3514, + "step": 1113 + }, + { + "epoch": 0.19065548519596098, + "grad_norm": 39.06338119506836, + "learning_rate": 6.326297775242442e-06, + "loss": 6.3797, + "step": 1114 + }, + { + "epoch": 0.19082663015574192, + "grad_norm": 40.017799377441406, + "learning_rate": 6.332002281802624e-06, + "loss": 5.5943, + "step": 1115 + }, + { + "epoch": 0.19099777511552285, + "grad_norm": 11.964347839355469, + "learning_rate": 6.337706788362806e-06, + "loss": 1.8095, + "step": 1116 + }, + { + "epoch": 0.19116892007530378, + "grad_norm": 12.956400871276855, + "learning_rate": 6.3434112949229896e-06, + "loss": 1.3529, + "step": 1117 + }, + { + "epoch": 0.19134006503508472, + "grad_norm": 36.93289566040039, + "learning_rate": 6.349115801483172e-06, + "loss": 6.0492, + "step": 1118 + }, + { + "epoch": 0.19151120999486565, + "grad_norm": 33.92202377319336, + "learning_rate": 6.354820308043355e-06, + "loss": 5.9093, + "step": 1119 + }, + { + "epoch": 0.19168235495464658, + "grad_norm": 37.51108169555664, + "learning_rate": 6.360524814603537e-06, + "loss": 5.5156, + "step": 1120 + }, + { + "epoch": 0.19185349991442752, + "grad_norm": 23.369075775146484, + "learning_rate": 6.36622932116372e-06, + "loss": 3.9585, + "step": 1121 + }, + { + "epoch": 0.19202464487420845, + "grad_norm": 27.76898765563965, + "learning_rate": 6.371933827723901e-06, + "loss": 4.0578, + "step": 1122 + }, + { + "epoch": 0.19219578983398938, + "grad_norm": 21.719980239868164, + "learning_rate": 6.377638334284085e-06, + "loss": 1.6746, + "step": 1123 + }, + { + "epoch": 0.19236693479377032, + "grad_norm": 32.65765380859375, + "learning_rate": 6.383342840844267e-06, + "loss": 4.4355, + "step": 1124 + }, + { + "epoch": 0.19253807975355125, + "grad_norm": 31.302228927612305, + "learning_rate": 6.38904734740445e-06, + "loss": 4.3111, + "step": 1125 + }, + { + "epoch": 0.19270922471333218, + "grad_norm": 36.785396575927734, + "learning_rate": 6.394751853964632e-06, + "loss": 5.3737, + "step": 1126 + }, + { + "epoch": 0.19288036967311312, + "grad_norm": 32.185787200927734, + "learning_rate": 6.400456360524815e-06, + "loss": 4.2842, + "step": 1127 + }, + { + "epoch": 0.19305151463289405, + "grad_norm": 49.154666900634766, + "learning_rate": 6.4061608670849966e-06, + "loss": 8.8989, + "step": 1128 + }, + { + "epoch": 0.19322265959267498, + "grad_norm": 31.552207946777344, + "learning_rate": 6.41186537364518e-06, + "loss": 4.2685, + "step": 1129 + }, + { + "epoch": 0.19339380455245592, + "grad_norm": 21.41136932373047, + "learning_rate": 6.4175698802053625e-06, + "loss": 2.3051, + "step": 1130 + }, + { + "epoch": 0.19356494951223688, + "grad_norm": 13.525940895080566, + "learning_rate": 6.423274386765545e-06, + "loss": 2.1123, + "step": 1131 + }, + { + "epoch": 0.1937360944720178, + "grad_norm": 37.48530960083008, + "learning_rate": 6.4289788933257275e-06, + "loss": 4.8037, + "step": 1132 + }, + { + "epoch": 0.19390723943179874, + "grad_norm": 38.14132308959961, + "learning_rate": 6.43468339988591e-06, + "loss": 6.2294, + "step": 1133 + }, + { + "epoch": 0.19407838439157968, + "grad_norm": 33.01750183105469, + "learning_rate": 6.440387906446093e-06, + "loss": 4.9204, + "step": 1134 + }, + { + "epoch": 0.1942495293513606, + "grad_norm": 36.364158630371094, + "learning_rate": 6.446092413006275e-06, + "loss": 4.5797, + "step": 1135 + }, + { + "epoch": 0.19442067431114154, + "grad_norm": 46.81378173828125, + "learning_rate": 6.451796919566458e-06, + "loss": 6.538, + "step": 1136 + }, + { + "epoch": 0.19459181927092248, + "grad_norm": 23.135957717895508, + "learning_rate": 6.45750142612664e-06, + "loss": 4.3991, + "step": 1137 + }, + { + "epoch": 0.1947629642307034, + "grad_norm": 25.031917572021484, + "learning_rate": 6.463205932686823e-06, + "loss": 2.3886, + "step": 1138 + }, + { + "epoch": 0.19493410919048434, + "grad_norm": 35.31920623779297, + "learning_rate": 6.468910439247005e-06, + "loss": 6.0172, + "step": 1139 + }, + { + "epoch": 0.19510525415026528, + "grad_norm": 36.97047424316406, + "learning_rate": 6.474614945807188e-06, + "loss": 5.4822, + "step": 1140 + }, + { + "epoch": 0.1952763991100462, + "grad_norm": 31.77883529663086, + "learning_rate": 6.48031945236737e-06, + "loss": 4.7072, + "step": 1141 + }, + { + "epoch": 0.19544754406982714, + "grad_norm": 28.897930145263672, + "learning_rate": 6.486023958927553e-06, + "loss": 3.7105, + "step": 1142 + }, + { + "epoch": 0.19561868902960808, + "grad_norm": 29.99696922302246, + "learning_rate": 6.491728465487735e-06, + "loss": 4.5102, + "step": 1143 + }, + { + "epoch": 0.195789833989389, + "grad_norm": 25.783557891845703, + "learning_rate": 6.497432972047918e-06, + "loss": 3.6023, + "step": 1144 + }, + { + "epoch": 0.19596097894916994, + "grad_norm": 35.004642486572266, + "learning_rate": 6.5031374786081005e-06, + "loss": 4.1587, + "step": 1145 + }, + { + "epoch": 0.19613212390895088, + "grad_norm": 173.46754455566406, + "learning_rate": 6.508841985168284e-06, + "loss": 7.5547, + "step": 1146 + }, + { + "epoch": 0.1963032688687318, + "grad_norm": 18.749853134155273, + "learning_rate": 6.5145464917284655e-06, + "loss": 1.7298, + "step": 1147 + }, + { + "epoch": 0.19647441382851275, + "grad_norm": 31.15353012084961, + "learning_rate": 6.520250998288648e-06, + "loss": 5.4053, + "step": 1148 + }, + { + "epoch": 0.19664555878829368, + "grad_norm": 21.659912109375, + "learning_rate": 6.525955504848831e-06, + "loss": 1.8891, + "step": 1149 + }, + { + "epoch": 0.1968167037480746, + "grad_norm": 23.412139892578125, + "learning_rate": 6.531660011409013e-06, + "loss": 3.8619, + "step": 1150 + }, + { + "epoch": 0.19698784870785555, + "grad_norm": 22.16069221496582, + "learning_rate": 6.537364517969196e-06, + "loss": 2.0106, + "step": 1151 + }, + { + "epoch": 0.19715899366763648, + "grad_norm": 33.494136810302734, + "learning_rate": 6.543069024529379e-06, + "loss": 5.4958, + "step": 1152 + }, + { + "epoch": 0.1973301386274174, + "grad_norm": 32.96882629394531, + "learning_rate": 6.548773531089561e-06, + "loss": 4.5927, + "step": 1153 + }, + { + "epoch": 0.19750128358719835, + "grad_norm": 36.14384078979492, + "learning_rate": 6.554478037649743e-06, + "loss": 5.6357, + "step": 1154 + }, + { + "epoch": 0.19767242854697928, + "grad_norm": 23.875118255615234, + "learning_rate": 6.560182544209926e-06, + "loss": 3.158, + "step": 1155 + }, + { + "epoch": 0.19784357350676024, + "grad_norm": 23.001026153564453, + "learning_rate": 6.565887050770108e-06, + "loss": 1.8949, + "step": 1156 + }, + { + "epoch": 0.19801471846654117, + "grad_norm": 46.26600646972656, + "learning_rate": 6.571591557330292e-06, + "loss": 9.1329, + "step": 1157 + }, + { + "epoch": 0.1981858634263221, + "grad_norm": 16.32296371459961, + "learning_rate": 6.577296063890474e-06, + "loss": 1.5302, + "step": 1158 + }, + { + "epoch": 0.19835700838610304, + "grad_norm": 26.114614486694336, + "learning_rate": 6.583000570450656e-06, + "loss": 2.3763, + "step": 1159 + }, + { + "epoch": 0.19852815334588397, + "grad_norm": 37.42622756958008, + "learning_rate": 6.5887050770108384e-06, + "loss": 5.5999, + "step": 1160 + }, + { + "epoch": 0.1986992983056649, + "grad_norm": 21.48786735534668, + "learning_rate": 6.594409583571021e-06, + "loss": 3.4369, + "step": 1161 + }, + { + "epoch": 0.19887044326544584, + "grad_norm": 24.472808837890625, + "learning_rate": 6.6001140901312035e-06, + "loss": 2.0175, + "step": 1162 + }, + { + "epoch": 0.19904158822522677, + "grad_norm": 25.275909423828125, + "learning_rate": 6.605818596691387e-06, + "loss": 2.6992, + "step": 1163 + }, + { + "epoch": 0.1992127331850077, + "grad_norm": 29.439197540283203, + "learning_rate": 6.611523103251569e-06, + "loss": 4.4373, + "step": 1164 + }, + { + "epoch": 0.19938387814478864, + "grad_norm": 224.64663696289062, + "learning_rate": 6.617227609811751e-06, + "loss": 10.3737, + "step": 1165 + }, + { + "epoch": 0.19955502310456957, + "grad_norm": 34.043575286865234, + "learning_rate": 6.622932116371934e-06, + "loss": 5.0921, + "step": 1166 + }, + { + "epoch": 0.1997261680643505, + "grad_norm": 11.060107231140137, + "learning_rate": 6.628636622932116e-06, + "loss": 1.2996, + "step": 1167 + }, + { + "epoch": 0.19989731302413144, + "grad_norm": 32.19368362426758, + "learning_rate": 6.634341129492299e-06, + "loss": 4.2537, + "step": 1168 + }, + { + "epoch": 0.20006845798391237, + "grad_norm": 48.267578125, + "learning_rate": 6.640045636052482e-06, + "loss": 9.335, + "step": 1169 + }, + { + "epoch": 0.2002396029436933, + "grad_norm": 19.327762603759766, + "learning_rate": 6.645750142612665e-06, + "loss": 1.8859, + "step": 1170 + }, + { + "epoch": 0.20041074790347424, + "grad_norm": 28.81614875793457, + "learning_rate": 6.651454649172847e-06, + "loss": 3.8125, + "step": 1171 + }, + { + "epoch": 0.20058189286325517, + "grad_norm": 24.971960067749023, + "learning_rate": 6.657159155733029e-06, + "loss": 3.0816, + "step": 1172 + }, + { + "epoch": 0.2007530378230361, + "grad_norm": 154.4432373046875, + "learning_rate": 6.662863662293211e-06, + "loss": 8.568, + "step": 1173 + }, + { + "epoch": 0.20092418278281704, + "grad_norm": 47.04978942871094, + "learning_rate": 6.668568168853395e-06, + "loss": 5.1816, + "step": 1174 + }, + { + "epoch": 0.20109532774259797, + "grad_norm": 24.374345779418945, + "learning_rate": 6.674272675413577e-06, + "loss": 2.6078, + "step": 1175 + }, + { + "epoch": 0.2012664727023789, + "grad_norm": 36.597232818603516, + "learning_rate": 6.67997718197376e-06, + "loss": 5.5402, + "step": 1176 + }, + { + "epoch": 0.20143761766215984, + "grad_norm": 36.612060546875, + "learning_rate": 6.685681688533942e-06, + "loss": 5.17, + "step": 1177 + }, + { + "epoch": 0.20160876262194077, + "grad_norm": 39.452117919921875, + "learning_rate": 6.691386195094124e-06, + "loss": 6.2861, + "step": 1178 + }, + { + "epoch": 0.2017799075817217, + "grad_norm": 35.985816955566406, + "learning_rate": 6.6970907016543065e-06, + "loss": 5.7763, + "step": 1179 + }, + { + "epoch": 0.20195105254150264, + "grad_norm": 11.960805892944336, + "learning_rate": 6.70279520821449e-06, + "loss": 2.7312, + "step": 1180 + }, + { + "epoch": 0.20212219750128357, + "grad_norm": 154.7554168701172, + "learning_rate": 6.7084997147746724e-06, + "loss": 9.5806, + "step": 1181 + }, + { + "epoch": 0.20229334246106453, + "grad_norm": 31.713943481445312, + "learning_rate": 6.714204221334855e-06, + "loss": 4.9006, + "step": 1182 + }, + { + "epoch": 0.20246448742084547, + "grad_norm": 11.431591987609863, + "learning_rate": 6.7199087278950375e-06, + "loss": 3.1028, + "step": 1183 + }, + { + "epoch": 0.2026356323806264, + "grad_norm": 208.2880859375, + "learning_rate": 6.725613234455219e-06, + "loss": 8.5447, + "step": 1184 + }, + { + "epoch": 0.20280677734040733, + "grad_norm": 32.78763198852539, + "learning_rate": 6.731317741015402e-06, + "loss": 5.0437, + "step": 1185 + }, + { + "epoch": 0.20297792230018827, + "grad_norm": 31.15655517578125, + "learning_rate": 6.737022247575585e-06, + "loss": 4.1921, + "step": 1186 + }, + { + "epoch": 0.2031490672599692, + "grad_norm": 12.072607040405273, + "learning_rate": 6.742726754135768e-06, + "loss": 1.9291, + "step": 1187 + }, + { + "epoch": 0.20332021221975013, + "grad_norm": 46.76679992675781, + "learning_rate": 6.74843126069595e-06, + "loss": 9.0577, + "step": 1188 + }, + { + "epoch": 0.20349135717953107, + "grad_norm": 28.912738800048828, + "learning_rate": 6.754135767256133e-06, + "loss": 4.3274, + "step": 1189 + }, + { + "epoch": 0.203662502139312, + "grad_norm": 151.7112579345703, + "learning_rate": 6.759840273816315e-06, + "loss": 8.1049, + "step": 1190 + }, + { + "epoch": 0.20383364709909293, + "grad_norm": 19.557729721069336, + "learning_rate": 6.765544780376497e-06, + "loss": 1.6717, + "step": 1191 + }, + { + "epoch": 0.20400479205887387, + "grad_norm": 37.28075408935547, + "learning_rate": 6.77124928693668e-06, + "loss": 5.6393, + "step": 1192 + }, + { + "epoch": 0.2041759370186548, + "grad_norm": 33.639183044433594, + "learning_rate": 6.776953793496863e-06, + "loss": 4.9937, + "step": 1193 + }, + { + "epoch": 0.20434708197843574, + "grad_norm": 16.514705657958984, + "learning_rate": 6.782658300057045e-06, + "loss": 2.2396, + "step": 1194 + }, + { + "epoch": 0.20451822693821667, + "grad_norm": 29.29157066345215, + "learning_rate": 6.788362806617228e-06, + "loss": 4.5062, + "step": 1195 + }, + { + "epoch": 0.2046893718979976, + "grad_norm": 24.25420570373535, + "learning_rate": 6.79406731317741e-06, + "loss": 2.5282, + "step": 1196 + }, + { + "epoch": 0.20486051685777854, + "grad_norm": 21.87625503540039, + "learning_rate": 6.799771819737593e-06, + "loss": 2.2101, + "step": 1197 + }, + { + "epoch": 0.20503166181755947, + "grad_norm": 29.727163314819336, + "learning_rate": 6.8054763262977755e-06, + "loss": 3.5679, + "step": 1198 + }, + { + "epoch": 0.2052028067773404, + "grad_norm": 23.502267837524414, + "learning_rate": 6.811180832857958e-06, + "loss": 3.9821, + "step": 1199 + }, + { + "epoch": 0.20537395173712134, + "grad_norm": 31.961931228637695, + "learning_rate": 6.8168853394181405e-06, + "loss": 4.6, + "step": 1200 + }, + { + "epoch": 0.20554509669690227, + "grad_norm": 27.584300994873047, + "learning_rate": 6.822589845978323e-06, + "loss": 3.389, + "step": 1201 + }, + { + "epoch": 0.2057162416566832, + "grad_norm": 34.41096115112305, + "learning_rate": 6.828294352538506e-06, + "loss": 4.722, + "step": 1202 + }, + { + "epoch": 0.20588738661646414, + "grad_norm": 41.341312408447266, + "learning_rate": 6.833998859098688e-06, + "loss": 6.7225, + "step": 1203 + }, + { + "epoch": 0.20605853157624507, + "grad_norm": 160.5906982421875, + "learning_rate": 6.839703365658871e-06, + "loss": 9.8412, + "step": 1204 + }, + { + "epoch": 0.206229676536026, + "grad_norm": 23.49472999572754, + "learning_rate": 6.845407872219053e-06, + "loss": 3.6378, + "step": 1205 + }, + { + "epoch": 0.20640082149580694, + "grad_norm": 31.307947158813477, + "learning_rate": 6.851112378779236e-06, + "loss": 3.6813, + "step": 1206 + }, + { + "epoch": 0.2065719664555879, + "grad_norm": 27.893850326538086, + "learning_rate": 6.856816885339418e-06, + "loss": 4.5216, + "step": 1207 + }, + { + "epoch": 0.20674311141536883, + "grad_norm": 32.200157165527344, + "learning_rate": 6.862521391899601e-06, + "loss": 4.5525, + "step": 1208 + }, + { + "epoch": 0.20691425637514976, + "grad_norm": 31.765216827392578, + "learning_rate": 6.868225898459783e-06, + "loss": 5.2865, + "step": 1209 + }, + { + "epoch": 0.2070854013349307, + "grad_norm": 35.562294006347656, + "learning_rate": 6.873930405019966e-06, + "loss": 5.0758, + "step": 1210 + }, + { + "epoch": 0.20725654629471163, + "grad_norm": 44.582786560058594, + "learning_rate": 6.879634911580148e-06, + "loss": 8.7973, + "step": 1211 + }, + { + "epoch": 0.20742769125449256, + "grad_norm": 29.667964935302734, + "learning_rate": 6.885339418140331e-06, + "loss": 3.7483, + "step": 1212 + }, + { + "epoch": 0.2075988362142735, + "grad_norm": 33.826454162597656, + "learning_rate": 6.8910439247005135e-06, + "loss": 5.321, + "step": 1213 + }, + { + "epoch": 0.20776998117405443, + "grad_norm": 36.56757354736328, + "learning_rate": 6.896748431260697e-06, + "loss": 4.6366, + "step": 1214 + }, + { + "epoch": 0.20794112613383536, + "grad_norm": 21.483030319213867, + "learning_rate": 6.9024529378208785e-06, + "loss": 1.7844, + "step": 1215 + }, + { + "epoch": 0.2081122710936163, + "grad_norm": 22.398630142211914, + "learning_rate": 6.908157444381061e-06, + "loss": 2.9002, + "step": 1216 + }, + { + "epoch": 0.20828341605339723, + "grad_norm": 16.41680145263672, + "learning_rate": 6.913861950941244e-06, + "loss": 1.5466, + "step": 1217 + }, + { + "epoch": 0.20845456101317816, + "grad_norm": 22.448949813842773, + "learning_rate": 6.919566457501426e-06, + "loss": 3.4011, + "step": 1218 + }, + { + "epoch": 0.2086257059729591, + "grad_norm": 35.074989318847656, + "learning_rate": 6.925270964061609e-06, + "loss": 4.4769, + "step": 1219 + }, + { + "epoch": 0.20879685093274003, + "grad_norm": 29.737442016601562, + "learning_rate": 6.930975470621792e-06, + "loss": 4.6152, + "step": 1220 + }, + { + "epoch": 0.20896799589252096, + "grad_norm": 29.097299575805664, + "learning_rate": 6.9366799771819746e-06, + "loss": 3.8591, + "step": 1221 + }, + { + "epoch": 0.2091391408523019, + "grad_norm": 22.356008529663086, + "learning_rate": 6.942384483742156e-06, + "loss": 3.6379, + "step": 1222 + }, + { + "epoch": 0.20931028581208283, + "grad_norm": 29.412656784057617, + "learning_rate": 6.948088990302339e-06, + "loss": 3.5976, + "step": 1223 + }, + { + "epoch": 0.20948143077186376, + "grad_norm": 19.5412654876709, + "learning_rate": 6.953793496862521e-06, + "loss": 2.0718, + "step": 1224 + }, + { + "epoch": 0.2096525757316447, + "grad_norm": 17.43561363220215, + "learning_rate": 6.959498003422704e-06, + "loss": 1.5389, + "step": 1225 + }, + { + "epoch": 0.20982372069142563, + "grad_norm": 34.85890579223633, + "learning_rate": 6.965202509982887e-06, + "loss": 4.4105, + "step": 1226 + }, + { + "epoch": 0.20999486565120656, + "grad_norm": 33.83147430419922, + "learning_rate": 6.97090701654307e-06, + "loss": 4.108, + "step": 1227 + }, + { + "epoch": 0.2101660106109875, + "grad_norm": 33.77149963378906, + "learning_rate": 6.9766115231032514e-06, + "loss": 4.4198, + "step": 1228 + }, + { + "epoch": 0.21033715557076843, + "grad_norm": 12.30455207824707, + "learning_rate": 6.982316029663434e-06, + "loss": 1.7759, + "step": 1229 + }, + { + "epoch": 0.21050830053054936, + "grad_norm": 34.55380630493164, + "learning_rate": 6.9880205362236165e-06, + "loss": 4.4813, + "step": 1230 + }, + { + "epoch": 0.2106794454903303, + "grad_norm": 23.975025177001953, + "learning_rate": 6.993725042783799e-06, + "loss": 3.728, + "step": 1231 + }, + { + "epoch": 0.21085059045011123, + "grad_norm": 190.6012725830078, + "learning_rate": 6.999429549343982e-06, + "loss": 10.1602, + "step": 1232 + }, + { + "epoch": 0.2110217354098922, + "grad_norm": 34.527076721191406, + "learning_rate": 7.005134055904165e-06, + "loss": 4.7483, + "step": 1233 + }, + { + "epoch": 0.21119288036967312, + "grad_norm": 35.65943908691406, + "learning_rate": 7.010838562464347e-06, + "loss": 5.5499, + "step": 1234 + }, + { + "epoch": 0.21136402532945406, + "grad_norm": 34.03565216064453, + "learning_rate": 7.016543069024529e-06, + "loss": 4.7829, + "step": 1235 + }, + { + "epoch": 0.211535170289235, + "grad_norm": 20.10201072692871, + "learning_rate": 7.022247575584712e-06, + "loss": 2.9853, + "step": 1236 + }, + { + "epoch": 0.21170631524901593, + "grad_norm": 72.77118682861328, + "learning_rate": 7.027952082144895e-06, + "loss": 6.8184, + "step": 1237 + }, + { + "epoch": 0.21187746020879686, + "grad_norm": 32.084381103515625, + "learning_rate": 7.033656588705078e-06, + "loss": 5.0572, + "step": 1238 + }, + { + "epoch": 0.2120486051685778, + "grad_norm": 28.180423736572266, + "learning_rate": 7.03936109526526e-06, + "loss": 3.8185, + "step": 1239 + }, + { + "epoch": 0.21221975012835873, + "grad_norm": 20.687843322753906, + "learning_rate": 7.045065601825443e-06, + "loss": 2.1643, + "step": 1240 + }, + { + "epoch": 0.21239089508813966, + "grad_norm": 15.380537033081055, + "learning_rate": 7.050770108385624e-06, + "loss": 1.6453, + "step": 1241 + }, + { + "epoch": 0.2125620400479206, + "grad_norm": 38.16814422607422, + "learning_rate": 7.056474614945807e-06, + "loss": 5.8775, + "step": 1242 + }, + { + "epoch": 0.21273318500770153, + "grad_norm": 43.55405807495117, + "learning_rate": 7.06217912150599e-06, + "loss": 5.1528, + "step": 1243 + }, + { + "epoch": 0.21290432996748246, + "grad_norm": 30.40400505065918, + "learning_rate": 7.067883628066173e-06, + "loss": 4.155, + "step": 1244 + }, + { + "epoch": 0.2130754749272634, + "grad_norm": 39.55487823486328, + "learning_rate": 7.073588134626355e-06, + "loss": 6.8649, + "step": 1245 + }, + { + "epoch": 0.21324661988704433, + "grad_norm": 46.886600494384766, + "learning_rate": 7.079292641186538e-06, + "loss": 4.8251, + "step": 1246 + }, + { + "epoch": 0.21341776484682526, + "grad_norm": 35.842594146728516, + "learning_rate": 7.0849971477467195e-06, + "loss": 5.3382, + "step": 1247 + }, + { + "epoch": 0.2135889098066062, + "grad_norm": 10.459444999694824, + "learning_rate": 7.090701654306902e-06, + "loss": 1.1781, + "step": 1248 + }, + { + "epoch": 0.21376005476638713, + "grad_norm": 31.134531021118164, + "learning_rate": 7.0964061608670854e-06, + "loss": 3.3419, + "step": 1249 + }, + { + "epoch": 0.21393119972616806, + "grad_norm": 32.50645065307617, + "learning_rate": 7.102110667427268e-06, + "loss": 4.1592, + "step": 1250 + }, + { + "epoch": 0.214102344685949, + "grad_norm": 38.065643310546875, + "learning_rate": 7.1078151739874505e-06, + "loss": 6.1903, + "step": 1251 + }, + { + "epoch": 0.21427348964572993, + "grad_norm": 32.13066482543945, + "learning_rate": 7.113519680547633e-06, + "loss": 3.8917, + "step": 1252 + }, + { + "epoch": 0.21444463460551086, + "grad_norm": 22.333932876586914, + "learning_rate": 7.119224187107815e-06, + "loss": 3.308, + "step": 1253 + }, + { + "epoch": 0.2146157795652918, + "grad_norm": 8.437789916992188, + "learning_rate": 7.124928693667997e-06, + "loss": 2.2375, + "step": 1254 + }, + { + "epoch": 0.21478692452507273, + "grad_norm": 32.72603225708008, + "learning_rate": 7.130633200228181e-06, + "loss": 4.8237, + "step": 1255 + }, + { + "epoch": 0.21495806948485366, + "grad_norm": 34.640647888183594, + "learning_rate": 7.136337706788363e-06, + "loss": 5.2757, + "step": 1256 + }, + { + "epoch": 0.2151292144446346, + "grad_norm": 20.100618362426758, + "learning_rate": 7.142042213348546e-06, + "loss": 2.961, + "step": 1257 + }, + { + "epoch": 0.21530035940441555, + "grad_norm": 43.29427719116211, + "learning_rate": 7.147746719908728e-06, + "loss": 8.933, + "step": 1258 + }, + { + "epoch": 0.2154715043641965, + "grad_norm": 33.56546401977539, + "learning_rate": 7.15345122646891e-06, + "loss": 4.6558, + "step": 1259 + }, + { + "epoch": 0.21564264932397742, + "grad_norm": 33.7791633605957, + "learning_rate": 7.159155733029093e-06, + "loss": 4.183, + "step": 1260 + }, + { + "epoch": 0.21581379428375835, + "grad_norm": 33.235233306884766, + "learning_rate": 7.164860239589276e-06, + "loss": 3.7487, + "step": 1261 + }, + { + "epoch": 0.2159849392435393, + "grad_norm": 140.30621337890625, + "learning_rate": 7.170564746149458e-06, + "loss": 9.0381, + "step": 1262 + }, + { + "epoch": 0.21615608420332022, + "grad_norm": 20.70719337463379, + "learning_rate": 7.176269252709641e-06, + "loss": 1.7769, + "step": 1263 + }, + { + "epoch": 0.21632722916310115, + "grad_norm": 36.93478012084961, + "learning_rate": 7.181973759269823e-06, + "loss": 4.5665, + "step": 1264 + }, + { + "epoch": 0.2164983741228821, + "grad_norm": 81.26618957519531, + "learning_rate": 7.187678265830006e-06, + "loss": 7.0141, + "step": 1265 + }, + { + "epoch": 0.21666951908266302, + "grad_norm": 33.15439224243164, + "learning_rate": 7.1933827723901885e-06, + "loss": 4.5814, + "step": 1266 + }, + { + "epoch": 0.21684066404244395, + "grad_norm": 26.268171310424805, + "learning_rate": 7.199087278950371e-06, + "loss": 3.0891, + "step": 1267 + }, + { + "epoch": 0.2170118090022249, + "grad_norm": 35.35780715942383, + "learning_rate": 7.2047917855105535e-06, + "loss": 4.8355, + "step": 1268 + }, + { + "epoch": 0.21718295396200582, + "grad_norm": 21.87150764465332, + "learning_rate": 7.210496292070736e-06, + "loss": 1.7614, + "step": 1269 + }, + { + "epoch": 0.21735409892178675, + "grad_norm": 36.49989318847656, + "learning_rate": 7.216200798630919e-06, + "loss": 5.8824, + "step": 1270 + }, + { + "epoch": 0.2175252438815677, + "grad_norm": 11.613662719726562, + "learning_rate": 7.221905305191101e-06, + "loss": 1.7057, + "step": 1271 + }, + { + "epoch": 0.21769638884134862, + "grad_norm": 28.447458267211914, + "learning_rate": 7.227609811751284e-06, + "loss": 4.3815, + "step": 1272 + }, + { + "epoch": 0.21786753380112955, + "grad_norm": 34.95615005493164, + "learning_rate": 7.233314318311466e-06, + "loss": 4.7223, + "step": 1273 + }, + { + "epoch": 0.2180386787609105, + "grad_norm": 36.12034606933594, + "learning_rate": 7.239018824871649e-06, + "loss": 5.4639, + "step": 1274 + }, + { + "epoch": 0.21820982372069142, + "grad_norm": 29.200042724609375, + "learning_rate": 7.244723331431831e-06, + "loss": 3.9203, + "step": 1275 + }, + { + "epoch": 0.21838096868047235, + "grad_norm": 173.54055786132812, + "learning_rate": 7.250427837992014e-06, + "loss": 9.2819, + "step": 1276 + }, + { + "epoch": 0.2185521136402533, + "grad_norm": 30.67865562438965, + "learning_rate": 7.256132344552197e-06, + "loss": 4.7412, + "step": 1277 + }, + { + "epoch": 0.21872325860003422, + "grad_norm": 35.703468322753906, + "learning_rate": 7.261836851112379e-06, + "loss": 5.3418, + "step": 1278 + }, + { + "epoch": 0.21889440355981515, + "grad_norm": 35.29546356201172, + "learning_rate": 7.267541357672561e-06, + "loss": 5.1735, + "step": 1279 + }, + { + "epoch": 0.2190655485195961, + "grad_norm": 20.382551193237305, + "learning_rate": 7.273245864232744e-06, + "loss": 1.8851, + "step": 1280 + }, + { + "epoch": 0.21923669347937702, + "grad_norm": 20.68045997619629, + "learning_rate": 7.2789503707929265e-06, + "loss": 2.681, + "step": 1281 + }, + { + "epoch": 0.21940783843915795, + "grad_norm": 37.52497482299805, + "learning_rate": 7.284654877353109e-06, + "loss": 5.9113, + "step": 1282 + }, + { + "epoch": 0.21957898339893892, + "grad_norm": 154.6285858154297, + "learning_rate": 7.290359383913292e-06, + "loss": 8.0077, + "step": 1283 + }, + { + "epoch": 0.21975012835871985, + "grad_norm": 28.380836486816406, + "learning_rate": 7.296063890473474e-06, + "loss": 3.5758, + "step": 1284 + }, + { + "epoch": 0.21992127331850078, + "grad_norm": 13.987469673156738, + "learning_rate": 7.301768397033657e-06, + "loss": 1.4051, + "step": 1285 + }, + { + "epoch": 0.22009241827828172, + "grad_norm": 21.18030548095703, + "learning_rate": 7.307472903593839e-06, + "loss": 3.1844, + "step": 1286 + }, + { + "epoch": 0.22026356323806265, + "grad_norm": 13.61611270904541, + "learning_rate": 7.313177410154022e-06, + "loss": 1.4008, + "step": 1287 + }, + { + "epoch": 0.22043470819784358, + "grad_norm": 32.63056182861328, + "learning_rate": 7.318881916714204e-06, + "loss": 5.485, + "step": 1288 + }, + { + "epoch": 0.22060585315762452, + "grad_norm": 12.39704704284668, + "learning_rate": 7.3245864232743876e-06, + "loss": 2.8362, + "step": 1289 + }, + { + "epoch": 0.22077699811740545, + "grad_norm": 160.39300537109375, + "learning_rate": 7.33029092983457e-06, + "loss": 9.3207, + "step": 1290 + }, + { + "epoch": 0.22094814307718638, + "grad_norm": 35.63487243652344, + "learning_rate": 7.335995436394752e-06, + "loss": 4.3364, + "step": 1291 + }, + { + "epoch": 0.22111928803696732, + "grad_norm": 18.865745544433594, + "learning_rate": 7.341699942954934e-06, + "loss": 1.9152, + "step": 1292 + }, + { + "epoch": 0.22129043299674825, + "grad_norm": 34.95203399658203, + "learning_rate": 7.347404449515117e-06, + "loss": 4.2394, + "step": 1293 + }, + { + "epoch": 0.22146157795652918, + "grad_norm": 32.99889373779297, + "learning_rate": 7.353108956075299e-06, + "loss": 5.7603, + "step": 1294 + }, + { + "epoch": 0.22163272291631012, + "grad_norm": 31.541820526123047, + "learning_rate": 7.358813462635483e-06, + "loss": 4.7464, + "step": 1295 + }, + { + "epoch": 0.22180386787609105, + "grad_norm": 22.86473274230957, + "learning_rate": 7.364517969195665e-06, + "loss": 3.2885, + "step": 1296 + }, + { + "epoch": 0.22197501283587198, + "grad_norm": 34.75326919555664, + "learning_rate": 7.370222475755847e-06, + "loss": 4.4337, + "step": 1297 + }, + { + "epoch": 0.22214615779565292, + "grad_norm": 33.42300796508789, + "learning_rate": 7.3759269823160295e-06, + "loss": 4.8641, + "step": 1298 + }, + { + "epoch": 0.22231730275543385, + "grad_norm": 40.14048385620117, + "learning_rate": 7.381631488876212e-06, + "loss": 5.3092, + "step": 1299 + }, + { + "epoch": 0.22248844771521478, + "grad_norm": 33.59206008911133, + "learning_rate": 7.387335995436395e-06, + "loss": 4.6114, + "step": 1300 + }, + { + "epoch": 0.22265959267499572, + "grad_norm": 32.96902084350586, + "learning_rate": 7.393040501996578e-06, + "loss": 4.9559, + "step": 1301 + }, + { + "epoch": 0.22283073763477665, + "grad_norm": 76.84076690673828, + "learning_rate": 7.3987450085567605e-06, + "loss": 7.2409, + "step": 1302 + }, + { + "epoch": 0.22300188259455758, + "grad_norm": 29.227497100830078, + "learning_rate": 7.404449515116942e-06, + "loss": 3.4494, + "step": 1303 + }, + { + "epoch": 0.22317302755433852, + "grad_norm": 34.10039520263672, + "learning_rate": 7.410154021677125e-06, + "loss": 4.6513, + "step": 1304 + }, + { + "epoch": 0.22334417251411945, + "grad_norm": 43.62645721435547, + "learning_rate": 7.415858528237307e-06, + "loss": 6.1141, + "step": 1305 + }, + { + "epoch": 0.22351531747390038, + "grad_norm": 29.59916877746582, + "learning_rate": 7.421563034797491e-06, + "loss": 4.5189, + "step": 1306 + }, + { + "epoch": 0.22368646243368132, + "grad_norm": 32.00434494018555, + "learning_rate": 7.427267541357673e-06, + "loss": 3.7625, + "step": 1307 + }, + { + "epoch": 0.22385760739346225, + "grad_norm": 12.214600563049316, + "learning_rate": 7.432972047917856e-06, + "loss": 1.6093, + "step": 1308 + }, + { + "epoch": 0.2240287523532432, + "grad_norm": 13.289321899414062, + "learning_rate": 7.438676554478037e-06, + "loss": 1.8433, + "step": 1309 + }, + { + "epoch": 0.22419989731302414, + "grad_norm": 12.391509056091309, + "learning_rate": 7.44438106103822e-06, + "loss": 1.8211, + "step": 1310 + }, + { + "epoch": 0.22437104227280508, + "grad_norm": 31.827852249145508, + "learning_rate": 7.450085567598402e-06, + "loss": 3.5607, + "step": 1311 + }, + { + "epoch": 0.224542187232586, + "grad_norm": 172.93185424804688, + "learning_rate": 7.455790074158586e-06, + "loss": 9.5445, + "step": 1312 + }, + { + "epoch": 0.22471333219236694, + "grad_norm": 18.688396453857422, + "learning_rate": 7.461494580718768e-06, + "loss": 1.9759, + "step": 1313 + }, + { + "epoch": 0.22488447715214788, + "grad_norm": 26.364185333251953, + "learning_rate": 7.467199087278951e-06, + "loss": 3.2682, + "step": 1314 + }, + { + "epoch": 0.2250556221119288, + "grad_norm": 8.573413848876953, + "learning_rate": 7.472903593839133e-06, + "loss": 1.3051, + "step": 1315 + }, + { + "epoch": 0.22522676707170974, + "grad_norm": 24.913686752319336, + "learning_rate": 7.478608100399315e-06, + "loss": 2.4598, + "step": 1316 + }, + { + "epoch": 0.22539791203149068, + "grad_norm": 30.283504486083984, + "learning_rate": 7.4843126069594984e-06, + "loss": 4.1503, + "step": 1317 + }, + { + "epoch": 0.2255690569912716, + "grad_norm": 18.146724700927734, + "learning_rate": 7.490017113519681e-06, + "loss": 1.8957, + "step": 1318 + }, + { + "epoch": 0.22574020195105254, + "grad_norm": 11.016623497009277, + "learning_rate": 7.4957216200798635e-06, + "loss": 2.636, + "step": 1319 + }, + { + "epoch": 0.22591134691083348, + "grad_norm": 35.766883850097656, + "learning_rate": 7.501426126640046e-06, + "loss": 4.3588, + "step": 1320 + }, + { + "epoch": 0.2260824918706144, + "grad_norm": 24.76753807067871, + "learning_rate": 7.5071306332002286e-06, + "loss": 3.2106, + "step": 1321 + }, + { + "epoch": 0.22625363683039534, + "grad_norm": 35.969505310058594, + "learning_rate": 7.51283513976041e-06, + "loss": 4.5488, + "step": 1322 + }, + { + "epoch": 0.22642478179017628, + "grad_norm": 13.215656280517578, + "learning_rate": 7.518539646320593e-06, + "loss": 1.7273, + "step": 1323 + }, + { + "epoch": 0.2265959267499572, + "grad_norm": 32.75537872314453, + "learning_rate": 7.524244152880775e-06, + "loss": 4.442, + "step": 1324 + }, + { + "epoch": 0.22676707170973814, + "grad_norm": 13.069498062133789, + "learning_rate": 7.529948659440958e-06, + "loss": 1.2864, + "step": 1325 + }, + { + "epoch": 0.22693821666951908, + "grad_norm": 29.5541934967041, + "learning_rate": 7.535653166001142e-06, + "loss": 3.5993, + "step": 1326 + }, + { + "epoch": 0.2271093616293, + "grad_norm": 36.506736755371094, + "learning_rate": 7.541357672561325e-06, + "loss": 4.7108, + "step": 1327 + }, + { + "epoch": 0.22728050658908094, + "grad_norm": 30.510953903198242, + "learning_rate": 7.547062179121506e-06, + "loss": 4.168, + "step": 1328 + }, + { + "epoch": 0.22745165154886188, + "grad_norm": 11.754740715026855, + "learning_rate": 7.552766685681689e-06, + "loss": 2.7865, + "step": 1329 + }, + { + "epoch": 0.2276227965086428, + "grad_norm": 31.793643951416016, + "learning_rate": 7.558471192241871e-06, + "loss": 3.4931, + "step": 1330 + }, + { + "epoch": 0.22779394146842374, + "grad_norm": 23.95293426513672, + "learning_rate": 7.564175698802054e-06, + "loss": 3.0252, + "step": 1331 + }, + { + "epoch": 0.22796508642820468, + "grad_norm": 28.809511184692383, + "learning_rate": 7.569880205362236e-06, + "loss": 4.2144, + "step": 1332 + }, + { + "epoch": 0.2281362313879856, + "grad_norm": 34.645267486572266, + "learning_rate": 7.575584711922419e-06, + "loss": 4.5155, + "step": 1333 + }, + { + "epoch": 0.22830737634776657, + "grad_norm": 31.90658950805664, + "learning_rate": 7.581289218482601e-06, + "loss": 3.6445, + "step": 1334 + }, + { + "epoch": 0.2284785213075475, + "grad_norm": 26.37479591369629, + "learning_rate": 7.586993725042783e-06, + "loss": 2.6728, + "step": 1335 + }, + { + "epoch": 0.22864966626732844, + "grad_norm": 29.64954376220703, + "learning_rate": 7.592698231602966e-06, + "loss": 4.0421, + "step": 1336 + }, + { + "epoch": 0.22882081122710937, + "grad_norm": 28.596891403198242, + "learning_rate": 7.59840273816315e-06, + "loss": 3.3059, + "step": 1337 + }, + { + "epoch": 0.2289919561868903, + "grad_norm": 36.07052993774414, + "learning_rate": 7.6041072447233325e-06, + "loss": 4.2618, + "step": 1338 + }, + { + "epoch": 0.22916310114667124, + "grad_norm": 50.589454650878906, + "learning_rate": 7.609811751283515e-06, + "loss": 9.3326, + "step": 1339 + }, + { + "epoch": 0.22933424610645217, + "grad_norm": 31.4276180267334, + "learning_rate": 7.6155162578436975e-06, + "loss": 4.6035, + "step": 1340 + }, + { + "epoch": 0.2295053910662331, + "grad_norm": 32.5452766418457, + "learning_rate": 7.621220764403879e-06, + "loss": 3.9264, + "step": 1341 + }, + { + "epoch": 0.22967653602601404, + "grad_norm": 32.74778747558594, + "learning_rate": 7.626925270964062e-06, + "loss": 4.6618, + "step": 1342 + }, + { + "epoch": 0.22984768098579497, + "grad_norm": 11.447990417480469, + "learning_rate": 7.632629777524244e-06, + "loss": 1.2467, + "step": 1343 + }, + { + "epoch": 0.2300188259455759, + "grad_norm": 19.261301040649414, + "learning_rate": 7.638334284084426e-06, + "loss": 1.4108, + "step": 1344 + }, + { + "epoch": 0.23018997090535684, + "grad_norm": 17.838138580322266, + "learning_rate": 7.64403879064461e-06, + "loss": 1.4796, + "step": 1345 + }, + { + "epoch": 0.23036111586513777, + "grad_norm": 36.09761047363281, + "learning_rate": 7.649743297204791e-06, + "loss": 4.8769, + "step": 1346 + }, + { + "epoch": 0.2305322608249187, + "grad_norm": 17.18463706970215, + "learning_rate": 7.655447803764974e-06, + "loss": 1.6009, + "step": 1347 + }, + { + "epoch": 0.23070340578469964, + "grad_norm": 20.603784561157227, + "learning_rate": 7.661152310325156e-06, + "loss": 3.0856, + "step": 1348 + }, + { + "epoch": 0.23087455074448057, + "grad_norm": 41.716590881347656, + "learning_rate": 7.666856816885341e-06, + "loss": 5.4047, + "step": 1349 + }, + { + "epoch": 0.2310456957042615, + "grad_norm": 181.26748657226562, + "learning_rate": 7.672561323445523e-06, + "loss": 8.5903, + "step": 1350 + }, + { + "epoch": 0.23121684066404244, + "grad_norm": 41.98673629760742, + "learning_rate": 7.678265830005705e-06, + "loss": 5.2161, + "step": 1351 + }, + { + "epoch": 0.23138798562382337, + "grad_norm": 35.29446792602539, + "learning_rate": 7.683970336565888e-06, + "loss": 4.2135, + "step": 1352 + }, + { + "epoch": 0.2315591305836043, + "grad_norm": 164.35977172851562, + "learning_rate": 7.68967484312607e-06, + "loss": 7.3687, + "step": 1353 + }, + { + "epoch": 0.23173027554338524, + "grad_norm": 20.39377784729004, + "learning_rate": 7.695379349686253e-06, + "loss": 1.6669, + "step": 1354 + }, + { + "epoch": 0.23190142050316617, + "grad_norm": 33.71407699584961, + "learning_rate": 7.701083856246435e-06, + "loss": 4.5662, + "step": 1355 + }, + { + "epoch": 0.2320725654629471, + "grad_norm": 9.964597702026367, + "learning_rate": 7.706788362806616e-06, + "loss": 2.2199, + "step": 1356 + }, + { + "epoch": 0.23224371042272804, + "grad_norm": 41.83567810058594, + "learning_rate": 7.7124928693668e-06, + "loss": 5.3555, + "step": 1357 + }, + { + "epoch": 0.23241485538250897, + "grad_norm": 19.700429916381836, + "learning_rate": 7.718197375926981e-06, + "loss": 1.6864, + "step": 1358 + }, + { + "epoch": 0.2325860003422899, + "grad_norm": 32.94630432128906, + "learning_rate": 7.723901882487165e-06, + "loss": 3.5872, + "step": 1359 + }, + { + "epoch": 0.23275714530207087, + "grad_norm": 26.41133689880371, + "learning_rate": 7.729606389047348e-06, + "loss": 3.5806, + "step": 1360 + }, + { + "epoch": 0.2329282902618518, + "grad_norm": 17.184593200683594, + "learning_rate": 7.735310895607532e-06, + "loss": 1.6527, + "step": 1361 + }, + { + "epoch": 0.23309943522163273, + "grad_norm": 11.024751663208008, + "learning_rate": 7.741015402167713e-06, + "loss": 1.2203, + "step": 1362 + }, + { + "epoch": 0.23327058018141367, + "grad_norm": 35.2708625793457, + "learning_rate": 7.746719908727895e-06, + "loss": 4.5728, + "step": 1363 + }, + { + "epoch": 0.2334417251411946, + "grad_norm": 35.836387634277344, + "learning_rate": 7.752424415288078e-06, + "loss": 4.9165, + "step": 1364 + }, + { + "epoch": 0.23361287010097553, + "grad_norm": 24.741012573242188, + "learning_rate": 7.75812892184826e-06, + "loss": 2.2991, + "step": 1365 + }, + { + "epoch": 0.23378401506075647, + "grad_norm": 41.604007720947266, + "learning_rate": 7.763833428408443e-06, + "loss": 4.7384, + "step": 1366 + }, + { + "epoch": 0.2339551600205374, + "grad_norm": 37.068485260009766, + "learning_rate": 7.769537934968625e-06, + "loss": 4.1609, + "step": 1367 + }, + { + "epoch": 0.23412630498031833, + "grad_norm": 31.635995864868164, + "learning_rate": 7.775242441528808e-06, + "loss": 3.6394, + "step": 1368 + }, + { + "epoch": 0.23429744994009927, + "grad_norm": 36.181602478027344, + "learning_rate": 7.78094694808899e-06, + "loss": 3.9604, + "step": 1369 + }, + { + "epoch": 0.2344685948998802, + "grad_norm": 34.47708511352539, + "learning_rate": 7.786651454649172e-06, + "loss": 4.4621, + "step": 1370 + }, + { + "epoch": 0.23463973985966113, + "grad_norm": 36.583919525146484, + "learning_rate": 7.792355961209355e-06, + "loss": 5.4214, + "step": 1371 + }, + { + "epoch": 0.23481088481944207, + "grad_norm": 139.80113220214844, + "learning_rate": 7.798060467769539e-06, + "loss": 7.582, + "step": 1372 + }, + { + "epoch": 0.234982029779223, + "grad_norm": 10.627038955688477, + "learning_rate": 7.803764974329722e-06, + "loss": 1.1265, + "step": 1373 + }, + { + "epoch": 0.23515317473900393, + "grad_norm": 56.01224899291992, + "learning_rate": 7.809469480889904e-06, + "loss": 9.2401, + "step": 1374 + }, + { + "epoch": 0.23532431969878487, + "grad_norm": 13.42536449432373, + "learning_rate": 7.815173987450085e-06, + "loss": 1.3022, + "step": 1375 + }, + { + "epoch": 0.2354954646585658, + "grad_norm": 34.816341400146484, + "learning_rate": 7.820878494010269e-06, + "loss": 4.6249, + "step": 1376 + }, + { + "epoch": 0.23566660961834673, + "grad_norm": 13.037670135498047, + "learning_rate": 7.82658300057045e-06, + "loss": 1.5747, + "step": 1377 + }, + { + "epoch": 0.23583775457812767, + "grad_norm": 38.446537017822266, + "learning_rate": 7.832287507130634e-06, + "loss": 4.9983, + "step": 1378 + }, + { + "epoch": 0.2360088995379086, + "grad_norm": 32.81908416748047, + "learning_rate": 7.837992013690815e-06, + "loss": 3.4363, + "step": 1379 + }, + { + "epoch": 0.23618004449768953, + "grad_norm": 12.17697525024414, + "learning_rate": 7.843696520250999e-06, + "loss": 1.6211, + "step": 1380 + }, + { + "epoch": 0.23635118945747047, + "grad_norm": 35.46131896972656, + "learning_rate": 7.84940102681118e-06, + "loss": 4.8981, + "step": 1381 + }, + { + "epoch": 0.2365223344172514, + "grad_norm": 29.793787002563477, + "learning_rate": 7.855105533371362e-06, + "loss": 3.5648, + "step": 1382 + }, + { + "epoch": 0.23669347937703233, + "grad_norm": 14.550475120544434, + "learning_rate": 7.860810039931547e-06, + "loss": 1.6714, + "step": 1383 + }, + { + "epoch": 0.23686462433681327, + "grad_norm": 36.01753234863281, + "learning_rate": 7.866514546491729e-06, + "loss": 4.936, + "step": 1384 + }, + { + "epoch": 0.23703576929659423, + "grad_norm": 21.261749267578125, + "learning_rate": 7.872219053051912e-06, + "loss": 2.3239, + "step": 1385 + }, + { + "epoch": 0.23720691425637516, + "grad_norm": 160.96620178222656, + "learning_rate": 7.877923559612094e-06, + "loss": 7.9267, + "step": 1386 + }, + { + "epoch": 0.2373780592161561, + "grad_norm": 34.994293212890625, + "learning_rate": 7.883628066172276e-06, + "loss": 4.6021, + "step": 1387 + }, + { + "epoch": 0.23754920417593703, + "grad_norm": 32.08713912963867, + "learning_rate": 7.889332572732459e-06, + "loss": 4.0803, + "step": 1388 + }, + { + "epoch": 0.23772034913571796, + "grad_norm": 36.49545669555664, + "learning_rate": 7.89503707929264e-06, + "loss": 4.4858, + "step": 1389 + }, + { + "epoch": 0.2378914940954989, + "grad_norm": 146.2379608154297, + "learning_rate": 7.900741585852824e-06, + "loss": 8.1082, + "step": 1390 + }, + { + "epoch": 0.23806263905527983, + "grad_norm": 31.705169677734375, + "learning_rate": 7.906446092413006e-06, + "loss": 4.1572, + "step": 1391 + }, + { + "epoch": 0.23823378401506076, + "grad_norm": 13.439140319824219, + "learning_rate": 7.91215059897319e-06, + "loss": 1.1091, + "step": 1392 + }, + { + "epoch": 0.2384049289748417, + "grad_norm": 51.37181854248047, + "learning_rate": 7.91785510553337e-06, + "loss": 9.8544, + "step": 1393 + }, + { + "epoch": 0.23857607393462263, + "grad_norm": 16.763200759887695, + "learning_rate": 7.923559612093553e-06, + "loss": 1.4605, + "step": 1394 + }, + { + "epoch": 0.23874721889440356, + "grad_norm": 32.19613265991211, + "learning_rate": 7.929264118653738e-06, + "loss": 4.0605, + "step": 1395 + }, + { + "epoch": 0.2389183638541845, + "grad_norm": 36.1611442565918, + "learning_rate": 7.93496862521392e-06, + "loss": 4.1027, + "step": 1396 + }, + { + "epoch": 0.23908950881396543, + "grad_norm": 36.234344482421875, + "learning_rate": 7.940673131774103e-06, + "loss": 5.0933, + "step": 1397 + }, + { + "epoch": 0.23926065377374636, + "grad_norm": 39.589111328125, + "learning_rate": 7.946377638334284e-06, + "loss": 5.4176, + "step": 1398 + }, + { + "epoch": 0.2394317987335273, + "grad_norm": 13.162062644958496, + "learning_rate": 7.952082144894468e-06, + "loss": 1.3262, + "step": 1399 + }, + { + "epoch": 0.23960294369330823, + "grad_norm": 11.512036323547363, + "learning_rate": 7.95778665145465e-06, + "loss": 2.8916, + "step": 1400 + }, + { + "epoch": 0.23977408865308916, + "grad_norm": 30.82523536682129, + "learning_rate": 7.963491158014831e-06, + "loss": 3.7983, + "step": 1401 + }, + { + "epoch": 0.2399452336128701, + "grad_norm": 9.881488800048828, + "learning_rate": 7.969195664575014e-06, + "loss": 1.6009, + "step": 1402 + }, + { + "epoch": 0.24011637857265103, + "grad_norm": 26.221534729003906, + "learning_rate": 7.974900171135196e-06, + "loss": 3.2459, + "step": 1403 + }, + { + "epoch": 0.24028752353243196, + "grad_norm": 34.7869987487793, + "learning_rate": 7.98060467769538e-06, + "loss": 4.2736, + "step": 1404 + }, + { + "epoch": 0.2404586684922129, + "grad_norm": 42.81889343261719, + "learning_rate": 7.986309184255561e-06, + "loss": 6.0254, + "step": 1405 + }, + { + "epoch": 0.24062981345199383, + "grad_norm": 35.25808334350586, + "learning_rate": 7.992013690815745e-06, + "loss": 3.8331, + "step": 1406 + }, + { + "epoch": 0.24080095841177476, + "grad_norm": 29.81654167175293, + "learning_rate": 7.997718197375928e-06, + "loss": 3.3841, + "step": 1407 + }, + { + "epoch": 0.2409721033715557, + "grad_norm": 34.251243591308594, + "learning_rate": 8.00342270393611e-06, + "loss": 4.8157, + "step": 1408 + }, + { + "epoch": 0.24114324833133663, + "grad_norm": 31.04636573791504, + "learning_rate": 8.009127210496293e-06, + "loss": 3.4431, + "step": 1409 + }, + { + "epoch": 0.2413143932911176, + "grad_norm": 33.0612678527832, + "learning_rate": 8.014831717056475e-06, + "loss": 3.8054, + "step": 1410 + }, + { + "epoch": 0.24148553825089852, + "grad_norm": 25.215789794921875, + "learning_rate": 8.020536223616658e-06, + "loss": 3.2052, + "step": 1411 + }, + { + "epoch": 0.24165668321067946, + "grad_norm": 22.657257080078125, + "learning_rate": 8.02624073017684e-06, + "loss": 2.5621, + "step": 1412 + }, + { + "epoch": 0.2418278281704604, + "grad_norm": 32.54667282104492, + "learning_rate": 8.031945236737021e-06, + "loss": 4.1257, + "step": 1413 + }, + { + "epoch": 0.24199897313024132, + "grad_norm": 14.109042167663574, + "learning_rate": 8.037649743297205e-06, + "loss": 1.2616, + "step": 1414 + }, + { + "epoch": 0.24217011809002226, + "grad_norm": 35.718116760253906, + "learning_rate": 8.043354249857387e-06, + "loss": 5.263, + "step": 1415 + }, + { + "epoch": 0.2423412630498032, + "grad_norm": 10.830004692077637, + "learning_rate": 8.04905875641757e-06, + "loss": 1.6628, + "step": 1416 + }, + { + "epoch": 0.24251240800958412, + "grad_norm": 21.519893646240234, + "learning_rate": 8.054763262977753e-06, + "loss": 2.2681, + "step": 1417 + }, + { + "epoch": 0.24268355296936506, + "grad_norm": 16.527233123779297, + "learning_rate": 8.060467769537937e-06, + "loss": 1.7274, + "step": 1418 + }, + { + "epoch": 0.242854697929146, + "grad_norm": 17.97334098815918, + "learning_rate": 8.066172276098118e-06, + "loss": 1.4341, + "step": 1419 + }, + { + "epoch": 0.24302584288892692, + "grad_norm": 38.63325500488281, + "learning_rate": 8.0718767826583e-06, + "loss": 5.4521, + "step": 1420 + }, + { + "epoch": 0.24319698784870786, + "grad_norm": 37.572818756103516, + "learning_rate": 8.077581289218483e-06, + "loss": 4.057, + "step": 1421 + }, + { + "epoch": 0.2433681328084888, + "grad_norm": 36.495025634765625, + "learning_rate": 8.083285795778665e-06, + "loss": 5.3841, + "step": 1422 + }, + { + "epoch": 0.24353927776826972, + "grad_norm": 46.322486877441406, + "learning_rate": 8.088990302338848e-06, + "loss": 9.2447, + "step": 1423 + }, + { + "epoch": 0.24371042272805066, + "grad_norm": 32.26517868041992, + "learning_rate": 8.09469480889903e-06, + "loss": 3.4902, + "step": 1424 + }, + { + "epoch": 0.2438815676878316, + "grad_norm": 29.286020278930664, + "learning_rate": 8.100399315459212e-06, + "loss": 3.6562, + "step": 1425 + }, + { + "epoch": 0.24405271264761252, + "grad_norm": 9.768603324890137, + "learning_rate": 8.106103822019395e-06, + "loss": 1.0808, + "step": 1426 + }, + { + "epoch": 0.24422385760739346, + "grad_norm": 40.53557205200195, + "learning_rate": 8.111808328579577e-06, + "loss": 5.3038, + "step": 1427 + }, + { + "epoch": 0.2443950025671744, + "grad_norm": 36.29978561401367, + "learning_rate": 8.11751283513976e-06, + "loss": 4.9487, + "step": 1428 + }, + { + "epoch": 0.24456614752695532, + "grad_norm": 50.365440368652344, + "learning_rate": 8.123217341699944e-06, + "loss": 9.1753, + "step": 1429 + }, + { + "epoch": 0.24473729248673626, + "grad_norm": 25.204608917236328, + "learning_rate": 8.128921848260127e-06, + "loss": 3.06, + "step": 1430 + }, + { + "epoch": 0.2449084374465172, + "grad_norm": 36.821929931640625, + "learning_rate": 8.134626354820309e-06, + "loss": 4.2367, + "step": 1431 + }, + { + "epoch": 0.24507958240629812, + "grad_norm": 9.532563209533691, + "learning_rate": 8.14033086138049e-06, + "loss": 1.0511, + "step": 1432 + }, + { + "epoch": 0.24525072736607906, + "grad_norm": 31.35403060913086, + "learning_rate": 8.146035367940674e-06, + "loss": 4.1655, + "step": 1433 + }, + { + "epoch": 0.24542187232586, + "grad_norm": 29.057531356811523, + "learning_rate": 8.151739874500855e-06, + "loss": 3.6622, + "step": 1434 + }, + { + "epoch": 0.24559301728564092, + "grad_norm": 18.69387435913086, + "learning_rate": 8.157444381061039e-06, + "loss": 1.6006, + "step": 1435 + }, + { + "epoch": 0.24576416224542189, + "grad_norm": 27.337491989135742, + "learning_rate": 8.16314888762122e-06, + "loss": 2.1133, + "step": 1436 + }, + { + "epoch": 0.24593530720520282, + "grad_norm": 59.810035705566406, + "learning_rate": 8.168853394181404e-06, + "loss": 9.2893, + "step": 1437 + }, + { + "epoch": 0.24610645216498375, + "grad_norm": 34.85076141357422, + "learning_rate": 8.174557900741586e-06, + "loss": 4.76, + "step": 1438 + }, + { + "epoch": 0.24627759712476469, + "grad_norm": 16.229951858520508, + "learning_rate": 8.180262407301767e-06, + "loss": 1.111, + "step": 1439 + }, + { + "epoch": 0.24644874208454562, + "grad_norm": 191.14859008789062, + "learning_rate": 8.185966913861952e-06, + "loss": 8.6606, + "step": 1440 + }, + { + "epoch": 0.24661988704432655, + "grad_norm": 25.192026138305664, + "learning_rate": 8.191671420422134e-06, + "loss": 2.2213, + "step": 1441 + }, + { + "epoch": 0.24679103200410749, + "grad_norm": 16.577152252197266, + "learning_rate": 8.197375926982317e-06, + "loss": 1.4564, + "step": 1442 + }, + { + "epoch": 0.24696217696388842, + "grad_norm": 37.47216796875, + "learning_rate": 8.203080433542499e-06, + "loss": 4.9652, + "step": 1443 + }, + { + "epoch": 0.24713332192366935, + "grad_norm": 33.50614547729492, + "learning_rate": 8.20878494010268e-06, + "loss": 3.8217, + "step": 1444 + }, + { + "epoch": 0.24730446688345029, + "grad_norm": 35.54981994628906, + "learning_rate": 8.214489446662864e-06, + "loss": 5.0781, + "step": 1445 + }, + { + "epoch": 0.24747561184323122, + "grad_norm": 29.486570358276367, + "learning_rate": 8.220193953223046e-06, + "loss": 3.4324, + "step": 1446 + }, + { + "epoch": 0.24764675680301215, + "grad_norm": 23.952808380126953, + "learning_rate": 8.22589845978323e-06, + "loss": 2.9791, + "step": 1447 + }, + { + "epoch": 0.24781790176279309, + "grad_norm": 22.885963439941406, + "learning_rate": 8.231602966343411e-06, + "loss": 2.1029, + "step": 1448 + }, + { + "epoch": 0.24798904672257402, + "grad_norm": 38.23826217651367, + "learning_rate": 8.237307472903594e-06, + "loss": 5.1107, + "step": 1449 + }, + { + "epoch": 0.24816019168235495, + "grad_norm": 21.183773040771484, + "learning_rate": 8.243011979463776e-06, + "loss": 2.6462, + "step": 1450 + }, + { + "epoch": 0.24833133664213589, + "grad_norm": 11.436287879943848, + "learning_rate": 8.248716486023958e-06, + "loss": 1.139, + "step": 1451 + }, + { + "epoch": 0.24850248160191682, + "grad_norm": 21.1058349609375, + "learning_rate": 8.254420992584143e-06, + "loss": 2.6237, + "step": 1452 + }, + { + "epoch": 0.24867362656169775, + "grad_norm": 29.661510467529297, + "learning_rate": 8.260125499144324e-06, + "loss": 3.9416, + "step": 1453 + }, + { + "epoch": 0.24884477152147869, + "grad_norm": 25.654918670654297, + "learning_rate": 8.265830005704508e-06, + "loss": 2.9109, + "step": 1454 + }, + { + "epoch": 0.24901591648125962, + "grad_norm": 29.254196166992188, + "learning_rate": 8.27153451226469e-06, + "loss": 3.9703, + "step": 1455 + }, + { + "epoch": 0.24918706144104055, + "grad_norm": 15.34985065460205, + "learning_rate": 8.277239018824871e-06, + "loss": 1.277, + "step": 1456 + }, + { + "epoch": 0.24935820640082149, + "grad_norm": 20.940813064575195, + "learning_rate": 8.282943525385055e-06, + "loss": 2.8225, + "step": 1457 + }, + { + "epoch": 0.24952935136060242, + "grad_norm": 156.33163452148438, + "learning_rate": 8.288648031945236e-06, + "loss": 6.8667, + "step": 1458 + }, + { + "epoch": 0.24970049632038335, + "grad_norm": 142.04833984375, + "learning_rate": 8.29435253850542e-06, + "loss": 7.7845, + "step": 1459 + }, + { + "epoch": 0.24987164128016429, + "grad_norm": 52.80269241333008, + "learning_rate": 8.300057045065601e-06, + "loss": 9.2945, + "step": 1460 + }, + { + "epoch": 0.25004278623994525, + "grad_norm": 36.25229263305664, + "learning_rate": 8.305761551625785e-06, + "loss": 4.1385, + "step": 1461 + }, + { + "epoch": 0.2502139311997262, + "grad_norm": 32.63280487060547, + "learning_rate": 8.311466058185966e-06, + "loss": 4.9526, + "step": 1462 + }, + { + "epoch": 0.2503850761595071, + "grad_norm": 36.09181213378906, + "learning_rate": 8.31717056474615e-06, + "loss": 4.9655, + "step": 1463 + }, + { + "epoch": 0.25055622111928805, + "grad_norm": 13.666475296020508, + "learning_rate": 8.322875071306333e-06, + "loss": 1.2171, + "step": 1464 + }, + { + "epoch": 0.250727366079069, + "grad_norm": 21.431262969970703, + "learning_rate": 8.328579577866515e-06, + "loss": 2.0253, + "step": 1465 + }, + { + "epoch": 0.2508985110388499, + "grad_norm": 34.866493225097656, + "learning_rate": 8.334284084426698e-06, + "loss": 4.7963, + "step": 1466 + }, + { + "epoch": 0.25106965599863085, + "grad_norm": 28.299697875976562, + "learning_rate": 8.33998859098688e-06, + "loss": 3.2393, + "step": 1467 + }, + { + "epoch": 0.2512408009584118, + "grad_norm": 30.702220916748047, + "learning_rate": 8.345693097547063e-06, + "loss": 4.459, + "step": 1468 + }, + { + "epoch": 0.2514119459181927, + "grad_norm": 35.572662353515625, + "learning_rate": 8.351397604107245e-06, + "loss": 4.0362, + "step": 1469 + }, + { + "epoch": 0.25158309087797365, + "grad_norm": 31.228361129760742, + "learning_rate": 8.357102110667427e-06, + "loss": 3.7291, + "step": 1470 + }, + { + "epoch": 0.2517542358377546, + "grad_norm": 158.43309020996094, + "learning_rate": 8.36280661722761e-06, + "loss": 7.5395, + "step": 1471 + }, + { + "epoch": 0.2519253807975355, + "grad_norm": 26.111873626708984, + "learning_rate": 8.368511123787792e-06, + "loss": 3.2816, + "step": 1472 + }, + { + "epoch": 0.25209652575731645, + "grad_norm": 152.1773681640625, + "learning_rate": 8.374215630347975e-06, + "loss": 9.2757, + "step": 1473 + }, + { + "epoch": 0.2522676707170974, + "grad_norm": 28.91309928894043, + "learning_rate": 8.379920136908157e-06, + "loss": 3.8, + "step": 1474 + }, + { + "epoch": 0.2524388156768783, + "grad_norm": 138.71820068359375, + "learning_rate": 8.38562464346834e-06, + "loss": 8.3701, + "step": 1475 + }, + { + "epoch": 0.25260996063665925, + "grad_norm": 10.94738483428955, + "learning_rate": 8.391329150028524e-06, + "loss": 1.0987, + "step": 1476 + }, + { + "epoch": 0.2527811055964402, + "grad_norm": 33.45675277709961, + "learning_rate": 8.397033656588705e-06, + "loss": 3.8679, + "step": 1477 + }, + { + "epoch": 0.2529522505562211, + "grad_norm": 30.219728469848633, + "learning_rate": 8.402738163148889e-06, + "loss": 3.7668, + "step": 1478 + }, + { + "epoch": 0.25312339551600205, + "grad_norm": 153.4755859375, + "learning_rate": 8.40844266970907e-06, + "loss": 8.493, + "step": 1479 + }, + { + "epoch": 0.253294540475783, + "grad_norm": 27.030277252197266, + "learning_rate": 8.414147176269254e-06, + "loss": 3.6373, + "step": 1480 + }, + { + "epoch": 0.2534656854355639, + "grad_norm": 26.931581497192383, + "learning_rate": 8.419851682829435e-06, + "loss": 2.4114, + "step": 1481 + }, + { + "epoch": 0.25363683039534485, + "grad_norm": 33.86345672607422, + "learning_rate": 8.425556189389617e-06, + "loss": 4.18, + "step": 1482 + }, + { + "epoch": 0.2538079753551258, + "grad_norm": 40.67789840698242, + "learning_rate": 8.4312606959498e-06, + "loss": 5.2501, + "step": 1483 + }, + { + "epoch": 0.2539791203149067, + "grad_norm": 11.627734184265137, + "learning_rate": 8.436965202509982e-06, + "loss": 1.2352, + "step": 1484 + }, + { + "epoch": 0.25415026527468765, + "grad_norm": 27.1390438079834, + "learning_rate": 8.442669709070165e-06, + "loss": 2.4447, + "step": 1485 + }, + { + "epoch": 0.2543214102344686, + "grad_norm": 33.907615661621094, + "learning_rate": 8.448374215630349e-06, + "loss": 5.7028, + "step": 1486 + }, + { + "epoch": 0.2544925551942495, + "grad_norm": 34.770687103271484, + "learning_rate": 8.45407872219053e-06, + "loss": 5.4022, + "step": 1487 + }, + { + "epoch": 0.25466370015403045, + "grad_norm": 87.67970275878906, + "learning_rate": 8.459783228750714e-06, + "loss": 7.2429, + "step": 1488 + }, + { + "epoch": 0.2548348451138114, + "grad_norm": 36.1263313293457, + "learning_rate": 8.465487735310896e-06, + "loss": 4.7788, + "step": 1489 + }, + { + "epoch": 0.2550059900735923, + "grad_norm": 35.22165298461914, + "learning_rate": 8.471192241871079e-06, + "loss": 4.132, + "step": 1490 + }, + { + "epoch": 0.25517713503337325, + "grad_norm": 28.420682907104492, + "learning_rate": 8.47689674843126e-06, + "loss": 3.6288, + "step": 1491 + }, + { + "epoch": 0.2553482799931542, + "grad_norm": 36.37025451660156, + "learning_rate": 8.482601254991444e-06, + "loss": 5.1911, + "step": 1492 + }, + { + "epoch": 0.2555194249529351, + "grad_norm": 40.647789001464844, + "learning_rate": 8.488305761551626e-06, + "loss": 5.5946, + "step": 1493 + }, + { + "epoch": 0.25569056991271605, + "grad_norm": 19.504039764404297, + "learning_rate": 8.494010268111807e-06, + "loss": 1.7075, + "step": 1494 + }, + { + "epoch": 0.255861714872497, + "grad_norm": 32.866695404052734, + "learning_rate": 8.49971477467199e-06, + "loss": 4.4763, + "step": 1495 + }, + { + "epoch": 0.2560328598322779, + "grad_norm": 33.1104736328125, + "learning_rate": 8.505419281232172e-06, + "loss": 4.4053, + "step": 1496 + }, + { + "epoch": 0.25620400479205885, + "grad_norm": 22.860944747924805, + "learning_rate": 8.511123787792358e-06, + "loss": 2.5604, + "step": 1497 + }, + { + "epoch": 0.2563751497518398, + "grad_norm": 34.79046630859375, + "learning_rate": 8.51682829435254e-06, + "loss": 4.993, + "step": 1498 + }, + { + "epoch": 0.25654629471162077, + "grad_norm": 28.405912399291992, + "learning_rate": 8.522532800912723e-06, + "loss": 3.3138, + "step": 1499 + }, + { + "epoch": 0.2567174396714017, + "grad_norm": 32.89986038208008, + "learning_rate": 8.528237307472904e-06, + "loss": 3.1908, + "step": 1500 + }, + { + "epoch": 0.25688858463118264, + "grad_norm": 20.201610565185547, + "learning_rate": 8.533941814033086e-06, + "loss": 1.974, + "step": 1501 + }, + { + "epoch": 0.25705972959096357, + "grad_norm": 32.933231353759766, + "learning_rate": 8.53964632059327e-06, + "loss": 4.8342, + "step": 1502 + }, + { + "epoch": 0.2572308745507445, + "grad_norm": 25.67669105529785, + "learning_rate": 8.545350827153451e-06, + "loss": 2.8345, + "step": 1503 + }, + { + "epoch": 0.25740201951052544, + "grad_norm": 50.461097717285156, + "learning_rate": 8.551055333713634e-06, + "loss": 6.9385, + "step": 1504 + }, + { + "epoch": 0.25757316447030637, + "grad_norm": 32.42000198364258, + "learning_rate": 8.556759840273816e-06, + "loss": 3.4542, + "step": 1505 + }, + { + "epoch": 0.2577443094300873, + "grad_norm": 29.946523666381836, + "learning_rate": 8.562464346833998e-06, + "loss": 3.2486, + "step": 1506 + }, + { + "epoch": 0.25791545438986824, + "grad_norm": 17.451496124267578, + "learning_rate": 8.568168853394181e-06, + "loss": 1.4946, + "step": 1507 + }, + { + "epoch": 0.25808659934964917, + "grad_norm": 30.164350509643555, + "learning_rate": 8.573873359954363e-06, + "loss": 3.8272, + "step": 1508 + }, + { + "epoch": 0.2582577443094301, + "grad_norm": 26.747682571411133, + "learning_rate": 8.579577866514548e-06, + "loss": 3.0653, + "step": 1509 + }, + { + "epoch": 0.25842888926921104, + "grad_norm": 20.9317626953125, + "learning_rate": 8.58528237307473e-06, + "loss": 1.8431, + "step": 1510 + }, + { + "epoch": 0.25860003422899197, + "grad_norm": 36.90618896484375, + "learning_rate": 8.590986879634913e-06, + "loss": 3.7371, + "step": 1511 + }, + { + "epoch": 0.2587711791887729, + "grad_norm": 19.612281799316406, + "learning_rate": 8.596691386195095e-06, + "loss": 1.4799, + "step": 1512 + }, + { + "epoch": 0.25894232414855384, + "grad_norm": 35.63535690307617, + "learning_rate": 8.602395892755276e-06, + "loss": 4.2458, + "step": 1513 + }, + { + "epoch": 0.25911346910833477, + "grad_norm": 37.25559997558594, + "learning_rate": 8.60810039931546e-06, + "loss": 3.7735, + "step": 1514 + }, + { + "epoch": 0.2592846140681157, + "grad_norm": 26.81685447692871, + "learning_rate": 8.613804905875641e-06, + "loss": 2.621, + "step": 1515 + }, + { + "epoch": 0.25945575902789664, + "grad_norm": 22.918485641479492, + "learning_rate": 8.619509412435825e-06, + "loss": 1.6105, + "step": 1516 + }, + { + "epoch": 0.25962690398767757, + "grad_norm": 12.06033992767334, + "learning_rate": 8.625213918996006e-06, + "loss": 1.1731, + "step": 1517 + }, + { + "epoch": 0.2597980489474585, + "grad_norm": 35.15945053100586, + "learning_rate": 8.63091842555619e-06, + "loss": 3.8198, + "step": 1518 + }, + { + "epoch": 0.25996919390723944, + "grad_norm": 13.90102767944336, + "learning_rate": 8.636622932116372e-06, + "loss": 2.736, + "step": 1519 + }, + { + "epoch": 0.26014033886702037, + "grad_norm": 35.0964469909668, + "learning_rate": 8.642327438676555e-06, + "loss": 4.3737, + "step": 1520 + }, + { + "epoch": 0.2603114838268013, + "grad_norm": 33.16070556640625, + "learning_rate": 8.648031945236738e-06, + "loss": 3.8065, + "step": 1521 + }, + { + "epoch": 0.26048262878658224, + "grad_norm": 16.28618621826172, + "learning_rate": 8.65373645179692e-06, + "loss": 1.257, + "step": 1522 + }, + { + "epoch": 0.26065377374636317, + "grad_norm": 28.174516677856445, + "learning_rate": 8.659440958357103e-06, + "loss": 3.7114, + "step": 1523 + }, + { + "epoch": 0.2608249187061441, + "grad_norm": 26.44544792175293, + "learning_rate": 8.665145464917285e-06, + "loss": 2.829, + "step": 1524 + }, + { + "epoch": 0.26099606366592504, + "grad_norm": 38.186378479003906, + "learning_rate": 8.670849971477467e-06, + "loss": 4.3011, + "step": 1525 + }, + { + "epoch": 0.26116720862570597, + "grad_norm": 206.24801635742188, + "learning_rate": 8.67655447803765e-06, + "loss": 9.2851, + "step": 1526 + }, + { + "epoch": 0.2613383535854869, + "grad_norm": 33.12008285522461, + "learning_rate": 8.682258984597832e-06, + "loss": 4.3036, + "step": 1527 + }, + { + "epoch": 0.26150949854526784, + "grad_norm": 136.57029724121094, + "learning_rate": 8.687963491158015e-06, + "loss": 8.4189, + "step": 1528 + }, + { + "epoch": 0.26168064350504877, + "grad_norm": 40.36309051513672, + "learning_rate": 8.693667997718197e-06, + "loss": 5.4948, + "step": 1529 + }, + { + "epoch": 0.2618517884648297, + "grad_norm": 19.74286651611328, + "learning_rate": 8.69937250427838e-06, + "loss": 2.0893, + "step": 1530 + }, + { + "epoch": 0.26202293342461064, + "grad_norm": 33.62118148803711, + "learning_rate": 8.705077010838562e-06, + "loss": 3.796, + "step": 1531 + }, + { + "epoch": 0.26219407838439157, + "grad_norm": 36.64006805419922, + "learning_rate": 8.710781517398745e-06, + "loss": 3.9848, + "step": 1532 + }, + { + "epoch": 0.2623652233441725, + "grad_norm": 12.980084419250488, + "learning_rate": 8.716486023958929e-06, + "loss": 1.1166, + "step": 1533 + }, + { + "epoch": 0.26253636830395344, + "grad_norm": 35.808021545410156, + "learning_rate": 8.72219053051911e-06, + "loss": 4.3018, + "step": 1534 + }, + { + "epoch": 0.26270751326373437, + "grad_norm": 51.2911491394043, + "learning_rate": 8.727895037079294e-06, + "loss": 9.237, + "step": 1535 + }, + { + "epoch": 0.2628786582235153, + "grad_norm": 26.75223731994629, + "learning_rate": 8.733599543639475e-06, + "loss": 3.3625, + "step": 1536 + }, + { + "epoch": 0.26304980318329624, + "grad_norm": 81.07520294189453, + "learning_rate": 8.739304050199659e-06, + "loss": 7.5686, + "step": 1537 + }, + { + "epoch": 0.26322094814307717, + "grad_norm": 37.027191162109375, + "learning_rate": 8.74500855675984e-06, + "loss": 3.7701, + "step": 1538 + }, + { + "epoch": 0.2633920931028581, + "grad_norm": 47.393333435058594, + "learning_rate": 8.750713063320022e-06, + "loss": 9.0139, + "step": 1539 + }, + { + "epoch": 0.26356323806263904, + "grad_norm": 34.1210823059082, + "learning_rate": 8.756417569880206e-06, + "loss": 4.4995, + "step": 1540 + }, + { + "epoch": 0.26373438302241997, + "grad_norm": 14.312548637390137, + "learning_rate": 8.762122076440387e-06, + "loss": 2.1827, + "step": 1541 + }, + { + "epoch": 0.2639055279822009, + "grad_norm": 30.19961166381836, + "learning_rate": 8.76782658300057e-06, + "loss": 3.9737, + "step": 1542 + }, + { + "epoch": 0.26407667294198184, + "grad_norm": 10.720991134643555, + "learning_rate": 8.773531089560754e-06, + "loss": 1.1108, + "step": 1543 + }, + { + "epoch": 0.26424781790176277, + "grad_norm": 26.29660987854004, + "learning_rate": 8.779235596120936e-06, + "loss": 2.9509, + "step": 1544 + }, + { + "epoch": 0.2644189628615437, + "grad_norm": 7.651371479034424, + "learning_rate": 8.784940102681119e-06, + "loss": 0.8929, + "step": 1545 + }, + { + "epoch": 0.26459010782132464, + "grad_norm": 32.411407470703125, + "learning_rate": 8.7906446092413e-06, + "loss": 3.9279, + "step": 1546 + }, + { + "epoch": 0.26476125278110557, + "grad_norm": 43.62602233886719, + "learning_rate": 8.796349115801484e-06, + "loss": 8.7932, + "step": 1547 + }, + { + "epoch": 0.2649323977408865, + "grad_norm": 28.391075134277344, + "learning_rate": 8.802053622361666e-06, + "loss": 3.3049, + "step": 1548 + }, + { + "epoch": 0.26510354270066744, + "grad_norm": 35.11864471435547, + "learning_rate": 8.80775812892185e-06, + "loss": 4.0323, + "step": 1549 + }, + { + "epoch": 0.2652746876604484, + "grad_norm": 10.911874771118164, + "learning_rate": 8.813462635482031e-06, + "loss": 1.3744, + "step": 1550 + }, + { + "epoch": 0.26544583262022936, + "grad_norm": 22.232980728149414, + "learning_rate": 8.819167142042213e-06, + "loss": 1.972, + "step": 1551 + }, + { + "epoch": 0.2656169775800103, + "grad_norm": 171.640625, + "learning_rate": 8.824871648602396e-06, + "loss": 8.4712, + "step": 1552 + }, + { + "epoch": 0.2657881225397912, + "grad_norm": 30.831897735595703, + "learning_rate": 8.830576155162578e-06, + "loss": 3.5869, + "step": 1553 + }, + { + "epoch": 0.26595926749957216, + "grad_norm": 36.305782318115234, + "learning_rate": 8.836280661722761e-06, + "loss": 4.9009, + "step": 1554 + }, + { + "epoch": 0.2661304124593531, + "grad_norm": 44.463626861572266, + "learning_rate": 8.841985168282944e-06, + "loss": 4.6015, + "step": 1555 + }, + { + "epoch": 0.266301557419134, + "grad_norm": 22.66800308227539, + "learning_rate": 8.847689674843126e-06, + "loss": 2.1498, + "step": 1556 + }, + { + "epoch": 0.26647270237891496, + "grad_norm": 30.886274337768555, + "learning_rate": 8.85339418140331e-06, + "loss": 4.3322, + "step": 1557 + }, + { + "epoch": 0.2666438473386959, + "grad_norm": 34.30126190185547, + "learning_rate": 8.859098687963491e-06, + "loss": 4.5378, + "step": 1558 + }, + { + "epoch": 0.2668149922984768, + "grad_norm": 36.92926025390625, + "learning_rate": 8.864803194523674e-06, + "loss": 4.2903, + "step": 1559 + }, + { + "epoch": 0.26698613725825776, + "grad_norm": 34.588077545166016, + "learning_rate": 8.870507701083856e-06, + "loss": 4.9088, + "step": 1560 + }, + { + "epoch": 0.2671572822180387, + "grad_norm": 30.621044158935547, + "learning_rate": 8.87621220764404e-06, + "loss": 3.6051, + "step": 1561 + }, + { + "epoch": 0.2673284271778196, + "grad_norm": 30.107677459716797, + "learning_rate": 8.881916714204221e-06, + "loss": 3.4027, + "step": 1562 + }, + { + "epoch": 0.26749957213760056, + "grad_norm": 16.614532470703125, + "learning_rate": 8.887621220764403e-06, + "loss": 1.5846, + "step": 1563 + }, + { + "epoch": 0.2676707170973815, + "grad_norm": 35.577842712402344, + "learning_rate": 8.893325727324586e-06, + "loss": 4.2335, + "step": 1564 + }, + { + "epoch": 0.2678418620571624, + "grad_norm": 33.13545227050781, + "learning_rate": 8.899030233884768e-06, + "loss": 4.6539, + "step": 1565 + }, + { + "epoch": 0.26801300701694336, + "grad_norm": 170.64297485351562, + "learning_rate": 8.904734740444953e-06, + "loss": 9.3362, + "step": 1566 + }, + { + "epoch": 0.2681841519767243, + "grad_norm": 12.3065185546875, + "learning_rate": 8.910439247005135e-06, + "loss": 1.573, + "step": 1567 + }, + { + "epoch": 0.2683552969365052, + "grad_norm": 38.08529281616211, + "learning_rate": 8.916143753565318e-06, + "loss": 3.7314, + "step": 1568 + }, + { + "epoch": 0.26852644189628616, + "grad_norm": 169.76089477539062, + "learning_rate": 8.9218482601255e-06, + "loss": 9.6942, + "step": 1569 + }, + { + "epoch": 0.2686975868560671, + "grad_norm": 38.42169952392578, + "learning_rate": 8.927552766685681e-06, + "loss": 5.3158, + "step": 1570 + }, + { + "epoch": 0.268868731815848, + "grad_norm": 14.410723686218262, + "learning_rate": 8.933257273245865e-06, + "loss": 1.2377, + "step": 1571 + }, + { + "epoch": 0.26903987677562896, + "grad_norm": 52.682533264160156, + "learning_rate": 8.938961779806047e-06, + "loss": 6.516, + "step": 1572 + }, + { + "epoch": 0.2692110217354099, + "grad_norm": 34.07759094238281, + "learning_rate": 8.94466628636623e-06, + "loss": 4.013, + "step": 1573 + }, + { + "epoch": 0.26938216669519083, + "grad_norm": 29.74109649658203, + "learning_rate": 8.950370792926412e-06, + "loss": 3.4177, + "step": 1574 + }, + { + "epoch": 0.26955331165497176, + "grad_norm": 35.098876953125, + "learning_rate": 8.956075299486593e-06, + "loss": 4.1055, + "step": 1575 + }, + { + "epoch": 0.2697244566147527, + "grad_norm": 50.082366943359375, + "learning_rate": 8.961779806046777e-06, + "loss": 8.4876, + "step": 1576 + }, + { + "epoch": 0.26989560157453363, + "grad_norm": 116.58244323730469, + "learning_rate": 8.96748431260696e-06, + "loss": 7.8558, + "step": 1577 + }, + { + "epoch": 0.27006674653431456, + "grad_norm": 32.75837326049805, + "learning_rate": 8.973188819167143e-06, + "loss": 3.8977, + "step": 1578 + }, + { + "epoch": 0.2702378914940955, + "grad_norm": 13.686226844787598, + "learning_rate": 8.978893325727325e-06, + "loss": 1.5984, + "step": 1579 + }, + { + "epoch": 0.27040903645387643, + "grad_norm": 31.057418823242188, + "learning_rate": 8.984597832287508e-06, + "loss": 4.2033, + "step": 1580 + }, + { + "epoch": 0.27058018141365736, + "grad_norm": 31.405447006225586, + "learning_rate": 8.99030233884769e-06, + "loss": 3.2895, + "step": 1581 + }, + { + "epoch": 0.2707513263734383, + "grad_norm": 29.978918075561523, + "learning_rate": 8.996006845407872e-06, + "loss": 4.0648, + "step": 1582 + }, + { + "epoch": 0.27092247133321923, + "grad_norm": 11.317312240600586, + "learning_rate": 9.001711351968055e-06, + "loss": 0.9835, + "step": 1583 + }, + { + "epoch": 0.27109361629300016, + "grad_norm": 17.877771377563477, + "learning_rate": 9.007415858528237e-06, + "loss": 1.4293, + "step": 1584 + }, + { + "epoch": 0.2712647612527811, + "grad_norm": 26.353673934936523, + "learning_rate": 9.01312036508842e-06, + "loss": 2.6549, + "step": 1585 + }, + { + "epoch": 0.27143590621256203, + "grad_norm": 31.735876083374023, + "learning_rate": 9.018824871648602e-06, + "loss": 3.9997, + "step": 1586 + }, + { + "epoch": 0.27160705117234296, + "grad_norm": 35.91917037963867, + "learning_rate": 9.024529378208785e-06, + "loss": 4.2824, + "step": 1587 + }, + { + "epoch": 0.2717781961321239, + "grad_norm": 32.27674865722656, + "learning_rate": 9.030233884768967e-06, + "loss": 4.0964, + "step": 1588 + }, + { + "epoch": 0.27194934109190483, + "grad_norm": 37.242549896240234, + "learning_rate": 9.03593839132915e-06, + "loss": 4.4567, + "step": 1589 + }, + { + "epoch": 0.27212048605168576, + "grad_norm": 15.34211540222168, + "learning_rate": 9.041642897889334e-06, + "loss": 1.1567, + "step": 1590 + }, + { + "epoch": 0.2722916310114667, + "grad_norm": 35.38195037841797, + "learning_rate": 9.047347404449515e-06, + "loss": 4.7975, + "step": 1591 + }, + { + "epoch": 0.27246277597124763, + "grad_norm": 29.104900360107422, + "learning_rate": 9.053051911009699e-06, + "loss": 3.1354, + "step": 1592 + }, + { + "epoch": 0.27263392093102856, + "grad_norm": 15.004528999328613, + "learning_rate": 9.05875641756988e-06, + "loss": 1.1248, + "step": 1593 + }, + { + "epoch": 0.2728050658908095, + "grad_norm": 26.269655227661133, + "learning_rate": 9.064460924130062e-06, + "loss": 2.0743, + "step": 1594 + }, + { + "epoch": 0.27297621085059043, + "grad_norm": 19.79959487915039, + "learning_rate": 9.070165430690246e-06, + "loss": 1.3031, + "step": 1595 + }, + { + "epoch": 0.27314735581037136, + "grad_norm": 43.51731491088867, + "learning_rate": 9.075869937250427e-06, + "loss": 4.4293, + "step": 1596 + }, + { + "epoch": 0.2733185007701523, + "grad_norm": 7.138434410095215, + "learning_rate": 9.08157444381061e-06, + "loss": 0.8485, + "step": 1597 + }, + { + "epoch": 0.27348964572993323, + "grad_norm": 32.309593200683594, + "learning_rate": 9.087278950370792e-06, + "loss": 3.4497, + "step": 1598 + }, + { + "epoch": 0.27366079068971416, + "grad_norm": 24.805715560913086, + "learning_rate": 9.092983456930976e-06, + "loss": 2.9256, + "step": 1599 + }, + { + "epoch": 0.2738319356494951, + "grad_norm": 61.22898483276367, + "learning_rate": 9.098687963491159e-06, + "loss": 5.9283, + "step": 1600 + }, + { + "epoch": 0.2740030806092761, + "grad_norm": 29.417680740356445, + "learning_rate": 9.10439247005134e-06, + "loss": 3.8084, + "step": 1601 + }, + { + "epoch": 0.274174225569057, + "grad_norm": 34.00372314453125, + "learning_rate": 9.110096976611524e-06, + "loss": 3.4933, + "step": 1602 + }, + { + "epoch": 0.27434537052883795, + "grad_norm": 14.374422073364258, + "learning_rate": 9.115801483171706e-06, + "loss": 1.4626, + "step": 1603 + }, + { + "epoch": 0.2745165154886189, + "grad_norm": 12.729880332946777, + "learning_rate": 9.12150598973189e-06, + "loss": 1.1151, + "step": 1604 + }, + { + "epoch": 0.2746876604483998, + "grad_norm": 17.94257164001465, + "learning_rate": 9.127210496292071e-06, + "loss": 1.3846, + "step": 1605 + }, + { + "epoch": 0.27485880540818075, + "grad_norm": 38.29545974731445, + "learning_rate": 9.132915002852253e-06, + "loss": 4.5905, + "step": 1606 + }, + { + "epoch": 0.2750299503679617, + "grad_norm": 35.37318420410156, + "learning_rate": 9.138619509412436e-06, + "loss": 4.3784, + "step": 1607 + }, + { + "epoch": 0.2752010953277426, + "grad_norm": 35.77292251586914, + "learning_rate": 9.144324015972618e-06, + "loss": 3.315, + "step": 1608 + }, + { + "epoch": 0.27537224028752355, + "grad_norm": 38.70093536376953, + "learning_rate": 9.150028522532801e-06, + "loss": 5.4718, + "step": 1609 + }, + { + "epoch": 0.2755433852473045, + "grad_norm": 185.0310516357422, + "learning_rate": 9.155733029092983e-06, + "loss": 7.5009, + "step": 1610 + }, + { + "epoch": 0.2757145302070854, + "grad_norm": 28.145288467407227, + "learning_rate": 9.161437535653166e-06, + "loss": 2.8764, + "step": 1611 + }, + { + "epoch": 0.27588567516686635, + "grad_norm": 7.594282150268555, + "learning_rate": 9.16714204221335e-06, + "loss": 0.8713, + "step": 1612 + }, + { + "epoch": 0.2760568201266473, + "grad_norm": 32.899845123291016, + "learning_rate": 9.172846548773531e-06, + "loss": 4.6094, + "step": 1613 + }, + { + "epoch": 0.2762279650864282, + "grad_norm": 39.75630569458008, + "learning_rate": 9.178551055333715e-06, + "loss": 4.5632, + "step": 1614 + }, + { + "epoch": 0.27639911004620915, + "grad_norm": 29.607851028442383, + "learning_rate": 9.184255561893896e-06, + "loss": 2.9606, + "step": 1615 + }, + { + "epoch": 0.2765702550059901, + "grad_norm": 76.37677001953125, + "learning_rate": 9.18996006845408e-06, + "loss": 7.355, + "step": 1616 + }, + { + "epoch": 0.276741399965771, + "grad_norm": 22.215526580810547, + "learning_rate": 9.195664575014261e-06, + "loss": 2.8241, + "step": 1617 + }, + { + "epoch": 0.27691254492555195, + "grad_norm": 9.465276718139648, + "learning_rate": 9.201369081574445e-06, + "loss": 0.9882, + "step": 1618 + }, + { + "epoch": 0.2770836898853329, + "grad_norm": 27.726600646972656, + "learning_rate": 9.207073588134626e-06, + "loss": 3.238, + "step": 1619 + }, + { + "epoch": 0.2772548348451138, + "grad_norm": 35.69710922241211, + "learning_rate": 9.212778094694808e-06, + "loss": 4.4113, + "step": 1620 + }, + { + "epoch": 0.27742597980489475, + "grad_norm": 34.97329330444336, + "learning_rate": 9.218482601254991e-06, + "loss": 5.005, + "step": 1621 + }, + { + "epoch": 0.2775971247646757, + "grad_norm": 18.749282836914062, + "learning_rate": 9.224187107815173e-06, + "loss": 1.7009, + "step": 1622 + }, + { + "epoch": 0.2777682697244566, + "grad_norm": 130.61004638671875, + "learning_rate": 9.229891614375358e-06, + "loss": 7.8661, + "step": 1623 + }, + { + "epoch": 0.27793941468423755, + "grad_norm": 12.980770111083984, + "learning_rate": 9.23559612093554e-06, + "loss": 1.1125, + "step": 1624 + }, + { + "epoch": 0.2781105596440185, + "grad_norm": 46.32781219482422, + "learning_rate": 9.241300627495722e-06, + "loss": 8.7552, + "step": 1625 + }, + { + "epoch": 0.2782817046037994, + "grad_norm": 101.5696029663086, + "learning_rate": 9.247005134055905e-06, + "loss": 7.1054, + "step": 1626 + }, + { + "epoch": 0.27845284956358035, + "grad_norm": 22.125795364379883, + "learning_rate": 9.252709640616087e-06, + "loss": 1.7911, + "step": 1627 + }, + { + "epoch": 0.2786239945233613, + "grad_norm": 34.277095794677734, + "learning_rate": 9.25841414717627e-06, + "loss": 4.438, + "step": 1628 + }, + { + "epoch": 0.2787951394831422, + "grad_norm": 22.72269058227539, + "learning_rate": 9.264118653736452e-06, + "loss": 1.7455, + "step": 1629 + }, + { + "epoch": 0.27896628444292315, + "grad_norm": 30.11455726623535, + "learning_rate": 9.269823160296635e-06, + "loss": 3.3549, + "step": 1630 + }, + { + "epoch": 0.2791374294027041, + "grad_norm": 34.13120651245117, + "learning_rate": 9.275527666856817e-06, + "loss": 3.7081, + "step": 1631 + }, + { + "epoch": 0.279308574362485, + "grad_norm": 8.457001686096191, + "learning_rate": 9.281232173416998e-06, + "loss": 0.9564, + "step": 1632 + }, + { + "epoch": 0.27947971932226595, + "grad_norm": 38.574615478515625, + "learning_rate": 9.286936679977182e-06, + "loss": 4.3973, + "step": 1633 + }, + { + "epoch": 0.2796508642820469, + "grad_norm": 11.158347129821777, + "learning_rate": 9.292641186537364e-06, + "loss": 0.9696, + "step": 1634 + }, + { + "epoch": 0.2798220092418278, + "grad_norm": 11.847931861877441, + "learning_rate": 9.298345693097549e-06, + "loss": 1.5377, + "step": 1635 + }, + { + "epoch": 0.27999315420160875, + "grad_norm": 11.096319198608398, + "learning_rate": 9.30405019965773e-06, + "loss": 1.136, + "step": 1636 + }, + { + "epoch": 0.2801642991613897, + "grad_norm": 36.63529586791992, + "learning_rate": 9.309754706217914e-06, + "loss": 5.2998, + "step": 1637 + }, + { + "epoch": 0.2803354441211706, + "grad_norm": 30.421175003051758, + "learning_rate": 9.315459212778095e-06, + "loss": 3.5562, + "step": 1638 + }, + { + "epoch": 0.28050658908095155, + "grad_norm": 34.89402770996094, + "learning_rate": 9.321163719338277e-06, + "loss": 4.9255, + "step": 1639 + }, + { + "epoch": 0.2806777340407325, + "grad_norm": 28.486478805541992, + "learning_rate": 9.32686822589846e-06, + "loss": 3.4583, + "step": 1640 + }, + { + "epoch": 0.2808488790005134, + "grad_norm": 7.498641490936279, + "learning_rate": 9.332572732458642e-06, + "loss": 0.8123, + "step": 1641 + }, + { + "epoch": 0.28102002396029435, + "grad_norm": 47.50094223022461, + "learning_rate": 9.338277239018825e-06, + "loss": 7.894, + "step": 1642 + }, + { + "epoch": 0.2811911689200753, + "grad_norm": 62.95503616333008, + "learning_rate": 9.343981745579007e-06, + "loss": 6.7316, + "step": 1643 + }, + { + "epoch": 0.2813623138798562, + "grad_norm": 26.29498291015625, + "learning_rate": 9.349686252139189e-06, + "loss": 2.9299, + "step": 1644 + }, + { + "epoch": 0.28153345883963715, + "grad_norm": 13.663917541503906, + "learning_rate": 9.355390758699372e-06, + "loss": 1.6658, + "step": 1645 + }, + { + "epoch": 0.2817046037994181, + "grad_norm": 31.745132446289062, + "learning_rate": 9.361095265259556e-06, + "loss": 4.9097, + "step": 1646 + }, + { + "epoch": 0.281875748759199, + "grad_norm": 16.757953643798828, + "learning_rate": 9.366799771819739e-06, + "loss": 1.4769, + "step": 1647 + }, + { + "epoch": 0.28204689371897995, + "grad_norm": 21.601877212524414, + "learning_rate": 9.37250427837992e-06, + "loss": 1.7352, + "step": 1648 + }, + { + "epoch": 0.2822180386787609, + "grad_norm": 38.61962127685547, + "learning_rate": 9.378208784940104e-06, + "loss": 4.4803, + "step": 1649 + }, + { + "epoch": 0.2823891836385418, + "grad_norm": 31.342639923095703, + "learning_rate": 9.383913291500286e-06, + "loss": 3.8044, + "step": 1650 + }, + { + "epoch": 0.28256032859832275, + "grad_norm": 9.416754722595215, + "learning_rate": 9.389617798060467e-06, + "loss": 0.804, + "step": 1651 + }, + { + "epoch": 0.28273147355810374, + "grad_norm": 31.227413177490234, + "learning_rate": 9.39532230462065e-06, + "loss": 4.1229, + "step": 1652 + }, + { + "epoch": 0.2829026185178847, + "grad_norm": 13.257563591003418, + "learning_rate": 9.401026811180832e-06, + "loss": 1.1089, + "step": 1653 + }, + { + "epoch": 0.2830737634776656, + "grad_norm": 33.36773681640625, + "learning_rate": 9.406731317741016e-06, + "loss": 4.6453, + "step": 1654 + }, + { + "epoch": 0.28324490843744654, + "grad_norm": 30.116289138793945, + "learning_rate": 9.412435824301198e-06, + "loss": 3.3475, + "step": 1655 + }, + { + "epoch": 0.2834160533972275, + "grad_norm": 9.72807502746582, + "learning_rate": 9.41814033086138e-06, + "loss": 1.3987, + "step": 1656 + }, + { + "epoch": 0.2835871983570084, + "grad_norm": 35.53730392456055, + "learning_rate": 9.423844837421563e-06, + "loss": 4.4274, + "step": 1657 + }, + { + "epoch": 0.28375834331678934, + "grad_norm": 25.7310733795166, + "learning_rate": 9.429549343981746e-06, + "loss": 3.1681, + "step": 1658 + }, + { + "epoch": 0.2839294882765703, + "grad_norm": 41.159175872802734, + "learning_rate": 9.43525385054193e-06, + "loss": 5.0249, + "step": 1659 + }, + { + "epoch": 0.2841006332363512, + "grad_norm": 44.80512619018555, + "learning_rate": 9.440958357102111e-06, + "loss": 8.5485, + "step": 1660 + }, + { + "epoch": 0.28427177819613214, + "grad_norm": 30.980173110961914, + "learning_rate": 9.446662863662294e-06, + "loss": 3.4326, + "step": 1661 + }, + { + "epoch": 0.2844429231559131, + "grad_norm": 34.32295608520508, + "learning_rate": 9.452367370222476e-06, + "loss": 3.1846, + "step": 1662 + }, + { + "epoch": 0.284614068115694, + "grad_norm": 31.66938591003418, + "learning_rate": 9.458071876782658e-06, + "loss": 3.5118, + "step": 1663 + }, + { + "epoch": 0.28478521307547494, + "grad_norm": 32.4676513671875, + "learning_rate": 9.463776383342841e-06, + "loss": 4.7146, + "step": 1664 + }, + { + "epoch": 0.2849563580352559, + "grad_norm": 10.913191795349121, + "learning_rate": 9.469480889903023e-06, + "loss": 0.9731, + "step": 1665 + }, + { + "epoch": 0.2851275029950368, + "grad_norm": 35.5974006652832, + "learning_rate": 9.475185396463206e-06, + "loss": 4.3522, + "step": 1666 + }, + { + "epoch": 0.28529864795481774, + "grad_norm": 33.59803771972656, + "learning_rate": 9.480889903023388e-06, + "loss": 3.3641, + "step": 1667 + }, + { + "epoch": 0.2854697929145987, + "grad_norm": 35.429466247558594, + "learning_rate": 9.486594409583571e-06, + "loss": 3.9219, + "step": 1668 + }, + { + "epoch": 0.2856409378743796, + "grad_norm": 13.85142707824707, + "learning_rate": 9.492298916143755e-06, + "loss": 1.3341, + "step": 1669 + }, + { + "epoch": 0.28581208283416054, + "grad_norm": 9.107728004455566, + "learning_rate": 9.498003422703936e-06, + "loss": 0.8871, + "step": 1670 + }, + { + "epoch": 0.2859832277939415, + "grad_norm": 35.564979553222656, + "learning_rate": 9.50370792926412e-06, + "loss": 4.1451, + "step": 1671 + }, + { + "epoch": 0.2861543727537224, + "grad_norm": 27.561506271362305, + "learning_rate": 9.509412435824301e-06, + "loss": 3.3942, + "step": 1672 + }, + { + "epoch": 0.28632551771350334, + "grad_norm": 35.57343292236328, + "learning_rate": 9.515116942384485e-06, + "loss": 3.7111, + "step": 1673 + }, + { + "epoch": 0.2864966626732843, + "grad_norm": 39.25908279418945, + "learning_rate": 9.520821448944666e-06, + "loss": 4.3091, + "step": 1674 + }, + { + "epoch": 0.2866678076330652, + "grad_norm": 41.76926803588867, + "learning_rate": 9.526525955504848e-06, + "loss": 4.7086, + "step": 1675 + }, + { + "epoch": 0.28683895259284614, + "grad_norm": 30.626611709594727, + "learning_rate": 9.532230462065032e-06, + "loss": 3.1129, + "step": 1676 + }, + { + "epoch": 0.2870100975526271, + "grad_norm": 15.441875457763672, + "learning_rate": 9.537934968625213e-06, + "loss": 1.2888, + "step": 1677 + }, + { + "epoch": 0.287181242512408, + "grad_norm": 28.600982666015625, + "learning_rate": 9.543639475185397e-06, + "loss": 2.9279, + "step": 1678 + }, + { + "epoch": 0.28735238747218894, + "grad_norm": 31.3085994720459, + "learning_rate": 9.549343981745578e-06, + "loss": 3.8741, + "step": 1679 + }, + { + "epoch": 0.2875235324319699, + "grad_norm": 207.9216766357422, + "learning_rate": 9.555048488305763e-06, + "loss": 8.5201, + "step": 1680 + }, + { + "epoch": 0.2876946773917508, + "grad_norm": 38.76487731933594, + "learning_rate": 9.560752994865945e-06, + "loss": 4.8258, + "step": 1681 + }, + { + "epoch": 0.28786582235153174, + "grad_norm": 35.18633270263672, + "learning_rate": 9.566457501426127e-06, + "loss": 3.9555, + "step": 1682 + }, + { + "epoch": 0.2880369673113127, + "grad_norm": 153.19830322265625, + "learning_rate": 9.57216200798631e-06, + "loss": 8.1017, + "step": 1683 + }, + { + "epoch": 0.2882081122710936, + "grad_norm": 8.444355010986328, + "learning_rate": 9.577866514546492e-06, + "loss": 0.8761, + "step": 1684 + }, + { + "epoch": 0.28837925723087454, + "grad_norm": 44.78715515136719, + "learning_rate": 9.583571021106675e-06, + "loss": 8.4681, + "step": 1685 + }, + { + "epoch": 0.2885504021906555, + "grad_norm": 25.710901260375977, + "learning_rate": 9.589275527666857e-06, + "loss": 3.2682, + "step": 1686 + }, + { + "epoch": 0.2887215471504364, + "grad_norm": 161.20376586914062, + "learning_rate": 9.59498003422704e-06, + "loss": 8.3231, + "step": 1687 + }, + { + "epoch": 0.28889269211021734, + "grad_norm": 36.88936996459961, + "learning_rate": 9.600684540787222e-06, + "loss": 4.4629, + "step": 1688 + }, + { + "epoch": 0.2890638370699983, + "grad_norm": 33.05325698852539, + "learning_rate": 9.606389047347404e-06, + "loss": 4.2398, + "step": 1689 + }, + { + "epoch": 0.2892349820297792, + "grad_norm": 31.297021865844727, + "learning_rate": 9.612093553907587e-06, + "loss": 3.9676, + "step": 1690 + }, + { + "epoch": 0.28940612698956014, + "grad_norm": 33.626365661621094, + "learning_rate": 9.617798060467769e-06, + "loss": 4.2342, + "step": 1691 + }, + { + "epoch": 0.2895772719493411, + "grad_norm": 32.812740325927734, + "learning_rate": 9.623502567027954e-06, + "loss": 3.9633, + "step": 1692 + }, + { + "epoch": 0.289748416909122, + "grad_norm": 16.281417846679688, + "learning_rate": 9.629207073588135e-06, + "loss": 1.3504, + "step": 1693 + }, + { + "epoch": 0.28991956186890294, + "grad_norm": 36.80635070800781, + "learning_rate": 9.634911580148317e-06, + "loss": 4.8133, + "step": 1694 + }, + { + "epoch": 0.2900907068286839, + "grad_norm": 36.548397064208984, + "learning_rate": 9.6406160867085e-06, + "loss": 4.0484, + "step": 1695 + }, + { + "epoch": 0.2902618517884648, + "grad_norm": 35.513729095458984, + "learning_rate": 9.646320593268682e-06, + "loss": 4.3281, + "step": 1696 + }, + { + "epoch": 0.29043299674824574, + "grad_norm": 33.258995056152344, + "learning_rate": 9.652025099828866e-06, + "loss": 4.3749, + "step": 1697 + }, + { + "epoch": 0.2906041417080267, + "grad_norm": 30.854419708251953, + "learning_rate": 9.657729606389047e-06, + "loss": 3.4016, + "step": 1698 + }, + { + "epoch": 0.2907752866678076, + "grad_norm": 8.308602333068848, + "learning_rate": 9.66343411294923e-06, + "loss": 1.221, + "step": 1699 + }, + { + "epoch": 0.29094643162758854, + "grad_norm": 10.515448570251465, + "learning_rate": 9.669138619509412e-06, + "loss": 0.9434, + "step": 1700 + }, + { + "epoch": 0.2911175765873695, + "grad_norm": 20.784067153930664, + "learning_rate": 9.674843126069594e-06, + "loss": 2.2873, + "step": 1701 + }, + { + "epoch": 0.2912887215471504, + "grad_norm": 44.417884826660156, + "learning_rate": 9.680547632629777e-06, + "loss": 8.2452, + "step": 1702 + }, + { + "epoch": 0.2914598665069314, + "grad_norm": 28.057279586791992, + "learning_rate": 9.68625213918996e-06, + "loss": 2.7355, + "step": 1703 + }, + { + "epoch": 0.29163101146671233, + "grad_norm": 50.71089553833008, + "learning_rate": 9.691956645750144e-06, + "loss": 8.431, + "step": 1704 + }, + { + "epoch": 0.29180215642649326, + "grad_norm": 44.918087005615234, + "learning_rate": 9.697661152310326e-06, + "loss": 7.8944, + "step": 1705 + }, + { + "epoch": 0.2919733013862742, + "grad_norm": 143.09837341308594, + "learning_rate": 9.703365658870507e-06, + "loss": 7.8241, + "step": 1706 + }, + { + "epoch": 0.29214444634605513, + "grad_norm": 22.652225494384766, + "learning_rate": 9.709070165430691e-06, + "loss": 2.888, + "step": 1707 + }, + { + "epoch": 0.29231559130583606, + "grad_norm": 32.992774963378906, + "learning_rate": 9.714774671990873e-06, + "loss": 3.7048, + "step": 1708 + }, + { + "epoch": 0.292486736265617, + "grad_norm": 30.531761169433594, + "learning_rate": 9.720479178551056e-06, + "loss": 3.8219, + "step": 1709 + }, + { + "epoch": 0.29265788122539793, + "grad_norm": 39.57463073730469, + "learning_rate": 9.726183685111238e-06, + "loss": 4.5677, + "step": 1710 + }, + { + "epoch": 0.29282902618517886, + "grad_norm": 32.177650451660156, + "learning_rate": 9.731888191671421e-06, + "loss": 3.9562, + "step": 1711 + }, + { + "epoch": 0.2930001711449598, + "grad_norm": 11.071613311767578, + "learning_rate": 9.737592698231603e-06, + "loss": 1.3818, + "step": 1712 + }, + { + "epoch": 0.29317131610474073, + "grad_norm": 183.07089233398438, + "learning_rate": 9.743297204791784e-06, + "loss": 8.202, + "step": 1713 + }, + { + "epoch": 0.29334246106452166, + "grad_norm": 127.64228057861328, + "learning_rate": 9.749001711351968e-06, + "loss": 7.7497, + "step": 1714 + }, + { + "epoch": 0.2935136060243026, + "grad_norm": 30.094449996948242, + "learning_rate": 9.754706217912151e-06, + "loss": 3.133, + "step": 1715 + }, + { + "epoch": 0.29368475098408353, + "grad_norm": 33.56199264526367, + "learning_rate": 9.760410724472334e-06, + "loss": 3.9969, + "step": 1716 + }, + { + "epoch": 0.29385589594386446, + "grad_norm": 30.969953536987305, + "learning_rate": 9.766115231032516e-06, + "loss": 3.2142, + "step": 1717 + }, + { + "epoch": 0.2940270409036454, + "grad_norm": 26.745988845825195, + "learning_rate": 9.7718197375927e-06, + "loss": 2.5008, + "step": 1718 + }, + { + "epoch": 0.29419818586342633, + "grad_norm": 19.184772491455078, + "learning_rate": 9.777524244152881e-06, + "loss": 1.7379, + "step": 1719 + }, + { + "epoch": 0.29436933082320726, + "grad_norm": 30.3228759765625, + "learning_rate": 9.783228750713063e-06, + "loss": 3.6131, + "step": 1720 + }, + { + "epoch": 0.2945404757829882, + "grad_norm": 30.700254440307617, + "learning_rate": 9.788933257273246e-06, + "loss": 3.4613, + "step": 1721 + }, + { + "epoch": 0.29471162074276913, + "grad_norm": 35.11033248901367, + "learning_rate": 9.794637763833428e-06, + "loss": 4.2554, + "step": 1722 + }, + { + "epoch": 0.29488276570255006, + "grad_norm": 47.508277893066406, + "learning_rate": 9.800342270393611e-06, + "loss": 8.2195, + "step": 1723 + }, + { + "epoch": 0.295053910662331, + "grad_norm": 35.247528076171875, + "learning_rate": 9.806046776953793e-06, + "loss": 4.3316, + "step": 1724 + }, + { + "epoch": 0.29522505562211193, + "grad_norm": 27.610280990600586, + "learning_rate": 9.811751283513975e-06, + "loss": 3.7587, + "step": 1725 + }, + { + "epoch": 0.29539620058189286, + "grad_norm": 34.314918518066406, + "learning_rate": 9.81745579007416e-06, + "loss": 4.4546, + "step": 1726 + }, + { + "epoch": 0.2955673455416738, + "grad_norm": 31.43994140625, + "learning_rate": 9.823160296634341e-06, + "loss": 3.8653, + "step": 1727 + }, + { + "epoch": 0.29573849050145473, + "grad_norm": 15.655001640319824, + "learning_rate": 9.828864803194525e-06, + "loss": 1.2015, + "step": 1728 + }, + { + "epoch": 0.29590963546123566, + "grad_norm": 13.799985885620117, + "learning_rate": 9.834569309754707e-06, + "loss": 1.6485, + "step": 1729 + }, + { + "epoch": 0.2960807804210166, + "grad_norm": 35.408145904541016, + "learning_rate": 9.84027381631489e-06, + "loss": 4.6912, + "step": 1730 + }, + { + "epoch": 0.29625192538079753, + "grad_norm": 33.258941650390625, + "learning_rate": 9.845978322875072e-06, + "loss": 3.8243, + "step": 1731 + }, + { + "epoch": 0.29642307034057847, + "grad_norm": 34.537960052490234, + "learning_rate": 9.851682829435253e-06, + "loss": 3.6863, + "step": 1732 + }, + { + "epoch": 0.2965942153003594, + "grad_norm": 25.667997360229492, + "learning_rate": 9.857387335995437e-06, + "loss": 3.21, + "step": 1733 + }, + { + "epoch": 0.29676536026014033, + "grad_norm": 146.46380615234375, + "learning_rate": 9.863091842555618e-06, + "loss": 7.1136, + "step": 1734 + }, + { + "epoch": 0.29693650521992127, + "grad_norm": 20.732595443725586, + "learning_rate": 9.868796349115802e-06, + "loss": 2.1932, + "step": 1735 + }, + { + "epoch": 0.2971076501797022, + "grad_norm": 37.78299331665039, + "learning_rate": 9.874500855675983e-06, + "loss": 4.4365, + "step": 1736 + }, + { + "epoch": 0.29727879513948313, + "grad_norm": 30.049827575683594, + "learning_rate": 9.880205362236167e-06, + "loss": 3.2983, + "step": 1737 + }, + { + "epoch": 0.29744994009926407, + "grad_norm": 12.33377742767334, + "learning_rate": 9.88590986879635e-06, + "loss": 1.4362, + "step": 1738 + }, + { + "epoch": 0.297621085059045, + "grad_norm": 24.165996551513672, + "learning_rate": 9.891614375356532e-06, + "loss": 2.7512, + "step": 1739 + }, + { + "epoch": 0.29779223001882593, + "grad_norm": 34.980438232421875, + "learning_rate": 9.897318881916715e-06, + "loss": 3.4089, + "step": 1740 + }, + { + "epoch": 0.29796337497860687, + "grad_norm": 52.22333526611328, + "learning_rate": 9.903023388476897e-06, + "loss": 8.55, + "step": 1741 + }, + { + "epoch": 0.2981345199383878, + "grad_norm": 30.178720474243164, + "learning_rate": 9.90872789503708e-06, + "loss": 3.7629, + "step": 1742 + }, + { + "epoch": 0.29830566489816873, + "grad_norm": 12.83564281463623, + "learning_rate": 9.914432401597262e-06, + "loss": 1.5206, + "step": 1743 + }, + { + "epoch": 0.29847680985794967, + "grad_norm": 116.20635223388672, + "learning_rate": 9.920136908157444e-06, + "loss": 7.1701, + "step": 1744 + }, + { + "epoch": 0.2986479548177306, + "grad_norm": 28.332143783569336, + "learning_rate": 9.925841414717627e-06, + "loss": 4.0808, + "step": 1745 + }, + { + "epoch": 0.29881909977751153, + "grad_norm": 17.009302139282227, + "learning_rate": 9.931545921277809e-06, + "loss": 1.1237, + "step": 1746 + }, + { + "epoch": 0.29899024473729247, + "grad_norm": 22.102079391479492, + "learning_rate": 9.937250427837992e-06, + "loss": 2.1591, + "step": 1747 + }, + { + "epoch": 0.2991613896970734, + "grad_norm": 31.704936981201172, + "learning_rate": 9.942954934398174e-06, + "loss": 3.3555, + "step": 1748 + }, + { + "epoch": 0.29933253465685433, + "grad_norm": 7.139681816101074, + "learning_rate": 9.948659440958359e-06, + "loss": 0.8492, + "step": 1749 + }, + { + "epoch": 0.29950367961663527, + "grad_norm": 37.93485641479492, + "learning_rate": 9.95436394751854e-06, + "loss": 4.3445, + "step": 1750 + }, + { + "epoch": 0.2996748245764162, + "grad_norm": 23.79175567626953, + "learning_rate": 9.960068454078722e-06, + "loss": 2.6891, + "step": 1751 + }, + { + "epoch": 0.29984596953619713, + "grad_norm": 26.583223342895508, + "learning_rate": 9.965772960638906e-06, + "loss": 3.0936, + "step": 1752 + }, + { + "epoch": 0.3000171144959781, + "grad_norm": 16.86503791809082, + "learning_rate": 9.971477467199087e-06, + "loss": 1.5367, + "step": 1753 + }, + { + "epoch": 0.30018825945575905, + "grad_norm": 37.780025482177734, + "learning_rate": 9.97718197375927e-06, + "loss": 4.5862, + "step": 1754 + }, + { + "epoch": 0.30018825945575905, + "eval_nli-pairs_loss": 3.577563524246216, + "eval_nli-pairs_runtime": 4.5158, + "eval_nli-pairs_samples_per_second": 44.289, + "eval_nli-pairs_steps_per_second": 1.55, + "eval_sts-test_pearson_cosine": 0.7051574603634622, + "eval_sts-test_pearson_dot": 0.5937802816639131, + "eval_sts-test_pearson_euclidean": 0.7000060119936138, + "eval_sts-test_pearson_manhattan": 0.7079127065958083, + "eval_sts-test_pearson_max": 0.7079127065958083, + "eval_sts-test_spearman_cosine": 0.6765504113809614, + "eval_sts-test_spearman_dot": 0.5611218190113842, + "eval_sts-test_spearman_euclidean": 0.6793571635918119, + "eval_sts-test_spearman_manhattan": 0.6864576898108908, + "eval_sts-test_spearman_max": 0.6864576898108908, + "step": 1754 + }, + { + "epoch": 0.30018825945575905, + "eval_vitaminc-pairs_loss": 2.382566213607788, + "eval_vitaminc-pairs_runtime": 2.7572, + "eval_vitaminc-pairs_samples_per_second": 72.538, + "eval_vitaminc-pairs_steps_per_second": 2.539, + "step": 1754 + }, + { + "epoch": 0.30018825945575905, + "eval_qnli-contrastive_loss": 7.762363910675049, + "eval_qnli-contrastive_runtime": 0.6686, + "eval_qnli-contrastive_samples_per_second": 299.128, + "eval_qnli-contrastive_steps_per_second": 10.469, + "step": 1754 + }, + { + "epoch": 0.30018825945575905, + "eval_scitail-pairs-qa_loss": 0.7197363972663879, + "eval_scitail-pairs-qa_runtime": 1.7426, + "eval_scitail-pairs-qa_samples_per_second": 114.768, + "eval_scitail-pairs-qa_steps_per_second": 4.017, + "step": 1754 + }, + { + "epoch": 0.30018825945575905, + "eval_scitail-pairs-pos_loss": 2.2759039402008057, + "eval_scitail-pairs-pos_runtime": 2.8206, + "eval_scitail-pairs-pos_samples_per_second": 70.906, + "eval_scitail-pairs-pos_steps_per_second": 2.482, + "step": 1754 + }, + { + "epoch": 0.30018825945575905, + "eval_xsum-pairs_loss": 2.1139955520629883, + "eval_xsum-pairs_runtime": 2.6563, + "eval_xsum-pairs_samples_per_second": 65.88, + "eval_xsum-pairs_steps_per_second": 2.259, + "step": 1754 + }, + { + "epoch": 0.30018825945575905, + "eval_compression-pairs_loss": 1.1527378559112549, + "eval_compression-pairs_runtime": 0.5278, + "eval_compression-pairs_samples_per_second": 378.929, + "eval_compression-pairs_steps_per_second": 13.263, + "step": 1754 + }, + { + "epoch": 0.30018825945575905, + "eval_sciq_pairs_loss": 6.166472434997559, + "eval_sciq_pairs_runtime": 9.2821, + "eval_sciq_pairs_samples_per_second": 21.547, + "eval_sciq_pairs_steps_per_second": 0.754, + "step": 1754 + }, + { + "epoch": 0.30018825945575905, + "eval_qasc_pairs_loss": 8.247413635253906, + "eval_qasc_pairs_runtime": 2.7444, + "eval_qasc_pairs_samples_per_second": 72.876, + "eval_qasc_pairs_steps_per_second": 2.551, + "step": 1754 + }, + { + "epoch": 0.30018825945575905, + "eval_openbookqa_pairs_loss": 4.27993631362915, + "eval_openbookqa_pairs_runtime": 0.68, + "eval_openbookqa_pairs_samples_per_second": 101.475, + "eval_openbookqa_pairs_steps_per_second": 4.412, + "step": 1754 + }, + { + "epoch": 0.30018825945575905, + "eval_msmarco_pairs_loss": 3.4503884315490723, + "eval_msmarco_pairs_runtime": 4.1424, + "eval_msmarco_pairs_samples_per_second": 48.281, + "eval_msmarco_pairs_steps_per_second": 1.69, + "step": 1754 + }, + { + "epoch": 0.30018825945575905, + "eval_nq_pairs_loss": 4.303767204284668, + "eval_nq_pairs_runtime": 8.7194, + "eval_nq_pairs_samples_per_second": 22.937, + "eval_nq_pairs_steps_per_second": 0.803, + "step": 1754 + }, + { + "epoch": 0.30018825945575905, + "eval_trivia_pairs_loss": 3.893390417098999, + "eval_trivia_pairs_runtime": 13.177, + "eval_trivia_pairs_samples_per_second": 15.178, + "eval_trivia_pairs_steps_per_second": 0.531, + "step": 1754 + }, + { + "epoch": 0.30018825945575905, + "eval_quora_pairs_loss": 1.0257954597473145, + "eval_quora_pairs_runtime": 1.5896, + "eval_quora_pairs_samples_per_second": 125.821, + "eval_quora_pairs_steps_per_second": 4.404, + "step": 1754 + }, + { + "epoch": 0.30018825945575905, + "eval_gooaq_pairs_loss": 2.6827940940856934, + "eval_gooaq_pairs_runtime": 2.6669, + "eval_gooaq_pairs_samples_per_second": 74.993, + "eval_gooaq_pairs_steps_per_second": 2.625, + "step": 1754 + }, + { + "epoch": 0.30035940441554, + "grad_norm": 32.57681655883789, + "learning_rate": 9.982886480319452e-06, + "loss": 4.3391, + "step": 1755 + }, + { + "epoch": 0.3005305493753209, + "grad_norm": 26.65064811706543, + "learning_rate": 9.988590986879634e-06, + "loss": 2.7014, + "step": 1756 + }, + { + "epoch": 0.30070169433510185, + "grad_norm": 33.25247573852539, + "learning_rate": 9.994295493439817e-06, + "loss": 4.1446, + "step": 1757 + }, + { + "epoch": 0.3008728392948828, + "grad_norm": 25.792116165161133, + "learning_rate": 9.999999999999999e-06, + "loss": 2.7164, + "step": 1758 + }, + { + "epoch": 0.3010439842546637, + "grad_norm": 28.707399368286133, + "learning_rate": 1.0005704506560183e-05, + "loss": 3.1937, + "step": 1759 + }, + { + "epoch": 0.30121512921444465, + "grad_norm": 38.30696105957031, + "learning_rate": 1.0011409013120366e-05, + "loss": 4.2427, + "step": 1760 + }, + { + "epoch": 0.3013862741742256, + "grad_norm": 26.254148483276367, + "learning_rate": 1.001711351968055e-05, + "loss": 2.5525, + "step": 1761 + }, + { + "epoch": 0.3015574191340065, + "grad_norm": 7.5429487228393555, + "learning_rate": 1.0022818026240731e-05, + "loss": 0.8481, + "step": 1762 + }, + { + "epoch": 0.30172856409378745, + "grad_norm": 45.37841796875, + "learning_rate": 1.0028522532800913e-05, + "loss": 6.5584, + "step": 1763 + }, + { + "epoch": 0.3018997090535684, + "grad_norm": 17.617197036743164, + "learning_rate": 1.0034227039361096e-05, + "loss": 1.5689, + "step": 1764 + }, + { + "epoch": 0.3020708540133493, + "grad_norm": 8.921030044555664, + "learning_rate": 1.0039931545921278e-05, + "loss": 1.9049, + "step": 1765 + }, + { + "epoch": 0.30224199897313025, + "grad_norm": 11.456149101257324, + "learning_rate": 1.0045636052481461e-05, + "loss": 1.4351, + "step": 1766 + }, + { + "epoch": 0.3024131439329112, + "grad_norm": 36.827125549316406, + "learning_rate": 1.0051340559041643e-05, + "loss": 3.8073, + "step": 1767 + }, + { + "epoch": 0.3025842888926921, + "grad_norm": 31.50043296813965, + "learning_rate": 1.0057045065601826e-05, + "loss": 3.4761, + "step": 1768 + }, + { + "epoch": 0.30275543385247305, + "grad_norm": 212.15618896484375, + "learning_rate": 1.0062749572162008e-05, + "loss": 8.804, + "step": 1769 + }, + { + "epoch": 0.302926578812254, + "grad_norm": 11.170289039611816, + "learning_rate": 1.006845407872219e-05, + "loss": 1.5324, + "step": 1770 + }, + { + "epoch": 0.3030977237720349, + "grad_norm": 11.275130271911621, + "learning_rate": 1.0074158585282373e-05, + "loss": 1.0326, + "step": 1771 + }, + { + "epoch": 0.30326886873181585, + "grad_norm": 37.139068603515625, + "learning_rate": 1.0079863091842556e-05, + "loss": 4.5464, + "step": 1772 + }, + { + "epoch": 0.3034400136915968, + "grad_norm": 24.030378341674805, + "learning_rate": 1.008556759840274e-05, + "loss": 1.9306, + "step": 1773 + }, + { + "epoch": 0.3036111586513777, + "grad_norm": 23.25863265991211, + "learning_rate": 1.0091272104962921e-05, + "loss": 1.8897, + "step": 1774 + }, + { + "epoch": 0.30378230361115865, + "grad_norm": 33.125823974609375, + "learning_rate": 1.0096976611523103e-05, + "loss": 3.4839, + "step": 1775 + }, + { + "epoch": 0.3039534485709396, + "grad_norm": 21.4809627532959, + "learning_rate": 1.0102681118083286e-05, + "loss": 2.866, + "step": 1776 + }, + { + "epoch": 0.3041245935307205, + "grad_norm": 54.2559928894043, + "learning_rate": 1.0108385624643468e-05, + "loss": 8.802, + "step": 1777 + }, + { + "epoch": 0.30429573849050146, + "grad_norm": 39.62715148925781, + "learning_rate": 1.0114090131203651e-05, + "loss": 5.1068, + "step": 1778 + }, + { + "epoch": 0.3044668834502824, + "grad_norm": 14.615751266479492, + "learning_rate": 1.0119794637763833e-05, + "loss": 1.2298, + "step": 1779 + }, + { + "epoch": 0.3046380284100633, + "grad_norm": 36.6978874206543, + "learning_rate": 1.0125499144324017e-05, + "loss": 4.1995, + "step": 1780 + }, + { + "epoch": 0.30480917336984426, + "grad_norm": 14.718832015991211, + "learning_rate": 1.0131203650884198e-05, + "loss": 1.1796, + "step": 1781 + }, + { + "epoch": 0.3049803183296252, + "grad_norm": 36.830204010009766, + "learning_rate": 1.013690815744438e-05, + "loss": 4.1858, + "step": 1782 + }, + { + "epoch": 0.3051514632894061, + "grad_norm": 23.391765594482422, + "learning_rate": 1.0142612664004565e-05, + "loss": 2.4115, + "step": 1783 + }, + { + "epoch": 0.30532260824918706, + "grad_norm": 35.27947998046875, + "learning_rate": 1.0148317170564747e-05, + "loss": 4.8061, + "step": 1784 + }, + { + "epoch": 0.305493753208968, + "grad_norm": 10.68021297454834, + "learning_rate": 1.015402167712493e-05, + "loss": 2.1324, + "step": 1785 + }, + { + "epoch": 0.3056648981687489, + "grad_norm": 23.529436111450195, + "learning_rate": 1.0159726183685112e-05, + "loss": 2.7194, + "step": 1786 + }, + { + "epoch": 0.30583604312852986, + "grad_norm": 32.76841354370117, + "learning_rate": 1.0165430690245295e-05, + "loss": 3.9735, + "step": 1787 + }, + { + "epoch": 0.3060071880883108, + "grad_norm": 20.872732162475586, + "learning_rate": 1.0171135196805477e-05, + "loss": 2.3385, + "step": 1788 + }, + { + "epoch": 0.3061783330480917, + "grad_norm": 14.08251953125, + "learning_rate": 1.0176839703365658e-05, + "loss": 1.8159, + "step": 1789 + }, + { + "epoch": 0.30634947800787266, + "grad_norm": 39.58723831176758, + "learning_rate": 1.0182544209925842e-05, + "loss": 4.7749, + "step": 1790 + }, + { + "epoch": 0.3065206229676536, + "grad_norm": 65.20591735839844, + "learning_rate": 1.0188248716486024e-05, + "loss": 6.4724, + "step": 1791 + }, + { + "epoch": 0.3066917679274345, + "grad_norm": 44.97452926635742, + "learning_rate": 1.0193953223046207e-05, + "loss": 4.9313, + "step": 1792 + }, + { + "epoch": 0.30686291288721546, + "grad_norm": 35.091163635253906, + "learning_rate": 1.0199657729606389e-05, + "loss": 3.4266, + "step": 1793 + }, + { + "epoch": 0.3070340578469964, + "grad_norm": 17.238380432128906, + "learning_rate": 1.020536223616657e-05, + "loss": 1.4114, + "step": 1794 + }, + { + "epoch": 0.3072052028067773, + "grad_norm": 12.661242485046387, + "learning_rate": 1.0211066742726755e-05, + "loss": 2.2799, + "step": 1795 + }, + { + "epoch": 0.30737634776655826, + "grad_norm": 29.67556381225586, + "learning_rate": 1.0216771249286937e-05, + "loss": 2.9217, + "step": 1796 + }, + { + "epoch": 0.3075474927263392, + "grad_norm": 34.465126037597656, + "learning_rate": 1.022247575584712e-05, + "loss": 3.9674, + "step": 1797 + }, + { + "epoch": 0.3077186376861201, + "grad_norm": 66.6548080444336, + "learning_rate": 1.0228180262407302e-05, + "loss": 6.0514, + "step": 1798 + }, + { + "epoch": 0.30788978264590106, + "grad_norm": 36.210044860839844, + "learning_rate": 1.0233884768967485e-05, + "loss": 4.2555, + "step": 1799 + }, + { + "epoch": 0.308060927605682, + "grad_norm": 24.441967010498047, + "learning_rate": 1.0239589275527667e-05, + "loss": 2.5473, + "step": 1800 + }, + { + "epoch": 0.3082320725654629, + "grad_norm": 20.574525833129883, + "learning_rate": 1.0245293782087849e-05, + "loss": 1.6693, + "step": 1801 + }, + { + "epoch": 0.30840321752524386, + "grad_norm": 26.07015037536621, + "learning_rate": 1.0250998288648032e-05, + "loss": 2.7451, + "step": 1802 + }, + { + "epoch": 0.3085743624850248, + "grad_norm": 29.663963317871094, + "learning_rate": 1.0256702795208214e-05, + "loss": 4.0482, + "step": 1803 + }, + { + "epoch": 0.3087455074448058, + "grad_norm": 27.77281379699707, + "learning_rate": 1.0262407301768397e-05, + "loss": 3.0752, + "step": 1804 + }, + { + "epoch": 0.3089166524045867, + "grad_norm": 34.827430725097656, + "learning_rate": 1.0268111808328579e-05, + "loss": 3.7669, + "step": 1805 + }, + { + "epoch": 0.30908779736436764, + "grad_norm": 37.112361907958984, + "learning_rate": 1.0273816314888762e-05, + "loss": 4.7788, + "step": 1806 + }, + { + "epoch": 0.3092589423241486, + "grad_norm": 53.2462272644043, + "learning_rate": 1.0279520821448946e-05, + "loss": 8.0593, + "step": 1807 + }, + { + "epoch": 0.3094300872839295, + "grad_norm": 38.18441390991211, + "learning_rate": 1.0285225328009127e-05, + "loss": 4.2028, + "step": 1808 + }, + { + "epoch": 0.30960123224371044, + "grad_norm": 13.605740547180176, + "learning_rate": 1.029092983456931e-05, + "loss": 2.5679, + "step": 1809 + }, + { + "epoch": 0.3097723772034914, + "grad_norm": 37.292240142822266, + "learning_rate": 1.0296634341129492e-05, + "loss": 4.0864, + "step": 1810 + }, + { + "epoch": 0.3099435221632723, + "grad_norm": 10.673694610595703, + "learning_rate": 1.0302338847689676e-05, + "loss": 0.953, + "step": 1811 + }, + { + "epoch": 0.31011466712305324, + "grad_norm": 30.847604751586914, + "learning_rate": 1.0308043354249858e-05, + "loss": 4.4181, + "step": 1812 + }, + { + "epoch": 0.3102858120828342, + "grad_norm": 25.303640365600586, + "learning_rate": 1.031374786081004e-05, + "loss": 3.0808, + "step": 1813 + }, + { + "epoch": 0.3104569570426151, + "grad_norm": 31.284347534179688, + "learning_rate": 1.0319452367370223e-05, + "loss": 3.3148, + "step": 1814 + }, + { + "epoch": 0.31062810200239604, + "grad_norm": 18.292266845703125, + "learning_rate": 1.0325156873930404e-05, + "loss": 1.4786, + "step": 1815 + }, + { + "epoch": 0.310799246962177, + "grad_norm": 93.66471099853516, + "learning_rate": 1.0330861380490588e-05, + "loss": 6.8127, + "step": 1816 + }, + { + "epoch": 0.3109703919219579, + "grad_norm": 38.12440872192383, + "learning_rate": 1.033656588705077e-05, + "loss": 5.0019, + "step": 1817 + }, + { + "epoch": 0.31114153688173884, + "grad_norm": 32.61493682861328, + "learning_rate": 1.0342270393610954e-05, + "loss": 4.3171, + "step": 1818 + }, + { + "epoch": 0.3113126818415198, + "grad_norm": 38.087646484375, + "learning_rate": 1.0347974900171136e-05, + "loss": 7.6945, + "step": 1819 + }, + { + "epoch": 0.3114838268013007, + "grad_norm": 21.899497985839844, + "learning_rate": 1.0353679406731318e-05, + "loss": 1.7206, + "step": 1820 + }, + { + "epoch": 0.31165497176108165, + "grad_norm": 113.81354522705078, + "learning_rate": 1.0359383913291501e-05, + "loss": 7.2513, + "step": 1821 + }, + { + "epoch": 0.3118261167208626, + "grad_norm": 11.316397666931152, + "learning_rate": 1.0365088419851683e-05, + "loss": 2.1259, + "step": 1822 + }, + { + "epoch": 0.3119972616806435, + "grad_norm": 26.67529296875, + "learning_rate": 1.0370792926411866e-05, + "loss": 3.1664, + "step": 1823 + }, + { + "epoch": 0.31216840664042445, + "grad_norm": 25.253353118896484, + "learning_rate": 1.0376497432972048e-05, + "loss": 2.4222, + "step": 1824 + }, + { + "epoch": 0.3123395516002054, + "grad_norm": 8.143440246582031, + "learning_rate": 1.038220193953223e-05, + "loss": 0.7973, + "step": 1825 + }, + { + "epoch": 0.3125106965599863, + "grad_norm": 19.66392707824707, + "learning_rate": 1.0387906446092413e-05, + "loss": 1.5552, + "step": 1826 + }, + { + "epoch": 0.31268184151976725, + "grad_norm": 23.67314910888672, + "learning_rate": 1.0393610952652595e-05, + "loss": 3.07, + "step": 1827 + }, + { + "epoch": 0.3128529864795482, + "grad_norm": 26.236251831054688, + "learning_rate": 1.0399315459212778e-05, + "loss": 3.1091, + "step": 1828 + }, + { + "epoch": 0.3130241314393291, + "grad_norm": 28.10502815246582, + "learning_rate": 1.0405019965772961e-05, + "loss": 3.0707, + "step": 1829 + }, + { + "epoch": 0.31319527639911005, + "grad_norm": 34.508846282958984, + "learning_rate": 1.0410724472333145e-05, + "loss": 4.872, + "step": 1830 + }, + { + "epoch": 0.313366421358891, + "grad_norm": 34.22414016723633, + "learning_rate": 1.0416428978893326e-05, + "loss": 3.3169, + "step": 1831 + }, + { + "epoch": 0.3135375663186719, + "grad_norm": 46.06840515136719, + "learning_rate": 1.0422133485453508e-05, + "loss": 7.9438, + "step": 1832 + }, + { + "epoch": 0.31370871127845285, + "grad_norm": 18.041322708129883, + "learning_rate": 1.0427837992013692e-05, + "loss": 1.629, + "step": 1833 + }, + { + "epoch": 0.3138798562382338, + "grad_norm": 14.525741577148438, + "learning_rate": 1.0433542498573873e-05, + "loss": 1.1969, + "step": 1834 + }, + { + "epoch": 0.3140510011980147, + "grad_norm": 4.135936260223389, + "learning_rate": 1.0439247005134057e-05, + "loss": 0.7184, + "step": 1835 + }, + { + "epoch": 0.31422214615779565, + "grad_norm": 41.9599723815918, + "learning_rate": 1.0444951511694238e-05, + "loss": 4.2524, + "step": 1836 + }, + { + "epoch": 0.3143932911175766, + "grad_norm": 7.373823642730713, + "learning_rate": 1.0450656018254422e-05, + "loss": 1.8983, + "step": 1837 + }, + { + "epoch": 0.3145644360773575, + "grad_norm": 31.084392547607422, + "learning_rate": 1.0456360524814603e-05, + "loss": 4.0436, + "step": 1838 + }, + { + "epoch": 0.31473558103713845, + "grad_norm": 11.967267036437988, + "learning_rate": 1.0462065031374785e-05, + "loss": 1.0282, + "step": 1839 + }, + { + "epoch": 0.3149067259969194, + "grad_norm": 33.466121673583984, + "learning_rate": 1.046776953793497e-05, + "loss": 3.9262, + "step": 1840 + }, + { + "epoch": 0.3150778709567003, + "grad_norm": 39.21562576293945, + "learning_rate": 1.0473474044495152e-05, + "loss": 4.844, + "step": 1841 + }, + { + "epoch": 0.31524901591648125, + "grad_norm": 33.843055725097656, + "learning_rate": 1.0479178551055335e-05, + "loss": 3.5103, + "step": 1842 + }, + { + "epoch": 0.3154201608762622, + "grad_norm": 35.37272644042969, + "learning_rate": 1.0484883057615517e-05, + "loss": 3.584, + "step": 1843 + }, + { + "epoch": 0.3155913058360431, + "grad_norm": 17.376483917236328, + "learning_rate": 1.0490587564175699e-05, + "loss": 1.4993, + "step": 1844 + }, + { + "epoch": 0.31576245079582405, + "grad_norm": 45.614688873291016, + "learning_rate": 1.0496292070735882e-05, + "loss": 8.1587, + "step": 1845 + }, + { + "epoch": 0.315933595755605, + "grad_norm": 31.185443878173828, + "learning_rate": 1.0501996577296064e-05, + "loss": 4.1762, + "step": 1846 + }, + { + "epoch": 0.3161047407153859, + "grad_norm": 33.703514099121094, + "learning_rate": 1.0507701083856247e-05, + "loss": 4.1885, + "step": 1847 + }, + { + "epoch": 0.31627588567516685, + "grad_norm": 24.48247718811035, + "learning_rate": 1.0513405590416429e-05, + "loss": 2.7277, + "step": 1848 + }, + { + "epoch": 0.3164470306349478, + "grad_norm": 25.966876983642578, + "learning_rate": 1.0519110096976612e-05, + "loss": 2.8921, + "step": 1849 + }, + { + "epoch": 0.3166181755947287, + "grad_norm": 35.0124626159668, + "learning_rate": 1.0524814603536794e-05, + "loss": 4.3145, + "step": 1850 + }, + { + "epoch": 0.31678932055450965, + "grad_norm": 33.62586975097656, + "learning_rate": 1.0530519110096975e-05, + "loss": 3.8524, + "step": 1851 + }, + { + "epoch": 0.3169604655142906, + "grad_norm": 30.16233253479004, + "learning_rate": 1.053622361665716e-05, + "loss": 3.3166, + "step": 1852 + }, + { + "epoch": 0.3171316104740715, + "grad_norm": 31.811193466186523, + "learning_rate": 1.0541928123217342e-05, + "loss": 3.5965, + "step": 1853 + }, + { + "epoch": 0.31730275543385245, + "grad_norm": 35.756778717041016, + "learning_rate": 1.0547632629777526e-05, + "loss": 4.4027, + "step": 1854 + }, + { + "epoch": 0.31747390039363343, + "grad_norm": 17.929304122924805, + "learning_rate": 1.0553337136337707e-05, + "loss": 2.2128, + "step": 1855 + }, + { + "epoch": 0.31764504535341437, + "grad_norm": 29.329362869262695, + "learning_rate": 1.0559041642897889e-05, + "loss": 2.4503, + "step": 1856 + }, + { + "epoch": 0.3178161903131953, + "grad_norm": 38.31791305541992, + "learning_rate": 1.0564746149458072e-05, + "loss": 4.1596, + "step": 1857 + }, + { + "epoch": 0.31798733527297623, + "grad_norm": 26.978776931762695, + "learning_rate": 1.0570450656018254e-05, + "loss": 2.5148, + "step": 1858 + }, + { + "epoch": 0.31815848023275717, + "grad_norm": 183.96864318847656, + "learning_rate": 1.0576155162578437e-05, + "loss": 7.8451, + "step": 1859 + }, + { + "epoch": 0.3183296251925381, + "grad_norm": 34.898677825927734, + "learning_rate": 1.0581859669138619e-05, + "loss": 3.631, + "step": 1860 + }, + { + "epoch": 0.31850077015231903, + "grad_norm": 18.749799728393555, + "learning_rate": 1.0587564175698802e-05, + "loss": 1.5066, + "step": 1861 + }, + { + "epoch": 0.31867191511209997, + "grad_norm": 32.26422882080078, + "learning_rate": 1.0593268682258984e-05, + "loss": 4.0466, + "step": 1862 + }, + { + "epoch": 0.3188430600718809, + "grad_norm": 9.538769721984863, + "learning_rate": 1.0598973188819167e-05, + "loss": 1.2133, + "step": 1863 + }, + { + "epoch": 0.31901420503166183, + "grad_norm": 9.156614303588867, + "learning_rate": 1.0604677695379351e-05, + "loss": 0.9202, + "step": 1864 + }, + { + "epoch": 0.31918534999144277, + "grad_norm": 137.56471252441406, + "learning_rate": 1.0610382201939533e-05, + "loss": 7.6205, + "step": 1865 + }, + { + "epoch": 0.3193564949512237, + "grad_norm": 24.30291748046875, + "learning_rate": 1.0616086708499716e-05, + "loss": 2.5704, + "step": 1866 + }, + { + "epoch": 0.31952763991100464, + "grad_norm": 32.78607940673828, + "learning_rate": 1.0621791215059898e-05, + "loss": 3.4866, + "step": 1867 + }, + { + "epoch": 0.31969878487078557, + "grad_norm": 25.44717025756836, + "learning_rate": 1.0627495721620081e-05, + "loss": 2.8747, + "step": 1868 + }, + { + "epoch": 0.3198699298305665, + "grad_norm": 71.5486831665039, + "learning_rate": 1.0633200228180263e-05, + "loss": 6.3834, + "step": 1869 + }, + { + "epoch": 0.32004107479034744, + "grad_norm": 36.36513900756836, + "learning_rate": 1.0638904734740444e-05, + "loss": 3.8896, + "step": 1870 + }, + { + "epoch": 0.32021221975012837, + "grad_norm": 14.369461059570312, + "learning_rate": 1.0644609241300628e-05, + "loss": 1.2576, + "step": 1871 + }, + { + "epoch": 0.3203833647099093, + "grad_norm": 34.6867561340332, + "learning_rate": 1.065031374786081e-05, + "loss": 3.4093, + "step": 1872 + }, + { + "epoch": 0.32055450966969024, + "grad_norm": 21.84122657775879, + "learning_rate": 1.0656018254420993e-05, + "loss": 2.2791, + "step": 1873 + }, + { + "epoch": 0.32072565462947117, + "grad_norm": 21.254135131835938, + "learning_rate": 1.0661722760981174e-05, + "loss": 2.2054, + "step": 1874 + }, + { + "epoch": 0.3208967995892521, + "grad_norm": 33.362220764160156, + "learning_rate": 1.0667427267541358e-05, + "loss": 4.1888, + "step": 1875 + }, + { + "epoch": 0.32106794454903304, + "grad_norm": 63.412601470947266, + "learning_rate": 1.0673131774101541e-05, + "loss": 8.5606, + "step": 1876 + }, + { + "epoch": 0.32123908950881397, + "grad_norm": 14.283455848693848, + "learning_rate": 1.0678836280661723e-05, + "loss": 0.9998, + "step": 1877 + }, + { + "epoch": 0.3214102344685949, + "grad_norm": 35.16504669189453, + "learning_rate": 1.0684540787221906e-05, + "loss": 4.2321, + "step": 1878 + }, + { + "epoch": 0.32158137942837584, + "grad_norm": 12.61963939666748, + "learning_rate": 1.0690245293782088e-05, + "loss": 1.5004, + "step": 1879 + }, + { + "epoch": 0.32175252438815677, + "grad_norm": 32.174076080322266, + "learning_rate": 1.0695949800342271e-05, + "loss": 3.5576, + "step": 1880 + }, + { + "epoch": 0.3219236693479377, + "grad_norm": 30.472043991088867, + "learning_rate": 1.0701654306902453e-05, + "loss": 3.4048, + "step": 1881 + }, + { + "epoch": 0.32209481430771864, + "grad_norm": 84.8609848022461, + "learning_rate": 1.0707358813462635e-05, + "loss": 6.2658, + "step": 1882 + }, + { + "epoch": 0.32226595926749957, + "grad_norm": 25.621240615844727, + "learning_rate": 1.0713063320022818e-05, + "loss": 2.6459, + "step": 1883 + }, + { + "epoch": 0.3224371042272805, + "grad_norm": 79.82257080078125, + "learning_rate": 1.0718767826583e-05, + "loss": 6.3192, + "step": 1884 + }, + { + "epoch": 0.32260824918706144, + "grad_norm": 7.729169845581055, + "learning_rate": 1.0724472333143183e-05, + "loss": 0.825, + "step": 1885 + }, + { + "epoch": 0.32277939414684237, + "grad_norm": 29.313451766967773, + "learning_rate": 1.0730176839703367e-05, + "loss": 2.9915, + "step": 1886 + }, + { + "epoch": 0.3229505391066233, + "grad_norm": 6.555768013000488, + "learning_rate": 1.073588134626355e-05, + "loss": 0.7525, + "step": 1887 + }, + { + "epoch": 0.32312168406640424, + "grad_norm": 35.07060241699219, + "learning_rate": 1.0741585852823732e-05, + "loss": 4.2147, + "step": 1888 + }, + { + "epoch": 0.32329282902618517, + "grad_norm": 10.583313941955566, + "learning_rate": 1.0747290359383913e-05, + "loss": 0.8557, + "step": 1889 + }, + { + "epoch": 0.3234639739859661, + "grad_norm": 26.075578689575195, + "learning_rate": 1.0752994865944097e-05, + "loss": 2.9433, + "step": 1890 + }, + { + "epoch": 0.32363511894574704, + "grad_norm": 17.7381591796875, + "learning_rate": 1.0758699372504278e-05, + "loss": 1.4998, + "step": 1891 + }, + { + "epoch": 0.32380626390552797, + "grad_norm": 16.11162567138672, + "learning_rate": 1.0764403879064462e-05, + "loss": 1.2949, + "step": 1892 + }, + { + "epoch": 0.3239774088653089, + "grad_norm": 28.165752410888672, + "learning_rate": 1.0770108385624643e-05, + "loss": 3.4363, + "step": 1893 + }, + { + "epoch": 0.32414855382508984, + "grad_norm": 37.37394714355469, + "learning_rate": 1.0775812892184825e-05, + "loss": 4.7016, + "step": 1894 + }, + { + "epoch": 0.32431969878487077, + "grad_norm": 35.620826721191406, + "learning_rate": 1.0781517398745008e-05, + "loss": 4.4153, + "step": 1895 + }, + { + "epoch": 0.3244908437446517, + "grad_norm": 35.83405303955078, + "learning_rate": 1.078722190530519e-05, + "loss": 4.4295, + "step": 1896 + }, + { + "epoch": 0.32466198870443264, + "grad_norm": 12.846619606018066, + "learning_rate": 1.0792926411865374e-05, + "loss": 1.4411, + "step": 1897 + }, + { + "epoch": 0.32483313366421357, + "grad_norm": 11.455179214477539, + "learning_rate": 1.0798630918425557e-05, + "loss": 1.1335, + "step": 1898 + }, + { + "epoch": 0.3250042786239945, + "grad_norm": 36.278289794921875, + "learning_rate": 1.080433542498574e-05, + "loss": 3.6505, + "step": 1899 + }, + { + "epoch": 0.32517542358377544, + "grad_norm": 37.59969711303711, + "learning_rate": 1.0810039931545922e-05, + "loss": 5.1473, + "step": 1900 + }, + { + "epoch": 0.32534656854355637, + "grad_norm": 27.851537704467773, + "learning_rate": 1.0815744438106104e-05, + "loss": 2.792, + "step": 1901 + }, + { + "epoch": 0.3255177135033373, + "grad_norm": 20.874591827392578, + "learning_rate": 1.0821448944666287e-05, + "loss": 2.5421, + "step": 1902 + }, + { + "epoch": 0.32568885846311824, + "grad_norm": 12.82272720336914, + "learning_rate": 1.0827153451226469e-05, + "loss": 0.9663, + "step": 1903 + }, + { + "epoch": 0.32586000342289917, + "grad_norm": 27.367874145507812, + "learning_rate": 1.0832857957786652e-05, + "loss": 2.6934, + "step": 1904 + }, + { + "epoch": 0.3260311483826801, + "grad_norm": 31.575483322143555, + "learning_rate": 1.0838562464346834e-05, + "loss": 3.3276, + "step": 1905 + }, + { + "epoch": 0.3262022933424611, + "grad_norm": 36.26526641845703, + "learning_rate": 1.0844266970907017e-05, + "loss": 4.196, + "step": 1906 + }, + { + "epoch": 0.326373438302242, + "grad_norm": 20.60125160217285, + "learning_rate": 1.0849971477467199e-05, + "loss": 1.5247, + "step": 1907 + }, + { + "epoch": 0.32654458326202296, + "grad_norm": 19.104351043701172, + "learning_rate": 1.085567598402738e-05, + "loss": 1.9953, + "step": 1908 + }, + { + "epoch": 0.3267157282218039, + "grad_norm": 31.618993759155273, + "learning_rate": 1.0861380490587566e-05, + "loss": 3.2496, + "step": 1909 + }, + { + "epoch": 0.3268868731815848, + "grad_norm": 20.25756072998047, + "learning_rate": 1.0867084997147747e-05, + "loss": 1.4173, + "step": 1910 + }, + { + "epoch": 0.32705801814136576, + "grad_norm": 19.579376220703125, + "learning_rate": 1.087278950370793e-05, + "loss": 1.4559, + "step": 1911 + }, + { + "epoch": 0.3272291631011467, + "grad_norm": 33.51919174194336, + "learning_rate": 1.0878494010268112e-05, + "loss": 4.3546, + "step": 1912 + }, + { + "epoch": 0.3274003080609276, + "grad_norm": 34.54380416870117, + "learning_rate": 1.0884198516828294e-05, + "loss": 3.8532, + "step": 1913 + }, + { + "epoch": 0.32757145302070856, + "grad_norm": 43.39759063720703, + "learning_rate": 1.0889903023388477e-05, + "loss": 5.7, + "step": 1914 + }, + { + "epoch": 0.3277425979804895, + "grad_norm": 31.343278884887695, + "learning_rate": 1.0895607529948659e-05, + "loss": 3.6086, + "step": 1915 + }, + { + "epoch": 0.3279137429402704, + "grad_norm": 37.40540313720703, + "learning_rate": 1.0901312036508843e-05, + "loss": 3.6012, + "step": 1916 + }, + { + "epoch": 0.32808488790005136, + "grad_norm": 10.474573135375977, + "learning_rate": 1.0907016543069024e-05, + "loss": 0.9649, + "step": 1917 + }, + { + "epoch": 0.3282560328598323, + "grad_norm": 26.88408088684082, + "learning_rate": 1.0912721049629208e-05, + "loss": 2.6185, + "step": 1918 + }, + { + "epoch": 0.3284271778196132, + "grad_norm": 24.986539840698242, + "learning_rate": 1.091842555618939e-05, + "loss": 2.0861, + "step": 1919 + }, + { + "epoch": 0.32859832277939416, + "grad_norm": 36.754337310791016, + "learning_rate": 1.0924130062749573e-05, + "loss": 4.4734, + "step": 1920 + }, + { + "epoch": 0.3287694677391751, + "grad_norm": 36.0711555480957, + "learning_rate": 1.0929834569309756e-05, + "loss": 3.7612, + "step": 1921 + }, + { + "epoch": 0.328940612698956, + "grad_norm": 33.72808074951172, + "learning_rate": 1.0935539075869938e-05, + "loss": 3.6817, + "step": 1922 + }, + { + "epoch": 0.32911175765873696, + "grad_norm": 31.21643829345703, + "learning_rate": 1.0941243582430121e-05, + "loss": 3.1247, + "step": 1923 + }, + { + "epoch": 0.3292829026185179, + "grad_norm": 26.2045955657959, + "learning_rate": 1.0946948088990303e-05, + "loss": 3.1474, + "step": 1924 + }, + { + "epoch": 0.3294540475782988, + "grad_norm": 30.681350708007812, + "learning_rate": 1.0952652595550484e-05, + "loss": 3.1958, + "step": 1925 + }, + { + "epoch": 0.32962519253807976, + "grad_norm": 57.95525360107422, + "learning_rate": 1.0958357102110668e-05, + "loss": 8.8044, + "step": 1926 + }, + { + "epoch": 0.3297963374978607, + "grad_norm": 178.06443786621094, + "learning_rate": 1.096406160867085e-05, + "loss": 8.7701, + "step": 1927 + }, + { + "epoch": 0.3299674824576416, + "grad_norm": 35.5237922668457, + "learning_rate": 1.0969766115231033e-05, + "loss": 3.8513, + "step": 1928 + }, + { + "epoch": 0.33013862741742256, + "grad_norm": 39.186771392822266, + "learning_rate": 1.0975470621791215e-05, + "loss": 4.4358, + "step": 1929 + }, + { + "epoch": 0.3303097723772035, + "grad_norm": 25.387964248657227, + "learning_rate": 1.0981175128351398e-05, + "loss": 2.6496, + "step": 1930 + }, + { + "epoch": 0.3304809173369844, + "grad_norm": 41.67265319824219, + "learning_rate": 1.098687963491158e-05, + "loss": 4.5891, + "step": 1931 + }, + { + "epoch": 0.33065206229676536, + "grad_norm": 36.71438217163086, + "learning_rate": 1.0992584141471763e-05, + "loss": 4.1564, + "step": 1932 + }, + { + "epoch": 0.3308232072565463, + "grad_norm": 12.194602012634277, + "learning_rate": 1.0998288648031946e-05, + "loss": 1.3654, + "step": 1933 + }, + { + "epoch": 0.3309943522163272, + "grad_norm": 30.5019474029541, + "learning_rate": 1.1003993154592128e-05, + "loss": 2.9248, + "step": 1934 + }, + { + "epoch": 0.33116549717610816, + "grad_norm": 30.596206665039062, + "learning_rate": 1.1009697661152311e-05, + "loss": 3.6483, + "step": 1935 + }, + { + "epoch": 0.3313366421358891, + "grad_norm": 190.34573364257812, + "learning_rate": 1.1015402167712493e-05, + "loss": 9.976, + "step": 1936 + }, + { + "epoch": 0.33150778709567, + "grad_norm": 23.65143585205078, + "learning_rate": 1.1021106674272677e-05, + "loss": 2.6501, + "step": 1937 + }, + { + "epoch": 0.33167893205545096, + "grad_norm": 32.524288177490234, + "learning_rate": 1.1026811180832858e-05, + "loss": 3.6287, + "step": 1938 + }, + { + "epoch": 0.3318500770152319, + "grad_norm": 24.90087890625, + "learning_rate": 1.103251568739304e-05, + "loss": 2.8126, + "step": 1939 + }, + { + "epoch": 0.3320212219750128, + "grad_norm": 11.670059204101562, + "learning_rate": 1.1038220193953223e-05, + "loss": 0.9268, + "step": 1940 + }, + { + "epoch": 0.33219236693479376, + "grad_norm": 20.560199737548828, + "learning_rate": 1.1043924700513405e-05, + "loss": 2.0298, + "step": 1941 + }, + { + "epoch": 0.3323635118945747, + "grad_norm": 32.11676788330078, + "learning_rate": 1.1049629207073588e-05, + "loss": 3.379, + "step": 1942 + }, + { + "epoch": 0.3325346568543556, + "grad_norm": 31.273881912231445, + "learning_rate": 1.1055333713633772e-05, + "loss": 3.6115, + "step": 1943 + }, + { + "epoch": 0.33270580181413656, + "grad_norm": 76.62176513671875, + "learning_rate": 1.1061038220193953e-05, + "loss": 6.2689, + "step": 1944 + }, + { + "epoch": 0.3328769467739175, + "grad_norm": 29.79790496826172, + "learning_rate": 1.1066742726754137e-05, + "loss": 2.9922, + "step": 1945 + }, + { + "epoch": 0.3330480917336984, + "grad_norm": 28.528804779052734, + "learning_rate": 1.1072447233314318e-05, + "loss": 3.192, + "step": 1946 + }, + { + "epoch": 0.33321923669347936, + "grad_norm": 101.99966430664062, + "learning_rate": 1.1078151739874502e-05, + "loss": 6.9582, + "step": 1947 + }, + { + "epoch": 0.3333903816532603, + "grad_norm": 33.45838165283203, + "learning_rate": 1.1083856246434684e-05, + "loss": 4.3572, + "step": 1948 + }, + { + "epoch": 0.3335615266130412, + "grad_norm": 31.591665267944336, + "learning_rate": 1.1089560752994867e-05, + "loss": 3.7906, + "step": 1949 + }, + { + "epoch": 0.33373267157282216, + "grad_norm": 42.0833740234375, + "learning_rate": 1.1095265259555049e-05, + "loss": 4.95, + "step": 1950 + }, + { + "epoch": 0.3339038165326031, + "grad_norm": 94.96964263916016, + "learning_rate": 1.110096976611523e-05, + "loss": 6.5888, + "step": 1951 + }, + { + "epoch": 0.334074961492384, + "grad_norm": 35.450111389160156, + "learning_rate": 1.1106674272675414e-05, + "loss": 4.8891, + "step": 1952 + }, + { + "epoch": 0.33424610645216496, + "grad_norm": 32.57542037963867, + "learning_rate": 1.1112378779235595e-05, + "loss": 4.2762, + "step": 1953 + }, + { + "epoch": 0.3344172514119459, + "grad_norm": 24.635988235473633, + "learning_rate": 1.1118083285795779e-05, + "loss": 2.6646, + "step": 1954 + }, + { + "epoch": 0.3345883963717268, + "grad_norm": 22.50608253479004, + "learning_rate": 1.1123787792355962e-05, + "loss": 2.1994, + "step": 1955 + }, + { + "epoch": 0.33475954133150776, + "grad_norm": 35.915611267089844, + "learning_rate": 1.1129492298916144e-05, + "loss": 4.3539, + "step": 1956 + }, + { + "epoch": 0.33493068629128875, + "grad_norm": 39.85637283325195, + "learning_rate": 1.1135196805476327e-05, + "loss": 8.0766, + "step": 1957 + }, + { + "epoch": 0.3351018312510697, + "grad_norm": 31.60897445678711, + "learning_rate": 1.1140901312036509e-05, + "loss": 3.5052, + "step": 1958 + }, + { + "epoch": 0.3352729762108506, + "grad_norm": 10.988346099853516, + "learning_rate": 1.1146605818596692e-05, + "loss": 2.0192, + "step": 1959 + }, + { + "epoch": 0.33544412117063155, + "grad_norm": 77.31686401367188, + "learning_rate": 1.1152310325156874e-05, + "loss": 6.7873, + "step": 1960 + }, + { + "epoch": 0.3356152661304125, + "grad_norm": 37.3287239074707, + "learning_rate": 1.1158014831717057e-05, + "loss": 4.5134, + "step": 1961 + }, + { + "epoch": 0.3357864110901934, + "grad_norm": 28.940874099731445, + "learning_rate": 1.1163719338277239e-05, + "loss": 3.5488, + "step": 1962 + }, + { + "epoch": 0.33595755604997435, + "grad_norm": 27.005020141601562, + "learning_rate": 1.116942384483742e-05, + "loss": 3.4131, + "step": 1963 + }, + { + "epoch": 0.3361287010097553, + "grad_norm": 23.171354293823242, + "learning_rate": 1.1175128351397604e-05, + "loss": 3.0202, + "step": 1964 + }, + { + "epoch": 0.3362998459695362, + "grad_norm": 33.08194351196289, + "learning_rate": 1.1180832857957786e-05, + "loss": 3.5406, + "step": 1965 + }, + { + "epoch": 0.33647099092931715, + "grad_norm": 42.914058685302734, + "learning_rate": 1.118653736451797e-05, + "loss": 7.7143, + "step": 1966 + }, + { + "epoch": 0.3366421358890981, + "grad_norm": 6.044030666351318, + "learning_rate": 1.1192241871078152e-05, + "loss": 1.0934, + "step": 1967 + }, + { + "epoch": 0.336813280848879, + "grad_norm": 13.652383804321289, + "learning_rate": 1.1197946377638336e-05, + "loss": 1.2611, + "step": 1968 + }, + { + "epoch": 0.33698442580865995, + "grad_norm": 120.25743103027344, + "learning_rate": 1.1203650884198518e-05, + "loss": 6.9692, + "step": 1969 + }, + { + "epoch": 0.3371555707684409, + "grad_norm": 138.58935546875, + "learning_rate": 1.12093553907587e-05, + "loss": 6.7316, + "step": 1970 + }, + { + "epoch": 0.3373267157282218, + "grad_norm": 30.030006408691406, + "learning_rate": 1.1215059897318883e-05, + "loss": 4.1817, + "step": 1971 + }, + { + "epoch": 0.33749786068800275, + "grad_norm": 9.535407066345215, + "learning_rate": 1.1220764403879064e-05, + "loss": 0.9512, + "step": 1972 + }, + { + "epoch": 0.3376690056477837, + "grad_norm": 25.748254776000977, + "learning_rate": 1.1226468910439248e-05, + "loss": 3.1973, + "step": 1973 + }, + { + "epoch": 0.3378401506075646, + "grad_norm": 29.184724807739258, + "learning_rate": 1.123217341699943e-05, + "loss": 3.5403, + "step": 1974 + }, + { + "epoch": 0.33801129556734555, + "grad_norm": 36.09633255004883, + "learning_rate": 1.1237877923559611e-05, + "loss": 4.1013, + "step": 1975 + }, + { + "epoch": 0.3381824405271265, + "grad_norm": 31.967252731323242, + "learning_rate": 1.1243582430119794e-05, + "loss": 3.2354, + "step": 1976 + }, + { + "epoch": 0.3383535854869074, + "grad_norm": 38.74686813354492, + "learning_rate": 1.1249286936679976e-05, + "loss": 4.5663, + "step": 1977 + }, + { + "epoch": 0.33852473044668835, + "grad_norm": 30.3746395111084, + "learning_rate": 1.1254991443240161e-05, + "loss": 3.3973, + "step": 1978 + }, + { + "epoch": 0.3386958754064693, + "grad_norm": 11.366987228393555, + "learning_rate": 1.1260695949800343e-05, + "loss": 0.8323, + "step": 1979 + }, + { + "epoch": 0.3388670203662502, + "grad_norm": 20.15157699584961, + "learning_rate": 1.1266400456360526e-05, + "loss": 1.5111, + "step": 1980 + }, + { + "epoch": 0.33903816532603115, + "grad_norm": 25.638330459594727, + "learning_rate": 1.1272104962920708e-05, + "loss": 2.7039, + "step": 1981 + }, + { + "epoch": 0.3392093102858121, + "grad_norm": 30.38153839111328, + "learning_rate": 1.127780946948089e-05, + "loss": 3.6275, + "step": 1982 + }, + { + "epoch": 0.339380455245593, + "grad_norm": 31.235469818115234, + "learning_rate": 1.1283513976041073e-05, + "loss": 4.032, + "step": 1983 + }, + { + "epoch": 0.33955160020537395, + "grad_norm": 36.95757293701172, + "learning_rate": 1.1289218482601255e-05, + "loss": 4.052, + "step": 1984 + }, + { + "epoch": 0.3397227451651549, + "grad_norm": 5.83810567855835, + "learning_rate": 1.1294922989161438e-05, + "loss": 0.7531, + "step": 1985 + }, + { + "epoch": 0.3398938901249358, + "grad_norm": 187.32872009277344, + "learning_rate": 1.130062749572162e-05, + "loss": 8.1223, + "step": 1986 + }, + { + "epoch": 0.34006503508471675, + "grad_norm": 10.221015930175781, + "learning_rate": 1.1306332002281803e-05, + "loss": 1.3128, + "step": 1987 + }, + { + "epoch": 0.3402361800444977, + "grad_norm": 23.46990203857422, + "learning_rate": 1.1312036508841985e-05, + "loss": 2.2877, + "step": 1988 + }, + { + "epoch": 0.3404073250042786, + "grad_norm": 204.71218872070312, + "learning_rate": 1.1317741015402168e-05, + "loss": 9.0911, + "step": 1989 + }, + { + "epoch": 0.34057846996405955, + "grad_norm": 11.691418647766113, + "learning_rate": 1.1323445521962352e-05, + "loss": 2.0669, + "step": 1990 + }, + { + "epoch": 0.3407496149238405, + "grad_norm": 34.32474899291992, + "learning_rate": 1.1329150028522533e-05, + "loss": 3.8131, + "step": 1991 + }, + { + "epoch": 0.3409207598836214, + "grad_norm": 15.316189765930176, + "learning_rate": 1.1334854535082717e-05, + "loss": 1.4449, + "step": 1992 + }, + { + "epoch": 0.34109190484340235, + "grad_norm": 33.847110748291016, + "learning_rate": 1.1340559041642898e-05, + "loss": 3.6209, + "step": 1993 + }, + { + "epoch": 0.3412630498031833, + "grad_norm": 30.83047103881836, + "learning_rate": 1.134626354820308e-05, + "loss": 3.3044, + "step": 1994 + }, + { + "epoch": 0.3414341947629642, + "grad_norm": 23.169050216674805, + "learning_rate": 1.1351968054763263e-05, + "loss": 2.7778, + "step": 1995 + }, + { + "epoch": 0.34160533972274515, + "grad_norm": 28.009946823120117, + "learning_rate": 1.1357672561323445e-05, + "loss": 2.5658, + "step": 1996 + }, + { + "epoch": 0.3417764846825261, + "grad_norm": 24.620206832885742, + "learning_rate": 1.1363377067883628e-05, + "loss": 2.8611, + "step": 1997 + }, + { + "epoch": 0.341947629642307, + "grad_norm": 35.302894592285156, + "learning_rate": 1.136908157444381e-05, + "loss": 3.8368, + "step": 1998 + }, + { + "epoch": 0.34211877460208795, + "grad_norm": 48.49169921875, + "learning_rate": 1.1374786081003993e-05, + "loss": 8.3039, + "step": 1999 + }, + { + "epoch": 0.3422899195618689, + "grad_norm": 26.473003387451172, + "learning_rate": 1.1380490587564177e-05, + "loss": 2.6571, + "step": 2000 + }, + { + "epoch": 0.3424610645216498, + "grad_norm": 8.975080490112305, + "learning_rate": 1.1386195094124359e-05, + "loss": 0.8311, + "step": 2001 + }, + { + "epoch": 0.34263220948143075, + "grad_norm": 29.154399871826172, + "learning_rate": 1.1391899600684542e-05, + "loss": 3.3092, + "step": 2002 + }, + { + "epoch": 0.3428033544412117, + "grad_norm": 9.116958618164062, + "learning_rate": 1.1397604107244724e-05, + "loss": 1.109, + "step": 2003 + }, + { + "epoch": 0.3429744994009926, + "grad_norm": 150.9268341064453, + "learning_rate": 1.1403308613804907e-05, + "loss": 6.7063, + "step": 2004 + }, + { + "epoch": 0.34314564436077355, + "grad_norm": 28.97213363647461, + "learning_rate": 1.1409013120365089e-05, + "loss": 3.4316, + "step": 2005 + }, + { + "epoch": 0.3433167893205545, + "grad_norm": 35.343074798583984, + "learning_rate": 1.1414717626925272e-05, + "loss": 4.1921, + "step": 2006 + }, + { + "epoch": 0.34348793428033547, + "grad_norm": 26.21539306640625, + "learning_rate": 1.1420422133485454e-05, + "loss": 2.8775, + "step": 2007 + }, + { + "epoch": 0.3436590792401164, + "grad_norm": 24.8580322265625, + "learning_rate": 1.1426126640045635e-05, + "loss": 2.7428, + "step": 2008 + }, + { + "epoch": 0.34383022419989734, + "grad_norm": 18.229679107666016, + "learning_rate": 1.1431831146605819e-05, + "loss": 2.1508, + "step": 2009 + }, + { + "epoch": 0.34400136915967827, + "grad_norm": 12.01388168334961, + "learning_rate": 1.1437535653166e-05, + "loss": 1.002, + "step": 2010 + }, + { + "epoch": 0.3441725141194592, + "grad_norm": 101.5674819946289, + "learning_rate": 1.1443240159726184e-05, + "loss": 6.9708, + "step": 2011 + }, + { + "epoch": 0.34434365907924014, + "grad_norm": 135.65138244628906, + "learning_rate": 1.1448944666286367e-05, + "loss": 6.0953, + "step": 2012 + }, + { + "epoch": 0.34451480403902107, + "grad_norm": 28.10844612121582, + "learning_rate": 1.1454649172846549e-05, + "loss": 3.5016, + "step": 2013 + }, + { + "epoch": 0.344685948998802, + "grad_norm": 31.837894439697266, + "learning_rate": 1.1460353679406732e-05, + "loss": 3.2448, + "step": 2014 + }, + { + "epoch": 0.34485709395858294, + "grad_norm": 28.26076889038086, + "learning_rate": 1.1466058185966914e-05, + "loss": 3.1378, + "step": 2015 + }, + { + "epoch": 0.34502823891836387, + "grad_norm": 32.99501419067383, + "learning_rate": 1.1471762692527097e-05, + "loss": 3.4328, + "step": 2016 + }, + { + "epoch": 0.3451993838781448, + "grad_norm": 31.268230438232422, + "learning_rate": 1.1477467199087279e-05, + "loss": 4.0378, + "step": 2017 + }, + { + "epoch": 0.34537052883792574, + "grad_norm": 32.19254684448242, + "learning_rate": 1.1483171705647462e-05, + "loss": 4.356, + "step": 2018 + }, + { + "epoch": 0.34554167379770667, + "grad_norm": 28.953779220581055, + "learning_rate": 1.1488876212207644e-05, + "loss": 3.8967, + "step": 2019 + }, + { + "epoch": 0.3457128187574876, + "grad_norm": 26.264999389648438, + "learning_rate": 1.1494580718767826e-05, + "loss": 2.7881, + "step": 2020 + }, + { + "epoch": 0.34588396371726854, + "grad_norm": 21.80779457092285, + "learning_rate": 1.150028522532801e-05, + "loss": 2.0569, + "step": 2021 + }, + { + "epoch": 0.34605510867704947, + "grad_norm": 5.897726535797119, + "learning_rate": 1.1505989731888191e-05, + "loss": 0.6854, + "step": 2022 + }, + { + "epoch": 0.3462262536368304, + "grad_norm": 18.685945510864258, + "learning_rate": 1.1511694238448376e-05, + "loss": 1.7189, + "step": 2023 + }, + { + "epoch": 0.34639739859661134, + "grad_norm": 16.55164909362793, + "learning_rate": 1.1517398745008558e-05, + "loss": 1.6266, + "step": 2024 + }, + { + "epoch": 0.3465685435563923, + "grad_norm": 26.497346878051758, + "learning_rate": 1.152310325156874e-05, + "loss": 3.1355, + "step": 2025 + }, + { + "epoch": 0.3467396885161732, + "grad_norm": 36.22391128540039, + "learning_rate": 1.1528807758128923e-05, + "loss": 4.2871, + "step": 2026 + }, + { + "epoch": 0.34691083347595414, + "grad_norm": 25.69757080078125, + "learning_rate": 1.1534512264689104e-05, + "loss": 2.4604, + "step": 2027 + }, + { + "epoch": 0.3470819784357351, + "grad_norm": 34.47371292114258, + "learning_rate": 1.1540216771249288e-05, + "loss": 4.5727, + "step": 2028 + }, + { + "epoch": 0.347253123395516, + "grad_norm": 25.829330444335938, + "learning_rate": 1.154592127780947e-05, + "loss": 2.3708, + "step": 2029 + }, + { + "epoch": 0.34742426835529694, + "grad_norm": 23.152074813842773, + "learning_rate": 1.1551625784369653e-05, + "loss": 2.5885, + "step": 2030 + }, + { + "epoch": 0.3475954133150779, + "grad_norm": 33.27009582519531, + "learning_rate": 1.1557330290929834e-05, + "loss": 4.0326, + "step": 2031 + }, + { + "epoch": 0.3477665582748588, + "grad_norm": 11.642922401428223, + "learning_rate": 1.1563034797490016e-05, + "loss": 1.3036, + "step": 2032 + }, + { + "epoch": 0.34793770323463974, + "grad_norm": 16.035924911499023, + "learning_rate": 1.15687393040502e-05, + "loss": 1.3584, + "step": 2033 + }, + { + "epoch": 0.3481088481944207, + "grad_norm": 38.5884895324707, + "learning_rate": 1.1574443810610381e-05, + "loss": 5.2381, + "step": 2034 + }, + { + "epoch": 0.3482799931542016, + "grad_norm": 34.79248046875, + "learning_rate": 1.1580148317170566e-05, + "loss": 3.4977, + "step": 2035 + }, + { + "epoch": 0.34845113811398254, + "grad_norm": 24.086618423461914, + "learning_rate": 1.1585852823730748e-05, + "loss": 2.489, + "step": 2036 + }, + { + "epoch": 0.3486222830737635, + "grad_norm": 17.970691680908203, + "learning_rate": 1.1591557330290931e-05, + "loss": 1.2174, + "step": 2037 + }, + { + "epoch": 0.3487934280335444, + "grad_norm": 27.199962615966797, + "learning_rate": 1.1597261836851113e-05, + "loss": 2.4304, + "step": 2038 + }, + { + "epoch": 0.34896457299332534, + "grad_norm": 36.157230377197266, + "learning_rate": 1.1602966343411295e-05, + "loss": 4.5914, + "step": 2039 + }, + { + "epoch": 0.3491357179531063, + "grad_norm": 30.98073387145996, + "learning_rate": 1.1608670849971478e-05, + "loss": 3.1108, + "step": 2040 + }, + { + "epoch": 0.3493068629128872, + "grad_norm": 4.110781192779541, + "learning_rate": 1.161437535653166e-05, + "loss": 0.6784, + "step": 2041 + }, + { + "epoch": 0.34947800787266814, + "grad_norm": 7.259744644165039, + "learning_rate": 1.1620079863091843e-05, + "loss": 0.7546, + "step": 2042 + }, + { + "epoch": 0.3496491528324491, + "grad_norm": 9.056280136108398, + "learning_rate": 1.1625784369652025e-05, + "loss": 0.8102, + "step": 2043 + }, + { + "epoch": 0.34982029779223, + "grad_norm": 17.079927444458008, + "learning_rate": 1.1631488876212207e-05, + "loss": 1.8825, + "step": 2044 + }, + { + "epoch": 0.34999144275201094, + "grad_norm": 5.583414077758789, + "learning_rate": 1.163719338277239e-05, + "loss": 0.6958, + "step": 2045 + }, + { + "epoch": 0.3501625877117919, + "grad_norm": 32.52211380004883, + "learning_rate": 1.1642897889332573e-05, + "loss": 3.8308, + "step": 2046 + }, + { + "epoch": 0.3503337326715728, + "grad_norm": 8.453152656555176, + "learning_rate": 1.1648602395892757e-05, + "loss": 0.9997, + "step": 2047 + }, + { + "epoch": 0.35050487763135374, + "grad_norm": 17.828163146972656, + "learning_rate": 1.1654306902452938e-05, + "loss": 2.0197, + "step": 2048 + }, + { + "epoch": 0.3506760225911347, + "grad_norm": 33.86958312988281, + "learning_rate": 1.1660011409013122e-05, + "loss": 3.5889, + "step": 2049 + }, + { + "epoch": 0.3508471675509156, + "grad_norm": 39.53785705566406, + "learning_rate": 1.1665715915573303e-05, + "loss": 4.3322, + "step": 2050 + }, + { + "epoch": 0.35101831251069654, + "grad_norm": 119.68132019042969, + "learning_rate": 1.1671420422133485e-05, + "loss": 8.5534, + "step": 2051 + }, + { + "epoch": 0.3511894574704775, + "grad_norm": 20.703731536865234, + "learning_rate": 1.1677124928693669e-05, + "loss": 1.9145, + "step": 2052 + }, + { + "epoch": 0.3513606024302584, + "grad_norm": 32.62479019165039, + "learning_rate": 1.168282943525385e-05, + "loss": 3.4411, + "step": 2053 + }, + { + "epoch": 0.35153174739003934, + "grad_norm": 28.38721466064453, + "learning_rate": 1.1688533941814034e-05, + "loss": 2.913, + "step": 2054 + }, + { + "epoch": 0.3517028923498203, + "grad_norm": 11.139078140258789, + "learning_rate": 1.1694238448374215e-05, + "loss": 1.2331, + "step": 2055 + }, + { + "epoch": 0.3518740373096012, + "grad_norm": 36.095458984375, + "learning_rate": 1.1699942954934399e-05, + "loss": 4.4497, + "step": 2056 + }, + { + "epoch": 0.35204518226938214, + "grad_norm": 17.7105655670166, + "learning_rate": 1.170564746149458e-05, + "loss": 1.341, + "step": 2057 + }, + { + "epoch": 0.35221632722916313, + "grad_norm": 34.70029067993164, + "learning_rate": 1.1711351968054764e-05, + "loss": 3.8577, + "step": 2058 + }, + { + "epoch": 0.35238747218894406, + "grad_norm": 30.967939376831055, + "learning_rate": 1.1717056474614947e-05, + "loss": 3.5998, + "step": 2059 + }, + { + "epoch": 0.352558617148725, + "grad_norm": 175.67909240722656, + "learning_rate": 1.1722760981175129e-05, + "loss": 7.5725, + "step": 2060 + }, + { + "epoch": 0.35272976210850593, + "grad_norm": 14.09093189239502, + "learning_rate": 1.1728465487735312e-05, + "loss": 1.1863, + "step": 2061 + }, + { + "epoch": 0.35290090706828686, + "grad_norm": 16.4505672454834, + "learning_rate": 1.1734169994295494e-05, + "loss": 1.3923, + "step": 2062 + }, + { + "epoch": 0.3530720520280678, + "grad_norm": 30.69254493713379, + "learning_rate": 1.1739874500855676e-05, + "loss": 4.0609, + "step": 2063 + }, + { + "epoch": 0.35324319698784873, + "grad_norm": 35.82154846191406, + "learning_rate": 1.1745579007415859e-05, + "loss": 4.1915, + "step": 2064 + }, + { + "epoch": 0.35341434194762966, + "grad_norm": 34.619754791259766, + "learning_rate": 1.175128351397604e-05, + "loss": 4.8903, + "step": 2065 + }, + { + "epoch": 0.3535854869074106, + "grad_norm": 13.456661224365234, + "learning_rate": 1.1756988020536224e-05, + "loss": 1.4971, + "step": 2066 + }, + { + "epoch": 0.35375663186719153, + "grad_norm": 34.76420974731445, + "learning_rate": 1.1762692527096406e-05, + "loss": 3.9249, + "step": 2067 + }, + { + "epoch": 0.35392777682697246, + "grad_norm": 11.180761337280273, + "learning_rate": 1.1768397033656589e-05, + "loss": 0.7, + "step": 2068 + }, + { + "epoch": 0.3540989217867534, + "grad_norm": 195.03485107421875, + "learning_rate": 1.1774101540216772e-05, + "loss": 6.9708, + "step": 2069 + }, + { + "epoch": 0.35427006674653433, + "grad_norm": 34.15081787109375, + "learning_rate": 1.1779806046776954e-05, + "loss": 4.0197, + "step": 2070 + }, + { + "epoch": 0.35444121170631526, + "grad_norm": 44.15553283691406, + "learning_rate": 1.1785510553337137e-05, + "loss": 8.034, + "step": 2071 + }, + { + "epoch": 0.3546123566660962, + "grad_norm": 36.1580924987793, + "learning_rate": 1.1791215059897319e-05, + "loss": 4.3774, + "step": 2072 + }, + { + "epoch": 0.35478350162587713, + "grad_norm": 37.583351135253906, + "learning_rate": 1.1796919566457503e-05, + "loss": 5.0443, + "step": 2073 + }, + { + "epoch": 0.35495464658565806, + "grad_norm": 7.443456172943115, + "learning_rate": 1.1802624073017684e-05, + "loss": 0.7081, + "step": 2074 + }, + { + "epoch": 0.355125791545439, + "grad_norm": 27.195236206054688, + "learning_rate": 1.1808328579577866e-05, + "loss": 2.7896, + "step": 2075 + }, + { + "epoch": 0.35529693650521993, + "grad_norm": 10.81725788116455, + "learning_rate": 1.181403308613805e-05, + "loss": 2.1049, + "step": 2076 + }, + { + "epoch": 0.35546808146500086, + "grad_norm": 32.889869689941406, + "learning_rate": 1.1819737592698231e-05, + "loss": 3.9205, + "step": 2077 + }, + { + "epoch": 0.3556392264247818, + "grad_norm": 119.37525939941406, + "learning_rate": 1.1825442099258414e-05, + "loss": 7.0729, + "step": 2078 + }, + { + "epoch": 0.35581037138456273, + "grad_norm": 13.211540222167969, + "learning_rate": 1.1831146605818596e-05, + "loss": 1.5046, + "step": 2079 + }, + { + "epoch": 0.35598151634434366, + "grad_norm": 29.677011489868164, + "learning_rate": 1.183685111237878e-05, + "loss": 3.4441, + "step": 2080 + }, + { + "epoch": 0.3561526613041246, + "grad_norm": 116.09097290039062, + "learning_rate": 1.1842555618938963e-05, + "loss": 6.9657, + "step": 2081 + }, + { + "epoch": 0.35632380626390553, + "grad_norm": 36.9529914855957, + "learning_rate": 1.1848260125499144e-05, + "loss": 5.1966, + "step": 2082 + }, + { + "epoch": 0.35649495122368646, + "grad_norm": 32.45378112792969, + "learning_rate": 1.1853964632059328e-05, + "loss": 3.8259, + "step": 2083 + }, + { + "epoch": 0.3566660961834674, + "grad_norm": 28.279193878173828, + "learning_rate": 1.185966913861951e-05, + "loss": 3.0802, + "step": 2084 + }, + { + "epoch": 0.35683724114324833, + "grad_norm": 16.36111831665039, + "learning_rate": 1.1865373645179693e-05, + "loss": 1.6254, + "step": 2085 + }, + { + "epoch": 0.35700838610302926, + "grad_norm": 33.62881851196289, + "learning_rate": 1.1871078151739875e-05, + "loss": 3.482, + "step": 2086 + }, + { + "epoch": 0.3571795310628102, + "grad_norm": 22.785282135009766, + "learning_rate": 1.1876782658300058e-05, + "loss": 2.5492, + "step": 2087 + }, + { + "epoch": 0.35735067602259113, + "grad_norm": 18.783733367919922, + "learning_rate": 1.188248716486024e-05, + "loss": 2.1471, + "step": 2088 + }, + { + "epoch": 0.35752182098237206, + "grad_norm": 25.175399780273438, + "learning_rate": 1.1888191671420421e-05, + "loss": 2.901, + "step": 2089 + }, + { + "epoch": 0.357692965942153, + "grad_norm": 32.070228576660156, + "learning_rate": 1.1893896177980605e-05, + "loss": 4.0126, + "step": 2090 + }, + { + "epoch": 0.35786411090193393, + "grad_norm": 30.165206909179688, + "learning_rate": 1.1899600684540786e-05, + "loss": 3.1196, + "step": 2091 + }, + { + "epoch": 0.35803525586171486, + "grad_norm": 25.695375442504883, + "learning_rate": 1.1905305191100971e-05, + "loss": 2.5124, + "step": 2092 + }, + { + "epoch": 0.3582064008214958, + "grad_norm": 7.505849838256836, + "learning_rate": 1.1911009697661153e-05, + "loss": 1.0043, + "step": 2093 + }, + { + "epoch": 0.35837754578127673, + "grad_norm": 28.15729522705078, + "learning_rate": 1.1916714204221335e-05, + "loss": 3.8256, + "step": 2094 + }, + { + "epoch": 0.35854869074105766, + "grad_norm": 15.077316284179688, + "learning_rate": 1.1922418710781518e-05, + "loss": 0.9039, + "step": 2095 + }, + { + "epoch": 0.3587198357008386, + "grad_norm": 11.068819999694824, + "learning_rate": 1.19281232173417e-05, + "loss": 0.9256, + "step": 2096 + }, + { + "epoch": 0.35889098066061953, + "grad_norm": 30.34836769104004, + "learning_rate": 1.1933827723901883e-05, + "loss": 3.3198, + "step": 2097 + }, + { + "epoch": 0.35906212562040046, + "grad_norm": 92.60661315917969, + "learning_rate": 1.1939532230462065e-05, + "loss": 5.7395, + "step": 2098 + }, + { + "epoch": 0.3592332705801814, + "grad_norm": 26.518394470214844, + "learning_rate": 1.1945236737022248e-05, + "loss": 2.7506, + "step": 2099 + }, + { + "epoch": 0.35940441553996233, + "grad_norm": 4.0069780349731445, + "learning_rate": 1.195094124358243e-05, + "loss": 0.622, + "step": 2100 + }, + { + "epoch": 0.35957556049974326, + "grad_norm": 25.66058349609375, + "learning_rate": 1.1956645750142612e-05, + "loss": 2.4436, + "step": 2101 + }, + { + "epoch": 0.3597467054595242, + "grad_norm": 16.090246200561523, + "learning_rate": 1.1962350256702795e-05, + "loss": 1.4181, + "step": 2102 + }, + { + "epoch": 0.35991785041930513, + "grad_norm": 9.653539657592773, + "learning_rate": 1.1968054763262978e-05, + "loss": 1.1303, + "step": 2103 + }, + { + "epoch": 0.36008899537908606, + "grad_norm": 26.997007369995117, + "learning_rate": 1.1973759269823162e-05, + "loss": 2.8454, + "step": 2104 + }, + { + "epoch": 0.360260140338867, + "grad_norm": 35.292945861816406, + "learning_rate": 1.1979463776383344e-05, + "loss": 4.4265, + "step": 2105 + }, + { + "epoch": 0.36043128529864793, + "grad_norm": 9.962848663330078, + "learning_rate": 1.1985168282943527e-05, + "loss": 1.1083, + "step": 2106 + }, + { + "epoch": 0.36060243025842886, + "grad_norm": 21.34442138671875, + "learning_rate": 1.1990872789503709e-05, + "loss": 1.9815, + "step": 2107 + }, + { + "epoch": 0.3607735752182098, + "grad_norm": 63.102256774902344, + "learning_rate": 1.199657729606389e-05, + "loss": 8.2906, + "step": 2108 + }, + { + "epoch": 0.3609447201779908, + "grad_norm": 31.640159606933594, + "learning_rate": 1.2002281802624074e-05, + "loss": 3.9734, + "step": 2109 + }, + { + "epoch": 0.3611158651377717, + "grad_norm": 29.008909225463867, + "learning_rate": 1.2007986309184255e-05, + "loss": 2.8619, + "step": 2110 + }, + { + "epoch": 0.36128701009755265, + "grad_norm": 158.99563598632812, + "learning_rate": 1.2013690815744439e-05, + "loss": 8.8876, + "step": 2111 + }, + { + "epoch": 0.3614581550573336, + "grad_norm": 12.028635025024414, + "learning_rate": 1.201939532230462e-05, + "loss": 1.1747, + "step": 2112 + }, + { + "epoch": 0.3616293000171145, + "grad_norm": 49.29413986206055, + "learning_rate": 1.2025099828864802e-05, + "loss": 8.4677, + "step": 2113 + }, + { + "epoch": 0.36180044497689545, + "grad_norm": 35.586788177490234, + "learning_rate": 1.2030804335424985e-05, + "loss": 4.3141, + "step": 2114 + }, + { + "epoch": 0.3619715899366764, + "grad_norm": 15.967235565185547, + "learning_rate": 1.2036508841985169e-05, + "loss": 1.4648, + "step": 2115 + }, + { + "epoch": 0.3621427348964573, + "grad_norm": 116.31715393066406, + "learning_rate": 1.2042213348545352e-05, + "loss": 5.9115, + "step": 2116 + }, + { + "epoch": 0.36231387985623825, + "grad_norm": 39.9970703125, + "learning_rate": 1.2047917855105534e-05, + "loss": 5.2751, + "step": 2117 + }, + { + "epoch": 0.3624850248160192, + "grad_norm": 15.636171340942383, + "learning_rate": 1.2053622361665717e-05, + "loss": 1.1331, + "step": 2118 + }, + { + "epoch": 0.3626561697758001, + "grad_norm": 29.51291847229004, + "learning_rate": 1.2059326868225899e-05, + "loss": 3.0782, + "step": 2119 + }, + { + "epoch": 0.36282731473558105, + "grad_norm": 33.99169921875, + "learning_rate": 1.206503137478608e-05, + "loss": 3.4875, + "step": 2120 + }, + { + "epoch": 0.362998459695362, + "grad_norm": 8.469818115234375, + "learning_rate": 1.2070735881346264e-05, + "loss": 0.9351, + "step": 2121 + }, + { + "epoch": 0.3631696046551429, + "grad_norm": 87.96151733398438, + "learning_rate": 1.2076440387906446e-05, + "loss": 5.0553, + "step": 2122 + }, + { + "epoch": 0.36334074961492385, + "grad_norm": 11.59670352935791, + "learning_rate": 1.2082144894466629e-05, + "loss": 1.314, + "step": 2123 + }, + { + "epoch": 0.3635118945747048, + "grad_norm": 7.859058856964111, + "learning_rate": 1.208784940102681e-05, + "loss": 0.9692, + "step": 2124 + }, + { + "epoch": 0.3636830395344857, + "grad_norm": 48.24964904785156, + "learning_rate": 1.2093553907586992e-05, + "loss": 5.6168, + "step": 2125 + }, + { + "epoch": 0.36385418449426665, + "grad_norm": 35.264366149902344, + "learning_rate": 1.2099258414147178e-05, + "loss": 3.7475, + "step": 2126 + }, + { + "epoch": 0.3640253294540476, + "grad_norm": 30.4807071685791, + "learning_rate": 1.210496292070736e-05, + "loss": 3.6681, + "step": 2127 + }, + { + "epoch": 0.3641964744138285, + "grad_norm": 37.583274841308594, + "learning_rate": 1.2110667427267543e-05, + "loss": 7.6763, + "step": 2128 + }, + { + "epoch": 0.36436761937360945, + "grad_norm": 10.553574562072754, + "learning_rate": 1.2116371933827724e-05, + "loss": 0.7119, + "step": 2129 + }, + { + "epoch": 0.3645387643333904, + "grad_norm": 25.893739700317383, + "learning_rate": 1.2122076440387908e-05, + "loss": 2.7102, + "step": 2130 + }, + { + "epoch": 0.3647099092931713, + "grad_norm": 37.81182861328125, + "learning_rate": 1.212778094694809e-05, + "loss": 7.7056, + "step": 2131 + }, + { + "epoch": 0.36488105425295225, + "grad_norm": 24.436336517333984, + "learning_rate": 1.2133485453508271e-05, + "loss": 3.0385, + "step": 2132 + }, + { + "epoch": 0.3650521992127332, + "grad_norm": 33.72613525390625, + "learning_rate": 1.2139189960068454e-05, + "loss": 3.625, + "step": 2133 + }, + { + "epoch": 0.3652233441725141, + "grad_norm": 29.429370880126953, + "learning_rate": 1.2144894466628636e-05, + "loss": 3.2735, + "step": 2134 + }, + { + "epoch": 0.36539448913229505, + "grad_norm": 29.37833595275879, + "learning_rate": 1.215059897318882e-05, + "loss": 3.3102, + "step": 2135 + }, + { + "epoch": 0.365565634092076, + "grad_norm": 4.678672790527344, + "learning_rate": 1.2156303479749001e-05, + "loss": 0.6167, + "step": 2136 + }, + { + "epoch": 0.3657367790518569, + "grad_norm": 13.350298881530762, + "learning_rate": 1.2162007986309185e-05, + "loss": 0.9838, + "step": 2137 + }, + { + "epoch": 0.36590792401163785, + "grad_norm": 197.19981384277344, + "learning_rate": 1.2167712492869368e-05, + "loss": 10.1095, + "step": 2138 + }, + { + "epoch": 0.3660790689714188, + "grad_norm": 33.24477767944336, + "learning_rate": 1.217341699942955e-05, + "loss": 3.6562, + "step": 2139 + }, + { + "epoch": 0.3662502139311997, + "grad_norm": 31.698823928833008, + "learning_rate": 1.2179121505989733e-05, + "loss": 3.1984, + "step": 2140 + }, + { + "epoch": 0.36642135889098065, + "grad_norm": 28.302553176879883, + "learning_rate": 1.2184826012549915e-05, + "loss": 2.9794, + "step": 2141 + }, + { + "epoch": 0.3665925038507616, + "grad_norm": 26.840988159179688, + "learning_rate": 1.2190530519110098e-05, + "loss": 3.1451, + "step": 2142 + }, + { + "epoch": 0.3667636488105425, + "grad_norm": 10.02106761932373, + "learning_rate": 1.219623502567028e-05, + "loss": 1.7728, + "step": 2143 + }, + { + "epoch": 0.36693479377032345, + "grad_norm": 19.4163761138916, + "learning_rate": 1.2201939532230461e-05, + "loss": 1.4892, + "step": 2144 + }, + { + "epoch": 0.3671059387301044, + "grad_norm": 117.40380096435547, + "learning_rate": 1.2207644038790645e-05, + "loss": 6.037, + "step": 2145 + }, + { + "epoch": 0.3672770836898853, + "grad_norm": 36.802330017089844, + "learning_rate": 1.2213348545350826e-05, + "loss": 3.9472, + "step": 2146 + }, + { + "epoch": 0.36744822864966625, + "grad_norm": 26.534914016723633, + "learning_rate": 1.221905305191101e-05, + "loss": 2.9076, + "step": 2147 + }, + { + "epoch": 0.3676193736094472, + "grad_norm": 8.252175331115723, + "learning_rate": 1.2224757558471192e-05, + "loss": 1.7274, + "step": 2148 + }, + { + "epoch": 0.3677905185692281, + "grad_norm": 36.72080993652344, + "learning_rate": 1.2230462065031377e-05, + "loss": 3.9691, + "step": 2149 + }, + { + "epoch": 0.36796166352900905, + "grad_norm": 31.389694213867188, + "learning_rate": 1.2236166571591558e-05, + "loss": 3.662, + "step": 2150 + }, + { + "epoch": 0.36813280848879, + "grad_norm": 17.889827728271484, + "learning_rate": 1.224187107815174e-05, + "loss": 1.3585, + "step": 2151 + }, + { + "epoch": 0.3683039534485709, + "grad_norm": 37.55808639526367, + "learning_rate": 1.2247575584711923e-05, + "loss": 3.733, + "step": 2152 + }, + { + "epoch": 0.36847509840835185, + "grad_norm": 28.830768585205078, + "learning_rate": 1.2253280091272105e-05, + "loss": 2.596, + "step": 2153 + }, + { + "epoch": 0.3686462433681328, + "grad_norm": 11.456624031066895, + "learning_rate": 1.2258984597832288e-05, + "loss": 0.6827, + "step": 2154 + }, + { + "epoch": 0.3688173883279137, + "grad_norm": 29.137744903564453, + "learning_rate": 1.226468910439247e-05, + "loss": 3.4631, + "step": 2155 + }, + { + "epoch": 0.36898853328769465, + "grad_norm": 27.315082550048828, + "learning_rate": 1.2270393610952653e-05, + "loss": 2.4743, + "step": 2156 + }, + { + "epoch": 0.3691596782474756, + "grad_norm": 15.013253211975098, + "learning_rate": 1.2276098117512835e-05, + "loss": 1.2812, + "step": 2157 + }, + { + "epoch": 0.3693308232072565, + "grad_norm": 33.02097702026367, + "learning_rate": 1.2281802624073017e-05, + "loss": 3.1825, + "step": 2158 + }, + { + "epoch": 0.36950196816703745, + "grad_norm": 37.75695037841797, + "learning_rate": 1.22875071306332e-05, + "loss": 3.9585, + "step": 2159 + }, + { + "epoch": 0.36967311312681844, + "grad_norm": 33.56565475463867, + "learning_rate": 1.2293211637193382e-05, + "loss": 3.9576, + "step": 2160 + }, + { + "epoch": 0.3698442580865994, + "grad_norm": 8.82251262664795, + "learning_rate": 1.2298916143753567e-05, + "loss": 1.045, + "step": 2161 + }, + { + "epoch": 0.3700154030463803, + "grad_norm": 26.975778579711914, + "learning_rate": 1.2304620650313749e-05, + "loss": 2.5674, + "step": 2162 + }, + { + "epoch": 0.37018654800616124, + "grad_norm": 136.73780822753906, + "learning_rate": 1.231032515687393e-05, + "loss": 5.8467, + "step": 2163 + }, + { + "epoch": 0.3703576929659422, + "grad_norm": 29.269546508789062, + "learning_rate": 1.2316029663434114e-05, + "loss": 2.9019, + "step": 2164 + }, + { + "epoch": 0.3705288379257231, + "grad_norm": 31.114402770996094, + "learning_rate": 1.2321734169994295e-05, + "loss": 3.803, + "step": 2165 + }, + { + "epoch": 0.37069998288550404, + "grad_norm": 28.02252769470215, + "learning_rate": 1.2327438676554479e-05, + "loss": 3.334, + "step": 2166 + }, + { + "epoch": 0.370871127845285, + "grad_norm": 36.24296951293945, + "learning_rate": 1.233314318311466e-05, + "loss": 4.0221, + "step": 2167 + }, + { + "epoch": 0.3710422728050659, + "grad_norm": 42.49361801147461, + "learning_rate": 1.2338847689674844e-05, + "loss": 4.4893, + "step": 2168 + }, + { + "epoch": 0.37121341776484684, + "grad_norm": 31.110870361328125, + "learning_rate": 1.2344552196235026e-05, + "loss": 3.2998, + "step": 2169 + }, + { + "epoch": 0.3713845627246278, + "grad_norm": 38.54166030883789, + "learning_rate": 1.2350256702795207e-05, + "loss": 3.9307, + "step": 2170 + }, + { + "epoch": 0.3715557076844087, + "grad_norm": 31.027143478393555, + "learning_rate": 1.235596120935539e-05, + "loss": 3.3663, + "step": 2171 + }, + { + "epoch": 0.37172685264418964, + "grad_norm": 19.778564453125, + "learning_rate": 1.2361665715915574e-05, + "loss": 1.4132, + "step": 2172 + }, + { + "epoch": 0.3718979976039706, + "grad_norm": 6.935482025146484, + "learning_rate": 1.2367370222475757e-05, + "loss": 0.7138, + "step": 2173 + }, + { + "epoch": 0.3720691425637515, + "grad_norm": 17.002243041992188, + "learning_rate": 1.2373074729035939e-05, + "loss": 1.363, + "step": 2174 + }, + { + "epoch": 0.37224028752353244, + "grad_norm": 36.1330451965332, + "learning_rate": 1.237877923559612e-05, + "loss": 4.304, + "step": 2175 + }, + { + "epoch": 0.3724114324833134, + "grad_norm": 37.96760940551758, + "learning_rate": 1.2384483742156304e-05, + "loss": 4.1877, + "step": 2176 + }, + { + "epoch": 0.3725825774430943, + "grad_norm": 37.3785400390625, + "learning_rate": 1.2390188248716486e-05, + "loss": 4.2806, + "step": 2177 + }, + { + "epoch": 0.37275372240287524, + "grad_norm": 124.93565368652344, + "learning_rate": 1.239589275527667e-05, + "loss": 5.4911, + "step": 2178 + }, + { + "epoch": 0.3729248673626562, + "grad_norm": 28.42656707763672, + "learning_rate": 1.2401597261836851e-05, + "loss": 2.685, + "step": 2179 + }, + { + "epoch": 0.3730960123224371, + "grad_norm": 44.78040313720703, + "learning_rate": 1.2407301768397034e-05, + "loss": 8.0281, + "step": 2180 + }, + { + "epoch": 0.37326715728221804, + "grad_norm": 106.5615005493164, + "learning_rate": 1.2413006274957216e-05, + "loss": 9.7692, + "step": 2181 + }, + { + "epoch": 0.373438302241999, + "grad_norm": 32.70700454711914, + "learning_rate": 1.2418710781517398e-05, + "loss": 3.7167, + "step": 2182 + }, + { + "epoch": 0.3736094472017799, + "grad_norm": 27.95832633972168, + "learning_rate": 1.2424415288077583e-05, + "loss": 3.4558, + "step": 2183 + }, + { + "epoch": 0.37378059216156084, + "grad_norm": 51.62168502807617, + "learning_rate": 1.2430119794637764e-05, + "loss": 7.8843, + "step": 2184 + }, + { + "epoch": 0.3739517371213418, + "grad_norm": 22.549152374267578, + "learning_rate": 1.2435824301197948e-05, + "loss": 2.2902, + "step": 2185 + }, + { + "epoch": 0.3741228820811227, + "grad_norm": 49.26498031616211, + "learning_rate": 1.244152880775813e-05, + "loss": 7.9836, + "step": 2186 + }, + { + "epoch": 0.37429402704090364, + "grad_norm": 32.918434143066406, + "learning_rate": 1.2447233314318313e-05, + "loss": 3.7321, + "step": 2187 + }, + { + "epoch": 0.3744651720006846, + "grad_norm": 115.87164306640625, + "learning_rate": 1.2452937820878495e-05, + "loss": 6.1209, + "step": 2188 + }, + { + "epoch": 0.3746363169604655, + "grad_norm": 32.60509490966797, + "learning_rate": 1.2458642327438676e-05, + "loss": 4.3652, + "step": 2189 + }, + { + "epoch": 0.37480746192024644, + "grad_norm": 40.821720123291016, + "learning_rate": 1.246434683399886e-05, + "loss": 7.5982, + "step": 2190 + }, + { + "epoch": 0.3749786068800274, + "grad_norm": 30.804649353027344, + "learning_rate": 1.2470051340559041e-05, + "loss": 3.6332, + "step": 2191 + }, + { + "epoch": 0.3751497518398083, + "grad_norm": 28.10482406616211, + "learning_rate": 1.2475755847119225e-05, + "loss": 3.1805, + "step": 2192 + }, + { + "epoch": 0.37532089679958924, + "grad_norm": 5.394840240478516, + "learning_rate": 1.2481460353679406e-05, + "loss": 0.6131, + "step": 2193 + }, + { + "epoch": 0.3754920417593702, + "grad_norm": 22.42398452758789, + "learning_rate": 1.2487164860239588e-05, + "loss": 2.9095, + "step": 2194 + }, + { + "epoch": 0.3756631867191511, + "grad_norm": 31.861984252929688, + "learning_rate": 1.2492869366799773e-05, + "loss": 3.7872, + "step": 2195 + }, + { + "epoch": 0.37583433167893204, + "grad_norm": 30.0163631439209, + "learning_rate": 1.2498573873359955e-05, + "loss": 3.2556, + "step": 2196 + }, + { + "epoch": 0.376005476638713, + "grad_norm": 43.01797103881836, + "learning_rate": 1.2504278379920138e-05, + "loss": 7.4534, + "step": 2197 + }, + { + "epoch": 0.3761766215984939, + "grad_norm": 26.029483795166016, + "learning_rate": 1.250998288648032e-05, + "loss": 3.4138, + "step": 2198 + }, + { + "epoch": 0.37634776655827484, + "grad_norm": 31.733152389526367, + "learning_rate": 1.2515687393040503e-05, + "loss": 3.7965, + "step": 2199 + }, + { + "epoch": 0.3765189115180558, + "grad_norm": 29.86209487915039, + "learning_rate": 1.2521391899600685e-05, + "loss": 3.1073, + "step": 2200 + }, + { + "epoch": 0.3766900564778367, + "grad_norm": 73.94261932373047, + "learning_rate": 1.2527096406160867e-05, + "loss": 6.7022, + "step": 2201 + }, + { + "epoch": 0.37686120143761764, + "grad_norm": 33.266666412353516, + "learning_rate": 1.253280091272105e-05, + "loss": 3.467, + "step": 2202 + }, + { + "epoch": 0.3770323463973986, + "grad_norm": 9.25309944152832, + "learning_rate": 1.2538505419281232e-05, + "loss": 0.9735, + "step": 2203 + }, + { + "epoch": 0.3772034913571795, + "grad_norm": 32.7879753112793, + "learning_rate": 1.2544209925841415e-05, + "loss": 4.3873, + "step": 2204 + }, + { + "epoch": 0.37737463631696044, + "grad_norm": 38.24089813232422, + "learning_rate": 1.2549914432401597e-05, + "loss": 4.1272, + "step": 2205 + }, + { + "epoch": 0.3775457812767414, + "grad_norm": 11.10142707824707, + "learning_rate": 1.2555618938961782e-05, + "loss": 0.8028, + "step": 2206 + }, + { + "epoch": 0.3777169262365223, + "grad_norm": 37.619815826416016, + "learning_rate": 1.2561323445521963e-05, + "loss": 3.8663, + "step": 2207 + }, + { + "epoch": 0.37788807119630324, + "grad_norm": 43.338417053222656, + "learning_rate": 1.2567027952082145e-05, + "loss": 4.6084, + "step": 2208 + }, + { + "epoch": 0.3780592161560842, + "grad_norm": 29.597476959228516, + "learning_rate": 1.2572732458642329e-05, + "loss": 4.3275, + "step": 2209 + }, + { + "epoch": 0.3782303611158651, + "grad_norm": 111.00467681884766, + "learning_rate": 1.257843696520251e-05, + "loss": 6.2678, + "step": 2210 + }, + { + "epoch": 0.3784015060756461, + "grad_norm": 28.328218460083008, + "learning_rate": 1.2584141471762694e-05, + "loss": 3.7021, + "step": 2211 + }, + { + "epoch": 0.37857265103542703, + "grad_norm": 7.334059238433838, + "learning_rate": 1.2589845978322875e-05, + "loss": 1.0266, + "step": 2212 + }, + { + "epoch": 0.37874379599520797, + "grad_norm": 12.333498001098633, + "learning_rate": 1.2595550484883057e-05, + "loss": 1.2937, + "step": 2213 + }, + { + "epoch": 0.3789149409549889, + "grad_norm": 33.395259857177734, + "learning_rate": 1.260125499144324e-05, + "loss": 3.7564, + "step": 2214 + }, + { + "epoch": 0.37908608591476983, + "grad_norm": 12.443466186523438, + "learning_rate": 1.2606959498003422e-05, + "loss": 1.1804, + "step": 2215 + }, + { + "epoch": 0.37925723087455077, + "grad_norm": 29.2781982421875, + "learning_rate": 1.2612664004563605e-05, + "loss": 2.8089, + "step": 2216 + }, + { + "epoch": 0.3794283758343317, + "grad_norm": 30.066843032836914, + "learning_rate": 1.2618368511123787e-05, + "loss": 3.678, + "step": 2217 + }, + { + "epoch": 0.37959952079411263, + "grad_norm": 198.62889099121094, + "learning_rate": 1.2624073017683972e-05, + "loss": 10.5079, + "step": 2218 + }, + { + "epoch": 0.37977066575389357, + "grad_norm": 36.29426574707031, + "learning_rate": 1.2629777524244154e-05, + "loss": 3.7594, + "step": 2219 + }, + { + "epoch": 0.3799418107136745, + "grad_norm": 4.288938522338867, + "learning_rate": 1.2635482030804336e-05, + "loss": 0.6002, + "step": 2220 + }, + { + "epoch": 0.38011295567345543, + "grad_norm": 16.282394409179688, + "learning_rate": 1.2641186537364519e-05, + "loss": 1.2398, + "step": 2221 + }, + { + "epoch": 0.38028410063323637, + "grad_norm": 15.423003196716309, + "learning_rate": 1.26468910439247e-05, + "loss": 1.0447, + "step": 2222 + }, + { + "epoch": 0.3804552455930173, + "grad_norm": 8.580951690673828, + "learning_rate": 1.2652595550484884e-05, + "loss": 1.0967, + "step": 2223 + }, + { + "epoch": 0.38062639055279823, + "grad_norm": 23.481037139892578, + "learning_rate": 1.2658300057045066e-05, + "loss": 3.0215, + "step": 2224 + }, + { + "epoch": 0.38079753551257917, + "grad_norm": 31.463350296020508, + "learning_rate": 1.2664004563605247e-05, + "loss": 3.9185, + "step": 2225 + }, + { + "epoch": 0.3809686804723601, + "grad_norm": 33.95023727416992, + "learning_rate": 1.266970907016543e-05, + "loss": 4.4252, + "step": 2226 + }, + { + "epoch": 0.38113982543214103, + "grad_norm": 32.201377868652344, + "learning_rate": 1.2675413576725612e-05, + "loss": 3.1638, + "step": 2227 + }, + { + "epoch": 0.38131097039192197, + "grad_norm": 33.09391784667969, + "learning_rate": 1.2681118083285796e-05, + "loss": 4.5716, + "step": 2228 + }, + { + "epoch": 0.3814821153517029, + "grad_norm": 89.28120422363281, + "learning_rate": 1.2686822589845979e-05, + "loss": 5.4798, + "step": 2229 + }, + { + "epoch": 0.38165326031148383, + "grad_norm": 18.636362075805664, + "learning_rate": 1.2692527096406163e-05, + "loss": 1.3417, + "step": 2230 + }, + { + "epoch": 0.38182440527126477, + "grad_norm": 108.82768249511719, + "learning_rate": 1.2698231602966344e-05, + "loss": 5.2101, + "step": 2231 + }, + { + "epoch": 0.3819955502310457, + "grad_norm": 32.57135009765625, + "learning_rate": 1.2703936109526526e-05, + "loss": 4.3203, + "step": 2232 + }, + { + "epoch": 0.38216669519082663, + "grad_norm": 33.27009963989258, + "learning_rate": 1.270964061608671e-05, + "loss": 3.9393, + "step": 2233 + }, + { + "epoch": 0.38233784015060757, + "grad_norm": 16.50580406188965, + "learning_rate": 1.2715345122646891e-05, + "loss": 1.5263, + "step": 2234 + }, + { + "epoch": 0.3825089851103885, + "grad_norm": 18.65876579284668, + "learning_rate": 1.2721049629207074e-05, + "loss": 1.948, + "step": 2235 + }, + { + "epoch": 0.38268013007016943, + "grad_norm": 28.283248901367188, + "learning_rate": 1.2726754135767256e-05, + "loss": 2.8414, + "step": 2236 + }, + { + "epoch": 0.38285127502995037, + "grad_norm": 118.61890411376953, + "learning_rate": 1.273245864232744e-05, + "loss": 7.9885, + "step": 2237 + }, + { + "epoch": 0.3830224199897313, + "grad_norm": 16.00472640991211, + "learning_rate": 1.2738163148887621e-05, + "loss": 1.4454, + "step": 2238 + }, + { + "epoch": 0.38319356494951223, + "grad_norm": 18.229719161987305, + "learning_rate": 1.2743867655447803e-05, + "loss": 1.5843, + "step": 2239 + }, + { + "epoch": 0.38336470990929317, + "grad_norm": 26.571413040161133, + "learning_rate": 1.2749572162007986e-05, + "loss": 2.6752, + "step": 2240 + }, + { + "epoch": 0.3835358548690741, + "grad_norm": 66.64990234375, + "learning_rate": 1.275527666856817e-05, + "loss": 5.1676, + "step": 2241 + }, + { + "epoch": 0.38370699982885503, + "grad_norm": 19.84005355834961, + "learning_rate": 1.2760981175128353e-05, + "loss": 1.4974, + "step": 2242 + }, + { + "epoch": 0.38387814478863597, + "grad_norm": 18.671689987182617, + "learning_rate": 1.2766685681688535e-05, + "loss": 1.9852, + "step": 2243 + }, + { + "epoch": 0.3840492897484169, + "grad_norm": 98.68587493896484, + "learning_rate": 1.2772390188248716e-05, + "loss": 4.9745, + "step": 2244 + }, + { + "epoch": 0.38422043470819783, + "grad_norm": 6.933028221130371, + "learning_rate": 1.27780946948089e-05, + "loss": 0.6802, + "step": 2245 + }, + { + "epoch": 0.38439157966797877, + "grad_norm": 18.11700439453125, + "learning_rate": 1.2783799201369081e-05, + "loss": 1.5278, + "step": 2246 + }, + { + "epoch": 0.3845627246277597, + "grad_norm": 18.046253204345703, + "learning_rate": 1.2789503707929265e-05, + "loss": 2.1576, + "step": 2247 + }, + { + "epoch": 0.38473386958754063, + "grad_norm": 44.326602935791016, + "learning_rate": 1.2795208214489446e-05, + "loss": 7.2696, + "step": 2248 + }, + { + "epoch": 0.38490501454732157, + "grad_norm": 95.7645034790039, + "learning_rate": 1.280091272104963e-05, + "loss": 6.0354, + "step": 2249 + }, + { + "epoch": 0.3850761595071025, + "grad_norm": 7.336085796356201, + "learning_rate": 1.2806617227609811e-05, + "loss": 1.6021, + "step": 2250 + }, + { + "epoch": 0.38524730446688343, + "grad_norm": 21.488544464111328, + "learning_rate": 1.2812321734169993e-05, + "loss": 1.9826, + "step": 2251 + }, + { + "epoch": 0.38541844942666437, + "grad_norm": 34.97186279296875, + "learning_rate": 1.2818026240730178e-05, + "loss": 4.3046, + "step": 2252 + }, + { + "epoch": 0.3855895943864453, + "grad_norm": 4.4676008224487305, + "learning_rate": 1.282373074729036e-05, + "loss": 0.6622, + "step": 2253 + }, + { + "epoch": 0.38576073934622623, + "grad_norm": 6.151776313781738, + "learning_rate": 1.2829435253850543e-05, + "loss": 0.6381, + "step": 2254 + }, + { + "epoch": 0.38593188430600717, + "grad_norm": 6.36190938949585, + "learning_rate": 1.2835139760410725e-05, + "loss": 0.6743, + "step": 2255 + }, + { + "epoch": 0.3861030292657881, + "grad_norm": 24.97540283203125, + "learning_rate": 1.2840844266970908e-05, + "loss": 2.6396, + "step": 2256 + }, + { + "epoch": 0.38627417422556903, + "grad_norm": 141.4521026611328, + "learning_rate": 1.284654877353109e-05, + "loss": 4.7593, + "step": 2257 + }, + { + "epoch": 0.38644531918534997, + "grad_norm": 16.442359924316406, + "learning_rate": 1.2852253280091272e-05, + "loss": 1.3891, + "step": 2258 + }, + { + "epoch": 0.3866164641451309, + "grad_norm": 26.524892807006836, + "learning_rate": 1.2857957786651455e-05, + "loss": 2.8351, + "step": 2259 + }, + { + "epoch": 0.38678760910491183, + "grad_norm": 20.284482955932617, + "learning_rate": 1.2863662293211637e-05, + "loss": 2.2276, + "step": 2260 + }, + { + "epoch": 0.3869587540646928, + "grad_norm": 13.217884063720703, + "learning_rate": 1.286936679977182e-05, + "loss": 0.9694, + "step": 2261 + }, + { + "epoch": 0.38712989902447376, + "grad_norm": 33.80121612548828, + "learning_rate": 1.2875071306332002e-05, + "loss": 4.1736, + "step": 2262 + }, + { + "epoch": 0.3873010439842547, + "grad_norm": 33.30670928955078, + "learning_rate": 1.2880775812892185e-05, + "loss": 3.5895, + "step": 2263 + }, + { + "epoch": 0.3874721889440356, + "grad_norm": 24.27392578125, + "learning_rate": 1.2886480319452369e-05, + "loss": 2.6142, + "step": 2264 + }, + { + "epoch": 0.38764333390381656, + "grad_norm": 4.387927055358887, + "learning_rate": 1.289218482601255e-05, + "loss": 0.5611, + "step": 2265 + }, + { + "epoch": 0.3878144788635975, + "grad_norm": 11.723445892333984, + "learning_rate": 1.2897889332572734e-05, + "loss": 1.9691, + "step": 2266 + }, + { + "epoch": 0.3879856238233784, + "grad_norm": 31.290142059326172, + "learning_rate": 1.2903593839132915e-05, + "loss": 4.4918, + "step": 2267 + }, + { + "epoch": 0.38815676878315936, + "grad_norm": 29.301557540893555, + "learning_rate": 1.2909298345693099e-05, + "loss": 3.0265, + "step": 2268 + }, + { + "epoch": 0.3883279137429403, + "grad_norm": 102.96603393554688, + "learning_rate": 1.291500285225328e-05, + "loss": 5.4491, + "step": 2269 + }, + { + "epoch": 0.3884990587027212, + "grad_norm": 6.566992282867432, + "learning_rate": 1.2920707358813462e-05, + "loss": 0.6619, + "step": 2270 + }, + { + "epoch": 0.38867020366250216, + "grad_norm": 20.63521385192871, + "learning_rate": 1.2926411865373645e-05, + "loss": 1.4227, + "step": 2271 + }, + { + "epoch": 0.3888413486222831, + "grad_norm": 35.605445861816406, + "learning_rate": 1.2932116371933827e-05, + "loss": 3.0135, + "step": 2272 + }, + { + "epoch": 0.389012493582064, + "grad_norm": 26.535554885864258, + "learning_rate": 1.293782087849401e-05, + "loss": 2.8951, + "step": 2273 + }, + { + "epoch": 0.38918363854184496, + "grad_norm": 216.86865234375, + "learning_rate": 1.2943525385054192e-05, + "loss": 9.7157, + "step": 2274 + }, + { + "epoch": 0.3893547835016259, + "grad_norm": 30.289108276367188, + "learning_rate": 1.2949229891614376e-05, + "loss": 3.7199, + "step": 2275 + }, + { + "epoch": 0.3895259284614068, + "grad_norm": 70.54218292236328, + "learning_rate": 1.2954934398174559e-05, + "loss": 8.7961, + "step": 2276 + }, + { + "epoch": 0.38969707342118776, + "grad_norm": 37.42404556274414, + "learning_rate": 1.296063890473474e-05, + "loss": 4.6408, + "step": 2277 + }, + { + "epoch": 0.3898682183809687, + "grad_norm": 20.272388458251953, + "learning_rate": 1.2966343411294924e-05, + "loss": 2.2787, + "step": 2278 + }, + { + "epoch": 0.3900393633407496, + "grad_norm": 21.717552185058594, + "learning_rate": 1.2972047917855106e-05, + "loss": 2.6312, + "step": 2279 + }, + { + "epoch": 0.39021050830053056, + "grad_norm": 27.405563354492188, + "learning_rate": 1.2977752424415289e-05, + "loss": 2.922, + "step": 2280 + }, + { + "epoch": 0.3903816532603115, + "grad_norm": 9.014309883117676, + "learning_rate": 1.298345693097547e-05, + "loss": 0.7394, + "step": 2281 + }, + { + "epoch": 0.3905527982200924, + "grad_norm": 34.70540237426758, + "learning_rate": 1.2989161437535652e-05, + "loss": 4.5773, + "step": 2282 + }, + { + "epoch": 0.39072394317987336, + "grad_norm": 17.615568161010742, + "learning_rate": 1.2994865944095836e-05, + "loss": 1.3793, + "step": 2283 + }, + { + "epoch": 0.3908950881396543, + "grad_norm": 9.69536018371582, + "learning_rate": 1.3000570450656018e-05, + "loss": 1.4154, + "step": 2284 + }, + { + "epoch": 0.3910662330994352, + "grad_norm": 42.174076080322266, + "learning_rate": 1.3006274957216201e-05, + "loss": 3.7989, + "step": 2285 + }, + { + "epoch": 0.39123737805921616, + "grad_norm": 23.85903549194336, + "learning_rate": 1.3011979463776384e-05, + "loss": 2.7904, + "step": 2286 + }, + { + "epoch": 0.3914085230189971, + "grad_norm": 12.722695350646973, + "learning_rate": 1.3017683970336568e-05, + "loss": 0.9675, + "step": 2287 + }, + { + "epoch": 0.391579667978778, + "grad_norm": 29.10125160217285, + "learning_rate": 1.302338847689675e-05, + "loss": 3.3556, + "step": 2288 + }, + { + "epoch": 0.39175081293855896, + "grad_norm": 28.335847854614258, + "learning_rate": 1.3029092983456931e-05, + "loss": 3.6913, + "step": 2289 + }, + { + "epoch": 0.3919219578983399, + "grad_norm": 27.098371505737305, + "learning_rate": 1.3034797490017114e-05, + "loss": 3.4171, + "step": 2290 + }, + { + "epoch": 0.3920931028581208, + "grad_norm": 24.61624526977539, + "learning_rate": 1.3040501996577296e-05, + "loss": 2.5601, + "step": 2291 + }, + { + "epoch": 0.39226424781790176, + "grad_norm": 36.29865264892578, + "learning_rate": 1.304620650313748e-05, + "loss": 4.1205, + "step": 2292 + }, + { + "epoch": 0.3924353927776827, + "grad_norm": Infinity, + "learning_rate": 1.304620650313748e-05, + "loss": 10.2854, + "step": 2293 + }, + { + "epoch": 0.3926065377374636, + "grad_norm": 43.4719352722168, + "learning_rate": 1.3051911009697661e-05, + "loss": 5.7968, + "step": 2294 + }, + { + "epoch": 0.39277768269724456, + "grad_norm": 34.03304672241211, + "learning_rate": 1.3057615516257843e-05, + "loss": 3.2482, + "step": 2295 + }, + { + "epoch": 0.3929488276570255, + "grad_norm": 28.92998695373535, + "learning_rate": 1.3063320022818026e-05, + "loss": 3.1048, + "step": 2296 + }, + { + "epoch": 0.3931199726168064, + "grad_norm": 30.764570236206055, + "learning_rate": 1.3069024529378208e-05, + "loss": 3.4204, + "step": 2297 + }, + { + "epoch": 0.39329111757658736, + "grad_norm": 30.185405731201172, + "learning_rate": 1.3074729035938391e-05, + "loss": 3.6159, + "step": 2298 + }, + { + "epoch": 0.3934622625363683, + "grad_norm": 15.160475730895996, + "learning_rate": 1.3080433542498575e-05, + "loss": 1.0351, + "step": 2299 + }, + { + "epoch": 0.3936334074961492, + "grad_norm": 30.460662841796875, + "learning_rate": 1.3086138049058758e-05, + "loss": 3.0676, + "step": 2300 + }, + { + "epoch": 0.39380455245593016, + "grad_norm": 31.176111221313477, + "learning_rate": 1.309184255561894e-05, + "loss": 3.07, + "step": 2301 + }, + { + "epoch": 0.3939756974157111, + "grad_norm": 40.287208557128906, + "learning_rate": 1.3097547062179121e-05, + "loss": 4.0389, + "step": 2302 + }, + { + "epoch": 0.394146842375492, + "grad_norm": 31.603471755981445, + "learning_rate": 1.3103251568739305e-05, + "loss": 3.1424, + "step": 2303 + }, + { + "epoch": 0.39431798733527296, + "grad_norm": 27.959386825561523, + "learning_rate": 1.3108956075299486e-05, + "loss": 2.7499, + "step": 2304 + }, + { + "epoch": 0.3944891322950539, + "grad_norm": 9.44000244140625, + "learning_rate": 1.311466058185967e-05, + "loss": 0.6843, + "step": 2305 + }, + { + "epoch": 0.3946602772548348, + "grad_norm": 31.026531219482422, + "learning_rate": 1.3120365088419852e-05, + "loss": 3.0951, + "step": 2306 + }, + { + "epoch": 0.39483142221461576, + "grad_norm": 28.429651260375977, + "learning_rate": 1.3126069594980035e-05, + "loss": 3.1954, + "step": 2307 + }, + { + "epoch": 0.3950025671743967, + "grad_norm": 36.807884216308594, + "learning_rate": 1.3131774101540217e-05, + "loss": 3.5692, + "step": 2308 + }, + { + "epoch": 0.3951737121341776, + "grad_norm": 27.523998260498047, + "learning_rate": 1.3137478608100398e-05, + "loss": 3.04, + "step": 2309 + }, + { + "epoch": 0.39534485709395856, + "grad_norm": 22.569734573364258, + "learning_rate": 1.3143183114660583e-05, + "loss": 2.6063, + "step": 2310 + }, + { + "epoch": 0.3955160020537395, + "grad_norm": 30.23894691467285, + "learning_rate": 1.3148887621220765e-05, + "loss": 2.6877, + "step": 2311 + }, + { + "epoch": 0.3956871470135205, + "grad_norm": 32.485286712646484, + "learning_rate": 1.3154592127780948e-05, + "loss": 4.2341, + "step": 2312 + }, + { + "epoch": 0.3958582919733014, + "grad_norm": 9.512272834777832, + "learning_rate": 1.316029663434113e-05, + "loss": 0.7438, + "step": 2313 + }, + { + "epoch": 0.39602943693308235, + "grad_norm": 30.39967918395996, + "learning_rate": 1.3166001140901312e-05, + "loss": 2.678, + "step": 2314 + }, + { + "epoch": 0.3962005818928633, + "grad_norm": 26.347349166870117, + "learning_rate": 1.3171705647461495e-05, + "loss": 2.6173, + "step": 2315 + }, + { + "epoch": 0.3963717268526442, + "grad_norm": 11.27676010131836, + "learning_rate": 1.3177410154021677e-05, + "loss": 0.9184, + "step": 2316 + }, + { + "epoch": 0.39654287181242515, + "grad_norm": 28.942106246948242, + "learning_rate": 1.318311466058186e-05, + "loss": 2.7751, + "step": 2317 + }, + { + "epoch": 0.3967140167722061, + "grad_norm": 155.31259155273438, + "learning_rate": 1.3188819167142042e-05, + "loss": 7.0884, + "step": 2318 + }, + { + "epoch": 0.396885161731987, + "grad_norm": 15.048434257507324, + "learning_rate": 1.3194523673702225e-05, + "loss": 1.0894, + "step": 2319 + }, + { + "epoch": 0.39705630669176795, + "grad_norm": 29.555904388427734, + "learning_rate": 1.3200228180262407e-05, + "loss": 3.0407, + "step": 2320 + }, + { + "epoch": 0.3972274516515489, + "grad_norm": 22.9705810546875, + "learning_rate": 1.3205932686822589e-05, + "loss": 2.1862, + "step": 2321 + }, + { + "epoch": 0.3973985966113298, + "grad_norm": 31.04474449157715, + "learning_rate": 1.3211637193382774e-05, + "loss": 3.5704, + "step": 2322 + }, + { + "epoch": 0.39756974157111075, + "grad_norm": 38.25536346435547, + "learning_rate": 1.3217341699942955e-05, + "loss": 3.7726, + "step": 2323 + }, + { + "epoch": 0.3977408865308917, + "grad_norm": 24.22712516784668, + "learning_rate": 1.3223046206503139e-05, + "loss": 2.9952, + "step": 2324 + }, + { + "epoch": 0.3979120314906726, + "grad_norm": 32.82272720336914, + "learning_rate": 1.322875071306332e-05, + "loss": 3.313, + "step": 2325 + }, + { + "epoch": 0.39808317645045355, + "grad_norm": 29.25124168395996, + "learning_rate": 1.3234455219623502e-05, + "loss": 2.9707, + "step": 2326 + }, + { + "epoch": 0.3982543214102345, + "grad_norm": 42.494041442871094, + "learning_rate": 1.3240159726183686e-05, + "loss": 4.4698, + "step": 2327 + }, + { + "epoch": 0.3984254663700154, + "grad_norm": 32.5220947265625, + "learning_rate": 1.3245864232743867e-05, + "loss": 3.7016, + "step": 2328 + }, + { + "epoch": 0.39859661132979635, + "grad_norm": 4.652500629425049, + "learning_rate": 1.325156873930405e-05, + "loss": 0.5571, + "step": 2329 + }, + { + "epoch": 0.3987677562895773, + "grad_norm": 136.5018768310547, + "learning_rate": 1.3257273245864232e-05, + "loss": 5.8926, + "step": 2330 + }, + { + "epoch": 0.3989389012493582, + "grad_norm": 51.504051208496094, + "learning_rate": 1.3262977752424416e-05, + "loss": 5.6096, + "step": 2331 + }, + { + "epoch": 0.39911004620913915, + "grad_norm": 18.578340530395508, + "learning_rate": 1.3268682258984597e-05, + "loss": 2.2877, + "step": 2332 + }, + { + "epoch": 0.3992811911689201, + "grad_norm": 26.573881149291992, + "learning_rate": 1.327438676554478e-05, + "loss": 3.3675, + "step": 2333 + }, + { + "epoch": 0.399452336128701, + "grad_norm": 28.39176368713379, + "learning_rate": 1.3280091272104964e-05, + "loss": 3.4595, + "step": 2334 + }, + { + "epoch": 0.39962348108848195, + "grad_norm": 27.315298080444336, + "learning_rate": 1.3285795778665146e-05, + "loss": 3.7229, + "step": 2335 + }, + { + "epoch": 0.3997946260482629, + "grad_norm": 34.018280029296875, + "learning_rate": 1.329150028522533e-05, + "loss": 3.7833, + "step": 2336 + }, + { + "epoch": 0.3999657710080438, + "grad_norm": 35.161949157714844, + "learning_rate": 1.3297204791785511e-05, + "loss": 3.6256, + "step": 2337 + }, + { + "epoch": 0.40013691596782475, + "grad_norm": 54.34180450439453, + "learning_rate": 1.3302909298345694e-05, + "loss": 8.2972, + "step": 2338 + }, + { + "epoch": 0.4003080609276057, + "grad_norm": 37.41242980957031, + "learning_rate": 1.3308613804905876e-05, + "loss": 4.1554, + "step": 2339 + }, + { + "epoch": 0.4004792058873866, + "grad_norm": 24.117671966552734, + "learning_rate": 1.3314318311466058e-05, + "loss": 2.6779, + "step": 2340 + }, + { + "epoch": 0.40065035084716755, + "grad_norm": 75.865234375, + "learning_rate": 1.3320022818026241e-05, + "loss": 5.2769, + "step": 2341 + }, + { + "epoch": 0.4008214958069485, + "grad_norm": 10.710969924926758, + "learning_rate": 1.3325727324586423e-05, + "loss": 0.8464, + "step": 2342 + }, + { + "epoch": 0.4009926407667294, + "grad_norm": 32.42598342895508, + "learning_rate": 1.3331431831146606e-05, + "loss": 4.2176, + "step": 2343 + }, + { + "epoch": 0.40116378572651035, + "grad_norm": 31.665010452270508, + "learning_rate": 1.333713633770679e-05, + "loss": 3.6966, + "step": 2344 + }, + { + "epoch": 0.4013349306862913, + "grad_norm": 19.899494171142578, + "learning_rate": 1.3342840844266971e-05, + "loss": 1.5844, + "step": 2345 + }, + { + "epoch": 0.4015060756460722, + "grad_norm": 26.72218132019043, + "learning_rate": 1.3348545350827155e-05, + "loss": 2.6391, + "step": 2346 + }, + { + "epoch": 0.40167722060585315, + "grad_norm": 67.58808135986328, + "learning_rate": 1.3354249857387336e-05, + "loss": 4.8317, + "step": 2347 + }, + { + "epoch": 0.4018483655656341, + "grad_norm": 35.947166442871094, + "learning_rate": 1.335995436394752e-05, + "loss": 4.4359, + "step": 2348 + }, + { + "epoch": 0.402019510525415, + "grad_norm": 33.10310745239258, + "learning_rate": 1.3365658870507701e-05, + "loss": 4.2287, + "step": 2349 + }, + { + "epoch": 0.40219065548519595, + "grad_norm": 26.962339401245117, + "learning_rate": 1.3371363377067885e-05, + "loss": 2.8973, + "step": 2350 + }, + { + "epoch": 0.4023618004449769, + "grad_norm": 10.832432746887207, + "learning_rate": 1.3377067883628066e-05, + "loss": 1.1334, + "step": 2351 + }, + { + "epoch": 0.4025329454047578, + "grad_norm": 3.8930623531341553, + "learning_rate": 1.3382772390188248e-05, + "loss": 0.5691, + "step": 2352 + }, + { + "epoch": 0.40270409036453875, + "grad_norm": 30.617422103881836, + "learning_rate": 1.3388476896748431e-05, + "loss": 4.1985, + "step": 2353 + }, + { + "epoch": 0.4028752353243197, + "grad_norm": 29.522432327270508, + "learning_rate": 1.3394181403308613e-05, + "loss": 3.3929, + "step": 2354 + }, + { + "epoch": 0.4030463802841006, + "grad_norm": 29.46415901184082, + "learning_rate": 1.3399885909868796e-05, + "loss": 3.0491, + "step": 2355 + }, + { + "epoch": 0.40321752524388155, + "grad_norm": 28.462308883666992, + "learning_rate": 1.340559041642898e-05, + "loss": 3.4012, + "step": 2356 + }, + { + "epoch": 0.4033886702036625, + "grad_norm": 26.383548736572266, + "learning_rate": 1.3411294922989163e-05, + "loss": 2.8104, + "step": 2357 + }, + { + "epoch": 0.4035598151634434, + "grad_norm": 5.228971004486084, + "learning_rate": 1.3416999429549345e-05, + "loss": 0.6984, + "step": 2358 + }, + { + "epoch": 0.40373096012322435, + "grad_norm": 65.2656021118164, + "learning_rate": 1.3422703936109527e-05, + "loss": 4.842, + "step": 2359 + }, + { + "epoch": 0.4039021050830053, + "grad_norm": 9.332406044006348, + "learning_rate": 1.342840844266971e-05, + "loss": 1.1261, + "step": 2360 + }, + { + "epoch": 0.4040732500427862, + "grad_norm": 19.021203994750977, + "learning_rate": 1.3434112949229892e-05, + "loss": 2.1595, + "step": 2361 + }, + { + "epoch": 0.40424439500256715, + "grad_norm": 26.664148330688477, + "learning_rate": 1.3439817455790075e-05, + "loss": 3.1312, + "step": 2362 + }, + { + "epoch": 0.40441553996234814, + "grad_norm": 14.464133262634277, + "learning_rate": 1.3445521962350257e-05, + "loss": 1.3528, + "step": 2363 + }, + { + "epoch": 0.40458668492212907, + "grad_norm": 25.62092399597168, + "learning_rate": 1.3451226468910438e-05, + "loss": 2.6933, + "step": 2364 + }, + { + "epoch": 0.40475782988191, + "grad_norm": 30.344446182250977, + "learning_rate": 1.3456930975470622e-05, + "loss": 3.5816, + "step": 2365 + }, + { + "epoch": 0.40492897484169094, + "grad_norm": 34.0131950378418, + "learning_rate": 1.3462635482030803e-05, + "loss": 3.8169, + "step": 2366 + }, + { + "epoch": 0.40510011980147187, + "grad_norm": 25.75012969970703, + "learning_rate": 1.3468339988590989e-05, + "loss": 2.9735, + "step": 2367 + }, + { + "epoch": 0.4052712647612528, + "grad_norm": 31.590328216552734, + "learning_rate": 1.347404449515117e-05, + "loss": 3.4589, + "step": 2368 + }, + { + "epoch": 0.40544240972103374, + "grad_norm": 24.881752014160156, + "learning_rate": 1.3479749001711354e-05, + "loss": 2.5041, + "step": 2369 + }, + { + "epoch": 0.40561355468081467, + "grad_norm": 20.611392974853516, + "learning_rate": 1.3485453508271535e-05, + "loss": 2.0903, + "step": 2370 + }, + { + "epoch": 0.4057846996405956, + "grad_norm": 35.99172592163086, + "learning_rate": 1.3491158014831717e-05, + "loss": 4.3005, + "step": 2371 + }, + { + "epoch": 0.40595584460037654, + "grad_norm": 44.53636932373047, + "learning_rate": 1.34968625213919e-05, + "loss": 7.8911, + "step": 2372 + }, + { + "epoch": 0.40612698956015747, + "grad_norm": 41.1456298828125, + "learning_rate": 1.3502567027952082e-05, + "loss": 7.4041, + "step": 2373 + }, + { + "epoch": 0.4062981345199384, + "grad_norm": 24.72629737854004, + "learning_rate": 1.3508271534512265e-05, + "loss": 2.0511, + "step": 2374 + }, + { + "epoch": 0.40646927947971934, + "grad_norm": 9.164275169372559, + "learning_rate": 1.3513976041072447e-05, + "loss": 1.0127, + "step": 2375 + }, + { + "epoch": 0.40664042443950027, + "grad_norm": 55.97251892089844, + "learning_rate": 1.351968054763263e-05, + "loss": 7.7883, + "step": 2376 + }, + { + "epoch": 0.4068115693992812, + "grad_norm": 15.729819297790527, + "learning_rate": 1.3525385054192812e-05, + "loss": 1.3747, + "step": 2377 + }, + { + "epoch": 0.40698271435906214, + "grad_norm": 31.85474967956543, + "learning_rate": 1.3531089560752994e-05, + "loss": 3.7341, + "step": 2378 + }, + { + "epoch": 0.40715385931884307, + "grad_norm": 32.369163513183594, + "learning_rate": 1.3536794067313179e-05, + "loss": 3.4044, + "step": 2379 + }, + { + "epoch": 0.407325004278624, + "grad_norm": 26.481473922729492, + "learning_rate": 1.354249857387336e-05, + "loss": 2.7264, + "step": 2380 + }, + { + "epoch": 0.40749614923840494, + "grad_norm": 36.87574005126953, + "learning_rate": 1.3548203080433544e-05, + "loss": 7.1091, + "step": 2381 + }, + { + "epoch": 0.40766729419818587, + "grad_norm": 34.68164825439453, + "learning_rate": 1.3553907586993726e-05, + "loss": 3.5182, + "step": 2382 + }, + { + "epoch": 0.4078384391579668, + "grad_norm": 4.539041042327881, + "learning_rate": 1.3559612093553907e-05, + "loss": 0.5712, + "step": 2383 + }, + { + "epoch": 0.40800958411774774, + "grad_norm": 18.264692306518555, + "learning_rate": 1.356531660011409e-05, + "loss": 1.6827, + "step": 2384 + }, + { + "epoch": 0.40818072907752867, + "grad_norm": 58.49655532836914, + "learning_rate": 1.3571021106674272e-05, + "loss": 4.7984, + "step": 2385 + }, + { + "epoch": 0.4083518740373096, + "grad_norm": 38.31999969482422, + "learning_rate": 1.3576725613234456e-05, + "loss": 4.9844, + "step": 2386 + }, + { + "epoch": 0.40852301899709054, + "grad_norm": 31.779747009277344, + "learning_rate": 1.3582430119794637e-05, + "loss": 4.109, + "step": 2387 + }, + { + "epoch": 0.40869416395687147, + "grad_norm": 28.318117141723633, + "learning_rate": 1.358813462635482e-05, + "loss": 3.1768, + "step": 2388 + }, + { + "epoch": 0.4088653089166524, + "grad_norm": 109.76797485351562, + "learning_rate": 1.3593839132915003e-05, + "loss": 8.1629, + "step": 2389 + }, + { + "epoch": 0.40903645387643334, + "grad_norm": 29.490888595581055, + "learning_rate": 1.3599543639475186e-05, + "loss": 2.8556, + "step": 2390 + }, + { + "epoch": 0.40920759883621427, + "grad_norm": 59.6926383972168, + "learning_rate": 1.360524814603537e-05, + "loss": 7.2838, + "step": 2391 + }, + { + "epoch": 0.4093787437959952, + "grad_norm": 26.968727111816406, + "learning_rate": 1.3610952652595551e-05, + "loss": 3.1073, + "step": 2392 + }, + { + "epoch": 0.40954988875577614, + "grad_norm": 14.444951057434082, + "learning_rate": 1.3616657159155734e-05, + "loss": 1.1151, + "step": 2393 + }, + { + "epoch": 0.40972103371555707, + "grad_norm": 27.179691314697266, + "learning_rate": 1.3622361665715916e-05, + "loss": 3.4778, + "step": 2394 + }, + { + "epoch": 0.409892178675338, + "grad_norm": 28.209474563598633, + "learning_rate": 1.3628066172276098e-05, + "loss": 3.1061, + "step": 2395 + }, + { + "epoch": 0.41006332363511894, + "grad_norm": 28.115158081054688, + "learning_rate": 1.3633770678836281e-05, + "loss": 3.3303, + "step": 2396 + }, + { + "epoch": 0.41023446859489987, + "grad_norm": 33.9571418762207, + "learning_rate": 1.3639475185396463e-05, + "loss": 3.476, + "step": 2397 + }, + { + "epoch": 0.4104056135546808, + "grad_norm": 99.95455932617188, + "learning_rate": 1.3645179691956646e-05, + "loss": 4.891, + "step": 2398 + }, + { + "epoch": 0.41057675851446174, + "grad_norm": 32.09910583496094, + "learning_rate": 1.3650884198516828e-05, + "loss": 4.5344, + "step": 2399 + }, + { + "epoch": 0.41074790347424267, + "grad_norm": 22.752981185913086, + "learning_rate": 1.3656588705077011e-05, + "loss": 2.5455, + "step": 2400 + }, + { + "epoch": 0.4109190484340236, + "grad_norm": 31.16071128845215, + "learning_rate": 1.3662293211637193e-05, + "loss": 3.5245, + "step": 2401 + }, + { + "epoch": 0.41109019339380454, + "grad_norm": 16.054365158081055, + "learning_rate": 1.3667997718197376e-05, + "loss": 1.1714, + "step": 2402 + }, + { + "epoch": 0.41126133835358547, + "grad_norm": 61.82563018798828, + "learning_rate": 1.367370222475756e-05, + "loss": 4.4332, + "step": 2403 + }, + { + "epoch": 0.4114324833133664, + "grad_norm": 25.521482467651367, + "learning_rate": 1.3679406731317741e-05, + "loss": 3.1523, + "step": 2404 + }, + { + "epoch": 0.41160362827314734, + "grad_norm": 28.02633285522461, + "learning_rate": 1.3685111237877925e-05, + "loss": 3.3077, + "step": 2405 + }, + { + "epoch": 0.41177477323292827, + "grad_norm": 31.012575149536133, + "learning_rate": 1.3690815744438106e-05, + "loss": 3.2338, + "step": 2406 + }, + { + "epoch": 0.4119459181927092, + "grad_norm": 24.693798065185547, + "learning_rate": 1.369652025099829e-05, + "loss": 3.1159, + "step": 2407 + }, + { + "epoch": 0.41211706315249014, + "grad_norm": 28.928600311279297, + "learning_rate": 1.3702224757558471e-05, + "loss": 3.231, + "step": 2408 + }, + { + "epoch": 0.41228820811227107, + "grad_norm": 30.929235458374023, + "learning_rate": 1.3707929264118653e-05, + "loss": 3.7399, + "step": 2409 + }, + { + "epoch": 0.412459353072052, + "grad_norm": 29.809967041015625, + "learning_rate": 1.3713633770678837e-05, + "loss": 2.8299, + "step": 2410 + }, + { + "epoch": 0.41263049803183294, + "grad_norm": 34.67237091064453, + "learning_rate": 1.3719338277239018e-05, + "loss": 3.7692, + "step": 2411 + }, + { + "epoch": 0.41280164299161387, + "grad_norm": 29.03022575378418, + "learning_rate": 1.3725042783799202e-05, + "loss": 3.1456, + "step": 2412 + }, + { + "epoch": 0.4129727879513948, + "grad_norm": 27.838979721069336, + "learning_rate": 1.3730747290359385e-05, + "loss": 2.7944, + "step": 2413 + }, + { + "epoch": 0.4131439329111758, + "grad_norm": 27.87117576599121, + "learning_rate": 1.3736451796919567e-05, + "loss": 2.9217, + "step": 2414 + }, + { + "epoch": 0.4133150778709567, + "grad_norm": 34.504295349121094, + "learning_rate": 1.374215630347975e-05, + "loss": 3.9797, + "step": 2415 + }, + { + "epoch": 0.41348622283073766, + "grad_norm": 21.570331573486328, + "learning_rate": 1.3747860810039932e-05, + "loss": 2.3607, + "step": 2416 + }, + { + "epoch": 0.4136573677905186, + "grad_norm": 132.19834899902344, + "learning_rate": 1.3753565316600115e-05, + "loss": 7.9895, + "step": 2417 + }, + { + "epoch": 0.4138285127502995, + "grad_norm": 29.5281925201416, + "learning_rate": 1.3759269823160297e-05, + "loss": 3.3183, + "step": 2418 + }, + { + "epoch": 0.41399965771008046, + "grad_norm": 7.409353256225586, + "learning_rate": 1.376497432972048e-05, + "loss": 0.6978, + "step": 2419 + }, + { + "epoch": 0.4141708026698614, + "grad_norm": 23.6326847076416, + "learning_rate": 1.3770678836280662e-05, + "loss": 2.4085, + "step": 2420 + }, + { + "epoch": 0.4143419476296423, + "grad_norm": 6.584427356719971, + "learning_rate": 1.3776383342840844e-05, + "loss": 0.7725, + "step": 2421 + }, + { + "epoch": 0.41451309258942326, + "grad_norm": 5.124080181121826, + "learning_rate": 1.3782087849401027e-05, + "loss": 0.5998, + "step": 2422 + }, + { + "epoch": 0.4146842375492042, + "grad_norm": 129.28781127929688, + "learning_rate": 1.3787792355961209e-05, + "loss": 5.2033, + "step": 2423 + }, + { + "epoch": 0.4148553825089851, + "grad_norm": 30.348461151123047, + "learning_rate": 1.3793496862521394e-05, + "loss": 3.4905, + "step": 2424 + }, + { + "epoch": 0.41502652746876606, + "grad_norm": 25.107507705688477, + "learning_rate": 1.3799201369081575e-05, + "loss": 2.4536, + "step": 2425 + }, + { + "epoch": 0.415197672428547, + "grad_norm": 20.649410247802734, + "learning_rate": 1.3804905875641757e-05, + "loss": 2.11, + "step": 2426 + }, + { + "epoch": 0.4153688173883279, + "grad_norm": 31.82566261291504, + "learning_rate": 1.381061038220194e-05, + "loss": 3.4624, + "step": 2427 + }, + { + "epoch": 0.41553996234810886, + "grad_norm": 25.216468811035156, + "learning_rate": 1.3816314888762122e-05, + "loss": 2.7103, + "step": 2428 + }, + { + "epoch": 0.4157111073078898, + "grad_norm": 62.44169616699219, + "learning_rate": 1.3822019395322305e-05, + "loss": 7.9435, + "step": 2429 + }, + { + "epoch": 0.4158822522676707, + "grad_norm": 14.46311092376709, + "learning_rate": 1.3827723901882487e-05, + "loss": 1.3546, + "step": 2430 + }, + { + "epoch": 0.41605339722745166, + "grad_norm": 21.584251403808594, + "learning_rate": 1.383342840844267e-05, + "loss": 2.3481, + "step": 2431 + }, + { + "epoch": 0.4162245421872326, + "grad_norm": 28.41043472290039, + "learning_rate": 1.3839132915002852e-05, + "loss": 3.242, + "step": 2432 + }, + { + "epoch": 0.4163956871470135, + "grad_norm": 57.48540496826172, + "learning_rate": 1.3844837421563034e-05, + "loss": 8.205, + "step": 2433 + }, + { + "epoch": 0.41656683210679446, + "grad_norm": 20.560029983520508, + "learning_rate": 1.3850541928123217e-05, + "loss": 2.4152, + "step": 2434 + }, + { + "epoch": 0.4167379770665754, + "grad_norm": 29.860027313232422, + "learning_rate": 1.3856246434683399e-05, + "loss": 2.9946, + "step": 2435 + }, + { + "epoch": 0.4169091220263563, + "grad_norm": 34.29914855957031, + "learning_rate": 1.3861950941243584e-05, + "loss": 4.0154, + "step": 2436 + }, + { + "epoch": 0.41708026698613726, + "grad_norm": 31.778980255126953, + "learning_rate": 1.3867655447803766e-05, + "loss": 3.279, + "step": 2437 + }, + { + "epoch": 0.4172514119459182, + "grad_norm": 30.92992401123047, + "learning_rate": 1.3873359954363949e-05, + "loss": 3.7128, + "step": 2438 + }, + { + "epoch": 0.4174225569056991, + "grad_norm": 30.067113876342773, + "learning_rate": 1.387906446092413e-05, + "loss": 3.3294, + "step": 2439 + }, + { + "epoch": 0.41759370186548006, + "grad_norm": 78.94349670410156, + "learning_rate": 1.3884768967484312e-05, + "loss": 4.5151, + "step": 2440 + }, + { + "epoch": 0.417764846825261, + "grad_norm": 35.60622787475586, + "learning_rate": 1.3890473474044496e-05, + "loss": 3.3575, + "step": 2441 + }, + { + "epoch": 0.4179359917850419, + "grad_norm": 29.288429260253906, + "learning_rate": 1.3896177980604678e-05, + "loss": 3.0086, + "step": 2442 + }, + { + "epoch": 0.41810713674482286, + "grad_norm": 29.106294631958008, + "learning_rate": 1.3901882487164861e-05, + "loss": 3.228, + "step": 2443 + }, + { + "epoch": 0.4182782817046038, + "grad_norm": 20.533992767333984, + "learning_rate": 1.3907586993725043e-05, + "loss": 2.2425, + "step": 2444 + }, + { + "epoch": 0.4184494266643847, + "grad_norm": 26.775163650512695, + "learning_rate": 1.3913291500285224e-05, + "loss": 2.6486, + "step": 2445 + }, + { + "epoch": 0.41862057162416566, + "grad_norm": 23.887187957763672, + "learning_rate": 1.3918996006845408e-05, + "loss": 2.0254, + "step": 2446 + }, + { + "epoch": 0.4187917165839466, + "grad_norm": 32.54766082763672, + "learning_rate": 1.3924700513405591e-05, + "loss": 4.0827, + "step": 2447 + }, + { + "epoch": 0.4189628615437275, + "grad_norm": 24.691999435424805, + "learning_rate": 1.3930405019965774e-05, + "loss": 2.7637, + "step": 2448 + }, + { + "epoch": 0.41913400650350846, + "grad_norm": 36.446842193603516, + "learning_rate": 1.3936109526525956e-05, + "loss": 7.2209, + "step": 2449 + }, + { + "epoch": 0.4193051514632894, + "grad_norm": 24.245582580566406, + "learning_rate": 1.394181403308614e-05, + "loss": 2.5936, + "step": 2450 + }, + { + "epoch": 0.4194762964230703, + "grad_norm": 34.520198822021484, + "learning_rate": 1.3947518539646321e-05, + "loss": 3.7529, + "step": 2451 + }, + { + "epoch": 0.41964744138285126, + "grad_norm": 34.79539489746094, + "learning_rate": 1.3953223046206503e-05, + "loss": 4.0982, + "step": 2452 + }, + { + "epoch": 0.4198185863426322, + "grad_norm": 6.38947057723999, + "learning_rate": 1.3958927552766686e-05, + "loss": 0.5817, + "step": 2453 + }, + { + "epoch": 0.4199897313024131, + "grad_norm": 35.33879852294922, + "learning_rate": 1.3964632059326868e-05, + "loss": 3.3258, + "step": 2454 + }, + { + "epoch": 0.42016087626219406, + "grad_norm": 8.833622932434082, + "learning_rate": 1.3970336565887051e-05, + "loss": 1.1178, + "step": 2455 + }, + { + "epoch": 0.420332021221975, + "grad_norm": 25.07313346862793, + "learning_rate": 1.3976041072447233e-05, + "loss": 2.7209, + "step": 2456 + }, + { + "epoch": 0.4205031661817559, + "grad_norm": 30.224679946899414, + "learning_rate": 1.3981745579007416e-05, + "loss": 3.4578, + "step": 2457 + }, + { + "epoch": 0.42067431114153686, + "grad_norm": 26.421674728393555, + "learning_rate": 1.3987450085567598e-05, + "loss": 2.6023, + "step": 2458 + }, + { + "epoch": 0.4208454561013178, + "grad_norm": 33.97099685668945, + "learning_rate": 1.3993154592127781e-05, + "loss": 4.066, + "step": 2459 + }, + { + "epoch": 0.4210166010610987, + "grad_norm": 28.07750701904297, + "learning_rate": 1.3998859098687965e-05, + "loss": 3.1308, + "step": 2460 + }, + { + "epoch": 0.42118774602087966, + "grad_norm": 33.50989532470703, + "learning_rate": 1.4004563605248146e-05, + "loss": 3.461, + "step": 2461 + }, + { + "epoch": 0.4213588909806606, + "grad_norm": 16.63654136657715, + "learning_rate": 1.401026811180833e-05, + "loss": 1.3419, + "step": 2462 + }, + { + "epoch": 0.4215300359404415, + "grad_norm": 16.42656707763672, + "learning_rate": 1.4015972618368512e-05, + "loss": 1.8682, + "step": 2463 + }, + { + "epoch": 0.42170118090022246, + "grad_norm": 30.457616806030273, + "learning_rate": 1.4021677124928693e-05, + "loss": 3.1266, + "step": 2464 + }, + { + "epoch": 0.42187232586000345, + "grad_norm": 20.4791202545166, + "learning_rate": 1.4027381631488877e-05, + "loss": 2.2995, + "step": 2465 + }, + { + "epoch": 0.4220434708197844, + "grad_norm": 8.99075698852539, + "learning_rate": 1.4033086138049058e-05, + "loss": 1.6982, + "step": 2466 + }, + { + "epoch": 0.4222146157795653, + "grad_norm": 57.64451599121094, + "learning_rate": 1.4038790644609242e-05, + "loss": 8.0313, + "step": 2467 + }, + { + "epoch": 0.42238576073934625, + "grad_norm": 13.558103561401367, + "learning_rate": 1.4044495151169423e-05, + "loss": 1.2578, + "step": 2468 + }, + { + "epoch": 0.4225569056991272, + "grad_norm": 21.366905212402344, + "learning_rate": 1.4050199657729607e-05, + "loss": 2.2006, + "step": 2469 + }, + { + "epoch": 0.4227280506589081, + "grad_norm": 14.984084129333496, + "learning_rate": 1.405590416428979e-05, + "loss": 1.1421, + "step": 2470 + }, + { + "epoch": 0.42289919561868905, + "grad_norm": 37.106781005859375, + "learning_rate": 1.4061608670849972e-05, + "loss": 3.448, + "step": 2471 + }, + { + "epoch": 0.42307034057847, + "grad_norm": 103.56417083740234, + "learning_rate": 1.4067313177410155e-05, + "loss": 8.869, + "step": 2472 + }, + { + "epoch": 0.4232414855382509, + "grad_norm": 34.15910339355469, + "learning_rate": 1.4073017683970337e-05, + "loss": 3.99, + "step": 2473 + }, + { + "epoch": 0.42341263049803185, + "grad_norm": 9.371402740478516, + "learning_rate": 1.407872219053052e-05, + "loss": 1.7126, + "step": 2474 + }, + { + "epoch": 0.4235837754578128, + "grad_norm": 5.610677719116211, + "learning_rate": 1.4084426697090702e-05, + "loss": 0.6139, + "step": 2475 + }, + { + "epoch": 0.4237549204175937, + "grad_norm": 25.18387222290039, + "learning_rate": 1.4090131203650885e-05, + "loss": 3.0951, + "step": 2476 + }, + { + "epoch": 0.42392606537737465, + "grad_norm": 21.81611442565918, + "learning_rate": 1.4095835710211067e-05, + "loss": 2.1739, + "step": 2477 + }, + { + "epoch": 0.4240972103371556, + "grad_norm": 37.39387893676758, + "learning_rate": 1.4101540216771249e-05, + "loss": 4.5252, + "step": 2478 + }, + { + "epoch": 0.4242683552969365, + "grad_norm": 5.823449611663818, + "learning_rate": 1.4107244723331432e-05, + "loss": 0.6128, + "step": 2479 + }, + { + "epoch": 0.42443950025671745, + "grad_norm": 31.69689178466797, + "learning_rate": 1.4112949229891614e-05, + "loss": 2.9986, + "step": 2480 + }, + { + "epoch": 0.4246106452164984, + "grad_norm": 35.987152099609375, + "learning_rate": 1.4118653736451797e-05, + "loss": 3.4619, + "step": 2481 + }, + { + "epoch": 0.4247817901762793, + "grad_norm": 16.255069732666016, + "learning_rate": 1.412435824301198e-05, + "loss": 1.2687, + "step": 2482 + }, + { + "epoch": 0.42495293513606025, + "grad_norm": 38.878501892089844, + "learning_rate": 1.4130062749572162e-05, + "loss": 4.4326, + "step": 2483 + }, + { + "epoch": 0.4251240800958412, + "grad_norm": 33.7603759765625, + "learning_rate": 1.4135767256132346e-05, + "loss": 3.6103, + "step": 2484 + }, + { + "epoch": 0.4252952250556221, + "grad_norm": 30.415058135986328, + "learning_rate": 1.4141471762692527e-05, + "loss": 2.9553, + "step": 2485 + }, + { + "epoch": 0.42546637001540305, + "grad_norm": 18.42668914794922, + "learning_rate": 1.414717626925271e-05, + "loss": 1.6697, + "step": 2486 + }, + { + "epoch": 0.425637514975184, + "grad_norm": 28.910137176513672, + "learning_rate": 1.4152880775812892e-05, + "loss": 2.7775, + "step": 2487 + }, + { + "epoch": 0.4258086599349649, + "grad_norm": 24.642555236816406, + "learning_rate": 1.4158585282373076e-05, + "loss": 2.6428, + "step": 2488 + }, + { + "epoch": 0.42597980489474585, + "grad_norm": 6.298614501953125, + "learning_rate": 1.4164289788933257e-05, + "loss": 0.5965, + "step": 2489 + }, + { + "epoch": 0.4261509498545268, + "grad_norm": 134.2201690673828, + "learning_rate": 1.4169994295493439e-05, + "loss": 5.7153, + "step": 2490 + }, + { + "epoch": 0.4263220948143077, + "grad_norm": 29.22636604309082, + "learning_rate": 1.4175698802053622e-05, + "loss": 3.1392, + "step": 2491 + }, + { + "epoch": 0.42649323977408865, + "grad_norm": 29.945589065551758, + "learning_rate": 1.4181403308613804e-05, + "loss": 2.9141, + "step": 2492 + }, + { + "epoch": 0.4266643847338696, + "grad_norm": 122.40117645263672, + "learning_rate": 1.418710781517399e-05, + "loss": 9.1376, + "step": 2493 + }, + { + "epoch": 0.4268355296936505, + "grad_norm": 12.278093338012695, + "learning_rate": 1.4192812321734171e-05, + "loss": 0.8576, + "step": 2494 + }, + { + "epoch": 0.42700667465343145, + "grad_norm": 29.33226776123047, + "learning_rate": 1.4198516828294353e-05, + "loss": 2.9443, + "step": 2495 + }, + { + "epoch": 0.4271778196132124, + "grad_norm": 31.89412498474121, + "learning_rate": 1.4204221334854536e-05, + "loss": 3.7789, + "step": 2496 + }, + { + "epoch": 0.4273489645729933, + "grad_norm": 30.404138565063477, + "learning_rate": 1.4209925841414718e-05, + "loss": 4.0024, + "step": 2497 + }, + { + "epoch": 0.42752010953277425, + "grad_norm": 7.538527488708496, + "learning_rate": 1.4215630347974901e-05, + "loss": 0.6812, + "step": 2498 + }, + { + "epoch": 0.4276912544925552, + "grad_norm": 194.7794952392578, + "learning_rate": 1.4221334854535083e-05, + "loss": 10.7557, + "step": 2499 + }, + { + "epoch": 0.4278623994523361, + "grad_norm": 27.38447380065918, + "learning_rate": 1.4227039361095266e-05, + "loss": 3.0669, + "step": 2500 + }, + { + "epoch": 0.42803354441211705, + "grad_norm": 36.52588653564453, + "learning_rate": 1.4232743867655448e-05, + "loss": 3.7922, + "step": 2501 + }, + { + "epoch": 0.428204689371898, + "grad_norm": 14.776211738586426, + "learning_rate": 1.423844837421563e-05, + "loss": 1.0695, + "step": 2502 + }, + { + "epoch": 0.4283758343316789, + "grad_norm": 22.516334533691406, + "learning_rate": 1.4244152880775813e-05, + "loss": 2.175, + "step": 2503 + }, + { + "epoch": 0.42854697929145985, + "grad_norm": 31.414302825927734, + "learning_rate": 1.4249857387335995e-05, + "loss": 3.5488, + "step": 2504 + }, + { + "epoch": 0.4287181242512408, + "grad_norm": 20.823116302490234, + "learning_rate": 1.425556189389618e-05, + "loss": 2.0128, + "step": 2505 + }, + { + "epoch": 0.4288892692110217, + "grad_norm": 33.47979736328125, + "learning_rate": 1.4261266400456361e-05, + "loss": 3.6721, + "step": 2506 + }, + { + "epoch": 0.42906041417080265, + "grad_norm": 33.771358489990234, + "learning_rate": 1.4266970907016545e-05, + "loss": 4.2083, + "step": 2507 + }, + { + "epoch": 0.4292315591305836, + "grad_norm": 21.674623489379883, + "learning_rate": 1.4272675413576726e-05, + "loss": 1.8789, + "step": 2508 + }, + { + "epoch": 0.4294027040903645, + "grad_norm": 31.44987678527832, + "learning_rate": 1.4278379920136908e-05, + "loss": 3.6812, + "step": 2509 + }, + { + "epoch": 0.42957384905014545, + "grad_norm": 9.912192344665527, + "learning_rate": 1.4284084426697091e-05, + "loss": 1.2073, + "step": 2510 + }, + { + "epoch": 0.4297449940099264, + "grad_norm": 26.342119216918945, + "learning_rate": 1.4289788933257273e-05, + "loss": 2.4193, + "step": 2511 + }, + { + "epoch": 0.4299161389697073, + "grad_norm": 57.646331787109375, + "learning_rate": 1.4295493439817456e-05, + "loss": 8.0588, + "step": 2512 + }, + { + "epoch": 0.43008728392948825, + "grad_norm": 25.247426986694336, + "learning_rate": 1.4301197946377638e-05, + "loss": 2.5184, + "step": 2513 + }, + { + "epoch": 0.4302584288892692, + "grad_norm": 21.471519470214844, + "learning_rate": 1.430690245293782e-05, + "loss": 2.2334, + "step": 2514 + }, + { + "epoch": 0.4304295738490501, + "grad_norm": 25.605525970458984, + "learning_rate": 1.4312606959498003e-05, + "loss": 2.584, + "step": 2515 + }, + { + "epoch": 0.4306007188088311, + "grad_norm": 34.87372589111328, + "learning_rate": 1.4318311466058187e-05, + "loss": 3.7447, + "step": 2516 + }, + { + "epoch": 0.43077186376861204, + "grad_norm": 28.899642944335938, + "learning_rate": 1.432401597261837e-05, + "loss": 2.9852, + "step": 2517 + }, + { + "epoch": 0.430943008728393, + "grad_norm": 24.084014892578125, + "learning_rate": 1.4329720479178552e-05, + "loss": 2.5703, + "step": 2518 + }, + { + "epoch": 0.4311141536881739, + "grad_norm": 13.15533447265625, + "learning_rate": 1.4335424985738735e-05, + "loss": 0.8629, + "step": 2519 + }, + { + "epoch": 0.43128529864795484, + "grad_norm": 100.28350067138672, + "learning_rate": 1.4341129492298917e-05, + "loss": 5.5365, + "step": 2520 + }, + { + "epoch": 0.4314564436077358, + "grad_norm": 25.63288116455078, + "learning_rate": 1.4346833998859098e-05, + "loss": 2.2759, + "step": 2521 + }, + { + "epoch": 0.4316275885675167, + "grad_norm": 4.588881969451904, + "learning_rate": 1.4352538505419282e-05, + "loss": 0.5143, + "step": 2522 + }, + { + "epoch": 0.43179873352729764, + "grad_norm": 31.304664611816406, + "learning_rate": 1.4358243011979463e-05, + "loss": 4.1463, + "step": 2523 + }, + { + "epoch": 0.4319698784870786, + "grad_norm": 18.030874252319336, + "learning_rate": 1.4363947518539647e-05, + "loss": 1.8934, + "step": 2524 + }, + { + "epoch": 0.4321410234468595, + "grad_norm": 29.338178634643555, + "learning_rate": 1.4369652025099829e-05, + "loss": 3.3346, + "step": 2525 + }, + { + "epoch": 0.43231216840664044, + "grad_norm": 29.54951286315918, + "learning_rate": 1.4375356531660012e-05, + "loss": 2.695, + "step": 2526 + }, + { + "epoch": 0.4324833133664214, + "grad_norm": 17.5317325592041, + "learning_rate": 1.4381061038220195e-05, + "loss": 1.4575, + "step": 2527 + }, + { + "epoch": 0.4326544583262023, + "grad_norm": 32.96657943725586, + "learning_rate": 1.4386765544780377e-05, + "loss": 3.592, + "step": 2528 + }, + { + "epoch": 0.43282560328598324, + "grad_norm": 19.32137107849121, + "learning_rate": 1.439247005134056e-05, + "loss": 1.2429, + "step": 2529 + }, + { + "epoch": 0.4329967482457642, + "grad_norm": 8.846491813659668, + "learning_rate": 1.4398174557900742e-05, + "loss": 1.021, + "step": 2530 + }, + { + "epoch": 0.4331678932055451, + "grad_norm": 4.180466651916504, + "learning_rate": 1.4403879064460925e-05, + "loss": 0.5108, + "step": 2531 + }, + { + "epoch": 0.43333903816532604, + "grad_norm": 28.7572078704834, + "learning_rate": 1.4409583571021107e-05, + "loss": 2.694, + "step": 2532 + }, + { + "epoch": 0.433510183125107, + "grad_norm": 34.48351287841797, + "learning_rate": 1.4415288077581289e-05, + "loss": 3.7674, + "step": 2533 + }, + { + "epoch": 0.4336813280848879, + "grad_norm": 27.524559020996094, + "learning_rate": 1.4420992584141472e-05, + "loss": 2.768, + "step": 2534 + }, + { + "epoch": 0.43385247304466884, + "grad_norm": 33.70855712890625, + "learning_rate": 1.4426697090701654e-05, + "loss": 3.1902, + "step": 2535 + }, + { + "epoch": 0.4340236180044498, + "grad_norm": 30.53034210205078, + "learning_rate": 1.4432401597261837e-05, + "loss": 3.4593, + "step": 2536 + }, + { + "epoch": 0.4341947629642307, + "grad_norm": 30.834991455078125, + "learning_rate": 1.4438106103822019e-05, + "loss": 3.439, + "step": 2537 + }, + { + "epoch": 0.43436590792401164, + "grad_norm": 31.815725326538086, + "learning_rate": 1.4443810610382202e-05, + "loss": 3.683, + "step": 2538 + }, + { + "epoch": 0.4345370528837926, + "grad_norm": 29.159996032714844, + "learning_rate": 1.4449515116942386e-05, + "loss": 2.9478, + "step": 2539 + }, + { + "epoch": 0.4347081978435735, + "grad_norm": 81.45700073242188, + "learning_rate": 1.4455219623502567e-05, + "loss": 9.0416, + "step": 2540 + }, + { + "epoch": 0.43487934280335444, + "grad_norm": 87.70926666259766, + "learning_rate": 1.446092413006275e-05, + "loss": 4.7629, + "step": 2541 + }, + { + "epoch": 0.4350504877631354, + "grad_norm": 9.934538841247559, + "learning_rate": 1.4466628636622932e-05, + "loss": 1.8243, + "step": 2542 + }, + { + "epoch": 0.4352216327229163, + "grad_norm": 9.613969802856445, + "learning_rate": 1.4472333143183116e-05, + "loss": 1.031, + "step": 2543 + }, + { + "epoch": 0.43539277768269724, + "grad_norm": 45.231689453125, + "learning_rate": 1.4478037649743297e-05, + "loss": 7.5705, + "step": 2544 + }, + { + "epoch": 0.4355639226424782, + "grad_norm": 9.317858695983887, + "learning_rate": 1.4483742156303479e-05, + "loss": 0.6934, + "step": 2545 + }, + { + "epoch": 0.4357350676022591, + "grad_norm": 35.789794921875, + "learning_rate": 1.4489446662863663e-05, + "loss": 4.1345, + "step": 2546 + }, + { + "epoch": 0.43590621256204004, + "grad_norm": 11.596151351928711, + "learning_rate": 1.4495151169423844e-05, + "loss": 1.3557, + "step": 2547 + }, + { + "epoch": 0.436077357521821, + "grad_norm": 4.43747091293335, + "learning_rate": 1.4500855675984028e-05, + "loss": 0.4991, + "step": 2548 + }, + { + "epoch": 0.4362485024816019, + "grad_norm": 29.71784019470215, + "learning_rate": 1.450656018254421e-05, + "loss": 3.192, + "step": 2549 + }, + { + "epoch": 0.43641964744138284, + "grad_norm": 44.21783447265625, + "learning_rate": 1.4512264689104394e-05, + "loss": 3.6461, + "step": 2550 + }, + { + "epoch": 0.4365907924011638, + "grad_norm": 27.61203384399414, + "learning_rate": 1.4517969195664576e-05, + "loss": 3.3135, + "step": 2551 + }, + { + "epoch": 0.4367619373609447, + "grad_norm": 23.84665298461914, + "learning_rate": 1.4523673702224758e-05, + "loss": 2.5268, + "step": 2552 + }, + { + "epoch": 0.43693308232072564, + "grad_norm": 29.368938446044922, + "learning_rate": 1.4529378208784941e-05, + "loss": 2.7211, + "step": 2553 + }, + { + "epoch": 0.4371042272805066, + "grad_norm": 36.08073806762695, + "learning_rate": 1.4535082715345123e-05, + "loss": 3.7964, + "step": 2554 + }, + { + "epoch": 0.4372753722402875, + "grad_norm": 32.68186950683594, + "learning_rate": 1.4540787221905306e-05, + "loss": 3.6173, + "step": 2555 + }, + { + "epoch": 0.43744651720006844, + "grad_norm": 34.985904693603516, + "learning_rate": 1.4546491728465488e-05, + "loss": 4.2656, + "step": 2556 + }, + { + "epoch": 0.4376176621598494, + "grad_norm": 129.27252197265625, + "learning_rate": 1.4552196235025671e-05, + "loss": 8.7164, + "step": 2557 + }, + { + "epoch": 0.4377888071196303, + "grad_norm": 29.99295997619629, + "learning_rate": 1.4557900741585853e-05, + "loss": 3.6185, + "step": 2558 + }, + { + "epoch": 0.43795995207941124, + "grad_norm": 28.371896743774414, + "learning_rate": 1.4563605248146035e-05, + "loss": 2.9256, + "step": 2559 + }, + { + "epoch": 0.4381310970391922, + "grad_norm": 8.728231430053711, + "learning_rate": 1.4569309754706218e-05, + "loss": 0.9915, + "step": 2560 + }, + { + "epoch": 0.4383022419989731, + "grad_norm": 31.164567947387695, + "learning_rate": 1.45750142612664e-05, + "loss": 3.8704, + "step": 2561 + }, + { + "epoch": 0.43847338695875404, + "grad_norm": 32.18178176879883, + "learning_rate": 1.4580718767826585e-05, + "loss": 4.2259, + "step": 2562 + }, + { + "epoch": 0.438644531918535, + "grad_norm": 25.499011993408203, + "learning_rate": 1.4586423274386766e-05, + "loss": 2.6854, + "step": 2563 + }, + { + "epoch": 0.4388156768783159, + "grad_norm": 34.26057815551758, + "learning_rate": 1.4592127780946948e-05, + "loss": 3.4689, + "step": 2564 + }, + { + "epoch": 0.43898682183809684, + "grad_norm": 33.73667526245117, + "learning_rate": 1.4597832287507131e-05, + "loss": 4.3474, + "step": 2565 + }, + { + "epoch": 0.43915796679787783, + "grad_norm": 32.83565902709961, + "learning_rate": 1.4603536794067313e-05, + "loss": 3.475, + "step": 2566 + }, + { + "epoch": 0.43932911175765876, + "grad_norm": 3.187453269958496, + "learning_rate": 1.4609241300627497e-05, + "loss": 0.4736, + "step": 2567 + }, + { + "epoch": 0.4395002567174397, + "grad_norm": 19.98860740661621, + "learning_rate": 1.4614945807187678e-05, + "loss": 2.0086, + "step": 2568 + }, + { + "epoch": 0.43967140167722063, + "grad_norm": 27.594697952270508, + "learning_rate": 1.4620650313747862e-05, + "loss": 3.2803, + "step": 2569 + }, + { + "epoch": 0.43984254663700156, + "grad_norm": 3.9966156482696533, + "learning_rate": 1.4626354820308043e-05, + "loss": 0.568, + "step": 2570 + }, + { + "epoch": 0.4400136915967825, + "grad_norm": 5.779835224151611, + "learning_rate": 1.4632059326868225e-05, + "loss": 0.648, + "step": 2571 + }, + { + "epoch": 0.44018483655656343, + "grad_norm": 39.23750305175781, + "learning_rate": 1.4637763833428408e-05, + "loss": 7.0941, + "step": 2572 + }, + { + "epoch": 0.44035598151634436, + "grad_norm": 28.068208694458008, + "learning_rate": 1.4643468339988592e-05, + "loss": 3.0381, + "step": 2573 + }, + { + "epoch": 0.4405271264761253, + "grad_norm": 25.783096313476562, + "learning_rate": 1.4649172846548775e-05, + "loss": 3.0511, + "step": 2574 + }, + { + "epoch": 0.44069827143590623, + "grad_norm": 29.101238250732422, + "learning_rate": 1.4654877353108957e-05, + "loss": 2.9123, + "step": 2575 + }, + { + "epoch": 0.44086941639568716, + "grad_norm": 14.171677589416504, + "learning_rate": 1.466058185966914e-05, + "loss": 1.0138, + "step": 2576 + }, + { + "epoch": 0.4410405613554681, + "grad_norm": 27.117347717285156, + "learning_rate": 1.4666286366229322e-05, + "loss": 3.1994, + "step": 2577 + }, + { + "epoch": 0.44121170631524903, + "grad_norm": 29.480358123779297, + "learning_rate": 1.4671990872789504e-05, + "loss": 3.4766, + "step": 2578 + }, + { + "epoch": 0.44138285127502996, + "grad_norm": 4.977560997009277, + "learning_rate": 1.4677695379349687e-05, + "loss": 0.7032, + "step": 2579 + }, + { + "epoch": 0.4415539962348109, + "grad_norm": 31.941097259521484, + "learning_rate": 1.4683399885909869e-05, + "loss": 2.931, + "step": 2580 + }, + { + "epoch": 0.44172514119459183, + "grad_norm": 136.83563232421875, + "learning_rate": 1.4689104392470052e-05, + "loss": 5.0846, + "step": 2581 + }, + { + "epoch": 0.44189628615437276, + "grad_norm": 9.305535316467285, + "learning_rate": 1.4694808899030234e-05, + "loss": 0.911, + "step": 2582 + }, + { + "epoch": 0.4420674311141537, + "grad_norm": 18.890281677246094, + "learning_rate": 1.4700513405590415e-05, + "loss": 1.4747, + "step": 2583 + }, + { + "epoch": 0.44223857607393463, + "grad_norm": 46.04558563232422, + "learning_rate": 1.4706217912150599e-05, + "loss": 7.673, + "step": 2584 + }, + { + "epoch": 0.44240972103371556, + "grad_norm": 24.37186050415039, + "learning_rate": 1.4711922418710782e-05, + "loss": 2.3299, + "step": 2585 + }, + { + "epoch": 0.4425808659934965, + "grad_norm": 38.21072006225586, + "learning_rate": 1.4717626925270965e-05, + "loss": 7.1275, + "step": 2586 + }, + { + "epoch": 0.44275201095327743, + "grad_norm": 77.47330474853516, + "learning_rate": 1.4723331431831147e-05, + "loss": 4.7878, + "step": 2587 + }, + { + "epoch": 0.44292315591305836, + "grad_norm": 37.18149185180664, + "learning_rate": 1.472903593839133e-05, + "loss": 3.7571, + "step": 2588 + }, + { + "epoch": 0.4430943008728393, + "grad_norm": 3.5262255668640137, + "learning_rate": 1.4734740444951512e-05, + "loss": 0.5224, + "step": 2589 + }, + { + "epoch": 0.44326544583262023, + "grad_norm": 11.645423889160156, + "learning_rate": 1.4740444951511694e-05, + "loss": 1.0846, + "step": 2590 + }, + { + "epoch": 0.44343659079240116, + "grad_norm": 6.892613410949707, + "learning_rate": 1.4746149458071877e-05, + "loss": 0.54, + "step": 2591 + }, + { + "epoch": 0.4436077357521821, + "grad_norm": 8.752089500427246, + "learning_rate": 1.4751853964632059e-05, + "loss": 0.6267, + "step": 2592 + }, + { + "epoch": 0.44377888071196303, + "grad_norm": 23.974550247192383, + "learning_rate": 1.4757558471192242e-05, + "loss": 2.6998, + "step": 2593 + }, + { + "epoch": 0.44395002567174396, + "grad_norm": 7.374299049377441, + "learning_rate": 1.4763262977752424e-05, + "loss": 0.8349, + "step": 2594 + }, + { + "epoch": 0.4441211706315249, + "grad_norm": 23.21881103515625, + "learning_rate": 1.4768967484312606e-05, + "loss": 2.7586, + "step": 2595 + }, + { + "epoch": 0.44429231559130583, + "grad_norm": 170.6956024169922, + "learning_rate": 1.477467199087279e-05, + "loss": 9.1929, + "step": 2596 + }, + { + "epoch": 0.44446346055108676, + "grad_norm": 20.11836814880371, + "learning_rate": 1.4780376497432972e-05, + "loss": 2.3475, + "step": 2597 + }, + { + "epoch": 0.4446346055108677, + "grad_norm": 24.03493881225586, + "learning_rate": 1.4786081003993156e-05, + "loss": 2.9464, + "step": 2598 + }, + { + "epoch": 0.44480575047064863, + "grad_norm": 27.76041603088379, + "learning_rate": 1.4791785510553338e-05, + "loss": 2.7217, + "step": 2599 + }, + { + "epoch": 0.44497689543042956, + "grad_norm": 17.792516708374023, + "learning_rate": 1.4797490017113521e-05, + "loss": 1.6209, + "step": 2600 + }, + { + "epoch": 0.4451480403902105, + "grad_norm": 34.788169860839844, + "learning_rate": 1.4803194523673703e-05, + "loss": 2.8761, + "step": 2601 + }, + { + "epoch": 0.44531918534999143, + "grad_norm": 18.824007034301758, + "learning_rate": 1.4808899030233884e-05, + "loss": 2.5789, + "step": 2602 + }, + { + "epoch": 0.44549033030977236, + "grad_norm": 19.51264190673828, + "learning_rate": 1.4814603536794068e-05, + "loss": 2.2163, + "step": 2603 + }, + { + "epoch": 0.4456614752695533, + "grad_norm": 31.428625106811523, + "learning_rate": 1.482030804335425e-05, + "loss": 2.9496, + "step": 2604 + }, + { + "epoch": 0.44583262022933423, + "grad_norm": 13.012333869934082, + "learning_rate": 1.4826012549914433e-05, + "loss": 0.8159, + "step": 2605 + }, + { + "epoch": 0.44600376518911516, + "grad_norm": 23.477638244628906, + "learning_rate": 1.4831717056474614e-05, + "loss": 2.4937, + "step": 2606 + }, + { + "epoch": 0.4461749101488961, + "grad_norm": 35.8111572265625, + "learning_rate": 1.48374215630348e-05, + "loss": 3.2833, + "step": 2607 + }, + { + "epoch": 0.44634605510867703, + "grad_norm": 32.99673080444336, + "learning_rate": 1.4843126069594981e-05, + "loss": 3.5874, + "step": 2608 + }, + { + "epoch": 0.44651720006845796, + "grad_norm": 3.853698253631592, + "learning_rate": 1.4848830576155163e-05, + "loss": 0.4709, + "step": 2609 + }, + { + "epoch": 0.4466883450282389, + "grad_norm": 27.9306583404541, + "learning_rate": 1.4854535082715346e-05, + "loss": 2.6247, + "step": 2610 + }, + { + "epoch": 0.44685948998801983, + "grad_norm": 11.854992866516113, + "learning_rate": 1.4860239589275528e-05, + "loss": 0.9967, + "step": 2611 + }, + { + "epoch": 0.44703063494780076, + "grad_norm": 49.759117126464844, + "learning_rate": 1.4865944095835711e-05, + "loss": 7.1995, + "step": 2612 + }, + { + "epoch": 0.4472017799075817, + "grad_norm": 31.380281448364258, + "learning_rate": 1.4871648602395893e-05, + "loss": 2.7301, + "step": 2613 + }, + { + "epoch": 0.44737292486736263, + "grad_norm": 29.84979820251465, + "learning_rate": 1.4877353108956075e-05, + "loss": 3.1099, + "step": 2614 + }, + { + "epoch": 0.44754406982714356, + "grad_norm": 13.841278076171875, + "learning_rate": 1.4883057615516258e-05, + "loss": 1.0569, + "step": 2615 + }, + { + "epoch": 0.4477152147869245, + "grad_norm": 28.414051055908203, + "learning_rate": 1.488876212207644e-05, + "loss": 2.6359, + "step": 2616 + }, + { + "epoch": 0.4478863597467055, + "grad_norm": 29.42824363708496, + "learning_rate": 1.4894466628636623e-05, + "loss": 3.3203, + "step": 2617 + }, + { + "epoch": 0.4480575047064864, + "grad_norm": 33.065799713134766, + "learning_rate": 1.4900171135196805e-05, + "loss": 4.0124, + "step": 2618 + }, + { + "epoch": 0.44822864966626735, + "grad_norm": 9.898391723632812, + "learning_rate": 1.490587564175699e-05, + "loss": 0.9107, + "step": 2619 + }, + { + "epoch": 0.4483997946260483, + "grad_norm": 23.923398971557617, + "learning_rate": 1.4911580148317172e-05, + "loss": 2.5028, + "step": 2620 + }, + { + "epoch": 0.4485709395858292, + "grad_norm": 25.825178146362305, + "learning_rate": 1.4917284654877353e-05, + "loss": 2.3647, + "step": 2621 + }, + { + "epoch": 0.44874208454561015, + "grad_norm": 24.46117401123047, + "learning_rate": 1.4922989161437537e-05, + "loss": 2.66, + "step": 2622 + }, + { + "epoch": 0.4489132295053911, + "grad_norm": 19.926624298095703, + "learning_rate": 1.4928693667997718e-05, + "loss": 1.771, + "step": 2623 + }, + { + "epoch": 0.449084374465172, + "grad_norm": 107.68805694580078, + "learning_rate": 1.4934398174557902e-05, + "loss": 9.219, + "step": 2624 + }, + { + "epoch": 0.44925551942495295, + "grad_norm": 18.121204376220703, + "learning_rate": 1.4940102681118083e-05, + "loss": 1.3726, + "step": 2625 + }, + { + "epoch": 0.4494266643847339, + "grad_norm": 27.648178100585938, + "learning_rate": 1.4945807187678267e-05, + "loss": 2.6469, + "step": 2626 + }, + { + "epoch": 0.4495978093445148, + "grad_norm": 28.146556854248047, + "learning_rate": 1.4951511694238448e-05, + "loss": 2.8926, + "step": 2627 + }, + { + "epoch": 0.44976895430429575, + "grad_norm": 52.536190032958984, + "learning_rate": 1.495721620079863e-05, + "loss": 7.5002, + "step": 2628 + }, + { + "epoch": 0.4499400992640767, + "grad_norm": 24.027881622314453, + "learning_rate": 1.4962920707358814e-05, + "loss": 2.3452, + "step": 2629 + }, + { + "epoch": 0.4501112442238576, + "grad_norm": 34.977684020996094, + "learning_rate": 1.4968625213918997e-05, + "loss": 3.9508, + "step": 2630 + }, + { + "epoch": 0.45028238918363855, + "grad_norm": 30.991193771362305, + "learning_rate": 1.497432972047918e-05, + "loss": 3.6064, + "step": 2631 + }, + { + "epoch": 0.45028238918363855, + "eval_nli-pairs_loss": 2.871744394302368, + "eval_nli-pairs_runtime": 4.2947, + "eval_nli-pairs_samples_per_second": 46.569, + "eval_nli-pairs_steps_per_second": 1.63, + "eval_sts-test_pearson_cosine": 0.7195428557259504, + "eval_sts-test_pearson_dot": 0.6098064793689061, + "eval_sts-test_pearson_euclidean": 0.7205423612792191, + "eval_sts-test_pearson_manhattan": 0.7293110123887395, + "eval_sts-test_pearson_max": 0.7293110123887395, + "eval_sts-test_spearman_cosine": 0.6966954300008318, + "eval_sts-test_spearman_dot": 0.5822364450229315, + "eval_sts-test_spearman_euclidean": 0.7004689124572796, + "eval_sts-test_spearman_manhattan": 0.7099498051685355, + "eval_sts-test_spearman_max": 0.7099498051685355, + "step": 2631 + }, + { + "epoch": 0.45028238918363855, + "eval_vitaminc-pairs_loss": 1.8629615306854248, + "eval_vitaminc-pairs_runtime": 2.7342, + "eval_vitaminc-pairs_samples_per_second": 73.148, + "eval_vitaminc-pairs_steps_per_second": 2.56, + "step": 2631 + }, + { + "epoch": 0.45028238918363855, + "eval_qnli-contrastive_loss": 5.418925762176514, + "eval_qnli-contrastive_runtime": 0.6359, + "eval_qnli-contrastive_samples_per_second": 314.496, + "eval_qnli-contrastive_steps_per_second": 11.007, + "step": 2631 + }, + { + "epoch": 0.45028238918363855, + "eval_scitail-pairs-qa_loss": 0.4216327965259552, + "eval_scitail-pairs-qa_runtime": 1.6135, + "eval_scitail-pairs-qa_samples_per_second": 123.956, + "eval_scitail-pairs-qa_steps_per_second": 4.338, + "step": 2631 + }, + { + "epoch": 0.45028238918363855, + "eval_scitail-pairs-pos_loss": 1.3018670082092285, + "eval_scitail-pairs-pos_runtime": 2.6103, + "eval_scitail-pairs-pos_samples_per_second": 76.619, + "eval_scitail-pairs-pos_steps_per_second": 2.682, + "step": 2631 + }, + { + "epoch": 0.45028238918363855, + "eval_xsum-pairs_loss": 1.584064245223999, + "eval_xsum-pairs_runtime": 2.6388, + "eval_xsum-pairs_samples_per_second": 66.317, + "eval_xsum-pairs_steps_per_second": 2.274, + "step": 2631 + }, + { + "epoch": 0.45028238918363855, + "eval_compression-pairs_loss": 0.7760603427886963, + "eval_compression-pairs_runtime": 0.5146, + "eval_compression-pairs_samples_per_second": 388.623, + "eval_compression-pairs_steps_per_second": 13.602, + "step": 2631 + }, + { + "epoch": 0.45028238918363855, + "eval_sciq_pairs_loss": 5.851566314697266, + "eval_sciq_pairs_runtime": 9.2089, + "eval_sciq_pairs_samples_per_second": 21.718, + "eval_sciq_pairs_steps_per_second": 0.76, + "step": 2631 + }, + { + "epoch": 0.45028238918363855, + "eval_qasc_pairs_loss": 7.442629814147949, + "eval_qasc_pairs_runtime": 2.6477, + "eval_qasc_pairs_samples_per_second": 75.537, + "eval_qasc_pairs_steps_per_second": 2.644, + "step": 2631 + }, + { + "epoch": 0.45028238918363855, + "eval_openbookqa_pairs_loss": 4.049252033233643, + "eval_openbookqa_pairs_runtime": 0.6399, + "eval_openbookqa_pairs_samples_per_second": 107.834, + "eval_openbookqa_pairs_steps_per_second": 4.688, + "step": 2631 + }, + { + "epoch": 0.45028238918363855, + "eval_msmarco_pairs_loss": 2.6957242488861084, + "eval_msmarco_pairs_runtime": 3.9586, + "eval_msmarco_pairs_samples_per_second": 50.523, + "eval_msmarco_pairs_steps_per_second": 1.768, + "step": 2631 + }, + { + "epoch": 0.45028238918363855, + "eval_nq_pairs_loss": 3.332510471343994, + "eval_nq_pairs_runtime": 8.6125, + "eval_nq_pairs_samples_per_second": 23.222, + "eval_nq_pairs_steps_per_second": 0.813, + "step": 2631 + }, + { + "epoch": 0.45028238918363855, + "eval_trivia_pairs_loss": 3.298595905303955, + "eval_trivia_pairs_runtime": 12.8335, + "eval_trivia_pairs_samples_per_second": 15.584, + "eval_trivia_pairs_steps_per_second": 0.545, + "step": 2631 + }, + { + "epoch": 0.45028238918363855, + "eval_quora_pairs_loss": 0.6931056380271912, + "eval_quora_pairs_runtime": 1.5975, + "eval_quora_pairs_samples_per_second": 125.194, + "eval_quora_pairs_steps_per_second": 4.382, + "step": 2631 + }, + { + "epoch": 0.45028238918363855, + "eval_gooaq_pairs_loss": 2.1408634185791016, + "eval_gooaq_pairs_runtime": 2.6505, + "eval_gooaq_pairs_samples_per_second": 75.457, + "eval_gooaq_pairs_steps_per_second": 2.641, + "step": 2631 + }, + { + "epoch": 0.4504535341434195, + "grad_norm": 25.071407318115234, + "learning_rate": 1.4980034227039362e-05, + "loss": 2.3902, + "step": 2632 + }, + { + "epoch": 0.4506246791032004, + "grad_norm": 14.988327026367188, + "learning_rate": 1.4985738733599544e-05, + "loss": 1.1409, + "step": 2633 + }, + { + "epoch": 0.45079582406298135, + "grad_norm": 26.867197036743164, + "learning_rate": 1.4991443240159727e-05, + "loss": 2.932, + "step": 2634 + }, + { + "epoch": 0.4509669690227623, + "grad_norm": 36.01612091064453, + "learning_rate": 1.4997147746719909e-05, + "loss": 4.7158, + "step": 2635 + }, + { + "epoch": 0.4511381139825432, + "grad_norm": 16.741594314575195, + "learning_rate": 1.5002852253280092e-05, + "loss": 1.3786, + "step": 2636 + }, + { + "epoch": 0.45130925894232415, + "grad_norm": 27.737268447875977, + "learning_rate": 1.5008556759840274e-05, + "loss": 3.3069, + "step": 2637 + }, + { + "epoch": 0.4514804039021051, + "grad_norm": 12.152483940124512, + "learning_rate": 1.5014261266400457e-05, + "loss": 1.0332, + "step": 2638 + }, + { + "epoch": 0.451651548861886, + "grad_norm": 12.2247314453125, + "learning_rate": 1.5019965772960639e-05, + "loss": 0.825, + "step": 2639 + }, + { + "epoch": 0.45182269382166695, + "grad_norm": 10.578752517700195, + "learning_rate": 1.502567027952082e-05, + "loss": 1.097, + "step": 2640 + }, + { + "epoch": 0.4519938387814479, + "grad_norm": 27.834949493408203, + "learning_rate": 1.5031374786081004e-05, + "loss": 3.3057, + "step": 2641 + }, + { + "epoch": 0.4521649837412288, + "grad_norm": 31.40846824645996, + "learning_rate": 1.5037079292641186e-05, + "loss": 3.5691, + "step": 2642 + }, + { + "epoch": 0.45233612870100975, + "grad_norm": 37.22605514526367, + "learning_rate": 1.5042783799201369e-05, + "loss": 3.8636, + "step": 2643 + }, + { + "epoch": 0.4525072736607907, + "grad_norm": 10.362072944641113, + "learning_rate": 1.504848830576155e-05, + "loss": 0.9574, + "step": 2644 + }, + { + "epoch": 0.4526784186205716, + "grad_norm": 27.246967315673828, + "learning_rate": 1.5054192812321734e-05, + "loss": 2.6717, + "step": 2645 + }, + { + "epoch": 0.45284956358035255, + "grad_norm": 17.54155921936035, + "learning_rate": 1.5059897318881916e-05, + "loss": 1.1453, + "step": 2646 + }, + { + "epoch": 0.4530207085401335, + "grad_norm": 27.662446975708008, + "learning_rate": 1.50656018254421e-05, + "loss": 2.3804, + "step": 2647 + }, + { + "epoch": 0.4531918534999144, + "grad_norm": 94.5572738647461, + "learning_rate": 1.5071306332002284e-05, + "loss": 8.0449, + "step": 2648 + }, + { + "epoch": 0.45336299845969535, + "grad_norm": 30.91036605834961, + "learning_rate": 1.5077010838562466e-05, + "loss": 3.4507, + "step": 2649 + }, + { + "epoch": 0.4535341434194763, + "grad_norm": 24.88844108581543, + "learning_rate": 1.508271534512265e-05, + "loss": 2.4537, + "step": 2650 + }, + { + "epoch": 0.4537052883792572, + "grad_norm": 36.03679656982422, + "learning_rate": 1.5088419851682831e-05, + "loss": 7.241, + "step": 2651 + }, + { + "epoch": 0.45387643333903815, + "grad_norm": 25.070192337036133, + "learning_rate": 1.5094124358243013e-05, + "loss": 2.5557, + "step": 2652 + }, + { + "epoch": 0.4540475782988191, + "grad_norm": 7.677706718444824, + "learning_rate": 1.5099828864803196e-05, + "loss": 0.5579, + "step": 2653 + }, + { + "epoch": 0.4542187232586, + "grad_norm": 30.037338256835938, + "learning_rate": 1.5105533371363378e-05, + "loss": 3.2643, + "step": 2654 + }, + { + "epoch": 0.45438986821838095, + "grad_norm": 25.368310928344727, + "learning_rate": 1.5111237877923561e-05, + "loss": 2.722, + "step": 2655 + }, + { + "epoch": 0.4545610131781619, + "grad_norm": 36.92127990722656, + "learning_rate": 1.5116942384483743e-05, + "loss": 4.7207, + "step": 2656 + }, + { + "epoch": 0.4547321581379428, + "grad_norm": 22.686552047729492, + "learning_rate": 1.5122646891043926e-05, + "loss": 2.1942, + "step": 2657 + }, + { + "epoch": 0.45490330309772375, + "grad_norm": 53.640262603759766, + "learning_rate": 1.5128351397604108e-05, + "loss": 6.8632, + "step": 2658 + }, + { + "epoch": 0.4550744480575047, + "grad_norm": 24.542247772216797, + "learning_rate": 1.513405590416429e-05, + "loss": 2.4562, + "step": 2659 + }, + { + "epoch": 0.4552455930172856, + "grad_norm": 5.353951930999756, + "learning_rate": 1.5139760410724473e-05, + "loss": 0.5523, + "step": 2660 + }, + { + "epoch": 0.45541673797706655, + "grad_norm": 32.79592514038086, + "learning_rate": 1.5145464917284655e-05, + "loss": 3.4424, + "step": 2661 + }, + { + "epoch": 0.4555878829368475, + "grad_norm": 35.7240104675293, + "learning_rate": 1.5151169423844838e-05, + "loss": 3.5062, + "step": 2662 + }, + { + "epoch": 0.4557590278966284, + "grad_norm": 30.997047424316406, + "learning_rate": 1.515687393040502e-05, + "loss": 3.9807, + "step": 2663 + }, + { + "epoch": 0.45593017285640935, + "grad_norm": 41.52260208129883, + "learning_rate": 1.5162578436965201e-05, + "loss": 4.4682, + "step": 2664 + }, + { + "epoch": 0.4561013178161903, + "grad_norm": 33.410797119140625, + "learning_rate": 1.5168282943525385e-05, + "loss": 3.3602, + "step": 2665 + }, + { + "epoch": 0.4562724627759712, + "grad_norm": 22.308074951171875, + "learning_rate": 1.5173987450085566e-05, + "loss": 2.5111, + "step": 2666 + }, + { + "epoch": 0.45644360773575215, + "grad_norm": 21.073827743530273, + "learning_rate": 1.517969195664575e-05, + "loss": 2.1332, + "step": 2667 + }, + { + "epoch": 0.45661475269553314, + "grad_norm": 36.2976188659668, + "learning_rate": 1.5185396463205931e-05, + "loss": 4.8541, + "step": 2668 + }, + { + "epoch": 0.4567858976553141, + "grad_norm": 37.76522445678711, + "learning_rate": 1.5191100969766115e-05, + "loss": 6.96, + "step": 2669 + }, + { + "epoch": 0.456957042615095, + "grad_norm": 29.864612579345703, + "learning_rate": 1.51968054763263e-05, + "loss": 2.896, + "step": 2670 + }, + { + "epoch": 0.45712818757487594, + "grad_norm": 22.04704475402832, + "learning_rate": 1.5202509982886482e-05, + "loss": 2.6772, + "step": 2671 + }, + { + "epoch": 0.4572993325346569, + "grad_norm": 19.153793334960938, + "learning_rate": 1.5208214489446665e-05, + "loss": 1.7357, + "step": 2672 + }, + { + "epoch": 0.4574704774944378, + "grad_norm": 30.495540618896484, + "learning_rate": 1.5213918996006847e-05, + "loss": 3.1067, + "step": 2673 + }, + { + "epoch": 0.45764162245421874, + "grad_norm": 12.724396705627441, + "learning_rate": 1.521962350256703e-05, + "loss": 0.9931, + "step": 2674 + }, + { + "epoch": 0.4578127674139997, + "grad_norm": 6.2942399978637695, + "learning_rate": 1.5225328009127212e-05, + "loss": 0.5454, + "step": 2675 + }, + { + "epoch": 0.4579839123737806, + "grad_norm": 10.231136322021484, + "learning_rate": 1.5231032515687395e-05, + "loss": 1.696, + "step": 2676 + }, + { + "epoch": 0.45815505733356154, + "grad_norm": 152.32469177246094, + "learning_rate": 1.5236737022247577e-05, + "loss": 8.8958, + "step": 2677 + }, + { + "epoch": 0.4583262022933425, + "grad_norm": 38.06270980834961, + "learning_rate": 1.5242441528807758e-05, + "loss": 3.9409, + "step": 2678 + }, + { + "epoch": 0.4584973472531234, + "grad_norm": 25.77074432373047, + "learning_rate": 1.5248146035367942e-05, + "loss": 2.6594, + "step": 2679 + }, + { + "epoch": 0.45866849221290434, + "grad_norm": 29.309284210205078, + "learning_rate": 1.5253850541928123e-05, + "loss": 3.3099, + "step": 2680 + }, + { + "epoch": 0.4588396371726853, + "grad_norm": 25.866558074951172, + "learning_rate": 1.5259555048488305e-05, + "loss": 3.4843, + "step": 2681 + }, + { + "epoch": 0.4590107821324662, + "grad_norm": 23.736045837402344, + "learning_rate": 1.526525955504849e-05, + "loss": 2.5762, + "step": 2682 + }, + { + "epoch": 0.45918192709224714, + "grad_norm": 24.48654556274414, + "learning_rate": 1.5270964061608672e-05, + "loss": 2.4442, + "step": 2683 + }, + { + "epoch": 0.4593530720520281, + "grad_norm": 9.48880386352539, + "learning_rate": 1.5276668568168852e-05, + "loss": 1.0744, + "step": 2684 + }, + { + "epoch": 0.459524217011809, + "grad_norm": 5.050061225891113, + "learning_rate": 1.5282373074729035e-05, + "loss": 0.4942, + "step": 2685 + }, + { + "epoch": 0.45969536197158994, + "grad_norm": 22.153188705444336, + "learning_rate": 1.528807758128922e-05, + "loss": 2.3914, + "step": 2686 + }, + { + "epoch": 0.4598665069313709, + "grad_norm": 3.6951212882995605, + "learning_rate": 1.5293782087849402e-05, + "loss": 0.4808, + "step": 2687 + }, + { + "epoch": 0.4600376518911518, + "grad_norm": 11.074764251708984, + "learning_rate": 1.5299486594409582e-05, + "loss": 1.7231, + "step": 2688 + }, + { + "epoch": 0.46020879685093274, + "grad_norm": 19.63498306274414, + "learning_rate": 1.5305191100969765e-05, + "loss": 2.1931, + "step": 2689 + }, + { + "epoch": 0.4603799418107137, + "grad_norm": 30.376020431518555, + "learning_rate": 1.531089560752995e-05, + "loss": 3.2142, + "step": 2690 + }, + { + "epoch": 0.4605510867704946, + "grad_norm": 39.05623245239258, + "learning_rate": 1.531660011409013e-05, + "loss": 4.3445, + "step": 2691 + }, + { + "epoch": 0.46072223173027554, + "grad_norm": 34.95427703857422, + "learning_rate": 1.5322304620650312e-05, + "loss": 3.5087, + "step": 2692 + }, + { + "epoch": 0.4608933766900565, + "grad_norm": 24.339468002319336, + "learning_rate": 1.53280091272105e-05, + "loss": 2.139, + "step": 2693 + }, + { + "epoch": 0.4610645216498374, + "grad_norm": 36.85024642944336, + "learning_rate": 1.5333713633770682e-05, + "loss": 4.5667, + "step": 2694 + }, + { + "epoch": 0.46123566660961834, + "grad_norm": 121.48307037353516, + "learning_rate": 1.5339418140330862e-05, + "loss": 5.9012, + "step": 2695 + }, + { + "epoch": 0.4614068115693993, + "grad_norm": 7.473188877105713, + "learning_rate": 1.5345122646891046e-05, + "loss": 0.9301, + "step": 2696 + }, + { + "epoch": 0.4615779565291802, + "grad_norm": 27.48497200012207, + "learning_rate": 1.535082715345123e-05, + "loss": 3.0351, + "step": 2697 + }, + { + "epoch": 0.46174910148896114, + "grad_norm": 22.619394302368164, + "learning_rate": 1.535653166001141e-05, + "loss": 2.4788, + "step": 2698 + }, + { + "epoch": 0.4619202464487421, + "grad_norm": 25.0198974609375, + "learning_rate": 1.5362236166571592e-05, + "loss": 2.3989, + "step": 2699 + }, + { + "epoch": 0.462091391408523, + "grad_norm": 23.36564064025879, + "learning_rate": 1.5367940673131776e-05, + "loss": 2.4179, + "step": 2700 + }, + { + "epoch": 0.46226253636830394, + "grad_norm": 29.04068946838379, + "learning_rate": 1.537364517969196e-05, + "loss": 3.1229, + "step": 2701 + }, + { + "epoch": 0.4624336813280849, + "grad_norm": 27.629722595214844, + "learning_rate": 1.537934968625214e-05, + "loss": 2.7618, + "step": 2702 + }, + { + "epoch": 0.4626048262878658, + "grad_norm": 23.081079483032227, + "learning_rate": 1.5385054192812323e-05, + "loss": 2.8201, + "step": 2703 + }, + { + "epoch": 0.46277597124764674, + "grad_norm": 26.009172439575195, + "learning_rate": 1.5390758699372506e-05, + "loss": 3.1322, + "step": 2704 + }, + { + "epoch": 0.4629471162074277, + "grad_norm": 18.447147369384766, + "learning_rate": 1.5396463205932686e-05, + "loss": 1.2356, + "step": 2705 + }, + { + "epoch": 0.4631182611672086, + "grad_norm": 22.773012161254883, + "learning_rate": 1.540216771249287e-05, + "loss": 2.6551, + "step": 2706 + }, + { + "epoch": 0.46328940612698954, + "grad_norm": 32.899314880371094, + "learning_rate": 1.5407872219053053e-05, + "loss": 3.7763, + "step": 2707 + }, + { + "epoch": 0.4634605510867705, + "grad_norm": 97.4777603149414, + "learning_rate": 1.5413576725613233e-05, + "loss": 4.5767, + "step": 2708 + }, + { + "epoch": 0.4636316960465514, + "grad_norm": 41.41079330444336, + "learning_rate": 1.5419281232173416e-05, + "loss": 7.352, + "step": 2709 + }, + { + "epoch": 0.46380284100633234, + "grad_norm": 24.83094024658203, + "learning_rate": 1.54249857387336e-05, + "loss": 2.836, + "step": 2710 + }, + { + "epoch": 0.4639739859661133, + "grad_norm": 12.101001739501953, + "learning_rate": 1.5430690245293783e-05, + "loss": 0.9624, + "step": 2711 + }, + { + "epoch": 0.4641451309258942, + "grad_norm": 24.289182662963867, + "learning_rate": 1.5436394751853963e-05, + "loss": 2.3101, + "step": 2712 + }, + { + "epoch": 0.46431627588567514, + "grad_norm": 23.911334991455078, + "learning_rate": 1.5442099258414146e-05, + "loss": 2.4969, + "step": 2713 + }, + { + "epoch": 0.4644874208454561, + "grad_norm": 35.51081085205078, + "learning_rate": 1.544780376497433e-05, + "loss": 3.353, + "step": 2714 + }, + { + "epoch": 0.464658565805237, + "grad_norm": 21.24627113342285, + "learning_rate": 1.545350827153451e-05, + "loss": 2.5466, + "step": 2715 + }, + { + "epoch": 0.46482971076501794, + "grad_norm": 30.70880126953125, + "learning_rate": 1.5459212778094696e-05, + "loss": 3.8228, + "step": 2716 + }, + { + "epoch": 0.4650008557247989, + "grad_norm": 25.956119537353516, + "learning_rate": 1.546491728465488e-05, + "loss": 2.6475, + "step": 2717 + }, + { + "epoch": 0.4651720006845798, + "grad_norm": 37.32086944580078, + "learning_rate": 1.5470621791215063e-05, + "loss": 3.6192, + "step": 2718 + }, + { + "epoch": 0.4653431456443608, + "grad_norm": 25.61843490600586, + "learning_rate": 1.5476326297775243e-05, + "loss": 2.336, + "step": 2719 + }, + { + "epoch": 0.46551429060414173, + "grad_norm": 31.5511531829834, + "learning_rate": 1.5482030804335426e-05, + "loss": 3.1832, + "step": 2720 + }, + { + "epoch": 0.46568543556392267, + "grad_norm": 35.96617889404297, + "learning_rate": 1.548773531089561e-05, + "loss": 4.2684, + "step": 2721 + }, + { + "epoch": 0.4658565805237036, + "grad_norm": 12.214024543762207, + "learning_rate": 1.549343981745579e-05, + "loss": 0.8686, + "step": 2722 + }, + { + "epoch": 0.46602772548348453, + "grad_norm": 3.517146110534668, + "learning_rate": 1.5499144324015973e-05, + "loss": 0.4999, + "step": 2723 + }, + { + "epoch": 0.46619887044326547, + "grad_norm": 27.56136703491211, + "learning_rate": 1.5504848830576157e-05, + "loss": 2.7019, + "step": 2724 + }, + { + "epoch": 0.4663700154030464, + "grad_norm": 4.812444686889648, + "learning_rate": 1.551055333713634e-05, + "loss": 0.5107, + "step": 2725 + }, + { + "epoch": 0.46654116036282733, + "grad_norm": 30.523237228393555, + "learning_rate": 1.551625784369652e-05, + "loss": 3.2109, + "step": 2726 + }, + { + "epoch": 0.46671230532260827, + "grad_norm": 28.326934814453125, + "learning_rate": 1.5521962350256703e-05, + "loss": 3.2289, + "step": 2727 + }, + { + "epoch": 0.4668834502823892, + "grad_norm": 34.37868118286133, + "learning_rate": 1.5527666856816887e-05, + "loss": 3.6814, + "step": 2728 + }, + { + "epoch": 0.46705459524217013, + "grad_norm": 30.16160774230957, + "learning_rate": 1.5533371363377067e-05, + "loss": 3.4049, + "step": 2729 + }, + { + "epoch": 0.46722574020195107, + "grad_norm": 4.218698024749756, + "learning_rate": 1.553907586993725e-05, + "loss": 0.4987, + "step": 2730 + }, + { + "epoch": 0.467396885161732, + "grad_norm": 23.180875778198242, + "learning_rate": 1.5544780376497433e-05, + "loss": 2.2238, + "step": 2731 + }, + { + "epoch": 0.46756803012151293, + "grad_norm": 25.21503257751465, + "learning_rate": 1.5550484883057617e-05, + "loss": 2.4819, + "step": 2732 + }, + { + "epoch": 0.46773917508129387, + "grad_norm": 30.37474822998047, + "learning_rate": 1.5556189389617797e-05, + "loss": 3.2935, + "step": 2733 + }, + { + "epoch": 0.4679103200410748, + "grad_norm": 16.8712100982666, + "learning_rate": 1.556189389617798e-05, + "loss": 1.0892, + "step": 2734 + }, + { + "epoch": 0.46808146500085573, + "grad_norm": 23.52683448791504, + "learning_rate": 1.5567598402738164e-05, + "loss": 2.3256, + "step": 2735 + }, + { + "epoch": 0.46825260996063667, + "grad_norm": 37.76002502441406, + "learning_rate": 1.5573302909298344e-05, + "loss": 3.8535, + "step": 2736 + }, + { + "epoch": 0.4684237549204176, + "grad_norm": 31.672475814819336, + "learning_rate": 1.5579007415858527e-05, + "loss": 2.5348, + "step": 2737 + }, + { + "epoch": 0.46859489988019853, + "grad_norm": 59.173072814941406, + "learning_rate": 1.558471192241871e-05, + "loss": 7.7627, + "step": 2738 + }, + { + "epoch": 0.46876604483997947, + "grad_norm": 23.428421020507812, + "learning_rate": 1.5590416428978894e-05, + "loss": 2.3317, + "step": 2739 + }, + { + "epoch": 0.4689371897997604, + "grad_norm": 38.1778564453125, + "learning_rate": 1.5596120935539077e-05, + "loss": 7.6561, + "step": 2740 + }, + { + "epoch": 0.46910833475954133, + "grad_norm": 10.163063049316406, + "learning_rate": 1.560182544209926e-05, + "loss": 0.7524, + "step": 2741 + }, + { + "epoch": 0.46927947971932227, + "grad_norm": 3.395460367202759, + "learning_rate": 1.5607529948659444e-05, + "loss": 0.4881, + "step": 2742 + }, + { + "epoch": 0.4694506246791032, + "grad_norm": 28.233747482299805, + "learning_rate": 1.5613234455219624e-05, + "loss": 2.606, + "step": 2743 + }, + { + "epoch": 0.46962176963888413, + "grad_norm": 33.14704513549805, + "learning_rate": 1.5618938961779807e-05, + "loss": 3.4617, + "step": 2744 + }, + { + "epoch": 0.46979291459866507, + "grad_norm": 4.885557651519775, + "learning_rate": 1.562464346833999e-05, + "loss": 0.5159, + "step": 2745 + }, + { + "epoch": 0.469964059558446, + "grad_norm": 32.37671661376953, + "learning_rate": 1.563034797490017e-05, + "loss": 3.1744, + "step": 2746 + }, + { + "epoch": 0.47013520451822693, + "grad_norm": 26.680980682373047, + "learning_rate": 1.5636052481460354e-05, + "loss": 2.8685, + "step": 2747 + }, + { + "epoch": 0.47030634947800787, + "grad_norm": 27.004371643066406, + "learning_rate": 1.5641756988020537e-05, + "loss": 3.0092, + "step": 2748 + }, + { + "epoch": 0.4704774944377888, + "grad_norm": 21.964834213256836, + "learning_rate": 1.564746149458072e-05, + "loss": 2.2193, + "step": 2749 + }, + { + "epoch": 0.47064863939756973, + "grad_norm": 66.10285186767578, + "learning_rate": 1.56531660011409e-05, + "loss": 7.346, + "step": 2750 + }, + { + "epoch": 0.47081978435735067, + "grad_norm": 6.018518924713135, + "learning_rate": 1.5658870507701084e-05, + "loss": 0.5488, + "step": 2751 + }, + { + "epoch": 0.4709909293171316, + "grad_norm": 30.385318756103516, + "learning_rate": 1.5664575014261267e-05, + "loss": 3.0093, + "step": 2752 + }, + { + "epoch": 0.47116207427691253, + "grad_norm": 13.027030944824219, + "learning_rate": 1.5670279520821447e-05, + "loss": 1.9682, + "step": 2753 + }, + { + "epoch": 0.47133321923669347, + "grad_norm": 35.71416473388672, + "learning_rate": 1.567598402738163e-05, + "loss": 6.69, + "step": 2754 + }, + { + "epoch": 0.4715043641964744, + "grad_norm": 29.253435134887695, + "learning_rate": 1.5681688533941814e-05, + "loss": 2.932, + "step": 2755 + }, + { + "epoch": 0.47167550915625533, + "grad_norm": 30.07666778564453, + "learning_rate": 1.5687393040501998e-05, + "loss": 3.8444, + "step": 2756 + }, + { + "epoch": 0.47184665411603627, + "grad_norm": 35.976871490478516, + "learning_rate": 1.5693097547062178e-05, + "loss": 4.9907, + "step": 2757 + }, + { + "epoch": 0.4720177990758172, + "grad_norm": 87.46236419677734, + "learning_rate": 1.569880205362236e-05, + "loss": 4.691, + "step": 2758 + }, + { + "epoch": 0.47218894403559813, + "grad_norm": 22.59965705871582, + "learning_rate": 1.5704506560182544e-05, + "loss": 2.1086, + "step": 2759 + }, + { + "epoch": 0.47236008899537907, + "grad_norm": 8.256918907165527, + "learning_rate": 1.5710211066742724e-05, + "loss": 0.9678, + "step": 2760 + }, + { + "epoch": 0.47253123395516, + "grad_norm": 38.63548278808594, + "learning_rate": 1.5715915573302908e-05, + "loss": 3.4149, + "step": 2761 + }, + { + "epoch": 0.47270237891494093, + "grad_norm": 7.380117893218994, + "learning_rate": 1.5721620079863094e-05, + "loss": 0.6134, + "step": 2762 + }, + { + "epoch": 0.47287352387472187, + "grad_norm": 27.26441764831543, + "learning_rate": 1.5727324586423278e-05, + "loss": 2.8164, + "step": 2763 + }, + { + "epoch": 0.4730446688345028, + "grad_norm": 18.643917083740234, + "learning_rate": 1.5733029092983458e-05, + "loss": 1.9656, + "step": 2764 + }, + { + "epoch": 0.47321581379428374, + "grad_norm": 27.289445877075195, + "learning_rate": 1.573873359954364e-05, + "loss": 2.8402, + "step": 2765 + }, + { + "epoch": 0.47338695875406467, + "grad_norm": 6.67548942565918, + "learning_rate": 1.5744438106103825e-05, + "loss": 0.7842, + "step": 2766 + }, + { + "epoch": 0.4735581037138456, + "grad_norm": 33.30831527709961, + "learning_rate": 1.5750142612664005e-05, + "loss": 3.6011, + "step": 2767 + }, + { + "epoch": 0.47372924867362654, + "grad_norm": 104.88871765136719, + "learning_rate": 1.5755847119224188e-05, + "loss": 5.0141, + "step": 2768 + }, + { + "epoch": 0.47390039363340747, + "grad_norm": 5.102696418762207, + "learning_rate": 1.576155162578437e-05, + "loss": 0.5258, + "step": 2769 + }, + { + "epoch": 0.47407153859318846, + "grad_norm": 3.6947317123413086, + "learning_rate": 1.576725613234455e-05, + "loss": 0.4833, + "step": 2770 + }, + { + "epoch": 0.4742426835529694, + "grad_norm": 32.43489074707031, + "learning_rate": 1.5772960638904735e-05, + "loss": 3.7272, + "step": 2771 + }, + { + "epoch": 0.4744138285127503, + "grad_norm": 32.32176971435547, + "learning_rate": 1.5778665145464918e-05, + "loss": 3.4356, + "step": 2772 + }, + { + "epoch": 0.47458497347253126, + "grad_norm": 2.583381175994873, + "learning_rate": 1.57843696520251e-05, + "loss": 0.4212, + "step": 2773 + }, + { + "epoch": 0.4747561184323122, + "grad_norm": 38.029869079589844, + "learning_rate": 1.579007415858528e-05, + "loss": 7.2033, + "step": 2774 + }, + { + "epoch": 0.4749272633920931, + "grad_norm": 10.577736854553223, + "learning_rate": 1.5795778665145465e-05, + "loss": 1.2395, + "step": 2775 + }, + { + "epoch": 0.47509840835187406, + "grad_norm": 9.981147766113281, + "learning_rate": 1.5801483171705648e-05, + "loss": 1.4924, + "step": 2776 + }, + { + "epoch": 0.475269553311655, + "grad_norm": 28.36383819580078, + "learning_rate": 1.5807187678265828e-05, + "loss": 3.8155, + "step": 2777 + }, + { + "epoch": 0.4754406982714359, + "grad_norm": 6.329680442810059, + "learning_rate": 1.581289218482601e-05, + "loss": 0.4932, + "step": 2778 + }, + { + "epoch": 0.47561184323121686, + "grad_norm": 17.587629318237305, + "learning_rate": 1.5818596691386195e-05, + "loss": 1.8358, + "step": 2779 + }, + { + "epoch": 0.4757829881909978, + "grad_norm": 32.48772048950195, + "learning_rate": 1.582430119794638e-05, + "loss": 4.1859, + "step": 2780 + }, + { + "epoch": 0.4759541331507787, + "grad_norm": 41.349056243896484, + "learning_rate": 1.583000570450656e-05, + "loss": 7.0338, + "step": 2781 + }, + { + "epoch": 0.47612527811055966, + "grad_norm": 32.28718185424805, + "learning_rate": 1.583571021106674e-05, + "loss": 3.439, + "step": 2782 + }, + { + "epoch": 0.4762964230703406, + "grad_norm": 52.53911209106445, + "learning_rate": 1.5841414717626925e-05, + "loss": 6.9516, + "step": 2783 + }, + { + "epoch": 0.4764675680301215, + "grad_norm": 6.450766086578369, + "learning_rate": 1.5847119224187105e-05, + "loss": 0.4587, + "step": 2784 + }, + { + "epoch": 0.47663871298990246, + "grad_norm": 31.295753479003906, + "learning_rate": 1.5852823730747292e-05, + "loss": 3.6037, + "step": 2785 + }, + { + "epoch": 0.4768098579496834, + "grad_norm": 10.392585754394531, + "learning_rate": 1.5858528237307475e-05, + "loss": 0.7695, + "step": 2786 + }, + { + "epoch": 0.4769810029094643, + "grad_norm": 31.578166961669922, + "learning_rate": 1.586423274386766e-05, + "loss": 3.4914, + "step": 2787 + }, + { + "epoch": 0.47715214786924526, + "grad_norm": 35.540199279785156, + "learning_rate": 1.586993725042784e-05, + "loss": 4.0507, + "step": 2788 + }, + { + "epoch": 0.4773232928290262, + "grad_norm": 30.065216064453125, + "learning_rate": 1.5875641756988022e-05, + "loss": 3.4183, + "step": 2789 + }, + { + "epoch": 0.4774944377888071, + "grad_norm": 3.6649258136749268, + "learning_rate": 1.5881346263548205e-05, + "loss": 0.4127, + "step": 2790 + }, + { + "epoch": 0.47766558274858806, + "grad_norm": 88.72532653808594, + "learning_rate": 1.5887050770108385e-05, + "loss": 4.5608, + "step": 2791 + }, + { + "epoch": 0.477836727708369, + "grad_norm": 23.770221710205078, + "learning_rate": 1.589275527666857e-05, + "loss": 2.2223, + "step": 2792 + }, + { + "epoch": 0.4780078726681499, + "grad_norm": 5.284163951873779, + "learning_rate": 1.5898459783228752e-05, + "loss": 0.5186, + "step": 2793 + }, + { + "epoch": 0.47817901762793086, + "grad_norm": 29.41139793395996, + "learning_rate": 1.5904164289788935e-05, + "loss": 3.0647, + "step": 2794 + }, + { + "epoch": 0.4783501625877118, + "grad_norm": 26.757612228393555, + "learning_rate": 1.5909868796349115e-05, + "loss": 2.3827, + "step": 2795 + }, + { + "epoch": 0.4785213075474927, + "grad_norm": 12.758798599243164, + "learning_rate": 1.59155733029093e-05, + "loss": 0.818, + "step": 2796 + }, + { + "epoch": 0.47869245250727366, + "grad_norm": 29.093143463134766, + "learning_rate": 1.5921277809469482e-05, + "loss": 2.9151, + "step": 2797 + }, + { + "epoch": 0.4788635974670546, + "grad_norm": 126.96649932861328, + "learning_rate": 1.5926982316029662e-05, + "loss": 8.4343, + "step": 2798 + }, + { + "epoch": 0.4790347424268355, + "grad_norm": 31.195518493652344, + "learning_rate": 1.5932686822589846e-05, + "loss": 3.7256, + "step": 2799 + }, + { + "epoch": 0.47920588738661646, + "grad_norm": 28.148395538330078, + "learning_rate": 1.593839132915003e-05, + "loss": 2.8813, + "step": 2800 + }, + { + "epoch": 0.4793770323463974, + "grad_norm": 36.015403747558594, + "learning_rate": 1.5944095835710212e-05, + "loss": 4.6005, + "step": 2801 + }, + { + "epoch": 0.4795481773061783, + "grad_norm": 31.1592960357666, + "learning_rate": 1.5949800342270392e-05, + "loss": 3.1305, + "step": 2802 + }, + { + "epoch": 0.47971932226595926, + "grad_norm": 128.36007690429688, + "learning_rate": 1.5955504848830576e-05, + "loss": 4.3169, + "step": 2803 + }, + { + "epoch": 0.4798904672257402, + "grad_norm": 13.735505104064941, + "learning_rate": 1.596120935539076e-05, + "loss": 0.9245, + "step": 2804 + }, + { + "epoch": 0.4800616121855211, + "grad_norm": 42.414024353027344, + "learning_rate": 1.596691386195094e-05, + "loss": 6.975, + "step": 2805 + }, + { + "epoch": 0.48023275714530206, + "grad_norm": 31.763032913208008, + "learning_rate": 1.5972618368511122e-05, + "loss": 3.1284, + "step": 2806 + }, + { + "epoch": 0.480403902105083, + "grad_norm": 27.716442108154297, + "learning_rate": 1.597832287507131e-05, + "loss": 3.0007, + "step": 2807 + }, + { + "epoch": 0.4805750470648639, + "grad_norm": 32.059425354003906, + "learning_rate": 1.598402738163149e-05, + "loss": 4.0829, + "step": 2808 + }, + { + "epoch": 0.48074619202464486, + "grad_norm": 36.31050491333008, + "learning_rate": 1.5989731888191673e-05, + "loss": 4.8666, + "step": 2809 + }, + { + "epoch": 0.4809173369844258, + "grad_norm": 23.16267204284668, + "learning_rate": 1.5995436394751856e-05, + "loss": 2.0993, + "step": 2810 + }, + { + "epoch": 0.4810884819442067, + "grad_norm": 12.366683006286621, + "learning_rate": 1.600114090131204e-05, + "loss": 0.913, + "step": 2811 + }, + { + "epoch": 0.48125962690398766, + "grad_norm": 11.630936622619629, + "learning_rate": 1.600684540787222e-05, + "loss": 1.233, + "step": 2812 + }, + { + "epoch": 0.4814307718637686, + "grad_norm": 5.433574676513672, + "learning_rate": 1.6012549914432403e-05, + "loss": 0.5408, + "step": 2813 + }, + { + "epoch": 0.4816019168235495, + "grad_norm": 25.152584075927734, + "learning_rate": 1.6018254420992586e-05, + "loss": 2.7558, + "step": 2814 + }, + { + "epoch": 0.48177306178333046, + "grad_norm": 32.2104377746582, + "learning_rate": 1.6023958927552766e-05, + "loss": 3.56, + "step": 2815 + }, + { + "epoch": 0.4819442067431114, + "grad_norm": 21.62321662902832, + "learning_rate": 1.602966343411295e-05, + "loss": 2.626, + "step": 2816 + }, + { + "epoch": 0.4821153517028923, + "grad_norm": 27.26594352722168, + "learning_rate": 1.6035367940673133e-05, + "loss": 3.057, + "step": 2817 + }, + { + "epoch": 0.48228649666267326, + "grad_norm": 29.751848220825195, + "learning_rate": 1.6041072447233316e-05, + "loss": 3.0557, + "step": 2818 + }, + { + "epoch": 0.4824576416224542, + "grad_norm": 28.00129508972168, + "learning_rate": 1.6046776953793496e-05, + "loss": 2.9606, + "step": 2819 + }, + { + "epoch": 0.4826287865822352, + "grad_norm": 10.21130084991455, + "learning_rate": 1.605248146035368e-05, + "loss": 1.0526, + "step": 2820 + }, + { + "epoch": 0.4827999315420161, + "grad_norm": 194.53099060058594, + "learning_rate": 1.6058185966913863e-05, + "loss": 9.7692, + "step": 2821 + }, + { + "epoch": 0.48297107650179705, + "grad_norm": 20.116971969604492, + "learning_rate": 1.6063890473474043e-05, + "loss": 1.702, + "step": 2822 + }, + { + "epoch": 0.483142221461578, + "grad_norm": 25.585695266723633, + "learning_rate": 1.6069594980034226e-05, + "loss": 3.1031, + "step": 2823 + }, + { + "epoch": 0.4833133664213589, + "grad_norm": 10.690316200256348, + "learning_rate": 1.607529948659441e-05, + "loss": 1.112, + "step": 2824 + }, + { + "epoch": 0.48348451138113985, + "grad_norm": 46.31101989746094, + "learning_rate": 1.6081003993154593e-05, + "loss": 7.0695, + "step": 2825 + }, + { + "epoch": 0.4836556563409208, + "grad_norm": 2.7449769973754883, + "learning_rate": 1.6086708499714773e-05, + "loss": 0.412, + "step": 2826 + }, + { + "epoch": 0.4838268013007017, + "grad_norm": 31.60761260986328, + "learning_rate": 1.6092413006274956e-05, + "loss": 3.5248, + "step": 2827 + }, + { + "epoch": 0.48399794626048265, + "grad_norm": 30.04467010498047, + "learning_rate": 1.609811751283514e-05, + "loss": 3.5359, + "step": 2828 + }, + { + "epoch": 0.4841690912202636, + "grad_norm": 10.859264373779297, + "learning_rate": 1.610382201939532e-05, + "loss": 0.9806, + "step": 2829 + }, + { + "epoch": 0.4843402361800445, + "grad_norm": 24.42304229736328, + "learning_rate": 1.6109526525955507e-05, + "loss": 2.5163, + "step": 2830 + }, + { + "epoch": 0.48451138113982545, + "grad_norm": 32.02371597290039, + "learning_rate": 1.611523103251569e-05, + "loss": 4.1818, + "step": 2831 + }, + { + "epoch": 0.4846825260996064, + "grad_norm": 35.690147399902344, + "learning_rate": 1.6120935539075873e-05, + "loss": 3.3438, + "step": 2832 + }, + { + "epoch": 0.4848536710593873, + "grad_norm": 25.543243408203125, + "learning_rate": 1.6126640045636053e-05, + "loss": 2.5981, + "step": 2833 + }, + { + "epoch": 0.48502481601916825, + "grad_norm": 25.119115829467773, + "learning_rate": 1.6132344552196237e-05, + "loss": 2.1322, + "step": 2834 + }, + { + "epoch": 0.4851959609789492, + "grad_norm": 23.112409591674805, + "learning_rate": 1.613804905875642e-05, + "loss": 2.5395, + "step": 2835 + }, + { + "epoch": 0.4853671059387301, + "grad_norm": 91.41179656982422, + "learning_rate": 1.61437535653166e-05, + "loss": 4.2215, + "step": 2836 + }, + { + "epoch": 0.48553825089851105, + "grad_norm": 34.66135787963867, + "learning_rate": 1.6149458071876783e-05, + "loss": 3.1988, + "step": 2837 + }, + { + "epoch": 0.485709395858292, + "grad_norm": 28.888839721679688, + "learning_rate": 1.6155162578436967e-05, + "loss": 3.1345, + "step": 2838 + }, + { + "epoch": 0.4858805408180729, + "grad_norm": 63.08065414428711, + "learning_rate": 1.6160867084997147e-05, + "loss": 8.1288, + "step": 2839 + }, + { + "epoch": 0.48605168577785385, + "grad_norm": 148.98455810546875, + "learning_rate": 1.616657159155733e-05, + "loss": 4.9747, + "step": 2840 + }, + { + "epoch": 0.4862228307376348, + "grad_norm": 29.048202514648438, + "learning_rate": 1.6172276098117514e-05, + "loss": 2.9531, + "step": 2841 + }, + { + "epoch": 0.4863939756974157, + "grad_norm": 6.495917320251465, + "learning_rate": 1.6177980604677697e-05, + "loss": 0.5056, + "step": 2842 + }, + { + "epoch": 0.48656512065719665, + "grad_norm": 8.356714248657227, + "learning_rate": 1.6183685111237877e-05, + "loss": 0.9125, + "step": 2843 + }, + { + "epoch": 0.4867362656169776, + "grad_norm": 26.18461036682129, + "learning_rate": 1.618938961779806e-05, + "loss": 3.175, + "step": 2844 + }, + { + "epoch": 0.4869074105767585, + "grad_norm": 9.202829360961914, + "learning_rate": 1.6195094124358244e-05, + "loss": 1.0864, + "step": 2845 + }, + { + "epoch": 0.48707855553653945, + "grad_norm": 34.182373046875, + "learning_rate": 1.6200798630918424e-05, + "loss": 2.7523, + "step": 2846 + }, + { + "epoch": 0.4872497004963204, + "grad_norm": 68.9462890625, + "learning_rate": 1.6206503137478607e-05, + "loss": 3.7044, + "step": 2847 + }, + { + "epoch": 0.4874208454561013, + "grad_norm": 24.633121490478516, + "learning_rate": 1.621220764403879e-05, + "loss": 2.6342, + "step": 2848 + }, + { + "epoch": 0.48759199041588225, + "grad_norm": 32.68869400024414, + "learning_rate": 1.6217912150598974e-05, + "loss": 4.6795, + "step": 2849 + }, + { + "epoch": 0.4877631353756632, + "grad_norm": 28.001712799072266, + "learning_rate": 1.6223616657159154e-05, + "loss": 3.3885, + "step": 2850 + }, + { + "epoch": 0.4879342803354441, + "grad_norm": 4.1197099685668945, + "learning_rate": 1.6229321163719337e-05, + "loss": 0.4097, + "step": 2851 + }, + { + "epoch": 0.48810542529522505, + "grad_norm": 29.35110092163086, + "learning_rate": 1.623502567027952e-05, + "loss": 3.5865, + "step": 2852 + }, + { + "epoch": 0.488276570255006, + "grad_norm": 26.92041778564453, + "learning_rate": 1.6240730176839704e-05, + "loss": 2.7247, + "step": 2853 + }, + { + "epoch": 0.4884477152147869, + "grad_norm": 34.873775482177734, + "learning_rate": 1.6246434683399887e-05, + "loss": 7.1172, + "step": 2854 + }, + { + "epoch": 0.48861886017456785, + "grad_norm": 24.180212020874023, + "learning_rate": 1.625213918996007e-05, + "loss": 2.4944, + "step": 2855 + }, + { + "epoch": 0.4887900051343488, + "grad_norm": 28.294334411621094, + "learning_rate": 1.6257843696520254e-05, + "loss": 3.4049, + "step": 2856 + }, + { + "epoch": 0.4889611500941297, + "grad_norm": 20.231170654296875, + "learning_rate": 1.6263548203080434e-05, + "loss": 2.2117, + "step": 2857 + }, + { + "epoch": 0.48913229505391065, + "grad_norm": 21.00507164001465, + "learning_rate": 1.6269252709640617e-05, + "loss": 1.8153, + "step": 2858 + }, + { + "epoch": 0.4893034400136916, + "grad_norm": 26.58632469177246, + "learning_rate": 1.62749572162008e-05, + "loss": 2.7509, + "step": 2859 + }, + { + "epoch": 0.4894745849734725, + "grad_norm": 25.922264099121094, + "learning_rate": 1.628066172276098e-05, + "loss": 3.0767, + "step": 2860 + }, + { + "epoch": 0.48964572993325345, + "grad_norm": 36.93525695800781, + "learning_rate": 1.6286366229321164e-05, + "loss": 6.587, + "step": 2861 + }, + { + "epoch": 0.4898168748930344, + "grad_norm": 30.786312103271484, + "learning_rate": 1.6292070735881348e-05, + "loss": 4.5453, + "step": 2862 + }, + { + "epoch": 0.4899880198528153, + "grad_norm": 10.850686073303223, + "learning_rate": 1.629777524244153e-05, + "loss": 0.8675, + "step": 2863 + }, + { + "epoch": 0.49015916481259625, + "grad_norm": 22.04916763305664, + "learning_rate": 1.630347974900171e-05, + "loss": 2.1868, + "step": 2864 + }, + { + "epoch": 0.4903303097723772, + "grad_norm": 27.125104904174805, + "learning_rate": 1.6309184255561894e-05, + "loss": 2.7107, + "step": 2865 + }, + { + "epoch": 0.4905014547321581, + "grad_norm": 26.232017517089844, + "learning_rate": 1.6314888762122078e-05, + "loss": 3.0023, + "step": 2866 + }, + { + "epoch": 0.49067259969193905, + "grad_norm": 2.6513617038726807, + "learning_rate": 1.6320593268682258e-05, + "loss": 0.4064, + "step": 2867 + }, + { + "epoch": 0.49084374465172, + "grad_norm": 29.269208908081055, + "learning_rate": 1.632629777524244e-05, + "loss": 2.9074, + "step": 2868 + }, + { + "epoch": 0.4910148896115009, + "grad_norm": 28.653419494628906, + "learning_rate": 1.6332002281802624e-05, + "loss": 2.749, + "step": 2869 + }, + { + "epoch": 0.49118603457128185, + "grad_norm": 24.419513702392578, + "learning_rate": 1.6337706788362808e-05, + "loss": 2.4733, + "step": 2870 + }, + { + "epoch": 0.49135717953106284, + "grad_norm": 31.81149673461914, + "learning_rate": 1.6343411294922988e-05, + "loss": 3.0798, + "step": 2871 + }, + { + "epoch": 0.49152832449084377, + "grad_norm": 7.05307149887085, + "learning_rate": 1.634911580148317e-05, + "loss": 0.6362, + "step": 2872 + }, + { + "epoch": 0.4916994694506247, + "grad_norm": 22.482975006103516, + "learning_rate": 1.6354820308043355e-05, + "loss": 2.0351, + "step": 2873 + }, + { + "epoch": 0.49187061441040564, + "grad_norm": 9.290128707885742, + "learning_rate": 1.6360524814603535e-05, + "loss": 0.6272, + "step": 2874 + }, + { + "epoch": 0.49204175937018657, + "grad_norm": 27.201467514038086, + "learning_rate": 1.6366229321163718e-05, + "loss": 2.6431, + "step": 2875 + }, + { + "epoch": 0.4922129043299675, + "grad_norm": 44.08928298950195, + "learning_rate": 1.6371933827723905e-05, + "loss": 6.6881, + "step": 2876 + }, + { + "epoch": 0.49238404928974844, + "grad_norm": 14.08613109588623, + "learning_rate": 1.6377638334284085e-05, + "loss": 1.0184, + "step": 2877 + }, + { + "epoch": 0.49255519424952937, + "grad_norm": 19.89874839782715, + "learning_rate": 1.6383342840844268e-05, + "loss": 2.0983, + "step": 2878 + }, + { + "epoch": 0.4927263392093103, + "grad_norm": 31.281314849853516, + "learning_rate": 1.638904734740445e-05, + "loss": 4.3604, + "step": 2879 + }, + { + "epoch": 0.49289748416909124, + "grad_norm": 4.3934245109558105, + "learning_rate": 1.6394751853964635e-05, + "loss": 0.4535, + "step": 2880 + }, + { + "epoch": 0.49306862912887217, + "grad_norm": 16.13640785217285, + "learning_rate": 1.6400456360524815e-05, + "loss": 1.4628, + "step": 2881 + }, + { + "epoch": 0.4932397740886531, + "grad_norm": 2.4228832721710205, + "learning_rate": 1.6406160867084998e-05, + "loss": 0.3669, + "step": 2882 + }, + { + "epoch": 0.49341091904843404, + "grad_norm": 39.298160552978516, + "learning_rate": 1.641186537364518e-05, + "loss": 5.1978, + "step": 2883 + }, + { + "epoch": 0.49358206400821497, + "grad_norm": 7.103499889373779, + "learning_rate": 1.641756988020536e-05, + "loss": 0.7534, + "step": 2884 + }, + { + "epoch": 0.4937532089679959, + "grad_norm": 36.24224090576172, + "learning_rate": 1.6423274386765545e-05, + "loss": 5.1747, + "step": 2885 + }, + { + "epoch": 0.49392435392777684, + "grad_norm": 88.6714859008789, + "learning_rate": 1.642897889332573e-05, + "loss": 4.1515, + "step": 2886 + }, + { + "epoch": 0.49409549888755777, + "grad_norm": 102.38868713378906, + "learning_rate": 1.6434683399885912e-05, + "loss": 4.1397, + "step": 2887 + }, + { + "epoch": 0.4942666438473387, + "grad_norm": 32.09382247924805, + "learning_rate": 1.6440387906446092e-05, + "loss": 3.822, + "step": 2888 + }, + { + "epoch": 0.49443778880711964, + "grad_norm": 27.632850646972656, + "learning_rate": 1.6446092413006275e-05, + "loss": 3.0071, + "step": 2889 + }, + { + "epoch": 0.49460893376690057, + "grad_norm": 29.850147247314453, + "learning_rate": 1.645179691956646e-05, + "loss": 4.5876, + "step": 2890 + }, + { + "epoch": 0.4947800787266815, + "grad_norm": 20.323644638061523, + "learning_rate": 1.645750142612664e-05, + "loss": 2.0093, + "step": 2891 + }, + { + "epoch": 0.49495122368646244, + "grad_norm": 28.592273712158203, + "learning_rate": 1.6463205932686822e-05, + "loss": 2.6316, + "step": 2892 + }, + { + "epoch": 0.49512236864624337, + "grad_norm": 29.890256881713867, + "learning_rate": 1.6468910439247005e-05, + "loss": 2.7351, + "step": 2893 + }, + { + "epoch": 0.4952935136060243, + "grad_norm": 25.856136322021484, + "learning_rate": 1.647461494580719e-05, + "loss": 2.7318, + "step": 2894 + }, + { + "epoch": 0.49546465856580524, + "grad_norm": 28.1647891998291, + "learning_rate": 1.648031945236737e-05, + "loss": 2.7787, + "step": 2895 + }, + { + "epoch": 0.49563580352558617, + "grad_norm": 24.757694244384766, + "learning_rate": 1.6486023958927552e-05, + "loss": 2.7135, + "step": 2896 + }, + { + "epoch": 0.4958069484853671, + "grad_norm": 42.44664764404297, + "learning_rate": 1.6491728465487735e-05, + "loss": 3.6649, + "step": 2897 + }, + { + "epoch": 0.49597809344514804, + "grad_norm": 30.2053279876709, + "learning_rate": 1.6497432972047915e-05, + "loss": 4.0259, + "step": 2898 + }, + { + "epoch": 0.49614923840492897, + "grad_norm": 12.054943084716797, + "learning_rate": 1.6503137478608102e-05, + "loss": 1.0105, + "step": 2899 + }, + { + "epoch": 0.4963203833647099, + "grad_norm": 17.974079132080078, + "learning_rate": 1.6508841985168286e-05, + "loss": 2.0786, + "step": 2900 + }, + { + "epoch": 0.49649152832449084, + "grad_norm": 12.725552558898926, + "learning_rate": 1.651454649172847e-05, + "loss": 1.0647, + "step": 2901 + }, + { + "epoch": 0.49666267328427177, + "grad_norm": 22.831754684448242, + "learning_rate": 1.652025099828865e-05, + "loss": 2.2329, + "step": 2902 + }, + { + "epoch": 0.4968338182440527, + "grad_norm": 21.267478942871094, + "learning_rate": 1.6525955504848832e-05, + "loss": 2.5314, + "step": 2903 + }, + { + "epoch": 0.49700496320383364, + "grad_norm": 27.087793350219727, + "learning_rate": 1.6531660011409016e-05, + "loss": 2.8437, + "step": 2904 + }, + { + "epoch": 0.49717610816361457, + "grad_norm": 19.73915672302246, + "learning_rate": 1.6537364517969196e-05, + "loss": 1.8543, + "step": 2905 + }, + { + "epoch": 0.4973472531233955, + "grad_norm": 2.955650806427002, + "learning_rate": 1.654306902452938e-05, + "loss": 0.4054, + "step": 2906 + }, + { + "epoch": 0.49751839808317644, + "grad_norm": 25.305593490600586, + "learning_rate": 1.6548773531089562e-05, + "loss": 2.5412, + "step": 2907 + }, + { + "epoch": 0.49768954304295737, + "grad_norm": 29.378746032714844, + "learning_rate": 1.6554478037649742e-05, + "loss": 2.7018, + "step": 2908 + }, + { + "epoch": 0.4978606880027383, + "grad_norm": 14.516071319580078, + "learning_rate": 1.6560182544209926e-05, + "loss": 1.9194, + "step": 2909 + }, + { + "epoch": 0.49803183296251924, + "grad_norm": 25.602577209472656, + "learning_rate": 1.656588705077011e-05, + "loss": 2.1128, + "step": 2910 + }, + { + "epoch": 0.49820297792230017, + "grad_norm": 109.72111511230469, + "learning_rate": 1.6571591557330293e-05, + "loss": 4.1774, + "step": 2911 + }, + { + "epoch": 0.4983741228820811, + "grad_norm": 19.274553298950195, + "learning_rate": 1.6577296063890472e-05, + "loss": 1.5632, + "step": 2912 + }, + { + "epoch": 0.49854526784186204, + "grad_norm": 29.17140007019043, + "learning_rate": 1.6583000570450656e-05, + "loss": 3.7158, + "step": 2913 + }, + { + "epoch": 0.49871641280164297, + "grad_norm": 31.559934616088867, + "learning_rate": 1.658870507701084e-05, + "loss": 4.5437, + "step": 2914 + }, + { + "epoch": 0.4988875577614239, + "grad_norm": 18.08380699157715, + "learning_rate": 1.659440958357102e-05, + "loss": 1.1722, + "step": 2915 + }, + { + "epoch": 0.49905870272120484, + "grad_norm": 29.155492782592773, + "learning_rate": 1.6600114090131203e-05, + "loss": 3.2768, + "step": 2916 + }, + { + "epoch": 0.49922984768098577, + "grad_norm": 36.51355743408203, + "learning_rate": 1.6605818596691386e-05, + "loss": 4.8346, + "step": 2917 + }, + { + "epoch": 0.4994009926407667, + "grad_norm": 18.29048728942871, + "learning_rate": 1.661152310325157e-05, + "loss": 1.1614, + "step": 2918 + }, + { + "epoch": 0.49957213760054764, + "grad_norm": 29.851797103881836, + "learning_rate": 1.661722760981175e-05, + "loss": 2.9554, + "step": 2919 + }, + { + "epoch": 0.49974328256032857, + "grad_norm": 27.82573699951172, + "learning_rate": 1.6622932116371933e-05, + "loss": 3.4135, + "step": 2920 + }, + { + "epoch": 0.4999144275201095, + "grad_norm": 26.42146110534668, + "learning_rate": 1.6628636622932116e-05, + "loss": 2.5056, + "step": 2921 + }, + { + "epoch": 0.5000855724798905, + "grad_norm": 11.394399642944336, + "learning_rate": 1.66343411294923e-05, + "loss": 1.5378, + "step": 2922 + }, + { + "epoch": 0.5002567174396714, + "grad_norm": 76.39617156982422, + "learning_rate": 1.6640045636052483e-05, + "loss": 7.2706, + "step": 2923 + }, + { + "epoch": 0.5004278623994524, + "grad_norm": 30.514179229736328, + "learning_rate": 1.6645750142612666e-05, + "loss": 3.1234, + "step": 2924 + }, + { + "epoch": 0.5005990073592332, + "grad_norm": 25.776514053344727, + "learning_rate": 1.665145464917285e-05, + "loss": 2.852, + "step": 2925 + }, + { + "epoch": 0.5007701523190142, + "grad_norm": 33.94929122924805, + "learning_rate": 1.665715915573303e-05, + "loss": 4.5202, + "step": 2926 + }, + { + "epoch": 0.5009412972787951, + "grad_norm": 42.92927551269531, + "learning_rate": 1.6662863662293213e-05, + "loss": 7.0151, + "step": 2927 + }, + { + "epoch": 0.5011124422385761, + "grad_norm": 8.699772834777832, + "learning_rate": 1.6668568168853396e-05, + "loss": 0.8725, + "step": 2928 + }, + { + "epoch": 0.501283587198357, + "grad_norm": 27.853302001953125, + "learning_rate": 1.6674272675413576e-05, + "loss": 2.2825, + "step": 2929 + }, + { + "epoch": 0.501454732158138, + "grad_norm": 26.110185623168945, + "learning_rate": 1.667997718197376e-05, + "loss": 2.5107, + "step": 2930 + }, + { + "epoch": 0.5016258771179188, + "grad_norm": 4.521554946899414, + "learning_rate": 1.6685681688533943e-05, + "loss": 0.4957, + "step": 2931 + }, + { + "epoch": 0.5017970220776998, + "grad_norm": 42.245086669921875, + "learning_rate": 1.6691386195094127e-05, + "loss": 6.5318, + "step": 2932 + }, + { + "epoch": 0.5019681670374807, + "grad_norm": 25.86848258972168, + "learning_rate": 1.6697090701654307e-05, + "loss": 2.2382, + "step": 2933 + }, + { + "epoch": 0.5021393119972617, + "grad_norm": 48.50715637207031, + "learning_rate": 1.670279520821449e-05, + "loss": 7.148, + "step": 2934 + }, + { + "epoch": 0.5023104569570426, + "grad_norm": 32.559574127197266, + "learning_rate": 1.6708499714774673e-05, + "loss": 3.6438, + "step": 2935 + }, + { + "epoch": 0.5024816019168236, + "grad_norm": 24.84282112121582, + "learning_rate": 1.6714204221334853e-05, + "loss": 2.7729, + "step": 2936 + }, + { + "epoch": 0.5026527468766044, + "grad_norm": 14.403919219970703, + "learning_rate": 1.6719908727895037e-05, + "loss": 1.217, + "step": 2937 + }, + { + "epoch": 0.5028238918363854, + "grad_norm": 27.424219131469727, + "learning_rate": 1.672561323445522e-05, + "loss": 2.5197, + "step": 2938 + }, + { + "epoch": 0.5029950367961663, + "grad_norm": 9.789163589477539, + "learning_rate": 1.67313177410154e-05, + "loss": 1.7931, + "step": 2939 + }, + { + "epoch": 0.5031661817559473, + "grad_norm": 27.327239990234375, + "learning_rate": 1.6737022247575583e-05, + "loss": 3.3266, + "step": 2940 + }, + { + "epoch": 0.5033373267157282, + "grad_norm": 19.182161331176758, + "learning_rate": 1.6742726754135767e-05, + "loss": 1.9967, + "step": 2941 + }, + { + "epoch": 0.5035084716755092, + "grad_norm": 56.43001174926758, + "learning_rate": 1.674843126069595e-05, + "loss": 3.7189, + "step": 2942 + }, + { + "epoch": 0.50367961663529, + "grad_norm": 19.654386520385742, + "learning_rate": 1.675413576725613e-05, + "loss": 1.9999, + "step": 2943 + }, + { + "epoch": 0.503850761595071, + "grad_norm": 22.203187942504883, + "learning_rate": 1.6759840273816314e-05, + "loss": 2.1959, + "step": 2944 + }, + { + "epoch": 0.5040219065548519, + "grad_norm": 6.563319683074951, + "learning_rate": 1.67655447803765e-05, + "loss": 0.6367, + "step": 2945 + }, + { + "epoch": 0.5041930515146329, + "grad_norm": 10.192085266113281, + "learning_rate": 1.677124928693668e-05, + "loss": 0.7288, + "step": 2946 + }, + { + "epoch": 0.5043641964744139, + "grad_norm": 32.45716094970703, + "learning_rate": 1.6776953793496864e-05, + "loss": 3.9021, + "step": 2947 + }, + { + "epoch": 0.5045353414341948, + "grad_norm": 4.9417595863342285, + "learning_rate": 1.6782658300057047e-05, + "loss": 0.4681, + "step": 2948 + }, + { + "epoch": 0.5047064863939758, + "grad_norm": 27.206302642822266, + "learning_rate": 1.678836280661723e-05, + "loss": 3.3352, + "step": 2949 + }, + { + "epoch": 0.5048776313537566, + "grad_norm": 28.154144287109375, + "learning_rate": 1.679406731317741e-05, + "loss": 2.9377, + "step": 2950 + }, + { + "epoch": 0.5050487763135376, + "grad_norm": 21.303789138793945, + "learning_rate": 1.6799771819737594e-05, + "loss": 2.785, + "step": 2951 + }, + { + "epoch": 0.5052199212733185, + "grad_norm": 31.954051971435547, + "learning_rate": 1.6805476326297777e-05, + "loss": 3.1305, + "step": 2952 + }, + { + "epoch": 0.5053910662330995, + "grad_norm": 10.69640827178955, + "learning_rate": 1.6811180832857957e-05, + "loss": 1.6799, + "step": 2953 + }, + { + "epoch": 0.5055622111928804, + "grad_norm": 30.222347259521484, + "learning_rate": 1.681688533941814e-05, + "loss": 2.8247, + "step": 2954 + }, + { + "epoch": 0.5057333561526614, + "grad_norm": 96.27491760253906, + "learning_rate": 1.6822589845978324e-05, + "loss": 4.6357, + "step": 2955 + }, + { + "epoch": 0.5059045011124422, + "grad_norm": 28.582870483398438, + "learning_rate": 1.6828294352538507e-05, + "loss": 3.2733, + "step": 2956 + }, + { + "epoch": 0.5060756460722232, + "grad_norm": 41.087825775146484, + "learning_rate": 1.6833998859098687e-05, + "loss": 7.1278, + "step": 2957 + }, + { + "epoch": 0.5062467910320041, + "grad_norm": 7.500061511993408, + "learning_rate": 1.683970336565887e-05, + "loss": 0.8286, + "step": 2958 + }, + { + "epoch": 0.5064179359917851, + "grad_norm": 26.969345092773438, + "learning_rate": 1.6845407872219054e-05, + "loss": 2.235, + "step": 2959 + }, + { + "epoch": 0.506589080951566, + "grad_norm": 26.311525344848633, + "learning_rate": 1.6851112378779234e-05, + "loss": 3.0085, + "step": 2960 + }, + { + "epoch": 0.506760225911347, + "grad_norm": 31.306970596313477, + "learning_rate": 1.6856816885339417e-05, + "loss": 2.5939, + "step": 2961 + }, + { + "epoch": 0.5069313708711278, + "grad_norm": 24.608043670654297, + "learning_rate": 1.68625213918996e-05, + "loss": 2.3096, + "step": 2962 + }, + { + "epoch": 0.5071025158309088, + "grad_norm": 27.197254180908203, + "learning_rate": 1.6868225898459784e-05, + "loss": 2.9187, + "step": 2963 + }, + { + "epoch": 0.5072736607906897, + "grad_norm": 28.446548461914062, + "learning_rate": 1.6873930405019964e-05, + "loss": 3.2735, + "step": 2964 + }, + { + "epoch": 0.5074448057504707, + "grad_norm": 32.15707778930664, + "learning_rate": 1.6879634911580148e-05, + "loss": 6.7019, + "step": 2965 + }, + { + "epoch": 0.5076159507102516, + "grad_norm": 23.724163055419922, + "learning_rate": 1.688533941814033e-05, + "loss": 2.1627, + "step": 2966 + }, + { + "epoch": 0.5077870956700326, + "grad_norm": 28.04530143737793, + "learning_rate": 1.689104392470051e-05, + "loss": 2.6273, + "step": 2967 + }, + { + "epoch": 0.5079582406298134, + "grad_norm": 30.895709991455078, + "learning_rate": 1.6896748431260698e-05, + "loss": 3.8368, + "step": 2968 + }, + { + "epoch": 0.5081293855895944, + "grad_norm": 14.024374961853027, + "learning_rate": 1.690245293782088e-05, + "loss": 1.0194, + "step": 2969 + }, + { + "epoch": 0.5083005305493753, + "grad_norm": 29.09341049194336, + "learning_rate": 1.690815744438106e-05, + "loss": 3.5337, + "step": 2970 + }, + { + "epoch": 0.5084716755091563, + "grad_norm": 28.34062385559082, + "learning_rate": 1.6913861950941244e-05, + "loss": 3.1743, + "step": 2971 + }, + { + "epoch": 0.5086428204689372, + "grad_norm": 25.496129989624023, + "learning_rate": 1.6919566457501428e-05, + "loss": 2.898, + "step": 2972 + }, + { + "epoch": 0.5088139654287182, + "grad_norm": 38.798343658447266, + "learning_rate": 1.692527096406161e-05, + "loss": 3.5201, + "step": 2973 + }, + { + "epoch": 0.508985110388499, + "grad_norm": 10.149602890014648, + "learning_rate": 1.693097547062179e-05, + "loss": 0.7939, + "step": 2974 + }, + { + "epoch": 0.50915625534828, + "grad_norm": 6.670815944671631, + "learning_rate": 1.6936679977181975e-05, + "loss": 0.854, + "step": 2975 + }, + { + "epoch": 0.5093274003080609, + "grad_norm": 75.72901153564453, + "learning_rate": 1.6942384483742158e-05, + "loss": 2.9379, + "step": 2976 + }, + { + "epoch": 0.5094985452678419, + "grad_norm": 26.788955688476562, + "learning_rate": 1.6948088990302338e-05, + "loss": 2.4457, + "step": 2977 + }, + { + "epoch": 0.5096696902276228, + "grad_norm": 14.796418190002441, + "learning_rate": 1.695379349686252e-05, + "loss": 1.0122, + "step": 2978 + }, + { + "epoch": 0.5098408351874038, + "grad_norm": 4.948236465454102, + "learning_rate": 1.6959498003422705e-05, + "loss": 0.5853, + "step": 2979 + }, + { + "epoch": 0.5100119801471846, + "grad_norm": 182.9610595703125, + "learning_rate": 1.6965202509982888e-05, + "loss": 8.9776, + "step": 2980 + }, + { + "epoch": 0.5101831251069656, + "grad_norm": 29.51963996887207, + "learning_rate": 1.6970907016543068e-05, + "loss": 2.9543, + "step": 2981 + }, + { + "epoch": 0.5103542700667465, + "grad_norm": 28.639034271240234, + "learning_rate": 1.697661152310325e-05, + "loss": 3.2262, + "step": 2982 + }, + { + "epoch": 0.5105254150265275, + "grad_norm": 29.50834846496582, + "learning_rate": 1.6982316029663435e-05, + "loss": 3.0001, + "step": 2983 + }, + { + "epoch": 0.5106965599863084, + "grad_norm": 15.582537651062012, + "learning_rate": 1.6988020536223615e-05, + "loss": 1.1638, + "step": 2984 + }, + { + "epoch": 0.5108677049460894, + "grad_norm": 27.667177200317383, + "learning_rate": 1.6993725042783798e-05, + "loss": 2.9351, + "step": 2985 + }, + { + "epoch": 0.5110388499058702, + "grad_norm": 28.853923797607422, + "learning_rate": 1.699942954934398e-05, + "loss": 3.6286, + "step": 2986 + }, + { + "epoch": 0.5112099948656512, + "grad_norm": 26.117013931274414, + "learning_rate": 1.7005134055904165e-05, + "loss": 2.8584, + "step": 2987 + }, + { + "epoch": 0.5113811398254321, + "grad_norm": 34.81660842895508, + "learning_rate": 1.7010838562464345e-05, + "loss": 4.3968, + "step": 2988 + }, + { + "epoch": 0.5115522847852131, + "grad_norm": 35.10283279418945, + "learning_rate": 1.7016543069024528e-05, + "loss": 6.599, + "step": 2989 + }, + { + "epoch": 0.511723429744994, + "grad_norm": 19.16140365600586, + "learning_rate": 1.7022247575584715e-05, + "loss": 2.1204, + "step": 2990 + }, + { + "epoch": 0.511894574704775, + "grad_norm": 22.029394149780273, + "learning_rate": 1.7027952082144895e-05, + "loss": 1.8696, + "step": 2991 + }, + { + "epoch": 0.5120657196645558, + "grad_norm": 3.448702335357666, + "learning_rate": 1.703365658870508e-05, + "loss": 0.4607, + "step": 2992 + }, + { + "epoch": 0.5122368646243368, + "grad_norm": 22.506763458251953, + "learning_rate": 1.7039361095265262e-05, + "loss": 2.1106, + "step": 2993 + }, + { + "epoch": 0.5124080095841177, + "grad_norm": 31.842361450195312, + "learning_rate": 1.7045065601825445e-05, + "loss": 3.8676, + "step": 2994 + }, + { + "epoch": 0.5125791545438987, + "grad_norm": 141.6663818359375, + "learning_rate": 1.7050770108385625e-05, + "loss": 8.5208, + "step": 2995 + }, + { + "epoch": 0.5127502995036796, + "grad_norm": 62.276729583740234, + "learning_rate": 1.705647461494581e-05, + "loss": 3.2482, + "step": 2996 + }, + { + "epoch": 0.5129214444634606, + "grad_norm": 22.119609832763672, + "learning_rate": 1.7062179121505992e-05, + "loss": 1.9903, + "step": 2997 + }, + { + "epoch": 0.5130925894232415, + "grad_norm": 52.37403106689453, + "learning_rate": 1.7067883628066172e-05, + "loss": 6.8319, + "step": 2998 + }, + { + "epoch": 0.5132637343830224, + "grad_norm": 12.259587287902832, + "learning_rate": 1.7073588134626355e-05, + "loss": 1.162, + "step": 2999 + }, + { + "epoch": 0.5134348793428034, + "grad_norm": 8.290674209594727, + "learning_rate": 1.707929264118654e-05, + "loss": 0.9012, + "step": 3000 + }, + { + "epoch": 0.5136060243025843, + "grad_norm": 32.74642562866211, + "learning_rate": 1.7084997147746722e-05, + "loss": 3.4785, + "step": 3001 + }, + { + "epoch": 0.5137771692623653, + "grad_norm": 31.82801055908203, + "learning_rate": 1.7090701654306902e-05, + "loss": 4.2721, + "step": 3002 + }, + { + "epoch": 0.5139483142221462, + "grad_norm": 32.273136138916016, + "learning_rate": 1.7096406160867085e-05, + "loss": 3.2625, + "step": 3003 + }, + { + "epoch": 0.5141194591819271, + "grad_norm": 78.98668670654297, + "learning_rate": 1.710211066742727e-05, + "loss": 3.2698, + "step": 3004 + }, + { + "epoch": 0.514290604141708, + "grad_norm": 30.16362762451172, + "learning_rate": 1.710781517398745e-05, + "loss": 3.9137, + "step": 3005 + }, + { + "epoch": 0.514461749101489, + "grad_norm": 18.465227127075195, + "learning_rate": 1.7113519680547632e-05, + "loss": 1.8387, + "step": 3006 + }, + { + "epoch": 0.5146328940612699, + "grad_norm": 3.536219358444214, + "learning_rate": 1.7119224187107816e-05, + "loss": 0.446, + "step": 3007 + }, + { + "epoch": 0.5148040390210509, + "grad_norm": 17.390464782714844, + "learning_rate": 1.7124928693667996e-05, + "loss": 1.7668, + "step": 3008 + }, + { + "epoch": 0.5149751839808318, + "grad_norm": 18.47218894958496, + "learning_rate": 1.713063320022818e-05, + "loss": 2.1817, + "step": 3009 + }, + { + "epoch": 0.5151463289406127, + "grad_norm": 28.22992515563965, + "learning_rate": 1.7136337706788362e-05, + "loss": 2.9769, + "step": 3010 + }, + { + "epoch": 0.5153174739003936, + "grad_norm": 62.36894989013672, + "learning_rate": 1.7142042213348546e-05, + "loss": 7.6922, + "step": 3011 + }, + { + "epoch": 0.5154886188601746, + "grad_norm": 33.23900604248047, + "learning_rate": 1.7147746719908726e-05, + "loss": 3.3971, + "step": 3012 + }, + { + "epoch": 0.5156597638199555, + "grad_norm": 2.5457472801208496, + "learning_rate": 1.7153451226468912e-05, + "loss": 0.4122, + "step": 3013 + }, + { + "epoch": 0.5158309087797365, + "grad_norm": 26.533376693725586, + "learning_rate": 1.7159155733029096e-05, + "loss": 2.9528, + "step": 3014 + }, + { + "epoch": 0.5160020537395174, + "grad_norm": 33.18933868408203, + "learning_rate": 1.7164860239589276e-05, + "loss": 3.7197, + "step": 3015 + }, + { + "epoch": 0.5161731986992983, + "grad_norm": 25.48127555847168, + "learning_rate": 1.717056474614946e-05, + "loss": 2.8834, + "step": 3016 + }, + { + "epoch": 0.5163443436590792, + "grad_norm": 32.51988983154297, + "learning_rate": 1.7176269252709643e-05, + "loss": 3.6681, + "step": 3017 + }, + { + "epoch": 0.5165154886188602, + "grad_norm": 21.83390998840332, + "learning_rate": 1.7181973759269826e-05, + "loss": 2.3579, + "step": 3018 + }, + { + "epoch": 0.5166866335786411, + "grad_norm": 21.106168746948242, + "learning_rate": 1.7187678265830006e-05, + "loss": 2.1503, + "step": 3019 + }, + { + "epoch": 0.5168577785384221, + "grad_norm": 23.668697357177734, + "learning_rate": 1.719338277239019e-05, + "loss": 2.65, + "step": 3020 + }, + { + "epoch": 0.517028923498203, + "grad_norm": 56.29466247558594, + "learning_rate": 1.7199087278950373e-05, + "loss": 7.269, + "step": 3021 + }, + { + "epoch": 0.5172000684579839, + "grad_norm": 14.612650871276855, + "learning_rate": 1.7204791785510553e-05, + "loss": 1.5426, + "step": 3022 + }, + { + "epoch": 0.5173712134177648, + "grad_norm": 28.365121841430664, + "learning_rate": 1.7210496292070736e-05, + "loss": 3.636, + "step": 3023 + }, + { + "epoch": 0.5175423583775458, + "grad_norm": 25.329317092895508, + "learning_rate": 1.721620079863092e-05, + "loss": 2.3847, + "step": 3024 + }, + { + "epoch": 0.5177135033373267, + "grad_norm": 32.05517578125, + "learning_rate": 1.7221905305191103e-05, + "loss": 3.7742, + "step": 3025 + }, + { + "epoch": 0.5178846482971077, + "grad_norm": 11.009437561035156, + "learning_rate": 1.7227609811751283e-05, + "loss": 1.5541, + "step": 3026 + }, + { + "epoch": 0.5180557932568886, + "grad_norm": 4.6759490966796875, + "learning_rate": 1.7233314318311466e-05, + "loss": 0.448, + "step": 3027 + }, + { + "epoch": 0.5182269382166695, + "grad_norm": 23.18576431274414, + "learning_rate": 1.723901882487165e-05, + "loss": 2.3099, + "step": 3028 + }, + { + "epoch": 0.5183980831764504, + "grad_norm": 21.823318481445312, + "learning_rate": 1.724472333143183e-05, + "loss": 2.0502, + "step": 3029 + }, + { + "epoch": 0.5185692281362314, + "grad_norm": 33.11149215698242, + "learning_rate": 1.7250427837992013e-05, + "loss": 3.7448, + "step": 3030 + }, + { + "epoch": 0.5187403730960123, + "grad_norm": 32.03651809692383, + "learning_rate": 1.7256132344552196e-05, + "loss": 3.5141, + "step": 3031 + }, + { + "epoch": 0.5189115180557933, + "grad_norm": 29.257003784179688, + "learning_rate": 1.726183685111238e-05, + "loss": 3.5149, + "step": 3032 + }, + { + "epoch": 0.5190826630155742, + "grad_norm": 6.367782115936279, + "learning_rate": 1.726754135767256e-05, + "loss": 1.1882, + "step": 3033 + }, + { + "epoch": 0.5192538079753551, + "grad_norm": 21.6986083984375, + "learning_rate": 1.7273245864232743e-05, + "loss": 1.7948, + "step": 3034 + }, + { + "epoch": 0.519424952935136, + "grad_norm": 14.612825393676758, + "learning_rate": 1.7278950370792926e-05, + "loss": 1.1659, + "step": 3035 + }, + { + "epoch": 0.519596097894917, + "grad_norm": 28.725549697875977, + "learning_rate": 1.728465487735311e-05, + "loss": 2.7611, + "step": 3036 + }, + { + "epoch": 0.5197672428546979, + "grad_norm": 30.985149383544922, + "learning_rate": 1.7290359383913293e-05, + "loss": 2.9706, + "step": 3037 + }, + { + "epoch": 0.5199383878144789, + "grad_norm": 17.664464950561523, + "learning_rate": 1.7296063890473477e-05, + "loss": 1.6902, + "step": 3038 + }, + { + "epoch": 0.5201095327742598, + "grad_norm": 32.17440414428711, + "learning_rate": 1.7301768397033657e-05, + "loss": 3.8811, + "step": 3039 + }, + { + "epoch": 0.5202806777340407, + "grad_norm": 5.3300580978393555, + "learning_rate": 1.730747290359384e-05, + "loss": 0.4921, + "step": 3040 + }, + { + "epoch": 0.5204518226938216, + "grad_norm": 38.537044525146484, + "learning_rate": 1.7313177410154023e-05, + "loss": 3.2832, + "step": 3041 + }, + { + "epoch": 0.5206229676536026, + "grad_norm": 39.10978698730469, + "learning_rate": 1.7318881916714207e-05, + "loss": 3.8919, + "step": 3042 + }, + { + "epoch": 0.5207941126133835, + "grad_norm": 29.357208251953125, + "learning_rate": 1.7324586423274387e-05, + "loss": 3.115, + "step": 3043 + }, + { + "epoch": 0.5209652575731645, + "grad_norm": 15.655451774597168, + "learning_rate": 1.733029092983457e-05, + "loss": 1.4122, + "step": 3044 + }, + { + "epoch": 0.5211364025329454, + "grad_norm": 28.293025970458984, + "learning_rate": 1.7335995436394753e-05, + "loss": 2.9349, + "step": 3045 + }, + { + "epoch": 0.5213075474927263, + "grad_norm": 32.65211868286133, + "learning_rate": 1.7341699942954933e-05, + "loss": 3.2992, + "step": 3046 + }, + { + "epoch": 0.5214786924525072, + "grad_norm": 23.2037296295166, + "learning_rate": 1.7347404449515117e-05, + "loss": 2.3879, + "step": 3047 + }, + { + "epoch": 0.5216498374122882, + "grad_norm": 26.37859535217285, + "learning_rate": 1.73531089560753e-05, + "loss": 2.1987, + "step": 3048 + }, + { + "epoch": 0.5218209823720692, + "grad_norm": 18.490966796875, + "learning_rate": 1.7358813462635484e-05, + "loss": 1.461, + "step": 3049 + }, + { + "epoch": 0.5219921273318501, + "grad_norm": 31.97382354736328, + "learning_rate": 1.7364517969195664e-05, + "loss": 2.8849, + "step": 3050 + }, + { + "epoch": 0.5221632722916311, + "grad_norm": 49.7996711730957, + "learning_rate": 1.7370222475755847e-05, + "loss": 6.9056, + "step": 3051 + }, + { + "epoch": 0.5223344172514119, + "grad_norm": 28.981660842895508, + "learning_rate": 1.737592698231603e-05, + "loss": 3.0778, + "step": 3052 + }, + { + "epoch": 0.5225055622111929, + "grad_norm": 120.67489624023438, + "learning_rate": 1.738163148887621e-05, + "loss": 4.2964, + "step": 3053 + }, + { + "epoch": 0.5226767071709738, + "grad_norm": 158.1115264892578, + "learning_rate": 1.7387335995436394e-05, + "loss": 8.5312, + "step": 3054 + }, + { + "epoch": 0.5228478521307548, + "grad_norm": 28.185558319091797, + "learning_rate": 1.7393040501996577e-05, + "loss": 3.537, + "step": 3055 + }, + { + "epoch": 0.5230189970905357, + "grad_norm": 9.107454299926758, + "learning_rate": 1.739874500855676e-05, + "loss": 1.0684, + "step": 3056 + }, + { + "epoch": 0.5231901420503167, + "grad_norm": 36.81668472290039, + "learning_rate": 1.740444951511694e-05, + "loss": 3.8926, + "step": 3057 + }, + { + "epoch": 0.5233612870100975, + "grad_norm": 26.352327346801758, + "learning_rate": 1.7410154021677124e-05, + "loss": 2.664, + "step": 3058 + }, + { + "epoch": 0.5235324319698785, + "grad_norm": 21.38902473449707, + "learning_rate": 1.741585852823731e-05, + "loss": 2.2566, + "step": 3059 + }, + { + "epoch": 0.5237035769296594, + "grad_norm": 10.19254207611084, + "learning_rate": 1.742156303479749e-05, + "loss": 0.8717, + "step": 3060 + }, + { + "epoch": 0.5238747218894404, + "grad_norm": 19.25916862487793, + "learning_rate": 1.7427267541357674e-05, + "loss": 1.3792, + "step": 3061 + }, + { + "epoch": 0.5240458668492213, + "grad_norm": 21.88836669921875, + "learning_rate": 1.7432972047917857e-05, + "loss": 2.2543, + "step": 3062 + }, + { + "epoch": 0.5242170118090023, + "grad_norm": 30.14661979675293, + "learning_rate": 1.743867655447804e-05, + "loss": 3.4672, + "step": 3063 + }, + { + "epoch": 0.5243881567687831, + "grad_norm": 25.134571075439453, + "learning_rate": 1.744438106103822e-05, + "loss": 2.5836, + "step": 3064 + }, + { + "epoch": 0.5245593017285641, + "grad_norm": 21.906818389892578, + "learning_rate": 1.7450085567598404e-05, + "loss": 2.4618, + "step": 3065 + }, + { + "epoch": 0.524730446688345, + "grad_norm": 31.330976486206055, + "learning_rate": 1.7455790074158587e-05, + "loss": 3.1397, + "step": 3066 + }, + { + "epoch": 0.524901591648126, + "grad_norm": 6.355524063110352, + "learning_rate": 1.7461494580718767e-05, + "loss": 0.5091, + "step": 3067 + }, + { + "epoch": 0.5250727366079069, + "grad_norm": 110.98942565917969, + "learning_rate": 1.746719908727895e-05, + "loss": 7.9114, + "step": 3068 + }, + { + "epoch": 0.5252438815676879, + "grad_norm": 31.17119789123535, + "learning_rate": 1.7472903593839134e-05, + "loss": 3.2594, + "step": 3069 + }, + { + "epoch": 0.5254150265274687, + "grad_norm": 24.364032745361328, + "learning_rate": 1.7478608100399318e-05, + "loss": 1.9217, + "step": 3070 + }, + { + "epoch": 0.5255861714872497, + "grad_norm": 34.264041900634766, + "learning_rate": 1.7484312606959498e-05, + "loss": 3.9812, + "step": 3071 + }, + { + "epoch": 0.5257573164470306, + "grad_norm": 27.54375648498535, + "learning_rate": 1.749001711351968e-05, + "loss": 2.8373, + "step": 3072 + }, + { + "epoch": 0.5259284614068116, + "grad_norm": 58.27510452270508, + "learning_rate": 1.7495721620079864e-05, + "loss": 7.2686, + "step": 3073 + }, + { + "epoch": 0.5260996063665925, + "grad_norm": 27.861116409301758, + "learning_rate": 1.7501426126640044e-05, + "loss": 3.2877, + "step": 3074 + }, + { + "epoch": 0.5262707513263735, + "grad_norm": 28.097177505493164, + "learning_rate": 1.7507130633200228e-05, + "loss": 2.3413, + "step": 3075 + }, + { + "epoch": 0.5264418962861543, + "grad_norm": 30.74901008605957, + "learning_rate": 1.751283513976041e-05, + "loss": 3.2284, + "step": 3076 + }, + { + "epoch": 0.5266130412459353, + "grad_norm": 5.434010982513428, + "learning_rate": 1.751853964632059e-05, + "loss": 0.5515, + "step": 3077 + }, + { + "epoch": 0.5267841862057162, + "grad_norm": 19.591594696044922, + "learning_rate": 1.7524244152880774e-05, + "loss": 1.8104, + "step": 3078 + }, + { + "epoch": 0.5269553311654972, + "grad_norm": 27.989707946777344, + "learning_rate": 1.7529948659440958e-05, + "loss": 2.4876, + "step": 3079 + }, + { + "epoch": 0.5271264761252781, + "grad_norm": 45.50398635864258, + "learning_rate": 1.753565316600114e-05, + "loss": 6.9276, + "step": 3080 + }, + { + "epoch": 0.5272976210850591, + "grad_norm": 29.907915115356445, + "learning_rate": 1.754135767256132e-05, + "loss": 3.8381, + "step": 3081 + }, + { + "epoch": 0.5274687660448399, + "grad_norm": 22.03485679626465, + "learning_rate": 1.7547062179121508e-05, + "loss": 1.8432, + "step": 3082 + }, + { + "epoch": 0.5276399110046209, + "grad_norm": 41.72187042236328, + "learning_rate": 1.755276668568169e-05, + "loss": 6.81, + "step": 3083 + }, + { + "epoch": 0.5278110559644018, + "grad_norm": 15.85753345489502, + "learning_rate": 1.755847119224187e-05, + "loss": 1.1867, + "step": 3084 + }, + { + "epoch": 0.5279822009241828, + "grad_norm": 14.52872085571289, + "learning_rate": 1.7564175698802055e-05, + "loss": 1.02, + "step": 3085 + }, + { + "epoch": 0.5281533458839637, + "grad_norm": 47.226070404052734, + "learning_rate": 1.7569880205362238e-05, + "loss": 6.5701, + "step": 3086 + }, + { + "epoch": 0.5283244908437447, + "grad_norm": 26.31117820739746, + "learning_rate": 1.757558471192242e-05, + "loss": 2.8588, + "step": 3087 + }, + { + "epoch": 0.5284956358035255, + "grad_norm": 24.817096710205078, + "learning_rate": 1.75812892184826e-05, + "loss": 2.5557, + "step": 3088 + }, + { + "epoch": 0.5286667807633065, + "grad_norm": 3.8697149753570557, + "learning_rate": 1.7586993725042785e-05, + "loss": 0.447, + "step": 3089 + }, + { + "epoch": 0.5288379257230874, + "grad_norm": 27.01019287109375, + "learning_rate": 1.7592698231602968e-05, + "loss": 2.6394, + "step": 3090 + }, + { + "epoch": 0.5290090706828684, + "grad_norm": 3.1552348136901855, + "learning_rate": 1.7598402738163148e-05, + "loss": 0.4523, + "step": 3091 + }, + { + "epoch": 0.5291802156426493, + "grad_norm": 30.454021453857422, + "learning_rate": 1.760410724472333e-05, + "loss": 3.3332, + "step": 3092 + }, + { + "epoch": 0.5293513606024303, + "grad_norm": 2.6408188343048096, + "learning_rate": 1.7609811751283515e-05, + "loss": 0.4075, + "step": 3093 + }, + { + "epoch": 0.5295225055622111, + "grad_norm": 27.623132705688477, + "learning_rate": 1.76155162578437e-05, + "loss": 2.7079, + "step": 3094 + }, + { + "epoch": 0.5296936505219921, + "grad_norm": 22.717605590820312, + "learning_rate": 1.762122076440388e-05, + "loss": 2.2378, + "step": 3095 + }, + { + "epoch": 0.529864795481773, + "grad_norm": 50.63970184326172, + "learning_rate": 1.7626925270964062e-05, + "loss": 3.3046, + "step": 3096 + }, + { + "epoch": 0.530035940441554, + "grad_norm": 47.14366912841797, + "learning_rate": 1.7632629777524245e-05, + "loss": 6.9276, + "step": 3097 + }, + { + "epoch": 0.5302070854013349, + "grad_norm": 26.201753616333008, + "learning_rate": 1.7638334284084425e-05, + "loss": 2.3728, + "step": 3098 + }, + { + "epoch": 0.5303782303611159, + "grad_norm": 33.462398529052734, + "learning_rate": 1.764403879064461e-05, + "loss": 6.8023, + "step": 3099 + }, + { + "epoch": 0.5305493753208969, + "grad_norm": 32.51939010620117, + "learning_rate": 1.7649743297204792e-05, + "loss": 3.992, + "step": 3100 + }, + { + "epoch": 0.5307205202806777, + "grad_norm": 14.161356925964355, + "learning_rate": 1.7655447803764975e-05, + "loss": 1.0058, + "step": 3101 + }, + { + "epoch": 0.5308916652404587, + "grad_norm": 91.61168670654297, + "learning_rate": 1.7661152310325155e-05, + "loss": 6.6935, + "step": 3102 + }, + { + "epoch": 0.5310628102002396, + "grad_norm": 26.40794563293457, + "learning_rate": 1.766685681688534e-05, + "loss": 2.5008, + "step": 3103 + }, + { + "epoch": 0.5312339551600206, + "grad_norm": 21.793699264526367, + "learning_rate": 1.7672561323445522e-05, + "loss": 2.0443, + "step": 3104 + }, + { + "epoch": 0.5314051001198015, + "grad_norm": 27.75925636291504, + "learning_rate": 1.7678265830005705e-05, + "loss": 2.893, + "step": 3105 + }, + { + "epoch": 0.5315762450795825, + "grad_norm": 22.48872947692871, + "learning_rate": 1.768397033656589e-05, + "loss": 1.8274, + "step": 3106 + }, + { + "epoch": 0.5317473900393633, + "grad_norm": 21.972978591918945, + "learning_rate": 1.7689674843126072e-05, + "loss": 2.0134, + "step": 3107 + }, + { + "epoch": 0.5319185349991443, + "grad_norm": 4.393357753753662, + "learning_rate": 1.7695379349686252e-05, + "loss": 0.4006, + "step": 3108 + }, + { + "epoch": 0.5320896799589252, + "grad_norm": 15.986166000366211, + "learning_rate": 1.7701083856246435e-05, + "loss": 1.0921, + "step": 3109 + }, + { + "epoch": 0.5322608249187062, + "grad_norm": 23.317607879638672, + "learning_rate": 1.770678836280662e-05, + "loss": 2.5234, + "step": 3110 + }, + { + "epoch": 0.5324319698784871, + "grad_norm": 81.8624267578125, + "learning_rate": 1.7712492869366802e-05, + "loss": 3.4206, + "step": 3111 + }, + { + "epoch": 0.532603114838268, + "grad_norm": 46.01921844482422, + "learning_rate": 1.7718197375926982e-05, + "loss": 3.2694, + "step": 3112 + }, + { + "epoch": 0.5327742597980489, + "grad_norm": 14.079997062683105, + "learning_rate": 1.7723901882487166e-05, + "loss": 1.1213, + "step": 3113 + }, + { + "epoch": 0.5329454047578299, + "grad_norm": 27.70348358154297, + "learning_rate": 1.772960638904735e-05, + "loss": 2.9553, + "step": 3114 + }, + { + "epoch": 0.5331165497176108, + "grad_norm": 13.08663558959961, + "learning_rate": 1.773531089560753e-05, + "loss": 0.9058, + "step": 3115 + }, + { + "epoch": 0.5332876946773918, + "grad_norm": 5.895364761352539, + "learning_rate": 1.7741015402167712e-05, + "loss": 0.5572, + "step": 3116 + }, + { + "epoch": 0.5334588396371727, + "grad_norm": 14.521390914916992, + "learning_rate": 1.7746719908727896e-05, + "loss": 1.2956, + "step": 3117 + }, + { + "epoch": 0.5336299845969537, + "grad_norm": 5.561517238616943, + "learning_rate": 1.775242441528808e-05, + "loss": 0.7001, + "step": 3118 + }, + { + "epoch": 0.5338011295567345, + "grad_norm": 12.158028602600098, + "learning_rate": 1.775812892184826e-05, + "loss": 0.8123, + "step": 3119 + }, + { + "epoch": 0.5339722745165155, + "grad_norm": 32.72988510131836, + "learning_rate": 1.7763833428408442e-05, + "loss": 2.9845, + "step": 3120 + }, + { + "epoch": 0.5341434194762964, + "grad_norm": 31.350831985473633, + "learning_rate": 1.7769537934968626e-05, + "loss": 3.6956, + "step": 3121 + }, + { + "epoch": 0.5343145644360774, + "grad_norm": 19.63844871520996, + "learning_rate": 1.7775242441528806e-05, + "loss": 2.0279, + "step": 3122 + }, + { + "epoch": 0.5344857093958583, + "grad_norm": 10.440444946289062, + "learning_rate": 1.778094694808899e-05, + "loss": 0.8947, + "step": 3123 + }, + { + "epoch": 0.5346568543556393, + "grad_norm": 28.158235549926758, + "learning_rate": 1.7786651454649173e-05, + "loss": 2.9307, + "step": 3124 + }, + { + "epoch": 0.5348279993154201, + "grad_norm": 25.009632110595703, + "learning_rate": 1.7792355961209356e-05, + "loss": 2.3439, + "step": 3125 + }, + { + "epoch": 0.5349991442752011, + "grad_norm": 25.99068832397461, + "learning_rate": 1.7798060467769536e-05, + "loss": 2.681, + "step": 3126 + }, + { + "epoch": 0.535170289234982, + "grad_norm": 16.541526794433594, + "learning_rate": 1.780376497432972e-05, + "loss": 1.7645, + "step": 3127 + }, + { + "epoch": 0.535341434194763, + "grad_norm": 32.52701950073242, + "learning_rate": 1.7809469480889906e-05, + "loss": 3.4399, + "step": 3128 + }, + { + "epoch": 0.5355125791545439, + "grad_norm": 1.9595259428024292, + "learning_rate": 1.7815173987450086e-05, + "loss": 0.3411, + "step": 3129 + }, + { + "epoch": 0.5356837241143249, + "grad_norm": 22.871707916259766, + "learning_rate": 1.782087849401027e-05, + "loss": 2.7152, + "step": 3130 + }, + { + "epoch": 0.5358548690741057, + "grad_norm": 30.88572120666504, + "learning_rate": 1.7826583000570453e-05, + "loss": 3.0383, + "step": 3131 + }, + { + "epoch": 0.5360260140338867, + "grad_norm": 24.158727645874023, + "learning_rate": 1.7832287507130636e-05, + "loss": 2.7594, + "step": 3132 + }, + { + "epoch": 0.5361971589936676, + "grad_norm": 19.16653823852539, + "learning_rate": 1.7837992013690816e-05, + "loss": 1.7749, + "step": 3133 + }, + { + "epoch": 0.5363683039534486, + "grad_norm": 11.925354957580566, + "learning_rate": 1.7843696520251e-05, + "loss": 0.8569, + "step": 3134 + }, + { + "epoch": 0.5365394489132295, + "grad_norm": 20.42278289794922, + "learning_rate": 1.7849401026811183e-05, + "loss": 1.9146, + "step": 3135 + }, + { + "epoch": 0.5367105938730105, + "grad_norm": 36.13545227050781, + "learning_rate": 1.7855105533371363e-05, + "loss": 4.4798, + "step": 3136 + }, + { + "epoch": 0.5368817388327913, + "grad_norm": 4.70065975189209, + "learning_rate": 1.7860810039931546e-05, + "loss": 0.4304, + "step": 3137 + }, + { + "epoch": 0.5370528837925723, + "grad_norm": 24.28241539001465, + "learning_rate": 1.786651454649173e-05, + "loss": 2.4005, + "step": 3138 + }, + { + "epoch": 0.5372240287523532, + "grad_norm": 14.650952339172363, + "learning_rate": 1.787221905305191e-05, + "loss": 0.968, + "step": 3139 + }, + { + "epoch": 0.5373951737121342, + "grad_norm": 16.861696243286133, + "learning_rate": 1.7877923559612093e-05, + "loss": 1.7277, + "step": 3140 + }, + { + "epoch": 0.5375663186719151, + "grad_norm": 5.233786106109619, + "learning_rate": 1.7883628066172276e-05, + "loss": 0.7488, + "step": 3141 + }, + { + "epoch": 0.537737463631696, + "grad_norm": 32.38574981689453, + "learning_rate": 1.788933257273246e-05, + "loss": 3.4367, + "step": 3142 + }, + { + "epoch": 0.5379086085914769, + "grad_norm": 75.13265991210938, + "learning_rate": 1.789503707929264e-05, + "loss": 3.4272, + "step": 3143 + }, + { + "epoch": 0.5380797535512579, + "grad_norm": 23.56121063232422, + "learning_rate": 1.7900741585852823e-05, + "loss": 2.2267, + "step": 3144 + }, + { + "epoch": 0.5382508985110388, + "grad_norm": 6.575436592102051, + "learning_rate": 1.7906446092413007e-05, + "loss": 0.7715, + "step": 3145 + }, + { + "epoch": 0.5384220434708198, + "grad_norm": 30.233795166015625, + "learning_rate": 1.7912150598973187e-05, + "loss": 2.2247, + "step": 3146 + }, + { + "epoch": 0.5385931884306007, + "grad_norm": 18.158550262451172, + "learning_rate": 1.791785510553337e-05, + "loss": 1.4116, + "step": 3147 + }, + { + "epoch": 0.5387643333903817, + "grad_norm": 25.578800201416016, + "learning_rate": 1.7923559612093553e-05, + "loss": 2.237, + "step": 3148 + }, + { + "epoch": 0.5389354783501625, + "grad_norm": 4.3977460861206055, + "learning_rate": 1.7929264118653737e-05, + "loss": 0.4421, + "step": 3149 + }, + { + "epoch": 0.5391066233099435, + "grad_norm": 23.86539649963379, + "learning_rate": 1.793496862521392e-05, + "loss": 2.2867, + "step": 3150 + }, + { + "epoch": 0.5392777682697245, + "grad_norm": 2.900665283203125, + "learning_rate": 1.7940673131774103e-05, + "loss": 0.3778, + "step": 3151 + }, + { + "epoch": 0.5394489132295054, + "grad_norm": 28.02079200744629, + "learning_rate": 1.7946377638334287e-05, + "loss": 2.756, + "step": 3152 + }, + { + "epoch": 0.5396200581892864, + "grad_norm": 27.565895080566406, + "learning_rate": 1.7952082144894467e-05, + "loss": 2.3044, + "step": 3153 + }, + { + "epoch": 0.5397912031490673, + "grad_norm": 35.14018630981445, + "learning_rate": 1.795778665145465e-05, + "loss": 4.3437, + "step": 3154 + }, + { + "epoch": 0.5399623481088482, + "grad_norm": 24.932573318481445, + "learning_rate": 1.7963491158014834e-05, + "loss": 2.2505, + "step": 3155 + }, + { + "epoch": 0.5401334930686291, + "grad_norm": 26.866313934326172, + "learning_rate": 1.7969195664575017e-05, + "loss": 2.7324, + "step": 3156 + }, + { + "epoch": 0.5403046380284101, + "grad_norm": 22.461328506469727, + "learning_rate": 1.7974900171135197e-05, + "loss": 2.1863, + "step": 3157 + }, + { + "epoch": 0.540475782988191, + "grad_norm": 16.967121124267578, + "learning_rate": 1.798060467769538e-05, + "loss": 1.0429, + "step": 3158 + }, + { + "epoch": 0.540646927947972, + "grad_norm": 116.18841552734375, + "learning_rate": 1.7986309184255564e-05, + "loss": 3.4443, + "step": 3159 + }, + { + "epoch": 0.5408180729077529, + "grad_norm": 28.559480667114258, + "learning_rate": 1.7992013690815744e-05, + "loss": 2.4973, + "step": 3160 + }, + { + "epoch": 0.5409892178675338, + "grad_norm": 3.590916395187378, + "learning_rate": 1.7997718197375927e-05, + "loss": 0.4966, + "step": 3161 + }, + { + "epoch": 0.5411603628273147, + "grad_norm": 78.85108947753906, + "learning_rate": 1.800342270393611e-05, + "loss": 4.2312, + "step": 3162 + }, + { + "epoch": 0.5413315077870957, + "grad_norm": 25.83539390563965, + "learning_rate": 1.8009127210496294e-05, + "loss": 2.4909, + "step": 3163 + }, + { + "epoch": 0.5415026527468766, + "grad_norm": 4.292176246643066, + "learning_rate": 1.8014831717056474e-05, + "loss": 0.4314, + "step": 3164 + }, + { + "epoch": 0.5416737977066576, + "grad_norm": 6.629253387451172, + "learning_rate": 1.8020536223616657e-05, + "loss": 0.7743, + "step": 3165 + }, + { + "epoch": 0.5418449426664385, + "grad_norm": 22.770082473754883, + "learning_rate": 1.802624073017684e-05, + "loss": 2.2334, + "step": 3166 + }, + { + "epoch": 0.5420160876262194, + "grad_norm": 26.48427963256836, + "learning_rate": 1.803194523673702e-05, + "loss": 3.4341, + "step": 3167 + }, + { + "epoch": 0.5421872325860003, + "grad_norm": 9.429801940917969, + "learning_rate": 1.8037649743297204e-05, + "loss": 0.8092, + "step": 3168 + }, + { + "epoch": 0.5423583775457813, + "grad_norm": 56.79134750366211, + "learning_rate": 1.8043354249857387e-05, + "loss": 7.2408, + "step": 3169 + }, + { + "epoch": 0.5425295225055622, + "grad_norm": 26.484098434448242, + "learning_rate": 1.804905875641757e-05, + "loss": 2.4684, + "step": 3170 + }, + { + "epoch": 0.5427006674653432, + "grad_norm": 21.694990158081055, + "learning_rate": 1.805476326297775e-05, + "loss": 2.1088, + "step": 3171 + }, + { + "epoch": 0.5428718124251241, + "grad_norm": 23.824108123779297, + "learning_rate": 1.8060467769537934e-05, + "loss": 2.7028, + "step": 3172 + }, + { + "epoch": 0.543042957384905, + "grad_norm": 23.9963321685791, + "learning_rate": 1.806617227609812e-05, + "loss": 2.1508, + "step": 3173 + }, + { + "epoch": 0.5432141023446859, + "grad_norm": 23.810443878173828, + "learning_rate": 1.80718767826583e-05, + "loss": 3.0015, + "step": 3174 + }, + { + "epoch": 0.5433852473044669, + "grad_norm": 38.47050857543945, + "learning_rate": 1.8077581289218484e-05, + "loss": 7.0851, + "step": 3175 + }, + { + "epoch": 0.5435563922642478, + "grad_norm": 26.14175033569336, + "learning_rate": 1.8083285795778668e-05, + "loss": 2.8927, + "step": 3176 + }, + { + "epoch": 0.5437275372240288, + "grad_norm": 34.895294189453125, + "learning_rate": 1.8088990302338848e-05, + "loss": 6.4431, + "step": 3177 + }, + { + "epoch": 0.5438986821838097, + "grad_norm": 30.46366310119629, + "learning_rate": 1.809469480889903e-05, + "loss": 3.8774, + "step": 3178 + }, + { + "epoch": 0.5440698271435906, + "grad_norm": 2.045729637145996, + "learning_rate": 1.8100399315459214e-05, + "loss": 0.3887, + "step": 3179 + }, + { + "epoch": 0.5442409721033715, + "grad_norm": 23.526275634765625, + "learning_rate": 1.8106103822019398e-05, + "loss": 2.2568, + "step": 3180 + }, + { + "epoch": 0.5444121170631525, + "grad_norm": 37.37553024291992, + "learning_rate": 1.8111808328579578e-05, + "loss": 5.5636, + "step": 3181 + }, + { + "epoch": 0.5445832620229334, + "grad_norm": 2.853957176208496, + "learning_rate": 1.811751283513976e-05, + "loss": 0.3745, + "step": 3182 + }, + { + "epoch": 0.5447544069827144, + "grad_norm": 3.5641119480133057, + "learning_rate": 1.8123217341699944e-05, + "loss": 0.424, + "step": 3183 + }, + { + "epoch": 0.5449255519424953, + "grad_norm": 8.759005546569824, + "learning_rate": 1.8128921848260124e-05, + "loss": 0.7256, + "step": 3184 + }, + { + "epoch": 0.5450966969022762, + "grad_norm": 54.43397521972656, + "learning_rate": 1.8134626354820308e-05, + "loss": 7.7319, + "step": 3185 + }, + { + "epoch": 0.5452678418620571, + "grad_norm": 26.35443878173828, + "learning_rate": 1.814033086138049e-05, + "loss": 3.1477, + "step": 3186 + }, + { + "epoch": 0.5454389868218381, + "grad_norm": 18.872291564941406, + "learning_rate": 1.8146035367940675e-05, + "loss": 1.8724, + "step": 3187 + }, + { + "epoch": 0.545610131781619, + "grad_norm": 22.673784255981445, + "learning_rate": 1.8151739874500855e-05, + "loss": 1.9899, + "step": 3188 + }, + { + "epoch": 0.5457812767414, + "grad_norm": 9.217958450317383, + "learning_rate": 1.8157444381061038e-05, + "loss": 0.726, + "step": 3189 + }, + { + "epoch": 0.5459524217011809, + "grad_norm": 2.148630380630493, + "learning_rate": 1.816314888762122e-05, + "loss": 0.323, + "step": 3190 + }, + { + "epoch": 0.5461235666609618, + "grad_norm": 26.988340377807617, + "learning_rate": 1.81688533941814e-05, + "loss": 2.586, + "step": 3191 + }, + { + "epoch": 0.5462947116207427, + "grad_norm": 37.6932373046875, + "learning_rate": 1.8174557900741585e-05, + "loss": 2.9146, + "step": 3192 + }, + { + "epoch": 0.5464658565805237, + "grad_norm": 74.42720794677734, + "learning_rate": 1.8180262407301768e-05, + "loss": 3.2535, + "step": 3193 + }, + { + "epoch": 0.5466370015403046, + "grad_norm": 29.757360458374023, + "learning_rate": 1.818596691386195e-05, + "loss": 2.8882, + "step": 3194 + }, + { + "epoch": 0.5468081465000856, + "grad_norm": 15.420557975769043, + "learning_rate": 1.819167142042213e-05, + "loss": 1.6278, + "step": 3195 + }, + { + "epoch": 0.5469792914598665, + "grad_norm": 31.367387771606445, + "learning_rate": 1.8197375926982318e-05, + "loss": 2.9209, + "step": 3196 + }, + { + "epoch": 0.5471504364196474, + "grad_norm": 28.30303382873535, + "learning_rate": 1.82030804335425e-05, + "loss": 3.066, + "step": 3197 + }, + { + "epoch": 0.5473215813794283, + "grad_norm": 27.540369033813477, + "learning_rate": 1.820878494010268e-05, + "loss": 3.3267, + "step": 3198 + }, + { + "epoch": 0.5474927263392093, + "grad_norm": 4.438743591308594, + "learning_rate": 1.8214489446662865e-05, + "loss": 0.3722, + "step": 3199 + }, + { + "epoch": 0.5476638712989902, + "grad_norm": 29.85404396057129, + "learning_rate": 1.822019395322305e-05, + "loss": 4.0139, + "step": 3200 + }, + { + "epoch": 0.5478350162587712, + "grad_norm": 28.56346893310547, + "learning_rate": 1.8225898459783232e-05, + "loss": 3.6119, + "step": 3201 + }, + { + "epoch": 0.5480061612185522, + "grad_norm": 29.419742584228516, + "learning_rate": 1.8231602966343412e-05, + "loss": 2.9087, + "step": 3202 + }, + { + "epoch": 0.548177306178333, + "grad_norm": 44.72222900390625, + "learning_rate": 1.8237307472903595e-05, + "loss": 6.7043, + "step": 3203 + }, + { + "epoch": 0.548348451138114, + "grad_norm": 21.762168884277344, + "learning_rate": 1.824301197946378e-05, + "loss": 1.9849, + "step": 3204 + }, + { + "epoch": 0.5485195960978949, + "grad_norm": 26.09598731994629, + "learning_rate": 1.824871648602396e-05, + "loss": 2.6737, + "step": 3205 + }, + { + "epoch": 0.5486907410576759, + "grad_norm": 32.6449089050293, + "learning_rate": 1.8254420992584142e-05, + "loss": 2.7534, + "step": 3206 + }, + { + "epoch": 0.5488618860174568, + "grad_norm": 20.140134811401367, + "learning_rate": 1.8260125499144325e-05, + "loss": 2.0024, + "step": 3207 + }, + { + "epoch": 0.5490330309772378, + "grad_norm": 8.021845817565918, + "learning_rate": 1.8265830005704505e-05, + "loss": 0.8484, + "step": 3208 + }, + { + "epoch": 0.5492041759370186, + "grad_norm": 23.706680297851562, + "learning_rate": 1.827153451226469e-05, + "loss": 2.5228, + "step": 3209 + }, + { + "epoch": 0.5493753208967996, + "grad_norm": 25.105031967163086, + "learning_rate": 1.8277239018824872e-05, + "loss": 2.4342, + "step": 3210 + }, + { + "epoch": 0.5495464658565805, + "grad_norm": 16.53352165222168, + "learning_rate": 1.8282943525385055e-05, + "loss": 1.4315, + "step": 3211 + }, + { + "epoch": 0.5497176108163615, + "grad_norm": 24.55224609375, + "learning_rate": 1.8288648031945235e-05, + "loss": 2.469, + "step": 3212 + }, + { + "epoch": 0.5498887557761424, + "grad_norm": 3.4264721870422363, + "learning_rate": 1.829435253850542e-05, + "loss": 0.3677, + "step": 3213 + }, + { + "epoch": 0.5500599007359234, + "grad_norm": 20.305509567260742, + "learning_rate": 1.8300057045065602e-05, + "loss": 2.3214, + "step": 3214 + }, + { + "epoch": 0.5502310456957042, + "grad_norm": 27.69756507873535, + "learning_rate": 1.8305761551625782e-05, + "loss": 2.8746, + "step": 3215 + }, + { + "epoch": 0.5504021906554852, + "grad_norm": 100.86264038085938, + "learning_rate": 1.8311466058185965e-05, + "loss": 7.5686, + "step": 3216 + }, + { + "epoch": 0.5505733356152661, + "grad_norm": 26.603628158569336, + "learning_rate": 1.831717056474615e-05, + "loss": 2.5639, + "step": 3217 + }, + { + "epoch": 0.5507444805750471, + "grad_norm": 31.449655532836914, + "learning_rate": 1.8322875071306332e-05, + "loss": 3.9795, + "step": 3218 + }, + { + "epoch": 0.550915625534828, + "grad_norm": 25.562639236450195, + "learning_rate": 1.8328579577866516e-05, + "loss": 3.0795, + "step": 3219 + }, + { + "epoch": 0.551086770494609, + "grad_norm": 4.988560199737549, + "learning_rate": 1.83342840844267e-05, + "loss": 0.4445, + "step": 3220 + }, + { + "epoch": 0.5512579154543898, + "grad_norm": 31.045183181762695, + "learning_rate": 1.8339988590986882e-05, + "loss": 6.4413, + "step": 3221 + }, + { + "epoch": 0.5514290604141708, + "grad_norm": 32.938106536865234, + "learning_rate": 1.8345693097547062e-05, + "loss": 4.0821, + "step": 3222 + }, + { + "epoch": 0.5516002053739517, + "grad_norm": 23.498254776000977, + "learning_rate": 1.8351397604107246e-05, + "loss": 2.8752, + "step": 3223 + }, + { + "epoch": 0.5517713503337327, + "grad_norm": 27.559247970581055, + "learning_rate": 1.835710211066743e-05, + "loss": 3.3216, + "step": 3224 + }, + { + "epoch": 0.5519424952935136, + "grad_norm": 46.420135498046875, + "learning_rate": 1.8362806617227613e-05, + "loss": 6.9903, + "step": 3225 + }, + { + "epoch": 0.5521136402532946, + "grad_norm": 23.508155822753906, + "learning_rate": 1.8368511123787793e-05, + "loss": 2.1877, + "step": 3226 + }, + { + "epoch": 0.5522847852130754, + "grad_norm": 20.4776611328125, + "learning_rate": 1.8374215630347976e-05, + "loss": 1.8942, + "step": 3227 + }, + { + "epoch": 0.5524559301728564, + "grad_norm": 15.294054985046387, + "learning_rate": 1.837992013690816e-05, + "loss": 1.2082, + "step": 3228 + }, + { + "epoch": 0.5526270751326373, + "grad_norm": 22.51180076599121, + "learning_rate": 1.838562464346834e-05, + "loss": 2.2929, + "step": 3229 + }, + { + "epoch": 0.5527982200924183, + "grad_norm": 21.741634368896484, + "learning_rate": 1.8391329150028523e-05, + "loss": 1.884, + "step": 3230 + }, + { + "epoch": 0.5529693650521992, + "grad_norm": 4.330467224121094, + "learning_rate": 1.8397033656588706e-05, + "loss": 0.4163, + "step": 3231 + }, + { + "epoch": 0.5531405100119802, + "grad_norm": 26.344017028808594, + "learning_rate": 1.840273816314889e-05, + "loss": 2.5767, + "step": 3232 + }, + { + "epoch": 0.553311654971761, + "grad_norm": 53.116172790527344, + "learning_rate": 1.840844266970907e-05, + "loss": 2.7426, + "step": 3233 + }, + { + "epoch": 0.553482799931542, + "grad_norm": 15.442861557006836, + "learning_rate": 1.8414147176269253e-05, + "loss": 0.9586, + "step": 3234 + }, + { + "epoch": 0.5536539448913229, + "grad_norm": 28.15229606628418, + "learning_rate": 1.8419851682829436e-05, + "loss": 2.9423, + "step": 3235 + }, + { + "epoch": 0.5538250898511039, + "grad_norm": 21.91160011291504, + "learning_rate": 1.8425556189389616e-05, + "loss": 2.3152, + "step": 3236 + }, + { + "epoch": 0.5539962348108848, + "grad_norm": 21.20878028869629, + "learning_rate": 1.84312606959498e-05, + "loss": 2.117, + "step": 3237 + }, + { + "epoch": 0.5541673797706658, + "grad_norm": 1.7572641372680664, + "learning_rate": 1.8436965202509983e-05, + "loss": 0.3516, + "step": 3238 + }, + { + "epoch": 0.5543385247304466, + "grad_norm": 25.218217849731445, + "learning_rate": 1.8442669709070166e-05, + "loss": 2.5554, + "step": 3239 + }, + { + "epoch": 0.5545096696902276, + "grad_norm": 30.133291244506836, + "learning_rate": 1.8448374215630346e-05, + "loss": 3.078, + "step": 3240 + }, + { + "epoch": 0.5546808146500085, + "grad_norm": 30.298227310180664, + "learning_rate": 1.845407872219053e-05, + "loss": 3.2276, + "step": 3241 + }, + { + "epoch": 0.5548519596097895, + "grad_norm": 31.560077667236328, + "learning_rate": 1.8459783228750716e-05, + "loss": 3.713, + "step": 3242 + }, + { + "epoch": 0.5550231045695704, + "grad_norm": 7.287442207336426, + "learning_rate": 1.8465487735310896e-05, + "loss": 0.4505, + "step": 3243 + }, + { + "epoch": 0.5551942495293514, + "grad_norm": 12.331917762756348, + "learning_rate": 1.847119224187108e-05, + "loss": 0.8588, + "step": 3244 + }, + { + "epoch": 0.5553653944891322, + "grad_norm": 44.76494216918945, + "learning_rate": 1.8476896748431263e-05, + "loss": 7.2343, + "step": 3245 + }, + { + "epoch": 0.5555365394489132, + "grad_norm": 32.351600646972656, + "learning_rate": 1.8482601254991443e-05, + "loss": 4.0445, + "step": 3246 + }, + { + "epoch": 0.5557076844086941, + "grad_norm": 28.279098510742188, + "learning_rate": 1.8488305761551627e-05, + "loss": 3.0796, + "step": 3247 + }, + { + "epoch": 0.5558788293684751, + "grad_norm": 28.361543655395508, + "learning_rate": 1.849401026811181e-05, + "loss": 3.3123, + "step": 3248 + }, + { + "epoch": 0.556049974328256, + "grad_norm": 41.510597229003906, + "learning_rate": 1.8499714774671993e-05, + "loss": 6.904, + "step": 3249 + }, + { + "epoch": 0.556221119288037, + "grad_norm": 28.658105850219727, + "learning_rate": 1.8505419281232173e-05, + "loss": 3.0404, + "step": 3250 + }, + { + "epoch": 0.5563922642478178, + "grad_norm": 28.752214431762695, + "learning_rate": 1.8511123787792357e-05, + "loss": 3.7672, + "step": 3251 + }, + { + "epoch": 0.5565634092075988, + "grad_norm": 34.223060607910156, + "learning_rate": 1.851682829435254e-05, + "loss": 5.3153, + "step": 3252 + }, + { + "epoch": 0.5567345541673798, + "grad_norm": 12.981977462768555, + "learning_rate": 1.852253280091272e-05, + "loss": 0.9685, + "step": 3253 + }, + { + "epoch": 0.5569056991271607, + "grad_norm": 20.176815032958984, + "learning_rate": 1.8528237307472903e-05, + "loss": 1.9294, + "step": 3254 + }, + { + "epoch": 0.5570768440869417, + "grad_norm": 24.55870246887207, + "learning_rate": 1.8533941814033087e-05, + "loss": 2.8725, + "step": 3255 + }, + { + "epoch": 0.5572479890467226, + "grad_norm": 21.821077346801758, + "learning_rate": 1.853964632059327e-05, + "loss": 2.0436, + "step": 3256 + }, + { + "epoch": 0.5574191340065036, + "grad_norm": 23.027362823486328, + "learning_rate": 1.854535082715345e-05, + "loss": 2.1357, + "step": 3257 + }, + { + "epoch": 0.5575902789662844, + "grad_norm": 10.035751342773438, + "learning_rate": 1.8551055333713634e-05, + "loss": 0.6818, + "step": 3258 + }, + { + "epoch": 0.5577614239260654, + "grad_norm": 4.451612949371338, + "learning_rate": 1.8556759840273817e-05, + "loss": 0.4069, + "step": 3259 + }, + { + "epoch": 0.5579325688858463, + "grad_norm": 25.953588485717773, + "learning_rate": 1.8562464346833997e-05, + "loss": 2.8325, + "step": 3260 + }, + { + "epoch": 0.5581037138456273, + "grad_norm": 63.914024353027344, + "learning_rate": 1.856816885339418e-05, + "loss": 8.0674, + "step": 3261 + }, + { + "epoch": 0.5582748588054082, + "grad_norm": 11.405961990356445, + "learning_rate": 1.8573873359954364e-05, + "loss": 0.8008, + "step": 3262 + }, + { + "epoch": 0.5584460037651892, + "grad_norm": 21.461894989013672, + "learning_rate": 1.8579577866514547e-05, + "loss": 2.1028, + "step": 3263 + }, + { + "epoch": 0.55861714872497, + "grad_norm": 22.72207260131836, + "learning_rate": 1.8585282373074727e-05, + "loss": 2.0172, + "step": 3264 + }, + { + "epoch": 0.558788293684751, + "grad_norm": 27.586618423461914, + "learning_rate": 1.8590986879634914e-05, + "loss": 2.0431, + "step": 3265 + }, + { + "epoch": 0.5589594386445319, + "grad_norm": 26.892311096191406, + "learning_rate": 1.8596691386195097e-05, + "loss": 2.5198, + "step": 3266 + }, + { + "epoch": 0.5591305836043129, + "grad_norm": 22.420379638671875, + "learning_rate": 1.8602395892755277e-05, + "loss": 2.155, + "step": 3267 + }, + { + "epoch": 0.5593017285640938, + "grad_norm": 8.758207321166992, + "learning_rate": 1.860810039931546e-05, + "loss": 1.3864, + "step": 3268 + }, + { + "epoch": 0.5594728735238748, + "grad_norm": 8.072163581848145, + "learning_rate": 1.8613804905875644e-05, + "loss": 0.5867, + "step": 3269 + }, + { + "epoch": 0.5596440184836556, + "grad_norm": 6.457092761993408, + "learning_rate": 1.8619509412435827e-05, + "loss": 0.6464, + "step": 3270 + }, + { + "epoch": 0.5598151634434366, + "grad_norm": 7.5770392417907715, + "learning_rate": 1.8625213918996007e-05, + "loss": 0.8377, + "step": 3271 + }, + { + "epoch": 0.5599863084032175, + "grad_norm": 28.0118350982666, + "learning_rate": 1.863091842555619e-05, + "loss": 3.5048, + "step": 3272 + }, + { + "epoch": 0.5601574533629985, + "grad_norm": 75.3667984008789, + "learning_rate": 1.8636622932116374e-05, + "loss": 7.3634, + "step": 3273 + }, + { + "epoch": 0.5603285983227794, + "grad_norm": 6.486256122589111, + "learning_rate": 1.8642327438676554e-05, + "loss": 0.772, + "step": 3274 + }, + { + "epoch": 0.5604997432825604, + "grad_norm": 18.678125381469727, + "learning_rate": 1.8648031945236737e-05, + "loss": 1.9006, + "step": 3275 + }, + { + "epoch": 0.5606708882423412, + "grad_norm": 7.29653263092041, + "learning_rate": 1.865373645179692e-05, + "loss": 1.075, + "step": 3276 + }, + { + "epoch": 0.5608420332021222, + "grad_norm": 2.164841890335083, + "learning_rate": 1.86594409583571e-05, + "loss": 0.3693, + "step": 3277 + }, + { + "epoch": 0.5610131781619031, + "grad_norm": 21.217857360839844, + "learning_rate": 1.8665145464917284e-05, + "loss": 2.2186, + "step": 3278 + }, + { + "epoch": 0.5611843231216841, + "grad_norm": 8.882852554321289, + "learning_rate": 1.8670849971477468e-05, + "loss": 0.9134, + "step": 3279 + }, + { + "epoch": 0.561355468081465, + "grad_norm": 17.709449768066406, + "learning_rate": 1.867655447803765e-05, + "loss": 1.5424, + "step": 3280 + }, + { + "epoch": 0.561526613041246, + "grad_norm": 5.205548286437988, + "learning_rate": 1.868225898459783e-05, + "loss": 0.6676, + "step": 3281 + }, + { + "epoch": 0.5616977580010268, + "grad_norm": 52.959259033203125, + "learning_rate": 1.8687963491158014e-05, + "loss": 7.2033, + "step": 3282 + }, + { + "epoch": 0.5618689029608078, + "grad_norm": 29.030555725097656, + "learning_rate": 1.8693667997718198e-05, + "loss": 2.9477, + "step": 3283 + }, + { + "epoch": 0.5620400479205887, + "grad_norm": 27.745031356811523, + "learning_rate": 1.8699372504278378e-05, + "loss": 2.719, + "step": 3284 + }, + { + "epoch": 0.5622111928803697, + "grad_norm": 25.971677780151367, + "learning_rate": 1.870507701083856e-05, + "loss": 2.706, + "step": 3285 + }, + { + "epoch": 0.5623823378401506, + "grad_norm": 27.33722686767578, + "learning_rate": 1.8710781517398744e-05, + "loss": 2.568, + "step": 3286 + }, + { + "epoch": 0.5625534827999316, + "grad_norm": 22.52666473388672, + "learning_rate": 1.8716486023958928e-05, + "loss": 2.3127, + "step": 3287 + }, + { + "epoch": 0.5627246277597124, + "grad_norm": 10.016031265258789, + "learning_rate": 1.872219053051911e-05, + "loss": 1.4001, + "step": 3288 + }, + { + "epoch": 0.5628957727194934, + "grad_norm": 24.30003547668457, + "learning_rate": 1.8727895037079295e-05, + "loss": 2.4201, + "step": 3289 + }, + { + "epoch": 0.5630669176792743, + "grad_norm": 6.622725009918213, + "learning_rate": 1.8733599543639478e-05, + "loss": 0.7098, + "step": 3290 + }, + { + "epoch": 0.5632380626390553, + "grad_norm": 24.1121883392334, + "learning_rate": 1.8739304050199658e-05, + "loss": 1.887, + "step": 3291 + }, + { + "epoch": 0.5634092075988362, + "grad_norm": 31.559614181518555, + "learning_rate": 1.874500855675984e-05, + "loss": 3.5751, + "step": 3292 + }, + { + "epoch": 0.5635803525586172, + "grad_norm": 33.04099655151367, + "learning_rate": 1.8750713063320025e-05, + "loss": 3.2233, + "step": 3293 + }, + { + "epoch": 0.563751497518398, + "grad_norm": 81.57552337646484, + "learning_rate": 1.8756417569880208e-05, + "loss": 3.8443, + "step": 3294 + }, + { + "epoch": 0.563922642478179, + "grad_norm": 30.438037872314453, + "learning_rate": 1.8762122076440388e-05, + "loss": 3.7816, + "step": 3295 + }, + { + "epoch": 0.5640937874379599, + "grad_norm": 7.756313323974609, + "learning_rate": 1.876782658300057e-05, + "loss": 0.7225, + "step": 3296 + }, + { + "epoch": 0.5642649323977409, + "grad_norm": 28.59238624572754, + "learning_rate": 1.8773531089560755e-05, + "loss": 3.1516, + "step": 3297 + }, + { + "epoch": 0.5644360773575218, + "grad_norm": 25.167417526245117, + "learning_rate": 1.8779235596120935e-05, + "loss": 3.174, + "step": 3298 + }, + { + "epoch": 0.5646072223173028, + "grad_norm": 86.82372283935547, + "learning_rate": 1.8784940102681118e-05, + "loss": 4.5193, + "step": 3299 + }, + { + "epoch": 0.5647783672770836, + "grad_norm": 30.278440475463867, + "learning_rate": 1.87906446092413e-05, + "loss": 3.2045, + "step": 3300 + }, + { + "epoch": 0.5649495122368646, + "grad_norm": 34.26241683959961, + "learning_rate": 1.8796349115801485e-05, + "loss": 3.7586, + "step": 3301 + }, + { + "epoch": 0.5651206571966455, + "grad_norm": 20.874797821044922, + "learning_rate": 1.8802053622361665e-05, + "loss": 1.9123, + "step": 3302 + }, + { + "epoch": 0.5652918021564265, + "grad_norm": 26.034624099731445, + "learning_rate": 1.8807758128921848e-05, + "loss": 2.522, + "step": 3303 + }, + { + "epoch": 0.5654629471162075, + "grad_norm": 11.349614143371582, + "learning_rate": 1.881346263548203e-05, + "loss": 0.9236, + "step": 3304 + }, + { + "epoch": 0.5656340920759884, + "grad_norm": 10.266570091247559, + "learning_rate": 1.881916714204221e-05, + "loss": 0.6643, + "step": 3305 + }, + { + "epoch": 0.5658052370357693, + "grad_norm": 32.189842224121094, + "learning_rate": 1.8824871648602395e-05, + "loss": 4.5101, + "step": 3306 + }, + { + "epoch": 0.5659763819955502, + "grad_norm": 24.921152114868164, + "learning_rate": 1.883057615516258e-05, + "loss": 2.9263, + "step": 3307 + }, + { + "epoch": 0.5661475269553312, + "grad_norm": 35.14552307128906, + "learning_rate": 1.883628066172276e-05, + "loss": 4.0464, + "step": 3308 + }, + { + "epoch": 0.5663186719151121, + "grad_norm": 37.087039947509766, + "learning_rate": 1.8841985168282942e-05, + "loss": 4.0199, + "step": 3309 + }, + { + "epoch": 0.5664898168748931, + "grad_norm": 26.691438674926758, + "learning_rate": 1.8847689674843125e-05, + "loss": 3.2809, + "step": 3310 + }, + { + "epoch": 0.566660961834674, + "grad_norm": 31.133575439453125, + "learning_rate": 1.8853394181403312e-05, + "loss": 3.4234, + "step": 3311 + }, + { + "epoch": 0.566832106794455, + "grad_norm": 98.82320404052734, + "learning_rate": 1.8859098687963492e-05, + "loss": 7.426, + "step": 3312 + }, + { + "epoch": 0.5670032517542358, + "grad_norm": 26.13225746154785, + "learning_rate": 1.8864803194523675e-05, + "loss": 2.5794, + "step": 3313 + }, + { + "epoch": 0.5671743967140168, + "grad_norm": 28.947038650512695, + "learning_rate": 1.887050770108386e-05, + "loss": 2.7049, + "step": 3314 + }, + { + "epoch": 0.5673455416737977, + "grad_norm": 23.491085052490234, + "learning_rate": 1.887621220764404e-05, + "loss": 2.1517, + "step": 3315 + }, + { + "epoch": 0.5675166866335787, + "grad_norm": 17.27471351623535, + "learning_rate": 1.8881916714204222e-05, + "loss": 1.4919, + "step": 3316 + }, + { + "epoch": 0.5676878315933596, + "grad_norm": 34.11532974243164, + "learning_rate": 1.8887621220764405e-05, + "loss": 4.5321, + "step": 3317 + }, + { + "epoch": 0.5678589765531405, + "grad_norm": 19.040075302124023, + "learning_rate": 1.889332572732459e-05, + "loss": 1.6601, + "step": 3318 + }, + { + "epoch": 0.5680301215129214, + "grad_norm": 18.085039138793945, + "learning_rate": 1.889903023388477e-05, + "loss": 1.5302, + "step": 3319 + }, + { + "epoch": 0.5682012664727024, + "grad_norm": 27.968341827392578, + "learning_rate": 1.8904734740444952e-05, + "loss": 3.3977, + "step": 3320 + }, + { + "epoch": 0.5683724114324833, + "grad_norm": 2.1676626205444336, + "learning_rate": 1.8910439247005136e-05, + "loss": 0.3746, + "step": 3321 + }, + { + "epoch": 0.5685435563922643, + "grad_norm": 3.0772573947906494, + "learning_rate": 1.8916143753565316e-05, + "loss": 0.4364, + "step": 3322 + }, + { + "epoch": 0.5687147013520452, + "grad_norm": 25.465309143066406, + "learning_rate": 1.89218482601255e-05, + "loss": 2.5141, + "step": 3323 + }, + { + "epoch": 0.5688858463118261, + "grad_norm": 28.62706184387207, + "learning_rate": 1.8927552766685682e-05, + "loss": 2.9376, + "step": 3324 + }, + { + "epoch": 0.569056991271607, + "grad_norm": 10.36950969696045, + "learning_rate": 1.8933257273245866e-05, + "loss": 0.9048, + "step": 3325 + }, + { + "epoch": 0.569228136231388, + "grad_norm": 22.50096893310547, + "learning_rate": 1.8938961779806046e-05, + "loss": 2.3555, + "step": 3326 + }, + { + "epoch": 0.5693992811911689, + "grad_norm": 25.440292358398438, + "learning_rate": 1.894466628636623e-05, + "loss": 2.5734, + "step": 3327 + }, + { + "epoch": 0.5695704261509499, + "grad_norm": 5.363638401031494, + "learning_rate": 1.8950370792926412e-05, + "loss": 0.4315, + "step": 3328 + }, + { + "epoch": 0.5697415711107308, + "grad_norm": 29.033611297607422, + "learning_rate": 1.8956075299486592e-05, + "loss": 2.7605, + "step": 3329 + }, + { + "epoch": 0.5699127160705117, + "grad_norm": 6.961116790771484, + "learning_rate": 1.8961779806046776e-05, + "loss": 1.2524, + "step": 3330 + }, + { + "epoch": 0.5700838610302926, + "grad_norm": 29.2668399810791, + "learning_rate": 1.896748431260696e-05, + "loss": 3.3542, + "step": 3331 + }, + { + "epoch": 0.5702550059900736, + "grad_norm": 38.82827377319336, + "learning_rate": 1.8973188819167143e-05, + "loss": 4.5204, + "step": 3332 + }, + { + "epoch": 0.5704261509498545, + "grad_norm": 32.07524871826172, + "learning_rate": 1.8978893325727326e-05, + "loss": 4.3224, + "step": 3333 + }, + { + "epoch": 0.5705972959096355, + "grad_norm": 3.1426124572753906, + "learning_rate": 1.898459783228751e-05, + "loss": 0.4036, + "step": 3334 + }, + { + "epoch": 0.5707684408694164, + "grad_norm": 19.389469146728516, + "learning_rate": 1.8990302338847693e-05, + "loss": 2.1048, + "step": 3335 + }, + { + "epoch": 0.5709395858291973, + "grad_norm": 17.071313858032227, + "learning_rate": 1.8996006845407873e-05, + "loss": 1.6332, + "step": 3336 + }, + { + "epoch": 0.5711107307889782, + "grad_norm": 7.998443603515625, + "learning_rate": 1.9001711351968056e-05, + "loss": 0.6125, + "step": 3337 + }, + { + "epoch": 0.5712818757487592, + "grad_norm": 27.566017150878906, + "learning_rate": 1.900741585852824e-05, + "loss": 3.0635, + "step": 3338 + }, + { + "epoch": 0.5714530207085401, + "grad_norm": 6.867462158203125, + "learning_rate": 1.901312036508842e-05, + "loss": 0.5083, + "step": 3339 + }, + { + "epoch": 0.5716241656683211, + "grad_norm": 24.942699432373047, + "learning_rate": 1.9018824871648603e-05, + "loss": 2.4329, + "step": 3340 + }, + { + "epoch": 0.571795310628102, + "grad_norm": 17.44595718383789, + "learning_rate": 1.9024529378208786e-05, + "loss": 1.2566, + "step": 3341 + }, + { + "epoch": 0.571966455587883, + "grad_norm": 30.833187103271484, + "learning_rate": 1.903023388476897e-05, + "loss": 2.7353, + "step": 3342 + }, + { + "epoch": 0.5721376005476638, + "grad_norm": 31.722270965576172, + "learning_rate": 1.903593839132915e-05, + "loss": 3.8463, + "step": 3343 + }, + { + "epoch": 0.5723087455074448, + "grad_norm": 12.909158706665039, + "learning_rate": 1.9041642897889333e-05, + "loss": 1.0365, + "step": 3344 + }, + { + "epoch": 0.5724798904672257, + "grad_norm": 32.17844772338867, + "learning_rate": 1.9047347404449516e-05, + "loss": 3.2414, + "step": 3345 + }, + { + "epoch": 0.5726510354270067, + "grad_norm": 25.432022094726562, + "learning_rate": 1.9053051911009696e-05, + "loss": 2.4539, + "step": 3346 + }, + { + "epoch": 0.5728221803867876, + "grad_norm": 2.373732805252075, + "learning_rate": 1.905875641756988e-05, + "loss": 0.3677, + "step": 3347 + }, + { + "epoch": 0.5729933253465685, + "grad_norm": 40.49632263183594, + "learning_rate": 1.9064460924130063e-05, + "loss": 3.9831, + "step": 3348 + }, + { + "epoch": 0.5731644703063494, + "grad_norm": 1.9657033681869507, + "learning_rate": 1.9070165430690246e-05, + "loss": 0.3775, + "step": 3349 + }, + { + "epoch": 0.5733356152661304, + "grad_norm": 61.38923645019531, + "learning_rate": 1.9075869937250426e-05, + "loss": 7.6187, + "step": 3350 + }, + { + "epoch": 0.5735067602259113, + "grad_norm": 24.892297744750977, + "learning_rate": 1.908157444381061e-05, + "loss": 2.5702, + "step": 3351 + }, + { + "epoch": 0.5736779051856923, + "grad_norm": 27.634868621826172, + "learning_rate": 1.9087278950370793e-05, + "loss": 2.6693, + "step": 3352 + }, + { + "epoch": 0.5738490501454732, + "grad_norm": 30.543689727783203, + "learning_rate": 1.9092983456930973e-05, + "loss": 2.881, + "step": 3353 + }, + { + "epoch": 0.5740201951052541, + "grad_norm": 20.875457763671875, + "learning_rate": 1.9098687963491157e-05, + "loss": 2.0431, + "step": 3354 + }, + { + "epoch": 0.5741913400650351, + "grad_norm": 10.260396003723145, + "learning_rate": 1.910439247005134e-05, + "loss": 1.0317, + "step": 3355 + }, + { + "epoch": 0.574362485024816, + "grad_norm": 28.790538787841797, + "learning_rate": 1.9110096976611527e-05, + "loss": 2.8238, + "step": 3356 + }, + { + "epoch": 0.574533629984597, + "grad_norm": 25.868772506713867, + "learning_rate": 1.9115801483171707e-05, + "loss": 2.5919, + "step": 3357 + }, + { + "epoch": 0.5747047749443779, + "grad_norm": 25.83347511291504, + "learning_rate": 1.912150598973189e-05, + "loss": 3.1996, + "step": 3358 + }, + { + "epoch": 0.5748759199041589, + "grad_norm": 29.329633712768555, + "learning_rate": 1.9127210496292073e-05, + "loss": 3.1316, + "step": 3359 + }, + { + "epoch": 0.5750470648639397, + "grad_norm": 9.001529693603516, + "learning_rate": 1.9132915002852253e-05, + "loss": 1.2363, + "step": 3360 + }, + { + "epoch": 0.5752182098237207, + "grad_norm": 5.358071804046631, + "learning_rate": 1.9138619509412437e-05, + "loss": 0.3949, + "step": 3361 + }, + { + "epoch": 0.5753893547835016, + "grad_norm": 13.40963363647461, + "learning_rate": 1.914432401597262e-05, + "loss": 0.7941, + "step": 3362 + }, + { + "epoch": 0.5755604997432826, + "grad_norm": 37.820556640625, + "learning_rate": 1.9150028522532804e-05, + "loss": 6.717, + "step": 3363 + }, + { + "epoch": 0.5757316447030635, + "grad_norm": 181.16746520996094, + "learning_rate": 1.9155733029092984e-05, + "loss": 9.5897, + "step": 3364 + }, + { + "epoch": 0.5759027896628445, + "grad_norm": 29.568683624267578, + "learning_rate": 1.9161437535653167e-05, + "loss": 2.4104, + "step": 3365 + }, + { + "epoch": 0.5760739346226253, + "grad_norm": 10.582496643066406, + "learning_rate": 1.916714204221335e-05, + "loss": 0.5204, + "step": 3366 + }, + { + "epoch": 0.5762450795824063, + "grad_norm": 37.75896072387695, + "learning_rate": 1.917284654877353e-05, + "loss": 6.4064, + "step": 3367 + }, + { + "epoch": 0.5764162245421872, + "grad_norm": 23.44141960144043, + "learning_rate": 1.9178551055333714e-05, + "loss": 2.2333, + "step": 3368 + }, + { + "epoch": 0.5765873695019682, + "grad_norm": 23.17081642150879, + "learning_rate": 1.9184255561893897e-05, + "loss": 2.3916, + "step": 3369 + }, + { + "epoch": 0.5767585144617491, + "grad_norm": 22.356122970581055, + "learning_rate": 1.918996006845408e-05, + "loss": 2.259, + "step": 3370 + }, + { + "epoch": 0.5769296594215301, + "grad_norm": 25.988954544067383, + "learning_rate": 1.919566457501426e-05, + "loss": 2.6056, + "step": 3371 + }, + { + "epoch": 0.577100804381311, + "grad_norm": 17.81022071838379, + "learning_rate": 1.9201369081574444e-05, + "loss": 1.3797, + "step": 3372 + }, + { + "epoch": 0.5772719493410919, + "grad_norm": 28.269866943359375, + "learning_rate": 1.9207073588134627e-05, + "loss": 2.3896, + "step": 3373 + }, + { + "epoch": 0.5774430943008728, + "grad_norm": 24.576251983642578, + "learning_rate": 1.9212778094694807e-05, + "loss": 2.2518, + "step": 3374 + }, + { + "epoch": 0.5776142392606538, + "grad_norm": 5.2097649574279785, + "learning_rate": 1.921848260125499e-05, + "loss": 0.3953, + "step": 3375 + }, + { + "epoch": 0.5777853842204347, + "grad_norm": 3.1124250888824463, + "learning_rate": 1.9224187107815174e-05, + "loss": 0.3687, + "step": 3376 + }, + { + "epoch": 0.5779565291802157, + "grad_norm": 20.81354331970215, + "learning_rate": 1.9229891614375354e-05, + "loss": 2.0595, + "step": 3377 + }, + { + "epoch": 0.5781276741399966, + "grad_norm": 29.21316909790039, + "learning_rate": 1.9235596120935537e-05, + "loss": 3.7875, + "step": 3378 + }, + { + "epoch": 0.5782988190997775, + "grad_norm": 84.69393157958984, + "learning_rate": 1.9241300627495724e-05, + "loss": 3.0112, + "step": 3379 + }, + { + "epoch": 0.5784699640595584, + "grad_norm": 1.8985782861709595, + "learning_rate": 1.9247005134055907e-05, + "loss": 0.3148, + "step": 3380 + }, + { + "epoch": 0.5786411090193394, + "grad_norm": 10.058646202087402, + "learning_rate": 1.9252709640616087e-05, + "loss": 0.8459, + "step": 3381 + }, + { + "epoch": 0.5788122539791203, + "grad_norm": 27.1168270111084, + "learning_rate": 1.925841414717627e-05, + "loss": 2.5347, + "step": 3382 + }, + { + "epoch": 0.5789833989389013, + "grad_norm": 89.62450408935547, + "learning_rate": 1.9264118653736454e-05, + "loss": 3.7248, + "step": 3383 + }, + { + "epoch": 0.5791545438986822, + "grad_norm": 4.5566558837890625, + "learning_rate": 1.9269823160296634e-05, + "loss": 0.6092, + "step": 3384 + }, + { + "epoch": 0.5793256888584631, + "grad_norm": 30.642803192138672, + "learning_rate": 1.9275527666856818e-05, + "loss": 3.5006, + "step": 3385 + }, + { + "epoch": 0.579496833818244, + "grad_norm": 27.308584213256836, + "learning_rate": 1.9281232173417e-05, + "loss": 3.4485, + "step": 3386 + }, + { + "epoch": 0.579667978778025, + "grad_norm": 29.646587371826172, + "learning_rate": 1.9286936679977184e-05, + "loss": 3.0531, + "step": 3387 + }, + { + "epoch": 0.5798391237378059, + "grad_norm": 14.223383903503418, + "learning_rate": 1.9292641186537364e-05, + "loss": 1.5835, + "step": 3388 + }, + { + "epoch": 0.5800102686975869, + "grad_norm": 24.695066452026367, + "learning_rate": 1.9298345693097548e-05, + "loss": 2.368, + "step": 3389 + }, + { + "epoch": 0.5801814136573678, + "grad_norm": 26.341815948486328, + "learning_rate": 1.930405019965773e-05, + "loss": 2.3547, + "step": 3390 + }, + { + "epoch": 0.5803525586171487, + "grad_norm": 18.38678741455078, + "learning_rate": 1.930975470621791e-05, + "loss": 1.7797, + "step": 3391 + }, + { + "epoch": 0.5805237035769296, + "grad_norm": 25.61127471923828, + "learning_rate": 1.9315459212778094e-05, + "loss": 2.638, + "step": 3392 + }, + { + "epoch": 0.5806948485367106, + "grad_norm": 2.290560007095337, + "learning_rate": 1.9321163719338278e-05, + "loss": 0.3367, + "step": 3393 + }, + { + "epoch": 0.5808659934964915, + "grad_norm": 11.412469863891602, + "learning_rate": 1.932686822589846e-05, + "loss": 1.1406, + "step": 3394 + }, + { + "epoch": 0.5810371384562725, + "grad_norm": 8.998905181884766, + "learning_rate": 1.933257273245864e-05, + "loss": 0.6117, + "step": 3395 + }, + { + "epoch": 0.5812082834160534, + "grad_norm": 7.52636194229126, + "learning_rate": 1.9338277239018825e-05, + "loss": 0.653, + "step": 3396 + }, + { + "epoch": 0.5813794283758343, + "grad_norm": 27.57058334350586, + "learning_rate": 1.9343981745579008e-05, + "loss": 3.0084, + "step": 3397 + }, + { + "epoch": 0.5815505733356152, + "grad_norm": 26.775415420532227, + "learning_rate": 1.9349686252139188e-05, + "loss": 2.9132, + "step": 3398 + }, + { + "epoch": 0.5817217182953962, + "grad_norm": 3.1826353073120117, + "learning_rate": 1.935539075869937e-05, + "loss": 0.3669, + "step": 3399 + }, + { + "epoch": 0.5818928632551771, + "grad_norm": 6.152859210968018, + "learning_rate": 1.9361095265259555e-05, + "loss": 0.6359, + "step": 3400 + }, + { + "epoch": 0.5820640082149581, + "grad_norm": 1.7208553552627563, + "learning_rate": 1.9366799771819738e-05, + "loss": 0.3746, + "step": 3401 + }, + { + "epoch": 0.582235153174739, + "grad_norm": 7.883406162261963, + "learning_rate": 1.937250427837992e-05, + "loss": 1.0997, + "step": 3402 + }, + { + "epoch": 0.5824062981345199, + "grad_norm": 26.301164627075195, + "learning_rate": 1.9378208784940105e-05, + "loss": 2.4291, + "step": 3403 + }, + { + "epoch": 0.5825774430943008, + "grad_norm": 23.660444259643555, + "learning_rate": 1.9383913291500288e-05, + "loss": 2.6292, + "step": 3404 + }, + { + "epoch": 0.5827485880540818, + "grad_norm": 17.410369873046875, + "learning_rate": 1.9389617798060468e-05, + "loss": 1.4877, + "step": 3405 + }, + { + "epoch": 0.5829197330138628, + "grad_norm": 31.716928482055664, + "learning_rate": 1.939532230462065e-05, + "loss": 3.778, + "step": 3406 + }, + { + "epoch": 0.5830908779736437, + "grad_norm": 39.23788833618164, + "learning_rate": 1.9401026811180835e-05, + "loss": 3.4125, + "step": 3407 + }, + { + "epoch": 0.5832620229334247, + "grad_norm": 21.296669006347656, + "learning_rate": 1.9406731317741015e-05, + "loss": 1.9439, + "step": 3408 + }, + { + "epoch": 0.5834331678932055, + "grad_norm": 4.249104022979736, + "learning_rate": 1.94124358243012e-05, + "loss": 0.5641, + "step": 3409 + }, + { + "epoch": 0.5836043128529865, + "grad_norm": 25.32843780517578, + "learning_rate": 1.9418140330861382e-05, + "loss": 2.1923, + "step": 3410 + }, + { + "epoch": 0.5837754578127674, + "grad_norm": 31.81114387512207, + "learning_rate": 1.9423844837421565e-05, + "loss": 6.2289, + "step": 3411 + }, + { + "epoch": 0.5839466027725484, + "grad_norm": 34.15937423706055, + "learning_rate": 1.9429549343981745e-05, + "loss": 7.0035, + "step": 3412 + }, + { + "epoch": 0.5841177477323293, + "grad_norm": 27.947298049926758, + "learning_rate": 1.943525385054193e-05, + "loss": 3.2937, + "step": 3413 + }, + { + "epoch": 0.5842888926921103, + "grad_norm": 13.201940536499023, + "learning_rate": 1.9440958357102112e-05, + "loss": 0.9029, + "step": 3414 + }, + { + "epoch": 0.5844600376518911, + "grad_norm": 21.287315368652344, + "learning_rate": 1.9446662863662292e-05, + "loss": 2.1476, + "step": 3415 + }, + { + "epoch": 0.5846311826116721, + "grad_norm": 27.151569366455078, + "learning_rate": 1.9452367370222475e-05, + "loss": 2.8889, + "step": 3416 + }, + { + "epoch": 0.584802327571453, + "grad_norm": 25.92886734008789, + "learning_rate": 1.945807187678266e-05, + "loss": 2.43, + "step": 3417 + }, + { + "epoch": 0.584973472531234, + "grad_norm": 16.41077423095703, + "learning_rate": 1.9463776383342842e-05, + "loss": 1.1515, + "step": 3418 + }, + { + "epoch": 0.5851446174910149, + "grad_norm": 7.387080669403076, + "learning_rate": 1.9469480889903022e-05, + "loss": 1.0438, + "step": 3419 + }, + { + "epoch": 0.5853157624507959, + "grad_norm": 28.30823516845703, + "learning_rate": 1.9475185396463205e-05, + "loss": 2.4817, + "step": 3420 + }, + { + "epoch": 0.5854869074105767, + "grad_norm": 19.957653045654297, + "learning_rate": 1.948088990302339e-05, + "loss": 2.492, + "step": 3421 + }, + { + "epoch": 0.5856580523703577, + "grad_norm": 26.708097457885742, + "learning_rate": 1.948659440958357e-05, + "loss": 2.9457, + "step": 3422 + }, + { + "epoch": 0.5858291973301386, + "grad_norm": 6.408317565917969, + "learning_rate": 1.9492298916143752e-05, + "loss": 0.7295, + "step": 3423 + }, + { + "epoch": 0.5860003422899196, + "grad_norm": 30.148130416870117, + "learning_rate": 1.9498003422703935e-05, + "loss": 3.3614, + "step": 3424 + }, + { + "epoch": 0.5861714872497005, + "grad_norm": 22.77581787109375, + "learning_rate": 1.9503707929264122e-05, + "loss": 1.9865, + "step": 3425 + }, + { + "epoch": 0.5863426322094815, + "grad_norm": 27.753477096557617, + "learning_rate": 1.9509412435824302e-05, + "loss": 2.899, + "step": 3426 + }, + { + "epoch": 0.5865137771692623, + "grad_norm": 6.288846015930176, + "learning_rate": 1.9515116942384486e-05, + "loss": 0.646, + "step": 3427 + }, + { + "epoch": 0.5866849221290433, + "grad_norm": 24.92253303527832, + "learning_rate": 1.952082144894467e-05, + "loss": 2.5784, + "step": 3428 + }, + { + "epoch": 0.5868560670888242, + "grad_norm": 24.49477767944336, + "learning_rate": 1.952652595550485e-05, + "loss": 2.1704, + "step": 3429 + }, + { + "epoch": 0.5870272120486052, + "grad_norm": 24.100597381591797, + "learning_rate": 1.9532230462065032e-05, + "loss": 2.5277, + "step": 3430 + }, + { + "epoch": 0.5871983570083861, + "grad_norm": 21.0911922454834, + "learning_rate": 1.9537934968625216e-05, + "loss": 2.2242, + "step": 3431 + }, + { + "epoch": 0.5873695019681671, + "grad_norm": 22.534944534301758, + "learning_rate": 1.95436394751854e-05, + "loss": 2.3912, + "step": 3432 + }, + { + "epoch": 0.5875406469279479, + "grad_norm": 22.132417678833008, + "learning_rate": 1.954934398174558e-05, + "loss": 2.3557, + "step": 3433 + }, + { + "epoch": 0.5877117918877289, + "grad_norm": 21.22612953186035, + "learning_rate": 1.9555048488305762e-05, + "loss": 1.8585, + "step": 3434 + }, + { + "epoch": 0.5878829368475098, + "grad_norm": 87.90875244140625, + "learning_rate": 1.9560752994865946e-05, + "loss": 4.0792, + "step": 3435 + }, + { + "epoch": 0.5880540818072908, + "grad_norm": 89.19034576416016, + "learning_rate": 1.9566457501426126e-05, + "loss": 4.43, + "step": 3436 + }, + { + "epoch": 0.5882252267670717, + "grad_norm": 19.258451461791992, + "learning_rate": 1.957216200798631e-05, + "loss": 1.7874, + "step": 3437 + }, + { + "epoch": 0.5883963717268527, + "grad_norm": 1.7522574663162231, + "learning_rate": 1.9577866514546493e-05, + "loss": 0.3112, + "step": 3438 + }, + { + "epoch": 0.5885675166866335, + "grad_norm": 18.229957580566406, + "learning_rate": 1.9583571021106676e-05, + "loss": 2.766, + "step": 3439 + }, + { + "epoch": 0.5887386616464145, + "grad_norm": 36.58788299560547, + "learning_rate": 1.9589275527666856e-05, + "loss": 5.2847, + "step": 3440 + }, + { + "epoch": 0.5889098066061954, + "grad_norm": 23.946247100830078, + "learning_rate": 1.959498003422704e-05, + "loss": 2.3658, + "step": 3441 + }, + { + "epoch": 0.5890809515659764, + "grad_norm": 29.713180541992188, + "learning_rate": 1.9600684540787223e-05, + "loss": 3.6696, + "step": 3442 + }, + { + "epoch": 0.5892520965257573, + "grad_norm": 22.247447967529297, + "learning_rate": 1.9606389047347403e-05, + "loss": 2.6781, + "step": 3443 + }, + { + "epoch": 0.5894232414855383, + "grad_norm": 23.726993560791016, + "learning_rate": 1.9612093553907586e-05, + "loss": 2.765, + "step": 3444 + }, + { + "epoch": 0.5895943864453191, + "grad_norm": 39.94513702392578, + "learning_rate": 1.961779806046777e-05, + "loss": 7.0047, + "step": 3445 + }, + { + "epoch": 0.5897655314051001, + "grad_norm": 24.248090744018555, + "learning_rate": 1.962350256702795e-05, + "loss": 2.7753, + "step": 3446 + }, + { + "epoch": 0.589936676364881, + "grad_norm": 3.6631691455841064, + "learning_rate": 1.9629207073588133e-05, + "loss": 0.3529, + "step": 3447 + }, + { + "epoch": 0.590107821324662, + "grad_norm": 25.42365264892578, + "learning_rate": 1.963491158014832e-05, + "loss": 2.1961, + "step": 3448 + }, + { + "epoch": 0.5902789662844429, + "grad_norm": 25.308515548706055, + "learning_rate": 1.9640616086708503e-05, + "loss": 2.3387, + "step": 3449 + }, + { + "epoch": 0.5904501112442239, + "grad_norm": 19.806636810302734, + "learning_rate": 1.9646320593268683e-05, + "loss": 2.0071, + "step": 3450 + }, + { + "epoch": 0.5906212562040047, + "grad_norm": 17.552900314331055, + "learning_rate": 1.9652025099828866e-05, + "loss": 1.4304, + "step": 3451 + }, + { + "epoch": 0.5907924011637857, + "grad_norm": 23.210519790649414, + "learning_rate": 1.965772960638905e-05, + "loss": 2.1445, + "step": 3452 + }, + { + "epoch": 0.5909635461235666, + "grad_norm": 25.595361709594727, + "learning_rate": 1.966343411294923e-05, + "loss": 3.0135, + "step": 3453 + }, + { + "epoch": 0.5911346910833476, + "grad_norm": 3.9893271923065186, + "learning_rate": 1.9669138619509413e-05, + "loss": 0.4081, + "step": 3454 + }, + { + "epoch": 0.5913058360431286, + "grad_norm": 2.2912561893463135, + "learning_rate": 1.9674843126069596e-05, + "loss": 0.3066, + "step": 3455 + }, + { + "epoch": 0.5914769810029095, + "grad_norm": 23.45972442626953, + "learning_rate": 1.968054763262978e-05, + "loss": 2.4938, + "step": 3456 + }, + { + "epoch": 0.5916481259626905, + "grad_norm": 24.78557777404785, + "learning_rate": 1.968625213918996e-05, + "loss": 2.2888, + "step": 3457 + }, + { + "epoch": 0.5918192709224713, + "grad_norm": 56.51396560668945, + "learning_rate": 1.9691956645750143e-05, + "loss": 3.3839, + "step": 3458 + }, + { + "epoch": 0.5919904158822523, + "grad_norm": 15.350875854492188, + "learning_rate": 1.9697661152310327e-05, + "loss": 1.0531, + "step": 3459 + }, + { + "epoch": 0.5921615608420332, + "grad_norm": 73.21929931640625, + "learning_rate": 1.9703365658870507e-05, + "loss": 3.5199, + "step": 3460 + }, + { + "epoch": 0.5923327058018142, + "grad_norm": 29.828990936279297, + "learning_rate": 1.970907016543069e-05, + "loss": 3.8054, + "step": 3461 + }, + { + "epoch": 0.5925038507615951, + "grad_norm": 18.3194637298584, + "learning_rate": 1.9714774671990873e-05, + "loss": 1.7246, + "step": 3462 + }, + { + "epoch": 0.592674995721376, + "grad_norm": 29.311429977416992, + "learning_rate": 1.9720479178551057e-05, + "loss": 3.2428, + "step": 3463 + }, + { + "epoch": 0.5928461406811569, + "grad_norm": 1.9222893714904785, + "learning_rate": 1.9726183685111237e-05, + "loss": 0.3078, + "step": 3464 + }, + { + "epoch": 0.5930172856409379, + "grad_norm": 6.286295413970947, + "learning_rate": 1.973188819167142e-05, + "loss": 0.6446, + "step": 3465 + }, + { + "epoch": 0.5931884306007188, + "grad_norm": 29.647480010986328, + "learning_rate": 1.9737592698231603e-05, + "loss": 2.9621, + "step": 3466 + }, + { + "epoch": 0.5933595755604998, + "grad_norm": 26.92269515991211, + "learning_rate": 1.9743297204791783e-05, + "loss": 2.5933, + "step": 3467 + }, + { + "epoch": 0.5935307205202807, + "grad_norm": 50.6396484375, + "learning_rate": 1.9749001711351967e-05, + "loss": 6.8098, + "step": 3468 + }, + { + "epoch": 0.5937018654800617, + "grad_norm": 25.224733352661133, + "learning_rate": 1.975470621791215e-05, + "loss": 2.4431, + "step": 3469 + }, + { + "epoch": 0.5938730104398425, + "grad_norm": 17.845563888549805, + "learning_rate": 1.9760410724472334e-05, + "loss": 1.7165, + "step": 3470 + }, + { + "epoch": 0.5940441553996235, + "grad_norm": 5.634066104888916, + "learning_rate": 1.9766115231032517e-05, + "loss": 0.5882, + "step": 3471 + }, + { + "epoch": 0.5942153003594044, + "grad_norm": 34.622920989990234, + "learning_rate": 1.97718197375927e-05, + "loss": 3.5714, + "step": 3472 + }, + { + "epoch": 0.5943864453191854, + "grad_norm": 63.40961837768555, + "learning_rate": 1.9777524244152884e-05, + "loss": 2.734, + "step": 3473 + }, + { + "epoch": 0.5945575902789663, + "grad_norm": 29.88731575012207, + "learning_rate": 1.9783228750713064e-05, + "loss": 3.8436, + "step": 3474 + }, + { + "epoch": 0.5947287352387473, + "grad_norm": 27.8708553314209, + "learning_rate": 1.9788933257273247e-05, + "loss": 2.3388, + "step": 3475 + }, + { + "epoch": 0.5948998801985281, + "grad_norm": 25.777362823486328, + "learning_rate": 1.979463776383343e-05, + "loss": 2.3517, + "step": 3476 + }, + { + "epoch": 0.5950710251583091, + "grad_norm": 14.805953979492188, + "learning_rate": 1.980034227039361e-05, + "loss": 1.5038, + "step": 3477 + }, + { + "epoch": 0.59524217011809, + "grad_norm": 19.073440551757812, + "learning_rate": 1.9806046776953794e-05, + "loss": 1.8955, + "step": 3478 + }, + { + "epoch": 0.595413315077871, + "grad_norm": 21.738014221191406, + "learning_rate": 1.9811751283513977e-05, + "loss": 2.3869, + "step": 3479 + }, + { + "epoch": 0.5955844600376519, + "grad_norm": 2.9714324474334717, + "learning_rate": 1.981745579007416e-05, + "loss": 0.3378, + "step": 3480 + }, + { + "epoch": 0.5957556049974329, + "grad_norm": 8.826178550720215, + "learning_rate": 1.982316029663434e-05, + "loss": 0.9817, + "step": 3481 + }, + { + "epoch": 0.5959267499572137, + "grad_norm": 16.54644012451172, + "learning_rate": 1.9828864803194524e-05, + "loss": 1.2827, + "step": 3482 + }, + { + "epoch": 0.5960978949169947, + "grad_norm": 9.384221076965332, + "learning_rate": 1.9834569309754707e-05, + "loss": 1.3316, + "step": 3483 + }, + { + "epoch": 0.5962690398767756, + "grad_norm": 25.255199432373047, + "learning_rate": 1.9840273816314887e-05, + "loss": 2.1236, + "step": 3484 + }, + { + "epoch": 0.5964401848365566, + "grad_norm": 27.23832893371582, + "learning_rate": 1.984597832287507e-05, + "loss": 2.8921, + "step": 3485 + }, + { + "epoch": 0.5966113297963375, + "grad_norm": 31.743816375732422, + "learning_rate": 1.9851682829435254e-05, + "loss": 4.1041, + "step": 3486 + }, + { + "epoch": 0.5967824747561185, + "grad_norm": 23.10817527770996, + "learning_rate": 1.9857387335995437e-05, + "loss": 1.973, + "step": 3487 + }, + { + "epoch": 0.5969536197158993, + "grad_norm": 40.163639068603516, + "learning_rate": 1.9863091842555617e-05, + "loss": 6.3457, + "step": 3488 + }, + { + "epoch": 0.5971247646756803, + "grad_norm": 29.302976608276367, + "learning_rate": 1.98687963491158e-05, + "loss": 2.8273, + "step": 3489 + }, + { + "epoch": 0.5972959096354612, + "grad_norm": 29.635021209716797, + "learning_rate": 1.9874500855675984e-05, + "loss": 3.671, + "step": 3490 + }, + { + "epoch": 0.5974670545952422, + "grad_norm": 21.227108001708984, + "learning_rate": 1.9880205362236164e-05, + "loss": 2.1672, + "step": 3491 + }, + { + "epoch": 0.5976381995550231, + "grad_norm": 30.448522567749023, + "learning_rate": 1.9885909868796348e-05, + "loss": 3.0936, + "step": 3492 + }, + { + "epoch": 0.597809344514804, + "grad_norm": 27.133663177490234, + "learning_rate": 1.9891614375356534e-05, + "loss": 2.4887, + "step": 3493 + }, + { + "epoch": 0.5979804894745849, + "grad_norm": 39.466121673583984, + "learning_rate": 1.9897318881916718e-05, + "loss": 4.8888, + "step": 3494 + }, + { + "epoch": 0.5981516344343659, + "grad_norm": 39.85908889770508, + "learning_rate": 1.9903023388476898e-05, + "loss": 6.6469, + "step": 3495 + }, + { + "epoch": 0.5983227793941468, + "grad_norm": 19.293907165527344, + "learning_rate": 1.990872789503708e-05, + "loss": 2.0085, + "step": 3496 + }, + { + "epoch": 0.5984939243539278, + "grad_norm": 30.540531158447266, + "learning_rate": 1.9914432401597265e-05, + "loss": 2.8524, + "step": 3497 + }, + { + "epoch": 0.5986650693137087, + "grad_norm": 2.173297882080078, + "learning_rate": 1.9920136908157444e-05, + "loss": 0.4671, + "step": 3498 + }, + { + "epoch": 0.5988362142734897, + "grad_norm": 23.616220474243164, + "learning_rate": 1.9925841414717628e-05, + "loss": 2.1941, + "step": 3499 + }, + { + "epoch": 0.5990073592332705, + "grad_norm": 10.88476276397705, + "learning_rate": 1.993154592127781e-05, + "loss": 0.9849, + "step": 3500 + }, + { + "epoch": 0.5991785041930515, + "grad_norm": 35.73077392578125, + "learning_rate": 1.9937250427837995e-05, + "loss": 3.3526, + "step": 3501 + }, + { + "epoch": 0.5993496491528324, + "grad_norm": 16.617977142333984, + "learning_rate": 1.9942954934398175e-05, + "loss": 1.305, + "step": 3502 + }, + { + "epoch": 0.5995207941126134, + "grad_norm": 18.637554168701172, + "learning_rate": 1.9948659440958358e-05, + "loss": 1.7833, + "step": 3503 + }, + { + "epoch": 0.5996919390723943, + "grad_norm": 22.126482009887695, + "learning_rate": 1.995436394751854e-05, + "loss": 1.8701, + "step": 3504 + }, + { + "epoch": 0.5998630840321753, + "grad_norm": 19.62862777709961, + "learning_rate": 1.996006845407872e-05, + "loss": 1.9236, + "step": 3505 + }, + { + "epoch": 0.6000342289919562, + "grad_norm": 27.936777114868164, + "learning_rate": 1.9965772960638905e-05, + "loss": 2.4178, + "step": 3506 + } + ], + "logging_steps": 1, + "max_steps": 17529, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1753, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}