{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999194392975107, "eval_steps": 500, "global_step": 6206, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 122.5660216815097, "learning_rate": 5.3475935828877005e-08, "loss": 1.4579, "step": 1 }, { "epoch": 0.0, "grad_norm": 96.20238517935834, "learning_rate": 1.0695187165775401e-07, "loss": 1.524, "step": 2 }, { "epoch": 0.0, "grad_norm": 849.0350587509935, "learning_rate": 1.6042780748663104e-07, "loss": 1.4637, "step": 3 }, { "epoch": 0.0, "grad_norm": 182.20744858900673, "learning_rate": 2.1390374331550802e-07, "loss": 1.5894, "step": 4 }, { "epoch": 0.0, "grad_norm": 231.05838823335316, "learning_rate": 2.6737967914438503e-07, "loss": 1.5317, "step": 5 }, { "epoch": 0.0, "grad_norm": 202.13167007470753, "learning_rate": 3.208556149732621e-07, "loss": 1.503, "step": 6 }, { "epoch": 0.0, "grad_norm": 118.1413889257339, "learning_rate": 3.7433155080213904e-07, "loss": 1.4591, "step": 7 }, { "epoch": 0.0, "grad_norm": 9.38572531457369, "learning_rate": 4.2780748663101604e-07, "loss": 0.9505, "step": 8 }, { "epoch": 0.0, "grad_norm": 82.05875043350517, "learning_rate": 4.812834224598931e-07, "loss": 1.4157, "step": 9 }, { "epoch": 0.0, "grad_norm": 70.05842179949386, "learning_rate": 5.347593582887701e-07, "loss": 1.2957, "step": 10 }, { "epoch": 0.0, "grad_norm": 56.513707608857935, "learning_rate": 5.882352941176471e-07, "loss": 1.2736, "step": 11 }, { "epoch": 0.0, "grad_norm": 8.69388740050222, "learning_rate": 6.417112299465242e-07, "loss": 0.8761, "step": 12 }, { "epoch": 0.0, "grad_norm": 49.36372707295016, "learning_rate": 6.951871657754011e-07, "loss": 1.2075, "step": 13 }, { "epoch": 0.0, "grad_norm": 54.19809693130065, "learning_rate": 7.486631016042781e-07, "loss": 1.1013, "step": 14 }, { "epoch": 0.0, "grad_norm": 27.337284203855997, "learning_rate": 8.021390374331551e-07, "loss": 1.0114, "step": 15 }, { "epoch": 0.0, "grad_norm": 50.81692104494523, "learning_rate": 8.556149732620321e-07, "loss": 1.0453, "step": 16 }, { "epoch": 0.0, "grad_norm": 29.485661099962932, "learning_rate": 9.090909090909091e-07, "loss": 0.8666, "step": 17 }, { "epoch": 0.0, "grad_norm": 22.564586972051583, "learning_rate": 9.625668449197862e-07, "loss": 0.8693, "step": 18 }, { "epoch": 0.0, "grad_norm": 24.12789244295064, "learning_rate": 1.0160427807486633e-06, "loss": 0.8764, "step": 19 }, { "epoch": 0.0, "grad_norm": 57.57569204864652, "learning_rate": 1.0695187165775401e-06, "loss": 0.891, "step": 20 }, { "epoch": 0.0, "grad_norm": 22.184258097094585, "learning_rate": 1.1229946524064172e-06, "loss": 0.8579, "step": 21 }, { "epoch": 0.0, "grad_norm": 28.40191743303267, "learning_rate": 1.1764705882352942e-06, "loss": 0.9162, "step": 22 }, { "epoch": 0.0, "grad_norm": 7.798274748644812, "learning_rate": 1.2299465240641713e-06, "loss": 0.7764, "step": 23 }, { "epoch": 0.0, "grad_norm": 34.09266117254502, "learning_rate": 1.2834224598930483e-06, "loss": 0.9927, "step": 24 }, { "epoch": 0.0, "grad_norm": 22.695569160117596, "learning_rate": 1.3368983957219254e-06, "loss": 0.911, "step": 25 }, { "epoch": 0.0, "grad_norm": 8.139679518356331, "learning_rate": 1.3903743315508022e-06, "loss": 0.7543, "step": 26 }, { "epoch": 0.0, "grad_norm": 22.45203333256374, "learning_rate": 1.4438502673796793e-06, "loss": 0.7663, "step": 27 }, { "epoch": 0.0, "grad_norm": 29.435173416448855, "learning_rate": 1.4973262032085562e-06, "loss": 0.9151, "step": 28 }, { "epoch": 0.0, "grad_norm": 48.716233763851676, "learning_rate": 1.5508021390374334e-06, "loss": 0.898, "step": 29 }, { "epoch": 0.0, "grad_norm": 34.64254333954248, "learning_rate": 1.6042780748663103e-06, "loss": 0.854, "step": 30 }, { "epoch": 0.0, "grad_norm": 17.356025431171073, "learning_rate": 1.6577540106951873e-06, "loss": 0.7167, "step": 31 }, { "epoch": 0.01, "grad_norm": 23.525924335048796, "learning_rate": 1.7112299465240642e-06, "loss": 0.8054, "step": 32 }, { "epoch": 0.01, "grad_norm": 20.999975435460883, "learning_rate": 1.7647058823529414e-06, "loss": 0.8317, "step": 33 }, { "epoch": 0.01, "grad_norm": 116.68779213216546, "learning_rate": 1.8181818181818183e-06, "loss": 0.7837, "step": 34 }, { "epoch": 0.01, "grad_norm": 26.071062158998576, "learning_rate": 1.8716577540106954e-06, "loss": 0.6666, "step": 35 }, { "epoch": 0.01, "grad_norm": 38.38066714403462, "learning_rate": 1.9251336898395724e-06, "loss": 0.7705, "step": 36 }, { "epoch": 0.01, "grad_norm": 22.450538903347848, "learning_rate": 1.9786096256684497e-06, "loss": 0.7418, "step": 37 }, { "epoch": 0.01, "grad_norm": 30.30609757613985, "learning_rate": 2.0320855614973265e-06, "loss": 0.7849, "step": 38 }, { "epoch": 0.01, "grad_norm": 33.965207939999864, "learning_rate": 2.0855614973262034e-06, "loss": 0.801, "step": 39 }, { "epoch": 0.01, "grad_norm": 27.47207670640874, "learning_rate": 2.1390374331550802e-06, "loss": 0.7227, "step": 40 }, { "epoch": 0.01, "grad_norm": 79.25148574388751, "learning_rate": 2.1925133689839575e-06, "loss": 0.7811, "step": 41 }, { "epoch": 0.01, "grad_norm": 29.200339339053983, "learning_rate": 2.2459893048128343e-06, "loss": 0.7301, "step": 42 }, { "epoch": 0.01, "grad_norm": 31.11611622077372, "learning_rate": 2.2994652406417116e-06, "loss": 0.6627, "step": 43 }, { "epoch": 0.01, "grad_norm": 31.134151427915988, "learning_rate": 2.3529411764705885e-06, "loss": 0.7633, "step": 44 }, { "epoch": 0.01, "grad_norm": 53.41608383517845, "learning_rate": 2.4064171122994653e-06, "loss": 0.8341, "step": 45 }, { "epoch": 0.01, "grad_norm": 44.86140085240313, "learning_rate": 2.4598930481283426e-06, "loss": 0.7442, "step": 46 }, { "epoch": 0.01, "grad_norm": 22.250899081589598, "learning_rate": 2.5133689839572194e-06, "loss": 0.7348, "step": 47 }, { "epoch": 0.01, "grad_norm": 22.921556463732713, "learning_rate": 2.5668449197860967e-06, "loss": 0.6637, "step": 48 }, { "epoch": 0.01, "grad_norm": 24.09751015733259, "learning_rate": 2.6203208556149735e-06, "loss": 0.695, "step": 49 }, { "epoch": 0.01, "grad_norm": 135.24250227202447, "learning_rate": 2.673796791443851e-06, "loss": 0.626, "step": 50 }, { "epoch": 0.01, "grad_norm": 24.999289116754852, "learning_rate": 2.7272727272727272e-06, "loss": 0.759, "step": 51 }, { "epoch": 0.01, "grad_norm": 12.29332324334594, "learning_rate": 2.7807486631016045e-06, "loss": 0.6928, "step": 52 }, { "epoch": 0.01, "grad_norm": 19.952618467661395, "learning_rate": 2.8342245989304818e-06, "loss": 0.762, "step": 53 }, { "epoch": 0.01, "grad_norm": 47.66698831670102, "learning_rate": 2.8877005347593586e-06, "loss": 0.6553, "step": 54 }, { "epoch": 0.01, "grad_norm": 19.87890828382766, "learning_rate": 2.9411764705882355e-06, "loss": 0.677, "step": 55 }, { "epoch": 0.01, "grad_norm": 22.021060921393875, "learning_rate": 2.9946524064171123e-06, "loss": 0.6556, "step": 56 }, { "epoch": 0.01, "grad_norm": 51.313613097860795, "learning_rate": 3.0481283422459896e-06, "loss": 0.6911, "step": 57 }, { "epoch": 0.01, "grad_norm": 15.232961668505869, "learning_rate": 3.101604278074867e-06, "loss": 0.6446, "step": 58 }, { "epoch": 0.01, "grad_norm": 17.793694324941825, "learning_rate": 3.1550802139037433e-06, "loss": 0.7448, "step": 59 }, { "epoch": 0.01, "grad_norm": 17.99288737861374, "learning_rate": 3.2085561497326205e-06, "loss": 0.5912, "step": 60 }, { "epoch": 0.01, "grad_norm": 14.122313155923214, "learning_rate": 3.262032085561498e-06, "loss": 0.7101, "step": 61 }, { "epoch": 0.01, "grad_norm": 17.17160324873029, "learning_rate": 3.3155080213903747e-06, "loss": 0.7451, "step": 62 }, { "epoch": 0.01, "grad_norm": 29.498395656556355, "learning_rate": 3.368983957219252e-06, "loss": 0.7817, "step": 63 }, { "epoch": 0.01, "grad_norm": 44.811745296161426, "learning_rate": 3.4224598930481284e-06, "loss": 0.6402, "step": 64 }, { "epoch": 0.01, "grad_norm": 16.25018370181595, "learning_rate": 3.4759358288770056e-06, "loss": 0.5297, "step": 65 }, { "epoch": 0.01, "grad_norm": 85.16994505724112, "learning_rate": 3.529411764705883e-06, "loss": 0.7363, "step": 66 }, { "epoch": 0.01, "grad_norm": 20.3349676620617, "learning_rate": 3.5828877005347597e-06, "loss": 0.6979, "step": 67 }, { "epoch": 0.01, "grad_norm": 20.41634192789583, "learning_rate": 3.6363636363636366e-06, "loss": 0.639, "step": 68 }, { "epoch": 0.01, "grad_norm": 16.61051722348918, "learning_rate": 3.6898395721925134e-06, "loss": 0.6495, "step": 69 }, { "epoch": 0.01, "grad_norm": 23.737252354348907, "learning_rate": 3.7433155080213907e-06, "loss": 0.6276, "step": 70 }, { "epoch": 0.01, "grad_norm": 30.61292490845703, "learning_rate": 3.796791443850268e-06, "loss": 0.6799, "step": 71 }, { "epoch": 0.01, "grad_norm": 18.93007032813426, "learning_rate": 3.850267379679145e-06, "loss": 0.6498, "step": 72 }, { "epoch": 0.01, "grad_norm": 28.244661419400103, "learning_rate": 3.903743315508022e-06, "loss": 0.6814, "step": 73 }, { "epoch": 0.01, "grad_norm": 14.863467310016713, "learning_rate": 3.957219251336899e-06, "loss": 0.6161, "step": 74 }, { "epoch": 0.01, "grad_norm": 42.089666891351996, "learning_rate": 4.010695187165775e-06, "loss": 0.6478, "step": 75 }, { "epoch": 0.01, "grad_norm": 23.754336196286815, "learning_rate": 4.064171122994653e-06, "loss": 0.6172, "step": 76 }, { "epoch": 0.01, "grad_norm": 18.767807519935523, "learning_rate": 4.11764705882353e-06, "loss": 0.6644, "step": 77 }, { "epoch": 0.01, "grad_norm": 16.954630485400536, "learning_rate": 4.171122994652407e-06, "loss": 0.6248, "step": 78 }, { "epoch": 0.01, "grad_norm": 4.921577083561282, "learning_rate": 4.224598930481284e-06, "loss": 0.5517, "step": 79 }, { "epoch": 0.01, "grad_norm": 26.780388977711148, "learning_rate": 4.2780748663101604e-06, "loss": 0.7048, "step": 80 }, { "epoch": 0.01, "grad_norm": 22.546568332881034, "learning_rate": 4.331550802139038e-06, "loss": 0.6402, "step": 81 }, { "epoch": 0.01, "grad_norm": 24.135059031162864, "learning_rate": 4.385026737967915e-06, "loss": 0.5676, "step": 82 }, { "epoch": 0.01, "grad_norm": 18.366818375625567, "learning_rate": 4.438502673796792e-06, "loss": 0.5909, "step": 83 }, { "epoch": 0.01, "grad_norm": 34.29237119670637, "learning_rate": 4.491978609625669e-06, "loss": 0.6756, "step": 84 }, { "epoch": 0.01, "grad_norm": 38.921358940231755, "learning_rate": 4.5454545454545455e-06, "loss": 0.6855, "step": 85 }, { "epoch": 0.01, "grad_norm": 21.573474684677027, "learning_rate": 4.598930481283423e-06, "loss": 0.5491, "step": 86 }, { "epoch": 0.01, "grad_norm": 16.27110683432754, "learning_rate": 4.6524064171123e-06, "loss": 0.6556, "step": 87 }, { "epoch": 0.01, "grad_norm": 14.517023376309627, "learning_rate": 4.705882352941177e-06, "loss": 0.7152, "step": 88 }, { "epoch": 0.01, "grad_norm": 14.485806438586652, "learning_rate": 4.759358288770054e-06, "loss": 0.5804, "step": 89 }, { "epoch": 0.01, "grad_norm": 12.101339416206482, "learning_rate": 4.812834224598931e-06, "loss": 0.6072, "step": 90 }, { "epoch": 0.01, "grad_norm": 18.75766830436097, "learning_rate": 4.866310160427808e-06, "loss": 0.5972, "step": 91 }, { "epoch": 0.01, "grad_norm": 42.48051171926769, "learning_rate": 4.919786096256685e-06, "loss": 0.6329, "step": 92 }, { "epoch": 0.01, "grad_norm": 58.99865852419644, "learning_rate": 4.973262032085562e-06, "loss": 0.659, "step": 93 }, { "epoch": 0.02, "grad_norm": 20.583671652291816, "learning_rate": 5.026737967914439e-06, "loss": 0.5816, "step": 94 }, { "epoch": 0.02, "grad_norm": 15.163742040869481, "learning_rate": 5.0802139037433165e-06, "loss": 0.6186, "step": 95 }, { "epoch": 0.02, "grad_norm": 14.42481259047122, "learning_rate": 5.133689839572193e-06, "loss": 0.6531, "step": 96 }, { "epoch": 0.02, "grad_norm": 68.4171386846051, "learning_rate": 5.187165775401069e-06, "loss": 0.6845, "step": 97 }, { "epoch": 0.02, "grad_norm": 32.42262668051302, "learning_rate": 5.240641711229947e-06, "loss": 0.6458, "step": 98 }, { "epoch": 0.02, "grad_norm": 2.6575772208807753, "learning_rate": 5.294117647058824e-06, "loss": 0.5741, "step": 99 }, { "epoch": 0.02, "grad_norm": 26.97516445099372, "learning_rate": 5.347593582887702e-06, "loss": 0.6071, "step": 100 }, { "epoch": 0.02, "grad_norm": 19.268147547208816, "learning_rate": 5.4010695187165785e-06, "loss": 0.6456, "step": 101 }, { "epoch": 0.02, "grad_norm": 22.009344652004238, "learning_rate": 5.4545454545454545e-06, "loss": 0.6481, "step": 102 }, { "epoch": 0.02, "grad_norm": 37.86598942391542, "learning_rate": 5.508021390374332e-06, "loss": 0.6428, "step": 103 }, { "epoch": 0.02, "grad_norm": 2.220507916056364, "learning_rate": 5.561497326203209e-06, "loss": 0.5524, "step": 104 }, { "epoch": 0.02, "grad_norm": 31.882822163137917, "learning_rate": 5.614973262032086e-06, "loss": 0.6773, "step": 105 }, { "epoch": 0.02, "grad_norm": 21.066689453711753, "learning_rate": 5.6684491978609635e-06, "loss": 0.5308, "step": 106 }, { "epoch": 0.02, "grad_norm": 12.810074093176306, "learning_rate": 5.7219251336898395e-06, "loss": 0.5844, "step": 107 }, { "epoch": 0.02, "grad_norm": 37.64071299743144, "learning_rate": 5.775401069518717e-06, "loss": 0.645, "step": 108 }, { "epoch": 0.02, "grad_norm": 42.61317343790438, "learning_rate": 5.828877005347594e-06, "loss": 0.6186, "step": 109 }, { "epoch": 0.02, "grad_norm": 20.320797351695944, "learning_rate": 5.882352941176471e-06, "loss": 0.6261, "step": 110 }, { "epoch": 0.02, "grad_norm": 41.82428402340237, "learning_rate": 5.935828877005349e-06, "loss": 0.6811, "step": 111 }, { "epoch": 0.02, "grad_norm": 18.05125983518669, "learning_rate": 5.989304812834225e-06, "loss": 0.5138, "step": 112 }, { "epoch": 0.02, "grad_norm": 30.54162507624327, "learning_rate": 6.0427807486631015e-06, "loss": 0.6124, "step": 113 }, { "epoch": 0.02, "grad_norm": 27.222009084195935, "learning_rate": 6.096256684491979e-06, "loss": 0.5959, "step": 114 }, { "epoch": 0.02, "grad_norm": 19.57620562665309, "learning_rate": 6.149732620320856e-06, "loss": 0.6031, "step": 115 }, { "epoch": 0.02, "grad_norm": 15.855726387451963, "learning_rate": 6.203208556149734e-06, "loss": 0.5891, "step": 116 }, { "epoch": 0.02, "grad_norm": 24.941928283806412, "learning_rate": 6.25668449197861e-06, "loss": 0.7241, "step": 117 }, { "epoch": 0.02, "grad_norm": 12.537936109313009, "learning_rate": 6.3101604278074865e-06, "loss": 0.5994, "step": 118 }, { "epoch": 0.02, "grad_norm": 19.81397782910211, "learning_rate": 6.363636363636364e-06, "loss": 0.6061, "step": 119 }, { "epoch": 0.02, "grad_norm": 26.976056572828575, "learning_rate": 6.417112299465241e-06, "loss": 0.5641, "step": 120 }, { "epoch": 0.02, "grad_norm": 20.92909696757465, "learning_rate": 6.470588235294119e-06, "loss": 0.5742, "step": 121 }, { "epoch": 0.02, "grad_norm": 59.960917768114115, "learning_rate": 6.524064171122996e-06, "loss": 0.5995, "step": 122 }, { "epoch": 0.02, "grad_norm": 21.486053491694033, "learning_rate": 6.577540106951872e-06, "loss": 0.6093, "step": 123 }, { "epoch": 0.02, "grad_norm": 11.968826538969584, "learning_rate": 6.631016042780749e-06, "loss": 0.596, "step": 124 }, { "epoch": 0.02, "grad_norm": 25.91913743513616, "learning_rate": 6.684491978609626e-06, "loss": 0.6512, "step": 125 }, { "epoch": 0.02, "grad_norm": 20.615735313978416, "learning_rate": 6.737967914438504e-06, "loss": 0.5516, "step": 126 }, { "epoch": 0.02, "grad_norm": 20.31173641758594, "learning_rate": 6.791443850267381e-06, "loss": 0.5893, "step": 127 }, { "epoch": 0.02, "grad_norm": 18.18932198046168, "learning_rate": 6.844919786096257e-06, "loss": 0.6299, "step": 128 }, { "epoch": 0.02, "grad_norm": 18.325853887283998, "learning_rate": 6.898395721925134e-06, "loss": 0.6586, "step": 129 }, { "epoch": 0.02, "grad_norm": 22.484889889241355, "learning_rate": 6.951871657754011e-06, "loss": 0.574, "step": 130 }, { "epoch": 0.02, "grad_norm": 22.084320699756343, "learning_rate": 7.005347593582889e-06, "loss": 0.6587, "step": 131 }, { "epoch": 0.02, "grad_norm": 1.2480209551400834, "learning_rate": 7.058823529411766e-06, "loss": 0.4398, "step": 132 }, { "epoch": 0.02, "grad_norm": 18.00151064391816, "learning_rate": 7.112299465240642e-06, "loss": 0.6409, "step": 133 }, { "epoch": 0.02, "grad_norm": 21.052998309067092, "learning_rate": 7.1657754010695195e-06, "loss": 0.5988, "step": 134 }, { "epoch": 0.02, "grad_norm": 17.284082776274925, "learning_rate": 7.219251336898396e-06, "loss": 0.5568, "step": 135 }, { "epoch": 0.02, "grad_norm": 13.783320304280428, "learning_rate": 7.272727272727273e-06, "loss": 0.6072, "step": 136 }, { "epoch": 0.02, "grad_norm": 22.41423729347594, "learning_rate": 7.326203208556151e-06, "loss": 0.5824, "step": 137 }, { "epoch": 0.02, "grad_norm": 20.34357850447982, "learning_rate": 7.379679144385027e-06, "loss": 0.5785, "step": 138 }, { "epoch": 0.02, "grad_norm": 91.10070873339994, "learning_rate": 7.433155080213904e-06, "loss": 0.6492, "step": 139 }, { "epoch": 0.02, "grad_norm": 28.06425496642058, "learning_rate": 7.486631016042781e-06, "loss": 0.6717, "step": 140 }, { "epoch": 0.02, "grad_norm": 19.17170518135643, "learning_rate": 7.540106951871658e-06, "loss": 0.6571, "step": 141 }, { "epoch": 0.02, "grad_norm": 24.877663676776276, "learning_rate": 7.593582887700536e-06, "loss": 0.5846, "step": 142 }, { "epoch": 0.02, "grad_norm": 22.753318407414667, "learning_rate": 7.647058823529411e-06, "loss": 0.5865, "step": 143 }, { "epoch": 0.02, "grad_norm": 69.6797782080603, "learning_rate": 7.70053475935829e-06, "loss": 0.6192, "step": 144 }, { "epoch": 0.02, "grad_norm": 15.339715015322948, "learning_rate": 7.754010695187166e-06, "loss": 0.5302, "step": 145 }, { "epoch": 0.02, "grad_norm": 25.125502174512487, "learning_rate": 7.807486631016043e-06, "loss": 0.5945, "step": 146 }, { "epoch": 0.02, "grad_norm": 31.807966557250012, "learning_rate": 7.86096256684492e-06, "loss": 0.5701, "step": 147 }, { "epoch": 0.02, "grad_norm": 16.756136730927324, "learning_rate": 7.914438502673799e-06, "loss": 0.6455, "step": 148 }, { "epoch": 0.02, "grad_norm": 21.694097248619133, "learning_rate": 7.967914438502674e-06, "loss": 0.6354, "step": 149 }, { "epoch": 0.02, "grad_norm": 24.988217091215123, "learning_rate": 8.02139037433155e-06, "loss": 0.5974, "step": 150 }, { "epoch": 0.02, "grad_norm": 48.94818189142812, "learning_rate": 8.07486631016043e-06, "loss": 0.5621, "step": 151 }, { "epoch": 0.02, "grad_norm": 16.786693796458923, "learning_rate": 8.128342245989306e-06, "loss": 0.5029, "step": 152 }, { "epoch": 0.02, "grad_norm": 1.2132368709705952, "learning_rate": 8.181818181818183e-06, "loss": 0.4582, "step": 153 }, { "epoch": 0.02, "grad_norm": 17.838014549382752, "learning_rate": 8.23529411764706e-06, "loss": 0.6081, "step": 154 }, { "epoch": 0.02, "grad_norm": 23.174949309393295, "learning_rate": 8.288770053475937e-06, "loss": 0.5913, "step": 155 }, { "epoch": 0.03, "grad_norm": 58.10277685435587, "learning_rate": 8.342245989304813e-06, "loss": 0.62, "step": 156 }, { "epoch": 0.03, "grad_norm": 35.11148524301774, "learning_rate": 8.39572192513369e-06, "loss": 0.6108, "step": 157 }, { "epoch": 0.03, "grad_norm": 26.345809275562033, "learning_rate": 8.449197860962567e-06, "loss": 0.6028, "step": 158 }, { "epoch": 0.03, "grad_norm": 28.757447702452716, "learning_rate": 8.502673796791444e-06, "loss": 0.6476, "step": 159 }, { "epoch": 0.03, "grad_norm": 1.2057898552029815, "learning_rate": 8.556149732620321e-06, "loss": 0.4502, "step": 160 }, { "epoch": 0.03, "grad_norm": 28.7196085975342, "learning_rate": 8.609625668449198e-06, "loss": 0.5792, "step": 161 }, { "epoch": 0.03, "grad_norm": 51.55763623930404, "learning_rate": 8.663101604278076e-06, "loss": 0.5271, "step": 162 }, { "epoch": 0.03, "grad_norm": 29.457758212672534, "learning_rate": 8.716577540106953e-06, "loss": 0.6488, "step": 163 }, { "epoch": 0.03, "grad_norm": 64.51326898401457, "learning_rate": 8.77005347593583e-06, "loss": 0.5934, "step": 164 }, { "epoch": 0.03, "grad_norm": 27.709961183521948, "learning_rate": 8.823529411764707e-06, "loss": 0.5677, "step": 165 }, { "epoch": 0.03, "grad_norm": 32.89692578423412, "learning_rate": 8.877005347593584e-06, "loss": 0.5662, "step": 166 }, { "epoch": 0.03, "grad_norm": 30.36290733478553, "learning_rate": 8.93048128342246e-06, "loss": 0.6224, "step": 167 }, { "epoch": 0.03, "grad_norm": 23.382179698659886, "learning_rate": 8.983957219251337e-06, "loss": 0.5562, "step": 168 }, { "epoch": 0.03, "grad_norm": 42.860233528227255, "learning_rate": 9.037433155080214e-06, "loss": 0.5669, "step": 169 }, { "epoch": 0.03, "grad_norm": 49.17982754033979, "learning_rate": 9.090909090909091e-06, "loss": 0.4837, "step": 170 }, { "epoch": 0.03, "grad_norm": 29.229854648173912, "learning_rate": 9.144385026737968e-06, "loss": 0.5601, "step": 171 }, { "epoch": 0.03, "grad_norm": 30.51681488275903, "learning_rate": 9.197860962566846e-06, "loss": 0.5652, "step": 172 }, { "epoch": 0.03, "grad_norm": 40.12952587187656, "learning_rate": 9.251336898395723e-06, "loss": 0.566, "step": 173 }, { "epoch": 0.03, "grad_norm": 32.46229786172875, "learning_rate": 9.3048128342246e-06, "loss": 0.5716, "step": 174 }, { "epoch": 0.03, "grad_norm": 44.679071142403664, "learning_rate": 9.358288770053477e-06, "loss": 0.5181, "step": 175 }, { "epoch": 0.03, "grad_norm": 40.71036783457782, "learning_rate": 9.411764705882354e-06, "loss": 0.6452, "step": 176 }, { "epoch": 0.03, "grad_norm": 78.8545624812088, "learning_rate": 9.46524064171123e-06, "loss": 0.6516, "step": 177 }, { "epoch": 0.03, "grad_norm": 21.921839800170993, "learning_rate": 9.518716577540108e-06, "loss": 0.5642, "step": 178 }, { "epoch": 0.03, "grad_norm": 67.10835639297564, "learning_rate": 9.572192513368986e-06, "loss": 0.5199, "step": 179 }, { "epoch": 0.03, "grad_norm": 35.91729067083796, "learning_rate": 9.625668449197861e-06, "loss": 0.6715, "step": 180 }, { "epoch": 0.03, "grad_norm": 30.53398989521768, "learning_rate": 9.679144385026738e-06, "loss": 0.5827, "step": 181 }, { "epoch": 0.03, "grad_norm": 1.225844987409763, "learning_rate": 9.732620320855617e-06, "loss": 0.4322, "step": 182 }, { "epoch": 0.03, "grad_norm": 42.32698934436769, "learning_rate": 9.786096256684493e-06, "loss": 0.5656, "step": 183 }, { "epoch": 0.03, "grad_norm": 25.03027998026712, "learning_rate": 9.83957219251337e-06, "loss": 0.5824, "step": 184 }, { "epoch": 0.03, "grad_norm": 1.317887942116737, "learning_rate": 9.893048128342247e-06, "loss": 0.4833, "step": 185 }, { "epoch": 0.03, "grad_norm": 31.93535496217499, "learning_rate": 9.946524064171124e-06, "loss": 0.5946, "step": 186 }, { "epoch": 0.03, "grad_norm": 21.01641343569306, "learning_rate": 1e-05, "loss": 0.5958, "step": 187 }, { "epoch": 0.03, "grad_norm": 84.57102601338673, "learning_rate": 9.999999318931088e-06, "loss": 0.576, "step": 188 }, { "epoch": 0.03, "grad_norm": 25.58380639550186, "learning_rate": 9.999997275724535e-06, "loss": 0.5934, "step": 189 }, { "epoch": 0.03, "grad_norm": 43.93180382709066, "learning_rate": 9.999993870380897e-06, "loss": 0.5593, "step": 190 }, { "epoch": 0.03, "grad_norm": 23.349849368866, "learning_rate": 9.999989102901105e-06, "loss": 0.5626, "step": 191 }, { "epoch": 0.03, "grad_norm": 20.888299437307587, "learning_rate": 9.999982973286455e-06, "loss": 0.5749, "step": 192 }, { "epoch": 0.03, "grad_norm": 20.0977512771481, "learning_rate": 9.999975481538618e-06, "loss": 0.5075, "step": 193 }, { "epoch": 0.03, "grad_norm": 38.11821401436929, "learning_rate": 9.999966627659635e-06, "loss": 0.5046, "step": 194 }, { "epoch": 0.03, "grad_norm": 22.473927278749628, "learning_rate": 9.999956411651916e-06, "loss": 0.5435, "step": 195 }, { "epoch": 0.03, "grad_norm": 18.120374263358624, "learning_rate": 9.999944833518248e-06, "loss": 0.5539, "step": 196 }, { "epoch": 0.03, "grad_norm": 18.818380286558053, "learning_rate": 9.999931893261783e-06, "loss": 0.6751, "step": 197 }, { "epoch": 0.03, "grad_norm": 22.25554843026403, "learning_rate": 9.999917590886046e-06, "loss": 0.5946, "step": 198 }, { "epoch": 0.03, "grad_norm": 22.97019807220915, "learning_rate": 9.999901926394932e-06, "loss": 0.5891, "step": 199 }, { "epoch": 0.03, "grad_norm": 14.706462387191223, "learning_rate": 9.99988489979271e-06, "loss": 0.4632, "step": 200 }, { "epoch": 0.03, "grad_norm": 24.061854873862444, "learning_rate": 9.999866511084021e-06, "loss": 0.5995, "step": 201 }, { "epoch": 0.03, "grad_norm": 30.882464693094764, "learning_rate": 9.999846760273873e-06, "loss": 0.5464, "step": 202 }, { "epoch": 0.03, "grad_norm": 26.51172217095793, "learning_rate": 9.999825647367643e-06, "loss": 0.5838, "step": 203 }, { "epoch": 0.03, "grad_norm": 37.093160416079336, "learning_rate": 9.999803172371088e-06, "loss": 0.5999, "step": 204 }, { "epoch": 0.03, "grad_norm": 25.646550754862893, "learning_rate": 9.999779335290328e-06, "loss": 0.5762, "step": 205 }, { "epoch": 0.03, "grad_norm": 1.564564908728737, "learning_rate": 9.999754136131855e-06, "loss": 0.4914, "step": 206 }, { "epoch": 0.03, "grad_norm": 23.616305190856394, "learning_rate": 9.999727574902538e-06, "loss": 0.5531, "step": 207 }, { "epoch": 0.03, "grad_norm": 19.747613211173068, "learning_rate": 9.999699651609611e-06, "loss": 0.5821, "step": 208 }, { "epoch": 0.03, "grad_norm": 27.09812594656937, "learning_rate": 9.999670366260682e-06, "loss": 0.5854, "step": 209 }, { "epoch": 0.03, "grad_norm": 29.071729394812532, "learning_rate": 9.999639718863728e-06, "loss": 0.5739, "step": 210 }, { "epoch": 0.03, "grad_norm": 16.119811914476216, "learning_rate": 9.9996077094271e-06, "loss": 0.5815, "step": 211 }, { "epoch": 0.03, "grad_norm": 34.84882929176075, "learning_rate": 9.999574337959514e-06, "loss": 0.5799, "step": 212 }, { "epoch": 0.03, "grad_norm": 27.849279396445613, "learning_rate": 9.999539604470068e-06, "loss": 0.5737, "step": 213 }, { "epoch": 0.03, "grad_norm": 24.952346926553595, "learning_rate": 9.999503508968218e-06, "loss": 0.5676, "step": 214 }, { "epoch": 0.03, "grad_norm": 26.946784765670863, "learning_rate": 9.9994660514638e-06, "loss": 0.5265, "step": 215 }, { "epoch": 0.03, "grad_norm": 39.515364664534594, "learning_rate": 9.99942723196702e-06, "loss": 0.5673, "step": 216 }, { "epoch": 0.03, "grad_norm": 21.597549155378786, "learning_rate": 9.999387050488451e-06, "loss": 0.4892, "step": 217 }, { "epoch": 0.04, "grad_norm": 36.36299545430016, "learning_rate": 9.99934550703904e-06, "loss": 0.5446, "step": 218 }, { "epoch": 0.04, "grad_norm": 21.538372120559256, "learning_rate": 9.999302601630106e-06, "loss": 0.5604, "step": 219 }, { "epoch": 0.04, "grad_norm": 27.13417033785625, "learning_rate": 9.999258334273338e-06, "loss": 0.5454, "step": 220 }, { "epoch": 0.04, "grad_norm": 1.306466726987151, "learning_rate": 9.999212704980792e-06, "loss": 0.4934, "step": 221 }, { "epoch": 0.04, "grad_norm": 52.223827376076414, "learning_rate": 9.999165713764902e-06, "loss": 0.5634, "step": 222 }, { "epoch": 0.04, "grad_norm": 23.92417627579469, "learning_rate": 9.999117360638469e-06, "loss": 0.5653, "step": 223 }, { "epoch": 0.04, "grad_norm": 26.25244952786487, "learning_rate": 9.999067645614666e-06, "loss": 0.6201, "step": 224 }, { "epoch": 0.04, "grad_norm": 52.99497059344627, "learning_rate": 9.999016568707036e-06, "loss": 0.649, "step": 225 }, { "epoch": 0.04, "grad_norm": 1.2354855596202785, "learning_rate": 9.998964129929493e-06, "loss": 0.4768, "step": 226 }, { "epoch": 0.04, "grad_norm": 22.988882270251633, "learning_rate": 9.998910329296322e-06, "loss": 0.5786, "step": 227 }, { "epoch": 0.04, "grad_norm": 24.13374025934291, "learning_rate": 9.998855166822186e-06, "loss": 0.5448, "step": 228 }, { "epoch": 0.04, "grad_norm": 19.478786990599023, "learning_rate": 9.998798642522105e-06, "loss": 0.5741, "step": 229 }, { "epoch": 0.04, "grad_norm": 26.826941026382045, "learning_rate": 9.998740756411483e-06, "loss": 0.5658, "step": 230 }, { "epoch": 0.04, "grad_norm": 14.563016385572427, "learning_rate": 9.998681508506087e-06, "loss": 0.4788, "step": 231 }, { "epoch": 0.04, "grad_norm": 15.526395978081707, "learning_rate": 9.998620898822059e-06, "loss": 0.5559, "step": 232 }, { "epoch": 0.04, "grad_norm": 22.194535950164592, "learning_rate": 9.998558927375909e-06, "loss": 0.5206, "step": 233 }, { "epoch": 0.04, "grad_norm": 33.49532012075608, "learning_rate": 9.998495594184523e-06, "loss": 0.5189, "step": 234 }, { "epoch": 0.04, "grad_norm": 57.31725056280748, "learning_rate": 9.998430899265152e-06, "loss": 0.534, "step": 235 }, { "epoch": 0.04, "grad_norm": 44.49557379578555, "learning_rate": 9.998364842635422e-06, "loss": 0.5228, "step": 236 }, { "epoch": 0.04, "grad_norm": 26.558282803659548, "learning_rate": 9.998297424313327e-06, "loss": 0.4949, "step": 237 }, { "epoch": 0.04, "grad_norm": 21.077766343461054, "learning_rate": 9.998228644317235e-06, "loss": 0.6122, "step": 238 }, { "epoch": 0.04, "grad_norm": 27.299226322814544, "learning_rate": 9.998158502665884e-06, "loss": 0.56, "step": 239 }, { "epoch": 0.04, "grad_norm": 17.952229427066612, "learning_rate": 9.99808699937838e-06, "loss": 0.5543, "step": 240 }, { "epoch": 0.04, "grad_norm": 49.61385356334192, "learning_rate": 9.998014134474207e-06, "loss": 0.5375, "step": 241 }, { "epoch": 0.04, "grad_norm": 18.227289147125834, "learning_rate": 9.997939907973212e-06, "loss": 0.5632, "step": 242 }, { "epoch": 0.04, "grad_norm": 22.92871168209305, "learning_rate": 9.997864319895616e-06, "loss": 0.5781, "step": 243 }, { "epoch": 0.04, "grad_norm": 1.2574590512697594, "learning_rate": 9.997787370262012e-06, "loss": 0.4581, "step": 244 }, { "epoch": 0.04, "grad_norm": 1.2005917968713267, "learning_rate": 9.997709059093364e-06, "loss": 0.4531, "step": 245 }, { "epoch": 0.04, "grad_norm": 23.441321088762436, "learning_rate": 9.997629386411006e-06, "loss": 0.5519, "step": 246 }, { "epoch": 0.04, "grad_norm": 34.888071097660706, "learning_rate": 9.997548352236644e-06, "loss": 0.5391, "step": 247 }, { "epoch": 0.04, "grad_norm": 14.904388776270206, "learning_rate": 9.99746595659235e-06, "loss": 0.5634, "step": 248 }, { "epoch": 0.04, "grad_norm": 29.944998619314298, "learning_rate": 9.997382199500577e-06, "loss": 0.611, "step": 249 }, { "epoch": 0.04, "grad_norm": 29.277373234281534, "learning_rate": 9.997297080984136e-06, "loss": 0.5593, "step": 250 }, { "epoch": 0.04, "grad_norm": 29.870702777628296, "learning_rate": 9.997210601066218e-06, "loss": 0.4689, "step": 251 }, { "epoch": 0.04, "grad_norm": 18.661590600953268, "learning_rate": 9.997122759770386e-06, "loss": 0.5765, "step": 252 }, { "epoch": 0.04, "grad_norm": 18.172031386490158, "learning_rate": 9.997033557120567e-06, "loss": 0.5655, "step": 253 }, { "epoch": 0.04, "grad_norm": 14.495604024380304, "learning_rate": 9.996942993141063e-06, "loss": 0.5272, "step": 254 }, { "epoch": 0.04, "grad_norm": 1.4875376276342165, "learning_rate": 9.996851067856546e-06, "loss": 0.4763, "step": 255 }, { "epoch": 0.04, "grad_norm": 18.066879056490265, "learning_rate": 9.996757781292058e-06, "loss": 0.535, "step": 256 }, { "epoch": 0.04, "grad_norm": 23.907625456763284, "learning_rate": 9.996663133473017e-06, "loss": 0.5069, "step": 257 }, { "epoch": 0.04, "grad_norm": 22.68612313214401, "learning_rate": 9.996567124425201e-06, "loss": 0.5409, "step": 258 }, { "epoch": 0.04, "grad_norm": 28.580822847531774, "learning_rate": 9.996469754174772e-06, "loss": 0.5293, "step": 259 }, { "epoch": 0.04, "grad_norm": 17.87811428048285, "learning_rate": 9.996371022748251e-06, "loss": 0.5215, "step": 260 }, { "epoch": 0.04, "grad_norm": 20.25414039382917, "learning_rate": 9.99627093017254e-06, "loss": 0.527, "step": 261 }, { "epoch": 0.04, "grad_norm": 24.734183136640993, "learning_rate": 9.996169476474902e-06, "loss": 0.5709, "step": 262 }, { "epoch": 0.04, "grad_norm": 18.01084560508422, "learning_rate": 9.996066661682981e-06, "loss": 0.5359, "step": 263 }, { "epoch": 0.04, "grad_norm": 24.46686164993537, "learning_rate": 9.995962485824783e-06, "loss": 0.5931, "step": 264 }, { "epoch": 0.04, "grad_norm": 18.35433198798991, "learning_rate": 9.995856948928688e-06, "loss": 0.5721, "step": 265 }, { "epoch": 0.04, "grad_norm": 15.213726242595348, "learning_rate": 9.99575005102345e-06, "loss": 0.4902, "step": 266 }, { "epoch": 0.04, "grad_norm": 18.392656248707723, "learning_rate": 9.99564179213819e-06, "loss": 0.5915, "step": 267 }, { "epoch": 0.04, "grad_norm": 15.04115075164745, "learning_rate": 9.995532172302399e-06, "loss": 0.5435, "step": 268 }, { "epoch": 0.04, "grad_norm": 24.847149451890505, "learning_rate": 9.995421191545942e-06, "loss": 0.5364, "step": 269 }, { "epoch": 0.04, "grad_norm": 18.766567633854013, "learning_rate": 9.995308849899052e-06, "loss": 0.6014, "step": 270 }, { "epoch": 0.04, "grad_norm": 18.934027444938955, "learning_rate": 9.995195147392335e-06, "loss": 0.5001, "step": 271 }, { "epoch": 0.04, "grad_norm": 23.29224964131107, "learning_rate": 9.995080084056767e-06, "loss": 0.5458, "step": 272 }, { "epoch": 0.04, "grad_norm": 116.0395013873673, "learning_rate": 9.994963659923695e-06, "loss": 0.4876, "step": 273 }, { "epoch": 0.04, "grad_norm": 20.550561641908725, "learning_rate": 9.994845875024834e-06, "loss": 0.5308, "step": 274 }, { "epoch": 0.04, "grad_norm": 17.723653697155687, "learning_rate": 9.994726729392272e-06, "loss": 0.5316, "step": 275 }, { "epoch": 0.04, "grad_norm": 1.2152913079936658, "learning_rate": 9.99460622305847e-06, "loss": 0.4777, "step": 276 }, { "epoch": 0.04, "grad_norm": 35.263337400817974, "learning_rate": 9.994484356056256e-06, "loss": 0.5959, "step": 277 }, { "epoch": 0.04, "grad_norm": 1.1001898209957774, "learning_rate": 9.994361128418828e-06, "loss": 0.4649, "step": 278 }, { "epoch": 0.04, "grad_norm": 24.12068182949776, "learning_rate": 9.99423654017976e-06, "loss": 0.5232, "step": 279 }, { "epoch": 0.05, "grad_norm": 50.66715646834188, "learning_rate": 9.99411059137299e-06, "loss": 0.5588, "step": 280 }, { "epoch": 0.05, "grad_norm": 13.650007219125833, "learning_rate": 9.993983282032831e-06, "loss": 0.5284, "step": 281 }, { "epoch": 0.05, "grad_norm": 34.57803254152516, "learning_rate": 9.993854612193967e-06, "loss": 0.5828, "step": 282 }, { "epoch": 0.05, "grad_norm": 40.86973153855904, "learning_rate": 9.993724581891451e-06, "loss": 0.5499, "step": 283 }, { "epoch": 0.05, "grad_norm": 17.63516792646248, "learning_rate": 9.993593191160704e-06, "loss": 0.5242, "step": 284 }, { "epoch": 0.05, "grad_norm": 22.431439466588927, "learning_rate": 9.993460440037525e-06, "loss": 0.5672, "step": 285 }, { "epoch": 0.05, "grad_norm": 28.27244449728826, "learning_rate": 9.993326328558076e-06, "loss": 0.5573, "step": 286 }, { "epoch": 0.05, "grad_norm": 69.61736822464846, "learning_rate": 9.993190856758892e-06, "loss": 0.525, "step": 287 }, { "epoch": 0.05, "grad_norm": 19.913915733585295, "learning_rate": 9.99305402467688e-06, "loss": 0.5368, "step": 288 }, { "epoch": 0.05, "grad_norm": 43.833892898165445, "learning_rate": 9.99291583234932e-06, "loss": 0.5604, "step": 289 }, { "epoch": 0.05, "grad_norm": 15.592956349232821, "learning_rate": 9.992776279813854e-06, "loss": 0.5595, "step": 290 }, { "epoch": 0.05, "grad_norm": 14.496479439748734, "learning_rate": 9.992635367108505e-06, "loss": 0.54, "step": 291 }, { "epoch": 0.05, "grad_norm": 21.801186293712377, "learning_rate": 9.992493094271657e-06, "loss": 0.5456, "step": 292 }, { "epoch": 0.05, "grad_norm": 15.521213994153893, "learning_rate": 9.992349461342073e-06, "loss": 0.508, "step": 293 }, { "epoch": 0.05, "grad_norm": 61.73552245107981, "learning_rate": 9.992204468358879e-06, "loss": 0.5279, "step": 294 }, { "epoch": 0.05, "grad_norm": 21.689513247406182, "learning_rate": 9.992058115361578e-06, "loss": 0.5112, "step": 295 }, { "epoch": 0.05, "grad_norm": 25.61535919933569, "learning_rate": 9.991910402390041e-06, "loss": 0.515, "step": 296 }, { "epoch": 0.05, "grad_norm": 118.51700796085427, "learning_rate": 9.991761329484505e-06, "loss": 0.4884, "step": 297 }, { "epoch": 0.05, "grad_norm": 18.761074282300584, "learning_rate": 9.991610896685587e-06, "loss": 0.5407, "step": 298 }, { "epoch": 0.05, "grad_norm": 1.4482620210295873, "learning_rate": 9.991459104034262e-06, "loss": 0.5206, "step": 299 }, { "epoch": 0.05, "grad_norm": 23.295496932705312, "learning_rate": 9.991305951571891e-06, "loss": 0.5356, "step": 300 }, { "epoch": 0.05, "grad_norm": 12.217357741647287, "learning_rate": 9.99115143934019e-06, "loss": 0.5142, "step": 301 }, { "epoch": 0.05, "grad_norm": 39.894290936272284, "learning_rate": 9.990995567381255e-06, "loss": 0.5593, "step": 302 }, { "epoch": 0.05, "grad_norm": 22.34063424998089, "learning_rate": 9.990838335737551e-06, "loss": 0.5281, "step": 303 }, { "epoch": 0.05, "grad_norm": 29.1087149830401, "learning_rate": 9.990679744451909e-06, "loss": 0.5024, "step": 304 }, { "epoch": 0.05, "grad_norm": 39.04360417713495, "learning_rate": 9.990519793567539e-06, "loss": 0.4991, "step": 305 }, { "epoch": 0.05, "grad_norm": 37.967141610223415, "learning_rate": 9.990358483128012e-06, "loss": 0.6056, "step": 306 }, { "epoch": 0.05, "grad_norm": 17.3353241881632, "learning_rate": 9.990195813177272e-06, "loss": 0.5398, "step": 307 }, { "epoch": 0.05, "grad_norm": 20.07522917913113, "learning_rate": 9.99003178375964e-06, "loss": 0.5157, "step": 308 }, { "epoch": 0.05, "grad_norm": 21.32991615089147, "learning_rate": 9.989866394919795e-06, "loss": 0.5543, "step": 309 }, { "epoch": 0.05, "grad_norm": 72.44284433249531, "learning_rate": 9.9896996467028e-06, "loss": 0.4953, "step": 310 }, { "epoch": 0.05, "grad_norm": 21.319769035482324, "learning_rate": 9.98953153915408e-06, "loss": 0.4953, "step": 311 }, { "epoch": 0.05, "grad_norm": 80.55786410346872, "learning_rate": 9.989362072319431e-06, "loss": 0.4916, "step": 312 }, { "epoch": 0.05, "grad_norm": 23.81783195210229, "learning_rate": 9.98919124624502e-06, "loss": 0.5682, "step": 313 }, { "epoch": 0.05, "grad_norm": 33.330639064743856, "learning_rate": 9.989019060977388e-06, "loss": 0.5809, "step": 314 }, { "epoch": 0.05, "grad_norm": 35.23341737570225, "learning_rate": 9.988845516563437e-06, "loss": 0.5576, "step": 315 }, { "epoch": 0.05, "grad_norm": 31.59904410655793, "learning_rate": 9.988670613050452e-06, "loss": 0.5942, "step": 316 }, { "epoch": 0.05, "grad_norm": 19.303374005522244, "learning_rate": 9.988494350486077e-06, "loss": 0.5701, "step": 317 }, { "epoch": 0.05, "grad_norm": 33.51499512469324, "learning_rate": 9.988316728918332e-06, "loss": 0.5293, "step": 318 }, { "epoch": 0.05, "grad_norm": 43.43297427047658, "learning_rate": 9.988137748395608e-06, "loss": 0.4729, "step": 319 }, { "epoch": 0.05, "grad_norm": 39.685120874705795, "learning_rate": 9.987957408966662e-06, "loss": 0.5238, "step": 320 }, { "epoch": 0.05, "grad_norm": 50.81876616983587, "learning_rate": 9.987775710680622e-06, "loss": 0.5382, "step": 321 }, { "epoch": 0.05, "grad_norm": 23.443865897767214, "learning_rate": 9.987592653586992e-06, "loss": 0.5383, "step": 322 }, { "epoch": 0.05, "grad_norm": 26.676038890179647, "learning_rate": 9.987408237735638e-06, "loss": 0.5777, "step": 323 }, { "epoch": 0.05, "grad_norm": 35.95779328482772, "learning_rate": 9.987222463176803e-06, "loss": 0.5146, "step": 324 }, { "epoch": 0.05, "grad_norm": 49.413121870741655, "learning_rate": 9.987035329961092e-06, "loss": 0.5714, "step": 325 }, { "epoch": 0.05, "grad_norm": 1.2661824546367435, "learning_rate": 9.986846838139492e-06, "loss": 0.4628, "step": 326 }, { "epoch": 0.05, "grad_norm": 22.909845685667744, "learning_rate": 9.986656987763348e-06, "loss": 0.5652, "step": 327 }, { "epoch": 0.05, "grad_norm": 44.75717727170816, "learning_rate": 9.986465778884384e-06, "loss": 0.6211, "step": 328 }, { "epoch": 0.05, "grad_norm": 26.470796711705937, "learning_rate": 9.986273211554689e-06, "loss": 0.5085, "step": 329 }, { "epoch": 0.05, "grad_norm": 70.9588388675342, "learning_rate": 9.986079285826721e-06, "loss": 0.5153, "step": 330 }, { "epoch": 0.05, "grad_norm": 29.68828398855501, "learning_rate": 9.985884001753317e-06, "loss": 0.4738, "step": 331 }, { "epoch": 0.05, "grad_norm": 30.295885616262016, "learning_rate": 9.985687359387673e-06, "loss": 0.52, "step": 332 }, { "epoch": 0.05, "grad_norm": 30.940617235406467, "learning_rate": 9.98548935878336e-06, "loss": 0.5988, "step": 333 }, { "epoch": 0.05, "grad_norm": 27.67229063242171, "learning_rate": 9.985289999994322e-06, "loss": 0.4904, "step": 334 }, { "epoch": 0.05, "grad_norm": 20.829003469840004, "learning_rate": 9.985089283074867e-06, "loss": 0.5243, "step": 335 }, { "epoch": 0.05, "grad_norm": 38.44824909771365, "learning_rate": 9.984887208079675e-06, "loss": 0.5253, "step": 336 }, { "epoch": 0.05, "grad_norm": 49.4598365701943, "learning_rate": 9.9846837750638e-06, "loss": 0.5281, "step": 337 }, { "epoch": 0.05, "grad_norm": 72.11861428752515, "learning_rate": 9.98447898408266e-06, "loss": 0.4996, "step": 338 }, { "epoch": 0.05, "grad_norm": 22.212622132797467, "learning_rate": 9.984272835192047e-06, "loss": 0.5199, "step": 339 }, { "epoch": 0.05, "grad_norm": 37.45339726520185, "learning_rate": 9.984065328448122e-06, "loss": 0.5341, "step": 340 }, { "epoch": 0.05, "grad_norm": 91.32610103206306, "learning_rate": 9.983856463907415e-06, "loss": 0.5574, "step": 341 }, { "epoch": 0.06, "grad_norm": 65.31059173141998, "learning_rate": 9.983646241626825e-06, "loss": 0.4818, "step": 342 }, { "epoch": 0.06, "grad_norm": 40.552655348204645, "learning_rate": 9.983434661663625e-06, "loss": 0.4886, "step": 343 }, { "epoch": 0.06, "grad_norm": 58.39408005261295, "learning_rate": 9.983221724075453e-06, "loss": 0.5584, "step": 344 }, { "epoch": 0.06, "grad_norm": 144.2902028408597, "learning_rate": 9.983007428920322e-06, "loss": 0.5377, "step": 345 }, { "epoch": 0.06, "grad_norm": 26.933321668542007, "learning_rate": 9.982791776256608e-06, "loss": 0.4838, "step": 346 }, { "epoch": 0.06, "grad_norm": 23.027458811876798, "learning_rate": 9.982574766143063e-06, "loss": 0.5939, "step": 347 }, { "epoch": 0.06, "grad_norm": 24.863361562216706, "learning_rate": 9.982356398638807e-06, "loss": 0.5593, "step": 348 }, { "epoch": 0.06, "grad_norm": 33.83998134688836, "learning_rate": 9.982136673803328e-06, "loss": 0.5112, "step": 349 }, { "epoch": 0.06, "grad_norm": 15.899535320294381, "learning_rate": 9.981915591696484e-06, "loss": 0.5451, "step": 350 }, { "epoch": 0.06, "grad_norm": 40.47621905200634, "learning_rate": 9.981693152378509e-06, "loss": 0.5137, "step": 351 }, { "epoch": 0.06, "grad_norm": 22.347207755338726, "learning_rate": 9.981469355909996e-06, "loss": 0.5799, "step": 352 }, { "epoch": 0.06, "grad_norm": 21.8418243578854, "learning_rate": 9.981244202351916e-06, "loss": 0.5997, "step": 353 }, { "epoch": 0.06, "grad_norm": 26.85124037083119, "learning_rate": 9.981017691765606e-06, "loss": 0.5601, "step": 354 }, { "epoch": 0.06, "grad_norm": 18.2564746566136, "learning_rate": 9.980789824212776e-06, "loss": 0.5069, "step": 355 }, { "epoch": 0.06, "grad_norm": 27.07945100773563, "learning_rate": 9.980560599755498e-06, "loss": 0.4989, "step": 356 }, { "epoch": 0.06, "grad_norm": 21.05545669655485, "learning_rate": 9.980330018456227e-06, "loss": 0.5536, "step": 357 }, { "epoch": 0.06, "grad_norm": 30.021464652348506, "learning_rate": 9.980098080377771e-06, "loss": 0.4864, "step": 358 }, { "epoch": 0.06, "grad_norm": 21.133571780039833, "learning_rate": 9.979864785583325e-06, "loss": 0.4633, "step": 359 }, { "epoch": 0.06, "grad_norm": 22.096160915361523, "learning_rate": 9.979630134136438e-06, "loss": 0.5231, "step": 360 }, { "epoch": 0.06, "grad_norm": 22.209467276723977, "learning_rate": 9.979394126101039e-06, "loss": 0.5541, "step": 361 }, { "epoch": 0.06, "grad_norm": 51.07370030596758, "learning_rate": 9.979156761541421e-06, "loss": 0.5364, "step": 362 }, { "epoch": 0.06, "grad_norm": 23.764647011805877, "learning_rate": 9.978918040522249e-06, "loss": 0.5244, "step": 363 }, { "epoch": 0.06, "grad_norm": 26.210163478876417, "learning_rate": 9.97867796310856e-06, "loss": 0.4787, "step": 364 }, { "epoch": 0.06, "grad_norm": 28.770515093485404, "learning_rate": 9.978436529365757e-06, "loss": 0.593, "step": 365 }, { "epoch": 0.06, "grad_norm": 34.681586124336654, "learning_rate": 9.978193739359611e-06, "loss": 0.4687, "step": 366 }, { "epoch": 0.06, "grad_norm": 23.057910576495072, "learning_rate": 9.977949593156264e-06, "loss": 0.4882, "step": 367 }, { "epoch": 0.06, "grad_norm": 55.649207961095534, "learning_rate": 9.977704090822232e-06, "loss": 0.5808, "step": 368 }, { "epoch": 0.06, "grad_norm": 16.620917117475674, "learning_rate": 9.977457232424394e-06, "loss": 0.4904, "step": 369 }, { "epoch": 0.06, "grad_norm": 24.19347034531309, "learning_rate": 9.97720901803e-06, "loss": 0.4757, "step": 370 }, { "epoch": 0.06, "grad_norm": 32.05019191077451, "learning_rate": 9.976959447706673e-06, "loss": 0.4838, "step": 371 }, { "epoch": 0.06, "grad_norm": 56.0939360157241, "learning_rate": 9.976708521522403e-06, "loss": 0.5237, "step": 372 }, { "epoch": 0.06, "grad_norm": 26.584647344869083, "learning_rate": 9.976456239545547e-06, "loss": 0.4829, "step": 373 }, { "epoch": 0.06, "grad_norm": 27.571158085912383, "learning_rate": 9.976202601844834e-06, "loss": 0.4916, "step": 374 }, { "epoch": 0.06, "grad_norm": 49.97319046051048, "learning_rate": 9.975947608489363e-06, "loss": 0.4987, "step": 375 }, { "epoch": 0.06, "grad_norm": 67.18675397070747, "learning_rate": 9.975691259548598e-06, "loss": 0.6057, "step": 376 }, { "epoch": 0.06, "grad_norm": 28.003749912193964, "learning_rate": 9.975433555092383e-06, "loss": 0.5284, "step": 377 }, { "epoch": 0.06, "grad_norm": 49.46711209658895, "learning_rate": 9.975174495190915e-06, "loss": 0.4835, "step": 378 }, { "epoch": 0.06, "grad_norm": 24.435425595843665, "learning_rate": 9.974914079914775e-06, "loss": 0.4766, "step": 379 }, { "epoch": 0.06, "grad_norm": 23.408063889696827, "learning_rate": 9.974652309334904e-06, "loss": 0.5403, "step": 380 }, { "epoch": 0.06, "grad_norm": 32.54455940449194, "learning_rate": 9.974389183522618e-06, "loss": 0.5166, "step": 381 }, { "epoch": 0.06, "grad_norm": 35.884292004176885, "learning_rate": 9.9741247025496e-06, "loss": 0.5509, "step": 382 }, { "epoch": 0.06, "grad_norm": 36.60357293360651, "learning_rate": 9.973858866487898e-06, "loss": 0.514, "step": 383 }, { "epoch": 0.06, "grad_norm": 48.61984072630844, "learning_rate": 9.973591675409934e-06, "loss": 0.542, "step": 384 }, { "epoch": 0.06, "grad_norm": 27.110096291137786, "learning_rate": 9.973323129388504e-06, "loss": 0.5647, "step": 385 }, { "epoch": 0.06, "grad_norm": 93.34157823078688, "learning_rate": 9.97305322849676e-06, "loss": 0.5082, "step": 386 }, { "epoch": 0.06, "grad_norm": 27.100647256232424, "learning_rate": 9.972781972808234e-06, "loss": 0.5348, "step": 387 }, { "epoch": 0.06, "grad_norm": 34.74180137614563, "learning_rate": 9.972509362396825e-06, "loss": 0.47, "step": 388 }, { "epoch": 0.06, "grad_norm": 32.81135544139614, "learning_rate": 9.972235397336796e-06, "loss": 0.4373, "step": 389 }, { "epoch": 0.06, "grad_norm": 50.96121732700845, "learning_rate": 9.971960077702785e-06, "loss": 0.5756, "step": 390 }, { "epoch": 0.06, "grad_norm": 24.153800490986022, "learning_rate": 9.971683403569795e-06, "loss": 0.5628, "step": 391 }, { "epoch": 0.06, "grad_norm": 22.339641623232918, "learning_rate": 9.971405375013201e-06, "loss": 0.4971, "step": 392 }, { "epoch": 0.06, "grad_norm": 38.07131761187898, "learning_rate": 9.971125992108746e-06, "loss": 0.4576, "step": 393 }, { "epoch": 0.06, "grad_norm": 29.705080223259554, "learning_rate": 9.97084525493254e-06, "loss": 0.4428, "step": 394 }, { "epoch": 0.06, "grad_norm": 32.76539459169689, "learning_rate": 9.970563163561064e-06, "loss": 0.5592, "step": 395 }, { "epoch": 0.06, "grad_norm": 32.92321553073833, "learning_rate": 9.970279718071169e-06, "loss": 0.5312, "step": 396 }, { "epoch": 0.06, "grad_norm": 25.99961677338992, "learning_rate": 9.969994918540071e-06, "loss": 0.5416, "step": 397 }, { "epoch": 0.06, "grad_norm": 23.912075970658957, "learning_rate": 9.969708765045361e-06, "loss": 0.4807, "step": 398 }, { "epoch": 0.06, "grad_norm": 32.496952791203256, "learning_rate": 9.96942125766499e-06, "loss": 0.5429, "step": 399 }, { "epoch": 0.06, "grad_norm": 34.37941225536387, "learning_rate": 9.969132396477286e-06, "loss": 0.5443, "step": 400 }, { "epoch": 0.06, "grad_norm": 21.76009615829801, "learning_rate": 9.968842181560943e-06, "loss": 0.4996, "step": 401 }, { "epoch": 0.06, "grad_norm": 30.417509061826994, "learning_rate": 9.968550612995023e-06, "loss": 0.4605, "step": 402 }, { "epoch": 0.06, "grad_norm": 25.11449780546829, "learning_rate": 9.968257690858955e-06, "loss": 0.517, "step": 403 }, { "epoch": 0.07, "grad_norm": 28.43934247717701, "learning_rate": 9.967963415232544e-06, "loss": 0.591, "step": 404 }, { "epoch": 0.07, "grad_norm": 33.299709043872284, "learning_rate": 9.967667786195955e-06, "loss": 0.4226, "step": 405 }, { "epoch": 0.07, "grad_norm": 30.647604254977566, "learning_rate": 9.967370803829725e-06, "loss": 0.4932, "step": 406 }, { "epoch": 0.07, "grad_norm": 29.23689961238255, "learning_rate": 9.967072468214763e-06, "loss": 0.5449, "step": 407 }, { "epoch": 0.07, "grad_norm": 1.152256864473235, "learning_rate": 9.966772779432343e-06, "loss": 0.4798, "step": 408 }, { "epoch": 0.07, "grad_norm": 25.93754312059422, "learning_rate": 9.966471737564107e-06, "loss": 0.4526, "step": 409 }, { "epoch": 0.07, "grad_norm": 31.5058520624137, "learning_rate": 9.96616934269207e-06, "loss": 0.5186, "step": 410 }, { "epoch": 0.07, "grad_norm": 33.40071467600732, "learning_rate": 9.965865594898608e-06, "loss": 0.5213, "step": 411 }, { "epoch": 0.07, "grad_norm": 45.87345368097057, "learning_rate": 9.965560494266475e-06, "loss": 0.6058, "step": 412 }, { "epoch": 0.07, "grad_norm": 17.9022278175797, "learning_rate": 9.965254040878786e-06, "loss": 0.5054, "step": 413 }, { "epoch": 0.07, "grad_norm": 19.032318318820096, "learning_rate": 9.964946234819028e-06, "loss": 0.5522, "step": 414 }, { "epoch": 0.07, "grad_norm": 32.094765870079655, "learning_rate": 9.964637076171056e-06, "loss": 0.5628, "step": 415 }, { "epoch": 0.07, "grad_norm": 26.350475094482974, "learning_rate": 9.964326565019094e-06, "loss": 0.5878, "step": 416 }, { "epoch": 0.07, "grad_norm": 39.22819349584317, "learning_rate": 9.964014701447733e-06, "loss": 0.4956, "step": 417 }, { "epoch": 0.07, "grad_norm": 1.8518828840626311, "learning_rate": 9.963701485541935e-06, "loss": 0.5035, "step": 418 }, { "epoch": 0.07, "grad_norm": 24.044781875480382, "learning_rate": 9.963386917387025e-06, "loss": 0.5318, "step": 419 }, { "epoch": 0.07, "grad_norm": 37.146033781379785, "learning_rate": 9.963070997068702e-06, "loss": 0.5467, "step": 420 }, { "epoch": 0.07, "grad_norm": 31.711666229701752, "learning_rate": 9.962753724673033e-06, "loss": 0.4822, "step": 421 }, { "epoch": 0.07, "grad_norm": 51.637175864837644, "learning_rate": 9.96243510028645e-06, "loss": 0.5468, "step": 422 }, { "epoch": 0.07, "grad_norm": 32.61322001761153, "learning_rate": 9.962115123995754e-06, "loss": 0.4857, "step": 423 }, { "epoch": 0.07, "grad_norm": 23.183350056129182, "learning_rate": 9.961793795888118e-06, "loss": 0.5071, "step": 424 }, { "epoch": 0.07, "grad_norm": 17.49179420014295, "learning_rate": 9.96147111605108e-06, "loss": 0.506, "step": 425 }, { "epoch": 0.07, "grad_norm": 25.104310791925805, "learning_rate": 9.961147084572544e-06, "loss": 0.5302, "step": 426 }, { "epoch": 0.07, "grad_norm": 22.80102658283887, "learning_rate": 9.96082170154079e-06, "loss": 0.5345, "step": 427 }, { "epoch": 0.07, "grad_norm": 25.702194501266998, "learning_rate": 9.960494967044457e-06, "loss": 0.5415, "step": 428 }, { "epoch": 0.07, "grad_norm": 32.649951165165845, "learning_rate": 9.960166881172558e-06, "loss": 0.473, "step": 429 }, { "epoch": 0.07, "grad_norm": 21.075870770267354, "learning_rate": 9.959837444014473e-06, "loss": 0.4941, "step": 430 }, { "epoch": 0.07, "grad_norm": 24.642503203066365, "learning_rate": 9.95950665565995e-06, "loss": 0.5225, "step": 431 }, { "epoch": 0.07, "grad_norm": 20.451268840433812, "learning_rate": 9.959174516199105e-06, "loss": 0.4903, "step": 432 }, { "epoch": 0.07, "grad_norm": 41.82872156073963, "learning_rate": 9.95884102572242e-06, "loss": 0.5129, "step": 433 }, { "epoch": 0.07, "grad_norm": 15.76760743144387, "learning_rate": 9.958506184320749e-06, "loss": 0.4902, "step": 434 }, { "epoch": 0.07, "grad_norm": 35.96038019174613, "learning_rate": 9.95816999208531e-06, "loss": 0.5312, "step": 435 }, { "epoch": 0.07, "grad_norm": 1.2998482128820028, "learning_rate": 9.957832449107694e-06, "loss": 0.4963, "step": 436 }, { "epoch": 0.07, "grad_norm": 24.39134148388489, "learning_rate": 9.957493555479856e-06, "loss": 0.4961, "step": 437 }, { "epoch": 0.07, "grad_norm": 10.886915339037717, "learning_rate": 9.957153311294119e-06, "loss": 0.5326, "step": 438 }, { "epoch": 0.07, "grad_norm": 17.96802745745171, "learning_rate": 9.956811716643173e-06, "loss": 0.4957, "step": 439 }, { "epoch": 0.07, "grad_norm": 0.993686561358084, "learning_rate": 9.956468771620082e-06, "loss": 0.4385, "step": 440 }, { "epoch": 0.07, "grad_norm": 18.14101545828282, "learning_rate": 9.956124476318271e-06, "loss": 0.4956, "step": 441 }, { "epoch": 0.07, "grad_norm": 11.773021169052909, "learning_rate": 9.955778830831537e-06, "loss": 0.4576, "step": 442 }, { "epoch": 0.07, "grad_norm": 16.97778938081952, "learning_rate": 9.955431835254044e-06, "loss": 0.5322, "step": 443 }, { "epoch": 0.07, "grad_norm": 17.945285245376137, "learning_rate": 9.95508348968032e-06, "loss": 0.5379, "step": 444 }, { "epoch": 0.07, "grad_norm": 38.47737829536334, "learning_rate": 9.954733794205264e-06, "loss": 0.5317, "step": 445 }, { "epoch": 0.07, "grad_norm": 24.438213751992887, "learning_rate": 9.954382748924148e-06, "loss": 0.499, "step": 446 }, { "epoch": 0.07, "grad_norm": 15.170834029802196, "learning_rate": 9.9540303539326e-06, "loss": 0.5723, "step": 447 }, { "epoch": 0.07, "grad_norm": 33.862426250772906, "learning_rate": 9.953676609326627e-06, "loss": 0.4964, "step": 448 }, { "epoch": 0.07, "grad_norm": 12.112890994802129, "learning_rate": 9.953321515202597e-06, "loss": 0.4915, "step": 449 }, { "epoch": 0.07, "grad_norm": 32.85664456727147, "learning_rate": 9.952965071657244e-06, "loss": 0.5366, "step": 450 }, { "epoch": 0.07, "grad_norm": 21.9750949204955, "learning_rate": 9.952607278787679e-06, "loss": 0.4266, "step": 451 }, { "epoch": 0.07, "grad_norm": 74.81021995694343, "learning_rate": 9.95224813669137e-06, "loss": 0.4706, "step": 452 }, { "epoch": 0.07, "grad_norm": 45.53066585967331, "learning_rate": 9.95188764546616e-06, "loss": 0.5144, "step": 453 }, { "epoch": 0.07, "grad_norm": 36.2506933342848, "learning_rate": 9.951525805210256e-06, "loss": 0.5315, "step": 454 }, { "epoch": 0.07, "grad_norm": 34.822025071845225, "learning_rate": 9.951162616022234e-06, "loss": 0.5531, "step": 455 }, { "epoch": 0.07, "grad_norm": 28.245875605121736, "learning_rate": 9.950798078001034e-06, "loss": 0.5798, "step": 456 }, { "epoch": 0.07, "grad_norm": 16.650132236007526, "learning_rate": 9.950432191245968e-06, "loss": 0.5383, "step": 457 }, { "epoch": 0.07, "grad_norm": 17.354218528978308, "learning_rate": 9.950064955856716e-06, "loss": 0.5129, "step": 458 }, { "epoch": 0.07, "grad_norm": 15.983261598181162, "learning_rate": 9.949696371933319e-06, "loss": 0.4969, "step": 459 }, { "epoch": 0.07, "grad_norm": 19.455593519089483, "learning_rate": 9.94932643957619e-06, "loss": 0.4953, "step": 460 }, { "epoch": 0.07, "grad_norm": 20.147914724490157, "learning_rate": 9.948955158886113e-06, "loss": 0.5159, "step": 461 }, { "epoch": 0.07, "grad_norm": 26.59561655932231, "learning_rate": 9.94858252996423e-06, "loss": 0.4919, "step": 462 }, { "epoch": 0.07, "grad_norm": 18.080481206147777, "learning_rate": 9.948208552912057e-06, "loss": 0.4196, "step": 463 }, { "epoch": 0.07, "grad_norm": 12.151717867852996, "learning_rate": 9.947833227831477e-06, "loss": 0.5, "step": 464 }, { "epoch": 0.07, "grad_norm": 81.61403177628088, "learning_rate": 9.947456554824736e-06, "loss": 0.5375, "step": 465 }, { "epoch": 0.08, "grad_norm": 16.824867867931683, "learning_rate": 9.947078533994454e-06, "loss": 0.4652, "step": 466 }, { "epoch": 0.08, "grad_norm": 15.050256485325868, "learning_rate": 9.94669916544361e-06, "loss": 0.5241, "step": 467 }, { "epoch": 0.08, "grad_norm": 63.02157767426345, "learning_rate": 9.94631844927556e-06, "loss": 0.514, "step": 468 }, { "epoch": 0.08, "grad_norm": 68.19499595231524, "learning_rate": 9.945936385594017e-06, "loss": 0.4761, "step": 469 }, { "epoch": 0.08, "grad_norm": 27.273817847327035, "learning_rate": 9.945552974503065e-06, "loss": 0.5537, "step": 470 }, { "epoch": 0.08, "grad_norm": 23.9123170228459, "learning_rate": 9.94516821610716e-06, "loss": 0.5655, "step": 471 }, { "epoch": 0.08, "grad_norm": 21.401173626886173, "learning_rate": 9.944782110511119e-06, "loss": 0.5059, "step": 472 }, { "epoch": 0.08, "grad_norm": 61.47661594360003, "learning_rate": 9.944394657820127e-06, "loss": 0.4985, "step": 473 }, { "epoch": 0.08, "grad_norm": 37.26648989157921, "learning_rate": 9.944005858139737e-06, "loss": 0.4723, "step": 474 }, { "epoch": 0.08, "grad_norm": 25.466351867268152, "learning_rate": 9.943615711575867e-06, "loss": 0.528, "step": 475 }, { "epoch": 0.08, "grad_norm": 21.341293850335937, "learning_rate": 9.943224218234809e-06, "loss": 0.5433, "step": 476 }, { "epoch": 0.08, "grad_norm": 1.202135558227354, "learning_rate": 9.94283137822321e-06, "loss": 0.4733, "step": 477 }, { "epoch": 0.08, "grad_norm": 24.972381702990607, "learning_rate": 9.942437191648095e-06, "loss": 0.5491, "step": 478 }, { "epoch": 0.08, "grad_norm": 22.390358102787957, "learning_rate": 9.94204165861685e-06, "loss": 0.496, "step": 479 }, { "epoch": 0.08, "grad_norm": 11.404522545439457, "learning_rate": 9.94164477923723e-06, "loss": 0.5141, "step": 480 }, { "epoch": 0.08, "grad_norm": 15.025411452689683, "learning_rate": 9.941246553617353e-06, "loss": 0.4987, "step": 481 }, { "epoch": 0.08, "grad_norm": 69.67307670682064, "learning_rate": 9.940846981865709e-06, "loss": 0.5341, "step": 482 }, { "epoch": 0.08, "grad_norm": 20.15731549368307, "learning_rate": 9.940446064091151e-06, "loss": 0.589, "step": 483 }, { "epoch": 0.08, "grad_norm": 21.460104371931862, "learning_rate": 9.940043800402903e-06, "loss": 0.5487, "step": 484 }, { "epoch": 0.08, "grad_norm": 14.176050738192115, "learning_rate": 9.939640190910552e-06, "loss": 0.5719, "step": 485 }, { "epoch": 0.08, "grad_norm": 21.164347680110982, "learning_rate": 9.939235235724047e-06, "loss": 0.5964, "step": 486 }, { "epoch": 0.08, "grad_norm": 12.430828246057462, "learning_rate": 9.938828934953716e-06, "loss": 0.4923, "step": 487 }, { "epoch": 0.08, "grad_norm": 18.61747113368015, "learning_rate": 9.938421288710242e-06, "loss": 0.5749, "step": 488 }, { "epoch": 0.08, "grad_norm": 22.535523152962813, "learning_rate": 9.93801229710468e-06, "loss": 0.5214, "step": 489 }, { "epoch": 0.08, "grad_norm": 1.1597154503489981, "learning_rate": 9.937601960248452e-06, "loss": 0.4547, "step": 490 }, { "epoch": 0.08, "grad_norm": 57.693816484905334, "learning_rate": 9.937190278253345e-06, "loss": 0.5041, "step": 491 }, { "epoch": 0.08, "grad_norm": 13.484892748239167, "learning_rate": 9.93677725123151e-06, "loss": 0.5483, "step": 492 }, { "epoch": 0.08, "grad_norm": 13.583313174375064, "learning_rate": 9.936362879295471e-06, "loss": 0.5042, "step": 493 }, { "epoch": 0.08, "grad_norm": 22.288730700915963, "learning_rate": 9.93594716255811e-06, "loss": 0.5434, "step": 494 }, { "epoch": 0.08, "grad_norm": 35.04838780673886, "learning_rate": 9.935530101132683e-06, "loss": 0.507, "step": 495 }, { "epoch": 0.08, "grad_norm": 20.625097478047156, "learning_rate": 9.935111695132807e-06, "loss": 0.5119, "step": 496 }, { "epoch": 0.08, "grad_norm": 86.06254025668136, "learning_rate": 9.934691944672468e-06, "loss": 0.4172, "step": 497 }, { "epoch": 0.08, "grad_norm": 25.48809362354762, "learning_rate": 9.934270849866019e-06, "loss": 0.4966, "step": 498 }, { "epoch": 0.08, "grad_norm": 23.410966011072528, "learning_rate": 9.933848410828175e-06, "loss": 0.4952, "step": 499 }, { "epoch": 0.08, "grad_norm": 14.350562452813703, "learning_rate": 9.933424627674022e-06, "loss": 0.554, "step": 500 }, { "epoch": 0.08, "grad_norm": 13.027639221252468, "learning_rate": 9.932999500519011e-06, "loss": 0.4475, "step": 501 }, { "epoch": 0.08, "grad_norm": 19.041522097836694, "learning_rate": 9.932573029478956e-06, "loss": 0.4994, "step": 502 }, { "epoch": 0.08, "grad_norm": 16.027027086172566, "learning_rate": 9.932145214670042e-06, "loss": 0.5044, "step": 503 }, { "epoch": 0.08, "grad_norm": 64.66387108030902, "learning_rate": 9.931716056208814e-06, "loss": 0.5528, "step": 504 }, { "epoch": 0.08, "grad_norm": 24.000625605576143, "learning_rate": 9.931285554212189e-06, "loss": 0.4812, "step": 505 }, { "epoch": 0.08, "grad_norm": 12.960465458234653, "learning_rate": 9.930853708797448e-06, "loss": 0.546, "step": 506 }, { "epoch": 0.08, "grad_norm": 14.035419882416011, "learning_rate": 9.930420520082238e-06, "loss": 0.4314, "step": 507 }, { "epoch": 0.08, "grad_norm": 32.5275889505669, "learning_rate": 9.929985988184568e-06, "loss": 0.5419, "step": 508 }, { "epoch": 0.08, "grad_norm": 11.00641954233929, "learning_rate": 9.929550113222822e-06, "loss": 0.5335, "step": 509 }, { "epoch": 0.08, "grad_norm": 14.146765223423792, "learning_rate": 9.929112895315739e-06, "loss": 0.4833, "step": 510 }, { "epoch": 0.08, "grad_norm": 17.155008550667898, "learning_rate": 9.928674334582431e-06, "loss": 0.5453, "step": 511 }, { "epoch": 0.08, "grad_norm": 13.954164077546192, "learning_rate": 9.928234431142376e-06, "loss": 0.4691, "step": 512 }, { "epoch": 0.08, "grad_norm": 13.152318314094025, "learning_rate": 9.927793185115416e-06, "loss": 0.5193, "step": 513 }, { "epoch": 0.08, "grad_norm": 1.2858546313823787, "learning_rate": 9.927350596621753e-06, "loss": 0.4962, "step": 514 }, { "epoch": 0.08, "grad_norm": 19.581461298857505, "learning_rate": 9.926906665781967e-06, "loss": 0.5338, "step": 515 }, { "epoch": 0.08, "grad_norm": 12.46761132147807, "learning_rate": 9.926461392716993e-06, "loss": 0.5111, "step": 516 }, { "epoch": 0.08, "grad_norm": 23.159004436177707, "learning_rate": 9.926014777548139e-06, "loss": 0.5462, "step": 517 }, { "epoch": 0.08, "grad_norm": 13.498808288714008, "learning_rate": 9.92556682039707e-06, "loss": 0.4943, "step": 518 }, { "epoch": 0.08, "grad_norm": 15.320359291926477, "learning_rate": 9.925117521385827e-06, "loss": 0.5232, "step": 519 }, { "epoch": 0.08, "grad_norm": 12.714600184919549, "learning_rate": 9.92466688063681e-06, "loss": 0.4788, "step": 520 }, { "epoch": 0.08, "grad_norm": 1.2266657596263884, "learning_rate": 9.924214898272783e-06, "loss": 0.4692, "step": 521 }, { "epoch": 0.08, "grad_norm": 14.280608901628383, "learning_rate": 9.923761574416884e-06, "loss": 0.5105, "step": 522 }, { "epoch": 0.08, "grad_norm": 13.36639863504587, "learning_rate": 9.923306909192608e-06, "loss": 0.4934, "step": 523 }, { "epoch": 0.08, "grad_norm": 1.2740002298563655, "learning_rate": 9.922850902723815e-06, "loss": 0.4731, "step": 524 }, { "epoch": 0.08, "grad_norm": 15.338048540651087, "learning_rate": 9.922393555134739e-06, "loss": 0.5108, "step": 525 }, { "epoch": 0.08, "grad_norm": 15.656241275884797, "learning_rate": 9.92193486654997e-06, "loss": 0.4961, "step": 526 }, { "epoch": 0.08, "grad_norm": 16.13458761927888, "learning_rate": 9.921474837094473e-06, "loss": 0.5769, "step": 527 }, { "epoch": 0.09, "grad_norm": 21.537727390519827, "learning_rate": 9.921013466893567e-06, "loss": 0.4841, "step": 528 }, { "epoch": 0.09, "grad_norm": 13.895990491642683, "learning_rate": 9.920550756072945e-06, "loss": 0.5524, "step": 529 }, { "epoch": 0.09, "grad_norm": 11.921325160411415, "learning_rate": 9.92008670475866e-06, "loss": 0.497, "step": 530 }, { "epoch": 0.09, "grad_norm": 15.56181054602899, "learning_rate": 9.919621313077135e-06, "loss": 0.5261, "step": 531 }, { "epoch": 0.09, "grad_norm": 20.49411978735034, "learning_rate": 9.919154581155152e-06, "loss": 0.4794, "step": 532 }, { "epoch": 0.09, "grad_norm": 18.480975845296932, "learning_rate": 9.918686509119867e-06, "loss": 0.5356, "step": 533 }, { "epoch": 0.09, "grad_norm": 13.073752984602349, "learning_rate": 9.91821709709879e-06, "loss": 0.552, "step": 534 }, { "epoch": 0.09, "grad_norm": 10.350033040569631, "learning_rate": 9.917746345219805e-06, "loss": 0.5218, "step": 535 }, { "epoch": 0.09, "grad_norm": 19.471724327339587, "learning_rate": 9.917274253611156e-06, "loss": 0.4993, "step": 536 }, { "epoch": 0.09, "grad_norm": 18.51573328978897, "learning_rate": 9.916800822401457e-06, "loss": 0.4766, "step": 537 }, { "epoch": 0.09, "grad_norm": 15.081392424003008, "learning_rate": 9.91632605171968e-06, "loss": 0.5776, "step": 538 }, { "epoch": 0.09, "grad_norm": 15.624456193197261, "learning_rate": 9.915849941695167e-06, "loss": 0.4756, "step": 539 }, { "epoch": 0.09, "grad_norm": 25.571778920649678, "learning_rate": 9.915372492457624e-06, "loss": 0.5568, "step": 540 }, { "epoch": 0.09, "grad_norm": 28.210014449808007, "learning_rate": 9.914893704137123e-06, "loss": 0.4193, "step": 541 }, { "epoch": 0.09, "grad_norm": 21.436324578551755, "learning_rate": 9.914413576864094e-06, "loss": 0.5066, "step": 542 }, { "epoch": 0.09, "grad_norm": 62.02139390483134, "learning_rate": 9.913932110769342e-06, "loss": 0.4802, "step": 543 }, { "epoch": 0.09, "grad_norm": 21.891837004744193, "learning_rate": 9.91344930598403e-06, "loss": 0.4749, "step": 544 }, { "epoch": 0.09, "grad_norm": 16.37938273771956, "learning_rate": 9.912965162639686e-06, "loss": 0.5342, "step": 545 }, { "epoch": 0.09, "grad_norm": 13.084183962994475, "learning_rate": 9.912479680868205e-06, "loss": 0.527, "step": 546 }, { "epoch": 0.09, "grad_norm": 19.76839280081624, "learning_rate": 9.911992860801846e-06, "loss": 0.5004, "step": 547 }, { "epoch": 0.09, "grad_norm": 13.805090440603099, "learning_rate": 9.911504702573232e-06, "loss": 0.5424, "step": 548 }, { "epoch": 0.09, "grad_norm": 13.46879663480542, "learning_rate": 9.91101520631535e-06, "loss": 0.4667, "step": 549 }, { "epoch": 0.09, "grad_norm": 24.261619553669544, "learning_rate": 9.910524372161554e-06, "loss": 0.5283, "step": 550 }, { "epoch": 0.09, "grad_norm": 17.686624891732816, "learning_rate": 9.910032200245559e-06, "loss": 0.4712, "step": 551 }, { "epoch": 0.09, "grad_norm": 24.0586317736002, "learning_rate": 9.909538690701448e-06, "loss": 0.5809, "step": 552 }, { "epoch": 0.09, "grad_norm": 15.201467138321378, "learning_rate": 9.909043843663664e-06, "loss": 0.4624, "step": 553 }, { "epoch": 0.09, "grad_norm": 21.430824461404985, "learning_rate": 9.908547659267019e-06, "loss": 0.5516, "step": 554 }, { "epoch": 0.09, "grad_norm": 21.342766524121007, "learning_rate": 9.908050137646685e-06, "loss": 0.4976, "step": 555 }, { "epoch": 0.09, "grad_norm": 11.304421935314497, "learning_rate": 9.907551278938203e-06, "loss": 0.5097, "step": 556 }, { "epoch": 0.09, "grad_norm": 25.06287670697131, "learning_rate": 9.907051083277477e-06, "loss": 0.532, "step": 557 }, { "epoch": 0.09, "grad_norm": 19.133356596383926, "learning_rate": 9.90654955080077e-06, "loss": 0.4935, "step": 558 }, { "epoch": 0.09, "grad_norm": 15.586515916242897, "learning_rate": 9.906046681644717e-06, "loss": 0.551, "step": 559 }, { "epoch": 0.09, "grad_norm": 18.851700225091328, "learning_rate": 9.905542475946312e-06, "loss": 0.4755, "step": 560 }, { "epoch": 0.09, "grad_norm": 25.686780346664317, "learning_rate": 9.905036933842914e-06, "loss": 0.4676, "step": 561 }, { "epoch": 0.09, "grad_norm": 11.444961722378883, "learning_rate": 9.904530055472246e-06, "loss": 0.4954, "step": 562 }, { "epoch": 0.09, "grad_norm": 16.291664276457745, "learning_rate": 9.904021840972397e-06, "loss": 0.5171, "step": 563 }, { "epoch": 0.09, "grad_norm": 17.460842707843682, "learning_rate": 9.90351229048182e-06, "loss": 0.556, "step": 564 }, { "epoch": 0.09, "grad_norm": 51.26147783962857, "learning_rate": 9.903001404139329e-06, "loss": 0.5271, "step": 565 }, { "epoch": 0.09, "grad_norm": 20.873330973764407, "learning_rate": 9.902489182084101e-06, "loss": 0.519, "step": 566 }, { "epoch": 0.09, "grad_norm": 15.087833043249425, "learning_rate": 9.901975624455685e-06, "loss": 0.5394, "step": 567 }, { "epoch": 0.09, "grad_norm": 17.038618637013016, "learning_rate": 9.901460731393984e-06, "loss": 0.469, "step": 568 }, { "epoch": 0.09, "grad_norm": 23.25601760477705, "learning_rate": 9.90094450303927e-06, "loss": 0.4479, "step": 569 }, { "epoch": 0.09, "grad_norm": 24.280666091088996, "learning_rate": 9.900426939532178e-06, "loss": 0.4584, "step": 570 }, { "epoch": 0.09, "grad_norm": 38.76345732975971, "learning_rate": 9.899908041013706e-06, "loss": 0.56, "step": 571 }, { "epoch": 0.09, "grad_norm": 15.615505893819986, "learning_rate": 9.899387807625217e-06, "loss": 0.4932, "step": 572 }, { "epoch": 0.09, "grad_norm": 18.937921332598645, "learning_rate": 9.898866239508437e-06, "loss": 0.5296, "step": 573 }, { "epoch": 0.09, "grad_norm": 19.36456621559494, "learning_rate": 9.898343336805456e-06, "loss": 0.5151, "step": 574 }, { "epoch": 0.09, "grad_norm": 25.640873109264188, "learning_rate": 9.897819099658726e-06, "loss": 0.5045, "step": 575 }, { "epoch": 0.09, "grad_norm": 17.443202463664413, "learning_rate": 9.897293528211062e-06, "loss": 0.4804, "step": 576 }, { "epoch": 0.09, "grad_norm": 19.67824539218894, "learning_rate": 9.896766622605649e-06, "loss": 0.5653, "step": 577 }, { "epoch": 0.09, "grad_norm": 19.553427435820534, "learning_rate": 9.896238382986028e-06, "loss": 0.564, "step": 578 }, { "epoch": 0.09, "grad_norm": 13.010337660887183, "learning_rate": 9.895708809496106e-06, "loss": 0.5158, "step": 579 }, { "epoch": 0.09, "grad_norm": 1.3807963752760515, "learning_rate": 9.895177902280151e-06, "loss": 0.4787, "step": 580 }, { "epoch": 0.09, "grad_norm": 17.235160096487327, "learning_rate": 9.8946456614828e-06, "loss": 0.5379, "step": 581 }, { "epoch": 0.09, "grad_norm": 15.332992515366278, "learning_rate": 9.894112087249049e-06, "loss": 0.508, "step": 582 }, { "epoch": 0.09, "grad_norm": 28.486584242033622, "learning_rate": 9.893577179724259e-06, "loss": 0.47, "step": 583 }, { "epoch": 0.09, "grad_norm": 38.77331446611641, "learning_rate": 9.893040939054152e-06, "loss": 0.5041, "step": 584 }, { "epoch": 0.09, "grad_norm": 21.207054655645923, "learning_rate": 9.892503365384818e-06, "loss": 0.4968, "step": 585 }, { "epoch": 0.09, "grad_norm": 22.202255991886688, "learning_rate": 9.891964458862702e-06, "loss": 0.5485, "step": 586 }, { "epoch": 0.09, "grad_norm": 13.730282123072067, "learning_rate": 9.891424219634622e-06, "loss": 0.497, "step": 587 }, { "epoch": 0.09, "grad_norm": 14.780413534325842, "learning_rate": 9.89088264784775e-06, "loss": 0.413, "step": 588 }, { "epoch": 0.09, "grad_norm": 1.560090180842033, "learning_rate": 9.890339743649628e-06, "loss": 0.5148, "step": 589 }, { "epoch": 0.1, "grad_norm": 57.951380138492446, "learning_rate": 9.889795507188156e-06, "loss": 0.513, "step": 590 }, { "epoch": 0.1, "grad_norm": 23.839207559460178, "learning_rate": 9.889249938611599e-06, "loss": 0.542, "step": 591 }, { "epoch": 0.1, "grad_norm": 1.3100884104163806, "learning_rate": 9.888703038068587e-06, "loss": 0.4517, "step": 592 }, { "epoch": 0.1, "grad_norm": 22.611797131759342, "learning_rate": 9.88815480570811e-06, "loss": 0.5044, "step": 593 }, { "epoch": 0.1, "grad_norm": 17.975168232540277, "learning_rate": 9.887605241679518e-06, "loss": 0.5426, "step": 594 }, { "epoch": 0.1, "grad_norm": 21.6299249713566, "learning_rate": 9.887054346132534e-06, "loss": 0.606, "step": 595 }, { "epoch": 0.1, "grad_norm": 21.433210431466645, "learning_rate": 9.886502119217232e-06, "loss": 0.4522, "step": 596 }, { "epoch": 0.1, "grad_norm": 16.8744986725075, "learning_rate": 9.885948561084056e-06, "loss": 0.5388, "step": 597 }, { "epoch": 0.1, "grad_norm": 17.774643117427285, "learning_rate": 9.885393671883811e-06, "loss": 0.4213, "step": 598 }, { "epoch": 0.1, "grad_norm": 27.177227386358386, "learning_rate": 9.884837451767662e-06, "loss": 0.4277, "step": 599 }, { "epoch": 0.1, "grad_norm": 51.94739877270996, "learning_rate": 9.88427990088714e-06, "loss": 0.5516, "step": 600 }, { "epoch": 0.1, "grad_norm": 1.48021643779853, "learning_rate": 9.883721019394136e-06, "loss": 0.468, "step": 601 }, { "epoch": 0.1, "grad_norm": 16.97300760191231, "learning_rate": 9.883160807440907e-06, "loss": 0.4785, "step": 602 }, { "epoch": 0.1, "grad_norm": 19.219778185051, "learning_rate": 9.88259926518007e-06, "loss": 0.4626, "step": 603 }, { "epoch": 0.1, "grad_norm": 19.837493866912144, "learning_rate": 9.882036392764602e-06, "loss": 0.538, "step": 604 }, { "epoch": 0.1, "grad_norm": 13.111250667867075, "learning_rate": 9.881472190347846e-06, "loss": 0.5101, "step": 605 }, { "epoch": 0.1, "grad_norm": 12.154732540806226, "learning_rate": 9.880906658083507e-06, "loss": 0.4959, "step": 606 }, { "epoch": 0.1, "grad_norm": 18.974950073827124, "learning_rate": 9.880339796125653e-06, "loss": 0.4927, "step": 607 }, { "epoch": 0.1, "grad_norm": 16.008869701656987, "learning_rate": 9.87977160462871e-06, "loss": 0.3937, "step": 608 }, { "epoch": 0.1, "grad_norm": 11.973939111385997, "learning_rate": 9.87920208374747e-06, "loss": 0.4829, "step": 609 }, { "epoch": 0.1, "grad_norm": 19.07527390670295, "learning_rate": 9.878631233637086e-06, "loss": 0.4573, "step": 610 }, { "epoch": 0.1, "grad_norm": 37.3900957125809, "learning_rate": 9.878059054453076e-06, "loss": 0.4768, "step": 611 }, { "epoch": 0.1, "grad_norm": 26.026006237698343, "learning_rate": 9.877485546351312e-06, "loss": 0.479, "step": 612 }, { "epoch": 0.1, "grad_norm": 18.584637638478732, "learning_rate": 9.876910709488038e-06, "loss": 0.5385, "step": 613 }, { "epoch": 0.1, "grad_norm": 19.707979074227556, "learning_rate": 9.876334544019855e-06, "loss": 0.4924, "step": 614 }, { "epoch": 0.1, "grad_norm": 18.95593368007674, "learning_rate": 9.875757050103722e-06, "loss": 0.4882, "step": 615 }, { "epoch": 0.1, "grad_norm": 21.66833157168632, "learning_rate": 9.87517822789697e-06, "loss": 0.4594, "step": 616 }, { "epoch": 0.1, "grad_norm": 17.8796696852202, "learning_rate": 9.874598077557284e-06, "loss": 0.4469, "step": 617 }, { "epoch": 0.1, "grad_norm": 24.646881349237603, "learning_rate": 9.874016599242711e-06, "loss": 0.4439, "step": 618 }, { "epoch": 0.1, "grad_norm": 15.796450595747965, "learning_rate": 9.873433793111665e-06, "loss": 0.4118, "step": 619 }, { "epoch": 0.1, "grad_norm": 24.956213659004316, "learning_rate": 9.872849659322916e-06, "loss": 0.4529, "step": 620 }, { "epoch": 0.1, "grad_norm": 26.042736685819456, "learning_rate": 9.872264198035599e-06, "loss": 0.5503, "step": 621 }, { "epoch": 0.1, "grad_norm": 21.845539964652424, "learning_rate": 9.871677409409209e-06, "loss": 0.5688, "step": 622 }, { "epoch": 0.1, "grad_norm": 21.452662831951695, "learning_rate": 9.871089293603604e-06, "loss": 0.5397, "step": 623 }, { "epoch": 0.1, "grad_norm": 43.34961729679202, "learning_rate": 9.870499850779003e-06, "loss": 0.4833, "step": 624 }, { "epoch": 0.1, "grad_norm": 20.337429337999552, "learning_rate": 9.869909081095988e-06, "loss": 0.4573, "step": 625 }, { "epoch": 0.1, "grad_norm": 26.97196410088741, "learning_rate": 9.8693169847155e-06, "loss": 0.4408, "step": 626 }, { "epoch": 0.1, "grad_norm": 36.297996472117006, "learning_rate": 9.86872356179884e-06, "loss": 0.4758, "step": 627 }, { "epoch": 0.1, "grad_norm": 40.79420099801528, "learning_rate": 9.868128812507675e-06, "loss": 0.4362, "step": 628 }, { "epoch": 0.1, "grad_norm": 36.893820811217815, "learning_rate": 9.86753273700403e-06, "loss": 0.452, "step": 629 }, { "epoch": 0.1, "grad_norm": 42.23198273548744, "learning_rate": 9.866935335450293e-06, "loss": 0.5667, "step": 630 }, { "epoch": 0.1, "grad_norm": 35.32524614457651, "learning_rate": 9.866336608009215e-06, "loss": 0.4725, "step": 631 }, { "epoch": 0.1, "grad_norm": 22.064314699962953, "learning_rate": 9.865736554843904e-06, "loss": 0.4945, "step": 632 }, { "epoch": 0.1, "grad_norm": 22.90459776756543, "learning_rate": 9.865135176117829e-06, "loss": 0.5529, "step": 633 }, { "epoch": 0.1, "grad_norm": 25.79532954683492, "learning_rate": 9.864532471994823e-06, "loss": 0.4612, "step": 634 }, { "epoch": 0.1, "grad_norm": 58.04159874681187, "learning_rate": 9.86392844263908e-06, "loss": 0.489, "step": 635 }, { "epoch": 0.1, "grad_norm": 132.27078165539208, "learning_rate": 9.863323088215155e-06, "loss": 0.5218, "step": 636 }, { "epoch": 0.1, "grad_norm": 28.432883357996204, "learning_rate": 9.862716408887962e-06, "loss": 0.4941, "step": 637 }, { "epoch": 0.1, "grad_norm": 31.952343496665208, "learning_rate": 9.862108404822779e-06, "loss": 0.4648, "step": 638 }, { "epoch": 0.1, "grad_norm": 27.63591118434334, "learning_rate": 9.861499076185241e-06, "loss": 0.5342, "step": 639 }, { "epoch": 0.1, "grad_norm": 18.844582797634676, "learning_rate": 9.860888423141346e-06, "loss": 0.5314, "step": 640 }, { "epoch": 0.1, "grad_norm": 51.4965595435129, "learning_rate": 9.860276445857456e-06, "loss": 0.5417, "step": 641 }, { "epoch": 0.1, "grad_norm": 45.4082197802291, "learning_rate": 9.859663144500284e-06, "loss": 0.5674, "step": 642 }, { "epoch": 0.1, "grad_norm": 134.49260996660362, "learning_rate": 9.859048519236915e-06, "loss": 0.4253, "step": 643 }, { "epoch": 0.1, "grad_norm": 32.943868056721065, "learning_rate": 9.85843257023479e-06, "loss": 0.4867, "step": 644 }, { "epoch": 0.1, "grad_norm": 246.0130793456897, "learning_rate": 9.85781529766171e-06, "loss": 0.5137, "step": 645 }, { "epoch": 0.1, "grad_norm": 35.67741187273675, "learning_rate": 9.857196701685836e-06, "loss": 0.5042, "step": 646 }, { "epoch": 0.1, "grad_norm": 30.118258654122457, "learning_rate": 9.85657678247569e-06, "loss": 0.4454, "step": 647 }, { "epoch": 0.1, "grad_norm": 41.90626237438905, "learning_rate": 9.855955540200157e-06, "loss": 0.5067, "step": 648 }, { "epoch": 0.1, "grad_norm": 36.34444784455322, "learning_rate": 9.85533297502848e-06, "loss": 0.4962, "step": 649 }, { "epoch": 0.1, "grad_norm": 34.78305457061313, "learning_rate": 9.854709087130261e-06, "loss": 0.5105, "step": 650 }, { "epoch": 0.1, "grad_norm": 38.952759762501486, "learning_rate": 9.854083876675466e-06, "loss": 0.4625, "step": 651 }, { "epoch": 0.11, "grad_norm": 22.529771829953194, "learning_rate": 9.853457343834421e-06, "loss": 0.5859, "step": 652 }, { "epoch": 0.11, "grad_norm": 1.2709564754567324, "learning_rate": 9.852829488777808e-06, "loss": 0.4506, "step": 653 }, { "epoch": 0.11, "grad_norm": 21.212993863384746, "learning_rate": 9.852200311676675e-06, "loss": 0.451, "step": 654 }, { "epoch": 0.11, "grad_norm": 25.56247381096788, "learning_rate": 9.851569812702423e-06, "loss": 0.5056, "step": 655 }, { "epoch": 0.11, "grad_norm": 21.084801633821954, "learning_rate": 9.850937992026821e-06, "loss": 0.5108, "step": 656 }, { "epoch": 0.11, "grad_norm": 25.00414206163789, "learning_rate": 9.850304849821992e-06, "loss": 0.4934, "step": 657 }, { "epoch": 0.11, "grad_norm": 42.71270858955537, "learning_rate": 9.849670386260424e-06, "loss": 0.5476, "step": 658 }, { "epoch": 0.11, "grad_norm": 31.07559909689798, "learning_rate": 9.84903460151496e-06, "loss": 0.4741, "step": 659 }, { "epoch": 0.11, "grad_norm": 33.013418555197035, "learning_rate": 9.848397495758806e-06, "loss": 0.4722, "step": 660 }, { "epoch": 0.11, "grad_norm": 27.5653662709689, "learning_rate": 9.847759069165528e-06, "loss": 0.5184, "step": 661 }, { "epoch": 0.11, "grad_norm": 70.36658088058428, "learning_rate": 9.847119321909049e-06, "loss": 0.5245, "step": 662 }, { "epoch": 0.11, "grad_norm": 27.000916080366956, "learning_rate": 9.846478254163657e-06, "loss": 0.5482, "step": 663 }, { "epoch": 0.11, "grad_norm": 25.355126591060326, "learning_rate": 9.845835866103993e-06, "loss": 0.5133, "step": 664 }, { "epoch": 0.11, "grad_norm": 18.154023877963184, "learning_rate": 9.845192157905063e-06, "loss": 0.5096, "step": 665 }, { "epoch": 0.11, "grad_norm": 24.44797918382641, "learning_rate": 9.844547129742231e-06, "loss": 0.5231, "step": 666 }, { "epoch": 0.11, "grad_norm": 18.450134762094933, "learning_rate": 9.84390078179122e-06, "loss": 0.5063, "step": 667 }, { "epoch": 0.11, "grad_norm": 15.528500908678737, "learning_rate": 9.843253114228113e-06, "loss": 0.5097, "step": 668 }, { "epoch": 0.11, "grad_norm": 20.689411881792772, "learning_rate": 9.842604127229353e-06, "loss": 0.4861, "step": 669 }, { "epoch": 0.11, "grad_norm": 31.949361452678566, "learning_rate": 9.841953820971742e-06, "loss": 0.5113, "step": 670 }, { "epoch": 0.11, "grad_norm": 16.50430020339359, "learning_rate": 9.841302195632441e-06, "loss": 0.4773, "step": 671 }, { "epoch": 0.11, "grad_norm": 19.874510449106946, "learning_rate": 9.84064925138897e-06, "loss": 0.4891, "step": 672 }, { "epoch": 0.11, "grad_norm": 17.795942870847803, "learning_rate": 9.839994988419208e-06, "loss": 0.4424, "step": 673 }, { "epoch": 0.11, "grad_norm": 18.373444821743597, "learning_rate": 9.839339406901398e-06, "loss": 0.505, "step": 674 }, { "epoch": 0.11, "grad_norm": 47.6997226648378, "learning_rate": 9.838682507014138e-06, "loss": 0.6177, "step": 675 }, { "epoch": 0.11, "grad_norm": 1.629724275034955, "learning_rate": 9.838024288936381e-06, "loss": 0.4528, "step": 676 }, { "epoch": 0.11, "grad_norm": 97.13753108723606, "learning_rate": 9.837364752847447e-06, "loss": 0.4399, "step": 677 }, { "epoch": 0.11, "grad_norm": 22.615554189219147, "learning_rate": 9.836703898927012e-06, "loss": 0.5192, "step": 678 }, { "epoch": 0.11, "grad_norm": 17.612337043071484, "learning_rate": 9.83604172735511e-06, "loss": 0.468, "step": 679 }, { "epoch": 0.11, "grad_norm": 15.20704373579663, "learning_rate": 9.835378238312136e-06, "loss": 0.47, "step": 680 }, { "epoch": 0.11, "grad_norm": 11.752093832860716, "learning_rate": 9.834713431978842e-06, "loss": 0.4284, "step": 681 }, { "epoch": 0.11, "grad_norm": 1.4054445478548272, "learning_rate": 9.834047308536342e-06, "loss": 0.4502, "step": 682 }, { "epoch": 0.11, "grad_norm": 13.562943089696912, "learning_rate": 9.8333798681661e-06, "loss": 0.4852, "step": 683 }, { "epoch": 0.11, "grad_norm": 24.384289530626003, "learning_rate": 9.83271111104995e-06, "loss": 0.5033, "step": 684 }, { "epoch": 0.11, "grad_norm": 18.980872074508046, "learning_rate": 9.83204103737008e-06, "loss": 0.4998, "step": 685 }, { "epoch": 0.11, "grad_norm": 15.838423874686844, "learning_rate": 9.831369647309033e-06, "loss": 0.5644, "step": 686 }, { "epoch": 0.11, "grad_norm": 21.18435459921517, "learning_rate": 9.830696941049718e-06, "loss": 0.5675, "step": 687 }, { "epoch": 0.11, "grad_norm": 23.130773855942554, "learning_rate": 9.830022918775399e-06, "loss": 0.4905, "step": 688 }, { "epoch": 0.11, "grad_norm": 15.76408715108176, "learning_rate": 9.829347580669694e-06, "loss": 0.4969, "step": 689 }, { "epoch": 0.11, "grad_norm": 27.70651280220986, "learning_rate": 9.828670926916588e-06, "loss": 0.5273, "step": 690 }, { "epoch": 0.11, "grad_norm": 44.96128793236422, "learning_rate": 9.827992957700417e-06, "loss": 0.4954, "step": 691 }, { "epoch": 0.11, "grad_norm": 28.94951983207798, "learning_rate": 9.82731367320588e-06, "loss": 0.4722, "step": 692 }, { "epoch": 0.11, "grad_norm": 24.60375973401854, "learning_rate": 9.826633073618034e-06, "loss": 0.5158, "step": 693 }, { "epoch": 0.11, "grad_norm": 15.61526604575308, "learning_rate": 9.825951159122291e-06, "loss": 0.4808, "step": 694 }, { "epoch": 0.11, "grad_norm": 20.029970377581847, "learning_rate": 9.825267929904423e-06, "loss": 0.479, "step": 695 }, { "epoch": 0.11, "grad_norm": 18.80831135800364, "learning_rate": 9.824583386150563e-06, "loss": 0.4689, "step": 696 }, { "epoch": 0.11, "grad_norm": 25.130011668809697, "learning_rate": 9.8238975280472e-06, "loss": 0.4587, "step": 697 }, { "epoch": 0.11, "grad_norm": 19.66733424673074, "learning_rate": 9.823210355781177e-06, "loss": 0.4644, "step": 698 }, { "epoch": 0.11, "grad_norm": 12.520007016067943, "learning_rate": 9.822521869539699e-06, "loss": 0.5742, "step": 699 }, { "epoch": 0.11, "grad_norm": 28.13264344536298, "learning_rate": 9.821832069510333e-06, "loss": 0.546, "step": 700 }, { "epoch": 0.11, "grad_norm": 16.692464746770355, "learning_rate": 9.821140955880995e-06, "loss": 0.4861, "step": 701 }, { "epoch": 0.11, "grad_norm": 19.615348003479365, "learning_rate": 9.820448528839965e-06, "loss": 0.517, "step": 702 }, { "epoch": 0.11, "grad_norm": 24.007174766752577, "learning_rate": 9.81975478857588e-06, "loss": 0.4822, "step": 703 }, { "epoch": 0.11, "grad_norm": 20.007041817194377, "learning_rate": 9.819059735277731e-06, "loss": 0.4692, "step": 704 }, { "epoch": 0.11, "grad_norm": 16.558918496167855, "learning_rate": 9.818363369134876e-06, "loss": 0.44, "step": 705 }, { "epoch": 0.11, "grad_norm": 19.801961439717147, "learning_rate": 9.817665690337017e-06, "loss": 0.5109, "step": 706 }, { "epoch": 0.11, "grad_norm": 27.136501245619463, "learning_rate": 9.816966699074226e-06, "loss": 0.571, "step": 707 }, { "epoch": 0.11, "grad_norm": 18.310853629813945, "learning_rate": 9.816266395536924e-06, "loss": 0.4353, "step": 708 }, { "epoch": 0.11, "grad_norm": 16.678283092310608, "learning_rate": 9.815564779915896e-06, "loss": 0.4831, "step": 709 }, { "epoch": 0.11, "grad_norm": 19.480562073306537, "learning_rate": 9.81486185240228e-06, "loss": 0.4973, "step": 710 }, { "epoch": 0.11, "grad_norm": 14.852937165804487, "learning_rate": 9.814157613187573e-06, "loss": 0.4689, "step": 711 }, { "epoch": 0.11, "grad_norm": 18.060259669713776, "learning_rate": 9.81345206246363e-06, "loss": 0.5666, "step": 712 }, { "epoch": 0.11, "grad_norm": 16.267854879660476, "learning_rate": 9.812745200422659e-06, "loss": 0.5042, "step": 713 }, { "epoch": 0.12, "grad_norm": 15.279511206182923, "learning_rate": 9.812037027257234e-06, "loss": 0.4792, "step": 714 }, { "epoch": 0.12, "grad_norm": 18.077932595783246, "learning_rate": 9.811327543160276e-06, "loss": 0.5004, "step": 715 }, { "epoch": 0.12, "grad_norm": 11.85758983948475, "learning_rate": 9.810616748325072e-06, "loss": 0.4722, "step": 716 }, { "epoch": 0.12, "grad_norm": 19.08561775926478, "learning_rate": 9.809904642945261e-06, "loss": 0.3992, "step": 717 }, { "epoch": 0.12, "grad_norm": 18.989014440011633, "learning_rate": 9.809191227214838e-06, "loss": 0.5258, "step": 718 }, { "epoch": 0.12, "grad_norm": 21.899839398838346, "learning_rate": 9.808476501328158e-06, "loss": 0.5198, "step": 719 }, { "epoch": 0.12, "grad_norm": 1.2860451595562876, "learning_rate": 9.807760465479934e-06, "loss": 0.4561, "step": 720 }, { "epoch": 0.12, "grad_norm": 22.612184072437053, "learning_rate": 9.807043119865232e-06, "loss": 0.4657, "step": 721 }, { "epoch": 0.12, "grad_norm": 39.008201714932646, "learning_rate": 9.806324464679477e-06, "loss": 0.488, "step": 722 }, { "epoch": 0.12, "grad_norm": 70.3485836861302, "learning_rate": 9.80560450011845e-06, "loss": 0.5922, "step": 723 }, { "epoch": 0.12, "grad_norm": 68.34010661053567, "learning_rate": 9.80488322637829e-06, "loss": 0.4226, "step": 724 }, { "epoch": 0.12, "grad_norm": 23.69579046044365, "learning_rate": 9.804160643655494e-06, "loss": 0.5183, "step": 725 }, { "epoch": 0.12, "grad_norm": 66.22987903247811, "learning_rate": 9.803436752146909e-06, "loss": 0.4681, "step": 726 }, { "epoch": 0.12, "grad_norm": 22.13576625594009, "learning_rate": 9.802711552049746e-06, "loss": 0.4655, "step": 727 }, { "epoch": 0.12, "grad_norm": 29.5295051982629, "learning_rate": 9.801985043561569e-06, "loss": 0.4876, "step": 728 }, { "epoch": 0.12, "grad_norm": 23.514806650665207, "learning_rate": 9.801257226880297e-06, "loss": 0.5135, "step": 729 }, { "epoch": 0.12, "grad_norm": 30.136030534841247, "learning_rate": 9.80052810220421e-06, "loss": 0.5332, "step": 730 }, { "epoch": 0.12, "grad_norm": 21.830147411374625, "learning_rate": 9.79979766973194e-06, "loss": 0.538, "step": 731 }, { "epoch": 0.12, "grad_norm": 49.81259418507051, "learning_rate": 9.799065929662478e-06, "loss": 0.5068, "step": 732 }, { "epoch": 0.12, "grad_norm": 30.120344600958013, "learning_rate": 9.798332882195172e-06, "loss": 0.4602, "step": 733 }, { "epoch": 0.12, "grad_norm": 24.56373477936228, "learning_rate": 9.79759852752972e-06, "loss": 0.5407, "step": 734 }, { "epoch": 0.12, "grad_norm": 40.67656506510299, "learning_rate": 9.796862865866183e-06, "loss": 0.5123, "step": 735 }, { "epoch": 0.12, "grad_norm": 28.08456392008881, "learning_rate": 9.796125897404973e-06, "loss": 0.4395, "step": 736 }, { "epoch": 0.12, "grad_norm": 24.581896318986935, "learning_rate": 9.795387622346866e-06, "loss": 0.4684, "step": 737 }, { "epoch": 0.12, "grad_norm": 42.86869425333663, "learning_rate": 9.794648040892983e-06, "loss": 0.4985, "step": 738 }, { "epoch": 0.12, "grad_norm": 35.17206995762081, "learning_rate": 9.793907153244808e-06, "loss": 0.57, "step": 739 }, { "epoch": 0.12, "grad_norm": 31.35202313496837, "learning_rate": 9.79316495960418e-06, "loss": 0.4461, "step": 740 }, { "epoch": 0.12, "grad_norm": 22.032846135926587, "learning_rate": 9.792421460173294e-06, "loss": 0.5234, "step": 741 }, { "epoch": 0.12, "grad_norm": 21.94914471309661, "learning_rate": 9.791676655154696e-06, "loss": 0.5382, "step": 742 }, { "epoch": 0.12, "grad_norm": 1.235393642987467, "learning_rate": 9.790930544751297e-06, "loss": 0.4793, "step": 743 }, { "epoch": 0.12, "grad_norm": 15.009076321870372, "learning_rate": 9.790183129166351e-06, "loss": 0.4514, "step": 744 }, { "epoch": 0.12, "grad_norm": 22.857535630894642, "learning_rate": 9.78943440860348e-06, "loss": 0.5513, "step": 745 }, { "epoch": 0.12, "grad_norm": 28.103565851655105, "learning_rate": 9.788684383266655e-06, "loss": 0.4908, "step": 746 }, { "epoch": 0.12, "grad_norm": 29.43437633491117, "learning_rate": 9.787933053360203e-06, "loss": 0.5203, "step": 747 }, { "epoch": 0.12, "grad_norm": 23.694193741663607, "learning_rate": 9.78718041908881e-06, "loss": 0.455, "step": 748 }, { "epoch": 0.12, "grad_norm": 20.296911835573596, "learning_rate": 9.786426480657507e-06, "loss": 0.5169, "step": 749 }, { "epoch": 0.12, "grad_norm": 27.717028874014833, "learning_rate": 9.785671238271694e-06, "loss": 0.4094, "step": 750 }, { "epoch": 0.12, "grad_norm": 26.48782492758789, "learning_rate": 9.784914692137118e-06, "loss": 0.524, "step": 751 }, { "epoch": 0.12, "grad_norm": 34.02215545449661, "learning_rate": 9.784156842459882e-06, "loss": 0.5327, "step": 752 }, { "epoch": 0.12, "grad_norm": 51.791240360548045, "learning_rate": 9.783397689446447e-06, "loss": 0.5072, "step": 753 }, { "epoch": 0.12, "grad_norm": 19.318952483530822, "learning_rate": 9.782637233303624e-06, "loss": 0.4843, "step": 754 }, { "epoch": 0.12, "grad_norm": 30.38603084942431, "learning_rate": 9.781875474238587e-06, "loss": 0.4982, "step": 755 }, { "epoch": 0.12, "grad_norm": 21.212825663607752, "learning_rate": 9.781112412458858e-06, "loss": 0.5192, "step": 756 }, { "epoch": 0.12, "grad_norm": 21.236279975655908, "learning_rate": 9.780348048172315e-06, "loss": 0.467, "step": 757 }, { "epoch": 0.12, "grad_norm": 25.763818650187154, "learning_rate": 9.779582381587192e-06, "loss": 0.5267, "step": 758 }, { "epoch": 0.12, "grad_norm": 42.01728460728236, "learning_rate": 9.778815412912078e-06, "loss": 0.4894, "step": 759 }, { "epoch": 0.12, "grad_norm": 22.665619984522813, "learning_rate": 9.778047142355917e-06, "loss": 0.5251, "step": 760 }, { "epoch": 0.12, "grad_norm": 46.90950551132639, "learning_rate": 9.777277570128008e-06, "loss": 0.4437, "step": 761 }, { "epoch": 0.12, "grad_norm": 24.245146390369378, "learning_rate": 9.776506696438002e-06, "loss": 0.5017, "step": 762 }, { "epoch": 0.12, "grad_norm": 33.75557381511454, "learning_rate": 9.775734521495905e-06, "loss": 0.5034, "step": 763 }, { "epoch": 0.12, "grad_norm": 249.8105860568305, "learning_rate": 9.774961045512082e-06, "loss": 0.5436, "step": 764 }, { "epoch": 0.12, "grad_norm": 25.11208467122166, "learning_rate": 9.774186268697247e-06, "loss": 0.4478, "step": 765 }, { "epoch": 0.12, "grad_norm": 43.728047738023335, "learning_rate": 9.773410191262471e-06, "loss": 0.5351, "step": 766 }, { "epoch": 0.12, "grad_norm": 36.982489729286506, "learning_rate": 9.772632813419181e-06, "loss": 0.4478, "step": 767 }, { "epoch": 0.12, "grad_norm": 28.577056074073887, "learning_rate": 9.771854135379153e-06, "loss": 0.5379, "step": 768 }, { "epoch": 0.12, "grad_norm": 21.66953740046426, "learning_rate": 9.771074157354521e-06, "loss": 0.5311, "step": 769 }, { "epoch": 0.12, "grad_norm": 22.018434431381095, "learning_rate": 9.770292879557774e-06, "loss": 0.4516, "step": 770 }, { "epoch": 0.12, "grad_norm": 28.9316001223727, "learning_rate": 9.769510302201751e-06, "loss": 0.4782, "step": 771 }, { "epoch": 0.12, "grad_norm": 38.465798868528, "learning_rate": 9.76872642549965e-06, "loss": 0.5506, "step": 772 }, { "epoch": 0.12, "grad_norm": 42.98895755490222, "learning_rate": 9.767941249665022e-06, "loss": 0.5215, "step": 773 }, { "epoch": 0.12, "grad_norm": 37.5542346787441, "learning_rate": 9.767154774911767e-06, "loss": 0.4245, "step": 774 }, { "epoch": 0.12, "grad_norm": 126.0337025925342, "learning_rate": 9.766367001454144e-06, "loss": 0.4668, "step": 775 }, { "epoch": 0.13, "grad_norm": 26.965340966727034, "learning_rate": 9.765577929506764e-06, "loss": 0.5684, "step": 776 }, { "epoch": 0.13, "grad_norm": 39.08373874742864, "learning_rate": 9.764787559284592e-06, "loss": 0.5215, "step": 777 }, { "epoch": 0.13, "grad_norm": 38.35500361272542, "learning_rate": 9.763995891002946e-06, "loss": 0.4699, "step": 778 }, { "epoch": 0.13, "grad_norm": 13.772844282819213, "learning_rate": 9.763202924877502e-06, "loss": 0.4488, "step": 779 }, { "epoch": 0.13, "grad_norm": 24.341551103401898, "learning_rate": 9.762408661124279e-06, "loss": 0.5404, "step": 780 }, { "epoch": 0.13, "grad_norm": 38.58160278798945, "learning_rate": 9.761613099959663e-06, "loss": 0.5149, "step": 781 }, { "epoch": 0.13, "grad_norm": 15.453459024065733, "learning_rate": 9.760816241600383e-06, "loss": 0.4269, "step": 782 }, { "epoch": 0.13, "grad_norm": 16.782477172057998, "learning_rate": 9.760018086263525e-06, "loss": 0.5336, "step": 783 }, { "epoch": 0.13, "grad_norm": 15.057640537762172, "learning_rate": 9.759218634166531e-06, "loss": 0.5252, "step": 784 }, { "epoch": 0.13, "grad_norm": 15.444438544851318, "learning_rate": 9.75841788552719e-06, "loss": 0.4455, "step": 785 }, { "epoch": 0.13, "grad_norm": 17.696856495068964, "learning_rate": 9.757615840563654e-06, "loss": 0.5463, "step": 786 }, { "epoch": 0.13, "grad_norm": 29.698541802400126, "learning_rate": 9.756812499494417e-06, "loss": 0.4737, "step": 787 }, { "epoch": 0.13, "grad_norm": 14.152192660611997, "learning_rate": 9.756007862538333e-06, "loss": 0.4847, "step": 788 }, { "epoch": 0.13, "grad_norm": 15.374085135518834, "learning_rate": 9.755201929914607e-06, "loss": 0.5425, "step": 789 }, { "epoch": 0.13, "grad_norm": 15.601690712889747, "learning_rate": 9.754394701842797e-06, "loss": 0.5351, "step": 790 }, { "epoch": 0.13, "grad_norm": 12.39724747079049, "learning_rate": 9.753586178542815e-06, "loss": 0.4753, "step": 791 }, { "epoch": 0.13, "grad_norm": 88.4459986252391, "learning_rate": 9.752776360234925e-06, "loss": 0.6004, "step": 792 }, { "epoch": 0.13, "grad_norm": 11.642484366322941, "learning_rate": 9.751965247139743e-06, "loss": 0.5001, "step": 793 }, { "epoch": 0.13, "grad_norm": 20.049962860527497, "learning_rate": 9.751152839478238e-06, "loss": 0.487, "step": 794 }, { "epoch": 0.13, "grad_norm": 34.15693630886806, "learning_rate": 9.750339137471733e-06, "loss": 0.4758, "step": 795 }, { "epoch": 0.13, "grad_norm": 12.007307300973832, "learning_rate": 9.749524141341905e-06, "loss": 0.4805, "step": 796 }, { "epoch": 0.13, "grad_norm": 21.15144517678615, "learning_rate": 9.74870785131078e-06, "loss": 0.4894, "step": 797 }, { "epoch": 0.13, "grad_norm": 20.318874521302774, "learning_rate": 9.747890267600735e-06, "loss": 0.5165, "step": 798 }, { "epoch": 0.13, "grad_norm": 45.477751430851235, "learning_rate": 9.747071390434507e-06, "loss": 0.5599, "step": 799 }, { "epoch": 0.13, "grad_norm": 30.523846900686554, "learning_rate": 9.746251220035176e-06, "loss": 0.5623, "step": 800 }, { "epoch": 0.13, "grad_norm": 17.029920017968752, "learning_rate": 9.745429756626184e-06, "loss": 0.5061, "step": 801 }, { "epoch": 0.13, "grad_norm": 12.798832651167722, "learning_rate": 9.744607000431317e-06, "loss": 0.4786, "step": 802 }, { "epoch": 0.13, "grad_norm": 42.98569617285503, "learning_rate": 9.743782951674715e-06, "loss": 0.4658, "step": 803 }, { "epoch": 0.13, "grad_norm": 11.936594426370887, "learning_rate": 9.742957610580876e-06, "loss": 0.4848, "step": 804 }, { "epoch": 0.13, "grad_norm": 41.80080208942185, "learning_rate": 9.742130977374643e-06, "loss": 0.5086, "step": 805 }, { "epoch": 0.13, "grad_norm": 13.581925144344769, "learning_rate": 9.741303052281214e-06, "loss": 0.434, "step": 806 }, { "epoch": 0.13, "grad_norm": 125.50288228823116, "learning_rate": 9.740473835526139e-06, "loss": 0.5607, "step": 807 }, { "epoch": 0.13, "grad_norm": 62.456442248613335, "learning_rate": 9.73964332733532e-06, "loss": 0.517, "step": 808 }, { "epoch": 0.13, "grad_norm": 12.800522872105265, "learning_rate": 9.738811527935008e-06, "loss": 0.4212, "step": 809 }, { "epoch": 0.13, "grad_norm": 14.914885095457565, "learning_rate": 9.737978437551812e-06, "loss": 0.4883, "step": 810 }, { "epoch": 0.13, "grad_norm": 14.822141050807216, "learning_rate": 9.737144056412685e-06, "loss": 0.4288, "step": 811 }, { "epoch": 0.13, "grad_norm": 29.969505488786965, "learning_rate": 9.736308384744935e-06, "loss": 0.4473, "step": 812 }, { "epoch": 0.13, "grad_norm": 18.497496614624033, "learning_rate": 9.735471422776225e-06, "loss": 0.474, "step": 813 }, { "epoch": 0.13, "grad_norm": 16.927159822840608, "learning_rate": 9.734633170734568e-06, "loss": 0.5147, "step": 814 }, { "epoch": 0.13, "grad_norm": 36.87017319293176, "learning_rate": 9.733793628848323e-06, "loss": 0.4876, "step": 815 }, { "epoch": 0.13, "grad_norm": 31.494967767495705, "learning_rate": 9.732952797346205e-06, "loss": 0.4122, "step": 816 }, { "epoch": 0.13, "grad_norm": 10.752764823995623, "learning_rate": 9.73211067645728e-06, "loss": 0.4765, "step": 817 }, { "epoch": 0.13, "grad_norm": 10.528607287800185, "learning_rate": 9.731267266410967e-06, "loss": 0.5333, "step": 818 }, { "epoch": 0.13, "grad_norm": 22.194691483947587, "learning_rate": 9.73042256743703e-06, "loss": 0.4357, "step": 819 }, { "epoch": 0.13, "grad_norm": 9.869727929302632, "learning_rate": 9.729576579765594e-06, "loss": 0.4917, "step": 820 }, { "epoch": 0.13, "grad_norm": 9.926837035753058, "learning_rate": 9.728729303627124e-06, "loss": 0.4927, "step": 821 }, { "epoch": 0.13, "grad_norm": 12.706235684641722, "learning_rate": 9.727880739252444e-06, "loss": 0.5219, "step": 822 }, { "epoch": 0.13, "grad_norm": 21.21492251551645, "learning_rate": 9.727030886872724e-06, "loss": 0.4769, "step": 823 }, { "epoch": 0.13, "grad_norm": 38.43607689859831, "learning_rate": 9.726179746719492e-06, "loss": 0.4821, "step": 824 }, { "epoch": 0.13, "grad_norm": 15.941592316429077, "learning_rate": 9.725327319024618e-06, "loss": 0.4896, "step": 825 }, { "epoch": 0.13, "grad_norm": 11.47631918971038, "learning_rate": 9.724473604020327e-06, "loss": 0.4711, "step": 826 }, { "epoch": 0.13, "grad_norm": 10.842969169683593, "learning_rate": 9.723618601939197e-06, "loss": 0.4248, "step": 827 }, { "epoch": 0.13, "grad_norm": 47.55235027940581, "learning_rate": 9.722762313014152e-06, "loss": 0.4227, "step": 828 }, { "epoch": 0.13, "grad_norm": 13.836324713863256, "learning_rate": 9.721904737478468e-06, "loss": 0.4579, "step": 829 }, { "epoch": 0.13, "grad_norm": 19.415280801615253, "learning_rate": 9.721045875565774e-06, "loss": 0.5543, "step": 830 }, { "epoch": 0.13, "grad_norm": 23.7578698124494, "learning_rate": 9.720185727510047e-06, "loss": 0.5065, "step": 831 }, { "epoch": 0.13, "grad_norm": 18.216402001014256, "learning_rate": 9.719324293545615e-06, "loss": 0.5267, "step": 832 }, { "epoch": 0.13, "grad_norm": 13.880159011854113, "learning_rate": 9.718461573907158e-06, "loss": 0.4669, "step": 833 }, { "epoch": 0.13, "grad_norm": 19.51078919073798, "learning_rate": 9.717597568829702e-06, "loss": 0.4535, "step": 834 }, { "epoch": 0.13, "grad_norm": 16.18987952909316, "learning_rate": 9.716732278548628e-06, "loss": 0.4872, "step": 835 }, { "epoch": 0.13, "grad_norm": 23.482734741641927, "learning_rate": 9.715865703299663e-06, "loss": 0.4869, "step": 836 }, { "epoch": 0.13, "grad_norm": 22.22853214736185, "learning_rate": 9.714997843318887e-06, "loss": 0.5543, "step": 837 }, { "epoch": 0.14, "grad_norm": 29.02386420869639, "learning_rate": 9.714128698842728e-06, "loss": 0.4664, "step": 838 }, { "epoch": 0.14, "grad_norm": 26.823224206670098, "learning_rate": 9.713258270107966e-06, "loss": 0.5409, "step": 839 }, { "epoch": 0.14, "grad_norm": 22.070761335419267, "learning_rate": 9.71238655735173e-06, "loss": 0.4323, "step": 840 }, { "epoch": 0.14, "grad_norm": 28.72337850581609, "learning_rate": 9.711513560811498e-06, "loss": 0.5368, "step": 841 }, { "epoch": 0.14, "grad_norm": 26.71920995935433, "learning_rate": 9.710639280725097e-06, "loss": 0.4952, "step": 842 }, { "epoch": 0.14, "grad_norm": 22.411715576414686, "learning_rate": 9.709763717330709e-06, "loss": 0.4597, "step": 843 }, { "epoch": 0.14, "grad_norm": 33.22750454262311, "learning_rate": 9.708886870866856e-06, "loss": 0.4676, "step": 844 }, { "epoch": 0.14, "grad_norm": 33.57076272100097, "learning_rate": 9.708008741572422e-06, "loss": 0.4582, "step": 845 }, { "epoch": 0.14, "grad_norm": 35.80275606572998, "learning_rate": 9.707129329686626e-06, "loss": 0.498, "step": 846 }, { "epoch": 0.14, "grad_norm": 167.16560738947774, "learning_rate": 9.706248635449048e-06, "loss": 0.504, "step": 847 }, { "epoch": 0.14, "grad_norm": 16.265516129201238, "learning_rate": 9.705366659099617e-06, "loss": 0.4637, "step": 848 }, { "epoch": 0.14, "grad_norm": 71.6112277441031, "learning_rate": 9.704483400878602e-06, "loss": 0.483, "step": 849 }, { "epoch": 0.14, "grad_norm": 80.8414976295329, "learning_rate": 9.703598861026627e-06, "loss": 0.5556, "step": 850 }, { "epoch": 0.14, "grad_norm": 26.965311598824623, "learning_rate": 9.702713039784668e-06, "loss": 0.5404, "step": 851 }, { "epoch": 0.14, "grad_norm": 58.389752719763734, "learning_rate": 9.701825937394045e-06, "loss": 0.5265, "step": 852 }, { "epoch": 0.14, "grad_norm": 1.279052830232325, "learning_rate": 9.700937554096432e-06, "loss": 0.4486, "step": 853 }, { "epoch": 0.14, "grad_norm": 1.4043328176802556, "learning_rate": 9.700047890133845e-06, "loss": 0.4749, "step": 854 }, { "epoch": 0.14, "grad_norm": 19.533761976762747, "learning_rate": 9.699156945748657e-06, "loss": 0.5307, "step": 855 }, { "epoch": 0.14, "grad_norm": 59.65067546295453, "learning_rate": 9.698264721183584e-06, "loss": 0.4929, "step": 856 }, { "epoch": 0.14, "grad_norm": 1.2623776914107365, "learning_rate": 9.69737121668169e-06, "loss": 0.4804, "step": 857 }, { "epoch": 0.14, "grad_norm": 16.671407692878628, "learning_rate": 9.696476432486395e-06, "loss": 0.4658, "step": 858 }, { "epoch": 0.14, "grad_norm": 32.75790084169422, "learning_rate": 9.695580368841462e-06, "loss": 0.5437, "step": 859 }, { "epoch": 0.14, "grad_norm": 31.097498922921837, "learning_rate": 9.694683025991e-06, "loss": 0.4413, "step": 860 }, { "epoch": 0.14, "grad_norm": 28.834648231069195, "learning_rate": 9.693784404179472e-06, "loss": 0.5846, "step": 861 }, { "epoch": 0.14, "grad_norm": 38.580221165450446, "learning_rate": 9.692884503651687e-06, "loss": 0.4923, "step": 862 }, { "epoch": 0.14, "grad_norm": 24.460690915658258, "learning_rate": 9.691983324652804e-06, "loss": 0.3778, "step": 863 }, { "epoch": 0.14, "grad_norm": 16.431107726245582, "learning_rate": 9.691080867428328e-06, "loss": 0.4853, "step": 864 }, { "epoch": 0.14, "grad_norm": 86.35788132544359, "learning_rate": 9.690177132224113e-06, "loss": 0.5914, "step": 865 }, { "epoch": 0.14, "grad_norm": 64.46183182063074, "learning_rate": 9.689272119286361e-06, "loss": 0.4576, "step": 866 }, { "epoch": 0.14, "grad_norm": 17.320974530922435, "learning_rate": 9.688365828861625e-06, "loss": 0.509, "step": 867 }, { "epoch": 0.14, "grad_norm": 126.20985491765774, "learning_rate": 9.6874582611968e-06, "loss": 0.4724, "step": 868 }, { "epoch": 0.14, "grad_norm": 24.26624464865846, "learning_rate": 9.686549416539135e-06, "loss": 0.5037, "step": 869 }, { "epoch": 0.14, "grad_norm": 1.4698098921507048, "learning_rate": 9.685639295136224e-06, "loss": 0.453, "step": 870 }, { "epoch": 0.14, "grad_norm": 22.00530610603084, "learning_rate": 9.684727897236008e-06, "loss": 0.4667, "step": 871 }, { "epoch": 0.14, "grad_norm": 40.4987937675432, "learning_rate": 9.683815223086777e-06, "loss": 0.5267, "step": 872 }, { "epoch": 0.14, "grad_norm": 1.4037830578460488, "learning_rate": 9.68290127293717e-06, "loss": 0.5089, "step": 873 }, { "epoch": 0.14, "grad_norm": 24.533577589405628, "learning_rate": 9.68198604703617e-06, "loss": 0.5373, "step": 874 }, { "epoch": 0.14, "grad_norm": 1.1957333630601856, "learning_rate": 9.681069545633113e-06, "loss": 0.4467, "step": 875 }, { "epoch": 0.14, "grad_norm": 44.51402900276449, "learning_rate": 9.680151768977676e-06, "loss": 0.4955, "step": 876 }, { "epoch": 0.14, "grad_norm": 13.810054162013374, "learning_rate": 9.679232717319887e-06, "loss": 0.4937, "step": 877 }, { "epoch": 0.14, "grad_norm": 23.56182578277013, "learning_rate": 9.678312390910123e-06, "loss": 0.4777, "step": 878 }, { "epoch": 0.14, "grad_norm": 13.25269393349784, "learning_rate": 9.677390789999106e-06, "loss": 0.5698, "step": 879 }, { "epoch": 0.14, "grad_norm": 17.385739310511497, "learning_rate": 9.676467914837904e-06, "loss": 0.4292, "step": 880 }, { "epoch": 0.14, "grad_norm": 1.418399895330307, "learning_rate": 9.675543765677935e-06, "loss": 0.482, "step": 881 }, { "epoch": 0.14, "grad_norm": 51.4925584104897, "learning_rate": 9.674618342770962e-06, "loss": 0.5084, "step": 882 }, { "epoch": 0.14, "grad_norm": 18.287028909763716, "learning_rate": 9.673691646369094e-06, "loss": 0.4899, "step": 883 }, { "epoch": 0.14, "grad_norm": 17.300724558715963, "learning_rate": 9.672763676724792e-06, "loss": 0.4877, "step": 884 }, { "epoch": 0.14, "grad_norm": 12.894477117895924, "learning_rate": 9.671834434090861e-06, "loss": 0.5086, "step": 885 }, { "epoch": 0.14, "grad_norm": 16.74662240257578, "learning_rate": 9.670903918720446e-06, "loss": 0.4973, "step": 886 }, { "epoch": 0.14, "grad_norm": 17.663540427089483, "learning_rate": 9.669972130867053e-06, "loss": 0.5045, "step": 887 }, { "epoch": 0.14, "grad_norm": 12.807825064701182, "learning_rate": 9.66903907078452e-06, "loss": 0.4121, "step": 888 }, { "epoch": 0.14, "grad_norm": 13.209317194839818, "learning_rate": 9.668104738727045e-06, "loss": 0.5457, "step": 889 }, { "epoch": 0.14, "grad_norm": 12.572594206483387, "learning_rate": 9.667169134949158e-06, "loss": 0.4775, "step": 890 }, { "epoch": 0.14, "grad_norm": 12.933546620391244, "learning_rate": 9.666232259705751e-06, "loss": 0.4305, "step": 891 }, { "epoch": 0.14, "grad_norm": 12.733915139145058, "learning_rate": 9.66529411325205e-06, "loss": 0.5353, "step": 892 }, { "epoch": 0.14, "grad_norm": 53.315614659048926, "learning_rate": 9.664354695843632e-06, "loss": 0.4776, "step": 893 }, { "epoch": 0.14, "grad_norm": 21.38241119271368, "learning_rate": 9.66341400773642e-06, "loss": 0.4456, "step": 894 }, { "epoch": 0.14, "grad_norm": 18.981549837730675, "learning_rate": 9.662472049186688e-06, "loss": 0.4922, "step": 895 }, { "epoch": 0.14, "grad_norm": 15.365771384488077, "learning_rate": 9.661528820451045e-06, "loss": 0.5116, "step": 896 }, { "epoch": 0.14, "grad_norm": 17.38066872462953, "learning_rate": 9.660584321786456e-06, "loss": 0.4082, "step": 897 }, { "epoch": 0.14, "grad_norm": 24.258314947389227, "learning_rate": 9.65963855345023e-06, "loss": 0.5222, "step": 898 }, { "epoch": 0.14, "grad_norm": 18.11395784802861, "learning_rate": 9.658691515700016e-06, "loss": 0.512, "step": 899 }, { "epoch": 0.15, "grad_norm": 34.84165914930186, "learning_rate": 9.657743208793818e-06, "loss": 0.4277, "step": 900 }, { "epoch": 0.15, "grad_norm": 26.304413274962798, "learning_rate": 9.656793632989976e-06, "loss": 0.5686, "step": 901 }, { "epoch": 0.15, "grad_norm": 21.38646161739341, "learning_rate": 9.655842788547184e-06, "loss": 0.4312, "step": 902 }, { "epoch": 0.15, "grad_norm": 14.560823648173352, "learning_rate": 9.654890675724478e-06, "loss": 0.4912, "step": 903 }, { "epoch": 0.15, "grad_norm": 19.80098513475386, "learning_rate": 9.653937294781237e-06, "loss": 0.4724, "step": 904 }, { "epoch": 0.15, "grad_norm": 22.69963368981203, "learning_rate": 9.652982645977193e-06, "loss": 0.4892, "step": 905 }, { "epoch": 0.15, "grad_norm": 24.81523226142492, "learning_rate": 9.652026729572415e-06, "loss": 0.4879, "step": 906 }, { "epoch": 0.15, "grad_norm": 25.74256499384213, "learning_rate": 9.651069545827321e-06, "loss": 0.5019, "step": 907 }, { "epoch": 0.15, "grad_norm": 19.296978907218705, "learning_rate": 9.650111095002676e-06, "loss": 0.4618, "step": 908 }, { "epoch": 0.15, "grad_norm": 26.53211183276057, "learning_rate": 9.649151377359588e-06, "loss": 0.5478, "step": 909 }, { "epoch": 0.15, "grad_norm": 18.359990850346854, "learning_rate": 9.64819039315951e-06, "loss": 0.5059, "step": 910 }, { "epoch": 0.15, "grad_norm": 27.146350026207703, "learning_rate": 9.647228142664241e-06, "loss": 0.5308, "step": 911 }, { "epoch": 0.15, "grad_norm": 14.601144986459667, "learning_rate": 9.646264626135924e-06, "loss": 0.4795, "step": 912 }, { "epoch": 0.15, "grad_norm": 19.379563200214534, "learning_rate": 9.645299843837047e-06, "loss": 0.5059, "step": 913 }, { "epoch": 0.15, "grad_norm": 22.483035712335138, "learning_rate": 9.644333796030444e-06, "loss": 0.4789, "step": 914 }, { "epoch": 0.15, "grad_norm": 18.399389340651457, "learning_rate": 9.643366482979296e-06, "loss": 0.5264, "step": 915 }, { "epoch": 0.15, "grad_norm": 33.64923947907195, "learning_rate": 9.64239790494712e-06, "loss": 0.4322, "step": 916 }, { "epoch": 0.15, "grad_norm": 16.24641618243107, "learning_rate": 9.641428062197789e-06, "loss": 0.5115, "step": 917 }, { "epoch": 0.15, "grad_norm": 21.46289283536512, "learning_rate": 9.640456954995509e-06, "loss": 0.4851, "step": 918 }, { "epoch": 0.15, "grad_norm": 21.980764490788626, "learning_rate": 9.639484583604841e-06, "loss": 0.55, "step": 919 }, { "epoch": 0.15, "grad_norm": 29.88506289570659, "learning_rate": 9.638510948290684e-06, "loss": 0.4992, "step": 920 }, { "epoch": 0.15, "grad_norm": 15.293853172724683, "learning_rate": 9.637536049318284e-06, "loss": 0.5399, "step": 921 }, { "epoch": 0.15, "grad_norm": 26.935886223016613, "learning_rate": 9.63655988695323e-06, "loss": 0.5603, "step": 922 }, { "epoch": 0.15, "grad_norm": 57.60764415721829, "learning_rate": 9.635582461461455e-06, "loss": 0.5477, "step": 923 }, { "epoch": 0.15, "grad_norm": 13.905764808920038, "learning_rate": 9.634603773109235e-06, "loss": 0.4264, "step": 924 }, { "epoch": 0.15, "grad_norm": 12.149995219723257, "learning_rate": 9.633623822163196e-06, "loss": 0.5126, "step": 925 }, { "epoch": 0.15, "grad_norm": 24.346304474653277, "learning_rate": 9.6326426088903e-06, "loss": 0.5666, "step": 926 }, { "epoch": 0.15, "grad_norm": 63.50948535867169, "learning_rate": 9.631660133557858e-06, "loss": 0.4929, "step": 927 }, { "epoch": 0.15, "grad_norm": 234.82827664074006, "learning_rate": 9.630676396433524e-06, "loss": 0.4471, "step": 928 }, { "epoch": 0.15, "grad_norm": 17.407425843058338, "learning_rate": 9.629691397785294e-06, "loss": 0.5085, "step": 929 }, { "epoch": 0.15, "grad_norm": 41.441168088491075, "learning_rate": 9.628705137881509e-06, "loss": 0.445, "step": 930 }, { "epoch": 0.15, "grad_norm": 67.10488979067183, "learning_rate": 9.627717616990853e-06, "loss": 0.451, "step": 931 }, { "epoch": 0.15, "grad_norm": 18.811327948317675, "learning_rate": 9.626728835382353e-06, "loss": 0.5455, "step": 932 }, { "epoch": 0.15, "grad_norm": 16.04187581422627, "learning_rate": 9.625738793325384e-06, "loss": 0.481, "step": 933 }, { "epoch": 0.15, "grad_norm": 31.82019051690305, "learning_rate": 9.624747491089657e-06, "loss": 0.557, "step": 934 }, { "epoch": 0.15, "grad_norm": 13.427645062972102, "learning_rate": 9.623754928945233e-06, "loss": 0.4604, "step": 935 }, { "epoch": 0.15, "grad_norm": 12.145064333624996, "learning_rate": 9.622761107162511e-06, "loss": 0.3854, "step": 936 }, { "epoch": 0.15, "grad_norm": 24.39856168953827, "learning_rate": 9.621766026012236e-06, "loss": 0.4084, "step": 937 }, { "epoch": 0.15, "grad_norm": 17.287502710482805, "learning_rate": 9.620769685765497e-06, "loss": 0.5109, "step": 938 }, { "epoch": 0.15, "grad_norm": 15.869683750374959, "learning_rate": 9.619772086693721e-06, "loss": 0.4883, "step": 939 }, { "epoch": 0.15, "grad_norm": 11.026502508984617, "learning_rate": 9.618773229068685e-06, "loss": 0.5207, "step": 940 }, { "epoch": 0.15, "grad_norm": 19.94769006365636, "learning_rate": 9.617773113162505e-06, "loss": 0.4478, "step": 941 }, { "epoch": 0.15, "grad_norm": 1.6611989853004538, "learning_rate": 9.616771739247639e-06, "loss": 0.4926, "step": 942 }, { "epoch": 0.15, "grad_norm": 29.964661697397037, "learning_rate": 9.615769107596888e-06, "loss": 0.4905, "step": 943 }, { "epoch": 0.15, "grad_norm": 16.413503397210974, "learning_rate": 9.614765218483398e-06, "loss": 0.4928, "step": 944 }, { "epoch": 0.15, "grad_norm": 20.424786699059403, "learning_rate": 9.613760072180656e-06, "loss": 0.5097, "step": 945 }, { "epoch": 0.15, "grad_norm": 14.706280523892099, "learning_rate": 9.612753668962492e-06, "loss": 0.4844, "step": 946 }, { "epoch": 0.15, "grad_norm": 11.79609289935516, "learning_rate": 9.611746009103077e-06, "loss": 0.4802, "step": 947 }, { "epoch": 0.15, "grad_norm": 11.701576662868932, "learning_rate": 9.610737092876924e-06, "loss": 0.5289, "step": 948 }, { "epoch": 0.15, "grad_norm": 11.092264536235662, "learning_rate": 9.609726920558893e-06, "loss": 0.4706, "step": 949 }, { "epoch": 0.15, "grad_norm": 29.435131489183174, "learning_rate": 9.60871549242418e-06, "loss": 0.4624, "step": 950 }, { "epoch": 0.15, "grad_norm": 10.296228534653492, "learning_rate": 9.607702808748329e-06, "loss": 0.4705, "step": 951 }, { "epoch": 0.15, "grad_norm": 29.626415975793552, "learning_rate": 9.60668886980722e-06, "loss": 0.4618, "step": 952 }, { "epoch": 0.15, "grad_norm": 10.609088950874494, "learning_rate": 9.605673675877077e-06, "loss": 0.4183, "step": 953 }, { "epoch": 0.15, "grad_norm": 23.558458313023788, "learning_rate": 9.604657227234468e-06, "loss": 0.5323, "step": 954 }, { "epoch": 0.15, "grad_norm": 11.798142775792458, "learning_rate": 9.603639524156306e-06, "loss": 0.5021, "step": 955 }, { "epoch": 0.15, "grad_norm": 12.267768452020524, "learning_rate": 9.602620566919834e-06, "loss": 0.4778, "step": 956 }, { "epoch": 0.15, "grad_norm": 44.57079061569597, "learning_rate": 9.601600355802649e-06, "loss": 0.4635, "step": 957 }, { "epoch": 0.15, "grad_norm": 16.502880927289095, "learning_rate": 9.600578891082683e-06, "loss": 0.5217, "step": 958 }, { "epoch": 0.15, "grad_norm": 21.689018152019912, "learning_rate": 9.599556173038211e-06, "loss": 0.4269, "step": 959 }, { "epoch": 0.15, "grad_norm": 14.531986554194695, "learning_rate": 9.59853220194785e-06, "loss": 0.4162, "step": 960 }, { "epoch": 0.15, "grad_norm": 32.57856162932613, "learning_rate": 9.597506978090558e-06, "loss": 0.4277, "step": 961 }, { "epoch": 0.15, "grad_norm": 19.578718641830992, "learning_rate": 9.596480501745635e-06, "loss": 0.5532, "step": 962 }, { "epoch": 0.16, "grad_norm": 20.304897535333694, "learning_rate": 9.59545277319272e-06, "loss": 0.4184, "step": 963 }, { "epoch": 0.16, "grad_norm": 16.85257954647598, "learning_rate": 9.594423792711796e-06, "loss": 0.5085, "step": 964 }, { "epoch": 0.16, "grad_norm": 13.092244342434318, "learning_rate": 9.593393560583184e-06, "loss": 0.5103, "step": 965 }, { "epoch": 0.16, "grad_norm": 40.08298882340636, "learning_rate": 9.592362077087548e-06, "loss": 0.5485, "step": 966 }, { "epoch": 0.16, "grad_norm": 20.483878434898017, "learning_rate": 9.591329342505894e-06, "loss": 0.4658, "step": 967 }, { "epoch": 0.16, "grad_norm": 17.234616929066743, "learning_rate": 9.590295357119565e-06, "loss": 0.4832, "step": 968 }, { "epoch": 0.16, "grad_norm": 11.947411677599721, "learning_rate": 9.589260121210249e-06, "loss": 0.5231, "step": 969 }, { "epoch": 0.16, "grad_norm": 17.75116681043414, "learning_rate": 9.588223635059973e-06, "loss": 0.455, "step": 970 }, { "epoch": 0.16, "grad_norm": 16.700109917417805, "learning_rate": 9.587185898951104e-06, "loss": 0.4448, "step": 971 }, { "epoch": 0.16, "grad_norm": 14.172756837957563, "learning_rate": 9.58614691316635e-06, "loss": 0.4755, "step": 972 }, { "epoch": 0.16, "grad_norm": 13.130934611983747, "learning_rate": 9.585106677988756e-06, "loss": 0.4213, "step": 973 }, { "epoch": 0.16, "grad_norm": 13.93130557338124, "learning_rate": 9.584065193701717e-06, "loss": 0.3945, "step": 974 }, { "epoch": 0.16, "grad_norm": 14.306927039039563, "learning_rate": 9.583022460588956e-06, "loss": 0.484, "step": 975 }, { "epoch": 0.16, "grad_norm": 19.085808779093785, "learning_rate": 9.581978478934547e-06, "loss": 0.4268, "step": 976 }, { "epoch": 0.16, "grad_norm": 12.889267451609474, "learning_rate": 9.580933249022896e-06, "loss": 0.4535, "step": 977 }, { "epoch": 0.16, "grad_norm": 20.593346115144758, "learning_rate": 9.579886771138754e-06, "loss": 0.5139, "step": 978 }, { "epoch": 0.16, "grad_norm": 13.172950097501262, "learning_rate": 9.578839045567212e-06, "loss": 0.5372, "step": 979 }, { "epoch": 0.16, "grad_norm": 11.895476707079597, "learning_rate": 9.577790072593696e-06, "loss": 0.4928, "step": 980 }, { "epoch": 0.16, "grad_norm": 12.14817049309049, "learning_rate": 9.576739852503976e-06, "loss": 0.4348, "step": 981 }, { "epoch": 0.16, "grad_norm": 18.441425954475015, "learning_rate": 9.575688385584162e-06, "loss": 0.4806, "step": 982 }, { "epoch": 0.16, "grad_norm": 9.128038132817784, "learning_rate": 9.574635672120705e-06, "loss": 0.4751, "step": 983 }, { "epoch": 0.16, "grad_norm": 18.973098666003366, "learning_rate": 9.573581712400386e-06, "loss": 0.4333, "step": 984 }, { "epoch": 0.16, "grad_norm": 16.796792041474703, "learning_rate": 9.57252650671034e-06, "loss": 0.4671, "step": 985 }, { "epoch": 0.16, "grad_norm": 22.827199612453057, "learning_rate": 9.57147005533803e-06, "loss": 0.4823, "step": 986 }, { "epoch": 0.16, "grad_norm": 17.57179750769814, "learning_rate": 9.570412358571264e-06, "loss": 0.5187, "step": 987 }, { "epoch": 0.16, "grad_norm": 13.410651668499781, "learning_rate": 9.569353416698187e-06, "loss": 0.5501, "step": 988 }, { "epoch": 0.16, "grad_norm": 19.774738320693395, "learning_rate": 9.568293230007284e-06, "loss": 0.563, "step": 989 }, { "epoch": 0.16, "grad_norm": 12.193087345047092, "learning_rate": 9.56723179878738e-06, "loss": 0.5148, "step": 990 }, { "epoch": 0.16, "grad_norm": 13.06236893677764, "learning_rate": 9.566169123327638e-06, "loss": 0.504, "step": 991 }, { "epoch": 0.16, "grad_norm": 15.382162286616634, "learning_rate": 9.565105203917559e-06, "loss": 0.4874, "step": 992 }, { "epoch": 0.16, "grad_norm": 12.493745017555534, "learning_rate": 9.564040040846984e-06, "loss": 0.4602, "step": 993 }, { "epoch": 0.16, "grad_norm": 13.95916096139023, "learning_rate": 9.562973634406096e-06, "loss": 0.4502, "step": 994 }, { "epoch": 0.16, "grad_norm": 14.877131856974902, "learning_rate": 9.561905984885407e-06, "loss": 0.4852, "step": 995 }, { "epoch": 0.16, "grad_norm": 17.394463385448358, "learning_rate": 9.560837092575781e-06, "loss": 0.4095, "step": 996 }, { "epoch": 0.16, "grad_norm": 1.687329726858136, "learning_rate": 9.55976695776841e-06, "loss": 0.5273, "step": 997 }, { "epoch": 0.16, "grad_norm": 14.113261260705103, "learning_rate": 9.558695580754828e-06, "loss": 0.5204, "step": 998 }, { "epoch": 0.16, "grad_norm": 12.484344485846112, "learning_rate": 9.55762296182691e-06, "loss": 0.4668, "step": 999 }, { "epoch": 0.16, "grad_norm": 35.16102900153671, "learning_rate": 9.556549101276865e-06, "loss": 0.49, "step": 1000 }, { "epoch": 0.16, "grad_norm": 13.601122733500624, "learning_rate": 9.555473999397242e-06, "loss": 0.4518, "step": 1001 }, { "epoch": 0.16, "grad_norm": 19.973026517191816, "learning_rate": 9.55439765648093e-06, "loss": 0.5142, "step": 1002 }, { "epoch": 0.16, "grad_norm": 12.74323779918366, "learning_rate": 9.553320072821154e-06, "loss": 0.5706, "step": 1003 }, { "epoch": 0.16, "grad_norm": 10.528235325119805, "learning_rate": 9.552241248711478e-06, "loss": 0.4327, "step": 1004 }, { "epoch": 0.16, "grad_norm": 13.804643031667975, "learning_rate": 9.551161184445801e-06, "loss": 0.4451, "step": 1005 }, { "epoch": 0.16, "grad_norm": 21.665667807762887, "learning_rate": 9.550079880318363e-06, "loss": 0.4484, "step": 1006 }, { "epoch": 0.16, "grad_norm": 1.3581474010418773, "learning_rate": 9.548997336623744e-06, "loss": 0.4666, "step": 1007 }, { "epoch": 0.16, "grad_norm": 15.79477809188708, "learning_rate": 9.547913553656856e-06, "loss": 0.4617, "step": 1008 }, { "epoch": 0.16, "grad_norm": 14.946613656419078, "learning_rate": 9.54682853171295e-06, "loss": 0.459, "step": 1009 }, { "epoch": 0.16, "grad_norm": 31.964391597437995, "learning_rate": 9.545742271087622e-06, "loss": 0.562, "step": 1010 }, { "epoch": 0.16, "grad_norm": 15.483170294856006, "learning_rate": 9.54465477207679e-06, "loss": 0.5087, "step": 1011 }, { "epoch": 0.16, "grad_norm": 22.400086590843667, "learning_rate": 9.543566034976728e-06, "loss": 0.4621, "step": 1012 }, { "epoch": 0.16, "grad_norm": 14.139788394314213, "learning_rate": 9.542476060084035e-06, "loss": 0.498, "step": 1013 }, { "epoch": 0.16, "grad_norm": 12.555769415941954, "learning_rate": 9.541384847695645e-06, "loss": 0.4352, "step": 1014 }, { "epoch": 0.16, "grad_norm": 16.57216535857035, "learning_rate": 9.540292398108842e-06, "loss": 0.4626, "step": 1015 }, { "epoch": 0.16, "grad_norm": 13.499571704566081, "learning_rate": 9.539198711621234e-06, "loss": 0.4518, "step": 1016 }, { "epoch": 0.16, "grad_norm": 13.234664693544344, "learning_rate": 9.538103788530773e-06, "loss": 0.4402, "step": 1017 }, { "epoch": 0.16, "grad_norm": 8.545299194691657, "learning_rate": 9.537007629135745e-06, "loss": 0.4387, "step": 1018 }, { "epoch": 0.16, "grad_norm": 14.18574041740857, "learning_rate": 9.53591023373478e-06, "loss": 0.4308, "step": 1019 }, { "epoch": 0.16, "grad_norm": 14.5217921791035, "learning_rate": 9.53481160262683e-06, "loss": 0.4181, "step": 1020 }, { "epoch": 0.16, "grad_norm": 79.58655211806712, "learning_rate": 9.5337117361112e-06, "loss": 0.4888, "step": 1021 }, { "epoch": 0.16, "grad_norm": 11.888995146115528, "learning_rate": 9.532610634487519e-06, "loss": 0.4874, "step": 1022 }, { "epoch": 0.16, "grad_norm": 19.08221064076107, "learning_rate": 9.531508298055758e-06, "loss": 0.5789, "step": 1023 }, { "epoch": 0.16, "grad_norm": 7.97861608552254, "learning_rate": 9.530404727116225e-06, "loss": 0.454, "step": 1024 }, { "epoch": 0.17, "grad_norm": 1.2010297306300344, "learning_rate": 9.529299921969563e-06, "loss": 0.4417, "step": 1025 }, { "epoch": 0.17, "grad_norm": 12.80708567197285, "learning_rate": 9.528193882916753e-06, "loss": 0.4841, "step": 1026 }, { "epoch": 0.17, "grad_norm": 22.150938767211656, "learning_rate": 9.52708661025911e-06, "loss": 0.464, "step": 1027 }, { "epoch": 0.17, "grad_norm": 13.515352821430874, "learning_rate": 9.525978104298282e-06, "loss": 0.4845, "step": 1028 }, { "epoch": 0.17, "grad_norm": 7.1434025319412555, "learning_rate": 9.524868365336259e-06, "loss": 0.5679, "step": 1029 }, { "epoch": 0.17, "grad_norm": 10.126513833511396, "learning_rate": 9.523757393675365e-06, "loss": 0.4565, "step": 1030 }, { "epoch": 0.17, "grad_norm": 18.142325378882944, "learning_rate": 9.522645189618258e-06, "loss": 0.4757, "step": 1031 }, { "epoch": 0.17, "grad_norm": 17.91740615647938, "learning_rate": 9.521531753467935e-06, "loss": 0.4313, "step": 1032 }, { "epoch": 0.17, "grad_norm": 17.608969009291357, "learning_rate": 9.520417085527725e-06, "loss": 0.5528, "step": 1033 }, { "epoch": 0.17, "grad_norm": 36.056968469451256, "learning_rate": 9.519301186101297e-06, "loss": 0.4697, "step": 1034 }, { "epoch": 0.17, "grad_norm": 11.163717110117974, "learning_rate": 9.518184055492648e-06, "loss": 0.5005, "step": 1035 }, { "epoch": 0.17, "grad_norm": 20.275226388453046, "learning_rate": 9.517065694006118e-06, "loss": 0.4848, "step": 1036 }, { "epoch": 0.17, "grad_norm": 13.672904576323003, "learning_rate": 9.51594610194638e-06, "loss": 0.4894, "step": 1037 }, { "epoch": 0.17, "grad_norm": 1.4247903404095903, "learning_rate": 9.514825279618443e-06, "loss": 0.4839, "step": 1038 }, { "epoch": 0.17, "grad_norm": 21.880846935059704, "learning_rate": 9.513703227327646e-06, "loss": 0.4823, "step": 1039 }, { "epoch": 0.17, "grad_norm": 16.33271826037092, "learning_rate": 9.512579945379668e-06, "loss": 0.4896, "step": 1040 }, { "epoch": 0.17, "grad_norm": 34.12762389932343, "learning_rate": 9.511455434080525e-06, "loss": 0.4468, "step": 1041 }, { "epoch": 0.17, "grad_norm": 47.95397372075502, "learning_rate": 9.51032969373656e-06, "loss": 0.5101, "step": 1042 }, { "epoch": 0.17, "grad_norm": 13.865484661601567, "learning_rate": 9.50920272465446e-06, "loss": 0.5096, "step": 1043 }, { "epoch": 0.17, "grad_norm": 19.991221702462603, "learning_rate": 9.508074527141243e-06, "loss": 0.4924, "step": 1044 }, { "epoch": 0.17, "grad_norm": 9.593405773468348, "learning_rate": 9.506945101504256e-06, "loss": 0.4791, "step": 1045 }, { "epoch": 0.17, "grad_norm": 12.228471517393855, "learning_rate": 9.50581444805119e-06, "loss": 0.5499, "step": 1046 }, { "epoch": 0.17, "grad_norm": 1.2120997499365314, "learning_rate": 9.504682567090064e-06, "loss": 0.4672, "step": 1047 }, { "epoch": 0.17, "grad_norm": 12.897666136524794, "learning_rate": 9.503549458929236e-06, "loss": 0.4759, "step": 1048 }, { "epoch": 0.17, "grad_norm": 7.044085415950523, "learning_rate": 9.502415123877393e-06, "loss": 0.4373, "step": 1049 }, { "epoch": 0.17, "grad_norm": 16.48255701514367, "learning_rate": 9.501279562243561e-06, "loss": 0.5126, "step": 1050 }, { "epoch": 0.17, "grad_norm": 10.895304992621552, "learning_rate": 9.500142774337097e-06, "loss": 0.5163, "step": 1051 }, { "epoch": 0.17, "grad_norm": 9.148211940230544, "learning_rate": 9.499004760467694e-06, "loss": 0.4558, "step": 1052 }, { "epoch": 0.17, "grad_norm": 9.859836359554315, "learning_rate": 9.49786552094538e-06, "loss": 0.4317, "step": 1053 }, { "epoch": 0.17, "grad_norm": 9.776650772075477, "learning_rate": 9.496725056080512e-06, "loss": 0.5136, "step": 1054 }, { "epoch": 0.17, "grad_norm": 20.957341788516025, "learning_rate": 9.495583366183788e-06, "loss": 0.4864, "step": 1055 }, { "epoch": 0.17, "grad_norm": 13.940883647140035, "learning_rate": 9.494440451566232e-06, "loss": 0.4566, "step": 1056 }, { "epoch": 0.17, "grad_norm": 18.690551730173826, "learning_rate": 9.493296312539206e-06, "loss": 0.5571, "step": 1057 }, { "epoch": 0.17, "grad_norm": 9.209947090213658, "learning_rate": 9.492150949414408e-06, "loss": 0.5108, "step": 1058 }, { "epoch": 0.17, "grad_norm": 11.507618148830549, "learning_rate": 9.491004362503862e-06, "loss": 0.4612, "step": 1059 }, { "epoch": 0.17, "grad_norm": 42.213896385302455, "learning_rate": 9.489856552119934e-06, "loss": 0.5285, "step": 1060 }, { "epoch": 0.17, "grad_norm": 9.12317444130324, "learning_rate": 9.488707518575319e-06, "loss": 0.401, "step": 1061 }, { "epoch": 0.17, "grad_norm": 21.878887963752344, "learning_rate": 9.487557262183043e-06, "loss": 0.4288, "step": 1062 }, { "epoch": 0.17, "grad_norm": 17.748352108962035, "learning_rate": 9.486405783256467e-06, "loss": 0.4927, "step": 1063 }, { "epoch": 0.17, "grad_norm": 27.55719417486496, "learning_rate": 9.485253082109289e-06, "loss": 0.4222, "step": 1064 }, { "epoch": 0.17, "grad_norm": 11.435198637025392, "learning_rate": 9.484099159055534e-06, "loss": 0.5437, "step": 1065 }, { "epoch": 0.17, "grad_norm": 17.12416456475259, "learning_rate": 9.482944014409563e-06, "loss": 0.4986, "step": 1066 }, { "epoch": 0.17, "grad_norm": 16.95971556895982, "learning_rate": 9.481787648486069e-06, "loss": 0.4986, "step": 1067 }, { "epoch": 0.17, "grad_norm": 10.982524039969313, "learning_rate": 9.480630061600079e-06, "loss": 0.4951, "step": 1068 }, { "epoch": 0.17, "grad_norm": 35.488620954847754, "learning_rate": 9.47947125406695e-06, "loss": 0.4603, "step": 1069 }, { "epoch": 0.17, "grad_norm": 11.756908885412711, "learning_rate": 9.478311226202375e-06, "loss": 0.5257, "step": 1070 }, { "epoch": 0.17, "grad_norm": 15.251827708371087, "learning_rate": 9.477149978322378e-06, "loss": 0.4619, "step": 1071 }, { "epoch": 0.17, "grad_norm": 13.01838895936322, "learning_rate": 9.475987510743311e-06, "loss": 0.5302, "step": 1072 }, { "epoch": 0.17, "grad_norm": 17.798763378924722, "learning_rate": 9.474823823781866e-06, "loss": 0.5385, "step": 1073 }, { "epoch": 0.17, "grad_norm": 14.464951985135798, "learning_rate": 9.473658917755063e-06, "loss": 0.4508, "step": 1074 }, { "epoch": 0.17, "grad_norm": 9.965477806289659, "learning_rate": 9.472492792980252e-06, "loss": 0.4697, "step": 1075 }, { "epoch": 0.17, "grad_norm": 11.487399091539281, "learning_rate": 9.471325449775119e-06, "loss": 0.4747, "step": 1076 }, { "epoch": 0.17, "grad_norm": 11.878887877521795, "learning_rate": 9.470156888457682e-06, "loss": 0.4411, "step": 1077 }, { "epoch": 0.17, "grad_norm": 11.2156365414979, "learning_rate": 9.468987109346288e-06, "loss": 0.4737, "step": 1078 }, { "epoch": 0.17, "grad_norm": 10.988792573984448, "learning_rate": 9.467816112759616e-06, "loss": 0.5397, "step": 1079 }, { "epoch": 0.17, "grad_norm": 16.641094947310126, "learning_rate": 9.46664389901668e-06, "loss": 0.4609, "step": 1080 }, { "epoch": 0.17, "grad_norm": 23.815410443281895, "learning_rate": 9.465470468436822e-06, "loss": 0.4861, "step": 1081 }, { "epoch": 0.17, "grad_norm": 15.637081850511155, "learning_rate": 9.464295821339715e-06, "loss": 0.5018, "step": 1082 }, { "epoch": 0.17, "grad_norm": 12.22993165834608, "learning_rate": 9.46311995804537e-06, "loss": 0.4241, "step": 1083 }, { "epoch": 0.17, "grad_norm": 15.578914863482261, "learning_rate": 9.46194287887412e-06, "loss": 0.4695, "step": 1084 }, { "epoch": 0.17, "grad_norm": 7.672083217835114, "learning_rate": 9.460764584146635e-06, "loss": 0.4722, "step": 1085 }, { "epoch": 0.17, "grad_norm": 15.475546803637759, "learning_rate": 9.459585074183919e-06, "loss": 0.4923, "step": 1086 }, { "epoch": 0.18, "grad_norm": 23.204796868705444, "learning_rate": 9.458404349307295e-06, "loss": 0.491, "step": 1087 }, { "epoch": 0.18, "grad_norm": 1.4269469763334337, "learning_rate": 9.457222409838433e-06, "loss": 0.499, "step": 1088 }, { "epoch": 0.18, "grad_norm": 11.623904209983921, "learning_rate": 9.456039256099321e-06, "loss": 0.4336, "step": 1089 }, { "epoch": 0.18, "grad_norm": 11.876087409419991, "learning_rate": 9.454854888412285e-06, "loss": 0.507, "step": 1090 }, { "epoch": 0.18, "grad_norm": 9.398020604301625, "learning_rate": 9.453669307099978e-06, "loss": 0.5097, "step": 1091 }, { "epoch": 0.18, "grad_norm": 11.654028564640374, "learning_rate": 9.452482512485386e-06, "loss": 0.4948, "step": 1092 }, { "epoch": 0.18, "grad_norm": 16.63963642095185, "learning_rate": 9.451294504891824e-06, "loss": 0.4649, "step": 1093 }, { "epoch": 0.18, "grad_norm": 16.693549034747875, "learning_rate": 9.450105284642938e-06, "loss": 0.5117, "step": 1094 }, { "epoch": 0.18, "grad_norm": 1.1640578879525318, "learning_rate": 9.448914852062705e-06, "loss": 0.4771, "step": 1095 }, { "epoch": 0.18, "grad_norm": 12.351754912346847, "learning_rate": 9.447723207475432e-06, "loss": 0.4351, "step": 1096 }, { "epoch": 0.18, "grad_norm": 18.223535066038824, "learning_rate": 9.446530351205754e-06, "loss": 0.5054, "step": 1097 }, { "epoch": 0.18, "grad_norm": 18.638419168632442, "learning_rate": 9.445336283578639e-06, "loss": 0.5412, "step": 1098 }, { "epoch": 0.18, "grad_norm": 23.48201034905293, "learning_rate": 9.444141004919385e-06, "loss": 0.5002, "step": 1099 }, { "epoch": 0.18, "grad_norm": 24.948653576937204, "learning_rate": 9.442944515553616e-06, "loss": 0.4789, "step": 1100 }, { "epoch": 0.18, "grad_norm": 13.252203421307753, "learning_rate": 9.441746815807292e-06, "loss": 0.4985, "step": 1101 }, { "epoch": 0.18, "grad_norm": 22.332822954586746, "learning_rate": 9.440547906006697e-06, "loss": 0.5795, "step": 1102 }, { "epoch": 0.18, "grad_norm": 12.880891534138579, "learning_rate": 9.439347786478448e-06, "loss": 0.5138, "step": 1103 }, { "epoch": 0.18, "grad_norm": 84.51759881771268, "learning_rate": 9.438146457549491e-06, "loss": 0.4963, "step": 1104 }, { "epoch": 0.18, "grad_norm": 11.341451261441499, "learning_rate": 9.436943919547101e-06, "loss": 0.4957, "step": 1105 }, { "epoch": 0.18, "grad_norm": 12.041994111552437, "learning_rate": 9.435740172798881e-06, "loss": 0.4649, "step": 1106 }, { "epoch": 0.18, "grad_norm": 17.791394882417716, "learning_rate": 9.434535217632768e-06, "loss": 0.4981, "step": 1107 }, { "epoch": 0.18, "grad_norm": 11.120870935680138, "learning_rate": 9.433329054377023e-06, "loss": 0.4958, "step": 1108 }, { "epoch": 0.18, "grad_norm": 15.132556810973604, "learning_rate": 9.432121683360235e-06, "loss": 0.5198, "step": 1109 }, { "epoch": 0.18, "grad_norm": 8.80646202797608, "learning_rate": 9.430913104911331e-06, "loss": 0.4607, "step": 1110 }, { "epoch": 0.18, "grad_norm": 11.030925059144677, "learning_rate": 9.429703319359558e-06, "loss": 0.4939, "step": 1111 }, { "epoch": 0.18, "grad_norm": 50.65573650694543, "learning_rate": 9.428492327034496e-06, "loss": 0.4355, "step": 1112 }, { "epoch": 0.18, "grad_norm": 12.188372492125053, "learning_rate": 9.427280128266049e-06, "loss": 0.4479, "step": 1113 }, { "epoch": 0.18, "grad_norm": 18.90073627757158, "learning_rate": 9.42606672338446e-06, "loss": 0.5341, "step": 1114 }, { "epoch": 0.18, "grad_norm": 18.311445664301328, "learning_rate": 9.424852112720289e-06, "loss": 0.4275, "step": 1115 }, { "epoch": 0.18, "grad_norm": 23.94973155526467, "learning_rate": 9.42363629660443e-06, "loss": 0.4815, "step": 1116 }, { "epoch": 0.18, "grad_norm": 45.12101300161726, "learning_rate": 9.422419275368107e-06, "loss": 0.4646, "step": 1117 }, { "epoch": 0.18, "grad_norm": 33.981706193205305, "learning_rate": 9.421201049342867e-06, "loss": 0.4381, "step": 1118 }, { "epoch": 0.18, "grad_norm": 29.862699745094982, "learning_rate": 9.41998161886059e-06, "loss": 0.4864, "step": 1119 }, { "epoch": 0.18, "grad_norm": 31.87468476084896, "learning_rate": 9.418760984253485e-06, "loss": 0.4884, "step": 1120 }, { "epoch": 0.18, "grad_norm": 18.145408793170006, "learning_rate": 9.417539145854082e-06, "loss": 0.497, "step": 1121 }, { "epoch": 0.18, "grad_norm": 33.32486798994945, "learning_rate": 9.416316103995246e-06, "loss": 0.4717, "step": 1122 }, { "epoch": 0.18, "grad_norm": 26.18179504580407, "learning_rate": 9.415091859010165e-06, "loss": 0.4646, "step": 1123 }, { "epoch": 0.18, "grad_norm": 21.537429592767765, "learning_rate": 9.413866411232361e-06, "loss": 0.4961, "step": 1124 }, { "epoch": 0.18, "grad_norm": 14.387111792037299, "learning_rate": 9.41263976099568e-06, "loss": 0.4726, "step": 1125 }, { "epoch": 0.18, "grad_norm": 22.69719255173724, "learning_rate": 9.411411908634289e-06, "loss": 0.4469, "step": 1126 }, { "epoch": 0.18, "grad_norm": 17.25897190677966, "learning_rate": 9.410182854482693e-06, "loss": 0.4663, "step": 1127 }, { "epoch": 0.18, "grad_norm": 1.3087366906080624, "learning_rate": 9.40895259887572e-06, "loss": 0.4837, "step": 1128 }, { "epoch": 0.18, "grad_norm": 32.081016827106815, "learning_rate": 9.407721142148528e-06, "loss": 0.4668, "step": 1129 }, { "epoch": 0.18, "grad_norm": 14.396009357773716, "learning_rate": 9.406488484636595e-06, "loss": 0.5448, "step": 1130 }, { "epoch": 0.18, "grad_norm": 30.392799335214516, "learning_rate": 9.405254626675736e-06, "loss": 0.5459, "step": 1131 }, { "epoch": 0.18, "grad_norm": 11.52677066210452, "learning_rate": 9.404019568602081e-06, "loss": 0.4165, "step": 1132 }, { "epoch": 0.18, "grad_norm": 17.485077966435156, "learning_rate": 9.402783310752102e-06, "loss": 0.4647, "step": 1133 }, { "epoch": 0.18, "grad_norm": 26.14712993730686, "learning_rate": 9.401545853462584e-06, "loss": 0.4324, "step": 1134 }, { "epoch": 0.18, "grad_norm": 206.99213813279073, "learning_rate": 9.400307197070646e-06, "loss": 0.4845, "step": 1135 }, { "epoch": 0.18, "grad_norm": 16.172639217316522, "learning_rate": 9.399067341913732e-06, "loss": 0.4469, "step": 1136 }, { "epoch": 0.18, "grad_norm": 20.885640090923513, "learning_rate": 9.397826288329615e-06, "loss": 0.4737, "step": 1137 }, { "epoch": 0.18, "grad_norm": 1.240214926222737, "learning_rate": 9.396584036656388e-06, "loss": 0.4828, "step": 1138 }, { "epoch": 0.18, "grad_norm": 19.212690911137276, "learning_rate": 9.395340587232478e-06, "loss": 0.5325, "step": 1139 }, { "epoch": 0.18, "grad_norm": 19.765604811065216, "learning_rate": 9.394095940396632e-06, "loss": 0.5173, "step": 1140 }, { "epoch": 0.18, "grad_norm": 88.99170683412197, "learning_rate": 9.39285009648793e-06, "loss": 0.3996, "step": 1141 }, { "epoch": 0.18, "grad_norm": 17.9497957789715, "learning_rate": 9.391603055845771e-06, "loss": 0.5323, "step": 1142 }, { "epoch": 0.18, "grad_norm": 10.257355218001061, "learning_rate": 9.390354818809886e-06, "loss": 0.5556, "step": 1143 }, { "epoch": 0.18, "grad_norm": 12.07665114170986, "learning_rate": 9.389105385720325e-06, "loss": 0.4289, "step": 1144 }, { "epoch": 0.18, "grad_norm": 21.817882344400445, "learning_rate": 9.387854756917472e-06, "loss": 0.4549, "step": 1145 }, { "epoch": 0.18, "grad_norm": 16.62139903255941, "learning_rate": 9.386602932742032e-06, "loss": 0.4868, "step": 1146 }, { "epoch": 0.18, "grad_norm": 10.06137492986953, "learning_rate": 9.385349913535034e-06, "loss": 0.5043, "step": 1147 }, { "epoch": 0.18, "grad_norm": 13.864672919800334, "learning_rate": 9.384095699637839e-06, "loss": 0.4445, "step": 1148 }, { "epoch": 0.19, "grad_norm": 1.253327053047788, "learning_rate": 9.382840291392124e-06, "loss": 0.4224, "step": 1149 }, { "epoch": 0.19, "grad_norm": 9.046173474935681, "learning_rate": 9.3815836891399e-06, "loss": 0.4754, "step": 1150 }, { "epoch": 0.19, "grad_norm": 10.489242303268826, "learning_rate": 9.380325893223503e-06, "loss": 0.5442, "step": 1151 }, { "epoch": 0.19, "grad_norm": 1.0869260441775574, "learning_rate": 9.379066903985588e-06, "loss": 0.4446, "step": 1152 }, { "epoch": 0.19, "grad_norm": 19.592258237974765, "learning_rate": 9.377806721769138e-06, "loss": 0.4478, "step": 1153 }, { "epoch": 0.19, "grad_norm": 9.748546834920383, "learning_rate": 9.37654534691746e-06, "loss": 0.4895, "step": 1154 }, { "epoch": 0.19, "grad_norm": 11.62312580828369, "learning_rate": 9.37528277977419e-06, "loss": 0.4243, "step": 1155 }, { "epoch": 0.19, "grad_norm": 35.12877145727759, "learning_rate": 9.374019020683287e-06, "loss": 0.4845, "step": 1156 }, { "epoch": 0.19, "grad_norm": 22.879201605154623, "learning_rate": 9.372754069989032e-06, "loss": 0.4797, "step": 1157 }, { "epoch": 0.19, "grad_norm": 10.476809555230036, "learning_rate": 9.371487928036032e-06, "loss": 0.5033, "step": 1158 }, { "epoch": 0.19, "grad_norm": 10.057681349307448, "learning_rate": 9.37022059516922e-06, "loss": 0.4588, "step": 1159 }, { "epoch": 0.19, "grad_norm": 10.16209405167654, "learning_rate": 9.368952071733851e-06, "loss": 0.4869, "step": 1160 }, { "epoch": 0.19, "grad_norm": 8.408254828463715, "learning_rate": 9.367682358075509e-06, "loss": 0.5148, "step": 1161 }, { "epoch": 0.19, "grad_norm": 7.635243574782355, "learning_rate": 9.366411454540095e-06, "loss": 0.5032, "step": 1162 }, { "epoch": 0.19, "grad_norm": 11.406930155882574, "learning_rate": 9.36513936147384e-06, "loss": 0.4921, "step": 1163 }, { "epoch": 0.19, "grad_norm": 17.416469116732234, "learning_rate": 9.363866079223299e-06, "loss": 0.4587, "step": 1164 }, { "epoch": 0.19, "grad_norm": 10.425267658806005, "learning_rate": 9.362591608135346e-06, "loss": 0.5475, "step": 1165 }, { "epoch": 0.19, "grad_norm": 10.431501851374636, "learning_rate": 9.361315948557182e-06, "loss": 0.4846, "step": 1166 }, { "epoch": 0.19, "grad_norm": 17.832089371763782, "learning_rate": 9.360039100836335e-06, "loss": 0.508, "step": 1167 }, { "epoch": 0.19, "grad_norm": 13.133379324501101, "learning_rate": 9.358761065320653e-06, "loss": 0.4656, "step": 1168 }, { "epoch": 0.19, "grad_norm": 9.798702732647197, "learning_rate": 9.357481842358302e-06, "loss": 0.566, "step": 1169 }, { "epoch": 0.19, "grad_norm": 14.117209200933333, "learning_rate": 9.356201432297788e-06, "loss": 0.4778, "step": 1170 }, { "epoch": 0.19, "grad_norm": 1.311854056583549, "learning_rate": 9.354919835487922e-06, "loss": 0.4823, "step": 1171 }, { "epoch": 0.19, "grad_norm": 10.52169299827595, "learning_rate": 9.353637052277848e-06, "loss": 0.4855, "step": 1172 }, { "epoch": 0.19, "grad_norm": 14.440455087182773, "learning_rate": 9.352353083017034e-06, "loss": 0.4369, "step": 1173 }, { "epoch": 0.19, "grad_norm": 17.625658108640863, "learning_rate": 9.351067928055265e-06, "loss": 0.5355, "step": 1174 }, { "epoch": 0.19, "grad_norm": 19.054361595410825, "learning_rate": 9.349781587742655e-06, "loss": 0.4444, "step": 1175 }, { "epoch": 0.19, "grad_norm": 50.298468256397996, "learning_rate": 9.348494062429639e-06, "loss": 0.493, "step": 1176 }, { "epoch": 0.19, "grad_norm": 22.228128249307108, "learning_rate": 9.347205352466972e-06, "loss": 0.4421, "step": 1177 }, { "epoch": 0.19, "grad_norm": 23.71583495647995, "learning_rate": 9.345915458205736e-06, "loss": 0.4444, "step": 1178 }, { "epoch": 0.19, "grad_norm": 19.1439750637546, "learning_rate": 9.344624379997335e-06, "loss": 0.4557, "step": 1179 }, { "epoch": 0.19, "grad_norm": 13.819056776149889, "learning_rate": 9.343332118193492e-06, "loss": 0.4726, "step": 1180 }, { "epoch": 0.19, "grad_norm": 17.508459236707278, "learning_rate": 9.342038673146253e-06, "loss": 0.5048, "step": 1181 }, { "epoch": 0.19, "grad_norm": 119.09385352197333, "learning_rate": 9.340744045207992e-06, "loss": 0.4849, "step": 1182 }, { "epoch": 0.19, "grad_norm": 16.957858104714944, "learning_rate": 9.3394482347314e-06, "loss": 0.4315, "step": 1183 }, { "epoch": 0.19, "grad_norm": 9.342867913210528, "learning_rate": 9.338151242069491e-06, "loss": 0.4139, "step": 1184 }, { "epoch": 0.19, "grad_norm": 16.215044926373327, "learning_rate": 9.336853067575603e-06, "loss": 0.4756, "step": 1185 }, { "epoch": 0.19, "grad_norm": 53.467968004359186, "learning_rate": 9.335553711603393e-06, "loss": 0.4518, "step": 1186 }, { "epoch": 0.19, "grad_norm": 18.85378554402728, "learning_rate": 9.33425317450684e-06, "loss": 0.4421, "step": 1187 }, { "epoch": 0.19, "grad_norm": 35.407259253720966, "learning_rate": 9.332951456640249e-06, "loss": 0.4914, "step": 1188 }, { "epoch": 0.19, "grad_norm": 19.981261755815293, "learning_rate": 9.331648558358244e-06, "loss": 0.4497, "step": 1189 }, { "epoch": 0.19, "grad_norm": 20.609169845081027, "learning_rate": 9.330344480015767e-06, "loss": 0.4358, "step": 1190 }, { "epoch": 0.19, "grad_norm": 99.9583634238747, "learning_rate": 9.329039221968087e-06, "loss": 0.4899, "step": 1191 }, { "epoch": 0.19, "grad_norm": 1.451622322687537, "learning_rate": 9.327732784570794e-06, "loss": 0.4941, "step": 1192 }, { "epoch": 0.19, "grad_norm": 21.35559268182892, "learning_rate": 9.326425168179795e-06, "loss": 0.4657, "step": 1193 }, { "epoch": 0.19, "grad_norm": 13.504441820715579, "learning_rate": 9.325116373151322e-06, "loss": 0.5195, "step": 1194 }, { "epoch": 0.19, "grad_norm": 20.2715934180167, "learning_rate": 9.323806399841927e-06, "loss": 0.4898, "step": 1195 }, { "epoch": 0.19, "grad_norm": 15.877236035520705, "learning_rate": 9.32249524860848e-06, "loss": 0.4756, "step": 1196 }, { "epoch": 0.19, "grad_norm": 13.665288639285857, "learning_rate": 9.321182919808179e-06, "loss": 0.4715, "step": 1197 }, { "epoch": 0.19, "grad_norm": 22.36898029781077, "learning_rate": 9.319869413798535e-06, "loss": 0.4858, "step": 1198 }, { "epoch": 0.19, "grad_norm": 22.996504095711675, "learning_rate": 9.318554730937385e-06, "loss": 0.4565, "step": 1199 }, { "epoch": 0.19, "grad_norm": 1.3377875501110854, "learning_rate": 9.317238871582886e-06, "loss": 0.4319, "step": 1200 }, { "epoch": 0.19, "grad_norm": 9.255664931655975, "learning_rate": 9.31592183609351e-06, "loss": 0.41, "step": 1201 }, { "epoch": 0.19, "grad_norm": 13.201775552399635, "learning_rate": 9.31460362482806e-06, "loss": 0.4497, "step": 1202 }, { "epoch": 0.19, "grad_norm": 23.992040097790134, "learning_rate": 9.313284238145648e-06, "loss": 0.4768, "step": 1203 }, { "epoch": 0.19, "grad_norm": 38.7723169133457, "learning_rate": 9.311963676405716e-06, "loss": 0.4836, "step": 1204 }, { "epoch": 0.19, "grad_norm": 19.343054276542055, "learning_rate": 9.310641939968016e-06, "loss": 0.5546, "step": 1205 }, { "epoch": 0.19, "grad_norm": 10.365010498680524, "learning_rate": 9.309319029192627e-06, "loss": 0.5018, "step": 1206 }, { "epoch": 0.19, "grad_norm": 13.967270033279258, "learning_rate": 9.307994944439949e-06, "loss": 0.5239, "step": 1207 }, { "epoch": 0.19, "grad_norm": 11.218295828753742, "learning_rate": 9.306669686070697e-06, "loss": 0.4324, "step": 1208 }, { "epoch": 0.19, "grad_norm": 13.54751828727378, "learning_rate": 9.30534325444591e-06, "loss": 0.5218, "step": 1209 }, { "epoch": 0.19, "grad_norm": 1.1739378918121595, "learning_rate": 9.304015649926941e-06, "loss": 0.4509, "step": 1210 }, { "epoch": 0.2, "grad_norm": 8.13142482689222, "learning_rate": 9.30268687287547e-06, "loss": 0.467, "step": 1211 }, { "epoch": 0.2, "grad_norm": 11.397876469421153, "learning_rate": 9.30135692365349e-06, "loss": 0.4784, "step": 1212 }, { "epoch": 0.2, "grad_norm": 9.158325384787432, "learning_rate": 9.300025802623316e-06, "loss": 0.481, "step": 1213 }, { "epoch": 0.2, "grad_norm": 29.734777233129346, "learning_rate": 9.29869351014758e-06, "loss": 0.4222, "step": 1214 }, { "epoch": 0.2, "grad_norm": 1.2468332697212356, "learning_rate": 9.29736004658924e-06, "loss": 0.4808, "step": 1215 }, { "epoch": 0.2, "grad_norm": 15.88352860449723, "learning_rate": 9.296025412311567e-06, "loss": 0.4934, "step": 1216 }, { "epoch": 0.2, "grad_norm": 16.768370728471233, "learning_rate": 9.294689607678151e-06, "loss": 0.5011, "step": 1217 }, { "epoch": 0.2, "grad_norm": 12.063144961541461, "learning_rate": 9.293352633052901e-06, "loss": 0.4005, "step": 1218 }, { "epoch": 0.2, "grad_norm": 10.141293802230301, "learning_rate": 9.292014488800046e-06, "loss": 0.4002, "step": 1219 }, { "epoch": 0.2, "grad_norm": 1.3431301998195286, "learning_rate": 9.290675175284135e-06, "loss": 0.4932, "step": 1220 }, { "epoch": 0.2, "grad_norm": 20.47786188148704, "learning_rate": 9.289334692870033e-06, "loss": 0.5413, "step": 1221 }, { "epoch": 0.2, "grad_norm": 14.956801794450936, "learning_rate": 9.287993041922924e-06, "loss": 0.5173, "step": 1222 }, { "epoch": 0.2, "grad_norm": 8.611344499766316, "learning_rate": 9.286650222808314e-06, "loss": 0.5498, "step": 1223 }, { "epoch": 0.2, "grad_norm": 21.48874694353932, "learning_rate": 9.285306235892017e-06, "loss": 0.4429, "step": 1224 }, { "epoch": 0.2, "grad_norm": 14.468222040691122, "learning_rate": 9.283961081540178e-06, "loss": 0.4165, "step": 1225 }, { "epoch": 0.2, "grad_norm": 11.28054113136257, "learning_rate": 9.282614760119252e-06, "loss": 0.4845, "step": 1226 }, { "epoch": 0.2, "grad_norm": 25.150239852820814, "learning_rate": 9.281267271996015e-06, "loss": 0.553, "step": 1227 }, { "epoch": 0.2, "grad_norm": 13.20899492909649, "learning_rate": 9.27991861753756e-06, "loss": 0.4772, "step": 1228 }, { "epoch": 0.2, "grad_norm": 8.79608666825319, "learning_rate": 9.278568797111294e-06, "loss": 0.472, "step": 1229 }, { "epoch": 0.2, "grad_norm": 11.215769122133224, "learning_rate": 9.277217811084952e-06, "loss": 0.4343, "step": 1230 }, { "epoch": 0.2, "grad_norm": 20.16144853859444, "learning_rate": 9.275865659826573e-06, "loss": 0.5042, "step": 1231 }, { "epoch": 0.2, "grad_norm": 7.989537269658579, "learning_rate": 9.274512343704525e-06, "loss": 0.4415, "step": 1232 }, { "epoch": 0.2, "grad_norm": 15.480275734066542, "learning_rate": 9.273157863087486e-06, "loss": 0.4042, "step": 1233 }, { "epoch": 0.2, "grad_norm": 8.801990898470137, "learning_rate": 9.271802218344455e-06, "loss": 0.4928, "step": 1234 }, { "epoch": 0.2, "grad_norm": 8.223933352370969, "learning_rate": 9.270445409844749e-06, "loss": 0.4635, "step": 1235 }, { "epoch": 0.2, "grad_norm": 11.055781175774458, "learning_rate": 9.269087437957996e-06, "loss": 0.4356, "step": 1236 }, { "epoch": 0.2, "grad_norm": 26.43649069801985, "learning_rate": 9.267728303054146e-06, "loss": 0.462, "step": 1237 }, { "epoch": 0.2, "grad_norm": 20.609353355665974, "learning_rate": 9.266368005503465e-06, "loss": 0.4623, "step": 1238 }, { "epoch": 0.2, "grad_norm": 12.13612454486031, "learning_rate": 9.265006545676538e-06, "loss": 0.4657, "step": 1239 }, { "epoch": 0.2, "grad_norm": 10.603635138028705, "learning_rate": 9.263643923944262e-06, "loss": 0.4725, "step": 1240 }, { "epoch": 0.2, "grad_norm": 13.04866956416687, "learning_rate": 9.262280140677852e-06, "loss": 0.548, "step": 1241 }, { "epoch": 0.2, "grad_norm": 7.216422309171557, "learning_rate": 9.260915196248842e-06, "loss": 0.4126, "step": 1242 }, { "epoch": 0.2, "grad_norm": 1.2481489264021828, "learning_rate": 9.259549091029082e-06, "loss": 0.4892, "step": 1243 }, { "epoch": 0.2, "grad_norm": 52.20123113474192, "learning_rate": 9.258181825390732e-06, "loss": 0.4764, "step": 1244 }, { "epoch": 0.2, "grad_norm": 11.05519660226427, "learning_rate": 9.256813399706276e-06, "loss": 0.4485, "step": 1245 }, { "epoch": 0.2, "grad_norm": 29.124364956214794, "learning_rate": 9.25544381434851e-06, "loss": 0.4824, "step": 1246 }, { "epoch": 0.2, "grad_norm": 11.517095297925277, "learning_rate": 9.254073069690545e-06, "loss": 0.4474, "step": 1247 }, { "epoch": 0.2, "grad_norm": 10.852439500211396, "learning_rate": 9.252701166105815e-06, "loss": 0.4288, "step": 1248 }, { "epoch": 0.2, "grad_norm": 8.538429558245012, "learning_rate": 9.251328103968059e-06, "loss": 0.5233, "step": 1249 }, { "epoch": 0.2, "grad_norm": 7.702156735349676, "learning_rate": 9.24995388365134e-06, "loss": 0.4236, "step": 1250 }, { "epoch": 0.2, "grad_norm": 11.099613402994994, "learning_rate": 9.24857850553003e-06, "loss": 0.497, "step": 1251 }, { "epoch": 0.2, "grad_norm": 13.707701686477801, "learning_rate": 9.247201969978825e-06, "loss": 0.4985, "step": 1252 }, { "epoch": 0.2, "grad_norm": 8.403183216867559, "learning_rate": 9.245824277372729e-06, "loss": 0.4521, "step": 1253 }, { "epoch": 0.2, "grad_norm": 11.367205305396112, "learning_rate": 9.24444542808706e-06, "loss": 0.5147, "step": 1254 }, { "epoch": 0.2, "grad_norm": 10.27569694009789, "learning_rate": 9.24306542249746e-06, "loss": 0.5031, "step": 1255 }, { "epoch": 0.2, "grad_norm": 16.95109154151481, "learning_rate": 9.241684260979877e-06, "loss": 0.5319, "step": 1256 }, { "epoch": 0.2, "grad_norm": 8.806869077294007, "learning_rate": 9.240301943910578e-06, "loss": 0.4816, "step": 1257 }, { "epoch": 0.2, "grad_norm": 79.63590440279026, "learning_rate": 9.238918471666148e-06, "loss": 0.4712, "step": 1258 }, { "epoch": 0.2, "grad_norm": 14.741811734696071, "learning_rate": 9.237533844623477e-06, "loss": 0.4267, "step": 1259 }, { "epoch": 0.2, "grad_norm": 20.51911950479914, "learning_rate": 9.236148063159778e-06, "loss": 0.5143, "step": 1260 }, { "epoch": 0.2, "grad_norm": 18.09495880257626, "learning_rate": 9.234761127652578e-06, "loss": 0.3864, "step": 1261 }, { "epoch": 0.2, "grad_norm": 18.274083317285942, "learning_rate": 9.233373038479716e-06, "loss": 0.5526, "step": 1262 }, { "epoch": 0.2, "grad_norm": 17.36187993715572, "learning_rate": 9.231983796019342e-06, "loss": 0.4587, "step": 1263 }, { "epoch": 0.2, "grad_norm": 14.530279273544366, "learning_rate": 9.230593400649928e-06, "loss": 0.463, "step": 1264 }, { "epoch": 0.2, "grad_norm": 8.589281296572754, "learning_rate": 9.229201852750254e-06, "loss": 0.4684, "step": 1265 }, { "epoch": 0.2, "grad_norm": 15.277295313947509, "learning_rate": 9.227809152699418e-06, "loss": 0.5006, "step": 1266 }, { "epoch": 0.2, "grad_norm": 23.395107243578348, "learning_rate": 9.226415300876828e-06, "loss": 0.5185, "step": 1267 }, { "epoch": 0.2, "grad_norm": 12.391051482791196, "learning_rate": 9.225020297662208e-06, "loss": 0.5072, "step": 1268 }, { "epoch": 0.2, "grad_norm": 1.149735535469054, "learning_rate": 9.223624143435595e-06, "loss": 0.4678, "step": 1269 }, { "epoch": 0.2, "grad_norm": 22.73384019925979, "learning_rate": 9.222226838577342e-06, "loss": 0.4558, "step": 1270 }, { "epoch": 0.2, "grad_norm": 11.62899862665488, "learning_rate": 9.22082838346811e-06, "loss": 0.5048, "step": 1271 }, { "epoch": 0.2, "grad_norm": 57.15693829281563, "learning_rate": 9.21942877848888e-06, "loss": 0.46, "step": 1272 }, { "epoch": 0.21, "grad_norm": 8.628456781608687, "learning_rate": 9.21802802402094e-06, "loss": 0.4475, "step": 1273 }, { "epoch": 0.21, "grad_norm": 9.087844092420713, "learning_rate": 9.216626120445897e-06, "loss": 0.4458, "step": 1274 }, { "epoch": 0.21, "grad_norm": 8.491669651180336, "learning_rate": 9.215223068145666e-06, "loss": 0.5246, "step": 1275 }, { "epoch": 0.21, "grad_norm": 6.977727515705651, "learning_rate": 9.213818867502478e-06, "loss": 0.4452, "step": 1276 }, { "epoch": 0.21, "grad_norm": 14.24986174431144, "learning_rate": 9.212413518898878e-06, "loss": 0.4236, "step": 1277 }, { "epoch": 0.21, "grad_norm": 11.413776483419992, "learning_rate": 9.211007022717716e-06, "loss": 0.4034, "step": 1278 }, { "epoch": 0.21, "grad_norm": 8.385466089490842, "learning_rate": 9.209599379342165e-06, "loss": 0.4427, "step": 1279 }, { "epoch": 0.21, "grad_norm": 10.940816814118238, "learning_rate": 9.208190589155705e-06, "loss": 0.5313, "step": 1280 }, { "epoch": 0.21, "grad_norm": 11.958495319276546, "learning_rate": 9.206780652542131e-06, "loss": 0.4437, "step": 1281 }, { "epoch": 0.21, "grad_norm": 26.791168871675765, "learning_rate": 9.205369569885544e-06, "loss": 0.5029, "step": 1282 }, { "epoch": 0.21, "grad_norm": 14.324589467082395, "learning_rate": 9.203957341570367e-06, "loss": 0.4548, "step": 1283 }, { "epoch": 0.21, "grad_norm": 1.410091724695577, "learning_rate": 9.202543967981325e-06, "loss": 0.4778, "step": 1284 }, { "epoch": 0.21, "grad_norm": 8.702845859851132, "learning_rate": 9.201129449503463e-06, "loss": 0.4368, "step": 1285 }, { "epoch": 0.21, "grad_norm": 15.623418610245421, "learning_rate": 9.199713786522135e-06, "loss": 0.4786, "step": 1286 }, { "epoch": 0.21, "grad_norm": 11.056804684065105, "learning_rate": 9.198296979423006e-06, "loss": 0.4497, "step": 1287 }, { "epoch": 0.21, "grad_norm": 24.75961125509385, "learning_rate": 9.196879028592052e-06, "loss": 0.3898, "step": 1288 }, { "epoch": 0.21, "grad_norm": 26.85212124355255, "learning_rate": 9.195459934415563e-06, "loss": 0.4554, "step": 1289 }, { "epoch": 0.21, "grad_norm": 17.310065012772057, "learning_rate": 9.19403969728014e-06, "loss": 0.4364, "step": 1290 }, { "epoch": 0.21, "grad_norm": 18.515392506319202, "learning_rate": 9.192618317572693e-06, "loss": 0.4527, "step": 1291 }, { "epoch": 0.21, "grad_norm": 17.17954595614451, "learning_rate": 9.191195795680447e-06, "loss": 0.4027, "step": 1292 }, { "epoch": 0.21, "grad_norm": 10.205084116468981, "learning_rate": 9.189772131990935e-06, "loss": 0.4488, "step": 1293 }, { "epoch": 0.21, "grad_norm": 16.050933430684896, "learning_rate": 9.188347326892002e-06, "loss": 0.4471, "step": 1294 }, { "epoch": 0.21, "grad_norm": 10.003518400123445, "learning_rate": 9.186921380771806e-06, "loss": 0.5134, "step": 1295 }, { "epoch": 0.21, "grad_norm": 27.695627595422607, "learning_rate": 9.185494294018811e-06, "loss": 0.4264, "step": 1296 }, { "epoch": 0.21, "grad_norm": 20.00949196301132, "learning_rate": 9.184066067021798e-06, "loss": 0.434, "step": 1297 }, { "epoch": 0.21, "grad_norm": 11.777361036554433, "learning_rate": 9.182636700169854e-06, "loss": 0.4422, "step": 1298 }, { "epoch": 0.21, "grad_norm": 1.3876563356128973, "learning_rate": 9.181206193852376e-06, "loss": 0.4462, "step": 1299 }, { "epoch": 0.21, "grad_norm": 13.688711584071397, "learning_rate": 9.179774548459077e-06, "loss": 0.4714, "step": 1300 }, { "epoch": 0.21, "grad_norm": 18.579101840787082, "learning_rate": 9.178341764379974e-06, "loss": 0.4762, "step": 1301 }, { "epoch": 0.21, "grad_norm": 23.57544635182901, "learning_rate": 9.176907842005398e-06, "loss": 0.5258, "step": 1302 }, { "epoch": 0.21, "grad_norm": 17.226077759206042, "learning_rate": 9.17547278172599e-06, "loss": 0.4234, "step": 1303 }, { "epoch": 0.21, "grad_norm": 10.54545814839091, "learning_rate": 9.174036583932695e-06, "loss": 0.5277, "step": 1304 }, { "epoch": 0.21, "grad_norm": 1.4144205359549633, "learning_rate": 9.17259924901678e-06, "loss": 0.5127, "step": 1305 }, { "epoch": 0.21, "grad_norm": 11.255516150321295, "learning_rate": 9.17116077736981e-06, "loss": 0.5251, "step": 1306 }, { "epoch": 0.21, "grad_norm": 13.807324052286138, "learning_rate": 9.169721169383666e-06, "loss": 0.4744, "step": 1307 }, { "epoch": 0.21, "grad_norm": 21.28383155767525, "learning_rate": 9.168280425450535e-06, "loss": 0.4663, "step": 1308 }, { "epoch": 0.21, "grad_norm": 19.963916602666423, "learning_rate": 9.166838545962917e-06, "loss": 0.4622, "step": 1309 }, { "epoch": 0.21, "grad_norm": 55.23616177983737, "learning_rate": 9.165395531313622e-06, "loss": 0.4964, "step": 1310 }, { "epoch": 0.21, "grad_norm": 16.042122077655637, "learning_rate": 9.163951381895761e-06, "loss": 0.5034, "step": 1311 }, { "epoch": 0.21, "grad_norm": 11.422662953418456, "learning_rate": 9.162506098102765e-06, "loss": 0.5072, "step": 1312 }, { "epoch": 0.21, "grad_norm": 16.476629498388053, "learning_rate": 9.161059680328368e-06, "loss": 0.4352, "step": 1313 }, { "epoch": 0.21, "grad_norm": 12.834599456728844, "learning_rate": 9.159612128966615e-06, "loss": 0.4169, "step": 1314 }, { "epoch": 0.21, "grad_norm": 15.989437922598462, "learning_rate": 9.158163444411857e-06, "loss": 0.5346, "step": 1315 }, { "epoch": 0.21, "grad_norm": 13.174938125846978, "learning_rate": 9.156713627058754e-06, "loss": 0.4222, "step": 1316 }, { "epoch": 0.21, "grad_norm": 18.44157670851661, "learning_rate": 9.155262677302281e-06, "loss": 0.4736, "step": 1317 }, { "epoch": 0.21, "grad_norm": 14.525230537507207, "learning_rate": 9.153810595537713e-06, "loss": 0.3699, "step": 1318 }, { "epoch": 0.21, "grad_norm": 16.931319891885362, "learning_rate": 9.152357382160639e-06, "loss": 0.4742, "step": 1319 }, { "epoch": 0.21, "grad_norm": 1.4073910594513346, "learning_rate": 9.150903037566954e-06, "loss": 0.4362, "step": 1320 }, { "epoch": 0.21, "grad_norm": 16.41880097822788, "learning_rate": 9.14944756215286e-06, "loss": 0.4477, "step": 1321 }, { "epoch": 0.21, "grad_norm": 31.637023932907365, "learning_rate": 9.14799095631487e-06, "loss": 0.4892, "step": 1322 }, { "epoch": 0.21, "grad_norm": 14.99614971919869, "learning_rate": 9.146533220449804e-06, "loss": 0.4099, "step": 1323 }, { "epoch": 0.21, "grad_norm": 23.106372062139346, "learning_rate": 9.14507435495479e-06, "loss": 0.489, "step": 1324 }, { "epoch": 0.21, "grad_norm": 25.684235961142882, "learning_rate": 9.143614360227261e-06, "loss": 0.4874, "step": 1325 }, { "epoch": 0.21, "grad_norm": 11.872847415985758, "learning_rate": 9.142153236664961e-06, "loss": 0.4499, "step": 1326 }, { "epoch": 0.21, "grad_norm": 19.62887129872119, "learning_rate": 9.14069098466594e-06, "loss": 0.418, "step": 1327 }, { "epoch": 0.21, "grad_norm": 1.4187584199052041, "learning_rate": 9.139227604628556e-06, "loss": 0.518, "step": 1328 }, { "epoch": 0.21, "grad_norm": 18.748798834806667, "learning_rate": 9.137763096951474e-06, "loss": 0.4456, "step": 1329 }, { "epoch": 0.21, "grad_norm": 17.390382067510025, "learning_rate": 9.136297462033667e-06, "loss": 0.4928, "step": 1330 }, { "epoch": 0.21, "grad_norm": 22.52121710577523, "learning_rate": 9.134830700274413e-06, "loss": 0.4352, "step": 1331 }, { "epoch": 0.21, "grad_norm": 30.130918738808376, "learning_rate": 9.133362812073298e-06, "loss": 0.4262, "step": 1332 }, { "epoch": 0.21, "grad_norm": 51.20820463640828, "learning_rate": 9.131893797830219e-06, "loss": 0.51, "step": 1333 }, { "epoch": 0.21, "grad_norm": 51.896539210940006, "learning_rate": 9.130423657945369e-06, "loss": 0.3873, "step": 1334 }, { "epoch": 0.22, "grad_norm": 38.01746440850158, "learning_rate": 9.128952392819262e-06, "loss": 0.488, "step": 1335 }, { "epoch": 0.22, "grad_norm": 29.387651645730315, "learning_rate": 9.127480002852706e-06, "loss": 0.5168, "step": 1336 }, { "epoch": 0.22, "grad_norm": 28.395164882192013, "learning_rate": 9.126006488446824e-06, "loss": 0.5219, "step": 1337 }, { "epoch": 0.22, "grad_norm": 46.6306114392875, "learning_rate": 9.12453185000304e-06, "loss": 0.5417, "step": 1338 }, { "epoch": 0.22, "grad_norm": 23.70115531328106, "learning_rate": 9.123056087923088e-06, "loss": 0.5601, "step": 1339 }, { "epoch": 0.22, "grad_norm": 14.480551692948808, "learning_rate": 9.121579202609004e-06, "loss": 0.5108, "step": 1340 }, { "epoch": 0.22, "grad_norm": 1.4914957166525074, "learning_rate": 9.12010119446313e-06, "loss": 0.4777, "step": 1341 }, { "epoch": 0.22, "grad_norm": 1.4780775033050049, "learning_rate": 9.118622063888124e-06, "loss": 0.4867, "step": 1342 }, { "epoch": 0.22, "grad_norm": 24.729274280863823, "learning_rate": 9.117141811286935e-06, "loss": 0.4694, "step": 1343 }, { "epoch": 0.22, "grad_norm": 17.06349807428754, "learning_rate": 9.115660437062828e-06, "loss": 0.4714, "step": 1344 }, { "epoch": 0.22, "grad_norm": 26.024820950268502, "learning_rate": 9.114177941619369e-06, "loss": 0.5125, "step": 1345 }, { "epoch": 0.22, "grad_norm": 15.231664802558168, "learning_rate": 9.11269432536043e-06, "loss": 0.4924, "step": 1346 }, { "epoch": 0.22, "grad_norm": 20.323088561043797, "learning_rate": 9.11120958869019e-06, "loss": 0.4952, "step": 1347 }, { "epoch": 0.22, "grad_norm": 1.3646948365850502, "learning_rate": 9.109723732013132e-06, "loss": 0.4997, "step": 1348 }, { "epoch": 0.22, "grad_norm": 22.515671785607097, "learning_rate": 9.108236755734045e-06, "loss": 0.4593, "step": 1349 }, { "epoch": 0.22, "grad_norm": 14.87568346975854, "learning_rate": 9.10674866025802e-06, "loss": 0.4786, "step": 1350 }, { "epoch": 0.22, "grad_norm": 32.87193528521304, "learning_rate": 9.105259445990457e-06, "loss": 0.5309, "step": 1351 }, { "epoch": 0.22, "grad_norm": 52.92662729038108, "learning_rate": 9.10376911333706e-06, "loss": 0.4188, "step": 1352 }, { "epoch": 0.22, "grad_norm": 27.640186962170592, "learning_rate": 9.102277662703834e-06, "loss": 0.4655, "step": 1353 }, { "epoch": 0.22, "grad_norm": 13.667125251100636, "learning_rate": 9.100785094497093e-06, "loss": 0.4639, "step": 1354 }, { "epoch": 0.22, "grad_norm": 14.083472155379368, "learning_rate": 9.099291409123454e-06, "loss": 0.5523, "step": 1355 }, { "epoch": 0.22, "grad_norm": 20.55259754733359, "learning_rate": 9.097796606989838e-06, "loss": 0.4389, "step": 1356 }, { "epoch": 0.22, "grad_norm": 20.704468162449896, "learning_rate": 9.09630068850347e-06, "loss": 0.4698, "step": 1357 }, { "epoch": 0.22, "grad_norm": 30.26170179842706, "learning_rate": 9.094803654071877e-06, "loss": 0.4954, "step": 1358 }, { "epoch": 0.22, "grad_norm": 15.73055675354421, "learning_rate": 9.093305504102897e-06, "loss": 0.358, "step": 1359 }, { "epoch": 0.22, "grad_norm": 21.843516125015547, "learning_rate": 9.091806239004664e-06, "loss": 0.4993, "step": 1360 }, { "epoch": 0.22, "grad_norm": 1.426705648488882, "learning_rate": 9.090305859185619e-06, "loss": 0.476, "step": 1361 }, { "epoch": 0.22, "grad_norm": 32.36476620726132, "learning_rate": 9.088804365054511e-06, "loss": 0.4966, "step": 1362 }, { "epoch": 0.22, "grad_norm": 16.53031496507569, "learning_rate": 9.087301757020384e-06, "loss": 0.4351, "step": 1363 }, { "epoch": 0.22, "grad_norm": 19.424222770714906, "learning_rate": 9.08579803549259e-06, "loss": 0.4574, "step": 1364 }, { "epoch": 0.22, "grad_norm": 22.71946720673779, "learning_rate": 9.084293200880787e-06, "loss": 0.4796, "step": 1365 }, { "epoch": 0.22, "grad_norm": 15.686661171616713, "learning_rate": 9.082787253594932e-06, "loss": 0.4305, "step": 1366 }, { "epoch": 0.22, "grad_norm": 18.501625328224105, "learning_rate": 9.081280194045286e-06, "loss": 0.3989, "step": 1367 }, { "epoch": 0.22, "grad_norm": 19.83478580526505, "learning_rate": 9.079772022642413e-06, "loss": 0.5081, "step": 1368 }, { "epoch": 0.22, "grad_norm": 24.332286338091723, "learning_rate": 9.078262739797184e-06, "loss": 0.4863, "step": 1369 }, { "epoch": 0.22, "grad_norm": 13.818230009541045, "learning_rate": 9.076752345920764e-06, "loss": 0.4919, "step": 1370 }, { "epoch": 0.22, "grad_norm": 19.801251483646638, "learning_rate": 9.075240841424629e-06, "loss": 0.4621, "step": 1371 }, { "epoch": 0.22, "grad_norm": 18.71701743471966, "learning_rate": 9.073728226720555e-06, "loss": 0.4796, "step": 1372 }, { "epoch": 0.22, "grad_norm": 13.81863544332554, "learning_rate": 9.07221450222062e-06, "loss": 0.5006, "step": 1373 }, { "epoch": 0.22, "grad_norm": 11.449025877618004, "learning_rate": 9.070699668337202e-06, "loss": 0.4857, "step": 1374 }, { "epoch": 0.22, "grad_norm": 21.725905979424546, "learning_rate": 9.069183725482984e-06, "loss": 0.4309, "step": 1375 }, { "epoch": 0.22, "grad_norm": 14.0013196207162, "learning_rate": 9.067666674070952e-06, "loss": 0.4357, "step": 1376 }, { "epoch": 0.22, "grad_norm": 15.803010596255133, "learning_rate": 9.066148514514395e-06, "loss": 0.4687, "step": 1377 }, { "epoch": 0.22, "grad_norm": 18.275906721830868, "learning_rate": 9.064629247226895e-06, "loss": 0.4528, "step": 1378 }, { "epoch": 0.22, "grad_norm": 13.973481170144813, "learning_rate": 9.063108872622348e-06, "loss": 0.4766, "step": 1379 }, { "epoch": 0.22, "grad_norm": 21.71026300091794, "learning_rate": 9.061587391114942e-06, "loss": 0.5199, "step": 1380 }, { "epoch": 0.22, "grad_norm": 44.02811839197106, "learning_rate": 9.060064803119173e-06, "loss": 0.5202, "step": 1381 }, { "epoch": 0.22, "grad_norm": 74.85422997274796, "learning_rate": 9.058541109049836e-06, "loss": 0.4556, "step": 1382 }, { "epoch": 0.22, "grad_norm": 18.579628251511966, "learning_rate": 9.057016309322026e-06, "loss": 0.5966, "step": 1383 }, { "epoch": 0.22, "grad_norm": 26.67411198452988, "learning_rate": 9.055490404351141e-06, "loss": 0.4774, "step": 1384 }, { "epoch": 0.22, "grad_norm": 16.633871110327483, "learning_rate": 9.05396339455288e-06, "loss": 0.5119, "step": 1385 }, { "epoch": 0.22, "grad_norm": 15.382117263458648, "learning_rate": 9.05243528034324e-06, "loss": 0.5104, "step": 1386 }, { "epoch": 0.22, "grad_norm": 18.573334075367782, "learning_rate": 9.050906062138527e-06, "loss": 0.5515, "step": 1387 }, { "epoch": 0.22, "grad_norm": 18.358890556324596, "learning_rate": 9.049375740355338e-06, "loss": 0.5047, "step": 1388 }, { "epoch": 0.22, "grad_norm": 25.633611702870787, "learning_rate": 9.047844315410574e-06, "loss": 0.4835, "step": 1389 }, { "epoch": 0.22, "grad_norm": 63.715255472140306, "learning_rate": 9.04631178772144e-06, "loss": 0.4322, "step": 1390 }, { "epoch": 0.22, "grad_norm": 22.65127175358151, "learning_rate": 9.044778157705437e-06, "loss": 0.4812, "step": 1391 }, { "epoch": 0.22, "grad_norm": 28.245544505170365, "learning_rate": 9.04324342578037e-06, "loss": 0.4849, "step": 1392 }, { "epoch": 0.22, "grad_norm": 21.291562515955235, "learning_rate": 9.04170759236434e-06, "loss": 0.4698, "step": 1393 }, { "epoch": 0.22, "grad_norm": 11.663448905980461, "learning_rate": 9.040170657875753e-06, "loss": 0.433, "step": 1394 }, { "epoch": 0.22, "grad_norm": 16.078475537493784, "learning_rate": 9.038632622733311e-06, "loss": 0.4937, "step": 1395 }, { "epoch": 0.22, "grad_norm": 19.058760656396302, "learning_rate": 9.037093487356016e-06, "loss": 0.5158, "step": 1396 }, { "epoch": 0.23, "grad_norm": 18.42271472261091, "learning_rate": 9.035553252163172e-06, "loss": 0.5191, "step": 1397 }, { "epoch": 0.23, "grad_norm": 11.883669861386426, "learning_rate": 9.034011917574382e-06, "loss": 0.5046, "step": 1398 }, { "epoch": 0.23, "grad_norm": 20.729675986626503, "learning_rate": 9.032469484009548e-06, "loss": 0.4744, "step": 1399 }, { "epoch": 0.23, "grad_norm": 17.518435951467687, "learning_rate": 9.030925951888869e-06, "loss": 0.5217, "step": 1400 }, { "epoch": 0.23, "grad_norm": 11.350922828878298, "learning_rate": 9.029381321632849e-06, "loss": 0.4789, "step": 1401 }, { "epoch": 0.23, "grad_norm": 13.321753440462835, "learning_rate": 9.027835593662285e-06, "loss": 0.4575, "step": 1402 }, { "epoch": 0.23, "grad_norm": 15.102292166377094, "learning_rate": 9.026288768398278e-06, "loss": 0.4668, "step": 1403 }, { "epoch": 0.23, "grad_norm": 15.530500443242154, "learning_rate": 9.024740846262225e-06, "loss": 0.468, "step": 1404 }, { "epoch": 0.23, "grad_norm": 18.54755125323169, "learning_rate": 9.023191827675824e-06, "loss": 0.4875, "step": 1405 }, { "epoch": 0.23, "grad_norm": 13.903256952424416, "learning_rate": 9.021641713061069e-06, "loss": 0.4899, "step": 1406 }, { "epoch": 0.23, "grad_norm": 12.819147567488159, "learning_rate": 9.020090502840253e-06, "loss": 0.4806, "step": 1407 }, { "epoch": 0.23, "grad_norm": 17.95248407686516, "learning_rate": 9.01853819743597e-06, "loss": 0.5195, "step": 1408 }, { "epoch": 0.23, "grad_norm": 22.108535873567885, "learning_rate": 9.016984797271112e-06, "loss": 0.4763, "step": 1409 }, { "epoch": 0.23, "grad_norm": 19.011571501621837, "learning_rate": 9.015430302768865e-06, "loss": 0.4444, "step": 1410 }, { "epoch": 0.23, "grad_norm": 15.251096900371449, "learning_rate": 9.013874714352716e-06, "loss": 0.5195, "step": 1411 }, { "epoch": 0.23, "grad_norm": 12.918221136442906, "learning_rate": 9.012318032446454e-06, "loss": 0.533, "step": 1412 }, { "epoch": 0.23, "grad_norm": 9.912184588390328, "learning_rate": 9.010760257474158e-06, "loss": 0.474, "step": 1413 }, { "epoch": 0.23, "grad_norm": 13.116186421493087, "learning_rate": 9.009201389860212e-06, "loss": 0.5148, "step": 1414 }, { "epoch": 0.23, "grad_norm": 24.100969375026544, "learning_rate": 9.007641430029292e-06, "loss": 0.4445, "step": 1415 }, { "epoch": 0.23, "grad_norm": 9.298245587728536, "learning_rate": 9.006080378406377e-06, "loss": 0.4616, "step": 1416 }, { "epoch": 0.23, "grad_norm": 44.95262205780494, "learning_rate": 9.004518235416737e-06, "loss": 0.5426, "step": 1417 }, { "epoch": 0.23, "grad_norm": 16.77950724248937, "learning_rate": 9.002955001485945e-06, "loss": 0.4738, "step": 1418 }, { "epoch": 0.23, "grad_norm": 11.743634809295369, "learning_rate": 9.001390677039868e-06, "loss": 0.4751, "step": 1419 }, { "epoch": 0.23, "grad_norm": 40.90380610995379, "learning_rate": 8.999825262504672e-06, "loss": 0.4761, "step": 1420 }, { "epoch": 0.23, "grad_norm": 12.379649063995942, "learning_rate": 8.998258758306819e-06, "loss": 0.4371, "step": 1421 }, { "epoch": 0.23, "grad_norm": 12.479741586427682, "learning_rate": 8.996691164873068e-06, "loss": 0.4947, "step": 1422 }, { "epoch": 0.23, "grad_norm": 32.0125924555458, "learning_rate": 8.995122482630473e-06, "loss": 0.4498, "step": 1423 }, { "epoch": 0.23, "grad_norm": 16.72696221970109, "learning_rate": 8.993552712006388e-06, "loss": 0.4878, "step": 1424 }, { "epoch": 0.23, "grad_norm": 8.738150180837083, "learning_rate": 8.99198185342846e-06, "loss": 0.4538, "step": 1425 }, { "epoch": 0.23, "grad_norm": 14.171858095111103, "learning_rate": 8.990409907324638e-06, "loss": 0.4423, "step": 1426 }, { "epoch": 0.23, "grad_norm": 37.14674702156252, "learning_rate": 8.98883687412316e-06, "loss": 0.5038, "step": 1427 }, { "epoch": 0.23, "grad_norm": 12.514200655491752, "learning_rate": 8.987262754252565e-06, "loss": 0.4195, "step": 1428 }, { "epoch": 0.23, "grad_norm": 18.73595518963675, "learning_rate": 8.985687548141685e-06, "loss": 0.4918, "step": 1429 }, { "epoch": 0.23, "grad_norm": 13.393744028649486, "learning_rate": 8.984111256219651e-06, "loss": 0.4598, "step": 1430 }, { "epoch": 0.23, "grad_norm": 33.74959136417218, "learning_rate": 8.982533878915889e-06, "loss": 0.4762, "step": 1431 }, { "epoch": 0.23, "grad_norm": 14.250598375787344, "learning_rate": 8.98095541666012e-06, "loss": 0.4775, "step": 1432 }, { "epoch": 0.23, "grad_norm": 12.825538490570228, "learning_rate": 8.979375869882358e-06, "loss": 0.4703, "step": 1433 }, { "epoch": 0.23, "grad_norm": 10.506107271922906, "learning_rate": 8.977795239012916e-06, "loss": 0.4326, "step": 1434 }, { "epoch": 0.23, "grad_norm": 14.52408901744981, "learning_rate": 8.976213524482404e-06, "loss": 0.4851, "step": 1435 }, { "epoch": 0.23, "grad_norm": 16.00035703593011, "learning_rate": 8.974630726721723e-06, "loss": 0.5184, "step": 1436 }, { "epoch": 0.23, "grad_norm": 33.75190294547055, "learning_rate": 8.97304684616207e-06, "loss": 0.5033, "step": 1437 }, { "epoch": 0.23, "grad_norm": 23.38672065857748, "learning_rate": 8.97146188323494e-06, "loss": 0.4523, "step": 1438 }, { "epoch": 0.23, "grad_norm": 28.17702616900368, "learning_rate": 8.969875838372117e-06, "loss": 0.4683, "step": 1439 }, { "epoch": 0.23, "grad_norm": 8.73695502523755, "learning_rate": 8.968288712005688e-06, "loss": 0.4534, "step": 1440 }, { "epoch": 0.23, "grad_norm": 9.039933951501318, "learning_rate": 8.966700504568025e-06, "loss": 0.4471, "step": 1441 }, { "epoch": 0.23, "grad_norm": 15.24768091596694, "learning_rate": 8.965111216491803e-06, "loss": 0.4524, "step": 1442 }, { "epoch": 0.23, "grad_norm": 11.855969482707387, "learning_rate": 8.963520848209985e-06, "loss": 0.4514, "step": 1443 }, { "epoch": 0.23, "grad_norm": 12.785279525959147, "learning_rate": 8.961929400155833e-06, "loss": 0.497, "step": 1444 }, { "epoch": 0.23, "grad_norm": 11.922075292698853, "learning_rate": 8.960336872762903e-06, "loss": 0.4884, "step": 1445 }, { "epoch": 0.23, "grad_norm": 12.026701372383078, "learning_rate": 8.958743266465041e-06, "loss": 0.4873, "step": 1446 }, { "epoch": 0.23, "grad_norm": 24.539774701349426, "learning_rate": 8.957148581696389e-06, "loss": 0.4496, "step": 1447 }, { "epoch": 0.23, "grad_norm": 12.641384795833185, "learning_rate": 8.955552818891384e-06, "loss": 0.4594, "step": 1448 }, { "epoch": 0.23, "grad_norm": 14.37102050702069, "learning_rate": 8.953955978484756e-06, "loss": 0.4741, "step": 1449 }, { "epoch": 0.23, "grad_norm": 9.790732735897041, "learning_rate": 8.95235806091153e-06, "loss": 0.4133, "step": 1450 }, { "epoch": 0.23, "grad_norm": 8.507999678566856, "learning_rate": 8.950759066607017e-06, "loss": 0.4779, "step": 1451 }, { "epoch": 0.23, "grad_norm": 13.959000156122002, "learning_rate": 8.949158996006834e-06, "loss": 0.5271, "step": 1452 }, { "epoch": 0.23, "grad_norm": 18.363854419623745, "learning_rate": 8.94755784954688e-06, "loss": 0.4959, "step": 1453 }, { "epoch": 0.23, "grad_norm": 10.21605100554187, "learning_rate": 8.945955627663353e-06, "loss": 0.4159, "step": 1454 }, { "epoch": 0.23, "grad_norm": 37.323527678676356, "learning_rate": 8.944352330792741e-06, "loss": 0.4545, "step": 1455 }, { "epoch": 0.23, "grad_norm": 10.789811674804827, "learning_rate": 8.942747959371829e-06, "loss": 0.4231, "step": 1456 }, { "epoch": 0.23, "grad_norm": 14.194037810394756, "learning_rate": 8.941142513837689e-06, "loss": 0.4463, "step": 1457 }, { "epoch": 0.23, "grad_norm": 8.884912495630356, "learning_rate": 8.939535994627692e-06, "loss": 0.4535, "step": 1458 }, { "epoch": 0.24, "grad_norm": 11.895915153642727, "learning_rate": 8.937928402179495e-06, "loss": 0.4292, "step": 1459 }, { "epoch": 0.24, "grad_norm": 12.67063274596574, "learning_rate": 8.936319736931051e-06, "loss": 0.443, "step": 1460 }, { "epoch": 0.24, "grad_norm": 10.584497654189247, "learning_rate": 8.934709999320605e-06, "loss": 0.5142, "step": 1461 }, { "epoch": 0.24, "grad_norm": 16.904185061229505, "learning_rate": 8.933099189786697e-06, "loss": 0.5184, "step": 1462 }, { "epoch": 0.24, "grad_norm": 18.79326128728486, "learning_rate": 8.93148730876815e-06, "loss": 0.4394, "step": 1463 }, { "epoch": 0.24, "grad_norm": 25.77048213505705, "learning_rate": 8.92987435670409e-06, "loss": 0.4672, "step": 1464 }, { "epoch": 0.24, "grad_norm": 14.731230767942538, "learning_rate": 8.928260334033927e-06, "loss": 0.4991, "step": 1465 }, { "epoch": 0.24, "grad_norm": 18.317646303564654, "learning_rate": 8.926645241197365e-06, "loss": 0.4601, "step": 1466 }, { "epoch": 0.24, "grad_norm": 10.913102317586345, "learning_rate": 8.925029078634401e-06, "loss": 0.4413, "step": 1467 }, { "epoch": 0.24, "grad_norm": 13.502409598859076, "learning_rate": 8.923411846785322e-06, "loss": 0.424, "step": 1468 }, { "epoch": 0.24, "grad_norm": 13.153480065630106, "learning_rate": 8.921793546090709e-06, "loss": 0.4972, "step": 1469 }, { "epoch": 0.24, "grad_norm": 17.658353229158543, "learning_rate": 8.920174176991426e-06, "loss": 0.3933, "step": 1470 }, { "epoch": 0.24, "grad_norm": 11.598314892462291, "learning_rate": 8.918553739928637e-06, "loss": 0.4715, "step": 1471 }, { "epoch": 0.24, "grad_norm": 8.894333455457994, "learning_rate": 8.916932235343797e-06, "loss": 0.4046, "step": 1472 }, { "epoch": 0.24, "grad_norm": 11.202730447168385, "learning_rate": 8.915309663678641e-06, "loss": 0.3356, "step": 1473 }, { "epoch": 0.24, "grad_norm": 1.222870010651823, "learning_rate": 8.913686025375207e-06, "loss": 0.4715, "step": 1474 }, { "epoch": 0.24, "grad_norm": 10.921090167960903, "learning_rate": 8.91206132087582e-06, "loss": 0.4393, "step": 1475 }, { "epoch": 0.24, "grad_norm": 8.665442244238676, "learning_rate": 8.910435550623093e-06, "loss": 0.5106, "step": 1476 }, { "epoch": 0.24, "grad_norm": 10.385219651649345, "learning_rate": 8.908808715059929e-06, "loss": 0.4484, "step": 1477 }, { "epoch": 0.24, "grad_norm": 8.989471358964002, "learning_rate": 8.907180814629526e-06, "loss": 0.3998, "step": 1478 }, { "epoch": 0.24, "grad_norm": 9.953810607363012, "learning_rate": 8.905551849775365e-06, "loss": 0.4225, "step": 1479 }, { "epoch": 0.24, "grad_norm": 19.387315516413985, "learning_rate": 8.903921820941224e-06, "loss": 0.4943, "step": 1480 }, { "epoch": 0.24, "grad_norm": 9.360662032342251, "learning_rate": 8.902290728571165e-06, "loss": 0.4566, "step": 1481 }, { "epoch": 0.24, "grad_norm": 9.828819831140725, "learning_rate": 8.900658573109546e-06, "loss": 0.4469, "step": 1482 }, { "epoch": 0.24, "grad_norm": 16.486301244107388, "learning_rate": 8.89902535500101e-06, "loss": 0.4308, "step": 1483 }, { "epoch": 0.24, "grad_norm": 10.946320977649018, "learning_rate": 8.897391074690489e-06, "loss": 0.4986, "step": 1484 }, { "epoch": 0.24, "grad_norm": 24.17010586858406, "learning_rate": 8.895755732623207e-06, "loss": 0.4612, "step": 1485 }, { "epoch": 0.24, "grad_norm": 11.499776398578296, "learning_rate": 8.894119329244675e-06, "loss": 0.5218, "step": 1486 }, { "epoch": 0.24, "grad_norm": 16.767031555023646, "learning_rate": 8.892481865000698e-06, "loss": 0.48, "step": 1487 }, { "epoch": 0.24, "grad_norm": 10.26755103261537, "learning_rate": 8.890843340337363e-06, "loss": 0.4486, "step": 1488 }, { "epoch": 0.24, "grad_norm": 19.02391252980494, "learning_rate": 8.88920375570105e-06, "loss": 0.3607, "step": 1489 }, { "epoch": 0.24, "grad_norm": 18.768405413748543, "learning_rate": 8.887563111538428e-06, "loss": 0.4413, "step": 1490 }, { "epoch": 0.24, "grad_norm": 10.266156968094913, "learning_rate": 8.885921408296454e-06, "loss": 0.4467, "step": 1491 }, { "epoch": 0.24, "grad_norm": 12.162884144522227, "learning_rate": 8.88427864642237e-06, "loss": 0.4993, "step": 1492 }, { "epoch": 0.24, "grad_norm": 8.316698072328538, "learning_rate": 8.882634826363714e-06, "loss": 0.3914, "step": 1493 }, { "epoch": 0.24, "grad_norm": 21.78965111471558, "learning_rate": 8.880989948568303e-06, "loss": 0.4835, "step": 1494 }, { "epoch": 0.24, "grad_norm": 12.386533619233864, "learning_rate": 8.879344013484253e-06, "loss": 0.456, "step": 1495 }, { "epoch": 0.24, "grad_norm": 18.561662849828668, "learning_rate": 8.877697021559958e-06, "loss": 0.5363, "step": 1496 }, { "epoch": 0.24, "grad_norm": 15.611816621633134, "learning_rate": 8.876048973244105e-06, "loss": 0.4692, "step": 1497 }, { "epoch": 0.24, "grad_norm": 15.247695530444396, "learning_rate": 8.874399868985668e-06, "loss": 0.4311, "step": 1498 }, { "epoch": 0.24, "grad_norm": 13.373173976261493, "learning_rate": 8.872749709233907e-06, "loss": 0.5271, "step": 1499 }, { "epoch": 0.24, "grad_norm": 9.322046954447156, "learning_rate": 8.871098494438375e-06, "loss": 0.4505, "step": 1500 }, { "epoch": 0.24, "grad_norm": 19.172725477371777, "learning_rate": 8.869446225048903e-06, "loss": 0.4634, "step": 1501 }, { "epoch": 0.24, "grad_norm": 28.065674841026837, "learning_rate": 8.867792901515617e-06, "loss": 0.4358, "step": 1502 }, { "epoch": 0.24, "grad_norm": 10.915751787905425, "learning_rate": 8.866138524288929e-06, "loss": 0.4824, "step": 1503 }, { "epoch": 0.24, "grad_norm": 21.636668840373233, "learning_rate": 8.864483093819537e-06, "loss": 0.5179, "step": 1504 }, { "epoch": 0.24, "grad_norm": 11.284744019902902, "learning_rate": 8.862826610558427e-06, "loss": 0.4411, "step": 1505 }, { "epoch": 0.24, "grad_norm": 10.870551271729145, "learning_rate": 8.861169074956865e-06, "loss": 0.3877, "step": 1506 }, { "epoch": 0.24, "grad_norm": 9.510066653139628, "learning_rate": 8.859510487466415e-06, "loss": 0.5143, "step": 1507 }, { "epoch": 0.24, "grad_norm": 9.854886437852786, "learning_rate": 8.85785084853892e-06, "loss": 0.4592, "step": 1508 }, { "epoch": 0.24, "grad_norm": 10.567428624446746, "learning_rate": 8.856190158626512e-06, "loss": 0.5008, "step": 1509 }, { "epoch": 0.24, "grad_norm": 11.22432118493056, "learning_rate": 8.854528418181609e-06, "loss": 0.4681, "step": 1510 }, { "epoch": 0.24, "grad_norm": 10.210935296363807, "learning_rate": 8.852865627656912e-06, "loss": 0.4547, "step": 1511 }, { "epoch": 0.24, "grad_norm": 17.77219464157935, "learning_rate": 8.851201787505415e-06, "loss": 0.4548, "step": 1512 }, { "epoch": 0.24, "grad_norm": 12.259867303667486, "learning_rate": 8.84953689818039e-06, "loss": 0.4927, "step": 1513 }, { "epoch": 0.24, "grad_norm": 13.915689227371422, "learning_rate": 8.847870960135403e-06, "loss": 0.379, "step": 1514 }, { "epoch": 0.24, "grad_norm": 15.157403765320169, "learning_rate": 8.8462039738243e-06, "loss": 0.4818, "step": 1515 }, { "epoch": 0.24, "grad_norm": 13.476337575029335, "learning_rate": 8.84453593970121e-06, "loss": 0.4334, "step": 1516 }, { "epoch": 0.24, "grad_norm": 12.843487372142736, "learning_rate": 8.842866858220558e-06, "loss": 0.4789, "step": 1517 }, { "epoch": 0.24, "grad_norm": 12.881006581618307, "learning_rate": 8.841196729837044e-06, "loss": 0.4581, "step": 1518 }, { "epoch": 0.24, "grad_norm": 10.77858180743534, "learning_rate": 8.839525555005656e-06, "loss": 0.4953, "step": 1519 }, { "epoch": 0.24, "grad_norm": 12.042844525487867, "learning_rate": 8.837853334181669e-06, "loss": 0.5429, "step": 1520 }, { "epoch": 0.25, "grad_norm": 12.585240439976088, "learning_rate": 8.836180067820646e-06, "loss": 0.4218, "step": 1521 }, { "epoch": 0.25, "grad_norm": 15.120633185594732, "learning_rate": 8.834505756378425e-06, "loss": 0.4639, "step": 1522 }, { "epoch": 0.25, "grad_norm": 9.74554076952203, "learning_rate": 8.832830400311137e-06, "loss": 0.4783, "step": 1523 }, { "epoch": 0.25, "grad_norm": 8.379328626539223, "learning_rate": 8.831154000075196e-06, "loss": 0.4685, "step": 1524 }, { "epoch": 0.25, "grad_norm": 13.914355972277242, "learning_rate": 8.829476556127301e-06, "loss": 0.4383, "step": 1525 }, { "epoch": 0.25, "grad_norm": 24.298526008086267, "learning_rate": 8.82779806892443e-06, "loss": 0.4679, "step": 1526 }, { "epoch": 0.25, "grad_norm": 13.703473101307404, "learning_rate": 8.826118538923851e-06, "loss": 0.4968, "step": 1527 }, { "epoch": 0.25, "grad_norm": 14.10033117811136, "learning_rate": 8.824437966583114e-06, "loss": 0.512, "step": 1528 }, { "epoch": 0.25, "grad_norm": 9.869979959375344, "learning_rate": 8.822756352360056e-06, "loss": 0.4619, "step": 1529 }, { "epoch": 0.25, "grad_norm": 18.959301298861334, "learning_rate": 8.82107369671279e-06, "loss": 0.4479, "step": 1530 }, { "epoch": 0.25, "grad_norm": 24.512693186427196, "learning_rate": 8.819390000099723e-06, "loss": 0.4853, "step": 1531 }, { "epoch": 0.25, "grad_norm": 11.015412494470718, "learning_rate": 8.817705262979536e-06, "loss": 0.4275, "step": 1532 }, { "epoch": 0.25, "grad_norm": 19.078467763508947, "learning_rate": 8.8160194858112e-06, "loss": 0.5253, "step": 1533 }, { "epoch": 0.25, "grad_norm": 40.32751858306284, "learning_rate": 8.814332669053968e-06, "loss": 0.4961, "step": 1534 }, { "epoch": 0.25, "grad_norm": 12.806281402721412, "learning_rate": 8.812644813167372e-06, "loss": 0.4689, "step": 1535 }, { "epoch": 0.25, "grad_norm": 10.506546895101797, "learning_rate": 8.810955918611235e-06, "loss": 0.3931, "step": 1536 }, { "epoch": 0.25, "grad_norm": 12.33836872639905, "learning_rate": 8.809265985845655e-06, "loss": 0.461, "step": 1537 }, { "epoch": 0.25, "grad_norm": 1.458692726322872, "learning_rate": 8.807575015331019e-06, "loss": 0.4868, "step": 1538 }, { "epoch": 0.25, "grad_norm": 27.8547530549637, "learning_rate": 8.805883007527992e-06, "loss": 0.471, "step": 1539 }, { "epoch": 0.25, "grad_norm": 54.47938342993424, "learning_rate": 8.804189962897521e-06, "loss": 0.5103, "step": 1540 }, { "epoch": 0.25, "grad_norm": 11.672482910770182, "learning_rate": 8.802495881900844e-06, "loss": 0.4434, "step": 1541 }, { "epoch": 0.25, "grad_norm": 14.586492126303982, "learning_rate": 8.80080076499947e-06, "loss": 0.504, "step": 1542 }, { "epoch": 0.25, "grad_norm": 16.949378974758428, "learning_rate": 8.7991046126552e-06, "loss": 0.5287, "step": 1543 }, { "epoch": 0.25, "grad_norm": 18.159470018985314, "learning_rate": 8.79740742533011e-06, "loss": 0.4861, "step": 1544 }, { "epoch": 0.25, "grad_norm": 17.607804835672827, "learning_rate": 8.795709203486563e-06, "loss": 0.4885, "step": 1545 }, { "epoch": 0.25, "grad_norm": 12.159775804109794, "learning_rate": 8.794009947587197e-06, "loss": 0.4585, "step": 1546 }, { "epoch": 0.25, "grad_norm": 26.62923218034464, "learning_rate": 8.79230965809494e-06, "loss": 0.4859, "step": 1547 }, { "epoch": 0.25, "grad_norm": 14.250902243362471, "learning_rate": 8.790608335472995e-06, "loss": 0.5175, "step": 1548 }, { "epoch": 0.25, "grad_norm": 11.675925681964848, "learning_rate": 8.788905980184851e-06, "loss": 0.4865, "step": 1549 }, { "epoch": 0.25, "grad_norm": 14.087372519844426, "learning_rate": 8.787202592694278e-06, "loss": 0.4431, "step": 1550 }, { "epoch": 0.25, "grad_norm": 1.5630420020553932, "learning_rate": 8.785498173465323e-06, "loss": 0.4954, "step": 1551 }, { "epoch": 0.25, "grad_norm": 14.049407890795996, "learning_rate": 8.783792722962316e-06, "loss": 0.5069, "step": 1552 }, { "epoch": 0.25, "grad_norm": 14.50248217894901, "learning_rate": 8.782086241649874e-06, "loss": 0.4288, "step": 1553 }, { "epoch": 0.25, "grad_norm": 85.24993064665662, "learning_rate": 8.780378729992884e-06, "loss": 0.4363, "step": 1554 }, { "epoch": 0.25, "grad_norm": 20.94501475124581, "learning_rate": 8.778670188456519e-06, "loss": 0.489, "step": 1555 }, { "epoch": 0.25, "grad_norm": 1.2893407157129522, "learning_rate": 8.776960617506237e-06, "loss": 0.4891, "step": 1556 }, { "epoch": 0.25, "grad_norm": 11.287533596726686, "learning_rate": 8.77525001760777e-06, "loss": 0.4055, "step": 1557 }, { "epoch": 0.25, "grad_norm": 13.671952396695993, "learning_rate": 8.773538389227134e-06, "loss": 0.5449, "step": 1558 }, { "epoch": 0.25, "grad_norm": 13.004745757255726, "learning_rate": 8.771825732830622e-06, "loss": 0.4723, "step": 1559 }, { "epoch": 0.25, "grad_norm": 25.78654896117246, "learning_rate": 8.77011204888481e-06, "loss": 0.4569, "step": 1560 }, { "epoch": 0.25, "grad_norm": 13.476302103720993, "learning_rate": 8.76839733785655e-06, "loss": 0.4803, "step": 1561 }, { "epoch": 0.25, "grad_norm": 15.924910595720602, "learning_rate": 8.766681600212981e-06, "loss": 0.4544, "step": 1562 }, { "epoch": 0.25, "grad_norm": 11.899806530334011, "learning_rate": 8.764964836421515e-06, "loss": 0.4417, "step": 1563 }, { "epoch": 0.25, "grad_norm": 11.213178257009929, "learning_rate": 8.763247046949843e-06, "loss": 0.4845, "step": 1564 }, { "epoch": 0.25, "grad_norm": 15.640810050489614, "learning_rate": 8.761528232265944e-06, "loss": 0.4844, "step": 1565 }, { "epoch": 0.25, "grad_norm": 14.732590348529142, "learning_rate": 8.759808392838066e-06, "loss": 0.4117, "step": 1566 }, { "epoch": 0.25, "grad_norm": 11.777350749626775, "learning_rate": 8.75808752913474e-06, "loss": 0.4687, "step": 1567 }, { "epoch": 0.25, "grad_norm": 10.857837687193514, "learning_rate": 8.756365641624782e-06, "loss": 0.4967, "step": 1568 }, { "epoch": 0.25, "grad_norm": 14.225323037480715, "learning_rate": 8.754642730777276e-06, "loss": 0.5513, "step": 1569 }, { "epoch": 0.25, "grad_norm": 19.721041422185902, "learning_rate": 8.752918797061593e-06, "loss": 0.4795, "step": 1570 }, { "epoch": 0.25, "grad_norm": 20.624679107638986, "learning_rate": 8.751193840947382e-06, "loss": 0.4806, "step": 1571 }, { "epoch": 0.25, "grad_norm": 12.15213570097301, "learning_rate": 8.749467862904565e-06, "loss": 0.4434, "step": 1572 }, { "epoch": 0.25, "grad_norm": 12.669716573387246, "learning_rate": 8.747740863403348e-06, "loss": 0.4534, "step": 1573 }, { "epoch": 0.25, "grad_norm": 10.42812835328904, "learning_rate": 8.746012842914214e-06, "loss": 0.3991, "step": 1574 }, { "epoch": 0.25, "grad_norm": 10.628703469906624, "learning_rate": 8.74428380190792e-06, "loss": 0.4875, "step": 1575 }, { "epoch": 0.25, "grad_norm": 11.53381864997919, "learning_rate": 8.742553740855507e-06, "loss": 0.4444, "step": 1576 }, { "epoch": 0.25, "grad_norm": 11.361858392222972, "learning_rate": 8.74082266022829e-06, "loss": 0.5031, "step": 1577 }, { "epoch": 0.25, "grad_norm": 19.741825356892356, "learning_rate": 8.739090560497864e-06, "loss": 0.4969, "step": 1578 }, { "epoch": 0.25, "grad_norm": 8.369046679916586, "learning_rate": 8.737357442136104e-06, "loss": 0.5179, "step": 1579 }, { "epoch": 0.25, "grad_norm": 9.38618474385405, "learning_rate": 8.735623305615153e-06, "loss": 0.4716, "step": 1580 }, { "epoch": 0.25, "grad_norm": 14.374576774025085, "learning_rate": 8.733888151407441e-06, "loss": 0.4917, "step": 1581 }, { "epoch": 0.25, "grad_norm": 11.434308699435563, "learning_rate": 8.732151979985671e-06, "loss": 0.5607, "step": 1582 }, { "epoch": 0.26, "grad_norm": 10.999567354293504, "learning_rate": 8.730414791822825e-06, "loss": 0.4855, "step": 1583 }, { "epoch": 0.26, "grad_norm": 16.43151531480138, "learning_rate": 8.72867658739216e-06, "loss": 0.3989, "step": 1584 }, { "epoch": 0.26, "grad_norm": 8.507633457444578, "learning_rate": 8.726937367167211e-06, "loss": 0.5235, "step": 1585 }, { "epoch": 0.26, "grad_norm": 7.395906798739117, "learning_rate": 8.72519713162179e-06, "loss": 0.4209, "step": 1586 }, { "epoch": 0.26, "grad_norm": 10.075748424826264, "learning_rate": 8.723455881229984e-06, "loss": 0.4558, "step": 1587 }, { "epoch": 0.26, "grad_norm": 74.68050799151031, "learning_rate": 8.721713616466158e-06, "loss": 0.4714, "step": 1588 }, { "epoch": 0.26, "grad_norm": 11.17493845346469, "learning_rate": 8.719970337804953e-06, "loss": 0.4418, "step": 1589 }, { "epoch": 0.26, "grad_norm": 15.108421406802913, "learning_rate": 8.718226045721287e-06, "loss": 0.499, "step": 1590 }, { "epoch": 0.26, "grad_norm": 6.386312616900841, "learning_rate": 8.716480740690353e-06, "loss": 0.5241, "step": 1591 }, { "epoch": 0.26, "grad_norm": 11.152886129264175, "learning_rate": 8.71473442318762e-06, "loss": 0.4956, "step": 1592 }, { "epoch": 0.26, "grad_norm": 9.824383606819817, "learning_rate": 8.712987093688833e-06, "loss": 0.5248, "step": 1593 }, { "epoch": 0.26, "grad_norm": 10.840339097815546, "learning_rate": 8.711238752670012e-06, "loss": 0.4019, "step": 1594 }, { "epoch": 0.26, "grad_norm": 14.305540928187915, "learning_rate": 8.709489400607453e-06, "loss": 0.5416, "step": 1595 }, { "epoch": 0.26, "grad_norm": 14.227153851722683, "learning_rate": 8.70773903797773e-06, "loss": 0.5591, "step": 1596 }, { "epoch": 0.26, "grad_norm": 11.108728143104305, "learning_rate": 8.705987665257688e-06, "loss": 0.5017, "step": 1597 }, { "epoch": 0.26, "grad_norm": 8.939312849928044, "learning_rate": 8.704235282924449e-06, "loss": 0.4614, "step": 1598 }, { "epoch": 0.26, "grad_norm": 6.875892599537317, "learning_rate": 8.702481891455414e-06, "loss": 0.4704, "step": 1599 }, { "epoch": 0.26, "grad_norm": 8.207126504701245, "learning_rate": 8.70072749132825e-06, "loss": 0.4264, "step": 1600 }, { "epoch": 0.26, "grad_norm": 10.784673634393716, "learning_rate": 8.698972083020905e-06, "loss": 0.4546, "step": 1601 }, { "epoch": 0.26, "grad_norm": 1.3230511787201216, "learning_rate": 8.697215667011605e-06, "loss": 0.4705, "step": 1602 }, { "epoch": 0.26, "grad_norm": 11.98221144030863, "learning_rate": 8.69545824377884e-06, "loss": 0.4449, "step": 1603 }, { "epoch": 0.26, "grad_norm": 23.821188858311075, "learning_rate": 8.693699813801387e-06, "loss": 0.5079, "step": 1604 }, { "epoch": 0.26, "grad_norm": 15.651753885235879, "learning_rate": 8.691940377558284e-06, "loss": 0.4985, "step": 1605 }, { "epoch": 0.26, "grad_norm": 10.218542923354468, "learning_rate": 8.690179935528853e-06, "loss": 0.4156, "step": 1606 }, { "epoch": 0.26, "grad_norm": 14.92000654913018, "learning_rate": 8.68841848819269e-06, "loss": 0.4425, "step": 1607 }, { "epoch": 0.26, "grad_norm": 11.039339432108823, "learning_rate": 8.686656036029657e-06, "loss": 0.4664, "step": 1608 }, { "epoch": 0.26, "grad_norm": 6.023226576557596, "learning_rate": 8.684892579519897e-06, "loss": 0.447, "step": 1609 }, { "epoch": 0.26, "grad_norm": 7.386391500978715, "learning_rate": 8.683128119143824e-06, "loss": 0.4835, "step": 1610 }, { "epoch": 0.26, "grad_norm": 7.272351642503492, "learning_rate": 8.681362655382125e-06, "loss": 0.4901, "step": 1611 }, { "epoch": 0.26, "grad_norm": 11.562683062850953, "learning_rate": 8.67959618871576e-06, "loss": 0.5142, "step": 1612 }, { "epoch": 0.26, "grad_norm": 6.337599537261909, "learning_rate": 8.677828719625967e-06, "loss": 0.4586, "step": 1613 }, { "epoch": 0.26, "grad_norm": 12.549774493591219, "learning_rate": 8.676060248594248e-06, "loss": 0.4516, "step": 1614 }, { "epoch": 0.26, "grad_norm": 8.91220969728968, "learning_rate": 8.674290776102388e-06, "loss": 0.48, "step": 1615 }, { "epoch": 0.26, "grad_norm": 31.60787626908987, "learning_rate": 8.67252030263244e-06, "loss": 0.4157, "step": 1616 }, { "epoch": 0.26, "grad_norm": 7.012919896435109, "learning_rate": 8.670748828666725e-06, "loss": 0.501, "step": 1617 }, { "epoch": 0.26, "grad_norm": 9.562450922303592, "learning_rate": 8.668976354687844e-06, "loss": 0.4023, "step": 1618 }, { "epoch": 0.26, "grad_norm": 9.126666057795351, "learning_rate": 8.66720288117867e-06, "loss": 0.5263, "step": 1619 }, { "epoch": 0.26, "grad_norm": 19.048680437263343, "learning_rate": 8.665428408622343e-06, "loss": 0.4949, "step": 1620 }, { "epoch": 0.26, "grad_norm": 1.304286820862748, "learning_rate": 8.66365293750228e-06, "loss": 0.4747, "step": 1621 }, { "epoch": 0.26, "grad_norm": 9.330854118739088, "learning_rate": 8.661876468302167e-06, "loss": 0.3907, "step": 1622 }, { "epoch": 0.26, "grad_norm": 7.773192018598685, "learning_rate": 8.660099001505965e-06, "loss": 0.462, "step": 1623 }, { "epoch": 0.26, "grad_norm": 7.423545288887948, "learning_rate": 8.658320537597901e-06, "loss": 0.4659, "step": 1624 }, { "epoch": 0.26, "grad_norm": 13.619672130449494, "learning_rate": 8.656541077062483e-06, "loss": 0.4232, "step": 1625 }, { "epoch": 0.26, "grad_norm": 12.155801981142217, "learning_rate": 8.654760620384482e-06, "loss": 0.4828, "step": 1626 }, { "epoch": 0.26, "grad_norm": 8.853544841782938, "learning_rate": 8.652979168048944e-06, "loss": 0.4484, "step": 1627 }, { "epoch": 0.26, "grad_norm": 14.509172060832219, "learning_rate": 8.651196720541186e-06, "loss": 0.5376, "step": 1628 }, { "epoch": 0.26, "grad_norm": 14.780221815046163, "learning_rate": 8.649413278346795e-06, "loss": 0.5588, "step": 1629 }, { "epoch": 0.26, "grad_norm": 10.350896913841805, "learning_rate": 8.64762884195163e-06, "loss": 0.4948, "step": 1630 }, { "epoch": 0.26, "grad_norm": 10.515572713753683, "learning_rate": 8.64584341184182e-06, "loss": 0.4799, "step": 1631 }, { "epoch": 0.26, "grad_norm": 11.242723253405977, "learning_rate": 8.644056988503769e-06, "loss": 0.4776, "step": 1632 }, { "epoch": 0.26, "grad_norm": 7.682261032499714, "learning_rate": 8.642269572424143e-06, "loss": 0.5032, "step": 1633 }, { "epoch": 0.26, "grad_norm": 13.137230347762788, "learning_rate": 8.640481164089887e-06, "loss": 0.4574, "step": 1634 }, { "epoch": 0.26, "grad_norm": 13.13361040266308, "learning_rate": 8.63869176398821e-06, "loss": 0.4564, "step": 1635 }, { "epoch": 0.26, "grad_norm": 32.3337013144351, "learning_rate": 8.636901372606596e-06, "loss": 0.4893, "step": 1636 }, { "epoch": 0.26, "grad_norm": 10.334518888758286, "learning_rate": 8.635109990432797e-06, "loss": 0.4794, "step": 1637 }, { "epoch": 0.26, "grad_norm": 16.42994113303752, "learning_rate": 8.633317617954832e-06, "loss": 0.4015, "step": 1638 }, { "epoch": 0.26, "grad_norm": 15.055502561901461, "learning_rate": 8.631524255660997e-06, "loss": 0.4392, "step": 1639 }, { "epoch": 0.26, "grad_norm": 7.92981169540569, "learning_rate": 8.629729904039853e-06, "loss": 0.441, "step": 1640 }, { "epoch": 0.26, "grad_norm": 1.3895174555023133, "learning_rate": 8.627934563580226e-06, "loss": 0.4982, "step": 1641 }, { "epoch": 0.26, "grad_norm": 15.557621464092596, "learning_rate": 8.62613823477122e-06, "loss": 0.4587, "step": 1642 }, { "epoch": 0.26, "grad_norm": 10.518765854296685, "learning_rate": 8.624340918102206e-06, "loss": 0.4543, "step": 1643 }, { "epoch": 0.26, "grad_norm": 12.869296176655247, "learning_rate": 8.622542614062816e-06, "loss": 0.4249, "step": 1644 }, { "epoch": 0.27, "grad_norm": 17.10485778520921, "learning_rate": 8.620743323142966e-06, "loss": 0.4686, "step": 1645 }, { "epoch": 0.27, "grad_norm": 1.3853653315378351, "learning_rate": 8.618943045832826e-06, "loss": 0.4945, "step": 1646 }, { "epoch": 0.27, "grad_norm": 9.63686716824368, "learning_rate": 8.617141782622844e-06, "loss": 0.5502, "step": 1647 }, { "epoch": 0.27, "grad_norm": 11.779907949552356, "learning_rate": 8.615339534003735e-06, "loss": 0.4504, "step": 1648 }, { "epoch": 0.27, "grad_norm": 8.028652664341822, "learning_rate": 8.613536300466476e-06, "loss": 0.4973, "step": 1649 }, { "epoch": 0.27, "grad_norm": 10.808902802279686, "learning_rate": 8.611732082502324e-06, "loss": 0.4591, "step": 1650 }, { "epoch": 0.27, "grad_norm": 10.749152932092182, "learning_rate": 8.609926880602794e-06, "loss": 0.4115, "step": 1651 }, { "epoch": 0.27, "grad_norm": 10.026370896564668, "learning_rate": 8.608120695259674e-06, "loss": 0.4868, "step": 1652 }, { "epoch": 0.27, "grad_norm": 11.50421339016215, "learning_rate": 8.606313526965017e-06, "loss": 0.5115, "step": 1653 }, { "epoch": 0.27, "grad_norm": 18.41372557705101, "learning_rate": 8.604505376211148e-06, "loss": 0.5554, "step": 1654 }, { "epoch": 0.27, "grad_norm": 10.236731374543647, "learning_rate": 8.602696243490653e-06, "loss": 0.4143, "step": 1655 }, { "epoch": 0.27, "grad_norm": 10.620595490367121, "learning_rate": 8.600886129296396e-06, "loss": 0.4151, "step": 1656 }, { "epoch": 0.27, "grad_norm": 33.25172395196379, "learning_rate": 8.599075034121496e-06, "loss": 0.4621, "step": 1657 }, { "epoch": 0.27, "grad_norm": 12.978323308186885, "learning_rate": 8.59726295845935e-06, "loss": 0.5235, "step": 1658 }, { "epoch": 0.27, "grad_norm": 13.602390675921882, "learning_rate": 8.595449902803612e-06, "loss": 0.4089, "step": 1659 }, { "epoch": 0.27, "grad_norm": 13.696820801308398, "learning_rate": 8.593635867648214e-06, "loss": 0.4428, "step": 1660 }, { "epoch": 0.27, "grad_norm": 11.41039944464447, "learning_rate": 8.591820853487344e-06, "loss": 0.5256, "step": 1661 }, { "epoch": 0.27, "grad_norm": 9.054526887437678, "learning_rate": 8.590004860815466e-06, "loss": 0.4178, "step": 1662 }, { "epoch": 0.27, "grad_norm": 12.486451819130309, "learning_rate": 8.588187890127305e-06, "loss": 0.4175, "step": 1663 }, { "epoch": 0.27, "grad_norm": 8.706921743690426, "learning_rate": 8.586369941917852e-06, "loss": 0.5075, "step": 1664 }, { "epoch": 0.27, "grad_norm": 7.05660143987441, "learning_rate": 8.58455101668237e-06, "loss": 0.4665, "step": 1665 }, { "epoch": 0.27, "grad_norm": 14.011998156857956, "learning_rate": 8.58273111491638e-06, "loss": 0.4425, "step": 1666 }, { "epoch": 0.27, "grad_norm": 9.781817735925221, "learning_rate": 8.580910237115678e-06, "loss": 0.5101, "step": 1667 }, { "epoch": 0.27, "grad_norm": 20.66535014417799, "learning_rate": 8.579088383776318e-06, "loss": 0.5304, "step": 1668 }, { "epoch": 0.27, "grad_norm": 16.90363131393331, "learning_rate": 8.577265555394626e-06, "loss": 0.4364, "step": 1669 }, { "epoch": 0.27, "grad_norm": 11.00326759488563, "learning_rate": 8.575441752467185e-06, "loss": 0.4369, "step": 1670 }, { "epoch": 0.27, "grad_norm": 11.101639684614764, "learning_rate": 8.573616975490855e-06, "loss": 0.4911, "step": 1671 }, { "epoch": 0.27, "grad_norm": 14.278542819965034, "learning_rate": 8.571791224962754e-06, "loss": 0.4537, "step": 1672 }, { "epoch": 0.27, "grad_norm": 9.27675994511827, "learning_rate": 8.569964501380266e-06, "loss": 0.4728, "step": 1673 }, { "epoch": 0.27, "grad_norm": 34.81570469805829, "learning_rate": 8.56813680524104e-06, "loss": 0.4279, "step": 1674 }, { "epoch": 0.27, "grad_norm": 18.298980035789068, "learning_rate": 8.566308137042995e-06, "loss": 0.4922, "step": 1675 }, { "epoch": 0.27, "grad_norm": 13.033749923288475, "learning_rate": 8.564478497284306e-06, "loss": 0.4888, "step": 1676 }, { "epoch": 0.27, "grad_norm": 12.503427217368314, "learning_rate": 8.562647886463417e-06, "loss": 0.4862, "step": 1677 }, { "epoch": 0.27, "grad_norm": 31.240571004033875, "learning_rate": 8.560816305079041e-06, "loss": 0.3986, "step": 1678 }, { "epoch": 0.27, "grad_norm": 12.130460752457049, "learning_rate": 8.558983753630149e-06, "loss": 0.4324, "step": 1679 }, { "epoch": 0.27, "grad_norm": 1.4086478235800388, "learning_rate": 8.557150232615977e-06, "loss": 0.4886, "step": 1680 }, { "epoch": 0.27, "grad_norm": 14.851887094705658, "learning_rate": 8.55531574253603e-06, "loss": 0.4839, "step": 1681 }, { "epoch": 0.27, "grad_norm": 13.72788461712924, "learning_rate": 8.55348028389007e-06, "loss": 0.4828, "step": 1682 }, { "epoch": 0.27, "grad_norm": 16.252121533670486, "learning_rate": 8.55164385717813e-06, "loss": 0.4857, "step": 1683 }, { "epoch": 0.27, "grad_norm": 11.574199275771631, "learning_rate": 8.549806462900503e-06, "loss": 0.5071, "step": 1684 }, { "epoch": 0.27, "grad_norm": 17.054549576769286, "learning_rate": 8.547968101557742e-06, "loss": 0.4587, "step": 1685 }, { "epoch": 0.27, "grad_norm": 10.51320614334958, "learning_rate": 8.54612877365067e-06, "loss": 0.4483, "step": 1686 }, { "epoch": 0.27, "grad_norm": 12.978725054756705, "learning_rate": 8.544288479680371e-06, "loss": 0.3968, "step": 1687 }, { "epoch": 0.27, "grad_norm": 12.153282940935025, "learning_rate": 8.542447220148191e-06, "loss": 0.5234, "step": 1688 }, { "epoch": 0.27, "grad_norm": 19.47603342477601, "learning_rate": 8.540604995555741e-06, "loss": 0.4826, "step": 1689 }, { "epoch": 0.27, "grad_norm": 14.18731041762881, "learning_rate": 8.538761806404892e-06, "loss": 0.4605, "step": 1690 }, { "epoch": 0.27, "grad_norm": 1.3946656817674892, "learning_rate": 8.53691765319778e-06, "loss": 0.4517, "step": 1691 }, { "epoch": 0.27, "grad_norm": 15.467965220831184, "learning_rate": 8.535072536436805e-06, "loss": 0.4843, "step": 1692 }, { "epoch": 0.27, "grad_norm": 15.594412590202573, "learning_rate": 8.533226456624624e-06, "loss": 0.4398, "step": 1693 }, { "epoch": 0.27, "grad_norm": 14.116100439626473, "learning_rate": 8.531379414264165e-06, "loss": 0.4495, "step": 1694 }, { "epoch": 0.27, "grad_norm": 21.007705304321693, "learning_rate": 8.52953140985861e-06, "loss": 0.5625, "step": 1695 }, { "epoch": 0.27, "grad_norm": 19.74921427902309, "learning_rate": 8.527682443911405e-06, "loss": 0.4496, "step": 1696 }, { "epoch": 0.27, "grad_norm": 15.638990948192355, "learning_rate": 8.525832516926262e-06, "loss": 0.5645, "step": 1697 }, { "epoch": 0.27, "grad_norm": 20.959114363780845, "learning_rate": 8.52398162940715e-06, "loss": 0.3857, "step": 1698 }, { "epoch": 0.27, "grad_norm": 16.137100407316332, "learning_rate": 8.522129781858306e-06, "loss": 0.5022, "step": 1699 }, { "epoch": 0.27, "grad_norm": 20.951808536602996, "learning_rate": 8.52027697478422e-06, "loss": 0.3644, "step": 1700 }, { "epoch": 0.27, "grad_norm": 26.158019756074676, "learning_rate": 8.518423208689647e-06, "loss": 0.4882, "step": 1701 }, { "epoch": 0.27, "grad_norm": 24.297157246455917, "learning_rate": 8.516568484079609e-06, "loss": 0.493, "step": 1702 }, { "epoch": 0.27, "grad_norm": 31.579394701798257, "learning_rate": 8.514712801459379e-06, "loss": 0.5093, "step": 1703 }, { "epoch": 0.27, "grad_norm": 25.97274536984953, "learning_rate": 8.5128561613345e-06, "loss": 0.4537, "step": 1704 }, { "epoch": 0.27, "grad_norm": 66.11590215744103, "learning_rate": 8.510998564210769e-06, "loss": 0.4128, "step": 1705 }, { "epoch": 0.27, "grad_norm": 29.710324056001934, "learning_rate": 8.509140010594248e-06, "loss": 0.4696, "step": 1706 }, { "epoch": 0.28, "grad_norm": 26.61163396301926, "learning_rate": 8.50728050099126e-06, "loss": 0.4059, "step": 1707 }, { "epoch": 0.28, "grad_norm": 15.268046430898593, "learning_rate": 8.505420035908383e-06, "loss": 0.4423, "step": 1708 }, { "epoch": 0.28, "grad_norm": 17.46164155677649, "learning_rate": 8.503558615852461e-06, "loss": 0.5137, "step": 1709 }, { "epoch": 0.28, "grad_norm": 19.987528095451438, "learning_rate": 8.501696241330594e-06, "loss": 0.4872, "step": 1710 }, { "epoch": 0.28, "grad_norm": 17.94169289381315, "learning_rate": 8.49983291285015e-06, "loss": 0.477, "step": 1711 }, { "epoch": 0.28, "grad_norm": 12.380447024361388, "learning_rate": 8.497968630918743e-06, "loss": 0.4179, "step": 1712 }, { "epoch": 0.28, "grad_norm": 18.153599009873076, "learning_rate": 8.496103396044262e-06, "loss": 0.4607, "step": 1713 }, { "epoch": 0.28, "grad_norm": 10.624039119459969, "learning_rate": 8.494237208734843e-06, "loss": 0.4405, "step": 1714 }, { "epoch": 0.28, "grad_norm": 13.64180077056077, "learning_rate": 8.492370069498892e-06, "loss": 0.4563, "step": 1715 }, { "epoch": 0.28, "grad_norm": 11.591772801802806, "learning_rate": 8.490501978845064e-06, "loss": 0.4074, "step": 1716 }, { "epoch": 0.28, "grad_norm": 13.188904265649606, "learning_rate": 8.488632937282281e-06, "loss": 0.428, "step": 1717 }, { "epoch": 0.28, "grad_norm": 14.284839185824236, "learning_rate": 8.486762945319722e-06, "loss": 0.4865, "step": 1718 }, { "epoch": 0.28, "grad_norm": 12.929936700489185, "learning_rate": 8.484892003466823e-06, "loss": 0.4191, "step": 1719 }, { "epoch": 0.28, "grad_norm": 10.59284587057886, "learning_rate": 8.48302011223328e-06, "loss": 0.3476, "step": 1720 }, { "epoch": 0.28, "grad_norm": 11.791662393615871, "learning_rate": 8.48114727212905e-06, "loss": 0.4482, "step": 1721 }, { "epoch": 0.28, "grad_norm": 15.446081038534517, "learning_rate": 8.479273483664344e-06, "loss": 0.4697, "step": 1722 }, { "epoch": 0.28, "grad_norm": 12.360144621178796, "learning_rate": 8.477398747349632e-06, "loss": 0.4608, "step": 1723 }, { "epoch": 0.28, "grad_norm": 10.344554653061945, "learning_rate": 8.47552306369565e-06, "loss": 0.4937, "step": 1724 }, { "epoch": 0.28, "grad_norm": 13.93541684719766, "learning_rate": 8.473646433213378e-06, "loss": 0.4346, "step": 1725 }, { "epoch": 0.28, "grad_norm": 14.310477905520795, "learning_rate": 8.471768856414069e-06, "loss": 0.4748, "step": 1726 }, { "epoch": 0.28, "grad_norm": 17.036615066349903, "learning_rate": 8.469890333809223e-06, "loss": 0.4512, "step": 1727 }, { "epoch": 0.28, "grad_norm": 20.868647610127773, "learning_rate": 8.468010865910601e-06, "loss": 0.4477, "step": 1728 }, { "epoch": 0.28, "grad_norm": 15.253249853719144, "learning_rate": 8.466130453230224e-06, "loss": 0.4814, "step": 1729 }, { "epoch": 0.28, "grad_norm": 14.94229099565896, "learning_rate": 8.464249096280368e-06, "loss": 0.4414, "step": 1730 }, { "epoch": 0.28, "grad_norm": 14.263295107453342, "learning_rate": 8.462366795573564e-06, "loss": 0.4329, "step": 1731 }, { "epoch": 0.28, "grad_norm": 10.049319993606705, "learning_rate": 8.460483551622606e-06, "loss": 0.4954, "step": 1732 }, { "epoch": 0.28, "grad_norm": 22.40481352155851, "learning_rate": 8.458599364940537e-06, "loss": 0.498, "step": 1733 }, { "epoch": 0.28, "grad_norm": 18.061119651418057, "learning_rate": 8.456714236040664e-06, "loss": 0.4554, "step": 1734 }, { "epoch": 0.28, "grad_norm": 11.911538626267598, "learning_rate": 8.45482816543655e-06, "loss": 0.5636, "step": 1735 }, { "epoch": 0.28, "grad_norm": 10.988129127948227, "learning_rate": 8.45294115364201e-06, "loss": 0.4689, "step": 1736 }, { "epoch": 0.28, "grad_norm": 73.01351768083636, "learning_rate": 8.45105320117112e-06, "loss": 0.5221, "step": 1737 }, { "epoch": 0.28, "grad_norm": 21.848488524356906, "learning_rate": 8.449164308538209e-06, "loss": 0.5006, "step": 1738 }, { "epoch": 0.28, "grad_norm": 13.244241721761913, "learning_rate": 8.447274476257863e-06, "loss": 0.5025, "step": 1739 }, { "epoch": 0.28, "grad_norm": 1.3658830912486526, "learning_rate": 8.445383704844925e-06, "loss": 0.4787, "step": 1740 }, { "epoch": 0.28, "grad_norm": 14.451635095346411, "learning_rate": 8.443491994814493e-06, "loss": 0.5581, "step": 1741 }, { "epoch": 0.28, "grad_norm": 13.183600117805668, "learning_rate": 8.441599346681921e-06, "loss": 0.5061, "step": 1742 }, { "epoch": 0.28, "grad_norm": 13.941269381075315, "learning_rate": 8.43970576096282e-06, "loss": 0.399, "step": 1743 }, { "epoch": 0.28, "grad_norm": 16.947461660431472, "learning_rate": 8.437811238173053e-06, "loss": 0.5306, "step": 1744 }, { "epoch": 0.28, "grad_norm": 15.532524262980653, "learning_rate": 8.43591577882874e-06, "loss": 0.3515, "step": 1745 }, { "epoch": 0.28, "grad_norm": 20.360542633207107, "learning_rate": 8.43401938344626e-06, "loss": 0.426, "step": 1746 }, { "epoch": 0.28, "grad_norm": 28.687001438847798, "learning_rate": 8.432122052542238e-06, "loss": 0.3623, "step": 1747 }, { "epoch": 0.28, "grad_norm": 18.50405036340982, "learning_rate": 8.430223786633563e-06, "loss": 0.4165, "step": 1748 }, { "epoch": 0.28, "grad_norm": 16.497973916622897, "learning_rate": 8.428324586237374e-06, "loss": 0.4901, "step": 1749 }, { "epoch": 0.28, "grad_norm": 27.731133264989, "learning_rate": 8.426424451871063e-06, "loss": 0.5145, "step": 1750 }, { "epoch": 0.28, "grad_norm": 17.873825700678175, "learning_rate": 8.424523384052284e-06, "loss": 0.4882, "step": 1751 }, { "epoch": 0.28, "grad_norm": 16.363451631667527, "learning_rate": 8.422621383298936e-06, "loss": 0.488, "step": 1752 }, { "epoch": 0.28, "grad_norm": 14.406654371583244, "learning_rate": 8.42071845012918e-06, "loss": 0.4437, "step": 1753 }, { "epoch": 0.28, "grad_norm": 17.08646492843234, "learning_rate": 8.418814585061423e-06, "loss": 0.3818, "step": 1754 }, { "epoch": 0.28, "grad_norm": 21.484578175918124, "learning_rate": 8.416909788614335e-06, "loss": 0.4347, "step": 1755 }, { "epoch": 0.28, "grad_norm": 24.31309885712686, "learning_rate": 8.415004061306833e-06, "loss": 0.4899, "step": 1756 }, { "epoch": 0.28, "grad_norm": 21.308017888189397, "learning_rate": 8.413097403658089e-06, "loss": 0.4511, "step": 1757 }, { "epoch": 0.28, "grad_norm": 16.101152504316246, "learning_rate": 8.411189816187528e-06, "loss": 0.4726, "step": 1758 }, { "epoch": 0.28, "grad_norm": 12.517548247178718, "learning_rate": 8.409281299414833e-06, "loss": 0.3902, "step": 1759 }, { "epoch": 0.28, "grad_norm": 11.62411927986583, "learning_rate": 8.407371853859935e-06, "loss": 0.4779, "step": 1760 }, { "epoch": 0.28, "grad_norm": 11.733966305725422, "learning_rate": 8.405461480043019e-06, "loss": 0.4881, "step": 1761 }, { "epoch": 0.28, "grad_norm": 25.606392439355545, "learning_rate": 8.403550178484521e-06, "loss": 0.4397, "step": 1762 }, { "epoch": 0.28, "grad_norm": 1.2400329454028334, "learning_rate": 8.401637949705138e-06, "loss": 0.5138, "step": 1763 }, { "epoch": 0.28, "grad_norm": 10.53117447597358, "learning_rate": 8.399724794225809e-06, "loss": 0.4055, "step": 1764 }, { "epoch": 0.28, "grad_norm": 12.439454360388517, "learning_rate": 8.397810712567732e-06, "loss": 0.4623, "step": 1765 }, { "epoch": 0.28, "grad_norm": 14.644494712122718, "learning_rate": 8.39589570525236e-06, "loss": 0.4962, "step": 1766 }, { "epoch": 0.28, "grad_norm": 8.883141468823347, "learning_rate": 8.393979772801386e-06, "loss": 0.491, "step": 1767 }, { "epoch": 0.28, "grad_norm": 12.825058122805691, "learning_rate": 8.392062915736765e-06, "loss": 0.4009, "step": 1768 }, { "epoch": 0.29, "grad_norm": 28.902302990397818, "learning_rate": 8.390145134580705e-06, "loss": 0.4433, "step": 1769 }, { "epoch": 0.29, "grad_norm": 17.084891083493, "learning_rate": 8.38822642985566e-06, "loss": 0.443, "step": 1770 }, { "epoch": 0.29, "grad_norm": 11.494499528812158, "learning_rate": 8.386306802084339e-06, "loss": 0.462, "step": 1771 }, { "epoch": 0.29, "grad_norm": 7.853799513547169, "learning_rate": 8.3843862517897e-06, "loss": 0.4567, "step": 1772 }, { "epoch": 0.29, "grad_norm": 15.655371380456858, "learning_rate": 8.382464779494954e-06, "loss": 0.3956, "step": 1773 }, { "epoch": 0.29, "grad_norm": 7.221529865290183, "learning_rate": 8.380542385723566e-06, "loss": 0.4512, "step": 1774 }, { "epoch": 0.29, "grad_norm": 8.463946808499726, "learning_rate": 8.378619070999245e-06, "loss": 0.4624, "step": 1775 }, { "epoch": 0.29, "grad_norm": 1.43068870456697, "learning_rate": 8.37669483584596e-06, "loss": 0.4958, "step": 1776 }, { "epoch": 0.29, "grad_norm": 8.864957123568374, "learning_rate": 8.37476968078792e-06, "loss": 0.5371, "step": 1777 }, { "epoch": 0.29, "grad_norm": 11.297187493578013, "learning_rate": 8.372843606349594e-06, "loss": 0.4424, "step": 1778 }, { "epoch": 0.29, "grad_norm": 9.646721072413927, "learning_rate": 8.370916613055695e-06, "loss": 0.468, "step": 1779 }, { "epoch": 0.29, "grad_norm": 8.959320936924168, "learning_rate": 8.368988701431192e-06, "loss": 0.4723, "step": 1780 }, { "epoch": 0.29, "grad_norm": 8.986117639114127, "learning_rate": 8.3670598720013e-06, "loss": 0.4734, "step": 1781 }, { "epoch": 0.29, "grad_norm": 11.176154087809401, "learning_rate": 8.365130125291485e-06, "loss": 0.5121, "step": 1782 }, { "epoch": 0.29, "grad_norm": 14.449742613048508, "learning_rate": 8.363199461827464e-06, "loss": 0.5307, "step": 1783 }, { "epoch": 0.29, "grad_norm": 13.136152815654695, "learning_rate": 8.361267882135203e-06, "loss": 0.4419, "step": 1784 }, { "epoch": 0.29, "grad_norm": 8.453501766924287, "learning_rate": 8.359335386740916e-06, "loss": 0.4166, "step": 1785 }, { "epoch": 0.29, "grad_norm": 15.46461678716022, "learning_rate": 8.357401976171069e-06, "loss": 0.4481, "step": 1786 }, { "epoch": 0.29, "grad_norm": 17.689025478266142, "learning_rate": 8.355467650952375e-06, "loss": 0.4319, "step": 1787 }, { "epoch": 0.29, "grad_norm": 32.92352334013537, "learning_rate": 8.353532411611801e-06, "loss": 0.5377, "step": 1788 }, { "epoch": 0.29, "grad_norm": 10.656649854795825, "learning_rate": 8.351596258676558e-06, "loss": 0.4499, "step": 1789 }, { "epoch": 0.29, "grad_norm": 14.819779590057792, "learning_rate": 8.349659192674104e-06, "loss": 0.4946, "step": 1790 }, { "epoch": 0.29, "grad_norm": 16.66328151920925, "learning_rate": 8.347721214132154e-06, "loss": 0.4765, "step": 1791 }, { "epoch": 0.29, "grad_norm": 10.049763762552724, "learning_rate": 8.345782323578664e-06, "loss": 0.4039, "step": 1792 }, { "epoch": 0.29, "grad_norm": 8.481263059860153, "learning_rate": 8.343842521541844e-06, "loss": 0.5103, "step": 1793 }, { "epoch": 0.29, "grad_norm": 10.685545764508802, "learning_rate": 8.341901808550147e-06, "loss": 0.4901, "step": 1794 }, { "epoch": 0.29, "grad_norm": 14.482133335822734, "learning_rate": 8.339960185132275e-06, "loss": 0.5208, "step": 1795 }, { "epoch": 0.29, "grad_norm": 14.870071432656541, "learning_rate": 8.338017651817183e-06, "loss": 0.5241, "step": 1796 }, { "epoch": 0.29, "grad_norm": 1.2869410834011459, "learning_rate": 8.336074209134071e-06, "loss": 0.446, "step": 1797 }, { "epoch": 0.29, "grad_norm": 8.778060979023511, "learning_rate": 8.334129857612383e-06, "loss": 0.5232, "step": 1798 }, { "epoch": 0.29, "grad_norm": 13.569415755455735, "learning_rate": 8.332184597781818e-06, "loss": 0.4619, "step": 1799 }, { "epoch": 0.29, "grad_norm": 15.430439255771944, "learning_rate": 8.330238430172315e-06, "loss": 0.434, "step": 1800 }, { "epoch": 0.29, "grad_norm": 16.751167726668026, "learning_rate": 8.328291355314067e-06, "loss": 0.4808, "step": 1801 }, { "epoch": 0.29, "grad_norm": 32.192879792975624, "learning_rate": 8.326343373737506e-06, "loss": 0.4244, "step": 1802 }, { "epoch": 0.29, "grad_norm": 13.470468212317533, "learning_rate": 8.32439448597332e-06, "loss": 0.4402, "step": 1803 }, { "epoch": 0.29, "grad_norm": 13.624004760927283, "learning_rate": 8.322444692552437e-06, "loss": 0.466, "step": 1804 }, { "epoch": 0.29, "grad_norm": 9.130367188054828, "learning_rate": 8.320493994006039e-06, "loss": 0.5162, "step": 1805 }, { "epoch": 0.29, "grad_norm": 11.621331870317807, "learning_rate": 8.318542390865546e-06, "loss": 0.4334, "step": 1806 }, { "epoch": 0.29, "grad_norm": 13.091968878269356, "learning_rate": 8.316589883662629e-06, "loss": 0.4858, "step": 1807 }, { "epoch": 0.29, "grad_norm": 6.098732968845795, "learning_rate": 8.314636472929206e-06, "loss": 0.3972, "step": 1808 }, { "epoch": 0.29, "grad_norm": 10.672514691461252, "learning_rate": 8.31268215919744e-06, "loss": 0.4977, "step": 1809 }, { "epoch": 0.29, "grad_norm": 8.796987146402977, "learning_rate": 8.310726942999736e-06, "loss": 0.5081, "step": 1810 }, { "epoch": 0.29, "grad_norm": 1.209953574810269, "learning_rate": 8.308770824868757e-06, "loss": 0.4743, "step": 1811 }, { "epoch": 0.29, "grad_norm": 8.60332827446013, "learning_rate": 8.306813805337395e-06, "loss": 0.5323, "step": 1812 }, { "epoch": 0.29, "grad_norm": 7.5702925008449204, "learning_rate": 8.3048558849388e-06, "loss": 0.4797, "step": 1813 }, { "epoch": 0.29, "grad_norm": 8.547047920633775, "learning_rate": 8.302897064206363e-06, "loss": 0.4777, "step": 1814 }, { "epoch": 0.29, "grad_norm": 29.593462606059088, "learning_rate": 8.300937343673722e-06, "loss": 0.5231, "step": 1815 }, { "epoch": 0.29, "grad_norm": 15.393494232963628, "learning_rate": 8.298976723874757e-06, "loss": 0.4042, "step": 1816 }, { "epoch": 0.29, "grad_norm": 8.934206236025013, "learning_rate": 8.297015205343595e-06, "loss": 0.396, "step": 1817 }, { "epoch": 0.29, "grad_norm": 10.092173847840467, "learning_rate": 8.295052788614608e-06, "loss": 0.4053, "step": 1818 }, { "epoch": 0.29, "grad_norm": 7.395020247515282, "learning_rate": 8.293089474222414e-06, "loss": 0.4308, "step": 1819 }, { "epoch": 0.29, "grad_norm": 15.60235545023061, "learning_rate": 8.291125262701874e-06, "loss": 0.5248, "step": 1820 }, { "epoch": 0.29, "grad_norm": 10.22186120958288, "learning_rate": 8.289160154588088e-06, "loss": 0.5464, "step": 1821 }, { "epoch": 0.29, "grad_norm": 7.876645115159355, "learning_rate": 8.287194150416413e-06, "loss": 0.4784, "step": 1822 }, { "epoch": 0.29, "grad_norm": 10.272816749331893, "learning_rate": 8.285227250722439e-06, "loss": 0.4482, "step": 1823 }, { "epoch": 0.29, "grad_norm": 5.691659178315458, "learning_rate": 8.283259456042e-06, "loss": 0.4232, "step": 1824 }, { "epoch": 0.29, "grad_norm": 9.952105268912957, "learning_rate": 8.281290766911187e-06, "loss": 0.3881, "step": 1825 }, { "epoch": 0.29, "grad_norm": 12.686149505205082, "learning_rate": 8.279321183866317e-06, "loss": 0.4323, "step": 1826 }, { "epoch": 0.29, "grad_norm": 9.753069243138839, "learning_rate": 8.277350707443964e-06, "loss": 0.4607, "step": 1827 }, { "epoch": 0.29, "grad_norm": 9.998994473107198, "learning_rate": 8.275379338180936e-06, "loss": 0.4328, "step": 1828 }, { "epoch": 0.29, "grad_norm": 9.150496834566596, "learning_rate": 8.273407076614288e-06, "loss": 0.4904, "step": 1829 }, { "epoch": 0.29, "grad_norm": 8.63007979752757, "learning_rate": 8.271433923281322e-06, "loss": 0.5007, "step": 1830 }, { "epoch": 0.3, "grad_norm": 9.24104133253299, "learning_rate": 8.26945987871958e-06, "loss": 0.4586, "step": 1831 }, { "epoch": 0.3, "grad_norm": 7.338792035956157, "learning_rate": 8.26748494346684e-06, "loss": 0.4354, "step": 1832 }, { "epoch": 0.3, "grad_norm": 24.69294378934424, "learning_rate": 8.265509118061135e-06, "loss": 0.4355, "step": 1833 }, { "epoch": 0.3, "grad_norm": 8.42627842149161, "learning_rate": 8.26353240304073e-06, "loss": 0.4569, "step": 1834 }, { "epoch": 0.3, "grad_norm": 8.299007469167014, "learning_rate": 8.26155479894414e-06, "loss": 0.4631, "step": 1835 }, { "epoch": 0.3, "grad_norm": 10.704611672321414, "learning_rate": 8.259576306310118e-06, "loss": 0.4947, "step": 1836 }, { "epoch": 0.3, "grad_norm": 11.265032240080957, "learning_rate": 8.257596925677657e-06, "loss": 0.4217, "step": 1837 }, { "epoch": 0.3, "grad_norm": 92.58870948430163, "learning_rate": 8.255616657586e-06, "loss": 0.4556, "step": 1838 }, { "epoch": 0.3, "grad_norm": 7.5883870427664615, "learning_rate": 8.253635502574623e-06, "loss": 0.4548, "step": 1839 }, { "epoch": 0.3, "grad_norm": 7.650623163245917, "learning_rate": 8.251653461183249e-06, "loss": 0.4217, "step": 1840 }, { "epoch": 0.3, "grad_norm": 12.186534988639481, "learning_rate": 8.249670533951838e-06, "loss": 0.506, "step": 1841 }, { "epoch": 0.3, "grad_norm": 12.829229322790615, "learning_rate": 8.247686721420596e-06, "loss": 0.5017, "step": 1842 }, { "epoch": 0.3, "grad_norm": 15.44682078639622, "learning_rate": 8.24570202412997e-06, "loss": 0.4305, "step": 1843 }, { "epoch": 0.3, "grad_norm": 1.2939011149731274, "learning_rate": 8.243716442620644e-06, "loss": 0.5034, "step": 1844 }, { "epoch": 0.3, "grad_norm": 8.00205765835229, "learning_rate": 8.241729977433543e-06, "loss": 0.4363, "step": 1845 }, { "epoch": 0.3, "grad_norm": 13.265499855379051, "learning_rate": 8.239742629109839e-06, "loss": 0.4812, "step": 1846 }, { "epoch": 0.3, "grad_norm": 8.657523160013294, "learning_rate": 8.237754398190937e-06, "loss": 0.4435, "step": 1847 }, { "epoch": 0.3, "grad_norm": 11.544997580430103, "learning_rate": 8.235765285218491e-06, "loss": 0.4829, "step": 1848 }, { "epoch": 0.3, "grad_norm": 10.082268117093488, "learning_rate": 8.233775290734385e-06, "loss": 0.4429, "step": 1849 }, { "epoch": 0.3, "grad_norm": 1.050997447869134, "learning_rate": 8.23178441528075e-06, "loss": 0.4625, "step": 1850 }, { "epoch": 0.3, "grad_norm": 9.01415487882834, "learning_rate": 8.229792659399957e-06, "loss": 0.4355, "step": 1851 }, { "epoch": 0.3, "grad_norm": 12.185236088347908, "learning_rate": 8.227800023634614e-06, "loss": 0.4719, "step": 1852 }, { "epoch": 0.3, "grad_norm": 1.1899598950708967, "learning_rate": 8.225806508527568e-06, "loss": 0.5037, "step": 1853 }, { "epoch": 0.3, "grad_norm": 11.544543319070447, "learning_rate": 8.22381211462191e-06, "loss": 0.538, "step": 1854 }, { "epoch": 0.3, "grad_norm": 1.2869544339427652, "learning_rate": 8.221816842460967e-06, "loss": 0.4602, "step": 1855 }, { "epoch": 0.3, "grad_norm": 18.29256238011236, "learning_rate": 8.219820692588306e-06, "loss": 0.5011, "step": 1856 }, { "epoch": 0.3, "grad_norm": 8.201743014993786, "learning_rate": 8.217823665547733e-06, "loss": 0.448, "step": 1857 }, { "epoch": 0.3, "grad_norm": 9.624443087298454, "learning_rate": 8.215825761883295e-06, "loss": 0.487, "step": 1858 }, { "epoch": 0.3, "grad_norm": 8.31640888031695, "learning_rate": 8.213826982139273e-06, "loss": 0.4834, "step": 1859 }, { "epoch": 0.3, "grad_norm": 15.376094189156644, "learning_rate": 8.211827326860193e-06, "loss": 0.4967, "step": 1860 }, { "epoch": 0.3, "grad_norm": 31.569835376328527, "learning_rate": 8.209826796590814e-06, "loss": 0.3853, "step": 1861 }, { "epoch": 0.3, "grad_norm": 17.178171678263944, "learning_rate": 8.207825391876137e-06, "loss": 0.5247, "step": 1862 }, { "epoch": 0.3, "grad_norm": 8.806153492820137, "learning_rate": 8.205823113261397e-06, "loss": 0.5024, "step": 1863 }, { "epoch": 0.3, "grad_norm": 8.56398242710549, "learning_rate": 8.203819961292074e-06, "loss": 0.4615, "step": 1864 }, { "epoch": 0.3, "grad_norm": 11.222232088655792, "learning_rate": 8.20181593651388e-06, "loss": 0.4978, "step": 1865 }, { "epoch": 0.3, "grad_norm": 12.826614289597524, "learning_rate": 8.199811039472764e-06, "loss": 0.5161, "step": 1866 }, { "epoch": 0.3, "grad_norm": 10.938278612428531, "learning_rate": 8.197805270714918e-06, "loss": 0.4294, "step": 1867 }, { "epoch": 0.3, "grad_norm": 9.912293474116812, "learning_rate": 8.19579863078677e-06, "loss": 0.3732, "step": 1868 }, { "epoch": 0.3, "grad_norm": 10.496731848682673, "learning_rate": 8.193791120234982e-06, "loss": 0.4378, "step": 1869 }, { "epoch": 0.3, "grad_norm": 34.9320832228751, "learning_rate": 8.191782739606455e-06, "loss": 0.4813, "step": 1870 }, { "epoch": 0.3, "grad_norm": 1.4325991702648226, "learning_rate": 8.189773489448328e-06, "loss": 0.4877, "step": 1871 }, { "epoch": 0.3, "grad_norm": 13.798553487796895, "learning_rate": 8.187763370307975e-06, "loss": 0.4346, "step": 1872 }, { "epoch": 0.3, "grad_norm": 1.3825097704921212, "learning_rate": 8.185752382733007e-06, "loss": 0.4209, "step": 1873 }, { "epoch": 0.3, "grad_norm": 21.821756747862477, "learning_rate": 8.183740527271278e-06, "loss": 0.4819, "step": 1874 }, { "epoch": 0.3, "grad_norm": 9.180679109090125, "learning_rate": 8.181727804470867e-06, "loss": 0.481, "step": 1875 }, { "epoch": 0.3, "grad_norm": 19.508129795862565, "learning_rate": 8.179714214880096e-06, "loss": 0.4727, "step": 1876 }, { "epoch": 0.3, "grad_norm": 25.065243455455967, "learning_rate": 8.177699759047525e-06, "loss": 0.5055, "step": 1877 }, { "epoch": 0.3, "grad_norm": 10.185311759250396, "learning_rate": 8.175684437521946e-06, "loss": 0.4373, "step": 1878 }, { "epoch": 0.3, "grad_norm": 22.61750829268189, "learning_rate": 8.173668250852389e-06, "loss": 0.4964, "step": 1879 }, { "epoch": 0.3, "grad_norm": 11.757430876743639, "learning_rate": 8.171651199588118e-06, "loss": 0.4633, "step": 1880 }, { "epoch": 0.3, "grad_norm": 1.2860555364682593, "learning_rate": 8.16963328427863e-06, "loss": 0.4721, "step": 1881 }, { "epoch": 0.3, "grad_norm": 32.43435771204616, "learning_rate": 8.167614505473667e-06, "loss": 0.5167, "step": 1882 }, { "epoch": 0.3, "grad_norm": 8.451686489945402, "learning_rate": 8.165594863723197e-06, "loss": 0.4788, "step": 1883 }, { "epoch": 0.3, "grad_norm": 11.28991473592401, "learning_rate": 8.163574359577422e-06, "loss": 0.5085, "step": 1884 }, { "epoch": 0.3, "grad_norm": 13.531241944417877, "learning_rate": 8.16155299358679e-06, "loss": 0.4545, "step": 1885 }, { "epoch": 0.3, "grad_norm": 6.499327205424262, "learning_rate": 8.159530766301974e-06, "loss": 0.3623, "step": 1886 }, { "epoch": 0.3, "grad_norm": 6.023961630403385, "learning_rate": 8.157507678273884e-06, "loss": 0.4487, "step": 1887 }, { "epoch": 0.3, "grad_norm": 13.223444847237513, "learning_rate": 8.155483730053664e-06, "loss": 0.5283, "step": 1888 }, { "epoch": 0.3, "grad_norm": 15.480474154831146, "learning_rate": 8.153458922192696e-06, "loss": 0.4348, "step": 1889 }, { "epoch": 0.3, "grad_norm": 1.396814694578618, "learning_rate": 8.15143325524259e-06, "loss": 0.4423, "step": 1890 }, { "epoch": 0.3, "grad_norm": 13.468279093750061, "learning_rate": 8.149406729755198e-06, "loss": 0.5127, "step": 1891 }, { "epoch": 0.3, "grad_norm": 68.76089004738085, "learning_rate": 8.147379346282599e-06, "loss": 0.5119, "step": 1892 }, { "epoch": 0.31, "grad_norm": 8.262141138936306, "learning_rate": 8.145351105377107e-06, "loss": 0.4351, "step": 1893 }, { "epoch": 0.31, "grad_norm": 8.069347919688143, "learning_rate": 8.14332200759127e-06, "loss": 0.4801, "step": 1894 }, { "epoch": 0.31, "grad_norm": 14.72688403141868, "learning_rate": 8.141292053477873e-06, "loss": 0.4662, "step": 1895 }, { "epoch": 0.31, "grad_norm": 6.901064449722384, "learning_rate": 8.139261243589933e-06, "loss": 0.499, "step": 1896 }, { "epoch": 0.31, "grad_norm": 7.230140264100207, "learning_rate": 8.137229578480694e-06, "loss": 0.4091, "step": 1897 }, { "epoch": 0.31, "grad_norm": 16.255200287087952, "learning_rate": 8.135197058703638e-06, "loss": 0.4332, "step": 1898 }, { "epoch": 0.31, "grad_norm": 7.125695915327382, "learning_rate": 8.133163684812484e-06, "loss": 0.4844, "step": 1899 }, { "epoch": 0.31, "grad_norm": 6.838622263720925, "learning_rate": 8.131129457361176e-06, "loss": 0.4612, "step": 1900 }, { "epoch": 0.31, "grad_norm": 7.1326989493283195, "learning_rate": 8.129094376903891e-06, "loss": 0.4366, "step": 1901 }, { "epoch": 0.31, "grad_norm": 7.168084706186646, "learning_rate": 8.127058443995046e-06, "loss": 0.4836, "step": 1902 }, { "epoch": 0.31, "grad_norm": 10.455059345979217, "learning_rate": 8.125021659189281e-06, "loss": 0.4367, "step": 1903 }, { "epoch": 0.31, "grad_norm": 13.884392601375222, "learning_rate": 8.122984023041476e-06, "loss": 0.4665, "step": 1904 }, { "epoch": 0.31, "grad_norm": 1.3049386307029434, "learning_rate": 8.120945536106738e-06, "loss": 0.4846, "step": 1905 }, { "epoch": 0.31, "grad_norm": 6.702064323236561, "learning_rate": 8.118906198940403e-06, "loss": 0.4946, "step": 1906 }, { "epoch": 0.31, "grad_norm": 32.61278915327996, "learning_rate": 8.11686601209805e-06, "loss": 0.3984, "step": 1907 }, { "epoch": 0.31, "grad_norm": 1.2554731141994322, "learning_rate": 8.114824976135478e-06, "loss": 0.5, "step": 1908 }, { "epoch": 0.31, "grad_norm": 10.745495582843695, "learning_rate": 8.11278309160872e-06, "loss": 0.4234, "step": 1909 }, { "epoch": 0.31, "grad_norm": 10.66981560973259, "learning_rate": 8.110740359074046e-06, "loss": 0.5033, "step": 1910 }, { "epoch": 0.31, "grad_norm": 10.578922072691991, "learning_rate": 8.108696779087949e-06, "loss": 0.5003, "step": 1911 }, { "epoch": 0.31, "grad_norm": 9.596061895270065, "learning_rate": 8.106652352207157e-06, "loss": 0.4477, "step": 1912 }, { "epoch": 0.31, "grad_norm": 9.589909903234725, "learning_rate": 8.10460707898863e-06, "loss": 0.4094, "step": 1913 }, { "epoch": 0.31, "grad_norm": 8.899410763774922, "learning_rate": 8.102560959989554e-06, "loss": 0.4883, "step": 1914 }, { "epoch": 0.31, "grad_norm": 13.645961532350412, "learning_rate": 8.100513995767352e-06, "loss": 0.4235, "step": 1915 }, { "epoch": 0.31, "grad_norm": 8.938508570814165, "learning_rate": 8.098466186879669e-06, "loss": 0.5199, "step": 1916 }, { "epoch": 0.31, "grad_norm": 6.950029226032058, "learning_rate": 8.096417533884388e-06, "loss": 0.4538, "step": 1917 }, { "epoch": 0.31, "grad_norm": 8.646892522456374, "learning_rate": 8.094368037339619e-06, "loss": 0.4831, "step": 1918 }, { "epoch": 0.31, "grad_norm": 1.3334363217075338, "learning_rate": 8.092317697803698e-06, "loss": 0.4679, "step": 1919 }, { "epoch": 0.31, "grad_norm": 11.715286427949287, "learning_rate": 8.090266515835197e-06, "loss": 0.4546, "step": 1920 }, { "epoch": 0.31, "grad_norm": 6.314716226954758, "learning_rate": 8.088214491992912e-06, "loss": 0.4988, "step": 1921 }, { "epoch": 0.31, "grad_norm": 10.860102370839819, "learning_rate": 8.086161626835873e-06, "loss": 0.4248, "step": 1922 }, { "epoch": 0.31, "grad_norm": 7.83879659651721, "learning_rate": 8.084107920923336e-06, "loss": 0.4431, "step": 1923 }, { "epoch": 0.31, "grad_norm": 6.975574715341837, "learning_rate": 8.08205337481479e-06, "loss": 0.4992, "step": 1924 }, { "epoch": 0.31, "grad_norm": 8.29652862993854, "learning_rate": 8.079997989069945e-06, "loss": 0.535, "step": 1925 }, { "epoch": 0.31, "grad_norm": 9.782181581164688, "learning_rate": 8.077941764248746e-06, "loss": 0.4299, "step": 1926 }, { "epoch": 0.31, "grad_norm": 10.712994439521564, "learning_rate": 8.075884700911368e-06, "loss": 0.436, "step": 1927 }, { "epoch": 0.31, "grad_norm": 70.16961276790295, "learning_rate": 8.07382679961821e-06, "loss": 0.4369, "step": 1928 }, { "epoch": 0.31, "grad_norm": 7.9019188456405205, "learning_rate": 8.071768060929903e-06, "loss": 0.4507, "step": 1929 }, { "epoch": 0.31, "grad_norm": 5.738944432790222, "learning_rate": 8.0697084854073e-06, "loss": 0.428, "step": 1930 }, { "epoch": 0.31, "grad_norm": 6.755314278216263, "learning_rate": 8.06764807361149e-06, "loss": 0.4552, "step": 1931 }, { "epoch": 0.31, "grad_norm": 7.552347306803121, "learning_rate": 8.065586826103782e-06, "loss": 0.4201, "step": 1932 }, { "epoch": 0.31, "grad_norm": 12.204452645409837, "learning_rate": 8.063524743445721e-06, "loss": 0.4231, "step": 1933 }, { "epoch": 0.31, "grad_norm": 7.463657697420989, "learning_rate": 8.061461826199075e-06, "loss": 0.4653, "step": 1934 }, { "epoch": 0.31, "grad_norm": 15.841048325652578, "learning_rate": 8.059398074925835e-06, "loss": 0.4855, "step": 1935 }, { "epoch": 0.31, "grad_norm": 6.851660165817965, "learning_rate": 8.057333490188228e-06, "loss": 0.4298, "step": 1936 }, { "epoch": 0.31, "grad_norm": 6.213266969864146, "learning_rate": 8.055268072548704e-06, "loss": 0.4506, "step": 1937 }, { "epoch": 0.31, "grad_norm": 9.568082338514536, "learning_rate": 8.053201822569933e-06, "loss": 0.4869, "step": 1938 }, { "epoch": 0.31, "grad_norm": 8.310918595384004, "learning_rate": 8.051134740814827e-06, "loss": 0.4371, "step": 1939 }, { "epoch": 0.31, "grad_norm": 7.870477648185713, "learning_rate": 8.049066827846513e-06, "loss": 0.419, "step": 1940 }, { "epoch": 0.31, "grad_norm": 12.027804923363858, "learning_rate": 8.046998084228347e-06, "loss": 0.4988, "step": 1941 }, { "epoch": 0.31, "grad_norm": 6.648922814481721, "learning_rate": 8.044928510523911e-06, "loss": 0.444, "step": 1942 }, { "epoch": 0.31, "grad_norm": 5.50971594506674, "learning_rate": 8.042858107297015e-06, "loss": 0.4588, "step": 1943 }, { "epoch": 0.31, "grad_norm": 12.976089748059135, "learning_rate": 8.040786875111694e-06, "loss": 0.4834, "step": 1944 }, { "epoch": 0.31, "grad_norm": 5.845809531793806, "learning_rate": 8.03871481453221e-06, "loss": 0.4694, "step": 1945 }, { "epoch": 0.31, "grad_norm": 8.41022219356413, "learning_rate": 8.036641926123043e-06, "loss": 0.502, "step": 1946 }, { "epoch": 0.31, "grad_norm": 6.180883378926618, "learning_rate": 8.034568210448914e-06, "loss": 0.4382, "step": 1947 }, { "epoch": 0.31, "grad_norm": 10.401262917198043, "learning_rate": 8.032493668074756e-06, "loss": 0.5562, "step": 1948 }, { "epoch": 0.31, "grad_norm": 9.167436909072268, "learning_rate": 8.03041829956573e-06, "loss": 0.5043, "step": 1949 }, { "epoch": 0.31, "grad_norm": 20.834765823767345, "learning_rate": 8.028342105487226e-06, "loss": 0.5405, "step": 1950 }, { "epoch": 0.31, "grad_norm": 5.377730462124356, "learning_rate": 8.026265086404856e-06, "loss": 0.4395, "step": 1951 }, { "epoch": 0.31, "grad_norm": 7.364589313361443, "learning_rate": 8.024187242884456e-06, "loss": 0.4614, "step": 1952 }, { "epoch": 0.31, "grad_norm": 10.056923211944085, "learning_rate": 8.02210857549209e-06, "loss": 0.427, "step": 1953 }, { "epoch": 0.31, "grad_norm": 13.082564912931845, "learning_rate": 8.020029084794043e-06, "loss": 0.5227, "step": 1954 }, { "epoch": 0.31, "grad_norm": 13.815038797792258, "learning_rate": 8.017948771356824e-06, "loss": 0.4767, "step": 1955 }, { "epoch": 0.32, "grad_norm": 8.602087374786478, "learning_rate": 8.01586763574717e-06, "loss": 0.4167, "step": 1956 }, { "epoch": 0.32, "grad_norm": 11.522124543890541, "learning_rate": 8.01378567853204e-06, "loss": 0.4911, "step": 1957 }, { "epoch": 0.32, "grad_norm": 4.672087614491452, "learning_rate": 8.011702900278614e-06, "loss": 0.4614, "step": 1958 }, { "epoch": 0.32, "grad_norm": 7.544293939827445, "learning_rate": 8.009619301554303e-06, "loss": 0.3818, "step": 1959 }, { "epoch": 0.32, "grad_norm": 7.22851174037217, "learning_rate": 8.007534882926731e-06, "loss": 0.3736, "step": 1960 }, { "epoch": 0.32, "grad_norm": 9.551389094659921, "learning_rate": 8.005449644963756e-06, "loss": 0.4239, "step": 1961 }, { "epoch": 0.32, "grad_norm": 8.327251297403608, "learning_rate": 8.00336358823345e-06, "loss": 0.4564, "step": 1962 }, { "epoch": 0.32, "grad_norm": 7.488387428536163, "learning_rate": 8.001276713304116e-06, "loss": 0.4512, "step": 1963 }, { "epoch": 0.32, "grad_norm": 7.095152155382439, "learning_rate": 7.999189020744273e-06, "loss": 0.4354, "step": 1964 }, { "epoch": 0.32, "grad_norm": 6.848915129494917, "learning_rate": 7.997100511122669e-06, "loss": 0.4663, "step": 1965 }, { "epoch": 0.32, "grad_norm": 5.688257044754639, "learning_rate": 7.99501118500827e-06, "loss": 0.4697, "step": 1966 }, { "epoch": 0.32, "grad_norm": 5.9624149026775415, "learning_rate": 7.992921042970264e-06, "loss": 0.4553, "step": 1967 }, { "epoch": 0.32, "grad_norm": 5.079864951095308, "learning_rate": 7.990830085578068e-06, "loss": 0.501, "step": 1968 }, { "epoch": 0.32, "grad_norm": 10.962010976951062, "learning_rate": 7.988738313401312e-06, "loss": 0.4525, "step": 1969 }, { "epoch": 0.32, "grad_norm": 17.514958168458968, "learning_rate": 7.986645727009856e-06, "loss": 0.4788, "step": 1970 }, { "epoch": 0.32, "grad_norm": 7.327362671635237, "learning_rate": 7.984552326973776e-06, "loss": 0.4974, "step": 1971 }, { "epoch": 0.32, "grad_norm": 6.209274732033065, "learning_rate": 7.982458113863373e-06, "loss": 0.4522, "step": 1972 }, { "epoch": 0.32, "grad_norm": 4.846222876316418, "learning_rate": 7.980363088249167e-06, "loss": 0.4566, "step": 1973 }, { "epoch": 0.32, "grad_norm": 6.629175978444325, "learning_rate": 7.978267250701904e-06, "loss": 0.4105, "step": 1974 }, { "epoch": 0.32, "grad_norm": 6.878768115665074, "learning_rate": 7.976170601792543e-06, "loss": 0.5323, "step": 1975 }, { "epoch": 0.32, "grad_norm": 8.855274719217585, "learning_rate": 7.97407314209227e-06, "loss": 0.4617, "step": 1976 }, { "epoch": 0.32, "grad_norm": 6.564497052781214, "learning_rate": 7.971974872172497e-06, "loss": 0.3646, "step": 1977 }, { "epoch": 0.32, "grad_norm": 11.833836147658264, "learning_rate": 7.969875792604842e-06, "loss": 0.4901, "step": 1978 }, { "epoch": 0.32, "grad_norm": 6.56754924312359, "learning_rate": 7.967775903961158e-06, "loss": 0.4463, "step": 1979 }, { "epoch": 0.32, "grad_norm": 8.47823879417704, "learning_rate": 7.96567520681351e-06, "loss": 0.4937, "step": 1980 }, { "epoch": 0.32, "grad_norm": 8.731319187584853, "learning_rate": 7.963573701734185e-06, "loss": 0.4191, "step": 1981 }, { "epoch": 0.32, "grad_norm": 13.508514477577917, "learning_rate": 7.961471389295694e-06, "loss": 0.4543, "step": 1982 }, { "epoch": 0.32, "grad_norm": 9.257723933190785, "learning_rate": 7.959368270070763e-06, "loss": 0.4433, "step": 1983 }, { "epoch": 0.32, "grad_norm": 6.145727241741314, "learning_rate": 7.957264344632338e-06, "loss": 0.4946, "step": 1984 }, { "epoch": 0.32, "grad_norm": 7.868389453337813, "learning_rate": 7.95515961355359e-06, "loss": 0.5473, "step": 1985 }, { "epoch": 0.32, "grad_norm": 13.454982620634105, "learning_rate": 7.953054077407903e-06, "loss": 0.4962, "step": 1986 }, { "epoch": 0.32, "grad_norm": 7.891587913483195, "learning_rate": 7.950947736768884e-06, "loss": 0.4443, "step": 1987 }, { "epoch": 0.32, "grad_norm": 38.44425384819595, "learning_rate": 7.948840592210358e-06, "loss": 0.5078, "step": 1988 }, { "epoch": 0.32, "grad_norm": 12.573910690161226, "learning_rate": 7.94673264430637e-06, "loss": 0.4534, "step": 1989 }, { "epoch": 0.32, "grad_norm": 35.55337855559341, "learning_rate": 7.94462389363118e-06, "loss": 0.4442, "step": 1990 }, { "epoch": 0.32, "grad_norm": 10.219659486498314, "learning_rate": 7.942514340759275e-06, "loss": 0.5177, "step": 1991 }, { "epoch": 0.32, "grad_norm": 1.255376693527574, "learning_rate": 7.940403986265353e-06, "loss": 0.4671, "step": 1992 }, { "epoch": 0.32, "grad_norm": 7.222703127820901, "learning_rate": 7.93829283072433e-06, "loss": 0.4613, "step": 1993 }, { "epoch": 0.32, "grad_norm": 9.09425685687342, "learning_rate": 7.936180874711347e-06, "loss": 0.3954, "step": 1994 }, { "epoch": 0.32, "grad_norm": 54.9231081497017, "learning_rate": 7.934068118801758e-06, "loss": 0.432, "step": 1995 }, { "epoch": 0.32, "grad_norm": 9.428750671611828, "learning_rate": 7.931954563571134e-06, "loss": 0.4382, "step": 1996 }, { "epoch": 0.32, "grad_norm": 5.2183006470045346, "learning_rate": 7.92984020959527e-06, "loss": 0.3958, "step": 1997 }, { "epoch": 0.32, "grad_norm": 8.335164577925307, "learning_rate": 7.92772505745017e-06, "loss": 0.4372, "step": 1998 }, { "epoch": 0.32, "grad_norm": 24.078572772560086, "learning_rate": 7.925609107712057e-06, "loss": 0.4183, "step": 1999 }, { "epoch": 0.32, "grad_norm": 6.2906227673759405, "learning_rate": 7.923492360957383e-06, "loss": 0.4798, "step": 2000 }, { "epoch": 0.32, "grad_norm": 7.142958731667773, "learning_rate": 7.921374817762801e-06, "loss": 0.5039, "step": 2001 }, { "epoch": 0.32, "grad_norm": 19.0528308052335, "learning_rate": 7.919256478705192e-06, "loss": 0.5041, "step": 2002 }, { "epoch": 0.32, "grad_norm": 8.837959980382905, "learning_rate": 7.917137344361647e-06, "loss": 0.3984, "step": 2003 }, { "epoch": 0.32, "grad_norm": 7.670517563732714, "learning_rate": 7.91501741530948e-06, "loss": 0.5087, "step": 2004 }, { "epoch": 0.32, "grad_norm": 10.43309152337332, "learning_rate": 7.912896692126216e-06, "loss": 0.476, "step": 2005 }, { "epoch": 0.32, "grad_norm": 7.4988024680866046, "learning_rate": 7.910775175389595e-06, "loss": 0.4517, "step": 2006 }, { "epoch": 0.32, "grad_norm": 8.35127470440275, "learning_rate": 7.908652865677584e-06, "loss": 0.472, "step": 2007 }, { "epoch": 0.32, "grad_norm": 12.317547405148845, "learning_rate": 7.906529763568354e-06, "loss": 0.4157, "step": 2008 }, { "epoch": 0.32, "grad_norm": 8.015749196698476, "learning_rate": 7.904405869640296e-06, "loss": 0.4391, "step": 2009 }, { "epoch": 0.32, "grad_norm": 9.343033581989047, "learning_rate": 7.902281184472021e-06, "loss": 0.4766, "step": 2010 }, { "epoch": 0.32, "grad_norm": 7.933565716348321, "learning_rate": 7.900155708642347e-06, "loss": 0.465, "step": 2011 }, { "epoch": 0.32, "grad_norm": 11.706468200632765, "learning_rate": 7.898029442730316e-06, "loss": 0.4902, "step": 2012 }, { "epoch": 0.32, "grad_norm": 10.987783285993412, "learning_rate": 7.89590238731518e-06, "loss": 0.4418, "step": 2013 }, { "epoch": 0.32, "grad_norm": 7.162176854245928, "learning_rate": 7.893774542976408e-06, "loss": 0.4849, "step": 2014 }, { "epoch": 0.32, "grad_norm": 9.223371427753035, "learning_rate": 7.891645910293683e-06, "loss": 0.5018, "step": 2015 }, { "epoch": 0.32, "grad_norm": 7.09631578016545, "learning_rate": 7.889516489846904e-06, "loss": 0.4901, "step": 2016 }, { "epoch": 0.32, "grad_norm": 8.316351170743074, "learning_rate": 7.887386282216182e-06, "loss": 0.4034, "step": 2017 }, { "epoch": 0.33, "grad_norm": 16.097680569091715, "learning_rate": 7.885255287981845e-06, "loss": 0.3753, "step": 2018 }, { "epoch": 0.33, "grad_norm": 5.612760669605551, "learning_rate": 7.883123507724437e-06, "loss": 0.5049, "step": 2019 }, { "epoch": 0.33, "grad_norm": 1.5947704432095369, "learning_rate": 7.88099094202471e-06, "loss": 0.5394, "step": 2020 }, { "epoch": 0.33, "grad_norm": 7.808357102121382, "learning_rate": 7.878857591463636e-06, "loss": 0.5089, "step": 2021 }, { "epoch": 0.33, "grad_norm": 8.28224412938231, "learning_rate": 7.876723456622399e-06, "loss": 0.4979, "step": 2022 }, { "epoch": 0.33, "grad_norm": 31.551334357942554, "learning_rate": 7.874588538082394e-06, "loss": 0.455, "step": 2023 }, { "epoch": 0.33, "grad_norm": 6.498760988145656, "learning_rate": 7.872452836425233e-06, "loss": 0.4674, "step": 2024 }, { "epoch": 0.33, "grad_norm": 7.094283612058878, "learning_rate": 7.870316352232739e-06, "loss": 0.4831, "step": 2025 }, { "epoch": 0.33, "grad_norm": 16.183512331735344, "learning_rate": 7.868179086086951e-06, "loss": 0.5141, "step": 2026 }, { "epoch": 0.33, "grad_norm": 9.35041804261931, "learning_rate": 7.866041038570117e-06, "loss": 0.4515, "step": 2027 }, { "epoch": 0.33, "grad_norm": 1.301796465197682, "learning_rate": 7.863902210264702e-06, "loss": 0.4529, "step": 2028 }, { "epoch": 0.33, "grad_norm": 8.120195296631943, "learning_rate": 7.861762601753379e-06, "loss": 0.4221, "step": 2029 }, { "epoch": 0.33, "grad_norm": 9.395519338347155, "learning_rate": 7.859622213619043e-06, "loss": 0.4484, "step": 2030 }, { "epoch": 0.33, "grad_norm": 9.729944583169548, "learning_rate": 7.857481046444785e-06, "loss": 0.4282, "step": 2031 }, { "epoch": 0.33, "grad_norm": 9.234628197285998, "learning_rate": 7.855339100813925e-06, "loss": 0.4497, "step": 2032 }, { "epoch": 0.33, "grad_norm": 7.165959591914228, "learning_rate": 7.853196377309986e-06, "loss": 0.4526, "step": 2033 }, { "epoch": 0.33, "grad_norm": 7.66118104021425, "learning_rate": 7.851052876516708e-06, "loss": 0.3996, "step": 2034 }, { "epoch": 0.33, "grad_norm": 11.064428195705318, "learning_rate": 7.848908599018033e-06, "loss": 0.4387, "step": 2035 }, { "epoch": 0.33, "grad_norm": 21.674799254722483, "learning_rate": 7.846763545398124e-06, "loss": 0.4562, "step": 2036 }, { "epoch": 0.33, "grad_norm": 7.665159723126349, "learning_rate": 7.844617716241358e-06, "loss": 0.4895, "step": 2037 }, { "epoch": 0.33, "grad_norm": 7.828176669011881, "learning_rate": 7.842471112132311e-06, "loss": 0.4237, "step": 2038 }, { "epoch": 0.33, "grad_norm": 8.771015524828297, "learning_rate": 7.84032373365578e-06, "loss": 0.4572, "step": 2039 }, { "epoch": 0.33, "grad_norm": 7.470157843075591, "learning_rate": 7.83817558139677e-06, "loss": 0.4257, "step": 2040 }, { "epoch": 0.33, "grad_norm": 6.816088864620952, "learning_rate": 7.836026655940497e-06, "loss": 0.4593, "step": 2041 }, { "epoch": 0.33, "grad_norm": 7.926826724743756, "learning_rate": 7.833876957872388e-06, "loss": 0.4447, "step": 2042 }, { "epoch": 0.33, "grad_norm": 7.733389651568566, "learning_rate": 7.83172648777808e-06, "loss": 0.3852, "step": 2043 }, { "epoch": 0.33, "grad_norm": 7.178703399655718, "learning_rate": 7.829575246243417e-06, "loss": 0.4546, "step": 2044 }, { "epoch": 0.33, "grad_norm": 12.673246330340408, "learning_rate": 7.82742323385446e-06, "loss": 0.3604, "step": 2045 }, { "epoch": 0.33, "grad_norm": 6.202751873354109, "learning_rate": 7.825270451197477e-06, "loss": 0.5041, "step": 2046 }, { "epoch": 0.33, "grad_norm": 6.536499853476276, "learning_rate": 7.823116898858945e-06, "loss": 0.4271, "step": 2047 }, { "epoch": 0.33, "grad_norm": 6.7787794304338105, "learning_rate": 7.820962577425548e-06, "loss": 0.3711, "step": 2048 }, { "epoch": 0.33, "grad_norm": 10.228048575577825, "learning_rate": 7.818807487484186e-06, "loss": 0.5705, "step": 2049 }, { "epoch": 0.33, "grad_norm": 8.546504991684403, "learning_rate": 7.816651629621963e-06, "loss": 0.4827, "step": 2050 }, { "epoch": 0.33, "grad_norm": 5.813017879852273, "learning_rate": 7.814495004426195e-06, "loss": 0.4707, "step": 2051 }, { "epoch": 0.33, "grad_norm": 15.28523074975952, "learning_rate": 7.812337612484404e-06, "loss": 0.4166, "step": 2052 }, { "epoch": 0.33, "grad_norm": 6.386043289355734, "learning_rate": 7.810179454384326e-06, "loss": 0.4567, "step": 2053 }, { "epoch": 0.33, "grad_norm": 7.4206538396143165, "learning_rate": 7.808020530713902e-06, "loss": 0.4662, "step": 2054 }, { "epoch": 0.33, "grad_norm": 4.491156846605655, "learning_rate": 7.805860842061282e-06, "loss": 0.4322, "step": 2055 }, { "epoch": 0.33, "grad_norm": 8.654295994570372, "learning_rate": 7.803700389014824e-06, "loss": 0.4602, "step": 2056 }, { "epoch": 0.33, "grad_norm": 8.005011798801162, "learning_rate": 7.801539172163097e-06, "loss": 0.5186, "step": 2057 }, { "epoch": 0.33, "grad_norm": 8.305515339484451, "learning_rate": 7.799377192094872e-06, "loss": 0.4235, "step": 2058 }, { "epoch": 0.33, "grad_norm": 12.36815159657513, "learning_rate": 7.797214449399136e-06, "loss": 0.5129, "step": 2059 }, { "epoch": 0.33, "grad_norm": 9.659070385983556, "learning_rate": 7.79505094466508e-06, "loss": 0.4414, "step": 2060 }, { "epoch": 0.33, "grad_norm": 11.807310655809136, "learning_rate": 7.792886678482096e-06, "loss": 0.4407, "step": 2061 }, { "epoch": 0.33, "grad_norm": 6.892352247191939, "learning_rate": 7.7907216514398e-06, "loss": 0.4644, "step": 2062 }, { "epoch": 0.33, "grad_norm": 4.633186861359742, "learning_rate": 7.788555864127995e-06, "loss": 0.4407, "step": 2063 }, { "epoch": 0.33, "grad_norm": 14.73566911462073, "learning_rate": 7.786389317136708e-06, "loss": 0.4191, "step": 2064 }, { "epoch": 0.33, "grad_norm": 7.0006853378797445, "learning_rate": 7.784222011056164e-06, "loss": 0.4985, "step": 2065 }, { "epoch": 0.33, "grad_norm": 5.547553933894957, "learning_rate": 7.782053946476795e-06, "loss": 0.4996, "step": 2066 }, { "epoch": 0.33, "grad_norm": 6.568642721501863, "learning_rate": 7.779885123989244e-06, "loss": 0.4591, "step": 2067 }, { "epoch": 0.33, "grad_norm": 8.148444163935931, "learning_rate": 7.777715544184358e-06, "loss": 0.4206, "step": 2068 }, { "epoch": 0.33, "grad_norm": 8.522686804883667, "learning_rate": 7.775545207653188e-06, "loss": 0.4912, "step": 2069 }, { "epoch": 0.33, "grad_norm": 9.12409953213317, "learning_rate": 7.773374114986996e-06, "loss": 0.4576, "step": 2070 }, { "epoch": 0.33, "grad_norm": 7.5868985531589, "learning_rate": 7.771202266777247e-06, "loss": 0.4994, "step": 2071 }, { "epoch": 0.33, "grad_norm": 6.250759787321442, "learning_rate": 7.76902966361561e-06, "loss": 0.4187, "step": 2072 }, { "epoch": 0.33, "grad_norm": 11.71473578028989, "learning_rate": 7.766856306093966e-06, "loss": 0.5041, "step": 2073 }, { "epoch": 0.33, "grad_norm": 8.351637843124184, "learning_rate": 7.764682194804394e-06, "loss": 0.4178, "step": 2074 }, { "epoch": 0.33, "grad_norm": 13.798845304222317, "learning_rate": 7.762507330339185e-06, "loss": 0.4457, "step": 2075 }, { "epoch": 0.33, "grad_norm": 5.495819809272955, "learning_rate": 7.76033171329083e-06, "loss": 0.4479, "step": 2076 }, { "epoch": 0.33, "grad_norm": 12.345739655749375, "learning_rate": 7.758155344252025e-06, "loss": 0.4024, "step": 2077 }, { "epoch": 0.33, "grad_norm": 16.992975632417618, "learning_rate": 7.755978223815678e-06, "loss": 0.4583, "step": 2078 }, { "epoch": 0.33, "grad_norm": 22.440057369445576, "learning_rate": 7.753800352574891e-06, "loss": 0.4172, "step": 2079 }, { "epoch": 0.34, "grad_norm": 7.775185137519819, "learning_rate": 7.751621731122981e-06, "loss": 0.4132, "step": 2080 }, { "epoch": 0.34, "grad_norm": 10.170048435338325, "learning_rate": 7.749442360053462e-06, "loss": 0.4111, "step": 2081 }, { "epoch": 0.34, "grad_norm": 8.793958641544817, "learning_rate": 7.747262239960055e-06, "loss": 0.5114, "step": 2082 }, { "epoch": 0.34, "grad_norm": 7.693722456524318, "learning_rate": 7.745081371436686e-06, "loss": 0.4481, "step": 2083 }, { "epoch": 0.34, "grad_norm": 8.121932971643975, "learning_rate": 7.742899755077482e-06, "loss": 0.4307, "step": 2084 }, { "epoch": 0.34, "grad_norm": 7.818245063688625, "learning_rate": 7.740717391476778e-06, "loss": 0.3956, "step": 2085 }, { "epoch": 0.34, "grad_norm": 1.1355429817902725, "learning_rate": 7.738534281229106e-06, "loss": 0.4366, "step": 2086 }, { "epoch": 0.34, "grad_norm": 39.04255247917219, "learning_rate": 7.736350424929209e-06, "loss": 0.4057, "step": 2087 }, { "epoch": 0.34, "grad_norm": 17.076772033874832, "learning_rate": 7.734165823172028e-06, "loss": 0.5252, "step": 2088 }, { "epoch": 0.34, "grad_norm": 6.699921254458682, "learning_rate": 7.731980476552708e-06, "loss": 0.4246, "step": 2089 }, { "epoch": 0.34, "grad_norm": 7.962129819741002, "learning_rate": 7.7297943856666e-06, "loss": 0.4588, "step": 2090 }, { "epoch": 0.34, "grad_norm": 1.1635822861437144, "learning_rate": 7.727607551109252e-06, "loss": 0.4531, "step": 2091 }, { "epoch": 0.34, "grad_norm": 12.569732998456995, "learning_rate": 7.725419973476422e-06, "loss": 0.5591, "step": 2092 }, { "epoch": 0.34, "grad_norm": 5.502310798658302, "learning_rate": 7.723231653364065e-06, "loss": 0.4124, "step": 2093 }, { "epoch": 0.34, "grad_norm": 9.27403344407039, "learning_rate": 7.72104259136834e-06, "loss": 0.462, "step": 2094 }, { "epoch": 0.34, "grad_norm": 19.366640599819426, "learning_rate": 7.718852788085604e-06, "loss": 0.5064, "step": 2095 }, { "epoch": 0.34, "grad_norm": 8.891207342638202, "learning_rate": 7.716662244112425e-06, "loss": 0.4654, "step": 2096 }, { "epoch": 0.34, "grad_norm": 8.626515686155651, "learning_rate": 7.714470960045565e-06, "loss": 0.4983, "step": 2097 }, { "epoch": 0.34, "grad_norm": 1.2476040495688359, "learning_rate": 7.712278936481992e-06, "loss": 0.4259, "step": 2098 }, { "epoch": 0.34, "grad_norm": 10.962365975774091, "learning_rate": 7.710086174018871e-06, "loss": 0.4554, "step": 2099 }, { "epoch": 0.34, "grad_norm": 9.099182736273677, "learning_rate": 7.707892673253572e-06, "loss": 0.4748, "step": 2100 }, { "epoch": 0.34, "grad_norm": 7.8360204500473705, "learning_rate": 7.705698434783666e-06, "loss": 0.4618, "step": 2101 }, { "epoch": 0.34, "grad_norm": 5.4648276335413675, "learning_rate": 7.703503459206922e-06, "loss": 0.5301, "step": 2102 }, { "epoch": 0.34, "grad_norm": 21.087349952832422, "learning_rate": 7.701307747121314e-06, "loss": 0.4925, "step": 2103 }, { "epoch": 0.34, "grad_norm": 12.295310339288026, "learning_rate": 7.699111299125015e-06, "loss": 0.4571, "step": 2104 }, { "epoch": 0.34, "grad_norm": 40.4547538075996, "learning_rate": 7.696914115816395e-06, "loss": 0.5021, "step": 2105 }, { "epoch": 0.34, "grad_norm": 11.517189106915135, "learning_rate": 7.69471619779403e-06, "loss": 0.4373, "step": 2106 }, { "epoch": 0.34, "grad_norm": 24.37179142332863, "learning_rate": 7.692517545656691e-06, "loss": 0.4854, "step": 2107 }, { "epoch": 0.34, "grad_norm": 8.491746271381572, "learning_rate": 7.690318160003356e-06, "loss": 0.4795, "step": 2108 }, { "epoch": 0.34, "grad_norm": 15.018576914890293, "learning_rate": 7.688118041433192e-06, "loss": 0.4134, "step": 2109 }, { "epoch": 0.34, "grad_norm": 12.373447919772515, "learning_rate": 7.685917190545576e-06, "loss": 0.3929, "step": 2110 }, { "epoch": 0.34, "grad_norm": 7.635106381013662, "learning_rate": 7.683715607940078e-06, "loss": 0.4987, "step": 2111 }, { "epoch": 0.34, "grad_norm": 7.420356528520459, "learning_rate": 7.681513294216476e-06, "loss": 0.4002, "step": 2112 }, { "epoch": 0.34, "grad_norm": 12.314805447930347, "learning_rate": 7.679310249974732e-06, "loss": 0.4496, "step": 2113 }, { "epoch": 0.34, "grad_norm": 8.152977500534956, "learning_rate": 7.677106475815021e-06, "loss": 0.4743, "step": 2114 }, { "epoch": 0.34, "grad_norm": 10.265360543514397, "learning_rate": 7.674901972337712e-06, "loss": 0.479, "step": 2115 }, { "epoch": 0.34, "grad_norm": 8.892626919665304, "learning_rate": 7.672696740143372e-06, "loss": 0.486, "step": 2116 }, { "epoch": 0.34, "grad_norm": 16.447314171336238, "learning_rate": 7.670490779832767e-06, "loss": 0.4447, "step": 2117 }, { "epoch": 0.34, "grad_norm": 9.010660934520317, "learning_rate": 7.668284092006859e-06, "loss": 0.4467, "step": 2118 }, { "epoch": 0.34, "grad_norm": 6.582599163752811, "learning_rate": 7.666076677266813e-06, "loss": 0.4534, "step": 2119 }, { "epoch": 0.34, "grad_norm": 7.439480988458671, "learning_rate": 7.66386853621399e-06, "loss": 0.468, "step": 2120 }, { "epoch": 0.34, "grad_norm": 9.709460865730035, "learning_rate": 7.661659669449948e-06, "loss": 0.5387, "step": 2121 }, { "epoch": 0.34, "grad_norm": 6.938826734532013, "learning_rate": 7.659450077576444e-06, "loss": 0.4394, "step": 2122 }, { "epoch": 0.34, "grad_norm": 8.444920566375533, "learning_rate": 7.657239761195428e-06, "loss": 0.4769, "step": 2123 }, { "epoch": 0.34, "grad_norm": 18.326600827972936, "learning_rate": 7.655028720909057e-06, "loss": 0.4388, "step": 2124 }, { "epoch": 0.34, "grad_norm": 13.71086697772542, "learning_rate": 7.652816957319674e-06, "loss": 0.4927, "step": 2125 }, { "epoch": 0.34, "grad_norm": 10.03794445672883, "learning_rate": 7.650604471029825e-06, "loss": 0.4382, "step": 2126 }, { "epoch": 0.34, "grad_norm": 9.903624924246099, "learning_rate": 7.648391262642257e-06, "loss": 0.4745, "step": 2127 }, { "epoch": 0.34, "grad_norm": 5.909242764401975, "learning_rate": 7.646177332759906e-06, "loss": 0.4062, "step": 2128 }, { "epoch": 0.34, "grad_norm": 8.002591919166264, "learning_rate": 7.643962681985904e-06, "loss": 0.4751, "step": 2129 }, { "epoch": 0.34, "grad_norm": 38.99043966300326, "learning_rate": 7.641747310923588e-06, "loss": 0.4337, "step": 2130 }, { "epoch": 0.34, "grad_norm": 53.35011615674261, "learning_rate": 7.639531220176484e-06, "loss": 0.484, "step": 2131 }, { "epoch": 0.34, "grad_norm": 12.135960179233045, "learning_rate": 7.637314410348315e-06, "loss": 0.4266, "step": 2132 }, { "epoch": 0.34, "grad_norm": 18.839165241310706, "learning_rate": 7.635096882043006e-06, "loss": 0.482, "step": 2133 }, { "epoch": 0.34, "grad_norm": 11.104523626688536, "learning_rate": 7.632878635864666e-06, "loss": 0.4261, "step": 2134 }, { "epoch": 0.34, "grad_norm": 16.020175023305185, "learning_rate": 7.630659672417613e-06, "loss": 0.4656, "step": 2135 }, { "epoch": 0.34, "grad_norm": 10.038651041019715, "learning_rate": 7.628439992306349e-06, "loss": 0.4451, "step": 2136 }, { "epoch": 0.34, "grad_norm": 14.149303386908928, "learning_rate": 7.626219596135578e-06, "loss": 0.3957, "step": 2137 }, { "epoch": 0.34, "grad_norm": 13.526555384856678, "learning_rate": 7.623998484510197e-06, "loss": 0.4776, "step": 2138 }, { "epoch": 0.34, "grad_norm": 9.55388190204747, "learning_rate": 7.621776658035298e-06, "loss": 0.4811, "step": 2139 }, { "epoch": 0.34, "grad_norm": 9.65707369913588, "learning_rate": 7.619554117316165e-06, "loss": 0.5148, "step": 2140 }, { "epoch": 0.34, "grad_norm": 13.143875365853082, "learning_rate": 7.617330862958287e-06, "loss": 0.4574, "step": 2141 }, { "epoch": 0.35, "grad_norm": 103.27202474706992, "learning_rate": 7.615106895567331e-06, "loss": 0.4774, "step": 2142 }, { "epoch": 0.35, "grad_norm": 32.31772357178092, "learning_rate": 7.612882215749172e-06, "loss": 0.444, "step": 2143 }, { "epoch": 0.35, "grad_norm": 33.44351750884095, "learning_rate": 7.6106568241098745e-06, "loss": 0.4923, "step": 2144 }, { "epoch": 0.35, "grad_norm": 22.014947822189317, "learning_rate": 7.608430721255691e-06, "loss": 0.435, "step": 2145 }, { "epoch": 0.35, "grad_norm": 12.548360259447163, "learning_rate": 7.606203907793081e-06, "loss": 0.4872, "step": 2146 }, { "epoch": 0.35, "grad_norm": 16.240883693894045, "learning_rate": 7.603976384328684e-06, "loss": 0.4828, "step": 2147 }, { "epoch": 0.35, "grad_norm": 1.36210250058226, "learning_rate": 7.601748151469341e-06, "loss": 0.4951, "step": 2148 }, { "epoch": 0.35, "grad_norm": 11.30723751481526, "learning_rate": 7.599519209822085e-06, "loss": 0.4352, "step": 2149 }, { "epoch": 0.35, "grad_norm": 12.951984227901336, "learning_rate": 7.59728955999414e-06, "loss": 0.4375, "step": 2150 }, { "epoch": 0.35, "grad_norm": 27.459722761193934, "learning_rate": 7.595059202592923e-06, "loss": 0.4875, "step": 2151 }, { "epoch": 0.35, "grad_norm": 7.207672555675573, "learning_rate": 7.5928281382260474e-06, "loss": 0.4358, "step": 2152 }, { "epoch": 0.35, "grad_norm": 22.450925334354615, "learning_rate": 7.590596367501314e-06, "loss": 0.4765, "step": 2153 }, { "epoch": 0.35, "grad_norm": 13.963648249263192, "learning_rate": 7.58836389102672e-06, "loss": 0.4238, "step": 2154 }, { "epoch": 0.35, "grad_norm": 10.411454674732726, "learning_rate": 7.586130709410454e-06, "loss": 0.5141, "step": 2155 }, { "epoch": 0.35, "grad_norm": 13.266203565270903, "learning_rate": 7.583896823260894e-06, "loss": 0.3885, "step": 2156 }, { "epoch": 0.35, "grad_norm": 12.371355393039966, "learning_rate": 7.581662233186618e-06, "loss": 0.4413, "step": 2157 }, { "epoch": 0.35, "grad_norm": 16.207917705661327, "learning_rate": 7.5794269397963814e-06, "loss": 0.4993, "step": 2158 }, { "epoch": 0.35, "grad_norm": 17.414883714143716, "learning_rate": 7.577190943699145e-06, "loss": 0.4873, "step": 2159 }, { "epoch": 0.35, "grad_norm": 14.78337604780799, "learning_rate": 7.574954245504056e-06, "loss": 0.4246, "step": 2160 }, { "epoch": 0.35, "grad_norm": 19.97413900319821, "learning_rate": 7.572716845820452e-06, "loss": 0.4673, "step": 2161 }, { "epoch": 0.35, "grad_norm": 16.428325361306584, "learning_rate": 7.57047874525786e-06, "loss": 0.4394, "step": 2162 }, { "epoch": 0.35, "grad_norm": 13.397722783432853, "learning_rate": 7.568239944426003e-06, "loss": 0.4537, "step": 2163 }, { "epoch": 0.35, "grad_norm": 7.452390771139909, "learning_rate": 7.5660004439347916e-06, "loss": 0.433, "step": 2164 }, { "epoch": 0.35, "grad_norm": 19.412005650509236, "learning_rate": 7.563760244394325e-06, "loss": 0.4888, "step": 2165 }, { "epoch": 0.35, "grad_norm": 13.566416846853913, "learning_rate": 7.5615193464149005e-06, "loss": 0.4329, "step": 2166 }, { "epoch": 0.35, "grad_norm": 8.602081209949365, "learning_rate": 7.5592777506069946e-06, "loss": 0.4011, "step": 2167 }, { "epoch": 0.35, "grad_norm": 8.390458987787747, "learning_rate": 7.557035457581284e-06, "loss": 0.5084, "step": 2168 }, { "epoch": 0.35, "grad_norm": 11.851761248183015, "learning_rate": 7.5547924679486294e-06, "loss": 0.4371, "step": 2169 }, { "epoch": 0.35, "grad_norm": 20.248081265506137, "learning_rate": 7.552548782320084e-06, "loss": 0.4674, "step": 2170 }, { "epoch": 0.35, "grad_norm": 15.864396244782142, "learning_rate": 7.550304401306887e-06, "loss": 0.4746, "step": 2171 }, { "epoch": 0.35, "grad_norm": 7.837406519165391, "learning_rate": 7.5480593255204725e-06, "loss": 0.4208, "step": 2172 }, { "epoch": 0.35, "grad_norm": 37.15316655511652, "learning_rate": 7.545813555572461e-06, "loss": 0.4565, "step": 2173 }, { "epoch": 0.35, "grad_norm": 11.960090103980175, "learning_rate": 7.54356709207466e-06, "loss": 0.4533, "step": 2174 }, { "epoch": 0.35, "grad_norm": 11.78600158204266, "learning_rate": 7.5413199356390695e-06, "loss": 0.4274, "step": 2175 }, { "epoch": 0.35, "grad_norm": 18.535708229303637, "learning_rate": 7.539072086877877e-06, "loss": 0.405, "step": 2176 }, { "epoch": 0.35, "grad_norm": 12.412936408471017, "learning_rate": 7.536823546403458e-06, "loss": 0.4504, "step": 2177 }, { "epoch": 0.35, "grad_norm": 10.598808594918669, "learning_rate": 7.534574314828376e-06, "loss": 0.452, "step": 2178 }, { "epoch": 0.35, "grad_norm": 10.089986698972194, "learning_rate": 7.532324392765387e-06, "loss": 0.4743, "step": 2179 }, { "epoch": 0.35, "grad_norm": 18.410059972717555, "learning_rate": 7.530073780827427e-06, "loss": 0.4482, "step": 2180 }, { "epoch": 0.35, "grad_norm": 11.594075646979574, "learning_rate": 7.527822479627629e-06, "loss": 0.4127, "step": 2181 }, { "epoch": 0.35, "grad_norm": 10.23008352433601, "learning_rate": 7.525570489779307e-06, "loss": 0.4261, "step": 2182 }, { "epoch": 0.35, "grad_norm": 14.269110040307998, "learning_rate": 7.523317811895965e-06, "loss": 0.4565, "step": 2183 }, { "epoch": 0.35, "grad_norm": 7.724414686476505, "learning_rate": 7.5210644465912975e-06, "loss": 0.466, "step": 2184 }, { "epoch": 0.35, "grad_norm": 6.093711851655809, "learning_rate": 7.518810394479179e-06, "loss": 0.4684, "step": 2185 }, { "epoch": 0.35, "grad_norm": 7.692682829884761, "learning_rate": 7.516555656173678e-06, "loss": 0.4543, "step": 2186 }, { "epoch": 0.35, "grad_norm": 1.5719634520222734, "learning_rate": 7.514300232289049e-06, "loss": 0.523, "step": 2187 }, { "epoch": 0.35, "grad_norm": 12.966202937254597, "learning_rate": 7.512044123439728e-06, "loss": 0.4497, "step": 2188 }, { "epoch": 0.35, "grad_norm": 10.014092401338424, "learning_rate": 7.509787330240342e-06, "loss": 0.4055, "step": 2189 }, { "epoch": 0.35, "grad_norm": 17.747031475174325, "learning_rate": 7.507529853305706e-06, "loss": 0.389, "step": 2190 }, { "epoch": 0.35, "grad_norm": 9.510371366013034, "learning_rate": 7.505271693250817e-06, "loss": 0.4492, "step": 2191 }, { "epoch": 0.35, "grad_norm": 7.601907829154497, "learning_rate": 7.50301285069086e-06, "loss": 0.5155, "step": 2192 }, { "epoch": 0.35, "grad_norm": 10.2548322800129, "learning_rate": 7.500753326241208e-06, "loss": 0.4372, "step": 2193 }, { "epoch": 0.35, "grad_norm": 17.409018910625846, "learning_rate": 7.498493120517415e-06, "loss": 0.4328, "step": 2194 }, { "epoch": 0.35, "grad_norm": 10.098137247245152, "learning_rate": 7.4962322341352256e-06, "loss": 0.4674, "step": 2195 }, { "epoch": 0.35, "grad_norm": 12.858562459473326, "learning_rate": 7.493970667710566e-06, "loss": 0.4751, "step": 2196 }, { "epoch": 0.35, "grad_norm": 14.568628082616911, "learning_rate": 7.491708421859549e-06, "loss": 0.4678, "step": 2197 }, { "epoch": 0.35, "grad_norm": 11.636427350868132, "learning_rate": 7.489445497198475e-06, "loss": 0.4706, "step": 2198 }, { "epoch": 0.35, "grad_norm": 12.952326217702499, "learning_rate": 7.487181894343826e-06, "loss": 0.4606, "step": 2199 }, { "epoch": 0.35, "grad_norm": 11.995260582823821, "learning_rate": 7.484917613912267e-06, "loss": 0.4686, "step": 2200 }, { "epoch": 0.35, "grad_norm": 6.623753843407551, "learning_rate": 7.482652656520655e-06, "loss": 0.5072, "step": 2201 }, { "epoch": 0.35, "grad_norm": 17.111091450134992, "learning_rate": 7.480387022786023e-06, "loss": 0.4933, "step": 2202 }, { "epoch": 0.35, "grad_norm": 10.941808314099825, "learning_rate": 7.478120713325595e-06, "loss": 0.4933, "step": 2203 }, { "epoch": 0.36, "grad_norm": 10.55069129000571, "learning_rate": 7.4758537287567745e-06, "loss": 0.4529, "step": 2204 }, { "epoch": 0.36, "grad_norm": 11.07230794015543, "learning_rate": 7.4735860696971505e-06, "loss": 0.4514, "step": 2205 }, { "epoch": 0.36, "grad_norm": 8.06690721760916, "learning_rate": 7.471317736764497e-06, "loss": 0.4681, "step": 2206 }, { "epoch": 0.36, "grad_norm": 14.06831495086204, "learning_rate": 7.46904873057677e-06, "loss": 0.458, "step": 2207 }, { "epoch": 0.36, "grad_norm": 12.915016507904822, "learning_rate": 7.466779051752107e-06, "loss": 0.4153, "step": 2208 }, { "epoch": 0.36, "grad_norm": 637.8867150776215, "learning_rate": 7.464508700908836e-06, "loss": 0.3987, "step": 2209 }, { "epoch": 0.36, "grad_norm": 9.982455642267267, "learning_rate": 7.4622376786654596e-06, "loss": 0.4262, "step": 2210 }, { "epoch": 0.36, "grad_norm": 9.845330031960573, "learning_rate": 7.459965985640665e-06, "loss": 0.4387, "step": 2211 }, { "epoch": 0.36, "grad_norm": 30.448791623191244, "learning_rate": 7.457693622453329e-06, "loss": 0.4366, "step": 2212 }, { "epoch": 0.36, "grad_norm": 8.235333536107866, "learning_rate": 7.455420589722504e-06, "loss": 0.4419, "step": 2213 }, { "epoch": 0.36, "grad_norm": 1.2981043499163316, "learning_rate": 7.453146888067424e-06, "loss": 0.4386, "step": 2214 }, { "epoch": 0.36, "grad_norm": 9.072360465820033, "learning_rate": 7.450872518107511e-06, "loss": 0.4866, "step": 2215 }, { "epoch": 0.36, "grad_norm": 8.288121813588003, "learning_rate": 7.448597480462366e-06, "loss": 0.4251, "step": 2216 }, { "epoch": 0.36, "grad_norm": 12.120802218964878, "learning_rate": 7.446321775751772e-06, "loss": 0.4685, "step": 2217 }, { "epoch": 0.36, "grad_norm": 8.596918772388062, "learning_rate": 7.444045404595692e-06, "loss": 0.4356, "step": 2218 }, { "epoch": 0.36, "grad_norm": 7.5552650636579175, "learning_rate": 7.441768367614274e-06, "loss": 0.4194, "step": 2219 }, { "epoch": 0.36, "grad_norm": 10.001765814701153, "learning_rate": 7.439490665427844e-06, "loss": 0.4019, "step": 2220 }, { "epoch": 0.36, "grad_norm": 15.02221793958069, "learning_rate": 7.437212298656914e-06, "loss": 0.4609, "step": 2221 }, { "epoch": 0.36, "grad_norm": 7.439664391213184, "learning_rate": 7.434933267922168e-06, "loss": 0.4554, "step": 2222 }, { "epoch": 0.36, "grad_norm": 16.4021217996922, "learning_rate": 7.432653573844483e-06, "loss": 0.4885, "step": 2223 }, { "epoch": 0.36, "grad_norm": 8.205231614628408, "learning_rate": 7.430373217044909e-06, "loss": 0.4373, "step": 2224 }, { "epoch": 0.36, "grad_norm": 11.680814244296599, "learning_rate": 7.428092198144674e-06, "loss": 0.4013, "step": 2225 }, { "epoch": 0.36, "grad_norm": 1.1737927882822101, "learning_rate": 7.425810517765196e-06, "loss": 0.4812, "step": 2226 }, { "epoch": 0.36, "grad_norm": 38.753182680092394, "learning_rate": 7.423528176528063e-06, "loss": 0.497, "step": 2227 }, { "epoch": 0.36, "grad_norm": 13.986620768775001, "learning_rate": 7.4212451750550515e-06, "loss": 0.4887, "step": 2228 }, { "epoch": 0.36, "grad_norm": 37.92779065413966, "learning_rate": 7.418961513968111e-06, "loss": 0.5363, "step": 2229 }, { "epoch": 0.36, "grad_norm": 12.144955078335013, "learning_rate": 7.416677193889376e-06, "loss": 0.4719, "step": 2230 }, { "epoch": 0.36, "grad_norm": 12.797129593581774, "learning_rate": 7.4143922154411576e-06, "loss": 0.4709, "step": 2231 }, { "epoch": 0.36, "grad_norm": 12.313986572615347, "learning_rate": 7.412106579245945e-06, "loss": 0.4663, "step": 2232 }, { "epoch": 0.36, "grad_norm": 8.426212812312611, "learning_rate": 7.409820285926411e-06, "loss": 0.5158, "step": 2233 }, { "epoch": 0.36, "grad_norm": 15.979716443754684, "learning_rate": 7.407533336105404e-06, "loss": 0.4804, "step": 2234 }, { "epoch": 0.36, "grad_norm": 9.706177401035058, "learning_rate": 7.405245730405954e-06, "loss": 0.4257, "step": 2235 }, { "epoch": 0.36, "grad_norm": 16.37471380908726, "learning_rate": 7.402957469451263e-06, "loss": 0.4244, "step": 2236 }, { "epoch": 0.36, "grad_norm": 9.157896888151061, "learning_rate": 7.4006685538647214e-06, "loss": 0.473, "step": 2237 }, { "epoch": 0.36, "grad_norm": 11.80268924173756, "learning_rate": 7.3983789842698894e-06, "loss": 0.4003, "step": 2238 }, { "epoch": 0.36, "grad_norm": 8.955791515873411, "learning_rate": 7.3960887612905116e-06, "loss": 0.528, "step": 2239 }, { "epoch": 0.36, "grad_norm": 12.133625113152975, "learning_rate": 7.393797885550506e-06, "loss": 0.4865, "step": 2240 }, { "epoch": 0.36, "grad_norm": 8.25714468987419, "learning_rate": 7.391506357673972e-06, "loss": 0.465, "step": 2241 }, { "epoch": 0.36, "grad_norm": 20.153509807388133, "learning_rate": 7.389214178285182e-06, "loss": 0.4241, "step": 2242 }, { "epoch": 0.36, "grad_norm": 11.188337004989473, "learning_rate": 7.386921348008592e-06, "loss": 0.4042, "step": 2243 }, { "epoch": 0.36, "grad_norm": 9.77823647067846, "learning_rate": 7.384627867468831e-06, "loss": 0.4905, "step": 2244 }, { "epoch": 0.36, "grad_norm": 7.724889248888449, "learning_rate": 7.382333737290708e-06, "loss": 0.4393, "step": 2245 }, { "epoch": 0.36, "grad_norm": 17.867966553701493, "learning_rate": 7.3800389580992025e-06, "loss": 0.4639, "step": 2246 }, { "epoch": 0.36, "grad_norm": 7.434624281875605, "learning_rate": 7.37774353051948e-06, "loss": 0.4374, "step": 2247 }, { "epoch": 0.36, "grad_norm": 7.950368977540833, "learning_rate": 7.375447455176877e-06, "loss": 0.4682, "step": 2248 }, { "epoch": 0.36, "grad_norm": 11.332165281344027, "learning_rate": 7.373150732696907e-06, "loss": 0.4589, "step": 2249 }, { "epoch": 0.36, "grad_norm": 10.247659934183176, "learning_rate": 7.370853363705261e-06, "loss": 0.4261, "step": 2250 }, { "epoch": 0.36, "grad_norm": 12.276095542889951, "learning_rate": 7.3685553488278064e-06, "loss": 0.4515, "step": 2251 }, { "epoch": 0.36, "grad_norm": 21.151116564101603, "learning_rate": 7.366256688690585e-06, "loss": 0.4158, "step": 2252 }, { "epoch": 0.36, "grad_norm": 10.02125046531735, "learning_rate": 7.363957383919815e-06, "loss": 0.4276, "step": 2253 }, { "epoch": 0.36, "grad_norm": 10.714814572295273, "learning_rate": 7.361657435141892e-06, "loss": 0.4743, "step": 2254 }, { "epoch": 0.36, "grad_norm": 14.540178462722155, "learning_rate": 7.3593568429833825e-06, "loss": 0.4944, "step": 2255 }, { "epoch": 0.36, "grad_norm": 6.3959780615148425, "learning_rate": 7.357055608071034e-06, "loss": 0.4467, "step": 2256 }, { "epoch": 0.36, "grad_norm": 9.326958353084601, "learning_rate": 7.354753731031765e-06, "loss": 0.4786, "step": 2257 }, { "epoch": 0.36, "grad_norm": 9.1770041229199, "learning_rate": 7.352451212492671e-06, "loss": 0.5197, "step": 2258 }, { "epoch": 0.36, "grad_norm": 9.30273349036512, "learning_rate": 7.350148053081021e-06, "loss": 0.4991, "step": 2259 }, { "epoch": 0.36, "grad_norm": 7.378671762319642, "learning_rate": 7.3478442534242565e-06, "loss": 0.4633, "step": 2260 }, { "epoch": 0.36, "grad_norm": 5.851291484288036, "learning_rate": 7.345539814150002e-06, "loss": 0.5002, "step": 2261 }, { "epoch": 0.36, "grad_norm": 6.907208181633826, "learning_rate": 7.3432347358860445e-06, "loss": 0.5443, "step": 2262 }, { "epoch": 0.36, "grad_norm": 18.8556952246286, "learning_rate": 7.340929019260356e-06, "loss": 0.4805, "step": 2263 }, { "epoch": 0.36, "grad_norm": 8.095614189610417, "learning_rate": 7.338622664901073e-06, "loss": 0.4518, "step": 2264 }, { "epoch": 0.36, "grad_norm": 10.839558533596918, "learning_rate": 7.336315673436512e-06, "loss": 0.4984, "step": 2265 }, { "epoch": 0.37, "grad_norm": 6.086179670561378, "learning_rate": 7.33400804549516e-06, "loss": 0.4215, "step": 2266 }, { "epoch": 0.37, "grad_norm": 9.581925745922735, "learning_rate": 7.331699781705679e-06, "loss": 0.5005, "step": 2267 }, { "epoch": 0.37, "grad_norm": 10.225794330510054, "learning_rate": 7.329390882696904e-06, "loss": 0.5105, "step": 2268 }, { "epoch": 0.37, "grad_norm": 8.997626559692268, "learning_rate": 7.327081349097842e-06, "loss": 0.4261, "step": 2269 }, { "epoch": 0.37, "grad_norm": 7.620642326796976, "learning_rate": 7.324771181537676e-06, "loss": 0.4002, "step": 2270 }, { "epoch": 0.37, "grad_norm": 10.499137496831187, "learning_rate": 7.322460380645755e-06, "loss": 0.4664, "step": 2271 }, { "epoch": 0.37, "grad_norm": 11.33877314538186, "learning_rate": 7.32014894705161e-06, "loss": 0.3852, "step": 2272 }, { "epoch": 0.37, "grad_norm": 1.3855615616887829, "learning_rate": 7.317836881384934e-06, "loss": 0.5257, "step": 2273 }, { "epoch": 0.37, "grad_norm": 7.439921804124257, "learning_rate": 7.315524184275602e-06, "loss": 0.4502, "step": 2274 }, { "epoch": 0.37, "grad_norm": 7.8423278727698715, "learning_rate": 7.313210856353653e-06, "loss": 0.4849, "step": 2275 }, { "epoch": 0.37, "grad_norm": 8.377224333850489, "learning_rate": 7.310896898249303e-06, "loss": 0.4851, "step": 2276 }, { "epoch": 0.37, "grad_norm": 11.695300727880074, "learning_rate": 7.3085823105929385e-06, "loss": 0.4897, "step": 2277 }, { "epoch": 0.37, "grad_norm": 7.355416852444542, "learning_rate": 7.306267094015116e-06, "loss": 0.4391, "step": 2278 }, { "epoch": 0.37, "grad_norm": 6.269960654369742, "learning_rate": 7.303951249146563e-06, "loss": 0.3958, "step": 2279 }, { "epoch": 0.37, "grad_norm": 10.942989723426711, "learning_rate": 7.301634776618183e-06, "loss": 0.5127, "step": 2280 }, { "epoch": 0.37, "grad_norm": 6.990299487583623, "learning_rate": 7.299317677061043e-06, "loss": 0.3809, "step": 2281 }, { "epoch": 0.37, "grad_norm": 7.428374292441154, "learning_rate": 7.2969999511063875e-06, "loss": 0.4492, "step": 2282 }, { "epoch": 0.37, "grad_norm": 9.68269509485545, "learning_rate": 7.294681599385629e-06, "loss": 0.4949, "step": 2283 }, { "epoch": 0.37, "grad_norm": 10.726385317223865, "learning_rate": 7.292362622530347e-06, "loss": 0.4387, "step": 2284 }, { "epoch": 0.37, "grad_norm": 5.703692197288127, "learning_rate": 7.2900430211723e-06, "loss": 0.5122, "step": 2285 }, { "epoch": 0.37, "grad_norm": 11.41923267502462, "learning_rate": 7.287722795943407e-06, "loss": 0.477, "step": 2286 }, { "epoch": 0.37, "grad_norm": 7.277230325607926, "learning_rate": 7.285401947475764e-06, "loss": 0.4471, "step": 2287 }, { "epoch": 0.37, "grad_norm": 10.295091473012938, "learning_rate": 7.283080476401634e-06, "loss": 0.4238, "step": 2288 }, { "epoch": 0.37, "grad_norm": 7.223241963594739, "learning_rate": 7.280758383353447e-06, "loss": 0.5175, "step": 2289 }, { "epoch": 0.37, "grad_norm": 9.56715934664918, "learning_rate": 7.278435668963807e-06, "loss": 0.4083, "step": 2290 }, { "epoch": 0.37, "grad_norm": 60.505905171125264, "learning_rate": 7.276112333865485e-06, "loss": 0.5082, "step": 2291 }, { "epoch": 0.37, "grad_norm": 8.0654693024042, "learning_rate": 7.273788378691422e-06, "loss": 0.4844, "step": 2292 }, { "epoch": 0.37, "grad_norm": 1.3780112708343384, "learning_rate": 7.271463804074728e-06, "loss": 0.466, "step": 2293 }, { "epoch": 0.37, "grad_norm": 8.717702348084831, "learning_rate": 7.26913861064868e-06, "loss": 0.4415, "step": 2294 }, { "epoch": 0.37, "grad_norm": 8.101783535247044, "learning_rate": 7.2668127990467266e-06, "loss": 0.4591, "step": 2295 }, { "epoch": 0.37, "grad_norm": 9.028238701024941, "learning_rate": 7.26448636990248e-06, "loss": 0.5111, "step": 2296 }, { "epoch": 0.37, "grad_norm": 7.297986563283944, "learning_rate": 7.262159323849725e-06, "loss": 0.4944, "step": 2297 }, { "epoch": 0.37, "grad_norm": 8.970588226967344, "learning_rate": 7.259831661522415e-06, "loss": 0.4636, "step": 2298 }, { "epoch": 0.37, "grad_norm": 1.3283109488606364, "learning_rate": 7.257503383554668e-06, "loss": 0.4609, "step": 2299 }, { "epoch": 0.37, "grad_norm": 6.281383438785661, "learning_rate": 7.25517449058077e-06, "loss": 0.4399, "step": 2300 }, { "epoch": 0.37, "grad_norm": 6.580084235730238, "learning_rate": 7.252844983235177e-06, "loss": 0.4196, "step": 2301 }, { "epoch": 0.37, "grad_norm": 20.15197987021541, "learning_rate": 7.250514862152509e-06, "loss": 0.4162, "step": 2302 }, { "epoch": 0.37, "grad_norm": 10.963679212689494, "learning_rate": 7.248184127967557e-06, "loss": 0.4434, "step": 2303 }, { "epoch": 0.37, "grad_norm": 12.34447337626002, "learning_rate": 7.245852781315278e-06, "loss": 0.4996, "step": 2304 }, { "epoch": 0.37, "grad_norm": 6.822386625745935, "learning_rate": 7.243520822830794e-06, "loss": 0.4463, "step": 2305 }, { "epoch": 0.37, "grad_norm": 7.376647823834363, "learning_rate": 7.241188253149395e-06, "loss": 0.412, "step": 2306 }, { "epoch": 0.37, "grad_norm": 6.82513575055368, "learning_rate": 7.238855072906537e-06, "loss": 0.5113, "step": 2307 }, { "epoch": 0.37, "grad_norm": 1.388090847297473, "learning_rate": 7.236521282737842e-06, "loss": 0.4658, "step": 2308 }, { "epoch": 0.37, "grad_norm": 5.217588585050403, "learning_rate": 7.2341868832791e-06, "loss": 0.438, "step": 2309 }, { "epoch": 0.37, "grad_norm": 12.68103867624612, "learning_rate": 7.231851875166266e-06, "loss": 0.4049, "step": 2310 }, { "epoch": 0.37, "grad_norm": 9.662099721902655, "learning_rate": 7.229516259035458e-06, "loss": 0.3642, "step": 2311 }, { "epoch": 0.37, "grad_norm": 10.455954810168295, "learning_rate": 7.227180035522966e-06, "loss": 0.491, "step": 2312 }, { "epoch": 0.37, "grad_norm": 8.941646924722033, "learning_rate": 7.224843205265239e-06, "loss": 0.4429, "step": 2313 }, { "epoch": 0.37, "grad_norm": 6.373739081692871, "learning_rate": 7.222505768898894e-06, "loss": 0.4596, "step": 2314 }, { "epoch": 0.37, "grad_norm": 1.3125910203014157, "learning_rate": 7.220167727060714e-06, "loss": 0.5171, "step": 2315 }, { "epoch": 0.37, "grad_norm": 7.691027964746766, "learning_rate": 7.217829080387648e-06, "loss": 0.461, "step": 2316 }, { "epoch": 0.37, "grad_norm": 6.236353262272981, "learning_rate": 7.2154898295168045e-06, "loss": 0.5031, "step": 2317 }, { "epoch": 0.37, "grad_norm": 7.06257014097761, "learning_rate": 7.21314997508546e-06, "loss": 0.4318, "step": 2318 }, { "epoch": 0.37, "grad_norm": 13.222896431949701, "learning_rate": 7.210809517731057e-06, "loss": 0.4274, "step": 2319 }, { "epoch": 0.37, "grad_norm": 7.275052758516195, "learning_rate": 7.2084684580912e-06, "loss": 0.4484, "step": 2320 }, { "epoch": 0.37, "grad_norm": 7.047955159935836, "learning_rate": 7.206126796803659e-06, "loss": 0.4387, "step": 2321 }, { "epoch": 0.37, "grad_norm": 7.040850616521069, "learning_rate": 7.203784534506364e-06, "loss": 0.4846, "step": 2322 }, { "epoch": 0.37, "grad_norm": 11.99305418972947, "learning_rate": 7.201441671837417e-06, "loss": 0.4626, "step": 2323 }, { "epoch": 0.37, "grad_norm": 7.570588079221245, "learning_rate": 7.199098209435073e-06, "loss": 0.4708, "step": 2324 }, { "epoch": 0.37, "grad_norm": 1.232881339789681, "learning_rate": 7.19675414793776e-06, "loss": 0.4992, "step": 2325 }, { "epoch": 0.37, "grad_norm": 8.622926516400964, "learning_rate": 7.194409487984063e-06, "loss": 0.4003, "step": 2326 }, { "epoch": 0.37, "grad_norm": 10.095281070298373, "learning_rate": 7.192064230212733e-06, "loss": 0.4063, "step": 2327 }, { "epoch": 0.38, "grad_norm": 9.798873220757585, "learning_rate": 7.189718375262681e-06, "loss": 0.5014, "step": 2328 }, { "epoch": 0.38, "grad_norm": 7.798353927363444, "learning_rate": 7.1873719237729835e-06, "loss": 0.494, "step": 2329 }, { "epoch": 0.38, "grad_norm": 32.21359090778231, "learning_rate": 7.18502487638288e-06, "loss": 0.4877, "step": 2330 }, { "epoch": 0.38, "grad_norm": 76.69863828111654, "learning_rate": 7.182677233731768e-06, "loss": 0.4908, "step": 2331 }, { "epoch": 0.38, "grad_norm": 8.306496828211518, "learning_rate": 7.1803289964592115e-06, "loss": 0.4528, "step": 2332 }, { "epoch": 0.38, "grad_norm": 12.959558298737406, "learning_rate": 7.177980165204935e-06, "loss": 0.4659, "step": 2333 }, { "epoch": 0.38, "grad_norm": 5.383401295302071, "learning_rate": 7.1756307406088275e-06, "loss": 0.4712, "step": 2334 }, { "epoch": 0.38, "grad_norm": 7.820948031824189, "learning_rate": 7.173280723310932e-06, "loss": 0.4685, "step": 2335 }, { "epoch": 0.38, "grad_norm": 9.370859669820504, "learning_rate": 7.170930113951462e-06, "loss": 0.4742, "step": 2336 }, { "epoch": 0.38, "grad_norm": 10.201344388532467, "learning_rate": 7.168578913170788e-06, "loss": 0.5038, "step": 2337 }, { "epoch": 0.38, "grad_norm": 6.148699503788613, "learning_rate": 7.166227121609439e-06, "loss": 0.4438, "step": 2338 }, { "epoch": 0.38, "grad_norm": 7.3972833657333625, "learning_rate": 7.16387473990811e-06, "loss": 0.3986, "step": 2339 }, { "epoch": 0.38, "grad_norm": 7.6801814010577, "learning_rate": 7.1615217687076555e-06, "loss": 0.43, "step": 2340 }, { "epoch": 0.38, "grad_norm": 7.677543195091026, "learning_rate": 7.159168208649086e-06, "loss": 0.4779, "step": 2341 }, { "epoch": 0.38, "grad_norm": 7.197696416554334, "learning_rate": 7.15681406037358e-06, "loss": 0.389, "step": 2342 }, { "epoch": 0.38, "grad_norm": 13.278102046453421, "learning_rate": 7.154459324522474e-06, "loss": 0.452, "step": 2343 }, { "epoch": 0.38, "grad_norm": 9.484114423721536, "learning_rate": 7.152104001737254e-06, "loss": 0.453, "step": 2344 }, { "epoch": 0.38, "grad_norm": 6.494850371531936, "learning_rate": 7.149748092659585e-06, "loss": 0.5003, "step": 2345 }, { "epoch": 0.38, "grad_norm": 5.951922827002078, "learning_rate": 7.147391597931277e-06, "loss": 0.3957, "step": 2346 }, { "epoch": 0.38, "grad_norm": 1.2073194239313514, "learning_rate": 7.145034518194304e-06, "loss": 0.4873, "step": 2347 }, { "epoch": 0.38, "grad_norm": 1.2781448741845711, "learning_rate": 7.1426768540908e-06, "loss": 0.4385, "step": 2348 }, { "epoch": 0.38, "grad_norm": 7.792667243618058, "learning_rate": 7.140318606263058e-06, "loss": 0.4329, "step": 2349 }, { "epoch": 0.38, "grad_norm": 7.126932308540526, "learning_rate": 7.137959775353529e-06, "loss": 0.4599, "step": 2350 }, { "epoch": 0.38, "grad_norm": 5.254358907380975, "learning_rate": 7.135600362004824e-06, "loss": 0.5906, "step": 2351 }, { "epoch": 0.38, "grad_norm": 7.571658807909942, "learning_rate": 7.133240366859713e-06, "loss": 0.4699, "step": 2352 }, { "epoch": 0.38, "grad_norm": 11.56911111335395, "learning_rate": 7.130879790561122e-06, "loss": 0.488, "step": 2353 }, { "epoch": 0.38, "grad_norm": 1.3955096313395376, "learning_rate": 7.128518633752139e-06, "loss": 0.4683, "step": 2354 }, { "epoch": 0.38, "grad_norm": 7.039838244323425, "learning_rate": 7.126156897076005e-06, "loss": 0.4212, "step": 2355 }, { "epoch": 0.38, "grad_norm": 6.335872449497525, "learning_rate": 7.123794581176127e-06, "loss": 0.4778, "step": 2356 }, { "epoch": 0.38, "grad_norm": 1.4370740332278222, "learning_rate": 7.121431686696061e-06, "loss": 0.4848, "step": 2357 }, { "epoch": 0.38, "grad_norm": 7.535473843128977, "learning_rate": 7.119068214279525e-06, "loss": 0.4218, "step": 2358 }, { "epoch": 0.38, "grad_norm": 10.693924844396516, "learning_rate": 7.116704164570398e-06, "loss": 0.4731, "step": 2359 }, { "epoch": 0.38, "grad_norm": 5.212206817490371, "learning_rate": 7.114339538212707e-06, "loss": 0.4985, "step": 2360 }, { "epoch": 0.38, "grad_norm": 22.40544599207518, "learning_rate": 7.111974335850644e-06, "loss": 0.3914, "step": 2361 }, { "epoch": 0.38, "grad_norm": 1.2077120511604114, "learning_rate": 7.1096085581285555e-06, "loss": 0.4901, "step": 2362 }, { "epoch": 0.38, "grad_norm": 13.08913830297425, "learning_rate": 7.1072422056909426e-06, "loss": 0.4581, "step": 2363 }, { "epoch": 0.38, "grad_norm": 14.535403628425986, "learning_rate": 7.104875279182468e-06, "loss": 0.5037, "step": 2364 }, { "epoch": 0.38, "grad_norm": 11.29889237359968, "learning_rate": 7.102507779247947e-06, "loss": 0.4171, "step": 2365 }, { "epoch": 0.38, "grad_norm": 11.247829408752288, "learning_rate": 7.100139706532347e-06, "loss": 0.5091, "step": 2366 }, { "epoch": 0.38, "grad_norm": 10.342420806821993, "learning_rate": 7.097771061680805e-06, "loss": 0.4864, "step": 2367 }, { "epoch": 0.38, "grad_norm": 20.43916592740891, "learning_rate": 7.095401845338598e-06, "loss": 0.4294, "step": 2368 }, { "epoch": 0.38, "grad_norm": 7.346033549745908, "learning_rate": 7.093032058151168e-06, "loss": 0.4017, "step": 2369 }, { "epoch": 0.38, "grad_norm": 1.4126253185001287, "learning_rate": 7.090661700764112e-06, "loss": 0.4767, "step": 2370 }, { "epoch": 0.38, "grad_norm": 9.442100200897753, "learning_rate": 7.088290773823177e-06, "loss": 0.4492, "step": 2371 }, { "epoch": 0.38, "grad_norm": 10.093730759580277, "learning_rate": 7.085919277974274e-06, "loss": 0.3833, "step": 2372 }, { "epoch": 0.38, "grad_norm": 14.209336948606353, "learning_rate": 7.083547213863458e-06, "loss": 0.4717, "step": 2373 }, { "epoch": 0.38, "grad_norm": 11.700448577681772, "learning_rate": 7.0811745821369495e-06, "loss": 0.4499, "step": 2374 }, { "epoch": 0.38, "grad_norm": 16.525786521393062, "learning_rate": 7.0788013834411165e-06, "loss": 0.3774, "step": 2375 }, { "epoch": 0.38, "grad_norm": 6.971559601576116, "learning_rate": 7.0764276184224845e-06, "loss": 0.4552, "step": 2376 }, { "epoch": 0.38, "grad_norm": 10.053155738690688, "learning_rate": 7.07405328772773e-06, "loss": 0.4599, "step": 2377 }, { "epoch": 0.38, "grad_norm": 10.54922162754398, "learning_rate": 7.071678392003691e-06, "loss": 0.4537, "step": 2378 }, { "epoch": 0.38, "grad_norm": 9.390768274107499, "learning_rate": 7.069302931897352e-06, "loss": 0.4318, "step": 2379 }, { "epoch": 0.38, "grad_norm": 16.389897126320463, "learning_rate": 7.0669269080558515e-06, "loss": 0.4357, "step": 2380 }, { "epoch": 0.38, "grad_norm": 6.359967822353355, "learning_rate": 7.064550321126489e-06, "loss": 0.4508, "step": 2381 }, { "epoch": 0.38, "grad_norm": 1.4346415714824337, "learning_rate": 7.0621731717567055e-06, "loss": 0.4711, "step": 2382 }, { "epoch": 0.38, "grad_norm": 10.367588822697945, "learning_rate": 7.059795460594109e-06, "loss": 0.4941, "step": 2383 }, { "epoch": 0.38, "grad_norm": 5.644600413776572, "learning_rate": 7.057417188286449e-06, "loss": 0.4505, "step": 2384 }, { "epoch": 0.38, "grad_norm": 6.441600520334161, "learning_rate": 7.0550383554816345e-06, "loss": 0.4506, "step": 2385 }, { "epoch": 0.38, "grad_norm": 5.380972897998028, "learning_rate": 7.052658962827724e-06, "loss": 0.4344, "step": 2386 }, { "epoch": 0.38, "grad_norm": 7.398920871556851, "learning_rate": 7.050279010972932e-06, "loss": 0.4521, "step": 2387 }, { "epoch": 0.38, "grad_norm": 8.274252363518743, "learning_rate": 7.047898500565619e-06, "loss": 0.4474, "step": 2388 }, { "epoch": 0.38, "grad_norm": 1.0866906532737934, "learning_rate": 7.045517432254304e-06, "loss": 0.4404, "step": 2389 }, { "epoch": 0.39, "grad_norm": 19.589335321162775, "learning_rate": 7.043135806687655e-06, "loss": 0.5244, "step": 2390 }, { "epoch": 0.39, "grad_norm": 7.120472291837094, "learning_rate": 7.040753624514494e-06, "loss": 0.4877, "step": 2391 }, { "epoch": 0.39, "grad_norm": 7.983480443303751, "learning_rate": 7.038370886383793e-06, "loss": 0.4287, "step": 2392 }, { "epoch": 0.39, "grad_norm": 10.234894789703503, "learning_rate": 7.035987592944672e-06, "loss": 0.4438, "step": 2393 }, { "epoch": 0.39, "grad_norm": 5.676336474260915, "learning_rate": 7.03360374484641e-06, "loss": 0.4813, "step": 2394 }, { "epoch": 0.39, "grad_norm": 8.041342141168878, "learning_rate": 7.031219342738431e-06, "loss": 0.4503, "step": 2395 }, { "epoch": 0.39, "grad_norm": 6.482946764121926, "learning_rate": 7.028834387270311e-06, "loss": 0.418, "step": 2396 }, { "epoch": 0.39, "grad_norm": 8.444080281908883, "learning_rate": 7.02644887909178e-06, "loss": 0.442, "step": 2397 }, { "epoch": 0.39, "grad_norm": 5.512584356362216, "learning_rate": 7.024062818852716e-06, "loss": 0.3888, "step": 2398 }, { "epoch": 0.39, "grad_norm": 9.405662255519774, "learning_rate": 7.021676207203145e-06, "loss": 0.4353, "step": 2399 }, { "epoch": 0.39, "grad_norm": 10.75244419428642, "learning_rate": 7.019289044793247e-06, "loss": 0.3938, "step": 2400 }, { "epoch": 0.39, "grad_norm": 11.346925884032942, "learning_rate": 7.016901332273352e-06, "loss": 0.5427, "step": 2401 }, { "epoch": 0.39, "grad_norm": 15.204562533791732, "learning_rate": 7.014513070293938e-06, "loss": 0.4767, "step": 2402 }, { "epoch": 0.39, "grad_norm": 7.040572170452628, "learning_rate": 7.012124259505633e-06, "loss": 0.399, "step": 2403 }, { "epoch": 0.39, "grad_norm": 7.442254116693553, "learning_rate": 7.0097349005592145e-06, "loss": 0.4117, "step": 2404 }, { "epoch": 0.39, "grad_norm": 5.985072505865638, "learning_rate": 7.007344994105612e-06, "loss": 0.4158, "step": 2405 }, { "epoch": 0.39, "grad_norm": 12.478739903039797, "learning_rate": 7.004954540795899e-06, "loss": 0.518, "step": 2406 }, { "epoch": 0.39, "grad_norm": 6.704643709428703, "learning_rate": 7.002563541281302e-06, "loss": 0.4486, "step": 2407 }, { "epoch": 0.39, "grad_norm": 7.3844073556806995, "learning_rate": 7.000171996213196e-06, "loss": 0.3981, "step": 2408 }, { "epoch": 0.39, "grad_norm": 13.199684753424732, "learning_rate": 6.997779906243103e-06, "loss": 0.3728, "step": 2409 }, { "epoch": 0.39, "grad_norm": 16.461602843019307, "learning_rate": 6.995387272022695e-06, "loss": 0.3962, "step": 2410 }, { "epoch": 0.39, "grad_norm": 8.678208730393507, "learning_rate": 6.99299409420379e-06, "loss": 0.5136, "step": 2411 }, { "epoch": 0.39, "grad_norm": 11.584951031330831, "learning_rate": 6.9906003734383565e-06, "loss": 0.4088, "step": 2412 }, { "epoch": 0.39, "grad_norm": 6.206057432935938, "learning_rate": 6.98820611037851e-06, "loss": 0.4919, "step": 2413 }, { "epoch": 0.39, "grad_norm": 25.48820974719153, "learning_rate": 6.985811305676515e-06, "loss": 0.4919, "step": 2414 }, { "epoch": 0.39, "grad_norm": 23.947717235865277, "learning_rate": 6.98341595998478e-06, "loss": 0.4232, "step": 2415 }, { "epoch": 0.39, "grad_norm": 8.898446328823951, "learning_rate": 6.981020073955866e-06, "loss": 0.3991, "step": 2416 }, { "epoch": 0.39, "grad_norm": 8.457360816881248, "learning_rate": 6.978623648242474e-06, "loss": 0.4558, "step": 2417 }, { "epoch": 0.39, "grad_norm": 11.97962498058865, "learning_rate": 6.9762266834974605e-06, "loss": 0.4969, "step": 2418 }, { "epoch": 0.39, "grad_norm": 15.40563419821297, "learning_rate": 6.973829180373823e-06, "loss": 0.4339, "step": 2419 }, { "epoch": 0.39, "grad_norm": 19.828764356090712, "learning_rate": 6.971431139524709e-06, "loss": 0.4901, "step": 2420 }, { "epoch": 0.39, "grad_norm": 9.778713667308615, "learning_rate": 6.96903256160341e-06, "loss": 0.4309, "step": 2421 }, { "epoch": 0.39, "grad_norm": 9.36401245946902, "learning_rate": 6.966633447263362e-06, "loss": 0.4377, "step": 2422 }, { "epoch": 0.39, "grad_norm": 9.788069862285154, "learning_rate": 6.964233797158155e-06, "loss": 0.469, "step": 2423 }, { "epoch": 0.39, "grad_norm": 12.102995897781907, "learning_rate": 6.961833611941515e-06, "loss": 0.4568, "step": 2424 }, { "epoch": 0.39, "grad_norm": 8.4561923233948, "learning_rate": 6.959432892267324e-06, "loss": 0.4617, "step": 2425 }, { "epoch": 0.39, "grad_norm": 9.187151635133098, "learning_rate": 6.957031638789598e-06, "loss": 0.4169, "step": 2426 }, { "epoch": 0.39, "grad_norm": 1.3874037784658793, "learning_rate": 6.954629852162509e-06, "loss": 0.4984, "step": 2427 }, { "epoch": 0.39, "grad_norm": 13.449698897659353, "learning_rate": 6.952227533040369e-06, "loss": 0.4636, "step": 2428 }, { "epoch": 0.39, "grad_norm": 22.568536652964223, "learning_rate": 6.949824682077635e-06, "loss": 0.4902, "step": 2429 }, { "epoch": 0.39, "grad_norm": 11.351741678071608, "learning_rate": 6.947421299928909e-06, "loss": 0.4346, "step": 2430 }, { "epoch": 0.39, "grad_norm": 7.790244987952567, "learning_rate": 6.945017387248942e-06, "loss": 0.4104, "step": 2431 }, { "epoch": 0.39, "grad_norm": 13.487103643818642, "learning_rate": 6.942612944692624e-06, "loss": 0.3904, "step": 2432 }, { "epoch": 0.39, "grad_norm": 16.244672362302826, "learning_rate": 6.940207972914989e-06, "loss": 0.3976, "step": 2433 }, { "epoch": 0.39, "grad_norm": 10.504723629669721, "learning_rate": 6.9378024725712225e-06, "loss": 0.408, "step": 2434 }, { "epoch": 0.39, "grad_norm": 10.508221371748512, "learning_rate": 6.935396444316646e-06, "loss": 0.4445, "step": 2435 }, { "epoch": 0.39, "grad_norm": 1.0637657091295818, "learning_rate": 6.9329898888067295e-06, "loss": 0.4681, "step": 2436 }, { "epoch": 0.39, "grad_norm": 1.746609200866317, "learning_rate": 6.930582806697082e-06, "loss": 0.4757, "step": 2437 }, { "epoch": 0.39, "grad_norm": 11.244148747738713, "learning_rate": 6.928175198643463e-06, "loss": 0.4374, "step": 2438 }, { "epoch": 0.39, "grad_norm": 6.587036160693334, "learning_rate": 6.9257670653017674e-06, "loss": 0.4679, "step": 2439 }, { "epoch": 0.39, "grad_norm": 8.316161717954362, "learning_rate": 6.92335840732804e-06, "loss": 0.387, "step": 2440 }, { "epoch": 0.39, "grad_norm": 10.624828080509273, "learning_rate": 6.9209492253784664e-06, "loss": 0.441, "step": 2441 }, { "epoch": 0.39, "grad_norm": 9.752619066860827, "learning_rate": 6.91853952010937e-06, "loss": 0.4813, "step": 2442 }, { "epoch": 0.39, "grad_norm": 7.472394904483936, "learning_rate": 6.916129292177225e-06, "loss": 0.5247, "step": 2443 }, { "epoch": 0.39, "grad_norm": 5.532384304401379, "learning_rate": 6.913718542238642e-06, "loss": 0.497, "step": 2444 }, { "epoch": 0.39, "grad_norm": 6.684806398803388, "learning_rate": 6.911307270950376e-06, "loss": 0.4275, "step": 2445 }, { "epoch": 0.39, "grad_norm": 5.8886421716911395, "learning_rate": 6.908895478969324e-06, "loss": 0.4319, "step": 2446 }, { "epoch": 0.39, "grad_norm": 6.638392199346289, "learning_rate": 6.906483166952526e-06, "loss": 0.4775, "step": 2447 }, { "epoch": 0.39, "grad_norm": 7.564972130318737, "learning_rate": 6.904070335557158e-06, "loss": 0.4477, "step": 2448 }, { "epoch": 0.39, "grad_norm": 8.183747032671569, "learning_rate": 6.9016569854405466e-06, "loss": 0.4179, "step": 2449 }, { "epoch": 0.39, "grad_norm": 13.931457895049578, "learning_rate": 6.899243117260153e-06, "loss": 0.4345, "step": 2450 }, { "epoch": 0.39, "grad_norm": 8.071694154845241, "learning_rate": 6.896828731673579e-06, "loss": 0.4728, "step": 2451 }, { "epoch": 0.4, "grad_norm": 9.848153462459301, "learning_rate": 6.894413829338576e-06, "loss": 0.4849, "step": 2452 }, { "epoch": 0.4, "grad_norm": 9.617163645216445, "learning_rate": 6.891998410913021e-06, "loss": 0.4632, "step": 2453 }, { "epoch": 0.4, "grad_norm": 12.02608942490304, "learning_rate": 6.88958247705495e-06, "loss": 0.5041, "step": 2454 }, { "epoch": 0.4, "grad_norm": 43.31490199784347, "learning_rate": 6.887166028422524e-06, "loss": 0.427, "step": 2455 }, { "epoch": 0.4, "grad_norm": 17.880979555375145, "learning_rate": 6.884749065674051e-06, "loss": 0.455, "step": 2456 }, { "epoch": 0.4, "grad_norm": 8.553133422448308, "learning_rate": 6.88233158946798e-06, "loss": 0.4146, "step": 2457 }, { "epoch": 0.4, "grad_norm": 6.792022571716306, "learning_rate": 6.879913600462898e-06, "loss": 0.4784, "step": 2458 }, { "epoch": 0.4, "grad_norm": 8.120768171119673, "learning_rate": 6.87749509931753e-06, "loss": 0.4019, "step": 2459 }, { "epoch": 0.4, "grad_norm": 8.628429299028117, "learning_rate": 6.875076086690744e-06, "loss": 0.4932, "step": 2460 }, { "epoch": 0.4, "grad_norm": 7.923010388243224, "learning_rate": 6.8726565632415445e-06, "loss": 0.4385, "step": 2461 }, { "epoch": 0.4, "grad_norm": 9.313389831013437, "learning_rate": 6.8702365296290775e-06, "loss": 0.4669, "step": 2462 }, { "epoch": 0.4, "grad_norm": 6.2075318171180305, "learning_rate": 6.867815986512627e-06, "loss": 0.3429, "step": 2463 }, { "epoch": 0.4, "grad_norm": 8.542475948699439, "learning_rate": 6.865394934551613e-06, "loss": 0.4565, "step": 2464 }, { "epoch": 0.4, "grad_norm": 24.349009204481835, "learning_rate": 6.862973374405601e-06, "loss": 0.401, "step": 2465 }, { "epoch": 0.4, "grad_norm": 17.61356147439899, "learning_rate": 6.860551306734289e-06, "loss": 0.5061, "step": 2466 }, { "epoch": 0.4, "grad_norm": 10.353981452950123, "learning_rate": 6.858128732197513e-06, "loss": 0.5167, "step": 2467 }, { "epoch": 0.4, "grad_norm": 11.959934202270215, "learning_rate": 6.855705651455252e-06, "loss": 0.4518, "step": 2468 }, { "epoch": 0.4, "grad_norm": 12.390369952358583, "learning_rate": 6.853282065167618e-06, "loss": 0.4665, "step": 2469 }, { "epoch": 0.4, "grad_norm": 15.66352112030414, "learning_rate": 6.850857973994865e-06, "loss": 0.5094, "step": 2470 }, { "epoch": 0.4, "grad_norm": 7.637520938717194, "learning_rate": 6.8484333785973786e-06, "loss": 0.4668, "step": 2471 }, { "epoch": 0.4, "grad_norm": 11.120490352544106, "learning_rate": 6.846008279635688e-06, "loss": 0.439, "step": 2472 }, { "epoch": 0.4, "grad_norm": 7.498246428285075, "learning_rate": 6.8435826777704575e-06, "loss": 0.4388, "step": 2473 }, { "epoch": 0.4, "grad_norm": 9.319473845578853, "learning_rate": 6.841156573662486e-06, "loss": 0.4539, "step": 2474 }, { "epoch": 0.4, "grad_norm": 13.067272245181947, "learning_rate": 6.8387299679727125e-06, "loss": 0.4351, "step": 2475 }, { "epoch": 0.4, "grad_norm": 6.951437447637335, "learning_rate": 6.836302861362211e-06, "loss": 0.5113, "step": 2476 }, { "epoch": 0.4, "grad_norm": 13.348753098698928, "learning_rate": 6.8338752544921915e-06, "loss": 0.4185, "step": 2477 }, { "epoch": 0.4, "grad_norm": 5.626255046470796, "learning_rate": 6.831447148024002e-06, "loss": 0.4693, "step": 2478 }, { "epoch": 0.4, "grad_norm": 9.314454162590877, "learning_rate": 6.829018542619125e-06, "loss": 0.5144, "step": 2479 }, { "epoch": 0.4, "grad_norm": 25.22795601174396, "learning_rate": 6.82658943893918e-06, "loss": 0.3851, "step": 2480 }, { "epoch": 0.4, "grad_norm": 10.805834671419454, "learning_rate": 6.824159837645921e-06, "loss": 0.4493, "step": 2481 }, { "epoch": 0.4, "grad_norm": 8.71174134334264, "learning_rate": 6.821729739401239e-06, "loss": 0.4594, "step": 2482 }, { "epoch": 0.4, "grad_norm": 7.946191081683506, "learning_rate": 6.8192991448671605e-06, "loss": 0.3986, "step": 2483 }, { "epoch": 0.4, "grad_norm": 8.371589467223759, "learning_rate": 6.8168680547058455e-06, "loss": 0.357, "step": 2484 }, { "epoch": 0.4, "grad_norm": 1.3604249155764994, "learning_rate": 6.81443646957959e-06, "loss": 0.4532, "step": 2485 }, { "epoch": 0.4, "grad_norm": 7.487567277588702, "learning_rate": 6.812004390150825e-06, "loss": 0.469, "step": 2486 }, { "epoch": 0.4, "grad_norm": 8.885175407638135, "learning_rate": 6.809571817082117e-06, "loss": 0.4407, "step": 2487 }, { "epoch": 0.4, "grad_norm": 6.80959018378842, "learning_rate": 6.807138751036163e-06, "loss": 0.4614, "step": 2488 }, { "epoch": 0.4, "grad_norm": 7.911564898686454, "learning_rate": 6.804705192675799e-06, "loss": 0.4443, "step": 2489 }, { "epoch": 0.4, "grad_norm": 8.605052674841815, "learning_rate": 6.802271142663994e-06, "loss": 0.3692, "step": 2490 }, { "epoch": 0.4, "grad_norm": 10.279131357547781, "learning_rate": 6.799836601663851e-06, "loss": 0.4285, "step": 2491 }, { "epoch": 0.4, "grad_norm": 6.78402630822119, "learning_rate": 6.797401570338604e-06, "loss": 0.4253, "step": 2492 }, { "epoch": 0.4, "grad_norm": 8.243260681573895, "learning_rate": 6.794966049351625e-06, "loss": 0.4135, "step": 2493 }, { "epoch": 0.4, "grad_norm": 11.689243656869555, "learning_rate": 6.792530039366414e-06, "loss": 0.5574, "step": 2494 }, { "epoch": 0.4, "grad_norm": 5.224508072563628, "learning_rate": 6.790093541046609e-06, "loss": 0.4303, "step": 2495 }, { "epoch": 0.4, "grad_norm": 8.94453921064131, "learning_rate": 6.787656555055979e-06, "loss": 0.4364, "step": 2496 }, { "epoch": 0.4, "grad_norm": 7.028222904594061, "learning_rate": 6.785219082058426e-06, "loss": 0.4769, "step": 2497 }, { "epoch": 0.4, "grad_norm": 7.602281927823516, "learning_rate": 6.782781122717987e-06, "loss": 0.4231, "step": 2498 }, { "epoch": 0.4, "grad_norm": 8.65427888621789, "learning_rate": 6.780342677698826e-06, "loss": 0.47, "step": 2499 }, { "epoch": 0.4, "grad_norm": 7.401731655806332, "learning_rate": 6.777903747665245e-06, "loss": 0.4269, "step": 2500 }, { "epoch": 0.4, "grad_norm": 6.54475510786984, "learning_rate": 6.775464333281674e-06, "loss": 0.5074, "step": 2501 }, { "epoch": 0.4, "grad_norm": 6.795421086707475, "learning_rate": 6.773024435212678e-06, "loss": 0.5656, "step": 2502 }, { "epoch": 0.4, "grad_norm": 9.8236463999906, "learning_rate": 6.770584054122954e-06, "loss": 0.4163, "step": 2503 }, { "epoch": 0.4, "grad_norm": 19.534862380627295, "learning_rate": 6.7681431906773255e-06, "loss": 0.5112, "step": 2504 }, { "epoch": 0.4, "grad_norm": 5.415365863433265, "learning_rate": 6.765701845540753e-06, "loss": 0.4402, "step": 2505 }, { "epoch": 0.4, "grad_norm": 5.003525598342758, "learning_rate": 6.763260019378325e-06, "loss": 0.4953, "step": 2506 }, { "epoch": 0.4, "grad_norm": 5.871693100856605, "learning_rate": 6.760817712855266e-06, "loss": 0.4365, "step": 2507 }, { "epoch": 0.4, "grad_norm": 4.905281859691696, "learning_rate": 6.75837492663692e-06, "loss": 0.4187, "step": 2508 }, { "epoch": 0.4, "grad_norm": 8.032936131322165, "learning_rate": 6.755931661388778e-06, "loss": 0.3569, "step": 2509 }, { "epoch": 0.4, "grad_norm": 6.708119835835211, "learning_rate": 6.753487917776447e-06, "loss": 0.4506, "step": 2510 }, { "epoch": 0.4, "grad_norm": 5.7092951775892935, "learning_rate": 6.751043696465674e-06, "loss": 0.4922, "step": 2511 }, { "epoch": 0.4, "grad_norm": 14.129649127754325, "learning_rate": 6.748598998122328e-06, "loss": 0.4639, "step": 2512 }, { "epoch": 0.4, "grad_norm": 1.153527092736446, "learning_rate": 6.746153823412416e-06, "loss": 0.4482, "step": 2513 }, { "epoch": 0.41, "grad_norm": 5.62264251203995, "learning_rate": 6.7437081730020695e-06, "loss": 0.4269, "step": 2514 }, { "epoch": 0.41, "grad_norm": 7.32422041928711, "learning_rate": 6.7412620475575495e-06, "loss": 0.4972, "step": 2515 }, { "epoch": 0.41, "grad_norm": 8.801432238383798, "learning_rate": 6.7388154477452505e-06, "loss": 0.4625, "step": 2516 }, { "epoch": 0.41, "grad_norm": 40.776797507388686, "learning_rate": 6.736368374231693e-06, "loss": 0.4438, "step": 2517 }, { "epoch": 0.41, "grad_norm": 6.170686519146614, "learning_rate": 6.733920827683529e-06, "loss": 0.461, "step": 2518 }, { "epoch": 0.41, "grad_norm": 6.751376694019045, "learning_rate": 6.731472808767532e-06, "loss": 0.4547, "step": 2519 }, { "epoch": 0.41, "grad_norm": 9.71397785458948, "learning_rate": 6.729024318150617e-06, "loss": 0.5061, "step": 2520 }, { "epoch": 0.41, "grad_norm": 4.594982642960922, "learning_rate": 6.726575356499814e-06, "loss": 0.4371, "step": 2521 }, { "epoch": 0.41, "grad_norm": 9.250113129443198, "learning_rate": 6.724125924482292e-06, "loss": 0.3829, "step": 2522 }, { "epoch": 0.41, "grad_norm": 11.853882875881725, "learning_rate": 6.7216760227653426e-06, "loss": 0.3786, "step": 2523 }, { "epoch": 0.41, "grad_norm": 7.861008928616773, "learning_rate": 6.7192256520163844e-06, "loss": 0.4565, "step": 2524 }, { "epoch": 0.41, "grad_norm": 6.457622200387418, "learning_rate": 6.7167748129029705e-06, "loss": 0.3949, "step": 2525 }, { "epoch": 0.41, "grad_norm": 4.6602605638169035, "learning_rate": 6.714323506092773e-06, "loss": 0.4781, "step": 2526 }, { "epoch": 0.41, "grad_norm": 7.436911792902247, "learning_rate": 6.711871732253596e-06, "loss": 0.411, "step": 2527 }, { "epoch": 0.41, "grad_norm": 4.101439601441904, "learning_rate": 6.709419492053373e-06, "loss": 0.4377, "step": 2528 }, { "epoch": 0.41, "grad_norm": 8.153150686188736, "learning_rate": 6.706966786160159e-06, "loss": 0.464, "step": 2529 }, { "epoch": 0.41, "grad_norm": 7.995667865135097, "learning_rate": 6.7045136152421395e-06, "loss": 0.4357, "step": 2530 }, { "epoch": 0.41, "grad_norm": 6.872361788978383, "learning_rate": 6.702059979967627e-06, "loss": 0.413, "step": 2531 }, { "epoch": 0.41, "grad_norm": 5.254810523532101, "learning_rate": 6.699605881005058e-06, "loss": 0.4111, "step": 2532 }, { "epoch": 0.41, "grad_norm": 6.489493616878587, "learning_rate": 6.697151319022996e-06, "loss": 0.472, "step": 2533 }, { "epoch": 0.41, "grad_norm": 150.0070041924996, "learning_rate": 6.694696294690133e-06, "loss": 0.4326, "step": 2534 }, { "epoch": 0.41, "grad_norm": 7.198379398120597, "learning_rate": 6.692240808675286e-06, "loss": 0.496, "step": 2535 }, { "epoch": 0.41, "grad_norm": 5.548660764451137, "learning_rate": 6.689784861647395e-06, "loss": 0.4637, "step": 2536 }, { "epoch": 0.41, "grad_norm": 1.23958418346717, "learning_rate": 6.6873284542755275e-06, "loss": 0.4693, "step": 2537 }, { "epoch": 0.41, "grad_norm": 35.062916405547824, "learning_rate": 6.684871587228878e-06, "loss": 0.4018, "step": 2538 }, { "epoch": 0.41, "grad_norm": 10.641131105627792, "learning_rate": 6.682414261176765e-06, "loss": 0.4659, "step": 2539 }, { "epoch": 0.41, "grad_norm": 4.948874295505058, "learning_rate": 6.6799564767886305e-06, "loss": 0.4367, "step": 2540 }, { "epoch": 0.41, "grad_norm": 5.4784267684022625, "learning_rate": 6.677498234734045e-06, "loss": 0.4525, "step": 2541 }, { "epoch": 0.41, "grad_norm": 5.2501831605458245, "learning_rate": 6.675039535682699e-06, "loss": 0.4128, "step": 2542 }, { "epoch": 0.41, "grad_norm": 5.473418512295367, "learning_rate": 6.67258038030441e-06, "loss": 0.4735, "step": 2543 }, { "epoch": 0.41, "grad_norm": 5.2851383732486426, "learning_rate": 6.67012076926912e-06, "loss": 0.4395, "step": 2544 }, { "epoch": 0.41, "grad_norm": 6.604074788966195, "learning_rate": 6.667660703246897e-06, "loss": 0.4685, "step": 2545 }, { "epoch": 0.41, "grad_norm": 6.313606849498942, "learning_rate": 6.665200182907928e-06, "loss": 0.4202, "step": 2546 }, { "epoch": 0.41, "grad_norm": 6.956636370713147, "learning_rate": 6.662739208922529e-06, "loss": 0.5373, "step": 2547 }, { "epoch": 0.41, "grad_norm": 12.055223231997422, "learning_rate": 6.660277781961135e-06, "loss": 0.4713, "step": 2548 }, { "epoch": 0.41, "grad_norm": 7.286407851874011, "learning_rate": 6.6578159026943064e-06, "loss": 0.4367, "step": 2549 }, { "epoch": 0.41, "grad_norm": 10.644769568841399, "learning_rate": 6.655353571792729e-06, "loss": 0.4711, "step": 2550 }, { "epoch": 0.41, "grad_norm": 10.229302329082488, "learning_rate": 6.652890789927209e-06, "loss": 0.4848, "step": 2551 }, { "epoch": 0.41, "grad_norm": 6.984718733276937, "learning_rate": 6.650427557768674e-06, "loss": 0.4504, "step": 2552 }, { "epoch": 0.41, "grad_norm": 7.613730766824289, "learning_rate": 6.647963875988179e-06, "loss": 0.5137, "step": 2553 }, { "epoch": 0.41, "grad_norm": 74.0701663191425, "learning_rate": 6.645499745256898e-06, "loss": 0.5075, "step": 2554 }, { "epoch": 0.41, "grad_norm": 16.06784590189765, "learning_rate": 6.643035166246128e-06, "loss": 0.5358, "step": 2555 }, { "epoch": 0.41, "grad_norm": 7.690869147987697, "learning_rate": 6.640570139627288e-06, "loss": 0.4751, "step": 2556 }, { "epoch": 0.41, "grad_norm": 9.133148844282418, "learning_rate": 6.638104666071918e-06, "loss": 0.405, "step": 2557 }, { "epoch": 0.41, "grad_norm": 5.986444923080993, "learning_rate": 6.635638746251685e-06, "loss": 0.4346, "step": 2558 }, { "epoch": 0.41, "grad_norm": 8.479772442887024, "learning_rate": 6.6331723808383674e-06, "loss": 0.4042, "step": 2559 }, { "epoch": 0.41, "grad_norm": 7.2267677263907695, "learning_rate": 6.630705570503878e-06, "loss": 0.4256, "step": 2560 }, { "epoch": 0.41, "grad_norm": 10.699193198341884, "learning_rate": 6.628238315920239e-06, "loss": 0.4184, "step": 2561 }, { "epoch": 0.41, "grad_norm": 6.317524374935241, "learning_rate": 6.6257706177595994e-06, "loss": 0.4855, "step": 2562 }, { "epoch": 0.41, "grad_norm": 1.2382407577660823, "learning_rate": 6.62330247669423e-06, "loss": 0.4169, "step": 2563 }, { "epoch": 0.41, "grad_norm": 9.899954610216126, "learning_rate": 6.62083389339652e-06, "loss": 0.459, "step": 2564 }, { "epoch": 0.41, "grad_norm": 10.494747103918552, "learning_rate": 6.618364868538978e-06, "loss": 0.4149, "step": 2565 }, { "epoch": 0.41, "grad_norm": 8.027055467196949, "learning_rate": 6.6158954027942345e-06, "loss": 0.481, "step": 2566 }, { "epoch": 0.41, "grad_norm": 4.969160671372257, "learning_rate": 6.6134254968350434e-06, "loss": 0.4484, "step": 2567 }, { "epoch": 0.41, "grad_norm": 13.324056038032303, "learning_rate": 6.610955151334269e-06, "loss": 0.4732, "step": 2568 }, { "epoch": 0.41, "grad_norm": 6.95180886494875, "learning_rate": 6.608484366964908e-06, "loss": 0.3573, "step": 2569 }, { "epoch": 0.41, "grad_norm": 7.450754776449508, "learning_rate": 6.606013144400065e-06, "loss": 0.3995, "step": 2570 }, { "epoch": 0.41, "grad_norm": 18.15265210573935, "learning_rate": 6.603541484312974e-06, "loss": 0.4307, "step": 2571 }, { "epoch": 0.41, "grad_norm": 8.088011485070242, "learning_rate": 6.601069387376979e-06, "loss": 0.4299, "step": 2572 }, { "epoch": 0.41, "grad_norm": 8.431935444216913, "learning_rate": 6.5985968542655495e-06, "loss": 0.3813, "step": 2573 }, { "epoch": 0.41, "grad_norm": 6.621195191444515, "learning_rate": 6.596123885652272e-06, "loss": 0.4423, "step": 2574 }, { "epoch": 0.41, "grad_norm": 7.477239791415972, "learning_rate": 6.593650482210851e-06, "loss": 0.4538, "step": 2575 }, { "epoch": 0.42, "grad_norm": 6.6289922172591105, "learning_rate": 6.591176644615108e-06, "loss": 0.4273, "step": 2576 }, { "epoch": 0.42, "grad_norm": 10.908990019562408, "learning_rate": 6.588702373538987e-06, "loss": 0.3923, "step": 2577 }, { "epoch": 0.42, "grad_norm": 4.5750552656757035, "learning_rate": 6.5862276696565454e-06, "loss": 0.4998, "step": 2578 }, { "epoch": 0.42, "grad_norm": 5.361338125006084, "learning_rate": 6.583752533641963e-06, "loss": 0.507, "step": 2579 }, { "epoch": 0.42, "grad_norm": 7.056490038779889, "learning_rate": 6.581276966169534e-06, "loss": 0.4807, "step": 2580 }, { "epoch": 0.42, "grad_norm": 6.70429634791011, "learning_rate": 6.57880096791367e-06, "loss": 0.4081, "step": 2581 }, { "epoch": 0.42, "grad_norm": 7.715377542307558, "learning_rate": 6.576324539548904e-06, "loss": 0.4719, "step": 2582 }, { "epoch": 0.42, "grad_norm": 6.05825707480304, "learning_rate": 6.573847681749881e-06, "loss": 0.4021, "step": 2583 }, { "epoch": 0.42, "grad_norm": 5.243919977210553, "learning_rate": 6.5713703951913665e-06, "loss": 0.4298, "step": 2584 }, { "epoch": 0.42, "grad_norm": 5.998699956224827, "learning_rate": 6.5688926805482425e-06, "loss": 0.424, "step": 2585 }, { "epoch": 0.42, "grad_norm": 5.238615815933871, "learning_rate": 6.566414538495504e-06, "loss": 0.4624, "step": 2586 }, { "epoch": 0.42, "grad_norm": 1.2656532034853407, "learning_rate": 6.563935969708266e-06, "loss": 0.4627, "step": 2587 }, { "epoch": 0.42, "grad_norm": 5.543358463385838, "learning_rate": 6.561456974861761e-06, "loss": 0.5258, "step": 2588 }, { "epoch": 0.42, "grad_norm": 6.347047652295883, "learning_rate": 6.558977554631334e-06, "loss": 0.4739, "step": 2589 }, { "epoch": 0.42, "grad_norm": 6.814995499421517, "learning_rate": 6.556497709692447e-06, "loss": 0.4696, "step": 2590 }, { "epoch": 0.42, "grad_norm": 5.558616012773064, "learning_rate": 6.554017440720679e-06, "loss": 0.4985, "step": 2591 }, { "epoch": 0.42, "grad_norm": 5.622321412075058, "learning_rate": 6.551536748391724e-06, "loss": 0.421, "step": 2592 }, { "epoch": 0.42, "grad_norm": 12.876306529399512, "learning_rate": 6.54905563338139e-06, "loss": 0.4552, "step": 2593 }, { "epoch": 0.42, "grad_norm": 6.392104080108402, "learning_rate": 6.546574096365601e-06, "loss": 0.4348, "step": 2594 }, { "epoch": 0.42, "grad_norm": 13.977814803803094, "learning_rate": 6.544092138020397e-06, "loss": 0.4589, "step": 2595 }, { "epoch": 0.42, "grad_norm": 5.935803424687954, "learning_rate": 6.541609759021933e-06, "loss": 0.465, "step": 2596 }, { "epoch": 0.42, "grad_norm": 6.633575173057532, "learning_rate": 6.539126960046474e-06, "loss": 0.4601, "step": 2597 }, { "epoch": 0.42, "grad_norm": 11.056524507265701, "learning_rate": 6.536643741770406e-06, "loss": 0.4567, "step": 2598 }, { "epoch": 0.42, "grad_norm": 8.182785309469038, "learning_rate": 6.534160104870224e-06, "loss": 0.5201, "step": 2599 }, { "epoch": 0.42, "grad_norm": 8.28855665371542, "learning_rate": 6.531676050022539e-06, "loss": 0.4843, "step": 2600 }, { "epoch": 0.42, "grad_norm": 9.904639065937364, "learning_rate": 6.529191577904079e-06, "loss": 0.5118, "step": 2601 }, { "epoch": 0.42, "grad_norm": 14.728811126547683, "learning_rate": 6.526706689191681e-06, "loss": 0.4585, "step": 2602 }, { "epoch": 0.42, "grad_norm": 5.324117688241901, "learning_rate": 6.524221384562295e-06, "loss": 0.4603, "step": 2603 }, { "epoch": 0.42, "grad_norm": 1.433413795561798, "learning_rate": 6.521735664692989e-06, "loss": 0.4852, "step": 2604 }, { "epoch": 0.42, "grad_norm": 5.113528607261589, "learning_rate": 6.519249530260943e-06, "loss": 0.4166, "step": 2605 }, { "epoch": 0.42, "grad_norm": 6.971934424921738, "learning_rate": 6.516762981943444e-06, "loss": 0.5678, "step": 2606 }, { "epoch": 0.42, "grad_norm": 6.035636553942168, "learning_rate": 6.514276020417901e-06, "loss": 0.4264, "step": 2607 }, { "epoch": 0.42, "grad_norm": 10.311031102408492, "learning_rate": 6.511788646361828e-06, "loss": 0.4414, "step": 2608 }, { "epoch": 0.42, "grad_norm": 8.555967568890328, "learning_rate": 6.509300860452854e-06, "loss": 0.5288, "step": 2609 }, { "epoch": 0.42, "grad_norm": 6.213857548757916, "learning_rate": 6.506812663368722e-06, "loss": 0.4375, "step": 2610 }, { "epoch": 0.42, "grad_norm": 9.852319543974456, "learning_rate": 6.504324055787285e-06, "loss": 0.4732, "step": 2611 }, { "epoch": 0.42, "grad_norm": 9.401242355672261, "learning_rate": 6.501835038386509e-06, "loss": 0.4724, "step": 2612 }, { "epoch": 0.42, "grad_norm": 9.15327723603227, "learning_rate": 6.499345611844471e-06, "loss": 0.4765, "step": 2613 }, { "epoch": 0.42, "grad_norm": 4.6293050332811, "learning_rate": 6.496855776839357e-06, "loss": 0.4075, "step": 2614 }, { "epoch": 0.42, "grad_norm": 6.008175140096545, "learning_rate": 6.494365534049469e-06, "loss": 0.3909, "step": 2615 }, { "epoch": 0.42, "grad_norm": 7.31794495599635, "learning_rate": 6.491874884153217e-06, "loss": 0.4094, "step": 2616 }, { "epoch": 0.42, "grad_norm": 5.698969580833742, "learning_rate": 6.489383827829122e-06, "loss": 0.586, "step": 2617 }, { "epoch": 0.42, "grad_norm": 14.487316960761474, "learning_rate": 6.486892365755819e-06, "loss": 0.4841, "step": 2618 }, { "epoch": 0.42, "grad_norm": 10.588295451567266, "learning_rate": 6.4844004986120465e-06, "loss": 0.3707, "step": 2619 }, { "epoch": 0.42, "grad_norm": 8.839548428583583, "learning_rate": 6.481908227076663e-06, "loss": 0.4402, "step": 2620 }, { "epoch": 0.42, "grad_norm": 5.651427795130737, "learning_rate": 6.479415551828627e-06, "loss": 0.4055, "step": 2621 }, { "epoch": 0.42, "grad_norm": 12.094997417962459, "learning_rate": 6.476922473547016e-06, "loss": 0.3929, "step": 2622 }, { "epoch": 0.42, "grad_norm": 14.177669561568806, "learning_rate": 6.474428992911011e-06, "loss": 0.4072, "step": 2623 }, { "epoch": 0.42, "grad_norm": 6.818888995850015, "learning_rate": 6.471935110599907e-06, "loss": 0.4229, "step": 2624 }, { "epoch": 0.42, "grad_norm": 13.74534096866166, "learning_rate": 6.469440827293103e-06, "loss": 0.4461, "step": 2625 }, { "epoch": 0.42, "grad_norm": 5.590711105562193, "learning_rate": 6.466946143670113e-06, "loss": 0.4612, "step": 2626 }, { "epoch": 0.42, "grad_norm": 6.866538942602566, "learning_rate": 6.464451060410556e-06, "loss": 0.4579, "step": 2627 }, { "epoch": 0.42, "grad_norm": 6.839636044363688, "learning_rate": 6.461955578194163e-06, "loss": 0.4974, "step": 2628 }, { "epoch": 0.42, "grad_norm": 9.309756196488948, "learning_rate": 6.459459697700772e-06, "loss": 0.4103, "step": 2629 }, { "epoch": 0.42, "grad_norm": 6.976798986316532, "learning_rate": 6.456963419610327e-06, "loss": 0.4768, "step": 2630 }, { "epoch": 0.42, "grad_norm": 8.980065588092765, "learning_rate": 6.454466744602888e-06, "loss": 0.3736, "step": 2631 }, { "epoch": 0.42, "grad_norm": 14.256426159789708, "learning_rate": 6.451969673358613e-06, "loss": 0.4476, "step": 2632 }, { "epoch": 0.42, "grad_norm": 10.714377376956033, "learning_rate": 6.449472206557776e-06, "loss": 0.4111, "step": 2633 }, { "epoch": 0.42, "grad_norm": 7.6693899040068505, "learning_rate": 6.4469743448807546e-06, "loss": 0.4755, "step": 2634 }, { "epoch": 0.42, "grad_norm": 7.19485490173629, "learning_rate": 6.444476089008037e-06, "loss": 0.4269, "step": 2635 }, { "epoch": 0.42, "grad_norm": 7.142633944501165, "learning_rate": 6.441977439620214e-06, "loss": 0.4413, "step": 2636 }, { "epoch": 0.42, "grad_norm": 5.030040109818081, "learning_rate": 6.439478397397989e-06, "loss": 0.4921, "step": 2637 }, { "epoch": 0.43, "grad_norm": 6.5924940255425195, "learning_rate": 6.436978963022168e-06, "loss": 0.4717, "step": 2638 }, { "epoch": 0.43, "grad_norm": 1.234421506393416, "learning_rate": 6.434479137173667e-06, "loss": 0.5025, "step": 2639 }, { "epoch": 0.43, "grad_norm": 6.166069220646565, "learning_rate": 6.431978920533509e-06, "loss": 0.4273, "step": 2640 }, { "epoch": 0.43, "grad_norm": 6.235043799236979, "learning_rate": 6.429478313782817e-06, "loss": 0.5036, "step": 2641 }, { "epoch": 0.43, "grad_norm": 5.492752842724464, "learning_rate": 6.426977317602833e-06, "loss": 0.3965, "step": 2642 }, { "epoch": 0.43, "grad_norm": 4.576258178111879, "learning_rate": 6.424475932674889e-06, "loss": 0.4069, "step": 2643 }, { "epoch": 0.43, "grad_norm": 7.655019012153899, "learning_rate": 6.421974159680437e-06, "loss": 0.4723, "step": 2644 }, { "epoch": 0.43, "grad_norm": 6.068719422172653, "learning_rate": 6.419471999301027e-06, "loss": 0.4335, "step": 2645 }, { "epoch": 0.43, "grad_norm": 8.869529164790956, "learning_rate": 6.416969452218317e-06, "loss": 0.4725, "step": 2646 }, { "epoch": 0.43, "grad_norm": 6.730561542807674, "learning_rate": 6.414466519114068e-06, "loss": 0.4603, "step": 2647 }, { "epoch": 0.43, "grad_norm": 7.507983135991724, "learning_rate": 6.411963200670148e-06, "loss": 0.4575, "step": 2648 }, { "epoch": 0.43, "grad_norm": 8.51326306108928, "learning_rate": 6.409459497568533e-06, "loss": 0.5274, "step": 2649 }, { "epoch": 0.43, "grad_norm": 7.064366392998285, "learning_rate": 6.406955410491298e-06, "loss": 0.4103, "step": 2650 }, { "epoch": 0.43, "grad_norm": 5.377608754759293, "learning_rate": 6.404450940120628e-06, "loss": 0.2971, "step": 2651 }, { "epoch": 0.43, "grad_norm": 7.057232338254683, "learning_rate": 6.4019460871388055e-06, "loss": 0.4087, "step": 2652 }, { "epoch": 0.43, "grad_norm": 6.180230786056468, "learning_rate": 6.399440852228226e-06, "loss": 0.4278, "step": 2653 }, { "epoch": 0.43, "grad_norm": 4.53148459455616, "learning_rate": 6.396935236071381e-06, "loss": 0.4284, "step": 2654 }, { "epoch": 0.43, "grad_norm": 12.063471195723691, "learning_rate": 6.394429239350872e-06, "loss": 0.4528, "step": 2655 }, { "epoch": 0.43, "grad_norm": 5.682416310124413, "learning_rate": 6.3919228627494e-06, "loss": 0.3967, "step": 2656 }, { "epoch": 0.43, "grad_norm": 8.271346342014903, "learning_rate": 6.38941610694977e-06, "loss": 0.4389, "step": 2657 }, { "epoch": 0.43, "grad_norm": 6.10739246140022, "learning_rate": 6.386908972634897e-06, "loss": 0.4173, "step": 2658 }, { "epoch": 0.43, "grad_norm": 6.187665782777974, "learning_rate": 6.384401460487787e-06, "loss": 0.4579, "step": 2659 }, { "epoch": 0.43, "grad_norm": 32.623136563088636, "learning_rate": 6.381893571191558e-06, "loss": 0.4661, "step": 2660 }, { "epoch": 0.43, "grad_norm": 10.435900444826837, "learning_rate": 6.379385305429428e-06, "loss": 0.4249, "step": 2661 }, { "epoch": 0.43, "grad_norm": 8.100213755462196, "learning_rate": 6.376876663884719e-06, "loss": 0.5064, "step": 2662 }, { "epoch": 0.43, "grad_norm": 13.144890622794215, "learning_rate": 6.374367647240851e-06, "loss": 0.4276, "step": 2663 }, { "epoch": 0.43, "grad_norm": 27.716935599979614, "learning_rate": 6.371858256181352e-06, "loss": 0.534, "step": 2664 }, { "epoch": 0.43, "grad_norm": 7.638379644538953, "learning_rate": 6.3693484913898494e-06, "loss": 0.4457, "step": 2665 }, { "epoch": 0.43, "grad_norm": 12.346828948137913, "learning_rate": 6.36683835355007e-06, "loss": 0.4207, "step": 2666 }, { "epoch": 0.43, "grad_norm": 7.227501003418883, "learning_rate": 6.364327843345847e-06, "loss": 0.4385, "step": 2667 }, { "epoch": 0.43, "grad_norm": 7.846696581638323, "learning_rate": 6.361816961461111e-06, "loss": 0.4229, "step": 2668 }, { "epoch": 0.43, "grad_norm": 6.291758538528957, "learning_rate": 6.359305708579897e-06, "loss": 0.3588, "step": 2669 }, { "epoch": 0.43, "grad_norm": 6.165142426568481, "learning_rate": 6.356794085386337e-06, "loss": 0.5033, "step": 2670 }, { "epoch": 0.43, "grad_norm": 7.900133208792257, "learning_rate": 6.3542820925646696e-06, "loss": 0.3729, "step": 2671 }, { "epoch": 0.43, "grad_norm": 9.002292295083949, "learning_rate": 6.351769730799227e-06, "loss": 0.4564, "step": 2672 }, { "epoch": 0.43, "grad_norm": 11.306790240773466, "learning_rate": 6.349257000774452e-06, "loss": 0.4124, "step": 2673 }, { "epoch": 0.43, "grad_norm": 12.838228191538686, "learning_rate": 6.346743903174872e-06, "loss": 0.4436, "step": 2674 }, { "epoch": 0.43, "grad_norm": 8.086842403856586, "learning_rate": 6.344230438685134e-06, "loss": 0.4187, "step": 2675 }, { "epoch": 0.43, "grad_norm": 33.80261339147973, "learning_rate": 6.3417166079899685e-06, "loss": 0.5015, "step": 2676 }, { "epoch": 0.43, "grad_norm": 9.243025034241391, "learning_rate": 6.339202411774215e-06, "loss": 0.4635, "step": 2677 }, { "epoch": 0.43, "grad_norm": 12.520925800838363, "learning_rate": 6.336687850722809e-06, "loss": 0.4702, "step": 2678 }, { "epoch": 0.43, "grad_norm": 8.64152819211246, "learning_rate": 6.334172925520785e-06, "loss": 0.5261, "step": 2679 }, { "epoch": 0.43, "grad_norm": 15.415014487141626, "learning_rate": 6.3316576368532814e-06, "loss": 0.3912, "step": 2680 }, { "epoch": 0.43, "grad_norm": 8.443083752000213, "learning_rate": 6.329141985405529e-06, "loss": 0.4338, "step": 2681 }, { "epoch": 0.43, "grad_norm": 5.554378337447598, "learning_rate": 6.326625971862863e-06, "loss": 0.496, "step": 2682 }, { "epoch": 0.43, "grad_norm": 9.524150112189412, "learning_rate": 6.324109596910713e-06, "loss": 0.4937, "step": 2683 }, { "epoch": 0.43, "grad_norm": 5.382812946871318, "learning_rate": 6.32159286123461e-06, "loss": 0.4313, "step": 2684 }, { "epoch": 0.43, "grad_norm": 6.300908206372447, "learning_rate": 6.319075765520179e-06, "loss": 0.5239, "step": 2685 }, { "epoch": 0.43, "grad_norm": 11.120138264827535, "learning_rate": 6.316558310453153e-06, "loss": 0.4447, "step": 2686 }, { "epoch": 0.43, "grad_norm": 6.554352031545959, "learning_rate": 6.314040496719349e-06, "loss": 0.3973, "step": 2687 }, { "epoch": 0.43, "grad_norm": 4.3209089390675555, "learning_rate": 6.3115223250046934e-06, "loss": 0.4189, "step": 2688 }, { "epoch": 0.43, "grad_norm": 7.634814001048279, "learning_rate": 6.309003795995205e-06, "loss": 0.4132, "step": 2689 }, { "epoch": 0.43, "grad_norm": 23.540451242394347, "learning_rate": 6.306484910376998e-06, "loss": 0.4192, "step": 2690 }, { "epoch": 0.43, "grad_norm": 75.74703327065372, "learning_rate": 6.303965668836288e-06, "loss": 0.3974, "step": 2691 }, { "epoch": 0.43, "grad_norm": 13.160979515850597, "learning_rate": 6.301446072059386e-06, "loss": 0.4044, "step": 2692 }, { "epoch": 0.43, "grad_norm": 7.465364088756202, "learning_rate": 6.2989261207327e-06, "loss": 0.4867, "step": 2693 }, { "epoch": 0.43, "grad_norm": 8.125097700183161, "learning_rate": 6.296405815542732e-06, "loss": 0.5052, "step": 2694 }, { "epoch": 0.43, "grad_norm": 8.200506300280933, "learning_rate": 6.293885157176087e-06, "loss": 0.4071, "step": 2695 }, { "epoch": 0.43, "grad_norm": 7.073838043357562, "learning_rate": 6.2913641463194566e-06, "loss": 0.456, "step": 2696 }, { "epoch": 0.43, "grad_norm": 9.10277824690059, "learning_rate": 6.288842783659638e-06, "loss": 0.5271, "step": 2697 }, { "epoch": 0.43, "grad_norm": 7.737442402847915, "learning_rate": 6.286321069883517e-06, "loss": 0.488, "step": 2698 }, { "epoch": 0.43, "grad_norm": 9.246804857140315, "learning_rate": 6.283799005678077e-06, "loss": 0.4591, "step": 2699 }, { "epoch": 0.44, "grad_norm": 5.925226873328308, "learning_rate": 6.281276591730403e-06, "loss": 0.5257, "step": 2700 }, { "epoch": 0.44, "grad_norm": 8.64727114832731, "learning_rate": 6.278753828727664e-06, "loss": 0.4179, "step": 2701 }, { "epoch": 0.44, "grad_norm": 5.680886340056413, "learning_rate": 6.276230717357136e-06, "loss": 0.4628, "step": 2702 }, { "epoch": 0.44, "grad_norm": 8.50793460908045, "learning_rate": 6.273707258306178e-06, "loss": 0.4515, "step": 2703 }, { "epoch": 0.44, "grad_norm": 7.796332425892704, "learning_rate": 6.271183452262255e-06, "loss": 0.5469, "step": 2704 }, { "epoch": 0.44, "grad_norm": 6.328862873918428, "learning_rate": 6.268659299912918e-06, "loss": 0.439, "step": 2705 }, { "epoch": 0.44, "grad_norm": 6.438906563087814, "learning_rate": 6.266134801945819e-06, "loss": 0.4875, "step": 2706 }, { "epoch": 0.44, "grad_norm": 20.978818076807062, "learning_rate": 6.263609959048696e-06, "loss": 0.4287, "step": 2707 }, { "epoch": 0.44, "grad_norm": 5.6353419296450085, "learning_rate": 6.261084771909391e-06, "loss": 0.4785, "step": 2708 }, { "epoch": 0.44, "grad_norm": 7.182548597994957, "learning_rate": 6.25855924121583e-06, "loss": 0.4581, "step": 2709 }, { "epoch": 0.44, "grad_norm": 8.588111544800425, "learning_rate": 6.25603336765604e-06, "loss": 0.4956, "step": 2710 }, { "epoch": 0.44, "grad_norm": 6.397898810042977, "learning_rate": 6.2535071519181385e-06, "loss": 0.4208, "step": 2711 }, { "epoch": 0.44, "grad_norm": 6.482741719503859, "learning_rate": 6.250980594690335e-06, "loss": 0.4457, "step": 2712 }, { "epoch": 0.44, "grad_norm": 8.50662115500216, "learning_rate": 6.248453696660934e-06, "loss": 0.4734, "step": 2713 }, { "epoch": 0.44, "grad_norm": 8.04578343209705, "learning_rate": 6.245926458518333e-06, "loss": 0.5028, "step": 2714 }, { "epoch": 0.44, "grad_norm": 6.830974507809927, "learning_rate": 6.243398880951019e-06, "loss": 0.4, "step": 2715 }, { "epoch": 0.44, "grad_norm": 10.052135634108433, "learning_rate": 6.240870964647577e-06, "loss": 0.4671, "step": 2716 }, { "epoch": 0.44, "grad_norm": 8.808063612110583, "learning_rate": 6.2383427102966786e-06, "loss": 0.4894, "step": 2717 }, { "epoch": 0.44, "grad_norm": 15.91788038970479, "learning_rate": 6.2358141185870915e-06, "loss": 0.4024, "step": 2718 }, { "epoch": 0.44, "grad_norm": 13.861177998389012, "learning_rate": 6.233285190207673e-06, "loss": 0.3856, "step": 2719 }, { "epoch": 0.44, "grad_norm": 6.701407921028038, "learning_rate": 6.2307559258473716e-06, "loss": 0.4649, "step": 2720 }, { "epoch": 0.44, "grad_norm": 10.36373193330169, "learning_rate": 6.228226326195232e-06, "loss": 0.51, "step": 2721 }, { "epoch": 0.44, "grad_norm": 9.405732683359846, "learning_rate": 6.225696391940383e-06, "loss": 0.4732, "step": 2722 }, { "epoch": 0.44, "grad_norm": 10.19915027692201, "learning_rate": 6.223166123772051e-06, "loss": 0.4777, "step": 2723 }, { "epoch": 0.44, "grad_norm": 20.878948203950827, "learning_rate": 6.220635522379551e-06, "loss": 0.5122, "step": 2724 }, { "epoch": 0.44, "grad_norm": 1.2779286199929991, "learning_rate": 6.2181045884522876e-06, "loss": 0.4779, "step": 2725 }, { "epoch": 0.44, "grad_norm": 5.99472682464385, "learning_rate": 6.215573322679756e-06, "loss": 0.4154, "step": 2726 }, { "epoch": 0.44, "grad_norm": 7.570595288035781, "learning_rate": 6.213041725751543e-06, "loss": 0.4424, "step": 2727 }, { "epoch": 0.44, "grad_norm": 6.810825507530346, "learning_rate": 6.210509798357328e-06, "loss": 0.4294, "step": 2728 }, { "epoch": 0.44, "grad_norm": 7.226577038235522, "learning_rate": 6.207977541186876e-06, "loss": 0.4348, "step": 2729 }, { "epoch": 0.44, "grad_norm": 21.599783684133605, "learning_rate": 6.205444954930043e-06, "loss": 0.4388, "step": 2730 }, { "epoch": 0.44, "grad_norm": 7.0821169438375176, "learning_rate": 6.2029120402767765e-06, "loss": 0.3968, "step": 2731 }, { "epoch": 0.44, "grad_norm": 10.354288564491162, "learning_rate": 6.2003787979171105e-06, "loss": 0.4482, "step": 2732 }, { "epoch": 0.44, "grad_norm": 18.65268544986505, "learning_rate": 6.197845228541174e-06, "loss": 0.4291, "step": 2733 }, { "epoch": 0.44, "grad_norm": 8.293287355011215, "learning_rate": 6.195311332839175e-06, "loss": 0.3855, "step": 2734 }, { "epoch": 0.44, "grad_norm": 9.126346477637044, "learning_rate": 6.192777111501422e-06, "loss": 0.464, "step": 2735 }, { "epoch": 0.44, "grad_norm": 7.755672787511041, "learning_rate": 6.190242565218305e-06, "loss": 0.4329, "step": 2736 }, { "epoch": 0.44, "grad_norm": 8.576648308887396, "learning_rate": 6.187707694680302e-06, "loss": 0.4338, "step": 2737 }, { "epoch": 0.44, "grad_norm": 9.046079700546041, "learning_rate": 6.185172500577986e-06, "loss": 0.4862, "step": 2738 }, { "epoch": 0.44, "grad_norm": 7.162362047865712, "learning_rate": 6.182636983602009e-06, "loss": 0.4116, "step": 2739 }, { "epoch": 0.44, "grad_norm": 6.153470463651396, "learning_rate": 6.180101144443121e-06, "loss": 0.4052, "step": 2740 }, { "epoch": 0.44, "grad_norm": 8.072678562721228, "learning_rate": 6.17756498379215e-06, "loss": 0.4315, "step": 2741 }, { "epoch": 0.44, "grad_norm": 10.0204922821827, "learning_rate": 6.175028502340018e-06, "loss": 0.4718, "step": 2742 }, { "epoch": 0.44, "grad_norm": 6.293669494645629, "learning_rate": 6.172491700777732e-06, "loss": 0.4177, "step": 2743 }, { "epoch": 0.44, "grad_norm": 12.653351949106673, "learning_rate": 6.169954579796387e-06, "loss": 0.453, "step": 2744 }, { "epoch": 0.44, "grad_norm": 4.513040664021082, "learning_rate": 6.167417140087163e-06, "loss": 0.4009, "step": 2745 }, { "epoch": 0.44, "grad_norm": 11.099125822071976, "learning_rate": 6.164879382341331e-06, "loss": 0.4766, "step": 2746 }, { "epoch": 0.44, "grad_norm": 8.429167475528589, "learning_rate": 6.162341307250246e-06, "loss": 0.4388, "step": 2747 }, { "epoch": 0.44, "grad_norm": 5.987854435430918, "learning_rate": 6.159802915505347e-06, "loss": 0.4226, "step": 2748 }, { "epoch": 0.44, "grad_norm": 6.607371448649292, "learning_rate": 6.157264207798165e-06, "loss": 0.4565, "step": 2749 }, { "epoch": 0.44, "grad_norm": 13.125636759146166, "learning_rate": 6.154725184820311e-06, "loss": 0.4975, "step": 2750 }, { "epoch": 0.44, "grad_norm": 17.391819986906896, "learning_rate": 6.15218584726349e-06, "loss": 0.5107, "step": 2751 }, { "epoch": 0.44, "grad_norm": 6.271319401004447, "learning_rate": 6.149646195819481e-06, "loss": 0.3739, "step": 2752 }, { "epoch": 0.44, "grad_norm": 7.362505493060931, "learning_rate": 6.147106231180159e-06, "loss": 0.4359, "step": 2753 }, { "epoch": 0.44, "grad_norm": 6.547927449821129, "learning_rate": 6.144565954037479e-06, "loss": 0.4066, "step": 2754 }, { "epoch": 0.44, "grad_norm": 9.648426213645745, "learning_rate": 6.142025365083484e-06, "loss": 0.4349, "step": 2755 }, { "epoch": 0.44, "grad_norm": 6.589219827220292, "learning_rate": 6.139484465010298e-06, "loss": 0.4896, "step": 2756 }, { "epoch": 0.44, "grad_norm": 9.825206933124118, "learning_rate": 6.136943254510135e-06, "loss": 0.4565, "step": 2757 }, { "epoch": 0.44, "grad_norm": 8.640455220267073, "learning_rate": 6.134401734275288e-06, "loss": 0.4367, "step": 2758 }, { "epoch": 0.44, "grad_norm": 53.51769463223771, "learning_rate": 6.13185990499814e-06, "loss": 0.4198, "step": 2759 }, { "epoch": 0.44, "grad_norm": 6.662277016620053, "learning_rate": 6.129317767371153e-06, "loss": 0.4572, "step": 2760 }, { "epoch": 0.44, "grad_norm": 13.15460624286685, "learning_rate": 6.126775322086876e-06, "loss": 0.4426, "step": 2761 }, { "epoch": 0.45, "grad_norm": 25.7354467811583, "learning_rate": 6.124232569837943e-06, "loss": 0.4108, "step": 2762 }, { "epoch": 0.45, "grad_norm": 6.474621270538083, "learning_rate": 6.121689511317068e-06, "loss": 0.4513, "step": 2763 }, { "epoch": 0.45, "grad_norm": 6.605291597863904, "learning_rate": 6.11914614721705e-06, "loss": 0.3929, "step": 2764 }, { "epoch": 0.45, "grad_norm": 6.8261324061656605, "learning_rate": 6.116602478230772e-06, "loss": 0.372, "step": 2765 }, { "epoch": 0.45, "grad_norm": 8.179267370371003, "learning_rate": 6.1140585050512e-06, "loss": 0.4222, "step": 2766 }, { "epoch": 0.45, "grad_norm": 5.541338685622392, "learning_rate": 6.111514228371381e-06, "loss": 0.4386, "step": 2767 }, { "epoch": 0.45, "grad_norm": 1.2127581313492073, "learning_rate": 6.108969648884449e-06, "loss": 0.4427, "step": 2768 }, { "epoch": 0.45, "grad_norm": 7.337868861712894, "learning_rate": 6.106424767283615e-06, "loss": 0.4009, "step": 2769 }, { "epoch": 0.45, "grad_norm": 1.2194106711466417, "learning_rate": 6.103879584262176e-06, "loss": 0.5144, "step": 2770 }, { "epoch": 0.45, "grad_norm": 6.565767684952268, "learning_rate": 6.101334100513508e-06, "loss": 0.4337, "step": 2771 }, { "epoch": 0.45, "grad_norm": 6.542000193867979, "learning_rate": 6.098788316731074e-06, "loss": 0.446, "step": 2772 }, { "epoch": 0.45, "grad_norm": 7.130358310296823, "learning_rate": 6.096242233608414e-06, "loss": 0.4843, "step": 2773 }, { "epoch": 0.45, "grad_norm": 6.2442525529707815, "learning_rate": 6.0936958518391505e-06, "loss": 0.4588, "step": 2774 }, { "epoch": 0.45, "grad_norm": 1.0900471694512461, "learning_rate": 6.0911491721169906e-06, "loss": 0.4631, "step": 2775 }, { "epoch": 0.45, "grad_norm": 13.842974242516187, "learning_rate": 6.088602195135717e-06, "loss": 0.3954, "step": 2776 }, { "epoch": 0.45, "grad_norm": 6.958940559790359, "learning_rate": 6.086054921589198e-06, "loss": 0.435, "step": 2777 }, { "epoch": 0.45, "grad_norm": 10.32351654385405, "learning_rate": 6.083507352171382e-06, "loss": 0.4234, "step": 2778 }, { "epoch": 0.45, "grad_norm": 5.094676318776658, "learning_rate": 6.080959487576297e-06, "loss": 0.3613, "step": 2779 }, { "epoch": 0.45, "grad_norm": 5.954262361918845, "learning_rate": 6.078411328498049e-06, "loss": 0.4213, "step": 2780 }, { "epoch": 0.45, "grad_norm": 9.253297250451642, "learning_rate": 6.075862875630829e-06, "loss": 0.3958, "step": 2781 }, { "epoch": 0.45, "grad_norm": 16.426067194496017, "learning_rate": 6.073314129668907e-06, "loss": 0.4401, "step": 2782 }, { "epoch": 0.45, "grad_norm": 10.143026139236158, "learning_rate": 6.070765091306628e-06, "loss": 0.348, "step": 2783 }, { "epoch": 0.45, "grad_norm": 6.466927515378565, "learning_rate": 6.068215761238423e-06, "loss": 0.4589, "step": 2784 }, { "epoch": 0.45, "grad_norm": 6.634866082306, "learning_rate": 6.065666140158799e-06, "loss": 0.4574, "step": 2785 }, { "epoch": 0.45, "grad_norm": 5.14251291095844, "learning_rate": 6.063116228762343e-06, "loss": 0.4704, "step": 2786 }, { "epoch": 0.45, "grad_norm": 1.2131468260960865, "learning_rate": 6.060566027743721e-06, "loss": 0.4954, "step": 2787 }, { "epoch": 0.45, "grad_norm": 8.061025459156463, "learning_rate": 6.058015537797678e-06, "loss": 0.4638, "step": 2788 }, { "epoch": 0.45, "grad_norm": 8.719036647159259, "learning_rate": 6.055464759619038e-06, "loss": 0.4568, "step": 2789 }, { "epoch": 0.45, "grad_norm": 28.169414343126586, "learning_rate": 6.052913693902706e-06, "loss": 0.4388, "step": 2790 }, { "epoch": 0.45, "grad_norm": 1.163086036479778, "learning_rate": 6.050362341343658e-06, "loss": 0.4874, "step": 2791 }, { "epoch": 0.45, "grad_norm": 5.635998107907762, "learning_rate": 6.047810702636956e-06, "loss": 0.4034, "step": 2792 }, { "epoch": 0.45, "grad_norm": 8.52263108529722, "learning_rate": 6.045258778477735e-06, "loss": 0.4485, "step": 2793 }, { "epoch": 0.45, "grad_norm": 10.815233012690973, "learning_rate": 6.042706569561209e-06, "loss": 0.4747, "step": 2794 }, { "epoch": 0.45, "grad_norm": 12.806976379810907, "learning_rate": 6.040154076582672e-06, "loss": 0.4517, "step": 2795 }, { "epoch": 0.45, "grad_norm": 6.8556525925025324, "learning_rate": 6.0376013002374924e-06, "loss": 0.4327, "step": 2796 }, { "epoch": 0.45, "grad_norm": 5.872191256594274, "learning_rate": 6.035048241221118e-06, "loss": 0.4623, "step": 2797 }, { "epoch": 0.45, "grad_norm": 5.3028402658989595, "learning_rate": 6.032494900229069e-06, "loss": 0.4775, "step": 2798 }, { "epoch": 0.45, "grad_norm": 8.25367019782884, "learning_rate": 6.02994127795695e-06, "loss": 0.4862, "step": 2799 }, { "epoch": 0.45, "grad_norm": 1.2613711018501945, "learning_rate": 6.027387375100435e-06, "loss": 0.4681, "step": 2800 }, { "epoch": 0.45, "grad_norm": 6.200438816117388, "learning_rate": 6.024833192355282e-06, "loss": 0.4748, "step": 2801 }, { "epoch": 0.45, "grad_norm": 8.743007219615023, "learning_rate": 6.022278730417315e-06, "loss": 0.4841, "step": 2802 }, { "epoch": 0.45, "grad_norm": 8.580469415826265, "learning_rate": 6.019723989982444e-06, "loss": 0.4707, "step": 2803 }, { "epoch": 0.45, "grad_norm": 12.027823925669162, "learning_rate": 6.0171689717466485e-06, "loss": 0.3815, "step": 2804 }, { "epoch": 0.45, "grad_norm": 23.82916723001742, "learning_rate": 6.014613676405985e-06, "loss": 0.46, "step": 2805 }, { "epoch": 0.45, "grad_norm": 17.540163612248104, "learning_rate": 6.01205810465659e-06, "loss": 0.4498, "step": 2806 }, { "epoch": 0.45, "grad_norm": 5.800024694125482, "learning_rate": 6.009502257194669e-06, "loss": 0.536, "step": 2807 }, { "epoch": 0.45, "grad_norm": 13.892549418889722, "learning_rate": 6.0069461347165084e-06, "loss": 0.4169, "step": 2808 }, { "epoch": 0.45, "grad_norm": 8.167414890031099, "learning_rate": 6.0043897379184615e-06, "loss": 0.4655, "step": 2809 }, { "epoch": 0.45, "grad_norm": 1.2945591617177967, "learning_rate": 6.001833067496964e-06, "loss": 0.5077, "step": 2810 }, { "epoch": 0.45, "grad_norm": 10.678785135980714, "learning_rate": 5.999276124148525e-06, "loss": 0.4045, "step": 2811 }, { "epoch": 0.45, "grad_norm": 5.987631285032045, "learning_rate": 5.996718908569723e-06, "loss": 0.449, "step": 2812 }, { "epoch": 0.45, "grad_norm": 1.2610627119641213, "learning_rate": 5.9941614214572155e-06, "loss": 0.4673, "step": 2813 }, { "epoch": 0.45, "grad_norm": 6.231435960396342, "learning_rate": 5.9916036635077325e-06, "loss": 0.4143, "step": 2814 }, { "epoch": 0.45, "grad_norm": 1.0977516409968608, "learning_rate": 5.98904563541808e-06, "loss": 0.4611, "step": 2815 }, { "epoch": 0.45, "grad_norm": 6.81995309989696, "learning_rate": 5.986487337885129e-06, "loss": 0.4216, "step": 2816 }, { "epoch": 0.45, "grad_norm": 8.894054853774277, "learning_rate": 5.983928771605839e-06, "loss": 0.4138, "step": 2817 }, { "epoch": 0.45, "grad_norm": 7.9660314031712085, "learning_rate": 5.981369937277226e-06, "loss": 0.4342, "step": 2818 }, { "epoch": 0.45, "grad_norm": 6.189830888470938, "learning_rate": 5.978810835596392e-06, "loss": 0.3284, "step": 2819 }, { "epoch": 0.45, "grad_norm": 9.087744432325385, "learning_rate": 5.976251467260505e-06, "loss": 0.4443, "step": 2820 }, { "epoch": 0.45, "grad_norm": 11.22671710824284, "learning_rate": 5.9736918329668094e-06, "loss": 0.4531, "step": 2821 }, { "epoch": 0.45, "grad_norm": 5.402020343149909, "learning_rate": 5.971131933412617e-06, "loss": 0.4035, "step": 2822 }, { "epoch": 0.45, "grad_norm": 13.782679246204324, "learning_rate": 5.9685717692953175e-06, "loss": 0.5087, "step": 2823 }, { "epoch": 0.46, "grad_norm": 6.430976506867861, "learning_rate": 5.966011341312368e-06, "loss": 0.3814, "step": 2824 }, { "epoch": 0.46, "grad_norm": 9.313739175264386, "learning_rate": 5.9634506501613006e-06, "loss": 0.4834, "step": 2825 }, { "epoch": 0.46, "grad_norm": 6.9398412071528055, "learning_rate": 5.960889696539719e-06, "loss": 0.4025, "step": 2826 }, { "epoch": 0.46, "grad_norm": 12.281238599008825, "learning_rate": 5.958328481145297e-06, "loss": 0.4679, "step": 2827 }, { "epoch": 0.46, "grad_norm": 12.82645596281723, "learning_rate": 5.955767004675781e-06, "loss": 0.4318, "step": 2828 }, { "epoch": 0.46, "grad_norm": 10.184481941672383, "learning_rate": 5.953205267828985e-06, "loss": 0.4374, "step": 2829 }, { "epoch": 0.46, "grad_norm": 8.812200149574807, "learning_rate": 5.950643271302802e-06, "loss": 0.4843, "step": 2830 }, { "epoch": 0.46, "grad_norm": 14.671128623119284, "learning_rate": 5.948081015795184e-06, "loss": 0.4648, "step": 2831 }, { "epoch": 0.46, "grad_norm": 10.950178348593365, "learning_rate": 5.945518502004165e-06, "loss": 0.4395, "step": 2832 }, { "epoch": 0.46, "grad_norm": 32.68427806120232, "learning_rate": 5.942955730627843e-06, "loss": 0.4276, "step": 2833 }, { "epoch": 0.46, "grad_norm": 1.4308996498046926, "learning_rate": 5.940392702364387e-06, "loss": 0.4826, "step": 2834 }, { "epoch": 0.46, "grad_norm": 10.46250085150172, "learning_rate": 5.9378294179120355e-06, "loss": 0.4748, "step": 2835 }, { "epoch": 0.46, "grad_norm": 10.569268625495289, "learning_rate": 5.935265877969101e-06, "loss": 0.4563, "step": 2836 }, { "epoch": 0.46, "grad_norm": 1.2777380071835278, "learning_rate": 5.932702083233959e-06, "loss": 0.4499, "step": 2837 }, { "epoch": 0.46, "grad_norm": 9.16965836799274, "learning_rate": 5.9301380344050595e-06, "loss": 0.392, "step": 2838 }, { "epoch": 0.46, "grad_norm": 12.919826187130859, "learning_rate": 5.927573732180923e-06, "loss": 0.4558, "step": 2839 }, { "epoch": 0.46, "grad_norm": 6.805789016615026, "learning_rate": 5.925009177260128e-06, "loss": 0.4045, "step": 2840 }, { "epoch": 0.46, "grad_norm": 8.169642680146085, "learning_rate": 5.922444370341339e-06, "loss": 0.466, "step": 2841 }, { "epoch": 0.46, "grad_norm": 6.236952089752237, "learning_rate": 5.919879312123276e-06, "loss": 0.5086, "step": 2842 }, { "epoch": 0.46, "grad_norm": 15.8182504017186, "learning_rate": 5.91731400330473e-06, "loss": 0.4862, "step": 2843 }, { "epoch": 0.46, "grad_norm": 1.729008949170497, "learning_rate": 5.914748444584565e-06, "loss": 0.5057, "step": 2844 }, { "epoch": 0.46, "grad_norm": 8.149287078719682, "learning_rate": 5.912182636661707e-06, "loss": 0.49, "step": 2845 }, { "epoch": 0.46, "grad_norm": 8.07467186332408, "learning_rate": 5.9096165802351555e-06, "loss": 0.5099, "step": 2846 }, { "epoch": 0.46, "grad_norm": 8.040016582505954, "learning_rate": 5.907050276003974e-06, "loss": 0.4662, "step": 2847 }, { "epoch": 0.46, "grad_norm": 7.104709325564337, "learning_rate": 5.904483724667294e-06, "loss": 0.4592, "step": 2848 }, { "epoch": 0.46, "grad_norm": 8.122872684250046, "learning_rate": 5.901916926924314e-06, "loss": 0.4565, "step": 2849 }, { "epoch": 0.46, "grad_norm": 11.744291334932427, "learning_rate": 5.899349883474303e-06, "loss": 0.4251, "step": 2850 }, { "epoch": 0.46, "grad_norm": 7.642401732751702, "learning_rate": 5.896782595016594e-06, "loss": 0.4696, "step": 2851 }, { "epoch": 0.46, "grad_norm": 6.170065178320583, "learning_rate": 5.894215062250584e-06, "loss": 0.4063, "step": 2852 }, { "epoch": 0.46, "grad_norm": 9.699551418636885, "learning_rate": 5.891647285875742e-06, "loss": 0.4424, "step": 2853 }, { "epoch": 0.46, "grad_norm": 6.752464275413466, "learning_rate": 5.889079266591602e-06, "loss": 0.4652, "step": 2854 }, { "epoch": 0.46, "grad_norm": 1.291017194480416, "learning_rate": 5.886511005097762e-06, "loss": 0.5129, "step": 2855 }, { "epoch": 0.46, "grad_norm": 7.123614591877015, "learning_rate": 5.883942502093887e-06, "loss": 0.5278, "step": 2856 }, { "epoch": 0.46, "grad_norm": 7.322261647140844, "learning_rate": 5.881373758279709e-06, "loss": 0.4241, "step": 2857 }, { "epoch": 0.46, "grad_norm": 20.788485005371616, "learning_rate": 5.878804774355024e-06, "loss": 0.4148, "step": 2858 }, { "epoch": 0.46, "grad_norm": 6.268956665265226, "learning_rate": 5.876235551019692e-06, "loss": 0.4414, "step": 2859 }, { "epoch": 0.46, "grad_norm": 8.765342971635018, "learning_rate": 5.873666088973644e-06, "loss": 0.4584, "step": 2860 }, { "epoch": 0.46, "grad_norm": 8.420105936739947, "learning_rate": 5.871096388916873e-06, "loss": 0.3963, "step": 2861 }, { "epoch": 0.46, "grad_norm": 8.761903389367417, "learning_rate": 5.86852645154943e-06, "loss": 0.4693, "step": 2862 }, { "epoch": 0.46, "grad_norm": 12.312109800182375, "learning_rate": 5.865956277571441e-06, "loss": 0.4683, "step": 2863 }, { "epoch": 0.46, "grad_norm": 1.4145967125193315, "learning_rate": 5.863385867683093e-06, "loss": 0.5107, "step": 2864 }, { "epoch": 0.46, "grad_norm": 6.776676672370537, "learning_rate": 5.860815222584635e-06, "loss": 0.4135, "step": 2865 }, { "epoch": 0.46, "grad_norm": 5.701685739464139, "learning_rate": 5.858244342976383e-06, "loss": 0.3583, "step": 2866 }, { "epoch": 0.46, "grad_norm": 15.644562450203866, "learning_rate": 5.855673229558711e-06, "loss": 0.4483, "step": 2867 }, { "epoch": 0.46, "grad_norm": 6.9802490518256715, "learning_rate": 5.853101883032069e-06, "loss": 0.416, "step": 2868 }, { "epoch": 0.46, "grad_norm": 14.950343144506773, "learning_rate": 5.850530304096956e-06, "loss": 0.386, "step": 2869 }, { "epoch": 0.46, "grad_norm": 31.015490967594783, "learning_rate": 5.847958493453941e-06, "loss": 0.4989, "step": 2870 }, { "epoch": 0.46, "grad_norm": 10.549421739557433, "learning_rate": 5.845386451803661e-06, "loss": 0.4447, "step": 2871 }, { "epoch": 0.46, "grad_norm": 5.092159702157703, "learning_rate": 5.8428141798468086e-06, "loss": 0.4635, "step": 2872 }, { "epoch": 0.46, "grad_norm": 7.091907129987086, "learning_rate": 5.84024167828414e-06, "loss": 0.3979, "step": 2873 }, { "epoch": 0.46, "grad_norm": 11.033593001540805, "learning_rate": 5.837668947816476e-06, "loss": 0.4813, "step": 2874 }, { "epoch": 0.46, "grad_norm": 5.914908888124439, "learning_rate": 5.835095989144701e-06, "loss": 0.3649, "step": 2875 }, { "epoch": 0.46, "grad_norm": 12.274884053073727, "learning_rate": 5.83252280296976e-06, "loss": 0.4233, "step": 2876 }, { "epoch": 0.46, "grad_norm": 5.681302157000443, "learning_rate": 5.8299493899926584e-06, "loss": 0.475, "step": 2877 }, { "epoch": 0.46, "grad_norm": 26.132700690007344, "learning_rate": 5.827375750914463e-06, "loss": 0.4379, "step": 2878 }, { "epoch": 0.46, "grad_norm": 6.36417889758148, "learning_rate": 5.82480188643631e-06, "loss": 0.3889, "step": 2879 }, { "epoch": 0.46, "grad_norm": 8.466491623862069, "learning_rate": 5.822227797259385e-06, "loss": 0.411, "step": 2880 }, { "epoch": 0.46, "grad_norm": 6.4873387330737335, "learning_rate": 5.819653484084944e-06, "loss": 0.4027, "step": 2881 }, { "epoch": 0.46, "grad_norm": 6.181501333812945, "learning_rate": 5.817078947614299e-06, "loss": 0.4879, "step": 2882 }, { "epoch": 0.46, "grad_norm": 10.290536697678343, "learning_rate": 5.814504188548828e-06, "loss": 0.4249, "step": 2883 }, { "epoch": 0.46, "grad_norm": 6.627272995061737, "learning_rate": 5.811929207589963e-06, "loss": 0.4942, "step": 2884 }, { "epoch": 0.46, "grad_norm": 7.988461582905527, "learning_rate": 5.8093540054392e-06, "loss": 0.447, "step": 2885 }, { "epoch": 0.46, "grad_norm": 10.953579135640085, "learning_rate": 5.806778582798096e-06, "loss": 0.5234, "step": 2886 }, { "epoch": 0.47, "grad_norm": 5.88206805539415, "learning_rate": 5.804202940368267e-06, "loss": 0.4478, "step": 2887 }, { "epoch": 0.47, "grad_norm": 7.429469932863361, "learning_rate": 5.801627078851391e-06, "loss": 0.3615, "step": 2888 }, { "epoch": 0.47, "grad_norm": 1.2323457238322424, "learning_rate": 5.799050998949199e-06, "loss": 0.478, "step": 2889 }, { "epoch": 0.47, "grad_norm": 14.648519009943133, "learning_rate": 5.796474701363491e-06, "loss": 0.466, "step": 2890 }, { "epoch": 0.47, "grad_norm": 6.445335338769459, "learning_rate": 5.793898186796118e-06, "loss": 0.4188, "step": 2891 }, { "epoch": 0.47, "grad_norm": 8.236850876240105, "learning_rate": 5.791321455948996e-06, "loss": 0.4102, "step": 2892 }, { "epoch": 0.47, "grad_norm": 8.832761301808484, "learning_rate": 5.7887445095240954e-06, "loss": 0.4922, "step": 2893 }, { "epoch": 0.47, "grad_norm": 10.172347927307735, "learning_rate": 5.78616734822345e-06, "loss": 0.3856, "step": 2894 }, { "epoch": 0.47, "grad_norm": 6.431278726392184, "learning_rate": 5.783589972749149e-06, "loss": 0.4382, "step": 2895 }, { "epoch": 0.47, "grad_norm": 1.116597379648586, "learning_rate": 5.781012383803337e-06, "loss": 0.4563, "step": 2896 }, { "epoch": 0.47, "grad_norm": 6.3252607913528935, "learning_rate": 5.7784345820882235e-06, "loss": 0.4752, "step": 2897 }, { "epoch": 0.47, "grad_norm": 8.270174263282255, "learning_rate": 5.775856568306073e-06, "loss": 0.4731, "step": 2898 }, { "epoch": 0.47, "grad_norm": 8.81204897196919, "learning_rate": 5.773278343159207e-06, "loss": 0.4886, "step": 2899 }, { "epoch": 0.47, "grad_norm": 5.930999153154023, "learning_rate": 5.770699907350003e-06, "loss": 0.3932, "step": 2900 }, { "epoch": 0.47, "grad_norm": 6.10985791126475, "learning_rate": 5.768121261580901e-06, "loss": 0.4893, "step": 2901 }, { "epoch": 0.47, "grad_norm": 6.709401416530688, "learning_rate": 5.765542406554394e-06, "loss": 0.4504, "step": 2902 }, { "epoch": 0.47, "grad_norm": 1.3077089412384026, "learning_rate": 5.762963342973033e-06, "loss": 0.4347, "step": 2903 }, { "epoch": 0.47, "grad_norm": 8.711515445818277, "learning_rate": 5.7603840715394264e-06, "loss": 0.4567, "step": 2904 }, { "epoch": 0.47, "grad_norm": 6.151365844661109, "learning_rate": 5.757804592956237e-06, "loss": 0.404, "step": 2905 }, { "epoch": 0.47, "grad_norm": 12.131252411719633, "learning_rate": 5.75522490792619e-06, "loss": 0.4141, "step": 2906 }, { "epoch": 0.47, "grad_norm": 7.658191696170597, "learning_rate": 5.752645017152058e-06, "loss": 0.4419, "step": 2907 }, { "epoch": 0.47, "grad_norm": 17.68514828944344, "learning_rate": 5.750064921336679e-06, "loss": 0.4387, "step": 2908 }, { "epoch": 0.47, "grad_norm": 8.394743276584741, "learning_rate": 5.7474846211829376e-06, "loss": 0.4258, "step": 2909 }, { "epoch": 0.47, "grad_norm": 9.617387018194139, "learning_rate": 5.7449041173937825e-06, "loss": 0.405, "step": 2910 }, { "epoch": 0.47, "grad_norm": 8.621199161538355, "learning_rate": 5.7423234106722105e-06, "loss": 0.4324, "step": 2911 }, { "epoch": 0.47, "grad_norm": 7.34297665671962, "learning_rate": 5.739742501721282e-06, "loss": 0.4806, "step": 2912 }, { "epoch": 0.47, "grad_norm": 9.068450920415767, "learning_rate": 5.7371613912441015e-06, "loss": 0.5048, "step": 2913 }, { "epoch": 0.47, "grad_norm": 16.90882216362115, "learning_rate": 5.73458007994384e-06, "loss": 0.3789, "step": 2914 }, { "epoch": 0.47, "grad_norm": 9.358641242146204, "learning_rate": 5.731998568523716e-06, "loss": 0.3909, "step": 2915 }, { "epoch": 0.47, "grad_norm": 16.704391680023594, "learning_rate": 5.729416857687002e-06, "loss": 0.422, "step": 2916 }, { "epoch": 0.47, "grad_norm": 6.963909238366637, "learning_rate": 5.726834948137033e-06, "loss": 0.4857, "step": 2917 }, { "epoch": 0.47, "grad_norm": 1.5659095219892367, "learning_rate": 5.7242528405771865e-06, "loss": 0.5525, "step": 2918 }, { "epoch": 0.47, "grad_norm": 14.18755182042403, "learning_rate": 5.721670535710902e-06, "loss": 0.4452, "step": 2919 }, { "epoch": 0.47, "grad_norm": 9.48614156534668, "learning_rate": 5.7190880342416725e-06, "loss": 0.3858, "step": 2920 }, { "epoch": 0.47, "grad_norm": 8.49574028786523, "learning_rate": 5.7165053368730395e-06, "loss": 0.4062, "step": 2921 }, { "epoch": 0.47, "grad_norm": 8.321588271493018, "learning_rate": 5.713922444308601e-06, "loss": 0.4526, "step": 2922 }, { "epoch": 0.47, "grad_norm": 9.10499448458488, "learning_rate": 5.711339357252011e-06, "loss": 0.4447, "step": 2923 }, { "epoch": 0.47, "grad_norm": 8.363324139045963, "learning_rate": 5.708756076406972e-06, "loss": 0.4746, "step": 2924 }, { "epoch": 0.47, "grad_norm": 10.449096613407512, "learning_rate": 5.706172602477239e-06, "loss": 0.4404, "step": 2925 }, { "epoch": 0.47, "grad_norm": 13.987779361774265, "learning_rate": 5.703588936166625e-06, "loss": 0.3972, "step": 2926 }, { "epoch": 0.47, "grad_norm": 7.740607792649446, "learning_rate": 5.701005078178989e-06, "loss": 0.4737, "step": 2927 }, { "epoch": 0.47, "grad_norm": 11.688351859744449, "learning_rate": 5.6984210292182485e-06, "loss": 0.4796, "step": 2928 }, { "epoch": 0.47, "grad_norm": 1.2708897545039788, "learning_rate": 5.695836789988365e-06, "loss": 0.4463, "step": 2929 }, { "epoch": 0.47, "grad_norm": 16.99989661488079, "learning_rate": 5.6932523611933595e-06, "loss": 0.3786, "step": 2930 }, { "epoch": 0.47, "grad_norm": 16.356109425026787, "learning_rate": 5.690667743537303e-06, "loss": 0.4863, "step": 2931 }, { "epoch": 0.47, "grad_norm": 6.828661865708355, "learning_rate": 5.688082937724314e-06, "loss": 0.3906, "step": 2932 }, { "epoch": 0.47, "grad_norm": 11.780843911172074, "learning_rate": 5.685497944458564e-06, "loss": 0.4064, "step": 2933 }, { "epoch": 0.47, "grad_norm": 8.758196365513205, "learning_rate": 5.68291276444428e-06, "loss": 0.4923, "step": 2934 }, { "epoch": 0.47, "grad_norm": 8.375931704056862, "learning_rate": 5.680327398385735e-06, "loss": 0.4395, "step": 2935 }, { "epoch": 0.47, "grad_norm": 6.5601526619090285, "learning_rate": 5.677741846987251e-06, "loss": 0.3962, "step": 2936 }, { "epoch": 0.47, "grad_norm": 8.676956289824686, "learning_rate": 5.675156110953208e-06, "loss": 0.4603, "step": 2937 }, { "epoch": 0.47, "grad_norm": 17.553858492834614, "learning_rate": 5.672570190988027e-06, "loss": 0.4169, "step": 2938 }, { "epoch": 0.47, "grad_norm": 9.359757256831715, "learning_rate": 5.6699840877961884e-06, "loss": 0.4424, "step": 2939 }, { "epoch": 0.47, "grad_norm": 16.198840893683663, "learning_rate": 5.667397802082216e-06, "loss": 0.4074, "step": 2940 }, { "epoch": 0.47, "grad_norm": 1.1697897420996894, "learning_rate": 5.664811334550685e-06, "loss": 0.4499, "step": 2941 }, { "epoch": 0.47, "grad_norm": 8.06793548040542, "learning_rate": 5.6622246859062205e-06, "loss": 0.4581, "step": 2942 }, { "epoch": 0.47, "grad_norm": 8.485420283804375, "learning_rate": 5.6596378568534984e-06, "loss": 0.419, "step": 2943 }, { "epoch": 0.47, "grad_norm": 9.178239157599464, "learning_rate": 5.657050848097239e-06, "loss": 0.507, "step": 2944 }, { "epoch": 0.47, "grad_norm": 7.32537508477043, "learning_rate": 5.654463660342219e-06, "loss": 0.3751, "step": 2945 }, { "epoch": 0.47, "grad_norm": 7.248006033912738, "learning_rate": 5.651876294293256e-06, "loss": 0.4202, "step": 2946 }, { "epoch": 0.47, "grad_norm": 5.670341710839232, "learning_rate": 5.649288750655222e-06, "loss": 0.4305, "step": 2947 }, { "epoch": 0.47, "grad_norm": 12.987442142332512, "learning_rate": 5.646701030133036e-06, "loss": 0.5025, "step": 2948 }, { "epoch": 0.48, "grad_norm": 9.008117523130684, "learning_rate": 5.64411313343166e-06, "loss": 0.4673, "step": 2949 }, { "epoch": 0.48, "grad_norm": 8.79781854836096, "learning_rate": 5.641525061256113e-06, "loss": 0.4763, "step": 2950 }, { "epoch": 0.48, "grad_norm": 15.434043102403942, "learning_rate": 5.638936814311455e-06, "loss": 0.4265, "step": 2951 }, { "epoch": 0.48, "grad_norm": 7.089252023063847, "learning_rate": 5.636348393302798e-06, "loss": 0.4643, "step": 2952 }, { "epoch": 0.48, "grad_norm": 12.361343945671232, "learning_rate": 5.633759798935296e-06, "loss": 0.466, "step": 2953 }, { "epoch": 0.48, "grad_norm": 8.433692321283031, "learning_rate": 5.631171031914155e-06, "loss": 0.4898, "step": 2954 }, { "epoch": 0.48, "grad_norm": 8.032368745184664, "learning_rate": 5.628582092944628e-06, "loss": 0.4258, "step": 2955 }, { "epoch": 0.48, "grad_norm": 10.37540584069632, "learning_rate": 5.625992982732011e-06, "loss": 0.4598, "step": 2956 }, { "epoch": 0.48, "grad_norm": 14.697101069490158, "learning_rate": 5.62340370198165e-06, "loss": 0.4282, "step": 2957 }, { "epoch": 0.48, "grad_norm": 17.498334520330957, "learning_rate": 5.620814251398936e-06, "loss": 0.4655, "step": 2958 }, { "epoch": 0.48, "grad_norm": 5.65715098955851, "learning_rate": 5.6182246316893086e-06, "loss": 0.4558, "step": 2959 }, { "epoch": 0.48, "grad_norm": 11.129480613435163, "learning_rate": 5.615634843558248e-06, "loss": 0.4371, "step": 2960 }, { "epoch": 0.48, "grad_norm": 9.920528352568887, "learning_rate": 5.613044887711288e-06, "loss": 0.512, "step": 2961 }, { "epoch": 0.48, "grad_norm": 5.168696657055204, "learning_rate": 5.6104547648539995e-06, "loss": 0.4567, "step": 2962 }, { "epoch": 0.48, "grad_norm": 10.557198106467714, "learning_rate": 5.6078644756920066e-06, "loss": 0.4361, "step": 2963 }, { "epoch": 0.48, "grad_norm": 9.018610417012438, "learning_rate": 5.605274020930975e-06, "loss": 0.4895, "step": 2964 }, { "epoch": 0.48, "grad_norm": 8.826050775154176, "learning_rate": 5.6026834012766155e-06, "loss": 0.4424, "step": 2965 }, { "epoch": 0.48, "grad_norm": 6.632067543121904, "learning_rate": 5.600092617434686e-06, "loss": 0.4667, "step": 2966 }, { "epoch": 0.48, "grad_norm": 8.021073553700118, "learning_rate": 5.597501670110984e-06, "loss": 0.4327, "step": 2967 }, { "epoch": 0.48, "grad_norm": 5.961601368770387, "learning_rate": 5.594910560011356e-06, "loss": 0.4471, "step": 2968 }, { "epoch": 0.48, "grad_norm": 8.931766059110284, "learning_rate": 5.592319287841694e-06, "loss": 0.4603, "step": 2969 }, { "epoch": 0.48, "grad_norm": 35.18790916719359, "learning_rate": 5.589727854307932e-06, "loss": 0.3948, "step": 2970 }, { "epoch": 0.48, "grad_norm": 9.87055063415602, "learning_rate": 5.587136260116044e-06, "loss": 0.452, "step": 2971 }, { "epoch": 0.48, "grad_norm": 8.274843826008723, "learning_rate": 5.584544505972056e-06, "loss": 0.4703, "step": 2972 }, { "epoch": 0.48, "grad_norm": 6.381430190343625, "learning_rate": 5.58195259258203e-06, "loss": 0.4381, "step": 2973 }, { "epoch": 0.48, "grad_norm": 5.8722634607789574, "learning_rate": 5.5793605206520776e-06, "loss": 0.45, "step": 2974 }, { "epoch": 0.48, "grad_norm": 15.477269450620478, "learning_rate": 5.576768290888348e-06, "loss": 0.4874, "step": 2975 }, { "epoch": 0.48, "grad_norm": 6.040556301846228, "learning_rate": 5.574175903997038e-06, "loss": 0.427, "step": 2976 }, { "epoch": 0.48, "grad_norm": 9.656252760195777, "learning_rate": 5.571583360684385e-06, "loss": 0.4457, "step": 2977 }, { "epoch": 0.48, "grad_norm": 7.557056097627112, "learning_rate": 5.568990661656668e-06, "loss": 0.4641, "step": 2978 }, { "epoch": 0.48, "grad_norm": 7.912199265099999, "learning_rate": 5.566397807620209e-06, "loss": 0.4103, "step": 2979 }, { "epoch": 0.48, "grad_norm": 9.184608369367421, "learning_rate": 5.563804799281377e-06, "loss": 0.427, "step": 2980 }, { "epoch": 0.48, "grad_norm": 10.747081923284373, "learning_rate": 5.561211637346576e-06, "loss": 0.4327, "step": 2981 }, { "epoch": 0.48, "grad_norm": 6.023265003576396, "learning_rate": 5.558618322522253e-06, "loss": 0.4619, "step": 2982 }, { "epoch": 0.48, "grad_norm": 7.174136054754001, "learning_rate": 5.556024855514904e-06, "loss": 0.466, "step": 2983 }, { "epoch": 0.48, "grad_norm": 7.940541998218045, "learning_rate": 5.553431237031054e-06, "loss": 0.4957, "step": 2984 }, { "epoch": 0.48, "grad_norm": 1.1117140702800752, "learning_rate": 5.5508374677772806e-06, "loss": 0.4374, "step": 2985 }, { "epoch": 0.48, "grad_norm": 7.582501620033825, "learning_rate": 5.548243548460197e-06, "loss": 0.3632, "step": 2986 }, { "epoch": 0.48, "grad_norm": 9.799756014417358, "learning_rate": 5.545649479786459e-06, "loss": 0.3867, "step": 2987 }, { "epoch": 0.48, "grad_norm": 8.731656731676633, "learning_rate": 5.543055262462762e-06, "loss": 0.4448, "step": 2988 }, { "epoch": 0.48, "grad_norm": 16.02364390092112, "learning_rate": 5.540460897195842e-06, "loss": 0.3909, "step": 2989 }, { "epoch": 0.48, "grad_norm": 16.91295147146881, "learning_rate": 5.537866384692474e-06, "loss": 0.4956, "step": 2990 }, { "epoch": 0.48, "grad_norm": 12.81289120047385, "learning_rate": 5.535271725659478e-06, "loss": 0.4124, "step": 2991 }, { "epoch": 0.48, "grad_norm": 23.592945826172226, "learning_rate": 5.53267692080371e-06, "loss": 0.3873, "step": 2992 }, { "epoch": 0.48, "grad_norm": 11.572809079223754, "learning_rate": 5.530081970832063e-06, "loss": 0.4434, "step": 2993 }, { "epoch": 0.48, "grad_norm": 1.0955485569805417, "learning_rate": 5.527486876451478e-06, "loss": 0.4443, "step": 2994 }, { "epoch": 0.48, "grad_norm": 6.502408749778991, "learning_rate": 5.524891638368926e-06, "loss": 0.3865, "step": 2995 }, { "epoch": 0.48, "grad_norm": 12.115212538984887, "learning_rate": 5.522296257291422e-06, "loss": 0.4154, "step": 2996 }, { "epoch": 0.48, "grad_norm": 8.34387453399514, "learning_rate": 5.519700733926022e-06, "loss": 0.4124, "step": 2997 }, { "epoch": 0.48, "grad_norm": 1.1951200354123026, "learning_rate": 5.517105068979816e-06, "loss": 0.4475, "step": 2998 }, { "epoch": 0.48, "grad_norm": 8.482576751341709, "learning_rate": 5.514509263159935e-06, "loss": 0.4882, "step": 2999 }, { "epoch": 0.48, "grad_norm": 13.784224124370374, "learning_rate": 5.511913317173548e-06, "loss": 0.4766, "step": 3000 }, { "epoch": 0.48, "grad_norm": 14.866964138657936, "learning_rate": 5.509317231727863e-06, "loss": 0.3741, "step": 3001 }, { "epoch": 0.48, "grad_norm": 13.241596807622788, "learning_rate": 5.506721007530125e-06, "loss": 0.4146, "step": 3002 }, { "epoch": 0.48, "grad_norm": 9.675312569336509, "learning_rate": 5.504124645287616e-06, "loss": 0.4436, "step": 3003 }, { "epoch": 0.48, "grad_norm": 10.597914457616575, "learning_rate": 5.501528145707657e-06, "loss": 0.4038, "step": 3004 }, { "epoch": 0.48, "grad_norm": 13.941102971490304, "learning_rate": 5.498931509497607e-06, "loss": 0.4448, "step": 3005 }, { "epoch": 0.48, "grad_norm": 11.385167178869475, "learning_rate": 5.4963347373648615e-06, "loss": 0.3787, "step": 3006 }, { "epoch": 0.48, "grad_norm": 7.840377027988868, "learning_rate": 5.49373783001685e-06, "loss": 0.4158, "step": 3007 }, { "epoch": 0.48, "grad_norm": 11.529640099088152, "learning_rate": 5.4911407881610455e-06, "loss": 0.4644, "step": 3008 }, { "epoch": 0.48, "grad_norm": 13.437315859413028, "learning_rate": 5.48854361250495e-06, "loss": 0.4401, "step": 3009 }, { "epoch": 0.48, "grad_norm": 11.128950454580467, "learning_rate": 5.48594630375611e-06, "loss": 0.4439, "step": 3010 }, { "epoch": 0.49, "grad_norm": 8.270697033204776, "learning_rate": 5.4833488626221e-06, "loss": 0.3898, "step": 3011 }, { "epoch": 0.49, "grad_norm": 1.3686987977185772, "learning_rate": 5.480751289810537e-06, "loss": 0.5186, "step": 3012 }, { "epoch": 0.49, "grad_norm": 23.322942602698525, "learning_rate": 5.478153586029069e-06, "loss": 0.4117, "step": 3013 }, { "epoch": 0.49, "grad_norm": 14.058488251684079, "learning_rate": 5.4755557519853854e-06, "loss": 0.4722, "step": 3014 }, { "epoch": 0.49, "grad_norm": 17.811941792675054, "learning_rate": 5.472957788387206e-06, "loss": 0.5039, "step": 3015 }, { "epoch": 0.49, "grad_norm": 10.13398004252201, "learning_rate": 5.470359695942289e-06, "loss": 0.4678, "step": 3016 }, { "epoch": 0.49, "grad_norm": 23.56169653992629, "learning_rate": 5.467761475358423e-06, "loss": 0.5041, "step": 3017 }, { "epoch": 0.49, "grad_norm": 7.61304770512132, "learning_rate": 5.465163127343438e-06, "loss": 0.4243, "step": 3018 }, { "epoch": 0.49, "grad_norm": 15.369763040543411, "learning_rate": 5.462564652605197e-06, "loss": 0.4737, "step": 3019 }, { "epoch": 0.49, "grad_norm": 20.573736604923365, "learning_rate": 5.459966051851589e-06, "loss": 0.4131, "step": 3020 }, { "epoch": 0.49, "grad_norm": 8.053407611857278, "learning_rate": 5.457367325790555e-06, "loss": 0.4773, "step": 3021 }, { "epoch": 0.49, "grad_norm": 28.38364267826306, "learning_rate": 5.454768475130051e-06, "loss": 0.4635, "step": 3022 }, { "epoch": 0.49, "grad_norm": 12.223334601524416, "learning_rate": 5.452169500578079e-06, "loss": 0.5018, "step": 3023 }, { "epoch": 0.49, "grad_norm": 13.375112871644935, "learning_rate": 5.449570402842671e-06, "loss": 0.4219, "step": 3024 }, { "epoch": 0.49, "grad_norm": 10.124627722668103, "learning_rate": 5.446971182631893e-06, "loss": 0.4771, "step": 3025 }, { "epoch": 0.49, "grad_norm": 10.9666111989232, "learning_rate": 5.444371840653843e-06, "loss": 0.4355, "step": 3026 }, { "epoch": 0.49, "grad_norm": 6.370208222662076, "learning_rate": 5.441772377616656e-06, "loss": 0.3884, "step": 3027 }, { "epoch": 0.49, "grad_norm": 12.281123527674065, "learning_rate": 5.439172794228495e-06, "loss": 0.4449, "step": 3028 }, { "epoch": 0.49, "grad_norm": 21.200330023634223, "learning_rate": 5.436573091197559e-06, "loss": 0.4655, "step": 3029 }, { "epoch": 0.49, "grad_norm": 9.250873829510336, "learning_rate": 5.43397326923208e-06, "loss": 0.3907, "step": 3030 }, { "epoch": 0.49, "grad_norm": 8.509768267684336, "learning_rate": 5.431373329040317e-06, "loss": 0.3898, "step": 3031 }, { "epoch": 0.49, "grad_norm": 17.878157133365526, "learning_rate": 5.42877327133057e-06, "loss": 0.368, "step": 3032 }, { "epoch": 0.49, "grad_norm": 10.406487336045982, "learning_rate": 5.426173096811166e-06, "loss": 0.4079, "step": 3033 }, { "epoch": 0.49, "grad_norm": 19.36970195794106, "learning_rate": 5.423572806190461e-06, "loss": 0.4702, "step": 3034 }, { "epoch": 0.49, "grad_norm": 10.87141146465951, "learning_rate": 5.420972400176848e-06, "loss": 0.4941, "step": 3035 }, { "epoch": 0.49, "grad_norm": 13.345501179078067, "learning_rate": 5.41837187947875e-06, "loss": 0.3892, "step": 3036 }, { "epoch": 0.49, "grad_norm": 1.2426044466837947, "learning_rate": 5.41577124480462e-06, "loss": 0.4935, "step": 3037 }, { "epoch": 0.49, "grad_norm": 19.217449550179943, "learning_rate": 5.413170496862941e-06, "loss": 0.4399, "step": 3038 }, { "epoch": 0.49, "grad_norm": 11.832878542802371, "learning_rate": 5.4105696363622284e-06, "loss": 0.4419, "step": 3039 }, { "epoch": 0.49, "grad_norm": 11.958381639233135, "learning_rate": 5.4079686640110305e-06, "loss": 0.5091, "step": 3040 }, { "epoch": 0.49, "grad_norm": 8.560510296147061, "learning_rate": 5.405367580517924e-06, "loss": 0.4742, "step": 3041 }, { "epoch": 0.49, "grad_norm": 9.395271131743407, "learning_rate": 5.402766386591511e-06, "loss": 0.4024, "step": 3042 }, { "epoch": 0.49, "grad_norm": 12.1148959973674, "learning_rate": 5.400165082940436e-06, "loss": 0.4839, "step": 3043 }, { "epoch": 0.49, "grad_norm": 10.142652226675239, "learning_rate": 5.397563670273359e-06, "loss": 0.4505, "step": 3044 }, { "epoch": 0.49, "grad_norm": 10.318194045151992, "learning_rate": 5.394962149298981e-06, "loss": 0.391, "step": 3045 }, { "epoch": 0.49, "grad_norm": 10.723830422267888, "learning_rate": 5.392360520726027e-06, "loss": 0.4882, "step": 3046 }, { "epoch": 0.49, "grad_norm": 6.861229940220509, "learning_rate": 5.38975878526325e-06, "loss": 0.4333, "step": 3047 }, { "epoch": 0.49, "grad_norm": 6.537563545070602, "learning_rate": 5.387156943619437e-06, "loss": 0.4207, "step": 3048 }, { "epoch": 0.49, "grad_norm": 21.353567527013375, "learning_rate": 5.384554996503401e-06, "loss": 0.3535, "step": 3049 }, { "epoch": 0.49, "grad_norm": 10.014262155330695, "learning_rate": 5.381952944623982e-06, "loss": 0.463, "step": 3050 }, { "epoch": 0.49, "grad_norm": 6.7390009527428445, "learning_rate": 5.3793507886900535e-06, "loss": 0.4599, "step": 3051 }, { "epoch": 0.49, "grad_norm": 7.64359595059921, "learning_rate": 5.3767485294105135e-06, "loss": 0.4664, "step": 3052 }, { "epoch": 0.49, "grad_norm": 9.1799534673305, "learning_rate": 5.374146167494286e-06, "loss": 0.4351, "step": 3053 }, { "epoch": 0.49, "grad_norm": 10.454901262697621, "learning_rate": 5.371543703650332e-06, "loss": 0.4987, "step": 3054 }, { "epoch": 0.49, "grad_norm": 7.55000203393371, "learning_rate": 5.368941138587631e-06, "loss": 0.403, "step": 3055 }, { "epoch": 0.49, "grad_norm": 13.627411002834808, "learning_rate": 5.366338473015195e-06, "loss": 0.4829, "step": 3056 }, { "epoch": 0.49, "grad_norm": 18.606728169076742, "learning_rate": 5.3637357076420595e-06, "loss": 0.4886, "step": 3057 }, { "epoch": 0.49, "grad_norm": 38.03049332390407, "learning_rate": 5.36113284317729e-06, "loss": 0.4336, "step": 3058 }, { "epoch": 0.49, "grad_norm": 8.497077389874061, "learning_rate": 5.358529880329982e-06, "loss": 0.4337, "step": 3059 }, { "epoch": 0.49, "grad_norm": 8.918706290947075, "learning_rate": 5.355926819809248e-06, "loss": 0.4792, "step": 3060 }, { "epoch": 0.49, "grad_norm": 11.572269335513441, "learning_rate": 5.3533236623242405e-06, "loss": 0.4102, "step": 3061 }, { "epoch": 0.49, "grad_norm": 11.66655156277395, "learning_rate": 5.3507204085841256e-06, "loss": 0.4247, "step": 3062 }, { "epoch": 0.49, "grad_norm": 37.46023585109764, "learning_rate": 5.348117059298105e-06, "loss": 0.4745, "step": 3063 }, { "epoch": 0.49, "grad_norm": 8.796431070246502, "learning_rate": 5.345513615175401e-06, "loss": 0.4963, "step": 3064 }, { "epoch": 0.49, "grad_norm": 6.654833553240898, "learning_rate": 5.342910076925264e-06, "loss": 0.4775, "step": 3065 }, { "epoch": 0.49, "grad_norm": 9.49501314515276, "learning_rate": 5.3403064452569685e-06, "loss": 0.4459, "step": 3066 }, { "epoch": 0.49, "grad_norm": 13.945065133961046, "learning_rate": 5.337702720879819e-06, "loss": 0.4763, "step": 3067 }, { "epoch": 0.49, "grad_norm": 1.2143253464473334, "learning_rate": 5.335098904503139e-06, "loss": 0.4355, "step": 3068 }, { "epoch": 0.49, "grad_norm": 23.330296694736557, "learning_rate": 5.332494996836279e-06, "loss": 0.4484, "step": 3069 }, { "epoch": 0.49, "grad_norm": 8.034016080741656, "learning_rate": 5.3298909985886195e-06, "loss": 0.4151, "step": 3070 }, { "epoch": 0.49, "grad_norm": 6.3388509752263325, "learning_rate": 5.327286910469556e-06, "loss": 0.4361, "step": 3071 }, { "epoch": 0.49, "grad_norm": 6.718726783764271, "learning_rate": 5.3246827331885165e-06, "loss": 0.4702, "step": 3072 }, { "epoch": 0.5, "grad_norm": 12.712503879988855, "learning_rate": 5.322078467454951e-06, "loss": 0.4829, "step": 3073 }, { "epoch": 0.5, "grad_norm": 7.396557953699374, "learning_rate": 5.3194741139783324e-06, "loss": 0.4828, "step": 3074 }, { "epoch": 0.5, "grad_norm": 13.203373318257308, "learning_rate": 5.316869673468159e-06, "loss": 0.4, "step": 3075 }, { "epoch": 0.5, "grad_norm": 6.396371056975073, "learning_rate": 5.314265146633954e-06, "loss": 0.4386, "step": 3076 }, { "epoch": 0.5, "grad_norm": 16.062307824542277, "learning_rate": 5.311660534185258e-06, "loss": 0.4475, "step": 3077 }, { "epoch": 0.5, "grad_norm": 11.43292352997585, "learning_rate": 5.3090558368316415e-06, "loss": 0.4478, "step": 3078 }, { "epoch": 0.5, "grad_norm": 7.203531688523813, "learning_rate": 5.306451055282696e-06, "loss": 0.4492, "step": 3079 }, { "epoch": 0.5, "grad_norm": 8.434731630693129, "learning_rate": 5.303846190248035e-06, "loss": 0.4465, "step": 3080 }, { "epoch": 0.5, "grad_norm": 6.947973468452653, "learning_rate": 5.301241242437299e-06, "loss": 0.4533, "step": 3081 }, { "epoch": 0.5, "grad_norm": 9.648690446391614, "learning_rate": 5.298636212560143e-06, "loss": 0.4875, "step": 3082 }, { "epoch": 0.5, "grad_norm": 16.854401199781517, "learning_rate": 5.29603110132625e-06, "loss": 0.5023, "step": 3083 }, { "epoch": 0.5, "grad_norm": 10.596059872730613, "learning_rate": 5.293425909445326e-06, "loss": 0.4721, "step": 3084 }, { "epoch": 0.5, "grad_norm": 10.37634818342419, "learning_rate": 5.290820637627095e-06, "loss": 0.4238, "step": 3085 }, { "epoch": 0.5, "grad_norm": 9.161830551808453, "learning_rate": 5.288215286581306e-06, "loss": 0.4947, "step": 3086 }, { "epoch": 0.5, "grad_norm": 7.0823576865966595, "learning_rate": 5.28560985701773e-06, "loss": 0.4233, "step": 3087 }, { "epoch": 0.5, "grad_norm": 6.035264724788157, "learning_rate": 5.283004349646154e-06, "loss": 0.5188, "step": 3088 }, { "epoch": 0.5, "grad_norm": 10.55183737701892, "learning_rate": 5.280398765176392e-06, "loss": 0.4296, "step": 3089 }, { "epoch": 0.5, "grad_norm": 25.923769655534414, "learning_rate": 5.277793104318279e-06, "loss": 0.4785, "step": 3090 }, { "epoch": 0.5, "grad_norm": 8.188738216200228, "learning_rate": 5.275187367781664e-06, "loss": 0.3882, "step": 3091 }, { "epoch": 0.5, "grad_norm": 13.504779817597388, "learning_rate": 5.272581556276428e-06, "loss": 0.4568, "step": 3092 }, { "epoch": 0.5, "grad_norm": 9.599343150321767, "learning_rate": 5.26997567051246e-06, "loss": 0.3954, "step": 3093 }, { "epoch": 0.5, "grad_norm": 7.049054956329651, "learning_rate": 5.267369711199678e-06, "loss": 0.4691, "step": 3094 }, { "epoch": 0.5, "grad_norm": 12.996168132368977, "learning_rate": 5.2647636790480175e-06, "loss": 0.388, "step": 3095 }, { "epoch": 0.5, "grad_norm": 7.758409096157251, "learning_rate": 5.26215757476743e-06, "loss": 0.5424, "step": 3096 }, { "epoch": 0.5, "grad_norm": 7.7150779883685106, "learning_rate": 5.259551399067895e-06, "loss": 0.4333, "step": 3097 }, { "epoch": 0.5, "grad_norm": 6.93152449971319, "learning_rate": 5.256945152659404e-06, "loss": 0.4084, "step": 3098 }, { "epoch": 0.5, "grad_norm": 9.568517183805675, "learning_rate": 5.254338836251971e-06, "loss": 0.4379, "step": 3099 }, { "epoch": 0.5, "grad_norm": 15.160476729858633, "learning_rate": 5.251732450555626e-06, "loss": 0.4607, "step": 3100 }, { "epoch": 0.5, "grad_norm": 10.439811695894882, "learning_rate": 5.249125996280426e-06, "loss": 0.3981, "step": 3101 }, { "epoch": 0.5, "grad_norm": 10.999900837084816, "learning_rate": 5.246519474136433e-06, "loss": 0.4628, "step": 3102 }, { "epoch": 0.5, "grad_norm": 5.908807735109394, "learning_rate": 5.243912884833744e-06, "loss": 0.4227, "step": 3103 }, { "epoch": 0.5, "grad_norm": 8.385455930999868, "learning_rate": 5.241306229082459e-06, "loss": 0.4072, "step": 3104 }, { "epoch": 0.5, "grad_norm": 11.539621899923734, "learning_rate": 5.2386995075927074e-06, "loss": 0.4327, "step": 3105 }, { "epoch": 0.5, "grad_norm": 1.3781149802070196, "learning_rate": 5.236092721074629e-06, "loss": 0.4656, "step": 3106 }, { "epoch": 0.5, "grad_norm": 16.24410489516164, "learning_rate": 5.233485870238385e-06, "loss": 0.4578, "step": 3107 }, { "epoch": 0.5, "grad_norm": 8.272384842875574, "learning_rate": 5.2308789557941546e-06, "loss": 0.4584, "step": 3108 }, { "epoch": 0.5, "grad_norm": 9.662367351895382, "learning_rate": 5.228271978452133e-06, "loss": 0.4544, "step": 3109 }, { "epoch": 0.5, "grad_norm": 5.956113864429475, "learning_rate": 5.225664938922531e-06, "loss": 0.372, "step": 3110 }, { "epoch": 0.5, "grad_norm": 28.737245347872076, "learning_rate": 5.223057837915579e-06, "loss": 0.3723, "step": 3111 }, { "epoch": 0.5, "grad_norm": 9.27407958667095, "learning_rate": 5.220450676141525e-06, "loss": 0.3873, "step": 3112 }, { "epoch": 0.5, "grad_norm": 12.691293350587973, "learning_rate": 5.217843454310628e-06, "loss": 0.4636, "step": 3113 }, { "epoch": 0.5, "grad_norm": 14.328866310325898, "learning_rate": 5.215236173133172e-06, "loss": 0.4295, "step": 3114 }, { "epoch": 0.5, "grad_norm": 7.358182658239808, "learning_rate": 5.2126288333194455e-06, "loss": 0.3988, "step": 3115 }, { "epoch": 0.5, "grad_norm": 5.642091199174931, "learning_rate": 5.210021435579766e-06, "loss": 0.3678, "step": 3116 }, { "epoch": 0.5, "grad_norm": 9.63133591111443, "learning_rate": 5.207413980624459e-06, "loss": 0.4575, "step": 3117 }, { "epoch": 0.5, "grad_norm": 9.137888328029778, "learning_rate": 5.204806469163865e-06, "loss": 0.4647, "step": 3118 }, { "epoch": 0.5, "grad_norm": 9.4078111362129, "learning_rate": 5.202198901908343e-06, "loss": 0.4801, "step": 3119 }, { "epoch": 0.5, "grad_norm": 8.27187302544425, "learning_rate": 5.199591279568268e-06, "loss": 0.4316, "step": 3120 }, { "epoch": 0.5, "grad_norm": 9.509805384648743, "learning_rate": 5.196983602854024e-06, "loss": 0.3958, "step": 3121 }, { "epoch": 0.5, "grad_norm": 7.5720357602583395, "learning_rate": 5.1943758724760185e-06, "loss": 0.434, "step": 3122 }, { "epoch": 0.5, "grad_norm": 7.90536422673787, "learning_rate": 5.191768089144668e-06, "loss": 0.4171, "step": 3123 }, { "epoch": 0.5, "grad_norm": 12.997410464897989, "learning_rate": 5.189160253570402e-06, "loss": 0.3666, "step": 3124 }, { "epoch": 0.5, "grad_norm": 1.246359227639362, "learning_rate": 5.186552366463671e-06, "loss": 0.4337, "step": 3125 }, { "epoch": 0.5, "grad_norm": 1.1508179337381892, "learning_rate": 5.18394442853493e-06, "loss": 0.4209, "step": 3126 }, { "epoch": 0.5, "grad_norm": 8.517050473227762, "learning_rate": 5.181336440494659e-06, "loss": 0.3948, "step": 3127 }, { "epoch": 0.5, "grad_norm": 10.528817289920582, "learning_rate": 5.178728403053342e-06, "loss": 0.5046, "step": 3128 }, { "epoch": 0.5, "grad_norm": 13.429959161940932, "learning_rate": 5.176120316921481e-06, "loss": 0.4848, "step": 3129 }, { "epoch": 0.5, "grad_norm": 9.854073549522074, "learning_rate": 5.173512182809592e-06, "loss": 0.46, "step": 3130 }, { "epoch": 0.5, "grad_norm": 19.92011495366333, "learning_rate": 5.170904001428201e-06, "loss": 0.4434, "step": 3131 }, { "epoch": 0.5, "grad_norm": 10.055177815685655, "learning_rate": 5.168295773487848e-06, "loss": 0.398, "step": 3132 }, { "epoch": 0.5, "grad_norm": 16.203570404030096, "learning_rate": 5.165687499699088e-06, "loss": 0.4296, "step": 3133 }, { "epoch": 0.5, "grad_norm": 12.984099710108099, "learning_rate": 5.163079180772486e-06, "loss": 0.4771, "step": 3134 }, { "epoch": 0.51, "grad_norm": 7.192463042597208, "learning_rate": 5.160470817418619e-06, "loss": 0.4163, "step": 3135 }, { "epoch": 0.51, "grad_norm": 8.565442429016356, "learning_rate": 5.15786241034808e-06, "loss": 0.5303, "step": 3136 }, { "epoch": 0.51, "grad_norm": 9.283318236375548, "learning_rate": 5.155253960271466e-06, "loss": 0.5087, "step": 3137 }, { "epoch": 0.51, "grad_norm": 6.8695097196008525, "learning_rate": 5.152645467899397e-06, "loss": 0.4028, "step": 3138 }, { "epoch": 0.51, "grad_norm": 74.1354057670772, "learning_rate": 5.150036933942492e-06, "loss": 0.5656, "step": 3139 }, { "epoch": 0.51, "grad_norm": 21.05956954268088, "learning_rate": 5.147428359111391e-06, "loss": 0.4414, "step": 3140 }, { "epoch": 0.51, "grad_norm": 9.431954873443466, "learning_rate": 5.144819744116742e-06, "loss": 0.4265, "step": 3141 }, { "epoch": 0.51, "grad_norm": 8.134236783618586, "learning_rate": 5.142211089669202e-06, "loss": 0.4002, "step": 3142 }, { "epoch": 0.51, "grad_norm": 11.640266005607186, "learning_rate": 5.1396023964794415e-06, "loss": 0.4356, "step": 3143 }, { "epoch": 0.51, "grad_norm": 9.804853887556776, "learning_rate": 5.1369936652581395e-06, "loss": 0.372, "step": 3144 }, { "epoch": 0.51, "grad_norm": 1.88973125690465, "learning_rate": 5.134384896715987e-06, "loss": 0.5036, "step": 3145 }, { "epoch": 0.51, "grad_norm": 7.043407790242097, "learning_rate": 5.131776091563685e-06, "loss": 0.3968, "step": 3146 }, { "epoch": 0.51, "grad_norm": 8.726585578535852, "learning_rate": 5.129167250511943e-06, "loss": 0.4159, "step": 3147 }, { "epoch": 0.51, "grad_norm": 11.748898531703654, "learning_rate": 5.12655837427148e-06, "loss": 0.4366, "step": 3148 }, { "epoch": 0.51, "grad_norm": 8.845501547923442, "learning_rate": 5.123949463553029e-06, "loss": 0.421, "step": 3149 }, { "epoch": 0.51, "grad_norm": 8.116760495223623, "learning_rate": 5.121340519067327e-06, "loss": 0.4696, "step": 3150 }, { "epoch": 0.51, "grad_norm": 13.160256279405793, "learning_rate": 5.118731541525123e-06, "loss": 0.3845, "step": 3151 }, { "epoch": 0.51, "grad_norm": 6.445196954418788, "learning_rate": 5.116122531637174e-06, "loss": 0.47, "step": 3152 }, { "epoch": 0.51, "grad_norm": 20.01810408514814, "learning_rate": 5.113513490114246e-06, "loss": 0.5091, "step": 3153 }, { "epoch": 0.51, "grad_norm": 7.551508561900697, "learning_rate": 5.110904417667115e-06, "loss": 0.4925, "step": 3154 }, { "epoch": 0.51, "grad_norm": 9.167580798866489, "learning_rate": 5.108295315006563e-06, "loss": 0.4855, "step": 3155 }, { "epoch": 0.51, "grad_norm": 6.875021697997089, "learning_rate": 5.1056861828433815e-06, "loss": 0.4352, "step": 3156 }, { "epoch": 0.51, "grad_norm": 7.938787625207812, "learning_rate": 5.1030770218883706e-06, "loss": 0.457, "step": 3157 }, { "epoch": 0.51, "grad_norm": 9.064501761238272, "learning_rate": 5.1004678328523395e-06, "loss": 0.43, "step": 3158 }, { "epoch": 0.51, "grad_norm": 8.12411076819494, "learning_rate": 5.097858616446099e-06, "loss": 0.4569, "step": 3159 }, { "epoch": 0.51, "grad_norm": 8.906620388322995, "learning_rate": 5.0952493733804755e-06, "loss": 0.4269, "step": 3160 }, { "epoch": 0.51, "grad_norm": 12.391253158649452, "learning_rate": 5.092640104366297e-06, "loss": 0.4277, "step": 3161 }, { "epoch": 0.51, "grad_norm": 9.32471092918946, "learning_rate": 5.0900308101143996e-06, "loss": 0.4397, "step": 3162 }, { "epoch": 0.51, "grad_norm": 8.951285899296328, "learning_rate": 5.087421491335629e-06, "loss": 0.4734, "step": 3163 }, { "epoch": 0.51, "grad_norm": 7.255599789091882, "learning_rate": 5.0848121487408316e-06, "loss": 0.4302, "step": 3164 }, { "epoch": 0.51, "grad_norm": 15.442434553740744, "learning_rate": 5.08220278304087e-06, "loss": 0.428, "step": 3165 }, { "epoch": 0.51, "grad_norm": 18.82601737410301, "learning_rate": 5.079593394946603e-06, "loss": 0.4098, "step": 3166 }, { "epoch": 0.51, "grad_norm": 9.84379319270009, "learning_rate": 5.076983985168901e-06, "loss": 0.4539, "step": 3167 }, { "epoch": 0.51, "grad_norm": 9.90659556642468, "learning_rate": 5.074374554418641e-06, "loss": 0.4554, "step": 3168 }, { "epoch": 0.51, "grad_norm": 11.397066710868136, "learning_rate": 5.071765103406702e-06, "loss": 0.4496, "step": 3169 }, { "epoch": 0.51, "grad_norm": 11.263879305209539, "learning_rate": 5.06915563284397e-06, "loss": 0.4647, "step": 3170 }, { "epoch": 0.51, "grad_norm": 7.528724922727628, "learning_rate": 5.066546143441336e-06, "loss": 0.4351, "step": 3171 }, { "epoch": 0.51, "grad_norm": 37.54356147082925, "learning_rate": 5.0639366359097e-06, "loss": 0.4124, "step": 3172 }, { "epoch": 0.51, "grad_norm": 8.665071544218575, "learning_rate": 5.061327110959961e-06, "loss": 0.4289, "step": 3173 }, { "epoch": 0.51, "grad_norm": 12.385387920719054, "learning_rate": 5.058717569303027e-06, "loss": 0.4503, "step": 3174 }, { "epoch": 0.51, "grad_norm": 19.15063352982195, "learning_rate": 5.056108011649807e-06, "loss": 0.4769, "step": 3175 }, { "epoch": 0.51, "grad_norm": 5.640173531036795, "learning_rate": 5.053498438711221e-06, "loss": 0.4878, "step": 3176 }, { "epoch": 0.51, "grad_norm": 8.199516133663327, "learning_rate": 5.050888851198183e-06, "loss": 0.4238, "step": 3177 }, { "epoch": 0.51, "grad_norm": 7.582901980695095, "learning_rate": 5.0482792498216194e-06, "loss": 0.4724, "step": 3178 }, { "epoch": 0.51, "grad_norm": 7.17863000147381, "learning_rate": 5.045669635292458e-06, "loss": 0.4744, "step": 3179 }, { "epoch": 0.51, "grad_norm": 5.154005527197196, "learning_rate": 5.04306000832163e-06, "loss": 0.4127, "step": 3180 }, { "epoch": 0.51, "grad_norm": 8.614642539802718, "learning_rate": 5.040450369620068e-06, "loss": 0.4076, "step": 3181 }, { "epoch": 0.51, "grad_norm": 7.974342715537309, "learning_rate": 5.03784071989871e-06, "loss": 0.4516, "step": 3182 }, { "epoch": 0.51, "grad_norm": 7.174299976122674, "learning_rate": 5.0352310598684965e-06, "loss": 0.3798, "step": 3183 }, { "epoch": 0.51, "grad_norm": 7.3646213021939735, "learning_rate": 5.032621390240371e-06, "loss": 0.4956, "step": 3184 }, { "epoch": 0.51, "grad_norm": 6.55489457133509, "learning_rate": 5.030011711725281e-06, "loss": 0.4494, "step": 3185 }, { "epoch": 0.51, "grad_norm": 6.31283624235548, "learning_rate": 5.027402025034171e-06, "loss": 0.4488, "step": 3186 }, { "epoch": 0.51, "grad_norm": 8.56269849761301, "learning_rate": 5.024792330877997e-06, "loss": 0.4759, "step": 3187 }, { "epoch": 0.51, "grad_norm": 5.464980892181738, "learning_rate": 5.022182629967707e-06, "loss": 0.4537, "step": 3188 }, { "epoch": 0.51, "grad_norm": 5.208495684185179, "learning_rate": 5.019572923014258e-06, "loss": 0.4574, "step": 3189 }, { "epoch": 0.51, "grad_norm": 7.479168826865888, "learning_rate": 5.016963210728604e-06, "loss": 0.386, "step": 3190 }, { "epoch": 0.51, "grad_norm": 5.699488101399249, "learning_rate": 5.014353493821705e-06, "loss": 0.354, "step": 3191 }, { "epoch": 0.51, "grad_norm": 8.722540149369177, "learning_rate": 5.011743773004518e-06, "loss": 0.413, "step": 3192 }, { "epoch": 0.51, "grad_norm": 7.01603451813438, "learning_rate": 5.009134048988004e-06, "loss": 0.4014, "step": 3193 }, { "epoch": 0.51, "grad_norm": 7.4619289800786675, "learning_rate": 5.006524322483123e-06, "loss": 0.4164, "step": 3194 }, { "epoch": 0.51, "grad_norm": 6.421089848127978, "learning_rate": 5.0039145942008364e-06, "loss": 0.4155, "step": 3195 }, { "epoch": 0.51, "grad_norm": 7.74082298964966, "learning_rate": 5.001304864852106e-06, "loss": 0.5006, "step": 3196 }, { "epoch": 0.52, "grad_norm": 9.56813355396322, "learning_rate": 4.998695135147896e-06, "loss": 0.4417, "step": 3197 }, { "epoch": 0.52, "grad_norm": 5.218033955418096, "learning_rate": 4.996085405799166e-06, "loss": 0.3978, "step": 3198 }, { "epoch": 0.52, "grad_norm": 10.238680888843067, "learning_rate": 4.993475677516878e-06, "loss": 0.4642, "step": 3199 }, { "epoch": 0.52, "grad_norm": 8.769383476423492, "learning_rate": 4.990865951011999e-06, "loss": 0.4639, "step": 3200 }, { "epoch": 0.52, "grad_norm": 6.7344999244144175, "learning_rate": 4.9882562269954835e-06, "loss": 0.4304, "step": 3201 }, { "epoch": 0.52, "grad_norm": 7.818351026349519, "learning_rate": 4.985646506178296e-06, "loss": 0.448, "step": 3202 }, { "epoch": 0.52, "grad_norm": 5.396901218275492, "learning_rate": 4.983036789271398e-06, "loss": 0.4126, "step": 3203 }, { "epoch": 0.52, "grad_norm": 10.332381905235522, "learning_rate": 4.980427076985744e-06, "loss": 0.4669, "step": 3204 }, { "epoch": 0.52, "grad_norm": 6.552738535529104, "learning_rate": 4.977817370032294e-06, "loss": 0.3936, "step": 3205 }, { "epoch": 0.52, "grad_norm": 7.903559338216413, "learning_rate": 4.975207669122005e-06, "loss": 0.484, "step": 3206 }, { "epoch": 0.52, "grad_norm": 8.578625600194476, "learning_rate": 4.97259797496583e-06, "loss": 0.4146, "step": 3207 }, { "epoch": 0.52, "grad_norm": 8.693772792008131, "learning_rate": 4.9699882882747205e-06, "loss": 0.4651, "step": 3208 }, { "epoch": 0.52, "grad_norm": 7.5363770574200775, "learning_rate": 4.967378609759629e-06, "loss": 0.4936, "step": 3209 }, { "epoch": 0.52, "grad_norm": 11.037016400219448, "learning_rate": 4.964768940131505e-06, "loss": 0.4497, "step": 3210 }, { "epoch": 0.52, "grad_norm": 1.4669779357922395, "learning_rate": 4.962159280101292e-06, "loss": 0.4945, "step": 3211 }, { "epoch": 0.52, "grad_norm": 4.912619470396417, "learning_rate": 4.959549630379934e-06, "loss": 0.4214, "step": 3212 }, { "epoch": 0.52, "grad_norm": 8.19765756419675, "learning_rate": 4.9569399916783724e-06, "loss": 0.4388, "step": 3213 }, { "epoch": 0.52, "grad_norm": 1.304027798950356, "learning_rate": 4.954330364707543e-06, "loss": 0.4678, "step": 3214 }, { "epoch": 0.52, "grad_norm": 6.225864610431844, "learning_rate": 4.9517207501783805e-06, "loss": 0.4518, "step": 3215 }, { "epoch": 0.52, "grad_norm": 4.962754802835571, "learning_rate": 4.949111148801819e-06, "loss": 0.4363, "step": 3216 }, { "epoch": 0.52, "grad_norm": 6.640241543859871, "learning_rate": 4.9465015612887815e-06, "loss": 0.4657, "step": 3217 }, { "epoch": 0.52, "grad_norm": 9.589996735112647, "learning_rate": 4.9438919883501934e-06, "loss": 0.5225, "step": 3218 }, { "epoch": 0.52, "grad_norm": 6.680593210614576, "learning_rate": 4.941282430696976e-06, "loss": 0.4799, "step": 3219 }, { "epoch": 0.52, "grad_norm": 10.615049125574775, "learning_rate": 4.938672889040041e-06, "loss": 0.425, "step": 3220 }, { "epoch": 0.52, "grad_norm": 6.338346451566445, "learning_rate": 4.936063364090302e-06, "loss": 0.4185, "step": 3221 }, { "epoch": 0.52, "grad_norm": 9.947471769384006, "learning_rate": 4.933453856558666e-06, "loss": 0.4888, "step": 3222 }, { "epoch": 0.52, "grad_norm": 9.263995730780012, "learning_rate": 4.930844367156032e-06, "loss": 0.4518, "step": 3223 }, { "epoch": 0.52, "grad_norm": 1.5495191993647182, "learning_rate": 4.9282348965933e-06, "loss": 0.5129, "step": 3224 }, { "epoch": 0.52, "grad_norm": 1.2757601222250934, "learning_rate": 4.925625445581361e-06, "loss": 0.4801, "step": 3225 }, { "epoch": 0.52, "grad_norm": 6.251884362999813, "learning_rate": 4.923016014831099e-06, "loss": 0.3793, "step": 3226 }, { "epoch": 0.52, "grad_norm": 6.7464754682644035, "learning_rate": 4.9204066050533975e-06, "loss": 0.3799, "step": 3227 }, { "epoch": 0.52, "grad_norm": 7.4799308269106435, "learning_rate": 4.91779721695913e-06, "loss": 0.3796, "step": 3228 }, { "epoch": 0.52, "grad_norm": 9.847177443266865, "learning_rate": 4.915187851259169e-06, "loss": 0.3261, "step": 3229 }, { "epoch": 0.52, "grad_norm": 16.55318391205063, "learning_rate": 4.9125785086643726e-06, "loss": 0.3903, "step": 3230 }, { "epoch": 0.52, "grad_norm": 10.6058240412937, "learning_rate": 4.9099691898856e-06, "loss": 0.4462, "step": 3231 }, { "epoch": 0.52, "grad_norm": 1.0516805345376652, "learning_rate": 4.907359895633705e-06, "loss": 0.4591, "step": 3232 }, { "epoch": 0.52, "grad_norm": 8.922762037333175, "learning_rate": 4.904750626619525e-06, "loss": 0.4699, "step": 3233 }, { "epoch": 0.52, "grad_norm": 6.736964785025565, "learning_rate": 4.902141383553901e-06, "loss": 0.4723, "step": 3234 }, { "epoch": 0.52, "grad_norm": 6.374462322562105, "learning_rate": 4.899532167147662e-06, "loss": 0.3979, "step": 3235 }, { "epoch": 0.52, "grad_norm": 8.349469302762756, "learning_rate": 4.89692297811163e-06, "loss": 0.4259, "step": 3236 }, { "epoch": 0.52, "grad_norm": 8.790210302078131, "learning_rate": 4.8943138171566184e-06, "loss": 0.4178, "step": 3237 }, { "epoch": 0.52, "grad_norm": 7.993080409280082, "learning_rate": 4.89170468499344e-06, "loss": 0.4443, "step": 3238 }, { "epoch": 0.52, "grad_norm": 9.843376587122243, "learning_rate": 4.889095582332887e-06, "loss": 0.4894, "step": 3239 }, { "epoch": 0.52, "grad_norm": 7.682398306055466, "learning_rate": 4.886486509885755e-06, "loss": 0.3884, "step": 3240 }, { "epoch": 0.52, "grad_norm": 10.548234451592345, "learning_rate": 4.883877468362828e-06, "loss": 0.4769, "step": 3241 }, { "epoch": 0.52, "grad_norm": 6.027812752610894, "learning_rate": 4.88126845847488e-06, "loss": 0.4136, "step": 3242 }, { "epoch": 0.52, "grad_norm": 7.656706476275129, "learning_rate": 4.878659480932674e-06, "loss": 0.3864, "step": 3243 }, { "epoch": 0.52, "grad_norm": 6.234740228312786, "learning_rate": 4.876050536446973e-06, "loss": 0.4032, "step": 3244 }, { "epoch": 0.52, "grad_norm": 13.771710670820658, "learning_rate": 4.8734416257285215e-06, "loss": 0.4426, "step": 3245 }, { "epoch": 0.52, "grad_norm": 10.76068678183386, "learning_rate": 4.870832749488058e-06, "loss": 0.4358, "step": 3246 }, { "epoch": 0.52, "grad_norm": 7.831224507993771, "learning_rate": 4.868223908436316e-06, "loss": 0.4289, "step": 3247 }, { "epoch": 0.52, "grad_norm": 10.620516785995491, "learning_rate": 4.865615103284014e-06, "loss": 0.4556, "step": 3248 }, { "epoch": 0.52, "grad_norm": 8.642129847768723, "learning_rate": 4.863006334741861e-06, "loss": 0.4121, "step": 3249 }, { "epoch": 0.52, "grad_norm": 6.997940337448587, "learning_rate": 4.860397603520559e-06, "loss": 0.5232, "step": 3250 }, { "epoch": 0.52, "grad_norm": 8.6888717423334, "learning_rate": 4.8577889103308e-06, "loss": 0.4285, "step": 3251 }, { "epoch": 0.52, "grad_norm": 8.064574250932244, "learning_rate": 4.855180255883259e-06, "loss": 0.4322, "step": 3252 }, { "epoch": 0.52, "grad_norm": 15.809160657905979, "learning_rate": 4.852571640888609e-06, "loss": 0.4233, "step": 3253 }, { "epoch": 0.52, "grad_norm": 10.771742827935897, "learning_rate": 4.84996306605751e-06, "loss": 0.5075, "step": 3254 }, { "epoch": 0.52, "grad_norm": 15.963208413064082, "learning_rate": 4.847354532100606e-06, "loss": 0.4981, "step": 3255 }, { "epoch": 0.52, "grad_norm": 12.312511769415389, "learning_rate": 4.844746039728535e-06, "loss": 0.4629, "step": 3256 }, { "epoch": 0.52, "grad_norm": 7.7846026897490574, "learning_rate": 4.842137589651923e-06, "loss": 0.4286, "step": 3257 }, { "epoch": 0.52, "grad_norm": 8.716225263554918, "learning_rate": 4.8395291825813824e-06, "loss": 0.36, "step": 3258 }, { "epoch": 0.53, "grad_norm": 6.775068069989327, "learning_rate": 4.836920819227514e-06, "loss": 0.4722, "step": 3259 }, { "epoch": 0.53, "grad_norm": 16.483548625918782, "learning_rate": 4.834312500300915e-06, "loss": 0.4417, "step": 3260 }, { "epoch": 0.53, "grad_norm": 6.066213906225504, "learning_rate": 4.831704226512153e-06, "loss": 0.4317, "step": 3261 }, { "epoch": 0.53, "grad_norm": 7.026019939585913, "learning_rate": 4.829095998571801e-06, "loss": 0.453, "step": 3262 }, { "epoch": 0.53, "grad_norm": 7.668837373565955, "learning_rate": 4.826487817190411e-06, "loss": 0.4663, "step": 3263 }, { "epoch": 0.53, "grad_norm": 8.357696914469692, "learning_rate": 4.8238796830785206e-06, "loss": 0.4167, "step": 3264 }, { "epoch": 0.53, "grad_norm": 10.61704735251367, "learning_rate": 4.821271596946659e-06, "loss": 0.4522, "step": 3265 }, { "epoch": 0.53, "grad_norm": 7.234798358369616, "learning_rate": 4.818663559505341e-06, "loss": 0.4076, "step": 3266 }, { "epoch": 0.53, "grad_norm": 8.313799338989178, "learning_rate": 4.816055571465071e-06, "loss": 0.4501, "step": 3267 }, { "epoch": 0.53, "grad_norm": 22.07417178178273, "learning_rate": 4.813447633536331e-06, "loss": 0.4276, "step": 3268 }, { "epoch": 0.53, "grad_norm": 1.2356274572792492, "learning_rate": 4.810839746429598e-06, "loss": 0.4565, "step": 3269 }, { "epoch": 0.53, "grad_norm": 8.413133051696658, "learning_rate": 4.808231910855335e-06, "loss": 0.4599, "step": 3270 }, { "epoch": 0.53, "grad_norm": 5.870987924254158, "learning_rate": 4.805624127523982e-06, "loss": 0.4528, "step": 3271 }, { "epoch": 0.53, "grad_norm": 6.387744826890694, "learning_rate": 4.8030163971459765e-06, "loss": 0.3826, "step": 3272 }, { "epoch": 0.53, "grad_norm": 5.4389605678258945, "learning_rate": 4.800408720431735e-06, "loss": 0.3771, "step": 3273 }, { "epoch": 0.53, "grad_norm": 12.146302108638302, "learning_rate": 4.797801098091659e-06, "loss": 0.5332, "step": 3274 }, { "epoch": 0.53, "grad_norm": 14.623552266455311, "learning_rate": 4.795193530836136e-06, "loss": 0.5391, "step": 3275 }, { "epoch": 0.53, "grad_norm": 6.646356063716102, "learning_rate": 4.792586019375543e-06, "loss": 0.4149, "step": 3276 }, { "epoch": 0.53, "grad_norm": 8.043754738826692, "learning_rate": 4.789978564420235e-06, "loss": 0.4427, "step": 3277 }, { "epoch": 0.53, "grad_norm": 7.173615627124765, "learning_rate": 4.787371166680555e-06, "loss": 0.3861, "step": 3278 }, { "epoch": 0.53, "grad_norm": 11.39499931391525, "learning_rate": 4.784763826866831e-06, "loss": 0.4053, "step": 3279 }, { "epoch": 0.53, "grad_norm": 6.286508499635275, "learning_rate": 4.782156545689373e-06, "loss": 0.3881, "step": 3280 }, { "epoch": 0.53, "grad_norm": 9.275607912510944, "learning_rate": 4.779549323858476e-06, "loss": 0.4994, "step": 3281 }, { "epoch": 0.53, "grad_norm": 5.138043520917561, "learning_rate": 4.776942162084423e-06, "loss": 0.3791, "step": 3282 }, { "epoch": 0.53, "grad_norm": 7.301236075288075, "learning_rate": 4.77433506107747e-06, "loss": 0.4117, "step": 3283 }, { "epoch": 0.53, "grad_norm": 5.71698639630238, "learning_rate": 4.771728021547868e-06, "loss": 0.3792, "step": 3284 }, { "epoch": 0.53, "grad_norm": 5.168082779960844, "learning_rate": 4.769121044205847e-06, "loss": 0.4505, "step": 3285 }, { "epoch": 0.53, "grad_norm": 8.017376726043553, "learning_rate": 4.766514129761616e-06, "loss": 0.4278, "step": 3286 }, { "epoch": 0.53, "grad_norm": 8.521431120661022, "learning_rate": 4.763907278925372e-06, "loss": 0.4617, "step": 3287 }, { "epoch": 0.53, "grad_norm": 6.124769879776053, "learning_rate": 4.761300492407293e-06, "loss": 0.4533, "step": 3288 }, { "epoch": 0.53, "grad_norm": 6.606607693241983, "learning_rate": 4.758693770917543e-06, "loss": 0.4703, "step": 3289 }, { "epoch": 0.53, "grad_norm": 8.516505479091492, "learning_rate": 4.756087115166257e-06, "loss": 0.4228, "step": 3290 }, { "epoch": 0.53, "grad_norm": 7.388999064185464, "learning_rate": 4.753480525863566e-06, "loss": 0.3979, "step": 3291 }, { "epoch": 0.53, "grad_norm": 5.437539219725819, "learning_rate": 4.750874003719577e-06, "loss": 0.481, "step": 3292 }, { "epoch": 0.53, "grad_norm": 14.92193121847193, "learning_rate": 4.7482675494443745e-06, "loss": 0.4296, "step": 3293 }, { "epoch": 0.53, "grad_norm": 5.531106221388708, "learning_rate": 4.745661163748031e-06, "loss": 0.4271, "step": 3294 }, { "epoch": 0.53, "grad_norm": 7.435712005507817, "learning_rate": 4.743054847340598e-06, "loss": 0.4044, "step": 3295 }, { "epoch": 0.53, "grad_norm": 4.978765435992806, "learning_rate": 4.740448600932107e-06, "loss": 0.3826, "step": 3296 }, { "epoch": 0.53, "grad_norm": 20.565732379267395, "learning_rate": 4.73784242523257e-06, "loss": 0.4897, "step": 3297 }, { "epoch": 0.53, "grad_norm": 5.300838360840096, "learning_rate": 4.735236320951986e-06, "loss": 0.4637, "step": 3298 }, { "epoch": 0.53, "grad_norm": 15.743621227201395, "learning_rate": 4.7326302888003224e-06, "loss": 0.5051, "step": 3299 }, { "epoch": 0.53, "grad_norm": 7.099566468745147, "learning_rate": 4.730024329487541e-06, "loss": 0.4893, "step": 3300 }, { "epoch": 0.53, "grad_norm": 6.294060126093305, "learning_rate": 4.7274184437235744e-06, "loss": 0.4727, "step": 3301 }, { "epoch": 0.53, "grad_norm": 11.08107344013705, "learning_rate": 4.7248126322183364e-06, "loss": 0.4077, "step": 3302 }, { "epoch": 0.53, "grad_norm": 9.235681587080398, "learning_rate": 4.722206895681723e-06, "loss": 0.3926, "step": 3303 }, { "epoch": 0.53, "grad_norm": 4.5395705284714865, "learning_rate": 4.71960123482361e-06, "loss": 0.481, "step": 3304 }, { "epoch": 0.53, "grad_norm": 8.045546721619823, "learning_rate": 4.716995650353847e-06, "loss": 0.4398, "step": 3305 }, { "epoch": 0.53, "grad_norm": 6.780349409781917, "learning_rate": 4.714390142982272e-06, "loss": 0.4515, "step": 3306 }, { "epoch": 0.53, "grad_norm": 9.988003419463865, "learning_rate": 4.711784713418694e-06, "loss": 0.5617, "step": 3307 }, { "epoch": 0.53, "grad_norm": 10.121728223924425, "learning_rate": 4.709179362372906e-06, "loss": 0.4211, "step": 3308 }, { "epoch": 0.53, "grad_norm": 11.549337597089705, "learning_rate": 4.706574090554676e-06, "loss": 0.4652, "step": 3309 }, { "epoch": 0.53, "grad_norm": 9.490114941211655, "learning_rate": 4.70396889867375e-06, "loss": 0.4872, "step": 3310 }, { "epoch": 0.53, "grad_norm": 10.893928551943358, "learning_rate": 4.70136378743986e-06, "loss": 0.4701, "step": 3311 }, { "epoch": 0.53, "grad_norm": 7.535358906081407, "learning_rate": 4.698758757562703e-06, "loss": 0.3832, "step": 3312 }, { "epoch": 0.53, "grad_norm": 4.946944298110637, "learning_rate": 4.696153809751964e-06, "loss": 0.3951, "step": 3313 }, { "epoch": 0.53, "grad_norm": 1.2181003488789302, "learning_rate": 4.693548944717305e-06, "loss": 0.4613, "step": 3314 }, { "epoch": 0.53, "grad_norm": 5.844560734263103, "learning_rate": 4.69094416316836e-06, "loss": 0.4224, "step": 3315 }, { "epoch": 0.53, "grad_norm": 6.921751409937989, "learning_rate": 4.688339465814744e-06, "loss": 0.423, "step": 3316 }, { "epoch": 0.53, "grad_norm": 7.631048747557401, "learning_rate": 4.685734853366049e-06, "loss": 0.4565, "step": 3317 }, { "epoch": 0.53, "grad_norm": 9.519475638284781, "learning_rate": 4.683130326531842e-06, "loss": 0.4706, "step": 3318 }, { "epoch": 0.53, "grad_norm": 7.428074653653947, "learning_rate": 4.6805258860216675e-06, "loss": 0.4374, "step": 3319 }, { "epoch": 0.53, "grad_norm": 5.313694257207128, "learning_rate": 4.677921532545052e-06, "loss": 0.456, "step": 3320 }, { "epoch": 0.54, "grad_norm": 8.849316593749629, "learning_rate": 4.675317266811485e-06, "loss": 0.4398, "step": 3321 }, { "epoch": 0.54, "grad_norm": 8.025838354173722, "learning_rate": 4.672713089530445e-06, "loss": 0.3967, "step": 3322 }, { "epoch": 0.54, "grad_norm": 7.310205119422886, "learning_rate": 4.670109001411383e-06, "loss": 0.4866, "step": 3323 }, { "epoch": 0.54, "grad_norm": 6.444862323027851, "learning_rate": 4.6675050031637216e-06, "loss": 0.4797, "step": 3324 }, { "epoch": 0.54, "grad_norm": 6.9917143365153684, "learning_rate": 4.664901095496863e-06, "loss": 0.4421, "step": 3325 }, { "epoch": 0.54, "grad_norm": 6.68857145607652, "learning_rate": 4.662297279120181e-06, "loss": 0.467, "step": 3326 }, { "epoch": 0.54, "grad_norm": 6.450331140259409, "learning_rate": 4.659693554743032e-06, "loss": 0.4523, "step": 3327 }, { "epoch": 0.54, "grad_norm": 6.138056485232844, "learning_rate": 4.657089923074737e-06, "loss": 0.4127, "step": 3328 }, { "epoch": 0.54, "grad_norm": 5.353099062623064, "learning_rate": 4.6544863848246e-06, "loss": 0.3113, "step": 3329 }, { "epoch": 0.54, "grad_norm": 6.573266462098007, "learning_rate": 4.651882940701897e-06, "loss": 0.4299, "step": 3330 }, { "epoch": 0.54, "grad_norm": 5.986807043582329, "learning_rate": 4.649279591415876e-06, "loss": 0.4611, "step": 3331 }, { "epoch": 0.54, "grad_norm": 7.456177625235982, "learning_rate": 4.646676337675762e-06, "loss": 0.488, "step": 3332 }, { "epoch": 0.54, "grad_norm": 1.3643161604762688, "learning_rate": 4.644073180190753e-06, "loss": 0.4455, "step": 3333 }, { "epoch": 0.54, "grad_norm": 6.343416304766303, "learning_rate": 4.64147011967002e-06, "loss": 0.4093, "step": 3334 }, { "epoch": 0.54, "grad_norm": 9.36574093433074, "learning_rate": 4.638867156822711e-06, "loss": 0.3843, "step": 3335 }, { "epoch": 0.54, "grad_norm": 7.067264187857168, "learning_rate": 4.636264292357943e-06, "loss": 0.4281, "step": 3336 }, { "epoch": 0.54, "grad_norm": 7.982734015017322, "learning_rate": 4.633661526984808e-06, "loss": 0.468, "step": 3337 }, { "epoch": 0.54, "grad_norm": 8.214565995373139, "learning_rate": 4.63105886141237e-06, "loss": 0.4038, "step": 3338 }, { "epoch": 0.54, "grad_norm": 16.32759485837536, "learning_rate": 4.628456296349669e-06, "loss": 0.498, "step": 3339 }, { "epoch": 0.54, "grad_norm": 9.408589100677231, "learning_rate": 4.6258538325057145e-06, "loss": 0.448, "step": 3340 }, { "epoch": 0.54, "grad_norm": 10.100266373879446, "learning_rate": 4.623251470589488e-06, "loss": 0.4797, "step": 3341 }, { "epoch": 0.54, "grad_norm": 6.934670092737477, "learning_rate": 4.62064921130995e-06, "loss": 0.4749, "step": 3342 }, { "epoch": 0.54, "grad_norm": 6.080513029104854, "learning_rate": 4.618047055376019e-06, "loss": 0.426, "step": 3343 }, { "epoch": 0.54, "grad_norm": 7.996952381641377, "learning_rate": 4.615445003496602e-06, "loss": 0.3745, "step": 3344 }, { "epoch": 0.54, "grad_norm": 6.581790745784809, "learning_rate": 4.612843056380564e-06, "loss": 0.4046, "step": 3345 }, { "epoch": 0.54, "grad_norm": 14.960138212332955, "learning_rate": 4.610241214736751e-06, "loss": 0.3596, "step": 3346 }, { "epoch": 0.54, "grad_norm": 7.305584947863702, "learning_rate": 4.607639479273976e-06, "loss": 0.4126, "step": 3347 }, { "epoch": 0.54, "grad_norm": 8.981537767765301, "learning_rate": 4.605037850701019e-06, "loss": 0.4994, "step": 3348 }, { "epoch": 0.54, "grad_norm": 12.499253997744956, "learning_rate": 4.602436329726643e-06, "loss": 0.4995, "step": 3349 }, { "epoch": 0.54, "grad_norm": 6.02415974888144, "learning_rate": 4.599834917059565e-06, "loss": 0.5045, "step": 3350 }, { "epoch": 0.54, "grad_norm": 8.64220047727567, "learning_rate": 4.597233613408488e-06, "loss": 0.4166, "step": 3351 }, { "epoch": 0.54, "grad_norm": 8.884072875711894, "learning_rate": 4.5946324194820795e-06, "loss": 0.4831, "step": 3352 }, { "epoch": 0.54, "grad_norm": 7.279347768775072, "learning_rate": 4.592031335988971e-06, "loss": 0.4899, "step": 3353 }, { "epoch": 0.54, "grad_norm": 7.185293329294756, "learning_rate": 4.589430363637773e-06, "loss": 0.4395, "step": 3354 }, { "epoch": 0.54, "grad_norm": 6.638959561914513, "learning_rate": 4.5868295031370625e-06, "loss": 0.4438, "step": 3355 }, { "epoch": 0.54, "grad_norm": 11.493372569380933, "learning_rate": 4.584228755195382e-06, "loss": 0.4356, "step": 3356 }, { "epoch": 0.54, "grad_norm": 11.552985044818614, "learning_rate": 4.581628120521251e-06, "loss": 0.4326, "step": 3357 }, { "epoch": 0.54, "grad_norm": 6.160875125254548, "learning_rate": 4.579027599823153e-06, "loss": 0.4143, "step": 3358 }, { "epoch": 0.54, "grad_norm": 6.011579050794526, "learning_rate": 4.5764271938095405e-06, "loss": 0.39, "step": 3359 }, { "epoch": 0.54, "grad_norm": 6.23525210595496, "learning_rate": 4.573826903188836e-06, "loss": 0.5036, "step": 3360 }, { "epoch": 0.54, "grad_norm": 1.1281370541847753, "learning_rate": 4.571226728669432e-06, "loss": 0.4627, "step": 3361 }, { "epoch": 0.54, "grad_norm": 15.609299410102988, "learning_rate": 4.568626670959684e-06, "loss": 0.4318, "step": 3362 }, { "epoch": 0.54, "grad_norm": 6.581426143431007, "learning_rate": 4.566026730767922e-06, "loss": 0.4761, "step": 3363 }, { "epoch": 0.54, "grad_norm": 7.624384769865726, "learning_rate": 4.5634269088024434e-06, "loss": 0.4591, "step": 3364 }, { "epoch": 0.54, "grad_norm": 10.770952137295978, "learning_rate": 4.560827205771506e-06, "loss": 0.4322, "step": 3365 }, { "epoch": 0.54, "grad_norm": 11.361493142898524, "learning_rate": 4.558227622383345e-06, "loss": 0.4367, "step": 3366 }, { "epoch": 0.54, "grad_norm": 5.583502725660896, "learning_rate": 4.555628159346158e-06, "loss": 0.4636, "step": 3367 }, { "epoch": 0.54, "grad_norm": 10.275416916262637, "learning_rate": 4.55302881736811e-06, "loss": 0.4436, "step": 3368 }, { "epoch": 0.54, "grad_norm": 8.94653162161526, "learning_rate": 4.550429597157331e-06, "loss": 0.4219, "step": 3369 }, { "epoch": 0.54, "grad_norm": 7.707972573054583, "learning_rate": 4.547830499421921e-06, "loss": 0.3876, "step": 3370 }, { "epoch": 0.54, "grad_norm": 8.056059756574957, "learning_rate": 4.545231524869952e-06, "loss": 0.3804, "step": 3371 }, { "epoch": 0.54, "grad_norm": 9.768783473244461, "learning_rate": 4.542632674209447e-06, "loss": 0.3962, "step": 3372 }, { "epoch": 0.54, "grad_norm": 9.453477678226667, "learning_rate": 4.54003394814841e-06, "loss": 0.3495, "step": 3373 }, { "epoch": 0.54, "grad_norm": 9.438729644808358, "learning_rate": 4.537435347394807e-06, "loss": 0.4212, "step": 3374 }, { "epoch": 0.54, "grad_norm": 8.837001294294776, "learning_rate": 4.534836872656563e-06, "loss": 0.4371, "step": 3375 }, { "epoch": 0.54, "grad_norm": 5.415510685959797, "learning_rate": 4.532238524641578e-06, "loss": 0.4048, "step": 3376 }, { "epoch": 0.54, "grad_norm": 7.753633627335367, "learning_rate": 4.529640304057714e-06, "loss": 0.4105, "step": 3377 }, { "epoch": 0.54, "grad_norm": 8.635286562765643, "learning_rate": 4.527042211612796e-06, "loss": 0.4637, "step": 3378 }, { "epoch": 0.54, "grad_norm": 7.027102779401307, "learning_rate": 4.524444248014615e-06, "loss": 0.4621, "step": 3379 }, { "epoch": 0.54, "grad_norm": 37.19150062910144, "learning_rate": 4.521846413970932e-06, "loss": 0.4799, "step": 3380 }, { "epoch": 0.54, "grad_norm": 7.501783972594417, "learning_rate": 4.519248710189465e-06, "loss": 0.4457, "step": 3381 }, { "epoch": 0.54, "grad_norm": 1.2070566048443694, "learning_rate": 4.516651137377902e-06, "loss": 0.5095, "step": 3382 }, { "epoch": 0.55, "grad_norm": 48.83425910903912, "learning_rate": 4.514053696243893e-06, "loss": 0.4635, "step": 3383 }, { "epoch": 0.55, "grad_norm": 5.784641510771142, "learning_rate": 4.511456387495052e-06, "loss": 0.4801, "step": 3384 }, { "epoch": 0.55, "grad_norm": 9.417376346266119, "learning_rate": 4.508859211838955e-06, "loss": 0.4022, "step": 3385 }, { "epoch": 0.55, "grad_norm": 6.5602398297054405, "learning_rate": 4.50626216998315e-06, "loss": 0.4091, "step": 3386 }, { "epoch": 0.55, "grad_norm": 7.46714993901338, "learning_rate": 4.503665262635141e-06, "loss": 0.3541, "step": 3387 }, { "epoch": 0.55, "grad_norm": 7.049172970818485, "learning_rate": 4.501068490502394e-06, "loss": 0.4323, "step": 3388 }, { "epoch": 0.55, "grad_norm": 11.120056323171253, "learning_rate": 4.498471854292344e-06, "loss": 0.4153, "step": 3389 }, { "epoch": 0.55, "grad_norm": 13.08604013280653, "learning_rate": 4.495875354712386e-06, "loss": 0.4173, "step": 3390 }, { "epoch": 0.55, "grad_norm": 8.692906690975171, "learning_rate": 4.493278992469877e-06, "loss": 0.5367, "step": 3391 }, { "epoch": 0.55, "grad_norm": 7.723231631289586, "learning_rate": 4.4906827682721375e-06, "loss": 0.4141, "step": 3392 }, { "epoch": 0.55, "grad_norm": 40.89502386868, "learning_rate": 4.488086682826454e-06, "loss": 0.4422, "step": 3393 }, { "epoch": 0.55, "grad_norm": 5.459800567126894, "learning_rate": 4.485490736840066e-06, "loss": 0.4705, "step": 3394 }, { "epoch": 0.55, "grad_norm": 1.3608876906379102, "learning_rate": 4.482894931020185e-06, "loss": 0.4635, "step": 3395 }, { "epoch": 0.55, "grad_norm": 13.571433855326525, "learning_rate": 4.48029926607398e-06, "loss": 0.4585, "step": 3396 }, { "epoch": 0.55, "grad_norm": 5.06698804186974, "learning_rate": 4.477703742708579e-06, "loss": 0.4198, "step": 3397 }, { "epoch": 0.55, "grad_norm": 7.014277717239853, "learning_rate": 4.475108361631076e-06, "loss": 0.4256, "step": 3398 }, { "epoch": 0.55, "grad_norm": 4.443084231751913, "learning_rate": 4.472513123548525e-06, "loss": 0.4285, "step": 3399 }, { "epoch": 0.55, "grad_norm": 4.027008386119307, "learning_rate": 4.469918029167939e-06, "loss": 0.4071, "step": 3400 }, { "epoch": 0.55, "grad_norm": 9.905486381752056, "learning_rate": 4.467323079196292e-06, "loss": 0.4614, "step": 3401 }, { "epoch": 0.55, "grad_norm": 6.515826442824798, "learning_rate": 4.464728274340525e-06, "loss": 0.4052, "step": 3402 }, { "epoch": 0.55, "grad_norm": 7.666701458144494, "learning_rate": 4.462133615307527e-06, "loss": 0.3932, "step": 3403 }, { "epoch": 0.55, "grad_norm": 15.644816529029521, "learning_rate": 4.45953910280416e-06, "loss": 0.457, "step": 3404 }, { "epoch": 0.55, "grad_norm": 32.281226202294235, "learning_rate": 4.4569447375372396e-06, "loss": 0.4086, "step": 3405 }, { "epoch": 0.55, "grad_norm": 6.6559834994105085, "learning_rate": 4.454350520213543e-06, "loss": 0.4498, "step": 3406 }, { "epoch": 0.55, "grad_norm": 1.0882312619900265, "learning_rate": 4.451756451539804e-06, "loss": 0.4791, "step": 3407 }, { "epoch": 0.55, "grad_norm": 8.794203687745213, "learning_rate": 4.44916253222272e-06, "loss": 0.397, "step": 3408 }, { "epoch": 0.55, "grad_norm": 10.373612177381668, "learning_rate": 4.446568762968947e-06, "loss": 0.4609, "step": 3409 }, { "epoch": 0.55, "grad_norm": 5.846940997076772, "learning_rate": 4.443975144485099e-06, "loss": 0.4563, "step": 3410 }, { "epoch": 0.55, "grad_norm": 9.351003833760304, "learning_rate": 4.441381677477748e-06, "loss": 0.4521, "step": 3411 }, { "epoch": 0.55, "grad_norm": 12.57071861357142, "learning_rate": 4.438788362653426e-06, "loss": 0.4169, "step": 3412 }, { "epoch": 0.55, "grad_norm": 4.93254865414392, "learning_rate": 4.436195200718625e-06, "loss": 0.4434, "step": 3413 }, { "epoch": 0.55, "grad_norm": 8.282090971033764, "learning_rate": 4.43360219237979e-06, "loss": 0.3893, "step": 3414 }, { "epoch": 0.55, "grad_norm": 4.5814903779140925, "learning_rate": 4.431009338343335e-06, "loss": 0.448, "step": 3415 }, { "epoch": 0.55, "grad_norm": 7.187124031880494, "learning_rate": 4.428416639315616e-06, "loss": 0.4619, "step": 3416 }, { "epoch": 0.55, "grad_norm": 6.120725231807672, "learning_rate": 4.425824096002962e-06, "loss": 0.4654, "step": 3417 }, { "epoch": 0.55, "grad_norm": 3.8285222512380135, "learning_rate": 4.423231709111653e-06, "loss": 0.4827, "step": 3418 }, { "epoch": 0.55, "grad_norm": 4.878570546858172, "learning_rate": 4.420639479347924e-06, "loss": 0.4019, "step": 3419 }, { "epoch": 0.55, "grad_norm": 7.514084877046083, "learning_rate": 4.4180474074179705e-06, "loss": 0.4901, "step": 3420 }, { "epoch": 0.55, "grad_norm": 7.038843681951046, "learning_rate": 4.415455494027946e-06, "loss": 0.4234, "step": 3421 }, { "epoch": 0.55, "grad_norm": 6.117266795578791, "learning_rate": 4.412863739883958e-06, "loss": 0.4576, "step": 3422 }, { "epoch": 0.55, "grad_norm": 5.061870633799726, "learning_rate": 4.410272145692069e-06, "loss": 0.4392, "step": 3423 }, { "epoch": 0.55, "grad_norm": 3.8862794940787184, "learning_rate": 4.407680712158308e-06, "loss": 0.4954, "step": 3424 }, { "epoch": 0.55, "grad_norm": 1.3002227617701827, "learning_rate": 4.405089439988645e-06, "loss": 0.5078, "step": 3425 }, { "epoch": 0.55, "grad_norm": 10.696716583327925, "learning_rate": 4.402498329889018e-06, "loss": 0.4282, "step": 3426 }, { "epoch": 0.55, "grad_norm": 5.5085646349168575, "learning_rate": 4.399907382565316e-06, "loss": 0.4796, "step": 3427 }, { "epoch": 0.55, "grad_norm": 4.293745563617675, "learning_rate": 4.397316598723385e-06, "loss": 0.3997, "step": 3428 }, { "epoch": 0.55, "grad_norm": 10.471068981386365, "learning_rate": 4.394725979069026e-06, "loss": 0.4171, "step": 3429 }, { "epoch": 0.55, "grad_norm": 6.180680695010223, "learning_rate": 4.392135524307993e-06, "loss": 0.4627, "step": 3430 }, { "epoch": 0.55, "grad_norm": 5.9907302300562355, "learning_rate": 4.389545235146003e-06, "loss": 0.4042, "step": 3431 }, { "epoch": 0.55, "grad_norm": 10.59490501434163, "learning_rate": 4.386955112288714e-06, "loss": 0.4856, "step": 3432 }, { "epoch": 0.55, "grad_norm": 5.423085616935056, "learning_rate": 4.384365156441752e-06, "loss": 0.4092, "step": 3433 }, { "epoch": 0.55, "grad_norm": 14.756201447757219, "learning_rate": 4.381775368310694e-06, "loss": 0.3825, "step": 3434 }, { "epoch": 0.55, "grad_norm": 11.118937096646885, "learning_rate": 4.3791857486010655e-06, "loss": 0.439, "step": 3435 }, { "epoch": 0.55, "grad_norm": 5.128043504056167, "learning_rate": 4.3765962980183515e-06, "loss": 0.4997, "step": 3436 }, { "epoch": 0.55, "grad_norm": 7.1122657749917595, "learning_rate": 4.374007017267991e-06, "loss": 0.3894, "step": 3437 }, { "epoch": 0.55, "grad_norm": 5.612449101145293, "learning_rate": 4.371417907055373e-06, "loss": 0.4304, "step": 3438 }, { "epoch": 0.55, "grad_norm": 8.458107820241239, "learning_rate": 4.3688289680858456e-06, "loss": 0.437, "step": 3439 }, { "epoch": 0.55, "grad_norm": 5.067725231643833, "learning_rate": 4.366240201064705e-06, "loss": 0.462, "step": 3440 }, { "epoch": 0.55, "grad_norm": 13.330828103307006, "learning_rate": 4.363651606697204e-06, "loss": 0.4231, "step": 3441 }, { "epoch": 0.55, "grad_norm": 8.38924920255093, "learning_rate": 4.361063185688546e-06, "loss": 0.3892, "step": 3442 }, { "epoch": 0.55, "grad_norm": 6.27568819112588, "learning_rate": 4.358474938743889e-06, "loss": 0.4501, "step": 3443 }, { "epoch": 0.55, "grad_norm": 6.729684600793394, "learning_rate": 4.355886866568342e-06, "loss": 0.4815, "step": 3444 }, { "epoch": 0.56, "grad_norm": 7.51861301174476, "learning_rate": 4.353298969866966e-06, "loss": 0.4226, "step": 3445 }, { "epoch": 0.56, "grad_norm": 10.03624715246827, "learning_rate": 4.350711249344778e-06, "loss": 0.4262, "step": 3446 }, { "epoch": 0.56, "grad_norm": 5.839333530457556, "learning_rate": 4.348123705706745e-06, "loss": 0.4446, "step": 3447 }, { "epoch": 0.56, "grad_norm": 4.812619094320759, "learning_rate": 4.345536339657783e-06, "loss": 0.4303, "step": 3448 }, { "epoch": 0.56, "grad_norm": 4.796860758313819, "learning_rate": 4.342949151902761e-06, "loss": 0.4355, "step": 3449 }, { "epoch": 0.56, "grad_norm": 14.59777829572145, "learning_rate": 4.340362143146504e-06, "loss": 0.3922, "step": 3450 }, { "epoch": 0.56, "grad_norm": 7.5269446099193065, "learning_rate": 4.337775314093781e-06, "loss": 0.4762, "step": 3451 }, { "epoch": 0.56, "grad_norm": 6.315530577146882, "learning_rate": 4.335188665449316e-06, "loss": 0.4256, "step": 3452 }, { "epoch": 0.56, "grad_norm": 10.739329546124404, "learning_rate": 4.3326021979177865e-06, "loss": 0.4646, "step": 3453 }, { "epoch": 0.56, "grad_norm": 6.0886515069476905, "learning_rate": 4.330015912203812e-06, "loss": 0.4297, "step": 3454 }, { "epoch": 0.56, "grad_norm": 15.019102731368733, "learning_rate": 4.327429809011973e-06, "loss": 0.4657, "step": 3455 }, { "epoch": 0.56, "grad_norm": 10.034397946411353, "learning_rate": 4.324843889046795e-06, "loss": 0.4372, "step": 3456 }, { "epoch": 0.56, "grad_norm": 8.529004925936166, "learning_rate": 4.322258153012751e-06, "loss": 0.419, "step": 3457 }, { "epoch": 0.56, "grad_norm": 8.218066916873486, "learning_rate": 4.319672601614268e-06, "loss": 0.4342, "step": 3458 }, { "epoch": 0.56, "grad_norm": 19.71388694159525, "learning_rate": 4.317087235555722e-06, "loss": 0.4315, "step": 3459 }, { "epoch": 0.56, "grad_norm": 13.3807502071221, "learning_rate": 4.314502055541437e-06, "loss": 0.4532, "step": 3460 }, { "epoch": 0.56, "grad_norm": 19.345401133326384, "learning_rate": 4.311917062275688e-06, "loss": 0.439, "step": 3461 }, { "epoch": 0.56, "grad_norm": 7.458860808757126, "learning_rate": 4.309332256462699e-06, "loss": 0.4399, "step": 3462 }, { "epoch": 0.56, "grad_norm": 6.206606286746489, "learning_rate": 4.306747638806641e-06, "loss": 0.4857, "step": 3463 }, { "epoch": 0.56, "grad_norm": 7.266032133966359, "learning_rate": 4.304163210011636e-06, "loss": 0.4637, "step": 3464 }, { "epoch": 0.56, "grad_norm": 7.976606062540379, "learning_rate": 4.301578970781753e-06, "loss": 0.4199, "step": 3465 }, { "epoch": 0.56, "grad_norm": 8.488918401229766, "learning_rate": 4.2989949218210125e-06, "loss": 0.4068, "step": 3466 }, { "epoch": 0.56, "grad_norm": 8.253583678162672, "learning_rate": 4.2964110638333755e-06, "loss": 0.4572, "step": 3467 }, { "epoch": 0.56, "grad_norm": 5.703991693692433, "learning_rate": 4.29382739752276e-06, "loss": 0.5107, "step": 3468 }, { "epoch": 0.56, "grad_norm": 6.672809763223897, "learning_rate": 4.2912439235930305e-06, "loss": 0.38, "step": 3469 }, { "epoch": 0.56, "grad_norm": 9.510942203346927, "learning_rate": 4.2886606427479905e-06, "loss": 0.4241, "step": 3470 }, { "epoch": 0.56, "grad_norm": 16.088609560307717, "learning_rate": 4.2860775556913995e-06, "loss": 0.447, "step": 3471 }, { "epoch": 0.56, "grad_norm": 7.313219202335723, "learning_rate": 4.283494663126962e-06, "loss": 0.4282, "step": 3472 }, { "epoch": 0.56, "grad_norm": 20.741036901517163, "learning_rate": 4.28091196575833e-06, "loss": 0.482, "step": 3473 }, { "epoch": 0.56, "grad_norm": 10.307669045433087, "learning_rate": 4.278329464289098e-06, "loss": 0.4429, "step": 3474 }, { "epoch": 0.56, "grad_norm": 12.812807370177131, "learning_rate": 4.275747159422815e-06, "loss": 0.413, "step": 3475 }, { "epoch": 0.56, "grad_norm": 8.630313500494156, "learning_rate": 4.273165051862969e-06, "loss": 0.38, "step": 3476 }, { "epoch": 0.56, "grad_norm": 5.678087343871618, "learning_rate": 4.270583142312998e-06, "loss": 0.478, "step": 3477 }, { "epoch": 0.56, "grad_norm": 6.224841752898653, "learning_rate": 4.268001431476286e-06, "loss": 0.4302, "step": 3478 }, { "epoch": 0.56, "grad_norm": 7.6365634153621205, "learning_rate": 4.265419920056162e-06, "loss": 0.4574, "step": 3479 }, { "epoch": 0.56, "grad_norm": 5.086752174009546, "learning_rate": 4.262838608755899e-06, "loss": 0.4332, "step": 3480 }, { "epoch": 0.56, "grad_norm": 7.107151771630165, "learning_rate": 4.2602574982787216e-06, "loss": 0.429, "step": 3481 }, { "epoch": 0.56, "grad_norm": 26.058421604216008, "learning_rate": 4.257676589327791e-06, "loss": 0.3935, "step": 3482 }, { "epoch": 0.56, "grad_norm": 7.247612679805015, "learning_rate": 4.255095882606219e-06, "loss": 0.4171, "step": 3483 }, { "epoch": 0.56, "grad_norm": 6.225492598127672, "learning_rate": 4.252515378817062e-06, "loss": 0.4745, "step": 3484 }, { "epoch": 0.56, "grad_norm": 7.13817567781495, "learning_rate": 4.2499350786633235e-06, "loss": 0.4459, "step": 3485 }, { "epoch": 0.56, "grad_norm": 10.965874592245191, "learning_rate": 4.247354982847942e-06, "loss": 0.4498, "step": 3486 }, { "epoch": 0.56, "grad_norm": 8.278921015524402, "learning_rate": 4.244775092073811e-06, "loss": 0.3276, "step": 3487 }, { "epoch": 0.56, "grad_norm": 7.49106739202001, "learning_rate": 4.242195407043763e-06, "loss": 0.4687, "step": 3488 }, { "epoch": 0.56, "grad_norm": 15.68109419366356, "learning_rate": 4.239615928460574e-06, "loss": 0.4427, "step": 3489 }, { "epoch": 0.56, "grad_norm": 1.1467590083741845, "learning_rate": 4.2370366570269676e-06, "loss": 0.4577, "step": 3490 }, { "epoch": 0.56, "grad_norm": 7.521048383500448, "learning_rate": 4.234457593445608e-06, "loss": 0.3743, "step": 3491 }, { "epoch": 0.56, "grad_norm": 8.212378853894835, "learning_rate": 4.2318787384191e-06, "loss": 0.4271, "step": 3492 }, { "epoch": 0.56, "grad_norm": 10.819709265294971, "learning_rate": 4.229300092649997e-06, "loss": 0.4334, "step": 3493 }, { "epoch": 0.56, "grad_norm": 8.686945184776075, "learning_rate": 4.226721656840796e-06, "loss": 0.4402, "step": 3494 }, { "epoch": 0.56, "grad_norm": 1.0970547333872147, "learning_rate": 4.224143431693929e-06, "loss": 0.469, "step": 3495 }, { "epoch": 0.56, "grad_norm": 9.165600042630059, "learning_rate": 4.2215654179117765e-06, "loss": 0.4681, "step": 3496 }, { "epoch": 0.56, "grad_norm": 7.2856202376375405, "learning_rate": 4.2189876161966655e-06, "loss": 0.4358, "step": 3497 }, { "epoch": 0.56, "grad_norm": 11.53513932988826, "learning_rate": 4.216410027250853e-06, "loss": 0.4265, "step": 3498 }, { "epoch": 0.56, "grad_norm": 6.63913120081179, "learning_rate": 4.213832651776551e-06, "loss": 0.4702, "step": 3499 }, { "epoch": 0.56, "grad_norm": 13.709006707528753, "learning_rate": 4.211255490475905e-06, "loss": 0.3962, "step": 3500 }, { "epoch": 0.56, "grad_norm": 7.545058952118208, "learning_rate": 4.208678544051005e-06, "loss": 0.5057, "step": 3501 }, { "epoch": 0.56, "grad_norm": 8.427371715233713, "learning_rate": 4.2061018132038825e-06, "loss": 0.49, "step": 3502 }, { "epoch": 0.56, "grad_norm": 6.511280198755118, "learning_rate": 4.203525298636512e-06, "loss": 0.493, "step": 3503 }, { "epoch": 0.56, "grad_norm": 22.72715837920541, "learning_rate": 4.2009490010508026e-06, "loss": 0.4759, "step": 3504 }, { "epoch": 0.56, "grad_norm": 7.860979531284946, "learning_rate": 4.19837292114861e-06, "loss": 0.3932, "step": 3505 }, { "epoch": 0.56, "grad_norm": 10.76229700344235, "learning_rate": 4.195797059631733e-06, "loss": 0.3596, "step": 3506 }, { "epoch": 0.57, "grad_norm": 14.02573135336524, "learning_rate": 4.1932214172019056e-06, "loss": 0.4198, "step": 3507 }, { "epoch": 0.57, "grad_norm": 8.302398856684944, "learning_rate": 4.190645994560802e-06, "loss": 0.4939, "step": 3508 }, { "epoch": 0.57, "grad_norm": 6.776301037506045, "learning_rate": 4.188070792410039e-06, "loss": 0.4322, "step": 3509 }, { "epoch": 0.57, "grad_norm": 10.515926041735389, "learning_rate": 4.185495811451175e-06, "loss": 0.4693, "step": 3510 }, { "epoch": 0.57, "grad_norm": 14.517266946668926, "learning_rate": 4.182921052385702e-06, "loss": 0.3717, "step": 3511 }, { "epoch": 0.57, "grad_norm": 27.524687356786455, "learning_rate": 4.180346515915057e-06, "loss": 0.4285, "step": 3512 }, { "epoch": 0.57, "grad_norm": 10.271454648682413, "learning_rate": 4.177772202740617e-06, "loss": 0.417, "step": 3513 }, { "epoch": 0.57, "grad_norm": 17.271864560305875, "learning_rate": 4.175198113563692e-06, "loss": 0.4566, "step": 3514 }, { "epoch": 0.57, "grad_norm": 9.41885435842443, "learning_rate": 4.172624249085537e-06, "loss": 0.3763, "step": 3515 }, { "epoch": 0.57, "grad_norm": 7.5452418473603355, "learning_rate": 4.170050610007344e-06, "loss": 0.3519, "step": 3516 }, { "epoch": 0.57, "grad_norm": 6.9420470448266345, "learning_rate": 4.167477197030242e-06, "loss": 0.4104, "step": 3517 }, { "epoch": 0.57, "grad_norm": 26.840609168874533, "learning_rate": 4.164904010855299e-06, "loss": 0.4371, "step": 3518 }, { "epoch": 0.57, "grad_norm": 7.912170424630667, "learning_rate": 4.162331052183526e-06, "loss": 0.438, "step": 3519 }, { "epoch": 0.57, "grad_norm": 9.059705334613804, "learning_rate": 4.159758321715862e-06, "loss": 0.4381, "step": 3520 }, { "epoch": 0.57, "grad_norm": 75.06967341796643, "learning_rate": 4.157185820153193e-06, "loss": 0.4531, "step": 3521 }, { "epoch": 0.57, "grad_norm": 9.899915320127883, "learning_rate": 4.154613548196341e-06, "loss": 0.4134, "step": 3522 }, { "epoch": 0.57, "grad_norm": 6.0126014489231965, "learning_rate": 4.15204150654606e-06, "loss": 0.401, "step": 3523 }, { "epoch": 0.57, "grad_norm": 8.309410053753508, "learning_rate": 4.149469695903047e-06, "loss": 0.4295, "step": 3524 }, { "epoch": 0.57, "grad_norm": 10.555110408120894, "learning_rate": 4.146898116967932e-06, "loss": 0.4364, "step": 3525 }, { "epoch": 0.57, "grad_norm": 11.826402512941181, "learning_rate": 4.1443267704412895e-06, "loss": 0.4023, "step": 3526 }, { "epoch": 0.57, "grad_norm": 7.578437344884836, "learning_rate": 4.141755657023618e-06, "loss": 0.3779, "step": 3527 }, { "epoch": 0.57, "grad_norm": 14.557961660842366, "learning_rate": 4.139184777415365e-06, "loss": 0.3378, "step": 3528 }, { "epoch": 0.57, "grad_norm": 9.733268122144805, "learning_rate": 4.1366141323169076e-06, "loss": 0.4716, "step": 3529 }, { "epoch": 0.57, "grad_norm": 12.596412647396354, "learning_rate": 4.1340437224285594e-06, "loss": 0.4076, "step": 3530 }, { "epoch": 0.57, "grad_norm": 8.0968260606884, "learning_rate": 4.131473548450571e-06, "loss": 0.4456, "step": 3531 }, { "epoch": 0.57, "grad_norm": 13.551312628942533, "learning_rate": 4.12890361108313e-06, "loss": 0.4205, "step": 3532 }, { "epoch": 0.57, "grad_norm": 9.738172538955002, "learning_rate": 4.126333911026357e-06, "loss": 0.5139, "step": 3533 }, { "epoch": 0.57, "grad_norm": 14.431413856628142, "learning_rate": 4.123764448980308e-06, "loss": 0.4078, "step": 3534 }, { "epoch": 0.57, "grad_norm": 13.331344611886424, "learning_rate": 4.1211952256449796e-06, "loss": 0.4638, "step": 3535 }, { "epoch": 0.57, "grad_norm": 8.328944083678772, "learning_rate": 4.118626241720293e-06, "loss": 0.3746, "step": 3536 }, { "epoch": 0.57, "grad_norm": 11.60294032970051, "learning_rate": 4.116057497906114e-06, "loss": 0.4362, "step": 3537 }, { "epoch": 0.57, "grad_norm": 69.1378934821045, "learning_rate": 4.11348899490224e-06, "loss": 0.4514, "step": 3538 }, { "epoch": 0.57, "grad_norm": 12.145953436160097, "learning_rate": 4.1109207334084e-06, "loss": 0.4029, "step": 3539 }, { "epoch": 0.57, "grad_norm": 8.98389370525785, "learning_rate": 4.1083527141242594e-06, "loss": 0.3766, "step": 3540 }, { "epoch": 0.57, "grad_norm": 7.446197390375823, "learning_rate": 4.105784937749419e-06, "loss": 0.3926, "step": 3541 }, { "epoch": 0.57, "grad_norm": 1.324769313439205, "learning_rate": 4.103217404983409e-06, "loss": 0.4735, "step": 3542 }, { "epoch": 0.57, "grad_norm": 6.32542526181782, "learning_rate": 4.100650116525698e-06, "loss": 0.4214, "step": 3543 }, { "epoch": 0.57, "grad_norm": 9.257213289099845, "learning_rate": 4.098083073075686e-06, "loss": 0.4767, "step": 3544 }, { "epoch": 0.57, "grad_norm": 27.627490570146293, "learning_rate": 4.0955162753327085e-06, "loss": 0.3947, "step": 3545 }, { "epoch": 0.57, "grad_norm": 10.05256553881891, "learning_rate": 4.092949723996028e-06, "loss": 0.4547, "step": 3546 }, { "epoch": 0.57, "grad_norm": 13.535300601371363, "learning_rate": 4.0903834197648444e-06, "loss": 0.4685, "step": 3547 }, { "epoch": 0.57, "grad_norm": 12.990737528339785, "learning_rate": 4.087817363338294e-06, "loss": 0.4393, "step": 3548 }, { "epoch": 0.57, "grad_norm": 7.0860485279130945, "learning_rate": 4.085251555415437e-06, "loss": 0.4167, "step": 3549 }, { "epoch": 0.57, "grad_norm": 111.97826905788796, "learning_rate": 4.08268599669527e-06, "loss": 0.4819, "step": 3550 }, { "epoch": 0.57, "grad_norm": 7.503513543788104, "learning_rate": 4.080120687876726e-06, "loss": 0.424, "step": 3551 }, { "epoch": 0.57, "grad_norm": 8.378768617660748, "learning_rate": 4.077555629658662e-06, "loss": 0.4493, "step": 3552 }, { "epoch": 0.57, "grad_norm": 6.263087740304019, "learning_rate": 4.074990822739871e-06, "loss": 0.3973, "step": 3553 }, { "epoch": 0.57, "grad_norm": 7.388060489075584, "learning_rate": 4.072426267819081e-06, "loss": 0.3926, "step": 3554 }, { "epoch": 0.57, "grad_norm": 6.834880253448762, "learning_rate": 4.069861965594941e-06, "loss": 0.4343, "step": 3555 }, { "epoch": 0.57, "grad_norm": 13.445803755529155, "learning_rate": 4.067297916766042e-06, "loss": 0.4413, "step": 3556 }, { "epoch": 0.57, "grad_norm": 6.0794359054678475, "learning_rate": 4.0647341220309024e-06, "loss": 0.4638, "step": 3557 }, { "epoch": 0.57, "grad_norm": 8.242683554494942, "learning_rate": 4.062170582087965e-06, "loss": 0.4664, "step": 3558 }, { "epoch": 0.57, "grad_norm": 7.2844353857897, "learning_rate": 4.059607297635615e-06, "loss": 0.4416, "step": 3559 }, { "epoch": 0.57, "grad_norm": 5.282764265667226, "learning_rate": 4.057044269372159e-06, "loss": 0.4805, "step": 3560 }, { "epoch": 0.57, "grad_norm": 9.170877191508653, "learning_rate": 4.054481497995836e-06, "loss": 0.5217, "step": 3561 }, { "epoch": 0.57, "grad_norm": 5.756305433392412, "learning_rate": 4.051918984204817e-06, "loss": 0.42, "step": 3562 }, { "epoch": 0.57, "grad_norm": 5.251699506887296, "learning_rate": 4.049356728697199e-06, "loss": 0.5017, "step": 3563 }, { "epoch": 0.57, "grad_norm": 1.1735564664472435, "learning_rate": 4.046794732171016e-06, "loss": 0.4323, "step": 3564 }, { "epoch": 0.57, "grad_norm": 8.352716009411997, "learning_rate": 4.04423299532422e-06, "loss": 0.4437, "step": 3565 }, { "epoch": 0.57, "grad_norm": 6.829489997019559, "learning_rate": 4.041671518854704e-06, "loss": 0.4353, "step": 3566 }, { "epoch": 0.57, "grad_norm": 5.626290683031584, "learning_rate": 4.0391103034602826e-06, "loss": 0.3558, "step": 3567 }, { "epoch": 0.57, "grad_norm": 4.6146929760792785, "learning_rate": 4.036549349838701e-06, "loss": 0.4686, "step": 3568 }, { "epoch": 0.58, "grad_norm": 6.953461616613704, "learning_rate": 4.033988658687634e-06, "loss": 0.3941, "step": 3569 }, { "epoch": 0.58, "grad_norm": 6.033827666490349, "learning_rate": 4.031428230704686e-06, "loss": 0.4012, "step": 3570 }, { "epoch": 0.58, "grad_norm": 8.90893041001692, "learning_rate": 4.028868066587384e-06, "loss": 0.4304, "step": 3571 }, { "epoch": 0.58, "grad_norm": 6.065832794765413, "learning_rate": 4.026308167033191e-06, "loss": 0.4751, "step": 3572 }, { "epoch": 0.58, "grad_norm": 1.2474893456415863, "learning_rate": 4.023748532739496e-06, "loss": 0.4998, "step": 3573 }, { "epoch": 0.58, "grad_norm": 6.562580237692714, "learning_rate": 4.0211891644036085e-06, "loss": 0.4726, "step": 3574 }, { "epoch": 0.58, "grad_norm": 1.1031163096751515, "learning_rate": 4.018630062722774e-06, "loss": 0.5107, "step": 3575 }, { "epoch": 0.58, "grad_norm": 6.596053069325565, "learning_rate": 4.016071228394164e-06, "loss": 0.4099, "step": 3576 }, { "epoch": 0.58, "grad_norm": 1.1788368610921687, "learning_rate": 4.013512662114871e-06, "loss": 0.5138, "step": 3577 }, { "epoch": 0.58, "grad_norm": 8.172498121023418, "learning_rate": 4.010954364581921e-06, "loss": 0.4949, "step": 3578 }, { "epoch": 0.58, "grad_norm": 5.132733494317439, "learning_rate": 4.008396336492269e-06, "loss": 0.5088, "step": 3579 }, { "epoch": 0.58, "grad_norm": 11.229850964266946, "learning_rate": 4.005838578542785e-06, "loss": 0.4317, "step": 3580 }, { "epoch": 0.58, "grad_norm": 15.63283983738339, "learning_rate": 4.003281091430279e-06, "loss": 0.4385, "step": 3581 }, { "epoch": 0.58, "grad_norm": 8.50555505850098, "learning_rate": 4.000723875851477e-06, "loss": 0.4804, "step": 3582 }, { "epoch": 0.58, "grad_norm": 10.886756622461682, "learning_rate": 3.998166932503037e-06, "loss": 0.4978, "step": 3583 }, { "epoch": 0.58, "grad_norm": 6.2214590196898065, "learning_rate": 3.99561026208154e-06, "loss": 0.4153, "step": 3584 }, { "epoch": 0.58, "grad_norm": 6.531539260526917, "learning_rate": 3.993053865283492e-06, "loss": 0.4438, "step": 3585 }, { "epoch": 0.58, "grad_norm": 10.939185047921445, "learning_rate": 3.9904977428053325e-06, "loss": 0.3712, "step": 3586 }, { "epoch": 0.58, "grad_norm": 7.366524620525083, "learning_rate": 3.9879418953434105e-06, "loss": 0.3978, "step": 3587 }, { "epoch": 0.58, "grad_norm": 5.759678829013252, "learning_rate": 3.9853863235940145e-06, "loss": 0.414, "step": 3588 }, { "epoch": 0.58, "grad_norm": 5.846520661650958, "learning_rate": 3.982831028253354e-06, "loss": 0.4103, "step": 3589 }, { "epoch": 0.58, "grad_norm": 9.782937194919242, "learning_rate": 3.980276010017559e-06, "loss": 0.3835, "step": 3590 }, { "epoch": 0.58, "grad_norm": 5.823158158254266, "learning_rate": 3.977721269582686e-06, "loss": 0.3856, "step": 3591 }, { "epoch": 0.58, "grad_norm": 9.378426233049174, "learning_rate": 3.975166807644721e-06, "loss": 0.4506, "step": 3592 }, { "epoch": 0.58, "grad_norm": 6.613035828092862, "learning_rate": 3.972612624899566e-06, "loss": 0.4306, "step": 3593 }, { "epoch": 0.58, "grad_norm": 1.138319673242159, "learning_rate": 3.97005872204305e-06, "loss": 0.4452, "step": 3594 }, { "epoch": 0.58, "grad_norm": 6.9077022756240645, "learning_rate": 3.967505099770932e-06, "loss": 0.4414, "step": 3595 }, { "epoch": 0.58, "grad_norm": 7.288992368732283, "learning_rate": 3.964951758778885e-06, "loss": 0.4859, "step": 3596 }, { "epoch": 0.58, "grad_norm": 5.59060967051074, "learning_rate": 3.962398699762508e-06, "loss": 0.4628, "step": 3597 }, { "epoch": 0.58, "grad_norm": 1.474342405635066, "learning_rate": 3.95984592341733e-06, "loss": 0.4654, "step": 3598 }, { "epoch": 0.58, "grad_norm": 9.972404209306013, "learning_rate": 3.957293430438792e-06, "loss": 0.3722, "step": 3599 }, { "epoch": 0.58, "grad_norm": 6.3179493197170125, "learning_rate": 3.954741221522266e-06, "loss": 0.4535, "step": 3600 }, { "epoch": 0.58, "grad_norm": 5.716858181155409, "learning_rate": 3.952189297363047e-06, "loss": 0.4634, "step": 3601 }, { "epoch": 0.58, "grad_norm": 12.653989121672188, "learning_rate": 3.949637658656343e-06, "loss": 0.4516, "step": 3602 }, { "epoch": 0.58, "grad_norm": 6.553174554910584, "learning_rate": 3.947086306097295e-06, "loss": 0.4967, "step": 3603 }, { "epoch": 0.58, "grad_norm": 4.925315165983608, "learning_rate": 3.944535240380961e-06, "loss": 0.42, "step": 3604 }, { "epoch": 0.58, "grad_norm": 15.240486803150265, "learning_rate": 3.941984462202323e-06, "loss": 0.4267, "step": 3605 }, { "epoch": 0.58, "grad_norm": 10.080074671543342, "learning_rate": 3.939433972256281e-06, "loss": 0.47, "step": 3606 }, { "epoch": 0.58, "grad_norm": 6.687731477483454, "learning_rate": 3.936883771237658e-06, "loss": 0.4054, "step": 3607 }, { "epoch": 0.58, "grad_norm": 8.141404927612301, "learning_rate": 3.934333859841204e-06, "loss": 0.4495, "step": 3608 }, { "epoch": 0.58, "grad_norm": 5.513827117719276, "learning_rate": 3.931784238761579e-06, "loss": 0.4799, "step": 3609 }, { "epoch": 0.58, "grad_norm": 7.947672218301246, "learning_rate": 3.929234908693373e-06, "loss": 0.4671, "step": 3610 }, { "epoch": 0.58, "grad_norm": 8.710804898855605, "learning_rate": 3.9266858703310965e-06, "loss": 0.4179, "step": 3611 }, { "epoch": 0.58, "grad_norm": 9.28626211905646, "learning_rate": 3.924137124369172e-06, "loss": 0.4372, "step": 3612 }, { "epoch": 0.58, "grad_norm": 8.629412425029125, "learning_rate": 3.9215886715019525e-06, "loss": 0.4221, "step": 3613 }, { "epoch": 0.58, "grad_norm": 7.857351383153173, "learning_rate": 3.919040512423706e-06, "loss": 0.5064, "step": 3614 }, { "epoch": 0.58, "grad_norm": 7.093414447538241, "learning_rate": 3.91649264782862e-06, "loss": 0.3801, "step": 3615 }, { "epoch": 0.58, "grad_norm": 7.681772215490231, "learning_rate": 3.913945078410802e-06, "loss": 0.4956, "step": 3616 }, { "epoch": 0.58, "grad_norm": 7.062223572954884, "learning_rate": 3.911397804864285e-06, "loss": 0.4402, "step": 3617 }, { "epoch": 0.58, "grad_norm": 6.60473819491873, "learning_rate": 3.908850827883012e-06, "loss": 0.4225, "step": 3618 }, { "epoch": 0.58, "grad_norm": 7.671705785598382, "learning_rate": 3.90630414816085e-06, "loss": 0.4841, "step": 3619 }, { "epoch": 0.58, "grad_norm": 1.3120089614250947, "learning_rate": 3.9037577663915885e-06, "loss": 0.4581, "step": 3620 }, { "epoch": 0.58, "grad_norm": 9.600982976314652, "learning_rate": 3.901211683268928e-06, "loss": 0.509, "step": 3621 }, { "epoch": 0.58, "grad_norm": 6.7183008735063, "learning_rate": 3.898665899486493e-06, "loss": 0.3722, "step": 3622 }, { "epoch": 0.58, "grad_norm": 9.976191138077947, "learning_rate": 3.896120415737825e-06, "loss": 0.4613, "step": 3623 }, { "epoch": 0.58, "grad_norm": 5.266893597870025, "learning_rate": 3.893575232716387e-06, "loss": 0.4501, "step": 3624 }, { "epoch": 0.58, "grad_norm": 5.762090639064801, "learning_rate": 3.891030351115552e-06, "loss": 0.4326, "step": 3625 }, { "epoch": 0.58, "grad_norm": 5.506032424269474, "learning_rate": 3.888485771628618e-06, "loss": 0.4586, "step": 3626 }, { "epoch": 0.58, "grad_norm": 7.912987618128784, "learning_rate": 3.885941494948802e-06, "loss": 0.426, "step": 3627 }, { "epoch": 0.58, "grad_norm": 5.437431704510847, "learning_rate": 3.8833975217692285e-06, "loss": 0.4345, "step": 3628 }, { "epoch": 0.58, "grad_norm": 7.550368720017152, "learning_rate": 3.88085385278295e-06, "loss": 0.4994, "step": 3629 }, { "epoch": 0.58, "grad_norm": 4.332374809278879, "learning_rate": 3.878310488682934e-06, "loss": 0.4562, "step": 3630 }, { "epoch": 0.59, "grad_norm": 8.230840107510328, "learning_rate": 3.875767430162058e-06, "loss": 0.4732, "step": 3631 }, { "epoch": 0.59, "grad_norm": 7.83231383853078, "learning_rate": 3.873224677913124e-06, "loss": 0.4721, "step": 3632 }, { "epoch": 0.59, "grad_norm": 9.178892473458497, "learning_rate": 3.870682232628848e-06, "loss": 0.4718, "step": 3633 }, { "epoch": 0.59, "grad_norm": 6.496459612358705, "learning_rate": 3.8681400950018615e-06, "loss": 0.4771, "step": 3634 }, { "epoch": 0.59, "grad_norm": 11.923952159695997, "learning_rate": 3.865598265724713e-06, "loss": 0.4633, "step": 3635 }, { "epoch": 0.59, "grad_norm": 7.026335488207349, "learning_rate": 3.8630567454898676e-06, "loss": 0.4344, "step": 3636 }, { "epoch": 0.59, "grad_norm": 7.504905440483758, "learning_rate": 3.860515534989704e-06, "loss": 0.4192, "step": 3637 }, { "epoch": 0.59, "grad_norm": 7.014858828256301, "learning_rate": 3.857974634916517e-06, "loss": 0.4509, "step": 3638 }, { "epoch": 0.59, "grad_norm": 5.754091087243154, "learning_rate": 3.8554340459625235e-06, "loss": 0.4549, "step": 3639 }, { "epoch": 0.59, "grad_norm": 5.859093904586176, "learning_rate": 3.852893768819843e-06, "loss": 0.4449, "step": 3640 }, { "epoch": 0.59, "grad_norm": 29.64419447681958, "learning_rate": 3.85035380418052e-06, "loss": 0.4291, "step": 3641 }, { "epoch": 0.59, "grad_norm": 4.210152238570601, "learning_rate": 3.847814152736512e-06, "loss": 0.3387, "step": 3642 }, { "epoch": 0.59, "grad_norm": 6.035947572784318, "learning_rate": 3.84527481517969e-06, "loss": 0.4258, "step": 3643 }, { "epoch": 0.59, "grad_norm": 19.787646272183938, "learning_rate": 3.842735792201837e-06, "loss": 0.4243, "step": 3644 }, { "epoch": 0.59, "grad_norm": 9.731788367437117, "learning_rate": 3.840197084494653e-06, "loss": 0.4366, "step": 3645 }, { "epoch": 0.59, "grad_norm": 6.918817686068713, "learning_rate": 3.8376586927497565e-06, "loss": 0.4773, "step": 3646 }, { "epoch": 0.59, "grad_norm": 8.885571734136057, "learning_rate": 3.835120617658669e-06, "loss": 0.4162, "step": 3647 }, { "epoch": 0.59, "grad_norm": 1.1830208672898663, "learning_rate": 3.832582859912838e-06, "loss": 0.475, "step": 3648 }, { "epoch": 0.59, "grad_norm": 9.498436191571576, "learning_rate": 3.8300454202036155e-06, "loss": 0.3808, "step": 3649 }, { "epoch": 0.59, "grad_norm": 17.98603487343265, "learning_rate": 3.82750829922227e-06, "loss": 0.4493, "step": 3650 }, { "epoch": 0.59, "grad_norm": 6.909395338043835, "learning_rate": 3.824971497659983e-06, "loss": 0.4034, "step": 3651 }, { "epoch": 0.59, "grad_norm": 9.22223668236618, "learning_rate": 3.8224350162078526e-06, "loss": 0.4316, "step": 3652 }, { "epoch": 0.59, "grad_norm": 5.01308389590162, "learning_rate": 3.81989885555688e-06, "loss": 0.4191, "step": 3653 }, { "epoch": 0.59, "grad_norm": 8.056124664503239, "learning_rate": 3.81736301639799e-06, "loss": 0.4459, "step": 3654 }, { "epoch": 0.59, "grad_norm": 5.641203411304238, "learning_rate": 3.814827499422016e-06, "loss": 0.4235, "step": 3655 }, { "epoch": 0.59, "grad_norm": 6.3396006412407315, "learning_rate": 3.8122923053196984e-06, "loss": 0.4509, "step": 3656 }, { "epoch": 0.59, "grad_norm": 6.164843058392669, "learning_rate": 3.809757434781697e-06, "loss": 0.4646, "step": 3657 }, { "epoch": 0.59, "grad_norm": 9.3734534295823, "learning_rate": 3.8072228884985803e-06, "loss": 0.4449, "step": 3658 }, { "epoch": 0.59, "grad_norm": 9.976982389694024, "learning_rate": 3.8046886671608264e-06, "loss": 0.4129, "step": 3659 }, { "epoch": 0.59, "grad_norm": 6.270447451515192, "learning_rate": 3.802154771458828e-06, "loss": 0.4785, "step": 3660 }, { "epoch": 0.59, "grad_norm": 9.101905286580596, "learning_rate": 3.7996212020828915e-06, "loss": 0.4864, "step": 3661 }, { "epoch": 0.59, "grad_norm": 10.55494672430918, "learning_rate": 3.797087959723225e-06, "loss": 0.3858, "step": 3662 }, { "epoch": 0.59, "grad_norm": 9.505594070812139, "learning_rate": 3.7945550450699585e-06, "loss": 0.4776, "step": 3663 }, { "epoch": 0.59, "grad_norm": 1.1903808073395685, "learning_rate": 3.7920224588131256e-06, "loss": 0.4844, "step": 3664 }, { "epoch": 0.59, "grad_norm": 8.78739447726393, "learning_rate": 3.7894902016426738e-06, "loss": 0.4583, "step": 3665 }, { "epoch": 0.59, "grad_norm": 10.233824708198869, "learning_rate": 3.786958274248458e-06, "loss": 0.3974, "step": 3666 }, { "epoch": 0.59, "grad_norm": 4.480623375921674, "learning_rate": 3.7844266773202448e-06, "loss": 0.3707, "step": 3667 }, { "epoch": 0.59, "grad_norm": 12.381672442360758, "learning_rate": 3.7818954115477158e-06, "loss": 0.4492, "step": 3668 }, { "epoch": 0.59, "grad_norm": 9.055208882728673, "learning_rate": 3.7793644776204503e-06, "loss": 0.431, "step": 3669 }, { "epoch": 0.59, "grad_norm": 8.815205364987651, "learning_rate": 3.7768338762279493e-06, "loss": 0.4042, "step": 3670 }, { "epoch": 0.59, "grad_norm": 4.029064235347674, "learning_rate": 3.7743036080596184e-06, "loss": 0.4121, "step": 3671 }, { "epoch": 0.59, "grad_norm": 7.087642521099984, "learning_rate": 3.77177367380477e-06, "loss": 0.493, "step": 3672 }, { "epoch": 0.59, "grad_norm": 8.63607252462849, "learning_rate": 3.7692440741526293e-06, "loss": 0.4406, "step": 3673 }, { "epoch": 0.59, "grad_norm": 7.567058702018806, "learning_rate": 3.76671480979233e-06, "loss": 0.4102, "step": 3674 }, { "epoch": 0.59, "grad_norm": 15.337530033380453, "learning_rate": 3.7641858814129093e-06, "loss": 0.4147, "step": 3675 }, { "epoch": 0.59, "grad_norm": 4.03743627688545, "learning_rate": 3.7616572897033223e-06, "loss": 0.3592, "step": 3676 }, { "epoch": 0.59, "grad_norm": 9.525252334033553, "learning_rate": 3.7591290353524247e-06, "loss": 0.3393, "step": 3677 }, { "epoch": 0.59, "grad_norm": 13.131548968249211, "learning_rate": 3.7566011190489815e-06, "loss": 0.3776, "step": 3678 }, { "epoch": 0.59, "grad_norm": 22.71700553053672, "learning_rate": 3.7540735414816685e-06, "loss": 0.422, "step": 3679 }, { "epoch": 0.59, "grad_norm": 6.265438864383878, "learning_rate": 3.7515463033390676e-06, "loss": 0.3941, "step": 3680 }, { "epoch": 0.59, "grad_norm": 5.237391676604251, "learning_rate": 3.7490194053096668e-06, "loss": 0.3593, "step": 3681 }, { "epoch": 0.59, "grad_norm": 6.86745225627778, "learning_rate": 3.7464928480818623e-06, "loss": 0.377, "step": 3682 }, { "epoch": 0.59, "grad_norm": 7.455762457490691, "learning_rate": 3.7439666323439603e-06, "loss": 0.4798, "step": 3683 }, { "epoch": 0.59, "grad_norm": 16.159513827959582, "learning_rate": 3.741440758784172e-06, "loss": 0.4854, "step": 3684 }, { "epoch": 0.59, "grad_norm": 7.562778439397837, "learning_rate": 3.738915228090611e-06, "loss": 0.4538, "step": 3685 }, { "epoch": 0.59, "grad_norm": 9.474535527182887, "learning_rate": 3.736390040951304e-06, "loss": 0.4426, "step": 3686 }, { "epoch": 0.59, "grad_norm": 7.779301552270303, "learning_rate": 3.733865198054184e-06, "loss": 0.4613, "step": 3687 }, { "epoch": 0.59, "grad_norm": 209.31357563799193, "learning_rate": 3.7313407000870826e-06, "loss": 0.409, "step": 3688 }, { "epoch": 0.59, "grad_norm": 22.335873580769764, "learning_rate": 3.728816547737745e-06, "loss": 0.4536, "step": 3689 }, { "epoch": 0.59, "grad_norm": 5.755604488359913, "learning_rate": 3.7262927416938234e-06, "loss": 0.4706, "step": 3690 }, { "epoch": 0.59, "grad_norm": 9.885297154441947, "learning_rate": 3.7237692826428662e-06, "loss": 0.4022, "step": 3691 }, { "epoch": 0.59, "grad_norm": 7.27689791820019, "learning_rate": 3.721246171272336e-06, "loss": 0.4637, "step": 3692 }, { "epoch": 0.6, "grad_norm": 8.930415162452213, "learning_rate": 3.718723408269599e-06, "loss": 0.4238, "step": 3693 }, { "epoch": 0.6, "grad_norm": 8.1200510782945, "learning_rate": 3.7162009943219234e-06, "loss": 0.3634, "step": 3694 }, { "epoch": 0.6, "grad_norm": 9.665935309207324, "learning_rate": 3.7136789301164854e-06, "loss": 0.3866, "step": 3695 }, { "epoch": 0.6, "grad_norm": 6.9151545218963095, "learning_rate": 3.7111572163403653e-06, "loss": 0.3851, "step": 3696 }, { "epoch": 0.6, "grad_norm": 9.083363945087907, "learning_rate": 3.7086358536805455e-06, "loss": 0.4504, "step": 3697 }, { "epoch": 0.6, "grad_norm": 10.244944004402264, "learning_rate": 3.7061148428239147e-06, "loss": 0.4392, "step": 3698 }, { "epoch": 0.6, "grad_norm": 5.887578774537082, "learning_rate": 3.7035941844572687e-06, "loss": 0.3745, "step": 3699 }, { "epoch": 0.6, "grad_norm": 8.858388654261098, "learning_rate": 3.701073879267302e-06, "loss": 0.4248, "step": 3700 }, { "epoch": 0.6, "grad_norm": 7.518290630923201, "learning_rate": 3.698553927940615e-06, "loss": 0.3484, "step": 3701 }, { "epoch": 0.6, "grad_norm": 10.056516156579661, "learning_rate": 3.6960343311637132e-06, "loss": 0.4709, "step": 3702 }, { "epoch": 0.6, "grad_norm": 6.847072226433738, "learning_rate": 3.6935150896230045e-06, "loss": 0.4833, "step": 3703 }, { "epoch": 0.6, "grad_norm": 5.722526327995164, "learning_rate": 3.6909962040047964e-06, "loss": 0.4651, "step": 3704 }, { "epoch": 0.6, "grad_norm": 6.928061030520333, "learning_rate": 3.6884776749953065e-06, "loss": 0.4839, "step": 3705 }, { "epoch": 0.6, "grad_norm": 9.031987960935293, "learning_rate": 3.6859595032806518e-06, "loss": 0.4651, "step": 3706 }, { "epoch": 0.6, "grad_norm": 12.69415506332248, "learning_rate": 3.683441689546849e-06, "loss": 0.4707, "step": 3707 }, { "epoch": 0.6, "grad_norm": 10.81833911102138, "learning_rate": 3.6809242344798207e-06, "loss": 0.4088, "step": 3708 }, { "epoch": 0.6, "grad_norm": 5.8777602129088375, "learning_rate": 3.6784071387653926e-06, "loss": 0.4653, "step": 3709 }, { "epoch": 0.6, "grad_norm": 8.51814568631765, "learning_rate": 3.675890403089289e-06, "loss": 0.3622, "step": 3710 }, { "epoch": 0.6, "grad_norm": 9.133428919448342, "learning_rate": 3.6733740281371377e-06, "loss": 0.4229, "step": 3711 }, { "epoch": 0.6, "grad_norm": 7.249489178248336, "learning_rate": 3.670858014594473e-06, "loss": 0.4353, "step": 3712 }, { "epoch": 0.6, "grad_norm": 7.052782825718367, "learning_rate": 3.66834236314672e-06, "loss": 0.3568, "step": 3713 }, { "epoch": 0.6, "grad_norm": 5.437251849986943, "learning_rate": 3.665827074479215e-06, "loss": 0.4409, "step": 3714 }, { "epoch": 0.6, "grad_norm": 8.262738580210845, "learning_rate": 3.6633121492771933e-06, "loss": 0.4165, "step": 3715 }, { "epoch": 0.6, "grad_norm": 1.2157340975349153, "learning_rate": 3.6607975882257875e-06, "loss": 0.4564, "step": 3716 }, { "epoch": 0.6, "grad_norm": 5.504310033689151, "learning_rate": 3.6582833920100336e-06, "loss": 0.41, "step": 3717 }, { "epoch": 0.6, "grad_norm": 16.238393245347265, "learning_rate": 3.655769561314869e-06, "loss": 0.3976, "step": 3718 }, { "epoch": 0.6, "grad_norm": 14.699000444193242, "learning_rate": 3.653256096825129e-06, "loss": 0.4226, "step": 3719 }, { "epoch": 0.6, "grad_norm": 7.032966775559035, "learning_rate": 3.6507429992255503e-06, "loss": 0.4245, "step": 3720 }, { "epoch": 0.6, "grad_norm": 9.009929203715936, "learning_rate": 3.648230269200775e-06, "loss": 0.4338, "step": 3721 }, { "epoch": 0.6, "grad_norm": 8.506451985134628, "learning_rate": 3.645717907435332e-06, "loss": 0.4466, "step": 3722 }, { "epoch": 0.6, "grad_norm": 6.828082705762483, "learning_rate": 3.6432059146136633e-06, "loss": 0.4234, "step": 3723 }, { "epoch": 0.6, "grad_norm": 6.307577630534607, "learning_rate": 3.6406942914201045e-06, "loss": 0.4295, "step": 3724 }, { "epoch": 0.6, "grad_norm": 14.160162269904504, "learning_rate": 3.6381830385388907e-06, "loss": 0.3971, "step": 3725 }, { "epoch": 0.6, "grad_norm": 15.336906778733216, "learning_rate": 3.635672156654154e-06, "loss": 0.4407, "step": 3726 }, { "epoch": 0.6, "grad_norm": 4.45320870741854, "learning_rate": 3.6331616464499297e-06, "loss": 0.3912, "step": 3727 }, { "epoch": 0.6, "grad_norm": 7.706710924609895, "learning_rate": 3.6306515086101522e-06, "loss": 0.4802, "step": 3728 }, { "epoch": 0.6, "grad_norm": 7.196630551758835, "learning_rate": 3.6281417438186484e-06, "loss": 0.3593, "step": 3729 }, { "epoch": 0.6, "grad_norm": 10.700947146404365, "learning_rate": 3.6256323527591496e-06, "loss": 0.3572, "step": 3730 }, { "epoch": 0.6, "grad_norm": 8.769946956062393, "learning_rate": 3.623123336115284e-06, "loss": 0.521, "step": 3731 }, { "epoch": 0.6, "grad_norm": 13.089161791675759, "learning_rate": 3.6206146945705735e-06, "loss": 0.4652, "step": 3732 }, { "epoch": 0.6, "grad_norm": 6.575411642801652, "learning_rate": 3.6181064288084423e-06, "loss": 0.4438, "step": 3733 }, { "epoch": 0.6, "grad_norm": 6.746309761261565, "learning_rate": 3.6155985395122157e-06, "loss": 0.3801, "step": 3734 }, { "epoch": 0.6, "grad_norm": 6.448202337907825, "learning_rate": 3.613091027365104e-06, "loss": 0.4199, "step": 3735 }, { "epoch": 0.6, "grad_norm": 10.070241965256509, "learning_rate": 3.610583893050229e-06, "loss": 0.4652, "step": 3736 }, { "epoch": 0.6, "grad_norm": 7.887941479824022, "learning_rate": 3.6080771372506017e-06, "loss": 0.368, "step": 3737 }, { "epoch": 0.6, "grad_norm": 8.91163129134379, "learning_rate": 3.6055707606491297e-06, "loss": 0.4361, "step": 3738 }, { "epoch": 0.6, "grad_norm": 8.090419071065636, "learning_rate": 3.6030647639286196e-06, "loss": 0.3958, "step": 3739 }, { "epoch": 0.6, "grad_norm": 8.230466721776446, "learning_rate": 3.6005591477717766e-06, "loss": 0.3909, "step": 3740 }, { "epoch": 0.6, "grad_norm": 13.87991254856921, "learning_rate": 3.598053912861196e-06, "loss": 0.4375, "step": 3741 }, { "epoch": 0.6, "grad_norm": 10.747180010743921, "learning_rate": 3.5955490598793734e-06, "loss": 0.3797, "step": 3742 }, { "epoch": 0.6, "grad_norm": 4.6880895423569475, "learning_rate": 3.5930445895087017e-06, "loss": 0.4134, "step": 3743 }, { "epoch": 0.6, "grad_norm": 1.093335667846363, "learning_rate": 3.5905405024314683e-06, "loss": 0.4798, "step": 3744 }, { "epoch": 0.6, "grad_norm": 7.294738453695441, "learning_rate": 3.588036799329853e-06, "loss": 0.4043, "step": 3745 }, { "epoch": 0.6, "grad_norm": 6.054470459013915, "learning_rate": 3.585533480885934e-06, "loss": 0.3901, "step": 3746 }, { "epoch": 0.6, "grad_norm": 7.195631491838864, "learning_rate": 3.5830305477816863e-06, "loss": 0.4064, "step": 3747 }, { "epoch": 0.6, "grad_norm": 6.745841675948784, "learning_rate": 3.580528000698975e-06, "loss": 0.3911, "step": 3748 }, { "epoch": 0.6, "grad_norm": 1.3528458823370044, "learning_rate": 3.5780258403195635e-06, "loss": 0.4904, "step": 3749 }, { "epoch": 0.6, "grad_norm": 5.981756348983047, "learning_rate": 3.5755240673251125e-06, "loss": 0.3938, "step": 3750 }, { "epoch": 0.6, "grad_norm": 7.195900763060852, "learning_rate": 3.5730226823971693e-06, "loss": 0.4209, "step": 3751 }, { "epoch": 0.6, "grad_norm": 23.919364099506623, "learning_rate": 3.5705216862171823e-06, "loss": 0.4268, "step": 3752 }, { "epoch": 0.6, "grad_norm": 7.82379589992891, "learning_rate": 3.568021079466494e-06, "loss": 0.4276, "step": 3753 }, { "epoch": 0.6, "grad_norm": 6.239583233597651, "learning_rate": 3.5655208628263345e-06, "loss": 0.4136, "step": 3754 }, { "epoch": 0.61, "grad_norm": 9.361350699378562, "learning_rate": 3.563021036977834e-06, "loss": 0.3942, "step": 3755 }, { "epoch": 0.61, "grad_norm": 9.241565549986962, "learning_rate": 3.560521602602014e-06, "loss": 0.4162, "step": 3756 }, { "epoch": 0.61, "grad_norm": 6.338414108567371, "learning_rate": 3.5580225603797873e-06, "loss": 0.4303, "step": 3757 }, { "epoch": 0.61, "grad_norm": 7.900660440844053, "learning_rate": 3.5555239109919647e-06, "loss": 0.4236, "step": 3758 }, { "epoch": 0.61, "grad_norm": 8.103109907220519, "learning_rate": 3.5530256551192467e-06, "loss": 0.4397, "step": 3759 }, { "epoch": 0.61, "grad_norm": 5.665997272125521, "learning_rate": 3.5505277934422254e-06, "loss": 0.4133, "step": 3760 }, { "epoch": 0.61, "grad_norm": 8.054224710856591, "learning_rate": 3.5480303266413884e-06, "loss": 0.3622, "step": 3761 }, { "epoch": 0.61, "grad_norm": 5.155547702029373, "learning_rate": 3.545533255397112e-06, "loss": 0.3229, "step": 3762 }, { "epoch": 0.61, "grad_norm": 7.610373875386152, "learning_rate": 3.5430365803896736e-06, "loss": 0.4054, "step": 3763 }, { "epoch": 0.61, "grad_norm": 7.375255654294126, "learning_rate": 3.540540302299229e-06, "loss": 0.4455, "step": 3764 }, { "epoch": 0.61, "grad_norm": 8.10028533181914, "learning_rate": 3.5380444218058374e-06, "loss": 0.5043, "step": 3765 }, { "epoch": 0.61, "grad_norm": 10.052960716124757, "learning_rate": 3.5355489395894448e-06, "loss": 0.425, "step": 3766 }, { "epoch": 0.61, "grad_norm": 6.516449082340731, "learning_rate": 3.533053856329889e-06, "loss": 0.409, "step": 3767 }, { "epoch": 0.61, "grad_norm": 5.942930605904436, "learning_rate": 3.5305591727068984e-06, "loss": 0.4446, "step": 3768 }, { "epoch": 0.61, "grad_norm": 8.297245342851765, "learning_rate": 3.5280648894000957e-06, "loss": 0.3697, "step": 3769 }, { "epoch": 0.61, "grad_norm": 7.916684787761617, "learning_rate": 3.5255710070889903e-06, "loss": 0.4373, "step": 3770 }, { "epoch": 0.61, "grad_norm": 1.292164430953637, "learning_rate": 3.5230775264529837e-06, "loss": 0.5054, "step": 3771 }, { "epoch": 0.61, "grad_norm": 10.339643447197412, "learning_rate": 3.520584448171375e-06, "loss": 0.4004, "step": 3772 }, { "epoch": 0.61, "grad_norm": 1.186865972558311, "learning_rate": 3.518091772923339e-06, "loss": 0.4523, "step": 3773 }, { "epoch": 0.61, "grad_norm": 13.36042797616664, "learning_rate": 3.515599501387954e-06, "loss": 0.5128, "step": 3774 }, { "epoch": 0.61, "grad_norm": 7.551796063271116, "learning_rate": 3.5131076342441838e-06, "loss": 0.4716, "step": 3775 }, { "epoch": 0.61, "grad_norm": 1.0742615000135156, "learning_rate": 3.5106161721708797e-06, "loss": 0.4477, "step": 3776 }, { "epoch": 0.61, "grad_norm": 1.2253269186874773, "learning_rate": 3.508125115846785e-06, "loss": 0.4614, "step": 3777 }, { "epoch": 0.61, "grad_norm": 6.183087179518292, "learning_rate": 3.5056344659505335e-06, "loss": 0.4108, "step": 3778 }, { "epoch": 0.61, "grad_norm": 10.248108339690626, "learning_rate": 3.503144223160644e-06, "loss": 0.4677, "step": 3779 }, { "epoch": 0.61, "grad_norm": 14.770455236119721, "learning_rate": 3.5006543881555304e-06, "loss": 0.3876, "step": 3780 }, { "epoch": 0.61, "grad_norm": 9.143019175180838, "learning_rate": 3.4981649616134912e-06, "loss": 0.4941, "step": 3781 }, { "epoch": 0.61, "grad_norm": 7.69436401486207, "learning_rate": 3.495675944212715e-06, "loss": 0.3964, "step": 3782 }, { "epoch": 0.61, "grad_norm": 6.754006586841706, "learning_rate": 3.4931873366312785e-06, "loss": 0.4148, "step": 3783 }, { "epoch": 0.61, "grad_norm": 6.979897179747272, "learning_rate": 3.490699139547146e-06, "loss": 0.3954, "step": 3784 }, { "epoch": 0.61, "grad_norm": 6.676939567287066, "learning_rate": 3.4882113536381744e-06, "loss": 0.4091, "step": 3785 }, { "epoch": 0.61, "grad_norm": 14.382115901387778, "learning_rate": 3.4857239795821003e-06, "loss": 0.4739, "step": 3786 }, { "epoch": 0.61, "grad_norm": 7.6734561881557335, "learning_rate": 3.483237018056556e-06, "loss": 0.3917, "step": 3787 }, { "epoch": 0.61, "grad_norm": 6.35804523611172, "learning_rate": 3.480750469739059e-06, "loss": 0.4823, "step": 3788 }, { "epoch": 0.61, "grad_norm": 428.8467476679283, "learning_rate": 3.478264335307011e-06, "loss": 0.4766, "step": 3789 }, { "epoch": 0.61, "grad_norm": 13.674578244281337, "learning_rate": 3.475778615437706e-06, "loss": 0.4235, "step": 3790 }, { "epoch": 0.61, "grad_norm": 5.99967296003406, "learning_rate": 3.4732933108083218e-06, "loss": 0.4681, "step": 3791 }, { "epoch": 0.61, "grad_norm": 12.5668404229954, "learning_rate": 3.470808422095923e-06, "loss": 0.5383, "step": 3792 }, { "epoch": 0.61, "grad_norm": 14.801378649572548, "learning_rate": 3.4683239499774606e-06, "loss": 0.4073, "step": 3793 }, { "epoch": 0.61, "grad_norm": 12.397697395657023, "learning_rate": 3.465839895129779e-06, "loss": 0.3943, "step": 3794 }, { "epoch": 0.61, "grad_norm": 4.652280886539598, "learning_rate": 3.463356258229596e-06, "loss": 0.4095, "step": 3795 }, { "epoch": 0.61, "grad_norm": 24.702249208867922, "learning_rate": 3.4608730399535273e-06, "loss": 0.3952, "step": 3796 }, { "epoch": 0.61, "grad_norm": 7.632785379963838, "learning_rate": 3.4583902409780693e-06, "loss": 0.445, "step": 3797 }, { "epoch": 0.61, "grad_norm": 6.171934297374592, "learning_rate": 3.4559078619796036e-06, "loss": 0.4495, "step": 3798 }, { "epoch": 0.61, "grad_norm": 1.2998072634038784, "learning_rate": 3.4534259036343996e-06, "loss": 0.5017, "step": 3799 }, { "epoch": 0.61, "grad_norm": 18.305652232070777, "learning_rate": 3.450944366618613e-06, "loss": 0.3674, "step": 3800 }, { "epoch": 0.61, "grad_norm": 7.686535922367812, "learning_rate": 3.4484632516082784e-06, "loss": 0.4571, "step": 3801 }, { "epoch": 0.61, "grad_norm": 5.006503850121893, "learning_rate": 3.445982559279322e-06, "loss": 0.4502, "step": 3802 }, { "epoch": 0.61, "grad_norm": 17.829982394009967, "learning_rate": 3.4435022903075536e-06, "loss": 0.4442, "step": 3803 }, { "epoch": 0.61, "grad_norm": 11.590574273880994, "learning_rate": 3.441022445368668e-06, "loss": 0.4958, "step": 3804 }, { "epoch": 0.61, "grad_norm": 11.262082922828691, "learning_rate": 3.4385430251382407e-06, "loss": 0.3915, "step": 3805 }, { "epoch": 0.61, "grad_norm": 8.018203507209071, "learning_rate": 3.4360640302917353e-06, "loss": 0.3981, "step": 3806 }, { "epoch": 0.61, "grad_norm": 8.202660890425134, "learning_rate": 3.433585461504499e-06, "loss": 0.3819, "step": 3807 }, { "epoch": 0.61, "grad_norm": 5.93954200337496, "learning_rate": 3.4311073194517596e-06, "loss": 0.4655, "step": 3808 }, { "epoch": 0.61, "grad_norm": 9.129071776646366, "learning_rate": 3.4286296048086343e-06, "loss": 0.3867, "step": 3809 }, { "epoch": 0.61, "grad_norm": 13.246350404321706, "learning_rate": 3.42615231825012e-06, "loss": 0.4572, "step": 3810 }, { "epoch": 0.61, "grad_norm": 6.240479734912556, "learning_rate": 3.423675460451097e-06, "loss": 0.4101, "step": 3811 }, { "epoch": 0.61, "grad_norm": 6.29477836530849, "learning_rate": 3.4211990320863307e-06, "loss": 0.437, "step": 3812 }, { "epoch": 0.61, "grad_norm": 7.405405629330422, "learning_rate": 3.4187230338304684e-06, "loss": 0.388, "step": 3813 }, { "epoch": 0.61, "grad_norm": 6.236224966620971, "learning_rate": 3.416247466358039e-06, "loss": 0.5006, "step": 3814 }, { "epoch": 0.61, "grad_norm": 17.041243045413403, "learning_rate": 3.413772330343455e-06, "loss": 0.4239, "step": 3815 }, { "epoch": 0.61, "grad_norm": 12.480223488697884, "learning_rate": 3.4112976264610167e-06, "loss": 0.3808, "step": 3816 }, { "epoch": 0.62, "grad_norm": 6.5494025940608145, "learning_rate": 3.408823355384894e-06, "loss": 0.4783, "step": 3817 }, { "epoch": 0.62, "grad_norm": 7.707765716198661, "learning_rate": 3.406349517789151e-06, "loss": 0.4165, "step": 3818 }, { "epoch": 0.62, "grad_norm": 10.420873159377349, "learning_rate": 3.4038761143477296e-06, "loss": 0.489, "step": 3819 }, { "epoch": 0.62, "grad_norm": 6.120507430164611, "learning_rate": 3.4014031457344517e-06, "loss": 0.4016, "step": 3820 }, { "epoch": 0.62, "grad_norm": 13.344251288601486, "learning_rate": 3.3989306126230226e-06, "loss": 0.4624, "step": 3821 }, { "epoch": 0.62, "grad_norm": 5.859040899404327, "learning_rate": 3.3964585156870267e-06, "loss": 0.462, "step": 3822 }, { "epoch": 0.62, "grad_norm": 21.486218302671194, "learning_rate": 3.393986855599936e-06, "loss": 0.4103, "step": 3823 }, { "epoch": 0.62, "grad_norm": 8.991755064636807, "learning_rate": 3.391515633035093e-06, "loss": 0.4447, "step": 3824 }, { "epoch": 0.62, "grad_norm": 8.176807277648642, "learning_rate": 3.389044848665731e-06, "loss": 0.4207, "step": 3825 }, { "epoch": 0.62, "grad_norm": 6.282797818435638, "learning_rate": 3.3865745031649595e-06, "loss": 0.4313, "step": 3826 }, { "epoch": 0.62, "grad_norm": 11.243219379621953, "learning_rate": 3.3841045972057663e-06, "loss": 0.4216, "step": 3827 }, { "epoch": 0.62, "grad_norm": 1.3200055250596865, "learning_rate": 3.3816351314610235e-06, "loss": 0.5029, "step": 3828 }, { "epoch": 0.62, "grad_norm": 14.619968733439679, "learning_rate": 3.379166106603482e-06, "loss": 0.508, "step": 3829 }, { "epoch": 0.62, "grad_norm": 18.874955412737965, "learning_rate": 3.3766975233057715e-06, "loss": 0.4382, "step": 3830 }, { "epoch": 0.62, "grad_norm": 7.676890388709013, "learning_rate": 3.3742293822404005e-06, "loss": 0.4436, "step": 3831 }, { "epoch": 0.62, "grad_norm": 7.789412793873831, "learning_rate": 3.371761684079763e-06, "loss": 0.4341, "step": 3832 }, { "epoch": 0.62, "grad_norm": 7.423450605314177, "learning_rate": 3.369294429496124e-06, "loss": 0.4295, "step": 3833 }, { "epoch": 0.62, "grad_norm": 7.961196154707833, "learning_rate": 3.366827619161632e-06, "loss": 0.3611, "step": 3834 }, { "epoch": 0.62, "grad_norm": 10.36499251201153, "learning_rate": 3.364361253748318e-06, "loss": 0.459, "step": 3835 }, { "epoch": 0.62, "grad_norm": 9.380048467270422, "learning_rate": 3.361895333928083e-06, "loss": 0.4167, "step": 3836 }, { "epoch": 0.62, "grad_norm": 6.59706560788016, "learning_rate": 3.3594298603727126e-06, "loss": 0.4643, "step": 3837 }, { "epoch": 0.62, "grad_norm": 10.863650471995008, "learning_rate": 3.356964833753875e-06, "loss": 0.4243, "step": 3838 }, { "epoch": 0.62, "grad_norm": 9.107274109106763, "learning_rate": 3.3545002547431034e-06, "loss": 0.4839, "step": 3839 }, { "epoch": 0.62, "grad_norm": 7.431758593392631, "learning_rate": 3.3520361240118216e-06, "loss": 0.409, "step": 3840 }, { "epoch": 0.62, "grad_norm": 11.011473926008383, "learning_rate": 3.3495724422313262e-06, "loss": 0.3431, "step": 3841 }, { "epoch": 0.62, "grad_norm": 10.191946366407205, "learning_rate": 3.347109210072793e-06, "loss": 0.404, "step": 3842 }, { "epoch": 0.62, "grad_norm": 11.026464907581602, "learning_rate": 3.3446464282072723e-06, "loss": 0.4647, "step": 3843 }, { "epoch": 0.62, "grad_norm": 6.913846886798677, "learning_rate": 3.3421840973056935e-06, "loss": 0.4091, "step": 3844 }, { "epoch": 0.62, "grad_norm": 5.506033097245198, "learning_rate": 3.3397222180388677e-06, "loss": 0.4427, "step": 3845 }, { "epoch": 0.62, "grad_norm": 7.6779302935728735, "learning_rate": 3.3372607910774726e-06, "loss": 0.4282, "step": 3846 }, { "epoch": 0.62, "grad_norm": 6.323013515910669, "learning_rate": 3.3347998170920724e-06, "loss": 0.4865, "step": 3847 }, { "epoch": 0.62, "grad_norm": 6.391155649929585, "learning_rate": 3.3323392967531043e-06, "loss": 0.4322, "step": 3848 }, { "epoch": 0.62, "grad_norm": 12.864544592814104, "learning_rate": 3.32987923073088e-06, "loss": 0.4345, "step": 3849 }, { "epoch": 0.62, "grad_norm": 8.392522288107367, "learning_rate": 3.327419619695591e-06, "loss": 0.4896, "step": 3850 }, { "epoch": 0.62, "grad_norm": 6.465387259429603, "learning_rate": 3.3249604643173037e-06, "loss": 0.487, "step": 3851 }, { "epoch": 0.62, "grad_norm": 5.074298185066563, "learning_rate": 3.3225017652659577e-06, "loss": 0.3377, "step": 3852 }, { "epoch": 0.62, "grad_norm": 9.399331281596357, "learning_rate": 3.3200435232113694e-06, "loss": 0.432, "step": 3853 }, { "epoch": 0.62, "grad_norm": 6.315498840291, "learning_rate": 3.3175857388232376e-06, "loss": 0.3759, "step": 3854 }, { "epoch": 0.62, "grad_norm": 7.302053744739064, "learning_rate": 3.3151284127711227e-06, "loss": 0.3961, "step": 3855 }, { "epoch": 0.62, "grad_norm": 7.77417220119939, "learning_rate": 3.312671545724474e-06, "loss": 0.3881, "step": 3856 }, { "epoch": 0.62, "grad_norm": 11.077963968298171, "learning_rate": 3.3102151383526077e-06, "loss": 0.4117, "step": 3857 }, { "epoch": 0.62, "grad_norm": 7.02025396257588, "learning_rate": 3.3077591913247166e-06, "loss": 0.4116, "step": 3858 }, { "epoch": 0.62, "grad_norm": 8.035588478737553, "learning_rate": 3.305303705309868e-06, "loss": 0.4428, "step": 3859 }, { "epoch": 0.62, "grad_norm": 8.161759837477927, "learning_rate": 3.3028486809770046e-06, "loss": 0.4482, "step": 3860 }, { "epoch": 0.62, "grad_norm": 7.542874776982548, "learning_rate": 3.300394118994944e-06, "loss": 0.4085, "step": 3861 }, { "epoch": 0.62, "grad_norm": 10.103733324244878, "learning_rate": 3.297940020032374e-06, "loss": 0.4382, "step": 3862 }, { "epoch": 0.62, "grad_norm": 1.204240749145806, "learning_rate": 3.295486384757861e-06, "loss": 0.4924, "step": 3863 }, { "epoch": 0.62, "grad_norm": 24.438732780196176, "learning_rate": 3.2930332138398422e-06, "loss": 0.3813, "step": 3864 }, { "epoch": 0.62, "grad_norm": 7.095960342518633, "learning_rate": 3.2905805079466284e-06, "loss": 0.4399, "step": 3865 }, { "epoch": 0.62, "grad_norm": 13.231829863412326, "learning_rate": 3.2881282677464034e-06, "loss": 0.4536, "step": 3866 }, { "epoch": 0.62, "grad_norm": 5.483480643881775, "learning_rate": 3.2856764939072294e-06, "loss": 0.3819, "step": 3867 }, { "epoch": 0.62, "grad_norm": 1.1071807986244537, "learning_rate": 3.283225187097031e-06, "loss": 0.4808, "step": 3868 }, { "epoch": 0.62, "grad_norm": 7.464704668664216, "learning_rate": 3.2807743479836155e-06, "loss": 0.3978, "step": 3869 }, { "epoch": 0.62, "grad_norm": 9.297451467508541, "learning_rate": 3.27832397723466e-06, "loss": 0.4092, "step": 3870 }, { "epoch": 0.62, "grad_norm": 10.227927326257817, "learning_rate": 3.27587407551771e-06, "loss": 0.4667, "step": 3871 }, { "epoch": 0.62, "grad_norm": 7.645441072236689, "learning_rate": 3.273424643500187e-06, "loss": 0.4388, "step": 3872 }, { "epoch": 0.62, "grad_norm": 5.832650695736625, "learning_rate": 3.2709756818493867e-06, "loss": 0.4105, "step": 3873 }, { "epoch": 0.62, "grad_norm": 10.263621282172071, "learning_rate": 3.26852719123247e-06, "loss": 0.434, "step": 3874 }, { "epoch": 0.62, "grad_norm": 55.43658253287473, "learning_rate": 3.266079172316473e-06, "loss": 0.411, "step": 3875 }, { "epoch": 0.62, "grad_norm": 6.620225794016527, "learning_rate": 3.263631625768309e-06, "loss": 0.4445, "step": 3876 }, { "epoch": 0.62, "grad_norm": 8.40532565964153, "learning_rate": 3.2611845522547503e-06, "loss": 0.425, "step": 3877 }, { "epoch": 0.62, "grad_norm": 6.454488073777527, "learning_rate": 3.2587379524424513e-06, "loss": 0.4532, "step": 3878 }, { "epoch": 0.62, "grad_norm": 6.984955103541048, "learning_rate": 3.2562918269979334e-06, "loss": 0.4105, "step": 3879 }, { "epoch": 0.63, "grad_norm": 5.221844218672381, "learning_rate": 3.253846176587586e-06, "loss": 0.458, "step": 3880 }, { "epoch": 0.63, "grad_norm": 6.837717736597488, "learning_rate": 3.251401001877673e-06, "loss": 0.5099, "step": 3881 }, { "epoch": 0.63, "grad_norm": 1.4254505434014106, "learning_rate": 3.2489563035343276e-06, "loss": 0.4804, "step": 3882 }, { "epoch": 0.63, "grad_norm": 9.259000450344796, "learning_rate": 3.246512082223555e-06, "loss": 0.3999, "step": 3883 }, { "epoch": 0.63, "grad_norm": 7.7410027747933725, "learning_rate": 3.2440683386112238e-06, "loss": 0.4594, "step": 3884 }, { "epoch": 0.63, "grad_norm": 11.116035129808385, "learning_rate": 3.24162507336308e-06, "loss": 0.3986, "step": 3885 }, { "epoch": 0.63, "grad_norm": 6.969200842735214, "learning_rate": 3.2391822871447377e-06, "loss": 0.3892, "step": 3886 }, { "epoch": 0.63, "grad_norm": 9.349340535556742, "learning_rate": 3.2367399806216765e-06, "loss": 0.4179, "step": 3887 }, { "epoch": 0.63, "grad_norm": 1.2729133500196799, "learning_rate": 3.234298154459249e-06, "loss": 0.4769, "step": 3888 }, { "epoch": 0.63, "grad_norm": 5.140624857035874, "learning_rate": 3.231856809322677e-06, "loss": 0.3546, "step": 3889 }, { "epoch": 0.63, "grad_norm": 5.9891059664066155, "learning_rate": 3.229415945877048e-06, "loss": 0.349, "step": 3890 }, { "epoch": 0.63, "grad_norm": 5.826321831394866, "learning_rate": 3.226975564787322e-06, "loss": 0.4171, "step": 3891 }, { "epoch": 0.63, "grad_norm": 8.16171566729037, "learning_rate": 3.224535666718327e-06, "loss": 0.3866, "step": 3892 }, { "epoch": 0.63, "grad_norm": 9.359787585511752, "learning_rate": 3.2220962523347567e-06, "loss": 0.4196, "step": 3893 }, { "epoch": 0.63, "grad_norm": 10.413002510828044, "learning_rate": 3.219657322301175e-06, "loss": 0.4043, "step": 3894 }, { "epoch": 0.63, "grad_norm": 9.185744630369937, "learning_rate": 3.2172188772820154e-06, "loss": 0.4139, "step": 3895 }, { "epoch": 0.63, "grad_norm": 6.649074456561787, "learning_rate": 3.214780917941575e-06, "loss": 0.3456, "step": 3896 }, { "epoch": 0.63, "grad_norm": 7.444724143890425, "learning_rate": 3.212343444944022e-06, "loss": 0.4098, "step": 3897 }, { "epoch": 0.63, "grad_norm": 5.08327011452638, "learning_rate": 3.209906458953394e-06, "loss": 0.4812, "step": 3898 }, { "epoch": 0.63, "grad_norm": 6.938854776164191, "learning_rate": 3.207469960633588e-06, "loss": 0.4074, "step": 3899 }, { "epoch": 0.63, "grad_norm": 6.325535071971464, "learning_rate": 3.2050339506483774e-06, "loss": 0.398, "step": 3900 }, { "epoch": 0.63, "grad_norm": 7.2926373815058145, "learning_rate": 3.2025984296613965e-06, "loss": 0.4667, "step": 3901 }, { "epoch": 0.63, "grad_norm": 4.915290186431194, "learning_rate": 3.200163398336151e-06, "loss": 0.4143, "step": 3902 }, { "epoch": 0.63, "grad_norm": 14.470093075954667, "learning_rate": 3.1977288573360064e-06, "loss": 0.3875, "step": 3903 }, { "epoch": 0.63, "grad_norm": 7.0085829274895435, "learning_rate": 3.1952948073242006e-06, "loss": 0.4297, "step": 3904 }, { "epoch": 0.63, "grad_norm": 11.077590310644805, "learning_rate": 3.19286124896384e-06, "loss": 0.4757, "step": 3905 }, { "epoch": 0.63, "grad_norm": 6.812140673266212, "learning_rate": 3.190428182917885e-06, "loss": 0.4312, "step": 3906 }, { "epoch": 0.63, "grad_norm": 10.613660991900307, "learning_rate": 3.187995609849176e-06, "loss": 0.4225, "step": 3907 }, { "epoch": 0.63, "grad_norm": 5.314089279811991, "learning_rate": 3.1855635304204113e-06, "loss": 0.3917, "step": 3908 }, { "epoch": 0.63, "grad_norm": 6.001666234912583, "learning_rate": 3.1831319452941557e-06, "loss": 0.4231, "step": 3909 }, { "epoch": 0.63, "grad_norm": 5.595272503287338, "learning_rate": 3.1807008551328407e-06, "loss": 0.4178, "step": 3910 }, { "epoch": 0.63, "grad_norm": 16.558727964983014, "learning_rate": 3.1782702605987623e-06, "loss": 0.3813, "step": 3911 }, { "epoch": 0.63, "grad_norm": 7.94579978481947, "learning_rate": 3.175840162354081e-06, "loss": 0.5287, "step": 3912 }, { "epoch": 0.63, "grad_norm": 12.6798435583847, "learning_rate": 3.1734105610608213e-06, "loss": 0.4233, "step": 3913 }, { "epoch": 0.63, "grad_norm": 14.5699831367811, "learning_rate": 3.1709814573808766e-06, "loss": 0.3999, "step": 3914 }, { "epoch": 0.63, "grad_norm": 8.218165217062538, "learning_rate": 3.168552851976e-06, "loss": 0.382, "step": 3915 }, { "epoch": 0.63, "grad_norm": 11.811377247396477, "learning_rate": 3.1661247455078097e-06, "loss": 0.4672, "step": 3916 }, { "epoch": 0.63, "grad_norm": 8.592689211836243, "learning_rate": 3.163697138637791e-06, "loss": 0.4739, "step": 3917 }, { "epoch": 0.63, "grad_norm": 10.725786428568863, "learning_rate": 3.161270032027289e-06, "loss": 0.503, "step": 3918 }, { "epoch": 0.63, "grad_norm": 8.493780431419871, "learning_rate": 3.1588434263375146e-06, "loss": 0.4439, "step": 3919 }, { "epoch": 0.63, "grad_norm": 1.2410384633187128, "learning_rate": 3.156417322229543e-06, "loss": 0.4685, "step": 3920 }, { "epoch": 0.63, "grad_norm": 7.98844777471595, "learning_rate": 3.153991720364313e-06, "loss": 0.4306, "step": 3921 }, { "epoch": 0.63, "grad_norm": 11.036505276721325, "learning_rate": 3.151566621402622e-06, "loss": 0.5004, "step": 3922 }, { "epoch": 0.63, "grad_norm": 11.231747102755259, "learning_rate": 3.1491420260051362e-06, "loss": 0.4135, "step": 3923 }, { "epoch": 0.63, "grad_norm": 8.11558368618105, "learning_rate": 3.146717934832383e-06, "loss": 0.4053, "step": 3924 }, { "epoch": 0.63, "grad_norm": 4.633279402436404, "learning_rate": 3.1442943485447493e-06, "loss": 0.416, "step": 3925 }, { "epoch": 0.63, "grad_norm": 10.104233276438467, "learning_rate": 3.1418712678024866e-06, "loss": 0.4342, "step": 3926 }, { "epoch": 0.63, "grad_norm": 8.910542264331065, "learning_rate": 3.1394486932657133e-06, "loss": 0.3737, "step": 3927 }, { "epoch": 0.63, "grad_norm": 1.2150810229222804, "learning_rate": 3.137026625594399e-06, "loss": 0.491, "step": 3928 }, { "epoch": 0.63, "grad_norm": 8.00445592365204, "learning_rate": 3.1346050654483867e-06, "loss": 0.3584, "step": 3929 }, { "epoch": 0.63, "grad_norm": 13.498654365312982, "learning_rate": 3.132184013487375e-06, "loss": 0.4198, "step": 3930 }, { "epoch": 0.63, "grad_norm": 10.36515321305887, "learning_rate": 3.129763470370924e-06, "loss": 0.3742, "step": 3931 }, { "epoch": 0.63, "grad_norm": 16.03945945439608, "learning_rate": 3.1273434367584567e-06, "loss": 0.4737, "step": 3932 }, { "epoch": 0.63, "grad_norm": 4.92103714372606, "learning_rate": 3.124923913309259e-06, "loss": 0.4227, "step": 3933 }, { "epoch": 0.63, "grad_norm": 9.31115060656964, "learning_rate": 3.1225049006824724e-06, "loss": 0.4625, "step": 3934 }, { "epoch": 0.63, "grad_norm": 6.868615355852885, "learning_rate": 3.1200863995371035e-06, "loss": 0.4283, "step": 3935 }, { "epoch": 0.63, "grad_norm": 5.044001908186342, "learning_rate": 3.1176684105320208e-06, "loss": 0.405, "step": 3936 }, { "epoch": 0.63, "grad_norm": 5.718611906311166, "learning_rate": 3.1152509343259494e-06, "loss": 0.3942, "step": 3937 }, { "epoch": 0.63, "grad_norm": 6.90730884931657, "learning_rate": 3.112833971577478e-06, "loss": 0.4438, "step": 3938 }, { "epoch": 0.63, "grad_norm": 7.528634175574992, "learning_rate": 3.1104175229450517e-06, "loss": 0.4525, "step": 3939 }, { "epoch": 0.63, "grad_norm": 5.242032131667083, "learning_rate": 3.1080015890869796e-06, "loss": 0.4349, "step": 3940 }, { "epoch": 0.63, "grad_norm": 6.186320444867246, "learning_rate": 3.1055861706614264e-06, "loss": 0.3913, "step": 3941 }, { "epoch": 0.64, "grad_norm": 5.684996804129319, "learning_rate": 3.1031712683264204e-06, "loss": 0.3745, "step": 3942 }, { "epoch": 0.64, "grad_norm": 7.057233039378118, "learning_rate": 3.1007568827398495e-06, "loss": 0.3881, "step": 3943 }, { "epoch": 0.64, "grad_norm": 7.267704559789786, "learning_rate": 3.0983430145594547e-06, "loss": 0.3886, "step": 3944 }, { "epoch": 0.64, "grad_norm": 5.302044610716438, "learning_rate": 3.0959296644428427e-06, "loss": 0.3984, "step": 3945 }, { "epoch": 0.64, "grad_norm": 8.549173992354037, "learning_rate": 3.0935168330474763e-06, "loss": 0.4953, "step": 3946 }, { "epoch": 0.64, "grad_norm": 5.603657268953786, "learning_rate": 3.0911045210306767e-06, "loss": 0.3674, "step": 3947 }, { "epoch": 0.64, "grad_norm": 7.1604961072220465, "learning_rate": 3.088692729049624e-06, "loss": 0.4192, "step": 3948 }, { "epoch": 0.64, "grad_norm": 1.08077714788108, "learning_rate": 3.0862814577613598e-06, "loss": 0.4532, "step": 3949 }, { "epoch": 0.64, "grad_norm": 1.3097702487268492, "learning_rate": 3.083870707822776e-06, "loss": 0.4987, "step": 3950 }, { "epoch": 0.64, "grad_norm": 11.437046106916698, "learning_rate": 3.08146047989063e-06, "loss": 0.4311, "step": 3951 }, { "epoch": 0.64, "grad_norm": 8.133323429468513, "learning_rate": 3.079050774621536e-06, "loss": 0.503, "step": 3952 }, { "epoch": 0.64, "grad_norm": 8.50629168000756, "learning_rate": 3.0766415926719606e-06, "loss": 0.4408, "step": 3953 }, { "epoch": 0.64, "grad_norm": 18.933211029018103, "learning_rate": 3.074232934698234e-06, "loss": 0.4829, "step": 3954 }, { "epoch": 0.64, "grad_norm": 7.972536101586685, "learning_rate": 3.07182480135654e-06, "loss": 0.4056, "step": 3955 }, { "epoch": 0.64, "grad_norm": 5.2784843992299715, "learning_rate": 3.06941719330292e-06, "loss": 0.4231, "step": 3956 }, { "epoch": 0.64, "grad_norm": 6.038697659989545, "learning_rate": 3.067010111193272e-06, "loss": 0.4833, "step": 3957 }, { "epoch": 0.64, "grad_norm": 10.645402990183074, "learning_rate": 3.0646035556833563e-06, "loss": 0.4302, "step": 3958 }, { "epoch": 0.64, "grad_norm": 10.634149823846363, "learning_rate": 3.0621975274287784e-06, "loss": 0.4277, "step": 3959 }, { "epoch": 0.64, "grad_norm": 6.424690088210346, "learning_rate": 3.059792027085011e-06, "loss": 0.4327, "step": 3960 }, { "epoch": 0.64, "grad_norm": 5.553280557506933, "learning_rate": 3.0573870553073776e-06, "loss": 0.4139, "step": 3961 }, { "epoch": 0.64, "grad_norm": 10.681732186006151, "learning_rate": 3.0549826127510595e-06, "loss": 0.4596, "step": 3962 }, { "epoch": 0.64, "grad_norm": 6.385413164021655, "learning_rate": 3.0525787000710915e-06, "loss": 0.347, "step": 3963 }, { "epoch": 0.64, "grad_norm": 6.152752065287386, "learning_rate": 3.0501753179223657e-06, "loss": 0.3941, "step": 3964 }, { "epoch": 0.64, "grad_norm": 8.953582916236869, "learning_rate": 3.0477724669596326e-06, "loss": 0.4578, "step": 3965 }, { "epoch": 0.64, "grad_norm": 7.498910858175644, "learning_rate": 3.045370147837492e-06, "loss": 0.4675, "step": 3966 }, { "epoch": 0.64, "grad_norm": 10.060906483355321, "learning_rate": 3.042968361210403e-06, "loss": 0.4214, "step": 3967 }, { "epoch": 0.64, "grad_norm": 10.71558569204994, "learning_rate": 3.040567107732679e-06, "loss": 0.475, "step": 3968 }, { "epoch": 0.64, "grad_norm": 5.157128913000169, "learning_rate": 3.0381663880584855e-06, "loss": 0.3907, "step": 3969 }, { "epoch": 0.64, "grad_norm": 6.5998373731786595, "learning_rate": 3.0357662028418455e-06, "loss": 0.4336, "step": 3970 }, { "epoch": 0.64, "grad_norm": 5.418040781831431, "learning_rate": 3.0333665527366394e-06, "loss": 0.4271, "step": 3971 }, { "epoch": 0.64, "grad_norm": 12.156264782261951, "learning_rate": 3.0309674383965915e-06, "loss": 0.431, "step": 3972 }, { "epoch": 0.64, "grad_norm": 8.216031073810308, "learning_rate": 3.0285688604752916e-06, "loss": 0.4591, "step": 3973 }, { "epoch": 0.64, "grad_norm": 1.262753958624208, "learning_rate": 3.026170819626178e-06, "loss": 0.5165, "step": 3974 }, { "epoch": 0.64, "grad_norm": 8.654005532859886, "learning_rate": 3.0237733165025408e-06, "loss": 0.4638, "step": 3975 }, { "epoch": 0.64, "grad_norm": 7.048303571177509, "learning_rate": 3.021376351757527e-06, "loss": 0.4399, "step": 3976 }, { "epoch": 0.64, "grad_norm": 10.179051111259673, "learning_rate": 3.0189799260441377e-06, "loss": 0.4978, "step": 3977 }, { "epoch": 0.64, "grad_norm": 11.254755996971404, "learning_rate": 3.0165840400152218e-06, "loss": 0.4282, "step": 3978 }, { "epoch": 0.64, "grad_norm": 8.758616489528508, "learning_rate": 3.014188694323486e-06, "loss": 0.4649, "step": 3979 }, { "epoch": 0.64, "grad_norm": 8.131247225133208, "learning_rate": 3.0117938896214904e-06, "loss": 0.4255, "step": 3980 }, { "epoch": 0.64, "grad_norm": 13.186642966683491, "learning_rate": 3.0093996265616447e-06, "loss": 0.4065, "step": 3981 }, { "epoch": 0.64, "grad_norm": 10.123510012404244, "learning_rate": 3.007005905796212e-06, "loss": 0.4442, "step": 3982 }, { "epoch": 0.64, "grad_norm": 6.047122857490186, "learning_rate": 3.0046127279773067e-06, "loss": 0.4492, "step": 3983 }, { "epoch": 0.64, "grad_norm": 6.592404601568773, "learning_rate": 3.002220093756899e-06, "loss": 0.5083, "step": 3984 }, { "epoch": 0.64, "grad_norm": 8.218807573454606, "learning_rate": 2.999828003786806e-06, "loss": 0.4177, "step": 3985 }, { "epoch": 0.64, "grad_norm": 10.038340043835284, "learning_rate": 2.9974364587186988e-06, "loss": 0.4441, "step": 3986 }, { "epoch": 0.64, "grad_norm": 11.697258013265095, "learning_rate": 2.995045459204104e-06, "loss": 0.4243, "step": 3987 }, { "epoch": 0.64, "grad_norm": 6.5521696703458385, "learning_rate": 2.9926550058943905e-06, "loss": 0.4101, "step": 3988 }, { "epoch": 0.64, "grad_norm": 12.55235533345434, "learning_rate": 2.9902650994407867e-06, "loss": 0.483, "step": 3989 }, { "epoch": 0.64, "grad_norm": 15.623612132289999, "learning_rate": 2.9878757404943694e-06, "loss": 0.4687, "step": 3990 }, { "epoch": 0.64, "grad_norm": 11.771551649035427, "learning_rate": 2.985486929706064e-06, "loss": 0.4644, "step": 3991 }, { "epoch": 0.64, "grad_norm": 10.948094345839378, "learning_rate": 2.9830986677266495e-06, "loss": 0.3678, "step": 3992 }, { "epoch": 0.64, "grad_norm": 13.72985317510009, "learning_rate": 2.980710955206755e-06, "loss": 0.4085, "step": 3993 }, { "epoch": 0.64, "grad_norm": 9.261450208768267, "learning_rate": 2.9783237927968567e-06, "loss": 0.3972, "step": 3994 }, { "epoch": 0.64, "grad_norm": 10.76624855982844, "learning_rate": 2.9759371811472857e-06, "loss": 0.4219, "step": 3995 }, { "epoch": 0.64, "grad_norm": 12.306752379919638, "learning_rate": 2.9735511209082213e-06, "loss": 0.4408, "step": 3996 }, { "epoch": 0.64, "grad_norm": 10.812893461065988, "learning_rate": 2.9711656127296895e-06, "loss": 0.4182, "step": 3997 }, { "epoch": 0.64, "grad_norm": 12.090564397952443, "learning_rate": 2.968780657261571e-06, "loss": 0.4029, "step": 3998 }, { "epoch": 0.64, "grad_norm": 8.88600991784268, "learning_rate": 2.96639625515359e-06, "loss": 0.4332, "step": 3999 }, { "epoch": 0.64, "grad_norm": 14.26690256779112, "learning_rate": 2.9640124070553296e-06, "loss": 0.4561, "step": 4000 }, { "epoch": 0.64, "grad_norm": 12.92138213722854, "learning_rate": 2.961629113616209e-06, "loss": 0.4174, "step": 4001 }, { "epoch": 0.64, "grad_norm": 11.860403783133458, "learning_rate": 2.959246375485506e-06, "loss": 0.3936, "step": 4002 }, { "epoch": 0.64, "grad_norm": 12.89227905917057, "learning_rate": 2.9568641933123456e-06, "loss": 0.4895, "step": 4003 }, { "epoch": 0.65, "grad_norm": 10.61796083498606, "learning_rate": 2.954482567745697e-06, "loss": 0.4174, "step": 4004 }, { "epoch": 0.65, "grad_norm": 9.500493743748446, "learning_rate": 2.9521014994343823e-06, "loss": 0.4561, "step": 4005 }, { "epoch": 0.65, "grad_norm": 11.933944810350585, "learning_rate": 2.9497209890270704e-06, "loss": 0.3906, "step": 4006 }, { "epoch": 0.65, "grad_norm": 33.31742745070077, "learning_rate": 2.947341037172277e-06, "loss": 0.424, "step": 4007 }, { "epoch": 0.65, "grad_norm": 10.277650679488188, "learning_rate": 2.944961644518366e-06, "loss": 0.4421, "step": 4008 }, { "epoch": 0.65, "grad_norm": 10.494944992363877, "learning_rate": 2.942582811713553e-06, "loss": 0.4046, "step": 4009 }, { "epoch": 0.65, "grad_norm": 24.258999854553792, "learning_rate": 2.940204539405892e-06, "loss": 0.4198, "step": 4010 }, { "epoch": 0.65, "grad_norm": 17.093710182987405, "learning_rate": 2.937826828243294e-06, "loss": 0.39, "step": 4011 }, { "epoch": 0.65, "grad_norm": 13.89987432976777, "learning_rate": 2.9354496788735145e-06, "loss": 0.4146, "step": 4012 }, { "epoch": 0.65, "grad_norm": 11.784108394930481, "learning_rate": 2.9330730919441498e-06, "loss": 0.4099, "step": 4013 }, { "epoch": 0.65, "grad_norm": 8.166655027292675, "learning_rate": 2.9306970681026503e-06, "loss": 0.437, "step": 4014 }, { "epoch": 0.65, "grad_norm": 6.730180386052663, "learning_rate": 2.9283216079963108e-06, "loss": 0.4202, "step": 4015 }, { "epoch": 0.65, "grad_norm": 10.081434382911809, "learning_rate": 2.9259467122722705e-06, "loss": 0.383, "step": 4016 }, { "epoch": 0.65, "grad_norm": 73.12482241273136, "learning_rate": 2.9235723815775167e-06, "loss": 0.4303, "step": 4017 }, { "epoch": 0.65, "grad_norm": 21.97171023440878, "learning_rate": 2.9211986165588856e-06, "loss": 0.4342, "step": 4018 }, { "epoch": 0.65, "grad_norm": 8.246003768055832, "learning_rate": 2.9188254178630526e-06, "loss": 0.4349, "step": 4019 }, { "epoch": 0.65, "grad_norm": 6.053694361853395, "learning_rate": 2.916452786136542e-06, "loss": 0.4068, "step": 4020 }, { "epoch": 0.65, "grad_norm": 8.257174461020268, "learning_rate": 2.914080722025728e-06, "loss": 0.4788, "step": 4021 }, { "epoch": 0.65, "grad_norm": 24.18150139444251, "learning_rate": 2.9117092261768247e-06, "loss": 0.4285, "step": 4022 }, { "epoch": 0.65, "grad_norm": 6.720649052335874, "learning_rate": 2.9093382992358897e-06, "loss": 0.3947, "step": 4023 }, { "epoch": 0.65, "grad_norm": 26.162742092696202, "learning_rate": 2.906967941848834e-06, "loss": 0.3844, "step": 4024 }, { "epoch": 0.65, "grad_norm": 11.461626079351024, "learning_rate": 2.9045981546614057e-06, "loss": 0.4242, "step": 4025 }, { "epoch": 0.65, "grad_norm": 8.475126822089058, "learning_rate": 2.9022289383191972e-06, "loss": 0.4235, "step": 4026 }, { "epoch": 0.65, "grad_norm": 8.044076016056755, "learning_rate": 2.899860293467652e-06, "loss": 0.3326, "step": 4027 }, { "epoch": 0.65, "grad_norm": 8.861613646683157, "learning_rate": 2.897492220752057e-06, "loss": 0.4364, "step": 4028 }, { "epoch": 0.65, "grad_norm": 1.2582715157697517, "learning_rate": 2.8951247208175337e-06, "loss": 0.4621, "step": 4029 }, { "epoch": 0.65, "grad_norm": 5.210951359886305, "learning_rate": 2.8927577943090574e-06, "loss": 0.4227, "step": 4030 }, { "epoch": 0.65, "grad_norm": 10.49034146872277, "learning_rate": 2.890391441871446e-06, "loss": 0.4019, "step": 4031 }, { "epoch": 0.65, "grad_norm": 19.47035804206972, "learning_rate": 2.8880256641493582e-06, "loss": 0.381, "step": 4032 }, { "epoch": 0.65, "grad_norm": 10.213800843593058, "learning_rate": 2.885660461787294e-06, "loss": 0.4149, "step": 4033 }, { "epoch": 0.65, "grad_norm": 10.920550682368328, "learning_rate": 2.8832958354296048e-06, "loss": 0.4335, "step": 4034 }, { "epoch": 0.65, "grad_norm": 6.728176713723922, "learning_rate": 2.880931785720474e-06, "loss": 0.3801, "step": 4035 }, { "epoch": 0.65, "grad_norm": 8.380266568796461, "learning_rate": 2.8785683133039404e-06, "loss": 0.4631, "step": 4036 }, { "epoch": 0.65, "grad_norm": 12.318999960644637, "learning_rate": 2.876205418823875e-06, "loss": 0.3883, "step": 4037 }, { "epoch": 0.65, "grad_norm": 4.542671164814548, "learning_rate": 2.8738431029239954e-06, "loss": 0.4403, "step": 4038 }, { "epoch": 0.65, "grad_norm": 11.752702610731474, "learning_rate": 2.8714813662478633e-06, "loss": 0.4053, "step": 4039 }, { "epoch": 0.65, "grad_norm": 6.301703294975158, "learning_rate": 2.869120209438879e-06, "loss": 0.4629, "step": 4040 }, { "epoch": 0.65, "grad_norm": 7.562987473029538, "learning_rate": 2.8667596331402892e-06, "loss": 0.4491, "step": 4041 }, { "epoch": 0.65, "grad_norm": 7.962089496456033, "learning_rate": 2.864399637995176e-06, "loss": 0.3567, "step": 4042 }, { "epoch": 0.65, "grad_norm": 1.0472967021520163, "learning_rate": 2.8620402246464717e-06, "loss": 0.442, "step": 4043 }, { "epoch": 0.65, "grad_norm": 16.701923541017162, "learning_rate": 2.8596813937369437e-06, "loss": 0.3999, "step": 4044 }, { "epoch": 0.65, "grad_norm": 7.337274947531368, "learning_rate": 2.8573231459092007e-06, "loss": 0.4207, "step": 4045 }, { "epoch": 0.65, "grad_norm": 1.1493757880078077, "learning_rate": 2.854965481805697e-06, "loss": 0.4299, "step": 4046 }, { "epoch": 0.65, "grad_norm": 8.302567736593238, "learning_rate": 2.852608402068725e-06, "loss": 0.4168, "step": 4047 }, { "epoch": 0.65, "grad_norm": 5.885929386480481, "learning_rate": 2.8502519073404155e-06, "loss": 0.3532, "step": 4048 }, { "epoch": 0.65, "grad_norm": 6.912332766439919, "learning_rate": 2.847895998262744e-06, "loss": 0.4568, "step": 4049 }, { "epoch": 0.65, "grad_norm": 6.190630551580049, "learning_rate": 2.8455406754775305e-06, "loss": 0.4863, "step": 4050 }, { "epoch": 0.65, "grad_norm": 12.775629984632445, "learning_rate": 2.8431859396264203e-06, "loss": 0.4041, "step": 4051 }, { "epoch": 0.65, "grad_norm": 5.701422672429989, "learning_rate": 2.8408317913509137e-06, "loss": 0.37, "step": 4052 }, { "epoch": 0.65, "grad_norm": 10.62754919406745, "learning_rate": 2.8384782312923466e-06, "loss": 0.3765, "step": 4053 }, { "epoch": 0.65, "grad_norm": 25.284005747120446, "learning_rate": 2.836125260091892e-06, "loss": 0.4581, "step": 4054 }, { "epoch": 0.65, "grad_norm": 8.238369254751749, "learning_rate": 2.8337728783905618e-06, "loss": 0.4433, "step": 4055 }, { "epoch": 0.65, "grad_norm": 13.346183507852489, "learning_rate": 2.8314210868292145e-06, "loss": 0.4889, "step": 4056 }, { "epoch": 0.65, "grad_norm": 6.968675860088378, "learning_rate": 2.82906988604854e-06, "loss": 0.38, "step": 4057 }, { "epoch": 0.65, "grad_norm": 10.293968643609517, "learning_rate": 2.8267192766890685e-06, "loss": 0.4259, "step": 4058 }, { "epoch": 0.65, "grad_norm": 7.523391278492276, "learning_rate": 2.824369259391173e-06, "loss": 0.4074, "step": 4059 }, { "epoch": 0.65, "grad_norm": 1.2710245453089868, "learning_rate": 2.822019834795065e-06, "loss": 0.4766, "step": 4060 }, { "epoch": 0.65, "grad_norm": 6.873122286942166, "learning_rate": 2.8196710035407905e-06, "loss": 0.4019, "step": 4061 }, { "epoch": 0.65, "grad_norm": 5.6436522698014455, "learning_rate": 2.8173227662682332e-06, "loss": 0.3839, "step": 4062 }, { "epoch": 0.65, "grad_norm": 11.158823722136578, "learning_rate": 2.814975123617123e-06, "loss": 0.4136, "step": 4063 }, { "epoch": 0.65, "grad_norm": 4.983087691082732, "learning_rate": 2.8126280762270177e-06, "loss": 0.4207, "step": 4064 }, { "epoch": 0.65, "grad_norm": 9.023024845041666, "learning_rate": 2.810281624737321e-06, "loss": 0.394, "step": 4065 }, { "epoch": 0.66, "grad_norm": 7.848112526262543, "learning_rate": 2.8079357697872696e-06, "loss": 0.4071, "step": 4066 }, { "epoch": 0.66, "grad_norm": 6.063030811260548, "learning_rate": 2.8055905120159375e-06, "loss": 0.4352, "step": 4067 }, { "epoch": 0.66, "grad_norm": 7.661607244408412, "learning_rate": 2.803245852062241e-06, "loss": 0.382, "step": 4068 }, { "epoch": 0.66, "grad_norm": 6.797194907618332, "learning_rate": 2.8009017905649283e-06, "loss": 0.441, "step": 4069 }, { "epoch": 0.66, "grad_norm": 6.380711409934143, "learning_rate": 2.7985583281625844e-06, "loss": 0.4258, "step": 4070 }, { "epoch": 0.66, "grad_norm": 7.609573554014029, "learning_rate": 2.7962154654936352e-06, "loss": 0.4529, "step": 4071 }, { "epoch": 0.66, "grad_norm": 8.397561185306046, "learning_rate": 2.7938732031963443e-06, "loss": 0.4127, "step": 4072 }, { "epoch": 0.66, "grad_norm": 5.8024437283177654, "learning_rate": 2.791531541908802e-06, "loss": 0.4632, "step": 4073 }, { "epoch": 0.66, "grad_norm": 6.980770748406464, "learning_rate": 2.7891904822689443e-06, "loss": 0.489, "step": 4074 }, { "epoch": 0.66, "grad_norm": 8.727460521582941, "learning_rate": 2.7868500249145414e-06, "loss": 0.3897, "step": 4075 }, { "epoch": 0.66, "grad_norm": 6.7826803324300835, "learning_rate": 2.784510170483199e-06, "loss": 0.3998, "step": 4076 }, { "epoch": 0.66, "grad_norm": 27.228951585944433, "learning_rate": 2.7821709196123535e-06, "loss": 0.4231, "step": 4077 }, { "epoch": 0.66, "grad_norm": 7.155809327604749, "learning_rate": 2.779832272939285e-06, "loss": 0.4571, "step": 4078 }, { "epoch": 0.66, "grad_norm": 9.903641862402326, "learning_rate": 2.7774942311011082e-06, "loss": 0.4137, "step": 4079 }, { "epoch": 0.66, "grad_norm": 15.644873383883255, "learning_rate": 2.7751567947347624e-06, "loss": 0.4626, "step": 4080 }, { "epoch": 0.66, "grad_norm": 8.029165368558605, "learning_rate": 2.772819964477035e-06, "loss": 0.4429, "step": 4081 }, { "epoch": 0.66, "grad_norm": 6.962690628802151, "learning_rate": 2.7704837409645425e-06, "loss": 0.3702, "step": 4082 }, { "epoch": 0.66, "grad_norm": 8.490858912493868, "learning_rate": 2.768148124833736e-06, "loss": 0.4769, "step": 4083 }, { "epoch": 0.66, "grad_norm": 7.019911488950765, "learning_rate": 2.765813116720901e-06, "loss": 0.5053, "step": 4084 }, { "epoch": 0.66, "grad_norm": 5.376564849420182, "learning_rate": 2.7634787172621593e-06, "loss": 0.4235, "step": 4085 }, { "epoch": 0.66, "grad_norm": 6.838580686097576, "learning_rate": 2.761144927093464e-06, "loss": 0.4535, "step": 4086 }, { "epoch": 0.66, "grad_norm": 13.232388174686443, "learning_rate": 2.7588117468506064e-06, "loss": 0.4325, "step": 4087 }, { "epoch": 0.66, "grad_norm": 11.874232325541847, "learning_rate": 2.756479177169208e-06, "loss": 0.3909, "step": 4088 }, { "epoch": 0.66, "grad_norm": 7.954044799090756, "learning_rate": 2.7541472186847224e-06, "loss": 0.4425, "step": 4089 }, { "epoch": 0.66, "grad_norm": 17.299557497791668, "learning_rate": 2.751815872032444e-06, "loss": 0.3656, "step": 4090 }, { "epoch": 0.66, "grad_norm": 5.251069936834028, "learning_rate": 2.7494851378474936e-06, "loss": 0.477, "step": 4091 }, { "epoch": 0.66, "grad_norm": 5.450700149587626, "learning_rate": 2.7471550167648255e-06, "loss": 0.3715, "step": 4092 }, { "epoch": 0.66, "grad_norm": 10.96852987781002, "learning_rate": 2.744825509419231e-06, "loss": 0.403, "step": 4093 }, { "epoch": 0.66, "grad_norm": 5.258096251162194, "learning_rate": 2.742496616445335e-06, "loss": 0.4288, "step": 4094 }, { "epoch": 0.66, "grad_norm": 6.670768883870506, "learning_rate": 2.740168338477587e-06, "loss": 0.377, "step": 4095 }, { "epoch": 0.66, "grad_norm": 6.096608459555834, "learning_rate": 2.7378406761502747e-06, "loss": 0.3798, "step": 4096 }, { "epoch": 0.66, "grad_norm": 6.823029148873207, "learning_rate": 2.7355136300975214e-06, "loss": 0.392, "step": 4097 }, { "epoch": 0.66, "grad_norm": 7.442626769692936, "learning_rate": 2.733187200953276e-06, "loss": 0.5114, "step": 4098 }, { "epoch": 0.66, "grad_norm": 5.937613130670315, "learning_rate": 2.7308613893513205e-06, "loss": 0.464, "step": 4099 }, { "epoch": 0.66, "grad_norm": 1.2974110953311102, "learning_rate": 2.7285361959252716e-06, "loss": 0.4829, "step": 4100 }, { "epoch": 0.66, "grad_norm": 7.017500787809507, "learning_rate": 2.72621162130858e-06, "loss": 0.3992, "step": 4101 }, { "epoch": 0.66, "grad_norm": 20.578963576315946, "learning_rate": 2.723887666134516e-06, "loss": 0.3943, "step": 4102 }, { "epoch": 0.66, "grad_norm": 10.076796391928223, "learning_rate": 2.721564331036194e-06, "loss": 0.3603, "step": 4103 }, { "epoch": 0.66, "grad_norm": 8.7410652564437, "learning_rate": 2.719241616646555e-06, "loss": 0.4251, "step": 4104 }, { "epoch": 0.66, "grad_norm": 15.370693187680478, "learning_rate": 2.716919523598369e-06, "loss": 0.3538, "step": 4105 }, { "epoch": 0.66, "grad_norm": 7.458244242672271, "learning_rate": 2.7145980525242367e-06, "loss": 0.4043, "step": 4106 }, { "epoch": 0.66, "grad_norm": 12.695412633924612, "learning_rate": 2.712277204056594e-06, "loss": 0.4154, "step": 4107 }, { "epoch": 0.66, "grad_norm": 7.944815976082589, "learning_rate": 2.7099569788277023e-06, "loss": 0.4163, "step": 4108 }, { "epoch": 0.66, "grad_norm": 1.2742473074045384, "learning_rate": 2.7076373774696527e-06, "loss": 0.469, "step": 4109 }, { "epoch": 0.66, "grad_norm": 10.480603220247222, "learning_rate": 2.705318400614374e-06, "loss": 0.5074, "step": 4110 }, { "epoch": 0.66, "grad_norm": 6.542335896165337, "learning_rate": 2.703000048893613e-06, "loss": 0.4193, "step": 4111 }, { "epoch": 0.66, "grad_norm": 5.216860934977393, "learning_rate": 2.7006823229389588e-06, "loss": 0.3678, "step": 4112 }, { "epoch": 0.66, "grad_norm": 11.700580206054259, "learning_rate": 2.6983652233818205e-06, "loss": 0.4373, "step": 4113 }, { "epoch": 0.66, "grad_norm": 8.132194919353882, "learning_rate": 2.6960487508534383e-06, "loss": 0.4861, "step": 4114 }, { "epoch": 0.66, "grad_norm": 5.44334421058753, "learning_rate": 2.693732905984885e-06, "loss": 0.4153, "step": 4115 }, { "epoch": 0.66, "grad_norm": 6.792853524770847, "learning_rate": 2.6914176894070644e-06, "loss": 0.414, "step": 4116 }, { "epoch": 0.66, "grad_norm": 13.170838629033128, "learning_rate": 2.6891031017506986e-06, "loss": 0.4608, "step": 4117 }, { "epoch": 0.66, "grad_norm": 5.219435461805495, "learning_rate": 2.686789143646347e-06, "loss": 0.4224, "step": 4118 }, { "epoch": 0.66, "grad_norm": 5.995582395523539, "learning_rate": 2.6844758157244e-06, "loss": 0.3665, "step": 4119 }, { "epoch": 0.66, "grad_norm": 9.581149161719905, "learning_rate": 2.6821631186150676e-06, "loss": 0.4009, "step": 4120 }, { "epoch": 0.66, "grad_norm": 6.263796594158633, "learning_rate": 2.6798510529483913e-06, "loss": 0.381, "step": 4121 }, { "epoch": 0.66, "grad_norm": 11.857085782382196, "learning_rate": 2.6775396193542436e-06, "loss": 0.4209, "step": 4122 }, { "epoch": 0.66, "grad_norm": 7.336189884862522, "learning_rate": 2.675228818462327e-06, "loss": 0.4072, "step": 4123 }, { "epoch": 0.66, "grad_norm": 6.768104492487895, "learning_rate": 2.6729186509021587e-06, "loss": 0.4567, "step": 4124 }, { "epoch": 0.66, "grad_norm": 9.614551832363667, "learning_rate": 2.670609117303096e-06, "loss": 0.5031, "step": 4125 }, { "epoch": 0.66, "grad_norm": 5.668503350971288, "learning_rate": 2.668300218294322e-06, "loss": 0.3761, "step": 4126 }, { "epoch": 0.66, "grad_norm": 6.431839544530715, "learning_rate": 2.6659919545048424e-06, "loss": 0.4532, "step": 4127 }, { "epoch": 0.67, "grad_norm": 5.121424885586337, "learning_rate": 2.663684326563489e-06, "loss": 0.461, "step": 4128 }, { "epoch": 0.67, "grad_norm": 8.361358820383945, "learning_rate": 2.6613773350989293e-06, "loss": 0.4436, "step": 4129 }, { "epoch": 0.67, "grad_norm": 6.402754343318197, "learning_rate": 2.6590709807396464e-06, "loss": 0.4158, "step": 4130 }, { "epoch": 0.67, "grad_norm": 10.995060547517761, "learning_rate": 2.656765264113955e-06, "loss": 0.443, "step": 4131 }, { "epoch": 0.67, "grad_norm": 6.632179340798159, "learning_rate": 2.65446018585e-06, "loss": 0.4339, "step": 4132 }, { "epoch": 0.67, "grad_norm": 11.316089583514849, "learning_rate": 2.652155746575743e-06, "loss": 0.4186, "step": 4133 }, { "epoch": 0.67, "grad_norm": 6.550851383671837, "learning_rate": 2.6498519469189814e-06, "loss": 0.388, "step": 4134 }, { "epoch": 0.67, "grad_norm": 12.196477300352107, "learning_rate": 2.6475487875073318e-06, "loss": 0.4677, "step": 4135 }, { "epoch": 0.67, "grad_norm": 4.904051962711692, "learning_rate": 2.6452462689682364e-06, "loss": 0.4143, "step": 4136 }, { "epoch": 0.67, "grad_norm": 8.285040720370983, "learning_rate": 2.6429443919289676e-06, "loss": 0.4171, "step": 4137 }, { "epoch": 0.67, "grad_norm": 6.425512741652333, "learning_rate": 2.640643157016618e-06, "loss": 0.3893, "step": 4138 }, { "epoch": 0.67, "grad_norm": 8.364313201013164, "learning_rate": 2.63834256485811e-06, "loss": 0.4263, "step": 4139 }, { "epoch": 0.67, "grad_norm": 7.290199349108534, "learning_rate": 2.636042616080185e-06, "loss": 0.4161, "step": 4140 }, { "epoch": 0.67, "grad_norm": 7.829513383505944, "learning_rate": 2.6337433113094164e-06, "loss": 0.4198, "step": 4141 }, { "epoch": 0.67, "grad_norm": 6.603742934833703, "learning_rate": 2.6314446511721957e-06, "loss": 0.4185, "step": 4142 }, { "epoch": 0.67, "grad_norm": 18.506489664743384, "learning_rate": 2.6291466362947392e-06, "loss": 0.3287, "step": 4143 }, { "epoch": 0.67, "grad_norm": 10.00200850338193, "learning_rate": 2.626849267303093e-06, "loss": 0.4395, "step": 4144 }, { "epoch": 0.67, "grad_norm": 10.055796381446534, "learning_rate": 2.624552544823126e-06, "loss": 0.3724, "step": 4145 }, { "epoch": 0.67, "grad_norm": 16.61425303145286, "learning_rate": 2.622256469480522e-06, "loss": 0.4729, "step": 4146 }, { "epoch": 0.67, "grad_norm": 10.841218608861091, "learning_rate": 2.6199610419007983e-06, "loss": 0.4358, "step": 4147 }, { "epoch": 0.67, "grad_norm": 8.704822476419505, "learning_rate": 2.617666262709294e-06, "loss": 0.435, "step": 4148 }, { "epoch": 0.67, "grad_norm": 7.107318093381502, "learning_rate": 2.61537213253117e-06, "loss": 0.3948, "step": 4149 }, { "epoch": 0.67, "grad_norm": 11.090248887518076, "learning_rate": 2.6130786519914074e-06, "loss": 0.4849, "step": 4150 }, { "epoch": 0.67, "grad_norm": 13.79445673411347, "learning_rate": 2.6107858217148185e-06, "loss": 0.4053, "step": 4151 }, { "epoch": 0.67, "grad_norm": 7.989762365147147, "learning_rate": 2.6084936423260303e-06, "loss": 0.4142, "step": 4152 }, { "epoch": 0.67, "grad_norm": 7.510794379134416, "learning_rate": 2.606202114449495e-06, "loss": 0.4327, "step": 4153 }, { "epoch": 0.67, "grad_norm": 9.272543144518421, "learning_rate": 2.60391123870949e-06, "loss": 0.4053, "step": 4154 }, { "epoch": 0.67, "grad_norm": 6.3434844864508495, "learning_rate": 2.601621015730111e-06, "loss": 0.4287, "step": 4155 }, { "epoch": 0.67, "grad_norm": 7.148290235405018, "learning_rate": 2.5993314461352802e-06, "loss": 0.4021, "step": 4156 }, { "epoch": 0.67, "grad_norm": 16.498149077975754, "learning_rate": 2.597042530548737e-06, "loss": 0.4485, "step": 4157 }, { "epoch": 0.67, "grad_norm": 8.350037216800475, "learning_rate": 2.5947542695940485e-06, "loss": 0.3529, "step": 4158 }, { "epoch": 0.67, "grad_norm": 9.697050795135292, "learning_rate": 2.5924666638945973e-06, "loss": 0.435, "step": 4159 }, { "epoch": 0.67, "grad_norm": 19.11128432461532, "learning_rate": 2.5901797140735895e-06, "loss": 0.4305, "step": 4160 }, { "epoch": 0.67, "grad_norm": 10.487376953269806, "learning_rate": 2.5878934207540564e-06, "loss": 0.4047, "step": 4161 }, { "epoch": 0.67, "grad_norm": 7.570922357007311, "learning_rate": 2.5856077845588433e-06, "loss": 0.4987, "step": 4162 }, { "epoch": 0.67, "grad_norm": 9.676241594060867, "learning_rate": 2.5833228061106253e-06, "loss": 0.4801, "step": 4163 }, { "epoch": 0.67, "grad_norm": 9.628210521812516, "learning_rate": 2.5810384860318904e-06, "loss": 0.4144, "step": 4164 }, { "epoch": 0.67, "grad_norm": 6.466535377421348, "learning_rate": 2.57875482494495e-06, "loss": 0.4123, "step": 4165 }, { "epoch": 0.67, "grad_norm": 6.752183075233021, "learning_rate": 2.576471823471938e-06, "loss": 0.3831, "step": 4166 }, { "epoch": 0.67, "grad_norm": 6.491606804773458, "learning_rate": 2.5741894822348073e-06, "loss": 0.416, "step": 4167 }, { "epoch": 0.67, "grad_norm": 5.814097043970005, "learning_rate": 2.571907801855327e-06, "loss": 0.4349, "step": 4168 }, { "epoch": 0.67, "grad_norm": 9.078920456171158, "learning_rate": 2.5696267829550926e-06, "loss": 0.3544, "step": 4169 }, { "epoch": 0.67, "grad_norm": 8.513759767746317, "learning_rate": 2.5673464261555174e-06, "loss": 0.3955, "step": 4170 }, { "epoch": 0.67, "grad_norm": 7.481594369983413, "learning_rate": 2.5650667320778333e-06, "loss": 0.3898, "step": 4171 }, { "epoch": 0.67, "grad_norm": 7.707875209016076, "learning_rate": 2.562787701343088e-06, "loss": 0.4974, "step": 4172 }, { "epoch": 0.67, "grad_norm": 10.053459390609397, "learning_rate": 2.5605093345721564e-06, "loss": 0.4082, "step": 4173 }, { "epoch": 0.67, "grad_norm": 6.93480955505699, "learning_rate": 2.558231632385728e-06, "loss": 0.3907, "step": 4174 }, { "epoch": 0.67, "grad_norm": 5.520104317005605, "learning_rate": 2.5559545954043086e-06, "loss": 0.3437, "step": 4175 }, { "epoch": 0.67, "grad_norm": 6.470803186495222, "learning_rate": 2.5536782242482294e-06, "loss": 0.3647, "step": 4176 }, { "epoch": 0.67, "grad_norm": 6.835280092168652, "learning_rate": 2.5514025195376336e-06, "loss": 0.3836, "step": 4177 }, { "epoch": 0.67, "grad_norm": 8.929742443683036, "learning_rate": 2.5491274818924893e-06, "loss": 0.4718, "step": 4178 }, { "epoch": 0.67, "grad_norm": 12.267339532483499, "learning_rate": 2.5468531119325756e-06, "loss": 0.4525, "step": 4179 }, { "epoch": 0.67, "grad_norm": 6.687945762334257, "learning_rate": 2.544579410277498e-06, "loss": 0.3754, "step": 4180 }, { "epoch": 0.67, "grad_norm": 1.2681804608139207, "learning_rate": 2.542306377546673e-06, "loss": 0.4831, "step": 4181 }, { "epoch": 0.67, "grad_norm": 12.806330170006689, "learning_rate": 2.540034014359335e-06, "loss": 0.3569, "step": 4182 }, { "epoch": 0.67, "grad_norm": 12.223113619149723, "learning_rate": 2.537762321334543e-06, "loss": 0.3795, "step": 4183 }, { "epoch": 0.67, "grad_norm": 1.2712539655738475, "learning_rate": 2.535491299091165e-06, "loss": 0.4972, "step": 4184 }, { "epoch": 0.67, "grad_norm": 9.46751671481261, "learning_rate": 2.533220948247893e-06, "loss": 0.3998, "step": 4185 }, { "epoch": 0.67, "grad_norm": 13.974185073279031, "learning_rate": 2.530951269423233e-06, "loss": 0.4172, "step": 4186 }, { "epoch": 0.67, "grad_norm": 15.806406716413644, "learning_rate": 2.528682263235504e-06, "loss": 0.4536, "step": 4187 }, { "epoch": 0.67, "grad_norm": 9.2876175864752, "learning_rate": 2.526413930302851e-06, "loss": 0.4463, "step": 4188 }, { "epoch": 0.67, "grad_norm": 21.148403140033174, "learning_rate": 2.5241462712432284e-06, "loss": 0.3989, "step": 4189 }, { "epoch": 0.68, "grad_norm": 8.583779532711976, "learning_rate": 2.5218792866744068e-06, "loss": 0.4018, "step": 4190 }, { "epoch": 0.68, "grad_norm": 7.576099076048648, "learning_rate": 2.5196129772139766e-06, "loss": 0.4609, "step": 4191 }, { "epoch": 0.68, "grad_norm": 12.629992210999397, "learning_rate": 2.517347343479346e-06, "loss": 0.4352, "step": 4192 }, { "epoch": 0.68, "grad_norm": 10.50223287720631, "learning_rate": 2.5150823860877345e-06, "loss": 0.4289, "step": 4193 }, { "epoch": 0.68, "grad_norm": 6.675515643351629, "learning_rate": 2.5128181056561755e-06, "loss": 0.4864, "step": 4194 }, { "epoch": 0.68, "grad_norm": 7.932977894506335, "learning_rate": 2.5105545028015266e-06, "loss": 0.4698, "step": 4195 }, { "epoch": 0.68, "grad_norm": 10.419095394884001, "learning_rate": 2.508291578140453e-06, "loss": 0.4309, "step": 4196 }, { "epoch": 0.68, "grad_norm": 7.956752727037793, "learning_rate": 2.5060293322894353e-06, "loss": 0.3392, "step": 4197 }, { "epoch": 0.68, "grad_norm": 9.946544990209416, "learning_rate": 2.503767765864774e-06, "loss": 0.4457, "step": 4198 }, { "epoch": 0.68, "grad_norm": 7.547597349860176, "learning_rate": 2.5015068794825853e-06, "loss": 0.4196, "step": 4199 }, { "epoch": 0.68, "grad_norm": 22.39995093987481, "learning_rate": 2.4992466737587932e-06, "loss": 0.3684, "step": 4200 }, { "epoch": 0.68, "grad_norm": 6.566526287942026, "learning_rate": 2.4969871493091398e-06, "loss": 0.4152, "step": 4201 }, { "epoch": 0.68, "grad_norm": 7.177585170492367, "learning_rate": 2.494728306749184e-06, "loss": 0.3991, "step": 4202 }, { "epoch": 0.68, "grad_norm": 10.969985854549728, "learning_rate": 2.4924701466942957e-06, "loss": 0.4845, "step": 4203 }, { "epoch": 0.68, "grad_norm": 7.5601311135489535, "learning_rate": 2.490212669759658e-06, "loss": 0.3901, "step": 4204 }, { "epoch": 0.68, "grad_norm": 10.958832086517287, "learning_rate": 2.487955876560274e-06, "loss": 0.4489, "step": 4205 }, { "epoch": 0.68, "grad_norm": 45.57462788417418, "learning_rate": 2.485699767710952e-06, "loss": 0.4994, "step": 4206 }, { "epoch": 0.68, "grad_norm": 19.15410495400765, "learning_rate": 2.4834443438263224e-06, "loss": 0.4083, "step": 4207 }, { "epoch": 0.68, "grad_norm": 5.714075635194001, "learning_rate": 2.4811896055208227e-06, "loss": 0.3931, "step": 4208 }, { "epoch": 0.68, "grad_norm": 8.44363030723576, "learning_rate": 2.4789355534087038e-06, "loss": 0.4449, "step": 4209 }, { "epoch": 0.68, "grad_norm": 6.566071689595343, "learning_rate": 2.476682188104036e-06, "loss": 0.3679, "step": 4210 }, { "epoch": 0.68, "grad_norm": 7.6310567158629805, "learning_rate": 2.4744295102206954e-06, "loss": 0.3644, "step": 4211 }, { "epoch": 0.68, "grad_norm": 7.102317223454744, "learning_rate": 2.4721775203723726e-06, "loss": 0.4196, "step": 4212 }, { "epoch": 0.68, "grad_norm": 12.864610115518843, "learning_rate": 2.4699262191725726e-06, "loss": 0.44, "step": 4213 }, { "epoch": 0.68, "grad_norm": 1.208630654219187, "learning_rate": 2.467675607234615e-06, "loss": 0.4774, "step": 4214 }, { "epoch": 0.68, "grad_norm": 13.03107045254372, "learning_rate": 2.465425685171625e-06, "loss": 0.4378, "step": 4215 }, { "epoch": 0.68, "grad_norm": 6.167309614176898, "learning_rate": 2.463176453596543e-06, "loss": 0.4377, "step": 4216 }, { "epoch": 0.68, "grad_norm": 18.203050138435987, "learning_rate": 2.4609279131221243e-06, "loss": 0.4813, "step": 4217 }, { "epoch": 0.68, "grad_norm": 11.147093600544459, "learning_rate": 2.4586800643609326e-06, "loss": 0.5236, "step": 4218 }, { "epoch": 0.68, "grad_norm": 12.872399660138699, "learning_rate": 2.456432907925341e-06, "loss": 0.4374, "step": 4219 }, { "epoch": 0.68, "grad_norm": 10.272396814350538, "learning_rate": 2.454186444427539e-06, "loss": 0.3242, "step": 4220 }, { "epoch": 0.68, "grad_norm": 9.546138198627961, "learning_rate": 2.4519406744795275e-06, "loss": 0.444, "step": 4221 }, { "epoch": 0.68, "grad_norm": 18.384124383050438, "learning_rate": 2.4496955986931143e-06, "loss": 0.4414, "step": 4222 }, { "epoch": 0.68, "grad_norm": 1.4359102126699177, "learning_rate": 2.447451217679917e-06, "loss": 0.4951, "step": 4223 }, { "epoch": 0.68, "grad_norm": 1.1472710225447738, "learning_rate": 2.445207532051372e-06, "loss": 0.4334, "step": 4224 }, { "epoch": 0.68, "grad_norm": 8.969045245651673, "learning_rate": 2.442964542418718e-06, "loss": 0.4177, "step": 4225 }, { "epoch": 0.68, "grad_norm": 5.716336055746022, "learning_rate": 2.4407222493930063e-06, "loss": 0.4607, "step": 4226 }, { "epoch": 0.68, "grad_norm": 13.721660088307331, "learning_rate": 2.438480653585102e-06, "loss": 0.4887, "step": 4227 }, { "epoch": 0.68, "grad_norm": 15.918198999025421, "learning_rate": 2.436239755605675e-06, "loss": 0.4972, "step": 4228 }, { "epoch": 0.68, "grad_norm": 12.033352755178402, "learning_rate": 2.433999556065211e-06, "loss": 0.3561, "step": 4229 }, { "epoch": 0.68, "grad_norm": 6.940825603100634, "learning_rate": 2.4317600555739997e-06, "loss": 0.4122, "step": 4230 }, { "epoch": 0.68, "grad_norm": 9.881922386110519, "learning_rate": 2.4295212547421415e-06, "loss": 0.4111, "step": 4231 }, { "epoch": 0.68, "grad_norm": 7.011183680846465, "learning_rate": 2.427283154179551e-06, "loss": 0.3911, "step": 4232 }, { "epoch": 0.68, "grad_norm": 8.490130899541349, "learning_rate": 2.4250457544959465e-06, "loss": 0.4313, "step": 4233 }, { "epoch": 0.68, "grad_norm": 7.888261885601563, "learning_rate": 2.422809056300856e-06, "loss": 0.4195, "step": 4234 }, { "epoch": 0.68, "grad_norm": 6.944924068723493, "learning_rate": 2.4205730602036194e-06, "loss": 0.3277, "step": 4235 }, { "epoch": 0.68, "grad_norm": 10.000713296361035, "learning_rate": 2.4183377668133843e-06, "loss": 0.4979, "step": 4236 }, { "epoch": 0.68, "grad_norm": 8.390518916921966, "learning_rate": 2.4161031767391067e-06, "loss": 0.4311, "step": 4237 }, { "epoch": 0.68, "grad_norm": 11.74555507583681, "learning_rate": 2.4138692905895473e-06, "loss": 0.4027, "step": 4238 }, { "epoch": 0.68, "grad_norm": 10.818241007379687, "learning_rate": 2.4116361089732815e-06, "loss": 0.4332, "step": 4239 }, { "epoch": 0.68, "grad_norm": 7.85162674830752, "learning_rate": 2.4094036324986888e-06, "loss": 0.4321, "step": 4240 }, { "epoch": 0.68, "grad_norm": 6.69905985392428, "learning_rate": 2.4071718617739542e-06, "loss": 0.3611, "step": 4241 }, { "epoch": 0.68, "grad_norm": 9.429108670270677, "learning_rate": 2.404940797407077e-06, "loss": 0.4433, "step": 4242 }, { "epoch": 0.68, "grad_norm": 10.07399175230551, "learning_rate": 2.4027104400058615e-06, "loss": 0.4623, "step": 4243 }, { "epoch": 0.68, "grad_norm": 12.336888452965798, "learning_rate": 2.4004807901779164e-06, "loss": 0.4372, "step": 4244 }, { "epoch": 0.68, "grad_norm": 17.669999404381336, "learning_rate": 2.3982518485306587e-06, "loss": 0.4197, "step": 4245 }, { "epoch": 0.68, "grad_norm": 5.945442763952645, "learning_rate": 2.3960236156713175e-06, "loss": 0.4582, "step": 4246 }, { "epoch": 0.68, "grad_norm": 9.985244862608376, "learning_rate": 2.3937960922069213e-06, "loss": 0.4334, "step": 4247 }, { "epoch": 0.68, "grad_norm": 8.291570835060414, "learning_rate": 2.391569278744309e-06, "loss": 0.3636, "step": 4248 }, { "epoch": 0.68, "grad_norm": 22.862092734354505, "learning_rate": 2.3893431758901285e-06, "loss": 0.4398, "step": 4249 }, { "epoch": 0.68, "grad_norm": 7.942843579207113, "learning_rate": 2.387117784250828e-06, "loss": 0.4308, "step": 4250 }, { "epoch": 0.68, "grad_norm": 7.29159159174315, "learning_rate": 2.38489310443267e-06, "loss": 0.4219, "step": 4251 }, { "epoch": 0.69, "grad_norm": 7.557277868188023, "learning_rate": 2.382669137041716e-06, "loss": 0.4848, "step": 4252 }, { "epoch": 0.69, "grad_norm": 23.652767009991326, "learning_rate": 2.3804458826838337e-06, "loss": 0.4173, "step": 4253 }, { "epoch": 0.69, "grad_norm": 7.665393625173859, "learning_rate": 2.3782233419647043e-06, "loss": 0.4268, "step": 4254 }, { "epoch": 0.69, "grad_norm": 12.162902201870216, "learning_rate": 2.376001515489806e-06, "loss": 0.3976, "step": 4255 }, { "epoch": 0.69, "grad_norm": 6.588250893036699, "learning_rate": 2.3737804038644236e-06, "loss": 0.4202, "step": 4256 }, { "epoch": 0.69, "grad_norm": 1.2522265499211311, "learning_rate": 2.371560007693651e-06, "loss": 0.482, "step": 4257 }, { "epoch": 0.69, "grad_norm": 11.498976358712579, "learning_rate": 2.3693403275823883e-06, "loss": 0.3906, "step": 4258 }, { "epoch": 0.69, "grad_norm": 8.683671503424353, "learning_rate": 2.367121364135335e-06, "loss": 0.5123, "step": 4259 }, { "epoch": 0.69, "grad_norm": 7.081374392957123, "learning_rate": 2.3649031179569954e-06, "loss": 0.4233, "step": 4260 }, { "epoch": 0.69, "grad_norm": 5.622890565700507, "learning_rate": 2.3626855896516855e-06, "loss": 0.4297, "step": 4261 }, { "epoch": 0.69, "grad_norm": 6.52548499686504, "learning_rate": 2.360468779823519e-06, "loss": 0.4318, "step": 4262 }, { "epoch": 0.69, "grad_norm": 10.034246698944754, "learning_rate": 2.3582526890764135e-06, "loss": 0.4315, "step": 4263 }, { "epoch": 0.69, "grad_norm": 4.866283895757253, "learning_rate": 2.356037318014096e-06, "loss": 0.3836, "step": 4264 }, { "epoch": 0.69, "grad_norm": 19.185538034135106, "learning_rate": 2.3538226672400982e-06, "loss": 0.401, "step": 4265 }, { "epoch": 0.69, "grad_norm": 8.10584373606494, "learning_rate": 2.3516087373577447e-06, "loss": 0.4181, "step": 4266 }, { "epoch": 0.69, "grad_norm": 6.3430698032377135, "learning_rate": 2.3493955289701744e-06, "loss": 0.3782, "step": 4267 }, { "epoch": 0.69, "grad_norm": 6.916309275386448, "learning_rate": 2.347183042680328e-06, "loss": 0.3547, "step": 4268 }, { "epoch": 0.69, "grad_norm": 9.728209659123351, "learning_rate": 2.3449712790909465e-06, "loss": 0.3828, "step": 4269 }, { "epoch": 0.69, "grad_norm": 7.5629660848165114, "learning_rate": 2.3427602388045723e-06, "loss": 0.4831, "step": 4270 }, { "epoch": 0.69, "grad_norm": 11.368515832158197, "learning_rate": 2.3405499224235583e-06, "loss": 0.3903, "step": 4271 }, { "epoch": 0.69, "grad_norm": 1.0545311408577567, "learning_rate": 2.3383403305500523e-06, "loss": 0.4192, "step": 4272 }, { "epoch": 0.69, "grad_norm": 6.2148256341141925, "learning_rate": 2.3361314637860104e-06, "loss": 0.4566, "step": 4273 }, { "epoch": 0.69, "grad_norm": 6.747737630643661, "learning_rate": 2.3339233227331887e-06, "loss": 0.4221, "step": 4274 }, { "epoch": 0.69, "grad_norm": 5.779715322704246, "learning_rate": 2.331715907993142e-06, "loss": 0.4068, "step": 4275 }, { "epoch": 0.69, "grad_norm": 9.289331868300978, "learning_rate": 2.329509220167236e-06, "loss": 0.3873, "step": 4276 }, { "epoch": 0.69, "grad_norm": 14.460206163385553, "learning_rate": 2.327303259856629e-06, "loss": 0.377, "step": 4277 }, { "epoch": 0.69, "grad_norm": 5.07713686507014, "learning_rate": 2.3250980276622893e-06, "loss": 0.4238, "step": 4278 }, { "epoch": 0.69, "grad_norm": 6.259662583051834, "learning_rate": 2.322893524184979e-06, "loss": 0.4151, "step": 4279 }, { "epoch": 0.69, "grad_norm": 7.049557259364586, "learning_rate": 2.320689750025269e-06, "loss": 0.3609, "step": 4280 }, { "epoch": 0.69, "grad_norm": 8.715552592894973, "learning_rate": 2.3184867057835274e-06, "loss": 0.4026, "step": 4281 }, { "epoch": 0.69, "grad_norm": 6.715970778066933, "learning_rate": 2.316284392059921e-06, "loss": 0.4176, "step": 4282 }, { "epoch": 0.69, "grad_norm": 20.689659325880644, "learning_rate": 2.3140828094544253e-06, "loss": 0.4429, "step": 4283 }, { "epoch": 0.69, "grad_norm": 6.475431675691197, "learning_rate": 2.3118819585668106e-06, "loss": 0.3778, "step": 4284 }, { "epoch": 0.69, "grad_norm": 5.243658626819774, "learning_rate": 2.3096818399966465e-06, "loss": 0.4704, "step": 4285 }, { "epoch": 0.69, "grad_norm": 5.819868414277102, "learning_rate": 2.3074824543433084e-06, "loss": 0.4149, "step": 4286 }, { "epoch": 0.69, "grad_norm": 8.110554820545692, "learning_rate": 2.305283802205973e-06, "loss": 0.4344, "step": 4287 }, { "epoch": 0.69, "grad_norm": 15.914223475516382, "learning_rate": 2.3030858841836063e-06, "loss": 0.4246, "step": 4288 }, { "epoch": 0.69, "grad_norm": 5.791008571397075, "learning_rate": 2.300888700874986e-06, "loss": 0.4, "step": 4289 }, { "epoch": 0.69, "grad_norm": 10.946221525894051, "learning_rate": 2.2986922528786865e-06, "loss": 0.4186, "step": 4290 }, { "epoch": 0.69, "grad_norm": 6.546902497052178, "learning_rate": 2.2964965407930796e-06, "loss": 0.4845, "step": 4291 }, { "epoch": 0.69, "grad_norm": 5.9330668102116535, "learning_rate": 2.2943015652163355e-06, "loss": 0.3907, "step": 4292 }, { "epoch": 0.69, "grad_norm": 8.737545100421132, "learning_rate": 2.29210732674643e-06, "loss": 0.4769, "step": 4293 }, { "epoch": 0.69, "grad_norm": 7.08812055315629, "learning_rate": 2.289913825981132e-06, "loss": 0.3932, "step": 4294 }, { "epoch": 0.69, "grad_norm": 9.491288418838039, "learning_rate": 2.2877210635180098e-06, "loss": 0.4429, "step": 4295 }, { "epoch": 0.69, "grad_norm": 8.21891713053102, "learning_rate": 2.2855290399544346e-06, "loss": 0.4669, "step": 4296 }, { "epoch": 0.69, "grad_norm": 24.380215040880355, "learning_rate": 2.2833377558875754e-06, "loss": 0.4421, "step": 4297 }, { "epoch": 0.69, "grad_norm": 5.706997440132937, "learning_rate": 2.281147211914397e-06, "loss": 0.424, "step": 4298 }, { "epoch": 0.69, "grad_norm": 7.107377579565903, "learning_rate": 2.278957408631662e-06, "loss": 0.3966, "step": 4299 }, { "epoch": 0.69, "grad_norm": 8.686099994283008, "learning_rate": 2.276768346635936e-06, "loss": 0.3957, "step": 4300 }, { "epoch": 0.69, "grad_norm": 7.382977801372351, "learning_rate": 2.2745800265235773e-06, "loss": 0.4269, "step": 4301 }, { "epoch": 0.69, "grad_norm": 5.375887189397306, "learning_rate": 2.2723924488907477e-06, "loss": 0.3965, "step": 4302 }, { "epoch": 0.69, "grad_norm": 7.116808655408611, "learning_rate": 2.2702056143334025e-06, "loss": 0.4561, "step": 4303 }, { "epoch": 0.69, "grad_norm": 9.51378427638696, "learning_rate": 2.268019523447292e-06, "loss": 0.4539, "step": 4304 }, { "epoch": 0.69, "grad_norm": 7.102955431802788, "learning_rate": 2.265834176827974e-06, "loss": 0.384, "step": 4305 }, { "epoch": 0.69, "grad_norm": 6.878846197043499, "learning_rate": 2.263649575070794e-06, "loss": 0.3734, "step": 4306 }, { "epoch": 0.69, "grad_norm": 10.466505311472522, "learning_rate": 2.261465718770895e-06, "loss": 0.4128, "step": 4307 }, { "epoch": 0.69, "grad_norm": 7.208216704117919, "learning_rate": 2.259282608523223e-06, "loss": 0.4142, "step": 4308 }, { "epoch": 0.69, "grad_norm": 17.332765945447687, "learning_rate": 2.25710024492252e-06, "loss": 0.4113, "step": 4309 }, { "epoch": 0.69, "grad_norm": 10.062647172743207, "learning_rate": 2.254918628563315e-06, "loss": 0.4189, "step": 4310 }, { "epoch": 0.69, "grad_norm": 1.2589073642177175, "learning_rate": 2.2527377600399446e-06, "loss": 0.4748, "step": 4311 }, { "epoch": 0.69, "grad_norm": 5.323459328808105, "learning_rate": 2.2505576399465384e-06, "loss": 0.4489, "step": 4312 }, { "epoch": 0.69, "grad_norm": 7.27444860343837, "learning_rate": 2.2483782688770208e-06, "loss": 0.4468, "step": 4313 }, { "epoch": 0.7, "grad_norm": 6.084049623859299, "learning_rate": 2.246199647425109e-06, "loss": 0.4168, "step": 4314 }, { "epoch": 0.7, "grad_norm": 4.057735738937002, "learning_rate": 2.2440217761843244e-06, "loss": 0.4081, "step": 4315 }, { "epoch": 0.7, "grad_norm": 13.58330657614081, "learning_rate": 2.241844655747977e-06, "loss": 0.3706, "step": 4316 }, { "epoch": 0.7, "grad_norm": 7.237682929272237, "learning_rate": 2.2396682867091723e-06, "loss": 0.4705, "step": 4317 }, { "epoch": 0.7, "grad_norm": 5.480521774762631, "learning_rate": 2.237492669660815e-06, "loss": 0.3738, "step": 4318 }, { "epoch": 0.7, "grad_norm": 6.49088626287101, "learning_rate": 2.235317805195606e-06, "loss": 0.3698, "step": 4319 }, { "epoch": 0.7, "grad_norm": 13.123959096964384, "learning_rate": 2.2331436939060354e-06, "loss": 0.3924, "step": 4320 }, { "epoch": 0.7, "grad_norm": 8.392184629746076, "learning_rate": 2.2309703363843893e-06, "loss": 0.4228, "step": 4321 }, { "epoch": 0.7, "grad_norm": 4.891337291051181, "learning_rate": 2.2287977332227543e-06, "loss": 0.4519, "step": 4322 }, { "epoch": 0.7, "grad_norm": 16.824704923606525, "learning_rate": 2.2266258850130055e-06, "loss": 0.4186, "step": 4323 }, { "epoch": 0.7, "grad_norm": 9.037476434418672, "learning_rate": 2.224454792346813e-06, "loss": 0.4054, "step": 4324 }, { "epoch": 0.7, "grad_norm": 4.7766535240179255, "learning_rate": 2.2222844558156444e-06, "loss": 0.4313, "step": 4325 }, { "epoch": 0.7, "grad_norm": 7.468536075371333, "learning_rate": 2.2201148760107568e-06, "loss": 0.4252, "step": 4326 }, { "epoch": 0.7, "grad_norm": 9.609264793868432, "learning_rate": 2.217946053523207e-06, "loss": 0.3615, "step": 4327 }, { "epoch": 0.7, "grad_norm": 5.617137266075924, "learning_rate": 2.2157779889438393e-06, "loss": 0.4466, "step": 4328 }, { "epoch": 0.7, "grad_norm": 5.578073814287037, "learning_rate": 2.213610682863293e-06, "loss": 0.3968, "step": 4329 }, { "epoch": 0.7, "grad_norm": 25.05186132050751, "learning_rate": 2.2114441358720046e-06, "loss": 0.4156, "step": 4330 }, { "epoch": 0.7, "grad_norm": 0.9951237829930111, "learning_rate": 2.209278348560204e-06, "loss": 0.4194, "step": 4331 }, { "epoch": 0.7, "grad_norm": 5.829220574271433, "learning_rate": 2.2071133215179043e-06, "loss": 0.4362, "step": 4332 }, { "epoch": 0.7, "grad_norm": 18.61106585225155, "learning_rate": 2.204949055334922e-06, "loss": 0.4415, "step": 4333 }, { "epoch": 0.7, "grad_norm": 11.266827887612756, "learning_rate": 2.202785550600865e-06, "loss": 0.4267, "step": 4334 }, { "epoch": 0.7, "grad_norm": 12.566342604335452, "learning_rate": 2.20062280790513e-06, "loss": 0.4889, "step": 4335 }, { "epoch": 0.7, "grad_norm": 7.444505810988872, "learning_rate": 2.198460827836905e-06, "loss": 0.4501, "step": 4336 }, { "epoch": 0.7, "grad_norm": 8.602908245085976, "learning_rate": 2.1962996109851757e-06, "loss": 0.4335, "step": 4337 }, { "epoch": 0.7, "grad_norm": 7.893448333339696, "learning_rate": 2.1941391579387204e-06, "loss": 0.4398, "step": 4338 }, { "epoch": 0.7, "grad_norm": 6.340960631113362, "learning_rate": 2.1919794692860992e-06, "loss": 0.4122, "step": 4339 }, { "epoch": 0.7, "grad_norm": 6.70550507972833, "learning_rate": 2.189820545615674e-06, "loss": 0.4215, "step": 4340 }, { "epoch": 0.7, "grad_norm": 12.935221105651625, "learning_rate": 2.187662387515597e-06, "loss": 0.3941, "step": 4341 }, { "epoch": 0.7, "grad_norm": 10.681118190790285, "learning_rate": 2.1855049955738077e-06, "loss": 0.3887, "step": 4342 }, { "epoch": 0.7, "grad_norm": 8.052530977979147, "learning_rate": 2.1833483703780383e-06, "loss": 0.3858, "step": 4343 }, { "epoch": 0.7, "grad_norm": 17.1225656366262, "learning_rate": 2.1811925125158154e-06, "loss": 0.4049, "step": 4344 }, { "epoch": 0.7, "grad_norm": 7.361990125940465, "learning_rate": 2.1790374225744537e-06, "loss": 0.421, "step": 4345 }, { "epoch": 0.7, "grad_norm": 26.97258920318783, "learning_rate": 2.1768831011410567e-06, "loss": 0.4283, "step": 4346 }, { "epoch": 0.7, "grad_norm": 28.408495272774765, "learning_rate": 2.174729548802524e-06, "loss": 0.3844, "step": 4347 }, { "epoch": 0.7, "grad_norm": 6.232498793267007, "learning_rate": 2.1725767661455393e-06, "loss": 0.4354, "step": 4348 }, { "epoch": 0.7, "grad_norm": 0.9692155791690763, "learning_rate": 2.170424753756584e-06, "loss": 0.4064, "step": 4349 }, { "epoch": 0.7, "grad_norm": 8.832123278917905, "learning_rate": 2.1682735122219236e-06, "loss": 0.4232, "step": 4350 }, { "epoch": 0.7, "grad_norm": 11.013858088253174, "learning_rate": 2.1661230421276134e-06, "loss": 0.4566, "step": 4351 }, { "epoch": 0.7, "grad_norm": 7.120908347879851, "learning_rate": 2.163973344059503e-06, "loss": 0.36, "step": 4352 }, { "epoch": 0.7, "grad_norm": 9.82044553928464, "learning_rate": 2.1618244186032328e-06, "loss": 0.474, "step": 4353 }, { "epoch": 0.7, "grad_norm": 5.786781018903208, "learning_rate": 2.159676266344222e-06, "loss": 0.4267, "step": 4354 }, { "epoch": 0.7, "grad_norm": 4.799309770591843, "learning_rate": 2.15752888786769e-06, "loss": 0.4363, "step": 4355 }, { "epoch": 0.7, "grad_norm": 5.947235692468703, "learning_rate": 2.155382283758644e-06, "loss": 0.4084, "step": 4356 }, { "epoch": 0.7, "grad_norm": 1.0861305413983438, "learning_rate": 2.153236454601877e-06, "loss": 0.4142, "step": 4357 }, { "epoch": 0.7, "grad_norm": 10.7635693601465, "learning_rate": 2.1510914009819684e-06, "loss": 0.4156, "step": 4358 }, { "epoch": 0.7, "grad_norm": 9.668108685312125, "learning_rate": 2.148947123483293e-06, "loss": 0.4009, "step": 4359 }, { "epoch": 0.7, "grad_norm": 5.197390316900548, "learning_rate": 2.146803622690015e-06, "loss": 0.4384, "step": 4360 }, { "epoch": 0.7, "grad_norm": 20.949508982337, "learning_rate": 2.144660899186075e-06, "loss": 0.3825, "step": 4361 }, { "epoch": 0.7, "grad_norm": 8.312990175828494, "learning_rate": 2.1425189535552143e-06, "loss": 0.4103, "step": 4362 }, { "epoch": 0.7, "grad_norm": 9.78545262823547, "learning_rate": 2.1403777863809594e-06, "loss": 0.475, "step": 4363 }, { "epoch": 0.7, "grad_norm": 8.777424862452708, "learning_rate": 2.1382373982466213e-06, "loss": 0.4163, "step": 4364 }, { "epoch": 0.7, "grad_norm": 29.540356303456907, "learning_rate": 2.1360977897352992e-06, "loss": 0.5227, "step": 4365 }, { "epoch": 0.7, "grad_norm": 13.879500482463184, "learning_rate": 2.1339589614298844e-06, "loss": 0.3996, "step": 4366 }, { "epoch": 0.7, "grad_norm": 7.281780099592318, "learning_rate": 2.131820913913052e-06, "loss": 0.485, "step": 4367 }, { "epoch": 0.7, "grad_norm": 6.858714216975375, "learning_rate": 2.129683647767262e-06, "loss": 0.4537, "step": 4368 }, { "epoch": 0.7, "grad_norm": 12.852052781324238, "learning_rate": 2.1275471635747695e-06, "loss": 0.3989, "step": 4369 }, { "epoch": 0.7, "grad_norm": 6.835472331949467, "learning_rate": 2.125411461917607e-06, "loss": 0.3965, "step": 4370 }, { "epoch": 0.7, "grad_norm": 10.2985632096115, "learning_rate": 2.1232765433776033e-06, "loss": 0.3937, "step": 4371 }, { "epoch": 0.7, "grad_norm": 6.9953277142207035, "learning_rate": 2.121142408536366e-06, "loss": 0.4512, "step": 4372 }, { "epoch": 0.7, "grad_norm": 15.010428276545154, "learning_rate": 2.1190090579752908e-06, "loss": 0.4713, "step": 4373 }, { "epoch": 0.7, "grad_norm": 4.799952640573696, "learning_rate": 2.116876492275565e-06, "loss": 0.3791, "step": 4374 }, { "epoch": 0.7, "grad_norm": 9.731776170892486, "learning_rate": 2.114744712018155e-06, "loss": 0.4667, "step": 4375 }, { "epoch": 0.71, "grad_norm": 8.29983685170601, "learning_rate": 2.1126137177838197e-06, "loss": 0.4766, "step": 4376 }, { "epoch": 0.71, "grad_norm": 10.954109319055911, "learning_rate": 2.110483510153097e-06, "loss": 0.408, "step": 4377 }, { "epoch": 0.71, "grad_norm": 9.903235534310632, "learning_rate": 2.108354089706318e-06, "loss": 0.4359, "step": 4378 }, { "epoch": 0.71, "grad_norm": 9.591783186171359, "learning_rate": 2.1062254570235934e-06, "loss": 0.4451, "step": 4379 }, { "epoch": 0.71, "grad_norm": 6.298285921846783, "learning_rate": 2.1040976126848208e-06, "loss": 0.4084, "step": 4380 }, { "epoch": 0.71, "grad_norm": 8.358330758524463, "learning_rate": 2.1019705572696836e-06, "loss": 0.423, "step": 4381 }, { "epoch": 0.71, "grad_norm": 8.43978143728766, "learning_rate": 2.099844291357655e-06, "loss": 0.4333, "step": 4382 }, { "epoch": 0.71, "grad_norm": 5.60983741110548, "learning_rate": 2.0977188155279815e-06, "loss": 0.4287, "step": 4383 }, { "epoch": 0.71, "grad_norm": 19.48883112623078, "learning_rate": 2.095594130359704e-06, "loss": 0.3816, "step": 4384 }, { "epoch": 0.71, "grad_norm": 8.539522298499108, "learning_rate": 2.093470236431648e-06, "loss": 0.4432, "step": 4385 }, { "epoch": 0.71, "grad_norm": 9.683954062576332, "learning_rate": 2.0913471343224183e-06, "loss": 0.4199, "step": 4386 }, { "epoch": 0.71, "grad_norm": 8.279885964523098, "learning_rate": 2.0892248246104047e-06, "loss": 0.4267, "step": 4387 }, { "epoch": 0.71, "grad_norm": 7.223336448097532, "learning_rate": 2.0871033078737875e-06, "loss": 0.3935, "step": 4388 }, { "epoch": 0.71, "grad_norm": 11.759195162361502, "learning_rate": 2.0849825846905223e-06, "loss": 0.4184, "step": 4389 }, { "epoch": 0.71, "grad_norm": 10.933543550089196, "learning_rate": 2.0828626556383534e-06, "loss": 0.3982, "step": 4390 }, { "epoch": 0.71, "grad_norm": 17.80300884195896, "learning_rate": 2.0807435212948094e-06, "loss": 0.3717, "step": 4391 }, { "epoch": 0.71, "grad_norm": 19.334584862210274, "learning_rate": 2.078625182237199e-06, "loss": 0.4278, "step": 4392 }, { "epoch": 0.71, "grad_norm": 5.96169224266698, "learning_rate": 2.0765076390426187e-06, "loss": 0.3852, "step": 4393 }, { "epoch": 0.71, "grad_norm": 10.090401119719939, "learning_rate": 2.074390892287944e-06, "loss": 0.3973, "step": 4394 }, { "epoch": 0.71, "grad_norm": 8.789922024115263, "learning_rate": 2.0722749425498332e-06, "loss": 0.4515, "step": 4395 }, { "epoch": 0.71, "grad_norm": 7.485738012982616, "learning_rate": 2.0701597904047332e-06, "loss": 0.4441, "step": 4396 }, { "epoch": 0.71, "grad_norm": 6.904543206912847, "learning_rate": 2.068045436428866e-06, "loss": 0.4806, "step": 4397 }, { "epoch": 0.71, "grad_norm": 22.833858964293672, "learning_rate": 2.0659318811982433e-06, "loss": 0.397, "step": 4398 }, { "epoch": 0.71, "grad_norm": 6.470022395882339, "learning_rate": 2.0638191252886526e-06, "loss": 0.4333, "step": 4399 }, { "epoch": 0.71, "grad_norm": 39.47916050038513, "learning_rate": 2.06170716927567e-06, "loss": 0.4291, "step": 4400 }, { "epoch": 0.71, "grad_norm": 10.27179095436662, "learning_rate": 2.0595960137346494e-06, "loss": 0.428, "step": 4401 }, { "epoch": 0.71, "grad_norm": 11.484540351984919, "learning_rate": 2.0574856592407255e-06, "loss": 0.4026, "step": 4402 }, { "epoch": 0.71, "grad_norm": 8.351660530253886, "learning_rate": 2.0553761063688204e-06, "loss": 0.4211, "step": 4403 }, { "epoch": 0.71, "grad_norm": 8.819512721404266, "learning_rate": 2.0532673556936333e-06, "loss": 0.423, "step": 4404 }, { "epoch": 0.71, "grad_norm": 9.449572983887284, "learning_rate": 2.051159407789644e-06, "loss": 0.4245, "step": 4405 }, { "epoch": 0.71, "grad_norm": 8.505960540720057, "learning_rate": 2.0490522632311173e-06, "loss": 0.3922, "step": 4406 }, { "epoch": 0.71, "grad_norm": 7.9630812562857, "learning_rate": 2.0469459225920987e-06, "loss": 0.3867, "step": 4407 }, { "epoch": 0.71, "grad_norm": 5.901076205271426, "learning_rate": 2.0448403864464123e-06, "loss": 0.4438, "step": 4408 }, { "epoch": 0.71, "grad_norm": 9.848871198251322, "learning_rate": 2.0427356553676625e-06, "loss": 0.451, "step": 4409 }, { "epoch": 0.71, "grad_norm": 6.545526471702682, "learning_rate": 2.0406317299292394e-06, "loss": 0.4027, "step": 4410 }, { "epoch": 0.71, "grad_norm": 6.359238054637132, "learning_rate": 2.0385286107043086e-06, "loss": 0.4747, "step": 4411 }, { "epoch": 0.71, "grad_norm": 12.146205661525215, "learning_rate": 2.0364262982658155e-06, "loss": 0.3972, "step": 4412 }, { "epoch": 0.71, "grad_norm": 10.466800750346675, "learning_rate": 2.0343247931864927e-06, "loss": 0.4294, "step": 4413 }, { "epoch": 0.71, "grad_norm": 12.994073279549283, "learning_rate": 2.0322240960388436e-06, "loss": 0.4178, "step": 4414 }, { "epoch": 0.71, "grad_norm": 10.059384276326798, "learning_rate": 2.030124207395159e-06, "loss": 0.4575, "step": 4415 }, { "epoch": 0.71, "grad_norm": 45.525884299980575, "learning_rate": 2.0280251278275043e-06, "loss": 0.4917, "step": 4416 }, { "epoch": 0.71, "grad_norm": 6.153790147113465, "learning_rate": 2.0259268579077295e-06, "loss": 0.3704, "step": 4417 }, { "epoch": 0.71, "grad_norm": 12.183936839186744, "learning_rate": 2.0238293982074593e-06, "loss": 0.3563, "step": 4418 }, { "epoch": 0.71, "grad_norm": 5.9656865714321485, "learning_rate": 2.0217327492980977e-06, "loss": 0.4435, "step": 4419 }, { "epoch": 0.71, "grad_norm": 5.34617681880783, "learning_rate": 2.0196369117508336e-06, "loss": 0.419, "step": 4420 }, { "epoch": 0.71, "grad_norm": 10.48706800501729, "learning_rate": 2.017541886136627e-06, "loss": 0.4419, "step": 4421 }, { "epoch": 0.71, "grad_norm": 8.881699617762127, "learning_rate": 2.0154476730262244e-06, "loss": 0.4754, "step": 4422 }, { "epoch": 0.71, "grad_norm": 7.649901093244667, "learning_rate": 2.013354272990145e-06, "loss": 0.4315, "step": 4423 }, { "epoch": 0.71, "grad_norm": 14.790581769198985, "learning_rate": 2.011261686598688e-06, "loss": 0.4143, "step": 4424 }, { "epoch": 0.71, "grad_norm": 7.82473973200117, "learning_rate": 2.009169914421934e-06, "loss": 0.534, "step": 4425 }, { "epoch": 0.71, "grad_norm": 9.922066664136056, "learning_rate": 2.0070789570297377e-06, "loss": 0.3887, "step": 4426 }, { "epoch": 0.71, "grad_norm": 5.22300780706498, "learning_rate": 2.004988814991732e-06, "loss": 0.5028, "step": 4427 }, { "epoch": 0.71, "grad_norm": 6.736755804491099, "learning_rate": 2.002899488877332e-06, "loss": 0.4131, "step": 4428 }, { "epoch": 0.71, "grad_norm": 6.824948514582528, "learning_rate": 2.000810979255728e-06, "loss": 0.4051, "step": 4429 }, { "epoch": 0.71, "grad_norm": 8.173186971412607, "learning_rate": 1.9987232866958863e-06, "loss": 0.4336, "step": 4430 }, { "epoch": 0.71, "grad_norm": 7.905065773763113, "learning_rate": 1.9966364117665505e-06, "loss": 0.3984, "step": 4431 }, { "epoch": 0.71, "grad_norm": 6.643772538989016, "learning_rate": 1.9945503550362453e-06, "loss": 0.4667, "step": 4432 }, { "epoch": 0.71, "grad_norm": 13.382679799362768, "learning_rate": 1.99246511707327e-06, "loss": 0.4113, "step": 4433 }, { "epoch": 0.71, "grad_norm": 4.383399680160455, "learning_rate": 1.990380698445698e-06, "loss": 0.3519, "step": 4434 }, { "epoch": 0.71, "grad_norm": 11.37449709402352, "learning_rate": 1.988297099721384e-06, "loss": 0.4117, "step": 4435 }, { "epoch": 0.71, "grad_norm": 6.956839813475648, "learning_rate": 1.9862143214679606e-06, "loss": 0.3807, "step": 4436 }, { "epoch": 0.71, "grad_norm": 31.986037238045043, "learning_rate": 1.984132364252831e-06, "loss": 0.4028, "step": 4437 }, { "epoch": 0.72, "grad_norm": 1.2946333337660927, "learning_rate": 1.9820512286431764e-06, "loss": 0.4806, "step": 4438 }, { "epoch": 0.72, "grad_norm": 10.355038409587957, "learning_rate": 1.979970915205959e-06, "loss": 0.4028, "step": 4439 }, { "epoch": 0.72, "grad_norm": 7.498570633005586, "learning_rate": 1.9778914245079127e-06, "loss": 0.429, "step": 4440 }, { "epoch": 0.72, "grad_norm": 26.988962682738865, "learning_rate": 1.975812757115545e-06, "loss": 0.4141, "step": 4441 }, { "epoch": 0.72, "grad_norm": 11.195857178876848, "learning_rate": 1.9737349135951463e-06, "loss": 0.4787, "step": 4442 }, { "epoch": 0.72, "grad_norm": 16.36707806183333, "learning_rate": 1.971657894512775e-06, "loss": 0.4428, "step": 4443 }, { "epoch": 0.72, "grad_norm": 10.947922100241565, "learning_rate": 1.9695817004342715e-06, "loss": 0.4366, "step": 4444 }, { "epoch": 0.72, "grad_norm": 8.415022766398078, "learning_rate": 1.9675063319252467e-06, "loss": 0.4384, "step": 4445 }, { "epoch": 0.72, "grad_norm": 7.658875228761434, "learning_rate": 1.9654317895510867e-06, "loss": 0.3381, "step": 4446 }, { "epoch": 0.72, "grad_norm": 5.35630093613522, "learning_rate": 1.9633580738769577e-06, "loss": 0.331, "step": 4447 }, { "epoch": 0.72, "grad_norm": 8.232887352381377, "learning_rate": 1.961285185467794e-06, "loss": 0.4452, "step": 4448 }, { "epoch": 0.72, "grad_norm": 6.218960516738631, "learning_rate": 1.959213124888307e-06, "loss": 0.3543, "step": 4449 }, { "epoch": 0.72, "grad_norm": 7.093506284498239, "learning_rate": 1.957141892702985e-06, "loss": 0.381, "step": 4450 }, { "epoch": 0.72, "grad_norm": 5.817138287826958, "learning_rate": 1.9550714894760913e-06, "loss": 0.5023, "step": 4451 }, { "epoch": 0.72, "grad_norm": 5.965315079851673, "learning_rate": 1.9530019157716547e-06, "loss": 0.4398, "step": 4452 }, { "epoch": 0.72, "grad_norm": 1.1823351385561875, "learning_rate": 1.9509331721534873e-06, "loss": 0.4668, "step": 4453 }, { "epoch": 0.72, "grad_norm": 6.683122395410382, "learning_rate": 1.9488652591851732e-06, "loss": 0.4181, "step": 4454 }, { "epoch": 0.72, "grad_norm": 8.041355545559082, "learning_rate": 1.9467981774300676e-06, "loss": 0.3437, "step": 4455 }, { "epoch": 0.72, "grad_norm": 7.677519039490286, "learning_rate": 1.944731927451299e-06, "loss": 0.4202, "step": 4456 }, { "epoch": 0.72, "grad_norm": 11.205096180352168, "learning_rate": 1.942666509811772e-06, "loss": 0.4245, "step": 4457 }, { "epoch": 0.72, "grad_norm": 8.782078116243902, "learning_rate": 1.940601925074165e-06, "loss": 0.477, "step": 4458 }, { "epoch": 0.72, "grad_norm": 10.345226605845056, "learning_rate": 1.9385381738009273e-06, "loss": 0.4779, "step": 4459 }, { "epoch": 0.72, "grad_norm": 13.330755324419297, "learning_rate": 1.936475256554279e-06, "loss": 0.4002, "step": 4460 }, { "epoch": 0.72, "grad_norm": 7.0847911725588215, "learning_rate": 1.9344131738962183e-06, "loss": 0.359, "step": 4461 }, { "epoch": 0.72, "grad_norm": 7.581007533990149, "learning_rate": 1.932351926388513e-06, "loss": 0.4669, "step": 4462 }, { "epoch": 0.72, "grad_norm": 16.170267287444368, "learning_rate": 1.9302915145927015e-06, "loss": 0.4115, "step": 4463 }, { "epoch": 0.72, "grad_norm": 9.384153108782398, "learning_rate": 1.9282319390701e-06, "loss": 0.4721, "step": 4464 }, { "epoch": 0.72, "grad_norm": 5.340382604716182, "learning_rate": 1.92617320038179e-06, "loss": 0.4244, "step": 4465 }, { "epoch": 0.72, "grad_norm": 13.073009077945114, "learning_rate": 1.924115299088633e-06, "loss": 0.4686, "step": 4466 }, { "epoch": 0.72, "grad_norm": 7.202668765068819, "learning_rate": 1.9220582357512555e-06, "loss": 0.493, "step": 4467 }, { "epoch": 0.72, "grad_norm": 6.011181770403301, "learning_rate": 1.9200020109300567e-06, "loss": 0.4265, "step": 4468 }, { "epoch": 0.72, "grad_norm": 16.99522640329488, "learning_rate": 1.917946625185213e-06, "loss": 0.3617, "step": 4469 }, { "epoch": 0.72, "grad_norm": 7.165239241897893, "learning_rate": 1.9158920790766657e-06, "loss": 0.4047, "step": 4470 }, { "epoch": 0.72, "grad_norm": 7.131780283687418, "learning_rate": 1.913838373164128e-06, "loss": 0.3522, "step": 4471 }, { "epoch": 0.72, "grad_norm": 9.758083127974773, "learning_rate": 1.911785508007088e-06, "loss": 0.3787, "step": 4472 }, { "epoch": 0.72, "grad_norm": 6.582123736196778, "learning_rate": 1.9097334841648064e-06, "loss": 0.4153, "step": 4473 }, { "epoch": 0.72, "grad_norm": 5.437957406193051, "learning_rate": 1.9076823021963036e-06, "loss": 0.4083, "step": 4474 }, { "epoch": 0.72, "grad_norm": 5.6840378327340435, "learning_rate": 1.9056319626603826e-06, "loss": 0.356, "step": 4475 }, { "epoch": 0.72, "grad_norm": 7.145750917524555, "learning_rate": 1.9035824661156128e-06, "loss": 0.4746, "step": 4476 }, { "epoch": 0.72, "grad_norm": 6.799775287498655, "learning_rate": 1.9015338131203325e-06, "loss": 0.3714, "step": 4477 }, { "epoch": 0.72, "grad_norm": 4.773571653964158, "learning_rate": 1.8994860042326501e-06, "loss": 0.4117, "step": 4478 }, { "epoch": 0.72, "grad_norm": 20.62146403561568, "learning_rate": 1.8974390400104458e-06, "loss": 0.3832, "step": 4479 }, { "epoch": 0.72, "grad_norm": 5.721476026797257, "learning_rate": 1.895392921011373e-06, "loss": 0.3824, "step": 4480 }, { "epoch": 0.72, "grad_norm": 9.683811683500084, "learning_rate": 1.8933476477928447e-06, "loss": 0.4467, "step": 4481 }, { "epoch": 0.72, "grad_norm": 12.6321430043805, "learning_rate": 1.8913032209120519e-06, "loss": 0.4748, "step": 4482 }, { "epoch": 0.72, "grad_norm": 10.034303712241616, "learning_rate": 1.8892596409259556e-06, "loss": 0.4836, "step": 4483 }, { "epoch": 0.72, "grad_norm": 6.595006005249415, "learning_rate": 1.8872169083912806e-06, "loss": 0.406, "step": 4484 }, { "epoch": 0.72, "grad_norm": 5.728369207340053, "learning_rate": 1.8851750238645228e-06, "loss": 0.4063, "step": 4485 }, { "epoch": 0.72, "grad_norm": 9.358594417108787, "learning_rate": 1.8831339879019511e-06, "loss": 0.4951, "step": 4486 }, { "epoch": 0.72, "grad_norm": 9.22701153043282, "learning_rate": 1.881093801059596e-06, "loss": 0.4751, "step": 4487 }, { "epoch": 0.72, "grad_norm": 5.52387648903579, "learning_rate": 1.879054463893264e-06, "loss": 0.4961, "step": 4488 }, { "epoch": 0.72, "grad_norm": 5.114330291485143, "learning_rate": 1.8770159769585261e-06, "loss": 0.4012, "step": 4489 }, { "epoch": 0.72, "grad_norm": 7.002696353087496, "learning_rate": 1.8749783408107192e-06, "loss": 0.4164, "step": 4490 }, { "epoch": 0.72, "grad_norm": 5.806856826435281, "learning_rate": 1.8729415560049563e-06, "loss": 0.4295, "step": 4491 }, { "epoch": 0.72, "grad_norm": 6.47002588598355, "learning_rate": 1.8709056230961115e-06, "loss": 0.5099, "step": 4492 }, { "epoch": 0.72, "grad_norm": 4.6005650858730185, "learning_rate": 1.8688705426388264e-06, "loss": 0.4637, "step": 4493 }, { "epoch": 0.72, "grad_norm": 4.3836870835541095, "learning_rate": 1.8668363151875163e-06, "loss": 0.4292, "step": 4494 }, { "epoch": 0.72, "grad_norm": 6.39085257807561, "learning_rate": 1.864802941296362e-06, "loss": 0.3879, "step": 4495 }, { "epoch": 0.72, "grad_norm": 8.139721817600888, "learning_rate": 1.8627704215193082e-06, "loss": 0.442, "step": 4496 }, { "epoch": 0.72, "grad_norm": 9.419559157773179, "learning_rate": 1.8607387564100681e-06, "loss": 0.434, "step": 4497 }, { "epoch": 0.72, "grad_norm": 1.147661855859265, "learning_rate": 1.858707946522127e-06, "loss": 0.4354, "step": 4498 }, { "epoch": 0.72, "grad_norm": 6.939034758512586, "learning_rate": 1.8566779924087315e-06, "loss": 0.4035, "step": 4499 }, { "epoch": 0.73, "grad_norm": 10.078699321026132, "learning_rate": 1.8546488946228952e-06, "loss": 0.3746, "step": 4500 }, { "epoch": 0.73, "grad_norm": 5.43603794232887, "learning_rate": 1.8526206537174019e-06, "loss": 0.4804, "step": 4501 }, { "epoch": 0.73, "grad_norm": 9.917698533050066, "learning_rate": 1.8505932702448042e-06, "loss": 0.4768, "step": 4502 }, { "epoch": 0.73, "grad_norm": 1.185670272098717, "learning_rate": 1.8485667447574101e-06, "loss": 0.4751, "step": 4503 }, { "epoch": 0.73, "grad_norm": 7.465843839930071, "learning_rate": 1.8465410778073046e-06, "loss": 0.4428, "step": 4504 }, { "epoch": 0.73, "grad_norm": 7.798474733860775, "learning_rate": 1.8445162699463365e-06, "loss": 0.397, "step": 4505 }, { "epoch": 0.73, "grad_norm": 5.614205703576717, "learning_rate": 1.842492321726118e-06, "loss": 0.39, "step": 4506 }, { "epoch": 0.73, "grad_norm": 1.1790116772121924, "learning_rate": 1.8404692336980272e-06, "loss": 0.4923, "step": 4507 }, { "epoch": 0.73, "grad_norm": 5.198082523076538, "learning_rate": 1.838447006413211e-06, "loss": 0.4902, "step": 4508 }, { "epoch": 0.73, "grad_norm": 5.663941045909598, "learning_rate": 1.8364256404225777e-06, "loss": 0.4443, "step": 4509 }, { "epoch": 0.73, "grad_norm": 15.554781014486947, "learning_rate": 1.834405136276806e-06, "loss": 0.4409, "step": 4510 }, { "epoch": 0.73, "grad_norm": 6.3112079446470934, "learning_rate": 1.8323854945263354e-06, "loss": 0.4226, "step": 4511 }, { "epoch": 0.73, "grad_norm": 14.151324814611046, "learning_rate": 1.83036671572137e-06, "loss": 0.4842, "step": 4512 }, { "epoch": 0.73, "grad_norm": 13.693167290152273, "learning_rate": 1.828348800411885e-06, "loss": 0.4029, "step": 4513 }, { "epoch": 0.73, "grad_norm": 7.224978524836273, "learning_rate": 1.8263317491476112e-06, "loss": 0.416, "step": 4514 }, { "epoch": 0.73, "grad_norm": 8.352885364369113, "learning_rate": 1.824315562478054e-06, "loss": 0.3888, "step": 4515 }, { "epoch": 0.73, "grad_norm": 20.383188767531394, "learning_rate": 1.8223002409524736e-06, "loss": 0.4296, "step": 4516 }, { "epoch": 0.73, "grad_norm": 4.359239147858997, "learning_rate": 1.8202857851199034e-06, "loss": 0.3238, "step": 4517 }, { "epoch": 0.73, "grad_norm": 5.379114089786967, "learning_rate": 1.818272195529135e-06, "loss": 0.3931, "step": 4518 }, { "epoch": 0.73, "grad_norm": 5.778534179894923, "learning_rate": 1.816259472728723e-06, "loss": 0.4643, "step": 4519 }, { "epoch": 0.73, "grad_norm": 7.000699179034113, "learning_rate": 1.8142476172669932e-06, "loss": 0.4028, "step": 4520 }, { "epoch": 0.73, "grad_norm": 8.440199071135973, "learning_rate": 1.812236629692028e-06, "loss": 0.395, "step": 4521 }, { "epoch": 0.73, "grad_norm": 5.815255094572453, "learning_rate": 1.8102265105516741e-06, "loss": 0.3779, "step": 4522 }, { "epoch": 0.73, "grad_norm": 7.690480123903951, "learning_rate": 1.8082172603935455e-06, "loss": 0.4091, "step": 4523 }, { "epoch": 0.73, "grad_norm": 11.253687091057483, "learning_rate": 1.8062088797650208e-06, "loss": 0.4604, "step": 4524 }, { "epoch": 0.73, "grad_norm": 6.272150845229022, "learning_rate": 1.804201369213231e-06, "loss": 0.3761, "step": 4525 }, { "epoch": 0.73, "grad_norm": 7.090587509423968, "learning_rate": 1.8021947292850811e-06, "loss": 0.3645, "step": 4526 }, { "epoch": 0.73, "grad_norm": 6.474965390855698, "learning_rate": 1.8001889605272366e-06, "loss": 0.3573, "step": 4527 }, { "epoch": 0.73, "grad_norm": 1.159616526586958, "learning_rate": 1.7981840634861225e-06, "loss": 0.4322, "step": 4528 }, { "epoch": 0.73, "grad_norm": 9.51375722735744, "learning_rate": 1.7961800387079265e-06, "loss": 0.3999, "step": 4529 }, { "epoch": 0.73, "grad_norm": 7.217687486172273, "learning_rate": 1.794176886738604e-06, "loss": 0.3783, "step": 4530 }, { "epoch": 0.73, "grad_norm": 10.646457711630864, "learning_rate": 1.7921746081238656e-06, "loss": 0.3591, "step": 4531 }, { "epoch": 0.73, "grad_norm": 5.018200423395607, "learning_rate": 1.7901732034091867e-06, "loss": 0.4616, "step": 4532 }, { "epoch": 0.73, "grad_norm": 6.02769591009989, "learning_rate": 1.788172673139807e-06, "loss": 0.4683, "step": 4533 }, { "epoch": 0.73, "grad_norm": 8.833755381411487, "learning_rate": 1.7861730178607274e-06, "loss": 0.453, "step": 4534 }, { "epoch": 0.73, "grad_norm": 6.253192148843246, "learning_rate": 1.784174238116707e-06, "loss": 0.4201, "step": 4535 }, { "epoch": 0.73, "grad_norm": 1.1414014713680958, "learning_rate": 1.7821763344522675e-06, "loss": 0.4773, "step": 4536 }, { "epoch": 0.73, "grad_norm": 6.659071949856766, "learning_rate": 1.7801793074116958e-06, "loss": 0.4204, "step": 4537 }, { "epoch": 0.73, "grad_norm": 5.608664291690792, "learning_rate": 1.778183157539034e-06, "loss": 0.4432, "step": 4538 }, { "epoch": 0.73, "grad_norm": 5.268885163844621, "learning_rate": 1.7761878853780918e-06, "loss": 0.3616, "step": 4539 }, { "epoch": 0.73, "grad_norm": 5.7821856706517964, "learning_rate": 1.774193491472434e-06, "loss": 0.48, "step": 4540 }, { "epoch": 0.73, "grad_norm": 5.765691912077277, "learning_rate": 1.772199976365388e-06, "loss": 0.4096, "step": 4541 }, { "epoch": 0.73, "grad_norm": 6.9870936730716675, "learning_rate": 1.7702073406000447e-06, "loss": 0.4464, "step": 4542 }, { "epoch": 0.73, "grad_norm": 6.378926483478102, "learning_rate": 1.768215584719251e-06, "loss": 0.4041, "step": 4543 }, { "epoch": 0.73, "grad_norm": 8.365345549858345, "learning_rate": 1.7662247092656159e-06, "loss": 0.4149, "step": 4544 }, { "epoch": 0.73, "grad_norm": 7.595828027065605, "learning_rate": 1.7642347147815091e-06, "loss": 0.407, "step": 4545 }, { "epoch": 0.73, "grad_norm": 5.671693620156142, "learning_rate": 1.7622456018090638e-06, "loss": 0.4202, "step": 4546 }, { "epoch": 0.73, "grad_norm": 6.099633766858797, "learning_rate": 1.7602573708901627e-06, "loss": 0.3776, "step": 4547 }, { "epoch": 0.73, "grad_norm": 5.890540105613058, "learning_rate": 1.7582700225664574e-06, "loss": 0.3792, "step": 4548 }, { "epoch": 0.73, "grad_norm": 5.787750814843247, "learning_rate": 1.7562835573793585e-06, "loss": 0.4217, "step": 4549 }, { "epoch": 0.73, "grad_norm": 6.80485350028781, "learning_rate": 1.7542979758700323e-06, "loss": 0.4209, "step": 4550 }, { "epoch": 0.73, "grad_norm": 6.308267665365734, "learning_rate": 1.752313278579404e-06, "loss": 0.4253, "step": 4551 }, { "epoch": 0.73, "grad_norm": 5.744100388949374, "learning_rate": 1.7503294660481635e-06, "loss": 0.4005, "step": 4552 }, { "epoch": 0.73, "grad_norm": 14.917344706595149, "learning_rate": 1.7483465388167542e-06, "loss": 0.3972, "step": 4553 }, { "epoch": 0.73, "grad_norm": 6.036256385290395, "learning_rate": 1.746364497425378e-06, "loss": 0.417, "step": 4554 }, { "epoch": 0.73, "grad_norm": 5.48479127490291, "learning_rate": 1.744383342414e-06, "loss": 0.3427, "step": 4555 }, { "epoch": 0.73, "grad_norm": 8.665985720515634, "learning_rate": 1.7424030743223424e-06, "loss": 0.4963, "step": 4556 }, { "epoch": 0.73, "grad_norm": 5.934116805528895, "learning_rate": 1.7404236936898837e-06, "loss": 0.394, "step": 4557 }, { "epoch": 0.73, "grad_norm": 7.130312903469824, "learning_rate": 1.7384452010558605e-06, "loss": 0.4399, "step": 4558 }, { "epoch": 0.73, "grad_norm": 17.145545285700436, "learning_rate": 1.736467596959271e-06, "loss": 0.4122, "step": 4559 }, { "epoch": 0.73, "grad_norm": 6.8221860966117145, "learning_rate": 1.7344908819388678e-06, "loss": 0.4287, "step": 4560 }, { "epoch": 0.73, "grad_norm": 5.527892503959112, "learning_rate": 1.7325150565331612e-06, "loss": 0.3714, "step": 4561 }, { "epoch": 0.74, "grad_norm": 6.3962441730490545, "learning_rate": 1.7305401212804235e-06, "loss": 0.3884, "step": 4562 }, { "epoch": 0.74, "grad_norm": 7.915484726784404, "learning_rate": 1.7285660767186778e-06, "loss": 0.4514, "step": 4563 }, { "epoch": 0.74, "grad_norm": 14.954831485611054, "learning_rate": 1.7265929233857125e-06, "loss": 0.4523, "step": 4564 }, { "epoch": 0.74, "grad_norm": 8.688720378880985, "learning_rate": 1.7246206618190676e-06, "loss": 0.4659, "step": 4565 }, { "epoch": 0.74, "grad_norm": 1.179100415248463, "learning_rate": 1.7226492925560385e-06, "loss": 0.4128, "step": 4566 }, { "epoch": 0.74, "grad_norm": 6.304398041990792, "learning_rate": 1.7206788161336825e-06, "loss": 0.4277, "step": 4567 }, { "epoch": 0.74, "grad_norm": 6.301338457177896, "learning_rate": 1.7187092330888155e-06, "loss": 0.3907, "step": 4568 }, { "epoch": 0.74, "grad_norm": 1.310549089673422, "learning_rate": 1.7167405439579994e-06, "loss": 0.4849, "step": 4569 }, { "epoch": 0.74, "grad_norm": 11.533623461543193, "learning_rate": 1.714772749277563e-06, "loss": 0.4775, "step": 4570 }, { "epoch": 0.74, "grad_norm": 1.0781386720030601, "learning_rate": 1.7128058495835886e-06, "loss": 0.4635, "step": 4571 }, { "epoch": 0.74, "grad_norm": 14.06992913712214, "learning_rate": 1.7108398454119134e-06, "loss": 0.4523, "step": 4572 }, { "epoch": 0.74, "grad_norm": 6.339645809984674, "learning_rate": 1.7088747372981285e-06, "loss": 0.4367, "step": 4573 }, { "epoch": 0.74, "grad_norm": 15.20227216952294, "learning_rate": 1.7069105257775859e-06, "loss": 0.4487, "step": 4574 }, { "epoch": 0.74, "grad_norm": 6.46212923264766, "learning_rate": 1.7049472113853937e-06, "loss": 0.425, "step": 4575 }, { "epoch": 0.74, "grad_norm": 7.501462220732155, "learning_rate": 1.7029847946564066e-06, "loss": 0.3756, "step": 4576 }, { "epoch": 0.74, "grad_norm": 14.066646895580233, "learning_rate": 1.7010232761252438e-06, "loss": 0.4154, "step": 4577 }, { "epoch": 0.74, "grad_norm": 6.787512291620692, "learning_rate": 1.6990626563262797e-06, "loss": 0.3682, "step": 4578 }, { "epoch": 0.74, "grad_norm": 17.15749661157707, "learning_rate": 1.6971029357936385e-06, "loss": 0.5007, "step": 4579 }, { "epoch": 0.74, "grad_norm": 5.156608984134489, "learning_rate": 1.6951441150612008e-06, "loss": 0.3622, "step": 4580 }, { "epoch": 0.74, "grad_norm": 8.790379115869285, "learning_rate": 1.6931861946626066e-06, "loss": 0.3853, "step": 4581 }, { "epoch": 0.74, "grad_norm": 7.259066103804772, "learning_rate": 1.6912291751312465e-06, "loss": 0.4324, "step": 4582 }, { "epoch": 0.74, "grad_norm": 5.230729953696585, "learning_rate": 1.6892730570002635e-06, "loss": 0.3687, "step": 4583 }, { "epoch": 0.74, "grad_norm": 9.990300105168071, "learning_rate": 1.6873178408025625e-06, "loss": 0.3828, "step": 4584 }, { "epoch": 0.74, "grad_norm": 7.468122764424567, "learning_rate": 1.6853635270707947e-06, "loss": 0.4229, "step": 4585 }, { "epoch": 0.74, "grad_norm": 5.922696195350371, "learning_rate": 1.6834101163373716e-06, "loss": 0.3407, "step": 4586 }, { "epoch": 0.74, "grad_norm": 8.373762495046375, "learning_rate": 1.6814576091344559e-06, "loss": 0.412, "step": 4587 }, { "epoch": 0.74, "grad_norm": 7.180575190525728, "learning_rate": 1.6795060059939617e-06, "loss": 0.4528, "step": 4588 }, { "epoch": 0.74, "grad_norm": 6.992521934988659, "learning_rate": 1.6775553074475631e-06, "loss": 0.4216, "step": 4589 }, { "epoch": 0.74, "grad_norm": 8.396207231397725, "learning_rate": 1.6756055140266826e-06, "loss": 0.4631, "step": 4590 }, { "epoch": 0.74, "grad_norm": 6.890771369152579, "learning_rate": 1.6736566262624959e-06, "loss": 0.4589, "step": 4591 }, { "epoch": 0.74, "grad_norm": 7.366268060889483, "learning_rate": 1.6717086446859354e-06, "loss": 0.3915, "step": 4592 }, { "epoch": 0.74, "grad_norm": 25.601613250263814, "learning_rate": 1.6697615698276864e-06, "loss": 0.3934, "step": 4593 }, { "epoch": 0.74, "grad_norm": 4.484650623798676, "learning_rate": 1.6678154022181842e-06, "loss": 0.3976, "step": 4594 }, { "epoch": 0.74, "grad_norm": 6.173629134663297, "learning_rate": 1.6658701423876172e-06, "loss": 0.4425, "step": 4595 }, { "epoch": 0.74, "grad_norm": 12.479155403553335, "learning_rate": 1.6639257908659294e-06, "loss": 0.4609, "step": 4596 }, { "epoch": 0.74, "grad_norm": 9.955595612285999, "learning_rate": 1.661982348182819e-06, "loss": 0.502, "step": 4597 }, { "epoch": 0.74, "grad_norm": 5.1711472517913455, "learning_rate": 1.660039814867726e-06, "loss": 0.4114, "step": 4598 }, { "epoch": 0.74, "grad_norm": 5.650007136229758, "learning_rate": 1.6580981914498545e-06, "loss": 0.4686, "step": 4599 }, { "epoch": 0.74, "grad_norm": 6.086713512739285, "learning_rate": 1.6561574784581574e-06, "loss": 0.3929, "step": 4600 }, { "epoch": 0.74, "grad_norm": 5.773929173316987, "learning_rate": 1.6542176764213364e-06, "loss": 0.4218, "step": 4601 }, { "epoch": 0.74, "grad_norm": 12.665841262605005, "learning_rate": 1.6522787858678463e-06, "loss": 0.4952, "step": 4602 }, { "epoch": 0.74, "grad_norm": 5.245531199524228, "learning_rate": 1.6503408073258964e-06, "loss": 0.3787, "step": 4603 }, { "epoch": 0.74, "grad_norm": 7.202749093500418, "learning_rate": 1.648403741323445e-06, "loss": 0.4217, "step": 4604 }, { "epoch": 0.74, "grad_norm": 5.916055203248557, "learning_rate": 1.6464675883881998e-06, "loss": 0.3377, "step": 4605 }, { "epoch": 0.74, "grad_norm": 21.10247271389902, "learning_rate": 1.6445323490476255e-06, "loss": 0.476, "step": 4606 }, { "epoch": 0.74, "grad_norm": 8.241044742479016, "learning_rate": 1.6425980238289324e-06, "loss": 0.4573, "step": 4607 }, { "epoch": 0.74, "grad_norm": 7.327067079670294, "learning_rate": 1.6406646132590865e-06, "loss": 0.397, "step": 4608 }, { "epoch": 0.74, "grad_norm": 10.60781185986753, "learning_rate": 1.6387321178648003e-06, "loss": 0.4876, "step": 4609 }, { "epoch": 0.74, "grad_norm": 4.903810281500996, "learning_rate": 1.6368005381725372e-06, "loss": 0.4214, "step": 4610 }, { "epoch": 0.74, "grad_norm": 8.571707061212956, "learning_rate": 1.6348698747085168e-06, "loss": 0.4597, "step": 4611 }, { "epoch": 0.74, "grad_norm": 6.106116272818694, "learning_rate": 1.6329401279987023e-06, "loss": 0.4273, "step": 4612 }, { "epoch": 0.74, "grad_norm": 12.613522793385254, "learning_rate": 1.6310112985688093e-06, "loss": 0.3722, "step": 4613 }, { "epoch": 0.74, "grad_norm": 9.457216530778723, "learning_rate": 1.629083386944305e-06, "loss": 0.4171, "step": 4614 }, { "epoch": 0.74, "grad_norm": 9.407565057138966, "learning_rate": 1.6271563936504082e-06, "loss": 0.4286, "step": 4615 }, { "epoch": 0.74, "grad_norm": 8.592314554626736, "learning_rate": 1.6252303192120821e-06, "loss": 0.4036, "step": 4616 }, { "epoch": 0.74, "grad_norm": 5.577679933583275, "learning_rate": 1.6233051641540415e-06, "loss": 0.4331, "step": 4617 }, { "epoch": 0.74, "grad_norm": 6.5187452043022205, "learning_rate": 1.6213809290007554e-06, "loss": 0.4153, "step": 4618 }, { "epoch": 0.74, "grad_norm": 7.8223210786395025, "learning_rate": 1.6194576142764363e-06, "loss": 0.4471, "step": 4619 }, { "epoch": 0.74, "grad_norm": 7.259815969873206, "learning_rate": 1.617535220505046e-06, "loss": 0.3662, "step": 4620 }, { "epoch": 0.74, "grad_norm": 10.13696228355579, "learning_rate": 1.6156137482103006e-06, "loss": 0.4518, "step": 4621 }, { "epoch": 0.74, "grad_norm": 57.16343160844678, "learning_rate": 1.6136931979156628e-06, "loss": 0.3889, "step": 4622 }, { "epoch": 0.74, "grad_norm": 5.21886681893156, "learning_rate": 1.6117735701443416e-06, "loss": 0.3747, "step": 4623 }, { "epoch": 0.75, "grad_norm": 13.636813767408224, "learning_rate": 1.6098548654192958e-06, "loss": 0.3645, "step": 4624 }, { "epoch": 0.75, "grad_norm": 18.248806230269267, "learning_rate": 1.6079370842632358e-06, "loss": 0.4661, "step": 4625 }, { "epoch": 0.75, "grad_norm": 6.001766067642663, "learning_rate": 1.606020227198617e-06, "loss": 0.3831, "step": 4626 }, { "epoch": 0.75, "grad_norm": 45.56941218779736, "learning_rate": 1.604104294747642e-06, "loss": 0.4539, "step": 4627 }, { "epoch": 0.75, "grad_norm": 1.1998878029432454, "learning_rate": 1.6021892874322675e-06, "loss": 0.4419, "step": 4628 }, { "epoch": 0.75, "grad_norm": 6.874921802741356, "learning_rate": 1.6002752057741906e-06, "loss": 0.4457, "step": 4629 }, { "epoch": 0.75, "grad_norm": 4.765032231765931, "learning_rate": 1.5983620502948632e-06, "loss": 0.3657, "step": 4630 }, { "epoch": 0.75, "grad_norm": 11.345891485526218, "learning_rate": 1.59644982151548e-06, "loss": 0.392, "step": 4631 }, { "epoch": 0.75, "grad_norm": 12.252487669741566, "learning_rate": 1.5945385199569836e-06, "loss": 0.384, "step": 4632 }, { "epoch": 0.75, "grad_norm": 5.377527005330427, "learning_rate": 1.5926281461400673e-06, "loss": 0.4566, "step": 4633 }, { "epoch": 0.75, "grad_norm": 10.542549984466994, "learning_rate": 1.5907187005851676e-06, "loss": 0.4099, "step": 4634 }, { "epoch": 0.75, "grad_norm": 1.2084658991101749, "learning_rate": 1.588810183812473e-06, "loss": 0.4608, "step": 4635 }, { "epoch": 0.75, "grad_norm": 10.288202998465433, "learning_rate": 1.5869025963419122e-06, "loss": 0.4617, "step": 4636 }, { "epoch": 0.75, "grad_norm": 6.244407638174307, "learning_rate": 1.584995938693169e-06, "loss": 0.4433, "step": 4637 }, { "epoch": 0.75, "grad_norm": 8.125490995155578, "learning_rate": 1.5830902113856667e-06, "loss": 0.3993, "step": 4638 }, { "epoch": 0.75, "grad_norm": 6.094428339158442, "learning_rate": 1.581185414938577e-06, "loss": 0.5109, "step": 4639 }, { "epoch": 0.75, "grad_norm": 8.932827421215514, "learning_rate": 1.579281549870822e-06, "loss": 0.4661, "step": 4640 }, { "epoch": 0.75, "grad_norm": 6.1413025803030195, "learning_rate": 1.5773786167010657e-06, "loss": 0.4754, "step": 4641 }, { "epoch": 0.75, "grad_norm": 5.075568966356319, "learning_rate": 1.5754766159477174e-06, "loss": 0.4084, "step": 4642 }, { "epoch": 0.75, "grad_norm": 5.09319808041101, "learning_rate": 1.5735755481289371e-06, "loss": 0.345, "step": 4643 }, { "epoch": 0.75, "grad_norm": 1.2118788375193417, "learning_rate": 1.5716754137626284e-06, "loss": 0.4573, "step": 4644 }, { "epoch": 0.75, "grad_norm": 6.130568093279297, "learning_rate": 1.5697762133664396e-06, "loss": 0.4089, "step": 4645 }, { "epoch": 0.75, "grad_norm": 10.077398697389578, "learning_rate": 1.5678779474577633e-06, "loss": 0.4617, "step": 4646 }, { "epoch": 0.75, "grad_norm": 6.503558335218649, "learning_rate": 1.5659806165537428e-06, "loss": 0.4145, "step": 4647 }, { "epoch": 0.75, "grad_norm": 1.2038104881067744, "learning_rate": 1.5640842211712614e-06, "loss": 0.4575, "step": 4648 }, { "epoch": 0.75, "grad_norm": 10.987019251152837, "learning_rate": 1.5621887618269481e-06, "loss": 0.4637, "step": 4649 }, { "epoch": 0.75, "grad_norm": 14.278256622617576, "learning_rate": 1.5602942390371817e-06, "loss": 0.4053, "step": 4650 }, { "epoch": 0.75, "grad_norm": 5.8201013658936915, "learning_rate": 1.558400653318079e-06, "loss": 0.3921, "step": 4651 }, { "epoch": 0.75, "grad_norm": 5.053688884360372, "learning_rate": 1.556508005185508e-06, "loss": 0.4357, "step": 4652 }, { "epoch": 0.75, "grad_norm": 13.942405752874604, "learning_rate": 1.5546162951550759e-06, "loss": 0.4588, "step": 4653 }, { "epoch": 0.75, "grad_norm": 8.193451270663555, "learning_rate": 1.5527255237421384e-06, "loss": 0.385, "step": 4654 }, { "epoch": 0.75, "grad_norm": 5.981863504654614, "learning_rate": 1.5508356914617933e-06, "loss": 0.37, "step": 4655 }, { "epoch": 0.75, "grad_norm": 1.0994779119821592, "learning_rate": 1.5489467988288809e-06, "loss": 0.4224, "step": 4656 }, { "epoch": 0.75, "grad_norm": 12.958297403644622, "learning_rate": 1.5470588463579906e-06, "loss": 0.4947, "step": 4657 }, { "epoch": 0.75, "grad_norm": 5.752223097478931, "learning_rate": 1.5451718345634503e-06, "loss": 0.4114, "step": 4658 }, { "epoch": 0.75, "grad_norm": 6.1607250437165, "learning_rate": 1.5432857639593362e-06, "loss": 0.3766, "step": 4659 }, { "epoch": 0.75, "grad_norm": 5.430045775903346, "learning_rate": 1.5414006350594658e-06, "loss": 0.3773, "step": 4660 }, { "epoch": 0.75, "grad_norm": 7.098351910611733, "learning_rate": 1.5395164483773966e-06, "loss": 0.4551, "step": 4661 }, { "epoch": 0.75, "grad_norm": 11.405661443815728, "learning_rate": 1.537633204426438e-06, "loss": 0.3602, "step": 4662 }, { "epoch": 0.75, "grad_norm": 7.37381676410755, "learning_rate": 1.535750903719635e-06, "loss": 0.4099, "step": 4663 }, { "epoch": 0.75, "grad_norm": 15.793761840575337, "learning_rate": 1.5338695467697767e-06, "loss": 0.3894, "step": 4664 }, { "epoch": 0.75, "grad_norm": 9.325148263663642, "learning_rate": 1.531989134089399e-06, "loss": 0.4038, "step": 4665 }, { "epoch": 0.75, "grad_norm": 5.796158938602956, "learning_rate": 1.5301096661907782e-06, "loss": 0.3829, "step": 4666 }, { "epoch": 0.75, "grad_norm": 7.0753382101938875, "learning_rate": 1.5282311435859325e-06, "loss": 0.4524, "step": 4667 }, { "epoch": 0.75, "grad_norm": 18.313548427033474, "learning_rate": 1.526353566786622e-06, "loss": 0.42, "step": 4668 }, { "epoch": 0.75, "grad_norm": 9.4989438793356, "learning_rate": 1.5244769363043527e-06, "loss": 0.3953, "step": 4669 }, { "epoch": 0.75, "grad_norm": 7.5374863936432295, "learning_rate": 1.5226012526503698e-06, "loss": 0.4192, "step": 4670 }, { "epoch": 0.75, "grad_norm": 7.079716538407558, "learning_rate": 1.5207265163356588e-06, "loss": 0.3841, "step": 4671 }, { "epoch": 0.75, "grad_norm": 6.6692461619279335, "learning_rate": 1.5188527278709514e-06, "loss": 0.4285, "step": 4672 }, { "epoch": 0.75, "grad_norm": 1.3422580863913138, "learning_rate": 1.5169798877667207e-06, "loss": 0.4582, "step": 4673 }, { "epoch": 0.75, "grad_norm": 10.199541682369334, "learning_rate": 1.5151079965331788e-06, "loss": 0.4424, "step": 4674 }, { "epoch": 0.75, "grad_norm": 10.975200998632868, "learning_rate": 1.5132370546802794e-06, "loss": 0.4698, "step": 4675 }, { "epoch": 0.75, "grad_norm": 10.150392570091952, "learning_rate": 1.5113670627177202e-06, "loss": 0.4784, "step": 4676 }, { "epoch": 0.75, "grad_norm": 7.791925786554734, "learning_rate": 1.5094980211549382e-06, "loss": 0.4379, "step": 4677 }, { "epoch": 0.75, "grad_norm": 9.781364996027921, "learning_rate": 1.5076299305011095e-06, "loss": 0.4149, "step": 4678 }, { "epoch": 0.75, "grad_norm": 5.758523227175734, "learning_rate": 1.5057627912651574e-06, "loss": 0.3857, "step": 4679 }, { "epoch": 0.75, "grad_norm": 5.54993798202104, "learning_rate": 1.5038966039557384e-06, "loss": 0.4054, "step": 4680 }, { "epoch": 0.75, "grad_norm": 6.095553493776552, "learning_rate": 1.502031369081257e-06, "loss": 0.3795, "step": 4681 }, { "epoch": 0.75, "grad_norm": 7.819217148493869, "learning_rate": 1.5001670871498524e-06, "loss": 0.3972, "step": 4682 }, { "epoch": 0.75, "grad_norm": 6.63342726126234, "learning_rate": 1.4983037586694055e-06, "loss": 0.4214, "step": 4683 }, { "epoch": 0.75, "grad_norm": 6.701235856709712, "learning_rate": 1.4964413841475412e-06, "loss": 0.4362, "step": 4684 }, { "epoch": 0.75, "grad_norm": 5.341167161807946, "learning_rate": 1.4945799640916198e-06, "loss": 0.3852, "step": 4685 }, { "epoch": 0.76, "grad_norm": 5.648424099530535, "learning_rate": 1.4927194990087418e-06, "loss": 0.364, "step": 4686 }, { "epoch": 0.76, "grad_norm": 16.097428509509562, "learning_rate": 1.4908599894057512e-06, "loss": 0.4178, "step": 4687 }, { "epoch": 0.76, "grad_norm": 9.255350769204757, "learning_rate": 1.489001435789233e-06, "loss": 0.4116, "step": 4688 }, { "epoch": 0.76, "grad_norm": 7.06905511979481, "learning_rate": 1.4871438386655018e-06, "loss": 0.4294, "step": 4689 }, { "epoch": 0.76, "grad_norm": 6.220965583094371, "learning_rate": 1.4852871985406208e-06, "loss": 0.4179, "step": 4690 }, { "epoch": 0.76, "grad_norm": 6.017250768930354, "learning_rate": 1.4834315159203927e-06, "loss": 0.4308, "step": 4691 }, { "epoch": 0.76, "grad_norm": 9.09828115301685, "learning_rate": 1.481576791310354e-06, "loss": 0.4503, "step": 4692 }, { "epoch": 0.76, "grad_norm": 5.980873503916682, "learning_rate": 1.4797230252157818e-06, "loss": 0.414, "step": 4693 }, { "epoch": 0.76, "grad_norm": 6.953017388628192, "learning_rate": 1.4778702181416949e-06, "loss": 0.3822, "step": 4694 }, { "epoch": 0.76, "grad_norm": 6.656739404093136, "learning_rate": 1.4760183705928499e-06, "loss": 0.3832, "step": 4695 }, { "epoch": 0.76, "grad_norm": 5.867230419798012, "learning_rate": 1.4741674830737396e-06, "loss": 0.4147, "step": 4696 }, { "epoch": 0.76, "grad_norm": 10.037414286459864, "learning_rate": 1.472317556088596e-06, "loss": 0.4386, "step": 4697 }, { "epoch": 0.76, "grad_norm": 14.169953079567211, "learning_rate": 1.4704685901413928e-06, "loss": 0.4598, "step": 4698 }, { "epoch": 0.76, "grad_norm": 7.604438551997105, "learning_rate": 1.468620585735837e-06, "loss": 0.4007, "step": 4699 }, { "epoch": 0.76, "grad_norm": 7.338202814470881, "learning_rate": 1.466773543375376e-06, "loss": 0.3017, "step": 4700 }, { "epoch": 0.76, "grad_norm": 7.0901971823117504, "learning_rate": 1.4649274635631972e-06, "loss": 0.4974, "step": 4701 }, { "epoch": 0.76, "grad_norm": 5.784070095862422, "learning_rate": 1.4630823468022204e-06, "loss": 0.3955, "step": 4702 }, { "epoch": 0.76, "grad_norm": 8.265811291867935, "learning_rate": 1.4612381935951093e-06, "loss": 0.3747, "step": 4703 }, { "epoch": 0.76, "grad_norm": 6.5309677228438945, "learning_rate": 1.4593950044442612e-06, "loss": 0.453, "step": 4704 }, { "epoch": 0.76, "grad_norm": 7.109056826410635, "learning_rate": 1.4575527798518097e-06, "loss": 0.4385, "step": 4705 }, { "epoch": 0.76, "grad_norm": 7.384068069497013, "learning_rate": 1.4557115203196304e-06, "loss": 0.4287, "step": 4706 }, { "epoch": 0.76, "grad_norm": 7.047456726143926, "learning_rate": 1.453871226349332e-06, "loss": 0.4402, "step": 4707 }, { "epoch": 0.76, "grad_norm": 1.250867589098619, "learning_rate": 1.45203189844226e-06, "loss": 0.4889, "step": 4708 }, { "epoch": 0.76, "grad_norm": 10.581721365328761, "learning_rate": 1.4501935370994985e-06, "loss": 0.3741, "step": 4709 }, { "epoch": 0.76, "grad_norm": 6.988287206988314, "learning_rate": 1.4483561428218717e-06, "loss": 0.4316, "step": 4710 }, { "epoch": 0.76, "grad_norm": 9.098809711178268, "learning_rate": 1.4465197161099305e-06, "loss": 0.4455, "step": 4711 }, { "epoch": 0.76, "grad_norm": 5.335500236476351, "learning_rate": 1.4446842574639708e-06, "loss": 0.4337, "step": 4712 }, { "epoch": 0.76, "grad_norm": 4.7634684848111855, "learning_rate": 1.4428497673840235e-06, "loss": 0.4347, "step": 4713 }, { "epoch": 0.76, "grad_norm": 7.34945801270178, "learning_rate": 1.441016246369853e-06, "loss": 0.4275, "step": 4714 }, { "epoch": 0.76, "grad_norm": 11.212341327535952, "learning_rate": 1.4391836949209597e-06, "loss": 0.3857, "step": 4715 }, { "epoch": 0.76, "grad_norm": 8.596761707332899, "learning_rate": 1.437352113536582e-06, "loss": 0.429, "step": 4716 }, { "epoch": 0.76, "grad_norm": 9.958990627516943, "learning_rate": 1.435521502715697e-06, "loss": 0.4115, "step": 4717 }, { "epoch": 0.76, "grad_norm": 8.940055691048675, "learning_rate": 1.4336918629570069e-06, "loss": 0.4426, "step": 4718 }, { "epoch": 0.76, "grad_norm": 8.061893793042824, "learning_rate": 1.431863194758959e-06, "loss": 0.4412, "step": 4719 }, { "epoch": 0.76, "grad_norm": 18.28261646655348, "learning_rate": 1.4300354986197345e-06, "loss": 0.4034, "step": 4720 }, { "epoch": 0.76, "grad_norm": 1.260865529445302, "learning_rate": 1.4282087750372475e-06, "loss": 0.4316, "step": 4721 }, { "epoch": 0.76, "grad_norm": 8.258700967367357, "learning_rate": 1.4263830245091454e-06, "loss": 0.4195, "step": 4722 }, { "epoch": 0.76, "grad_norm": 5.597806041817395, "learning_rate": 1.4245582475328156e-06, "loss": 0.4429, "step": 4723 }, { "epoch": 0.76, "grad_norm": 10.99264609316521, "learning_rate": 1.4227344446053759e-06, "loss": 0.4597, "step": 4724 }, { "epoch": 0.76, "grad_norm": 6.7895136763495865, "learning_rate": 1.420911616223683e-06, "loss": 0.4535, "step": 4725 }, { "epoch": 0.76, "grad_norm": 7.04661784543088, "learning_rate": 1.4190897628843242e-06, "loss": 0.4754, "step": 4726 }, { "epoch": 0.76, "grad_norm": 5.959644756315961, "learning_rate": 1.4172688850836202e-06, "loss": 0.4291, "step": 4727 }, { "epoch": 0.76, "grad_norm": 7.185760476766221, "learning_rate": 1.4154489833176322e-06, "loss": 0.3277, "step": 4728 }, { "epoch": 0.76, "grad_norm": 7.563011587331211, "learning_rate": 1.41363005808215e-06, "loss": 0.4263, "step": 4729 }, { "epoch": 0.76, "grad_norm": 1.3517934854711944, "learning_rate": 1.4118121098726972e-06, "loss": 0.4419, "step": 4730 }, { "epoch": 0.76, "grad_norm": 10.96599313418365, "learning_rate": 1.4099951391845345e-06, "loss": 0.3903, "step": 4731 }, { "epoch": 0.76, "grad_norm": 5.847731651684053, "learning_rate": 1.408179146512657e-06, "loss": 0.3482, "step": 4732 }, { "epoch": 0.76, "grad_norm": 15.370965521557725, "learning_rate": 1.4063641323517886e-06, "loss": 0.4742, "step": 4733 }, { "epoch": 0.76, "grad_norm": 7.825540222288062, "learning_rate": 1.4045500971963882e-06, "loss": 0.4253, "step": 4734 }, { "epoch": 0.76, "grad_norm": 7.531820966558123, "learning_rate": 1.4027370415406528e-06, "loss": 0.4135, "step": 4735 }, { "epoch": 0.76, "grad_norm": 11.505491077657233, "learning_rate": 1.4009249658785058e-06, "loss": 0.4026, "step": 4736 }, { "epoch": 0.76, "grad_norm": 7.39851432561194, "learning_rate": 1.399113870703605e-06, "loss": 0.3939, "step": 4737 }, { "epoch": 0.76, "grad_norm": 6.1147202540359205, "learning_rate": 1.3973037565093455e-06, "loss": 0.4186, "step": 4738 }, { "epoch": 0.76, "grad_norm": 25.473576998314215, "learning_rate": 1.395494623788855e-06, "loss": 0.3992, "step": 4739 }, { "epoch": 0.76, "grad_norm": 6.988597560782217, "learning_rate": 1.3936864730349842e-06, "loss": 0.4547, "step": 4740 }, { "epoch": 0.76, "grad_norm": 11.842046519064832, "learning_rate": 1.3918793047403268e-06, "loss": 0.4367, "step": 4741 }, { "epoch": 0.76, "grad_norm": 5.544780679844971, "learning_rate": 1.3900731193972073e-06, "loss": 0.4236, "step": 4742 }, { "epoch": 0.76, "grad_norm": 9.482266476532116, "learning_rate": 1.3882679174976777e-06, "loss": 0.3975, "step": 4743 }, { "epoch": 0.76, "grad_norm": 6.047525965063067, "learning_rate": 1.386463699533524e-06, "loss": 0.4418, "step": 4744 }, { "epoch": 0.76, "grad_norm": 7.250636395846284, "learning_rate": 1.3846604659962676e-06, "loss": 0.4321, "step": 4745 }, { "epoch": 0.76, "grad_norm": 5.241619148739895, "learning_rate": 1.3828582173771576e-06, "loss": 0.4084, "step": 4746 }, { "epoch": 0.76, "grad_norm": 8.177054175648749, "learning_rate": 1.3810569541671754e-06, "loss": 0.3787, "step": 4747 }, { "epoch": 0.77, "grad_norm": 4.439514156092396, "learning_rate": 1.3792566768570364e-06, "loss": 0.3511, "step": 4748 }, { "epoch": 0.77, "grad_norm": 7.09834036145748, "learning_rate": 1.3774573859371842e-06, "loss": 0.4139, "step": 4749 }, { "epoch": 0.77, "grad_norm": 9.762571078952499, "learning_rate": 1.3756590818977972e-06, "loss": 0.427, "step": 4750 }, { "epoch": 0.77, "grad_norm": 1.0794038150859964, "learning_rate": 1.37386176522878e-06, "loss": 0.439, "step": 4751 }, { "epoch": 0.77, "grad_norm": 8.647485064078477, "learning_rate": 1.372065436419775e-06, "loss": 0.4598, "step": 4752 }, { "epoch": 0.77, "grad_norm": 10.02934074901242, "learning_rate": 1.3702700959601483e-06, "loss": 0.3638, "step": 4753 }, { "epoch": 0.77, "grad_norm": 5.362510397145931, "learning_rate": 1.368475744339003e-06, "loss": 0.4542, "step": 4754 }, { "epoch": 0.77, "grad_norm": 5.389912186454759, "learning_rate": 1.366682382045168e-06, "loss": 0.4376, "step": 4755 }, { "epoch": 0.77, "grad_norm": 6.170886566018786, "learning_rate": 1.364890009567204e-06, "loss": 0.4524, "step": 4756 }, { "epoch": 0.77, "grad_norm": 6.3128007962098565, "learning_rate": 1.3630986273934054e-06, "loss": 0.4304, "step": 4757 }, { "epoch": 0.77, "grad_norm": 9.08799522117687, "learning_rate": 1.3613082360117924e-06, "loss": 0.4185, "step": 4758 }, { "epoch": 0.77, "grad_norm": 17.866289081018913, "learning_rate": 1.3595188359101152e-06, "loss": 0.4351, "step": 4759 }, { "epoch": 0.77, "grad_norm": 5.457582559212692, "learning_rate": 1.357730427575858e-06, "loss": 0.3738, "step": 4760 }, { "epoch": 0.77, "grad_norm": 8.9616537972812, "learning_rate": 1.3559430114962345e-06, "loss": 0.4459, "step": 4761 }, { "epoch": 0.77, "grad_norm": 4.52768059100778, "learning_rate": 1.3541565881581815e-06, "loss": 0.4266, "step": 4762 }, { "epoch": 0.77, "grad_norm": 1.2110562917915075, "learning_rate": 1.3523711580483717e-06, "loss": 0.45, "step": 4763 }, { "epoch": 0.77, "grad_norm": 14.546734886259859, "learning_rate": 1.350586721653207e-06, "loss": 0.392, "step": 4764 }, { "epoch": 0.77, "grad_norm": 6.7114285612021725, "learning_rate": 1.3488032794588168e-06, "loss": 0.4357, "step": 4765 }, { "epoch": 0.77, "grad_norm": 11.25814414495884, "learning_rate": 1.347020831951057e-06, "loss": 0.4871, "step": 4766 }, { "epoch": 0.77, "grad_norm": 6.19639877710852, "learning_rate": 1.3452393796155194e-06, "loss": 0.4209, "step": 4767 }, { "epoch": 0.77, "grad_norm": 5.07613534542183, "learning_rate": 1.343458922937519e-06, "loss": 0.354, "step": 4768 }, { "epoch": 0.77, "grad_norm": 8.652272743659418, "learning_rate": 1.341679462402099e-06, "loss": 0.4296, "step": 4769 }, { "epoch": 0.77, "grad_norm": 8.967775619002554, "learning_rate": 1.3399009984940376e-06, "loss": 0.4239, "step": 4770 }, { "epoch": 0.77, "grad_norm": 8.129126482678892, "learning_rate": 1.338123531697834e-06, "loss": 0.435, "step": 4771 }, { "epoch": 0.77, "grad_norm": 5.649837485305931, "learning_rate": 1.3363470624977221e-06, "loss": 0.4006, "step": 4772 }, { "epoch": 0.77, "grad_norm": 14.366720829431518, "learning_rate": 1.3345715913776575e-06, "loss": 0.392, "step": 4773 }, { "epoch": 0.77, "grad_norm": 6.119457539215147, "learning_rate": 1.3327971188213317e-06, "loss": 0.3707, "step": 4774 }, { "epoch": 0.77, "grad_norm": 5.5648236873714, "learning_rate": 1.3310236453121562e-06, "loss": 0.385, "step": 4775 }, { "epoch": 0.77, "grad_norm": 6.8181693412224975, "learning_rate": 1.3292511713332767e-06, "loss": 0.4953, "step": 4776 }, { "epoch": 0.77, "grad_norm": 6.6061573522521755, "learning_rate": 1.3274796973675629e-06, "loss": 0.4324, "step": 4777 }, { "epoch": 0.77, "grad_norm": 13.825480751326745, "learning_rate": 1.3257092238976122e-06, "loss": 0.3591, "step": 4778 }, { "epoch": 0.77, "grad_norm": 19.25773862154062, "learning_rate": 1.3239397514057523e-06, "loss": 0.4209, "step": 4779 }, { "epoch": 0.77, "grad_norm": 6.060051817070309, "learning_rate": 1.3221712803740356e-06, "loss": 0.4133, "step": 4780 }, { "epoch": 0.77, "grad_norm": 7.073113500616108, "learning_rate": 1.3204038112842404e-06, "loss": 0.4025, "step": 4781 }, { "epoch": 0.77, "grad_norm": 6.614316324453301, "learning_rate": 1.3186373446178757e-06, "loss": 0.4339, "step": 4782 }, { "epoch": 0.77, "grad_norm": 6.815605939948523, "learning_rate": 1.3168718808561793e-06, "loss": 0.3755, "step": 4783 }, { "epoch": 0.77, "grad_norm": 5.3773254226286555, "learning_rate": 1.3151074204801046e-06, "loss": 0.3894, "step": 4784 }, { "epoch": 0.77, "grad_norm": 6.579724402021042, "learning_rate": 1.313343963970344e-06, "loss": 0.363, "step": 4785 }, { "epoch": 0.77, "grad_norm": 6.069779306259264, "learning_rate": 1.3115815118073116e-06, "loss": 0.371, "step": 4786 }, { "epoch": 0.77, "grad_norm": 8.751869174939817, "learning_rate": 1.3098200644711478e-06, "loss": 0.4794, "step": 4787 }, { "epoch": 0.77, "grad_norm": 5.845005176100715, "learning_rate": 1.3080596224417174e-06, "loss": 0.3874, "step": 4788 }, { "epoch": 0.77, "grad_norm": 5.796120520823414, "learning_rate": 1.3063001861986162e-06, "loss": 0.3719, "step": 4789 }, { "epoch": 0.77, "grad_norm": 5.795757597625095, "learning_rate": 1.3045417562211616e-06, "loss": 0.4322, "step": 4790 }, { "epoch": 0.77, "grad_norm": 5.42140988415866, "learning_rate": 1.3027843329883972e-06, "loss": 0.3743, "step": 4791 }, { "epoch": 0.77, "grad_norm": 9.967291438561222, "learning_rate": 1.3010279169790947e-06, "loss": 0.4008, "step": 4792 }, { "epoch": 0.77, "grad_norm": 7.453848656225191, "learning_rate": 1.2992725086717518e-06, "loss": 0.4656, "step": 4793 }, { "epoch": 0.77, "grad_norm": 6.257711428115967, "learning_rate": 1.2975181085445887e-06, "loss": 0.4076, "step": 4794 }, { "epoch": 0.77, "grad_norm": 7.044788233291754, "learning_rate": 1.2957647170755504e-06, "loss": 0.4564, "step": 4795 }, { "epoch": 0.77, "grad_norm": 7.613923480801501, "learning_rate": 1.2940123347423133e-06, "loss": 0.4283, "step": 4796 }, { "epoch": 0.77, "grad_norm": 9.054242363437876, "learning_rate": 1.292260962022272e-06, "loss": 0.4662, "step": 4797 }, { "epoch": 0.77, "grad_norm": 9.741412496945943, "learning_rate": 1.2905105993925477e-06, "loss": 0.4206, "step": 4798 }, { "epoch": 0.77, "grad_norm": 5.763334296659333, "learning_rate": 1.2887612473299905e-06, "loss": 0.4709, "step": 4799 }, { "epoch": 0.77, "grad_norm": 6.457734338729853, "learning_rate": 1.2870129063111685e-06, "loss": 0.4196, "step": 4800 }, { "epoch": 0.77, "grad_norm": 6.5119562863606175, "learning_rate": 1.2852655768123811e-06, "loss": 0.3866, "step": 4801 }, { "epoch": 0.77, "grad_norm": 12.733457095218865, "learning_rate": 1.2835192593096485e-06, "loss": 0.4194, "step": 4802 }, { "epoch": 0.77, "grad_norm": 8.946065069622087, "learning_rate": 1.2817739542787134e-06, "loss": 0.4093, "step": 4803 }, { "epoch": 0.77, "grad_norm": 10.880314887596017, "learning_rate": 1.2800296621950463e-06, "loss": 0.3792, "step": 4804 }, { "epoch": 0.77, "grad_norm": 6.511249416942179, "learning_rate": 1.2782863835338444e-06, "loss": 0.5258, "step": 4805 }, { "epoch": 0.77, "grad_norm": 8.86039024105612, "learning_rate": 1.2765441187700179e-06, "loss": 0.4578, "step": 4806 }, { "epoch": 0.77, "grad_norm": 4.851119683417882, "learning_rate": 1.2748028683782115e-06, "loss": 0.3771, "step": 4807 }, { "epoch": 0.77, "grad_norm": 4.794361005822668, "learning_rate": 1.2730626328327906e-06, "loss": 0.3474, "step": 4808 }, { "epoch": 0.77, "grad_norm": 6.2513150652853176, "learning_rate": 1.2713234126078423e-06, "loss": 0.4734, "step": 4809 }, { "epoch": 0.77, "grad_norm": 8.663512757648, "learning_rate": 1.2695852081771758e-06, "loss": 0.4235, "step": 4810 }, { "epoch": 0.78, "grad_norm": 6.661108650185705, "learning_rate": 1.267848020014329e-06, "loss": 0.3803, "step": 4811 }, { "epoch": 0.78, "grad_norm": 5.145690485357668, "learning_rate": 1.266111848592561e-06, "loss": 0.3994, "step": 4812 }, { "epoch": 0.78, "grad_norm": 4.661461533529053, "learning_rate": 1.2643766943848484e-06, "loss": 0.4569, "step": 4813 }, { "epoch": 0.78, "grad_norm": 6.814108637481711, "learning_rate": 1.2626425578638973e-06, "loss": 0.4524, "step": 4814 }, { "epoch": 0.78, "grad_norm": 4.430298968976859, "learning_rate": 1.2609094395021354e-06, "loss": 0.366, "step": 4815 }, { "epoch": 0.78, "grad_norm": 7.375367103054462, "learning_rate": 1.259177339771711e-06, "loss": 0.3775, "step": 4816 }, { "epoch": 0.78, "grad_norm": 1.2287926248963557, "learning_rate": 1.257446259144494e-06, "loss": 0.4245, "step": 4817 }, { "epoch": 0.78, "grad_norm": 7.350105195055138, "learning_rate": 1.2557161980920824e-06, "loss": 0.4589, "step": 4818 }, { "epoch": 0.78, "grad_norm": 9.270269980420554, "learning_rate": 1.2539871570857893e-06, "loss": 0.4408, "step": 4819 }, { "epoch": 0.78, "grad_norm": 5.273158463216462, "learning_rate": 1.252259136596653e-06, "loss": 0.4001, "step": 4820 }, { "epoch": 0.78, "grad_norm": 9.43957974473868, "learning_rate": 1.250532137095436e-06, "loss": 0.3991, "step": 4821 }, { "epoch": 0.78, "grad_norm": 9.569199359459839, "learning_rate": 1.2488061590526185e-06, "loss": 0.4416, "step": 4822 }, { "epoch": 0.78, "grad_norm": 6.716206736622448, "learning_rate": 1.2470812029384071e-06, "loss": 0.3839, "step": 4823 }, { "epoch": 0.78, "grad_norm": 7.599728007522551, "learning_rate": 1.2453572692227257e-06, "loss": 0.4449, "step": 4824 }, { "epoch": 0.78, "grad_norm": 8.634701682554734, "learning_rate": 1.2436343583752197e-06, "loss": 0.4027, "step": 4825 }, { "epoch": 0.78, "grad_norm": 8.532351873203318, "learning_rate": 1.2419124708652607e-06, "loss": 0.417, "step": 4826 }, { "epoch": 0.78, "grad_norm": 6.043082239316731, "learning_rate": 1.2401916071619374e-06, "loss": 0.4196, "step": 4827 }, { "epoch": 0.78, "grad_norm": 1.390813896748056, "learning_rate": 1.2384717677340585e-06, "loss": 0.4674, "step": 4828 }, { "epoch": 0.78, "grad_norm": 8.168855612728432, "learning_rate": 1.2367529530501571e-06, "loss": 0.4384, "step": 4829 }, { "epoch": 0.78, "grad_norm": 9.087948860162664, "learning_rate": 1.2350351635784875e-06, "loss": 0.4515, "step": 4830 }, { "epoch": 0.78, "grad_norm": 9.49881774896016, "learning_rate": 1.2333183997870207e-06, "loss": 0.4665, "step": 4831 }, { "epoch": 0.78, "grad_norm": 8.606645952295898, "learning_rate": 1.2316026621434502e-06, "loss": 0.3427, "step": 4832 }, { "epoch": 0.78, "grad_norm": 6.296504266617278, "learning_rate": 1.2298879511151906e-06, "loss": 0.4123, "step": 4833 }, { "epoch": 0.78, "grad_norm": 5.954155165172583, "learning_rate": 1.2281742671693798e-06, "loss": 0.412, "step": 4834 }, { "epoch": 0.78, "grad_norm": 5.030820647092895, "learning_rate": 1.2264616107728666e-06, "loss": 0.4213, "step": 4835 }, { "epoch": 0.78, "grad_norm": 6.226509670338716, "learning_rate": 1.2247499823922287e-06, "loss": 0.4184, "step": 4836 }, { "epoch": 0.78, "grad_norm": 7.055801030254379, "learning_rate": 1.2230393824937631e-06, "loss": 0.4271, "step": 4837 }, { "epoch": 0.78, "grad_norm": 14.778853863372987, "learning_rate": 1.2213298115434812e-06, "loss": 0.4056, "step": 4838 }, { "epoch": 0.78, "grad_norm": 10.875500736448283, "learning_rate": 1.219621270007118e-06, "loss": 0.4807, "step": 4839 }, { "epoch": 0.78, "grad_norm": 7.812803412095989, "learning_rate": 1.2179137583501282e-06, "loss": 0.463, "step": 4840 }, { "epoch": 0.78, "grad_norm": 6.0145360970099695, "learning_rate": 1.2162072770376848e-06, "loss": 0.3725, "step": 4841 }, { "epoch": 0.78, "grad_norm": 5.82177435095611, "learning_rate": 1.2145018265346786e-06, "loss": 0.4113, "step": 4842 }, { "epoch": 0.78, "grad_norm": 12.324405086185829, "learning_rate": 1.2127974073057241e-06, "loss": 0.4018, "step": 4843 }, { "epoch": 0.78, "grad_norm": 9.224705225362557, "learning_rate": 1.2110940198151489e-06, "loss": 0.4489, "step": 4844 }, { "epoch": 0.78, "grad_norm": 8.532376202257181, "learning_rate": 1.2093916645270066e-06, "loss": 0.3946, "step": 4845 }, { "epoch": 0.78, "grad_norm": 6.534333913007494, "learning_rate": 1.2076903419050629e-06, "loss": 0.4825, "step": 4846 }, { "epoch": 0.78, "grad_norm": 12.747471279876997, "learning_rate": 1.2059900524128048e-06, "loss": 0.3488, "step": 4847 }, { "epoch": 0.78, "grad_norm": 5.369645366493282, "learning_rate": 1.2042907965134404e-06, "loss": 0.4659, "step": 4848 }, { "epoch": 0.78, "grad_norm": 5.836203920138631, "learning_rate": 1.2025925746698918e-06, "loss": 0.4487, "step": 4849 }, { "epoch": 0.78, "grad_norm": 8.162206563773072, "learning_rate": 1.200895387344801e-06, "loss": 0.3704, "step": 4850 }, { "epoch": 0.78, "grad_norm": 6.877802731151928, "learning_rate": 1.1991992350005294e-06, "loss": 0.4521, "step": 4851 }, { "epoch": 0.78, "grad_norm": 4.813674888979266, "learning_rate": 1.1975041180991576e-06, "loss": 0.4337, "step": 4852 }, { "epoch": 0.78, "grad_norm": 8.597605002150404, "learning_rate": 1.19581003710248e-06, "loss": 0.4217, "step": 4853 }, { "epoch": 0.78, "grad_norm": 24.20871815898599, "learning_rate": 1.1941169924720103e-06, "loss": 0.4004, "step": 4854 }, { "epoch": 0.78, "grad_norm": 9.861936868830862, "learning_rate": 1.1924249846689835e-06, "loss": 0.3839, "step": 4855 }, { "epoch": 0.78, "grad_norm": 16.127468078524796, "learning_rate": 1.1907340141543466e-06, "loss": 0.4094, "step": 4856 }, { "epoch": 0.78, "grad_norm": 6.604542676373059, "learning_rate": 1.189044081388766e-06, "loss": 0.4602, "step": 4857 }, { "epoch": 0.78, "grad_norm": 1.0427336029682848, "learning_rate": 1.1873551868326272e-06, "loss": 0.4342, "step": 4858 }, { "epoch": 0.78, "grad_norm": 10.791213059539333, "learning_rate": 1.185667330946033e-06, "loss": 0.4099, "step": 4859 }, { "epoch": 0.78, "grad_norm": 34.67555047690918, "learning_rate": 1.1839805141888012e-06, "loss": 0.3813, "step": 4860 }, { "epoch": 0.78, "grad_norm": 16.213908656291117, "learning_rate": 1.1822947370204647e-06, "loss": 0.36, "step": 4861 }, { "epoch": 0.78, "grad_norm": 12.967771159934538, "learning_rate": 1.180609999900279e-06, "loss": 0.3874, "step": 4862 }, { "epoch": 0.78, "grad_norm": 8.493203190032089, "learning_rate": 1.1789263032872112e-06, "loss": 0.4315, "step": 4863 }, { "epoch": 0.78, "grad_norm": 8.478590041519976, "learning_rate": 1.1772436476399456e-06, "loss": 0.3658, "step": 4864 }, { "epoch": 0.78, "grad_norm": 9.337306400351714, "learning_rate": 1.1755620334168866e-06, "loss": 0.5091, "step": 4865 }, { "epoch": 0.78, "grad_norm": 10.8222973441737, "learning_rate": 1.1738814610761496e-06, "loss": 0.4442, "step": 4866 }, { "epoch": 0.78, "grad_norm": 16.8350004410709, "learning_rate": 1.1722019310755717e-06, "loss": 0.4507, "step": 4867 }, { "epoch": 0.78, "grad_norm": 5.2404060652488225, "learning_rate": 1.1705234438727015e-06, "loss": 0.3829, "step": 4868 }, { "epoch": 0.78, "grad_norm": 11.951481349415708, "learning_rate": 1.1688459999248042e-06, "loss": 0.4379, "step": 4869 }, { "epoch": 0.78, "grad_norm": 7.542107477687765, "learning_rate": 1.1671695996888637e-06, "loss": 0.423, "step": 4870 }, { "epoch": 0.78, "grad_norm": 5.173045720650641, "learning_rate": 1.165494243621576e-06, "loss": 0.3366, "step": 4871 }, { "epoch": 0.78, "grad_norm": 5.817230880108993, "learning_rate": 1.1638199321793563e-06, "loss": 0.387, "step": 4872 }, { "epoch": 0.79, "grad_norm": 1.1257645849375972, "learning_rate": 1.1621466658183306e-06, "loss": 0.4826, "step": 4873 }, { "epoch": 0.79, "grad_norm": 12.143604211494988, "learning_rate": 1.1604744449943455e-06, "loss": 0.4169, "step": 4874 }, { "epoch": 0.79, "grad_norm": 9.033032469971031, "learning_rate": 1.1588032701629592e-06, "loss": 0.3437, "step": 4875 }, { "epoch": 0.79, "grad_norm": 6.933810233315558, "learning_rate": 1.1571331417794435e-06, "loss": 0.4569, "step": 4876 }, { "epoch": 0.79, "grad_norm": 7.270906845343537, "learning_rate": 1.1554640602987905e-06, "loss": 0.417, "step": 4877 }, { "epoch": 0.79, "grad_norm": 4.107057029200767, "learning_rate": 1.153796026175703e-06, "loss": 0.408, "step": 4878 }, { "epoch": 0.79, "grad_norm": 13.046430969569801, "learning_rate": 1.1521290398645978e-06, "loss": 0.392, "step": 4879 }, { "epoch": 0.79, "grad_norm": 8.039660356518825, "learning_rate": 1.150463101819609e-06, "loss": 0.4277, "step": 4880 }, { "epoch": 0.79, "grad_norm": 5.057185128188726, "learning_rate": 1.1487982124945861e-06, "loss": 0.3533, "step": 4881 }, { "epoch": 0.79, "grad_norm": 4.721321919283583, "learning_rate": 1.147134372343089e-06, "loss": 0.3886, "step": 4882 }, { "epoch": 0.79, "grad_norm": 7.565969246659852, "learning_rate": 1.1454715818183927e-06, "loss": 0.4732, "step": 4883 }, { "epoch": 0.79, "grad_norm": 5.19155752636563, "learning_rate": 1.1438098413734888e-06, "loss": 0.3264, "step": 4884 }, { "epoch": 0.79, "grad_norm": 16.138667175847573, "learning_rate": 1.142149151461081e-06, "loss": 0.4115, "step": 4885 }, { "epoch": 0.79, "grad_norm": 13.811759906429646, "learning_rate": 1.1404895125335859e-06, "loss": 0.4213, "step": 4886 }, { "epoch": 0.79, "grad_norm": 6.23119607835735, "learning_rate": 1.1388309250431363e-06, "loss": 0.4176, "step": 4887 }, { "epoch": 0.79, "grad_norm": 18.30188683578474, "learning_rate": 1.1371733894415748e-06, "loss": 0.3778, "step": 4888 }, { "epoch": 0.79, "grad_norm": 8.695343968996843, "learning_rate": 1.1355169061804632e-06, "loss": 0.4512, "step": 4889 }, { "epoch": 0.79, "grad_norm": 12.477559586149049, "learning_rate": 1.1338614757110706e-06, "loss": 0.4541, "step": 4890 }, { "epoch": 0.79, "grad_norm": 15.872409182259652, "learning_rate": 1.1322070984843837e-06, "loss": 0.4145, "step": 4891 }, { "epoch": 0.79, "grad_norm": 9.598142688364083, "learning_rate": 1.1305537749510993e-06, "loss": 0.4578, "step": 4892 }, { "epoch": 0.79, "grad_norm": 6.174761032103409, "learning_rate": 1.128901505561627e-06, "loss": 0.4353, "step": 4893 }, { "epoch": 0.79, "grad_norm": 8.258146760132579, "learning_rate": 1.1272502907660937e-06, "loss": 0.3567, "step": 4894 }, { "epoch": 0.79, "grad_norm": 6.113047766642349, "learning_rate": 1.1256001310143327e-06, "loss": 0.4309, "step": 4895 }, { "epoch": 0.79, "grad_norm": 1.1136941523228963, "learning_rate": 1.1239510267558962e-06, "loss": 0.4268, "step": 4896 }, { "epoch": 0.79, "grad_norm": 8.169937547317659, "learning_rate": 1.1223029784400436e-06, "loss": 0.432, "step": 4897 }, { "epoch": 0.79, "grad_norm": 8.219536882294408, "learning_rate": 1.120655986515748e-06, "loss": 0.3921, "step": 4898 }, { "epoch": 0.79, "grad_norm": 8.828703023916113, "learning_rate": 1.1190100514316977e-06, "loss": 0.4172, "step": 4899 }, { "epoch": 0.79, "grad_norm": 6.69754409550244, "learning_rate": 1.1173651736362889e-06, "loss": 0.3658, "step": 4900 }, { "epoch": 0.79, "grad_norm": 13.589375433301951, "learning_rate": 1.1157213535776312e-06, "loss": 0.3768, "step": 4901 }, { "epoch": 0.79, "grad_norm": 6.138009918242657, "learning_rate": 1.114078591703548e-06, "loss": 0.4149, "step": 4902 }, { "epoch": 0.79, "grad_norm": 12.98675754904074, "learning_rate": 1.1124368884615748e-06, "loss": 0.3853, "step": 4903 }, { "epoch": 0.79, "grad_norm": 6.215564516546042, "learning_rate": 1.1107962442989518e-06, "loss": 0.4116, "step": 4904 }, { "epoch": 0.79, "grad_norm": 17.307138561534714, "learning_rate": 1.1091566596626384e-06, "loss": 0.3917, "step": 4905 }, { "epoch": 0.79, "grad_norm": 9.83477257736823, "learning_rate": 1.1075181349993042e-06, "loss": 0.4296, "step": 4906 }, { "epoch": 0.79, "grad_norm": 5.867720886453453, "learning_rate": 1.1058806707553266e-06, "loss": 0.4034, "step": 4907 }, { "epoch": 0.79, "grad_norm": 5.0926177512602955, "learning_rate": 1.104244267376795e-06, "loss": 0.4437, "step": 4908 }, { "epoch": 0.79, "grad_norm": 13.626745599928466, "learning_rate": 1.1026089253095134e-06, "loss": 0.4291, "step": 4909 }, { "epoch": 0.79, "grad_norm": 7.2030915909672855, "learning_rate": 1.1009746449989916e-06, "loss": 0.4646, "step": 4910 }, { "epoch": 0.79, "grad_norm": 10.15358081378589, "learning_rate": 1.0993414268904552e-06, "loss": 0.3852, "step": 4911 }, { "epoch": 0.79, "grad_norm": 4.900883586711025, "learning_rate": 1.0977092714288345e-06, "loss": 0.4327, "step": 4912 }, { "epoch": 0.79, "grad_norm": 72.6580849504174, "learning_rate": 1.0960781790587776e-06, "loss": 0.4626, "step": 4913 }, { "epoch": 0.79, "grad_norm": 6.72585593304791, "learning_rate": 1.0944481502246368e-06, "loss": 0.3884, "step": 4914 }, { "epoch": 0.79, "grad_norm": 30.363817516976347, "learning_rate": 1.0928191853704757e-06, "loss": 0.4103, "step": 4915 }, { "epoch": 0.79, "grad_norm": 5.780525590379177, "learning_rate": 1.0911912849400712e-06, "loss": 0.452, "step": 4916 }, { "epoch": 0.79, "grad_norm": 15.30028644570836, "learning_rate": 1.089564449376907e-06, "loss": 0.3628, "step": 4917 }, { "epoch": 0.79, "grad_norm": 12.452722242899878, "learning_rate": 1.0879386791241797e-06, "loss": 0.4274, "step": 4918 }, { "epoch": 0.79, "grad_norm": 4.7821004256668616, "learning_rate": 1.086313974624793e-06, "loss": 0.4, "step": 4919 }, { "epoch": 0.79, "grad_norm": 5.786521716995293, "learning_rate": 1.0846903363213595e-06, "loss": 0.3846, "step": 4920 }, { "epoch": 0.79, "grad_norm": 6.547791064923995, "learning_rate": 1.083067764656206e-06, "loss": 0.3836, "step": 4921 }, { "epoch": 0.79, "grad_norm": 1.3809455618444775, "learning_rate": 1.0814462600713642e-06, "loss": 0.4703, "step": 4922 }, { "epoch": 0.79, "grad_norm": 7.379113198955013, "learning_rate": 1.0798258230085756e-06, "loss": 0.4598, "step": 4923 }, { "epoch": 0.79, "grad_norm": 4.964939935856226, "learning_rate": 1.078206453909293e-06, "loss": 0.4321, "step": 4924 }, { "epoch": 0.79, "grad_norm": 7.940795913640375, "learning_rate": 1.0765881532146793e-06, "loss": 0.3879, "step": 4925 }, { "epoch": 0.79, "grad_norm": 4.74925577751493, "learning_rate": 1.0749709213656001e-06, "loss": 0.4159, "step": 4926 }, { "epoch": 0.79, "grad_norm": 5.313000682212276, "learning_rate": 1.0733547588026355e-06, "loss": 0.4708, "step": 4927 }, { "epoch": 0.79, "grad_norm": 6.199337850968073, "learning_rate": 1.071739665966075e-06, "loss": 0.4357, "step": 4928 }, { "epoch": 0.79, "grad_norm": 9.858182101830659, "learning_rate": 1.0701256432959123e-06, "loss": 0.3574, "step": 4929 }, { "epoch": 0.79, "grad_norm": 6.826240907848948, "learning_rate": 1.0685126912318511e-06, "loss": 0.4226, "step": 4930 }, { "epoch": 0.79, "grad_norm": 6.013713067614893, "learning_rate": 1.0669008102133044e-06, "loss": 0.4422, "step": 4931 }, { "epoch": 0.79, "grad_norm": 5.081971722930161, "learning_rate": 1.0652900006793953e-06, "loss": 0.3924, "step": 4932 }, { "epoch": 0.79, "grad_norm": 7.943272492713196, "learning_rate": 1.0636802630689508e-06, "loss": 0.4615, "step": 4933 }, { "epoch": 0.79, "grad_norm": 4.5140519934195265, "learning_rate": 1.0620715978205066e-06, "loss": 0.3525, "step": 4934 }, { "epoch": 0.8, "grad_norm": 6.067346893747588, "learning_rate": 1.0604640053723098e-06, "loss": 0.4371, "step": 4935 }, { "epoch": 0.8, "grad_norm": 6.667967905546646, "learning_rate": 1.058857486162312e-06, "loss": 0.3647, "step": 4936 }, { "epoch": 0.8, "grad_norm": 6.2920299974096565, "learning_rate": 1.0572520406281716e-06, "loss": 0.3445, "step": 4937 }, { "epoch": 0.8, "grad_norm": 5.956711707048635, "learning_rate": 1.0556476692072598e-06, "loss": 0.3427, "step": 4938 }, { "epoch": 0.8, "grad_norm": 12.600431628391135, "learning_rate": 1.0540443723366478e-06, "loss": 0.4162, "step": 4939 }, { "epoch": 0.8, "grad_norm": 5.489760134334394, "learning_rate": 1.0524421504531212e-06, "loss": 0.4186, "step": 4940 }, { "epoch": 0.8, "grad_norm": 8.139154625046737, "learning_rate": 1.0508410039931683e-06, "loss": 0.4075, "step": 4941 }, { "epoch": 0.8, "grad_norm": 6.192614250626416, "learning_rate": 1.0492409333929833e-06, "loss": 0.3863, "step": 4942 }, { "epoch": 0.8, "grad_norm": 7.8984561597926985, "learning_rate": 1.0476419390884723e-06, "loss": 0.3916, "step": 4943 }, { "epoch": 0.8, "grad_norm": 5.2364041240399715, "learning_rate": 1.046044021515245e-06, "loss": 0.363, "step": 4944 }, { "epoch": 0.8, "grad_norm": 15.079074923930056, "learning_rate": 1.0444471811086166e-06, "loss": 0.3714, "step": 4945 }, { "epoch": 0.8, "grad_norm": 8.656039669805464, "learning_rate": 1.0428514183036109e-06, "loss": 0.3978, "step": 4946 }, { "epoch": 0.8, "grad_norm": 1.2374127938961095, "learning_rate": 1.0412567335349616e-06, "loss": 0.4888, "step": 4947 }, { "epoch": 0.8, "grad_norm": 8.830543773477803, "learning_rate": 1.0396631272370982e-06, "loss": 0.3774, "step": 4948 }, { "epoch": 0.8, "grad_norm": 12.974909796855309, "learning_rate": 1.0380705998441664e-06, "loss": 0.3599, "step": 4949 }, { "epoch": 0.8, "grad_norm": 5.90625739868267, "learning_rate": 1.0364791517900164e-06, "loss": 0.4047, "step": 4950 }, { "epoch": 0.8, "grad_norm": 5.4773383034245455, "learning_rate": 1.0348887835082e-06, "loss": 0.3973, "step": 4951 }, { "epoch": 0.8, "grad_norm": 9.863954438828529, "learning_rate": 1.0332994954319763e-06, "loss": 0.4799, "step": 4952 }, { "epoch": 0.8, "grad_norm": 6.146777214949624, "learning_rate": 1.031711287994313e-06, "loss": 0.4244, "step": 4953 }, { "epoch": 0.8, "grad_norm": 5.369826997485884, "learning_rate": 1.0301241616278845e-06, "loss": 0.4147, "step": 4954 }, { "epoch": 0.8, "grad_norm": 4.849554090027531, "learning_rate": 1.0285381167650615e-06, "loss": 0.3987, "step": 4955 }, { "epoch": 0.8, "grad_norm": 5.095824091148038, "learning_rate": 1.0269531538379295e-06, "loss": 0.4238, "step": 4956 }, { "epoch": 0.8, "grad_norm": 20.55153736765723, "learning_rate": 1.0253692732782778e-06, "loss": 0.3951, "step": 4957 }, { "epoch": 0.8, "grad_norm": 9.279900706244042, "learning_rate": 1.0237864755175969e-06, "loss": 0.4272, "step": 4958 }, { "epoch": 0.8, "grad_norm": 5.260168225815382, "learning_rate": 1.022204760987084e-06, "loss": 0.401, "step": 4959 }, { "epoch": 0.8, "grad_norm": 1.032808876788408, "learning_rate": 1.0206241301176439e-06, "loss": 0.4949, "step": 4960 }, { "epoch": 0.8, "grad_norm": 8.836601342135245, "learning_rate": 1.0190445833398814e-06, "loss": 0.4721, "step": 4961 }, { "epoch": 0.8, "grad_norm": 7.477879073364938, "learning_rate": 1.0174661210841119e-06, "loss": 0.4417, "step": 4962 }, { "epoch": 0.8, "grad_norm": 6.353823630239038, "learning_rate": 1.0158887437803499e-06, "loss": 0.438, "step": 4963 }, { "epoch": 0.8, "grad_norm": 7.861081314770807, "learning_rate": 1.0143124518583158e-06, "loss": 0.4406, "step": 4964 }, { "epoch": 0.8, "grad_norm": 9.334349305032633, "learning_rate": 1.012737245747437e-06, "loss": 0.4935, "step": 4965 }, { "epoch": 0.8, "grad_norm": 8.167322926874983, "learning_rate": 1.0111631258768416e-06, "loss": 0.4175, "step": 4966 }, { "epoch": 0.8, "grad_norm": 4.953860508967381, "learning_rate": 1.0095900926753632e-06, "loss": 0.3807, "step": 4967 }, { "epoch": 0.8, "grad_norm": 7.197595117450276, "learning_rate": 1.0080181465715394e-06, "loss": 0.3851, "step": 4968 }, { "epoch": 0.8, "grad_norm": 10.924122292778346, "learning_rate": 1.0064472879936132e-06, "loss": 0.4282, "step": 4969 }, { "epoch": 0.8, "grad_norm": 6.531869034942238, "learning_rate": 1.0048775173695285e-06, "loss": 0.3676, "step": 4970 }, { "epoch": 0.8, "grad_norm": 6.307874892284786, "learning_rate": 1.0033088351269338e-06, "loss": 0.4513, "step": 4971 }, { "epoch": 0.8, "grad_norm": 7.287532284042153, "learning_rate": 1.0017412416931826e-06, "loss": 0.3795, "step": 4972 }, { "epoch": 0.8, "grad_norm": 15.654009692165715, "learning_rate": 1.0001747374953297e-06, "loss": 0.3887, "step": 4973 }, { "epoch": 0.8, "grad_norm": 10.775942144550207, "learning_rate": 9.986093229601328e-07, "loss": 0.3329, "step": 4974 }, { "epoch": 0.8, "grad_norm": 7.487499343942186, "learning_rate": 9.970449985140557e-07, "loss": 0.3832, "step": 4975 }, { "epoch": 0.8, "grad_norm": 10.42239110313061, "learning_rate": 9.954817645832654e-07, "loss": 0.4377, "step": 4976 }, { "epoch": 0.8, "grad_norm": 5.963052878324259, "learning_rate": 9.939196215936253e-07, "loss": 0.4652, "step": 4977 }, { "epoch": 0.8, "grad_norm": 6.882870542554774, "learning_rate": 9.923585699707084e-07, "loss": 0.4309, "step": 4978 }, { "epoch": 0.8, "grad_norm": 17.696632571582906, "learning_rate": 9.907986101397898e-07, "loss": 0.456, "step": 4979 }, { "epoch": 0.8, "grad_norm": 10.098344269277957, "learning_rate": 9.892397425258437e-07, "loss": 0.4707, "step": 4980 }, { "epoch": 0.8, "grad_norm": 16.544295198139, "learning_rate": 9.876819675535477e-07, "loss": 0.4684, "step": 4981 }, { "epoch": 0.8, "grad_norm": 5.965667646841321, "learning_rate": 9.861252856472857e-07, "loss": 0.3916, "step": 4982 }, { "epoch": 0.8, "grad_norm": 1.193253063214609, "learning_rate": 9.845696972311385e-07, "loss": 0.4676, "step": 4983 }, { "epoch": 0.8, "grad_norm": 6.036223267656382, "learning_rate": 9.830152027288907e-07, "loss": 0.4002, "step": 4984 }, { "epoch": 0.8, "grad_norm": 6.938859508741169, "learning_rate": 9.81461802564032e-07, "loss": 0.3744, "step": 4985 }, { "epoch": 0.8, "grad_norm": 5.290840342282463, "learning_rate": 9.799094971597483e-07, "loss": 0.4288, "step": 4986 }, { "epoch": 0.8, "grad_norm": 25.701346448766785, "learning_rate": 9.783582869389336e-07, "loss": 0.4139, "step": 4987 }, { "epoch": 0.8, "grad_norm": 11.838085518267414, "learning_rate": 9.768081723241785e-07, "loss": 0.393, "step": 4988 }, { "epoch": 0.8, "grad_norm": 4.870090570475528, "learning_rate": 9.752591537377758e-07, "loss": 0.4114, "step": 4989 }, { "epoch": 0.8, "grad_norm": 7.032666404941196, "learning_rate": 9.737112316017221e-07, "loss": 0.3744, "step": 4990 }, { "epoch": 0.8, "grad_norm": 6.9707821947421165, "learning_rate": 9.72164406337716e-07, "loss": 0.4239, "step": 4991 }, { "epoch": 0.8, "grad_norm": 9.060884011168355, "learning_rate": 9.706186783671535e-07, "loss": 0.4425, "step": 4992 }, { "epoch": 0.8, "grad_norm": 6.996254019001918, "learning_rate": 9.69074048111132e-07, "loss": 0.4072, "step": 4993 }, { "epoch": 0.8, "grad_norm": 8.58636588938138, "learning_rate": 9.675305159904546e-07, "loss": 0.3688, "step": 4994 }, { "epoch": 0.8, "grad_norm": 8.58696816776118, "learning_rate": 9.659880824256202e-07, "loss": 0.4528, "step": 4995 }, { "epoch": 0.8, "grad_norm": 7.1081586607191145, "learning_rate": 9.644467478368286e-07, "loss": 0.4519, "step": 4996 }, { "epoch": 0.81, "grad_norm": 5.281218821644457, "learning_rate": 9.629065126439842e-07, "loss": 0.4273, "step": 4997 }, { "epoch": 0.81, "grad_norm": 5.869212687187768, "learning_rate": 9.61367377266691e-07, "loss": 0.4843, "step": 4998 }, { "epoch": 0.81, "grad_norm": 9.437692765761465, "learning_rate": 9.59829342124247e-07, "loss": 0.4287, "step": 4999 }, { "epoch": 0.81, "grad_norm": 5.8777397953456045, "learning_rate": 9.582924076356587e-07, "loss": 0.4853, "step": 5000 }, { "epoch": 0.81, "grad_norm": 7.337724926224135, "learning_rate": 9.5675657421963e-07, "loss": 0.4358, "step": 5001 }, { "epoch": 0.81, "grad_norm": 10.23514949433835, "learning_rate": 9.552218422945636e-07, "loss": 0.4233, "step": 5002 }, { "epoch": 0.81, "grad_norm": 12.399285657233223, "learning_rate": 9.536882122785602e-07, "loss": 0.3559, "step": 5003 }, { "epoch": 0.81, "grad_norm": 103.07473343182457, "learning_rate": 9.52155684589427e-07, "loss": 0.3986, "step": 5004 }, { "epoch": 0.81, "grad_norm": 6.24667607287334, "learning_rate": 9.506242596446641e-07, "loss": 0.3972, "step": 5005 }, { "epoch": 0.81, "grad_norm": 7.704459868219285, "learning_rate": 9.490939378614739e-07, "loss": 0.4332, "step": 5006 }, { "epoch": 0.81, "grad_norm": 4.909994027098494, "learning_rate": 9.4756471965676e-07, "loss": 0.4195, "step": 5007 }, { "epoch": 0.81, "grad_norm": 1.1908633225652752, "learning_rate": 9.46036605447121e-07, "loss": 0.4562, "step": 5008 }, { "epoch": 0.81, "grad_norm": 7.898239638696525, "learning_rate": 9.445095956488604e-07, "loss": 0.4203, "step": 5009 }, { "epoch": 0.81, "grad_norm": 5.931213696842268, "learning_rate": 9.42983690677975e-07, "loss": 0.459, "step": 5010 }, { "epoch": 0.81, "grad_norm": 11.746113045100664, "learning_rate": 9.414588909501654e-07, "loss": 0.4417, "step": 5011 }, { "epoch": 0.81, "grad_norm": 7.110872701715413, "learning_rate": 9.399351968808285e-07, "loss": 0.4614, "step": 5012 }, { "epoch": 0.81, "grad_norm": 8.282669694218209, "learning_rate": 9.384126088850592e-07, "loss": 0.5, "step": 5013 }, { "epoch": 0.81, "grad_norm": 10.611356955787391, "learning_rate": 9.368911273776543e-07, "loss": 0.3931, "step": 5014 }, { "epoch": 0.81, "grad_norm": 9.486513387758569, "learning_rate": 9.35370752773106e-07, "loss": 0.4794, "step": 5015 }, { "epoch": 0.81, "grad_norm": 8.956353836422622, "learning_rate": 9.338514854856073e-07, "loss": 0.3784, "step": 5016 }, { "epoch": 0.81, "grad_norm": 11.38250462575223, "learning_rate": 9.323333259290484e-07, "loss": 0.4483, "step": 5017 }, { "epoch": 0.81, "grad_norm": 5.924234592942446, "learning_rate": 9.308162745170163e-07, "loss": 0.3826, "step": 5018 }, { "epoch": 0.81, "grad_norm": 6.271223702875999, "learning_rate": 9.293003316627985e-07, "loss": 0.3633, "step": 5019 }, { "epoch": 0.81, "grad_norm": 6.939777435546889, "learning_rate": 9.277854977793827e-07, "loss": 0.4074, "step": 5020 }, { "epoch": 0.81, "grad_norm": 6.543684184766702, "learning_rate": 9.262717732794457e-07, "loss": 0.438, "step": 5021 }, { "epoch": 0.81, "grad_norm": 6.478895814218908, "learning_rate": 9.247591585753707e-07, "loss": 0.4307, "step": 5022 }, { "epoch": 0.81, "grad_norm": 8.114989081342317, "learning_rate": 9.232476540792367e-07, "loss": 0.4465, "step": 5023 }, { "epoch": 0.81, "grad_norm": 4.845936716756069, "learning_rate": 9.217372602028185e-07, "loss": 0.4219, "step": 5024 }, { "epoch": 0.81, "grad_norm": 5.77658401705603, "learning_rate": 9.202279773575873e-07, "loss": 0.3496, "step": 5025 }, { "epoch": 0.81, "grad_norm": 6.26252364972767, "learning_rate": 9.187198059547153e-07, "loss": 0.4356, "step": 5026 }, { "epoch": 0.81, "grad_norm": 7.138575439222402, "learning_rate": 9.172127464050701e-07, "loss": 0.4114, "step": 5027 }, { "epoch": 0.81, "grad_norm": 7.469923901436861, "learning_rate": 9.157067991192137e-07, "loss": 0.472, "step": 5028 }, { "epoch": 0.81, "grad_norm": 10.216767559556537, "learning_rate": 9.14201964507409e-07, "loss": 0.4154, "step": 5029 }, { "epoch": 0.81, "grad_norm": 4.9130901693400455, "learning_rate": 9.126982429796172e-07, "loss": 0.4365, "step": 5030 }, { "epoch": 0.81, "grad_norm": 5.246565335193918, "learning_rate": 9.111956349454904e-07, "loss": 0.3882, "step": 5031 }, { "epoch": 0.81, "grad_norm": 6.706807199624973, "learning_rate": 9.0969414081438e-07, "loss": 0.4016, "step": 5032 }, { "epoch": 0.81, "grad_norm": 7.071067235671073, "learning_rate": 9.081937609953367e-07, "loss": 0.3895, "step": 5033 }, { "epoch": 0.81, "grad_norm": 8.314847151289053, "learning_rate": 9.066944958971046e-07, "loss": 0.4585, "step": 5034 }, { "epoch": 0.81, "grad_norm": 4.808177914660189, "learning_rate": 9.051963459281232e-07, "loss": 0.394, "step": 5035 }, { "epoch": 0.81, "grad_norm": 4.361053525014949, "learning_rate": 9.03699311496532e-07, "loss": 0.449, "step": 5036 }, { "epoch": 0.81, "grad_norm": 4.768545414900244, "learning_rate": 9.022033930101625e-07, "loss": 0.4506, "step": 5037 }, { "epoch": 0.81, "grad_norm": 17.874350752070075, "learning_rate": 9.007085908765467e-07, "loss": 0.4147, "step": 5038 }, { "epoch": 0.81, "grad_norm": 6.2758915111608795, "learning_rate": 8.992149055029081e-07, "loss": 0.3686, "step": 5039 }, { "epoch": 0.81, "grad_norm": 41.76492998447651, "learning_rate": 8.977223372961663e-07, "loss": 0.3433, "step": 5040 }, { "epoch": 0.81, "grad_norm": 7.987419195651066, "learning_rate": 8.962308866629416e-07, "loss": 0.4251, "step": 5041 }, { "epoch": 0.81, "grad_norm": 8.777584928992443, "learning_rate": 8.947405540095444e-07, "loss": 0.4039, "step": 5042 }, { "epoch": 0.81, "grad_norm": 4.187751934916821, "learning_rate": 8.932513397419812e-07, "loss": 0.4226, "step": 5043 }, { "epoch": 0.81, "grad_norm": 8.730972702504362, "learning_rate": 8.917632442659563e-07, "loss": 0.3469, "step": 5044 }, { "epoch": 0.81, "grad_norm": 9.530501604764343, "learning_rate": 8.902762679868687e-07, "loss": 0.4028, "step": 5045 }, { "epoch": 0.81, "grad_norm": 1.25284876830902, "learning_rate": 8.887904113098111e-07, "loss": 0.5188, "step": 5046 }, { "epoch": 0.81, "grad_norm": 7.38209678098137, "learning_rate": 8.873056746395703e-07, "loss": 0.4911, "step": 5047 }, { "epoch": 0.81, "grad_norm": 6.289130904746449, "learning_rate": 8.858220583806309e-07, "loss": 0.5184, "step": 5048 }, { "epoch": 0.81, "grad_norm": 6.307090914903794, "learning_rate": 8.843395629371738e-07, "loss": 0.4796, "step": 5049 }, { "epoch": 0.81, "grad_norm": 5.4389697513109665, "learning_rate": 8.828581887130655e-07, "loss": 0.3963, "step": 5050 }, { "epoch": 0.81, "grad_norm": 7.864259839253838, "learning_rate": 8.813779361118763e-07, "loss": 0.4268, "step": 5051 }, { "epoch": 0.81, "grad_norm": 6.783937020930577, "learning_rate": 8.79898805536869e-07, "loss": 0.3773, "step": 5052 }, { "epoch": 0.81, "grad_norm": 9.811138935871762, "learning_rate": 8.784207973909986e-07, "loss": 0.4421, "step": 5053 }, { "epoch": 0.81, "grad_norm": 5.266262088179931, "learning_rate": 8.769439120769135e-07, "loss": 0.4213, "step": 5054 }, { "epoch": 0.81, "grad_norm": 33.33006529592601, "learning_rate": 8.754681499969608e-07, "loss": 0.4191, "step": 5055 }, { "epoch": 0.81, "grad_norm": 5.2625330347314, "learning_rate": 8.739935115531772e-07, "loss": 0.478, "step": 5056 }, { "epoch": 0.81, "grad_norm": 5.600008122911344, "learning_rate": 8.725199971472942e-07, "loss": 0.3498, "step": 5057 }, { "epoch": 0.81, "grad_norm": 13.994978585331703, "learning_rate": 8.7104760718074e-07, "loss": 0.3388, "step": 5058 }, { "epoch": 0.82, "grad_norm": 7.982346453941434, "learning_rate": 8.69576342054631e-07, "loss": 0.396, "step": 5059 }, { "epoch": 0.82, "grad_norm": 11.120155084141487, "learning_rate": 8.681062021697839e-07, "loss": 0.4331, "step": 5060 }, { "epoch": 0.82, "grad_norm": 11.85996426607785, "learning_rate": 8.666371879267038e-07, "loss": 0.43, "step": 5061 }, { "epoch": 0.82, "grad_norm": 13.23469206285628, "learning_rate": 8.65169299725589e-07, "loss": 0.4509, "step": 5062 }, { "epoch": 0.82, "grad_norm": 8.889834146422762, "learning_rate": 8.637025379663355e-07, "loss": 0.4901, "step": 5063 }, { "epoch": 0.82, "grad_norm": 9.136873429072761, "learning_rate": 8.622369030485283e-07, "loss": 0.3732, "step": 5064 }, { "epoch": 0.82, "grad_norm": 4.858097165859271, "learning_rate": 8.607723953714453e-07, "loss": 0.445, "step": 5065 }, { "epoch": 0.82, "grad_norm": 7.123284154519073, "learning_rate": 8.593090153340611e-07, "loss": 0.3695, "step": 5066 }, { "epoch": 0.82, "grad_norm": 9.005781728346276, "learning_rate": 8.578467633350407e-07, "loss": 0.4059, "step": 5067 }, { "epoch": 0.82, "grad_norm": 5.880776543816584, "learning_rate": 8.56385639772741e-07, "loss": 0.3374, "step": 5068 }, { "epoch": 0.82, "grad_norm": 6.990039657052862, "learning_rate": 8.549256450452109e-07, "loss": 0.433, "step": 5069 }, { "epoch": 0.82, "grad_norm": 58.3536616355321, "learning_rate": 8.534667795501955e-07, "loss": 0.3737, "step": 5070 }, { "epoch": 0.82, "grad_norm": 8.283607640655086, "learning_rate": 8.520090436851314e-07, "loss": 0.3981, "step": 5071 }, { "epoch": 0.82, "grad_norm": 4.622247180882688, "learning_rate": 8.505524378471408e-07, "loss": 0.426, "step": 5072 }, { "epoch": 0.82, "grad_norm": 9.219660306114157, "learning_rate": 8.490969624330469e-07, "loss": 0.4115, "step": 5073 }, { "epoch": 0.82, "grad_norm": 6.284870532914952, "learning_rate": 8.476426178393621e-07, "loss": 0.3513, "step": 5074 }, { "epoch": 0.82, "grad_norm": 12.855142735261378, "learning_rate": 8.461894044622882e-07, "loss": 0.4067, "step": 5075 }, { "epoch": 0.82, "grad_norm": 1.357532557638631, "learning_rate": 8.447373226977201e-07, "loss": 0.4784, "step": 5076 }, { "epoch": 0.82, "grad_norm": 9.555023108575012, "learning_rate": 8.432863729412466e-07, "loss": 0.4062, "step": 5077 }, { "epoch": 0.82, "grad_norm": 4.916831346099286, "learning_rate": 8.418365555881458e-07, "loss": 0.3929, "step": 5078 }, { "epoch": 0.82, "grad_norm": 6.419111076422727, "learning_rate": 8.403878710333868e-07, "loss": 0.4166, "step": 5079 }, { "epoch": 0.82, "grad_norm": 8.644087443411669, "learning_rate": 8.389403196716328e-07, "loss": 0.4285, "step": 5080 }, { "epoch": 0.82, "grad_norm": 14.911990514687616, "learning_rate": 8.37493901897235e-07, "loss": 0.427, "step": 5081 }, { "epoch": 0.82, "grad_norm": 9.121109002864998, "learning_rate": 8.360486181042399e-07, "loss": 0.4209, "step": 5082 }, { "epoch": 0.82, "grad_norm": 8.789623457700772, "learning_rate": 8.346044686863808e-07, "loss": 0.4171, "step": 5083 }, { "epoch": 0.82, "grad_norm": 1.1576800855573193, "learning_rate": 8.331614540370836e-07, "loss": 0.4864, "step": 5084 }, { "epoch": 0.82, "grad_norm": 6.590676409719685, "learning_rate": 8.317195745494666e-07, "loss": 0.4181, "step": 5085 }, { "epoch": 0.82, "grad_norm": 4.7462881976101965, "learning_rate": 8.302788306163373e-07, "loss": 0.4647, "step": 5086 }, { "epoch": 0.82, "grad_norm": 5.749243857141998, "learning_rate": 8.288392226301917e-07, "loss": 0.431, "step": 5087 }, { "epoch": 0.82, "grad_norm": 7.646232890597468, "learning_rate": 8.27400750983221e-07, "loss": 0.3512, "step": 5088 }, { "epoch": 0.82, "grad_norm": 4.454838500654198, "learning_rate": 8.259634160673052e-07, "loss": 0.3768, "step": 5089 }, { "epoch": 0.82, "grad_norm": 6.378151531920107, "learning_rate": 8.245272182740133e-07, "loss": 0.5171, "step": 5090 }, { "epoch": 0.82, "grad_norm": 20.47400090970667, "learning_rate": 8.230921579946027e-07, "loss": 0.3883, "step": 5091 }, { "epoch": 0.82, "grad_norm": 7.033912900830468, "learning_rate": 8.21658235620027e-07, "loss": 0.3485, "step": 5092 }, { "epoch": 0.82, "grad_norm": 6.148421388748061, "learning_rate": 8.202254515409247e-07, "loss": 0.4077, "step": 5093 }, { "epoch": 0.82, "grad_norm": 7.079231207233739, "learning_rate": 8.187938061476242e-07, "loss": 0.4479, "step": 5094 }, { "epoch": 0.82, "grad_norm": 5.634770125244585, "learning_rate": 8.173632998301468e-07, "loss": 0.4354, "step": 5095 }, { "epoch": 0.82, "grad_norm": 6.236355670378655, "learning_rate": 8.159339329782023e-07, "loss": 0.4242, "step": 5096 }, { "epoch": 0.82, "grad_norm": 6.270762262276684, "learning_rate": 8.145057059811895e-07, "loss": 0.4398, "step": 5097 }, { "epoch": 0.82, "grad_norm": 9.077991442575138, "learning_rate": 8.130786192281947e-07, "loss": 0.4107, "step": 5098 }, { "epoch": 0.82, "grad_norm": 5.37305112777276, "learning_rate": 8.116526731079982e-07, "loss": 0.4508, "step": 5099 }, { "epoch": 0.82, "grad_norm": 19.233811337769104, "learning_rate": 8.102278680090664e-07, "loss": 0.3905, "step": 5100 }, { "epoch": 0.82, "grad_norm": 1.3087221307641297, "learning_rate": 8.088042043195538e-07, "loss": 0.4759, "step": 5101 }, { "epoch": 0.82, "grad_norm": 5.541691314593341, "learning_rate": 8.073816824273078e-07, "loss": 0.3774, "step": 5102 }, { "epoch": 0.82, "grad_norm": 14.917923897365943, "learning_rate": 8.059603027198609e-07, "loss": 0.3637, "step": 5103 }, { "epoch": 0.82, "grad_norm": 5.587423764554688, "learning_rate": 8.045400655844382e-07, "loss": 0.3576, "step": 5104 }, { "epoch": 0.82, "grad_norm": 5.7038863566358975, "learning_rate": 8.031209714079496e-07, "loss": 0.3861, "step": 5105 }, { "epoch": 0.82, "grad_norm": 11.545903182540822, "learning_rate": 8.017030205769955e-07, "loss": 0.4432, "step": 5106 }, { "epoch": 0.82, "grad_norm": 13.324631043222343, "learning_rate": 8.002862134778661e-07, "loss": 0.4032, "step": 5107 }, { "epoch": 0.82, "grad_norm": 5.142056457711467, "learning_rate": 7.988705504965372e-07, "loss": 0.4519, "step": 5108 }, { "epoch": 0.82, "grad_norm": 7.953079242233721, "learning_rate": 7.974560320186759e-07, "loss": 0.4332, "step": 5109 }, { "epoch": 0.82, "grad_norm": 8.05747456108488, "learning_rate": 7.960426584296338e-07, "loss": 0.3811, "step": 5110 }, { "epoch": 0.82, "grad_norm": 6.718273671096098, "learning_rate": 7.946304301144564e-07, "loss": 0.3672, "step": 5111 }, { "epoch": 0.82, "grad_norm": 9.70772345914396, "learning_rate": 7.932193474578709e-07, "loss": 0.4407, "step": 5112 }, { "epoch": 0.82, "grad_norm": 9.558099997689615, "learning_rate": 7.918094108442947e-07, "loss": 0.498, "step": 5113 }, { "epoch": 0.82, "grad_norm": 12.489949043992755, "learning_rate": 7.904006206578358e-07, "loss": 0.4309, "step": 5114 }, { "epoch": 0.82, "grad_norm": 6.260661081557242, "learning_rate": 7.889929772822857e-07, "loss": 0.3713, "step": 5115 }, { "epoch": 0.82, "grad_norm": 5.52438405730543, "learning_rate": 7.875864811011247e-07, "loss": 0.4329, "step": 5116 }, { "epoch": 0.82, "grad_norm": 29.19908978996887, "learning_rate": 7.861811324975221e-07, "loss": 0.4499, "step": 5117 }, { "epoch": 0.82, "grad_norm": 6.007335192045254, "learning_rate": 7.847769318543346e-07, "loss": 0.4334, "step": 5118 }, { "epoch": 0.82, "grad_norm": 8.066842574742502, "learning_rate": 7.833738795541046e-07, "loss": 0.3913, "step": 5119 }, { "epoch": 0.82, "grad_norm": 11.800773451843927, "learning_rate": 7.819719759790606e-07, "loss": 0.4498, "step": 5120 }, { "epoch": 0.83, "grad_norm": 5.36539528834278, "learning_rate": 7.805712215111216e-07, "loss": 0.425, "step": 5121 }, { "epoch": 0.83, "grad_norm": 4.861032416313649, "learning_rate": 7.791716165318913e-07, "loss": 0.4139, "step": 5122 }, { "epoch": 0.83, "grad_norm": 6.30968744525048, "learning_rate": 7.777731614226596e-07, "loss": 0.4057, "step": 5123 }, { "epoch": 0.83, "grad_norm": 6.139607003145905, "learning_rate": 7.763758565644064e-07, "loss": 0.3865, "step": 5124 }, { "epoch": 0.83, "grad_norm": 12.307439492851042, "learning_rate": 7.749797023377931e-07, "loss": 0.4457, "step": 5125 }, { "epoch": 0.83, "grad_norm": 4.708985016392934, "learning_rate": 7.735846991231738e-07, "loss": 0.4103, "step": 5126 }, { "epoch": 0.83, "grad_norm": 5.370661339049936, "learning_rate": 7.721908473005829e-07, "loss": 0.3914, "step": 5127 }, { "epoch": 0.83, "grad_norm": 7.286683666830271, "learning_rate": 7.707981472497467e-07, "loss": 0.3795, "step": 5128 }, { "epoch": 0.83, "grad_norm": 5.59325971539908, "learning_rate": 7.694065993500732e-07, "loss": 0.4373, "step": 5129 }, { "epoch": 0.83, "grad_norm": 6.043648530045134, "learning_rate": 7.680162039806588e-07, "loss": 0.3604, "step": 5130 }, { "epoch": 0.83, "grad_norm": 10.627896574575761, "learning_rate": 7.666269615202865e-07, "loss": 0.4544, "step": 5131 }, { "epoch": 0.83, "grad_norm": 5.8887630781292755, "learning_rate": 7.652388723474224e-07, "loss": 0.4397, "step": 5132 }, { "epoch": 0.83, "grad_norm": 6.048414509645428, "learning_rate": 7.638519368402225e-07, "loss": 0.3538, "step": 5133 }, { "epoch": 0.83, "grad_norm": 10.304033147877178, "learning_rate": 7.62466155376525e-07, "loss": 0.4329, "step": 5134 }, { "epoch": 0.83, "grad_norm": 7.30889483830578, "learning_rate": 7.61081528333854e-07, "loss": 0.4773, "step": 5135 }, { "epoch": 0.83, "grad_norm": 3.900192158664378, "learning_rate": 7.596980560894224e-07, "loss": 0.4574, "step": 5136 }, { "epoch": 0.83, "grad_norm": 6.117761613069653, "learning_rate": 7.583157390201246e-07, "loss": 0.3957, "step": 5137 }, { "epoch": 0.83, "grad_norm": 17.90609966401349, "learning_rate": 7.56934577502541e-07, "loss": 0.3195, "step": 5138 }, { "epoch": 0.83, "grad_norm": 8.19709239219677, "learning_rate": 7.555545719129398e-07, "loss": 0.3886, "step": 5139 }, { "epoch": 0.83, "grad_norm": 8.339998278799756, "learning_rate": 7.541757226272744e-07, "loss": 0.3891, "step": 5140 }, { "epoch": 0.83, "grad_norm": 5.427456854020218, "learning_rate": 7.527980300211762e-07, "loss": 0.364, "step": 5141 }, { "epoch": 0.83, "grad_norm": 6.213020379867123, "learning_rate": 7.514214944699694e-07, "loss": 0.366, "step": 5142 }, { "epoch": 0.83, "grad_norm": 6.037970929440187, "learning_rate": 7.500461163486616e-07, "loss": 0.4126, "step": 5143 }, { "epoch": 0.83, "grad_norm": 5.096322028069207, "learning_rate": 7.486718960319428e-07, "loss": 0.4405, "step": 5144 }, { "epoch": 0.83, "grad_norm": 6.453391240070106, "learning_rate": 7.472988338941861e-07, "loss": 0.4334, "step": 5145 }, { "epoch": 0.83, "grad_norm": 1.1601826509736752, "learning_rate": 7.459269303094552e-07, "loss": 0.4366, "step": 5146 }, { "epoch": 0.83, "grad_norm": 8.390840621782635, "learning_rate": 7.445561856514916e-07, "loss": 0.4249, "step": 5147 }, { "epoch": 0.83, "grad_norm": 4.691907359287466, "learning_rate": 7.431866002937254e-07, "loss": 0.3628, "step": 5148 }, { "epoch": 0.83, "grad_norm": 21.710205529708354, "learning_rate": 7.41818174609269e-07, "loss": 0.4319, "step": 5149 }, { "epoch": 0.83, "grad_norm": 6.898975346621165, "learning_rate": 7.404509089709194e-07, "loss": 0.401, "step": 5150 }, { "epoch": 0.83, "grad_norm": 20.070586310102055, "learning_rate": 7.390848037511578e-07, "loss": 0.3985, "step": 5151 }, { "epoch": 0.83, "grad_norm": 7.655898911910306, "learning_rate": 7.377198593221474e-07, "loss": 0.4698, "step": 5152 }, { "epoch": 0.83, "grad_norm": 9.951372286771235, "learning_rate": 7.363560760557392e-07, "loss": 0.4483, "step": 5153 }, { "epoch": 0.83, "grad_norm": 5.298966659960592, "learning_rate": 7.349934543234621e-07, "loss": 0.4103, "step": 5154 }, { "epoch": 0.83, "grad_norm": 5.386646055671122, "learning_rate": 7.336319944965353e-07, "loss": 0.4714, "step": 5155 }, { "epoch": 0.83, "grad_norm": 6.873003923169134, "learning_rate": 7.32271696945856e-07, "loss": 0.4885, "step": 5156 }, { "epoch": 0.83, "grad_norm": 4.411095983910647, "learning_rate": 7.30912562042006e-07, "loss": 0.4166, "step": 5157 }, { "epoch": 0.83, "grad_norm": 4.944645019856956, "learning_rate": 7.295545901552536e-07, "loss": 0.4621, "step": 5158 }, { "epoch": 0.83, "grad_norm": 5.405397586304533, "learning_rate": 7.281977816555463e-07, "loss": 0.4244, "step": 5159 }, { "epoch": 0.83, "grad_norm": 5.284280832255569, "learning_rate": 7.268421369125145e-07, "loss": 0.4396, "step": 5160 }, { "epoch": 0.83, "grad_norm": 8.43388570706495, "learning_rate": 7.254876562954755e-07, "loss": 0.4096, "step": 5161 }, { "epoch": 0.83, "grad_norm": 8.492021713608688, "learning_rate": 7.241343401734285e-07, "loss": 0.426, "step": 5162 }, { "epoch": 0.83, "grad_norm": 1.010980641146892, "learning_rate": 7.2278218891505e-07, "loss": 0.4339, "step": 5163 }, { "epoch": 0.83, "grad_norm": 6.135272865110586, "learning_rate": 7.214312028887055e-07, "loss": 0.4223, "step": 5164 }, { "epoch": 0.83, "grad_norm": 1.3780039300140874, "learning_rate": 7.20081382462442e-07, "loss": 0.4925, "step": 5165 }, { "epoch": 0.83, "grad_norm": 7.115186084121902, "learning_rate": 7.187327280039863e-07, "loss": 0.3585, "step": 5166 }, { "epoch": 0.83, "grad_norm": 4.462073860132539, "learning_rate": 7.173852398807485e-07, "loss": 0.3935, "step": 5167 }, { "epoch": 0.83, "grad_norm": 7.170810687069655, "learning_rate": 7.160389184598221e-07, "loss": 0.446, "step": 5168 }, { "epoch": 0.83, "grad_norm": 9.40846636279818, "learning_rate": 7.146937641079849e-07, "loss": 0.436, "step": 5169 }, { "epoch": 0.83, "grad_norm": 1.1604065529927903, "learning_rate": 7.133497771916886e-07, "loss": 0.4607, "step": 5170 }, { "epoch": 0.83, "grad_norm": 7.439016556959326, "learning_rate": 7.120069580770755e-07, "loss": 0.4725, "step": 5171 }, { "epoch": 0.83, "grad_norm": 5.566335676374294, "learning_rate": 7.10665307129968e-07, "loss": 0.4469, "step": 5172 }, { "epoch": 0.83, "grad_norm": 10.483125336935396, "learning_rate": 7.093248247158663e-07, "loss": 0.3794, "step": 5173 }, { "epoch": 0.83, "grad_norm": 6.777562037756328, "learning_rate": 7.079855111999545e-07, "loss": 0.3974, "step": 5174 }, { "epoch": 0.83, "grad_norm": 4.094239412490918, "learning_rate": 7.066473669471008e-07, "loss": 0.3842, "step": 5175 }, { "epoch": 0.83, "grad_norm": 5.913868418828845, "learning_rate": 7.053103923218501e-07, "loss": 0.4481, "step": 5176 }, { "epoch": 0.83, "grad_norm": 5.694489210691064, "learning_rate": 7.039745876884335e-07, "loss": 0.4061, "step": 5177 }, { "epoch": 0.83, "grad_norm": 9.869748455618549, "learning_rate": 7.026399534107603e-07, "loss": 0.3978, "step": 5178 }, { "epoch": 0.83, "grad_norm": 13.88981284597105, "learning_rate": 7.013064898524196e-07, "loss": 0.4508, "step": 5179 }, { "epoch": 0.83, "grad_norm": 7.198039646114673, "learning_rate": 6.99974197376686e-07, "loss": 0.3819, "step": 5180 }, { "epoch": 0.83, "grad_norm": 6.544678771266348, "learning_rate": 6.986430763465124e-07, "loss": 0.4694, "step": 5181 }, { "epoch": 0.83, "grad_norm": 5.058448332166017, "learning_rate": 6.973131271245315e-07, "loss": 0.3997, "step": 5182 }, { "epoch": 0.84, "grad_norm": 8.839688502457097, "learning_rate": 6.95984350073059e-07, "loss": 0.4086, "step": 5183 }, { "epoch": 0.84, "grad_norm": 5.586093286667702, "learning_rate": 6.94656745554092e-07, "loss": 0.3761, "step": 5184 }, { "epoch": 0.84, "grad_norm": 5.781163485599814, "learning_rate": 6.933303139293035e-07, "loss": 0.3608, "step": 5185 }, { "epoch": 0.84, "grad_norm": 6.431462725603388, "learning_rate": 6.92005055560051e-07, "loss": 0.4342, "step": 5186 }, { "epoch": 0.84, "grad_norm": 5.075606158420179, "learning_rate": 6.906809708073736e-07, "loss": 0.4243, "step": 5187 }, { "epoch": 0.84, "grad_norm": 4.0951292199482445, "learning_rate": 6.893580600319865e-07, "loss": 0.4082, "step": 5188 }, { "epoch": 0.84, "grad_norm": 10.932343625235042, "learning_rate": 6.880363235942861e-07, "loss": 0.4141, "step": 5189 }, { "epoch": 0.84, "grad_norm": 12.730457082527625, "learning_rate": 6.867157618543513e-07, "loss": 0.474, "step": 5190 }, { "epoch": 0.84, "grad_norm": 6.543192585505034, "learning_rate": 6.853963751719417e-07, "loss": 0.4583, "step": 5191 }, { "epoch": 0.84, "grad_norm": 9.588366046957356, "learning_rate": 6.840781639064897e-07, "loss": 0.4319, "step": 5192 }, { "epoch": 0.84, "grad_norm": 6.682578310155655, "learning_rate": 6.827611284171154e-07, "loss": 0.4709, "step": 5193 }, { "epoch": 0.84, "grad_norm": 5.385630011329006, "learning_rate": 6.814452690626161e-07, "loss": 0.4122, "step": 5194 }, { "epoch": 0.84, "grad_norm": 6.2765735964760685, "learning_rate": 6.801305862014667e-07, "loss": 0.4618, "step": 5195 }, { "epoch": 0.84, "grad_norm": 8.996336179152513, "learning_rate": 6.788170801918231e-07, "loss": 0.4121, "step": 5196 }, { "epoch": 0.84, "grad_norm": 1.0938053712995783, "learning_rate": 6.775047513915218e-07, "loss": 0.4822, "step": 5197 }, { "epoch": 0.84, "grad_norm": 5.993674601956297, "learning_rate": 6.76193600158076e-07, "loss": 0.4328, "step": 5198 }, { "epoch": 0.84, "grad_norm": 4.749437704899738, "learning_rate": 6.748836268486797e-07, "loss": 0.4052, "step": 5199 }, { "epoch": 0.84, "grad_norm": 5.434455388949122, "learning_rate": 6.735748318202062e-07, "loss": 0.3911, "step": 5200 }, { "epoch": 0.84, "grad_norm": 8.000680899721354, "learning_rate": 6.722672154292065e-07, "loss": 0.3688, "step": 5201 }, { "epoch": 0.84, "grad_norm": 1.1828288561246265, "learning_rate": 6.709607780319133e-07, "loss": 0.4394, "step": 5202 }, { "epoch": 0.84, "grad_norm": 6.647281907407361, "learning_rate": 6.696555199842347e-07, "loss": 0.4158, "step": 5203 }, { "epoch": 0.84, "grad_norm": 4.537618367529139, "learning_rate": 6.683514416417574e-07, "loss": 0.4311, "step": 5204 }, { "epoch": 0.84, "grad_norm": 20.126615214102898, "learning_rate": 6.670485433597507e-07, "loss": 0.3932, "step": 5205 }, { "epoch": 0.84, "grad_norm": 5.330010541672859, "learning_rate": 6.657468254931615e-07, "loss": 0.4206, "step": 5206 }, { "epoch": 0.84, "grad_norm": 6.504147572911003, "learning_rate": 6.644462883966085e-07, "loss": 0.3979, "step": 5207 }, { "epoch": 0.84, "grad_norm": 4.532142914990343, "learning_rate": 6.631469324243978e-07, "loss": 0.399, "step": 5208 }, { "epoch": 0.84, "grad_norm": 4.566983490126192, "learning_rate": 6.618487579305089e-07, "loss": 0.4334, "step": 5209 }, { "epoch": 0.84, "grad_norm": 5.97641712068898, "learning_rate": 6.605517652686005e-07, "loss": 0.3557, "step": 5210 }, { "epoch": 0.84, "grad_norm": 6.940301658325301, "learning_rate": 6.59255954792008e-07, "loss": 0.4309, "step": 5211 }, { "epoch": 0.84, "grad_norm": 8.649855082632047, "learning_rate": 6.579613268537466e-07, "loss": 0.3905, "step": 5212 }, { "epoch": 0.84, "grad_norm": 6.5507375327668145, "learning_rate": 6.566678818065108e-07, "loss": 0.4645, "step": 5213 }, { "epoch": 0.84, "grad_norm": 5.967252404757335, "learning_rate": 6.553756200026668e-07, "loss": 0.4751, "step": 5214 }, { "epoch": 0.84, "grad_norm": 6.898138413738658, "learning_rate": 6.540845417942637e-07, "loss": 0.3904, "step": 5215 }, { "epoch": 0.84, "grad_norm": 9.562978990480373, "learning_rate": 6.527946475330288e-07, "loss": 0.3843, "step": 5216 }, { "epoch": 0.84, "grad_norm": 5.199447743700096, "learning_rate": 6.51505937570363e-07, "loss": 0.3851, "step": 5217 }, { "epoch": 0.84, "grad_norm": 12.900808448496539, "learning_rate": 6.502184122573457e-07, "loss": 0.4213, "step": 5218 }, { "epoch": 0.84, "grad_norm": 7.055098217115098, "learning_rate": 6.489320719447367e-07, "loss": 0.4371, "step": 5219 }, { "epoch": 0.84, "grad_norm": 1.072598732348102, "learning_rate": 6.476469169829691e-07, "loss": 0.4402, "step": 5220 }, { "epoch": 0.84, "grad_norm": 0.925196170821637, "learning_rate": 6.463629477221533e-07, "loss": 0.4399, "step": 5221 }, { "epoch": 0.84, "grad_norm": 6.314636921905483, "learning_rate": 6.450801645120808e-07, "loss": 0.3896, "step": 5222 }, { "epoch": 0.84, "grad_norm": 5.786662543475865, "learning_rate": 6.43798567702214e-07, "loss": 0.4861, "step": 5223 }, { "epoch": 0.84, "grad_norm": 5.715299785367704, "learning_rate": 6.425181576416978e-07, "loss": 0.3958, "step": 5224 }, { "epoch": 0.84, "grad_norm": 6.311246426968363, "learning_rate": 6.412389346793507e-07, "loss": 0.4088, "step": 5225 }, { "epoch": 0.84, "grad_norm": 11.117707614321164, "learning_rate": 6.399608991636663e-07, "loss": 0.4347, "step": 5226 }, { "epoch": 0.84, "grad_norm": 7.64619438614458, "learning_rate": 6.386840514428183e-07, "loss": 0.4163, "step": 5227 }, { "epoch": 0.84, "grad_norm": 5.987053579448218, "learning_rate": 6.374083918646557e-07, "loss": 0.376, "step": 5228 }, { "epoch": 0.84, "grad_norm": 8.153410802065691, "learning_rate": 6.361339207767031e-07, "loss": 0.4637, "step": 5229 }, { "epoch": 0.84, "grad_norm": 6.002782237204669, "learning_rate": 6.348606385261602e-07, "loss": 0.3521, "step": 5230 }, { "epoch": 0.84, "grad_norm": 5.460430654341795, "learning_rate": 6.335885454599061e-07, "loss": 0.4933, "step": 5231 }, { "epoch": 0.84, "grad_norm": 6.318259755349841, "learning_rate": 6.323176419244925e-07, "loss": 0.435, "step": 5232 }, { "epoch": 0.84, "grad_norm": 7.396689682388432, "learning_rate": 6.310479282661485e-07, "loss": 0.4105, "step": 5233 }, { "epoch": 0.84, "grad_norm": 7.633405190230257, "learning_rate": 6.297794048307798e-07, "loss": 0.4491, "step": 5234 }, { "epoch": 0.84, "grad_norm": 8.587733443088494, "learning_rate": 6.285120719639693e-07, "loss": 0.4547, "step": 5235 }, { "epoch": 0.84, "grad_norm": 26.494946319429015, "learning_rate": 6.27245930010969e-07, "loss": 0.453, "step": 5236 }, { "epoch": 0.84, "grad_norm": 8.969518436555928, "learning_rate": 6.259809793167127e-07, "loss": 0.4134, "step": 5237 }, { "epoch": 0.84, "grad_norm": 10.684680983005558, "learning_rate": 6.247172202258095e-07, "loss": 0.4037, "step": 5238 }, { "epoch": 0.84, "grad_norm": 5.2207700234931105, "learning_rate": 6.234546530825408e-07, "loss": 0.4589, "step": 5239 }, { "epoch": 0.84, "grad_norm": 17.254429545347406, "learning_rate": 6.221932782308637e-07, "loss": 0.339, "step": 5240 }, { "epoch": 0.84, "grad_norm": 5.640155389898209, "learning_rate": 6.209330960144139e-07, "loss": 0.4573, "step": 5241 }, { "epoch": 0.84, "grad_norm": 5.563214768221484, "learning_rate": 6.196741067764977e-07, "loss": 0.4201, "step": 5242 }, { "epoch": 0.84, "grad_norm": 7.586669227857651, "learning_rate": 6.184163108600988e-07, "loss": 0.3807, "step": 5243 }, { "epoch": 0.84, "grad_norm": 6.161131667208156, "learning_rate": 6.171597086078767e-07, "loss": 0.3856, "step": 5244 }, { "epoch": 0.85, "grad_norm": 8.270947477998746, "learning_rate": 6.159043003621628e-07, "loss": 0.4205, "step": 5245 }, { "epoch": 0.85, "grad_norm": 9.022912117577617, "learning_rate": 6.146500864649668e-07, "loss": 0.4445, "step": 5246 }, { "epoch": 0.85, "grad_norm": 6.534561715993588, "learning_rate": 6.133970672579693e-07, "loss": 0.4621, "step": 5247 }, { "epoch": 0.85, "grad_norm": 4.370824609374356, "learning_rate": 6.121452430825287e-07, "loss": 0.4063, "step": 5248 }, { "epoch": 0.85, "grad_norm": 4.865896705986213, "learning_rate": 6.108946142796757e-07, "loss": 0.418, "step": 5249 }, { "epoch": 0.85, "grad_norm": 7.559630510588873, "learning_rate": 6.096451811901155e-07, "loss": 0.4283, "step": 5250 }, { "epoch": 0.85, "grad_norm": 5.9704121019619345, "learning_rate": 6.083969441542298e-07, "loss": 0.4418, "step": 5251 }, { "epoch": 0.85, "grad_norm": 6.097492463838492, "learning_rate": 6.071499035120703e-07, "loss": 0.3435, "step": 5252 }, { "epoch": 0.85, "grad_norm": 9.55162384945751, "learning_rate": 6.059040596033682e-07, "loss": 0.4491, "step": 5253 }, { "epoch": 0.85, "grad_norm": 5.9052188533036984, "learning_rate": 6.046594127675242e-07, "loss": 0.4449, "step": 5254 }, { "epoch": 0.85, "grad_norm": 5.286873179102048, "learning_rate": 6.034159633436132e-07, "loss": 0.367, "step": 5255 }, { "epoch": 0.85, "grad_norm": 10.066195808253005, "learning_rate": 6.021737116703868e-07, "loss": 0.4235, "step": 5256 }, { "epoch": 0.85, "grad_norm": 8.00092700847031, "learning_rate": 6.009326580862696e-07, "loss": 0.4327, "step": 5257 }, { "epoch": 0.85, "grad_norm": 5.214885977049775, "learning_rate": 5.996928029293559e-07, "loss": 0.379, "step": 5258 }, { "epoch": 0.85, "grad_norm": 7.6701913084493345, "learning_rate": 5.984541465374172e-07, "loss": 0.4222, "step": 5259 }, { "epoch": 0.85, "grad_norm": 5.730498862416246, "learning_rate": 5.972166892478998e-07, "loss": 0.4175, "step": 5260 }, { "epoch": 0.85, "grad_norm": 10.033996435688959, "learning_rate": 5.959804313979195e-07, "loss": 0.3398, "step": 5261 }, { "epoch": 0.85, "grad_norm": 7.894439461189259, "learning_rate": 5.947453733242659e-07, "loss": 0.397, "step": 5262 }, { "epoch": 0.85, "grad_norm": 5.543333018152931, "learning_rate": 5.935115153634058e-07, "loss": 0.3796, "step": 5263 }, { "epoch": 0.85, "grad_norm": 5.521751642396397, "learning_rate": 5.922788578514737e-07, "loss": 0.3892, "step": 5264 }, { "epoch": 0.85, "grad_norm": 6.201298706929204, "learning_rate": 5.910474011242801e-07, "loss": 0.4028, "step": 5265 }, { "epoch": 0.85, "grad_norm": 5.769529830326897, "learning_rate": 5.898171455173074e-07, "loss": 0.3664, "step": 5266 }, { "epoch": 0.85, "grad_norm": 6.78138727972116, "learning_rate": 5.885880913657127e-07, "loss": 0.3678, "step": 5267 }, { "epoch": 0.85, "grad_norm": 7.006605747945957, "learning_rate": 5.873602390043231e-07, "loss": 0.4777, "step": 5268 }, { "epoch": 0.85, "grad_norm": 6.979367209414301, "learning_rate": 5.861335887676389e-07, "loss": 0.3887, "step": 5269 }, { "epoch": 0.85, "grad_norm": 19.90359480776006, "learning_rate": 5.84908140989835e-07, "loss": 0.404, "step": 5270 }, { "epoch": 0.85, "grad_norm": 10.954653367935228, "learning_rate": 5.836838960047558e-07, "loss": 0.4152, "step": 5271 }, { "epoch": 0.85, "grad_norm": 5.183101663882465, "learning_rate": 5.824608541459192e-07, "loss": 0.4385, "step": 5272 }, { "epoch": 0.85, "grad_norm": 6.861058308923739, "learning_rate": 5.812390157465169e-07, "loss": 0.3988, "step": 5273 }, { "epoch": 0.85, "grad_norm": 6.214290779519238, "learning_rate": 5.8001838113941e-07, "loss": 0.4121, "step": 5274 }, { "epoch": 0.85, "grad_norm": 4.587558386231922, "learning_rate": 5.78798950657134e-07, "loss": 0.3866, "step": 5275 }, { "epoch": 0.85, "grad_norm": 7.64735138255537, "learning_rate": 5.775807246318954e-07, "loss": 0.3829, "step": 5276 }, { "epoch": 0.85, "grad_norm": 7.901030518619207, "learning_rate": 5.76363703395571e-07, "loss": 0.4128, "step": 5277 }, { "epoch": 0.85, "grad_norm": 6.377395250542658, "learning_rate": 5.751478872797128e-07, "loss": 0.4074, "step": 5278 }, { "epoch": 0.85, "grad_norm": 15.839005762580202, "learning_rate": 5.739332766155419e-07, "loss": 0.421, "step": 5279 }, { "epoch": 0.85, "grad_norm": 7.784420757730078, "learning_rate": 5.727198717339511e-07, "loss": 0.4436, "step": 5280 }, { "epoch": 0.85, "grad_norm": 5.901020614611662, "learning_rate": 5.715076729655056e-07, "loss": 0.4619, "step": 5281 }, { "epoch": 0.85, "grad_norm": 15.186678943229952, "learning_rate": 5.702966806404431e-07, "loss": 0.4333, "step": 5282 }, { "epoch": 0.85, "grad_norm": 6.6226845446600535, "learning_rate": 5.690868950886702e-07, "loss": 0.3916, "step": 5283 }, { "epoch": 0.85, "grad_norm": 1.127540306725197, "learning_rate": 5.67878316639765e-07, "loss": 0.4483, "step": 5284 }, { "epoch": 0.85, "grad_norm": 29.1759069348595, "learning_rate": 5.666709456229797e-07, "loss": 0.4417, "step": 5285 }, { "epoch": 0.85, "grad_norm": 4.6249210971283485, "learning_rate": 5.654647823672337e-07, "loss": 0.4262, "step": 5286 }, { "epoch": 0.85, "grad_norm": 6.030530317640041, "learning_rate": 5.642598272011196e-07, "loss": 0.4895, "step": 5287 }, { "epoch": 0.85, "grad_norm": 8.775071842926382, "learning_rate": 5.630560804528995e-07, "loss": 0.4445, "step": 5288 }, { "epoch": 0.85, "grad_norm": 5.810718507449153, "learning_rate": 5.6185354245051e-07, "loss": 0.4329, "step": 5289 }, { "epoch": 0.85, "grad_norm": 5.815670606153476, "learning_rate": 5.606522135215531e-07, "loss": 0.4077, "step": 5290 }, { "epoch": 0.85, "grad_norm": 6.92539098905313, "learning_rate": 5.594520939933041e-07, "loss": 0.4639, "step": 5291 }, { "epoch": 0.85, "grad_norm": 8.408856401600202, "learning_rate": 5.582531841927097e-07, "loss": 0.4187, "step": 5292 }, { "epoch": 0.85, "grad_norm": 6.4576385446173195, "learning_rate": 5.570554844463854e-07, "loss": 0.3915, "step": 5293 }, { "epoch": 0.85, "grad_norm": 27.394392984767407, "learning_rate": 5.558589950806164e-07, "loss": 0.3822, "step": 5294 }, { "epoch": 0.85, "grad_norm": 7.24877921988408, "learning_rate": 5.546637164213625e-07, "loss": 0.452, "step": 5295 }, { "epoch": 0.85, "grad_norm": 5.165858864977825, "learning_rate": 5.53469648794247e-07, "loss": 0.4082, "step": 5296 }, { "epoch": 0.85, "grad_norm": 5.516831691363996, "learning_rate": 5.522767925245698e-07, "loss": 0.4061, "step": 5297 }, { "epoch": 0.85, "grad_norm": 6.800234108625724, "learning_rate": 5.510851479372959e-07, "loss": 0.3538, "step": 5298 }, { "epoch": 0.85, "grad_norm": 9.581339603356755, "learning_rate": 5.498947153570622e-07, "loss": 0.3284, "step": 5299 }, { "epoch": 0.85, "grad_norm": 7.58131831360925, "learning_rate": 5.487054951081772e-07, "loss": 0.4561, "step": 5300 }, { "epoch": 0.85, "grad_norm": 9.163680944832233, "learning_rate": 5.475174875146156e-07, "loss": 0.3986, "step": 5301 }, { "epoch": 0.85, "grad_norm": 7.859099376781523, "learning_rate": 5.463306929000228e-07, "loss": 0.4695, "step": 5302 }, { "epoch": 0.85, "grad_norm": 5.401345663289105, "learning_rate": 5.451451115877154e-07, "loss": 0.4027, "step": 5303 }, { "epoch": 0.85, "grad_norm": 13.239172666430148, "learning_rate": 5.439607439006795e-07, "loss": 0.3828, "step": 5304 }, { "epoch": 0.85, "grad_norm": 8.956289879751637, "learning_rate": 5.427775901615684e-07, "loss": 0.4772, "step": 5305 }, { "epoch": 0.85, "grad_norm": 5.595138014754916, "learning_rate": 5.415956506927051e-07, "loss": 0.3764, "step": 5306 }, { "epoch": 0.86, "grad_norm": 5.3624894142642585, "learning_rate": 5.404149258160835e-07, "loss": 0.4764, "step": 5307 }, { "epoch": 0.86, "grad_norm": 4.66728557266445, "learning_rate": 5.392354158533658e-07, "loss": 0.4314, "step": 5308 }, { "epoch": 0.86, "grad_norm": 6.053256277874953, "learning_rate": 5.380571211258811e-07, "loss": 0.4122, "step": 5309 }, { "epoch": 0.86, "grad_norm": 1.034848403125363, "learning_rate": 5.36880041954631e-07, "loss": 0.4867, "step": 5310 }, { "epoch": 0.86, "grad_norm": 6.917186721206175, "learning_rate": 5.357041786602851e-07, "loss": 0.4684, "step": 5311 }, { "epoch": 0.86, "grad_norm": 15.800665618559174, "learning_rate": 5.345295315631805e-07, "loss": 0.3881, "step": 5312 }, { "epoch": 0.86, "grad_norm": 15.165099391855884, "learning_rate": 5.33356100983321e-07, "loss": 0.4082, "step": 5313 }, { "epoch": 0.86, "grad_norm": 7.823819267359406, "learning_rate": 5.32183887240385e-07, "loss": 0.4442, "step": 5314 }, { "epoch": 0.86, "grad_norm": 10.197485973182356, "learning_rate": 5.310128906537137e-07, "loss": 0.4084, "step": 5315 }, { "epoch": 0.86, "grad_norm": 24.47032414679811, "learning_rate": 5.298431115423186e-07, "loss": 0.4071, "step": 5316 }, { "epoch": 0.86, "grad_norm": 5.920259189690462, "learning_rate": 5.286745502248819e-07, "loss": 0.4655, "step": 5317 }, { "epoch": 0.86, "grad_norm": 6.544392882982514, "learning_rate": 5.27507207019749e-07, "loss": 0.4046, "step": 5318 }, { "epoch": 0.86, "grad_norm": 5.578917239637161, "learning_rate": 5.263410822449388e-07, "loss": 0.4035, "step": 5319 }, { "epoch": 0.86, "grad_norm": 11.022203071199328, "learning_rate": 5.251761762181351e-07, "loss": 0.4126, "step": 5320 }, { "epoch": 0.86, "grad_norm": 19.64847858635474, "learning_rate": 5.240124892566895e-07, "loss": 0.3749, "step": 5321 }, { "epoch": 0.86, "grad_norm": 7.244255691782633, "learning_rate": 5.228500216776239e-07, "loss": 0.3499, "step": 5322 }, { "epoch": 0.86, "grad_norm": 8.972797812331269, "learning_rate": 5.216887737976256e-07, "loss": 0.428, "step": 5323 }, { "epoch": 0.86, "grad_norm": 6.428749845320174, "learning_rate": 5.205287459330499e-07, "loss": 0.4505, "step": 5324 }, { "epoch": 0.86, "grad_norm": 5.399787409591956, "learning_rate": 5.193699383999213e-07, "loss": 0.4377, "step": 5325 }, { "epoch": 0.86, "grad_norm": 6.789812068845749, "learning_rate": 5.182123515139315e-07, "loss": 0.3444, "step": 5326 }, { "epoch": 0.86, "grad_norm": 5.2983560493318205, "learning_rate": 5.170559855904389e-07, "loss": 0.4078, "step": 5327 }, { "epoch": 0.86, "grad_norm": 6.852153629855909, "learning_rate": 5.159008409444671e-07, "loss": 0.4321, "step": 5328 }, { "epoch": 0.86, "grad_norm": 6.503261888489776, "learning_rate": 5.147469178907127e-07, "loss": 0.4311, "step": 5329 }, { "epoch": 0.86, "grad_norm": 74.52382112484217, "learning_rate": 5.135942167435342e-07, "loss": 0.4343, "step": 5330 }, { "epoch": 0.86, "grad_norm": 18.96210599058679, "learning_rate": 5.124427378169588e-07, "loss": 0.4209, "step": 5331 }, { "epoch": 0.86, "grad_norm": 6.942747298502926, "learning_rate": 5.112924814246817e-07, "loss": 0.385, "step": 5332 }, { "epoch": 0.86, "grad_norm": 13.667452891848173, "learning_rate": 5.10143447880066e-07, "loss": 0.3988, "step": 5333 }, { "epoch": 0.86, "grad_norm": 8.825978089537543, "learning_rate": 5.089956374961386e-07, "loss": 0.4455, "step": 5334 }, { "epoch": 0.86, "grad_norm": 44.98588646909186, "learning_rate": 5.078490505855938e-07, "loss": 0.3822, "step": 5335 }, { "epoch": 0.86, "grad_norm": 6.437944680339552, "learning_rate": 5.06703687460795e-07, "loss": 0.4725, "step": 5336 }, { "epoch": 0.86, "grad_norm": 16.626036690571258, "learning_rate": 5.055595484337705e-07, "loss": 0.4235, "step": 5337 }, { "epoch": 0.86, "grad_norm": 12.301427380854832, "learning_rate": 5.044166338162143e-07, "loss": 0.3771, "step": 5338 }, { "epoch": 0.86, "grad_norm": 9.17133501780956, "learning_rate": 5.03274943919489e-07, "loss": 0.4756, "step": 5339 }, { "epoch": 0.86, "grad_norm": 8.558840809623556, "learning_rate": 5.021344790546212e-07, "loss": 0.4177, "step": 5340 }, { "epoch": 0.86, "grad_norm": 12.89312055490241, "learning_rate": 5.009952395323065e-07, "loss": 0.4309, "step": 5341 }, { "epoch": 0.86, "grad_norm": 6.360323063915149, "learning_rate": 4.998572256629047e-07, "loss": 0.484, "step": 5342 }, { "epoch": 0.86, "grad_norm": 5.886018953131113, "learning_rate": 4.987204377564409e-07, "loss": 0.4222, "step": 5343 }, { "epoch": 0.86, "grad_norm": 9.026664722867963, "learning_rate": 4.975848761226088e-07, "loss": 0.4412, "step": 5344 }, { "epoch": 0.86, "grad_norm": 13.9126777291004, "learning_rate": 4.964505410707655e-07, "loss": 0.3429, "step": 5345 }, { "epoch": 0.86, "grad_norm": 7.437184313701611, "learning_rate": 4.95317432909937e-07, "loss": 0.3848, "step": 5346 }, { "epoch": 0.86, "grad_norm": 4.392627823546229, "learning_rate": 4.941855519488109e-07, "loss": 0.3559, "step": 5347 }, { "epoch": 0.86, "grad_norm": 5.757502503976423, "learning_rate": 4.930548984957451e-07, "loss": 0.4044, "step": 5348 }, { "epoch": 0.86, "grad_norm": 4.4593410509549125, "learning_rate": 4.919254728587591e-07, "loss": 0.4543, "step": 5349 }, { "epoch": 0.86, "grad_norm": 17.786174257561143, "learning_rate": 4.907972753455398e-07, "loss": 0.3413, "step": 5350 }, { "epoch": 0.86, "grad_norm": 7.146808361338149, "learning_rate": 4.896703062634401e-07, "loss": 0.4458, "step": 5351 }, { "epoch": 0.86, "grad_norm": 7.582402037250631, "learning_rate": 4.885445659194771e-07, "loss": 0.525, "step": 5352 }, { "epoch": 0.86, "grad_norm": 16.279801321200193, "learning_rate": 4.874200546203328e-07, "loss": 0.4178, "step": 5353 }, { "epoch": 0.86, "grad_norm": 8.872327605324376, "learning_rate": 4.86296772672355e-07, "loss": 0.5229, "step": 5354 }, { "epoch": 0.86, "grad_norm": 7.4949892461922065, "learning_rate": 4.851747203815588e-07, "loss": 0.4124, "step": 5355 }, { "epoch": 0.86, "grad_norm": 5.7456333789718, "learning_rate": 4.840538980536203e-07, "loss": 0.4055, "step": 5356 }, { "epoch": 0.86, "grad_norm": 7.762640573618225, "learning_rate": 4.829343059938818e-07, "loss": 0.4165, "step": 5357 }, { "epoch": 0.86, "grad_norm": 6.309000193762146, "learning_rate": 4.818159445073528e-07, "loss": 0.3481, "step": 5358 }, { "epoch": 0.86, "grad_norm": 6.1651002833474635, "learning_rate": 4.806988138987051e-07, "loss": 0.3931, "step": 5359 }, { "epoch": 0.86, "grad_norm": 5.766064247213585, "learning_rate": 4.795829144722752e-07, "loss": 0.3656, "step": 5360 }, { "epoch": 0.86, "grad_norm": 8.741023677040962, "learning_rate": 4.784682465320661e-07, "loss": 0.4296, "step": 5361 }, { "epoch": 0.86, "grad_norm": 1.112649773777639, "learning_rate": 4.773548103817421e-07, "loss": 0.487, "step": 5362 }, { "epoch": 0.86, "grad_norm": 6.938801122561541, "learning_rate": 4.762426063246367e-07, "loss": 0.4503, "step": 5363 }, { "epoch": 0.86, "grad_norm": 8.179647425183878, "learning_rate": 4.75131634663743e-07, "loss": 0.4095, "step": 5364 }, { "epoch": 0.86, "grad_norm": 6.639280224955156, "learning_rate": 4.740218957017201e-07, "loss": 0.4691, "step": 5365 }, { "epoch": 0.86, "grad_norm": 5.630583585791296, "learning_rate": 4.729133897408933e-07, "loss": 0.4169, "step": 5366 }, { "epoch": 0.86, "grad_norm": 6.445101292974257, "learning_rate": 4.7180611708324776e-07, "loss": 0.4012, "step": 5367 }, { "epoch": 0.86, "grad_norm": 9.196392236634734, "learning_rate": 4.7070007803043714e-07, "loss": 0.4485, "step": 5368 }, { "epoch": 0.87, "grad_norm": 9.372825385616434, "learning_rate": 4.6959527288377493e-07, "loss": 0.4514, "step": 5369 }, { "epoch": 0.87, "grad_norm": 7.199406521664843, "learning_rate": 4.684917019442431e-07, "loss": 0.4341, "step": 5370 }, { "epoch": 0.87, "grad_norm": 5.9222169528871405, "learning_rate": 4.673893655124834e-07, "loss": 0.386, "step": 5371 }, { "epoch": 0.87, "grad_norm": 6.726690736618884, "learning_rate": 4.6628826388880165e-07, "loss": 0.3834, "step": 5372 }, { "epoch": 0.87, "grad_norm": 4.678756829787878, "learning_rate": 4.651883973731708e-07, "loss": 0.4837, "step": 5373 }, { "epoch": 0.87, "grad_norm": 9.666783151337423, "learning_rate": 4.640897662652227e-07, "loss": 0.3567, "step": 5374 }, { "epoch": 0.87, "grad_norm": 7.35807639431972, "learning_rate": 4.629923708642542e-07, "loss": 0.4215, "step": 5375 }, { "epoch": 0.87, "grad_norm": 8.441488333315057, "learning_rate": 4.618962114692277e-07, "loss": 0.3784, "step": 5376 }, { "epoch": 0.87, "grad_norm": 5.4704178387913025, "learning_rate": 4.608012883787688e-07, "loss": 0.3651, "step": 5377 }, { "epoch": 0.87, "grad_norm": 4.218297444413399, "learning_rate": 4.5970760189116057e-07, "loss": 0.4418, "step": 5378 }, { "epoch": 0.87, "grad_norm": 6.794336943082552, "learning_rate": 4.5861515230435525e-07, "loss": 0.395, "step": 5379 }, { "epoch": 0.87, "grad_norm": 6.161564818104756, "learning_rate": 4.5752393991596754e-07, "loss": 0.4512, "step": 5380 }, { "epoch": 0.87, "grad_norm": 5.332589886035079, "learning_rate": 4.5643396502327297e-07, "loss": 0.3552, "step": 5381 }, { "epoch": 0.87, "grad_norm": 8.256058574849284, "learning_rate": 4.55345227923209e-07, "loss": 0.4382, "step": 5382 }, { "epoch": 0.87, "grad_norm": 8.772645002317685, "learning_rate": 4.5425772891238065e-07, "loss": 0.442, "step": 5383 }, { "epoch": 0.87, "grad_norm": 7.706827015891457, "learning_rate": 4.5317146828704973e-07, "loss": 0.3903, "step": 5384 }, { "epoch": 0.87, "grad_norm": 8.937386203510606, "learning_rate": 4.520864463431457e-07, "loss": 0.4243, "step": 5385 }, { "epoch": 0.87, "grad_norm": 0.9986632707986416, "learning_rate": 4.510026633762571e-07, "loss": 0.4653, "step": 5386 }, { "epoch": 0.87, "grad_norm": 6.482633208710864, "learning_rate": 4.4992011968163775e-07, "loss": 0.4533, "step": 5387 }, { "epoch": 0.87, "grad_norm": 5.593283144380061, "learning_rate": 4.488388155542012e-07, "loss": 0.4704, "step": 5388 }, { "epoch": 0.87, "grad_norm": 5.352235569042049, "learning_rate": 4.47758751288524e-07, "loss": 0.3601, "step": 5389 }, { "epoch": 0.87, "grad_norm": 13.868612191346697, "learning_rate": 4.466799271788469e-07, "loss": 0.4203, "step": 5390 }, { "epoch": 0.87, "grad_norm": 6.4114001384022865, "learning_rate": 4.4560234351906983e-07, "loss": 0.4019, "step": 5391 }, { "epoch": 0.87, "grad_norm": 7.042674560771361, "learning_rate": 4.445260006027585e-07, "loss": 0.42, "step": 5392 }, { "epoch": 0.87, "grad_norm": 7.729938661253006, "learning_rate": 4.4345089872313674e-07, "loss": 0.4238, "step": 5393 }, { "epoch": 0.87, "grad_norm": 6.301088847798538, "learning_rate": 4.4237703817309073e-07, "loss": 0.4345, "step": 5394 }, { "epoch": 0.87, "grad_norm": 9.920558186486286, "learning_rate": 4.4130441924517263e-07, "loss": 0.3359, "step": 5395 }, { "epoch": 0.87, "grad_norm": 1.141335568071695, "learning_rate": 4.4023304223159203e-07, "loss": 0.4834, "step": 5396 }, { "epoch": 0.87, "grad_norm": 7.637868410280652, "learning_rate": 4.3916290742421986e-07, "loss": 0.4171, "step": 5397 }, { "epoch": 0.87, "grad_norm": 9.576498835788243, "learning_rate": 4.3809401511459237e-07, "loss": 0.4454, "step": 5398 }, { "epoch": 0.87, "grad_norm": 17.64865830466037, "learning_rate": 4.3702636559390667e-07, "loss": 0.4285, "step": 5399 }, { "epoch": 0.87, "grad_norm": 10.487336657771833, "learning_rate": 4.3595995915301614e-07, "loss": 0.434, "step": 5400 }, { "epoch": 0.87, "grad_norm": 6.155011582092008, "learning_rate": 4.348947960824412e-07, "loss": 0.3712, "step": 5401 }, { "epoch": 0.87, "grad_norm": 5.88295286378949, "learning_rate": 4.3383087667236254e-07, "loss": 0.4152, "step": 5402 }, { "epoch": 0.87, "grad_norm": 7.0087228199859055, "learning_rate": 4.3276820121262053e-07, "loss": 0.3748, "step": 5403 }, { "epoch": 0.87, "grad_norm": 6.933289816181155, "learning_rate": 4.3170676999271576e-07, "loss": 0.4071, "step": 5404 }, { "epoch": 0.87, "grad_norm": 5.984943717751519, "learning_rate": 4.306465833018131e-07, "loss": 0.3457, "step": 5405 }, { "epoch": 0.87, "grad_norm": 6.510480112242898, "learning_rate": 4.295876414287375e-07, "loss": 0.4196, "step": 5406 }, { "epoch": 0.87, "grad_norm": 10.677208689865212, "learning_rate": 4.2852994466197064e-07, "loss": 0.4234, "step": 5407 }, { "epoch": 0.87, "grad_norm": 5.531107443400987, "learning_rate": 4.274734932896607e-07, "loss": 0.3452, "step": 5408 }, { "epoch": 0.87, "grad_norm": 8.937036726114478, "learning_rate": 4.264182875996142e-07, "loss": 0.4162, "step": 5409 }, { "epoch": 0.87, "grad_norm": 6.2579553057063, "learning_rate": 4.253643278792974e-07, "loss": 0.3792, "step": 5410 }, { "epoch": 0.87, "grad_norm": 8.929406159334912, "learning_rate": 4.243116144158377e-07, "loss": 0.3967, "step": 5411 }, { "epoch": 0.87, "grad_norm": 7.023669195069621, "learning_rate": 4.232601474960246e-07, "loss": 0.4112, "step": 5412 }, { "epoch": 0.87, "grad_norm": 39.19346853717111, "learning_rate": 4.2220992740630515e-07, "loss": 0.4079, "step": 5413 }, { "epoch": 0.87, "grad_norm": 11.521817440314724, "learning_rate": 4.211609544327894e-07, "loss": 0.4534, "step": 5414 }, { "epoch": 0.87, "grad_norm": 9.732753568406329, "learning_rate": 4.201132288612464e-07, "loss": 0.3942, "step": 5415 }, { "epoch": 0.87, "grad_norm": 13.014022336909237, "learning_rate": 4.1906675097710457e-07, "loss": 0.4312, "step": 5416 }, { "epoch": 0.87, "grad_norm": 11.198380202319221, "learning_rate": 4.180215210654548e-07, "loss": 0.4242, "step": 5417 }, { "epoch": 0.87, "grad_norm": 9.458794552423361, "learning_rate": 4.1697753941104533e-07, "loss": 0.4069, "step": 5418 }, { "epoch": 0.87, "grad_norm": 7.869111662004919, "learning_rate": 4.159348062982849e-07, "loss": 0.3506, "step": 5419 }, { "epoch": 0.87, "grad_norm": 6.2723683968509105, "learning_rate": 4.1489332201124443e-07, "loss": 0.4066, "step": 5420 }, { "epoch": 0.87, "grad_norm": 9.681952815930353, "learning_rate": 4.138530868336532e-07, "loss": 0.3979, "step": 5421 }, { "epoch": 0.87, "grad_norm": 5.6253102245126305, "learning_rate": 4.1281410104889784e-07, "loss": 0.354, "step": 5422 }, { "epoch": 0.87, "grad_norm": 5.578186864546129, "learning_rate": 4.1177636494002747e-07, "loss": 0.4411, "step": 5423 }, { "epoch": 0.87, "grad_norm": 10.040839160189126, "learning_rate": 4.107398787897515e-07, "loss": 0.4173, "step": 5424 }, { "epoch": 0.87, "grad_norm": 4.980739229553199, "learning_rate": 4.097046428804363e-07, "loss": 0.4132, "step": 5425 }, { "epoch": 0.87, "grad_norm": 5.1360496394444, "learning_rate": 4.0867065749410737e-07, "loss": 0.4636, "step": 5426 }, { "epoch": 0.87, "grad_norm": 5.872259311922395, "learning_rate": 4.0763792291245274e-07, "loss": 0.3806, "step": 5427 }, { "epoch": 0.87, "grad_norm": 6.535233556104301, "learning_rate": 4.066064394168184e-07, "loss": 0.4057, "step": 5428 }, { "epoch": 0.87, "grad_norm": 8.671308518902356, "learning_rate": 4.055762072882058e-07, "loss": 0.3552, "step": 5429 }, { "epoch": 0.87, "grad_norm": 5.48708421757117, "learning_rate": 4.045472268072803e-07, "loss": 0.4629, "step": 5430 }, { "epoch": 0.88, "grad_norm": 32.987893833921135, "learning_rate": 4.0351949825436556e-07, "loss": 0.3902, "step": 5431 }, { "epoch": 0.88, "grad_norm": 5.677609296824972, "learning_rate": 4.024930219094425e-07, "loss": 0.4101, "step": 5432 }, { "epoch": 0.88, "grad_norm": 7.275223320415885, "learning_rate": 4.014677980521503e-07, "loss": 0.3787, "step": 5433 }, { "epoch": 0.88, "grad_norm": 5.371291488025167, "learning_rate": 4.0044382696178994e-07, "loss": 0.3905, "step": 5434 }, { "epoch": 0.88, "grad_norm": 9.111739418832654, "learning_rate": 3.994211089173189e-07, "loss": 0.3932, "step": 5435 }, { "epoch": 0.88, "grad_norm": 5.9055530480413685, "learning_rate": 3.9839964419735254e-07, "loss": 0.3884, "step": 5436 }, { "epoch": 0.88, "grad_norm": 5.719362407858911, "learning_rate": 3.9737943308016726e-07, "loss": 0.4897, "step": 5437 }, { "epoch": 0.88, "grad_norm": 5.723881868188386, "learning_rate": 3.963604758436962e-07, "loss": 0.4203, "step": 5438 }, { "epoch": 0.88, "grad_norm": 4.323508223978595, "learning_rate": 3.953427727655323e-07, "loss": 0.3791, "step": 5439 }, { "epoch": 0.88, "grad_norm": 19.172766690550894, "learning_rate": 3.9432632412292504e-07, "loss": 0.4125, "step": 5440 }, { "epoch": 0.88, "grad_norm": 6.253885545073005, "learning_rate": 3.933111301927828e-07, "loss": 0.3783, "step": 5441 }, { "epoch": 0.88, "grad_norm": 5.224874946813128, "learning_rate": 3.922971912516721e-07, "loss": 0.413, "step": 5442 }, { "epoch": 0.88, "grad_norm": 14.85101369030795, "learning_rate": 3.912845075758209e-07, "loss": 0.3945, "step": 5443 }, { "epoch": 0.88, "grad_norm": 5.573305520812423, "learning_rate": 3.9027307944110804e-07, "loss": 0.4342, "step": 5444 }, { "epoch": 0.88, "grad_norm": 5.203101724932555, "learning_rate": 3.892629071230758e-07, "loss": 0.4151, "step": 5445 }, { "epoch": 0.88, "grad_norm": 7.9868183058627915, "learning_rate": 3.8825399089692464e-07, "loss": 0.4192, "step": 5446 }, { "epoch": 0.88, "grad_norm": 5.988996521107887, "learning_rate": 3.8724633103750964e-07, "loss": 0.4417, "step": 5447 }, { "epoch": 0.88, "grad_norm": 5.908448389885545, "learning_rate": 3.862399278193446e-07, "loss": 0.4577, "step": 5448 }, { "epoch": 0.88, "grad_norm": 5.847796516310014, "learning_rate": 3.852347815166024e-07, "loss": 0.416, "step": 5449 }, { "epoch": 0.88, "grad_norm": 37.32424538631784, "learning_rate": 3.8423089240311404e-07, "loss": 0.3622, "step": 5450 }, { "epoch": 0.88, "grad_norm": 5.608093205080137, "learning_rate": 3.8322826075236295e-07, "loss": 0.4007, "step": 5451 }, { "epoch": 0.88, "grad_norm": 1.008094350394244, "learning_rate": 3.822268868374962e-07, "loss": 0.4737, "step": 5452 }, { "epoch": 0.88, "grad_norm": 5.985591573542547, "learning_rate": 3.8122677093131613e-07, "loss": 0.4111, "step": 5453 }, { "epoch": 0.88, "grad_norm": 7.448830073636295, "learning_rate": 3.802279133062803e-07, "loss": 0.4319, "step": 5454 }, { "epoch": 0.88, "grad_norm": 5.9140768824130205, "learning_rate": 3.7923031423450495e-07, "loss": 0.3814, "step": 5455 }, { "epoch": 0.88, "grad_norm": 1.2512554812928376, "learning_rate": 3.782339739877655e-07, "loss": 0.4572, "step": 5456 }, { "epoch": 0.88, "grad_norm": 6.695170774842016, "learning_rate": 3.772388928374909e-07, "loss": 0.3633, "step": 5457 }, { "epoch": 0.88, "grad_norm": 6.523170057426924, "learning_rate": 3.7624507105476826e-07, "loss": 0.4101, "step": 5458 }, { "epoch": 0.88, "grad_norm": 9.28645655818329, "learning_rate": 3.752525089103437e-07, "loss": 0.3745, "step": 5459 }, { "epoch": 0.88, "grad_norm": 6.326758245641892, "learning_rate": 3.7426120667461706e-07, "loss": 0.3768, "step": 5460 }, { "epoch": 0.88, "grad_norm": 21.9191188755056, "learning_rate": 3.7327116461764737e-07, "loss": 0.4689, "step": 5461 }, { "epoch": 0.88, "grad_norm": 1.1328787617246863, "learning_rate": 3.7228238300914934e-07, "loss": 0.4346, "step": 5462 }, { "epoch": 0.88, "grad_norm": 6.987198451785732, "learning_rate": 3.7129486211849307e-07, "loss": 0.4094, "step": 5463 }, { "epoch": 0.88, "grad_norm": 5.9746696206509275, "learning_rate": 3.7030860221470787e-07, "loss": 0.4778, "step": 5464 }, { "epoch": 0.88, "grad_norm": 7.554537967931254, "learning_rate": 3.6932360356647713e-07, "loss": 0.3647, "step": 5465 }, { "epoch": 0.88, "grad_norm": 6.729118126587965, "learning_rate": 3.683398664421428e-07, "loss": 0.4044, "step": 5466 }, { "epoch": 0.88, "grad_norm": 4.311153718031421, "learning_rate": 3.673573911097006e-07, "loss": 0.3506, "step": 5467 }, { "epoch": 0.88, "grad_norm": 7.110406989310331, "learning_rate": 3.663761778368052e-07, "loss": 0.397, "step": 5468 }, { "epoch": 0.88, "grad_norm": 1.1286695981980963, "learning_rate": 3.653962268907657e-07, "loss": 0.4829, "step": 5469 }, { "epoch": 0.88, "grad_norm": 8.7757287791021, "learning_rate": 3.6441753853854677e-07, "loss": 0.3001, "step": 5470 }, { "epoch": 0.88, "grad_norm": 10.257507230874316, "learning_rate": 3.6344011304677074e-07, "loss": 0.4251, "step": 5471 }, { "epoch": 0.88, "grad_norm": 5.614687462690748, "learning_rate": 3.624639506817174e-07, "loss": 0.4134, "step": 5472 }, { "epoch": 0.88, "grad_norm": 8.574919545999684, "learning_rate": 3.614890517093161e-07, "loss": 0.4507, "step": 5473 }, { "epoch": 0.88, "grad_norm": 6.517129145556188, "learning_rate": 3.605154163951591e-07, "loss": 0.3863, "step": 5474 }, { "epoch": 0.88, "grad_norm": 5.727385147495447, "learning_rate": 3.595430450044912e-07, "loss": 0.41, "step": 5475 }, { "epoch": 0.88, "grad_norm": 5.21948050752002, "learning_rate": 3.585719378022129e-07, "loss": 0.3917, "step": 5476 }, { "epoch": 0.88, "grad_norm": 7.981980428739117, "learning_rate": 3.576020950528797e-07, "loss": 0.4166, "step": 5477 }, { "epoch": 0.88, "grad_norm": 5.361554366645686, "learning_rate": 3.566335170207053e-07, "loss": 0.3966, "step": 5478 }, { "epoch": 0.88, "grad_norm": 9.545832683699158, "learning_rate": 3.5566620396955573e-07, "loss": 0.3764, "step": 5479 }, { "epoch": 0.88, "grad_norm": 16.67168108490487, "learning_rate": 3.5470015616295315e-07, "loss": 0.3558, "step": 5480 }, { "epoch": 0.88, "grad_norm": 1.1356589802631945, "learning_rate": 3.537353738640775e-07, "loss": 0.4741, "step": 5481 }, { "epoch": 0.88, "grad_norm": 1.0998738254986102, "learning_rate": 3.527718573357597e-07, "loss": 0.4516, "step": 5482 }, { "epoch": 0.88, "grad_norm": 5.639718197924721, "learning_rate": 3.518096068404908e-07, "loss": 0.4138, "step": 5483 }, { "epoch": 0.88, "grad_norm": 6.50562551885508, "learning_rate": 3.508486226404123e-07, "loss": 0.4558, "step": 5484 }, { "epoch": 0.88, "grad_norm": 6.989789786635936, "learning_rate": 3.4988890499732477e-07, "loss": 0.4399, "step": 5485 }, { "epoch": 0.88, "grad_norm": 6.618643865636363, "learning_rate": 3.489304541726801e-07, "loss": 0.4128, "step": 5486 }, { "epoch": 0.88, "grad_norm": 6.300461121221082, "learning_rate": 3.479732704275862e-07, "loss": 0.4088, "step": 5487 }, { "epoch": 0.88, "grad_norm": 8.715432052846534, "learning_rate": 3.470173540228089e-07, "loss": 0.3819, "step": 5488 }, { "epoch": 0.88, "grad_norm": 6.09393786445219, "learning_rate": 3.460627052187632e-07, "loss": 0.4202, "step": 5489 }, { "epoch": 0.88, "grad_norm": 1.2962622066295508, "learning_rate": 3.451093242755238e-07, "loss": 0.479, "step": 5490 }, { "epoch": 0.88, "grad_norm": 8.380188669171899, "learning_rate": 3.4415721145281743e-07, "loss": 0.3514, "step": 5491 }, { "epoch": 0.88, "grad_norm": 5.139686229850509, "learning_rate": 3.4320636701002497e-07, "loss": 0.4091, "step": 5492 }, { "epoch": 0.89, "grad_norm": 7.7016491228908075, "learning_rate": 3.422567912061836e-07, "loss": 0.4037, "step": 5493 }, { "epoch": 0.89, "grad_norm": 7.3058106533760725, "learning_rate": 3.4130848429998533e-07, "loss": 0.4441, "step": 5494 }, { "epoch": 0.89, "grad_norm": 4.479731379543604, "learning_rate": 3.403614465497718e-07, "loss": 0.3756, "step": 5495 }, { "epoch": 0.89, "grad_norm": 4.3533919790831614, "learning_rate": 3.3941567821354383e-07, "loss": 0.371, "step": 5496 }, { "epoch": 0.89, "grad_norm": 6.236337118108129, "learning_rate": 3.3847117954895536e-07, "loss": 0.3874, "step": 5497 }, { "epoch": 0.89, "grad_norm": 7.639016294539808, "learning_rate": 3.375279508133139e-07, "loss": 0.4702, "step": 5498 }, { "epoch": 0.89, "grad_norm": 4.790078032133164, "learning_rate": 3.3658599226357944e-07, "loss": 0.467, "step": 5499 }, { "epoch": 0.89, "grad_norm": 1.4275112690455682, "learning_rate": 3.356453041563695e-07, "loss": 0.482, "step": 5500 }, { "epoch": 0.89, "grad_norm": 5.192629278894666, "learning_rate": 3.347058867479519e-07, "loss": 0.3984, "step": 5501 }, { "epoch": 0.89, "grad_norm": 20.325624888121677, "learning_rate": 3.337677402942502e-07, "loss": 0.3643, "step": 5502 }, { "epoch": 0.89, "grad_norm": 14.783158208307313, "learning_rate": 3.3283086505084163e-07, "loss": 0.4456, "step": 5503 }, { "epoch": 0.89, "grad_norm": 8.540932304429171, "learning_rate": 3.3189526127295644e-07, "loss": 0.4594, "step": 5504 }, { "epoch": 0.89, "grad_norm": 6.714769276604025, "learning_rate": 3.3096092921547976e-07, "loss": 0.4267, "step": 5505 }, { "epoch": 0.89, "grad_norm": 7.09451181999298, "learning_rate": 3.300278691329478e-07, "loss": 0.483, "step": 5506 }, { "epoch": 0.89, "grad_norm": 6.420973659347821, "learning_rate": 3.290960812795535e-07, "loss": 0.4309, "step": 5507 }, { "epoch": 0.89, "grad_norm": 8.622779209624776, "learning_rate": 3.2816556590914096e-07, "loss": 0.4088, "step": 5508 }, { "epoch": 0.89, "grad_norm": 7.279418142128179, "learning_rate": 3.2723632327520795e-07, "loss": 0.4328, "step": 5509 }, { "epoch": 0.89, "grad_norm": 7.999990854919739, "learning_rate": 3.2630835363090584e-07, "loss": 0.4532, "step": 5510 }, { "epoch": 0.89, "grad_norm": 5.301159061123506, "learning_rate": 3.2538165722903913e-07, "loss": 0.3911, "step": 5511 }, { "epoch": 0.89, "grad_norm": 10.456957245504995, "learning_rate": 3.244562343220664e-07, "loss": 0.4198, "step": 5512 }, { "epoch": 0.89, "grad_norm": 8.71878381693972, "learning_rate": 3.235320851620971e-07, "loss": 0.4129, "step": 5513 }, { "epoch": 0.89, "grad_norm": 5.376724966639073, "learning_rate": 3.2260921000089483e-07, "loss": 0.3544, "step": 5514 }, { "epoch": 0.89, "grad_norm": 11.003656558271219, "learning_rate": 3.216876090898774e-07, "loss": 0.3624, "step": 5515 }, { "epoch": 0.89, "grad_norm": 11.086396283150378, "learning_rate": 3.207672826801139e-07, "loss": 0.479, "step": 5516 }, { "epoch": 0.89, "grad_norm": 7.854438146062388, "learning_rate": 3.198482310223261e-07, "loss": 0.4268, "step": 5517 }, { "epoch": 0.89, "grad_norm": 8.3969725068891, "learning_rate": 3.1893045436688863e-07, "loss": 0.4368, "step": 5518 }, { "epoch": 0.89, "grad_norm": 8.670361452543856, "learning_rate": 3.1801395296383097e-07, "loss": 0.418, "step": 5519 }, { "epoch": 0.89, "grad_norm": 5.372577489338434, "learning_rate": 3.1709872706283173e-07, "loss": 0.4046, "step": 5520 }, { "epoch": 0.89, "grad_norm": 10.359289475262425, "learning_rate": 3.161847769132237e-07, "loss": 0.3736, "step": 5521 }, { "epoch": 0.89, "grad_norm": 8.042191481285021, "learning_rate": 3.152721027639932e-07, "loss": 0.3996, "step": 5522 }, { "epoch": 0.89, "grad_norm": 1.2575994245920827, "learning_rate": 3.1436070486377755e-07, "loss": 0.4433, "step": 5523 }, { "epoch": 0.89, "grad_norm": 15.277968502494566, "learning_rate": 3.134505834608653e-07, "loss": 0.4058, "step": 5524 }, { "epoch": 0.89, "grad_norm": 10.25046436677384, "learning_rate": 3.125417388031998e-07, "loss": 0.4633, "step": 5525 }, { "epoch": 0.89, "grad_norm": 11.497466934966727, "learning_rate": 3.116341711383758e-07, "loss": 0.4032, "step": 5526 }, { "epoch": 0.89, "grad_norm": 13.811747609706352, "learning_rate": 3.1072788071363936e-07, "loss": 0.3957, "step": 5527 }, { "epoch": 0.89, "grad_norm": 6.9855802342414535, "learning_rate": 3.0982286777588745e-07, "loss": 0.4121, "step": 5528 }, { "epoch": 0.89, "grad_norm": 6.58354653721065, "learning_rate": 3.089191325716728e-07, "loss": 0.4573, "step": 5529 }, { "epoch": 0.89, "grad_norm": 4.899375440808218, "learning_rate": 3.0801667534719694e-07, "loss": 0.3894, "step": 5530 }, { "epoch": 0.89, "grad_norm": 9.363768946579198, "learning_rate": 3.0711549634831364e-07, "loss": 0.434, "step": 5531 }, { "epoch": 0.89, "grad_norm": 7.588855976871044, "learning_rate": 3.062155958205293e-07, "loss": 0.44, "step": 5532 }, { "epoch": 0.89, "grad_norm": 1.444901107272617, "learning_rate": 3.0531697400900117e-07, "loss": 0.5128, "step": 5533 }, { "epoch": 0.89, "grad_norm": 1.1376965762023412, "learning_rate": 3.044196311585401e-07, "loss": 0.4087, "step": 5534 }, { "epoch": 0.89, "grad_norm": 6.552745013895243, "learning_rate": 3.0352356751360555e-07, "loss": 0.4275, "step": 5535 }, { "epoch": 0.89, "grad_norm": 7.199416513287486, "learning_rate": 3.0262878331830946e-07, "loss": 0.3817, "step": 5536 }, { "epoch": 0.89, "grad_norm": 5.610464811265526, "learning_rate": 3.017352788164174e-07, "loss": 0.3682, "step": 5537 }, { "epoch": 0.89, "grad_norm": 5.781370166372906, "learning_rate": 3.0084305425134406e-07, "loss": 0.4601, "step": 5538 }, { "epoch": 0.89, "grad_norm": 9.457750131401145, "learning_rate": 2.999521098661551e-07, "loss": 0.3565, "step": 5539 }, { "epoch": 0.89, "grad_norm": 4.748196291926938, "learning_rate": 2.990624459035685e-07, "loss": 0.4121, "step": 5540 }, { "epoch": 0.89, "grad_norm": 6.338798291916017, "learning_rate": 2.981740626059548e-07, "loss": 0.3687, "step": 5541 }, { "epoch": 0.89, "grad_norm": 5.839697376547755, "learning_rate": 2.972869602153333e-07, "loss": 0.4176, "step": 5542 }, { "epoch": 0.89, "grad_norm": 9.97373679300075, "learning_rate": 2.964011389733734e-07, "loss": 0.4286, "step": 5543 }, { "epoch": 0.89, "grad_norm": 7.616848003760706, "learning_rate": 2.955165991214004e-07, "loss": 0.3551, "step": 5544 }, { "epoch": 0.89, "grad_norm": 4.671203347313586, "learning_rate": 2.946333409003849e-07, "loss": 0.4266, "step": 5545 }, { "epoch": 0.89, "grad_norm": 9.588708477434107, "learning_rate": 2.93751364550951e-07, "loss": 0.4152, "step": 5546 }, { "epoch": 0.89, "grad_norm": 5.116126028897421, "learning_rate": 2.928706703133738e-07, "loss": 0.3893, "step": 5547 }, { "epoch": 0.89, "grad_norm": 5.349947313867396, "learning_rate": 2.919912584275797e-07, "loss": 0.4055, "step": 5548 }, { "epoch": 0.89, "grad_norm": 1.078093493744602, "learning_rate": 2.911131291331437e-07, "loss": 0.4929, "step": 5549 }, { "epoch": 0.89, "grad_norm": 7.081465285392764, "learning_rate": 2.902362826692923e-07, "loss": 0.465, "step": 5550 }, { "epoch": 0.89, "grad_norm": 7.5943412451397, "learning_rate": 2.8936071927490317e-07, "loss": 0.3752, "step": 5551 }, { "epoch": 0.89, "grad_norm": 5.974395079048967, "learning_rate": 2.884864391885034e-07, "loss": 0.4092, "step": 5552 }, { "epoch": 0.89, "grad_norm": 12.242571725196171, "learning_rate": 2.876134426482713e-07, "loss": 0.4513, "step": 5553 }, { "epoch": 0.89, "grad_norm": 6.902770771407512, "learning_rate": 2.8674172989203497e-07, "loss": 0.4094, "step": 5554 }, { "epoch": 0.9, "grad_norm": 6.56945427745902, "learning_rate": 2.858713011572728e-07, "loss": 0.4515, "step": 5555 }, { "epoch": 0.9, "grad_norm": 5.094578927839349, "learning_rate": 2.850021566811145e-07, "loss": 0.3975, "step": 5556 }, { "epoch": 0.9, "grad_norm": 1.3269984400099661, "learning_rate": 2.8413429670033855e-07, "loss": 0.4496, "step": 5557 }, { "epoch": 0.9, "grad_norm": 4.811283036584181, "learning_rate": 2.832677214513735e-07, "loss": 0.4666, "step": 5558 }, { "epoch": 0.9, "grad_norm": 6.650471158491238, "learning_rate": 2.8240243117029885e-07, "loss": 0.3902, "step": 5559 }, { "epoch": 0.9, "grad_norm": 6.415788916708207, "learning_rate": 2.815384260928433e-07, "loss": 0.3991, "step": 5560 }, { "epoch": 0.9, "grad_norm": 7.053182638840294, "learning_rate": 2.8067570645438467e-07, "loss": 0.396, "step": 5561 }, { "epoch": 0.9, "grad_norm": 5.757937813948898, "learning_rate": 2.7981427248995276e-07, "loss": 0.4013, "step": 5562 }, { "epoch": 0.9, "grad_norm": 13.535798367845265, "learning_rate": 2.7895412443422644e-07, "loss": 0.424, "step": 5563 }, { "epoch": 0.9, "grad_norm": 6.573899018411119, "learning_rate": 2.780952625215333e-07, "loss": 0.4325, "step": 5564 }, { "epoch": 0.9, "grad_norm": 9.394106107996087, "learning_rate": 2.772376869858495e-07, "loss": 0.4199, "step": 5565 }, { "epoch": 0.9, "grad_norm": 6.26811469756973, "learning_rate": 2.7638139806080435e-07, "loss": 0.4142, "step": 5566 }, { "epoch": 0.9, "grad_norm": 5.8975711230956875, "learning_rate": 2.755263959796739e-07, "loss": 0.4528, "step": 5567 }, { "epoch": 0.9, "grad_norm": 6.881632462255824, "learning_rate": 2.746726809753836e-07, "loss": 0.4744, "step": 5568 }, { "epoch": 0.9, "grad_norm": 13.622670057496439, "learning_rate": 2.73820253280509e-07, "loss": 0.3588, "step": 5569 }, { "epoch": 0.9, "grad_norm": 7.3511006473182485, "learning_rate": 2.72969113127276e-07, "loss": 0.3243, "step": 5570 }, { "epoch": 0.9, "grad_norm": 6.5413959043765715, "learning_rate": 2.7211926074755755e-07, "loss": 0.4739, "step": 5571 }, { "epoch": 0.9, "grad_norm": 8.392800389466684, "learning_rate": 2.712706963728773e-07, "loss": 0.4265, "step": 5572 }, { "epoch": 0.9, "grad_norm": 5.763077431925545, "learning_rate": 2.7042342023440803e-07, "loss": 0.3538, "step": 5573 }, { "epoch": 0.9, "grad_norm": 3.968205827455832, "learning_rate": 2.6957743256297077e-07, "loss": 0.3907, "step": 5574 }, { "epoch": 0.9, "grad_norm": 7.577269121955915, "learning_rate": 2.687327335890344e-07, "loss": 0.3777, "step": 5575 }, { "epoch": 0.9, "grad_norm": 1.1624599261141724, "learning_rate": 2.6788932354272104e-07, "loss": 0.4349, "step": 5576 }, { "epoch": 0.9, "grad_norm": 14.048962673973865, "learning_rate": 2.6704720265379626e-07, "loss": 0.3769, "step": 5577 }, { "epoch": 0.9, "grad_norm": 6.474048595629797, "learning_rate": 2.662063711516788e-07, "loss": 0.4248, "step": 5578 }, { "epoch": 0.9, "grad_norm": 4.50572960953007, "learning_rate": 2.653668292654338e-07, "loss": 0.3589, "step": 5579 }, { "epoch": 0.9, "grad_norm": 6.593189811920075, "learning_rate": 2.645285772237743e-07, "loss": 0.4064, "step": 5580 }, { "epoch": 0.9, "grad_norm": 6.179147139053955, "learning_rate": 2.636916152550656e-07, "loss": 0.3752, "step": 5581 }, { "epoch": 0.9, "grad_norm": 8.634652067822145, "learning_rate": 2.6285594358731736e-07, "loss": 0.3983, "step": 5582 }, { "epoch": 0.9, "grad_norm": 28.290569569023457, "learning_rate": 2.6202156244819035e-07, "loss": 0.3731, "step": 5583 }, { "epoch": 0.9, "grad_norm": 14.711132958500876, "learning_rate": 2.611884720649921e-07, "loss": 0.4772, "step": 5584 }, { "epoch": 0.9, "grad_norm": 4.100438697344083, "learning_rate": 2.6035667266468166e-07, "loss": 0.4486, "step": 5585 }, { "epoch": 0.9, "grad_norm": 5.558185883331002, "learning_rate": 2.595261644738617e-07, "loss": 0.3871, "step": 5586 }, { "epoch": 0.9, "grad_norm": 11.439006615067088, "learning_rate": 2.586969477187867e-07, "loss": 0.2952, "step": 5587 }, { "epoch": 0.9, "grad_norm": 5.760567874843918, "learning_rate": 2.578690226253583e-07, "loss": 0.4017, "step": 5588 }, { "epoch": 0.9, "grad_norm": 6.6509826784426185, "learning_rate": 2.570423894191254e-07, "loss": 0.3776, "step": 5589 }, { "epoch": 0.9, "grad_norm": 7.419998145151117, "learning_rate": 2.5621704832528573e-07, "loss": 0.4632, "step": 5590 }, { "epoch": 0.9, "grad_norm": 8.641532690832065, "learning_rate": 2.553929995686849e-07, "loss": 0.3916, "step": 5591 }, { "epoch": 0.9, "grad_norm": 10.279192141379214, "learning_rate": 2.545702433738184e-07, "loss": 0.4417, "step": 5592 }, { "epoch": 0.9, "grad_norm": 10.920774811421737, "learning_rate": 2.537487799648247e-07, "loss": 0.3988, "step": 5593 }, { "epoch": 0.9, "grad_norm": 8.320572481407005, "learning_rate": 2.529286095654948e-07, "loss": 0.3717, "step": 5594 }, { "epoch": 0.9, "grad_norm": 5.4727012963216595, "learning_rate": 2.5210973239926553e-07, "loss": 0.4002, "step": 5595 }, { "epoch": 0.9, "grad_norm": 1.4631861246146862, "learning_rate": 2.5129214868922227e-07, "loss": 0.482, "step": 5596 }, { "epoch": 0.9, "grad_norm": 7.658722684089519, "learning_rate": 2.504758586580952e-07, "loss": 0.4273, "step": 5597 }, { "epoch": 0.9, "grad_norm": 7.1104689381259725, "learning_rate": 2.4966086252826706e-07, "loss": 0.3916, "step": 5598 }, { "epoch": 0.9, "grad_norm": 5.556154212533265, "learning_rate": 2.4884716052176237e-07, "loss": 0.5336, "step": 5599 }, { "epoch": 0.9, "grad_norm": 10.22787171734883, "learning_rate": 2.480347528602589e-07, "loss": 0.4152, "step": 5600 }, { "epoch": 0.9, "grad_norm": 8.332750596889245, "learning_rate": 2.4722363976507677e-07, "loss": 0.4157, "step": 5601 }, { "epoch": 0.9, "grad_norm": 5.2104000268671555, "learning_rate": 2.4641382145718597e-07, "loss": 0.4159, "step": 5602 }, { "epoch": 0.9, "grad_norm": 7.0060276755620965, "learning_rate": 2.456052981572038e-07, "loss": 0.4491, "step": 5603 }, { "epoch": 0.9, "grad_norm": 5.590096276191707, "learning_rate": 2.447980700853941e-07, "loss": 0.4281, "step": 5604 }, { "epoch": 0.9, "grad_norm": 10.624694494160746, "learning_rate": 2.439921374616683e-07, "loss": 0.4455, "step": 5605 }, { "epoch": 0.9, "grad_norm": 7.952641157708498, "learning_rate": 2.4318750050558336e-07, "loss": 0.4763, "step": 5606 }, { "epoch": 0.9, "grad_norm": 6.050077799728524, "learning_rate": 2.423841594363469e-07, "loss": 0.4337, "step": 5607 }, { "epoch": 0.9, "grad_norm": 7.364899562404551, "learning_rate": 2.415821144728092e-07, "loss": 0.3772, "step": 5608 }, { "epoch": 0.9, "grad_norm": 8.968789736601055, "learning_rate": 2.4078136583347014e-07, "loss": 0.3917, "step": 5609 }, { "epoch": 0.9, "grad_norm": 10.671810856381203, "learning_rate": 2.399819137364756e-07, "loss": 0.3443, "step": 5610 }, { "epoch": 0.9, "grad_norm": 7.155547603754944, "learning_rate": 2.3918375839961927e-07, "loss": 0.38, "step": 5611 }, { "epoch": 0.9, "grad_norm": 9.918086025206291, "learning_rate": 2.3838690004033826e-07, "loss": 0.3911, "step": 5612 }, { "epoch": 0.9, "grad_norm": 9.83088684192959, "learning_rate": 2.3759133887572117e-07, "loss": 0.4189, "step": 5613 }, { "epoch": 0.9, "grad_norm": 11.84462985183511, "learning_rate": 2.3679707512250072e-07, "loss": 0.3947, "step": 5614 }, { "epoch": 0.9, "grad_norm": 14.446089985678176, "learning_rate": 2.3600410899705383e-07, "loss": 0.3836, "step": 5615 }, { "epoch": 0.9, "grad_norm": 5.413138430978946, "learning_rate": 2.3521244071540883e-07, "loss": 0.418, "step": 5616 }, { "epoch": 0.91, "grad_norm": 15.603307346664275, "learning_rate": 2.3442207049323707e-07, "loss": 0.4165, "step": 5617 }, { "epoch": 0.91, "grad_norm": 6.8440266939596635, "learning_rate": 2.3363299854585742e-07, "loss": 0.3487, "step": 5618 }, { "epoch": 0.91, "grad_norm": 8.87770243045142, "learning_rate": 2.3284522508823403e-07, "loss": 0.4284, "step": 5619 }, { "epoch": 0.91, "grad_norm": 13.075798787105164, "learning_rate": 2.320587503349797e-07, "loss": 0.3929, "step": 5620 }, { "epoch": 0.91, "grad_norm": 8.63692720012179, "learning_rate": 2.3127357450035025e-07, "loss": 0.3885, "step": 5621 }, { "epoch": 0.91, "grad_norm": 8.028215367215198, "learning_rate": 2.304896977982496e-07, "loss": 0.4093, "step": 5622 }, { "epoch": 0.91, "grad_norm": 0.9939273095844205, "learning_rate": 2.2970712044222742e-07, "loss": 0.4121, "step": 5623 }, { "epoch": 0.91, "grad_norm": 7.391065898999475, "learning_rate": 2.2892584264548046e-07, "loss": 0.4315, "step": 5624 }, { "epoch": 0.91, "grad_norm": 6.5600712801803915, "learning_rate": 2.2814586462084897e-07, "loss": 0.4582, "step": 5625 }, { "epoch": 0.91, "grad_norm": 1.0951164069892068, "learning_rate": 2.2736718658082025e-07, "loss": 0.4513, "step": 5626 }, { "epoch": 0.91, "grad_norm": 5.012808385793947, "learning_rate": 2.2658980873752902e-07, "loss": 0.3531, "step": 5627 }, { "epoch": 0.91, "grad_norm": 5.8665867819509545, "learning_rate": 2.2581373130275307e-07, "loss": 0.4361, "step": 5628 }, { "epoch": 0.91, "grad_norm": 6.1238007118596975, "learning_rate": 2.250389544879189e-07, "loss": 0.3674, "step": 5629 }, { "epoch": 0.91, "grad_norm": 5.844578529625602, "learning_rate": 2.2426547850409598e-07, "loss": 0.3629, "step": 5630 }, { "epoch": 0.91, "grad_norm": 5.820412043293247, "learning_rate": 2.234933035619996e-07, "loss": 0.444, "step": 5631 }, { "epoch": 0.91, "grad_norm": 10.994468992140094, "learning_rate": 2.2272242987199379e-07, "loss": 0.4165, "step": 5632 }, { "epoch": 0.91, "grad_norm": 6.119450590532243, "learning_rate": 2.219528576440838e-07, "loss": 0.4005, "step": 5633 }, { "epoch": 0.91, "grad_norm": 8.390439593073815, "learning_rate": 2.2118458708792256e-07, "loss": 0.4145, "step": 5634 }, { "epoch": 0.91, "grad_norm": 5.183710218944606, "learning_rate": 2.2041761841280873e-07, "loss": 0.3539, "step": 5635 }, { "epoch": 0.91, "grad_norm": 1.4930307966498106, "learning_rate": 2.196519518276874e-07, "loss": 0.5118, "step": 5636 }, { "epoch": 0.91, "grad_norm": 10.589080595934432, "learning_rate": 2.1888758754114336e-07, "loss": 0.3842, "step": 5637 }, { "epoch": 0.91, "grad_norm": 7.891383939730442, "learning_rate": 2.1812452576141285e-07, "loss": 0.4002, "step": 5638 }, { "epoch": 0.91, "grad_norm": 6.613806238676052, "learning_rate": 2.1736276669637568e-07, "loss": 0.4374, "step": 5639 }, { "epoch": 0.91, "grad_norm": 8.401610757722734, "learning_rate": 2.1660231055355473e-07, "loss": 0.3685, "step": 5640 }, { "epoch": 0.91, "grad_norm": 5.537566396517372, "learning_rate": 2.1584315754011876e-07, "loss": 0.3972, "step": 5641 }, { "epoch": 0.91, "grad_norm": 6.0473674999937455, "learning_rate": 2.1508530786288284e-07, "loss": 0.3839, "step": 5642 }, { "epoch": 0.91, "grad_norm": 11.07020816468202, "learning_rate": 2.143287617283074e-07, "loss": 0.4165, "step": 5643 }, { "epoch": 0.91, "grad_norm": 10.168385323138658, "learning_rate": 2.135735193424937e-07, "loss": 0.3552, "step": 5644 }, { "epoch": 0.91, "grad_norm": 8.24471549688324, "learning_rate": 2.1281958091119215e-07, "loss": 0.477, "step": 5645 }, { "epoch": 0.91, "grad_norm": 5.607939437820756, "learning_rate": 2.1206694663979676e-07, "loss": 0.4038, "step": 5646 }, { "epoch": 0.91, "grad_norm": 16.831427855179, "learning_rate": 2.113156167333452e-07, "loss": 0.4615, "step": 5647 }, { "epoch": 0.91, "grad_norm": 6.4601499721893925, "learning_rate": 2.105655913965199e-07, "loss": 0.3956, "step": 5648 }, { "epoch": 0.91, "grad_norm": 12.267416887269848, "learning_rate": 2.0981687083364955e-07, "loss": 0.3783, "step": 5649 }, { "epoch": 0.91, "grad_norm": 9.701809368497797, "learning_rate": 2.0906945524870493e-07, "loss": 0.398, "step": 5650 }, { "epoch": 0.91, "grad_norm": 5.065051426680281, "learning_rate": 2.0832334484530424e-07, "loss": 0.4109, "step": 5651 }, { "epoch": 0.91, "grad_norm": 16.708565409964823, "learning_rate": 2.0757853982670773e-07, "loss": 0.481, "step": 5652 }, { "epoch": 0.91, "grad_norm": 9.664951354496948, "learning_rate": 2.0683504039582024e-07, "loss": 0.4597, "step": 5653 }, { "epoch": 0.91, "grad_norm": 1.251462773099307, "learning_rate": 2.0609284675519315e-07, "loss": 0.4957, "step": 5654 }, { "epoch": 0.91, "grad_norm": 4.621581606496009, "learning_rate": 2.053519591070191e-07, "loss": 0.32, "step": 5655 }, { "epoch": 0.91, "grad_norm": 7.4525676078137, "learning_rate": 2.0461237765313612e-07, "loss": 0.3974, "step": 5656 }, { "epoch": 0.91, "grad_norm": 6.599071881910322, "learning_rate": 2.0387410259502692e-07, "loss": 0.4761, "step": 5657 }, { "epoch": 0.91, "grad_norm": 1.03062391755534, "learning_rate": 2.0313713413381952e-07, "loss": 0.4338, "step": 5658 }, { "epoch": 0.91, "grad_norm": 5.4269472895707445, "learning_rate": 2.0240147247028164e-07, "loss": 0.3931, "step": 5659 }, { "epoch": 0.91, "grad_norm": 11.74083948579148, "learning_rate": 2.016671178048296e-07, "loss": 0.4064, "step": 5660 }, { "epoch": 0.91, "grad_norm": 6.611220546341827, "learning_rate": 2.0093407033752178e-07, "loss": 0.3975, "step": 5661 }, { "epoch": 0.91, "grad_norm": 15.17178481068634, "learning_rate": 2.0020233026806003e-07, "loss": 0.3659, "step": 5662 }, { "epoch": 0.91, "grad_norm": 17.86489123164075, "learning_rate": 1.9947189779579047e-07, "loss": 0.3982, "step": 5663 }, { "epoch": 0.91, "grad_norm": 10.903728722391923, "learning_rate": 1.9874277311970335e-07, "loss": 0.4124, "step": 5664 }, { "epoch": 0.91, "grad_norm": 11.292687402526482, "learning_rate": 1.9801495643843316e-07, "loss": 0.4633, "step": 5665 }, { "epoch": 0.91, "grad_norm": 9.153829829979024, "learning_rate": 1.972884479502557e-07, "loss": 0.3548, "step": 5666 }, { "epoch": 0.91, "grad_norm": 4.193277159394358, "learning_rate": 1.965632478530921e-07, "loss": 0.364, "step": 5667 }, { "epoch": 0.91, "grad_norm": 7.080483668032681, "learning_rate": 1.9583935634450768e-07, "loss": 0.4152, "step": 5668 }, { "epoch": 0.91, "grad_norm": 7.315143910518079, "learning_rate": 1.9511677362171078e-07, "loss": 0.4389, "step": 5669 }, { "epoch": 0.91, "grad_norm": 7.682125110626832, "learning_rate": 1.943954998815506e-07, "loss": 0.4371, "step": 5670 }, { "epoch": 0.91, "grad_norm": 7.5242238784207585, "learning_rate": 1.9367553532052497e-07, "loss": 0.3862, "step": 5671 }, { "epoch": 0.91, "grad_norm": 7.66512380749184, "learning_rate": 1.9295688013477031e-07, "loss": 0.3822, "step": 5672 }, { "epoch": 0.91, "grad_norm": 8.677533531133708, "learning_rate": 1.9223953452006784e-07, "loss": 0.4665, "step": 5673 }, { "epoch": 0.91, "grad_norm": 5.945319722257524, "learning_rate": 1.915234986718434e-07, "loss": 0.4004, "step": 5674 }, { "epoch": 0.91, "grad_norm": 4.74825767323441, "learning_rate": 1.908087727851632e-07, "loss": 0.3902, "step": 5675 }, { "epoch": 0.91, "grad_norm": 8.930941945445651, "learning_rate": 1.9009535705474091e-07, "loss": 0.466, "step": 5676 }, { "epoch": 0.91, "grad_norm": 10.536868757526305, "learning_rate": 1.8938325167492888e-07, "loss": 0.4374, "step": 5677 }, { "epoch": 0.91, "grad_norm": 6.84400853281233, "learning_rate": 1.8867245683972358e-07, "loss": 0.3935, "step": 5678 }, { "epoch": 0.92, "grad_norm": 5.219624391224151, "learning_rate": 1.8796297274276677e-07, "loss": 0.4726, "step": 5679 }, { "epoch": 0.92, "grad_norm": 4.225957038186819, "learning_rate": 1.8725479957734105e-07, "loss": 0.3698, "step": 5680 }, { "epoch": 0.92, "grad_norm": 5.318578838339036, "learning_rate": 1.8654793753637156e-07, "loss": 0.4374, "step": 5681 }, { "epoch": 0.92, "grad_norm": 9.61787225320892, "learning_rate": 1.8584238681242705e-07, "loss": 0.4509, "step": 5682 }, { "epoch": 0.92, "grad_norm": 7.194949426355194, "learning_rate": 1.8513814759772042e-07, "loss": 0.3756, "step": 5683 }, { "epoch": 0.92, "grad_norm": 6.89789197224502, "learning_rate": 1.844352200841043e-07, "loss": 0.3682, "step": 5684 }, { "epoch": 0.92, "grad_norm": 10.599436218436441, "learning_rate": 1.8373360446307608e-07, "loss": 0.3818, "step": 5685 }, { "epoch": 0.92, "grad_norm": 20.47946827886676, "learning_rate": 1.8303330092577453e-07, "loss": 0.4035, "step": 5686 }, { "epoch": 0.92, "grad_norm": 12.848645435005375, "learning_rate": 1.823343096629837e-07, "loss": 0.4902, "step": 5687 }, { "epoch": 0.92, "grad_norm": 7.11234515784128, "learning_rate": 1.8163663086512574e-07, "loss": 0.3471, "step": 5688 }, { "epoch": 0.92, "grad_norm": 6.190382808486084, "learning_rate": 1.80940264722268e-07, "loss": 0.4705, "step": 5689 }, { "epoch": 0.92, "grad_norm": 8.24348449545593, "learning_rate": 1.8024521142412155e-07, "loss": 0.4235, "step": 5690 }, { "epoch": 0.92, "grad_norm": 1.322727403083543, "learning_rate": 1.7955147116003603e-07, "loss": 0.481, "step": 5691 }, { "epoch": 0.92, "grad_norm": 5.047430117082018, "learning_rate": 1.7885904411900633e-07, "loss": 0.4241, "step": 5692 }, { "epoch": 0.92, "grad_norm": 11.59899323838557, "learning_rate": 1.7816793048966884e-07, "loss": 0.4988, "step": 5693 }, { "epoch": 0.92, "grad_norm": 5.8043922269912915, "learning_rate": 1.7747813046030184e-07, "loss": 0.425, "step": 5694 }, { "epoch": 0.92, "grad_norm": 5.594260174402231, "learning_rate": 1.7678964421882494e-07, "loss": 0.394, "step": 5695 }, { "epoch": 0.92, "grad_norm": 5.238407288765113, "learning_rate": 1.7610247195280207e-07, "loss": 0.3868, "step": 5696 }, { "epoch": 0.92, "grad_norm": 8.625979468931048, "learning_rate": 1.7541661384943676e-07, "loss": 0.4117, "step": 5697 }, { "epoch": 0.92, "grad_norm": 4.6694017607711595, "learning_rate": 1.747320700955768e-07, "loss": 0.5149, "step": 5698 }, { "epoch": 0.92, "grad_norm": 5.74752640227918, "learning_rate": 1.7404884087771023e-07, "loss": 0.3877, "step": 5699 }, { "epoch": 0.92, "grad_norm": 21.10747722192997, "learning_rate": 1.7336692638196706e-07, "loss": 0.4115, "step": 5700 }, { "epoch": 0.92, "grad_norm": 8.666266737314029, "learning_rate": 1.726863267941209e-07, "loss": 0.4011, "step": 5701 }, { "epoch": 0.92, "grad_norm": 16.39343525880441, "learning_rate": 1.72007042299584e-07, "loss": 0.379, "step": 5702 }, { "epoch": 0.92, "grad_norm": 11.82139612384758, "learning_rate": 1.7132907308341383e-07, "loss": 0.4631, "step": 5703 }, { "epoch": 0.92, "grad_norm": 6.481875989140034, "learning_rate": 1.7065241933030664e-07, "loss": 0.4138, "step": 5704 }, { "epoch": 0.92, "grad_norm": 5.0392192920757095, "learning_rate": 1.6997708122460322e-07, "loss": 0.4292, "step": 5705 }, { "epoch": 0.92, "grad_norm": 5.974780873464883, "learning_rate": 1.6930305895028255e-07, "loss": 0.4026, "step": 5706 }, { "epoch": 0.92, "grad_norm": 4.763039421033171, "learning_rate": 1.6863035269096774e-07, "loss": 0.347, "step": 5707 }, { "epoch": 0.92, "grad_norm": 9.906384094260954, "learning_rate": 1.6795896262992162e-07, "loss": 0.3506, "step": 5708 }, { "epoch": 0.92, "grad_norm": 4.002980847134111, "learning_rate": 1.672888889500518e-07, "loss": 0.3767, "step": 5709 }, { "epoch": 0.92, "grad_norm": 5.940667328769693, "learning_rate": 1.6662013183390112e-07, "loss": 0.4258, "step": 5710 }, { "epoch": 0.92, "grad_norm": 18.654288103818917, "learning_rate": 1.6595269146366e-07, "loss": 0.4177, "step": 5711 }, { "epoch": 0.92, "grad_norm": 13.345879362058952, "learning_rate": 1.6528656802115794e-07, "loss": 0.4495, "step": 5712 }, { "epoch": 0.92, "grad_norm": 6.763094233466857, "learning_rate": 1.6462176168786371e-07, "loss": 0.4351, "step": 5713 }, { "epoch": 0.92, "grad_norm": 12.300561476181244, "learning_rate": 1.6395827264488962e-07, "loss": 0.361, "step": 5714 }, { "epoch": 0.92, "grad_norm": 14.312788323776475, "learning_rate": 1.6329610107298833e-07, "loss": 0.3512, "step": 5715 }, { "epoch": 0.92, "grad_norm": 9.290958103442085, "learning_rate": 1.6263524715255442e-07, "loss": 0.4266, "step": 5716 }, { "epoch": 0.92, "grad_norm": 5.692028736494481, "learning_rate": 1.6197571106362054e-07, "loss": 0.385, "step": 5717 }, { "epoch": 0.92, "grad_norm": 5.973696118269885, "learning_rate": 1.6131749298586464e-07, "loss": 0.4409, "step": 5718 }, { "epoch": 0.92, "grad_norm": 11.989204208784603, "learning_rate": 1.6066059309860273e-07, "loss": 0.4621, "step": 5719 }, { "epoch": 0.92, "grad_norm": 10.87619009861215, "learning_rate": 1.6000501158079217e-07, "loss": 0.3863, "step": 5720 }, { "epoch": 0.92, "grad_norm": 4.949451011883772, "learning_rate": 1.5935074861103183e-07, "loss": 0.3791, "step": 5721 }, { "epoch": 0.92, "grad_norm": 6.216317036441988, "learning_rate": 1.5869780436756078e-07, "loss": 0.426, "step": 5722 }, { "epoch": 0.92, "grad_norm": 6.149134998403504, "learning_rate": 1.5804617902825948e-07, "loss": 0.375, "step": 5723 }, { "epoch": 0.92, "grad_norm": 1.221450749403776, "learning_rate": 1.573958727706476e-07, "loss": 0.4528, "step": 5724 }, { "epoch": 0.92, "grad_norm": 8.105379535857221, "learning_rate": 1.567468857718879e-07, "loss": 0.3988, "step": 5725 }, { "epoch": 0.92, "grad_norm": 6.074944905391083, "learning_rate": 1.560992182087806e-07, "loss": 0.4348, "step": 5726 }, { "epoch": 0.92, "grad_norm": 4.979908624463976, "learning_rate": 1.554528702577701e-07, "loss": 0.4136, "step": 5727 }, { "epoch": 0.92, "grad_norm": 9.618657257238828, "learning_rate": 1.5480784209493837e-07, "loss": 0.4205, "step": 5728 }, { "epoch": 0.92, "grad_norm": 5.251143012145175, "learning_rate": 1.5416413389600816e-07, "loss": 0.4069, "step": 5729 }, { "epoch": 0.92, "grad_norm": 23.746872150690024, "learning_rate": 1.5352174583634526e-07, "loss": 0.4506, "step": 5730 }, { "epoch": 0.92, "grad_norm": 7.422963005313953, "learning_rate": 1.5288067809095196e-07, "loss": 0.4487, "step": 5731 }, { "epoch": 0.92, "grad_norm": 6.232360640315047, "learning_rate": 1.5224093083447354e-07, "loss": 0.4942, "step": 5732 }, { "epoch": 0.92, "grad_norm": 6.944194595825505, "learning_rate": 1.5160250424119505e-07, "loss": 0.3519, "step": 5733 }, { "epoch": 0.92, "grad_norm": 8.282486211522995, "learning_rate": 1.509653984850412e-07, "loss": 0.3591, "step": 5734 }, { "epoch": 0.92, "grad_norm": 8.743740952490462, "learning_rate": 1.5032961373957766e-07, "loss": 0.4343, "step": 5735 }, { "epoch": 0.92, "grad_norm": 0.99238269707156, "learning_rate": 1.496951501780086e-07, "loss": 0.4543, "step": 5736 }, { "epoch": 0.92, "grad_norm": 9.001438292428004, "learning_rate": 1.490620079731808e-07, "loss": 0.4172, "step": 5737 }, { "epoch": 0.92, "grad_norm": 5.906954103434556, "learning_rate": 1.4843018729757853e-07, "loss": 0.3922, "step": 5738 }, { "epoch": 0.92, "grad_norm": 6.47791874659914, "learning_rate": 1.4779968832332737e-07, "loss": 0.4132, "step": 5739 }, { "epoch": 0.92, "grad_norm": 5.025946833685009, "learning_rate": 1.4717051122219272e-07, "loss": 0.3783, "step": 5740 }, { "epoch": 0.92, "grad_norm": 7.484472352686724, "learning_rate": 1.4654265616557973e-07, "loss": 0.3814, "step": 5741 }, { "epoch": 0.93, "grad_norm": 7.450014671174384, "learning_rate": 1.459161233245343e-07, "loss": 0.3845, "step": 5742 }, { "epoch": 0.93, "grad_norm": 4.856514252857716, "learning_rate": 1.4529091286973994e-07, "loss": 0.4347, "step": 5743 }, { "epoch": 0.93, "grad_norm": 10.236074033375482, "learning_rate": 1.4466702497152208e-07, "loss": 0.3948, "step": 5744 }, { "epoch": 0.93, "grad_norm": 20.737271150611626, "learning_rate": 1.4404445979984473e-07, "loss": 0.4728, "step": 5745 }, { "epoch": 0.93, "grad_norm": 6.907427466254237, "learning_rate": 1.434232175243111e-07, "loss": 0.4026, "step": 5746 }, { "epoch": 0.93, "grad_norm": 9.218725467008014, "learning_rate": 1.4280329831416585e-07, "loss": 0.4277, "step": 5747 }, { "epoch": 0.93, "grad_norm": 6.544943070837199, "learning_rate": 1.4218470233829107e-07, "loss": 0.4163, "step": 5748 }, { "epoch": 0.93, "grad_norm": 7.391966666981644, "learning_rate": 1.415674297652103e-07, "loss": 0.4179, "step": 5749 }, { "epoch": 0.93, "grad_norm": 4.716714766436801, "learning_rate": 1.4095148076308518e-07, "loss": 0.4572, "step": 5750 }, { "epoch": 0.93, "grad_norm": 10.138736937389433, "learning_rate": 1.4033685549971643e-07, "loss": 0.3902, "step": 5751 }, { "epoch": 0.93, "grad_norm": 6.13777654887846, "learning_rate": 1.397235541425468e-07, "loss": 0.4258, "step": 5752 }, { "epoch": 0.93, "grad_norm": 7.806130950352292, "learning_rate": 1.3911157685865483e-07, "loss": 0.4422, "step": 5753 }, { "epoch": 0.93, "grad_norm": 56.00443958804668, "learning_rate": 1.3850092381476e-07, "loss": 0.4338, "step": 5754 }, { "epoch": 0.93, "grad_norm": 5.2342355818159865, "learning_rate": 1.3789159517722138e-07, "loss": 0.4427, "step": 5755 }, { "epoch": 0.93, "grad_norm": 6.283376878949043, "learning_rate": 1.3728359111203792e-07, "loss": 0.3929, "step": 5756 }, { "epoch": 0.93, "grad_norm": 6.4032650526334365, "learning_rate": 1.3667691178484598e-07, "loss": 0.4017, "step": 5757 }, { "epoch": 0.93, "grad_norm": 6.480814629253254, "learning_rate": 1.360715573609206e-07, "loss": 0.357, "step": 5758 }, { "epoch": 0.93, "grad_norm": 9.146528631849833, "learning_rate": 1.354675280051787e-07, "loss": 0.3866, "step": 5759 }, { "epoch": 0.93, "grad_norm": 7.164474112845113, "learning_rate": 1.348648238821737e-07, "loss": 0.3855, "step": 5760 }, { "epoch": 0.93, "grad_norm": 1.0382655116272952, "learning_rate": 1.3426344515609813e-07, "loss": 0.4254, "step": 5761 }, { "epoch": 0.93, "grad_norm": 1.2970577267304515, "learning_rate": 1.3366339199078538e-07, "loss": 0.4246, "step": 5762 }, { "epoch": 0.93, "grad_norm": 5.75442760296835, "learning_rate": 1.3306466454970634e-07, "loss": 0.4346, "step": 5763 }, { "epoch": 0.93, "grad_norm": 5.537737038293063, "learning_rate": 1.3246726299597058e-07, "loss": 0.475, "step": 5764 }, { "epoch": 0.93, "grad_norm": 7.237304254787766, "learning_rate": 1.318711874923262e-07, "loss": 0.4411, "step": 5765 }, { "epoch": 0.93, "grad_norm": 15.920798167218466, "learning_rate": 1.312764382011611e-07, "loss": 0.4365, "step": 5766 }, { "epoch": 0.93, "grad_norm": 7.756997851574939, "learning_rate": 1.3068301528450178e-07, "loss": 0.4368, "step": 5767 }, { "epoch": 0.93, "grad_norm": 6.868328241856368, "learning_rate": 1.3009091890401226e-07, "loss": 0.3537, "step": 5768 }, { "epoch": 0.93, "grad_norm": 9.354781934227097, "learning_rate": 1.2950014922099685e-07, "loss": 0.4488, "step": 5769 }, { "epoch": 0.93, "grad_norm": 9.335241831777957, "learning_rate": 1.2891070639639625e-07, "loss": 0.3471, "step": 5770 }, { "epoch": 0.93, "grad_norm": 18.338255987382198, "learning_rate": 1.28322590590792e-07, "loss": 0.4087, "step": 5771 }, { "epoch": 0.93, "grad_norm": 8.949105461969896, "learning_rate": 1.2773580196440262e-07, "loss": 0.4252, "step": 5772 }, { "epoch": 0.93, "grad_norm": 9.518901631882311, "learning_rate": 1.271503406770852e-07, "loss": 0.3942, "step": 5773 }, { "epoch": 0.93, "grad_norm": 15.921139372478414, "learning_rate": 1.265662068883361e-07, "loss": 0.397, "step": 5774 }, { "epoch": 0.93, "grad_norm": 1.0867527144085891, "learning_rate": 1.2598340075728967e-07, "loss": 0.4128, "step": 5775 }, { "epoch": 0.93, "grad_norm": 5.279062951916112, "learning_rate": 1.254019224427172e-07, "loss": 0.3784, "step": 5776 }, { "epoch": 0.93, "grad_norm": 5.409989086022639, "learning_rate": 1.2482177210303036e-07, "loss": 0.4499, "step": 5777 }, { "epoch": 0.93, "grad_norm": 7.967686677265175, "learning_rate": 1.242429498962783e-07, "loss": 0.4072, "step": 5778 }, { "epoch": 0.93, "grad_norm": 7.269043266987582, "learning_rate": 1.23665455980147e-07, "loss": 0.375, "step": 5779 }, { "epoch": 0.93, "grad_norm": 7.646761940051288, "learning_rate": 1.2308929051196296e-07, "loss": 0.4049, "step": 5780 }, { "epoch": 0.93, "grad_norm": 6.888657230322561, "learning_rate": 1.2251445364868886e-07, "loss": 0.4077, "step": 5781 }, { "epoch": 0.93, "grad_norm": 1.1415596147112, "learning_rate": 1.2194094554692614e-07, "loss": 0.4514, "step": 5782 }, { "epoch": 0.93, "grad_norm": 6.735622630760204, "learning_rate": 1.213687663629143e-07, "loss": 0.4083, "step": 5783 }, { "epoch": 0.93, "grad_norm": 10.085999038222868, "learning_rate": 1.2079791625253078e-07, "loss": 0.4123, "step": 5784 }, { "epoch": 0.93, "grad_norm": 11.992922934707742, "learning_rate": 1.2022839537129127e-07, "loss": 0.3911, "step": 5785 }, { "epoch": 0.93, "grad_norm": 8.934844099779076, "learning_rate": 1.196602038743483e-07, "loss": 0.4428, "step": 5786 }, { "epoch": 0.93, "grad_norm": 5.073061098502415, "learning_rate": 1.1909334191649302e-07, "loss": 0.3574, "step": 5787 }, { "epoch": 0.93, "grad_norm": 6.763205650607409, "learning_rate": 1.1852780965215471e-07, "loss": 0.4088, "step": 5788 }, { "epoch": 0.93, "grad_norm": 4.540506573593914, "learning_rate": 1.1796360723539956e-07, "loss": 0.4065, "step": 5789 }, { "epoch": 0.93, "grad_norm": 5.781266709194573, "learning_rate": 1.174007348199313e-07, "loss": 0.3503, "step": 5790 }, { "epoch": 0.93, "grad_norm": 14.571636224652636, "learning_rate": 1.1683919255909337e-07, "loss": 0.3965, "step": 5791 }, { "epoch": 0.93, "grad_norm": 6.199851786524772, "learning_rate": 1.1627898060586395e-07, "loss": 0.4313, "step": 5792 }, { "epoch": 0.93, "grad_norm": 9.210104138994689, "learning_rate": 1.1572009911286097e-07, "loss": 0.4181, "step": 5793 }, { "epoch": 0.93, "grad_norm": 9.622908778663875, "learning_rate": 1.151625482323393e-07, "loss": 0.4297, "step": 5794 }, { "epoch": 0.93, "grad_norm": 9.27573984580691, "learning_rate": 1.1460632811619021e-07, "loss": 0.4822, "step": 5795 }, { "epoch": 0.93, "grad_norm": 12.533227596168032, "learning_rate": 1.1405143891594417e-07, "loss": 0.4847, "step": 5796 }, { "epoch": 0.93, "grad_norm": 8.576708741714535, "learning_rate": 1.1349788078276857e-07, "loss": 0.422, "step": 5797 }, { "epoch": 0.93, "grad_norm": 6.206835263603047, "learning_rate": 1.129456538674667e-07, "loss": 0.3712, "step": 5798 }, { "epoch": 0.93, "grad_norm": 4.961290519001119, "learning_rate": 1.1239475832048152e-07, "loss": 0.442, "step": 5799 }, { "epoch": 0.93, "grad_norm": 7.0001864632818, "learning_rate": 1.1184519429189245e-07, "loss": 0.4672, "step": 5800 }, { "epoch": 0.93, "grad_norm": 8.141258510606905, "learning_rate": 1.1129696193141414e-07, "loss": 0.3983, "step": 5801 }, { "epoch": 0.93, "grad_norm": 6.6674470744303855, "learning_rate": 1.107500613884016e-07, "loss": 0.4538, "step": 5802 }, { "epoch": 0.93, "grad_norm": 7.114301779444167, "learning_rate": 1.1020449281184565e-07, "loss": 0.3953, "step": 5803 }, { "epoch": 0.94, "grad_norm": 5.486918232967572, "learning_rate": 1.0966025635037403e-07, "loss": 0.3877, "step": 5804 }, { "epoch": 0.94, "grad_norm": 9.006151838538086, "learning_rate": 1.0911735215225095e-07, "loss": 0.3975, "step": 5805 }, { "epoch": 0.94, "grad_norm": 6.485855712234616, "learning_rate": 1.0857578036537919e-07, "loss": 0.397, "step": 5806 }, { "epoch": 0.94, "grad_norm": 6.333085687940403, "learning_rate": 1.0803554113729797e-07, "loss": 0.3897, "step": 5807 }, { "epoch": 0.94, "grad_norm": 9.402562581558769, "learning_rate": 1.0749663461518344e-07, "loss": 0.3487, "step": 5808 }, { "epoch": 0.94, "grad_norm": 7.200716967468235, "learning_rate": 1.0695906094584763e-07, "loss": 0.385, "step": 5809 }, { "epoch": 0.94, "grad_norm": 7.0716164350194015, "learning_rate": 1.0642282027574168e-07, "loss": 0.4614, "step": 5810 }, { "epoch": 0.94, "grad_norm": 4.562872904319401, "learning_rate": 1.0588791275095156e-07, "loss": 0.4152, "step": 5811 }, { "epoch": 0.94, "grad_norm": 6.382719579352955, "learning_rate": 1.0535433851720067e-07, "loss": 0.4263, "step": 5812 }, { "epoch": 0.94, "grad_norm": 6.135548221215424, "learning_rate": 1.0482209771985052e-07, "loss": 0.4131, "step": 5813 }, { "epoch": 0.94, "grad_norm": 8.925986057512551, "learning_rate": 1.0429119050389624e-07, "loss": 0.4009, "step": 5814 }, { "epoch": 0.94, "grad_norm": 8.521372580819769, "learning_rate": 1.0376161701397325e-07, "loss": 0.3809, "step": 5815 }, { "epoch": 0.94, "grad_norm": 7.258351875059061, "learning_rate": 1.0323337739435169e-07, "loss": 0.4214, "step": 5816 }, { "epoch": 0.94, "grad_norm": 7.1623260769387125, "learning_rate": 1.0270647178893756e-07, "loss": 0.3965, "step": 5817 }, { "epoch": 0.94, "grad_norm": 6.767774577862997, "learning_rate": 1.0218090034127603e-07, "loss": 0.3732, "step": 5818 }, { "epoch": 0.94, "grad_norm": 1.209114100699132, "learning_rate": 1.0165666319454592e-07, "loss": 0.4681, "step": 5819 }, { "epoch": 0.94, "grad_norm": 26.165265312192133, "learning_rate": 1.011337604915641e-07, "loss": 0.3606, "step": 5820 }, { "epoch": 0.94, "grad_norm": 12.685863718973767, "learning_rate": 1.0061219237478382e-07, "loss": 0.3566, "step": 5821 }, { "epoch": 0.94, "grad_norm": 7.286592878355289, "learning_rate": 1.0009195898629532e-07, "loss": 0.4823, "step": 5822 }, { "epoch": 0.94, "grad_norm": 9.583253894335648, "learning_rate": 9.957306046782411e-08, "loss": 0.3735, "step": 5823 }, { "epoch": 0.94, "grad_norm": 8.836992136466474, "learning_rate": 9.905549696073213e-08, "loss": 0.3891, "step": 5824 }, { "epoch": 0.94, "grad_norm": 5.8262920469017, "learning_rate": 9.85392686060177e-08, "loss": 0.4681, "step": 5825 }, { "epoch": 0.94, "grad_norm": 4.65573669329764, "learning_rate": 9.802437554431665e-08, "loss": 0.3815, "step": 5826 }, { "epoch": 0.94, "grad_norm": 7.335559481630403, "learning_rate": 9.7510817915899e-08, "loss": 0.4572, "step": 5827 }, { "epoch": 0.94, "grad_norm": 6.226576757282827, "learning_rate": 9.699859586067229e-08, "loss": 0.426, "step": 5828 }, { "epoch": 0.94, "grad_norm": 7.2284310102423746, "learning_rate": 9.648770951818098e-08, "loss": 0.4938, "step": 5829 }, { "epoch": 0.94, "grad_norm": 16.295223570527472, "learning_rate": 9.597815902760321e-08, "loss": 0.444, "step": 5830 }, { "epoch": 0.94, "grad_norm": 7.891958484061828, "learning_rate": 9.54699445277546e-08, "loss": 0.4212, "step": 5831 }, { "epoch": 0.94, "grad_norm": 16.51412270214544, "learning_rate": 9.496306615708773e-08, "loss": 0.442, "step": 5832 }, { "epoch": 0.94, "grad_norm": 6.652512388286109, "learning_rate": 9.44575240536899e-08, "loss": 0.3582, "step": 5833 }, { "epoch": 0.94, "grad_norm": 6.67244200555198, "learning_rate": 9.395331835528431e-08, "loss": 0.4237, "step": 5834 }, { "epoch": 0.94, "grad_norm": 8.067059554441094, "learning_rate": 9.345044919923108e-08, "loss": 0.4106, "step": 5835 }, { "epoch": 0.94, "grad_norm": 7.6210532084480445, "learning_rate": 9.294891672252448e-08, "loss": 0.4206, "step": 5836 }, { "epoch": 0.94, "grad_norm": 7.678608786024188, "learning_rate": 9.244872106179748e-08, "loss": 0.3936, "step": 5837 }, { "epoch": 0.94, "grad_norm": 4.0062816596510284, "learning_rate": 9.194986235331604e-08, "loss": 0.3958, "step": 5838 }, { "epoch": 0.94, "grad_norm": 4.972339549688348, "learning_rate": 9.145234073298314e-08, "loss": 0.3936, "step": 5839 }, { "epoch": 0.94, "grad_norm": 8.548191144077434, "learning_rate": 9.095615633633814e-08, "loss": 0.3626, "step": 5840 }, { "epoch": 0.94, "grad_norm": 12.053233552367903, "learning_rate": 9.046130929855401e-08, "loss": 0.4655, "step": 5841 }, { "epoch": 0.94, "grad_norm": 5.514570018198193, "learning_rate": 8.99677997544418e-08, "loss": 0.4338, "step": 5842 }, { "epoch": 0.94, "grad_norm": 5.01981552612846, "learning_rate": 8.947562783844677e-08, "loss": 0.4357, "step": 5843 }, { "epoch": 0.94, "grad_norm": 6.077244655508158, "learning_rate": 8.898479368464996e-08, "loss": 0.4475, "step": 5844 }, { "epoch": 0.94, "grad_norm": 9.963062713542467, "learning_rate": 8.849529742676887e-08, "loss": 0.4027, "step": 5845 }, { "epoch": 0.94, "grad_norm": 5.142137641847229, "learning_rate": 8.800713919815407e-08, "loss": 0.4383, "step": 5846 }, { "epoch": 0.94, "grad_norm": 6.180726440621798, "learning_rate": 8.752031913179527e-08, "loss": 0.3988, "step": 5847 }, { "epoch": 0.94, "grad_norm": 8.331591741468932, "learning_rate": 8.70348373603147e-08, "loss": 0.4633, "step": 5848 }, { "epoch": 0.94, "grad_norm": 10.770742271464655, "learning_rate": 8.655069401597105e-08, "loss": 0.4182, "step": 5849 }, { "epoch": 0.94, "grad_norm": 11.587774022867231, "learning_rate": 8.606788923065824e-08, "loss": 0.3881, "step": 5850 }, { "epoch": 0.94, "grad_norm": 7.058609342425633, "learning_rate": 8.558642313590603e-08, "loss": 0.3761, "step": 5851 }, { "epoch": 0.94, "grad_norm": 6.825711552891778, "learning_rate": 8.510629586287844e-08, "loss": 0.3624, "step": 5852 }, { "epoch": 0.94, "grad_norm": 7.264032076776157, "learning_rate": 8.462750754237581e-08, "loss": 0.4062, "step": 5853 }, { "epoch": 0.94, "grad_norm": 6.5244532020392185, "learning_rate": 8.415005830483324e-08, "loss": 0.3587, "step": 5854 }, { "epoch": 0.94, "grad_norm": 11.225856190189385, "learning_rate": 8.367394828032116e-08, "loss": 0.4615, "step": 5855 }, { "epoch": 0.94, "grad_norm": 4.573829437438821, "learning_rate": 8.319917759854413e-08, "loss": 0.3417, "step": 5856 }, { "epoch": 0.94, "grad_norm": 16.02296492438904, "learning_rate": 8.272574638884423e-08, "loss": 0.466, "step": 5857 }, { "epoch": 0.94, "grad_norm": 5.79364953230151, "learning_rate": 8.225365478019664e-08, "loss": 0.4119, "step": 5858 }, { "epoch": 0.94, "grad_norm": 6.846320201797515, "learning_rate": 8.178290290121127e-08, "loss": 0.3758, "step": 5859 }, { "epoch": 0.94, "grad_norm": 4.725858630667987, "learning_rate": 8.131349088013495e-08, "loss": 0.5032, "step": 5860 }, { "epoch": 0.94, "grad_norm": 6.234053807948427, "learning_rate": 8.08454188448482e-08, "loss": 0.4746, "step": 5861 }, { "epoch": 0.94, "grad_norm": 10.048455527480266, "learning_rate": 8.037868692286677e-08, "loss": 0.409, "step": 5862 }, { "epoch": 0.94, "grad_norm": 10.65406336088548, "learning_rate": 7.991329524134062e-08, "loss": 0.4167, "step": 5863 }, { "epoch": 0.94, "grad_norm": 7.684963797557379, "learning_rate": 7.944924392705666e-08, "loss": 0.4379, "step": 5864 }, { "epoch": 0.94, "grad_norm": 11.321740926565184, "learning_rate": 7.898653310643378e-08, "loss": 0.4127, "step": 5865 }, { "epoch": 0.95, "grad_norm": 5.667911571802254, "learning_rate": 7.852516290552781e-08, "loss": 0.4098, "step": 5866 }, { "epoch": 0.95, "grad_norm": 6.479981074795613, "learning_rate": 7.80651334500293e-08, "loss": 0.4091, "step": 5867 }, { "epoch": 0.95, "grad_norm": 9.813719685254274, "learning_rate": 7.760644486526137e-08, "loss": 0.3635, "step": 5868 }, { "epoch": 0.95, "grad_norm": 14.118240563554707, "learning_rate": 7.714909727618569e-08, "loss": 0.4583, "step": 5869 }, { "epoch": 0.95, "grad_norm": 4.4876422558385105, "learning_rate": 7.669309080739429e-08, "loss": 0.4245, "step": 5870 }, { "epoch": 0.95, "grad_norm": 5.506215435477915, "learning_rate": 7.623842558311668e-08, "loss": 0.4258, "step": 5871 }, { "epoch": 0.95, "grad_norm": 4.858537272584325, "learning_rate": 7.578510172721598e-08, "loss": 0.409, "step": 5872 }, { "epoch": 0.95, "grad_norm": 7.141470147091458, "learning_rate": 7.533311936319121e-08, "loss": 0.3829, "step": 5873 }, { "epoch": 0.95, "grad_norm": 6.657257389962846, "learning_rate": 7.48824786141733e-08, "loss": 0.4403, "step": 5874 }, { "epoch": 0.95, "grad_norm": 6.344094551086219, "learning_rate": 7.443317960293018e-08, "loss": 0.4, "step": 5875 }, { "epoch": 0.95, "grad_norm": 8.33474184075586, "learning_rate": 7.39852224518628e-08, "loss": 0.404, "step": 5876 }, { "epoch": 0.95, "grad_norm": 10.407355835805284, "learning_rate": 7.353860728300743e-08, "loss": 0.4065, "step": 5877 }, { "epoch": 0.95, "grad_norm": 20.59488469179399, "learning_rate": 7.3093334218034e-08, "loss": 0.4034, "step": 5878 }, { "epoch": 0.95, "grad_norm": 8.22075818680981, "learning_rate": 7.264940337824767e-08, "loss": 0.4411, "step": 5879 }, { "epoch": 0.95, "grad_norm": 6.6086331895969606, "learning_rate": 7.22068148845867e-08, "loss": 0.4129, "step": 5880 }, { "epoch": 0.95, "grad_norm": 7.725880053857819, "learning_rate": 7.176556885762465e-08, "loss": 0.4562, "step": 5881 }, { "epoch": 0.95, "grad_norm": 8.86585837054657, "learning_rate": 7.132566541756925e-08, "loss": 0.4635, "step": 5882 }, { "epoch": 0.95, "grad_norm": 6.384710363284801, "learning_rate": 7.088710468426241e-08, "loss": 0.4583, "step": 5883 }, { "epoch": 0.95, "grad_norm": 12.045637904444138, "learning_rate": 7.044988677718023e-08, "loss": 0.3897, "step": 5884 }, { "epoch": 0.95, "grad_norm": 7.2931893617965144, "learning_rate": 7.001401181543243e-08, "loss": 0.4356, "step": 5885 }, { "epoch": 0.95, "grad_norm": 23.042045191042856, "learning_rate": 6.957947991776403e-08, "loss": 0.4389, "step": 5886 }, { "epoch": 0.95, "grad_norm": 9.900109059428903, "learning_rate": 6.914629120255312e-08, "loss": 0.4223, "step": 5887 }, { "epoch": 0.95, "grad_norm": 5.835897248992323, "learning_rate": 6.87144457878114e-08, "loss": 0.4284, "step": 5888 }, { "epoch": 0.95, "grad_norm": 7.708066293394715, "learning_rate": 6.828394379118752e-08, "loss": 0.4358, "step": 5889 }, { "epoch": 0.95, "grad_norm": 6.776882452938125, "learning_rate": 6.785478532995993e-08, "loss": 0.3985, "step": 5890 }, { "epoch": 0.95, "grad_norm": 4.700469630526049, "learning_rate": 6.742697052104507e-08, "loss": 0.3593, "step": 5891 }, { "epoch": 0.95, "grad_norm": 9.260048895223225, "learning_rate": 6.70004994809903e-08, "loss": 0.4361, "step": 5892 }, { "epoch": 0.95, "grad_norm": 7.654670722886092, "learning_rate": 6.657537232597766e-08, "loss": 0.4132, "step": 5893 }, { "epoch": 0.95, "grad_norm": 8.49725715984542, "learning_rate": 6.615158917182507e-08, "loss": 0.389, "step": 5894 }, { "epoch": 0.95, "grad_norm": 5.746975826596797, "learning_rate": 6.572915013398184e-08, "loss": 0.4404, "step": 5895 }, { "epoch": 0.95, "grad_norm": 4.791142248871385, "learning_rate": 6.530805532753204e-08, "loss": 0.3724, "step": 5896 }, { "epoch": 0.95, "grad_norm": 5.71437291283, "learning_rate": 6.488830486719333e-08, "loss": 0.4229, "step": 5897 }, { "epoch": 0.95, "grad_norm": 5.822357403467221, "learning_rate": 6.446989886731758e-08, "loss": 0.4168, "step": 5898 }, { "epoch": 0.95, "grad_norm": 6.262449623790251, "learning_rate": 6.405283744189027e-08, "loss": 0.4336, "step": 5899 }, { "epoch": 0.95, "grad_norm": 5.701487737129059, "learning_rate": 6.363712070452999e-08, "loss": 0.4514, "step": 5900 }, { "epoch": 0.95, "grad_norm": 6.290442548069577, "learning_rate": 6.322274876848944e-08, "loss": 0.4164, "step": 5901 }, { "epoch": 0.95, "grad_norm": 11.965618092778563, "learning_rate": 6.280972174665611e-08, "loss": 0.4178, "step": 5902 }, { "epoch": 0.95, "grad_norm": 5.827712682324868, "learning_rate": 6.239803975154835e-08, "loss": 0.3925, "step": 5903 }, { "epoch": 0.95, "grad_norm": 10.917158579931378, "learning_rate": 6.19877028953203e-08, "loss": 0.373, "step": 5904 }, { "epoch": 0.95, "grad_norm": 13.140587831327757, "learning_rate": 6.157871128975923e-08, "loss": 0.4259, "step": 5905 }, { "epoch": 0.95, "grad_norm": 10.124391970048807, "learning_rate": 6.1171065046286e-08, "loss": 0.4616, "step": 5906 }, { "epoch": 0.95, "grad_norm": 6.013872641561472, "learning_rate": 6.076476427595345e-08, "loss": 0.4711, "step": 5907 }, { "epoch": 0.95, "grad_norm": 7.1520028372818825, "learning_rate": 6.035980908945027e-08, "loss": 0.4525, "step": 5908 }, { "epoch": 0.95, "grad_norm": 5.280795576400204, "learning_rate": 5.995619959709764e-08, "loss": 0.4068, "step": 5909 }, { "epoch": 0.95, "grad_norm": 6.071374327806699, "learning_rate": 5.9553935908848724e-08, "loss": 0.4113, "step": 5910 }, { "epoch": 0.95, "grad_norm": 15.592313663552922, "learning_rate": 5.915301813429197e-08, "loss": 0.3669, "step": 5911 }, { "epoch": 0.95, "grad_norm": 7.7435192998579385, "learning_rate": 5.875344638264835e-08, "loss": 0.4452, "step": 5912 }, { "epoch": 0.95, "grad_norm": 23.5835348039031, "learning_rate": 5.835522076277189e-08, "loss": 0.3979, "step": 5913 }, { "epoch": 0.95, "grad_norm": 7.786179809279504, "learning_rate": 5.795834138315137e-08, "loss": 0.4429, "step": 5914 }, { "epoch": 0.95, "grad_norm": 4.909885190016353, "learning_rate": 5.756280835190586e-08, "loss": 0.4484, "step": 5915 }, { "epoch": 0.95, "grad_norm": 12.073537111284496, "learning_rate": 5.7168621776790836e-08, "loss": 0.4176, "step": 5916 }, { "epoch": 0.95, "grad_norm": 1.3233130574130547, "learning_rate": 5.6775781765193714e-08, "loss": 0.4256, "step": 5917 }, { "epoch": 0.95, "grad_norm": 5.081024513234629, "learning_rate": 5.638428842413335e-08, "loss": 0.3803, "step": 5918 }, { "epoch": 0.95, "grad_norm": 5.20612359915436, "learning_rate": 5.599414186026497e-08, "loss": 0.4613, "step": 5919 }, { "epoch": 0.95, "grad_norm": 9.012360383680965, "learning_rate": 5.5605342179874676e-08, "loss": 0.4371, "step": 5920 }, { "epoch": 0.95, "grad_norm": 11.770986569614706, "learning_rate": 5.5217889488882734e-08, "loss": 0.4802, "step": 5921 }, { "epoch": 0.95, "grad_norm": 8.028635249280748, "learning_rate": 5.4831783892840275e-08, "loss": 0.403, "step": 5922 }, { "epoch": 0.95, "grad_norm": 5.043275926565666, "learning_rate": 5.444702549693481e-08, "loss": 0.3939, "step": 5923 }, { "epoch": 0.95, "grad_norm": 5.5037434684148785, "learning_rate": 5.406361440598529e-08, "loss": 0.3936, "step": 5924 }, { "epoch": 0.95, "grad_norm": 8.1938582052948, "learning_rate": 5.368155072444148e-08, "loss": 0.3659, "step": 5925 }, { "epoch": 0.95, "grad_norm": 28.90732576803218, "learning_rate": 5.330083455638957e-08, "loss": 0.4069, "step": 5926 }, { "epoch": 0.95, "grad_norm": 4.359009605500593, "learning_rate": 5.2921466005547706e-08, "loss": 0.3961, "step": 5927 }, { "epoch": 0.96, "grad_norm": 8.13153873348958, "learning_rate": 5.2543445175264883e-08, "loss": 0.4379, "step": 5928 }, { "epoch": 0.96, "grad_norm": 4.782696909653592, "learning_rate": 5.216677216852539e-08, "loss": 0.3796, "step": 5929 }, { "epoch": 0.96, "grad_norm": 13.692619169191119, "learning_rate": 5.179144708794437e-08, "loss": 0.4051, "step": 5930 }, { "epoch": 0.96, "grad_norm": 8.08143424869347, "learning_rate": 5.141747003577224e-08, "loss": 0.3922, "step": 5931 }, { "epoch": 0.96, "grad_norm": 7.4843549854584195, "learning_rate": 5.104484111388919e-08, "loss": 0.4962, "step": 5932 }, { "epoch": 0.96, "grad_norm": 7.245580664891651, "learning_rate": 5.067356042381011e-08, "loss": 0.3699, "step": 5933 }, { "epoch": 0.96, "grad_norm": 12.121654555619664, "learning_rate": 5.0303628066681874e-08, "loss": 0.396, "step": 5934 }, { "epoch": 0.96, "grad_norm": 4.493532646825219, "learning_rate": 4.9935044143284984e-08, "loss": 0.4032, "step": 5935 }, { "epoch": 0.96, "grad_norm": 6.136654045646493, "learning_rate": 4.956780875403189e-08, "loss": 0.3645, "step": 5936 }, { "epoch": 0.96, "grad_norm": 16.803513505680726, "learning_rate": 4.920192199896645e-08, "loss": 0.3717, "step": 5937 }, { "epoch": 0.96, "grad_norm": 4.894621661522535, "learning_rate": 4.883738397776727e-08, "loss": 0.4214, "step": 5938 }, { "epoch": 0.96, "grad_norm": 5.902692149765108, "learning_rate": 4.84741947897438e-08, "loss": 0.3576, "step": 5939 }, { "epoch": 0.96, "grad_norm": 6.696671894687354, "learning_rate": 4.8112354533839664e-08, "loss": 0.4199, "step": 5940 }, { "epoch": 0.96, "grad_norm": 8.935088507076216, "learning_rate": 4.775186330862991e-08, "loss": 0.4368, "step": 5941 }, { "epoch": 0.96, "grad_norm": 11.359034999257965, "learning_rate": 4.7392721212322076e-08, "loss": 0.4076, "step": 5942 }, { "epoch": 0.96, "grad_norm": 5.156012424957383, "learning_rate": 4.703492834275625e-08, "loss": 0.3896, "step": 5943 }, { "epoch": 0.96, "grad_norm": 6.342539611611853, "learning_rate": 4.6678484797405e-08, "loss": 0.411, "step": 5944 }, { "epoch": 0.96, "grad_norm": 7.0700500862174644, "learning_rate": 4.6323390673373434e-08, "loss": 0.4819, "step": 5945 }, { "epoch": 0.96, "grad_norm": 7.407487929296645, "learning_rate": 4.5969646067400285e-08, "loss": 0.4312, "step": 5946 }, { "epoch": 0.96, "grad_norm": 8.847861297694688, "learning_rate": 4.561725107585346e-08, "loss": 0.4124, "step": 5947 }, { "epoch": 0.96, "grad_norm": 7.557253604952598, "learning_rate": 4.5266205794735617e-08, "loss": 0.4415, "step": 5948 }, { "epoch": 0.96, "grad_norm": 5.622285848101991, "learning_rate": 4.491651031968136e-08, "loss": 0.4634, "step": 5949 }, { "epoch": 0.96, "grad_norm": 1.1098755126128153, "learning_rate": 4.456816474595782e-08, "loss": 0.4185, "step": 5950 }, { "epoch": 0.96, "grad_norm": 10.545771803147769, "learning_rate": 4.4221169168462975e-08, "loss": 0.3651, "step": 5951 }, { "epoch": 0.96, "grad_norm": 6.722286213623457, "learning_rate": 4.387552368172898e-08, "loss": 0.4034, "step": 5952 }, { "epoch": 0.96, "grad_norm": 5.306308663369756, "learning_rate": 4.35312283799183e-08, "loss": 0.4065, "step": 5953 }, { "epoch": 0.96, "grad_norm": 4.959896793442997, "learning_rate": 4.318828335682701e-08, "loss": 0.4284, "step": 5954 }, { "epoch": 0.96, "grad_norm": 5.700360230701288, "learning_rate": 4.28466887058826e-08, "loss": 0.4077, "step": 5955 }, { "epoch": 0.96, "grad_norm": 5.503813471126561, "learning_rate": 4.250644452014507e-08, "loss": 0.4456, "step": 5956 }, { "epoch": 0.96, "grad_norm": 18.6928897347029, "learning_rate": 4.216755089230584e-08, "loss": 0.3787, "step": 5957 }, { "epoch": 0.96, "grad_norm": 4.928056008004031, "learning_rate": 4.183000791468994e-08, "loss": 0.4804, "step": 5958 }, { "epoch": 0.96, "grad_norm": 12.208786307475588, "learning_rate": 4.149381567925215e-08, "loss": 0.4481, "step": 5959 }, { "epoch": 0.96, "grad_norm": 7.361227644459601, "learning_rate": 4.1158974277580866e-08, "loss": 0.4201, "step": 5960 }, { "epoch": 0.96, "grad_norm": 6.394638556460001, "learning_rate": 4.0825483800895905e-08, "loss": 0.4183, "step": 5961 }, { "epoch": 0.96, "grad_norm": 5.6884245863734, "learning_rate": 4.04933443400507e-08, "loss": 0.354, "step": 5962 }, { "epoch": 0.96, "grad_norm": 5.721707770805464, "learning_rate": 4.0162555985526766e-08, "loss": 0.3642, "step": 5963 }, { "epoch": 0.96, "grad_norm": 6.619694059761135, "learning_rate": 3.9833118827442565e-08, "loss": 0.4028, "step": 5964 }, { "epoch": 0.96, "grad_norm": 6.30243908453077, "learning_rate": 3.950503295554409e-08, "loss": 0.4487, "step": 5965 }, { "epoch": 0.96, "grad_norm": 1.0902467334362889, "learning_rate": 3.917829845921095e-08, "loss": 0.4376, "step": 5966 }, { "epoch": 0.96, "grad_norm": 6.443638228152987, "learning_rate": 3.885291542745584e-08, "loss": 0.3356, "step": 5967 }, { "epoch": 0.96, "grad_norm": 4.076784529531071, "learning_rate": 3.8528883948921183e-08, "loss": 0.3764, "step": 5968 }, { "epoch": 0.96, "grad_norm": 8.819100128845204, "learning_rate": 3.8206204111882475e-08, "loss": 0.4556, "step": 5969 }, { "epoch": 0.96, "grad_norm": 1.0060458522836582, "learning_rate": 3.788487600424606e-08, "loss": 0.4349, "step": 5970 }, { "epoch": 0.96, "grad_norm": 5.855538328597058, "learning_rate": 3.7564899713550815e-08, "loss": 0.4344, "step": 5971 }, { "epoch": 0.96, "grad_norm": 4.305390947429676, "learning_rate": 3.724627532696812e-08, "loss": 0.3409, "step": 5972 }, { "epoch": 0.96, "grad_norm": 9.311060838478724, "learning_rate": 3.6929002931297975e-08, "loss": 0.4236, "step": 5973 }, { "epoch": 0.96, "grad_norm": 4.741499464320747, "learning_rate": 3.661308261297625e-08, "loss": 0.348, "step": 5974 }, { "epoch": 0.96, "grad_norm": 7.928163664086753, "learning_rate": 3.629851445806687e-08, "loss": 0.4815, "step": 5975 }, { "epoch": 0.96, "grad_norm": 6.632046612815195, "learning_rate": 3.59852985522674e-08, "loss": 0.4048, "step": 5976 }, { "epoch": 0.96, "grad_norm": 7.010620011046495, "learning_rate": 3.5673434980906806e-08, "loss": 0.4678, "step": 5977 }, { "epoch": 0.96, "grad_norm": 7.382062508792096, "learning_rate": 3.536292382894435e-08, "loss": 0.4098, "step": 5978 }, { "epoch": 0.96, "grad_norm": 1.3178493772666569, "learning_rate": 3.505376518097292e-08, "loss": 0.4851, "step": 5979 }, { "epoch": 0.96, "grad_norm": 10.621455062820207, "learning_rate": 3.474595912121514e-08, "loss": 0.4085, "step": 5980 }, { "epoch": 0.96, "grad_norm": 1.280743721388792, "learning_rate": 3.443950573352672e-08, "loss": 0.4556, "step": 5981 }, { "epoch": 0.96, "grad_norm": 5.774075484445219, "learning_rate": 3.41344051013931e-08, "loss": 0.3951, "step": 5982 }, { "epoch": 0.96, "grad_norm": 13.129593829413892, "learning_rate": 3.3830657307932224e-08, "loss": 0.3486, "step": 5983 }, { "epoch": 0.96, "grad_norm": 7.145772291670653, "learning_rate": 3.352826243589346e-08, "loss": 0.4234, "step": 5984 }, { "epoch": 0.96, "grad_norm": 6.821949491111831, "learning_rate": 3.322722056765759e-08, "loss": 0.416, "step": 5985 }, { "epoch": 0.96, "grad_norm": 12.153632784286645, "learning_rate": 3.292753178523733e-08, "loss": 0.3008, "step": 5986 }, { "epoch": 0.96, "grad_norm": 7.116915657369894, "learning_rate": 3.262919617027516e-08, "loss": 0.3465, "step": 5987 }, { "epoch": 0.96, "grad_norm": 8.121280067440487, "learning_rate": 3.233221380404605e-08, "loss": 0.472, "step": 5988 }, { "epoch": 0.96, "grad_norm": 12.925447474057027, "learning_rate": 3.2036584767456965e-08, "loss": 0.4176, "step": 5989 }, { "epoch": 0.97, "grad_norm": 5.974661526885103, "learning_rate": 3.1742309141044594e-08, "loss": 0.4365, "step": 5990 }, { "epoch": 0.97, "grad_norm": 6.200548717387913, "learning_rate": 3.1449387004978125e-08, "loss": 0.4252, "step": 5991 }, { "epoch": 0.97, "grad_norm": 5.323820782915598, "learning_rate": 3.115781843905763e-08, "loss": 0.44, "step": 5992 }, { "epoch": 0.97, "grad_norm": 11.325014056647124, "learning_rate": 3.086760352271401e-08, "loss": 0.3958, "step": 5993 }, { "epoch": 0.97, "grad_norm": 5.1471808269529475, "learning_rate": 3.057874233501068e-08, "loss": 0.3872, "step": 5994 }, { "epoch": 0.97, "grad_norm": 7.138364169444989, "learning_rate": 3.0291234954640256e-08, "loss": 0.4126, "step": 5995 }, { "epoch": 0.97, "grad_norm": 6.070428328212516, "learning_rate": 3.0005081459928976e-08, "loss": 0.4044, "step": 5996 }, { "epoch": 0.97, "grad_norm": 6.027306126888241, "learning_rate": 2.9720281928831694e-08, "loss": 0.3495, "step": 5997 }, { "epoch": 0.97, "grad_norm": 15.484251643676506, "learning_rate": 2.9436836438936356e-08, "loss": 0.3925, "step": 5998 }, { "epoch": 0.97, "grad_norm": 4.981061043060834, "learning_rate": 2.9154745067460632e-08, "loss": 0.387, "step": 5999 }, { "epoch": 0.97, "grad_norm": 8.013327818517764, "learning_rate": 2.8874007891255273e-08, "loss": 0.4435, "step": 6000 }, { "epoch": 0.97, "grad_norm": 7.845035399263454, "learning_rate": 2.859462498679966e-08, "loss": 0.4426, "step": 6001 }, { "epoch": 0.97, "grad_norm": 5.067345512731572, "learning_rate": 2.831659643020568e-08, "loss": 0.4305, "step": 6002 }, { "epoch": 0.97, "grad_norm": 4.050833092650646, "learning_rate": 2.8039922297216638e-08, "loss": 0.4216, "step": 6003 }, { "epoch": 0.97, "grad_norm": 7.716401291648597, "learning_rate": 2.776460266320502e-08, "loss": 0.4236, "step": 6004 }, { "epoch": 0.97, "grad_norm": 6.350410826299381, "learning_rate": 2.7490637603176385e-08, "loss": 0.3445, "step": 6005 }, { "epoch": 0.97, "grad_norm": 8.35827400590957, "learning_rate": 2.7218027191766027e-08, "loss": 0.3516, "step": 6006 }, { "epoch": 0.97, "grad_norm": 9.334357280179175, "learning_rate": 2.6946771503240653e-08, "loss": 0.41, "step": 6007 }, { "epoch": 0.97, "grad_norm": 4.768189884383511, "learning_rate": 2.6676870611497817e-08, "loss": 0.3524, "step": 6008 }, { "epoch": 0.97, "grad_norm": 4.548425267408917, "learning_rate": 2.6408324590065926e-08, "loss": 0.4556, "step": 6009 }, { "epoch": 0.97, "grad_norm": 6.4917488053342565, "learning_rate": 2.6141133512103677e-08, "loss": 0.3823, "step": 6010 }, { "epoch": 0.97, "grad_norm": 7.272435594183172, "learning_rate": 2.5875297450402848e-08, "loss": 0.462, "step": 6011 }, { "epoch": 0.97, "grad_norm": 35.1506432110317, "learning_rate": 2.5610816477382728e-08, "loss": 0.4226, "step": 6012 }, { "epoch": 0.97, "grad_norm": 7.088717262821666, "learning_rate": 2.5347690665096236e-08, "loss": 0.428, "step": 6013 }, { "epoch": 0.97, "grad_norm": 6.822472235741214, "learning_rate": 2.5085920085226035e-08, "loss": 0.3644, "step": 6014 }, { "epoch": 0.97, "grad_norm": 4.80356632167269, "learning_rate": 2.482550480908563e-08, "loss": 0.4111, "step": 6015 }, { "epoch": 0.97, "grad_norm": 10.576413389135011, "learning_rate": 2.456644490761939e-08, "loss": 0.4035, "step": 6016 }, { "epoch": 0.97, "grad_norm": 6.073589034326763, "learning_rate": 2.4308740451401413e-08, "loss": 0.4074, "step": 6017 }, { "epoch": 0.97, "grad_norm": 6.101722832469393, "learning_rate": 2.4052391510638873e-08, "loss": 0.456, "step": 6018 }, { "epoch": 0.97, "grad_norm": 7.188207502027041, "learning_rate": 2.379739815516757e-08, "loss": 0.4172, "step": 6019 }, { "epoch": 0.97, "grad_norm": 9.640262788199468, "learning_rate": 2.354376045445472e-08, "loss": 0.3838, "step": 6020 }, { "epoch": 0.97, "grad_norm": 7.859405950750676, "learning_rate": 2.3291478477598383e-08, "loss": 0.4327, "step": 6021 }, { "epoch": 0.97, "grad_norm": 5.160002489421595, "learning_rate": 2.304055229332691e-08, "loss": 0.3565, "step": 6022 }, { "epoch": 0.97, "grad_norm": 7.910831820358226, "learning_rate": 2.279098197000007e-08, "loss": 0.4103, "step": 6023 }, { "epoch": 0.97, "grad_norm": 7.3546984441467425, "learning_rate": 2.254276757560736e-08, "loss": 0.4485, "step": 6024 }, { "epoch": 0.97, "grad_norm": 7.163151368607638, "learning_rate": 2.2295909177769138e-08, "loss": 0.4853, "step": 6025 }, { "epoch": 0.97, "grad_norm": 6.7349808329194625, "learning_rate": 2.205040684373605e-08, "loss": 0.3533, "step": 6026 }, { "epoch": 0.97, "grad_norm": 7.222925335470593, "learning_rate": 2.1806260640390155e-08, "loss": 0.432, "step": 6027 }, { "epoch": 0.97, "grad_norm": 5.4717750983998545, "learning_rate": 2.15634706342438e-08, "loss": 0.353, "step": 6028 }, { "epoch": 0.97, "grad_norm": 5.799953132698664, "learning_rate": 2.1322036891439634e-08, "loss": 0.4373, "step": 6029 }, { "epoch": 0.97, "grad_norm": 10.125779045981059, "learning_rate": 2.1081959477750604e-08, "loss": 0.5125, "step": 6030 }, { "epoch": 0.97, "grad_norm": 8.440219872737547, "learning_rate": 2.0843238458580494e-08, "loss": 0.4668, "step": 6031 }, { "epoch": 0.97, "grad_norm": 5.5206951620778995, "learning_rate": 2.060587389896285e-08, "loss": 0.4048, "step": 6032 }, { "epoch": 0.97, "grad_norm": 9.641644872522267, "learning_rate": 2.0369865863563708e-08, "loss": 0.4048, "step": 6033 }, { "epoch": 0.97, "grad_norm": 17.07710476526659, "learning_rate": 2.0135214416677205e-08, "loss": 0.3758, "step": 6034 }, { "epoch": 0.97, "grad_norm": 8.072103970624342, "learning_rate": 1.990191962222887e-08, "loss": 0.4563, "step": 6035 }, { "epoch": 0.97, "grad_norm": 9.036322343789793, "learning_rate": 1.9669981543775085e-08, "loss": 0.4108, "step": 6036 }, { "epoch": 0.97, "grad_norm": 7.468016271365962, "learning_rate": 1.943940024450197e-08, "loss": 0.4287, "step": 6037 }, { "epoch": 0.97, "grad_norm": 8.982370095216224, "learning_rate": 1.9210175787226503e-08, "loss": 0.4124, "step": 6038 }, { "epoch": 0.97, "grad_norm": 5.207761329786003, "learning_rate": 1.898230823439484e-08, "loss": 0.3655, "step": 6039 }, { "epoch": 0.97, "grad_norm": 8.36000357932267, "learning_rate": 1.8755797648085105e-08, "loss": 0.414, "step": 6040 }, { "epoch": 0.97, "grad_norm": 7.971061568619134, "learning_rate": 1.8530644090005156e-08, "loss": 0.4566, "step": 6041 }, { "epoch": 0.97, "grad_norm": 5.227771808491149, "learning_rate": 1.8306847621492042e-08, "loss": 0.3761, "step": 6042 }, { "epoch": 0.97, "grad_norm": 6.503577117007659, "learning_rate": 1.808440830351532e-08, "loss": 0.3914, "step": 6043 }, { "epoch": 0.97, "grad_norm": 5.608909397934487, "learning_rate": 1.7863326196673193e-08, "loss": 0.3955, "step": 6044 }, { "epoch": 0.97, "grad_norm": 5.921338206461803, "learning_rate": 1.764360136119414e-08, "loss": 0.3871, "step": 6045 }, { "epoch": 0.97, "grad_norm": 6.3376490927286895, "learning_rate": 1.742523385693806e-08, "loss": 0.4085, "step": 6046 }, { "epoch": 0.97, "grad_norm": 4.271484616459547, "learning_rate": 1.7208223743392927e-08, "loss": 0.3339, "step": 6047 }, { "epoch": 0.97, "grad_norm": 8.089954834932664, "learning_rate": 1.6992571079679775e-08, "loss": 0.3662, "step": 6048 }, { "epoch": 0.97, "grad_norm": 7.419170742417445, "learning_rate": 1.6778275924547726e-08, "loss": 0.3948, "step": 6049 }, { "epoch": 0.97, "grad_norm": 5.533755278309194, "learning_rate": 1.656533833637619e-08, "loss": 0.4893, "step": 6050 }, { "epoch": 0.97, "grad_norm": 5.1627924485816115, "learning_rate": 1.6353758373175986e-08, "loss": 0.455, "step": 6051 }, { "epoch": 0.98, "grad_norm": 5.453473858991174, "learning_rate": 1.6143536092586564e-08, "loss": 0.4363, "step": 6052 }, { "epoch": 0.98, "grad_norm": 5.438991704220839, "learning_rate": 1.593467155187933e-08, "loss": 0.449, "step": 6053 }, { "epoch": 0.98, "grad_norm": 6.113592727555225, "learning_rate": 1.5727164807953777e-08, "loss": 0.3305, "step": 6054 }, { "epoch": 0.98, "grad_norm": 6.532545517923866, "learning_rate": 1.5521015917340787e-08, "loss": 0.3631, "step": 6055 }, { "epoch": 0.98, "grad_norm": 6.815106777587831, "learning_rate": 1.5316224936200997e-08, "loss": 0.4427, "step": 6056 }, { "epoch": 0.98, "grad_norm": 15.847134939799764, "learning_rate": 1.5112791920325332e-08, "loss": 0.3917, "step": 6057 }, { "epoch": 0.98, "grad_norm": 16.843670174152184, "learning_rate": 1.491071692513446e-08, "loss": 0.4279, "step": 6058 }, { "epoch": 0.98, "grad_norm": 33.4052993289339, "learning_rate": 1.4710000005678792e-08, "loss": 0.3912, "step": 6059 }, { "epoch": 0.98, "grad_norm": 8.31791918515353, "learning_rate": 1.4510641216639587e-08, "loss": 0.3296, "step": 6060 }, { "epoch": 0.98, "grad_norm": 7.7520733528678925, "learning_rate": 1.4312640612327289e-08, "loss": 0.4864, "step": 6061 }, { "epoch": 0.98, "grad_norm": 5.861304287285569, "learning_rate": 1.4115998246683193e-08, "loss": 0.4014, "step": 6062 }, { "epoch": 0.98, "grad_norm": 9.461470870694528, "learning_rate": 1.3920714173278338e-08, "loss": 0.4587, "step": 6063 }, { "epoch": 0.98, "grad_norm": 11.489053227249872, "learning_rate": 1.372678844531239e-08, "loss": 0.3613, "step": 6064 }, { "epoch": 0.98, "grad_norm": 14.099669619567528, "learning_rate": 1.3534221115616975e-08, "loss": 0.3677, "step": 6065 }, { "epoch": 0.98, "grad_norm": 10.391123156960205, "learning_rate": 1.3343012236652353e-08, "loss": 0.3675, "step": 6066 }, { "epoch": 0.98, "grad_norm": 9.674963939351567, "learning_rate": 1.3153161860509078e-08, "loss": 0.4745, "step": 6067 }, { "epoch": 0.98, "grad_norm": 6.065294697914851, "learning_rate": 1.2964670038908e-08, "loss": 0.3566, "step": 6068 }, { "epoch": 0.98, "grad_norm": 27.473325332228793, "learning_rate": 1.2777536823199155e-08, "loss": 0.3834, "step": 6069 }, { "epoch": 0.98, "grad_norm": 5.6655308473195625, "learning_rate": 1.2591762264362872e-08, "loss": 0.3946, "step": 6070 }, { "epoch": 0.98, "grad_norm": 6.020211802786527, "learning_rate": 1.2407346413009224e-08, "loss": 0.4304, "step": 6071 }, { "epoch": 0.98, "grad_norm": 7.808388710844722, "learning_rate": 1.2224289319378025e-08, "loss": 0.3905, "step": 6072 }, { "epoch": 0.98, "grad_norm": 12.23554763369316, "learning_rate": 1.204259103333938e-08, "loss": 0.4683, "step": 6073 }, { "epoch": 0.98, "grad_norm": 8.370507947373715, "learning_rate": 1.1862251604393138e-08, "loss": 0.3781, "step": 6074 }, { "epoch": 0.98, "grad_norm": 6.93501585259362, "learning_rate": 1.1683271081668334e-08, "loss": 0.3382, "step": 6075 }, { "epoch": 0.98, "grad_norm": 4.830234682021126, "learning_rate": 1.1505649513923744e-08, "loss": 0.3947, "step": 6076 }, { "epoch": 0.98, "grad_norm": 16.45824783868109, "learning_rate": 1.1329386949548993e-08, "loss": 0.418, "step": 6077 }, { "epoch": 0.98, "grad_norm": 8.570556827407191, "learning_rate": 1.115448343656289e-08, "loss": 0.4123, "step": 6078 }, { "epoch": 0.98, "grad_norm": 9.89208313669047, "learning_rate": 1.098093902261399e-08, "loss": 0.3959, "step": 6079 }, { "epoch": 0.98, "grad_norm": 9.190697344643453, "learning_rate": 1.0808753754980029e-08, "loss": 0.3755, "step": 6080 }, { "epoch": 0.98, "grad_norm": 5.440818441358894, "learning_rate": 1.0637927680570149e-08, "loss": 0.3819, "step": 6081 }, { "epoch": 0.98, "grad_norm": 7.07350933645867, "learning_rate": 1.0468460845921014e-08, "loss": 0.3578, "step": 6082 }, { "epoch": 0.98, "grad_norm": 7.766170133810189, "learning_rate": 1.0300353297200139e-08, "loss": 0.4433, "step": 6083 }, { "epoch": 0.98, "grad_norm": 8.717861764776151, "learning_rate": 1.0133605080204779e-08, "loss": 0.4559, "step": 6084 }, { "epoch": 0.98, "grad_norm": 5.357904689740886, "learning_rate": 9.968216240361927e-09, "loss": 0.4218, "step": 6085 }, { "epoch": 0.98, "grad_norm": 8.304135888341806, "learning_rate": 9.804186822728324e-09, "loss": 0.4177, "step": 6086 }, { "epoch": 0.98, "grad_norm": 7.5765103713169974, "learning_rate": 9.641516871989331e-09, "loss": 0.3729, "step": 6087 }, { "epoch": 0.98, "grad_norm": 6.899938727792872, "learning_rate": 9.480206432461725e-09, "loss": 0.424, "step": 6088 }, { "epoch": 0.98, "grad_norm": 7.651215525487839, "learning_rate": 9.32025554809035e-09, "loss": 0.3798, "step": 6089 }, { "epoch": 0.98, "grad_norm": 10.4178826421065, "learning_rate": 9.161664262449799e-09, "loss": 0.4189, "step": 6090 }, { "epoch": 0.98, "grad_norm": 6.350177918918874, "learning_rate": 9.004432618745507e-09, "loss": 0.4377, "step": 6091 }, { "epoch": 0.98, "grad_norm": 10.52618921866084, "learning_rate": 8.848560659810989e-09, "loss": 0.4496, "step": 6092 }, { "epoch": 0.98, "grad_norm": 6.796982675187409, "learning_rate": 8.694048428110614e-09, "loss": 0.4422, "step": 6093 }, { "epoch": 0.98, "grad_norm": 5.264471589571443, "learning_rate": 8.540895965737928e-09, "loss": 0.3813, "step": 6094 }, { "epoch": 0.98, "grad_norm": 18.847214823380735, "learning_rate": 8.38910331441567e-09, "loss": 0.4607, "step": 6095 }, { "epoch": 0.98, "grad_norm": 29.551052843500198, "learning_rate": 8.23867051549576e-09, "loss": 0.4111, "step": 6096 }, { "epoch": 0.98, "grad_norm": 8.473369761896919, "learning_rate": 8.089597609960976e-09, "loss": 0.493, "step": 6097 }, { "epoch": 0.98, "grad_norm": 11.709032964177176, "learning_rate": 7.94188463842216e-09, "loss": 0.3716, "step": 6098 }, { "epoch": 0.98, "grad_norm": 6.8048252363823805, "learning_rate": 7.795531641121013e-09, "loss": 0.442, "step": 6099 }, { "epoch": 0.98, "grad_norm": 7.4448797375536495, "learning_rate": 7.65053865792842e-09, "loss": 0.3508, "step": 6100 }, { "epoch": 0.98, "grad_norm": 5.985813284660222, "learning_rate": 7.506905728343893e-09, "loss": 0.37, "step": 6101 }, { "epoch": 0.98, "grad_norm": 6.069218528610057, "learning_rate": 7.364632891496692e-09, "loss": 0.3951, "step": 6102 }, { "epoch": 0.98, "grad_norm": 9.04308757565744, "learning_rate": 7.223720186146366e-09, "loss": 0.4016, "step": 6103 }, { "epoch": 0.98, "grad_norm": 10.737650633776012, "learning_rate": 7.08416765068165e-09, "loss": 0.4, "step": 6104 }, { "epoch": 0.98, "grad_norm": 5.138620122231401, "learning_rate": 6.945975323119913e-09, "loss": 0.3915, "step": 6105 }, { "epoch": 0.98, "grad_norm": 6.593341527781504, "learning_rate": 6.809143241109373e-09, "loss": 0.4294, "step": 6106 }, { "epoch": 0.98, "grad_norm": 5.669509005373945, "learning_rate": 6.673671441925766e-09, "loss": 0.4769, "step": 6107 }, { "epoch": 0.98, "grad_norm": 9.09312722026993, "learning_rate": 6.539559962476238e-09, "loss": 0.4178, "step": 6108 }, { "epoch": 0.98, "grad_norm": 5.5198948654101425, "learning_rate": 6.4068088392960084e-09, "loss": 0.3464, "step": 6109 }, { "epoch": 0.98, "grad_norm": 6.073676465949116, "learning_rate": 6.275418108550591e-09, "loss": 0.3973, "step": 6110 }, { "epoch": 0.98, "grad_norm": 9.220166490288445, "learning_rate": 6.1453878060335755e-09, "loss": 0.3546, "step": 6111 }, { "epoch": 0.98, "grad_norm": 6.142059881878264, "learning_rate": 6.0167179671694055e-09, "loss": 0.414, "step": 6112 }, { "epoch": 0.98, "grad_norm": 1.0801394384599652, "learning_rate": 5.8894086270111505e-09, "loss": 0.449, "step": 6113 }, { "epoch": 0.99, "grad_norm": 5.202552803901914, "learning_rate": 5.7634598202416235e-09, "loss": 0.4114, "step": 6114 }, { "epoch": 0.99, "grad_norm": 1.1227019272606347, "learning_rate": 5.638871581172822e-09, "loss": 0.4401, "step": 6115 }, { "epoch": 0.99, "grad_norm": 5.907913478093451, "learning_rate": 5.515643943745375e-09, "loss": 0.4063, "step": 6116 }, { "epoch": 0.99, "grad_norm": 14.254601387068977, "learning_rate": 5.393776941530759e-09, "loss": 0.4105, "step": 6117 }, { "epoch": 0.99, "grad_norm": 4.081722083976935, "learning_rate": 5.273270607727976e-09, "loss": 0.388, "step": 6118 }, { "epoch": 0.99, "grad_norm": 7.368456224003473, "learning_rate": 5.1541249751668745e-09, "loss": 0.3865, "step": 6119 }, { "epoch": 0.99, "grad_norm": 8.331727097624388, "learning_rate": 5.0363400763059346e-09, "loss": 0.4022, "step": 6120 }, { "epoch": 0.99, "grad_norm": 1.3116252920895939, "learning_rate": 4.919915943232822e-09, "loss": 0.4794, "step": 6121 }, { "epoch": 0.99, "grad_norm": 21.66870043846836, "learning_rate": 4.80485260766439e-09, "loss": 0.4469, "step": 6122 }, { "epoch": 0.99, "grad_norm": 5.859805987432393, "learning_rate": 4.691150100948338e-09, "loss": 0.4298, "step": 6123 }, { "epoch": 0.99, "grad_norm": 1.216466742032423, "learning_rate": 4.578808454058781e-09, "loss": 0.5016, "step": 6124 }, { "epoch": 0.99, "grad_norm": 23.178312008392336, "learning_rate": 4.4678276976017895e-09, "loss": 0.4078, "step": 6125 }, { "epoch": 0.99, "grad_norm": 10.946148994979515, "learning_rate": 4.358207861810959e-09, "loss": 0.4062, "step": 6126 }, { "epoch": 0.99, "grad_norm": 6.917774853260408, "learning_rate": 4.249948976550178e-09, "loss": 0.355, "step": 6127 }, { "epoch": 0.99, "grad_norm": 12.102880410817235, "learning_rate": 4.143051071311965e-09, "loss": 0.4048, "step": 6128 }, { "epoch": 0.99, "grad_norm": 5.663957136127433, "learning_rate": 4.037514175218027e-09, "loss": 0.4194, "step": 6129 }, { "epoch": 0.99, "grad_norm": 5.394378401234621, "learning_rate": 3.933338317019808e-09, "loss": 0.4383, "step": 6130 }, { "epoch": 0.99, "grad_norm": 4.892768247197947, "learning_rate": 3.83052352509794e-09, "loss": 0.4829, "step": 6131 }, { "epoch": 0.99, "grad_norm": 9.647023320495624, "learning_rate": 3.729069827461685e-09, "loss": 0.3949, "step": 6132 }, { "epoch": 0.99, "grad_norm": 13.554356542917054, "learning_rate": 3.628977251749488e-09, "loss": 0.4906, "step": 6133 }, { "epoch": 0.99, "grad_norm": 1.0527465350347402, "learning_rate": 3.530245825229539e-09, "loss": 0.4369, "step": 6134 }, { "epoch": 0.99, "grad_norm": 5.878133802501732, "learning_rate": 3.4328755747992103e-09, "loss": 0.4155, "step": 6135 }, { "epoch": 0.99, "grad_norm": 8.015421641386913, "learning_rate": 3.336866526985061e-09, "loss": 0.4232, "step": 6136 }, { "epoch": 0.99, "grad_norm": 10.898180899022222, "learning_rate": 3.2422187079417242e-09, "loss": 0.4286, "step": 6137 }, { "epoch": 0.99, "grad_norm": 6.110808241441318, "learning_rate": 3.148932143455241e-09, "loss": 0.4444, "step": 6138 }, { "epoch": 0.99, "grad_norm": 6.346760391923627, "learning_rate": 3.0570068589380606e-09, "loss": 0.3917, "step": 6139 }, { "epoch": 0.99, "grad_norm": 5.020984681275212, "learning_rate": 2.9664428794340393e-09, "loss": 0.4735, "step": 6140 }, { "epoch": 0.99, "grad_norm": 9.43866629901102, "learning_rate": 2.877240229614553e-09, "loss": 0.358, "step": 6141 }, { "epoch": 0.99, "grad_norm": 7.72577302897655, "learning_rate": 2.7893989337818283e-09, "loss": 0.379, "step": 6142 }, { "epoch": 0.99, "grad_norm": 4.987038193873231, "learning_rate": 2.7029190158656125e-09, "loss": 0.417, "step": 6143 }, { "epoch": 0.99, "grad_norm": 6.232591282472939, "learning_rate": 2.6178004994253936e-09, "loss": 0.4136, "step": 6144 }, { "epoch": 0.99, "grad_norm": 10.179370395692516, "learning_rate": 2.5340434076503994e-09, "loss": 0.4398, "step": 6145 }, { "epoch": 0.99, "grad_norm": 1.1745821739528415, "learning_rate": 2.4516477633579338e-09, "loss": 0.4839, "step": 6146 }, { "epoch": 0.99, "grad_norm": 8.313312570973585, "learning_rate": 2.370613588994486e-09, "loss": 0.3689, "step": 6147 }, { "epoch": 0.99, "grad_norm": 7.362322241481521, "learning_rate": 2.2909409066362854e-09, "loss": 0.4822, "step": 6148 }, { "epoch": 0.99, "grad_norm": 6.397634502037918, "learning_rate": 2.2126297379887473e-09, "loss": 0.4366, "step": 6149 }, { "epoch": 0.99, "grad_norm": 10.84490569463106, "learning_rate": 2.1356801043853624e-09, "loss": 0.3666, "step": 6150 }, { "epoch": 0.99, "grad_norm": 9.728637687901125, "learning_rate": 2.060092026789917e-09, "loss": 0.4045, "step": 6151 }, { "epoch": 0.99, "grad_norm": 7.316861827030633, "learning_rate": 1.9858655257942726e-09, "loss": 0.4193, "step": 6152 }, { "epoch": 0.99, "grad_norm": 10.141829091733241, "learning_rate": 1.9130006216200314e-09, "loss": 0.467, "step": 6153 }, { "epoch": 0.99, "grad_norm": 4.727186055319543, "learning_rate": 1.841497334117426e-09, "loss": 0.4014, "step": 6154 }, { "epoch": 0.99, "grad_norm": 8.021105975067629, "learning_rate": 1.771355682765874e-09, "loss": 0.4793, "step": 6155 }, { "epoch": 0.99, "grad_norm": 6.364216295047285, "learning_rate": 1.7025756866739795e-09, "loss": 0.3746, "step": 6156 }, { "epoch": 0.99, "grad_norm": 23.229018878342714, "learning_rate": 1.6351573645795316e-09, "loss": 0.4345, "step": 6157 }, { "epoch": 0.99, "grad_norm": 6.1957081400054985, "learning_rate": 1.5691007348489495e-09, "loss": 0.3539, "step": 6158 }, { "epoch": 0.99, "grad_norm": 26.040534784527363, "learning_rate": 1.5044058154778385e-09, "loss": 0.4139, "step": 6159 }, { "epoch": 0.99, "grad_norm": 6.224006707459801, "learning_rate": 1.441072624090989e-09, "loss": 0.3572, "step": 6160 }, { "epoch": 0.99, "grad_norm": 14.733831569741248, "learning_rate": 1.3791011779423769e-09, "loss": 0.4434, "step": 6161 }, { "epoch": 0.99, "grad_norm": 10.213729027054061, "learning_rate": 1.3184914939140537e-09, "loss": 0.4228, "step": 6162 }, { "epoch": 0.99, "grad_norm": 5.48150459004017, "learning_rate": 1.2592435885178112e-09, "loss": 0.3246, "step": 6163 }, { "epoch": 0.99, "grad_norm": 12.120620765998419, "learning_rate": 1.2013574778951819e-09, "loss": 0.4133, "step": 6164 }, { "epoch": 0.99, "grad_norm": 7.467612450847542, "learning_rate": 1.1448331778152189e-09, "loss": 0.3639, "step": 6165 }, { "epoch": 0.99, "grad_norm": 9.181595434787408, "learning_rate": 1.0896707036772703e-09, "loss": 0.4424, "step": 6166 }, { "epoch": 0.99, "grad_norm": 14.609234451513899, "learning_rate": 1.0358700705082048e-09, "loss": 0.4195, "step": 6167 }, { "epoch": 0.99, "grad_norm": 12.651273337820202, "learning_rate": 9.834312929657419e-10, "loss": 0.4055, "step": 6168 }, { "epoch": 0.99, "grad_norm": 7.954822724360298, "learning_rate": 9.323543853351212e-10, "loss": 0.4844, "step": 6169 }, { "epoch": 0.99, "grad_norm": 10.223454056212427, "learning_rate": 8.826393615318784e-10, "loss": 0.4072, "step": 6170 }, { "epoch": 0.99, "grad_norm": 8.021186471007278, "learning_rate": 8.342862350985137e-10, "loss": 0.3348, "step": 6171 }, { "epoch": 0.99, "grad_norm": 8.915500727027704, "learning_rate": 7.872950192083783e-10, "loss": 0.3985, "step": 6172 }, { "epoch": 0.99, "grad_norm": 7.061696961393825, "learning_rate": 7.416657266634542e-10, "loss": 0.4145, "step": 6173 }, { "epoch": 0.99, "grad_norm": 5.597530538370256, "learning_rate": 6.973983698943532e-10, "loss": 0.3481, "step": 6174 }, { "epoch": 0.99, "grad_norm": 8.335415548569067, "learning_rate": 6.544929609597628e-10, "loss": 0.4649, "step": 6175 }, { "epoch": 1.0, "grad_norm": 5.227930452211167, "learning_rate": 6.129495115497764e-10, "loss": 0.4048, "step": 6176 }, { "epoch": 1.0, "grad_norm": 5.029770965236446, "learning_rate": 5.727680329808971e-10, "loss": 0.4794, "step": 6177 }, { "epoch": 1.0, "grad_norm": 9.520516198579397, "learning_rate": 5.339485362004793e-10, "loss": 0.3492, "step": 6178 }, { "epoch": 1.0, "grad_norm": 9.116180764939346, "learning_rate": 4.96491031782842e-10, "loss": 0.4271, "step": 6179 }, { "epoch": 1.0, "grad_norm": 8.921383798167614, "learning_rate": 4.603955299337104e-10, "loss": 0.4318, "step": 6180 }, { "epoch": 1.0, "grad_norm": 1.0746654954229136, "learning_rate": 4.2566204048577473e-10, "loss": 0.4271, "step": 6181 }, { "epoch": 1.0, "grad_norm": 6.969422497493513, "learning_rate": 3.9229057290146587e-10, "loss": 0.3772, "step": 6182 }, { "epoch": 1.0, "grad_norm": 1.1897269848044565, "learning_rate": 3.6028113627240015e-10, "loss": 0.4551, "step": 6183 }, { "epoch": 1.0, "grad_norm": 7.554933802975019, "learning_rate": 3.2963373931882425e-10, "loss": 0.4483, "step": 6184 }, { "epoch": 1.0, "grad_norm": 7.019045594479564, "learning_rate": 3.003483903890603e-10, "loss": 0.3606, "step": 6185 }, { "epoch": 1.0, "grad_norm": 10.078260501888897, "learning_rate": 2.724250974628362e-10, "loss": 0.3736, "step": 6186 }, { "epoch": 1.0, "grad_norm": 7.738962420198265, "learning_rate": 2.458638681457348e-10, "loss": 0.4026, "step": 6187 }, { "epoch": 1.0, "grad_norm": 10.910467437794013, "learning_rate": 2.2066470967418985e-10, "loss": 0.338, "step": 6188 }, { "epoch": 1.0, "grad_norm": 7.522931697443439, "learning_rate": 1.9682762891382045e-10, "loss": 0.4746, "step": 6189 }, { "epoch": 1.0, "grad_norm": 6.459417452162901, "learning_rate": 1.74352632357766e-10, "loss": 0.3763, "step": 6190 }, { "epoch": 1.0, "grad_norm": 11.202578332491813, "learning_rate": 1.5323972612890647e-10, "loss": 0.4162, "step": 6191 }, { "epoch": 1.0, "grad_norm": 5.634305084844753, "learning_rate": 1.3348891597930734e-10, "loss": 0.4169, "step": 6192 }, { "epoch": 1.0, "grad_norm": 6.422162267430779, "learning_rate": 1.1510020728910943e-10, "loss": 0.4444, "step": 6193 }, { "epoch": 1.0, "grad_norm": 4.87311337589163, "learning_rate": 9.807360506874919e-11, "loss": 0.3867, "step": 6194 }, { "epoch": 1.0, "grad_norm": 9.567841007067475, "learning_rate": 8.240911395562823e-11, "loss": 0.4014, "step": 6195 }, { "epoch": 1.0, "grad_norm": 6.59661168114939, "learning_rate": 6.810673821855407e-11, "loss": 0.409, "step": 6196 }, { "epoch": 1.0, "grad_norm": 7.727390032355771, "learning_rate": 5.516648175274419e-11, "loss": 0.3805, "step": 6197 }, { "epoch": 1.0, "grad_norm": 5.456222955044605, "learning_rate": 4.358834808371182e-11, "loss": 0.3949, "step": 6198 }, { "epoch": 1.0, "grad_norm": 6.997693936588838, "learning_rate": 3.3372340366155663e-11, "loss": 0.3969, "step": 6199 }, { "epoch": 1.0, "grad_norm": 9.391928347366203, "learning_rate": 2.4518461382849745e-11, "loss": 0.4083, "step": 6200 }, { "epoch": 1.0, "grad_norm": 5.8280251099775136, "learning_rate": 1.702671354575358e-11, "loss": 0.4475, "step": 6201 }, { "epoch": 1.0, "grad_norm": 18.325389450510528, "learning_rate": 1.0897098895457092e-11, "loss": 0.4361, "step": 6202 }, { "epoch": 1.0, "grad_norm": 7.00372671048808, "learning_rate": 6.129619102845929e-12, "loss": 0.4431, "step": 6203 }, { "epoch": 1.0, "grad_norm": 6.627549124373033, "learning_rate": 2.7242754663259207e-12, "loss": 0.4502, "step": 6204 }, { "epoch": 1.0, "grad_norm": 13.06124087765242, "learning_rate": 6.810689129332915e-13, "loss": 0.4299, "step": 6205 }, { "epoch": 1.0, "grad_norm": 19.593519222619403, "learning_rate": 0.0, "loss": 0.4042, "step": 6206 }, { "epoch": 1.0, "step": 6206, "total_flos": 5571263301926912.0, "train_loss": 0.45520276639429247, "train_runtime": 233098.4711, "train_samples_per_second": 3.408, "train_steps_per_second": 0.027 } ], "logging_steps": 1.0, "max_steps": 6206, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 3000, "total_flos": 5571263301926912.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }