{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999612538261846, "eval_steps": 500, "global_step": 3226, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00030996939052268586, "grad_norm": 1.137789723923757, "learning_rate": 1.2121212121212122e-06, "loss": 1.1651, "step": 1 }, { "epoch": 0.0006199387810453717, "grad_norm": 1.1018865930964166, "learning_rate": 2.4242424242424244e-06, "loss": 1.0987, "step": 2 }, { "epoch": 0.0009299081715680576, "grad_norm": 1.1597150434486017, "learning_rate": 3.6363636363636366e-06, "loss": 1.1688, "step": 3 }, { "epoch": 0.0012398775620907434, "grad_norm": 1.0174661710952033, "learning_rate": 4.848484848484849e-06, "loss": 1.1541, "step": 4 }, { "epoch": 0.0015498469526134295, "grad_norm": 0.916348093890311, "learning_rate": 6.060606060606061e-06, "loss": 1.2, "step": 5 }, { "epoch": 0.0018598163431361153, "grad_norm": 0.92856916119964, "learning_rate": 7.272727272727273e-06, "loss": 1.108, "step": 6 }, { "epoch": 0.0021697857336588013, "grad_norm": 1.157008021173396, "learning_rate": 8.484848484848486e-06, "loss": 1.209, "step": 7 }, { "epoch": 0.002479755124181487, "grad_norm": 1.0743918029107917, "learning_rate": 9.696969696969698e-06, "loss": 1.1652, "step": 8 }, { "epoch": 0.002789724514704173, "grad_norm": 0.8904137505727883, "learning_rate": 1.0909090909090909e-05, "loss": 1.1749, "step": 9 }, { "epoch": 0.003099693905226859, "grad_norm": 1.0678210850483507, "learning_rate": 1.2121212121212122e-05, "loss": 1.1856, "step": 10 }, { "epoch": 0.003409663295749545, "grad_norm": 0.673789771092121, "learning_rate": 1.3333333333333333e-05, "loss": 1.1459, "step": 11 }, { "epoch": 0.0037196326862722305, "grad_norm": 0.9566669365403035, "learning_rate": 1.4545454545454546e-05, "loss": 1.0883, "step": 12 }, { "epoch": 0.004029602076794916, "grad_norm": 0.8253040466806454, "learning_rate": 1.575757575757576e-05, "loss": 1.08, "step": 13 }, { "epoch": 0.0043395714673176026, "grad_norm": 0.903007738673878, "learning_rate": 1.6969696969696972e-05, "loss": 1.1393, "step": 14 }, { "epoch": 0.004649540857840288, "grad_norm": 0.7603211853667786, "learning_rate": 1.8181818181818182e-05, "loss": 1.2096, "step": 15 }, { "epoch": 0.004959510248362974, "grad_norm": 0.7929308078243145, "learning_rate": 1.9393939393939395e-05, "loss": 1.0372, "step": 16 }, { "epoch": 0.00526947963888566, "grad_norm": 0.8214607559876134, "learning_rate": 2.0606060606060608e-05, "loss": 1.1009, "step": 17 }, { "epoch": 0.005579449029408346, "grad_norm": 0.8064902101573628, "learning_rate": 2.1818181818181818e-05, "loss": 1.0466, "step": 18 }, { "epoch": 0.005889418419931032, "grad_norm": 0.7638444409087849, "learning_rate": 2.3030303030303034e-05, "loss": 1.0956, "step": 19 }, { "epoch": 0.006199387810453718, "grad_norm": 0.6973831355272175, "learning_rate": 2.4242424242424244e-05, "loss": 1.0743, "step": 20 }, { "epoch": 0.006509357200976403, "grad_norm": 0.6907951027965982, "learning_rate": 2.5454545454545457e-05, "loss": 1.0257, "step": 21 }, { "epoch": 0.00681932659149909, "grad_norm": 0.7202520271591938, "learning_rate": 2.6666666666666667e-05, "loss": 1.09, "step": 22 }, { "epoch": 0.007129295982021775, "grad_norm": 0.7636967018254577, "learning_rate": 2.7878787878787883e-05, "loss": 1.0342, "step": 23 }, { "epoch": 0.007439265372544461, "grad_norm": 0.9979793209713308, "learning_rate": 2.9090909090909093e-05, "loss": 0.9938, "step": 24 }, { "epoch": 0.0077492347630671475, "grad_norm": 1.530098477271715, "learning_rate": 3.0303030303030306e-05, "loss": 1.049, "step": 25 }, { "epoch": 0.008059204153589832, "grad_norm": 0.7537938173464634, "learning_rate": 3.151515151515152e-05, "loss": 1.0837, "step": 26 }, { "epoch": 0.00836917354411252, "grad_norm": 1.047294471788112, "learning_rate": 3.272727272727273e-05, "loss": 1.0541, "step": 27 }, { "epoch": 0.008679142934635205, "grad_norm": 1.479801478841223, "learning_rate": 3.3939393939393945e-05, "loss": 1.0168, "step": 28 }, { "epoch": 0.00898911232515789, "grad_norm": 0.7365701352023788, "learning_rate": 3.515151515151515e-05, "loss": 1.0234, "step": 29 }, { "epoch": 0.009299081715680576, "grad_norm": 1.3342978728882533, "learning_rate": 3.6363636363636364e-05, "loss": 1.0091, "step": 30 }, { "epoch": 0.009609051106203262, "grad_norm": 0.8749851970412246, "learning_rate": 3.7575757575757584e-05, "loss": 0.9993, "step": 31 }, { "epoch": 0.009919020496725947, "grad_norm": 1.2582682892806059, "learning_rate": 3.878787878787879e-05, "loss": 1.0328, "step": 32 }, { "epoch": 0.010228989887248635, "grad_norm": 0.9339851441460317, "learning_rate": 4e-05, "loss": 1.0022, "step": 33 }, { "epoch": 0.01053895927777132, "grad_norm": 0.8833103830902043, "learning_rate": 3.999999031940897e-05, "loss": 1.0059, "step": 34 }, { "epoch": 0.010848928668294006, "grad_norm": 1.0100421871086487, "learning_rate": 3.999996127764524e-05, "loss": 1.0444, "step": 35 }, { "epoch": 0.011158898058816692, "grad_norm": 1.2884004043263202, "learning_rate": 3.999991287473694e-05, "loss": 1.0349, "step": 36 }, { "epoch": 0.011468867449339377, "grad_norm": 0.934301168666736, "learning_rate": 3.99998451107309e-05, "loss": 1.0452, "step": 37 }, { "epoch": 0.011778836839862064, "grad_norm": 1.0115126598068578, "learning_rate": 3.999975798569275e-05, "loss": 0.9364, "step": 38 }, { "epoch": 0.01208880623038475, "grad_norm": 1.0033024041496346, "learning_rate": 3.999965149970681e-05, "loss": 0.9567, "step": 39 }, { "epoch": 0.012398775620907436, "grad_norm": 0.9708164309155881, "learning_rate": 3.9999525652876174e-05, "loss": 1.0018, "step": 40 }, { "epoch": 0.012708745011430121, "grad_norm": 0.792528646535, "learning_rate": 3.9999380445322667e-05, "loss": 0.9922, "step": 41 }, { "epoch": 0.013018714401952807, "grad_norm": 0.8151458683359586, "learning_rate": 3.999921587718686e-05, "loss": 0.9933, "step": 42 }, { "epoch": 0.013328683792475492, "grad_norm": 0.9421113577928464, "learning_rate": 3.999903194862807e-05, "loss": 1.0013, "step": 43 }, { "epoch": 0.01363865318299818, "grad_norm": 0.8313977464431337, "learning_rate": 3.999882865982434e-05, "loss": 0.9423, "step": 44 }, { "epoch": 0.013948622573520865, "grad_norm": 0.8564306988079825, "learning_rate": 3.999860601097247e-05, "loss": 0.9932, "step": 45 }, { "epoch": 0.01425859196404355, "grad_norm": 0.6720448583594837, "learning_rate": 3.9998364002288e-05, "loss": 0.9829, "step": 46 }, { "epoch": 0.014568561354566236, "grad_norm": 0.748424870509473, "learning_rate": 3.99981026340052e-05, "loss": 0.937, "step": 47 }, { "epoch": 0.014878530745088922, "grad_norm": 0.7489850081978585, "learning_rate": 3.99978219063771e-05, "loss": 0.9756, "step": 48 }, { "epoch": 0.015188500135611608, "grad_norm": 0.6767829522268337, "learning_rate": 3.9997521819675466e-05, "loss": 0.9501, "step": 49 }, { "epoch": 0.015498469526134295, "grad_norm": 0.5314010940571126, "learning_rate": 3.999720237419078e-05, "loss": 0.9648, "step": 50 }, { "epoch": 0.01580843891665698, "grad_norm": 0.5309431077501623, "learning_rate": 3.99968635702323e-05, "loss": 0.9336, "step": 51 }, { "epoch": 0.016118408307179664, "grad_norm": 0.6438437407488871, "learning_rate": 3.9996505408128e-05, "loss": 1.0127, "step": 52 }, { "epoch": 0.01642837769770235, "grad_norm": 0.6012953339070396, "learning_rate": 3.999612788822461e-05, "loss": 0.9295, "step": 53 }, { "epoch": 0.01673834708822504, "grad_norm": 0.7137111409786041, "learning_rate": 3.9995731010887584e-05, "loss": 0.9167, "step": 54 }, { "epoch": 0.017048316478747723, "grad_norm": 0.46054093475195995, "learning_rate": 3.999531477650113e-05, "loss": 0.963, "step": 55 }, { "epoch": 0.01735828586927041, "grad_norm": 0.4899694337689944, "learning_rate": 3.999487918546817e-05, "loss": 0.9726, "step": 56 }, { "epoch": 0.017668255259793094, "grad_norm": 0.5012099217587257, "learning_rate": 3.999442423821041e-05, "loss": 0.9481, "step": 57 }, { "epoch": 0.01797822465031578, "grad_norm": 0.45094226879721305, "learning_rate": 3.999394993516824e-05, "loss": 0.961, "step": 58 }, { "epoch": 0.01828819404083847, "grad_norm": 0.4560295989158257, "learning_rate": 3.9993456276800835e-05, "loss": 0.9478, "step": 59 }, { "epoch": 0.018598163431361153, "grad_norm": 0.491273368024734, "learning_rate": 3.999294326358607e-05, "loss": 1.0054, "step": 60 }, { "epoch": 0.01890813282188384, "grad_norm": 0.4628848428335045, "learning_rate": 3.999241089602057e-05, "loss": 0.9684, "step": 61 }, { "epoch": 0.019218102212406524, "grad_norm": 0.46276790641398996, "learning_rate": 3.999185917461971e-05, "loss": 0.9414, "step": 62 }, { "epoch": 0.01952807160292921, "grad_norm": 0.4994426020871583, "learning_rate": 3.999128809991759e-05, "loss": 0.9512, "step": 63 }, { "epoch": 0.019838040993451895, "grad_norm": 0.4801592023606227, "learning_rate": 3.9990697672467034e-05, "loss": 0.9445, "step": 64 }, { "epoch": 0.020148010383974582, "grad_norm": 0.505713141369313, "learning_rate": 3.9990087892839615e-05, "loss": 0.9106, "step": 65 }, { "epoch": 0.02045797977449727, "grad_norm": 0.5429357412226704, "learning_rate": 3.998945876162563e-05, "loss": 0.9539, "step": 66 }, { "epoch": 0.020767949165019953, "grad_norm": 0.4927749608802053, "learning_rate": 3.998881027943413e-05, "loss": 0.9236, "step": 67 }, { "epoch": 0.02107791855554264, "grad_norm": 0.47247438705703965, "learning_rate": 3.9988142446892874e-05, "loss": 0.9283, "step": 68 }, { "epoch": 0.021387887946065325, "grad_norm": 0.5172070390032725, "learning_rate": 3.9987455264648356e-05, "loss": 0.9345, "step": 69 }, { "epoch": 0.021697857336588012, "grad_norm": 0.5113428628466911, "learning_rate": 3.998674873336582e-05, "loss": 0.9089, "step": 70 }, { "epoch": 0.0220078267271107, "grad_norm": 0.432393042375737, "learning_rate": 3.998602285372923e-05, "loss": 0.8948, "step": 71 }, { "epoch": 0.022317796117633383, "grad_norm": 0.39394374547046124, "learning_rate": 3.998527762644128e-05, "loss": 0.9196, "step": 72 }, { "epoch": 0.02262776550815607, "grad_norm": 0.41696112674265406, "learning_rate": 3.998451305222339e-05, "loss": 0.9323, "step": 73 }, { "epoch": 0.022937734898678754, "grad_norm": 0.45373968475589543, "learning_rate": 3.99837291318157e-05, "loss": 0.8988, "step": 74 }, { "epoch": 0.02324770428920144, "grad_norm": 0.49243571389859, "learning_rate": 3.9982925865977125e-05, "loss": 0.9358, "step": 75 }, { "epoch": 0.02355767367972413, "grad_norm": 0.4345821951122476, "learning_rate": 3.998210325548525e-05, "loss": 0.9366, "step": 76 }, { "epoch": 0.023867643070246813, "grad_norm": 0.4725208725574258, "learning_rate": 3.998126130113641e-05, "loss": 0.9154, "step": 77 }, { "epoch": 0.0241776124607695, "grad_norm": 0.520045747427895, "learning_rate": 3.9980400003745676e-05, "loss": 0.8997, "step": 78 }, { "epoch": 0.024487581851292184, "grad_norm": 0.5966444788162458, "learning_rate": 3.9979519364146834e-05, "loss": 0.8695, "step": 79 }, { "epoch": 0.02479755124181487, "grad_norm": 0.7027789793358356, "learning_rate": 3.997861938319239e-05, "loss": 0.9033, "step": 80 }, { "epoch": 0.025107520632337555, "grad_norm": 0.7551061002011615, "learning_rate": 3.997770006175358e-05, "loss": 0.9071, "step": 81 }, { "epoch": 0.025417490022860242, "grad_norm": 0.831923767517788, "learning_rate": 3.9976761400720364e-05, "loss": 0.9068, "step": 82 }, { "epoch": 0.02572745941338293, "grad_norm": 0.8475731116018044, "learning_rate": 3.9975803401001416e-05, "loss": 0.9265, "step": 83 }, { "epoch": 0.026037428803905614, "grad_norm": 0.7276987213522423, "learning_rate": 3.997482606352414e-05, "loss": 0.8843, "step": 84 }, { "epoch": 0.0263473981944283, "grad_norm": 0.7004099333590288, "learning_rate": 3.997382938923466e-05, "loss": 0.8824, "step": 85 }, { "epoch": 0.026657367584950985, "grad_norm": 0.9061629965864144, "learning_rate": 3.997281337909781e-05, "loss": 0.895, "step": 86 }, { "epoch": 0.026967336975473672, "grad_norm": 1.002774230192351, "learning_rate": 3.9971778034097144e-05, "loss": 0.9184, "step": 87 }, { "epoch": 0.02727730636599636, "grad_norm": 0.971506708725424, "learning_rate": 3.997072335523495e-05, "loss": 0.8533, "step": 88 }, { "epoch": 0.027587275756519043, "grad_norm": 0.8320666372303335, "learning_rate": 3.996964934353221e-05, "loss": 0.9089, "step": 89 }, { "epoch": 0.02789724514704173, "grad_norm": 0.675845839717719, "learning_rate": 3.996855600002863e-05, "loss": 0.8861, "step": 90 }, { "epoch": 0.028207214537564414, "grad_norm": 0.5568459256356857, "learning_rate": 3.996744332578264e-05, "loss": 0.8848, "step": 91 }, { "epoch": 0.0285171839280871, "grad_norm": 0.586040508536657, "learning_rate": 3.996631132187136e-05, "loss": 0.8696, "step": 92 }, { "epoch": 0.028827153318609786, "grad_norm": 0.6558535894404462, "learning_rate": 3.9965159989390654e-05, "loss": 0.9148, "step": 93 }, { "epoch": 0.029137122709132473, "grad_norm": 0.6103230788841535, "learning_rate": 3.996398932945507e-05, "loss": 0.8961, "step": 94 }, { "epoch": 0.02944709209965516, "grad_norm": 0.5144438166461939, "learning_rate": 3.996279934319788e-05, "loss": 0.8865, "step": 95 }, { "epoch": 0.029757061490177844, "grad_norm": 0.6300708850644913, "learning_rate": 3.9961590031771054e-05, "loss": 0.9033, "step": 96 }, { "epoch": 0.03006703088070053, "grad_norm": 0.6359827561501791, "learning_rate": 3.996036139634528e-05, "loss": 0.8813, "step": 97 }, { "epoch": 0.030377000271223215, "grad_norm": 0.5599014125602371, "learning_rate": 3.9959113438109954e-05, "loss": 0.9038, "step": 98 }, { "epoch": 0.030686969661745903, "grad_norm": 0.7798144643566122, "learning_rate": 3.995784615827317e-05, "loss": 0.9393, "step": 99 }, { "epoch": 0.03099693905226859, "grad_norm": 0.49201783677236666, "learning_rate": 3.995655955806173e-05, "loss": 0.8567, "step": 100 }, { "epoch": 0.03130690844279128, "grad_norm": 0.48709255116781697, "learning_rate": 3.9955253638721143e-05, "loss": 0.8811, "step": 101 }, { "epoch": 0.03161687783331396, "grad_norm": 0.5838311179287525, "learning_rate": 3.9953928401515606e-05, "loss": 0.85, "step": 102 }, { "epoch": 0.031926847223836645, "grad_norm": 0.5114118090663093, "learning_rate": 3.995258384772804e-05, "loss": 0.8448, "step": 103 }, { "epoch": 0.03223681661435933, "grad_norm": 0.47566214301746973, "learning_rate": 3.995121997866004e-05, "loss": 0.8593, "step": 104 }, { "epoch": 0.03254678600488202, "grad_norm": 0.45754064102601544, "learning_rate": 3.9949836795631925e-05, "loss": 0.8765, "step": 105 }, { "epoch": 0.0328567553954047, "grad_norm": 0.42502802621278973, "learning_rate": 3.994843429998268e-05, "loss": 0.8845, "step": 106 }, { "epoch": 0.03316672478592739, "grad_norm": 0.4559171955051951, "learning_rate": 3.9947012493070023e-05, "loss": 0.88, "step": 107 }, { "epoch": 0.03347669417645008, "grad_norm": 0.4362561057142038, "learning_rate": 3.994557137627033e-05, "loss": 0.8796, "step": 108 }, { "epoch": 0.03378666356697276, "grad_norm": 0.37850909720840786, "learning_rate": 3.99441109509787e-05, "loss": 0.8997, "step": 109 }, { "epoch": 0.034096632957495446, "grad_norm": 0.38984258247343234, "learning_rate": 3.994263121860891e-05, "loss": 0.9111, "step": 110 }, { "epoch": 0.03440660234801814, "grad_norm": 0.39954070047292034, "learning_rate": 3.994113218059342e-05, "loss": 0.871, "step": 111 }, { "epoch": 0.03471657173854082, "grad_norm": 0.37914717881100773, "learning_rate": 3.9939613838383394e-05, "loss": 0.8768, "step": 112 }, { "epoch": 0.035026541129063504, "grad_norm": 0.38100979078722225, "learning_rate": 3.9938076193448675e-05, "loss": 0.8927, "step": 113 }, { "epoch": 0.03533651051958619, "grad_norm": 0.3930661811672519, "learning_rate": 3.9936519247277796e-05, "loss": 0.8684, "step": 114 }, { "epoch": 0.03564647991010888, "grad_norm": 0.40293007751101606, "learning_rate": 3.993494300137797e-05, "loss": 0.9398, "step": 115 }, { "epoch": 0.03595644930063156, "grad_norm": 0.3999977850861087, "learning_rate": 3.9933347457275095e-05, "loss": 0.8947, "step": 116 }, { "epoch": 0.03626641869115425, "grad_norm": 0.476576744846758, "learning_rate": 3.993173261651376e-05, "loss": 0.8249, "step": 117 }, { "epoch": 0.03657638808167694, "grad_norm": 0.39762946004472105, "learning_rate": 3.9930098480657215e-05, "loss": 0.9094, "step": 118 }, { "epoch": 0.03688635747219962, "grad_norm": 0.3839493390924663, "learning_rate": 3.9928445051287417e-05, "loss": 0.8722, "step": 119 }, { "epoch": 0.037196326862722305, "grad_norm": 0.37275567241096463, "learning_rate": 3.992677233000496e-05, "loss": 0.8749, "step": 120 }, { "epoch": 0.03750629625324499, "grad_norm": 0.3857660733114723, "learning_rate": 3.992508031842916e-05, "loss": 0.8615, "step": 121 }, { "epoch": 0.03781626564376768, "grad_norm": 0.38348507543355453, "learning_rate": 3.992336901819797e-05, "loss": 0.9196, "step": 122 }, { "epoch": 0.038126235034290364, "grad_norm": 0.3599601591304575, "learning_rate": 3.992163843096803e-05, "loss": 0.85, "step": 123 }, { "epoch": 0.03843620442481305, "grad_norm": 0.3725765079950775, "learning_rate": 3.9919888558414654e-05, "loss": 0.8318, "step": 124 }, { "epoch": 0.03874617381533574, "grad_norm": 0.4197624460655737, "learning_rate": 3.991811940223183e-05, "loss": 0.8956, "step": 125 }, { "epoch": 0.03905614320585842, "grad_norm": 0.40170773729660336, "learning_rate": 3.991633096413219e-05, "loss": 0.9049, "step": 126 }, { "epoch": 0.039366112596381106, "grad_norm": 0.3839490912652062, "learning_rate": 3.991452324584706e-05, "loss": 0.8417, "step": 127 }, { "epoch": 0.03967608198690379, "grad_norm": 0.40149142402936944, "learning_rate": 3.991269624912641e-05, "loss": 0.8933, "step": 128 }, { "epoch": 0.03998605137742648, "grad_norm": 0.40365190575875254, "learning_rate": 3.9910849975738883e-05, "loss": 0.8631, "step": 129 }, { "epoch": 0.040296020767949164, "grad_norm": 0.4256704376731805, "learning_rate": 3.9908984427471795e-05, "loss": 0.8936, "step": 130 }, { "epoch": 0.04060599015847185, "grad_norm": 0.476236749156045, "learning_rate": 3.9907099606131085e-05, "loss": 0.8686, "step": 131 }, { "epoch": 0.04091595954899454, "grad_norm": 0.4524983952413524, "learning_rate": 3.9905195513541377e-05, "loss": 0.8811, "step": 132 }, { "epoch": 0.04122592893951722, "grad_norm": 0.4957343063646191, "learning_rate": 3.9903272151545956e-05, "loss": 0.8418, "step": 133 }, { "epoch": 0.04153589833003991, "grad_norm": 0.5649586751535758, "learning_rate": 3.9901329522006744e-05, "loss": 0.8672, "step": 134 }, { "epoch": 0.0418458677205626, "grad_norm": 0.5521927318361664, "learning_rate": 3.9899367626804316e-05, "loss": 0.8456, "step": 135 }, { "epoch": 0.04215583711108528, "grad_norm": 0.5353118817622022, "learning_rate": 3.98973864678379e-05, "loss": 0.8791, "step": 136 }, { "epoch": 0.042465806501607965, "grad_norm": 0.4629340111634766, "learning_rate": 3.98953860470254e-05, "loss": 0.8789, "step": 137 }, { "epoch": 0.04277577589213065, "grad_norm": 0.3705730436004438, "learning_rate": 3.989336636630331e-05, "loss": 0.8734, "step": 138 }, { "epoch": 0.04308574528265334, "grad_norm": 0.3599531762818033, "learning_rate": 3.9891327427626815e-05, "loss": 0.9045, "step": 139 }, { "epoch": 0.043395714673176024, "grad_norm": 0.38047605163602755, "learning_rate": 3.988926923296973e-05, "loss": 0.8476, "step": 140 }, { "epoch": 0.04370568406369871, "grad_norm": 0.4232037083877876, "learning_rate": 3.988719178432451e-05, "loss": 0.8761, "step": 141 }, { "epoch": 0.0440156534542214, "grad_norm": 0.46803250516379286, "learning_rate": 3.988509508370224e-05, "loss": 0.8761, "step": 142 }, { "epoch": 0.04432562284474408, "grad_norm": 0.40823954056946227, "learning_rate": 3.9882979133132654e-05, "loss": 0.8542, "step": 143 }, { "epoch": 0.044635592235266766, "grad_norm": 0.41590910734179626, "learning_rate": 3.988084393466411e-05, "loss": 0.8582, "step": 144 }, { "epoch": 0.04494556162578945, "grad_norm": 0.4332955018074458, "learning_rate": 3.987868949036363e-05, "loss": 0.8375, "step": 145 }, { "epoch": 0.04525553101631214, "grad_norm": 0.3813329334459608, "learning_rate": 3.9876515802316815e-05, "loss": 0.8486, "step": 146 }, { "epoch": 0.045565500406834825, "grad_norm": 0.4838921692496399, "learning_rate": 3.987432287262794e-05, "loss": 0.8519, "step": 147 }, { "epoch": 0.04587546979735751, "grad_norm": 0.3534423405699255, "learning_rate": 3.987211070341988e-05, "loss": 0.8583, "step": 148 }, { "epoch": 0.0461854391878802, "grad_norm": 0.40425864042716586, "learning_rate": 3.986987929683416e-05, "loss": 0.8855, "step": 149 }, { "epoch": 0.04649540857840288, "grad_norm": 0.4073670212713839, "learning_rate": 3.9867628655030894e-05, "loss": 0.8362, "step": 150 }, { "epoch": 0.04680537796892557, "grad_norm": 0.37665089152090453, "learning_rate": 3.986535878018886e-05, "loss": 0.8483, "step": 151 }, { "epoch": 0.04711534735944826, "grad_norm": 0.35466860055687843, "learning_rate": 3.986306967450541e-05, "loss": 0.842, "step": 152 }, { "epoch": 0.04742531674997094, "grad_norm": 0.44046002284907254, "learning_rate": 3.986076134019655e-05, "loss": 0.8786, "step": 153 }, { "epoch": 0.047735286140493625, "grad_norm": 0.3515697672405208, "learning_rate": 3.985843377949687e-05, "loss": 0.848, "step": 154 }, { "epoch": 0.04804525553101631, "grad_norm": 0.41911637061446305, "learning_rate": 3.985608699465959e-05, "loss": 0.8516, "step": 155 }, { "epoch": 0.048355224921539, "grad_norm": 0.4796371577320052, "learning_rate": 3.9853720987956545e-05, "loss": 0.8152, "step": 156 }, { "epoch": 0.048665194312061684, "grad_norm": 0.49883758125334976, "learning_rate": 3.985133576167816e-05, "loss": 0.8447, "step": 157 }, { "epoch": 0.04897516370258437, "grad_norm": 0.45773142049719884, "learning_rate": 3.984893131813348e-05, "loss": 0.8449, "step": 158 }, { "epoch": 0.04928513309310706, "grad_norm": 0.41585979898799236, "learning_rate": 3.984650765965015e-05, "loss": 0.8786, "step": 159 }, { "epoch": 0.04959510248362974, "grad_norm": 0.4034512127871603, "learning_rate": 3.98440647885744e-05, "loss": 0.8303, "step": 160 }, { "epoch": 0.049905071874152426, "grad_norm": 0.3497696899239929, "learning_rate": 3.9841602707271095e-05, "loss": 0.8389, "step": 161 }, { "epoch": 0.05021504126467511, "grad_norm": 0.3348003142633905, "learning_rate": 3.9839121418123666e-05, "loss": 0.8736, "step": 162 }, { "epoch": 0.0505250106551978, "grad_norm": 0.38416793490788237, "learning_rate": 3.9836620923534136e-05, "loss": 0.8899, "step": 163 }, { "epoch": 0.050834980045720485, "grad_norm": 0.45178052904790883, "learning_rate": 3.983410122592316e-05, "loss": 0.838, "step": 164 }, { "epoch": 0.05114494943624317, "grad_norm": 0.43156651563854537, "learning_rate": 3.983156232772992e-05, "loss": 0.8436, "step": 165 }, { "epoch": 0.05145491882676586, "grad_norm": 0.44875757388586124, "learning_rate": 3.982900423141224e-05, "loss": 0.8518, "step": 166 }, { "epoch": 0.05176488821728854, "grad_norm": 0.4882256849356562, "learning_rate": 3.9826426939446505e-05, "loss": 0.8668, "step": 167 }, { "epoch": 0.05207485760781123, "grad_norm": 0.500421546665886, "learning_rate": 3.982383045432769e-05, "loss": 0.8616, "step": 168 }, { "epoch": 0.05238482699833392, "grad_norm": 0.4237864386235901, "learning_rate": 3.982121477856933e-05, "loss": 0.8144, "step": 169 }, { "epoch": 0.0526947963888566, "grad_norm": 0.3749459366720854, "learning_rate": 3.9818579914703576e-05, "loss": 0.8624, "step": 170 }, { "epoch": 0.053004765779379286, "grad_norm": 0.3593817162123949, "learning_rate": 3.9815925865281115e-05, "loss": 0.8438, "step": 171 }, { "epoch": 0.05331473516990197, "grad_norm": 0.38631310463615576, "learning_rate": 3.981325263287123e-05, "loss": 0.8541, "step": 172 }, { "epoch": 0.05362470456042466, "grad_norm": 0.4611445664345047, "learning_rate": 3.9810560220061774e-05, "loss": 0.8635, "step": 173 }, { "epoch": 0.053934673950947344, "grad_norm": 0.6074077437077018, "learning_rate": 3.980784862945915e-05, "loss": 0.8265, "step": 174 }, { "epoch": 0.05424464334147003, "grad_norm": 0.6414773202102408, "learning_rate": 3.980511786368834e-05, "loss": 0.8524, "step": 175 }, { "epoch": 0.05455461273199272, "grad_norm": 0.5399625823887626, "learning_rate": 3.98023679253929e-05, "loss": 0.8598, "step": 176 }, { "epoch": 0.0548645821225154, "grad_norm": 0.4204662461994773, "learning_rate": 3.9799598817234916e-05, "loss": 0.847, "step": 177 }, { "epoch": 0.055174551513038086, "grad_norm": 0.3563442506988686, "learning_rate": 3.979681054189506e-05, "loss": 0.8236, "step": 178 }, { "epoch": 0.05548452090356077, "grad_norm": 0.4804436280516237, "learning_rate": 3.979400310207254e-05, "loss": 0.8889, "step": 179 }, { "epoch": 0.05579449029408346, "grad_norm": 0.5495376130887005, "learning_rate": 3.979117650048512e-05, "loss": 0.8651, "step": 180 }, { "epoch": 0.056104459684606145, "grad_norm": 0.5080484973254404, "learning_rate": 3.9788330739869126e-05, "loss": 0.865, "step": 181 }, { "epoch": 0.05641442907512883, "grad_norm": 0.3866665033089606, "learning_rate": 3.9785465822979425e-05, "loss": 0.845, "step": 182 }, { "epoch": 0.05672439846565152, "grad_norm": 0.37287594336263713, "learning_rate": 3.978258175258942e-05, "loss": 0.8319, "step": 183 }, { "epoch": 0.0570343678561742, "grad_norm": 0.46398630075382347, "learning_rate": 3.9779678531491065e-05, "loss": 0.844, "step": 184 }, { "epoch": 0.05734433724669689, "grad_norm": 0.4413829672230403, "learning_rate": 3.977675616249484e-05, "loss": 0.8487, "step": 185 }, { "epoch": 0.05765430663721957, "grad_norm": 0.36003296311227684, "learning_rate": 3.977381464842978e-05, "loss": 0.8814, "step": 186 }, { "epoch": 0.05796427602774226, "grad_norm": 0.32723304983403834, "learning_rate": 3.977085399214345e-05, "loss": 0.8855, "step": 187 }, { "epoch": 0.058274245418264946, "grad_norm": 0.43273391701291225, "learning_rate": 3.9767874196501925e-05, "loss": 0.8267, "step": 188 }, { "epoch": 0.05858421480878763, "grad_norm": 0.43156508420987205, "learning_rate": 3.9764875264389836e-05, "loss": 0.8456, "step": 189 }, { "epoch": 0.05889418419931032, "grad_norm": 0.3921485367129451, "learning_rate": 3.9761857198710316e-05, "loss": 0.8752, "step": 190 }, { "epoch": 0.059204153589833004, "grad_norm": 0.31455864850808873, "learning_rate": 3.975882000238504e-05, "loss": 0.8722, "step": 191 }, { "epoch": 0.05951412298035569, "grad_norm": 0.3198027893307386, "learning_rate": 3.975576367835419e-05, "loss": 0.8047, "step": 192 }, { "epoch": 0.05982409237087838, "grad_norm": 0.4134366696897723, "learning_rate": 3.975268822957647e-05, "loss": 0.8523, "step": 193 }, { "epoch": 0.06013406176140106, "grad_norm": 0.46912424615552356, "learning_rate": 3.9749593659029096e-05, "loss": 0.8642, "step": 194 }, { "epoch": 0.06044403115192375, "grad_norm": 0.39243517062629535, "learning_rate": 3.9746479969707786e-05, "loss": 0.8759, "step": 195 }, { "epoch": 0.06075400054244643, "grad_norm": 0.33349457710756236, "learning_rate": 3.974334716462679e-05, "loss": 0.8364, "step": 196 }, { "epoch": 0.06106396993296912, "grad_norm": 0.33308553219709414, "learning_rate": 3.9740195246818844e-05, "loss": 0.8128, "step": 197 }, { "epoch": 0.061373939323491805, "grad_norm": 0.4093289411665088, "learning_rate": 3.973702421933518e-05, "loss": 0.8256, "step": 198 }, { "epoch": 0.06168390871401449, "grad_norm": 0.42196790159653785, "learning_rate": 3.9733834085245554e-05, "loss": 0.8372, "step": 199 }, { "epoch": 0.06199387810453718, "grad_norm": 0.4240412451465289, "learning_rate": 3.973062484763819e-05, "loss": 0.8419, "step": 200 }, { "epoch": 0.062303847495059864, "grad_norm": 0.36327092598147415, "learning_rate": 3.972739650961984e-05, "loss": 0.8054, "step": 201 }, { "epoch": 0.06261381688558255, "grad_norm": 0.415763303397204, "learning_rate": 3.9724149074315705e-05, "loss": 0.8096, "step": 202 }, { "epoch": 0.06292378627610523, "grad_norm": 0.47726956728801245, "learning_rate": 3.9720882544869514e-05, "loss": 0.8471, "step": 203 }, { "epoch": 0.06323375566662792, "grad_norm": 0.4601860404661211, "learning_rate": 3.971759692444345e-05, "loss": 0.8266, "step": 204 }, { "epoch": 0.06354372505715061, "grad_norm": 0.45872705383165807, "learning_rate": 3.9714292216218175e-05, "loss": 0.8331, "step": 205 }, { "epoch": 0.06385369444767329, "grad_norm": 0.35464444405071266, "learning_rate": 3.971096842339287e-05, "loss": 0.8438, "step": 206 }, { "epoch": 0.06416366383819598, "grad_norm": 0.37457445058733324, "learning_rate": 3.9707625549185135e-05, "loss": 0.8268, "step": 207 }, { "epoch": 0.06447363322871866, "grad_norm": 0.41856453790501325, "learning_rate": 3.970426359683109e-05, "loss": 0.8109, "step": 208 }, { "epoch": 0.06478360261924135, "grad_norm": 0.3798193609200399, "learning_rate": 3.970088256958529e-05, "loss": 0.8615, "step": 209 }, { "epoch": 0.06509357200976404, "grad_norm": 0.37037934846072873, "learning_rate": 3.969748247072078e-05, "loss": 0.8446, "step": 210 }, { "epoch": 0.06540354140028672, "grad_norm": 0.35143833850063905, "learning_rate": 3.9694063303529055e-05, "loss": 0.8536, "step": 211 }, { "epoch": 0.0657135107908094, "grad_norm": 0.38547840343090495, "learning_rate": 3.969062507132006e-05, "loss": 0.8542, "step": 212 }, { "epoch": 0.0660234801813321, "grad_norm": 0.4333159189646541, "learning_rate": 3.9687167777422226e-05, "loss": 0.8218, "step": 213 }, { "epoch": 0.06633344957185477, "grad_norm": 0.42916262885738543, "learning_rate": 3.96836914251824e-05, "loss": 0.8361, "step": 214 }, { "epoch": 0.06664341896237747, "grad_norm": 0.36533651028271663, "learning_rate": 3.96801960179659e-05, "loss": 0.8219, "step": 215 }, { "epoch": 0.06695338835290016, "grad_norm": 0.34433161239171234, "learning_rate": 3.9676681559156495e-05, "loss": 0.8262, "step": 216 }, { "epoch": 0.06726335774342283, "grad_norm": 0.42246591407753986, "learning_rate": 3.967314805215638e-05, "loss": 0.7822, "step": 217 }, { "epoch": 0.06757332713394552, "grad_norm": 0.440877048508625, "learning_rate": 3.966959550038621e-05, "loss": 0.864, "step": 218 }, { "epoch": 0.06788329652446821, "grad_norm": 0.36898754741874734, "learning_rate": 3.966602390728505e-05, "loss": 0.818, "step": 219 }, { "epoch": 0.06819326591499089, "grad_norm": 0.3744045716394384, "learning_rate": 3.9662433276310427e-05, "loss": 0.8388, "step": 220 }, { "epoch": 0.06850323530551358, "grad_norm": 0.38145976555830197, "learning_rate": 3.965882361093827e-05, "loss": 0.8448, "step": 221 }, { "epoch": 0.06881320469603627, "grad_norm": 0.3905663643402046, "learning_rate": 3.965519491466296e-05, "loss": 0.838, "step": 222 }, { "epoch": 0.06912317408655895, "grad_norm": 0.4359452985403786, "learning_rate": 3.965154719099729e-05, "loss": 0.8639, "step": 223 }, { "epoch": 0.06943314347708164, "grad_norm": 0.42921409442006764, "learning_rate": 3.9647880443472464e-05, "loss": 0.8839, "step": 224 }, { "epoch": 0.06974311286760432, "grad_norm": 0.4101075800272988, "learning_rate": 3.964419467563811e-05, "loss": 0.8291, "step": 225 }, { "epoch": 0.07005308225812701, "grad_norm": 0.4328578349246405, "learning_rate": 3.964048989106229e-05, "loss": 0.836, "step": 226 }, { "epoch": 0.0703630516486497, "grad_norm": 0.479500094801105, "learning_rate": 3.9636766093331416e-05, "loss": 0.8217, "step": 227 }, { "epoch": 0.07067302103917238, "grad_norm": 0.4588798653544195, "learning_rate": 3.963302328605037e-05, "loss": 0.8129, "step": 228 }, { "epoch": 0.07098299042969507, "grad_norm": 0.4756840260719557, "learning_rate": 3.962926147284242e-05, "loss": 0.8442, "step": 229 }, { "epoch": 0.07129295982021776, "grad_norm": 0.39207558337339227, "learning_rate": 3.9625480657349196e-05, "loss": 0.8301, "step": 230 }, { "epoch": 0.07160292921074043, "grad_norm": 0.3721572182635624, "learning_rate": 3.962168084323077e-05, "loss": 0.801, "step": 231 }, { "epoch": 0.07191289860126313, "grad_norm": 0.376361043333028, "learning_rate": 3.9617862034165584e-05, "loss": 0.8324, "step": 232 }, { "epoch": 0.07222286799178582, "grad_norm": 0.3684518336466041, "learning_rate": 3.9614024233850465e-05, "loss": 0.8347, "step": 233 }, { "epoch": 0.0725328373823085, "grad_norm": 0.42363042348470564, "learning_rate": 3.961016744600064e-05, "loss": 0.8437, "step": 234 }, { "epoch": 0.07284280677283118, "grad_norm": 0.41037515385576345, "learning_rate": 3.960629167434969e-05, "loss": 0.828, "step": 235 }, { "epoch": 0.07315277616335387, "grad_norm": 0.4117224551176362, "learning_rate": 3.960239692264961e-05, "loss": 0.8407, "step": 236 }, { "epoch": 0.07346274555387655, "grad_norm": 0.3451149945020082, "learning_rate": 3.959848319467075e-05, "loss": 0.8293, "step": 237 }, { "epoch": 0.07377271494439924, "grad_norm": 0.30416522364008625, "learning_rate": 3.959455049420181e-05, "loss": 0.7972, "step": 238 }, { "epoch": 0.07408268433492193, "grad_norm": 0.38410037406568764, "learning_rate": 3.9590598825049896e-05, "loss": 0.8528, "step": 239 }, { "epoch": 0.07439265372544461, "grad_norm": 0.35194676031747457, "learning_rate": 3.958662819104045e-05, "loss": 0.8244, "step": 240 }, { "epoch": 0.0747026231159673, "grad_norm": 0.39974551262615, "learning_rate": 3.9582638596017275e-05, "loss": 0.8614, "step": 241 }, { "epoch": 0.07501259250648998, "grad_norm": 0.4969381693842004, "learning_rate": 3.9578630043842543e-05, "loss": 0.7953, "step": 242 }, { "epoch": 0.07532256189701267, "grad_norm": 0.5047696746624432, "learning_rate": 3.957460253839677e-05, "loss": 0.8262, "step": 243 }, { "epoch": 0.07563253128753536, "grad_norm": 0.48233014936073026, "learning_rate": 3.957055608357881e-05, "loss": 0.8471, "step": 244 }, { "epoch": 0.07594250067805804, "grad_norm": 0.3718813653838518, "learning_rate": 3.9566490683305884e-05, "loss": 0.8395, "step": 245 }, { "epoch": 0.07625247006858073, "grad_norm": 0.3048292504211336, "learning_rate": 3.956240634151352e-05, "loss": 0.8105, "step": 246 }, { "epoch": 0.07656243945910342, "grad_norm": 0.4220466950123056, "learning_rate": 3.955830306215563e-05, "loss": 0.8039, "step": 247 }, { "epoch": 0.0768724088496261, "grad_norm": 0.5113493779921835, "learning_rate": 3.955418084920441e-05, "loss": 0.8194, "step": 248 }, { "epoch": 0.07718237824014879, "grad_norm": 0.4847599262138346, "learning_rate": 3.955003970665041e-05, "loss": 0.8458, "step": 249 }, { "epoch": 0.07749234763067148, "grad_norm": 0.37215998337358813, "learning_rate": 3.9545879638502495e-05, "loss": 0.8398, "step": 250 }, { "epoch": 0.07780231702119415, "grad_norm": 0.35338728154406246, "learning_rate": 3.9541700648787876e-05, "loss": 0.8241, "step": 251 }, { "epoch": 0.07811228641171684, "grad_norm": 0.5723708921940028, "learning_rate": 3.953750274155205e-05, "loss": 0.8389, "step": 252 }, { "epoch": 0.07842225580223953, "grad_norm": 0.47277446271128515, "learning_rate": 3.953328592085883e-05, "loss": 0.8456, "step": 253 }, { "epoch": 0.07873222519276221, "grad_norm": 0.44198953448546885, "learning_rate": 3.952905019079036e-05, "loss": 0.8228, "step": 254 }, { "epoch": 0.0790421945832849, "grad_norm": 0.4152930937434394, "learning_rate": 3.952479555544708e-05, "loss": 0.821, "step": 255 }, { "epoch": 0.07935216397380758, "grad_norm": 0.4126038363426425, "learning_rate": 3.9520522018947716e-05, "loss": 0.8731, "step": 256 }, { "epoch": 0.07966213336433027, "grad_norm": 0.3135632515346253, "learning_rate": 3.9516229585429314e-05, "loss": 0.8038, "step": 257 }, { "epoch": 0.07997210275485296, "grad_norm": 0.39123093675937815, "learning_rate": 3.95119182590472e-05, "loss": 0.8371, "step": 258 }, { "epoch": 0.08028207214537564, "grad_norm": 0.4137040986320039, "learning_rate": 3.9507588043974984e-05, "loss": 0.8167, "step": 259 }, { "epoch": 0.08059204153589833, "grad_norm": 0.3740886967094645, "learning_rate": 3.9503238944404584e-05, "loss": 0.7994, "step": 260 }, { "epoch": 0.08090201092642102, "grad_norm": 0.288256579156756, "learning_rate": 3.9498870964546185e-05, "loss": 0.7948, "step": 261 }, { "epoch": 0.0812119803169437, "grad_norm": 0.308042871656003, "learning_rate": 3.949448410862824e-05, "loss": 0.8093, "step": 262 }, { "epoch": 0.08152194970746639, "grad_norm": 0.3394635846264843, "learning_rate": 3.9490078380897485e-05, "loss": 0.8167, "step": 263 }, { "epoch": 0.08183191909798908, "grad_norm": 0.3449046710897922, "learning_rate": 3.948565378561894e-05, "loss": 0.8138, "step": 264 }, { "epoch": 0.08214188848851176, "grad_norm": 0.33564453052475696, "learning_rate": 3.948121032707585e-05, "loss": 0.8224, "step": 265 }, { "epoch": 0.08245185787903445, "grad_norm": 0.31550152714320445, "learning_rate": 3.947674800956977e-05, "loss": 0.8179, "step": 266 }, { "epoch": 0.08276182726955714, "grad_norm": 0.3384431574597758, "learning_rate": 3.947226683742048e-05, "loss": 0.8118, "step": 267 }, { "epoch": 0.08307179666007981, "grad_norm": 0.36628757265660533, "learning_rate": 3.9467766814966e-05, "loss": 0.8247, "step": 268 }, { "epoch": 0.0833817660506025, "grad_norm": 0.36611928338366795, "learning_rate": 3.9463247946562646e-05, "loss": 0.8303, "step": 269 }, { "epoch": 0.0836917354411252, "grad_norm": 0.3096568872243093, "learning_rate": 3.945871023658494e-05, "loss": 0.7999, "step": 270 }, { "epoch": 0.08400170483164787, "grad_norm": 0.3023450141656687, "learning_rate": 3.9454153689425646e-05, "loss": 0.8513, "step": 271 }, { "epoch": 0.08431167422217056, "grad_norm": 0.322914843094919, "learning_rate": 3.944957830949577e-05, "loss": 0.8188, "step": 272 }, { "epoch": 0.08462164361269324, "grad_norm": 0.3628124589007999, "learning_rate": 3.9444984101224566e-05, "loss": 0.8561, "step": 273 }, { "epoch": 0.08493161300321593, "grad_norm": 0.34688561902295556, "learning_rate": 3.944037106905948e-05, "loss": 0.7911, "step": 274 }, { "epoch": 0.08524158239373862, "grad_norm": 0.33691937807388156, "learning_rate": 3.9435739217466215e-05, "loss": 0.8523, "step": 275 }, { "epoch": 0.0855515517842613, "grad_norm": 0.3451637398425159, "learning_rate": 3.943108855092868e-05, "loss": 0.8141, "step": 276 }, { "epoch": 0.08586152117478399, "grad_norm": 0.3380506443506499, "learning_rate": 3.9426419073948976e-05, "loss": 0.819, "step": 277 }, { "epoch": 0.08617149056530668, "grad_norm": 0.40303686858228366, "learning_rate": 3.942173079104744e-05, "loss": 0.8556, "step": 278 }, { "epoch": 0.08648145995582936, "grad_norm": 0.3721194018033922, "learning_rate": 3.941702370676261e-05, "loss": 0.8076, "step": 279 }, { "epoch": 0.08679142934635205, "grad_norm": 0.3665427755790044, "learning_rate": 3.941229782565123e-05, "loss": 0.8452, "step": 280 }, { "epoch": 0.08710139873687474, "grad_norm": 0.3561828252993337, "learning_rate": 3.940755315228821e-05, "loss": 0.8456, "step": 281 }, { "epoch": 0.08741136812739742, "grad_norm": 0.35809222900046134, "learning_rate": 3.9402789691266703e-05, "loss": 0.8374, "step": 282 }, { "epoch": 0.0877213375179201, "grad_norm": 0.35517538672143156, "learning_rate": 3.9398007447198e-05, "loss": 0.8002, "step": 283 }, { "epoch": 0.0880313069084428, "grad_norm": 0.36148254021238113, "learning_rate": 3.93932064247116e-05, "loss": 0.7886, "step": 284 }, { "epoch": 0.08834127629896547, "grad_norm": 0.3319573641996667, "learning_rate": 3.938838662845518e-05, "loss": 0.8054, "step": 285 }, { "epoch": 0.08865124568948816, "grad_norm": 0.30390510952886224, "learning_rate": 3.938354806309459e-05, "loss": 0.8227, "step": 286 }, { "epoch": 0.08896121508001086, "grad_norm": 0.34174698639247314, "learning_rate": 3.937869073331384e-05, "loss": 0.8099, "step": 287 }, { "epoch": 0.08927118447053353, "grad_norm": 0.31664797256574495, "learning_rate": 3.937381464381511e-05, "loss": 0.8094, "step": 288 }, { "epoch": 0.08958115386105622, "grad_norm": 0.3017155679803345, "learning_rate": 3.9368919799318755e-05, "loss": 0.8106, "step": 289 }, { "epoch": 0.0898911232515789, "grad_norm": 0.33199339885822565, "learning_rate": 3.936400620456326e-05, "loss": 0.8389, "step": 290 }, { "epoch": 0.09020109264210159, "grad_norm": 0.6145127568166471, "learning_rate": 3.935907386430529e-05, "loss": 0.8339, "step": 291 }, { "epoch": 0.09051106203262428, "grad_norm": 0.4019054079985019, "learning_rate": 3.9354122783319634e-05, "loss": 0.8397, "step": 292 }, { "epoch": 0.09082103142314696, "grad_norm": 0.361053882333466, "learning_rate": 3.934915296639923e-05, "loss": 0.7694, "step": 293 }, { "epoch": 0.09113100081366965, "grad_norm": 0.3245138492875535, "learning_rate": 3.934416441835515e-05, "loss": 0.8409, "step": 294 }, { "epoch": 0.09144097020419234, "grad_norm": 0.31454628468311946, "learning_rate": 3.9339157144016614e-05, "loss": 0.7989, "step": 295 }, { "epoch": 0.09175093959471502, "grad_norm": 0.5430302402295328, "learning_rate": 3.9334131148230954e-05, "loss": 0.8857, "step": 296 }, { "epoch": 0.09206090898523771, "grad_norm": 0.5930830812977453, "learning_rate": 3.9329086435863637e-05, "loss": 0.8188, "step": 297 }, { "epoch": 0.0923708783757604, "grad_norm": 0.33256004320995924, "learning_rate": 3.932402301179823e-05, "loss": 0.8198, "step": 298 }, { "epoch": 0.09268084776628308, "grad_norm": 0.3428517617662639, "learning_rate": 3.9318940880936434e-05, "loss": 0.7842, "step": 299 }, { "epoch": 0.09299081715680577, "grad_norm": 0.3425347804007228, "learning_rate": 3.931384004819805e-05, "loss": 0.7936, "step": 300 }, { "epoch": 0.09330078654732846, "grad_norm": 0.40021595328947496, "learning_rate": 3.9308720518521e-05, "loss": 0.8459, "step": 301 }, { "epoch": 0.09361075593785113, "grad_norm": 0.3992011710299031, "learning_rate": 3.930358229686126e-05, "loss": 0.8382, "step": 302 }, { "epoch": 0.09392072532837382, "grad_norm": 0.3715097769606661, "learning_rate": 3.929842538819297e-05, "loss": 0.7758, "step": 303 }, { "epoch": 0.09423069471889652, "grad_norm": 0.34253919057660137, "learning_rate": 3.929324979750829e-05, "loss": 0.824, "step": 304 }, { "epoch": 0.09454066410941919, "grad_norm": 0.36190892321409573, "learning_rate": 3.928805552981752e-05, "loss": 0.8283, "step": 305 }, { "epoch": 0.09485063349994188, "grad_norm": 0.37250185909875827, "learning_rate": 3.9282842590149e-05, "loss": 0.799, "step": 306 }, { "epoch": 0.09516060289046456, "grad_norm": 0.3253796939024287, "learning_rate": 3.927761098354918e-05, "loss": 0.8013, "step": 307 }, { "epoch": 0.09547057228098725, "grad_norm": 0.3067453388038983, "learning_rate": 3.927236071508256e-05, "loss": 0.7978, "step": 308 }, { "epoch": 0.09578054167150994, "grad_norm": 0.36258162512526093, "learning_rate": 3.92670917898317e-05, "loss": 0.8008, "step": 309 }, { "epoch": 0.09609051106203262, "grad_norm": 0.37097831725394625, "learning_rate": 3.926180421289724e-05, "loss": 0.8247, "step": 310 }, { "epoch": 0.09640048045255531, "grad_norm": 0.35159644520913824, "learning_rate": 3.925649798939787e-05, "loss": 0.7917, "step": 311 }, { "epoch": 0.096710449843078, "grad_norm": 0.33757339398037617, "learning_rate": 3.925117312447032e-05, "loss": 0.8256, "step": 312 }, { "epoch": 0.09702041923360068, "grad_norm": 0.3381259484905048, "learning_rate": 3.924582962326938e-05, "loss": 0.8136, "step": 313 }, { "epoch": 0.09733038862412337, "grad_norm": 0.3428784885239197, "learning_rate": 3.924046749096787e-05, "loss": 0.834, "step": 314 }, { "epoch": 0.09764035801464606, "grad_norm": 0.289338460471925, "learning_rate": 3.923508673275666e-05, "loss": 0.792, "step": 315 }, { "epoch": 0.09795032740516874, "grad_norm": 0.2751201355788077, "learning_rate": 3.922968735384463e-05, "loss": 0.8391, "step": 316 }, { "epoch": 0.09826029679569143, "grad_norm": 0.331451295251653, "learning_rate": 3.9224269359458704e-05, "loss": 0.8114, "step": 317 }, { "epoch": 0.09857026618621412, "grad_norm": 0.33404602382146853, "learning_rate": 3.921883275484382e-05, "loss": 0.8015, "step": 318 }, { "epoch": 0.0988802355767368, "grad_norm": 0.28574901954262094, "learning_rate": 3.9213377545262935e-05, "loss": 0.8348, "step": 319 }, { "epoch": 0.09919020496725948, "grad_norm": 0.4479822599056564, "learning_rate": 3.9207903735997014e-05, "loss": 0.8115, "step": 320 }, { "epoch": 0.09950017435778218, "grad_norm": 0.32246453024637073, "learning_rate": 3.920241133234502e-05, "loss": 0.7964, "step": 321 }, { "epoch": 0.09981014374830485, "grad_norm": 0.3073373545966875, "learning_rate": 3.919690033962393e-05, "loss": 0.7669, "step": 322 }, { "epoch": 0.10012011313882754, "grad_norm": 0.36835212725017336, "learning_rate": 3.919137076316871e-05, "loss": 0.7923, "step": 323 }, { "epoch": 0.10043008252935022, "grad_norm": 0.3531777912656876, "learning_rate": 3.918582260833233e-05, "loss": 0.7908, "step": 324 }, { "epoch": 0.10074005191987291, "grad_norm": 0.4692546791045469, "learning_rate": 3.9180255880485706e-05, "loss": 0.7944, "step": 325 }, { "epoch": 0.1010500213103956, "grad_norm": 0.3573876814703193, "learning_rate": 3.917467058501778e-05, "loss": 0.8081, "step": 326 }, { "epoch": 0.10135999070091828, "grad_norm": 0.348582428458398, "learning_rate": 3.916906672733544e-05, "loss": 0.7728, "step": 327 }, { "epoch": 0.10166996009144097, "grad_norm": 0.3352864616350136, "learning_rate": 3.916344431286355e-05, "loss": 0.7852, "step": 328 }, { "epoch": 0.10197992948196366, "grad_norm": 0.322461055192433, "learning_rate": 3.9157803347044945e-05, "loss": 0.8208, "step": 329 }, { "epoch": 0.10228989887248634, "grad_norm": 0.2627681391660773, "learning_rate": 3.915214383534041e-05, "loss": 0.7991, "step": 330 }, { "epoch": 0.10259986826300903, "grad_norm": 0.30061566430430525, "learning_rate": 3.9146465783228686e-05, "loss": 0.8156, "step": 331 }, { "epoch": 0.10290983765353172, "grad_norm": 0.31300523523902696, "learning_rate": 3.914076919620647e-05, "loss": 0.7869, "step": 332 }, { "epoch": 0.1032198070440544, "grad_norm": 0.3022412317306906, "learning_rate": 3.913505407978838e-05, "loss": 0.7973, "step": 333 }, { "epoch": 0.10352977643457709, "grad_norm": 0.3481369686431684, "learning_rate": 3.9129320439507005e-05, "loss": 0.8022, "step": 334 }, { "epoch": 0.10383974582509978, "grad_norm": 0.35121785335306877, "learning_rate": 3.912356828091284e-05, "loss": 0.7814, "step": 335 }, { "epoch": 0.10414971521562245, "grad_norm": 0.3322762608243955, "learning_rate": 3.9117797609574306e-05, "loss": 0.8158, "step": 336 }, { "epoch": 0.10445968460614515, "grad_norm": 0.29414746160872685, "learning_rate": 3.911200843107776e-05, "loss": 0.8303, "step": 337 }, { "epoch": 0.10476965399666784, "grad_norm": 0.31497828084820084, "learning_rate": 3.910620075102747e-05, "loss": 0.8045, "step": 338 }, { "epoch": 0.10507962338719051, "grad_norm": 0.3340020762810428, "learning_rate": 3.910037457504562e-05, "loss": 0.8066, "step": 339 }, { "epoch": 0.1053895927777132, "grad_norm": 0.34848879258729915, "learning_rate": 3.909452990877229e-05, "loss": 0.7934, "step": 340 }, { "epoch": 0.10569956216823588, "grad_norm": 0.37861459088350863, "learning_rate": 3.908866675786544e-05, "loss": 0.7923, "step": 341 }, { "epoch": 0.10600953155875857, "grad_norm": 0.36622763310918793, "learning_rate": 3.908278512800098e-05, "loss": 0.8174, "step": 342 }, { "epoch": 0.10631950094928126, "grad_norm": 0.36448088235966714, "learning_rate": 3.907688502487266e-05, "loss": 0.7931, "step": 343 }, { "epoch": 0.10662947033980394, "grad_norm": 0.3608902447657813, "learning_rate": 3.9070966454192124e-05, "loss": 0.8086, "step": 344 }, { "epoch": 0.10693943973032663, "grad_norm": 0.3024378352348953, "learning_rate": 3.906502942168891e-05, "loss": 0.8181, "step": 345 }, { "epoch": 0.10724940912084932, "grad_norm": 0.32882339001289335, "learning_rate": 3.90590739331104e-05, "loss": 0.8195, "step": 346 }, { "epoch": 0.107559378511372, "grad_norm": 0.33608193287703, "learning_rate": 3.905309999422187e-05, "loss": 0.8196, "step": 347 }, { "epoch": 0.10786934790189469, "grad_norm": 0.32692036502064814, "learning_rate": 3.904710761080645e-05, "loss": 0.7934, "step": 348 }, { "epoch": 0.10817931729241738, "grad_norm": 0.3158989756391777, "learning_rate": 3.904109678866511e-05, "loss": 0.7775, "step": 349 }, { "epoch": 0.10848928668294006, "grad_norm": 0.3194695464235128, "learning_rate": 3.903506753361669e-05, "loss": 0.8201, "step": 350 }, { "epoch": 0.10879925607346275, "grad_norm": 0.2953812489450616, "learning_rate": 3.902901985149786e-05, "loss": 0.7892, "step": 351 }, { "epoch": 0.10910922546398544, "grad_norm": 0.31751516650548717, "learning_rate": 3.902295374816314e-05, "loss": 0.7739, "step": 352 }, { "epoch": 0.10941919485450811, "grad_norm": 0.3752944694038208, "learning_rate": 3.901686922948487e-05, "loss": 0.7875, "step": 353 }, { "epoch": 0.1097291642450308, "grad_norm": 0.3690036525480242, "learning_rate": 3.9010766301353224e-05, "loss": 0.8173, "step": 354 }, { "epoch": 0.1100391336355535, "grad_norm": 0.32683193232247276, "learning_rate": 3.9004644969676206e-05, "loss": 0.7763, "step": 355 }, { "epoch": 0.11034910302607617, "grad_norm": 0.3535902254742443, "learning_rate": 3.899850524037962e-05, "loss": 0.7852, "step": 356 }, { "epoch": 0.11065907241659886, "grad_norm": 0.35827198769523827, "learning_rate": 3.899234711940707e-05, "loss": 0.8052, "step": 357 }, { "epoch": 0.11096904180712154, "grad_norm": 0.5922539253296351, "learning_rate": 3.898617061272002e-05, "loss": 0.7984, "step": 358 }, { "epoch": 0.11127901119764423, "grad_norm": 0.4136473529324024, "learning_rate": 3.8979975726297665e-05, "loss": 0.8096, "step": 359 }, { "epoch": 0.11158898058816692, "grad_norm": 0.36156195473225594, "learning_rate": 3.8973762466137015e-05, "loss": 0.7734, "step": 360 }, { "epoch": 0.1118989499786896, "grad_norm": 0.3230998370395452, "learning_rate": 3.89675308382529e-05, "loss": 0.8062, "step": 361 }, { "epoch": 0.11220891936921229, "grad_norm": 0.3243934882613732, "learning_rate": 3.8961280848677886e-05, "loss": 0.7937, "step": 362 }, { "epoch": 0.11251888875973498, "grad_norm": 0.3531111157244753, "learning_rate": 3.895501250346233e-05, "loss": 0.7734, "step": 363 }, { "epoch": 0.11282885815025766, "grad_norm": 0.40290444249949187, "learning_rate": 3.894872580867437e-05, "loss": 0.7938, "step": 364 }, { "epoch": 0.11313882754078035, "grad_norm": 0.3971442540229998, "learning_rate": 3.894242077039991e-05, "loss": 0.8113, "step": 365 }, { "epoch": 0.11344879693130304, "grad_norm": 0.3322370455825724, "learning_rate": 3.893609739474257e-05, "loss": 0.799, "step": 366 }, { "epoch": 0.11375876632182572, "grad_norm": 0.3358897044388768, "learning_rate": 3.8929755687823765e-05, "loss": 0.7777, "step": 367 }, { "epoch": 0.1140687357123484, "grad_norm": 0.34385937086335766, "learning_rate": 3.8923395655782646e-05, "loss": 0.8298, "step": 368 }, { "epoch": 0.1143787051028711, "grad_norm": 0.36402700356206213, "learning_rate": 3.8917017304776095e-05, "loss": 0.7976, "step": 369 }, { "epoch": 0.11468867449339377, "grad_norm": 0.3847139642229183, "learning_rate": 3.891062064097874e-05, "loss": 0.8001, "step": 370 }, { "epoch": 0.11499864388391647, "grad_norm": 0.34340449692134745, "learning_rate": 3.890420567058292e-05, "loss": 0.7848, "step": 371 }, { "epoch": 0.11530861327443914, "grad_norm": 0.3750712117328107, "learning_rate": 3.889777239979871e-05, "loss": 0.7772, "step": 372 }, { "epoch": 0.11561858266496183, "grad_norm": 0.37723047588279923, "learning_rate": 3.8891320834853894e-05, "loss": 0.8464, "step": 373 }, { "epoch": 0.11592855205548452, "grad_norm": 0.36897223460083184, "learning_rate": 3.8884850981993974e-05, "loss": 0.8004, "step": 374 }, { "epoch": 0.1162385214460072, "grad_norm": 0.28354726419094856, "learning_rate": 3.8878362847482145e-05, "loss": 0.7961, "step": 375 }, { "epoch": 0.11654849083652989, "grad_norm": 0.34398838273051885, "learning_rate": 3.887185643759931e-05, "loss": 0.8289, "step": 376 }, { "epoch": 0.11685846022705258, "grad_norm": 0.32823802676869496, "learning_rate": 3.886533175864405e-05, "loss": 0.7687, "step": 377 }, { "epoch": 0.11716842961757526, "grad_norm": 0.615864874192243, "learning_rate": 3.8858788816932644e-05, "loss": 0.8028, "step": 378 }, { "epoch": 0.11747839900809795, "grad_norm": 0.33940474793629927, "learning_rate": 3.885222761879905e-05, "loss": 0.7954, "step": 379 }, { "epoch": 0.11778836839862064, "grad_norm": 0.30747427427392093, "learning_rate": 3.884564817059489e-05, "loss": 0.8224, "step": 380 }, { "epoch": 0.11809833778914332, "grad_norm": 0.2717345657270449, "learning_rate": 3.883905047868946e-05, "loss": 0.7895, "step": 381 }, { "epoch": 0.11840830717966601, "grad_norm": 0.3430486539762534, "learning_rate": 3.8832434549469715e-05, "loss": 0.8088, "step": 382 }, { "epoch": 0.1187182765701887, "grad_norm": 0.44122989570609084, "learning_rate": 3.882580038934027e-05, "loss": 0.8273, "step": 383 }, { "epoch": 0.11902824596071138, "grad_norm": 0.29257360025637374, "learning_rate": 3.881914800472339e-05, "loss": 0.8229, "step": 384 }, { "epoch": 0.11933821535123407, "grad_norm": 0.3128647765123276, "learning_rate": 3.881247740205895e-05, "loss": 0.7987, "step": 385 }, { "epoch": 0.11964818474175676, "grad_norm": 0.3454966571542469, "learning_rate": 3.880578858780451e-05, "loss": 0.831, "step": 386 }, { "epoch": 0.11995815413227943, "grad_norm": 0.39680282752751533, "learning_rate": 3.879908156843524e-05, "loss": 0.7796, "step": 387 }, { "epoch": 0.12026812352280213, "grad_norm": 0.9501438267576771, "learning_rate": 3.879235635044392e-05, "loss": 0.7835, "step": 388 }, { "epoch": 0.1205780929133248, "grad_norm": 0.3522289002753077, "learning_rate": 3.878561294034096e-05, "loss": 0.8171, "step": 389 }, { "epoch": 0.1208880623038475, "grad_norm": 0.30707055644219183, "learning_rate": 3.877885134465439e-05, "loss": 0.7753, "step": 390 }, { "epoch": 0.12119803169437018, "grad_norm": 0.39507607108323134, "learning_rate": 3.877207156992982e-05, "loss": 0.792, "step": 391 }, { "epoch": 0.12150800108489286, "grad_norm": 0.4571958652755779, "learning_rate": 3.876527362273048e-05, "loss": 0.8178, "step": 392 }, { "epoch": 0.12181797047541555, "grad_norm": 0.4021966006131574, "learning_rate": 3.875845750963718e-05, "loss": 0.8227, "step": 393 }, { "epoch": 0.12212793986593824, "grad_norm": 0.4190273619658639, "learning_rate": 3.8751623237248324e-05, "loss": 0.8358, "step": 394 }, { "epoch": 0.12243790925646092, "grad_norm": 0.3393591074779922, "learning_rate": 3.87447708121799e-05, "loss": 0.767, "step": 395 }, { "epoch": 0.12274787864698361, "grad_norm": 0.4426875109679626, "learning_rate": 3.873790024106544e-05, "loss": 0.759, "step": 396 }, { "epoch": 0.1230578480375063, "grad_norm": 0.4447852606641024, "learning_rate": 3.8731011530556084e-05, "loss": 0.7749, "step": 397 }, { "epoch": 0.12336781742802898, "grad_norm": 0.3422600246951877, "learning_rate": 3.87241046873205e-05, "loss": 0.7841, "step": 398 }, { "epoch": 0.12367778681855167, "grad_norm": 0.2972370548064978, "learning_rate": 3.8717179718044925e-05, "loss": 0.8134, "step": 399 }, { "epoch": 0.12398775620907436, "grad_norm": 0.3392199467688266, "learning_rate": 3.871023662943313e-05, "loss": 0.8129, "step": 400 }, { "epoch": 0.12429772559959704, "grad_norm": 0.3122405862355022, "learning_rate": 3.8703275428206445e-05, "loss": 0.8196, "step": 401 }, { "epoch": 0.12460769499011973, "grad_norm": 0.3144251527194966, "learning_rate": 3.8696296121103716e-05, "loss": 0.7868, "step": 402 }, { "epoch": 0.12491766438064242, "grad_norm": 0.31133343627024523, "learning_rate": 3.868929871488133e-05, "loss": 0.7947, "step": 403 }, { "epoch": 0.1252276337711651, "grad_norm": 0.34219946820128194, "learning_rate": 3.868228321631319e-05, "loss": 0.7938, "step": 404 }, { "epoch": 0.12553760316168777, "grad_norm": 0.3559844144746437, "learning_rate": 3.867524963219072e-05, "loss": 0.7974, "step": 405 }, { "epoch": 0.12584757255221046, "grad_norm": 0.3396630859231934, "learning_rate": 3.8668197969322816e-05, "loss": 0.809, "step": 406 }, { "epoch": 0.12615754194273315, "grad_norm": 0.28367335740921773, "learning_rate": 3.866112823453594e-05, "loss": 0.7976, "step": 407 }, { "epoch": 0.12646751133325584, "grad_norm": 0.3098784828921308, "learning_rate": 3.8654040434674e-05, "loss": 0.7963, "step": 408 }, { "epoch": 0.12677748072377854, "grad_norm": 0.3521295543246585, "learning_rate": 3.86469345765984e-05, "loss": 0.7952, "step": 409 }, { "epoch": 0.12708745011430123, "grad_norm": 0.30061256240230483, "learning_rate": 3.8639810667188035e-05, "loss": 0.7986, "step": 410 }, { "epoch": 0.1273974195048239, "grad_norm": 0.2977947222541728, "learning_rate": 3.863266871333927e-05, "loss": 0.8051, "step": 411 }, { "epoch": 0.12770738889534658, "grad_norm": 0.37139281142871644, "learning_rate": 3.8625508721965944e-05, "loss": 0.8127, "step": 412 }, { "epoch": 0.12801735828586927, "grad_norm": 0.3755739486648549, "learning_rate": 3.8618330699999335e-05, "loss": 0.8144, "step": 413 }, { "epoch": 0.12832732767639196, "grad_norm": 0.29910218207454947, "learning_rate": 3.8611134654388205e-05, "loss": 0.7853, "step": 414 }, { "epoch": 0.12863729706691465, "grad_norm": 0.2763764996059984, "learning_rate": 3.860392059209876e-05, "loss": 0.8117, "step": 415 }, { "epoch": 0.12894726645743732, "grad_norm": 0.5470451971649547, "learning_rate": 3.859668852011462e-05, "loss": 0.8282, "step": 416 }, { "epoch": 0.12925723584796, "grad_norm": 0.4273472014667062, "learning_rate": 3.858943844543687e-05, "loss": 0.8008, "step": 417 }, { "epoch": 0.1295672052384827, "grad_norm": 0.38694135383141276, "learning_rate": 3.8582170375084006e-05, "loss": 0.8109, "step": 418 }, { "epoch": 0.1298771746290054, "grad_norm": 0.31084621278982616, "learning_rate": 3.857488431609195e-05, "loss": 0.7894, "step": 419 }, { "epoch": 0.13018714401952808, "grad_norm": 0.5473732488621978, "learning_rate": 3.8567580275514046e-05, "loss": 0.7738, "step": 420 }, { "epoch": 0.13049711341005077, "grad_norm": 0.3478835008128649, "learning_rate": 3.856025826042102e-05, "loss": 0.8163, "step": 421 }, { "epoch": 0.13080708280057343, "grad_norm": 0.27911071126556497, "learning_rate": 3.855291827790104e-05, "loss": 0.7669, "step": 422 }, { "epoch": 0.13111705219109612, "grad_norm": 0.2905959307919688, "learning_rate": 3.854556033505961e-05, "loss": 0.7687, "step": 423 }, { "epoch": 0.1314270215816188, "grad_norm": 0.3395528592486997, "learning_rate": 3.853818443901968e-05, "loss": 0.7849, "step": 424 }, { "epoch": 0.1317369909721415, "grad_norm": 0.31974012152414005, "learning_rate": 3.8530790596921546e-05, "loss": 0.7747, "step": 425 }, { "epoch": 0.1320469603626642, "grad_norm": 0.32214383752012915, "learning_rate": 3.852337881592289e-05, "loss": 0.8065, "step": 426 }, { "epoch": 0.13235692975318689, "grad_norm": 0.2717032424230823, "learning_rate": 3.8515949103198736e-05, "loss": 0.7579, "step": 427 }, { "epoch": 0.13266689914370955, "grad_norm": 0.5380885649772502, "learning_rate": 3.8508501465941496e-05, "loss": 0.8002, "step": 428 }, { "epoch": 0.13297686853423224, "grad_norm": 0.30375003064750355, "learning_rate": 3.850103591136093e-05, "loss": 0.8095, "step": 429 }, { "epoch": 0.13328683792475493, "grad_norm": 0.3259668941871077, "learning_rate": 3.849355244668413e-05, "loss": 0.829, "step": 430 }, { "epoch": 0.13359680731527762, "grad_norm": 0.32962715665674647, "learning_rate": 3.848605107915553e-05, "loss": 0.8097, "step": 431 }, { "epoch": 0.1339067767058003, "grad_norm": 0.3303112272405804, "learning_rate": 3.8478531816036894e-05, "loss": 0.7861, "step": 432 }, { "epoch": 0.13421674609632298, "grad_norm": 0.28884100956368297, "learning_rate": 3.847099466460733e-05, "loss": 0.7756, "step": 433 }, { "epoch": 0.13452671548684567, "grad_norm": 0.3010689011064647, "learning_rate": 3.846343963216322e-05, "loss": 0.8045, "step": 434 }, { "epoch": 0.13483668487736836, "grad_norm": 0.30118556293059057, "learning_rate": 3.84558667260183e-05, "loss": 0.7759, "step": 435 }, { "epoch": 0.13514665426789105, "grad_norm": 0.3183420418408163, "learning_rate": 3.8448275953503594e-05, "loss": 0.8457, "step": 436 }, { "epoch": 0.13545662365841374, "grad_norm": 0.2913873855540769, "learning_rate": 3.84406673219674e-05, "loss": 0.7983, "step": 437 }, { "epoch": 0.13576659304893643, "grad_norm": 0.32459077570953493, "learning_rate": 3.843304083877534e-05, "loss": 0.8089, "step": 438 }, { "epoch": 0.1360765624394591, "grad_norm": 0.29297299784816105, "learning_rate": 3.842539651131029e-05, "loss": 0.8014, "step": 439 }, { "epoch": 0.13638653182998178, "grad_norm": 0.3031056949473124, "learning_rate": 3.841773434697242e-05, "loss": 0.7994, "step": 440 }, { "epoch": 0.13669650122050447, "grad_norm": 0.46624347152724605, "learning_rate": 3.841005435317916e-05, "loss": 0.7428, "step": 441 }, { "epoch": 0.13700647061102716, "grad_norm": 0.3649153328118437, "learning_rate": 3.840235653736517e-05, "loss": 0.7939, "step": 442 }, { "epoch": 0.13731644000154986, "grad_norm": 0.3693282341166441, "learning_rate": 3.8394640906982425e-05, "loss": 0.7979, "step": 443 }, { "epoch": 0.13762640939207255, "grad_norm": 0.69101612915966, "learning_rate": 3.8386907469500106e-05, "loss": 0.7887, "step": 444 }, { "epoch": 0.1379363787825952, "grad_norm": 0.2995076413287196, "learning_rate": 3.837915623240462e-05, "loss": 0.7558, "step": 445 }, { "epoch": 0.1382463481731179, "grad_norm": 0.7451815109745559, "learning_rate": 3.837138720319963e-05, "loss": 0.7802, "step": 446 }, { "epoch": 0.1385563175636406, "grad_norm": 0.2885988057964658, "learning_rate": 3.836360038940602e-05, "loss": 0.7639, "step": 447 }, { "epoch": 0.13886628695416328, "grad_norm": 0.2784859311232094, "learning_rate": 3.835579579856188e-05, "loss": 0.83, "step": 448 }, { "epoch": 0.13917625634468597, "grad_norm": 0.2702845411491866, "learning_rate": 3.834797343822253e-05, "loss": 0.7964, "step": 449 }, { "epoch": 0.13948622573520864, "grad_norm": 0.30321540802593855, "learning_rate": 3.834013331596046e-05, "loss": 0.7763, "step": 450 }, { "epoch": 0.13979619512573133, "grad_norm": 0.2997047860509391, "learning_rate": 3.833227543936537e-05, "loss": 0.7923, "step": 451 }, { "epoch": 0.14010616451625402, "grad_norm": 0.3091424096192159, "learning_rate": 3.8324399816044165e-05, "loss": 0.7737, "step": 452 }, { "epoch": 0.1404161339067767, "grad_norm": 0.3241574833092194, "learning_rate": 3.831650645362091e-05, "loss": 0.7949, "step": 453 }, { "epoch": 0.1407261032972994, "grad_norm": 0.3105026136009369, "learning_rate": 3.830859535973683e-05, "loss": 0.7564, "step": 454 }, { "epoch": 0.1410360726878221, "grad_norm": 0.30307459380188023, "learning_rate": 3.830066654205035e-05, "loss": 0.8163, "step": 455 }, { "epoch": 0.14134604207834475, "grad_norm": 0.33300754291026124, "learning_rate": 3.8292720008237024e-05, "loss": 0.7884, "step": 456 }, { "epoch": 0.14165601146886744, "grad_norm": 0.3350911615878647, "learning_rate": 3.8284755765989573e-05, "loss": 0.7703, "step": 457 }, { "epoch": 0.14196598085939013, "grad_norm": 0.3095084004251958, "learning_rate": 3.8276773823017854e-05, "loss": 0.7883, "step": 458 }, { "epoch": 0.14227595024991282, "grad_norm": 0.3224989362979955, "learning_rate": 3.826877418704885e-05, "loss": 0.7529, "step": 459 }, { "epoch": 0.14258591964043552, "grad_norm": 0.35160133488079365, "learning_rate": 3.8260756865826695e-05, "loss": 0.7653, "step": 460 }, { "epoch": 0.1428958890309582, "grad_norm": 0.3238561562316431, "learning_rate": 3.825272186711262e-05, "loss": 0.7925, "step": 461 }, { "epoch": 0.14320585842148087, "grad_norm": 0.31932451435747744, "learning_rate": 3.824466919868498e-05, "loss": 0.7852, "step": 462 }, { "epoch": 0.14351582781200356, "grad_norm": 0.31424996961635265, "learning_rate": 3.8236598868339235e-05, "loss": 0.7733, "step": 463 }, { "epoch": 0.14382579720252625, "grad_norm": 0.2586902101312506, "learning_rate": 3.822851088388794e-05, "loss": 0.8002, "step": 464 }, { "epoch": 0.14413576659304894, "grad_norm": 0.3370900329218615, "learning_rate": 3.822040525316075e-05, "loss": 0.8056, "step": 465 }, { "epoch": 0.14444573598357163, "grad_norm": 0.2898526506562313, "learning_rate": 3.821228198400439e-05, "loss": 0.7772, "step": 466 }, { "epoch": 0.1447557053740943, "grad_norm": 0.3116972872482152, "learning_rate": 3.820414108428266e-05, "loss": 0.7671, "step": 467 }, { "epoch": 0.145065674764617, "grad_norm": 0.4360554453490022, "learning_rate": 3.819598256187643e-05, "loss": 0.8558, "step": 468 }, { "epoch": 0.14537564415513968, "grad_norm": 0.29824271958550763, "learning_rate": 3.818780642468364e-05, "loss": 0.7865, "step": 469 }, { "epoch": 0.14568561354566237, "grad_norm": 0.29102171199767357, "learning_rate": 3.817961268061928e-05, "loss": 0.8036, "step": 470 }, { "epoch": 0.14599558293618506, "grad_norm": 0.29981291587283543, "learning_rate": 3.817140133761537e-05, "loss": 0.7938, "step": 471 }, { "epoch": 0.14630555232670775, "grad_norm": 0.2877785273036168, "learning_rate": 3.816317240362097e-05, "loss": 0.8127, "step": 472 }, { "epoch": 0.1466155217172304, "grad_norm": 0.32036695685478644, "learning_rate": 3.815492588660217e-05, "loss": 0.7901, "step": 473 }, { "epoch": 0.1469254911077531, "grad_norm": 0.2820471250413034, "learning_rate": 3.814666179454211e-05, "loss": 0.8172, "step": 474 }, { "epoch": 0.1472354604982758, "grad_norm": 0.31440512913729357, "learning_rate": 3.81383801354409e-05, "loss": 0.7714, "step": 475 }, { "epoch": 0.14754542988879848, "grad_norm": 0.3378434043642509, "learning_rate": 3.813008091731568e-05, "loss": 0.7838, "step": 476 }, { "epoch": 0.14785539927932118, "grad_norm": 0.27257324272054684, "learning_rate": 3.8121764148200584e-05, "loss": 0.7603, "step": 477 }, { "epoch": 0.14816536866984387, "grad_norm": 0.3000941898185358, "learning_rate": 3.811342983614674e-05, "loss": 0.7881, "step": 478 }, { "epoch": 0.14847533806036653, "grad_norm": 0.30286162493784125, "learning_rate": 3.8105077989222246e-05, "loss": 0.7709, "step": 479 }, { "epoch": 0.14878530745088922, "grad_norm": 0.30059963626746505, "learning_rate": 3.809670861551219e-05, "loss": 0.7786, "step": 480 }, { "epoch": 0.1490952768414119, "grad_norm": 1.0270114554964265, "learning_rate": 3.8088321723118624e-05, "loss": 0.7974, "step": 481 }, { "epoch": 0.1494052462319346, "grad_norm": 0.9325643277295712, "learning_rate": 3.8079917320160545e-05, "loss": 0.7644, "step": 482 }, { "epoch": 0.1497152156224573, "grad_norm": 0.6211270392253934, "learning_rate": 3.807149541477392e-05, "loss": 0.8024, "step": 483 }, { "epoch": 0.15002518501297996, "grad_norm": 0.6866400058377645, "learning_rate": 3.806305601511165e-05, "loss": 0.8042, "step": 484 }, { "epoch": 0.15033515440350265, "grad_norm": 0.6794556727448318, "learning_rate": 3.805459912934356e-05, "loss": 0.824, "step": 485 }, { "epoch": 0.15064512379402534, "grad_norm": 0.5421390534649524, "learning_rate": 3.804612476565644e-05, "loss": 0.8009, "step": 486 }, { "epoch": 0.15095509318454803, "grad_norm": 0.37620211069164994, "learning_rate": 3.803763293225395e-05, "loss": 0.7529, "step": 487 }, { "epoch": 0.15126506257507072, "grad_norm": 0.4803748809702832, "learning_rate": 3.802912363735671e-05, "loss": 0.827, "step": 488 }, { "epoch": 0.1515750319655934, "grad_norm": 0.6347852114724724, "learning_rate": 3.80205968892022e-05, "loss": 0.7754, "step": 489 }, { "epoch": 0.15188500135611607, "grad_norm": 0.47710691561708773, "learning_rate": 3.801205269604483e-05, "loss": 0.7525, "step": 490 }, { "epoch": 0.15219497074663876, "grad_norm": 0.38775762600894964, "learning_rate": 3.8003491066155874e-05, "loss": 0.8469, "step": 491 }, { "epoch": 0.15250494013716145, "grad_norm": 0.39685612640825685, "learning_rate": 3.79949120078235e-05, "loss": 0.7528, "step": 492 }, { "epoch": 0.15281490952768415, "grad_norm": 1.0382639383673347, "learning_rate": 3.7986315529352746e-05, "loss": 0.7836, "step": 493 }, { "epoch": 0.15312487891820684, "grad_norm": 0.4293065295131116, "learning_rate": 3.7977701639065505e-05, "loss": 0.7967, "step": 494 }, { "epoch": 0.15343484830872953, "grad_norm": 0.42195488295062644, "learning_rate": 3.7969070345300536e-05, "loss": 0.8016, "step": 495 }, { "epoch": 0.1537448176992522, "grad_norm": 0.4860012967066258, "learning_rate": 3.796042165641345e-05, "loss": 0.7938, "step": 496 }, { "epoch": 0.15405478708977488, "grad_norm": 0.3819822421011022, "learning_rate": 3.795175558077667e-05, "loss": 0.7776, "step": 497 }, { "epoch": 0.15436475648029757, "grad_norm": 0.4434464951976022, "learning_rate": 3.794307212677949e-05, "loss": 0.8014, "step": 498 }, { "epoch": 0.15467472587082026, "grad_norm": 0.3851231674367315, "learning_rate": 3.793437130282799e-05, "loss": 0.8027, "step": 499 }, { "epoch": 0.15498469526134295, "grad_norm": 0.9141666451259747, "learning_rate": 3.7925653117345085e-05, "loss": 0.8282, "step": 500 }, { "epoch": 0.15529466465186562, "grad_norm": 0.31614693252836823, "learning_rate": 3.791691757877051e-05, "loss": 0.7886, "step": 501 }, { "epoch": 0.1556046340423883, "grad_norm": 0.33830123679597085, "learning_rate": 3.7908164695560755e-05, "loss": 0.7464, "step": 502 }, { "epoch": 0.155914603432911, "grad_norm": 0.35601017133283097, "learning_rate": 3.7899394476189145e-05, "loss": 0.7946, "step": 503 }, { "epoch": 0.1562245728234337, "grad_norm": 0.3219001430666695, "learning_rate": 3.7890606929145774e-05, "loss": 0.7827, "step": 504 }, { "epoch": 0.15653454221395638, "grad_norm": 0.4686742090328973, "learning_rate": 3.78818020629375e-05, "loss": 0.8099, "step": 505 }, { "epoch": 0.15684451160447907, "grad_norm": 1.1215918030014407, "learning_rate": 3.787297988608796e-05, "loss": 0.8055, "step": 506 }, { "epoch": 0.15715448099500173, "grad_norm": 0.32655975764563433, "learning_rate": 3.786414040713753e-05, "loss": 0.8024, "step": 507 }, { "epoch": 0.15746445038552442, "grad_norm": 0.30773559484780133, "learning_rate": 3.785528363464336e-05, "loss": 0.8252, "step": 508 }, { "epoch": 0.15777441977604711, "grad_norm": 0.2873577328889441, "learning_rate": 3.7846409577179325e-05, "loss": 0.7852, "step": 509 }, { "epoch": 0.1580843891665698, "grad_norm": 0.37976711022513365, "learning_rate": 3.783751824333604e-05, "loss": 0.7898, "step": 510 }, { "epoch": 0.1583943585570925, "grad_norm": 0.27668989654056003, "learning_rate": 3.782860964172083e-05, "loss": 0.8052, "step": 511 }, { "epoch": 0.15870432794761516, "grad_norm": 0.31383868470751597, "learning_rate": 3.781968378095777e-05, "loss": 0.7655, "step": 512 }, { "epoch": 0.15901429733813785, "grad_norm": 0.30318591343320483, "learning_rate": 3.78107406696876e-05, "loss": 0.8131, "step": 513 }, { "epoch": 0.15932426672866054, "grad_norm": 0.294054596832995, "learning_rate": 3.780178031656779e-05, "loss": 0.7666, "step": 514 }, { "epoch": 0.15963423611918323, "grad_norm": 0.3229765839784485, "learning_rate": 3.779280273027249e-05, "loss": 0.7946, "step": 515 }, { "epoch": 0.15994420550970592, "grad_norm": 0.2979879277039262, "learning_rate": 3.778380791949253e-05, "loss": 0.7926, "step": 516 }, { "epoch": 0.1602541749002286, "grad_norm": 0.2701957051394202, "learning_rate": 3.777479589293543e-05, "loss": 0.7421, "step": 517 }, { "epoch": 0.16056414429075128, "grad_norm": 0.30446800125587115, "learning_rate": 3.7765766659325354e-05, "loss": 0.7935, "step": 518 }, { "epoch": 0.16087411368127397, "grad_norm": 0.24121354620462285, "learning_rate": 3.775672022740313e-05, "loss": 0.7582, "step": 519 }, { "epoch": 0.16118408307179666, "grad_norm": 0.26331496279983563, "learning_rate": 3.774765660592625e-05, "loss": 0.8026, "step": 520 }, { "epoch": 0.16149405246231935, "grad_norm": 0.24718742294651952, "learning_rate": 3.7738575803668834e-05, "loss": 0.7879, "step": 521 }, { "epoch": 0.16180402185284204, "grad_norm": 0.29276238592042, "learning_rate": 3.7729477829421623e-05, "loss": 0.7673, "step": 522 }, { "epoch": 0.16211399124336473, "grad_norm": 0.28255198340705706, "learning_rate": 3.772036269199201e-05, "loss": 0.7804, "step": 523 }, { "epoch": 0.1624239606338874, "grad_norm": 0.25130205269502387, "learning_rate": 3.771123040020397e-05, "loss": 0.7713, "step": 524 }, { "epoch": 0.16273393002441008, "grad_norm": 0.26468287168805077, "learning_rate": 3.770208096289812e-05, "loss": 0.7748, "step": 525 }, { "epoch": 0.16304389941493277, "grad_norm": 0.29232055263612305, "learning_rate": 3.769291438893164e-05, "loss": 0.7697, "step": 526 }, { "epoch": 0.16335386880545547, "grad_norm": 0.28303676671660577, "learning_rate": 3.768373068717833e-05, "loss": 0.7659, "step": 527 }, { "epoch": 0.16366383819597816, "grad_norm": 0.25280761743266217, "learning_rate": 3.767452986652854e-05, "loss": 0.7661, "step": 528 }, { "epoch": 0.16397380758650082, "grad_norm": 0.28335258003898417, "learning_rate": 3.7665311935889214e-05, "loss": 0.7858, "step": 529 }, { "epoch": 0.1642837769770235, "grad_norm": 0.26183345010579656, "learning_rate": 3.765607690418387e-05, "loss": 0.8005, "step": 530 }, { "epoch": 0.1645937463675462, "grad_norm": 0.27447811839436476, "learning_rate": 3.7646824780352534e-05, "loss": 0.7791, "step": 531 }, { "epoch": 0.1649037157580689, "grad_norm": 0.24387751891473242, "learning_rate": 3.7637555573351835e-05, "loss": 0.7822, "step": 532 }, { "epoch": 0.16521368514859158, "grad_norm": 0.2796978074547317, "learning_rate": 3.76282692921549e-05, "loss": 0.7606, "step": 533 }, { "epoch": 0.16552365453911427, "grad_norm": 0.2823735350644033, "learning_rate": 3.7618965945751396e-05, "loss": 0.8067, "step": 534 }, { "epoch": 0.16583362392963694, "grad_norm": 0.24272713893935313, "learning_rate": 3.760964554314753e-05, "loss": 0.8137, "step": 535 }, { "epoch": 0.16614359332015963, "grad_norm": 0.3308768924459356, "learning_rate": 3.760030809336598e-05, "loss": 0.8192, "step": 536 }, { "epoch": 0.16645356271068232, "grad_norm": 0.2945463524094961, "learning_rate": 3.7590953605445955e-05, "loss": 0.762, "step": 537 }, { "epoch": 0.166763532101205, "grad_norm": 0.2664505615814581, "learning_rate": 3.7581582088443166e-05, "loss": 0.7714, "step": 538 }, { "epoch": 0.1670735014917277, "grad_norm": 0.27360343856172586, "learning_rate": 3.7572193551429785e-05, "loss": 0.7907, "step": 539 }, { "epoch": 0.1673834708822504, "grad_norm": 0.2488288555385127, "learning_rate": 3.756278800349447e-05, "loss": 0.7895, "step": 540 }, { "epoch": 0.16769344027277305, "grad_norm": 0.25419420209807086, "learning_rate": 3.7553365453742346e-05, "loss": 0.7928, "step": 541 }, { "epoch": 0.16800340966329574, "grad_norm": 0.3019821796426921, "learning_rate": 3.7543925911295006e-05, "loss": 0.7801, "step": 542 }, { "epoch": 0.16831337905381843, "grad_norm": 0.2949515298749417, "learning_rate": 3.753446938529048e-05, "loss": 0.7668, "step": 543 }, { "epoch": 0.16862334844434113, "grad_norm": 0.2679777266326525, "learning_rate": 3.7524995884883235e-05, "loss": 0.7482, "step": 544 }, { "epoch": 0.16893331783486382, "grad_norm": 0.3009443588945842, "learning_rate": 3.75155054192442e-05, "loss": 0.7908, "step": 545 }, { "epoch": 0.16924328722538648, "grad_norm": 0.2747678791825462, "learning_rate": 3.750599799756069e-05, "loss": 0.7862, "step": 546 }, { "epoch": 0.16955325661590917, "grad_norm": 0.293339788793943, "learning_rate": 3.749647362903645e-05, "loss": 0.7714, "step": 547 }, { "epoch": 0.16986322600643186, "grad_norm": 0.3102187247850659, "learning_rate": 3.7486932322891646e-05, "loss": 0.7813, "step": 548 }, { "epoch": 0.17017319539695455, "grad_norm": 0.2815168758918942, "learning_rate": 3.747737408836281e-05, "loss": 0.7492, "step": 549 }, { "epoch": 0.17048316478747724, "grad_norm": 0.28082359421744385, "learning_rate": 3.746779893470289e-05, "loss": 0.7785, "step": 550 }, { "epoch": 0.17079313417799993, "grad_norm": 0.34901162927527285, "learning_rate": 3.74582068711812e-05, "loss": 0.7531, "step": 551 }, { "epoch": 0.1711031035685226, "grad_norm": 0.34464303815942443, "learning_rate": 3.744859790708342e-05, "loss": 0.7874, "step": 552 }, { "epoch": 0.1714130729590453, "grad_norm": 0.38926412781851155, "learning_rate": 3.74389720517116e-05, "loss": 0.7928, "step": 553 }, { "epoch": 0.17172304234956798, "grad_norm": 0.2689071539194968, "learning_rate": 3.742932931438413e-05, "loss": 0.7704, "step": 554 }, { "epoch": 0.17203301174009067, "grad_norm": 0.3244733282866573, "learning_rate": 3.741966970443574e-05, "loss": 0.7778, "step": 555 }, { "epoch": 0.17234298113061336, "grad_norm": 0.6170585382968542, "learning_rate": 3.7409993231217535e-05, "loss": 0.8133, "step": 556 }, { "epoch": 0.17265295052113605, "grad_norm": 0.2434312497514771, "learning_rate": 3.7400299904096885e-05, "loss": 0.7347, "step": 557 }, { "epoch": 0.1729629199116587, "grad_norm": 0.3326075646746564, "learning_rate": 3.739058973245751e-05, "loss": 0.7849, "step": 558 }, { "epoch": 0.1732728893021814, "grad_norm": 0.3713194338442772, "learning_rate": 3.738086272569944e-05, "loss": 0.7928, "step": 559 }, { "epoch": 0.1735828586927041, "grad_norm": 0.31952814713357847, "learning_rate": 3.737111889323898e-05, "loss": 0.7733, "step": 560 }, { "epoch": 0.17389282808322679, "grad_norm": 0.31106055998886134, "learning_rate": 3.736135824450874e-05, "loss": 0.7688, "step": 561 }, { "epoch": 0.17420279747374948, "grad_norm": 0.30662490506962475, "learning_rate": 3.7351580788957606e-05, "loss": 0.7536, "step": 562 }, { "epoch": 0.17451276686427214, "grad_norm": 0.3462808280840969, "learning_rate": 3.7341786536050734e-05, "loss": 0.8159, "step": 563 }, { "epoch": 0.17482273625479483, "grad_norm": 0.67652904281416, "learning_rate": 3.733197549526954e-05, "loss": 0.8006, "step": 564 }, { "epoch": 0.17513270564531752, "grad_norm": 0.28857185407329855, "learning_rate": 3.732214767611169e-05, "loss": 0.7746, "step": 565 }, { "epoch": 0.1754426750358402, "grad_norm": 0.35837018966614187, "learning_rate": 3.7312303088091095e-05, "loss": 0.7688, "step": 566 }, { "epoch": 0.1757526444263629, "grad_norm": 0.39784508886050873, "learning_rate": 3.730244174073789e-05, "loss": 0.8039, "step": 567 }, { "epoch": 0.1760626138168856, "grad_norm": 0.31439774424209715, "learning_rate": 3.729256364359845e-05, "loss": 0.786, "step": 568 }, { "epoch": 0.17637258320740826, "grad_norm": 0.280282088298547, "learning_rate": 3.7282668806235355e-05, "loss": 0.7752, "step": 569 }, { "epoch": 0.17668255259793095, "grad_norm": 0.34713796740676356, "learning_rate": 3.7272757238227395e-05, "loss": 0.7657, "step": 570 }, { "epoch": 0.17699252198845364, "grad_norm": 0.3100544461298075, "learning_rate": 3.726282894916955e-05, "loss": 0.7535, "step": 571 }, { "epoch": 0.17730249137897633, "grad_norm": 0.2557731730458684, "learning_rate": 3.7252883948673e-05, "loss": 0.7796, "step": 572 }, { "epoch": 0.17761246076949902, "grad_norm": 0.26785043584493473, "learning_rate": 3.724292224636507e-05, "loss": 0.7605, "step": 573 }, { "epoch": 0.1779224301600217, "grad_norm": 0.568230421233692, "learning_rate": 3.7232943851889304e-05, "loss": 0.7929, "step": 574 }, { "epoch": 0.17823239955054437, "grad_norm": 0.2822028908118053, "learning_rate": 3.7222948774905365e-05, "loss": 0.7692, "step": 575 }, { "epoch": 0.17854236894106706, "grad_norm": 0.27375148623956713, "learning_rate": 3.7212937025089074e-05, "loss": 0.7652, "step": 576 }, { "epoch": 0.17885233833158976, "grad_norm": 0.2845793443647388, "learning_rate": 3.72029086121324e-05, "loss": 0.8135, "step": 577 }, { "epoch": 0.17916230772211245, "grad_norm": 0.28103552309775703, "learning_rate": 3.719286354574345e-05, "loss": 0.7737, "step": 578 }, { "epoch": 0.17947227711263514, "grad_norm": 0.27146718265347086, "learning_rate": 3.718280183564642e-05, "loss": 0.7138, "step": 579 }, { "epoch": 0.1797822465031578, "grad_norm": 0.3063488583039922, "learning_rate": 3.717272349158166e-05, "loss": 0.7557, "step": 580 }, { "epoch": 0.1800922158936805, "grad_norm": 0.26470589642749415, "learning_rate": 3.716262852330559e-05, "loss": 0.7709, "step": 581 }, { "epoch": 0.18040218528420318, "grad_norm": 0.2582936035612825, "learning_rate": 3.715251694059075e-05, "loss": 0.7617, "step": 582 }, { "epoch": 0.18071215467472587, "grad_norm": 0.2883946752742215, "learning_rate": 3.7142388753225735e-05, "loss": 0.7739, "step": 583 }, { "epoch": 0.18102212406524856, "grad_norm": 0.2993880941748182, "learning_rate": 3.713224397101524e-05, "loss": 0.7781, "step": 584 }, { "epoch": 0.18133209345577125, "grad_norm": 0.28119002043068986, "learning_rate": 3.7122082603780005e-05, "loss": 0.7796, "step": 585 }, { "epoch": 0.18164206284629392, "grad_norm": 0.2546942154480431, "learning_rate": 3.711190466135684e-05, "loss": 0.8275, "step": 586 }, { "epoch": 0.1819520322368166, "grad_norm": 0.3149133430077025, "learning_rate": 3.710171015359859e-05, "loss": 0.7193, "step": 587 }, { "epoch": 0.1822620016273393, "grad_norm": 0.29847667206354384, "learning_rate": 3.7091499090374146e-05, "loss": 0.7587, "step": 588 }, { "epoch": 0.182571971017862, "grad_norm": 0.255610940253817, "learning_rate": 3.708127148156841e-05, "loss": 0.774, "step": 589 }, { "epoch": 0.18288194040838468, "grad_norm": 0.2744185627093472, "learning_rate": 3.707102733708233e-05, "loss": 0.7652, "step": 590 }, { "epoch": 0.18319190979890737, "grad_norm": 0.26596774366164605, "learning_rate": 3.706076666683284e-05, "loss": 0.7707, "step": 591 }, { "epoch": 0.18350187918943003, "grad_norm": 0.26192164288233233, "learning_rate": 3.705048948075286e-05, "loss": 0.8168, "step": 592 }, { "epoch": 0.18381184857995272, "grad_norm": 0.26669566897530195, "learning_rate": 3.704019578879131e-05, "loss": 0.7963, "step": 593 }, { "epoch": 0.18412181797047542, "grad_norm": 0.28968129025811146, "learning_rate": 3.7029885600913127e-05, "loss": 0.7664, "step": 594 }, { "epoch": 0.1844317873609981, "grad_norm": 0.26700194450773995, "learning_rate": 3.701955892709915e-05, "loss": 0.7586, "step": 595 }, { "epoch": 0.1847417567515208, "grad_norm": 0.2652912428223736, "learning_rate": 3.700921577734622e-05, "loss": 0.7779, "step": 596 }, { "epoch": 0.18505172614204346, "grad_norm": 0.2407735918880382, "learning_rate": 3.699885616166711e-05, "loss": 0.7752, "step": 597 }, { "epoch": 0.18536169553256615, "grad_norm": 0.26607519777081207, "learning_rate": 3.698848009009056e-05, "loss": 0.7692, "step": 598 }, { "epoch": 0.18567166492308884, "grad_norm": 0.35143050318506863, "learning_rate": 3.69780875726612e-05, "loss": 0.766, "step": 599 }, { "epoch": 0.18598163431361153, "grad_norm": 0.2975917061325769, "learning_rate": 3.696767861943961e-05, "loss": 0.7884, "step": 600 }, { "epoch": 0.18629160370413422, "grad_norm": 0.24176415958719175, "learning_rate": 3.6957253240502263e-05, "loss": 0.7833, "step": 601 }, { "epoch": 0.18660157309465691, "grad_norm": 0.2565237496343686, "learning_rate": 3.6946811445941556e-05, "loss": 0.7759, "step": 602 }, { "epoch": 0.18691154248517958, "grad_norm": 0.25371936850288046, "learning_rate": 3.693635324586576e-05, "loss": 0.7409, "step": 603 }, { "epoch": 0.18722151187570227, "grad_norm": 13.702109439944163, "learning_rate": 3.6925878650399024e-05, "loss": 0.7561, "step": 604 }, { "epoch": 0.18753148126622496, "grad_norm": 0.37328410984369803, "learning_rate": 3.691538766968138e-05, "loss": 0.7538, "step": 605 }, { "epoch": 0.18784145065674765, "grad_norm": 0.350162706701504, "learning_rate": 3.6904880313868714e-05, "loss": 0.8219, "step": 606 }, { "epoch": 0.18815142004727034, "grad_norm": 0.35890966937304963, "learning_rate": 3.689435659313278e-05, "loss": 0.7993, "step": 607 }, { "epoch": 0.18846138943779303, "grad_norm": 0.31651354829376915, "learning_rate": 3.688381651766114e-05, "loss": 0.7724, "step": 608 }, { "epoch": 0.1887713588283157, "grad_norm": 0.3049886005878354, "learning_rate": 3.687326009765722e-05, "loss": 0.761, "step": 609 }, { "epoch": 0.18908132821883838, "grad_norm": 0.3124108453739762, "learning_rate": 3.686268734334027e-05, "loss": 0.7603, "step": 610 }, { "epoch": 0.18939129760936108, "grad_norm": 0.2967940944246881, "learning_rate": 3.685209826494532e-05, "loss": 0.791, "step": 611 }, { "epoch": 0.18970126699988377, "grad_norm": 0.3237788593458754, "learning_rate": 3.684149287272325e-05, "loss": 0.7315, "step": 612 }, { "epoch": 0.19001123639040646, "grad_norm": 0.2678637253273264, "learning_rate": 3.683087117694068e-05, "loss": 0.792, "step": 613 }, { "epoch": 0.19032120578092912, "grad_norm": 0.27358289558179094, "learning_rate": 3.682023318788005e-05, "loss": 0.7445, "step": 614 }, { "epoch": 0.1906311751714518, "grad_norm": 0.2690654768591916, "learning_rate": 3.680957891583957e-05, "loss": 0.7535, "step": 615 }, { "epoch": 0.1909411445619745, "grad_norm": 0.2983683479976357, "learning_rate": 3.679890837113319e-05, "loss": 0.7857, "step": 616 }, { "epoch": 0.1912511139524972, "grad_norm": 0.2686323552040142, "learning_rate": 3.678822156409064e-05, "loss": 0.7762, "step": 617 }, { "epoch": 0.19156108334301988, "grad_norm": 0.5012022385811049, "learning_rate": 3.677751850505738e-05, "loss": 0.8104, "step": 618 }, { "epoch": 0.19187105273354257, "grad_norm": 0.28748023796822053, "learning_rate": 3.676679920439459e-05, "loss": 0.7959, "step": 619 }, { "epoch": 0.19218102212406524, "grad_norm": 0.2639582314275832, "learning_rate": 3.675606367247921e-05, "loss": 0.7536, "step": 620 }, { "epoch": 0.19249099151458793, "grad_norm": 0.26114050572469677, "learning_rate": 3.6745311919703846e-05, "loss": 0.7809, "step": 621 }, { "epoch": 0.19280096090511062, "grad_norm": 0.2487046919077591, "learning_rate": 3.6734543956476844e-05, "loss": 0.7624, "step": 622 }, { "epoch": 0.1931109302956333, "grad_norm": 0.27050069600351334, "learning_rate": 3.672375979322222e-05, "loss": 0.7706, "step": 623 }, { "epoch": 0.193420899686156, "grad_norm": 0.2454279899238048, "learning_rate": 3.6712959440379694e-05, "loss": 0.7862, "step": 624 }, { "epoch": 0.1937308690766787, "grad_norm": 0.25709899573344913, "learning_rate": 3.670214290840465e-05, "loss": 0.7589, "step": 625 }, { "epoch": 0.19404083846720135, "grad_norm": 0.25343187093681524, "learning_rate": 3.6691310207768095e-05, "loss": 0.7659, "step": 626 }, { "epoch": 0.19435080785772404, "grad_norm": 0.23646580290656158, "learning_rate": 3.668046134895676e-05, "loss": 0.7601, "step": 627 }, { "epoch": 0.19466077724824674, "grad_norm": 0.31654827465648927, "learning_rate": 3.666959634247297e-05, "loss": 0.7754, "step": 628 }, { "epoch": 0.19497074663876943, "grad_norm": 0.2638664865049239, "learning_rate": 3.6658715198834685e-05, "loss": 0.7534, "step": 629 }, { "epoch": 0.19528071602929212, "grad_norm": 0.25103168873230186, "learning_rate": 3.6647817928575506e-05, "loss": 0.7611, "step": 630 }, { "epoch": 0.19559068541981478, "grad_norm": 0.28363588103107024, "learning_rate": 3.6636904542244634e-05, "loss": 0.8106, "step": 631 }, { "epoch": 0.19590065481033747, "grad_norm": 0.2771570308892168, "learning_rate": 3.6625975050406866e-05, "loss": 0.7449, "step": 632 }, { "epoch": 0.19621062420086016, "grad_norm": 0.27526081978599864, "learning_rate": 3.66150294636426e-05, "loss": 0.7859, "step": 633 }, { "epoch": 0.19652059359138285, "grad_norm": 0.2830024685155179, "learning_rate": 3.660406779254781e-05, "loss": 0.7483, "step": 634 }, { "epoch": 0.19683056298190554, "grad_norm": 0.29457702775406003, "learning_rate": 3.659309004773405e-05, "loss": 0.7522, "step": 635 }, { "epoch": 0.19714053237242823, "grad_norm": 0.30195310510042034, "learning_rate": 3.65820962398284e-05, "loss": 0.7809, "step": 636 }, { "epoch": 0.1974505017629509, "grad_norm": 0.26016437671593345, "learning_rate": 3.657108637947355e-05, "loss": 0.7632, "step": 637 }, { "epoch": 0.1977604711534736, "grad_norm": 0.28440292488645924, "learning_rate": 3.656006047732766e-05, "loss": 0.7643, "step": 638 }, { "epoch": 0.19807044054399628, "grad_norm": 0.2734093164758963, "learning_rate": 3.654901854406449e-05, "loss": 0.7994, "step": 639 }, { "epoch": 0.19838040993451897, "grad_norm": 0.25833712454934155, "learning_rate": 3.653796059037326e-05, "loss": 0.7867, "step": 640 }, { "epoch": 0.19869037932504166, "grad_norm": 0.25627397146751935, "learning_rate": 3.652688662695873e-05, "loss": 0.7815, "step": 641 }, { "epoch": 0.19900034871556435, "grad_norm": 0.3074577949769034, "learning_rate": 3.651579666454116e-05, "loss": 0.7821, "step": 642 }, { "epoch": 0.19931031810608701, "grad_norm": 0.26501937427252564, "learning_rate": 3.650469071385627e-05, "loss": 0.727, "step": 643 }, { "epoch": 0.1996202874966097, "grad_norm": 0.25878824264939415, "learning_rate": 3.64935687856553e-05, "loss": 0.7535, "step": 644 }, { "epoch": 0.1999302568871324, "grad_norm": 0.32011452514578564, "learning_rate": 3.6482430890704906e-05, "loss": 0.7747, "step": 645 }, { "epoch": 0.2002402262776551, "grad_norm": 0.27403824734776006, "learning_rate": 3.647127703978725e-05, "loss": 0.7564, "step": 646 }, { "epoch": 0.20055019566817778, "grad_norm": 0.5965303839014678, "learning_rate": 3.6460107243699916e-05, "loss": 0.7608, "step": 647 }, { "epoch": 0.20086016505870044, "grad_norm": 0.26328480640731305, "learning_rate": 3.644892151325592e-05, "loss": 0.777, "step": 648 }, { "epoch": 0.20117013444922313, "grad_norm": 0.28210637867990807, "learning_rate": 3.643771985928371e-05, "loss": 0.7829, "step": 649 }, { "epoch": 0.20148010383974582, "grad_norm": 0.2838235103056316, "learning_rate": 3.642650229262716e-05, "loss": 0.7605, "step": 650 }, { "epoch": 0.2017900732302685, "grad_norm": 0.3677230891475276, "learning_rate": 3.641526882414553e-05, "loss": 0.7574, "step": 651 }, { "epoch": 0.2021000426207912, "grad_norm": 0.24002295290679088, "learning_rate": 3.640401946471348e-05, "loss": 0.7574, "step": 652 }, { "epoch": 0.2024100120113139, "grad_norm": 0.27256836542703783, "learning_rate": 3.639275422522105e-05, "loss": 0.7638, "step": 653 }, { "epoch": 0.20271998140183656, "grad_norm": 0.28458572005415494, "learning_rate": 3.638147311657367e-05, "loss": 0.788, "step": 654 }, { "epoch": 0.20302995079235925, "grad_norm": 0.2604096745298751, "learning_rate": 3.637017614969212e-05, "loss": 0.7771, "step": 655 }, { "epoch": 0.20333992018288194, "grad_norm": 0.22357894035586898, "learning_rate": 3.6358863335512516e-05, "loss": 0.7546, "step": 656 }, { "epoch": 0.20364988957340463, "grad_norm": 0.25440842805311786, "learning_rate": 3.6347534684986346e-05, "loss": 0.7598, "step": 657 }, { "epoch": 0.20395985896392732, "grad_norm": 0.29999074420688526, "learning_rate": 3.6336190209080405e-05, "loss": 0.7501, "step": 658 }, { "epoch": 0.20426982835445, "grad_norm": 0.25285771482750025, "learning_rate": 3.6324829918776815e-05, "loss": 0.7619, "step": 659 }, { "epoch": 0.20457979774497267, "grad_norm": 0.27310430623825244, "learning_rate": 3.631345382507302e-05, "loss": 0.7875, "step": 660 }, { "epoch": 0.20488976713549537, "grad_norm": 0.2504477681646785, "learning_rate": 3.6302061938981734e-05, "loss": 0.7775, "step": 661 }, { "epoch": 0.20519973652601806, "grad_norm": 0.27673149907533257, "learning_rate": 3.6290654271530996e-05, "loss": 0.7857, "step": 662 }, { "epoch": 0.20550970591654075, "grad_norm": 0.3045167710603624, "learning_rate": 3.627923083376408e-05, "loss": 0.7779, "step": 663 }, { "epoch": 0.20581967530706344, "grad_norm": 0.3504673035729479, "learning_rate": 3.626779163673957e-05, "loss": 0.7679, "step": 664 }, { "epoch": 0.2061296446975861, "grad_norm": 0.2686522207746343, "learning_rate": 3.625633669153128e-05, "loss": 0.7624, "step": 665 }, { "epoch": 0.2064396140881088, "grad_norm": 0.2970753118714501, "learning_rate": 3.624486600922826e-05, "loss": 0.7329, "step": 666 }, { "epoch": 0.20674958347863148, "grad_norm": 0.2381172130524566, "learning_rate": 3.6233379600934816e-05, "loss": 0.7535, "step": 667 }, { "epoch": 0.20705955286915417, "grad_norm": 0.2565332164155875, "learning_rate": 3.622187747777047e-05, "loss": 0.7886, "step": 668 }, { "epoch": 0.20736952225967686, "grad_norm": 0.24569263075252032, "learning_rate": 3.6210359650869965e-05, "loss": 0.7871, "step": 669 }, { "epoch": 0.20767949165019955, "grad_norm": 0.34988868819093627, "learning_rate": 3.6198826131383234e-05, "loss": 0.7475, "step": 670 }, { "epoch": 0.20798946104072222, "grad_norm": 0.2605564386510665, "learning_rate": 3.61872769304754e-05, "loss": 0.7621, "step": 671 }, { "epoch": 0.2082994304312449, "grad_norm": 0.25258425282545044, "learning_rate": 3.617571205932678e-05, "loss": 0.7656, "step": 672 }, { "epoch": 0.2086093998217676, "grad_norm": 0.2536250178326735, "learning_rate": 3.616413152913285e-05, "loss": 0.7685, "step": 673 }, { "epoch": 0.2089193692122903, "grad_norm": 0.23645965920138468, "learning_rate": 3.6152535351104236e-05, "loss": 0.7438, "step": 674 }, { "epoch": 0.20922933860281298, "grad_norm": 0.29826849811849926, "learning_rate": 3.6140923536466746e-05, "loss": 0.7632, "step": 675 }, { "epoch": 0.20953930799333567, "grad_norm": 0.2977893471163173, "learning_rate": 3.612929609646128e-05, "loss": 0.7765, "step": 676 }, { "epoch": 0.20984927738385833, "grad_norm": 0.24282783942844702, "learning_rate": 3.61176530423439e-05, "loss": 0.7347, "step": 677 }, { "epoch": 0.21015924677438103, "grad_norm": 0.2880115645784661, "learning_rate": 3.610599438538577e-05, "loss": 0.7825, "step": 678 }, { "epoch": 0.21046921616490372, "grad_norm": 0.2723058538688928, "learning_rate": 3.6094320136873164e-05, "loss": 0.772, "step": 679 }, { "epoch": 0.2107791855554264, "grad_norm": 0.314982428165282, "learning_rate": 3.608263030810743e-05, "loss": 0.7632, "step": 680 }, { "epoch": 0.2110891549459491, "grad_norm": 0.26835444064723446, "learning_rate": 3.6070924910405025e-05, "loss": 0.7499, "step": 681 }, { "epoch": 0.21139912433647176, "grad_norm": 0.2530242562884304, "learning_rate": 3.605920395509746e-05, "loss": 0.7662, "step": 682 }, { "epoch": 0.21170909372699445, "grad_norm": 0.2772144466555683, "learning_rate": 3.604746745353131e-05, "loss": 0.7781, "step": 683 }, { "epoch": 0.21201906311751714, "grad_norm": 0.2624554460988767, "learning_rate": 3.6035715417068214e-05, "loss": 0.7723, "step": 684 }, { "epoch": 0.21232903250803983, "grad_norm": 0.25876871252398786, "learning_rate": 3.602394785708483e-05, "loss": 0.767, "step": 685 }, { "epoch": 0.21263900189856252, "grad_norm": 0.23368532188710192, "learning_rate": 3.601216478497285e-05, "loss": 0.7774, "step": 686 }, { "epoch": 0.21294897128908521, "grad_norm": 0.26203371013711885, "learning_rate": 3.6000366212138984e-05, "loss": 0.7752, "step": 687 }, { "epoch": 0.21325894067960788, "grad_norm": 0.2470513994929163, "learning_rate": 3.598855215000496e-05, "loss": 0.7835, "step": 688 }, { "epoch": 0.21356891007013057, "grad_norm": 0.22899959971381986, "learning_rate": 3.597672261000747e-05, "loss": 0.7428, "step": 689 }, { "epoch": 0.21387887946065326, "grad_norm": 0.25247042843418926, "learning_rate": 3.5964877603598215e-05, "loss": 0.7975, "step": 690 }, { "epoch": 0.21418884885117595, "grad_norm": 0.24047382050711094, "learning_rate": 3.595301714224387e-05, "loss": 0.7416, "step": 691 }, { "epoch": 0.21449881824169864, "grad_norm": 0.26655082012243403, "learning_rate": 3.5941141237426045e-05, "loss": 0.7598, "step": 692 }, { "epoch": 0.21480878763222133, "grad_norm": 0.6755114596451073, "learning_rate": 3.592924990064134e-05, "loss": 0.7498, "step": 693 }, { "epoch": 0.215118757022744, "grad_norm": 0.2687161859094378, "learning_rate": 3.591734314340125e-05, "loss": 0.7418, "step": 694 }, { "epoch": 0.21542872641326669, "grad_norm": 0.2528647161520822, "learning_rate": 3.5905420977232236e-05, "loss": 0.7782, "step": 695 }, { "epoch": 0.21573869580378938, "grad_norm": 0.2461511353059048, "learning_rate": 3.589348341367565e-05, "loss": 0.7475, "step": 696 }, { "epoch": 0.21604866519431207, "grad_norm": 0.2872303121100371, "learning_rate": 3.588153046428776e-05, "loss": 0.7657, "step": 697 }, { "epoch": 0.21635863458483476, "grad_norm": 0.23761834642085802, "learning_rate": 3.586956214063974e-05, "loss": 0.7496, "step": 698 }, { "epoch": 0.21666860397535742, "grad_norm": 0.27100198256114433, "learning_rate": 3.5857578454317614e-05, "loss": 0.7378, "step": 699 }, { "epoch": 0.2169785733658801, "grad_norm": 0.2699243461468857, "learning_rate": 3.5845579416922306e-05, "loss": 0.7436, "step": 700 }, { "epoch": 0.2172885427564028, "grad_norm": 0.2791248265212446, "learning_rate": 3.5833565040069605e-05, "loss": 0.7694, "step": 701 }, { "epoch": 0.2175985121469255, "grad_norm": 0.26598045921909214, "learning_rate": 3.582153533539013e-05, "loss": 0.7336, "step": 702 }, { "epoch": 0.21790848153744818, "grad_norm": 0.26128336528712504, "learning_rate": 3.5809490314529336e-05, "loss": 0.78, "step": 703 }, { "epoch": 0.21821845092797088, "grad_norm": 0.26197545831646485, "learning_rate": 3.5797429989147535e-05, "loss": 0.7598, "step": 704 }, { "epoch": 0.21852842031849354, "grad_norm": 0.28655788960152273, "learning_rate": 3.578535437091982e-05, "loss": 0.7648, "step": 705 }, { "epoch": 0.21883838970901623, "grad_norm": 0.2877828810157509, "learning_rate": 3.5773263471536104e-05, "loss": 0.7556, "step": 706 }, { "epoch": 0.21914835909953892, "grad_norm": 0.25588907345715617, "learning_rate": 3.5761157302701096e-05, "loss": 0.7515, "step": 707 }, { "epoch": 0.2194583284900616, "grad_norm": 0.26446262318324393, "learning_rate": 3.5749035876134284e-05, "loss": 0.7654, "step": 708 }, { "epoch": 0.2197682978805843, "grad_norm": 0.2756265189284901, "learning_rate": 3.5736899203569926e-05, "loss": 0.7942, "step": 709 }, { "epoch": 0.220078267271107, "grad_norm": 0.27039277681517443, "learning_rate": 3.572474729675704e-05, "loss": 0.7604, "step": 710 }, { "epoch": 0.22038823666162966, "grad_norm": 0.2730921308266887, "learning_rate": 3.5712580167459375e-05, "loss": 0.7712, "step": 711 }, { "epoch": 0.22069820605215235, "grad_norm": 0.24453894903370857, "learning_rate": 3.5700397827455454e-05, "loss": 0.7856, "step": 712 }, { "epoch": 0.22100817544267504, "grad_norm": 0.24326899858243498, "learning_rate": 3.568820028853848e-05, "loss": 0.7845, "step": 713 }, { "epoch": 0.22131814483319773, "grad_norm": 0.27023763334706097, "learning_rate": 3.56759875625164e-05, "loss": 0.7681, "step": 714 }, { "epoch": 0.22162811422372042, "grad_norm": 0.29502209883292524, "learning_rate": 3.566375966121188e-05, "loss": 0.7476, "step": 715 }, { "epoch": 0.22193808361424308, "grad_norm": 0.2809174494963929, "learning_rate": 3.565151659646221e-05, "loss": 0.7746, "step": 716 }, { "epoch": 0.22224805300476577, "grad_norm": 0.3297219348409969, "learning_rate": 3.563925838011942e-05, "loss": 0.7559, "step": 717 }, { "epoch": 0.22255802239528846, "grad_norm": 0.2577508423860054, "learning_rate": 3.562698502405019e-05, "loss": 0.7721, "step": 718 }, { "epoch": 0.22286799178581115, "grad_norm": 0.2605217418986504, "learning_rate": 3.561469654013584e-05, "loss": 0.7457, "step": 719 }, { "epoch": 0.22317796117633384, "grad_norm": 0.2994581331785838, "learning_rate": 3.560239294027237e-05, "loss": 0.7526, "step": 720 }, { "epoch": 0.22348793056685654, "grad_norm": 0.3022073750569211, "learning_rate": 3.559007423637037e-05, "loss": 0.7515, "step": 721 }, { "epoch": 0.2237978999573792, "grad_norm": 0.22050978634291482, "learning_rate": 3.5577740440355095e-05, "loss": 0.73, "step": 722 }, { "epoch": 0.2241078693479019, "grad_norm": 0.3920412975743932, "learning_rate": 3.5565391564166375e-05, "loss": 0.7688, "step": 723 }, { "epoch": 0.22441783873842458, "grad_norm": 0.24388708044414326, "learning_rate": 3.555302761975865e-05, "loss": 0.7558, "step": 724 }, { "epoch": 0.22472780812894727, "grad_norm": 0.2546533690383233, "learning_rate": 3.5540648619100956e-05, "loss": 0.7337, "step": 725 }, { "epoch": 0.22503777751946996, "grad_norm": 0.23068512727308793, "learning_rate": 3.55282545741769e-05, "loss": 0.7078, "step": 726 }, { "epoch": 0.22534774690999262, "grad_norm": 0.23521203645255595, "learning_rate": 3.551584549698464e-05, "loss": 0.7615, "step": 727 }, { "epoch": 0.22565771630051532, "grad_norm": 0.26293567224220066, "learning_rate": 3.5503421399536904e-05, "loss": 0.7969, "step": 728 }, { "epoch": 0.225967685691038, "grad_norm": 0.21782608277835164, "learning_rate": 3.549098229386095e-05, "loss": 0.7673, "step": 729 }, { "epoch": 0.2262776550815607, "grad_norm": 0.26732243215655677, "learning_rate": 3.547852819199856e-05, "loss": 0.773, "step": 730 }, { "epoch": 0.2265876244720834, "grad_norm": 0.2630783379665148, "learning_rate": 3.546605910600606e-05, "loss": 0.7746, "step": 731 }, { "epoch": 0.22689759386260608, "grad_norm": 0.2585885184033892, "learning_rate": 3.545357504795424e-05, "loss": 0.7783, "step": 732 }, { "epoch": 0.22720756325312874, "grad_norm": 0.28216952646281884, "learning_rate": 3.544107602992843e-05, "loss": 0.7461, "step": 733 }, { "epoch": 0.22751753264365143, "grad_norm": 0.2488640459825002, "learning_rate": 3.542856206402839e-05, "loss": 0.7493, "step": 734 }, { "epoch": 0.22782750203417412, "grad_norm": 0.29317182760484817, "learning_rate": 3.54160331623684e-05, "loss": 0.7619, "step": 735 }, { "epoch": 0.2281374714246968, "grad_norm": 0.291981693984087, "learning_rate": 3.5403489337077165e-05, "loss": 0.7502, "step": 736 }, { "epoch": 0.2284474408152195, "grad_norm": 0.2546267450278339, "learning_rate": 3.539093060029786e-05, "loss": 0.7551, "step": 737 }, { "epoch": 0.2287574102057422, "grad_norm": 0.22893520427604347, "learning_rate": 3.537835696418808e-05, "loss": 0.7779, "step": 738 }, { "epoch": 0.22906737959626486, "grad_norm": 0.25446128975594057, "learning_rate": 3.536576844091984e-05, "loss": 0.7834, "step": 739 }, { "epoch": 0.22937734898678755, "grad_norm": 0.2517486508545758, "learning_rate": 3.535316504267959e-05, "loss": 0.7547, "step": 740 }, { "epoch": 0.22968731837731024, "grad_norm": 0.23902017141158682, "learning_rate": 3.534054678166815e-05, "loss": 0.758, "step": 741 }, { "epoch": 0.22999728776783293, "grad_norm": 0.23224485813978812, "learning_rate": 3.532791367010075e-05, "loss": 0.7375, "step": 742 }, { "epoch": 0.23030725715835562, "grad_norm": 0.24986918622691023, "learning_rate": 3.531526572020698e-05, "loss": 0.7557, "step": 743 }, { "epoch": 0.23061722654887828, "grad_norm": 0.2683827065773706, "learning_rate": 3.530260294423082e-05, "loss": 0.7772, "step": 744 }, { "epoch": 0.23092719593940098, "grad_norm": 0.2534963270064156, "learning_rate": 3.528992535443058e-05, "loss": 0.7705, "step": 745 }, { "epoch": 0.23123716532992367, "grad_norm": 0.23902490892648784, "learning_rate": 3.52772329630789e-05, "loss": 0.778, "step": 746 }, { "epoch": 0.23154713472044636, "grad_norm": 0.24883719163787069, "learning_rate": 3.526452578246278e-05, "loss": 0.7797, "step": 747 }, { "epoch": 0.23185710411096905, "grad_norm": 0.24789139331234167, "learning_rate": 3.525180382488352e-05, "loss": 0.7662, "step": 748 }, { "epoch": 0.23216707350149174, "grad_norm": 0.22599417821191162, "learning_rate": 3.523906710265672e-05, "loss": 0.7683, "step": 749 }, { "epoch": 0.2324770428920144, "grad_norm": 0.2503667278437097, "learning_rate": 3.5226315628112294e-05, "loss": 0.7766, "step": 750 }, { "epoch": 0.2327870122825371, "grad_norm": 0.244672384624875, "learning_rate": 3.5213549413594416e-05, "loss": 0.766, "step": 751 }, { "epoch": 0.23309698167305978, "grad_norm": 0.2317792938893582, "learning_rate": 3.520076847146153e-05, "loss": 0.7722, "step": 752 }, { "epoch": 0.23340695106358247, "grad_norm": 0.25823782486961433, "learning_rate": 3.5187972814086346e-05, "loss": 0.7646, "step": 753 }, { "epoch": 0.23371692045410516, "grad_norm": 0.24530030395507857, "learning_rate": 3.517516245385582e-05, "loss": 0.7525, "step": 754 }, { "epoch": 0.23402688984462786, "grad_norm": 0.22706903171712198, "learning_rate": 3.516233740317114e-05, "loss": 0.7568, "step": 755 }, { "epoch": 0.23433685923515052, "grad_norm": 0.23596597482330012, "learning_rate": 3.5149497674447705e-05, "loss": 0.7569, "step": 756 }, { "epoch": 0.2346468286256732, "grad_norm": 0.2413388161419918, "learning_rate": 3.513664328011514e-05, "loss": 0.7392, "step": 757 }, { "epoch": 0.2349567980161959, "grad_norm": 0.2284671895036096, "learning_rate": 3.512377423261725e-05, "loss": 0.8021, "step": 758 }, { "epoch": 0.2352667674067186, "grad_norm": 0.22097526417184665, "learning_rate": 3.511089054441204e-05, "loss": 0.7637, "step": 759 }, { "epoch": 0.23557673679724128, "grad_norm": 0.2502375350642723, "learning_rate": 3.509799222797168e-05, "loss": 0.7827, "step": 760 }, { "epoch": 0.23588670618776394, "grad_norm": 0.2565083097299287, "learning_rate": 3.50850792957825e-05, "loss": 0.7598, "step": 761 }, { "epoch": 0.23619667557828664, "grad_norm": 0.23560053720601806, "learning_rate": 3.507215176034498e-05, "loss": 0.7293, "step": 762 }, { "epoch": 0.23650664496880933, "grad_norm": 0.23989527614580536, "learning_rate": 3.5059209634173746e-05, "loss": 0.8046, "step": 763 }, { "epoch": 0.23681661435933202, "grad_norm": 0.23177480139011722, "learning_rate": 3.504625292979754e-05, "loss": 0.7818, "step": 764 }, { "epoch": 0.2371265837498547, "grad_norm": 0.23351795716411344, "learning_rate": 3.503328165975921e-05, "loss": 0.7608, "step": 765 }, { "epoch": 0.2374365531403774, "grad_norm": 0.24026417680348194, "learning_rate": 3.502029583661572e-05, "loss": 0.7359, "step": 766 }, { "epoch": 0.23774652253090006, "grad_norm": 0.2431358597605013, "learning_rate": 3.5007295472938114e-05, "loss": 0.7531, "step": 767 }, { "epoch": 0.23805649192142275, "grad_norm": 0.23588309927370985, "learning_rate": 3.4994280581311505e-05, "loss": 0.7593, "step": 768 }, { "epoch": 0.23836646131194544, "grad_norm": 0.2386523484103554, "learning_rate": 3.498125117433508e-05, "loss": 0.7235, "step": 769 }, { "epoch": 0.23867643070246813, "grad_norm": 0.2287794699595061, "learning_rate": 3.496820726462208e-05, "loss": 0.7561, "step": 770 }, { "epoch": 0.23898640009299082, "grad_norm": 0.2345143012429222, "learning_rate": 3.495514886479978e-05, "loss": 0.7142, "step": 771 }, { "epoch": 0.23929636948351352, "grad_norm": 0.24747943623860738, "learning_rate": 3.494207598750947e-05, "loss": 0.7413, "step": 772 }, { "epoch": 0.23960633887403618, "grad_norm": 0.2371402167821787, "learning_rate": 3.492898864540648e-05, "loss": 0.7387, "step": 773 }, { "epoch": 0.23991630826455887, "grad_norm": 0.24160380801514494, "learning_rate": 3.491588685116014e-05, "loss": 0.7618, "step": 774 }, { "epoch": 0.24022627765508156, "grad_norm": 0.23067231142576958, "learning_rate": 3.4902770617453746e-05, "loss": 0.7716, "step": 775 }, { "epoch": 0.24053624704560425, "grad_norm": 0.22486987556864327, "learning_rate": 3.4889639956984585e-05, "loss": 0.7526, "step": 776 }, { "epoch": 0.24084621643612694, "grad_norm": 0.2476971845581763, "learning_rate": 3.487649488246392e-05, "loss": 0.7586, "step": 777 }, { "epoch": 0.2411561858266496, "grad_norm": 0.24545354909251815, "learning_rate": 3.4863335406616965e-05, "loss": 0.7494, "step": 778 }, { "epoch": 0.2414661552171723, "grad_norm": 0.23007279421773968, "learning_rate": 3.485016154218286e-05, "loss": 0.7809, "step": 779 }, { "epoch": 0.241776124607695, "grad_norm": 0.22551099556975807, "learning_rate": 3.483697330191469e-05, "loss": 0.744, "step": 780 }, { "epoch": 0.24208609399821768, "grad_norm": 0.2674026862992863, "learning_rate": 3.4823770698579454e-05, "loss": 0.7908, "step": 781 }, { "epoch": 0.24239606338874037, "grad_norm": 0.26842664835956515, "learning_rate": 3.4810553744958037e-05, "loss": 0.7745, "step": 782 }, { "epoch": 0.24270603277926306, "grad_norm": 0.21878804394724163, "learning_rate": 3.479732245384525e-05, "loss": 0.7704, "step": 783 }, { "epoch": 0.24301600216978572, "grad_norm": 0.2042055809091645, "learning_rate": 3.4784076838049755e-05, "loss": 0.7487, "step": 784 }, { "epoch": 0.2433259715603084, "grad_norm": 0.24349189658299514, "learning_rate": 3.4770816910394095e-05, "loss": 0.7868, "step": 785 }, { "epoch": 0.2436359409508311, "grad_norm": 0.23100875705389182, "learning_rate": 3.4757542683714657e-05, "loss": 0.787, "step": 786 }, { "epoch": 0.2439459103413538, "grad_norm": 0.233129854655048, "learning_rate": 3.474425417086168e-05, "loss": 0.7724, "step": 787 }, { "epoch": 0.24425587973187649, "grad_norm": 0.5758799361528429, "learning_rate": 3.473095138469923e-05, "loss": 0.7491, "step": 788 }, { "epoch": 0.24456584912239918, "grad_norm": 0.24343741535091776, "learning_rate": 3.47176343381052e-05, "loss": 0.7694, "step": 789 }, { "epoch": 0.24487581851292184, "grad_norm": 0.24512973639510297, "learning_rate": 3.470430304397127e-05, "loss": 0.7743, "step": 790 }, { "epoch": 0.24518578790344453, "grad_norm": 0.23964197380259153, "learning_rate": 3.469095751520292e-05, "loss": 0.7424, "step": 791 }, { "epoch": 0.24549575729396722, "grad_norm": 0.21629547226119125, "learning_rate": 3.467759776471941e-05, "loss": 0.7347, "step": 792 }, { "epoch": 0.2458057266844899, "grad_norm": 0.3204766442599704, "learning_rate": 3.466422380545376e-05, "loss": 0.7454, "step": 793 }, { "epoch": 0.2461156960750126, "grad_norm": 0.2425703773716094, "learning_rate": 3.465083565035278e-05, "loss": 0.7794, "step": 794 }, { "epoch": 0.24642566546553527, "grad_norm": 0.23684076107682514, "learning_rate": 3.463743331237697e-05, "loss": 0.7211, "step": 795 }, { "epoch": 0.24673563485605796, "grad_norm": 0.2790069496103027, "learning_rate": 3.462401680450059e-05, "loss": 0.7834, "step": 796 }, { "epoch": 0.24704560424658065, "grad_norm": 0.4144088315949293, "learning_rate": 3.461058613971161e-05, "loss": 0.8008, "step": 797 }, { "epoch": 0.24735557363710334, "grad_norm": 0.24663626508375722, "learning_rate": 3.459714133101172e-05, "loss": 0.7402, "step": 798 }, { "epoch": 0.24766554302762603, "grad_norm": 0.2652452761431135, "learning_rate": 3.4583682391416286e-05, "loss": 0.7328, "step": 799 }, { "epoch": 0.24797551241814872, "grad_norm": 0.2631466272414165, "learning_rate": 3.457020933395435e-05, "loss": 0.7715, "step": 800 }, { "epoch": 0.24828548180867138, "grad_norm": 0.24842029943599325, "learning_rate": 3.455672217166862e-05, "loss": 0.7511, "step": 801 }, { "epoch": 0.24859545119919407, "grad_norm": 0.2559125702869346, "learning_rate": 3.45432209176155e-05, "loss": 0.7499, "step": 802 }, { "epoch": 0.24890542058971676, "grad_norm": 0.25471127468657745, "learning_rate": 3.4529705584864964e-05, "loss": 0.7628, "step": 803 }, { "epoch": 0.24921538998023945, "grad_norm": 0.25094245154212613, "learning_rate": 3.451617618650067e-05, "loss": 0.7719, "step": 804 }, { "epoch": 0.24952535937076215, "grad_norm": 0.2919665543211642, "learning_rate": 3.450263273561987e-05, "loss": 0.77, "step": 805 }, { "epoch": 0.24983532876128484, "grad_norm": 0.23870006255901793, "learning_rate": 3.448907524533344e-05, "loss": 0.7337, "step": 806 }, { "epoch": 0.2501452981518075, "grad_norm": 0.24297818941524366, "learning_rate": 3.447550372876581e-05, "loss": 0.7635, "step": 807 }, { "epoch": 0.2504552675423302, "grad_norm": 0.21441480217811798, "learning_rate": 3.4461918199055025e-05, "loss": 0.752, "step": 808 }, { "epoch": 0.2507652369328529, "grad_norm": 0.28367786532944467, "learning_rate": 3.444831866935268e-05, "loss": 0.7719, "step": 809 }, { "epoch": 0.25107520632337554, "grad_norm": 0.22613301746308723, "learning_rate": 3.443470515282391e-05, "loss": 0.7397, "step": 810 }, { "epoch": 0.25138517571389823, "grad_norm": 0.2523253263517585, "learning_rate": 3.442107766264742e-05, "loss": 0.7893, "step": 811 }, { "epoch": 0.2516951451044209, "grad_norm": 0.23970380407041414, "learning_rate": 3.440743621201541e-05, "loss": 0.7641, "step": 812 }, { "epoch": 0.2520051144949436, "grad_norm": 0.5440887867297499, "learning_rate": 3.439378081413363e-05, "loss": 0.7604, "step": 813 }, { "epoch": 0.2523150838854663, "grad_norm": 0.23544300035481452, "learning_rate": 3.43801114822213e-05, "loss": 0.7589, "step": 814 }, { "epoch": 0.252625053275989, "grad_norm": 0.23193959430161015, "learning_rate": 3.4366428229511146e-05, "loss": 0.7754, "step": 815 }, { "epoch": 0.2529350226665117, "grad_norm": 0.28618561484341165, "learning_rate": 3.435273106924936e-05, "loss": 0.7753, "step": 816 }, { "epoch": 0.2532449920570344, "grad_norm": 0.37405444473794863, "learning_rate": 3.433902001469561e-05, "loss": 0.7733, "step": 817 }, { "epoch": 0.25355496144755707, "grad_norm": 0.23354986714273307, "learning_rate": 3.432529507912299e-05, "loss": 0.759, "step": 818 }, { "epoch": 0.25386493083807976, "grad_norm": 0.23565297061421792, "learning_rate": 3.431155627581807e-05, "loss": 0.777, "step": 819 }, { "epoch": 0.25417490022860245, "grad_norm": 0.4088827257843897, "learning_rate": 3.4297803618080806e-05, "loss": 0.7476, "step": 820 }, { "epoch": 0.2544848696191251, "grad_norm": 0.26923969780920487, "learning_rate": 3.4284037119224604e-05, "loss": 0.7404, "step": 821 }, { "epoch": 0.2547948390096478, "grad_norm": 0.2666646436772437, "learning_rate": 3.4270256792576226e-05, "loss": 0.765, "step": 822 }, { "epoch": 0.25510480840017047, "grad_norm": 0.2680260696891564, "learning_rate": 3.4256462651475865e-05, "loss": 0.7464, "step": 823 }, { "epoch": 0.25541477779069316, "grad_norm": 0.2627656040619322, "learning_rate": 3.424265470927704e-05, "loss": 0.7142, "step": 824 }, { "epoch": 0.25572474718121585, "grad_norm": 0.2559384795661323, "learning_rate": 3.422883297934668e-05, "loss": 0.7408, "step": 825 }, { "epoch": 0.25603471657173854, "grad_norm": 0.25388874742176587, "learning_rate": 3.4214997475065024e-05, "loss": 0.7822, "step": 826 }, { "epoch": 0.25634468596226123, "grad_norm": 0.23540945995415943, "learning_rate": 3.4201148209825645e-05, "loss": 0.7846, "step": 827 }, { "epoch": 0.2566546553527839, "grad_norm": 0.26342776284927943, "learning_rate": 3.418728519703548e-05, "loss": 0.7223, "step": 828 }, { "epoch": 0.2569646247433066, "grad_norm": 0.2599428737373883, "learning_rate": 3.417340845011473e-05, "loss": 0.7434, "step": 829 }, { "epoch": 0.2572745941338293, "grad_norm": 0.22810269650616527, "learning_rate": 3.415951798249689e-05, "loss": 0.7676, "step": 830 }, { "epoch": 0.257584563524352, "grad_norm": 0.23954974060627526, "learning_rate": 3.414561380762878e-05, "loss": 0.7641, "step": 831 }, { "epoch": 0.25789453291487463, "grad_norm": 0.24410853128416823, "learning_rate": 3.413169593897045e-05, "loss": 0.7424, "step": 832 }, { "epoch": 0.2582045023053973, "grad_norm": 0.23596569200349804, "learning_rate": 3.4117764389995215e-05, "loss": 0.7269, "step": 833 }, { "epoch": 0.25851447169592, "grad_norm": 0.2225917434812326, "learning_rate": 3.410381917418965e-05, "loss": 0.7259, "step": 834 }, { "epoch": 0.2588244410864427, "grad_norm": 0.24902787705997853, "learning_rate": 3.408986030505355e-05, "loss": 0.7787, "step": 835 }, { "epoch": 0.2591344104769654, "grad_norm": 0.36779275766373765, "learning_rate": 3.40758877960999e-05, "loss": 0.7267, "step": 836 }, { "epoch": 0.2594443798674881, "grad_norm": 0.231071530281904, "learning_rate": 3.406190166085494e-05, "loss": 0.753, "step": 837 }, { "epoch": 0.2597543492580108, "grad_norm": 0.23560389891950548, "learning_rate": 3.4047901912858065e-05, "loss": 0.7305, "step": 838 }, { "epoch": 0.26006431864853347, "grad_norm": 0.24984960421745725, "learning_rate": 3.403388856566187e-05, "loss": 0.7695, "step": 839 }, { "epoch": 0.26037428803905616, "grad_norm": 0.230192642367729, "learning_rate": 3.401986163283209e-05, "loss": 0.7517, "step": 840 }, { "epoch": 0.26068425742957885, "grad_norm": 0.23295160420935745, "learning_rate": 3.400582112794762e-05, "loss": 0.7607, "step": 841 }, { "epoch": 0.26099422682010154, "grad_norm": 0.26133309614042183, "learning_rate": 3.399176706460051e-05, "loss": 0.764, "step": 842 }, { "epoch": 0.2613041962106242, "grad_norm": 0.41435897033744723, "learning_rate": 3.397769945639594e-05, "loss": 0.7516, "step": 843 }, { "epoch": 0.26161416560114686, "grad_norm": 0.23330209091884166, "learning_rate": 3.396361831695215e-05, "loss": 0.7873, "step": 844 }, { "epoch": 0.26192413499166955, "grad_norm": 0.24558516524428756, "learning_rate": 3.3949523659900534e-05, "loss": 0.7267, "step": 845 }, { "epoch": 0.26223410438219225, "grad_norm": 0.234724763850448, "learning_rate": 3.3935415498885555e-05, "loss": 0.7512, "step": 846 }, { "epoch": 0.26254407377271494, "grad_norm": 0.2535411884325549, "learning_rate": 3.3921293847564746e-05, "loss": 0.7576, "step": 847 }, { "epoch": 0.2628540431632376, "grad_norm": 0.21968618381874006, "learning_rate": 3.3907158719608704e-05, "loss": 0.7501, "step": 848 }, { "epoch": 0.2631640125537603, "grad_norm": 0.22933918180630125, "learning_rate": 3.3893010128701055e-05, "loss": 0.7575, "step": 849 }, { "epoch": 0.263473981944283, "grad_norm": 0.25261595810193094, "learning_rate": 3.387884808853848e-05, "loss": 0.7677, "step": 850 }, { "epoch": 0.2637839513348057, "grad_norm": 0.2582385199491574, "learning_rate": 3.3864672612830666e-05, "loss": 0.7321, "step": 851 }, { "epoch": 0.2640939207253284, "grad_norm": 0.2646975063840269, "learning_rate": 3.385048371530032e-05, "loss": 0.7541, "step": 852 }, { "epoch": 0.2644038901158511, "grad_norm": 0.27325303600579187, "learning_rate": 3.3836281409683126e-05, "loss": 0.7658, "step": 853 }, { "epoch": 0.26471385950637377, "grad_norm": 0.2637567495536503, "learning_rate": 3.382206570972776e-05, "loss": 0.7228, "step": 854 }, { "epoch": 0.2650238288968964, "grad_norm": 0.24902765508068914, "learning_rate": 3.380783662919586e-05, "loss": 0.7621, "step": 855 }, { "epoch": 0.2653337982874191, "grad_norm": 0.22956223493246322, "learning_rate": 3.3793594181862e-05, "loss": 0.7738, "step": 856 }, { "epoch": 0.2656437676779418, "grad_norm": 0.24338897647292845, "learning_rate": 3.377933838151374e-05, "loss": 0.755, "step": 857 }, { "epoch": 0.2659537370684645, "grad_norm": 0.23132534900282462, "learning_rate": 3.376506924195151e-05, "loss": 0.7474, "step": 858 }, { "epoch": 0.26626370645898717, "grad_norm": 0.22033756863238757, "learning_rate": 3.37507867769887e-05, "loss": 0.7523, "step": 859 }, { "epoch": 0.26657367584950986, "grad_norm": 0.22841370529091318, "learning_rate": 3.373649100045157e-05, "loss": 0.7514, "step": 860 }, { "epoch": 0.26688364524003255, "grad_norm": 0.35280224440303787, "learning_rate": 3.372218192617928e-05, "loss": 0.7422, "step": 861 }, { "epoch": 0.26719361463055524, "grad_norm": 0.23228896137795332, "learning_rate": 3.370785956802386e-05, "loss": 0.7644, "step": 862 }, { "epoch": 0.26750358402107793, "grad_norm": 0.23813693812245942, "learning_rate": 3.36935239398502e-05, "loss": 0.7586, "step": 863 }, { "epoch": 0.2678135534116006, "grad_norm": 0.23803709023675967, "learning_rate": 3.367917505553603e-05, "loss": 0.7503, "step": 864 }, { "epoch": 0.2681235228021233, "grad_norm": 0.27371606886529326, "learning_rate": 3.3664812928971936e-05, "loss": 0.756, "step": 865 }, { "epoch": 0.26843349219264595, "grad_norm": 0.24084321729305508, "learning_rate": 3.365043757406127e-05, "loss": 0.7703, "step": 866 }, { "epoch": 0.26874346158316864, "grad_norm": 0.21636299862129504, "learning_rate": 3.363604900472026e-05, "loss": 0.7628, "step": 867 }, { "epoch": 0.26905343097369133, "grad_norm": 0.22361778011246639, "learning_rate": 3.362164723487787e-05, "loss": 0.7413, "step": 868 }, { "epoch": 0.269363400364214, "grad_norm": 0.22937950493510711, "learning_rate": 3.3607232278475885e-05, "loss": 0.7765, "step": 869 }, { "epoch": 0.2696733697547367, "grad_norm": 0.22559438320537462, "learning_rate": 3.359280414946881e-05, "loss": 0.735, "step": 870 }, { "epoch": 0.2699833391452594, "grad_norm": 0.2291684524875121, "learning_rate": 3.3578362861823945e-05, "loss": 0.7588, "step": 871 }, { "epoch": 0.2702933085357821, "grad_norm": 0.2520785971321808, "learning_rate": 3.35639084295213e-05, "loss": 0.7621, "step": 872 }, { "epoch": 0.2706032779263048, "grad_norm": 0.22353260929195956, "learning_rate": 3.354944086655362e-05, "loss": 0.7703, "step": 873 }, { "epoch": 0.2709132473168275, "grad_norm": 0.25004415309416633, "learning_rate": 3.353496018692637e-05, "loss": 0.7945, "step": 874 }, { "epoch": 0.27122321670735017, "grad_norm": 0.2603041425534796, "learning_rate": 3.3520466404657704e-05, "loss": 0.7601, "step": 875 }, { "epoch": 0.27153318609787286, "grad_norm": 0.2548418284687178, "learning_rate": 3.350595953377844e-05, "loss": 0.7472, "step": 876 }, { "epoch": 0.2718431554883955, "grad_norm": 0.25560395956843984, "learning_rate": 3.3491439588332106e-05, "loss": 0.7466, "step": 877 }, { "epoch": 0.2721531248789182, "grad_norm": 0.2622722824588928, "learning_rate": 3.347690658237487e-05, "loss": 0.7687, "step": 878 }, { "epoch": 0.2724630942694409, "grad_norm": 0.27788127600793466, "learning_rate": 3.346236052997552e-05, "loss": 0.7879, "step": 879 }, { "epoch": 0.27277306365996357, "grad_norm": 0.2438936089681872, "learning_rate": 3.344780144521552e-05, "loss": 0.7543, "step": 880 }, { "epoch": 0.27308303305048626, "grad_norm": 0.24440207364061828, "learning_rate": 3.34332293421889e-05, "loss": 0.7588, "step": 881 }, { "epoch": 0.27339300244100895, "grad_norm": 0.278949536675913, "learning_rate": 3.341864423500233e-05, "loss": 0.746, "step": 882 }, { "epoch": 0.27370297183153164, "grad_norm": 0.2518475953523928, "learning_rate": 3.340404613777506e-05, "loss": 0.7466, "step": 883 }, { "epoch": 0.27401294122205433, "grad_norm": 0.28627656064213813, "learning_rate": 3.3389435064638905e-05, "loss": 0.7896, "step": 884 }, { "epoch": 0.274322910612577, "grad_norm": 0.2185711746451371, "learning_rate": 3.337481102973824e-05, "loss": 0.778, "step": 885 }, { "epoch": 0.2746328800030997, "grad_norm": 0.2706019506236542, "learning_rate": 3.336017404723001e-05, "loss": 0.7721, "step": 886 }, { "epoch": 0.2749428493936224, "grad_norm": 0.2671721487406345, "learning_rate": 3.334552413128367e-05, "loss": 0.8032, "step": 887 }, { "epoch": 0.2752528187841451, "grad_norm": 0.26903202515864766, "learning_rate": 3.333086129608121e-05, "loss": 0.7755, "step": 888 }, { "epoch": 0.2755627881746677, "grad_norm": 0.24964233666265923, "learning_rate": 3.331618555581711e-05, "loss": 0.7683, "step": 889 }, { "epoch": 0.2758727575651904, "grad_norm": 0.24883053951806633, "learning_rate": 3.330149692469836e-05, "loss": 0.7291, "step": 890 }, { "epoch": 0.2761827269557131, "grad_norm": 0.2540042838217714, "learning_rate": 3.3286795416944434e-05, "loss": 0.7402, "step": 891 }, { "epoch": 0.2764926963462358, "grad_norm": 0.28066243374122485, "learning_rate": 3.3272081046787245e-05, "loss": 0.7527, "step": 892 }, { "epoch": 0.2768026657367585, "grad_norm": 0.23858484992537418, "learning_rate": 3.3257353828471185e-05, "loss": 0.7566, "step": 893 }, { "epoch": 0.2771126351272812, "grad_norm": 0.23452596576217588, "learning_rate": 3.324261377625305e-05, "loss": 0.723, "step": 894 }, { "epoch": 0.2774226045178039, "grad_norm": 0.24599756376213916, "learning_rate": 3.3227860904402114e-05, "loss": 0.7515, "step": 895 }, { "epoch": 0.27773257390832656, "grad_norm": 0.24801336087681558, "learning_rate": 3.321309522720001e-05, "loss": 0.7605, "step": 896 }, { "epoch": 0.27804254329884925, "grad_norm": 0.23337432569014574, "learning_rate": 3.319831675894078e-05, "loss": 0.7458, "step": 897 }, { "epoch": 0.27835251268937194, "grad_norm": 0.26540376265460974, "learning_rate": 3.318352551393087e-05, "loss": 0.7497, "step": 898 }, { "epoch": 0.27866248207989464, "grad_norm": 0.29613144912965383, "learning_rate": 3.316872150648907e-05, "loss": 0.7644, "step": 899 }, { "epoch": 0.27897245147041727, "grad_norm": 0.24694818721741707, "learning_rate": 3.315390475094654e-05, "loss": 0.758, "step": 900 }, { "epoch": 0.27928242086093996, "grad_norm": 0.24397925701331755, "learning_rate": 3.313907526164677e-05, "loss": 0.7594, "step": 901 }, { "epoch": 0.27959239025146265, "grad_norm": 0.25484663199143237, "learning_rate": 3.3124233052945584e-05, "loss": 0.7532, "step": 902 }, { "epoch": 0.27990235964198534, "grad_norm": 0.2409789672880315, "learning_rate": 3.310937813921111e-05, "loss": 0.7608, "step": 903 }, { "epoch": 0.28021232903250803, "grad_norm": 0.3795913573441996, "learning_rate": 3.30945105348238e-05, "loss": 0.7327, "step": 904 }, { "epoch": 0.2805222984230307, "grad_norm": 0.25636411187815433, "learning_rate": 3.307963025417636e-05, "loss": 0.7838, "step": 905 }, { "epoch": 0.2808322678135534, "grad_norm": 0.25144652306037996, "learning_rate": 3.3064737311673786e-05, "loss": 0.7528, "step": 906 }, { "epoch": 0.2811422372040761, "grad_norm": 0.236251698431929, "learning_rate": 3.3049831721733316e-05, "loss": 0.7521, "step": 907 }, { "epoch": 0.2814522065945988, "grad_norm": 0.26315581340854777, "learning_rate": 3.3034913498784465e-05, "loss": 0.7301, "step": 908 }, { "epoch": 0.2817621759851215, "grad_norm": 0.2252468941168773, "learning_rate": 3.3019982657268934e-05, "loss": 0.7437, "step": 909 }, { "epoch": 0.2820721453756442, "grad_norm": 0.25477499223901073, "learning_rate": 3.300503921164067e-05, "loss": 0.7362, "step": 910 }, { "epoch": 0.2823821147661668, "grad_norm": 0.272979847226357, "learning_rate": 3.29900831763658e-05, "loss": 0.7522, "step": 911 }, { "epoch": 0.2826920841566895, "grad_norm": 0.272718648316629, "learning_rate": 3.297511456592267e-05, "loss": 0.7525, "step": 912 }, { "epoch": 0.2830020535472122, "grad_norm": 0.22288931169241966, "learning_rate": 3.296013339480176e-05, "loss": 0.7299, "step": 913 }, { "epoch": 0.2833120229377349, "grad_norm": 0.26231819492728803, "learning_rate": 3.294513967750574e-05, "loss": 0.7647, "step": 914 }, { "epoch": 0.2836219923282576, "grad_norm": 0.30006747502945486, "learning_rate": 3.293013342854941e-05, "loss": 0.7456, "step": 915 }, { "epoch": 0.28393196171878027, "grad_norm": 0.25120546553051637, "learning_rate": 3.291511466245971e-05, "loss": 0.759, "step": 916 }, { "epoch": 0.28424193110930296, "grad_norm": 0.24171307158585356, "learning_rate": 3.2900083393775684e-05, "loss": 0.7577, "step": 917 }, { "epoch": 0.28455190049982565, "grad_norm": 0.26305177280825137, "learning_rate": 3.288503963704851e-05, "loss": 0.7368, "step": 918 }, { "epoch": 0.28486186989034834, "grad_norm": 0.2524570045331103, "learning_rate": 3.2869983406841406e-05, "loss": 0.7591, "step": 919 }, { "epoch": 0.28517183928087103, "grad_norm": 0.24452523043278895, "learning_rate": 3.28549147177297e-05, "loss": 0.7605, "step": 920 }, { "epoch": 0.2854818086713937, "grad_norm": 0.223356312262565, "learning_rate": 3.283983358430079e-05, "loss": 0.7327, "step": 921 }, { "epoch": 0.2857917780619164, "grad_norm": 0.24749829700278117, "learning_rate": 3.282474002115409e-05, "loss": 0.7425, "step": 922 }, { "epoch": 0.28610174745243905, "grad_norm": 0.22758995569364876, "learning_rate": 3.280963404290107e-05, "loss": 0.7815, "step": 923 }, { "epoch": 0.28641171684296174, "grad_norm": 0.24042262316147678, "learning_rate": 3.27945156641652e-05, "loss": 0.737, "step": 924 }, { "epoch": 0.28672168623348443, "grad_norm": 0.26237077249548096, "learning_rate": 3.277938489958198e-05, "loss": 0.7468, "step": 925 }, { "epoch": 0.2870316556240071, "grad_norm": 0.2818517103580507, "learning_rate": 3.276424176379886e-05, "loss": 0.7852, "step": 926 }, { "epoch": 0.2873416250145298, "grad_norm": 0.260421242933528, "learning_rate": 3.27490862714753e-05, "loss": 0.7562, "step": 927 }, { "epoch": 0.2876515944050525, "grad_norm": 0.22686946363767718, "learning_rate": 3.2733918437282735e-05, "loss": 0.7758, "step": 928 }, { "epoch": 0.2879615637955752, "grad_norm": 0.25280044636795385, "learning_rate": 3.271873827590449e-05, "loss": 0.7654, "step": 929 }, { "epoch": 0.2882715331860979, "grad_norm": 0.24480577558301395, "learning_rate": 3.270354580203588e-05, "loss": 0.7282, "step": 930 }, { "epoch": 0.2885815025766206, "grad_norm": 0.24480857792434194, "learning_rate": 3.268834103038411e-05, "loss": 0.7576, "step": 931 }, { "epoch": 0.28889147196714327, "grad_norm": 0.2264836436915036, "learning_rate": 3.267312397566831e-05, "loss": 0.7437, "step": 932 }, { "epoch": 0.28920144135766596, "grad_norm": 0.22584354256430736, "learning_rate": 3.2657894652619467e-05, "loss": 0.7312, "step": 933 }, { "epoch": 0.2895114107481886, "grad_norm": 0.2423090267999244, "learning_rate": 3.264265307598048e-05, "loss": 0.71, "step": 934 }, { "epoch": 0.2898213801387113, "grad_norm": 0.21697751241077787, "learning_rate": 3.262739926050609e-05, "loss": 0.7657, "step": 935 }, { "epoch": 0.290131349529234, "grad_norm": 0.24193791778370433, "learning_rate": 3.26121332209629e-05, "loss": 0.7375, "step": 936 }, { "epoch": 0.29044131891975666, "grad_norm": 0.21767398515490471, "learning_rate": 3.259685497212933e-05, "loss": 0.7274, "step": 937 }, { "epoch": 0.29075128831027935, "grad_norm": 0.24419832974306843, "learning_rate": 3.258156452879563e-05, "loss": 0.7617, "step": 938 }, { "epoch": 0.29106125770080205, "grad_norm": 0.21292127413559273, "learning_rate": 3.256626190576386e-05, "loss": 0.7621, "step": 939 }, { "epoch": 0.29137122709132474, "grad_norm": 0.22579053486324593, "learning_rate": 3.2550947117847845e-05, "loss": 0.777, "step": 940 }, { "epoch": 0.2916811964818474, "grad_norm": 0.2088900194639143, "learning_rate": 3.2535620179873235e-05, "loss": 0.761, "step": 941 }, { "epoch": 0.2919911658723701, "grad_norm": 0.23527037720839672, "learning_rate": 3.2520281106677385e-05, "loss": 0.7446, "step": 942 }, { "epoch": 0.2923011352628928, "grad_norm": 0.22473142669449764, "learning_rate": 3.250492991310943e-05, "loss": 0.7513, "step": 943 }, { "epoch": 0.2926111046534155, "grad_norm": 0.23756215551182833, "learning_rate": 3.248956661403024e-05, "loss": 0.7436, "step": 944 }, { "epoch": 0.29292107404393813, "grad_norm": 0.250523347475261, "learning_rate": 3.2474191224312394e-05, "loss": 0.7231, "step": 945 }, { "epoch": 0.2932310434344608, "grad_norm": 0.24286505522903745, "learning_rate": 3.245880375884018e-05, "loss": 0.7107, "step": 946 }, { "epoch": 0.2935410128249835, "grad_norm": 0.24669075622500197, "learning_rate": 3.244340423250957e-05, "loss": 0.7653, "step": 947 }, { "epoch": 0.2938509822155062, "grad_norm": 0.25960939077807504, "learning_rate": 3.242799266022821e-05, "loss": 0.7476, "step": 948 }, { "epoch": 0.2941609516060289, "grad_norm": 0.2458134859538897, "learning_rate": 3.241256905691542e-05, "loss": 0.7581, "step": 949 }, { "epoch": 0.2944709209965516, "grad_norm": 0.22767030200583296, "learning_rate": 3.239713343750217e-05, "loss": 0.7481, "step": 950 }, { "epoch": 0.2947808903870743, "grad_norm": 0.24511953767291453, "learning_rate": 3.238168581693103e-05, "loss": 0.722, "step": 951 }, { "epoch": 0.29509085977759697, "grad_norm": 0.3726511751710222, "learning_rate": 3.2366226210156226e-05, "loss": 0.7869, "step": 952 }, { "epoch": 0.29540082916811966, "grad_norm": 0.24664013262158416, "learning_rate": 3.2350754632143565e-05, "loss": 0.7702, "step": 953 }, { "epoch": 0.29571079855864235, "grad_norm": 0.25318777337034376, "learning_rate": 3.233527109787045e-05, "loss": 0.7625, "step": 954 }, { "epoch": 0.29602076794916504, "grad_norm": 0.2450554802200368, "learning_rate": 3.2319775622325854e-05, "loss": 0.763, "step": 955 }, { "epoch": 0.29633073733968773, "grad_norm": 0.22616782456640486, "learning_rate": 3.230426822051032e-05, "loss": 0.7431, "step": 956 }, { "epoch": 0.29664070673021037, "grad_norm": 0.26333467445875214, "learning_rate": 3.228874890743592e-05, "loss": 0.7656, "step": 957 }, { "epoch": 0.29695067612073306, "grad_norm": 0.26427832123681605, "learning_rate": 3.227321769812628e-05, "loss": 0.7519, "step": 958 }, { "epoch": 0.29726064551125575, "grad_norm": 0.23825009284806262, "learning_rate": 3.225767460761651e-05, "loss": 0.7698, "step": 959 }, { "epoch": 0.29757061490177844, "grad_norm": 0.24179539958391214, "learning_rate": 3.224211965095326e-05, "loss": 0.7758, "step": 960 }, { "epoch": 0.29788058429230113, "grad_norm": 0.2677300746177885, "learning_rate": 3.222655284319464e-05, "loss": 0.7293, "step": 961 }, { "epoch": 0.2981905536828238, "grad_norm": 0.49263528695583697, "learning_rate": 3.221097419941023e-05, "loss": 0.7373, "step": 962 }, { "epoch": 0.2985005230733465, "grad_norm": 0.24300352375299628, "learning_rate": 3.2195383734681095e-05, "loss": 0.7531, "step": 963 }, { "epoch": 0.2988104924638692, "grad_norm": 0.22571297259187206, "learning_rate": 3.21797814640997e-05, "loss": 0.7368, "step": 964 }, { "epoch": 0.2991204618543919, "grad_norm": 0.22328401308212834, "learning_rate": 3.2164167402769997e-05, "loss": 0.7417, "step": 965 }, { "epoch": 0.2994304312449146, "grad_norm": 0.24106998947980787, "learning_rate": 3.214854156580731e-05, "loss": 0.7413, "step": 966 }, { "epoch": 0.2997404006354373, "grad_norm": 0.25225683672856647, "learning_rate": 3.213290396833836e-05, "loss": 0.7731, "step": 967 }, { "epoch": 0.3000503700259599, "grad_norm": 0.23663759564592565, "learning_rate": 3.211725462550127e-05, "loss": 0.7321, "step": 968 }, { "epoch": 0.3003603394164826, "grad_norm": 0.23507241590081665, "learning_rate": 3.210159355244554e-05, "loss": 0.7352, "step": 969 }, { "epoch": 0.3006703088070053, "grad_norm": 0.2231734232907866, "learning_rate": 3.2085920764331996e-05, "loss": 0.7616, "step": 970 }, { "epoch": 0.300980278197528, "grad_norm": 0.22835699038364082, "learning_rate": 3.2070236276332846e-05, "loss": 0.7646, "step": 971 }, { "epoch": 0.3012902475880507, "grad_norm": 0.2660411707425027, "learning_rate": 3.205454010363159e-05, "loss": 0.7617, "step": 972 }, { "epoch": 0.30160021697857337, "grad_norm": 0.24652774916569453, "learning_rate": 3.203883226142305e-05, "loss": 0.7676, "step": 973 }, { "epoch": 0.30191018636909606, "grad_norm": 0.23892757506477638, "learning_rate": 3.202311276491334e-05, "loss": 0.7478, "step": 974 }, { "epoch": 0.30222015575961875, "grad_norm": 0.9794261909249787, "learning_rate": 3.2007381629319875e-05, "loss": 0.7671, "step": 975 }, { "epoch": 0.30253012515014144, "grad_norm": 0.2455414349854984, "learning_rate": 3.199163886987132e-05, "loss": 0.7497, "step": 976 }, { "epoch": 0.30284009454066413, "grad_norm": 0.24092230892214198, "learning_rate": 3.1975884501807576e-05, "loss": 0.7375, "step": 977 }, { "epoch": 0.3031500639311868, "grad_norm": 0.24217232980655312, "learning_rate": 3.196011854037983e-05, "loss": 0.7641, "step": 978 }, { "epoch": 0.30346003332170945, "grad_norm": 0.2883323387035198, "learning_rate": 3.194434100085046e-05, "loss": 0.7648, "step": 979 }, { "epoch": 0.30377000271223215, "grad_norm": 0.2729902374482473, "learning_rate": 3.192855189849304e-05, "loss": 0.732, "step": 980 }, { "epoch": 0.30407997210275484, "grad_norm": 0.22637964019346565, "learning_rate": 3.191275124859238e-05, "loss": 0.7673, "step": 981 }, { "epoch": 0.3043899414932775, "grad_norm": 0.3209522134669654, "learning_rate": 3.189693906644442e-05, "loss": 0.7452, "step": 982 }, { "epoch": 0.3046999108838002, "grad_norm": 0.2879443396228846, "learning_rate": 3.1881115367356295e-05, "loss": 0.7933, "step": 983 }, { "epoch": 0.3050098802743229, "grad_norm": 0.2576259325087488, "learning_rate": 3.186528016664628e-05, "loss": 0.7752, "step": 984 }, { "epoch": 0.3053198496648456, "grad_norm": 0.2494876052524028, "learning_rate": 3.184943347964379e-05, "loss": 0.7112, "step": 985 }, { "epoch": 0.3056298190553683, "grad_norm": 0.26921116070078294, "learning_rate": 3.183357532168934e-05, "loss": 0.7332, "step": 986 }, { "epoch": 0.305939788445891, "grad_norm": 0.23494077879367958, "learning_rate": 3.181770570813459e-05, "loss": 0.7574, "step": 987 }, { "epoch": 0.30624975783641367, "grad_norm": 0.25164123402380645, "learning_rate": 3.180182465434224e-05, "loss": 0.7265, "step": 988 }, { "epoch": 0.30655972722693636, "grad_norm": 0.2908136197523213, "learning_rate": 3.178593217568611e-05, "loss": 0.7212, "step": 989 }, { "epoch": 0.30686969661745905, "grad_norm": 0.23636146839530967, "learning_rate": 3.1770028287551035e-05, "loss": 0.7582, "step": 990 }, { "epoch": 0.3071796660079817, "grad_norm": 0.25354446638955486, "learning_rate": 3.175411300533293e-05, "loss": 0.7204, "step": 991 }, { "epoch": 0.3074896353985044, "grad_norm": 0.230939422531496, "learning_rate": 3.1738186344438733e-05, "loss": 0.728, "step": 992 }, { "epoch": 0.30779960478902707, "grad_norm": 0.24018558028336165, "learning_rate": 3.172224832028639e-05, "loss": 0.7345, "step": 993 }, { "epoch": 0.30810957417954976, "grad_norm": 0.2298163928566889, "learning_rate": 3.170629894830484e-05, "loss": 0.732, "step": 994 }, { "epoch": 0.30841954357007245, "grad_norm": 0.21619479560992938, "learning_rate": 3.169033824393404e-05, "loss": 0.7095, "step": 995 }, { "epoch": 0.30872951296059514, "grad_norm": 0.25089796020563854, "learning_rate": 3.167436622262487e-05, "loss": 0.7692, "step": 996 }, { "epoch": 0.30903948235111783, "grad_norm": 0.24966286391832243, "learning_rate": 3.1658382899839216e-05, "loss": 0.7275, "step": 997 }, { "epoch": 0.3093494517416405, "grad_norm": 0.22581210955369257, "learning_rate": 3.164238829104986e-05, "loss": 0.7441, "step": 998 }, { "epoch": 0.3096594211321632, "grad_norm": 0.2518054515977403, "learning_rate": 3.162638241174053e-05, "loss": 0.7409, "step": 999 }, { "epoch": 0.3099693905226859, "grad_norm": 0.23929613102677205, "learning_rate": 3.1610365277405874e-05, "loss": 0.7448, "step": 1000 }, { "epoch": 0.3102793599132086, "grad_norm": 2.6790041342816786, "learning_rate": 3.159433690355141e-05, "loss": 0.7637, "step": 1001 }, { "epoch": 0.31058932930373123, "grad_norm": 0.2810726343437977, "learning_rate": 3.1578297305693575e-05, "loss": 0.7407, "step": 1002 }, { "epoch": 0.3108992986942539, "grad_norm": 0.34146784164694005, "learning_rate": 3.156224649935962e-05, "loss": 0.7559, "step": 1003 }, { "epoch": 0.3112092680847766, "grad_norm": 0.2473135891919964, "learning_rate": 3.1546184500087685e-05, "loss": 0.7525, "step": 1004 }, { "epoch": 0.3115192374752993, "grad_norm": 0.2755680265549058, "learning_rate": 3.1530111323426746e-05, "loss": 0.7336, "step": 1005 }, { "epoch": 0.311829206865822, "grad_norm": 0.3011783747409944, "learning_rate": 3.1514026984936573e-05, "loss": 0.7339, "step": 1006 }, { "epoch": 0.3121391762563447, "grad_norm": 0.2814140829840387, "learning_rate": 3.149793150018776e-05, "loss": 0.7479, "step": 1007 }, { "epoch": 0.3124491456468674, "grad_norm": 0.23373519025430378, "learning_rate": 3.148182488476169e-05, "loss": 0.7511, "step": 1008 }, { "epoch": 0.31275911503739007, "grad_norm": 0.2577542375075003, "learning_rate": 3.1465707154250526e-05, "loss": 0.7587, "step": 1009 }, { "epoch": 0.31306908442791276, "grad_norm": 0.2973148654175549, "learning_rate": 3.1449578324257166e-05, "loss": 0.7366, "step": 1010 }, { "epoch": 0.31337905381843545, "grad_norm": 0.22393694343022819, "learning_rate": 3.143343841039529e-05, "loss": 0.7649, "step": 1011 }, { "epoch": 0.31368902320895814, "grad_norm": 0.2889320019735483, "learning_rate": 3.141728742828926e-05, "loss": 0.7739, "step": 1012 }, { "epoch": 0.3139989925994808, "grad_norm": 0.26992016630162996, "learning_rate": 3.140112539357421e-05, "loss": 0.7694, "step": 1013 }, { "epoch": 0.31430896199000347, "grad_norm": 0.8874246280868295, "learning_rate": 3.1384952321895946e-05, "loss": 0.7509, "step": 1014 }, { "epoch": 0.31461893138052616, "grad_norm": 0.28279556481784074, "learning_rate": 3.1368768228910935e-05, "loss": 0.7332, "step": 1015 }, { "epoch": 0.31492890077104885, "grad_norm": 0.262665835869936, "learning_rate": 3.135257313028635e-05, "loss": 0.7385, "step": 1016 }, { "epoch": 0.31523887016157154, "grad_norm": 0.250570913380528, "learning_rate": 3.13363670417e-05, "loss": 0.7645, "step": 1017 }, { "epoch": 0.31554883955209423, "grad_norm": 0.24511645211154903, "learning_rate": 3.132014997884033e-05, "loss": 0.7173, "step": 1018 }, { "epoch": 0.3158588089426169, "grad_norm": 0.25114967517044523, "learning_rate": 3.130392195740643e-05, "loss": 0.7725, "step": 1019 }, { "epoch": 0.3161687783331396, "grad_norm": 0.22041694576517554, "learning_rate": 3.128768299310798e-05, "loss": 0.7688, "step": 1020 }, { "epoch": 0.3164787477236623, "grad_norm": 0.2672916234429763, "learning_rate": 3.127143310166524e-05, "loss": 0.7502, "step": 1021 }, { "epoch": 0.316788717114185, "grad_norm": 0.23730279855207254, "learning_rate": 3.125517229880909e-05, "loss": 0.774, "step": 1022 }, { "epoch": 0.3170986865047077, "grad_norm": 0.2168677602386088, "learning_rate": 3.123890060028093e-05, "loss": 0.7604, "step": 1023 }, { "epoch": 0.3174086558952303, "grad_norm": 0.6924638485109834, "learning_rate": 3.122261802183274e-05, "loss": 0.7388, "step": 1024 }, { "epoch": 0.317718625285753, "grad_norm": 0.2781563257082715, "learning_rate": 3.1206324579227007e-05, "loss": 0.7525, "step": 1025 }, { "epoch": 0.3180285946762757, "grad_norm": 5.441606271526102, "learning_rate": 3.1190020288236746e-05, "loss": 0.7009, "step": 1026 }, { "epoch": 0.3183385640667984, "grad_norm": 0.3699839431520652, "learning_rate": 3.117370516464548e-05, "loss": 0.7557, "step": 1027 }, { "epoch": 0.3186485334573211, "grad_norm": 0.42243674882184784, "learning_rate": 3.115737922424721e-05, "loss": 0.7511, "step": 1028 }, { "epoch": 0.3189585028478438, "grad_norm": 0.2519808056951553, "learning_rate": 3.114104248284643e-05, "loss": 0.749, "step": 1029 }, { "epoch": 0.31926847223836646, "grad_norm": 0.2914828530648478, "learning_rate": 3.1124694956258036e-05, "loss": 0.7641, "step": 1030 }, { "epoch": 0.31957844162888915, "grad_norm": 0.3166894147805065, "learning_rate": 3.110833666030742e-05, "loss": 0.7515, "step": 1031 }, { "epoch": 0.31988841101941184, "grad_norm": 0.2540334726785875, "learning_rate": 3.1091967610830386e-05, "loss": 0.7413, "step": 1032 }, { "epoch": 0.32019838040993454, "grad_norm": 0.2812282306987321, "learning_rate": 3.107558782367312e-05, "loss": 0.7735, "step": 1033 }, { "epoch": 0.3205083498004572, "grad_norm": 0.2892612452954716, "learning_rate": 3.105919731469225e-05, "loss": 0.7352, "step": 1034 }, { "epoch": 0.3208183191909799, "grad_norm": 0.21610873051303356, "learning_rate": 3.104279609975474e-05, "loss": 0.7316, "step": 1035 }, { "epoch": 0.32112828858150255, "grad_norm": 0.30755228955974323, "learning_rate": 3.1026384194737944e-05, "loss": 0.7383, "step": 1036 }, { "epoch": 0.32143825797202524, "grad_norm": 0.2963356124481978, "learning_rate": 3.100996161552955e-05, "loss": 0.7632, "step": 1037 }, { "epoch": 0.32174822736254793, "grad_norm": 2.4274618582188503, "learning_rate": 3.0993528378027576e-05, "loss": 0.782, "step": 1038 }, { "epoch": 0.3220581967530706, "grad_norm": 0.41288040339987575, "learning_rate": 3.097708449814039e-05, "loss": 0.7379, "step": 1039 }, { "epoch": 0.3223681661435933, "grad_norm": 0.3553005448623427, "learning_rate": 3.0960629991786625e-05, "loss": 0.7409, "step": 1040 }, { "epoch": 0.322678135534116, "grad_norm": 0.27111502383736497, "learning_rate": 3.094416487489521e-05, "loss": 0.703, "step": 1041 }, { "epoch": 0.3229881049246387, "grad_norm": 0.3438883997265298, "learning_rate": 3.0927689163405377e-05, "loss": 0.7645, "step": 1042 }, { "epoch": 0.3232980743151614, "grad_norm": 0.3499914314464935, "learning_rate": 3.0911202873266556e-05, "loss": 0.7627, "step": 1043 }, { "epoch": 0.3236080437056841, "grad_norm": 0.2895451276187434, "learning_rate": 3.089470602043847e-05, "loss": 0.7504, "step": 1044 }, { "epoch": 0.32391801309620677, "grad_norm": 0.26373494284831794, "learning_rate": 3.087819862089105e-05, "loss": 0.7233, "step": 1045 }, { "epoch": 0.32422798248672946, "grad_norm": 0.8490386218762093, "learning_rate": 3.0861680690604415e-05, "loss": 0.7597, "step": 1046 }, { "epoch": 0.3245379518772521, "grad_norm": 0.28094004394597466, "learning_rate": 3.0845152245568915e-05, "loss": 0.7313, "step": 1047 }, { "epoch": 0.3248479212677748, "grad_norm": 0.25266281007417385, "learning_rate": 3.0828613301785066e-05, "loss": 0.7528, "step": 1048 }, { "epoch": 0.3251578906582975, "grad_norm": 0.259746567813253, "learning_rate": 3.081206387526353e-05, "loss": 0.7313, "step": 1049 }, { "epoch": 0.32546786004882017, "grad_norm": 0.25762494144753356, "learning_rate": 3.0795503982025125e-05, "loss": 0.7669, "step": 1050 }, { "epoch": 0.32577782943934286, "grad_norm": 0.2477585726349949, "learning_rate": 3.077893363810082e-05, "loss": 0.7465, "step": 1051 }, { "epoch": 0.32608779882986555, "grad_norm": 0.27739486141268443, "learning_rate": 3.076235285953169e-05, "loss": 0.7189, "step": 1052 }, { "epoch": 0.32639776822038824, "grad_norm": 0.2677545428646783, "learning_rate": 3.074576166236889e-05, "loss": 0.7355, "step": 1053 }, { "epoch": 0.32670773761091093, "grad_norm": 0.8813776545664329, "learning_rate": 3.07291600626737e-05, "loss": 0.7433, "step": 1054 }, { "epoch": 0.3270177070014336, "grad_norm": 0.275200572736372, "learning_rate": 3.071254807651744e-05, "loss": 0.7486, "step": 1055 }, { "epoch": 0.3273276763919563, "grad_norm": 0.25412069671476284, "learning_rate": 3.069592571998149e-05, "loss": 0.7188, "step": 1056 }, { "epoch": 0.327637645782479, "grad_norm": 0.21421894812288114, "learning_rate": 3.067929300915728e-05, "loss": 0.7787, "step": 1057 }, { "epoch": 0.32794761517300164, "grad_norm": 0.25331315295790735, "learning_rate": 3.066264996014625e-05, "loss": 0.7961, "step": 1058 }, { "epoch": 0.32825758456352433, "grad_norm": 0.3499978269555264, "learning_rate": 3.0645996589059875e-05, "loss": 0.7443, "step": 1059 }, { "epoch": 0.328567553954047, "grad_norm": 0.23176683572512144, "learning_rate": 3.0629332912019574e-05, "loss": 0.7001, "step": 1060 }, { "epoch": 0.3288775233445697, "grad_norm": 0.27098402230316615, "learning_rate": 3.061265894515679e-05, "loss": 0.7966, "step": 1061 }, { "epoch": 0.3291874927350924, "grad_norm": 0.22198273590853412, "learning_rate": 3.059597470461291e-05, "loss": 0.7324, "step": 1062 }, { "epoch": 0.3294974621256151, "grad_norm": 0.2548378121125497, "learning_rate": 3.057928020653925e-05, "loss": 0.7532, "step": 1063 }, { "epoch": 0.3298074315161378, "grad_norm": 0.2572084060998425, "learning_rate": 3.056257546709709e-05, "loss": 0.7529, "step": 1064 }, { "epoch": 0.3301174009066605, "grad_norm": 0.2266720289023458, "learning_rate": 3.0545860502457585e-05, "loss": 0.7277, "step": 1065 }, { "epoch": 0.33042737029718316, "grad_norm": 0.27281798817947117, "learning_rate": 3.052913532880182e-05, "loss": 0.7403, "step": 1066 }, { "epoch": 0.33073733968770586, "grad_norm": 0.30942752557427544, "learning_rate": 3.0512399962320755e-05, "loss": 0.7087, "step": 1067 }, { "epoch": 0.33104730907822855, "grad_norm": 0.26354446897254175, "learning_rate": 3.0495654419215204e-05, "loss": 0.7649, "step": 1068 }, { "epoch": 0.33135727846875124, "grad_norm": 0.27542770247373477, "learning_rate": 3.0478898715695852e-05, "loss": 0.7382, "step": 1069 }, { "epoch": 0.3316672478592739, "grad_norm": 0.25876766029244386, "learning_rate": 3.0462132867983205e-05, "loss": 0.7151, "step": 1070 }, { "epoch": 0.33197721724979656, "grad_norm": 0.2292348294676987, "learning_rate": 3.044535689230759e-05, "loss": 0.767, "step": 1071 }, { "epoch": 0.33228718664031925, "grad_norm": 0.28205052215113974, "learning_rate": 3.0428570804909152e-05, "loss": 0.7374, "step": 1072 }, { "epoch": 0.33259715603084195, "grad_norm": 0.28265870976901275, "learning_rate": 3.0411774622037816e-05, "loss": 0.7491, "step": 1073 }, { "epoch": 0.33290712542136464, "grad_norm": 0.22228880319491698, "learning_rate": 3.0394968359953263e-05, "loss": 0.7445, "step": 1074 }, { "epoch": 0.3332170948118873, "grad_norm": 0.3081913694355383, "learning_rate": 3.037815203492497e-05, "loss": 0.733, "step": 1075 }, { "epoch": 0.33352706420241, "grad_norm": 0.3228628428648717, "learning_rate": 3.036132566323212e-05, "loss": 0.7741, "step": 1076 }, { "epoch": 0.3338370335929327, "grad_norm": 0.23851203080215833, "learning_rate": 3.0344489261163646e-05, "loss": 0.7248, "step": 1077 }, { "epoch": 0.3341470029834554, "grad_norm": 0.2842482555480414, "learning_rate": 3.0327642845018167e-05, "loss": 0.7278, "step": 1078 }, { "epoch": 0.3344569723739781, "grad_norm": 0.3206323697078852, "learning_rate": 3.031078643110402e-05, "loss": 0.763, "step": 1079 }, { "epoch": 0.3347669417645008, "grad_norm": 0.23487951661036732, "learning_rate": 3.0293920035739206e-05, "loss": 0.721, "step": 1080 }, { "epoch": 0.3350769111550234, "grad_norm": 0.27054750762194635, "learning_rate": 3.0277043675251395e-05, "loss": 0.7392, "step": 1081 }, { "epoch": 0.3353868805455461, "grad_norm": 0.2804668629372328, "learning_rate": 3.0260157365977893e-05, "loss": 0.737, "step": 1082 }, { "epoch": 0.3356968499360688, "grad_norm": 0.21720251570795898, "learning_rate": 3.0243261124265655e-05, "loss": 0.7439, "step": 1083 }, { "epoch": 0.3360068193265915, "grad_norm": 0.2771375270622577, "learning_rate": 3.0226354966471237e-05, "loss": 0.7324, "step": 1084 }, { "epoch": 0.3363167887171142, "grad_norm": 0.3039986091673437, "learning_rate": 3.0209438908960803e-05, "loss": 0.7502, "step": 1085 }, { "epoch": 0.33662675810763687, "grad_norm": 0.204611967056365, "learning_rate": 3.019251296811009e-05, "loss": 0.7537, "step": 1086 }, { "epoch": 0.33693672749815956, "grad_norm": 0.2857187030618154, "learning_rate": 3.0175577160304414e-05, "loss": 0.7402, "step": 1087 }, { "epoch": 0.33724669688868225, "grad_norm": 0.3051334172385636, "learning_rate": 3.0158631501938634e-05, "loss": 0.6995, "step": 1088 }, { "epoch": 0.33755666627920494, "grad_norm": 1.0111050777163988, "learning_rate": 3.0141676009417157e-05, "loss": 0.7866, "step": 1089 }, { "epoch": 0.33786663566972763, "grad_norm": 0.24117439289989703, "learning_rate": 3.012471069915389e-05, "loss": 0.7329, "step": 1090 }, { "epoch": 0.3381766050602503, "grad_norm": 0.2614849861856249, "learning_rate": 3.010773558757227e-05, "loss": 0.7361, "step": 1091 }, { "epoch": 0.33848657445077296, "grad_norm": 0.23561459347548863, "learning_rate": 3.0090750691105196e-05, "loss": 0.7346, "step": 1092 }, { "epoch": 0.33879654384129565, "grad_norm": 0.2518686226984406, "learning_rate": 3.0073756026195053e-05, "loss": 0.7291, "step": 1093 }, { "epoch": 0.33910651323181834, "grad_norm": 0.2707352108451459, "learning_rate": 3.005675160929369e-05, "loss": 0.763, "step": 1094 }, { "epoch": 0.33941648262234103, "grad_norm": 0.6837674157319147, "learning_rate": 3.003973745686238e-05, "loss": 0.7954, "step": 1095 }, { "epoch": 0.3397264520128637, "grad_norm": 0.28832602131855717, "learning_rate": 3.002271358537183e-05, "loss": 0.7716, "step": 1096 }, { "epoch": 0.3400364214033864, "grad_norm": 0.24433893863786485, "learning_rate": 3.0005680011302155e-05, "loss": 0.7661, "step": 1097 }, { "epoch": 0.3403463907939091, "grad_norm": 0.22742257585282194, "learning_rate": 2.9988636751142863e-05, "loss": 0.7387, "step": 1098 }, { "epoch": 0.3406563601844318, "grad_norm": 0.2747350498450583, "learning_rate": 2.997158382139283e-05, "loss": 0.7778, "step": 1099 }, { "epoch": 0.3409663295749545, "grad_norm": 0.22463080720147466, "learning_rate": 2.99545212385603e-05, "loss": 0.7618, "step": 1100 }, { "epoch": 0.3412762989654772, "grad_norm": 0.2357528741484417, "learning_rate": 2.9937449019162873e-05, "loss": 0.7579, "step": 1101 }, { "epoch": 0.34158626835599987, "grad_norm": 0.27103609099147863, "learning_rate": 2.9920367179727458e-05, "loss": 0.7482, "step": 1102 }, { "epoch": 0.34189623774652256, "grad_norm": 0.23064526887196252, "learning_rate": 2.9903275736790288e-05, "loss": 0.7419, "step": 1103 }, { "epoch": 0.3422062071370452, "grad_norm": 0.2504010803545433, "learning_rate": 2.9886174706896883e-05, "loss": 0.7579, "step": 1104 }, { "epoch": 0.3425161765275679, "grad_norm": 0.2809888457410378, "learning_rate": 2.986906410660206e-05, "loss": 0.7474, "step": 1105 }, { "epoch": 0.3428261459180906, "grad_norm": 0.204125007968823, "learning_rate": 2.985194395246989e-05, "loss": 0.7467, "step": 1106 }, { "epoch": 0.34313611530861327, "grad_norm": 0.25745215800658616, "learning_rate": 2.983481426107369e-05, "loss": 0.7252, "step": 1107 }, { "epoch": 0.34344608469913596, "grad_norm": 0.22150319509717886, "learning_rate": 2.981767504899601e-05, "loss": 0.7422, "step": 1108 }, { "epoch": 0.34375605408965865, "grad_norm": 0.22695963685452122, "learning_rate": 2.980052633282863e-05, "loss": 0.7263, "step": 1109 }, { "epoch": 0.34406602348018134, "grad_norm": 0.3030831920143564, "learning_rate": 2.978336812917252e-05, "loss": 0.7353, "step": 1110 }, { "epoch": 0.34437599287070403, "grad_norm": 0.21413029577495268, "learning_rate": 2.976620045463783e-05, "loss": 0.7554, "step": 1111 }, { "epoch": 0.3446859622612267, "grad_norm": 0.2063883673237585, "learning_rate": 2.974902332584388e-05, "loss": 0.747, "step": 1112 }, { "epoch": 0.3449959316517494, "grad_norm": 0.22681006785622881, "learning_rate": 2.973183675941915e-05, "loss": 0.7436, "step": 1113 }, { "epoch": 0.3453059010422721, "grad_norm": 0.20834478817377095, "learning_rate": 2.971464077200126e-05, "loss": 0.7253, "step": 1114 }, { "epoch": 0.34561587043279474, "grad_norm": 0.21532761873822415, "learning_rate": 2.9697435380236933e-05, "loss": 0.7618, "step": 1115 }, { "epoch": 0.3459258398233174, "grad_norm": 0.2351279348791468, "learning_rate": 2.9680220600782003e-05, "loss": 0.7638, "step": 1116 }, { "epoch": 0.3462358092138401, "grad_norm": 0.20573529035269236, "learning_rate": 2.966299645030141e-05, "loss": 0.729, "step": 1117 }, { "epoch": 0.3465457786043628, "grad_norm": 0.22883096757995225, "learning_rate": 2.9645762945469126e-05, "loss": 0.737, "step": 1118 }, { "epoch": 0.3468557479948855, "grad_norm": 0.2339928024320759, "learning_rate": 2.9628520102968223e-05, "loss": 0.7207, "step": 1119 }, { "epoch": 0.3471657173854082, "grad_norm": 0.21245021780648382, "learning_rate": 2.9611267939490776e-05, "loss": 0.7321, "step": 1120 }, { "epoch": 0.3474756867759309, "grad_norm": 0.230827337459178, "learning_rate": 2.9594006471737908e-05, "loss": 0.731, "step": 1121 }, { "epoch": 0.34778565616645357, "grad_norm": 0.20718065701815577, "learning_rate": 2.9576735716419735e-05, "loss": 0.7313, "step": 1122 }, { "epoch": 0.34809562555697626, "grad_norm": 0.2182115122005175, "learning_rate": 2.955945569025538e-05, "loss": 0.7377, "step": 1123 }, { "epoch": 0.34840559494749895, "grad_norm": 0.20805626608955557, "learning_rate": 2.9542166409972915e-05, "loss": 0.7267, "step": 1124 }, { "epoch": 0.34871556433802164, "grad_norm": 0.23880196967416106, "learning_rate": 2.9524867892309395e-05, "loss": 0.769, "step": 1125 }, { "epoch": 0.3490255337285443, "grad_norm": 0.19675391857457653, "learning_rate": 2.95075601540108e-05, "loss": 0.6934, "step": 1126 }, { "epoch": 0.34933550311906697, "grad_norm": 0.4862217566798732, "learning_rate": 2.949024321183205e-05, "loss": 0.7588, "step": 1127 }, { "epoch": 0.34964547250958966, "grad_norm": 0.515075565894092, "learning_rate": 2.9472917082536967e-05, "loss": 0.7341, "step": 1128 }, { "epoch": 0.34995544190011235, "grad_norm": 0.2631116259161958, "learning_rate": 2.945558178289827e-05, "loss": 0.7404, "step": 1129 }, { "epoch": 0.35026541129063504, "grad_norm": 0.21085597284550237, "learning_rate": 2.9438237329697546e-05, "loss": 0.7669, "step": 1130 }, { "epoch": 0.35057538068115773, "grad_norm": 0.9818371184279339, "learning_rate": 2.9420883739725255e-05, "loss": 0.7612, "step": 1131 }, { "epoch": 0.3508853500716804, "grad_norm": 0.2595826450426671, "learning_rate": 2.94035210297807e-05, "loss": 0.713, "step": 1132 }, { "epoch": 0.3511953194622031, "grad_norm": 0.2436045799133853, "learning_rate": 2.9386149216672014e-05, "loss": 0.7179, "step": 1133 }, { "epoch": 0.3515052888527258, "grad_norm": 0.24400480975350253, "learning_rate": 2.936876831721613e-05, "loss": 0.7452, "step": 1134 }, { "epoch": 0.3518152582432485, "grad_norm": 0.26580383884918835, "learning_rate": 2.9351378348238787e-05, "loss": 0.7829, "step": 1135 }, { "epoch": 0.3521252276337712, "grad_norm": 0.24138296451527305, "learning_rate": 2.933397932657451e-05, "loss": 0.7424, "step": 1136 }, { "epoch": 0.3524351970242939, "grad_norm": 0.23098850834012627, "learning_rate": 2.9316571269066572e-05, "loss": 0.7136, "step": 1137 }, { "epoch": 0.3527451664148165, "grad_norm": 0.21878621553844999, "learning_rate": 2.9299154192567014e-05, "loss": 0.7215, "step": 1138 }, { "epoch": 0.3530551358053392, "grad_norm": 0.20438136228372825, "learning_rate": 2.9281728113936575e-05, "loss": 0.7369, "step": 1139 }, { "epoch": 0.3533651051958619, "grad_norm": 0.21320919473080224, "learning_rate": 2.926429305004475e-05, "loss": 0.7442, "step": 1140 }, { "epoch": 0.3536750745863846, "grad_norm": 0.23196561791948825, "learning_rate": 2.9246849017769697e-05, "loss": 0.7578, "step": 1141 }, { "epoch": 0.3539850439769073, "grad_norm": 0.2190893481940896, "learning_rate": 2.9229396033998277e-05, "loss": 0.7215, "step": 1142 }, { "epoch": 0.35429501336742997, "grad_norm": 0.21929217601798584, "learning_rate": 2.9211934115626008e-05, "loss": 0.753, "step": 1143 }, { "epoch": 0.35460498275795266, "grad_norm": 0.22282171080643928, "learning_rate": 2.9194463279557063e-05, "loss": 0.7348, "step": 1144 }, { "epoch": 0.35491495214847535, "grad_norm": 0.22784075522748054, "learning_rate": 2.9176983542704237e-05, "loss": 0.7481, "step": 1145 }, { "epoch": 0.35522492153899804, "grad_norm": 0.39591026247946765, "learning_rate": 2.915949492198895e-05, "loss": 0.7331, "step": 1146 }, { "epoch": 0.35553489092952073, "grad_norm": 0.2213789587588288, "learning_rate": 2.9141997434341224e-05, "loss": 0.7618, "step": 1147 }, { "epoch": 0.3558448603200434, "grad_norm": 0.2958969107334082, "learning_rate": 2.9124491096699654e-05, "loss": 0.7646, "step": 1148 }, { "epoch": 0.35615482971056606, "grad_norm": 0.23316018634328137, "learning_rate": 2.9106975926011423e-05, "loss": 0.7647, "step": 1149 }, { "epoch": 0.35646479910108875, "grad_norm": 0.2121420382509286, "learning_rate": 2.9089451939232246e-05, "loss": 0.7537, "step": 1150 }, { "epoch": 0.35677476849161144, "grad_norm": 0.24417197961950893, "learning_rate": 2.9071919153326367e-05, "loss": 0.7331, "step": 1151 }, { "epoch": 0.35708473788213413, "grad_norm": 0.20849397859581126, "learning_rate": 2.9054377585266568e-05, "loss": 0.7487, "step": 1152 }, { "epoch": 0.3573947072726568, "grad_norm": 0.22015395239132388, "learning_rate": 2.903682725203412e-05, "loss": 0.7693, "step": 1153 }, { "epoch": 0.3577046766631795, "grad_norm": 0.2137959558161097, "learning_rate": 2.9019268170618793e-05, "loss": 0.7429, "step": 1154 }, { "epoch": 0.3580146460537022, "grad_norm": 0.21789929664409824, "learning_rate": 2.9001700358018808e-05, "loss": 0.752, "step": 1155 }, { "epoch": 0.3583246154442249, "grad_norm": 0.2141247820500741, "learning_rate": 2.8984123831240843e-05, "loss": 0.7382, "step": 1156 }, { "epoch": 0.3586345848347476, "grad_norm": 0.2053788494869095, "learning_rate": 2.896653860730002e-05, "loss": 0.725, "step": 1157 }, { "epoch": 0.3589445542252703, "grad_norm": 0.2070862243426331, "learning_rate": 2.8948944703219872e-05, "loss": 0.7162, "step": 1158 }, { "epoch": 0.35925452361579296, "grad_norm": 0.23339684220463022, "learning_rate": 2.893134213603234e-05, "loss": 0.7327, "step": 1159 }, { "epoch": 0.3595644930063156, "grad_norm": 0.2134565152404374, "learning_rate": 2.8913730922777746e-05, "loss": 0.762, "step": 1160 }, { "epoch": 0.3598744623968383, "grad_norm": 0.23369721649545722, "learning_rate": 2.8896111080504786e-05, "loss": 0.7673, "step": 1161 }, { "epoch": 0.360184431787361, "grad_norm": 0.26013143676013945, "learning_rate": 2.8878482626270515e-05, "loss": 0.7079, "step": 1162 }, { "epoch": 0.36049440117788367, "grad_norm": 0.1987406963436504, "learning_rate": 2.8860845577140315e-05, "loss": 0.7373, "step": 1163 }, { "epoch": 0.36080437056840636, "grad_norm": 0.2094562271929691, "learning_rate": 2.884319995018789e-05, "loss": 0.7106, "step": 1164 }, { "epoch": 0.36111433995892905, "grad_norm": 0.20476309804588452, "learning_rate": 2.882554576249525e-05, "loss": 0.7102, "step": 1165 }, { "epoch": 0.36142430934945174, "grad_norm": 0.21140552301961169, "learning_rate": 2.880788303115269e-05, "loss": 0.7166, "step": 1166 }, { "epoch": 0.36173427873997444, "grad_norm": 0.20211760395339498, "learning_rate": 2.879021177325878e-05, "loss": 0.7469, "step": 1167 }, { "epoch": 0.3620442481304971, "grad_norm": 0.21991309591771477, "learning_rate": 2.8772532005920347e-05, "loss": 0.7383, "step": 1168 }, { "epoch": 0.3623542175210198, "grad_norm": 0.21108271023399724, "learning_rate": 2.875484374625245e-05, "loss": 0.7375, "step": 1169 }, { "epoch": 0.3626641869115425, "grad_norm": 0.20920765227821897, "learning_rate": 2.873714701137836e-05, "loss": 0.7433, "step": 1170 }, { "epoch": 0.3629741563020652, "grad_norm": 0.31800455922242804, "learning_rate": 2.8719441818429573e-05, "loss": 0.7704, "step": 1171 }, { "epoch": 0.36328412569258783, "grad_norm": 0.363277730388085, "learning_rate": 2.8701728184545755e-05, "loss": 0.7248, "step": 1172 }, { "epoch": 0.3635940950831105, "grad_norm": 0.22164221715989155, "learning_rate": 2.8684006126874756e-05, "loss": 0.7471, "step": 1173 }, { "epoch": 0.3639040644736332, "grad_norm": 0.21711164221609935, "learning_rate": 2.866627566257257e-05, "loss": 0.7161, "step": 1174 }, { "epoch": 0.3642140338641559, "grad_norm": 0.2283193890760758, "learning_rate": 2.864853680880334e-05, "loss": 0.7497, "step": 1175 }, { "epoch": 0.3645240032546786, "grad_norm": 0.21257013673336045, "learning_rate": 2.863078958273932e-05, "loss": 0.7392, "step": 1176 }, { "epoch": 0.3648339726452013, "grad_norm": 0.21416499157033753, "learning_rate": 2.861303400156088e-05, "loss": 0.7288, "step": 1177 }, { "epoch": 0.365143942035724, "grad_norm": 0.21039287145055047, "learning_rate": 2.859527008245646e-05, "loss": 0.724, "step": 1178 }, { "epoch": 0.36545391142624667, "grad_norm": 0.23288070397546928, "learning_rate": 2.8577497842622596e-05, "loss": 0.7299, "step": 1179 }, { "epoch": 0.36576388081676936, "grad_norm": 0.21110461696131624, "learning_rate": 2.855971729926386e-05, "loss": 0.7282, "step": 1180 }, { "epoch": 0.36607385020729205, "grad_norm": 0.2370808431830124, "learning_rate": 2.8541928469592872e-05, "loss": 0.7465, "step": 1181 }, { "epoch": 0.36638381959781474, "grad_norm": 0.2181764971747298, "learning_rate": 2.8524131370830274e-05, "loss": 0.7319, "step": 1182 }, { "epoch": 0.3666937889883374, "grad_norm": 0.22182909834350165, "learning_rate": 2.8506326020204697e-05, "loss": 0.7698, "step": 1183 }, { "epoch": 0.36700375837886007, "grad_norm": 0.23108194659252604, "learning_rate": 2.8488512434952783e-05, "loss": 0.7467, "step": 1184 }, { "epoch": 0.36731372776938276, "grad_norm": 0.21600723557346702, "learning_rate": 2.8470690632319136e-05, "loss": 0.7367, "step": 1185 }, { "epoch": 0.36762369715990545, "grad_norm": 0.23940787755798265, "learning_rate": 2.845286062955631e-05, "loss": 0.7401, "step": 1186 }, { "epoch": 0.36793366655042814, "grad_norm": 0.29357810409286145, "learning_rate": 2.84350224439248e-05, "loss": 0.7809, "step": 1187 }, { "epoch": 0.36824363594095083, "grad_norm": 0.2776701490295169, "learning_rate": 2.8417176092693025e-05, "loss": 0.7662, "step": 1188 }, { "epoch": 0.3685536053314735, "grad_norm": 0.23167478110344467, "learning_rate": 2.839932159313731e-05, "loss": 0.7517, "step": 1189 }, { "epoch": 0.3688635747219962, "grad_norm": 0.24763473012099987, "learning_rate": 2.838145896254187e-05, "loss": 0.742, "step": 1190 }, { "epoch": 0.3691735441125189, "grad_norm": 0.24792890034752216, "learning_rate": 2.836358821819878e-05, "loss": 0.7233, "step": 1191 }, { "epoch": 0.3694835135030416, "grad_norm": 0.23795495356381705, "learning_rate": 2.8345709377407974e-05, "loss": 0.7163, "step": 1192 }, { "epoch": 0.3697934828935643, "grad_norm": 0.27680649094050275, "learning_rate": 2.8327822457477237e-05, "loss": 0.767, "step": 1193 }, { "epoch": 0.3701034522840869, "grad_norm": 0.23115513141029634, "learning_rate": 2.8309927475722162e-05, "loss": 0.7246, "step": 1194 }, { "epoch": 0.3704134216746096, "grad_norm": 0.24948264507318044, "learning_rate": 2.8292024449466143e-05, "loss": 0.7289, "step": 1195 }, { "epoch": 0.3707233910651323, "grad_norm": 0.2522418614754411, "learning_rate": 2.827411339604037e-05, "loss": 0.7291, "step": 1196 }, { "epoch": 0.371033360455655, "grad_norm": 0.21877528421895495, "learning_rate": 2.82561943327838e-05, "loss": 0.7284, "step": 1197 }, { "epoch": 0.3713433298461777, "grad_norm": 0.23974598779764564, "learning_rate": 2.8238267277043156e-05, "loss": 0.7565, "step": 1198 }, { "epoch": 0.3716532992367004, "grad_norm": 0.23099057108375565, "learning_rate": 2.822033224617287e-05, "loss": 0.743, "step": 1199 }, { "epoch": 0.37196326862722306, "grad_norm": 0.22349977851649436, "learning_rate": 2.820238925753513e-05, "loss": 0.7339, "step": 1200 }, { "epoch": 0.37227323801774576, "grad_norm": 0.22461289978443816, "learning_rate": 2.818443832849979e-05, "loss": 0.7218, "step": 1201 }, { "epoch": 0.37258320740826845, "grad_norm": 0.2520335545726753, "learning_rate": 2.8166479476444423e-05, "loss": 0.743, "step": 1202 }, { "epoch": 0.37289317679879114, "grad_norm": 0.19625812588307115, "learning_rate": 2.8148512718754266e-05, "loss": 0.7336, "step": 1203 }, { "epoch": 0.37320314618931383, "grad_norm": 0.25223990327976503, "learning_rate": 2.8130538072822186e-05, "loss": 0.7302, "step": 1204 }, { "epoch": 0.3735131155798365, "grad_norm": 0.2672607692917908, "learning_rate": 2.811255555604871e-05, "loss": 0.7592, "step": 1205 }, { "epoch": 0.37382308497035915, "grad_norm": 0.2094917656206123, "learning_rate": 2.8094565185841974e-05, "loss": 0.764, "step": 1206 }, { "epoch": 0.37413305436088184, "grad_norm": 0.23503844113883854, "learning_rate": 2.807656697961773e-05, "loss": 0.7396, "step": 1207 }, { "epoch": 0.37444302375140454, "grad_norm": 0.21899992255390996, "learning_rate": 2.8058560954799296e-05, "loss": 0.7289, "step": 1208 }, { "epoch": 0.3747529931419272, "grad_norm": 0.19972637813015184, "learning_rate": 2.8040547128817562e-05, "loss": 0.7311, "step": 1209 }, { "epoch": 0.3750629625324499, "grad_norm": 0.24719483286893504, "learning_rate": 2.8022525519110996e-05, "loss": 0.7459, "step": 1210 }, { "epoch": 0.3753729319229726, "grad_norm": 0.22506472028240526, "learning_rate": 2.800449614312556e-05, "loss": 0.7565, "step": 1211 }, { "epoch": 0.3756829013134953, "grad_norm": 0.20662771194314178, "learning_rate": 2.798645901831477e-05, "loss": 0.758, "step": 1212 }, { "epoch": 0.375992870704018, "grad_norm": 0.22688806644639006, "learning_rate": 2.796841416213963e-05, "loss": 0.7557, "step": 1213 }, { "epoch": 0.3763028400945407, "grad_norm": 0.2123360217997782, "learning_rate": 2.795036159206861e-05, "loss": 0.7398, "step": 1214 }, { "epoch": 0.37661280948506337, "grad_norm": 0.20541136490997866, "learning_rate": 2.793230132557768e-05, "loss": 0.7416, "step": 1215 }, { "epoch": 0.37692277887558606, "grad_norm": 0.2007534187514477, "learning_rate": 2.791423338015024e-05, "loss": 0.7405, "step": 1216 }, { "epoch": 0.3772327482661087, "grad_norm": 0.2150922867439891, "learning_rate": 2.7896157773277138e-05, "loss": 0.7365, "step": 1217 }, { "epoch": 0.3775427176566314, "grad_norm": 0.20291188952521672, "learning_rate": 2.7878074522456616e-05, "loss": 0.7759, "step": 1218 }, { "epoch": 0.3778526870471541, "grad_norm": 0.2106834934256671, "learning_rate": 2.785998364519434e-05, "loss": 0.7232, "step": 1219 }, { "epoch": 0.37816265643767677, "grad_norm": 0.19629307695787743, "learning_rate": 2.7841885159003345e-05, "loss": 0.7134, "step": 1220 }, { "epoch": 0.37847262582819946, "grad_norm": 0.20396010052631758, "learning_rate": 2.782377908140403e-05, "loss": 0.7433, "step": 1221 }, { "epoch": 0.37878259521872215, "grad_norm": 0.2003752029786654, "learning_rate": 2.7805665429924157e-05, "loss": 0.7389, "step": 1222 }, { "epoch": 0.37909256460924484, "grad_norm": 0.20047411628675751, "learning_rate": 2.7787544222098806e-05, "loss": 0.6867, "step": 1223 }, { "epoch": 0.37940253399976753, "grad_norm": 0.18898166030919034, "learning_rate": 2.776941547547038e-05, "loss": 0.7295, "step": 1224 }, { "epoch": 0.3797125033902902, "grad_norm": 0.19879134135296328, "learning_rate": 2.7751279207588574e-05, "loss": 0.7085, "step": 1225 }, { "epoch": 0.3800224727808129, "grad_norm": 0.19796828497630295, "learning_rate": 2.773313543601037e-05, "loss": 0.7381, "step": 1226 }, { "epoch": 0.3803324421713356, "grad_norm": 0.21266407073135613, "learning_rate": 2.7714984178300003e-05, "loss": 0.7155, "step": 1227 }, { "epoch": 0.38064241156185824, "grad_norm": 0.19511408638934988, "learning_rate": 2.7696825452028975e-05, "loss": 0.7615, "step": 1228 }, { "epoch": 0.38095238095238093, "grad_norm": 0.22184520095968313, "learning_rate": 2.7678659274776e-05, "loss": 0.7226, "step": 1229 }, { "epoch": 0.3812623503429036, "grad_norm": 0.22044602628055462, "learning_rate": 2.7660485664127024e-05, "loss": 0.7324, "step": 1230 }, { "epoch": 0.3815723197334263, "grad_norm": 0.21001710729757259, "learning_rate": 2.7642304637675153e-05, "loss": 0.7311, "step": 1231 }, { "epoch": 0.381882289123949, "grad_norm": 0.21596277739301867, "learning_rate": 2.762411621302071e-05, "loss": 0.7478, "step": 1232 }, { "epoch": 0.3821922585144717, "grad_norm": 0.2192145747651128, "learning_rate": 2.7605920407771165e-05, "loss": 0.7385, "step": 1233 }, { "epoch": 0.3825022279049944, "grad_norm": 0.20795482445199834, "learning_rate": 2.7587717239541128e-05, "loss": 0.7334, "step": 1234 }, { "epoch": 0.3828121972955171, "grad_norm": 0.20929337329575728, "learning_rate": 2.7569506725952346e-05, "loss": 0.745, "step": 1235 }, { "epoch": 0.38312216668603977, "grad_norm": 0.22387613476239335, "learning_rate": 2.755128888463367e-05, "loss": 0.7303, "step": 1236 }, { "epoch": 0.38343213607656246, "grad_norm": 0.20831870020279636, "learning_rate": 2.7533063733221045e-05, "loss": 0.7405, "step": 1237 }, { "epoch": 0.38374210546708515, "grad_norm": 0.2280180092320844, "learning_rate": 2.7514831289357498e-05, "loss": 0.7358, "step": 1238 }, { "epoch": 0.3840520748576078, "grad_norm": 0.20038158036254608, "learning_rate": 2.7496591570693113e-05, "loss": 0.7649, "step": 1239 }, { "epoch": 0.3843620442481305, "grad_norm": 0.3163387378898734, "learning_rate": 2.747834459488501e-05, "loss": 0.7507, "step": 1240 }, { "epoch": 0.38467201363865317, "grad_norm": 0.1865418293114199, "learning_rate": 2.7460090379597352e-05, "loss": 0.7162, "step": 1241 }, { "epoch": 0.38498198302917586, "grad_norm": 0.22475306662726338, "learning_rate": 2.7441828942501287e-05, "loss": 0.7176, "step": 1242 }, { "epoch": 0.38529195241969855, "grad_norm": 0.2051961195007581, "learning_rate": 2.7423560301274966e-05, "loss": 0.7454, "step": 1243 }, { "epoch": 0.38560192181022124, "grad_norm": 0.37871666831198564, "learning_rate": 2.7405284473603524e-05, "loss": 0.7767, "step": 1244 }, { "epoch": 0.38591189120074393, "grad_norm": 0.1968615316257365, "learning_rate": 2.7387001477179026e-05, "loss": 0.717, "step": 1245 }, { "epoch": 0.3862218605912666, "grad_norm": 0.210883507300382, "learning_rate": 2.7368711329700505e-05, "loss": 0.7072, "step": 1246 }, { "epoch": 0.3865318299817893, "grad_norm": 0.3076896521443727, "learning_rate": 2.7350414048873903e-05, "loss": 0.755, "step": 1247 }, { "epoch": 0.386841799372312, "grad_norm": 0.2326575803573776, "learning_rate": 2.7332109652412067e-05, "loss": 0.7192, "step": 1248 }, { "epoch": 0.3871517687628347, "grad_norm": 0.212022912218315, "learning_rate": 2.7313798158034736e-05, "loss": 0.7327, "step": 1249 }, { "epoch": 0.3874617381533574, "grad_norm": 0.22377895896069722, "learning_rate": 2.729547958346852e-05, "loss": 0.7168, "step": 1250 }, { "epoch": 0.38777170754388, "grad_norm": 0.2407962816344513, "learning_rate": 2.727715394644688e-05, "loss": 0.7271, "step": 1251 }, { "epoch": 0.3880816769344027, "grad_norm": 0.19999766820306664, "learning_rate": 2.725882126471011e-05, "loss": 0.7604, "step": 1252 }, { "epoch": 0.3883916463249254, "grad_norm": 0.24639768788877026, "learning_rate": 2.7240481556005338e-05, "loss": 0.7237, "step": 1253 }, { "epoch": 0.3887016157154481, "grad_norm": 0.23690633378921502, "learning_rate": 2.7222134838086485e-05, "loss": 0.7122, "step": 1254 }, { "epoch": 0.3890115851059708, "grad_norm": 0.21093665788800267, "learning_rate": 2.720378112871426e-05, "loss": 0.7321, "step": 1255 }, { "epoch": 0.38932155449649347, "grad_norm": 0.33836145199311946, "learning_rate": 2.718542044565613e-05, "loss": 0.7498, "step": 1256 }, { "epoch": 0.38963152388701616, "grad_norm": 0.2358035446099482, "learning_rate": 2.7167052806686337e-05, "loss": 0.7135, "step": 1257 }, { "epoch": 0.38994149327753885, "grad_norm": 0.24550572633371945, "learning_rate": 2.7148678229585813e-05, "loss": 0.7482, "step": 1258 }, { "epoch": 0.39025146266806154, "grad_norm": 0.21854157818908912, "learning_rate": 2.7130296732142265e-05, "loss": 0.7318, "step": 1259 }, { "epoch": 0.39056143205858423, "grad_norm": 0.2152986602570693, "learning_rate": 2.7111908332150054e-05, "loss": 0.7367, "step": 1260 }, { "epoch": 0.3908714014491069, "grad_norm": 0.20476331849351304, "learning_rate": 2.7093513047410236e-05, "loss": 0.7328, "step": 1261 }, { "epoch": 0.39118137083962956, "grad_norm": 0.21878718642753175, "learning_rate": 2.707511089573054e-05, "loss": 0.7576, "step": 1262 }, { "epoch": 0.39149134023015225, "grad_norm": 0.2077102929110779, "learning_rate": 2.705670189492534e-05, "loss": 0.7402, "step": 1263 }, { "epoch": 0.39180130962067494, "grad_norm": 0.21267304485523703, "learning_rate": 2.7038286062815628e-05, "loss": 0.7464, "step": 1264 }, { "epoch": 0.39211127901119763, "grad_norm": 0.21474956357644098, "learning_rate": 2.701986341722902e-05, "loss": 0.7227, "step": 1265 }, { "epoch": 0.3924212484017203, "grad_norm": 0.20628788278579546, "learning_rate": 2.7001433975999723e-05, "loss": 0.7381, "step": 1266 }, { "epoch": 0.392731217792243, "grad_norm": 0.2101582529274012, "learning_rate": 2.698299775696853e-05, "loss": 0.7304, "step": 1267 }, { "epoch": 0.3930411871827657, "grad_norm": 0.22871974676656717, "learning_rate": 2.6964554777982798e-05, "loss": 0.7511, "step": 1268 }, { "epoch": 0.3933511565732884, "grad_norm": 0.19813643756553795, "learning_rate": 2.6946105056896406e-05, "loss": 0.7428, "step": 1269 }, { "epoch": 0.3936611259638111, "grad_norm": 0.2249203208982518, "learning_rate": 2.6927648611569783e-05, "loss": 0.7315, "step": 1270 }, { "epoch": 0.3939710953543338, "grad_norm": 0.20693205399374723, "learning_rate": 2.690918545986986e-05, "loss": 0.7259, "step": 1271 }, { "epoch": 0.39428106474485647, "grad_norm": 0.21126373954302222, "learning_rate": 2.6890715619670054e-05, "loss": 0.7641, "step": 1272 }, { "epoch": 0.3945910341353791, "grad_norm": 0.21143477637459776, "learning_rate": 2.6872239108850266e-05, "loss": 0.7643, "step": 1273 }, { "epoch": 0.3949010035259018, "grad_norm": 0.2133353656226426, "learning_rate": 2.6853755945296844e-05, "loss": 0.7178, "step": 1274 }, { "epoch": 0.3952109729164245, "grad_norm": 0.20725932626975047, "learning_rate": 2.6835266146902585e-05, "loss": 0.7415, "step": 1275 }, { "epoch": 0.3955209423069472, "grad_norm": 0.20874948242201924, "learning_rate": 2.681676973156672e-05, "loss": 0.7249, "step": 1276 }, { "epoch": 0.39583091169746987, "grad_norm": 0.21815472764548915, "learning_rate": 2.6798266717194856e-05, "loss": 0.7125, "step": 1277 }, { "epoch": 0.39614088108799256, "grad_norm": 0.21107716807142188, "learning_rate": 2.677975712169901e-05, "loss": 0.74, "step": 1278 }, { "epoch": 0.39645085047851525, "grad_norm": 0.21076963003709562, "learning_rate": 2.676124096299756e-05, "loss": 0.7483, "step": 1279 }, { "epoch": 0.39676081986903794, "grad_norm": 0.20061888264267919, "learning_rate": 2.674271825901525e-05, "loss": 0.7173, "step": 1280 }, { "epoch": 0.39707078925956063, "grad_norm": 0.19936066600613725, "learning_rate": 2.6724189027683145e-05, "loss": 0.7284, "step": 1281 }, { "epoch": 0.3973807586500833, "grad_norm": 0.2168290157234397, "learning_rate": 2.670565328693864e-05, "loss": 0.7485, "step": 1282 }, { "epoch": 0.397690728040606, "grad_norm": 0.19448257153937648, "learning_rate": 2.668711105472543e-05, "loss": 0.742, "step": 1283 }, { "epoch": 0.3980006974311287, "grad_norm": 0.2268105526986316, "learning_rate": 2.666856234899349e-05, "loss": 0.7309, "step": 1284 }, { "epoch": 0.39831066682165134, "grad_norm": 0.349653119321168, "learning_rate": 2.6650007187699055e-05, "loss": 0.7487, "step": 1285 }, { "epoch": 0.39862063621217403, "grad_norm": 0.20904847343371957, "learning_rate": 2.663144558880463e-05, "loss": 0.7434, "step": 1286 }, { "epoch": 0.3989306056026967, "grad_norm": 0.21955416070988856, "learning_rate": 2.6612877570278936e-05, "loss": 0.7686, "step": 1287 }, { "epoch": 0.3992405749932194, "grad_norm": 0.29570933700223745, "learning_rate": 2.6594303150096907e-05, "loss": 0.7523, "step": 1288 }, { "epoch": 0.3995505443837421, "grad_norm": 0.20365364724792587, "learning_rate": 2.6575722346239686e-05, "loss": 0.7436, "step": 1289 }, { "epoch": 0.3998605137742648, "grad_norm": 0.20756612177885367, "learning_rate": 2.655713517669459e-05, "loss": 0.7218, "step": 1290 }, { "epoch": 0.4001704831647875, "grad_norm": 0.3135166704671891, "learning_rate": 2.653854165945509e-05, "loss": 0.7221, "step": 1291 }, { "epoch": 0.4004804525553102, "grad_norm": 0.20969693707466924, "learning_rate": 2.651994181252082e-05, "loss": 0.7401, "step": 1292 }, { "epoch": 0.40079042194583286, "grad_norm": 0.2128795219758573, "learning_rate": 2.6501335653897515e-05, "loss": 0.7424, "step": 1293 }, { "epoch": 0.40110039133635556, "grad_norm": 0.21661897777522085, "learning_rate": 2.6482723201597053e-05, "loss": 0.7292, "step": 1294 }, { "epoch": 0.40141036072687825, "grad_norm": 0.20305625206684524, "learning_rate": 2.6464104473637382e-05, "loss": 0.7294, "step": 1295 }, { "epoch": 0.4017203301174009, "grad_norm": 0.2583222307529841, "learning_rate": 2.644547948804253e-05, "loss": 0.7056, "step": 1296 }, { "epoch": 0.40203029950792357, "grad_norm": 0.21337763093592074, "learning_rate": 2.642684826284258e-05, "loss": 0.7579, "step": 1297 }, { "epoch": 0.40234026889844626, "grad_norm": 0.3503505948857035, "learning_rate": 2.6408210816073673e-05, "loss": 0.7341, "step": 1298 }, { "epoch": 0.40265023828896895, "grad_norm": 0.23512865496827953, "learning_rate": 2.638956716577794e-05, "loss": 0.7201, "step": 1299 }, { "epoch": 0.40296020767949164, "grad_norm": 0.20460856983198553, "learning_rate": 2.6370917330003557e-05, "loss": 0.6988, "step": 1300 }, { "epoch": 0.40327017707001434, "grad_norm": 0.23959478148375887, "learning_rate": 2.6352261326804645e-05, "loss": 0.7236, "step": 1301 }, { "epoch": 0.403580146460537, "grad_norm": 0.23298889332511194, "learning_rate": 2.633359917424133e-05, "loss": 0.7584, "step": 1302 }, { "epoch": 0.4038901158510597, "grad_norm": 0.2183636719257126, "learning_rate": 2.6314930890379685e-05, "loss": 0.7524, "step": 1303 }, { "epoch": 0.4042000852415824, "grad_norm": 0.23410915117891476, "learning_rate": 2.62962564932917e-05, "loss": 0.7725, "step": 1304 }, { "epoch": 0.4045100546321051, "grad_norm": 0.21621119990249565, "learning_rate": 2.6277576001055306e-05, "loss": 0.7296, "step": 1305 }, { "epoch": 0.4048200240226278, "grad_norm": 0.20335803011394057, "learning_rate": 2.6258889431754306e-05, "loss": 0.709, "step": 1306 }, { "epoch": 0.4051299934131504, "grad_norm": 0.23491721628343337, "learning_rate": 2.6240196803478424e-05, "loss": 0.7576, "step": 1307 }, { "epoch": 0.4054399628036731, "grad_norm": 0.21800786658487994, "learning_rate": 2.622149813432322e-05, "loss": 0.7457, "step": 1308 }, { "epoch": 0.4057499321941958, "grad_norm": 0.2330970066815105, "learning_rate": 2.620279344239011e-05, "loss": 0.7432, "step": 1309 }, { "epoch": 0.4060599015847185, "grad_norm": 0.23321277699602003, "learning_rate": 2.6184082745786333e-05, "loss": 0.72, "step": 1310 }, { "epoch": 0.4063698709752412, "grad_norm": 0.2163230528648886, "learning_rate": 2.6165366062624973e-05, "loss": 0.717, "step": 1311 }, { "epoch": 0.4066798403657639, "grad_norm": 0.30870884720387276, "learning_rate": 2.6146643411024864e-05, "loss": 0.7374, "step": 1312 }, { "epoch": 0.40698980975628657, "grad_norm": 0.242985610918623, "learning_rate": 2.6127914809110645e-05, "loss": 0.7458, "step": 1313 }, { "epoch": 0.40729977914680926, "grad_norm": 0.2441708996001541, "learning_rate": 2.6109180275012712e-05, "loss": 0.7361, "step": 1314 }, { "epoch": 0.40760974853733195, "grad_norm": 0.2392428836428483, "learning_rate": 2.6090439826867197e-05, "loss": 0.7519, "step": 1315 }, { "epoch": 0.40791971792785464, "grad_norm": 0.21093465269392375, "learning_rate": 2.6071693482815967e-05, "loss": 0.7428, "step": 1316 }, { "epoch": 0.40822968731837733, "grad_norm": 0.2354455142088759, "learning_rate": 2.6052941261006594e-05, "loss": 0.7508, "step": 1317 }, { "epoch": 0.4085396567089, "grad_norm": 0.21360058019628742, "learning_rate": 2.6034183179592326e-05, "loss": 0.7228, "step": 1318 }, { "epoch": 0.40884962609942266, "grad_norm": 0.22046807463698465, "learning_rate": 2.6015419256732095e-05, "loss": 0.7296, "step": 1319 }, { "epoch": 0.40915959548994535, "grad_norm": 0.2272569889897426, "learning_rate": 2.59966495105905e-05, "loss": 0.7356, "step": 1320 }, { "epoch": 0.40946956488046804, "grad_norm": 0.22004116785695496, "learning_rate": 2.5977873959337753e-05, "loss": 0.7316, "step": 1321 }, { "epoch": 0.40977953427099073, "grad_norm": 0.21071076720319626, "learning_rate": 2.5959092621149706e-05, "loss": 0.748, "step": 1322 }, { "epoch": 0.4100895036615134, "grad_norm": 0.2113257988898608, "learning_rate": 2.5940305514207797e-05, "loss": 0.7304, "step": 1323 }, { "epoch": 0.4103994730520361, "grad_norm": 0.20044773230379356, "learning_rate": 2.5921512656699056e-05, "loss": 0.7714, "step": 1324 }, { "epoch": 0.4107094424425588, "grad_norm": 0.21760711400610488, "learning_rate": 2.5902714066816087e-05, "loss": 0.7281, "step": 1325 }, { "epoch": 0.4110194118330815, "grad_norm": 0.23666526440930366, "learning_rate": 2.5883909762757027e-05, "loss": 0.7385, "step": 1326 }, { "epoch": 0.4113293812236042, "grad_norm": 0.246197211380527, "learning_rate": 2.5865099762725564e-05, "loss": 0.7459, "step": 1327 }, { "epoch": 0.4116393506141269, "grad_norm": 0.24276788670127533, "learning_rate": 2.5846284084930875e-05, "loss": 0.7571, "step": 1328 }, { "epoch": 0.41194932000464957, "grad_norm": 0.20042140723799656, "learning_rate": 2.5827462747587663e-05, "loss": 0.7164, "step": 1329 }, { "epoch": 0.4122592893951722, "grad_norm": 0.24305383819936113, "learning_rate": 2.5808635768916094e-05, "loss": 0.7615, "step": 1330 }, { "epoch": 0.4125692587856949, "grad_norm": 0.2566790129631054, "learning_rate": 2.5789803167141782e-05, "loss": 0.7616, "step": 1331 }, { "epoch": 0.4128792281762176, "grad_norm": 0.19255529124530493, "learning_rate": 2.577096496049581e-05, "loss": 0.6875, "step": 1332 }, { "epoch": 0.4131891975667403, "grad_norm": 0.24009929567596863, "learning_rate": 2.575212116721467e-05, "loss": 0.7493, "step": 1333 }, { "epoch": 0.41349916695726296, "grad_norm": 0.2456967866420787, "learning_rate": 2.573327180554028e-05, "loss": 0.7438, "step": 1334 }, { "epoch": 0.41380913634778566, "grad_norm": 0.2085156637393531, "learning_rate": 2.571441689371992e-05, "loss": 0.752, "step": 1335 }, { "epoch": 0.41411910573830835, "grad_norm": 0.2299647821950016, "learning_rate": 2.569555645000627e-05, "loss": 0.718, "step": 1336 }, { "epoch": 0.41442907512883104, "grad_norm": 0.23568110218864435, "learning_rate": 2.5676690492657348e-05, "loss": 0.743, "step": 1337 }, { "epoch": 0.41473904451935373, "grad_norm": 0.2143232307690934, "learning_rate": 2.5657819039936515e-05, "loss": 0.7455, "step": 1338 }, { "epoch": 0.4150490139098764, "grad_norm": 0.28596638253982304, "learning_rate": 2.5638942110112458e-05, "loss": 0.7153, "step": 1339 }, { "epoch": 0.4153589833003991, "grad_norm": 0.2060172188209476, "learning_rate": 2.562005972145916e-05, "loss": 0.7726, "step": 1340 }, { "epoch": 0.41566895269092174, "grad_norm": 0.2649639928612651, "learning_rate": 2.5601171892255883e-05, "loss": 0.7273, "step": 1341 }, { "epoch": 0.41597892208144444, "grad_norm": 0.25532618859697437, "learning_rate": 2.5582278640787174e-05, "loss": 0.6917, "step": 1342 }, { "epoch": 0.4162888914719671, "grad_norm": 0.20280451137525435, "learning_rate": 2.5563379985342808e-05, "loss": 0.7521, "step": 1343 }, { "epoch": 0.4165988608624898, "grad_norm": 0.26375599694857077, "learning_rate": 2.5544475944217802e-05, "loss": 0.7291, "step": 1344 }, { "epoch": 0.4169088302530125, "grad_norm": 0.20437615879401735, "learning_rate": 2.5525566535712373e-05, "loss": 0.6929, "step": 1345 }, { "epoch": 0.4172187996435352, "grad_norm": 0.23991131053846904, "learning_rate": 2.550665177813197e-05, "loss": 0.7224, "step": 1346 }, { "epoch": 0.4175287690340579, "grad_norm": 0.25157987152453787, "learning_rate": 2.5487731689787184e-05, "loss": 0.7218, "step": 1347 }, { "epoch": 0.4178387384245806, "grad_norm": 0.2071617689931208, "learning_rate": 2.546880628899378e-05, "loss": 0.7118, "step": 1348 }, { "epoch": 0.41814870781510327, "grad_norm": 0.24599638798473858, "learning_rate": 2.544987559407266e-05, "loss": 0.7191, "step": 1349 }, { "epoch": 0.41845867720562596, "grad_norm": 0.20617181294687173, "learning_rate": 2.543093962334986e-05, "loss": 0.7124, "step": 1350 }, { "epoch": 0.41876864659614865, "grad_norm": 0.26712317926784335, "learning_rate": 2.541199839515652e-05, "loss": 0.7217, "step": 1351 }, { "epoch": 0.41907861598667134, "grad_norm": 0.2342211928658147, "learning_rate": 2.5393051927828864e-05, "loss": 0.7421, "step": 1352 }, { "epoch": 0.419388585377194, "grad_norm": 0.22708917136616327, "learning_rate": 2.5374100239708198e-05, "loss": 0.7894, "step": 1353 }, { "epoch": 0.41969855476771667, "grad_norm": 0.24551954857855726, "learning_rate": 2.535514334914087e-05, "loss": 0.7205, "step": 1354 }, { "epoch": 0.42000852415823936, "grad_norm": 0.22411805716370967, "learning_rate": 2.5336181274478273e-05, "loss": 0.7452, "step": 1355 }, { "epoch": 0.42031849354876205, "grad_norm": 0.6968041321694265, "learning_rate": 2.5317214034076817e-05, "loss": 0.7613, "step": 1356 }, { "epoch": 0.42062846293928474, "grad_norm": 0.22271714059222486, "learning_rate": 2.5298241646297912e-05, "loss": 0.7303, "step": 1357 }, { "epoch": 0.42093843232980743, "grad_norm": 0.2069290295578089, "learning_rate": 2.527926412950795e-05, "loss": 0.7651, "step": 1358 }, { "epoch": 0.4212484017203301, "grad_norm": 0.23082396947885403, "learning_rate": 2.526028150207829e-05, "loss": 0.7621, "step": 1359 }, { "epoch": 0.4215583711108528, "grad_norm": 0.2189734814859488, "learning_rate": 2.5241293782385234e-05, "loss": 0.7381, "step": 1360 }, { "epoch": 0.4218683405013755, "grad_norm": 0.2639665751489641, "learning_rate": 2.5222300988810017e-05, "loss": 0.7102, "step": 1361 }, { "epoch": 0.4221783098918982, "grad_norm": 0.2208304237485797, "learning_rate": 2.520330313973879e-05, "loss": 0.7229, "step": 1362 }, { "epoch": 0.4224882792824209, "grad_norm": 0.21713997186517306, "learning_rate": 2.518430025356259e-05, "loss": 0.7222, "step": 1363 }, { "epoch": 0.4227982486729435, "grad_norm": 0.2174251046101084, "learning_rate": 2.5165292348677338e-05, "loss": 0.7427, "step": 1364 }, { "epoch": 0.4231082180634662, "grad_norm": 0.2132426254891767, "learning_rate": 2.5146279443483804e-05, "loss": 0.7329, "step": 1365 }, { "epoch": 0.4234181874539889, "grad_norm": 0.22428138301589337, "learning_rate": 2.512726155638761e-05, "loss": 0.7036, "step": 1366 }, { "epoch": 0.4237281568445116, "grad_norm": 0.2216530575993946, "learning_rate": 2.5108238705799184e-05, "loss": 0.7565, "step": 1367 }, { "epoch": 0.4240381262350343, "grad_norm": 0.19952396222023377, "learning_rate": 2.5089210910133782e-05, "loss": 0.7645, "step": 1368 }, { "epoch": 0.424348095625557, "grad_norm": 0.2562479607004519, "learning_rate": 2.507017818781143e-05, "loss": 0.7111, "step": 1369 }, { "epoch": 0.42465806501607967, "grad_norm": 0.21108842147316093, "learning_rate": 2.5051140557256936e-05, "loss": 0.7413, "step": 1370 }, { "epoch": 0.42496803440660236, "grad_norm": 0.20818493452668244, "learning_rate": 2.5032098036899833e-05, "loss": 0.7296, "step": 1371 }, { "epoch": 0.42527800379712505, "grad_norm": 0.21507387035423442, "learning_rate": 2.5013050645174414e-05, "loss": 0.7372, "step": 1372 }, { "epoch": 0.42558797318764774, "grad_norm": 0.22677893043895297, "learning_rate": 2.4993998400519684e-05, "loss": 0.7319, "step": 1373 }, { "epoch": 0.42589794257817043, "grad_norm": 0.21661284512861398, "learning_rate": 2.4974941321379346e-05, "loss": 0.7305, "step": 1374 }, { "epoch": 0.42620791196869307, "grad_norm": 0.213616015320891, "learning_rate": 2.4955879426201773e-05, "loss": 0.7168, "step": 1375 }, { "epoch": 0.42651788135921576, "grad_norm": 0.2155422003967378, "learning_rate": 2.4936812733440006e-05, "loss": 0.7403, "step": 1376 }, { "epoch": 0.42682785074973845, "grad_norm": 0.214293853903226, "learning_rate": 2.4917741261551732e-05, "loss": 0.7004, "step": 1377 }, { "epoch": 0.42713782014026114, "grad_norm": 0.2180369292115733, "learning_rate": 2.4898665028999257e-05, "loss": 0.742, "step": 1378 }, { "epoch": 0.42744778953078383, "grad_norm": 0.2249705158964885, "learning_rate": 2.487958405424951e-05, "loss": 0.7572, "step": 1379 }, { "epoch": 0.4277577589213065, "grad_norm": 0.20138394071486468, "learning_rate": 2.4860498355773995e-05, "loss": 0.7515, "step": 1380 }, { "epoch": 0.4280677283118292, "grad_norm": 0.2632663637588241, "learning_rate": 2.4841407952048805e-05, "loss": 0.7347, "step": 1381 }, { "epoch": 0.4283776977023519, "grad_norm": 0.2170291316216236, "learning_rate": 2.4822312861554572e-05, "loss": 0.722, "step": 1382 }, { "epoch": 0.4286876670928746, "grad_norm": 0.20295183561425653, "learning_rate": 2.4803213102776473e-05, "loss": 0.7305, "step": 1383 }, { "epoch": 0.4289976364833973, "grad_norm": 0.18914327671620518, "learning_rate": 2.4784108694204213e-05, "loss": 0.7063, "step": 1384 }, { "epoch": 0.42930760587392, "grad_norm": 0.1994627641215986, "learning_rate": 2.476499965433197e-05, "loss": 0.7306, "step": 1385 }, { "epoch": 0.42961757526444266, "grad_norm": 0.20457291667461974, "learning_rate": 2.474588600165844e-05, "loss": 0.7552, "step": 1386 }, { "epoch": 0.4299275446549653, "grad_norm": 0.20029747566482675, "learning_rate": 2.4726767754686764e-05, "loss": 0.7622, "step": 1387 }, { "epoch": 0.430237514045488, "grad_norm": 0.20925021208042938, "learning_rate": 2.470764493192453e-05, "loss": 0.731, "step": 1388 }, { "epoch": 0.4305474834360107, "grad_norm": 0.19372642293327114, "learning_rate": 2.4688517551883767e-05, "loss": 0.7419, "step": 1389 }, { "epoch": 0.43085745282653337, "grad_norm": 0.18538372112241505, "learning_rate": 2.4669385633080906e-05, "loss": 0.7323, "step": 1390 }, { "epoch": 0.43116742221705606, "grad_norm": 0.19670106733705423, "learning_rate": 2.4650249194036773e-05, "loss": 0.7326, "step": 1391 }, { "epoch": 0.43147739160757875, "grad_norm": 0.1863431863571904, "learning_rate": 2.4631108253276584e-05, "loss": 0.7076, "step": 1392 }, { "epoch": 0.43178736099810144, "grad_norm": 0.19705077870979804, "learning_rate": 2.461196282932988e-05, "loss": 0.7576, "step": 1393 }, { "epoch": 0.43209733038862413, "grad_norm": 0.1791231239141325, "learning_rate": 2.4592812940730582e-05, "loss": 0.6888, "step": 1394 }, { "epoch": 0.4324072997791468, "grad_norm": 0.3538490394668464, "learning_rate": 2.457365860601691e-05, "loss": 0.7462, "step": 1395 }, { "epoch": 0.4327172691696695, "grad_norm": 0.18856737633345866, "learning_rate": 2.455449984373138e-05, "loss": 0.7254, "step": 1396 }, { "epoch": 0.4330272385601922, "grad_norm": 0.2121670442461179, "learning_rate": 2.4535336672420822e-05, "loss": 0.7296, "step": 1397 }, { "epoch": 0.43333720795071484, "grad_norm": 0.20612936988936564, "learning_rate": 2.451616911063631e-05, "loss": 0.7419, "step": 1398 }, { "epoch": 0.43364717734123753, "grad_norm": 0.20969230703822359, "learning_rate": 2.4496997176933183e-05, "loss": 0.7269, "step": 1399 }, { "epoch": 0.4339571467317602, "grad_norm": 0.21427405001995295, "learning_rate": 2.4477820889870994e-05, "loss": 0.7554, "step": 1400 }, { "epoch": 0.4342671161222829, "grad_norm": 0.21417446886234195, "learning_rate": 2.4458640268013533e-05, "loss": 0.7336, "step": 1401 }, { "epoch": 0.4345770855128056, "grad_norm": 0.2212170252201521, "learning_rate": 2.443945532992877e-05, "loss": 0.7185, "step": 1402 }, { "epoch": 0.4348870549033283, "grad_norm": 0.20915062434196552, "learning_rate": 2.4420266094188865e-05, "loss": 0.7305, "step": 1403 }, { "epoch": 0.435197024293851, "grad_norm": 0.232408940101745, "learning_rate": 2.4401072579370125e-05, "loss": 0.7154, "step": 1404 }, { "epoch": 0.4355069936843737, "grad_norm": 0.19696995679944257, "learning_rate": 2.438187480405301e-05, "loss": 0.7058, "step": 1405 }, { "epoch": 0.43581696307489637, "grad_norm": 0.26074882939594146, "learning_rate": 2.4362672786822108e-05, "loss": 0.7282, "step": 1406 }, { "epoch": 0.43612693246541906, "grad_norm": 0.2435245803956274, "learning_rate": 2.4343466546266088e-05, "loss": 0.7207, "step": 1407 }, { "epoch": 0.43643690185594175, "grad_norm": 0.2337669400558605, "learning_rate": 2.4324256100977745e-05, "loss": 0.7267, "step": 1408 }, { "epoch": 0.4367468712464644, "grad_norm": 0.22230201437484903, "learning_rate": 2.430504146955392e-05, "loss": 0.7596, "step": 1409 }, { "epoch": 0.4370568406369871, "grad_norm": 0.23481457061899566, "learning_rate": 2.428582267059551e-05, "loss": 0.7325, "step": 1410 }, { "epoch": 0.43736681002750977, "grad_norm": 0.2365464512862241, "learning_rate": 2.426659972270745e-05, "loss": 0.7423, "step": 1411 }, { "epoch": 0.43767677941803246, "grad_norm": 0.2457160790035583, "learning_rate": 2.4247372644498683e-05, "loss": 0.7392, "step": 1412 }, { "epoch": 0.43798674880855515, "grad_norm": 0.2139555811342418, "learning_rate": 2.4228141454582167e-05, "loss": 0.6994, "step": 1413 }, { "epoch": 0.43829671819907784, "grad_norm": 0.22580875513134208, "learning_rate": 2.4208906171574822e-05, "loss": 0.7318, "step": 1414 }, { "epoch": 0.43860668758960053, "grad_norm": 0.2274976851511962, "learning_rate": 2.418966681409754e-05, "loss": 0.7293, "step": 1415 }, { "epoch": 0.4389166569801232, "grad_norm": 0.21970870248406596, "learning_rate": 2.4170423400775168e-05, "loss": 0.726, "step": 1416 }, { "epoch": 0.4392266263706459, "grad_norm": 0.2173377841145302, "learning_rate": 2.4151175950236453e-05, "loss": 0.722, "step": 1417 }, { "epoch": 0.4395365957611686, "grad_norm": 0.2168635716708119, "learning_rate": 2.4131924481114074e-05, "loss": 0.7432, "step": 1418 }, { "epoch": 0.4398465651516913, "grad_norm": 0.3405161229183264, "learning_rate": 2.4112669012044584e-05, "loss": 0.7443, "step": 1419 }, { "epoch": 0.440156534542214, "grad_norm": 0.24047900760383803, "learning_rate": 2.409340956166841e-05, "loss": 0.7251, "step": 1420 }, { "epoch": 0.4404665039327366, "grad_norm": 0.2191842177554391, "learning_rate": 2.4074146148629853e-05, "loss": 0.6957, "step": 1421 }, { "epoch": 0.4407764733232593, "grad_norm": 0.25770634399085396, "learning_rate": 2.405487879157703e-05, "loss": 0.7219, "step": 1422 }, { "epoch": 0.441086442713782, "grad_norm": 0.21554626411846683, "learning_rate": 2.4035607509161882e-05, "loss": 0.7215, "step": 1423 }, { "epoch": 0.4413964121043047, "grad_norm": 0.21466073665394297, "learning_rate": 2.4016332320040144e-05, "loss": 0.7326, "step": 1424 }, { "epoch": 0.4417063814948274, "grad_norm": 0.2158138694195358, "learning_rate": 2.3997053242871342e-05, "loss": 0.7234, "step": 1425 }, { "epoch": 0.4420163508853501, "grad_norm": 0.28352424806053084, "learning_rate": 2.3977770296318763e-05, "loss": 0.7185, "step": 1426 }, { "epoch": 0.44232632027587276, "grad_norm": 0.2167297492872719, "learning_rate": 2.395848349904944e-05, "loss": 0.7262, "step": 1427 }, { "epoch": 0.44263628966639545, "grad_norm": 0.2032744970188301, "learning_rate": 2.3939192869734126e-05, "loss": 0.7133, "step": 1428 }, { "epoch": 0.44294625905691815, "grad_norm": 0.2121882040668938, "learning_rate": 2.391989842704729e-05, "loss": 0.7103, "step": 1429 }, { "epoch": 0.44325622844744084, "grad_norm": 0.20613239976517453, "learning_rate": 2.3900600189667112e-05, "loss": 0.7355, "step": 1430 }, { "epoch": 0.4435661978379635, "grad_norm": 0.2239110216437738, "learning_rate": 2.3881298176275402e-05, "loss": 0.7559, "step": 1431 }, { "epoch": 0.44387616722848616, "grad_norm": 0.19358919131725183, "learning_rate": 2.3861992405557662e-05, "loss": 0.6987, "step": 1432 }, { "epoch": 0.44418613661900885, "grad_norm": 0.22228015365327, "learning_rate": 2.3842682896203012e-05, "loss": 0.7088, "step": 1433 }, { "epoch": 0.44449610600953154, "grad_norm": 0.20704002885870362, "learning_rate": 2.382336966690421e-05, "loss": 0.722, "step": 1434 }, { "epoch": 0.44480607540005423, "grad_norm": 0.21720937015748298, "learning_rate": 2.3804052736357593e-05, "loss": 0.7029, "step": 1435 }, { "epoch": 0.4451160447905769, "grad_norm": 0.2061765164268522, "learning_rate": 2.3784732123263098e-05, "loss": 0.7288, "step": 1436 }, { "epoch": 0.4454260141810996, "grad_norm": 0.28691393147996785, "learning_rate": 2.3765407846324218e-05, "loss": 0.7159, "step": 1437 }, { "epoch": 0.4457359835716223, "grad_norm": 0.2203438465954715, "learning_rate": 2.3746079924247992e-05, "loss": 0.7061, "step": 1438 }, { "epoch": 0.446045952962145, "grad_norm": 0.22055573243176468, "learning_rate": 2.3726748375744997e-05, "loss": 0.7339, "step": 1439 }, { "epoch": 0.4463559223526677, "grad_norm": 0.23217386763007894, "learning_rate": 2.370741321952931e-05, "loss": 0.7338, "step": 1440 }, { "epoch": 0.4466658917431904, "grad_norm": 0.2025547426820029, "learning_rate": 2.3688074474318504e-05, "loss": 0.7025, "step": 1441 }, { "epoch": 0.44697586113371307, "grad_norm": 0.22700693471153174, "learning_rate": 2.3668732158833626e-05, "loss": 0.7193, "step": 1442 }, { "epoch": 0.4472858305242357, "grad_norm": 0.20923541892838604, "learning_rate": 2.364938629179919e-05, "loss": 0.7372, "step": 1443 }, { "epoch": 0.4475957999147584, "grad_norm": 0.20919845422719777, "learning_rate": 2.3630036891943133e-05, "loss": 0.7435, "step": 1444 }, { "epoch": 0.4479057693052811, "grad_norm": 0.21992194484023744, "learning_rate": 2.361068397799682e-05, "loss": 0.7293, "step": 1445 }, { "epoch": 0.4482157386958038, "grad_norm": 0.196299413647983, "learning_rate": 2.3591327568695e-05, "loss": 0.7155, "step": 1446 }, { "epoch": 0.44852570808632647, "grad_norm": 0.21955731391628322, "learning_rate": 2.3571967682775844e-05, "loss": 0.7438, "step": 1447 }, { "epoch": 0.44883567747684916, "grad_norm": 0.19406488160735175, "learning_rate": 2.3552604338980857e-05, "loss": 0.7305, "step": 1448 }, { "epoch": 0.44914564686737185, "grad_norm": 0.21035905935500898, "learning_rate": 2.3533237556054895e-05, "loss": 0.7108, "step": 1449 }, { "epoch": 0.44945561625789454, "grad_norm": 0.2399806003362241, "learning_rate": 2.3513867352746154e-05, "loss": 0.7336, "step": 1450 }, { "epoch": 0.44976558564841723, "grad_norm": 0.2165976342307417, "learning_rate": 2.349449374780614e-05, "loss": 0.7657, "step": 1451 }, { "epoch": 0.4500755550389399, "grad_norm": 0.18937240494805513, "learning_rate": 2.3475116759989635e-05, "loss": 0.7447, "step": 1452 }, { "epoch": 0.4503855244294626, "grad_norm": 0.20828926307854473, "learning_rate": 2.3455736408054715e-05, "loss": 0.7338, "step": 1453 }, { "epoch": 0.45069549381998525, "grad_norm": 0.19539751930407134, "learning_rate": 2.343635271076271e-05, "loss": 0.7173, "step": 1454 }, { "epoch": 0.45100546321050794, "grad_norm": 0.2399133176050341, "learning_rate": 2.3416965686878177e-05, "loss": 0.7283, "step": 1455 }, { "epoch": 0.45131543260103063, "grad_norm": 0.2553492599881148, "learning_rate": 2.339757535516891e-05, "loss": 0.7492, "step": 1456 }, { "epoch": 0.4516254019915533, "grad_norm": 0.24319633144511707, "learning_rate": 2.337818173440589e-05, "loss": 0.718, "step": 1457 }, { "epoch": 0.451935371382076, "grad_norm": 0.18867853057271616, "learning_rate": 2.3358784843363292e-05, "loss": 0.7571, "step": 1458 }, { "epoch": 0.4522453407725987, "grad_norm": 0.22292709212968584, "learning_rate": 2.3339384700818447e-05, "loss": 0.7327, "step": 1459 }, { "epoch": 0.4525553101631214, "grad_norm": 0.21204466234022432, "learning_rate": 2.331998132555184e-05, "loss": 0.7211, "step": 1460 }, { "epoch": 0.4528652795536441, "grad_norm": 0.1929829215645478, "learning_rate": 2.330057473634709e-05, "loss": 0.737, "step": 1461 }, { "epoch": 0.4531752489441668, "grad_norm": 0.22395580327421424, "learning_rate": 2.3281164951990922e-05, "loss": 0.7078, "step": 1462 }, { "epoch": 0.45348521833468947, "grad_norm": 0.2032686272144793, "learning_rate": 2.3261751991273155e-05, "loss": 0.7386, "step": 1463 }, { "epoch": 0.45379518772521216, "grad_norm": 0.231516324017996, "learning_rate": 2.3242335872986676e-05, "loss": 0.738, "step": 1464 }, { "epoch": 0.45410515711573485, "grad_norm": 0.23056557828666838, "learning_rate": 2.3222916615927442e-05, "loss": 0.7215, "step": 1465 }, { "epoch": 0.4544151265062575, "grad_norm": 0.20029887206501146, "learning_rate": 2.320349423889444e-05, "loss": 0.7483, "step": 1466 }, { "epoch": 0.4547250958967802, "grad_norm": 0.2329072369790247, "learning_rate": 2.3184068760689674e-05, "loss": 0.7256, "step": 1467 }, { "epoch": 0.45503506528730286, "grad_norm": 0.18652085263544768, "learning_rate": 2.3164640200118152e-05, "loss": 0.7204, "step": 1468 }, { "epoch": 0.45534503467782556, "grad_norm": 0.21646664620255857, "learning_rate": 2.3145208575987885e-05, "loss": 0.7374, "step": 1469 }, { "epoch": 0.45565500406834825, "grad_norm": 0.27280010021600537, "learning_rate": 2.312577390710983e-05, "loss": 0.7478, "step": 1470 }, { "epoch": 0.45596497345887094, "grad_norm": 0.19673655682771807, "learning_rate": 2.3106336212297883e-05, "loss": 0.7104, "step": 1471 }, { "epoch": 0.4562749428493936, "grad_norm": 0.21934826527821855, "learning_rate": 2.308689551036888e-05, "loss": 0.7251, "step": 1472 }, { "epoch": 0.4565849122399163, "grad_norm": 0.1919693987238332, "learning_rate": 2.3067451820142587e-05, "loss": 0.7199, "step": 1473 }, { "epoch": 0.456894881630439, "grad_norm": 0.20169288244800987, "learning_rate": 2.3048005160441634e-05, "loss": 0.7248, "step": 1474 }, { "epoch": 0.4572048510209617, "grad_norm": 0.18728480523061813, "learning_rate": 2.3028555550091536e-05, "loss": 0.7644, "step": 1475 }, { "epoch": 0.4575148204114844, "grad_norm": 0.21228647080679544, "learning_rate": 2.300910300792067e-05, "loss": 0.7189, "step": 1476 }, { "epoch": 0.457824789802007, "grad_norm": 0.28139825356326026, "learning_rate": 2.2989647552760243e-05, "loss": 0.7558, "step": 1477 }, { "epoch": 0.4581347591925297, "grad_norm": 0.2066477120466219, "learning_rate": 2.297018920344429e-05, "loss": 0.7094, "step": 1478 }, { "epoch": 0.4584447285830524, "grad_norm": 0.2014336870308051, "learning_rate": 2.2950727978809635e-05, "loss": 0.7246, "step": 1479 }, { "epoch": 0.4587546979735751, "grad_norm": 0.19082871140815935, "learning_rate": 2.2931263897695903e-05, "loss": 0.727, "step": 1480 }, { "epoch": 0.4590646673640978, "grad_norm": 0.19403951039787767, "learning_rate": 2.2911796978945463e-05, "loss": 0.7179, "step": 1481 }, { "epoch": 0.4593746367546205, "grad_norm": 0.19713398696207743, "learning_rate": 2.2892327241403456e-05, "loss": 0.7306, "step": 1482 }, { "epoch": 0.45968460614514317, "grad_norm": 0.19101199062192661, "learning_rate": 2.287285470391774e-05, "loss": 0.7272, "step": 1483 }, { "epoch": 0.45999457553566586, "grad_norm": 0.2016081832861803, "learning_rate": 2.285337938533887e-05, "loss": 0.7114, "step": 1484 }, { "epoch": 0.46030454492618855, "grad_norm": 0.21116443529533735, "learning_rate": 2.2833901304520106e-05, "loss": 0.7218, "step": 1485 }, { "epoch": 0.46061451431671124, "grad_norm": 0.19801452476038695, "learning_rate": 2.281442048031739e-05, "loss": 0.695, "step": 1486 }, { "epoch": 0.46092448370723393, "grad_norm": 0.19773736809368972, "learning_rate": 2.2794936931589308e-05, "loss": 0.7378, "step": 1487 }, { "epoch": 0.46123445309775657, "grad_norm": 0.1952868346927464, "learning_rate": 2.2775450677197087e-05, "loss": 0.7073, "step": 1488 }, { "epoch": 0.46154442248827926, "grad_norm": 0.18544686050923076, "learning_rate": 2.275596173600457e-05, "loss": 0.7233, "step": 1489 }, { "epoch": 0.46185439187880195, "grad_norm": 0.21768059485054841, "learning_rate": 2.2736470126878206e-05, "loss": 0.7425, "step": 1490 }, { "epoch": 0.46216436126932464, "grad_norm": 0.2029028626281952, "learning_rate": 2.2716975868687027e-05, "loss": 0.7393, "step": 1491 }, { "epoch": 0.46247433065984733, "grad_norm": 0.2016962636234101, "learning_rate": 2.2697478980302625e-05, "loss": 0.7303, "step": 1492 }, { "epoch": 0.46278430005037, "grad_norm": 0.2831649325089382, "learning_rate": 2.2677979480599137e-05, "loss": 0.7419, "step": 1493 }, { "epoch": 0.4630942694408927, "grad_norm": 0.2065207005157215, "learning_rate": 2.2658477388453233e-05, "loss": 0.7153, "step": 1494 }, { "epoch": 0.4634042388314154, "grad_norm": 0.2361063909543952, "learning_rate": 2.2638972722744094e-05, "loss": 0.7388, "step": 1495 }, { "epoch": 0.4637142082219381, "grad_norm": 0.1942590332703988, "learning_rate": 2.261946550235339e-05, "loss": 0.6911, "step": 1496 }, { "epoch": 0.4640241776124608, "grad_norm": 0.20824821336594707, "learning_rate": 2.2599955746165266e-05, "loss": 0.7719, "step": 1497 }, { "epoch": 0.4643341470029835, "grad_norm": 0.20142762994766256, "learning_rate": 2.2580443473066308e-05, "loss": 0.7004, "step": 1498 }, { "epoch": 0.46464411639350617, "grad_norm": 0.19193228977784166, "learning_rate": 2.256092870194555e-05, "loss": 0.7218, "step": 1499 }, { "epoch": 0.4649540857840288, "grad_norm": 0.21286053376453937, "learning_rate": 2.2541411451694452e-05, "loss": 0.7659, "step": 1500 }, { "epoch": 0.4652640551745515, "grad_norm": 0.19470038476069573, "learning_rate": 2.2521891741206864e-05, "loss": 0.7356, "step": 1501 }, { "epoch": 0.4655740245650742, "grad_norm": 0.21944226806476055, "learning_rate": 2.250236958937902e-05, "loss": 0.7343, "step": 1502 }, { "epoch": 0.4658839939555969, "grad_norm": 0.30025297616479496, "learning_rate": 2.2482845015109517e-05, "loss": 0.7014, "step": 1503 }, { "epoch": 0.46619396334611957, "grad_norm": 0.20111867130139216, "learning_rate": 2.246331803729929e-05, "loss": 0.7268, "step": 1504 }, { "epoch": 0.46650393273664226, "grad_norm": 0.1860718392272302, "learning_rate": 2.2443788674851614e-05, "loss": 0.7303, "step": 1505 }, { "epoch": 0.46681390212716495, "grad_norm": 0.2034961006297702, "learning_rate": 2.242425694667207e-05, "loss": 0.7246, "step": 1506 }, { "epoch": 0.46712387151768764, "grad_norm": 0.18426487174291087, "learning_rate": 2.240472287166851e-05, "loss": 0.7281, "step": 1507 }, { "epoch": 0.46743384090821033, "grad_norm": 0.2224182280214145, "learning_rate": 2.2385186468751088e-05, "loss": 0.7322, "step": 1508 }, { "epoch": 0.467743810298733, "grad_norm": 0.19101473508562122, "learning_rate": 2.2365647756832195e-05, "loss": 0.7168, "step": 1509 }, { "epoch": 0.4680537796892557, "grad_norm": 0.20022708250751492, "learning_rate": 2.2346106754826454e-05, "loss": 0.7318, "step": 1510 }, { "epoch": 0.46836374907977835, "grad_norm": 0.19587257757743182, "learning_rate": 2.2326563481650716e-05, "loss": 0.7151, "step": 1511 }, { "epoch": 0.46867371847030104, "grad_norm": 0.19060946080237273, "learning_rate": 2.230701795622401e-05, "loss": 0.7171, "step": 1512 }, { "epoch": 0.46898368786082373, "grad_norm": 0.19642997195482112, "learning_rate": 2.2287470197467576e-05, "loss": 0.7178, "step": 1513 }, { "epoch": 0.4692936572513464, "grad_norm": 0.1825139427540615, "learning_rate": 2.2267920224304792e-05, "loss": 0.7277, "step": 1514 }, { "epoch": 0.4696036266418691, "grad_norm": 0.2131880946620328, "learning_rate": 2.2248368055661192e-05, "loss": 0.7419, "step": 1515 }, { "epoch": 0.4699135960323918, "grad_norm": 0.18779650487138486, "learning_rate": 2.2228813710464428e-05, "loss": 0.7316, "step": 1516 }, { "epoch": 0.4702235654229145, "grad_norm": 0.24711829865854493, "learning_rate": 2.220925720764426e-05, "loss": 0.7506, "step": 1517 }, { "epoch": 0.4705335348134372, "grad_norm": 0.18248907475384207, "learning_rate": 2.2189698566132542e-05, "loss": 0.7288, "step": 1518 }, { "epoch": 0.4708435042039599, "grad_norm": 0.2089139502419096, "learning_rate": 2.217013780486319e-05, "loss": 0.6948, "step": 1519 }, { "epoch": 0.47115347359448256, "grad_norm": 0.20193562368007564, "learning_rate": 2.2150574942772187e-05, "loss": 0.7582, "step": 1520 }, { "epoch": 0.47146344298500525, "grad_norm": 0.20514012193784623, "learning_rate": 2.2131009998797524e-05, "loss": 0.7247, "step": 1521 }, { "epoch": 0.4717734123755279, "grad_norm": 0.18469437938344846, "learning_rate": 2.211144299187924e-05, "loss": 0.7004, "step": 1522 }, { "epoch": 0.4720833817660506, "grad_norm": 0.20458105326497456, "learning_rate": 2.2091873940959344e-05, "loss": 0.758, "step": 1523 }, { "epoch": 0.47239335115657327, "grad_norm": 0.17754927889893998, "learning_rate": 2.207230286498184e-05, "loss": 0.7124, "step": 1524 }, { "epoch": 0.47270332054709596, "grad_norm": 0.24268325178714265, "learning_rate": 2.2052729782892677e-05, "loss": 0.755, "step": 1525 }, { "epoch": 0.47301328993761865, "grad_norm": 0.6647531099564885, "learning_rate": 2.2033154713639765e-05, "loss": 0.7714, "step": 1526 }, { "epoch": 0.47332325932814134, "grad_norm": 0.20867418688939873, "learning_rate": 2.2013577676172923e-05, "loss": 0.7177, "step": 1527 }, { "epoch": 0.47363322871866403, "grad_norm": 0.19778072188361837, "learning_rate": 2.199399868944388e-05, "loss": 0.6964, "step": 1528 }, { "epoch": 0.4739431981091867, "grad_norm": 0.20459207869204254, "learning_rate": 2.197441777240626e-05, "loss": 0.7452, "step": 1529 }, { "epoch": 0.4742531674997094, "grad_norm": 0.1911198202162672, "learning_rate": 2.1954834944015535e-05, "loss": 0.7286, "step": 1530 }, { "epoch": 0.4745631368902321, "grad_norm": 0.2017350589398839, "learning_rate": 2.1935250223229048e-05, "loss": 0.6912, "step": 1531 }, { "epoch": 0.4748731062807548, "grad_norm": 0.18745396635518183, "learning_rate": 2.191566362900597e-05, "loss": 0.7226, "step": 1532 }, { "epoch": 0.4751830756712775, "grad_norm": 0.19990471241903876, "learning_rate": 2.189607518030727e-05, "loss": 0.7194, "step": 1533 }, { "epoch": 0.4754930450618001, "grad_norm": 0.1816047891850118, "learning_rate": 2.1876484896095727e-05, "loss": 0.6871, "step": 1534 }, { "epoch": 0.4758030144523228, "grad_norm": 0.32834907263538393, "learning_rate": 2.1856892795335906e-05, "loss": 0.7456, "step": 1535 }, { "epoch": 0.4761129838428455, "grad_norm": 0.19004002659394398, "learning_rate": 2.183729889699411e-05, "loss": 0.7398, "step": 1536 }, { "epoch": 0.4764229532333682, "grad_norm": 0.1977422957284132, "learning_rate": 2.1817703220038398e-05, "loss": 0.7548, "step": 1537 }, { "epoch": 0.4767329226238909, "grad_norm": 0.1880179894608937, "learning_rate": 2.1798105783438528e-05, "loss": 0.726, "step": 1538 }, { "epoch": 0.4770428920144136, "grad_norm": 0.17585581072127984, "learning_rate": 2.1778506606165988e-05, "loss": 0.7134, "step": 1539 }, { "epoch": 0.47735286140493627, "grad_norm": 0.18490484124293588, "learning_rate": 2.1758905707193936e-05, "loss": 0.7538, "step": 1540 }, { "epoch": 0.47766283079545896, "grad_norm": 0.17329990682108173, "learning_rate": 2.1739303105497203e-05, "loss": 0.7226, "step": 1541 }, { "epoch": 0.47797280018598165, "grad_norm": 0.18046712533322629, "learning_rate": 2.171969882005226e-05, "loss": 0.7108, "step": 1542 }, { "epoch": 0.47828276957650434, "grad_norm": 0.18979787368004253, "learning_rate": 2.1700092869837236e-05, "loss": 0.711, "step": 1543 }, { "epoch": 0.47859273896702703, "grad_norm": 0.18106811352324265, "learning_rate": 2.168048527383182e-05, "loss": 0.7329, "step": 1544 }, { "epoch": 0.47890270835754967, "grad_norm": 0.20026238611642044, "learning_rate": 2.166087605101734e-05, "loss": 0.7343, "step": 1545 }, { "epoch": 0.47921267774807236, "grad_norm": 0.1744681259850024, "learning_rate": 2.1641265220376675e-05, "loss": 0.7262, "step": 1546 }, { "epoch": 0.47952264713859505, "grad_norm": 0.1918734870172085, "learning_rate": 2.1621652800894272e-05, "loss": 0.7216, "step": 1547 }, { "epoch": 0.47983261652911774, "grad_norm": 0.19365316828228185, "learning_rate": 2.160203881155612e-05, "loss": 0.7365, "step": 1548 }, { "epoch": 0.48014258591964043, "grad_norm": 0.19966315622212358, "learning_rate": 2.158242327134971e-05, "loss": 0.6792, "step": 1549 }, { "epoch": 0.4804525553101631, "grad_norm": 0.1758049726276636, "learning_rate": 2.1562806199264043e-05, "loss": 0.7293, "step": 1550 }, { "epoch": 0.4807625247006858, "grad_norm": 0.1899515515953791, "learning_rate": 2.1543187614289613e-05, "loss": 0.7135, "step": 1551 }, { "epoch": 0.4810724940912085, "grad_norm": 0.1896586383131061, "learning_rate": 2.1523567535418364e-05, "loss": 0.7175, "step": 1552 }, { "epoch": 0.4813824634817312, "grad_norm": 0.19323242536140864, "learning_rate": 2.1503945981643686e-05, "loss": 0.7167, "step": 1553 }, { "epoch": 0.4816924328722539, "grad_norm": 0.18383452685473903, "learning_rate": 2.1484322971960417e-05, "loss": 0.7185, "step": 1554 }, { "epoch": 0.4820024022627766, "grad_norm": 0.19102970984680848, "learning_rate": 2.146469852536478e-05, "loss": 0.7232, "step": 1555 }, { "epoch": 0.4823123716532992, "grad_norm": 0.170609729024298, "learning_rate": 2.144507266085441e-05, "loss": 0.7125, "step": 1556 }, { "epoch": 0.4826223410438219, "grad_norm": 0.192697450196366, "learning_rate": 2.1425445397428285e-05, "loss": 0.7325, "step": 1557 }, { "epoch": 0.4829323104343446, "grad_norm": 0.1796042920005577, "learning_rate": 2.1405816754086773e-05, "loss": 0.735, "step": 1558 }, { "epoch": 0.4832422798248673, "grad_norm": 0.18325196034008365, "learning_rate": 2.138618674983155e-05, "loss": 0.6942, "step": 1559 }, { "epoch": 0.48355224921539, "grad_norm": 0.2590352686869652, "learning_rate": 2.1366555403665626e-05, "loss": 0.7402, "step": 1560 }, { "epoch": 0.48386221860591266, "grad_norm": 0.18871659660882206, "learning_rate": 2.1346922734593302e-05, "loss": 0.7677, "step": 1561 }, { "epoch": 0.48417218799643535, "grad_norm": 0.18687013596826554, "learning_rate": 2.1327288761620166e-05, "loss": 0.7554, "step": 1562 }, { "epoch": 0.48448215738695805, "grad_norm": 0.21004159014124946, "learning_rate": 2.130765350375306e-05, "loss": 0.6941, "step": 1563 }, { "epoch": 0.48479212677748074, "grad_norm": 0.1870121053899386, "learning_rate": 2.1288016980000078e-05, "loss": 0.7236, "step": 1564 }, { "epoch": 0.4851020961680034, "grad_norm": 0.20192921331822142, "learning_rate": 2.1268379209370536e-05, "loss": 0.7361, "step": 1565 }, { "epoch": 0.4854120655585261, "grad_norm": 0.19088593155862338, "learning_rate": 2.124874021087495e-05, "loss": 0.6995, "step": 1566 }, { "epoch": 0.4857220349490488, "grad_norm": 0.1989649341722726, "learning_rate": 2.1229100003525032e-05, "loss": 0.7303, "step": 1567 }, { "epoch": 0.48603200433957144, "grad_norm": 0.7783013934142011, "learning_rate": 2.1209458606333674e-05, "loss": 0.7406, "step": 1568 }, { "epoch": 0.48634197373009413, "grad_norm": 0.1857965478202906, "learning_rate": 2.1189816038314894e-05, "loss": 0.7418, "step": 1569 }, { "epoch": 0.4866519431206168, "grad_norm": 0.1923053135124555, "learning_rate": 2.1170172318483876e-05, "loss": 0.732, "step": 1570 }, { "epoch": 0.4869619125111395, "grad_norm": 0.18808212780889658, "learning_rate": 2.1150527465856883e-05, "loss": 0.7179, "step": 1571 }, { "epoch": 0.4872718819016622, "grad_norm": 0.19289121584229177, "learning_rate": 2.113088149945131e-05, "loss": 0.7567, "step": 1572 }, { "epoch": 0.4875818512921849, "grad_norm": 0.19257605258390298, "learning_rate": 2.11112344382856e-05, "loss": 0.7162, "step": 1573 }, { "epoch": 0.4878918206827076, "grad_norm": 0.20857287093029023, "learning_rate": 2.109158630137928e-05, "loss": 0.7308, "step": 1574 }, { "epoch": 0.4882017900732303, "grad_norm": 0.19606742034715693, "learning_rate": 2.107193710775291e-05, "loss": 0.7633, "step": 1575 }, { "epoch": 0.48851175946375297, "grad_norm": 0.19756061321342377, "learning_rate": 2.105228687642806e-05, "loss": 0.7157, "step": 1576 }, { "epoch": 0.48882172885427566, "grad_norm": 0.24206634305931496, "learning_rate": 2.1032635626427317e-05, "loss": 0.7689, "step": 1577 }, { "epoch": 0.48913169824479835, "grad_norm": 0.19267412697798642, "learning_rate": 2.1012983376774255e-05, "loss": 0.6707, "step": 1578 }, { "epoch": 0.489441667635321, "grad_norm": 0.20679781907821632, "learning_rate": 2.099333014649342e-05, "loss": 0.7256, "step": 1579 }, { "epoch": 0.4897516370258437, "grad_norm": 0.2033661856932757, "learning_rate": 2.097367595461029e-05, "loss": 0.6831, "step": 1580 }, { "epoch": 0.49006160641636637, "grad_norm": 0.1978426569807425, "learning_rate": 2.0954020820151287e-05, "loss": 0.7261, "step": 1581 }, { "epoch": 0.49037157580688906, "grad_norm": 0.22822808766410746, "learning_rate": 2.0934364762143745e-05, "loss": 0.7061, "step": 1582 }, { "epoch": 0.49068154519741175, "grad_norm": 0.2895377754865252, "learning_rate": 2.0914707799615894e-05, "loss": 0.721, "step": 1583 }, { "epoch": 0.49099151458793444, "grad_norm": 0.4396582339559346, "learning_rate": 2.0895049951596826e-05, "loss": 0.6856, "step": 1584 }, { "epoch": 0.49130148397845713, "grad_norm": 0.1848055133170772, "learning_rate": 2.0875391237116505e-05, "loss": 0.7365, "step": 1585 }, { "epoch": 0.4916114533689798, "grad_norm": 0.23013252482353652, "learning_rate": 2.0855731675205723e-05, "loss": 0.7414, "step": 1586 }, { "epoch": 0.4919214227595025, "grad_norm": 0.1973786858553773, "learning_rate": 2.0836071284896112e-05, "loss": 0.7274, "step": 1587 }, { "epoch": 0.4922313921500252, "grad_norm": 0.2779977330718178, "learning_rate": 2.081641008522008e-05, "loss": 0.7024, "step": 1588 }, { "epoch": 0.4925413615405479, "grad_norm": 0.21488880620253692, "learning_rate": 2.079674809521083e-05, "loss": 0.7307, "step": 1589 }, { "epoch": 0.49285133093107053, "grad_norm": 0.22054042201289498, "learning_rate": 2.077708533390234e-05, "loss": 0.7349, "step": 1590 }, { "epoch": 0.4931613003215932, "grad_norm": 0.20191276694685717, "learning_rate": 2.075742182032931e-05, "loss": 0.7238, "step": 1591 }, { "epoch": 0.4934712697121159, "grad_norm": 0.20116197658638432, "learning_rate": 2.0737757573527197e-05, "loss": 0.7185, "step": 1592 }, { "epoch": 0.4937812391026386, "grad_norm": 0.21982860482076008, "learning_rate": 2.071809261253215e-05, "loss": 0.728, "step": 1593 }, { "epoch": 0.4940912084931613, "grad_norm": 0.19486821569839902, "learning_rate": 2.0698426956381007e-05, "loss": 0.7376, "step": 1594 }, { "epoch": 0.494401177883684, "grad_norm": 0.21866311434293112, "learning_rate": 2.067876062411129e-05, "loss": 0.709, "step": 1595 }, { "epoch": 0.4947111472742067, "grad_norm": 0.18130061468487338, "learning_rate": 2.065909363476118e-05, "loss": 0.7261, "step": 1596 }, { "epoch": 0.49502111666472937, "grad_norm": 0.21829736030605634, "learning_rate": 2.0639426007369473e-05, "loss": 0.7225, "step": 1597 }, { "epoch": 0.49533108605525206, "grad_norm": 0.1964026143780699, "learning_rate": 2.06197577609756e-05, "loss": 0.7578, "step": 1598 }, { "epoch": 0.49564105544577475, "grad_norm": 0.348654774920395, "learning_rate": 2.0600088914619576e-05, "loss": 0.6949, "step": 1599 }, { "epoch": 0.49595102483629744, "grad_norm": 0.21058209001355763, "learning_rate": 2.058041948734202e-05, "loss": 0.7421, "step": 1600 }, { "epoch": 0.49626099422682013, "grad_norm": 0.29299306251146634, "learning_rate": 2.05607494981841e-05, "loss": 0.7166, "step": 1601 }, { "epoch": 0.49657096361734276, "grad_norm": 0.22026389937094631, "learning_rate": 2.0541078966187524e-05, "loss": 0.7058, "step": 1602 }, { "epoch": 0.49688093300786546, "grad_norm": 0.17655766563452424, "learning_rate": 2.0521407910394527e-05, "loss": 0.7326, "step": 1603 }, { "epoch": 0.49719090239838815, "grad_norm": 0.19911883320803317, "learning_rate": 2.0501736349847852e-05, "loss": 0.7578, "step": 1604 }, { "epoch": 0.49750087178891084, "grad_norm": 0.17931150498114928, "learning_rate": 2.0482064303590743e-05, "loss": 0.6922, "step": 1605 }, { "epoch": 0.4978108411794335, "grad_norm": 0.19789530859007046, "learning_rate": 2.046239179066689e-05, "loss": 0.7081, "step": 1606 }, { "epoch": 0.4981208105699562, "grad_norm": 0.18153533962758212, "learning_rate": 2.044271883012046e-05, "loss": 0.7143, "step": 1607 }, { "epoch": 0.4984307799604789, "grad_norm": 0.48177855311776463, "learning_rate": 2.042304544099603e-05, "loss": 0.7159, "step": 1608 }, { "epoch": 0.4987407493510016, "grad_norm": 0.18594610924410043, "learning_rate": 2.0403371642338615e-05, "loss": 0.6991, "step": 1609 }, { "epoch": 0.4990507187415243, "grad_norm": 0.19111472037446203, "learning_rate": 2.0383697453193612e-05, "loss": 0.726, "step": 1610 }, { "epoch": 0.499360688132047, "grad_norm": 0.1942979238440848, "learning_rate": 2.0364022892606793e-05, "loss": 0.717, "step": 1611 }, { "epoch": 0.4996706575225697, "grad_norm": 0.17939223923970873, "learning_rate": 2.0344347979624296e-05, "loss": 0.7391, "step": 1612 }, { "epoch": 0.4999806269130923, "grad_norm": 0.19357566736413784, "learning_rate": 2.0324672733292606e-05, "loss": 0.72, "step": 1613 }, { "epoch": 0.500290596303615, "grad_norm": 0.19579314424887076, "learning_rate": 2.030499717265852e-05, "loss": 0.6977, "step": 1614 }, { "epoch": 0.5006005656941377, "grad_norm": 0.1882382351041294, "learning_rate": 2.0285321316769148e-05, "loss": 0.7045, "step": 1615 }, { "epoch": 0.5009105350846604, "grad_norm": 0.22809729592411726, "learning_rate": 2.0265645184671876e-05, "loss": 0.7195, "step": 1616 }, { "epoch": 0.5012205044751831, "grad_norm": 0.1838420790016605, "learning_rate": 2.024596879541436e-05, "loss": 0.7274, "step": 1617 }, { "epoch": 0.5015304738657058, "grad_norm": 0.2110009709245855, "learning_rate": 2.0226292168044518e-05, "loss": 0.7206, "step": 1618 }, { "epoch": 0.5018404432562285, "grad_norm": 0.4660076399298664, "learning_rate": 2.0206615321610484e-05, "loss": 0.714, "step": 1619 }, { "epoch": 0.5021504126467511, "grad_norm": 0.19637073650067302, "learning_rate": 2.01869382751606e-05, "loss": 0.7109, "step": 1620 }, { "epoch": 0.5024603820372738, "grad_norm": 0.1990905772147558, "learning_rate": 2.016726104774342e-05, "loss": 0.7561, "step": 1621 }, { "epoch": 0.5027703514277965, "grad_norm": 0.2117310028858786, "learning_rate": 2.0147583658407658e-05, "loss": 0.7503, "step": 1622 }, { "epoch": 0.5030803208183192, "grad_norm": 0.19340394950989767, "learning_rate": 2.0127906126202204e-05, "loss": 0.7211, "step": 1623 }, { "epoch": 0.5033902902088419, "grad_norm": 0.19054609414798193, "learning_rate": 2.0108228470176048e-05, "loss": 0.7037, "step": 1624 }, { "epoch": 0.5037002595993646, "grad_norm": 0.19159720975053873, "learning_rate": 2.008855070937834e-05, "loss": 0.7023, "step": 1625 }, { "epoch": 0.5040102289898872, "grad_norm": 0.19673014506284917, "learning_rate": 2.0068872862858305e-05, "loss": 0.7548, "step": 1626 }, { "epoch": 0.50432019838041, "grad_norm": 0.23998461102031104, "learning_rate": 2.0049194949665275e-05, "loss": 0.7215, "step": 1627 }, { "epoch": 0.5046301677709326, "grad_norm": 0.19180752482550664, "learning_rate": 2.002951698884863e-05, "loss": 0.7316, "step": 1628 }, { "epoch": 0.5049401371614554, "grad_norm": 0.1812090280154094, "learning_rate": 2.0009838999457787e-05, "loss": 0.7283, "step": 1629 }, { "epoch": 0.505250106551978, "grad_norm": 0.18816095065598926, "learning_rate": 1.9990161000542217e-05, "loss": 0.7235, "step": 1630 }, { "epoch": 0.5055600759425006, "grad_norm": 0.18785016164374832, "learning_rate": 1.9970483011151383e-05, "loss": 0.7174, "step": 1631 }, { "epoch": 0.5058700453330234, "grad_norm": 0.3171322201052858, "learning_rate": 1.995080505033473e-05, "loss": 0.7246, "step": 1632 }, { "epoch": 0.506180014723546, "grad_norm": 0.18714169097396832, "learning_rate": 1.9931127137141692e-05, "loss": 0.7174, "step": 1633 }, { "epoch": 0.5064899841140688, "grad_norm": 0.1841514321260491, "learning_rate": 1.991144929062167e-05, "loss": 0.7105, "step": 1634 }, { "epoch": 0.5067999535045914, "grad_norm": 0.19175410424689854, "learning_rate": 1.9891771529823956e-05, "loss": 0.7334, "step": 1635 }, { "epoch": 0.5071099228951141, "grad_norm": 0.1958862786496548, "learning_rate": 1.987209387379781e-05, "loss": 0.7432, "step": 1636 }, { "epoch": 0.5074198922856368, "grad_norm": 0.20155261669816665, "learning_rate": 1.9852416341592345e-05, "loss": 0.7081, "step": 1637 }, { "epoch": 0.5077298616761595, "grad_norm": 0.17907699767487956, "learning_rate": 1.9832738952256582e-05, "loss": 0.7416, "step": 1638 }, { "epoch": 0.5080398310666822, "grad_norm": 0.18738609756640173, "learning_rate": 1.981306172483941e-05, "loss": 0.7463, "step": 1639 }, { "epoch": 0.5083498004572049, "grad_norm": 0.18212336246964206, "learning_rate": 1.9793384678389523e-05, "loss": 0.7438, "step": 1640 }, { "epoch": 0.5086597698477275, "grad_norm": 0.1838765379316398, "learning_rate": 1.9773707831955482e-05, "loss": 0.7253, "step": 1641 }, { "epoch": 0.5089697392382502, "grad_norm": 0.24924729664920386, "learning_rate": 1.9754031204585646e-05, "loss": 0.7539, "step": 1642 }, { "epoch": 0.5092797086287729, "grad_norm": 0.19145291637256975, "learning_rate": 1.973435481532813e-05, "loss": 0.7432, "step": 1643 }, { "epoch": 0.5095896780192956, "grad_norm": 0.18699770362490503, "learning_rate": 1.9714678683230862e-05, "loss": 0.7194, "step": 1644 }, { "epoch": 0.5098996474098183, "grad_norm": 0.23888246808924593, "learning_rate": 1.9695002827341484e-05, "loss": 0.719, "step": 1645 }, { "epoch": 0.5102096168003409, "grad_norm": 0.19250997869028935, "learning_rate": 1.9675327266707394e-05, "loss": 0.7232, "step": 1646 }, { "epoch": 0.5105195861908637, "grad_norm": 0.1962208148247768, "learning_rate": 1.9655652020375707e-05, "loss": 0.7295, "step": 1647 }, { "epoch": 0.5108295555813863, "grad_norm": 0.19181320941442195, "learning_rate": 1.963597710739321e-05, "loss": 0.7226, "step": 1648 }, { "epoch": 0.5111395249719091, "grad_norm": 0.1888733301818893, "learning_rate": 1.9616302546806398e-05, "loss": 0.719, "step": 1649 }, { "epoch": 0.5114494943624317, "grad_norm": 0.18390752042671005, "learning_rate": 1.9596628357661388e-05, "loss": 0.7158, "step": 1650 }, { "epoch": 0.5117594637529544, "grad_norm": 0.2119977515608807, "learning_rate": 1.9576954559003967e-05, "loss": 0.7442, "step": 1651 }, { "epoch": 0.5120694331434771, "grad_norm": 0.18118628025772782, "learning_rate": 1.955728116987955e-05, "loss": 0.7513, "step": 1652 }, { "epoch": 0.5123794025339997, "grad_norm": 0.19431903729047992, "learning_rate": 1.9537608209333113e-05, "loss": 0.7459, "step": 1653 }, { "epoch": 0.5126893719245225, "grad_norm": 0.1858550540592433, "learning_rate": 1.9517935696409256e-05, "loss": 0.7521, "step": 1654 }, { "epoch": 0.5129993413150451, "grad_norm": 0.18920198764480986, "learning_rate": 1.949826365015215e-05, "loss": 0.7574, "step": 1655 }, { "epoch": 0.5133093107055678, "grad_norm": 0.19334141507774052, "learning_rate": 1.947859208960548e-05, "loss": 0.7051, "step": 1656 }, { "epoch": 0.5136192800960905, "grad_norm": 0.1767332630605365, "learning_rate": 1.945892103381248e-05, "loss": 0.7376, "step": 1657 }, { "epoch": 0.5139292494866132, "grad_norm": 0.21554883333003833, "learning_rate": 1.9439250501815903e-05, "loss": 0.716, "step": 1658 }, { "epoch": 0.5142392188771359, "grad_norm": 0.1806750752759098, "learning_rate": 1.941958051265798e-05, "loss": 0.7321, "step": 1659 }, { "epoch": 0.5145491882676586, "grad_norm": 0.18039599823835104, "learning_rate": 1.939991108538043e-05, "loss": 0.6852, "step": 1660 }, { "epoch": 0.5148591576581812, "grad_norm": 0.18975087453471765, "learning_rate": 1.9380242239024406e-05, "loss": 0.7334, "step": 1661 }, { "epoch": 0.515169127048704, "grad_norm": 0.19401795031154245, "learning_rate": 1.9360573992630537e-05, "loss": 0.7602, "step": 1662 }, { "epoch": 0.5154790964392266, "grad_norm": 0.19347682630410148, "learning_rate": 1.9340906365238824e-05, "loss": 0.6983, "step": 1663 }, { "epoch": 0.5157890658297493, "grad_norm": 0.18315432638596207, "learning_rate": 1.9321239375888713e-05, "loss": 0.738, "step": 1664 }, { "epoch": 0.516099035220272, "grad_norm": 0.1958342532324687, "learning_rate": 1.9301573043619003e-05, "loss": 0.7344, "step": 1665 }, { "epoch": 0.5164090046107946, "grad_norm": 0.18597653898057898, "learning_rate": 1.928190738746786e-05, "loss": 0.732, "step": 1666 }, { "epoch": 0.5167189740013174, "grad_norm": 0.1822064326855474, "learning_rate": 1.9262242426472807e-05, "loss": 0.7023, "step": 1667 }, { "epoch": 0.51702894339184, "grad_norm": 0.24183287957510896, "learning_rate": 1.9242578179670695e-05, "loss": 0.7142, "step": 1668 }, { "epoch": 0.5173389127823628, "grad_norm": 0.17539912903490648, "learning_rate": 1.922291466609767e-05, "loss": 0.7162, "step": 1669 }, { "epoch": 0.5176488821728854, "grad_norm": 0.1772538558167344, "learning_rate": 1.9203251904789175e-05, "loss": 0.7066, "step": 1670 }, { "epoch": 0.5179588515634082, "grad_norm": 0.18117132135691852, "learning_rate": 1.9183589914779927e-05, "loss": 0.7221, "step": 1671 }, { "epoch": 0.5182688209539308, "grad_norm": 0.19273875591075698, "learning_rate": 1.916392871510389e-05, "loss": 0.7589, "step": 1672 }, { "epoch": 0.5185787903444535, "grad_norm": 0.18753943944876816, "learning_rate": 1.914426832479428e-05, "loss": 0.7149, "step": 1673 }, { "epoch": 0.5188887597349762, "grad_norm": 0.23346583179569855, "learning_rate": 1.91246087628835e-05, "loss": 0.7109, "step": 1674 }, { "epoch": 0.5191987291254988, "grad_norm": 0.19364045296957666, "learning_rate": 1.9104950048403184e-05, "loss": 0.7064, "step": 1675 }, { "epoch": 0.5195086985160215, "grad_norm": 0.1910577677950634, "learning_rate": 1.9085292200384112e-05, "loss": 0.7172, "step": 1676 }, { "epoch": 0.5198186679065442, "grad_norm": 0.17542428622255768, "learning_rate": 1.906563523785626e-05, "loss": 0.7364, "step": 1677 }, { "epoch": 0.5201286372970669, "grad_norm": 0.19129893277543467, "learning_rate": 1.9045979179848723e-05, "loss": 0.6927, "step": 1678 }, { "epoch": 0.5204386066875896, "grad_norm": 0.18104008407019037, "learning_rate": 1.9026324045389717e-05, "loss": 0.7247, "step": 1679 }, { "epoch": 0.5207485760781123, "grad_norm": 0.18949512901710916, "learning_rate": 1.9006669853506584e-05, "loss": 0.7384, "step": 1680 }, { "epoch": 0.521058545468635, "grad_norm": 0.1826096333268497, "learning_rate": 1.8987016623225748e-05, "loss": 0.7461, "step": 1681 }, { "epoch": 0.5213685148591577, "grad_norm": 0.18418855735295622, "learning_rate": 1.8967364373572686e-05, "loss": 0.6995, "step": 1682 }, { "epoch": 0.5216784842496803, "grad_norm": 0.19127597986417277, "learning_rate": 1.894771312357195e-05, "loss": 0.7273, "step": 1683 }, { "epoch": 0.5219884536402031, "grad_norm": 0.18797825992682085, "learning_rate": 1.89280628922471e-05, "loss": 0.7598, "step": 1684 }, { "epoch": 0.5222984230307257, "grad_norm": 0.18442731316129674, "learning_rate": 1.890841369862072e-05, "loss": 0.7511, "step": 1685 }, { "epoch": 0.5226083924212483, "grad_norm": 0.18737502926449723, "learning_rate": 1.8888765561714404e-05, "loss": 0.719, "step": 1686 }, { "epoch": 0.5229183618117711, "grad_norm": 0.195823502221825, "learning_rate": 1.8869118500548697e-05, "loss": 0.7514, "step": 1687 }, { "epoch": 0.5232283312022937, "grad_norm": 0.24560616995918824, "learning_rate": 1.8849472534143123e-05, "loss": 0.7357, "step": 1688 }, { "epoch": 0.5235383005928165, "grad_norm": 0.19324530018791583, "learning_rate": 1.882982768151613e-05, "loss": 0.7323, "step": 1689 }, { "epoch": 0.5238482699833391, "grad_norm": 0.31588035451463065, "learning_rate": 1.881018396168511e-05, "loss": 0.7309, "step": 1690 }, { "epoch": 0.5241582393738619, "grad_norm": 0.18782213775487502, "learning_rate": 1.8790541393666336e-05, "loss": 0.7062, "step": 1691 }, { "epoch": 0.5244682087643845, "grad_norm": 0.20658647507975872, "learning_rate": 1.877089999647497e-05, "loss": 0.714, "step": 1692 }, { "epoch": 0.5247781781549072, "grad_norm": 0.2871507932545251, "learning_rate": 1.875125978912506e-05, "loss": 0.722, "step": 1693 }, { "epoch": 0.5250881475454299, "grad_norm": 0.19433059132399286, "learning_rate": 1.873162079062947e-05, "loss": 0.7108, "step": 1694 }, { "epoch": 0.5253981169359526, "grad_norm": 0.41457821717438287, "learning_rate": 1.871198301999992e-05, "loss": 0.709, "step": 1695 }, { "epoch": 0.5257080863264753, "grad_norm": 0.18734510706656757, "learning_rate": 1.8692346496246944e-05, "loss": 0.7051, "step": 1696 }, { "epoch": 0.526018055716998, "grad_norm": 0.18747032659551552, "learning_rate": 1.867271123837984e-05, "loss": 0.7365, "step": 1697 }, { "epoch": 0.5263280251075206, "grad_norm": 0.197073283747461, "learning_rate": 1.8653077265406698e-05, "loss": 0.7624, "step": 1698 }, { "epoch": 0.5266379944980433, "grad_norm": 0.3112649872516709, "learning_rate": 1.8633444596334384e-05, "loss": 0.7411, "step": 1699 }, { "epoch": 0.526947963888566, "grad_norm": 0.19518510243656104, "learning_rate": 1.8613813250168457e-05, "loss": 0.699, "step": 1700 }, { "epoch": 0.5272579332790887, "grad_norm": 0.1743777761308997, "learning_rate": 1.859418324591324e-05, "loss": 0.7211, "step": 1701 }, { "epoch": 0.5275679026696114, "grad_norm": 0.21426107301493102, "learning_rate": 1.8574554602571718e-05, "loss": 0.73, "step": 1702 }, { "epoch": 0.527877872060134, "grad_norm": 0.3237030008709417, "learning_rate": 1.85549273391456e-05, "loss": 0.7255, "step": 1703 }, { "epoch": 0.5281878414506568, "grad_norm": 0.18691392951512392, "learning_rate": 1.8535301474635225e-05, "loss": 0.7128, "step": 1704 }, { "epoch": 0.5284978108411794, "grad_norm": 0.1721823844522374, "learning_rate": 1.8515677028039586e-05, "loss": 0.7025, "step": 1705 }, { "epoch": 0.5288077802317022, "grad_norm": 0.18567006025031033, "learning_rate": 1.849605401835632e-05, "loss": 0.7263, "step": 1706 }, { "epoch": 0.5291177496222248, "grad_norm": 0.18425041186337526, "learning_rate": 1.8476432464581643e-05, "loss": 0.7165, "step": 1707 }, { "epoch": 0.5294277190127475, "grad_norm": 0.18526847212147876, "learning_rate": 1.845681238571039e-05, "loss": 0.7126, "step": 1708 }, { "epoch": 0.5297376884032702, "grad_norm": 0.1856979617972345, "learning_rate": 1.843719380073596e-05, "loss": 0.7194, "step": 1709 }, { "epoch": 0.5300476577937928, "grad_norm": 0.18042090206752925, "learning_rate": 1.84175767286503e-05, "loss": 0.7345, "step": 1710 }, { "epoch": 0.5303576271843156, "grad_norm": 0.17499420838630725, "learning_rate": 1.839796118844388e-05, "loss": 0.6915, "step": 1711 }, { "epoch": 0.5306675965748382, "grad_norm": 0.18391963238870035, "learning_rate": 1.837834719910573e-05, "loss": 0.7132, "step": 1712 }, { "epoch": 0.5309775659653609, "grad_norm": 0.21467260180751757, "learning_rate": 1.8358734779623328e-05, "loss": 0.729, "step": 1713 }, { "epoch": 0.5312875353558836, "grad_norm": 0.17188842972984072, "learning_rate": 1.833912394898267e-05, "loss": 0.717, "step": 1714 }, { "epoch": 0.5315975047464063, "grad_norm": 0.17945583912991833, "learning_rate": 1.8319514726168188e-05, "loss": 0.7336, "step": 1715 }, { "epoch": 0.531907474136929, "grad_norm": 0.17993767513429995, "learning_rate": 1.8299907130162774e-05, "loss": 0.7067, "step": 1716 }, { "epoch": 0.5322174435274517, "grad_norm": 0.17421095557874036, "learning_rate": 1.8280301179947743e-05, "loss": 0.7096, "step": 1717 }, { "epoch": 0.5325274129179743, "grad_norm": 0.1780851558749167, "learning_rate": 1.82606968945028e-05, "loss": 0.6948, "step": 1718 }, { "epoch": 0.5328373823084971, "grad_norm": 0.19785008374897536, "learning_rate": 1.8241094292806074e-05, "loss": 0.7091, "step": 1719 }, { "epoch": 0.5331473516990197, "grad_norm": 0.18007330535593963, "learning_rate": 1.822149339383402e-05, "loss": 0.7012, "step": 1720 }, { "epoch": 0.5334573210895424, "grad_norm": 0.2020797945551986, "learning_rate": 1.8201894216561476e-05, "loss": 0.7263, "step": 1721 }, { "epoch": 0.5337672904800651, "grad_norm": 0.17693678774191052, "learning_rate": 1.8182296779961612e-05, "loss": 0.7357, "step": 1722 }, { "epoch": 0.5340772598705877, "grad_norm": 1.249639119732263, "learning_rate": 1.8162701103005892e-05, "loss": 0.7805, "step": 1723 }, { "epoch": 0.5343872292611105, "grad_norm": 0.18462124037833635, "learning_rate": 1.8143107204664094e-05, "loss": 0.7509, "step": 1724 }, { "epoch": 0.5346971986516331, "grad_norm": 0.2052073868928146, "learning_rate": 1.8123515103904276e-05, "loss": 0.7334, "step": 1725 }, { "epoch": 0.5350071680421559, "grad_norm": 0.40887561447829107, "learning_rate": 1.8103924819692737e-05, "loss": 0.7109, "step": 1726 }, { "epoch": 0.5353171374326785, "grad_norm": 0.22161981395578656, "learning_rate": 1.8084336370994045e-05, "loss": 0.7092, "step": 1727 }, { "epoch": 0.5356271068232012, "grad_norm": 0.19502421466718858, "learning_rate": 1.806474977677096e-05, "loss": 0.7299, "step": 1728 }, { "epoch": 0.5359370762137239, "grad_norm": 0.19532374456135593, "learning_rate": 1.8045165055984472e-05, "loss": 0.7029, "step": 1729 }, { "epoch": 0.5362470456042466, "grad_norm": 0.19388772941426255, "learning_rate": 1.802558222759375e-05, "loss": 0.7036, "step": 1730 }, { "epoch": 0.5365570149947693, "grad_norm": 0.2004008025297023, "learning_rate": 1.8006001310556125e-05, "loss": 0.7158, "step": 1731 }, { "epoch": 0.5368669843852919, "grad_norm": 0.19107577254182675, "learning_rate": 1.7986422323827087e-05, "loss": 0.7093, "step": 1732 }, { "epoch": 0.5371769537758146, "grad_norm": 0.1970571693241457, "learning_rate": 1.7966845286360242e-05, "loss": 0.7515, "step": 1733 }, { "epoch": 0.5374869231663373, "grad_norm": 0.21055070321196365, "learning_rate": 1.7947270217107326e-05, "loss": 0.725, "step": 1734 }, { "epoch": 0.53779689255686, "grad_norm": 0.17853413300699383, "learning_rate": 1.792769713501817e-05, "loss": 0.7255, "step": 1735 }, { "epoch": 0.5381068619473827, "grad_norm": 0.21943848070447805, "learning_rate": 1.7908126059040663e-05, "loss": 0.7002, "step": 1736 }, { "epoch": 0.5384168313379054, "grad_norm": 0.19949178315624347, "learning_rate": 1.7888557008120765e-05, "loss": 0.7212, "step": 1737 }, { "epoch": 0.538726800728428, "grad_norm": 0.18730687797051276, "learning_rate": 1.7868990001202482e-05, "loss": 0.7355, "step": 1738 }, { "epoch": 0.5390367701189508, "grad_norm": 0.19108954496114378, "learning_rate": 1.784942505722782e-05, "loss": 0.7215, "step": 1739 }, { "epoch": 0.5393467395094734, "grad_norm": 0.19172650646633627, "learning_rate": 1.7829862195136817e-05, "loss": 0.7264, "step": 1740 }, { "epoch": 0.5396567088999962, "grad_norm": 0.1985712162710161, "learning_rate": 1.7810301433867464e-05, "loss": 0.7023, "step": 1741 }, { "epoch": 0.5399666782905188, "grad_norm": 0.18719889380473043, "learning_rate": 1.7790742792355748e-05, "loss": 0.7311, "step": 1742 }, { "epoch": 0.5402766476810414, "grad_norm": 0.18815561985443383, "learning_rate": 1.777118628953558e-05, "loss": 0.7176, "step": 1743 }, { "epoch": 0.5405866170715642, "grad_norm": 0.196179017422479, "learning_rate": 1.7751631944338814e-05, "loss": 0.7158, "step": 1744 }, { "epoch": 0.5408965864620868, "grad_norm": 0.18432765436717394, "learning_rate": 1.7732079775695218e-05, "loss": 0.7284, "step": 1745 }, { "epoch": 0.5412065558526096, "grad_norm": 0.20907963954862308, "learning_rate": 1.771252980253243e-05, "loss": 0.7437, "step": 1746 }, { "epoch": 0.5415165252431322, "grad_norm": 0.19919914348515852, "learning_rate": 1.769298204377599e-05, "loss": 0.7053, "step": 1747 }, { "epoch": 0.541826494633655, "grad_norm": 0.2289687347541544, "learning_rate": 1.7673436518349298e-05, "loss": 0.708, "step": 1748 }, { "epoch": 0.5421364640241776, "grad_norm": 0.19716820716565073, "learning_rate": 1.765389324517355e-05, "loss": 0.7509, "step": 1749 }, { "epoch": 0.5424464334147003, "grad_norm": 0.1761514184351862, "learning_rate": 1.7634352243167808e-05, "loss": 0.7083, "step": 1750 }, { "epoch": 0.542756402805223, "grad_norm": 0.2953178205393578, "learning_rate": 1.7614813531248915e-05, "loss": 0.7213, "step": 1751 }, { "epoch": 0.5430663721957457, "grad_norm": 0.42457860340383735, "learning_rate": 1.7595277128331493e-05, "loss": 0.7353, "step": 1752 }, { "epoch": 0.5433763415862684, "grad_norm": 0.19372800529363587, "learning_rate": 1.7575743053327944e-05, "loss": 0.6981, "step": 1753 }, { "epoch": 0.543686310976791, "grad_norm": 0.1918792716168009, "learning_rate": 1.755621132514839e-05, "loss": 0.7312, "step": 1754 }, { "epoch": 0.5439962803673137, "grad_norm": 0.21086854787596446, "learning_rate": 1.7536681962700716e-05, "loss": 0.7415, "step": 1755 }, { "epoch": 0.5443062497578364, "grad_norm": 0.177169380951802, "learning_rate": 1.751715498489049e-05, "loss": 0.7215, "step": 1756 }, { "epoch": 0.5446162191483591, "grad_norm": 0.21908031581576354, "learning_rate": 1.7497630410620985e-05, "loss": 0.7314, "step": 1757 }, { "epoch": 0.5449261885388818, "grad_norm": 0.17386887989294925, "learning_rate": 1.7478108258793146e-05, "loss": 0.7282, "step": 1758 }, { "epoch": 0.5452361579294045, "grad_norm": 0.2043370252455661, "learning_rate": 1.745858854830555e-05, "loss": 0.7401, "step": 1759 }, { "epoch": 0.5455461273199271, "grad_norm": 0.17485683995409, "learning_rate": 1.7439071298054453e-05, "loss": 0.6995, "step": 1760 }, { "epoch": 0.5458560967104499, "grad_norm": 0.2239935814833675, "learning_rate": 1.7419556526933702e-05, "loss": 0.7396, "step": 1761 }, { "epoch": 0.5461660661009725, "grad_norm": 0.1732591131975747, "learning_rate": 1.740004425383474e-05, "loss": 0.7067, "step": 1762 }, { "epoch": 0.5464760354914953, "grad_norm": 0.2022495436705838, "learning_rate": 1.7380534497646616e-05, "loss": 0.6686, "step": 1763 }, { "epoch": 0.5467860048820179, "grad_norm": 0.1789117826184266, "learning_rate": 1.7361027277255912e-05, "loss": 0.7389, "step": 1764 }, { "epoch": 0.5470959742725406, "grad_norm": 0.20360973378087907, "learning_rate": 1.734152261154677e-05, "loss": 0.7473, "step": 1765 }, { "epoch": 0.5474059436630633, "grad_norm": 0.27064423419244954, "learning_rate": 1.7322020519400874e-05, "loss": 0.7161, "step": 1766 }, { "epoch": 0.5477159130535859, "grad_norm": 0.20765170859623922, "learning_rate": 1.7302521019697382e-05, "loss": 0.713, "step": 1767 }, { "epoch": 0.5480258824441087, "grad_norm": 0.1862193225985003, "learning_rate": 1.7283024131312976e-05, "loss": 0.7111, "step": 1768 }, { "epoch": 0.5483358518346313, "grad_norm": 0.18668443679675234, "learning_rate": 1.72635298731218e-05, "loss": 0.6651, "step": 1769 }, { "epoch": 0.548645821225154, "grad_norm": 0.18930445332085136, "learning_rate": 1.7244038263995436e-05, "loss": 0.7231, "step": 1770 }, { "epoch": 0.5489557906156767, "grad_norm": 0.18866814675049026, "learning_rate": 1.7224549322802926e-05, "loss": 0.72, "step": 1771 }, { "epoch": 0.5492657600061994, "grad_norm": 0.18139334538709742, "learning_rate": 1.72050630684107e-05, "loss": 0.7198, "step": 1772 }, { "epoch": 0.5495757293967221, "grad_norm": 0.18714033131201965, "learning_rate": 1.7185579519682614e-05, "loss": 0.7159, "step": 1773 }, { "epoch": 0.5498856987872448, "grad_norm": 0.18350193144938318, "learning_rate": 1.71660986954799e-05, "loss": 0.7565, "step": 1774 }, { "epoch": 0.5501956681777674, "grad_norm": 0.17649681525235697, "learning_rate": 1.7146620614661138e-05, "loss": 0.7372, "step": 1775 }, { "epoch": 0.5505056375682902, "grad_norm": 0.18306207953859416, "learning_rate": 1.712714529608227e-05, "loss": 0.7401, "step": 1776 }, { "epoch": 0.5508156069588128, "grad_norm": 0.20216236452903905, "learning_rate": 1.710767275859655e-05, "loss": 0.759, "step": 1777 }, { "epoch": 0.5511255763493355, "grad_norm": 0.1814536847403557, "learning_rate": 1.708820302105454e-05, "loss": 0.7141, "step": 1778 }, { "epoch": 0.5514355457398582, "grad_norm": 0.2013054993600907, "learning_rate": 1.7068736102304107e-05, "loss": 0.7458, "step": 1779 }, { "epoch": 0.5517455151303808, "grad_norm": 0.16751551489373315, "learning_rate": 1.7049272021190372e-05, "loss": 0.7084, "step": 1780 }, { "epoch": 0.5520554845209036, "grad_norm": 0.21458448215997156, "learning_rate": 1.7029810796555714e-05, "loss": 0.6854, "step": 1781 }, { "epoch": 0.5523654539114262, "grad_norm": 0.19310808319873407, "learning_rate": 1.7010352447239763e-05, "loss": 0.7519, "step": 1782 }, { "epoch": 0.552675423301949, "grad_norm": 0.1937837152096258, "learning_rate": 1.6990896992079336e-05, "loss": 0.693, "step": 1783 }, { "epoch": 0.5529853926924716, "grad_norm": 0.1920148055669668, "learning_rate": 1.6971444449908474e-05, "loss": 0.7282, "step": 1784 }, { "epoch": 0.5532953620829943, "grad_norm": 0.1813893676021912, "learning_rate": 1.6951994839558376e-05, "loss": 0.7165, "step": 1785 }, { "epoch": 0.553605331473517, "grad_norm": 0.19884918956981026, "learning_rate": 1.6932548179857416e-05, "loss": 0.7384, "step": 1786 }, { "epoch": 0.5539153008640397, "grad_norm": 0.1882786714593342, "learning_rate": 1.6913104489631122e-05, "loss": 0.7106, "step": 1787 }, { "epoch": 0.5542252702545624, "grad_norm": 0.18325817047741969, "learning_rate": 1.6893663787702124e-05, "loss": 0.718, "step": 1788 }, { "epoch": 0.554535239645085, "grad_norm": 0.18290945019656096, "learning_rate": 1.687422609289018e-05, "loss": 0.7058, "step": 1789 }, { "epoch": 0.5548452090356077, "grad_norm": 0.1895036060026172, "learning_rate": 1.685479142401212e-05, "loss": 0.7468, "step": 1790 }, { "epoch": 0.5551551784261304, "grad_norm": 0.18960215743491388, "learning_rate": 1.6835359799881844e-05, "loss": 0.7274, "step": 1791 }, { "epoch": 0.5554651478166531, "grad_norm": 0.19395026818505506, "learning_rate": 1.6815931239310336e-05, "loss": 0.729, "step": 1792 }, { "epoch": 0.5557751172071758, "grad_norm": 0.18317088751220864, "learning_rate": 1.679650576110557e-05, "loss": 0.7255, "step": 1793 }, { "epoch": 0.5560850865976985, "grad_norm": 0.1838478309929949, "learning_rate": 1.6777083384072557e-05, "loss": 0.7164, "step": 1794 }, { "epoch": 0.5563950559882211, "grad_norm": 0.18518538568078496, "learning_rate": 1.6757664127013328e-05, "loss": 0.7199, "step": 1795 }, { "epoch": 0.5567050253787439, "grad_norm": 0.17664863718627055, "learning_rate": 1.673824800872685e-05, "loss": 0.7046, "step": 1796 }, { "epoch": 0.5570149947692665, "grad_norm": 0.19276628453598976, "learning_rate": 1.671883504800908e-05, "loss": 0.7035, "step": 1797 }, { "epoch": 0.5573249641597893, "grad_norm": 0.1823592546225962, "learning_rate": 1.6699425263652913e-05, "loss": 0.7451, "step": 1798 }, { "epoch": 0.5576349335503119, "grad_norm": 0.1881900217380686, "learning_rate": 1.668001867444816e-05, "loss": 0.7121, "step": 1799 }, { "epoch": 0.5579449029408345, "grad_norm": 0.18902769914012876, "learning_rate": 1.6660615299181563e-05, "loss": 0.6985, "step": 1800 }, { "epoch": 0.5582548723313573, "grad_norm": 0.19105103428714146, "learning_rate": 1.6641215156636714e-05, "loss": 0.6956, "step": 1801 }, { "epoch": 0.5585648417218799, "grad_norm": 0.20651195244933696, "learning_rate": 1.6621818265594116e-05, "loss": 0.7072, "step": 1802 }, { "epoch": 0.5588748111124027, "grad_norm": 0.1820297291181603, "learning_rate": 1.6602424644831097e-05, "loss": 0.7337, "step": 1803 }, { "epoch": 0.5591847805029253, "grad_norm": 0.18439564201112887, "learning_rate": 1.6583034313121823e-05, "loss": 0.7388, "step": 1804 }, { "epoch": 0.559494749893448, "grad_norm": 0.18291425793122074, "learning_rate": 1.65636472892373e-05, "loss": 0.6904, "step": 1805 }, { "epoch": 0.5598047192839707, "grad_norm": 0.18781752650253883, "learning_rate": 1.6544263591945288e-05, "loss": 0.7456, "step": 1806 }, { "epoch": 0.5601146886744934, "grad_norm": 0.18600348302281092, "learning_rate": 1.652488324001037e-05, "loss": 0.6957, "step": 1807 }, { "epoch": 0.5604246580650161, "grad_norm": 0.19249503261371453, "learning_rate": 1.650550625219387e-05, "loss": 0.709, "step": 1808 }, { "epoch": 0.5607346274555388, "grad_norm": 0.17518237160438002, "learning_rate": 1.648613264725385e-05, "loss": 0.712, "step": 1809 }, { "epoch": 0.5610445968460614, "grad_norm": 0.17417362441119677, "learning_rate": 1.646676244394511e-05, "loss": 0.7028, "step": 1810 }, { "epoch": 0.5613545662365841, "grad_norm": 0.17303722345130618, "learning_rate": 1.6447395661019153e-05, "loss": 0.7051, "step": 1811 }, { "epoch": 0.5616645356271068, "grad_norm": 0.1715167474187788, "learning_rate": 1.642803231722416e-05, "loss": 0.7186, "step": 1812 }, { "epoch": 0.5619745050176295, "grad_norm": 0.20196123740838037, "learning_rate": 1.6408672431305004e-05, "loss": 0.7607, "step": 1813 }, { "epoch": 0.5622844744081522, "grad_norm": 0.1718503439171141, "learning_rate": 1.638931602200319e-05, "loss": 0.7103, "step": 1814 }, { "epoch": 0.5625944437986748, "grad_norm": 0.17738149214590276, "learning_rate": 1.6369963108056877e-05, "loss": 0.7146, "step": 1815 }, { "epoch": 0.5629044131891976, "grad_norm": 0.18359728621608062, "learning_rate": 1.6350613708200813e-05, "loss": 0.6912, "step": 1816 }, { "epoch": 0.5632143825797202, "grad_norm": 0.19336562236862617, "learning_rate": 1.6331267841166377e-05, "loss": 0.7171, "step": 1817 }, { "epoch": 0.563524351970243, "grad_norm": 0.17255689883819086, "learning_rate": 1.6311925525681506e-05, "loss": 0.698, "step": 1818 }, { "epoch": 0.5638343213607656, "grad_norm": 0.18151549026892835, "learning_rate": 1.62925867804707e-05, "loss": 0.7275, "step": 1819 }, { "epoch": 0.5641442907512884, "grad_norm": 0.17803031774297592, "learning_rate": 1.6273251624255017e-05, "loss": 0.7355, "step": 1820 }, { "epoch": 0.564454260141811, "grad_norm": 0.18837853709996827, "learning_rate": 1.625392007575201e-05, "loss": 0.7363, "step": 1821 }, { "epoch": 0.5647642295323336, "grad_norm": 0.1967933563475986, "learning_rate": 1.6234592153675785e-05, "loss": 0.732, "step": 1822 }, { "epoch": 0.5650741989228564, "grad_norm": 0.1842879545410301, "learning_rate": 1.6215267876736905e-05, "loss": 0.7072, "step": 1823 }, { "epoch": 0.565384168313379, "grad_norm": 0.21011456949523544, "learning_rate": 1.619594726364241e-05, "loss": 0.7021, "step": 1824 }, { "epoch": 0.5656941377039018, "grad_norm": 0.1933500746031847, "learning_rate": 1.617663033309579e-05, "loss": 0.6902, "step": 1825 }, { "epoch": 0.5660041070944244, "grad_norm": 0.1853664271627677, "learning_rate": 1.615731710379699e-05, "loss": 0.7078, "step": 1826 }, { "epoch": 0.5663140764849471, "grad_norm": 0.19263445578514748, "learning_rate": 1.613800759444234e-05, "loss": 0.7173, "step": 1827 }, { "epoch": 0.5666240458754698, "grad_norm": 0.1837459670998477, "learning_rate": 1.6118701823724608e-05, "loss": 0.7372, "step": 1828 }, { "epoch": 0.5669340152659925, "grad_norm": 0.1829161392938192, "learning_rate": 1.6099399810332898e-05, "loss": 0.709, "step": 1829 }, { "epoch": 0.5672439846565152, "grad_norm": 0.18671617195527576, "learning_rate": 1.6080101572952708e-05, "loss": 0.7303, "step": 1830 }, { "epoch": 0.5675539540470379, "grad_norm": 0.2819791624319467, "learning_rate": 1.6060807130265884e-05, "loss": 0.7094, "step": 1831 }, { "epoch": 0.5678639234375605, "grad_norm": 0.1724630247860942, "learning_rate": 1.6041516500950568e-05, "loss": 0.7198, "step": 1832 }, { "epoch": 0.5681738928280832, "grad_norm": 0.17984616817426285, "learning_rate": 1.6022229703681247e-05, "loss": 0.7193, "step": 1833 }, { "epoch": 0.5684838622186059, "grad_norm": 0.19421573790793575, "learning_rate": 1.6002946757128665e-05, "loss": 0.7656, "step": 1834 }, { "epoch": 0.5687938316091286, "grad_norm": 0.17585646535858607, "learning_rate": 1.5983667679959856e-05, "loss": 0.7336, "step": 1835 }, { "epoch": 0.5691038009996513, "grad_norm": 0.18254098311172126, "learning_rate": 1.5964392490838124e-05, "loss": 0.709, "step": 1836 }, { "epoch": 0.5694137703901739, "grad_norm": 0.1797255045485471, "learning_rate": 1.5945121208422975e-05, "loss": 0.698, "step": 1837 }, { "epoch": 0.5697237397806967, "grad_norm": 0.18764811505601925, "learning_rate": 1.5925853851370146e-05, "loss": 0.7201, "step": 1838 }, { "epoch": 0.5700337091712193, "grad_norm": 0.18081286219417725, "learning_rate": 1.5906590438331595e-05, "loss": 0.7647, "step": 1839 }, { "epoch": 0.5703436785617421, "grad_norm": 0.2434808466164941, "learning_rate": 1.5887330987955426e-05, "loss": 0.7463, "step": 1840 }, { "epoch": 0.5706536479522647, "grad_norm": 0.1883752455842645, "learning_rate": 1.586807551888594e-05, "loss": 0.7267, "step": 1841 }, { "epoch": 0.5709636173427874, "grad_norm": 0.17916146352734125, "learning_rate": 1.584882404976355e-05, "loss": 0.7066, "step": 1842 }, { "epoch": 0.5712735867333101, "grad_norm": 0.20348364016589648, "learning_rate": 1.582957659922484e-05, "loss": 0.767, "step": 1843 }, { "epoch": 0.5715835561238328, "grad_norm": 0.1800735369307771, "learning_rate": 1.5810333185902465e-05, "loss": 0.7103, "step": 1844 }, { "epoch": 0.5718935255143555, "grad_norm": 0.1804115316111104, "learning_rate": 1.5791093828425185e-05, "loss": 0.6947, "step": 1845 }, { "epoch": 0.5722034949048781, "grad_norm": 0.18309829196222766, "learning_rate": 1.5771858545417846e-05, "loss": 0.6943, "step": 1846 }, { "epoch": 0.5725134642954008, "grad_norm": 0.18042910015282673, "learning_rate": 1.5752627355501323e-05, "loss": 0.7092, "step": 1847 }, { "epoch": 0.5728234336859235, "grad_norm": 0.18421299058822943, "learning_rate": 1.5733400277292553e-05, "loss": 0.7055, "step": 1848 }, { "epoch": 0.5731334030764462, "grad_norm": 0.1837408308496378, "learning_rate": 1.5714177329404496e-05, "loss": 0.7127, "step": 1849 }, { "epoch": 0.5734433724669689, "grad_norm": 0.16960879614215546, "learning_rate": 1.5694958530446085e-05, "loss": 0.7037, "step": 1850 }, { "epoch": 0.5737533418574916, "grad_norm": 0.18354431971848417, "learning_rate": 1.5675743899022255e-05, "loss": 0.7173, "step": 1851 }, { "epoch": 0.5740633112480142, "grad_norm": 0.1801371400210947, "learning_rate": 1.5656533453733915e-05, "loss": 0.7212, "step": 1852 }, { "epoch": 0.574373280638537, "grad_norm": 0.22837551000122552, "learning_rate": 1.56373272131779e-05, "loss": 0.7195, "step": 1853 }, { "epoch": 0.5746832500290596, "grad_norm": 0.2711845331426683, "learning_rate": 1.5618125195946995e-05, "loss": 0.7113, "step": 1854 }, { "epoch": 0.5749932194195824, "grad_norm": 0.17974794417312098, "learning_rate": 1.5598927420629878e-05, "loss": 0.707, "step": 1855 }, { "epoch": 0.575303188810105, "grad_norm": 0.17553237617357326, "learning_rate": 1.557973390581114e-05, "loss": 0.7157, "step": 1856 }, { "epoch": 0.5756131582006276, "grad_norm": 0.18203739931494334, "learning_rate": 1.556054467007124e-05, "loss": 0.757, "step": 1857 }, { "epoch": 0.5759231275911504, "grad_norm": 0.17567948794303304, "learning_rate": 1.554135973198647e-05, "loss": 0.7287, "step": 1858 }, { "epoch": 0.576233096981673, "grad_norm": 0.1727725117968627, "learning_rate": 1.5522179110129016e-05, "loss": 0.7273, "step": 1859 }, { "epoch": 0.5765430663721958, "grad_norm": 0.19058966130412403, "learning_rate": 1.5503002823066827e-05, "loss": 0.7517, "step": 1860 }, { "epoch": 0.5768530357627184, "grad_norm": 0.17303217103507665, "learning_rate": 1.548383088936369e-05, "loss": 0.7379, "step": 1861 }, { "epoch": 0.5771630051532411, "grad_norm": 0.1795746243819627, "learning_rate": 1.5464663327579184e-05, "loss": 0.7406, "step": 1862 }, { "epoch": 0.5774729745437638, "grad_norm": 0.18186891346516626, "learning_rate": 1.5445500156268624e-05, "loss": 0.7161, "step": 1863 }, { "epoch": 0.5777829439342865, "grad_norm": 0.1818149269933372, "learning_rate": 1.5426341393983095e-05, "loss": 0.7004, "step": 1864 }, { "epoch": 0.5780929133248092, "grad_norm": 0.1777873733675098, "learning_rate": 1.5407187059269424e-05, "loss": 0.7294, "step": 1865 }, { "epoch": 0.5784028827153319, "grad_norm": 0.1830052052885661, "learning_rate": 1.538803717067012e-05, "loss": 0.7317, "step": 1866 }, { "epoch": 0.5787128521058545, "grad_norm": 0.20434333482297193, "learning_rate": 1.536889174672343e-05, "loss": 0.7244, "step": 1867 }, { "epoch": 0.5790228214963772, "grad_norm": 0.1711835545484821, "learning_rate": 1.534975080596323e-05, "loss": 0.7303, "step": 1868 }, { "epoch": 0.5793327908868999, "grad_norm": 0.29612996179693746, "learning_rate": 1.5330614366919098e-05, "loss": 0.7344, "step": 1869 }, { "epoch": 0.5796427602774226, "grad_norm": 0.1854857813974233, "learning_rate": 1.531148244811624e-05, "loss": 0.7104, "step": 1870 }, { "epoch": 0.5799527296679453, "grad_norm": 0.177907669166859, "learning_rate": 1.5292355068075475e-05, "loss": 0.6868, "step": 1871 }, { "epoch": 0.580262699058468, "grad_norm": 0.1755774295157113, "learning_rate": 1.5273232245313246e-05, "loss": 0.7382, "step": 1872 }, { "epoch": 0.5805726684489907, "grad_norm": 0.19051703528565367, "learning_rate": 1.5254113998341564e-05, "loss": 0.7074, "step": 1873 }, { "epoch": 0.5808826378395133, "grad_norm": 0.1808492891105276, "learning_rate": 1.523500034566803e-05, "loss": 0.7364, "step": 1874 }, { "epoch": 0.5811926072300361, "grad_norm": 0.17892157637935166, "learning_rate": 1.5215891305795795e-05, "loss": 0.7208, "step": 1875 }, { "epoch": 0.5815025766205587, "grad_norm": 0.17910665874641135, "learning_rate": 1.519678689722353e-05, "loss": 0.7081, "step": 1876 }, { "epoch": 0.5818125460110815, "grad_norm": 0.18171365660914723, "learning_rate": 1.517768713844543e-05, "loss": 0.7028, "step": 1877 }, { "epoch": 0.5821225154016041, "grad_norm": 0.21337538067100403, "learning_rate": 1.5158592047951202e-05, "loss": 0.7628, "step": 1878 }, { "epoch": 0.5824324847921267, "grad_norm": 0.191844377069721, "learning_rate": 1.5139501644226006e-05, "loss": 0.6937, "step": 1879 }, { "epoch": 0.5827424541826495, "grad_norm": 0.17872359996827256, "learning_rate": 1.51204159457505e-05, "loss": 0.7083, "step": 1880 }, { "epoch": 0.5830524235731721, "grad_norm": 0.1868263718003891, "learning_rate": 1.5101334971000748e-05, "loss": 0.7096, "step": 1881 }, { "epoch": 0.5833623929636949, "grad_norm": 0.17476970174758283, "learning_rate": 1.5082258738448277e-05, "loss": 0.6704, "step": 1882 }, { "epoch": 0.5836723623542175, "grad_norm": 0.1832466959520012, "learning_rate": 1.5063187266560003e-05, "loss": 0.7337, "step": 1883 }, { "epoch": 0.5839823317447402, "grad_norm": 0.17600453401147018, "learning_rate": 1.5044120573798233e-05, "loss": 0.6946, "step": 1884 }, { "epoch": 0.5842923011352629, "grad_norm": 0.17742787383581196, "learning_rate": 1.5025058678620662e-05, "loss": 0.7252, "step": 1885 }, { "epoch": 0.5846022705257856, "grad_norm": 0.20107719721612924, "learning_rate": 1.5006001599480317e-05, "loss": 0.7241, "step": 1886 }, { "epoch": 0.5849122399163083, "grad_norm": 0.17902385088077255, "learning_rate": 1.498694935482559e-05, "loss": 0.721, "step": 1887 }, { "epoch": 0.585222209306831, "grad_norm": 0.1677693729257124, "learning_rate": 1.496790196310018e-05, "loss": 0.7391, "step": 1888 }, { "epoch": 0.5855321786973536, "grad_norm": 0.18105556437647033, "learning_rate": 1.4948859442743073e-05, "loss": 0.698, "step": 1889 }, { "epoch": 0.5858421480878763, "grad_norm": 0.16153493682271766, "learning_rate": 1.4929821812188575e-05, "loss": 0.7236, "step": 1890 }, { "epoch": 0.586152117478399, "grad_norm": 0.17168777112271066, "learning_rate": 1.4910789089866224e-05, "loss": 0.7076, "step": 1891 }, { "epoch": 0.5864620868689217, "grad_norm": 0.17244291552924898, "learning_rate": 1.4891761294200818e-05, "loss": 0.7309, "step": 1892 }, { "epoch": 0.5867720562594444, "grad_norm": 0.17281808354741646, "learning_rate": 1.4872738443612403e-05, "loss": 0.7455, "step": 1893 }, { "epoch": 0.587082025649967, "grad_norm": 0.17449429926840518, "learning_rate": 1.48537205565162e-05, "loss": 0.7183, "step": 1894 }, { "epoch": 0.5873919950404898, "grad_norm": 0.1759009131298335, "learning_rate": 1.4834707651322666e-05, "loss": 0.6982, "step": 1895 }, { "epoch": 0.5877019644310124, "grad_norm": 0.17325481798072587, "learning_rate": 1.4815699746437414e-05, "loss": 0.7232, "step": 1896 }, { "epoch": 0.5880119338215352, "grad_norm": 0.25295996964420653, "learning_rate": 1.4796696860261214e-05, "loss": 0.7392, "step": 1897 }, { "epoch": 0.5883219032120578, "grad_norm": 0.1678078467240471, "learning_rate": 1.4777699011189993e-05, "loss": 0.7286, "step": 1898 }, { "epoch": 0.5886318726025805, "grad_norm": 0.16790308103366983, "learning_rate": 1.4758706217614773e-05, "loss": 0.6993, "step": 1899 }, { "epoch": 0.5889418419931032, "grad_norm": 0.17090334006759694, "learning_rate": 1.4739718497921713e-05, "loss": 0.7187, "step": 1900 }, { "epoch": 0.5892518113836258, "grad_norm": 0.285218697230415, "learning_rate": 1.4720735870492057e-05, "loss": 0.7351, "step": 1901 }, { "epoch": 0.5895617807741486, "grad_norm": 0.17792115838574182, "learning_rate": 1.4701758353702089e-05, "loss": 0.7113, "step": 1902 }, { "epoch": 0.5898717501646712, "grad_norm": 0.1814823239077753, "learning_rate": 1.468278596592319e-05, "loss": 0.7335, "step": 1903 }, { "epoch": 0.5901817195551939, "grad_norm": 0.1785394944476355, "learning_rate": 1.4663818725521735e-05, "loss": 0.7056, "step": 1904 }, { "epoch": 0.5904916889457166, "grad_norm": 0.17610368838324955, "learning_rate": 1.4644856650859133e-05, "loss": 0.7367, "step": 1905 }, { "epoch": 0.5908016583362393, "grad_norm": 0.18147346816609325, "learning_rate": 1.462589976029181e-05, "loss": 0.7254, "step": 1906 }, { "epoch": 0.591111627726762, "grad_norm": 0.1694905472434542, "learning_rate": 1.460694807217114e-05, "loss": 0.7145, "step": 1907 }, { "epoch": 0.5914215971172847, "grad_norm": 0.17667095214163694, "learning_rate": 1.4588001604843482e-05, "loss": 0.7356, "step": 1908 }, { "epoch": 0.5917315665078073, "grad_norm": 0.1814981504558404, "learning_rate": 1.4569060376650144e-05, "loss": 0.7402, "step": 1909 }, { "epoch": 0.5920415358983301, "grad_norm": 0.17485650011961612, "learning_rate": 1.4550124405927345e-05, "loss": 0.7431, "step": 1910 }, { "epoch": 0.5923515052888527, "grad_norm": 0.34317276106328126, "learning_rate": 1.4531193711006232e-05, "loss": 0.7363, "step": 1911 }, { "epoch": 0.5926614746793755, "grad_norm": 0.17118407680297024, "learning_rate": 1.4512268310212823e-05, "loss": 0.7137, "step": 1912 }, { "epoch": 0.5929714440698981, "grad_norm": 0.17588394902133966, "learning_rate": 1.4493348221868031e-05, "loss": 0.7341, "step": 1913 }, { "epoch": 0.5932814134604207, "grad_norm": 0.17847611798544918, "learning_rate": 1.4474433464287629e-05, "loss": 0.7018, "step": 1914 }, { "epoch": 0.5935913828509435, "grad_norm": 0.2843275597761347, "learning_rate": 1.4455524055782207e-05, "loss": 0.7314, "step": 1915 }, { "epoch": 0.5939013522414661, "grad_norm": 0.17817939823390155, "learning_rate": 1.4436620014657203e-05, "loss": 0.6834, "step": 1916 }, { "epoch": 0.5942113216319889, "grad_norm": 0.17262062953010612, "learning_rate": 1.4417721359212836e-05, "loss": 0.7376, "step": 1917 }, { "epoch": 0.5945212910225115, "grad_norm": 0.1738452838765574, "learning_rate": 1.4398828107744116e-05, "loss": 0.7123, "step": 1918 }, { "epoch": 0.5948312604130342, "grad_norm": 0.17122197103522957, "learning_rate": 1.4379940278540847e-05, "loss": 0.7216, "step": 1919 }, { "epoch": 0.5951412298035569, "grad_norm": 0.16792429738344195, "learning_rate": 1.4361057889887544e-05, "loss": 0.6972, "step": 1920 }, { "epoch": 0.5954511991940796, "grad_norm": 0.1728807264130131, "learning_rate": 1.4342180960063483e-05, "loss": 0.7089, "step": 1921 }, { "epoch": 0.5957611685846023, "grad_norm": 0.1702321262818614, "learning_rate": 1.4323309507342659e-05, "loss": 0.7292, "step": 1922 }, { "epoch": 0.596071137975125, "grad_norm": 0.1677635676591947, "learning_rate": 1.4304443549993737e-05, "loss": 0.7049, "step": 1923 }, { "epoch": 0.5963811073656476, "grad_norm": 0.1706133264198535, "learning_rate": 1.428558310628009e-05, "loss": 0.6972, "step": 1924 }, { "epoch": 0.5966910767561703, "grad_norm": 0.17361118628549754, "learning_rate": 1.4266728194459729e-05, "loss": 0.724, "step": 1925 }, { "epoch": 0.597001046146693, "grad_norm": 0.36217565530280676, "learning_rate": 1.4247878832785329e-05, "loss": 0.6758, "step": 1926 }, { "epoch": 0.5973110155372157, "grad_norm": 0.1700566689675756, "learning_rate": 1.4229035039504198e-05, "loss": 0.7163, "step": 1927 }, { "epoch": 0.5976209849277384, "grad_norm": 0.17597615395747143, "learning_rate": 1.4210196832858223e-05, "loss": 0.7167, "step": 1928 }, { "epoch": 0.597930954318261, "grad_norm": 0.18826775278018815, "learning_rate": 1.419136423108392e-05, "loss": 0.7259, "step": 1929 }, { "epoch": 0.5982409237087838, "grad_norm": 0.1768702988728839, "learning_rate": 1.4172537252412344e-05, "loss": 0.7098, "step": 1930 }, { "epoch": 0.5985508930993064, "grad_norm": 0.17844854249134218, "learning_rate": 1.4153715915069128e-05, "loss": 0.7248, "step": 1931 }, { "epoch": 0.5988608624898292, "grad_norm": 0.17297620045450737, "learning_rate": 1.413490023727445e-05, "loss": 0.7115, "step": 1932 }, { "epoch": 0.5991708318803518, "grad_norm": 0.19573304065241834, "learning_rate": 1.4116090237242976e-05, "loss": 0.7244, "step": 1933 }, { "epoch": 0.5994808012708746, "grad_norm": 0.23002842250701153, "learning_rate": 1.4097285933183915e-05, "loss": 0.7677, "step": 1934 }, { "epoch": 0.5997907706613972, "grad_norm": 0.18065945099374645, "learning_rate": 1.4078487343300948e-05, "loss": 0.6803, "step": 1935 }, { "epoch": 0.6001007400519198, "grad_norm": 0.3340366680174546, "learning_rate": 1.405969448579221e-05, "loss": 0.6983, "step": 1936 }, { "epoch": 0.6004107094424426, "grad_norm": 0.18399884360067692, "learning_rate": 1.40409073788503e-05, "loss": 0.7086, "step": 1937 }, { "epoch": 0.6007206788329652, "grad_norm": 0.18746608341727308, "learning_rate": 1.4022126040662252e-05, "loss": 0.7086, "step": 1938 }, { "epoch": 0.601030648223488, "grad_norm": 0.17574798059249058, "learning_rate": 1.4003350489409501e-05, "loss": 0.674, "step": 1939 }, { "epoch": 0.6013406176140106, "grad_norm": 0.19613650594414422, "learning_rate": 1.3984580743267908e-05, "loss": 0.7422, "step": 1940 }, { "epoch": 0.6016505870045333, "grad_norm": 0.17481713288883519, "learning_rate": 1.3965816820407681e-05, "loss": 0.7166, "step": 1941 }, { "epoch": 0.601960556395056, "grad_norm": 0.19306245084314852, "learning_rate": 1.3947058738993418e-05, "loss": 0.7168, "step": 1942 }, { "epoch": 0.6022705257855787, "grad_norm": 0.16803363076012134, "learning_rate": 1.3928306517184038e-05, "loss": 0.712, "step": 1943 }, { "epoch": 0.6025804951761013, "grad_norm": 0.17914182612841034, "learning_rate": 1.3909560173132806e-05, "loss": 0.6938, "step": 1944 }, { "epoch": 0.6028904645666241, "grad_norm": 0.20310395526085828, "learning_rate": 1.3890819724987298e-05, "loss": 0.7062, "step": 1945 }, { "epoch": 0.6032004339571467, "grad_norm": 0.17939315088401314, "learning_rate": 1.3872085190889362e-05, "loss": 0.7207, "step": 1946 }, { "epoch": 0.6035104033476694, "grad_norm": 0.18334678711861066, "learning_rate": 1.3853356588975149e-05, "loss": 0.7212, "step": 1947 }, { "epoch": 0.6038203727381921, "grad_norm": 0.1868777880242582, "learning_rate": 1.3834633937375035e-05, "loss": 0.7334, "step": 1948 }, { "epoch": 0.6041303421287147, "grad_norm": 0.17565798538942784, "learning_rate": 1.3815917254213669e-05, "loss": 0.7167, "step": 1949 }, { "epoch": 0.6044403115192375, "grad_norm": 0.18975787215868803, "learning_rate": 1.3797206557609899e-05, "loss": 0.7217, "step": 1950 }, { "epoch": 0.6047502809097601, "grad_norm": 0.17164595331473353, "learning_rate": 1.3778501865676788e-05, "loss": 0.6877, "step": 1951 }, { "epoch": 0.6050602503002829, "grad_norm": 0.18723674888260813, "learning_rate": 1.375980319652158e-05, "loss": 0.7318, "step": 1952 }, { "epoch": 0.6053702196908055, "grad_norm": 0.17405441992907172, "learning_rate": 1.3741110568245697e-05, "loss": 0.7004, "step": 1953 }, { "epoch": 0.6056801890813283, "grad_norm": 0.16966799492519602, "learning_rate": 1.37224239989447e-05, "loss": 0.6796, "step": 1954 }, { "epoch": 0.6059901584718509, "grad_norm": 0.17019617349109092, "learning_rate": 1.3703743506708307e-05, "loss": 0.7222, "step": 1955 }, { "epoch": 0.6063001278623736, "grad_norm": 0.1781228651817907, "learning_rate": 1.368506910962032e-05, "loss": 0.6949, "step": 1956 }, { "epoch": 0.6066100972528963, "grad_norm": 0.17687575152743448, "learning_rate": 1.366640082575867e-05, "loss": 0.7106, "step": 1957 }, { "epoch": 0.6069200666434189, "grad_norm": 0.18945935025271768, "learning_rate": 1.3647738673195363e-05, "loss": 0.7051, "step": 1958 }, { "epoch": 0.6072300360339417, "grad_norm": 0.1706463707423477, "learning_rate": 1.3629082669996451e-05, "loss": 0.7276, "step": 1959 }, { "epoch": 0.6075400054244643, "grad_norm": 0.20007429446223132, "learning_rate": 1.3610432834222067e-05, "loss": 0.7173, "step": 1960 }, { "epoch": 0.607849974814987, "grad_norm": 0.17103697022738928, "learning_rate": 1.3591789183926336e-05, "loss": 0.7079, "step": 1961 }, { "epoch": 0.6081599442055097, "grad_norm": 0.1924384054519446, "learning_rate": 1.3573151737157422e-05, "loss": 0.718, "step": 1962 }, { "epoch": 0.6084699135960324, "grad_norm": 0.1839799346908743, "learning_rate": 1.3554520511957476e-05, "loss": 0.7293, "step": 1963 }, { "epoch": 0.608779882986555, "grad_norm": 0.337377354724682, "learning_rate": 1.3535895526362626e-05, "loss": 0.7477, "step": 1964 }, { "epoch": 0.6090898523770778, "grad_norm": 0.17813982798327324, "learning_rate": 1.3517276798402947e-05, "loss": 0.7117, "step": 1965 }, { "epoch": 0.6093998217676004, "grad_norm": 0.19138291521673947, "learning_rate": 1.3498664346102492e-05, "loss": 0.7066, "step": 1966 }, { "epoch": 0.6097097911581232, "grad_norm": 0.18013830794288938, "learning_rate": 1.3480058187479188e-05, "loss": 0.7158, "step": 1967 }, { "epoch": 0.6100197605486458, "grad_norm": 0.17067153556446737, "learning_rate": 1.3461458340544917e-05, "loss": 0.7245, "step": 1968 }, { "epoch": 0.6103297299391685, "grad_norm": 0.1818899205948349, "learning_rate": 1.3442864823305418e-05, "loss": 0.6915, "step": 1969 }, { "epoch": 0.6106396993296912, "grad_norm": 0.16417331798153872, "learning_rate": 1.3424277653760318e-05, "loss": 0.7067, "step": 1970 }, { "epoch": 0.6109496687202138, "grad_norm": 0.1945579950597434, "learning_rate": 1.3405696849903098e-05, "loss": 0.7097, "step": 1971 }, { "epoch": 0.6112596381107366, "grad_norm": 0.16980108883198353, "learning_rate": 1.3387122429721069e-05, "loss": 0.716, "step": 1972 }, { "epoch": 0.6115696075012592, "grad_norm": 0.19760494695076902, "learning_rate": 1.3368554411195378e-05, "loss": 0.7151, "step": 1973 }, { "epoch": 0.611879576891782, "grad_norm": 0.17417044488354183, "learning_rate": 1.3349992812300946e-05, "loss": 0.7321, "step": 1974 }, { "epoch": 0.6121895462823046, "grad_norm": 0.18146320745182506, "learning_rate": 1.3331437651006513e-05, "loss": 0.7408, "step": 1975 }, { "epoch": 0.6124995156728273, "grad_norm": 0.1801308741825105, "learning_rate": 1.3312888945274576e-05, "loss": 0.7076, "step": 1976 }, { "epoch": 0.61280948506335, "grad_norm": 0.170213356448548, "learning_rate": 1.3294346713061366e-05, "loss": 0.6794, "step": 1977 }, { "epoch": 0.6131194544538727, "grad_norm": 0.18217630908573024, "learning_rate": 1.3275810972316859e-05, "loss": 0.7132, "step": 1978 }, { "epoch": 0.6134294238443954, "grad_norm": 0.16787453904063, "learning_rate": 1.325728174098476e-05, "loss": 0.7196, "step": 1979 }, { "epoch": 0.6137393932349181, "grad_norm": 0.1766939704750766, "learning_rate": 1.3238759037002445e-05, "loss": 0.7574, "step": 1980 }, { "epoch": 0.6140493626254407, "grad_norm": 0.18252512468280396, "learning_rate": 1.3220242878301002e-05, "loss": 0.7193, "step": 1981 }, { "epoch": 0.6143593320159634, "grad_norm": 0.18763751743218232, "learning_rate": 1.320173328280515e-05, "loss": 0.7205, "step": 1982 }, { "epoch": 0.6146693014064861, "grad_norm": 0.16582860175160558, "learning_rate": 1.3183230268433284e-05, "loss": 0.7211, "step": 1983 }, { "epoch": 0.6149792707970088, "grad_norm": 0.17462414373017038, "learning_rate": 1.3164733853097418e-05, "loss": 0.7025, "step": 1984 }, { "epoch": 0.6152892401875315, "grad_norm": 0.1813048991215406, "learning_rate": 1.3146244054703161e-05, "loss": 0.6979, "step": 1985 }, { "epoch": 0.6155992095780541, "grad_norm": 0.17258621027636453, "learning_rate": 1.3127760891149746e-05, "loss": 0.7225, "step": 1986 }, { "epoch": 0.6159091789685769, "grad_norm": 0.17025232013773955, "learning_rate": 1.310928438032995e-05, "loss": 0.7237, "step": 1987 }, { "epoch": 0.6162191483590995, "grad_norm": 0.17579773877800556, "learning_rate": 1.309081454013014e-05, "loss": 0.7126, "step": 1988 }, { "epoch": 0.6165291177496223, "grad_norm": 0.17398470979316216, "learning_rate": 1.307235138843022e-05, "loss": 0.7346, "step": 1989 }, { "epoch": 0.6168390871401449, "grad_norm": 0.177285599490572, "learning_rate": 1.3053894943103598e-05, "loss": 0.7224, "step": 1990 }, { "epoch": 0.6171490565306677, "grad_norm": 0.1798632587697535, "learning_rate": 1.3035445222017204e-05, "loss": 0.7083, "step": 1991 }, { "epoch": 0.6174590259211903, "grad_norm": 0.21801767465749108, "learning_rate": 1.301700224303147e-05, "loss": 0.6884, "step": 1992 }, { "epoch": 0.6177689953117129, "grad_norm": 0.191092168418143, "learning_rate": 1.2998566024000279e-05, "loss": 0.6885, "step": 1993 }, { "epoch": 0.6180789647022357, "grad_norm": 0.1699156123216027, "learning_rate": 1.298013658277099e-05, "loss": 0.7273, "step": 1994 }, { "epoch": 0.6183889340927583, "grad_norm": 0.17770054142317807, "learning_rate": 1.2961713937184377e-05, "loss": 0.7178, "step": 1995 }, { "epoch": 0.618698903483281, "grad_norm": 0.17532742120029557, "learning_rate": 1.2943298105074664e-05, "loss": 0.7005, "step": 1996 }, { "epoch": 0.6190088728738037, "grad_norm": 0.17671466019786553, "learning_rate": 1.2924889104269465e-05, "loss": 0.7431, "step": 1997 }, { "epoch": 0.6193188422643264, "grad_norm": 0.16310812243645184, "learning_rate": 1.2906486952589767e-05, "loss": 0.7132, "step": 1998 }, { "epoch": 0.6196288116548491, "grad_norm": 0.17331724863401404, "learning_rate": 1.2888091667849958e-05, "loss": 0.7013, "step": 1999 }, { "epoch": 0.6199387810453718, "grad_norm": 0.1691175546676313, "learning_rate": 1.286970326785774e-05, "loss": 0.6967, "step": 2000 }, { "epoch": 0.6202487504358944, "grad_norm": 0.17719341790772813, "learning_rate": 1.2851321770414185e-05, "loss": 0.7004, "step": 2001 }, { "epoch": 0.6205587198264172, "grad_norm": 0.17790988113448727, "learning_rate": 1.2832947193313677e-05, "loss": 0.7437, "step": 2002 }, { "epoch": 0.6208686892169398, "grad_norm": 0.16879641907904108, "learning_rate": 1.2814579554343874e-05, "loss": 0.7008, "step": 2003 }, { "epoch": 0.6211786586074625, "grad_norm": 0.1815271696401953, "learning_rate": 1.2796218871285742e-05, "loss": 0.7006, "step": 2004 }, { "epoch": 0.6214886279979852, "grad_norm": 0.16783827810606974, "learning_rate": 1.2777865161913518e-05, "loss": 0.6993, "step": 2005 }, { "epoch": 0.6217985973885078, "grad_norm": 0.17205722585453895, "learning_rate": 1.275951844399466e-05, "loss": 0.7172, "step": 2006 }, { "epoch": 0.6221085667790306, "grad_norm": 0.18030030586092968, "learning_rate": 1.2741178735289897e-05, "loss": 0.7231, "step": 2007 }, { "epoch": 0.6224185361695532, "grad_norm": 0.1635110386273002, "learning_rate": 1.2722846053553127e-05, "loss": 0.7331, "step": 2008 }, { "epoch": 0.622728505560076, "grad_norm": 0.17837215175612925, "learning_rate": 1.2704520416531487e-05, "loss": 0.7331, "step": 2009 }, { "epoch": 0.6230384749505986, "grad_norm": 0.18793571850413665, "learning_rate": 1.2686201841965273e-05, "loss": 0.7137, "step": 2010 }, { "epoch": 0.6233484443411214, "grad_norm": 0.1709532947049663, "learning_rate": 1.266789034758794e-05, "loss": 0.6673, "step": 2011 }, { "epoch": 0.623658413731644, "grad_norm": 0.1749242825586796, "learning_rate": 1.2649585951126106e-05, "loss": 0.7031, "step": 2012 }, { "epoch": 0.6239683831221667, "grad_norm": 0.18123364613587253, "learning_rate": 1.26312886702995e-05, "loss": 0.7187, "step": 2013 }, { "epoch": 0.6242783525126894, "grad_norm": 0.17723949872123937, "learning_rate": 1.2612998522820975e-05, "loss": 0.729, "step": 2014 }, { "epoch": 0.624588321903212, "grad_norm": 0.1889890606300318, "learning_rate": 1.2594715526396486e-05, "loss": 0.7054, "step": 2015 }, { "epoch": 0.6248982912937348, "grad_norm": 0.17611601963983634, "learning_rate": 1.2576439698725041e-05, "loss": 0.7316, "step": 2016 }, { "epoch": 0.6252082606842574, "grad_norm": 0.18886733798067923, "learning_rate": 1.2558171057498721e-05, "loss": 0.7115, "step": 2017 }, { "epoch": 0.6255182300747801, "grad_norm": 0.24289403364566933, "learning_rate": 1.2539909620402655e-05, "loss": 0.7308, "step": 2018 }, { "epoch": 0.6258281994653028, "grad_norm": 0.17567838594314059, "learning_rate": 1.252165540511499e-05, "loss": 0.7171, "step": 2019 }, { "epoch": 0.6261381688558255, "grad_norm": 0.17607248845863968, "learning_rate": 1.2503408429306894e-05, "loss": 0.7255, "step": 2020 }, { "epoch": 0.6264481382463482, "grad_norm": 0.17486112389609165, "learning_rate": 1.2485168710642506e-05, "loss": 0.6926, "step": 2021 }, { "epoch": 0.6267581076368709, "grad_norm": 0.1778667720440063, "learning_rate": 1.2466936266778959e-05, "loss": 0.7278, "step": 2022 }, { "epoch": 0.6270680770273935, "grad_norm": 0.1737634614326015, "learning_rate": 1.2448711115366336e-05, "loss": 0.733, "step": 2023 }, { "epoch": 0.6273780464179163, "grad_norm": 0.16286092294563562, "learning_rate": 1.243049327404766e-05, "loss": 0.7049, "step": 2024 }, { "epoch": 0.6276880158084389, "grad_norm": 0.1792066928406184, "learning_rate": 1.241228276045888e-05, "loss": 0.6996, "step": 2025 }, { "epoch": 0.6279979851989616, "grad_norm": 0.15550394913970192, "learning_rate": 1.239407959222884e-05, "loss": 0.7113, "step": 2026 }, { "epoch": 0.6283079545894843, "grad_norm": 0.17570761797379825, "learning_rate": 1.237588378697929e-05, "loss": 0.7463, "step": 2027 }, { "epoch": 0.6286179239800069, "grad_norm": 0.16520090627991565, "learning_rate": 1.2357695362324853e-05, "loss": 0.7166, "step": 2028 }, { "epoch": 0.6289278933705297, "grad_norm": 0.16358956732604724, "learning_rate": 1.2339514335872981e-05, "loss": 0.7218, "step": 2029 }, { "epoch": 0.6292378627610523, "grad_norm": 0.165273406078975, "learning_rate": 1.2321340725224e-05, "loss": 0.7023, "step": 2030 }, { "epoch": 0.6295478321515751, "grad_norm": 0.16661922379917876, "learning_rate": 1.2303174547971031e-05, "loss": 0.7026, "step": 2031 }, { "epoch": 0.6298578015420977, "grad_norm": 0.16981828595541326, "learning_rate": 1.2285015821699999e-05, "loss": 0.7025, "step": 2032 }, { "epoch": 0.6301677709326204, "grad_norm": 0.16757643204324243, "learning_rate": 1.2266864563989642e-05, "loss": 0.7288, "step": 2033 }, { "epoch": 0.6304777403231431, "grad_norm": 0.16485980930694208, "learning_rate": 1.2248720792411433e-05, "loss": 0.6944, "step": 2034 }, { "epoch": 0.6307877097136658, "grad_norm": 0.1702532905948909, "learning_rate": 1.2230584524529627e-05, "loss": 0.732, "step": 2035 }, { "epoch": 0.6310976791041885, "grad_norm": 0.16877594042855584, "learning_rate": 1.22124557779012e-05, "loss": 0.7043, "step": 2036 }, { "epoch": 0.6314076484947111, "grad_norm": 0.17034839689296363, "learning_rate": 1.2194334570075848e-05, "loss": 0.75, "step": 2037 }, { "epoch": 0.6317176178852338, "grad_norm": 0.1835103461269933, "learning_rate": 1.2176220918595978e-05, "loss": 0.7048, "step": 2038 }, { "epoch": 0.6320275872757565, "grad_norm": 0.16440921384478313, "learning_rate": 1.2158114840996661e-05, "loss": 0.6853, "step": 2039 }, { "epoch": 0.6323375566662792, "grad_norm": 0.17447821213324916, "learning_rate": 1.2140016354805662e-05, "loss": 0.7161, "step": 2040 }, { "epoch": 0.6326475260568019, "grad_norm": 0.1818709743333573, "learning_rate": 1.2121925477543388e-05, "loss": 0.7223, "step": 2041 }, { "epoch": 0.6329574954473246, "grad_norm": 0.1791101686685577, "learning_rate": 1.2103842226722864e-05, "loss": 0.707, "step": 2042 }, { "epoch": 0.6332674648378472, "grad_norm": 0.18227538914164526, "learning_rate": 1.2085766619849762e-05, "loss": 0.7193, "step": 2043 }, { "epoch": 0.63357743422837, "grad_norm": 0.16236820519981596, "learning_rate": 1.2067698674422326e-05, "loss": 0.7025, "step": 2044 }, { "epoch": 0.6338874036188926, "grad_norm": 0.18477762723892632, "learning_rate": 1.2049638407931394e-05, "loss": 0.7268, "step": 2045 }, { "epoch": 0.6341973730094154, "grad_norm": 0.17849663855989883, "learning_rate": 1.2031585837860384e-05, "loss": 0.7279, "step": 2046 }, { "epoch": 0.634507342399938, "grad_norm": 0.1702340343726146, "learning_rate": 1.2013540981685235e-05, "loss": 0.6921, "step": 2047 }, { "epoch": 0.6348173117904606, "grad_norm": 0.18566061267706144, "learning_rate": 1.199550385687444e-05, "loss": 0.6952, "step": 2048 }, { "epoch": 0.6351272811809834, "grad_norm": 0.1668734909843924, "learning_rate": 1.1977474480889013e-05, "loss": 0.7308, "step": 2049 }, { "epoch": 0.635437250571506, "grad_norm": 0.17755788389742294, "learning_rate": 1.195945287118244e-05, "loss": 0.6971, "step": 2050 }, { "epoch": 0.6357472199620288, "grad_norm": 0.1871242052848891, "learning_rate": 1.1941439045200718e-05, "loss": 0.7382, "step": 2051 }, { "epoch": 0.6360571893525514, "grad_norm": 0.17343046233376855, "learning_rate": 1.1923433020382278e-05, "loss": 0.7093, "step": 2052 }, { "epoch": 0.6363671587430741, "grad_norm": 0.1796991394787321, "learning_rate": 1.1905434814158026e-05, "loss": 0.7206, "step": 2053 }, { "epoch": 0.6366771281335968, "grad_norm": 0.177397238628382, "learning_rate": 1.1887444443951299e-05, "loss": 0.7266, "step": 2054 }, { "epoch": 0.6369870975241195, "grad_norm": 0.1923558142361078, "learning_rate": 1.186946192717782e-05, "loss": 0.7356, "step": 2055 }, { "epoch": 0.6372970669146422, "grad_norm": 0.17391776882398732, "learning_rate": 1.1851487281245744e-05, "loss": 0.7189, "step": 2056 }, { "epoch": 0.6376070363051649, "grad_norm": 0.17171110999354086, "learning_rate": 1.183352052355558e-05, "loss": 0.695, "step": 2057 }, { "epoch": 0.6379170056956875, "grad_norm": 0.19016078725118118, "learning_rate": 1.1815561671500213e-05, "loss": 0.7106, "step": 2058 }, { "epoch": 0.6382269750862103, "grad_norm": 0.16821254643757982, "learning_rate": 1.1797610742464883e-05, "loss": 0.7166, "step": 2059 }, { "epoch": 0.6385369444767329, "grad_norm": 0.18379510795436885, "learning_rate": 1.1779667753827134e-05, "loss": 0.7336, "step": 2060 }, { "epoch": 0.6388469138672556, "grad_norm": 0.1718647773999133, "learning_rate": 1.1761732722956846e-05, "loss": 0.7009, "step": 2061 }, { "epoch": 0.6391568832577783, "grad_norm": 0.17158526910129046, "learning_rate": 1.1743805667216202e-05, "loss": 0.7259, "step": 2062 }, { "epoch": 0.6394668526483009, "grad_norm": 0.17067168882741746, "learning_rate": 1.1725886603959635e-05, "loss": 0.7059, "step": 2063 }, { "epoch": 0.6397768220388237, "grad_norm": 0.1755891584894269, "learning_rate": 1.1707975550533867e-05, "loss": 0.7415, "step": 2064 }, { "epoch": 0.6400867914293463, "grad_norm": 0.17007771038536829, "learning_rate": 1.1690072524277845e-05, "loss": 0.7334, "step": 2065 }, { "epoch": 0.6403967608198691, "grad_norm": 0.35141497005936156, "learning_rate": 1.1672177542522763e-05, "loss": 0.7257, "step": 2066 }, { "epoch": 0.6407067302103917, "grad_norm": 0.1680113015225973, "learning_rate": 1.1654290622592032e-05, "loss": 0.6987, "step": 2067 }, { "epoch": 0.6410166996009145, "grad_norm": 0.21539151473422952, "learning_rate": 1.1636411781801225e-05, "loss": 0.7177, "step": 2068 }, { "epoch": 0.6413266689914371, "grad_norm": 0.16792396952893437, "learning_rate": 1.1618541037458143e-05, "loss": 0.7114, "step": 2069 }, { "epoch": 0.6416366383819598, "grad_norm": 0.17845213656761152, "learning_rate": 1.1600678406862692e-05, "loss": 0.6921, "step": 2070 }, { "epoch": 0.6419466077724825, "grad_norm": 0.1752399388699735, "learning_rate": 1.158282390730698e-05, "loss": 0.6993, "step": 2071 }, { "epoch": 0.6422565771630051, "grad_norm": 0.17513574747604319, "learning_rate": 1.1564977556075206e-05, "loss": 0.6935, "step": 2072 }, { "epoch": 0.6425665465535279, "grad_norm": 0.1712626025876583, "learning_rate": 1.1547139370443698e-05, "loss": 0.6867, "step": 2073 }, { "epoch": 0.6428765159440505, "grad_norm": 0.1845689259454556, "learning_rate": 1.1529309367680873e-05, "loss": 0.7088, "step": 2074 }, { "epoch": 0.6431864853345732, "grad_norm": 0.1770177830891676, "learning_rate": 1.1511487565047222e-05, "loss": 0.7101, "step": 2075 }, { "epoch": 0.6434964547250959, "grad_norm": 0.1753168546566528, "learning_rate": 1.1493673979795308e-05, "loss": 0.7091, "step": 2076 }, { "epoch": 0.6438064241156186, "grad_norm": 0.18797494134001844, "learning_rate": 1.1475868629169735e-05, "loss": 0.7283, "step": 2077 }, { "epoch": 0.6441163935061412, "grad_norm": 0.16927589922576536, "learning_rate": 1.1458071530407131e-05, "loss": 0.6998, "step": 2078 }, { "epoch": 0.644426362896664, "grad_norm": 0.1686112748746439, "learning_rate": 1.1440282700736144e-05, "loss": 0.7254, "step": 2079 }, { "epoch": 0.6447363322871866, "grad_norm": 0.17966121262607251, "learning_rate": 1.1422502157377409e-05, "loss": 0.7329, "step": 2080 }, { "epoch": 0.6450463016777094, "grad_norm": 0.1569286966940516, "learning_rate": 1.1404729917543546e-05, "loss": 0.6732, "step": 2081 }, { "epoch": 0.645356271068232, "grad_norm": 0.1829455670941984, "learning_rate": 1.138696599843913e-05, "loss": 0.7199, "step": 2082 }, { "epoch": 0.6456662404587546, "grad_norm": 0.15833811461235348, "learning_rate": 1.1369210417260685e-05, "loss": 0.7054, "step": 2083 }, { "epoch": 0.6459762098492774, "grad_norm": 0.17207601945727402, "learning_rate": 1.1351463191196661e-05, "loss": 0.697, "step": 2084 }, { "epoch": 0.6462861792398, "grad_norm": 0.4187917410149319, "learning_rate": 1.1333724337427437e-05, "loss": 0.7363, "step": 2085 }, { "epoch": 0.6465961486303228, "grad_norm": 0.16675873327977309, "learning_rate": 1.1315993873125252e-05, "loss": 0.7203, "step": 2086 }, { "epoch": 0.6469061180208454, "grad_norm": 0.16600110650843347, "learning_rate": 1.1298271815454252e-05, "loss": 0.7288, "step": 2087 }, { "epoch": 0.6472160874113682, "grad_norm": 0.16458904153076725, "learning_rate": 1.1280558181570438e-05, "loss": 0.7118, "step": 2088 }, { "epoch": 0.6475260568018908, "grad_norm": 0.16171358961633645, "learning_rate": 1.126285298862164e-05, "loss": 0.6877, "step": 2089 }, { "epoch": 0.6478360261924135, "grad_norm": 0.16783341941879343, "learning_rate": 1.1245156253747562e-05, "loss": 0.6918, "step": 2090 }, { "epoch": 0.6481459955829362, "grad_norm": 0.16026332282820144, "learning_rate": 1.1227467994079653e-05, "loss": 0.7047, "step": 2091 }, { "epoch": 0.6484559649734589, "grad_norm": 0.16788694579547325, "learning_rate": 1.1209788226741219e-05, "loss": 0.7268, "step": 2092 }, { "epoch": 0.6487659343639816, "grad_norm": 0.1640567394516124, "learning_rate": 1.1192116968847313e-05, "loss": 0.7071, "step": 2093 }, { "epoch": 0.6490759037545042, "grad_norm": 0.16369670647119186, "learning_rate": 1.1174454237504757e-05, "loss": 0.7162, "step": 2094 }, { "epoch": 0.6493858731450269, "grad_norm": 0.16212367811263864, "learning_rate": 1.1156800049812123e-05, "loss": 0.6978, "step": 2095 }, { "epoch": 0.6496958425355496, "grad_norm": 0.1713097834386035, "learning_rate": 1.113915442285969e-05, "loss": 0.7169, "step": 2096 }, { "epoch": 0.6500058119260723, "grad_norm": 0.16668438850075065, "learning_rate": 1.112151737372949e-05, "loss": 0.717, "step": 2097 }, { "epoch": 0.650315781316595, "grad_norm": 1.3947317500734489, "learning_rate": 1.1103888919495218e-05, "loss": 0.7212, "step": 2098 }, { "epoch": 0.6506257507071177, "grad_norm": 0.1624164000494737, "learning_rate": 1.108626907722226e-05, "loss": 0.6882, "step": 2099 }, { "epoch": 0.6509357200976403, "grad_norm": 0.16823788511542515, "learning_rate": 1.1068657863967669e-05, "loss": 0.7248, "step": 2100 }, { "epoch": 0.6512456894881631, "grad_norm": 0.16414642843958036, "learning_rate": 1.1051055296780135e-05, "loss": 0.7332, "step": 2101 }, { "epoch": 0.6515556588786857, "grad_norm": 0.17062018735454265, "learning_rate": 1.1033461392699987e-05, "loss": 0.7081, "step": 2102 }, { "epoch": 0.6518656282692085, "grad_norm": 0.16440421621622914, "learning_rate": 1.1015876168759163e-05, "loss": 0.6995, "step": 2103 }, { "epoch": 0.6521755976597311, "grad_norm": 0.16974197629370266, "learning_rate": 1.0998299641981199e-05, "loss": 0.7328, "step": 2104 }, { "epoch": 0.6524855670502537, "grad_norm": 0.17029137724800453, "learning_rate": 1.098073182938121e-05, "loss": 0.7232, "step": 2105 }, { "epoch": 0.6527955364407765, "grad_norm": 0.17597771846336335, "learning_rate": 1.0963172747965882e-05, "loss": 0.7584, "step": 2106 }, { "epoch": 0.6531055058312991, "grad_norm": 0.1625814865070022, "learning_rate": 1.0945622414733439e-05, "loss": 0.6859, "step": 2107 }, { "epoch": 0.6534154752218219, "grad_norm": 0.1738372990114597, "learning_rate": 1.0928080846673641e-05, "loss": 0.7454, "step": 2108 }, { "epoch": 0.6537254446123445, "grad_norm": 0.2321973115452579, "learning_rate": 1.0910548060767764e-05, "loss": 0.7343, "step": 2109 }, { "epoch": 0.6540354140028672, "grad_norm": 0.8213732502121437, "learning_rate": 1.0893024073988575e-05, "loss": 0.7177, "step": 2110 }, { "epoch": 0.6543453833933899, "grad_norm": 0.16531970867088852, "learning_rate": 1.087550890330035e-05, "loss": 0.6956, "step": 2111 }, { "epoch": 0.6546553527839126, "grad_norm": 0.16866690829563738, "learning_rate": 1.0858002565658779e-05, "loss": 0.7305, "step": 2112 }, { "epoch": 0.6549653221744353, "grad_norm": 0.17165749279668174, "learning_rate": 1.084050507801106e-05, "loss": 0.7328, "step": 2113 }, { "epoch": 0.655275291564958, "grad_norm": 0.17113016376478585, "learning_rate": 1.0823016457295775e-05, "loss": 0.6988, "step": 2114 }, { "epoch": 0.6555852609554806, "grad_norm": 0.17172014505796496, "learning_rate": 1.0805536720442942e-05, "loss": 0.6971, "step": 2115 }, { "epoch": 0.6558952303460033, "grad_norm": 0.4601203222206292, "learning_rate": 1.0788065884374e-05, "loss": 0.7219, "step": 2116 }, { "epoch": 0.656205199736526, "grad_norm": 0.17488866875777284, "learning_rate": 1.0770603966001725e-05, "loss": 0.719, "step": 2117 }, { "epoch": 0.6565151691270487, "grad_norm": 0.1783739242803969, "learning_rate": 1.0753150982230304e-05, "loss": 0.7206, "step": 2118 }, { "epoch": 0.6568251385175714, "grad_norm": 0.17687710769791987, "learning_rate": 1.0735706949955254e-05, "loss": 0.7531, "step": 2119 }, { "epoch": 0.657135107908094, "grad_norm": 0.1831150413779917, "learning_rate": 1.0718271886063425e-05, "loss": 0.7112, "step": 2120 }, { "epoch": 0.6574450772986168, "grad_norm": 0.17891508379676432, "learning_rate": 1.0700845807433002e-05, "loss": 0.7006, "step": 2121 }, { "epoch": 0.6577550466891394, "grad_norm": 0.2934696525918074, "learning_rate": 1.068342873093343e-05, "loss": 0.7296, "step": 2122 }, { "epoch": 0.6580650160796622, "grad_norm": 0.18685029560459743, "learning_rate": 1.0666020673425495e-05, "loss": 0.7354, "step": 2123 }, { "epoch": 0.6583749854701848, "grad_norm": 0.17213168722205968, "learning_rate": 1.0648621651761215e-05, "loss": 0.7217, "step": 2124 }, { "epoch": 0.6586849548607075, "grad_norm": 0.17857614569831134, "learning_rate": 1.0631231682783876e-05, "loss": 0.6925, "step": 2125 }, { "epoch": 0.6589949242512302, "grad_norm": 0.18031338613967707, "learning_rate": 1.0613850783327992e-05, "loss": 0.7172, "step": 2126 }, { "epoch": 0.6593048936417529, "grad_norm": 0.17066899761201546, "learning_rate": 1.0596478970219301e-05, "loss": 0.6892, "step": 2127 }, { "epoch": 0.6596148630322756, "grad_norm": 0.17035954406899678, "learning_rate": 1.0579116260274748e-05, "loss": 0.6891, "step": 2128 }, { "epoch": 0.6599248324227982, "grad_norm": 0.18067788335840973, "learning_rate": 1.0561762670302462e-05, "loss": 0.7096, "step": 2129 }, { "epoch": 0.660234801813321, "grad_norm": 0.178499303571393, "learning_rate": 1.054441821710174e-05, "loss": 0.7041, "step": 2130 }, { "epoch": 0.6605447712038436, "grad_norm": 0.22984778469889822, "learning_rate": 1.0527082917463041e-05, "loss": 0.7034, "step": 2131 }, { "epoch": 0.6608547405943663, "grad_norm": 0.18453904337236768, "learning_rate": 1.0509756788167956e-05, "loss": 0.6696, "step": 2132 }, { "epoch": 0.661164709984889, "grad_norm": 0.1945314040203912, "learning_rate": 1.0492439845989206e-05, "loss": 0.7062, "step": 2133 }, { "epoch": 0.6614746793754117, "grad_norm": 0.17219768354224518, "learning_rate": 1.0475132107690613e-05, "loss": 0.7631, "step": 2134 }, { "epoch": 0.6617846487659343, "grad_norm": 0.17978909960909037, "learning_rate": 1.0457833590027093e-05, "loss": 0.7332, "step": 2135 }, { "epoch": 0.6620946181564571, "grad_norm": 0.1723387107097079, "learning_rate": 1.0440544309744622e-05, "loss": 0.7245, "step": 2136 }, { "epoch": 0.6624045875469797, "grad_norm": 0.17146497183800707, "learning_rate": 1.042326428358027e-05, "loss": 0.7231, "step": 2137 }, { "epoch": 0.6627145569375025, "grad_norm": 0.19653462447211711, "learning_rate": 1.0405993528262095e-05, "loss": 0.6931, "step": 2138 }, { "epoch": 0.6630245263280251, "grad_norm": 0.169899119455615, "learning_rate": 1.0388732060509235e-05, "loss": 0.7293, "step": 2139 }, { "epoch": 0.6633344957185477, "grad_norm": 0.17264905607700848, "learning_rate": 1.037147989703179e-05, "loss": 0.7286, "step": 2140 }, { "epoch": 0.6636444651090705, "grad_norm": 0.17040768355853048, "learning_rate": 1.0354237054530876e-05, "loss": 0.6967, "step": 2141 }, { "epoch": 0.6639544344995931, "grad_norm": 0.16931464316422506, "learning_rate": 1.0337003549698603e-05, "loss": 0.7205, "step": 2142 }, { "epoch": 0.6642644038901159, "grad_norm": 0.1721058431712841, "learning_rate": 1.0319779399217995e-05, "loss": 0.7519, "step": 2143 }, { "epoch": 0.6645743732806385, "grad_norm": 0.1755237635950328, "learning_rate": 1.0302564619763078e-05, "loss": 0.6969, "step": 2144 }, { "epoch": 0.6648843426711613, "grad_norm": 0.16415784807000292, "learning_rate": 1.0285359227998743e-05, "loss": 0.7125, "step": 2145 }, { "epoch": 0.6651943120616839, "grad_norm": 0.1810757344119179, "learning_rate": 1.026816324058085e-05, "loss": 0.7476, "step": 2146 }, { "epoch": 0.6655042814522066, "grad_norm": 0.16480253025125355, "learning_rate": 1.0250976674156123e-05, "loss": 0.7166, "step": 2147 }, { "epoch": 0.6658142508427293, "grad_norm": 0.1698376437829576, "learning_rate": 1.0233799545362179e-05, "loss": 0.6921, "step": 2148 }, { "epoch": 0.666124220233252, "grad_norm": 0.16421330036855739, "learning_rate": 1.0216631870827486e-05, "loss": 0.7125, "step": 2149 }, { "epoch": 0.6664341896237747, "grad_norm": 0.15846548071286698, "learning_rate": 1.0199473667171373e-05, "loss": 0.7243, "step": 2150 }, { "epoch": 0.6667441590142973, "grad_norm": 0.1615894981765816, "learning_rate": 1.0182324951003992e-05, "loss": 0.7106, "step": 2151 }, { "epoch": 0.66705412840482, "grad_norm": 0.16339718414995863, "learning_rate": 1.0165185738926318e-05, "loss": 0.6928, "step": 2152 }, { "epoch": 0.6673640977953427, "grad_norm": 0.1642798710602855, "learning_rate": 1.0148056047530117e-05, "loss": 0.7223, "step": 2153 }, { "epoch": 0.6676740671858654, "grad_norm": 0.16141827999628586, "learning_rate": 1.0130935893397944e-05, "loss": 0.7066, "step": 2154 }, { "epoch": 0.667984036576388, "grad_norm": 0.16546315897475086, "learning_rate": 1.0113825293103122e-05, "loss": 0.7252, "step": 2155 }, { "epoch": 0.6682940059669108, "grad_norm": 0.16412218840692058, "learning_rate": 1.009672426320972e-05, "loss": 0.7236, "step": 2156 }, { "epoch": 0.6686039753574334, "grad_norm": 0.15344563592833815, "learning_rate": 1.0079632820272547e-05, "loss": 0.7186, "step": 2157 }, { "epoch": 0.6689139447479562, "grad_norm": 0.16494677182625994, "learning_rate": 1.0062550980837132e-05, "loss": 0.722, "step": 2158 }, { "epoch": 0.6692239141384788, "grad_norm": 0.1649006981663391, "learning_rate": 1.0045478761439704e-05, "loss": 0.7106, "step": 2159 }, { "epoch": 0.6695338835290016, "grad_norm": 0.16351557436763697, "learning_rate": 1.0028416178607179e-05, "loss": 0.7243, "step": 2160 }, { "epoch": 0.6698438529195242, "grad_norm": 0.15877332300985958, "learning_rate": 1.0011363248857147e-05, "loss": 0.6949, "step": 2161 }, { "epoch": 0.6701538223100468, "grad_norm": 0.16754507745706668, "learning_rate": 9.994319988697844e-06, "loss": 0.7284, "step": 2162 }, { "epoch": 0.6704637917005696, "grad_norm": 0.1633818576686753, "learning_rate": 9.977286414628178e-06, "loss": 0.6888, "step": 2163 }, { "epoch": 0.6707737610910922, "grad_norm": 0.17827744319472186, "learning_rate": 9.96026254313762e-06, "loss": 0.7304, "step": 2164 }, { "epoch": 0.671083730481615, "grad_norm": 0.16627242837592998, "learning_rate": 9.943248390706317e-06, "loss": 0.7088, "step": 2165 }, { "epoch": 0.6713936998721376, "grad_norm": 0.6259117236261188, "learning_rate": 9.926243973804946e-06, "loss": 0.6946, "step": 2166 }, { "epoch": 0.6717036692626603, "grad_norm": 0.16965255969996887, "learning_rate": 9.909249308894805e-06, "loss": 0.7162, "step": 2167 }, { "epoch": 0.672013638653183, "grad_norm": 0.1622884877706002, "learning_rate": 9.892264412427742e-06, "loss": 0.6933, "step": 2168 }, { "epoch": 0.6723236080437057, "grad_norm": 0.1633930679112389, "learning_rate": 9.87528930084611e-06, "loss": 0.7235, "step": 2169 }, { "epoch": 0.6726335774342284, "grad_norm": 0.1659444930818219, "learning_rate": 9.858323990582854e-06, "loss": 0.6984, "step": 2170 }, { "epoch": 0.6729435468247511, "grad_norm": 0.1686045041994269, "learning_rate": 9.841368498061368e-06, "loss": 0.7471, "step": 2171 }, { "epoch": 0.6732535162152737, "grad_norm": 0.16028831032318694, "learning_rate": 9.82442283969559e-06, "loss": 0.6964, "step": 2172 }, { "epoch": 0.6735634856057964, "grad_norm": 0.1653258698637878, "learning_rate": 9.807487031889913e-06, "loss": 0.7082, "step": 2173 }, { "epoch": 0.6738734549963191, "grad_norm": 0.17904292913282216, "learning_rate": 9.790561091039204e-06, "loss": 0.7159, "step": 2174 }, { "epoch": 0.6741834243868418, "grad_norm": 0.19251297891177818, "learning_rate": 9.773645033528766e-06, "loss": 0.7361, "step": 2175 }, { "epoch": 0.6744933937773645, "grad_norm": 0.16255924692139467, "learning_rate": 9.75673887573435e-06, "loss": 0.7243, "step": 2176 }, { "epoch": 0.6748033631678871, "grad_norm": 0.1625652885255522, "learning_rate": 9.739842634022112e-06, "loss": 0.708, "step": 2177 }, { "epoch": 0.6751133325584099, "grad_norm": 0.1692405503869704, "learning_rate": 9.722956324748613e-06, "loss": 0.7246, "step": 2178 }, { "epoch": 0.6754233019489325, "grad_norm": 0.17209513968165982, "learning_rate": 9.706079964260799e-06, "loss": 0.7265, "step": 2179 }, { "epoch": 0.6757332713394553, "grad_norm": 0.16099746862017872, "learning_rate": 9.689213568895983e-06, "loss": 0.6891, "step": 2180 }, { "epoch": 0.6760432407299779, "grad_norm": 0.1791722466874798, "learning_rate": 9.672357154981837e-06, "loss": 0.7043, "step": 2181 }, { "epoch": 0.6763532101205006, "grad_norm": 0.17314023846036494, "learning_rate": 9.65551073883636e-06, "loss": 0.6841, "step": 2182 }, { "epoch": 0.6766631795110233, "grad_norm": 0.16696291126029858, "learning_rate": 9.638674336767884e-06, "loss": 0.7641, "step": 2183 }, { "epoch": 0.6769731489015459, "grad_norm": 0.1620459667841101, "learning_rate": 9.621847965075035e-06, "loss": 0.6994, "step": 2184 }, { "epoch": 0.6772831182920687, "grad_norm": 0.18076375502485909, "learning_rate": 9.605031640046733e-06, "loss": 0.7364, "step": 2185 }, { "epoch": 0.6775930876825913, "grad_norm": 0.15649389251841225, "learning_rate": 9.588225377962197e-06, "loss": 0.6869, "step": 2186 }, { "epoch": 0.677903057073114, "grad_norm": 0.17458022924764452, "learning_rate": 9.571429195090855e-06, "loss": 0.7285, "step": 2187 }, { "epoch": 0.6782130264636367, "grad_norm": 0.16822122746476662, "learning_rate": 9.55464310769241e-06, "loss": 0.7199, "step": 2188 }, { "epoch": 0.6785229958541594, "grad_norm": 0.17812128384771114, "learning_rate": 9.537867132016807e-06, "loss": 0.7225, "step": 2189 }, { "epoch": 0.6788329652446821, "grad_norm": 0.16734202062295817, "learning_rate": 9.521101284304151e-06, "loss": 0.7111, "step": 2190 }, { "epoch": 0.6791429346352048, "grad_norm": 0.16707923770602068, "learning_rate": 9.504345580784804e-06, "loss": 0.7211, "step": 2191 }, { "epoch": 0.6794529040257274, "grad_norm": 0.18318648720511063, "learning_rate": 9.487600037679248e-06, "loss": 0.7199, "step": 2192 }, { "epoch": 0.6797628734162502, "grad_norm": 0.16642232563079587, "learning_rate": 9.470864671198178e-06, "loss": 0.7175, "step": 2193 }, { "epoch": 0.6800728428067728, "grad_norm": 0.17703023142740315, "learning_rate": 9.454139497542425e-06, "loss": 0.6982, "step": 2194 }, { "epoch": 0.6803828121972956, "grad_norm": 0.16754189432143282, "learning_rate": 9.437424532902916e-06, "loss": 0.6959, "step": 2195 }, { "epoch": 0.6806927815878182, "grad_norm": 0.1703245554049261, "learning_rate": 9.420719793460758e-06, "loss": 0.7463, "step": 2196 }, { "epoch": 0.6810027509783408, "grad_norm": 0.16296406934202776, "learning_rate": 9.404025295387096e-06, "loss": 0.6756, "step": 2197 }, { "epoch": 0.6813127203688636, "grad_norm": 0.1643000992725967, "learning_rate": 9.387341054843211e-06, "loss": 0.719, "step": 2198 }, { "epoch": 0.6816226897593862, "grad_norm": 0.1691597085732946, "learning_rate": 9.37066708798043e-06, "loss": 0.7325, "step": 2199 }, { "epoch": 0.681932659149909, "grad_norm": 0.16793554639583577, "learning_rate": 9.354003410940134e-06, "loss": 0.6854, "step": 2200 }, { "epoch": 0.6822426285404316, "grad_norm": 0.15938944673465952, "learning_rate": 9.337350039853759e-06, "loss": 0.6985, "step": 2201 }, { "epoch": 0.6825525979309544, "grad_norm": 0.16978636705342354, "learning_rate": 9.320706990842728e-06, "loss": 0.7328, "step": 2202 }, { "epoch": 0.682862567321477, "grad_norm": 0.1627067475844741, "learning_rate": 9.304074280018518e-06, "loss": 0.7152, "step": 2203 }, { "epoch": 0.6831725367119997, "grad_norm": 0.16650140239447883, "learning_rate": 9.287451923482571e-06, "loss": 0.695, "step": 2204 }, { "epoch": 0.6834825061025224, "grad_norm": 0.17208689896282797, "learning_rate": 9.270839937326308e-06, "loss": 0.6942, "step": 2205 }, { "epoch": 0.6837924754930451, "grad_norm": 0.1628449072251215, "learning_rate": 9.254238337631115e-06, "loss": 0.7014, "step": 2206 }, { "epoch": 0.6841024448835678, "grad_norm": 0.2295757313735982, "learning_rate": 9.23764714046832e-06, "loss": 0.7305, "step": 2207 }, { "epoch": 0.6844124142740904, "grad_norm": 0.16284355729180955, "learning_rate": 9.221066361899185e-06, "loss": 0.7029, "step": 2208 }, { "epoch": 0.6847223836646131, "grad_norm": 0.1664891031336953, "learning_rate": 9.204496017974882e-06, "loss": 0.7161, "step": 2209 }, { "epoch": 0.6850323530551358, "grad_norm": 0.1665164349353457, "learning_rate": 9.187936124736483e-06, "loss": 0.7391, "step": 2210 }, { "epoch": 0.6853423224456585, "grad_norm": 0.16763868519713207, "learning_rate": 9.171386698214937e-06, "loss": 0.7306, "step": 2211 }, { "epoch": 0.6856522918361811, "grad_norm": 0.17106542186736037, "learning_rate": 9.154847754431088e-06, "loss": 0.696, "step": 2212 }, { "epoch": 0.6859622612267039, "grad_norm": 0.1628636199214089, "learning_rate": 9.138319309395591e-06, "loss": 0.7105, "step": 2213 }, { "epoch": 0.6862722306172265, "grad_norm": 0.16360463716680018, "learning_rate": 9.121801379108963e-06, "loss": 0.7093, "step": 2214 }, { "epoch": 0.6865822000077493, "grad_norm": 0.16995964854709028, "learning_rate": 9.105293979561538e-06, "loss": 0.7312, "step": 2215 }, { "epoch": 0.6868921693982719, "grad_norm": 0.15821292408452803, "learning_rate": 9.088797126733446e-06, "loss": 0.7204, "step": 2216 }, { "epoch": 0.6872021387887947, "grad_norm": 0.15822299199894088, "learning_rate": 9.072310836594637e-06, "loss": 0.7052, "step": 2217 }, { "epoch": 0.6875121081793173, "grad_norm": 0.15921458841070918, "learning_rate": 9.055835125104789e-06, "loss": 0.728, "step": 2218 }, { "epoch": 0.6878220775698399, "grad_norm": 0.1602857736483488, "learning_rate": 9.039370008213382e-06, "loss": 0.7055, "step": 2219 }, { "epoch": 0.6881320469603627, "grad_norm": 0.16100680587045355, "learning_rate": 9.022915501859622e-06, "loss": 0.7148, "step": 2220 }, { "epoch": 0.6884420163508853, "grad_norm": 0.16060040887663546, "learning_rate": 9.006471621972426e-06, "loss": 0.6939, "step": 2221 }, { "epoch": 0.6887519857414081, "grad_norm": 0.15366728392966178, "learning_rate": 8.990038384470468e-06, "loss": 0.7089, "step": 2222 }, { "epoch": 0.6890619551319307, "grad_norm": 0.15783763515589225, "learning_rate": 8.973615805262064e-06, "loss": 0.7178, "step": 2223 }, { "epoch": 0.6893719245224534, "grad_norm": 0.15623419603982128, "learning_rate": 8.957203900245262e-06, "loss": 0.6876, "step": 2224 }, { "epoch": 0.6896818939129761, "grad_norm": 0.16037484087835868, "learning_rate": 8.940802685307752e-06, "loss": 0.7242, "step": 2225 }, { "epoch": 0.6899918633034988, "grad_norm": 0.25632058472830066, "learning_rate": 8.924412176326877e-06, "loss": 0.6992, "step": 2226 }, { "epoch": 0.6903018326940215, "grad_norm": 0.15888555882801747, "learning_rate": 8.908032389169619e-06, "loss": 0.7057, "step": 2227 }, { "epoch": 0.6906118020845442, "grad_norm": 0.16217327572102186, "learning_rate": 8.89166333969258e-06, "loss": 0.6887, "step": 2228 }, { "epoch": 0.6909217714750668, "grad_norm": 0.1593382182950899, "learning_rate": 8.875305043741969e-06, "loss": 0.7164, "step": 2229 }, { "epoch": 0.6912317408655895, "grad_norm": 0.15402413371698143, "learning_rate": 8.85895751715358e-06, "loss": 0.7075, "step": 2230 }, { "epoch": 0.6915417102561122, "grad_norm": 0.16310382170161747, "learning_rate": 8.842620775752789e-06, "loss": 0.7134, "step": 2231 }, { "epoch": 0.6918516796466349, "grad_norm": 0.16134954690261089, "learning_rate": 8.826294835354524e-06, "loss": 0.6862, "step": 2232 }, { "epoch": 0.6921616490371576, "grad_norm": 0.16272211056927044, "learning_rate": 8.80997971176326e-06, "loss": 0.7087, "step": 2233 }, { "epoch": 0.6924716184276802, "grad_norm": 0.16151030421669332, "learning_rate": 8.793675420773005e-06, "loss": 0.6867, "step": 2234 }, { "epoch": 0.692781587818203, "grad_norm": 0.16520266606328085, "learning_rate": 8.777381978167273e-06, "loss": 0.7178, "step": 2235 }, { "epoch": 0.6930915572087256, "grad_norm": 0.1589921954968959, "learning_rate": 8.761099399719077e-06, "loss": 0.7054, "step": 2236 }, { "epoch": 0.6934015265992484, "grad_norm": 0.17661394744007586, "learning_rate": 8.744827701190913e-06, "loss": 0.6956, "step": 2237 }, { "epoch": 0.693711495989771, "grad_norm": 0.1683295780655638, "learning_rate": 8.728566898334767e-06, "loss": 0.722, "step": 2238 }, { "epoch": 0.6940214653802937, "grad_norm": 0.16689455646458928, "learning_rate": 8.712317006892035e-06, "loss": 0.694, "step": 2239 }, { "epoch": 0.6943314347708164, "grad_norm": 0.16488142182200619, "learning_rate": 8.696078042593578e-06, "loss": 0.7051, "step": 2240 }, { "epoch": 0.694641404161339, "grad_norm": 0.1635399089135149, "learning_rate": 8.679850021159677e-06, "loss": 0.7046, "step": 2241 }, { "epoch": 0.6949513735518618, "grad_norm": 0.17330417599988976, "learning_rate": 8.663632958300005e-06, "loss": 0.7238, "step": 2242 }, { "epoch": 0.6952613429423844, "grad_norm": 0.17146740107487193, "learning_rate": 8.647426869713663e-06, "loss": 0.6907, "step": 2243 }, { "epoch": 0.6955713123329071, "grad_norm": 0.1664613825105276, "learning_rate": 8.631231771089069e-06, "loss": 0.7211, "step": 2244 }, { "epoch": 0.6958812817234298, "grad_norm": 0.16059536402835478, "learning_rate": 8.615047678104059e-06, "loss": 0.7075, "step": 2245 }, { "epoch": 0.6961912511139525, "grad_norm": 0.17292345164953277, "learning_rate": 8.598874606425785e-06, "loss": 0.7466, "step": 2246 }, { "epoch": 0.6965012205044752, "grad_norm": 0.15929372175690182, "learning_rate": 8.582712571710737e-06, "loss": 0.6949, "step": 2247 }, { "epoch": 0.6968111898949979, "grad_norm": 0.16511206155699362, "learning_rate": 8.566561589604727e-06, "loss": 0.6973, "step": 2248 }, { "epoch": 0.6971211592855205, "grad_norm": 0.17371538012084436, "learning_rate": 8.550421675742837e-06, "loss": 0.751, "step": 2249 }, { "epoch": 0.6974311286760433, "grad_norm": 0.16261989375594318, "learning_rate": 8.53429284574948e-06, "loss": 0.7244, "step": 2250 }, { "epoch": 0.6977410980665659, "grad_norm": 0.1567734690789464, "learning_rate": 8.51817511523831e-06, "loss": 0.7083, "step": 2251 }, { "epoch": 0.6980510674570886, "grad_norm": 0.16326489910838254, "learning_rate": 8.50206849981224e-06, "loss": 0.7197, "step": 2252 }, { "epoch": 0.6983610368476113, "grad_norm": 0.15803673092565274, "learning_rate": 8.485973015063428e-06, "loss": 0.6945, "step": 2253 }, { "epoch": 0.6986710062381339, "grad_norm": 0.15149937809988134, "learning_rate": 8.46988867657326e-06, "loss": 0.6832, "step": 2254 }, { "epoch": 0.6989809756286567, "grad_norm": 0.16017640614843678, "learning_rate": 8.453815499912314e-06, "loss": 0.7195, "step": 2255 }, { "epoch": 0.6992909450191793, "grad_norm": 0.1632426884437499, "learning_rate": 8.437753500640384e-06, "loss": 0.7215, "step": 2256 }, { "epoch": 0.6996009144097021, "grad_norm": 0.1629175450014321, "learning_rate": 8.421702694306435e-06, "loss": 0.7227, "step": 2257 }, { "epoch": 0.6999108838002247, "grad_norm": 0.16509508832541003, "learning_rate": 8.405663096448591e-06, "loss": 0.7101, "step": 2258 }, { "epoch": 0.7002208531907474, "grad_norm": 1.893824919806484, "learning_rate": 8.389634722594134e-06, "loss": 0.7077, "step": 2259 }, { "epoch": 0.7005308225812701, "grad_norm": 0.17291526877330357, "learning_rate": 8.373617588259475e-06, "loss": 0.7127, "step": 2260 }, { "epoch": 0.7008407919717928, "grad_norm": 0.16430943311216697, "learning_rate": 8.35761170895015e-06, "loss": 0.7236, "step": 2261 }, { "epoch": 0.7011507613623155, "grad_norm": 0.16194683782878325, "learning_rate": 8.341617100160792e-06, "loss": 0.7209, "step": 2262 }, { "epoch": 0.7014607307528381, "grad_norm": 0.15931202677703213, "learning_rate": 8.325633777375127e-06, "loss": 0.7126, "step": 2263 }, { "epoch": 0.7017707001433608, "grad_norm": 0.28579767183416643, "learning_rate": 8.30966175606597e-06, "loss": 0.7142, "step": 2264 }, { "epoch": 0.7020806695338835, "grad_norm": 0.16758318405091144, "learning_rate": 8.293701051695157e-06, "loss": 0.722, "step": 2265 }, { "epoch": 0.7023906389244062, "grad_norm": 0.1863056020962609, "learning_rate": 8.277751679713621e-06, "loss": 0.7265, "step": 2266 }, { "epoch": 0.7027006083149289, "grad_norm": 0.15697806788061167, "learning_rate": 8.261813655561277e-06, "loss": 0.7138, "step": 2267 }, { "epoch": 0.7030105777054516, "grad_norm": 0.16177564498170607, "learning_rate": 8.245886994667072e-06, "loss": 0.7328, "step": 2268 }, { "epoch": 0.7033205470959742, "grad_norm": 0.15924091723666972, "learning_rate": 8.229971712448976e-06, "loss": 0.7119, "step": 2269 }, { "epoch": 0.703630516486497, "grad_norm": 0.16007083126854954, "learning_rate": 8.214067824313899e-06, "loss": 0.7108, "step": 2270 }, { "epoch": 0.7039404858770196, "grad_norm": 0.162165621786686, "learning_rate": 8.198175345657766e-06, "loss": 0.6847, "step": 2271 }, { "epoch": 0.7042504552675424, "grad_norm": 0.200756863939925, "learning_rate": 8.182294291865412e-06, "loss": 0.7326, "step": 2272 }, { "epoch": 0.704560424658065, "grad_norm": 0.16590758139488898, "learning_rate": 8.166424678310658e-06, "loss": 0.7171, "step": 2273 }, { "epoch": 0.7048703940485878, "grad_norm": 0.15612352880354213, "learning_rate": 8.150566520356224e-06, "loss": 0.6818, "step": 2274 }, { "epoch": 0.7051803634391104, "grad_norm": 0.15820415113821476, "learning_rate": 8.134719833353727e-06, "loss": 0.6956, "step": 2275 }, { "epoch": 0.705490332829633, "grad_norm": 0.16740969266016648, "learning_rate": 8.118884632643712e-06, "loss": 0.7534, "step": 2276 }, { "epoch": 0.7058003022201558, "grad_norm": 0.20541231708113294, "learning_rate": 8.103060933555588e-06, "loss": 0.7296, "step": 2277 }, { "epoch": 0.7061102716106784, "grad_norm": 0.1587600421373133, "learning_rate": 8.087248751407628e-06, "loss": 0.7096, "step": 2278 }, { "epoch": 0.7064202410012012, "grad_norm": 0.1541546986193987, "learning_rate": 8.07144810150696e-06, "loss": 0.7294, "step": 2279 }, { "epoch": 0.7067302103917238, "grad_norm": 0.16091934272482256, "learning_rate": 8.055658999149548e-06, "loss": 0.6927, "step": 2280 }, { "epoch": 0.7070401797822465, "grad_norm": 0.2416517367538878, "learning_rate": 8.039881459620171e-06, "loss": 0.6886, "step": 2281 }, { "epoch": 0.7073501491727692, "grad_norm": 0.16267122899929107, "learning_rate": 8.024115498192426e-06, "loss": 0.7197, "step": 2282 }, { "epoch": 0.7076601185632919, "grad_norm": 0.16394897051590207, "learning_rate": 8.008361130128695e-06, "loss": 0.7049, "step": 2283 }, { "epoch": 0.7079700879538146, "grad_norm": 0.16692311062010734, "learning_rate": 7.992618370680132e-06, "loss": 0.7226, "step": 2284 }, { "epoch": 0.7082800573443373, "grad_norm": 0.1686564292278113, "learning_rate": 7.976887235086665e-06, "loss": 0.6913, "step": 2285 }, { "epoch": 0.7085900267348599, "grad_norm": 0.16519727755501776, "learning_rate": 7.961167738576959e-06, "loss": 0.7225, "step": 2286 }, { "epoch": 0.7088999961253826, "grad_norm": 0.15880354630554921, "learning_rate": 7.945459896368417e-06, "loss": 0.7281, "step": 2287 }, { "epoch": 0.7092099655159053, "grad_norm": 0.1630860621710222, "learning_rate": 7.929763723667156e-06, "loss": 0.704, "step": 2288 }, { "epoch": 0.709519934906428, "grad_norm": 0.15402990311000567, "learning_rate": 7.914079235667997e-06, "loss": 0.7022, "step": 2289 }, { "epoch": 0.7098299042969507, "grad_norm": 0.15919571220732107, "learning_rate": 7.89840644755447e-06, "loss": 0.735, "step": 2290 }, { "epoch": 0.7101398736874733, "grad_norm": 0.16860701796472025, "learning_rate": 7.882745374498731e-06, "loss": 0.757, "step": 2291 }, { "epoch": 0.7104498430779961, "grad_norm": 0.1647023288654531, "learning_rate": 7.867096031661655e-06, "loss": 0.7038, "step": 2292 }, { "epoch": 0.7107598124685187, "grad_norm": 0.15890229052664598, "learning_rate": 7.851458434192705e-06, "loss": 0.7037, "step": 2293 }, { "epoch": 0.7110697818590415, "grad_norm": 0.17004430174061583, "learning_rate": 7.835832597230005e-06, "loss": 0.7284, "step": 2294 }, { "epoch": 0.7113797512495641, "grad_norm": 0.15349723811628568, "learning_rate": 7.820218535900306e-06, "loss": 0.6893, "step": 2295 }, { "epoch": 0.7116897206400868, "grad_norm": 0.15878306788821622, "learning_rate": 7.804616265318914e-06, "loss": 0.6966, "step": 2296 }, { "epoch": 0.7119996900306095, "grad_norm": 0.18436966945759628, "learning_rate": 7.789025800589783e-06, "loss": 0.7276, "step": 2297 }, { "epoch": 0.7123096594211321, "grad_norm": 0.17443796691567062, "learning_rate": 7.773447156805368e-06, "loss": 0.6798, "step": 2298 }, { "epoch": 0.7126196288116549, "grad_norm": 0.16274576117741188, "learning_rate": 7.757880349046742e-06, "loss": 0.7225, "step": 2299 }, { "epoch": 0.7129295982021775, "grad_norm": 0.16772624287505142, "learning_rate": 7.74232539238349e-06, "loss": 0.7059, "step": 2300 }, { "epoch": 0.7132395675927002, "grad_norm": 0.15598877169190095, "learning_rate": 7.726782301873728e-06, "loss": 0.7119, "step": 2301 }, { "epoch": 0.7135495369832229, "grad_norm": 0.1562340367960554, "learning_rate": 7.711251092564084e-06, "loss": 0.7183, "step": 2302 }, { "epoch": 0.7138595063737456, "grad_norm": 0.16613917168209222, "learning_rate": 7.695731779489686e-06, "loss": 0.7384, "step": 2303 }, { "epoch": 0.7141694757642683, "grad_norm": 0.15929605637308977, "learning_rate": 7.68022437767415e-06, "loss": 0.7392, "step": 2304 }, { "epoch": 0.714479445154791, "grad_norm": 0.26122952284694406, "learning_rate": 7.664728902129557e-06, "loss": 0.7112, "step": 2305 }, { "epoch": 0.7147894145453136, "grad_norm": 0.16540587583763816, "learning_rate": 7.649245367856442e-06, "loss": 0.7438, "step": 2306 }, { "epoch": 0.7150993839358364, "grad_norm": 0.16177008476837174, "learning_rate": 7.633773789843779e-06, "loss": 0.7146, "step": 2307 }, { "epoch": 0.715409353326359, "grad_norm": 0.20299353145289686, "learning_rate": 7.6183141830689754e-06, "loss": 0.7312, "step": 2308 }, { "epoch": 0.7157193227168817, "grad_norm": 0.16390008852936203, "learning_rate": 7.6028665624978395e-06, "loss": 0.7111, "step": 2309 }, { "epoch": 0.7160292921074044, "grad_norm": 0.16193080356917758, "learning_rate": 7.587430943084582e-06, "loss": 0.6894, "step": 2310 }, { "epoch": 0.716339261497927, "grad_norm": 0.16989062455431786, "learning_rate": 7.572007339771796e-06, "loss": 0.7116, "step": 2311 }, { "epoch": 0.7166492308884498, "grad_norm": 0.15767776010421886, "learning_rate": 7.55659576749044e-06, "loss": 0.6797, "step": 2312 }, { "epoch": 0.7169592002789724, "grad_norm": 0.16627412142413653, "learning_rate": 7.541196241159827e-06, "loss": 0.6984, "step": 2313 }, { "epoch": 0.7172691696694952, "grad_norm": 0.3096879756431862, "learning_rate": 7.5258087756876105e-06, "loss": 0.6853, "step": 2314 }, { "epoch": 0.7175791390600178, "grad_norm": 0.16820204586091034, "learning_rate": 7.5104333859697574e-06, "loss": 0.7269, "step": 2315 }, { "epoch": 0.7178891084505405, "grad_norm": 0.17820874586882318, "learning_rate": 7.495070086890577e-06, "loss": 0.7278, "step": 2316 }, { "epoch": 0.7181990778410632, "grad_norm": 0.16310343642308725, "learning_rate": 7.479718893322618e-06, "loss": 0.699, "step": 2317 }, { "epoch": 0.7185090472315859, "grad_norm": 0.15934729867492342, "learning_rate": 7.464379820126777e-06, "loss": 0.6945, "step": 2318 }, { "epoch": 0.7188190166221086, "grad_norm": 0.16602266641239824, "learning_rate": 7.44905288215215e-06, "loss": 0.7306, "step": 2319 }, { "epoch": 0.7191289860126312, "grad_norm": 0.16573214279989254, "learning_rate": 7.433738094236145e-06, "loss": 0.7219, "step": 2320 }, { "epoch": 0.719438955403154, "grad_norm": 0.16577162492323302, "learning_rate": 7.418435471204377e-06, "loss": 0.6917, "step": 2321 }, { "epoch": 0.7197489247936766, "grad_norm": 0.19798826286176519, "learning_rate": 7.403145027870673e-06, "loss": 0.7185, "step": 2322 }, { "epoch": 0.7200588941841993, "grad_norm": 0.2733407822851321, "learning_rate": 7.387866779037112e-06, "loss": 0.7084, "step": 2323 }, { "epoch": 0.720368863574722, "grad_norm": 0.1619027066363392, "learning_rate": 7.3726007394939114e-06, "loss": 0.7332, "step": 2324 }, { "epoch": 0.7206788329652447, "grad_norm": 0.17026564094123992, "learning_rate": 7.3573469240195235e-06, "loss": 0.7059, "step": 2325 }, { "epoch": 0.7209888023557673, "grad_norm": 0.32681775507902344, "learning_rate": 7.342105347380537e-06, "loss": 0.7078, "step": 2326 }, { "epoch": 0.7212987717462901, "grad_norm": 0.15896995254484894, "learning_rate": 7.326876024331697e-06, "loss": 0.6894, "step": 2327 }, { "epoch": 0.7216087411368127, "grad_norm": 0.16877759410409895, "learning_rate": 7.311658969615896e-06, "loss": 0.7345, "step": 2328 }, { "epoch": 0.7219187105273355, "grad_norm": 0.15661368404632448, "learning_rate": 7.296454197964123e-06, "loss": 0.7046, "step": 2329 }, { "epoch": 0.7222286799178581, "grad_norm": 0.17041498865022414, "learning_rate": 7.281261724095512e-06, "loss": 0.711, "step": 2330 }, { "epoch": 0.7225386493083807, "grad_norm": 0.16025758764758707, "learning_rate": 7.266081562717271e-06, "loss": 0.6902, "step": 2331 }, { "epoch": 0.7228486186989035, "grad_norm": 0.18798268694492132, "learning_rate": 7.2509137285246956e-06, "loss": 0.6959, "step": 2332 }, { "epoch": 0.7231585880894261, "grad_norm": 0.16336624571248134, "learning_rate": 7.235758236201145e-06, "loss": 0.7165, "step": 2333 }, { "epoch": 0.7234685574799489, "grad_norm": 0.1628072874794801, "learning_rate": 7.2206151004180295e-06, "loss": 0.7029, "step": 2334 }, { "epoch": 0.7237785268704715, "grad_norm": 0.17305686082000907, "learning_rate": 7.2054843358348004e-06, "loss": 0.7245, "step": 2335 }, { "epoch": 0.7240884962609943, "grad_norm": 0.16275632029827272, "learning_rate": 7.1903659570989325e-06, "loss": 0.7032, "step": 2336 }, { "epoch": 0.7243984656515169, "grad_norm": 0.150786787581354, "learning_rate": 7.17525997884591e-06, "loss": 0.6862, "step": 2337 }, { "epoch": 0.7247084350420396, "grad_norm": 0.16118437304660094, "learning_rate": 7.160166415699206e-06, "loss": 0.702, "step": 2338 }, { "epoch": 0.7250184044325623, "grad_norm": 0.1590626882077515, "learning_rate": 7.145085282270301e-06, "loss": 0.6926, "step": 2339 }, { "epoch": 0.725328373823085, "grad_norm": 0.1622922646988146, "learning_rate": 7.130016593158606e-06, "loss": 0.7332, "step": 2340 }, { "epoch": 0.7256383432136077, "grad_norm": 0.1590657632670749, "learning_rate": 7.1149603629515065e-06, "loss": 0.708, "step": 2341 }, { "epoch": 0.7259483126041304, "grad_norm": 0.36774793808733186, "learning_rate": 7.099916606224322e-06, "loss": 0.7559, "step": 2342 }, { "epoch": 0.726258281994653, "grad_norm": 0.15548498765587518, "learning_rate": 7.0848853375402946e-06, "loss": 0.6873, "step": 2343 }, { "epoch": 0.7265682513851757, "grad_norm": 0.15879629820538918, "learning_rate": 7.069866571450601e-06, "loss": 0.7169, "step": 2344 }, { "epoch": 0.7268782207756984, "grad_norm": 0.1721510409287817, "learning_rate": 7.054860322494266e-06, "loss": 0.7269, "step": 2345 }, { "epoch": 0.727188190166221, "grad_norm": 0.1538873290485504, "learning_rate": 7.039866605198245e-06, "loss": 0.7259, "step": 2346 }, { "epoch": 0.7274981595567438, "grad_norm": 0.1579022564767543, "learning_rate": 7.024885434077344e-06, "loss": 0.6962, "step": 2347 }, { "epoch": 0.7278081289472664, "grad_norm": 0.15896467935181968, "learning_rate": 7.009916823634202e-06, "loss": 0.6864, "step": 2348 }, { "epoch": 0.7281180983377892, "grad_norm": 0.16618422070927277, "learning_rate": 6.994960788359344e-06, "loss": 0.7349, "step": 2349 }, { "epoch": 0.7284280677283118, "grad_norm": 0.15125770957344906, "learning_rate": 6.980017342731071e-06, "loss": 0.6901, "step": 2350 }, { "epoch": 0.7287380371188346, "grad_norm": 0.16573874536187339, "learning_rate": 6.965086501215541e-06, "loss": 0.71, "step": 2351 }, { "epoch": 0.7290480065093572, "grad_norm": 0.1572321058386756, "learning_rate": 6.950168278266683e-06, "loss": 0.6955, "step": 2352 }, { "epoch": 0.7293579758998799, "grad_norm": 0.16495520444207665, "learning_rate": 6.935262688326221e-06, "loss": 0.7191, "step": 2353 }, { "epoch": 0.7296679452904026, "grad_norm": 0.15302723546301336, "learning_rate": 6.920369745823645e-06, "loss": 0.6908, "step": 2354 }, { "epoch": 0.7299779146809252, "grad_norm": 0.1634625290069426, "learning_rate": 6.905489465176205e-06, "loss": 0.7128, "step": 2355 }, { "epoch": 0.730287884071448, "grad_norm": 0.17331083661117355, "learning_rate": 6.890621860788893e-06, "loss": 0.7109, "step": 2356 }, { "epoch": 0.7305978534619706, "grad_norm": 0.15146122424184363, "learning_rate": 6.875766947054425e-06, "loss": 0.7313, "step": 2357 }, { "epoch": 0.7309078228524933, "grad_norm": 0.1545351369830959, "learning_rate": 6.8609247383532386e-06, "loss": 0.6981, "step": 2358 }, { "epoch": 0.731217792243016, "grad_norm": 0.15383438566699578, "learning_rate": 6.8460952490534685e-06, "loss": 0.7256, "step": 2359 }, { "epoch": 0.7315277616335387, "grad_norm": 0.15521661908295922, "learning_rate": 6.831278493510935e-06, "loss": 0.7279, "step": 2360 }, { "epoch": 0.7318377310240614, "grad_norm": 0.15625289645161466, "learning_rate": 6.816474486069138e-06, "loss": 0.6981, "step": 2361 }, { "epoch": 0.7321477004145841, "grad_norm": 0.1564598751203104, "learning_rate": 6.801683241059225e-06, "loss": 0.699, "step": 2362 }, { "epoch": 0.7324576698051067, "grad_norm": 0.153682245506985, "learning_rate": 6.786904772800001e-06, "loss": 0.7065, "step": 2363 }, { "epoch": 0.7327676391956295, "grad_norm": 0.15572909673625418, "learning_rate": 6.7721390955978875e-06, "loss": 0.7128, "step": 2364 }, { "epoch": 0.7330776085861521, "grad_norm": 0.1616814019173995, "learning_rate": 6.757386223746951e-06, "loss": 0.7152, "step": 2365 }, { "epoch": 0.7333875779766748, "grad_norm": 0.1585211561970485, "learning_rate": 6.742646171528828e-06, "loss": 0.7177, "step": 2366 }, { "epoch": 0.7336975473671975, "grad_norm": 0.16261238346803783, "learning_rate": 6.727918953212762e-06, "loss": 0.7367, "step": 2367 }, { "epoch": 0.7340075167577201, "grad_norm": 0.16007070797634482, "learning_rate": 6.713204583055574e-06, "loss": 0.7444, "step": 2368 }, { "epoch": 0.7343174861482429, "grad_norm": 0.1491846883446134, "learning_rate": 6.698503075301639e-06, "loss": 0.6978, "step": 2369 }, { "epoch": 0.7346274555387655, "grad_norm": 0.15848196826907437, "learning_rate": 6.683814444182901e-06, "loss": 0.7196, "step": 2370 }, { "epoch": 0.7349374249292883, "grad_norm": 0.15546528876785393, "learning_rate": 6.669138703918798e-06, "loss": 0.7294, "step": 2371 }, { "epoch": 0.7352473943198109, "grad_norm": 0.1517435563227221, "learning_rate": 6.654475868716335e-06, "loss": 0.7024, "step": 2372 }, { "epoch": 0.7355573637103336, "grad_norm": 0.15118258715903973, "learning_rate": 6.639825952769994e-06, "loss": 0.6931, "step": 2373 }, { "epoch": 0.7358673331008563, "grad_norm": 0.15637224571461678, "learning_rate": 6.625188970261762e-06, "loss": 0.7282, "step": 2374 }, { "epoch": 0.736177302491379, "grad_norm": 0.15387405918281358, "learning_rate": 6.610564935361108e-06, "loss": 0.6946, "step": 2375 }, { "epoch": 0.7364872718819017, "grad_norm": 0.14997087649192947, "learning_rate": 6.595953862224944e-06, "loss": 0.6925, "step": 2376 }, { "epoch": 0.7367972412724243, "grad_norm": 0.15370875802806172, "learning_rate": 6.581355764997672e-06, "loss": 0.7159, "step": 2377 }, { "epoch": 0.737107210662947, "grad_norm": 0.15363204319531637, "learning_rate": 6.566770657811105e-06, "loss": 0.6952, "step": 2378 }, { "epoch": 0.7374171800534697, "grad_norm": 0.1521416849881024, "learning_rate": 6.55219855478449e-06, "loss": 0.6998, "step": 2379 }, { "epoch": 0.7377271494439924, "grad_norm": 0.1523460871719531, "learning_rate": 6.537639470024484e-06, "loss": 0.6981, "step": 2380 }, { "epoch": 0.7380371188345151, "grad_norm": 0.15478801897191882, "learning_rate": 6.5230934176251395e-06, "loss": 0.7303, "step": 2381 }, { "epoch": 0.7383470882250378, "grad_norm": 0.15973719383516147, "learning_rate": 6.508560411667897e-06, "loss": 0.7307, "step": 2382 }, { "epoch": 0.7386570576155604, "grad_norm": 0.30146307883418955, "learning_rate": 6.494040466221565e-06, "loss": 0.7055, "step": 2383 }, { "epoch": 0.7389670270060832, "grad_norm": 0.15262097067098737, "learning_rate": 6.479533595342307e-06, "loss": 0.7107, "step": 2384 }, { "epoch": 0.7392769963966058, "grad_norm": 0.15849838871716926, "learning_rate": 6.4650398130736345e-06, "loss": 0.7286, "step": 2385 }, { "epoch": 0.7395869657871286, "grad_norm": 0.15761351666241016, "learning_rate": 6.450559133446383e-06, "loss": 0.7161, "step": 2386 }, { "epoch": 0.7398969351776512, "grad_norm": 0.16373521942291597, "learning_rate": 6.436091570478709e-06, "loss": 0.7142, "step": 2387 }, { "epoch": 0.7402069045681738, "grad_norm": 0.15158523557783649, "learning_rate": 6.421637138176065e-06, "loss": 0.6737, "step": 2388 }, { "epoch": 0.7405168739586966, "grad_norm": 0.1536109800535527, "learning_rate": 6.407195850531196e-06, "loss": 0.7243, "step": 2389 }, { "epoch": 0.7408268433492192, "grad_norm": 0.15313623431248394, "learning_rate": 6.392767721524118e-06, "loss": 0.7004, "step": 2390 }, { "epoch": 0.741136812739742, "grad_norm": 0.1550477188167231, "learning_rate": 6.378352765122131e-06, "loss": 0.721, "step": 2391 }, { "epoch": 0.7414467821302646, "grad_norm": 0.15431705680642777, "learning_rate": 6.363950995279739e-06, "loss": 0.6908, "step": 2392 }, { "epoch": 0.7417567515207873, "grad_norm": 0.16876686375197636, "learning_rate": 6.349562425938733e-06, "loss": 0.7172, "step": 2393 }, { "epoch": 0.74206672091131, "grad_norm": 0.16189973188133816, "learning_rate": 6.335187071028078e-06, "loss": 0.6997, "step": 2394 }, { "epoch": 0.7423766903018327, "grad_norm": 0.17889873355162478, "learning_rate": 6.320824944463966e-06, "loss": 0.7013, "step": 2395 }, { "epoch": 0.7426866596923554, "grad_norm": 0.15811439181615491, "learning_rate": 6.306476060149804e-06, "loss": 0.7123, "step": 2396 }, { "epoch": 0.7429966290828781, "grad_norm": 0.16783779767358212, "learning_rate": 6.292140431976137e-06, "loss": 0.7195, "step": 2397 }, { "epoch": 0.7433065984734007, "grad_norm": 0.16169679552079658, "learning_rate": 6.277818073820725e-06, "loss": 0.7262, "step": 2398 }, { "epoch": 0.7436165678639234, "grad_norm": 0.1601339909379381, "learning_rate": 6.263508999548431e-06, "loss": 0.7454, "step": 2399 }, { "epoch": 0.7439265372544461, "grad_norm": 0.1611178500980144, "learning_rate": 6.249213223011303e-06, "loss": 0.7009, "step": 2400 }, { "epoch": 0.7442365066449688, "grad_norm": 0.157950987712499, "learning_rate": 6.234930758048499e-06, "loss": 0.6762, "step": 2401 }, { "epoch": 0.7445464760354915, "grad_norm": 0.15543042354364803, "learning_rate": 6.220661618486268e-06, "loss": 0.7067, "step": 2402 }, { "epoch": 0.7448564454260141, "grad_norm": 0.1614426078318928, "learning_rate": 6.206405818138004e-06, "loss": 0.696, "step": 2403 }, { "epoch": 0.7451664148165369, "grad_norm": 0.1664991008260914, "learning_rate": 6.1921633708041516e-06, "loss": 0.7457, "step": 2404 }, { "epoch": 0.7454763842070595, "grad_norm": 0.15526762726122637, "learning_rate": 6.177934290272247e-06, "loss": 0.7162, "step": 2405 }, { "epoch": 0.7457863535975823, "grad_norm": 0.15838033099131713, "learning_rate": 6.163718590316878e-06, "loss": 0.7106, "step": 2406 }, { "epoch": 0.7460963229881049, "grad_norm": 0.1623181733526835, "learning_rate": 6.149516284699686e-06, "loss": 0.7192, "step": 2407 }, { "epoch": 0.7464062923786277, "grad_norm": 0.15511429173795033, "learning_rate": 6.135327387169339e-06, "loss": 0.6941, "step": 2408 }, { "epoch": 0.7467162617691503, "grad_norm": 0.15579266608915923, "learning_rate": 6.121151911461527e-06, "loss": 0.6819, "step": 2409 }, { "epoch": 0.747026231159673, "grad_norm": 0.1555711540390522, "learning_rate": 6.106989871298951e-06, "loss": 0.7198, "step": 2410 }, { "epoch": 0.7473362005501957, "grad_norm": 0.16059158028300272, "learning_rate": 6.092841280391304e-06, "loss": 0.7337, "step": 2411 }, { "epoch": 0.7476461699407183, "grad_norm": 0.15816697179012337, "learning_rate": 6.0787061524352566e-06, "loss": 0.7196, "step": 2412 }, { "epoch": 0.747956139331241, "grad_norm": 0.15696840144989374, "learning_rate": 6.064584501114446e-06, "loss": 0.7042, "step": 2413 }, { "epoch": 0.7482661087217637, "grad_norm": 0.162664789597097, "learning_rate": 6.05047634009947e-06, "loss": 0.7369, "step": 2414 }, { "epoch": 0.7485760781122864, "grad_norm": 0.15765587816952162, "learning_rate": 6.036381683047858e-06, "loss": 0.7293, "step": 2415 }, { "epoch": 0.7488860475028091, "grad_norm": 0.15640675265621942, "learning_rate": 6.022300543604067e-06, "loss": 0.6888, "step": 2416 }, { "epoch": 0.7491960168933318, "grad_norm": 0.16888248936281744, "learning_rate": 6.00823293539949e-06, "loss": 0.7291, "step": 2417 }, { "epoch": 0.7495059862838545, "grad_norm": 0.15756767995311563, "learning_rate": 5.9941788720523786e-06, "loss": 0.695, "step": 2418 }, { "epoch": 0.7498159556743772, "grad_norm": 0.16331221166637075, "learning_rate": 5.980138367167922e-06, "loss": 0.6975, "step": 2419 }, { "epoch": 0.7501259250648998, "grad_norm": 0.1592642865752872, "learning_rate": 5.96611143433814e-06, "loss": 0.7233, "step": 2420 }, { "epoch": 0.7504358944554226, "grad_norm": 0.1732083907156217, "learning_rate": 5.952098087141933e-06, "loss": 0.7511, "step": 2421 }, { "epoch": 0.7507458638459452, "grad_norm": 0.16058948598367817, "learning_rate": 5.938098339145069e-06, "loss": 0.6926, "step": 2422 }, { "epoch": 0.7510558332364679, "grad_norm": 0.15120200565383488, "learning_rate": 5.9241122039001005e-06, "loss": 0.6948, "step": 2423 }, { "epoch": 0.7513658026269906, "grad_norm": 0.1631862157818821, "learning_rate": 5.910139694946466e-06, "loss": 0.7132, "step": 2424 }, { "epoch": 0.7516757720175132, "grad_norm": 0.1620092800725162, "learning_rate": 5.896180825810351e-06, "loss": 0.7236, "step": 2425 }, { "epoch": 0.751985741408036, "grad_norm": 0.16260237076278083, "learning_rate": 5.882235610004785e-06, "loss": 0.7109, "step": 2426 }, { "epoch": 0.7522957107985586, "grad_norm": 0.15421051991979404, "learning_rate": 5.868304061029562e-06, "loss": 0.7187, "step": 2427 }, { "epoch": 0.7526056801890814, "grad_norm": 0.1589871943286513, "learning_rate": 5.854386192371224e-06, "loss": 0.7076, "step": 2428 }, { "epoch": 0.752915649579604, "grad_norm": 0.15862032022712189, "learning_rate": 5.840482017503113e-06, "loss": 0.7164, "step": 2429 }, { "epoch": 0.7532256189701267, "grad_norm": 0.15739953762262138, "learning_rate": 5.826591549885281e-06, "loss": 0.7193, "step": 2430 }, { "epoch": 0.7535355883606494, "grad_norm": 0.15464515052653302, "learning_rate": 5.812714802964525e-06, "loss": 0.6723, "step": 2431 }, { "epoch": 0.7538455577511721, "grad_norm": 0.15554167604378688, "learning_rate": 5.798851790174354e-06, "loss": 0.7022, "step": 2432 }, { "epoch": 0.7541555271416948, "grad_norm": 0.16006819699798672, "learning_rate": 5.785002524934986e-06, "loss": 0.7194, "step": 2433 }, { "epoch": 0.7544654965322174, "grad_norm": 0.21350297781498018, "learning_rate": 5.771167020653325e-06, "loss": 0.6813, "step": 2434 }, { "epoch": 0.7547754659227401, "grad_norm": 0.15897233195518684, "learning_rate": 5.757345290722962e-06, "loss": 0.7053, "step": 2435 }, { "epoch": 0.7550854353132628, "grad_norm": 0.14944555647871618, "learning_rate": 5.7435373485241445e-06, "loss": 0.6927, "step": 2436 }, { "epoch": 0.7553954047037855, "grad_norm": 0.1561123990456676, "learning_rate": 5.729743207423777e-06, "loss": 0.694, "step": 2437 }, { "epoch": 0.7557053740943082, "grad_norm": 0.15449555708120377, "learning_rate": 5.715962880775405e-06, "loss": 0.6885, "step": 2438 }, { "epoch": 0.7560153434848309, "grad_norm": 0.15566533100650679, "learning_rate": 5.702196381919198e-06, "loss": 0.7293, "step": 2439 }, { "epoch": 0.7563253128753535, "grad_norm": 0.1574761685165961, "learning_rate": 5.68844372418194e-06, "loss": 0.6934, "step": 2440 }, { "epoch": 0.7566352822658763, "grad_norm": 0.15655359655835033, "learning_rate": 5.674704920877016e-06, "loss": 0.7052, "step": 2441 }, { "epoch": 0.7569452516563989, "grad_norm": 0.1976044887991165, "learning_rate": 5.660979985304398e-06, "loss": 0.7012, "step": 2442 }, { "epoch": 0.7572552210469217, "grad_norm": 0.14949797388003497, "learning_rate": 5.647268930750649e-06, "loss": 0.7034, "step": 2443 }, { "epoch": 0.7575651904374443, "grad_norm": 0.16086489210381874, "learning_rate": 5.6335717704888545e-06, "loss": 0.7495, "step": 2444 }, { "epoch": 0.7578751598279669, "grad_norm": 0.1620028405987147, "learning_rate": 5.619888517778704e-06, "loss": 0.6945, "step": 2445 }, { "epoch": 0.7581851292184897, "grad_norm": 0.1588083304309638, "learning_rate": 5.606219185866373e-06, "loss": 0.6969, "step": 2446 }, { "epoch": 0.7584950986090123, "grad_norm": 0.14971574730406026, "learning_rate": 5.592563787984584e-06, "loss": 0.7051, "step": 2447 }, { "epoch": 0.7588050679995351, "grad_norm": 0.15620816077244465, "learning_rate": 5.578922337352588e-06, "loss": 0.6758, "step": 2448 }, { "epoch": 0.7591150373900577, "grad_norm": 0.16187572676632114, "learning_rate": 5.56529484717609e-06, "loss": 0.7245, "step": 2449 }, { "epoch": 0.7594250067805804, "grad_norm": 0.15192598418239736, "learning_rate": 5.551681330647332e-06, "loss": 0.7011, "step": 2450 }, { "epoch": 0.7597349761711031, "grad_norm": 0.1542629085747123, "learning_rate": 5.538081800944977e-06, "loss": 0.6796, "step": 2451 }, { "epoch": 0.7600449455616258, "grad_norm": 0.15173306528839103, "learning_rate": 5.52449627123419e-06, "loss": 0.7216, "step": 2452 }, { "epoch": 0.7603549149521485, "grad_norm": 0.1565620382808827, "learning_rate": 5.510924754666565e-06, "loss": 0.6984, "step": 2453 }, { "epoch": 0.7606648843426712, "grad_norm": 0.15981717141322738, "learning_rate": 5.497367264380129e-06, "loss": 0.6997, "step": 2454 }, { "epoch": 0.7609748537331938, "grad_norm": 0.15093907588738695, "learning_rate": 5.48382381349934e-06, "loss": 0.7004, "step": 2455 }, { "epoch": 0.7612848231237165, "grad_norm": 0.1544492843218365, "learning_rate": 5.4702944151350405e-06, "loss": 0.7375, "step": 2456 }, { "epoch": 0.7615947925142392, "grad_norm": 0.15645129378809172, "learning_rate": 5.456779082384509e-06, "loss": 0.7061, "step": 2457 }, { "epoch": 0.7619047619047619, "grad_norm": 0.2862652998037389, "learning_rate": 5.443277828331377e-06, "loss": 0.7042, "step": 2458 }, { "epoch": 0.7622147312952846, "grad_norm": 0.15139135153561561, "learning_rate": 5.4297906660456575e-06, "loss": 0.7079, "step": 2459 }, { "epoch": 0.7625247006858072, "grad_norm": 0.1515873680797615, "learning_rate": 5.416317608583721e-06, "loss": 0.7057, "step": 2460 }, { "epoch": 0.76283467007633, "grad_norm": 0.16735022341606676, "learning_rate": 5.4028586689882824e-06, "loss": 0.7189, "step": 2461 }, { "epoch": 0.7631446394668526, "grad_norm": 0.15754154269687312, "learning_rate": 5.389413860288393e-06, "loss": 0.6896, "step": 2462 }, { "epoch": 0.7634546088573754, "grad_norm": 0.15809919829979782, "learning_rate": 5.375983195499419e-06, "loss": 0.7347, "step": 2463 }, { "epoch": 0.763764578247898, "grad_norm": 0.1514560290995207, "learning_rate": 5.362566687623041e-06, "loss": 0.6819, "step": 2464 }, { "epoch": 0.7640745476384208, "grad_norm": 0.15295679311602117, "learning_rate": 5.349164349647223e-06, "loss": 0.6935, "step": 2465 }, { "epoch": 0.7643845170289434, "grad_norm": 0.15626531856031442, "learning_rate": 5.3357761945462385e-06, "loss": 0.698, "step": 2466 }, { "epoch": 0.764694486419466, "grad_norm": 0.15572876818241432, "learning_rate": 5.322402235280597e-06, "loss": 0.7148, "step": 2467 }, { "epoch": 0.7650044558099888, "grad_norm": 0.15447251263492665, "learning_rate": 5.309042484797087e-06, "loss": 0.7197, "step": 2468 }, { "epoch": 0.7653144252005114, "grad_norm": 0.15077775532340973, "learning_rate": 5.295696956028735e-06, "loss": 0.7031, "step": 2469 }, { "epoch": 0.7656243945910342, "grad_norm": 0.16129085543873825, "learning_rate": 5.2823656618947974e-06, "loss": 0.7228, "step": 2470 }, { "epoch": 0.7659343639815568, "grad_norm": 0.15635176573695767, "learning_rate": 5.269048615300769e-06, "loss": 0.6872, "step": 2471 }, { "epoch": 0.7662443333720795, "grad_norm": 0.15380372610758677, "learning_rate": 5.255745829138319e-06, "loss": 0.6944, "step": 2472 }, { "epoch": 0.7665543027626022, "grad_norm": 0.15242872004447416, "learning_rate": 5.2424573162853476e-06, "loss": 0.6893, "step": 2473 }, { "epoch": 0.7668642721531249, "grad_norm": 0.15529730383506357, "learning_rate": 5.229183089605918e-06, "loss": 0.695, "step": 2474 }, { "epoch": 0.7671742415436476, "grad_norm": 0.16515344733915976, "learning_rate": 5.215923161950249e-06, "loss": 0.7114, "step": 2475 }, { "epoch": 0.7674842109341703, "grad_norm": 0.16385641692810998, "learning_rate": 5.202677546154762e-06, "loss": 0.7423, "step": 2476 }, { "epoch": 0.7677941803246929, "grad_norm": 0.1518366039774961, "learning_rate": 5.189446255041966e-06, "loss": 0.6922, "step": 2477 }, { "epoch": 0.7681041497152156, "grad_norm": 0.1542697415817334, "learning_rate": 5.176229301420554e-06, "loss": 0.6885, "step": 2478 }, { "epoch": 0.7684141191057383, "grad_norm": 0.16043347794023086, "learning_rate": 5.163026698085314e-06, "loss": 0.7137, "step": 2479 }, { "epoch": 0.768724088496261, "grad_norm": 0.15891685390696286, "learning_rate": 5.149838457817142e-06, "loss": 0.718, "step": 2480 }, { "epoch": 0.7690340578867837, "grad_norm": 0.1581078657066591, "learning_rate": 5.136664593383045e-06, "loss": 0.7162, "step": 2481 }, { "epoch": 0.7693440272773063, "grad_norm": 0.15521967878929677, "learning_rate": 5.12350511753608e-06, "loss": 0.7211, "step": 2482 }, { "epoch": 0.7696539966678291, "grad_norm": 0.15553927917252738, "learning_rate": 5.110360043015416e-06, "loss": 0.7364, "step": 2483 }, { "epoch": 0.7699639660583517, "grad_norm": 0.15974120895669522, "learning_rate": 5.09722938254626e-06, "loss": 0.7503, "step": 2484 }, { "epoch": 0.7702739354488745, "grad_norm": 0.1592513985017166, "learning_rate": 5.0841131488398645e-06, "loss": 0.7154, "step": 2485 }, { "epoch": 0.7705839048393971, "grad_norm": 0.1548263771118547, "learning_rate": 5.071011354593518e-06, "loss": 0.6936, "step": 2486 }, { "epoch": 0.7708938742299198, "grad_norm": 0.15461385877180944, "learning_rate": 5.057924012490532e-06, "loss": 0.676, "step": 2487 }, { "epoch": 0.7712038436204425, "grad_norm": 0.1547661983949544, "learning_rate": 5.04485113520023e-06, "loss": 0.7024, "step": 2488 }, { "epoch": 0.7715138130109652, "grad_norm": 0.15012379528461756, "learning_rate": 5.031792735377927e-06, "loss": 0.6894, "step": 2489 }, { "epoch": 0.7718237824014879, "grad_norm": 0.15228512999933888, "learning_rate": 5.018748825664925e-06, "loss": 0.7241, "step": 2490 }, { "epoch": 0.7721337517920105, "grad_norm": 0.154186978081359, "learning_rate": 5.005719418688497e-06, "loss": 0.7016, "step": 2491 }, { "epoch": 0.7724437211825332, "grad_norm": 0.1566103213931097, "learning_rate": 4.992704527061896e-06, "loss": 0.7075, "step": 2492 }, { "epoch": 0.7727536905730559, "grad_norm": 0.16285610514705123, "learning_rate": 4.979704163384284e-06, "loss": 0.7158, "step": 2493 }, { "epoch": 0.7730636599635786, "grad_norm": 0.15719878701375575, "learning_rate": 4.966718340240795e-06, "loss": 0.7419, "step": 2494 }, { "epoch": 0.7733736293541013, "grad_norm": 0.15273487151989082, "learning_rate": 4.953747070202468e-06, "loss": 0.705, "step": 2495 }, { "epoch": 0.773683598744624, "grad_norm": 0.15188024727130428, "learning_rate": 4.940790365826253e-06, "loss": 0.6818, "step": 2496 }, { "epoch": 0.7739935681351466, "grad_norm": 0.14939966965717438, "learning_rate": 4.9278482396550265e-06, "loss": 0.6874, "step": 2497 }, { "epoch": 0.7743035375256694, "grad_norm": 0.1531278001591817, "learning_rate": 4.914920704217505e-06, "loss": 0.7341, "step": 2498 }, { "epoch": 0.774613506916192, "grad_norm": 0.15913951872163065, "learning_rate": 4.902007772028325e-06, "loss": 0.7449, "step": 2499 }, { "epoch": 0.7749234763067148, "grad_norm": 0.16276669723148554, "learning_rate": 4.88910945558797e-06, "loss": 0.7206, "step": 2500 }, { "epoch": 0.7752334456972374, "grad_norm": 0.15473456101159394, "learning_rate": 4.876225767382754e-06, "loss": 0.7339, "step": 2501 }, { "epoch": 0.77554341508776, "grad_norm": 0.14956846929611942, "learning_rate": 4.863356719884871e-06, "loss": 0.7095, "step": 2502 }, { "epoch": 0.7758533844782828, "grad_norm": 0.15652159893482767, "learning_rate": 4.850502325552298e-06, "loss": 0.6832, "step": 2503 }, { "epoch": 0.7761633538688054, "grad_norm": 0.15966386005021638, "learning_rate": 4.837662596828865e-06, "loss": 0.7315, "step": 2504 }, { "epoch": 0.7764733232593282, "grad_norm": 0.15364120575902843, "learning_rate": 4.824837546144183e-06, "loss": 0.7212, "step": 2505 }, { "epoch": 0.7767832926498508, "grad_norm": 0.1522117857253803, "learning_rate": 4.812027185913657e-06, "loss": 0.71, "step": 2506 }, { "epoch": 0.7770932620403735, "grad_norm": 0.16342088088656595, "learning_rate": 4.799231528538475e-06, "loss": 0.7202, "step": 2507 }, { "epoch": 0.7774032314308962, "grad_norm": 0.2156592360481282, "learning_rate": 4.786450586405589e-06, "loss": 0.7148, "step": 2508 }, { "epoch": 0.7777132008214189, "grad_norm": 0.15503521868092582, "learning_rate": 4.773684371887706e-06, "loss": 0.6953, "step": 2509 }, { "epoch": 0.7780231702119416, "grad_norm": 0.15014616842116707, "learning_rate": 4.760932897343278e-06, "loss": 0.6829, "step": 2510 }, { "epoch": 0.7783331396024643, "grad_norm": 0.16266600900560305, "learning_rate": 4.748196175116484e-06, "loss": 0.6981, "step": 2511 }, { "epoch": 0.7786431089929869, "grad_norm": 0.1561672083796911, "learning_rate": 4.735474217537224e-06, "loss": 0.7068, "step": 2512 }, { "epoch": 0.7789530783835096, "grad_norm": 0.15713032123232415, "learning_rate": 4.722767036921105e-06, "loss": 0.7246, "step": 2513 }, { "epoch": 0.7792630477740323, "grad_norm": 0.15849043978200167, "learning_rate": 4.710074645569429e-06, "loss": 0.7154, "step": 2514 }, { "epoch": 0.779573017164555, "grad_norm": 0.15400877492265558, "learning_rate": 4.6973970557691814e-06, "loss": 0.7004, "step": 2515 }, { "epoch": 0.7798829865550777, "grad_norm": 0.1509305140609206, "learning_rate": 4.68473427979302e-06, "loss": 0.7088, "step": 2516 }, { "epoch": 0.7801929559456003, "grad_norm": 0.15727839009633315, "learning_rate": 4.67208632989925e-06, "loss": 0.6956, "step": 2517 }, { "epoch": 0.7805029253361231, "grad_norm": 0.16280651282027506, "learning_rate": 4.659453218331858e-06, "loss": 0.7082, "step": 2518 }, { "epoch": 0.7808128947266457, "grad_norm": 0.15290420971021448, "learning_rate": 4.646834957320419e-06, "loss": 0.6748, "step": 2519 }, { "epoch": 0.7811228641171685, "grad_norm": 0.16745272508890643, "learning_rate": 4.634231559080164e-06, "loss": 0.7351, "step": 2520 }, { "epoch": 0.7814328335076911, "grad_norm": 0.15256345889760375, "learning_rate": 4.621643035811929e-06, "loss": 0.6966, "step": 2521 }, { "epoch": 0.7817428028982139, "grad_norm": 0.1580580020102626, "learning_rate": 4.60906939970214e-06, "loss": 0.7189, "step": 2522 }, { "epoch": 0.7820527722887365, "grad_norm": 0.15561711227716454, "learning_rate": 4.59651066292284e-06, "loss": 0.7293, "step": 2523 }, { "epoch": 0.7823627416792591, "grad_norm": 0.1524131334857925, "learning_rate": 4.5839668376316015e-06, "loss": 0.7077, "step": 2524 }, { "epoch": 0.7826727110697819, "grad_norm": 0.14577379380150635, "learning_rate": 4.571437935971616e-06, "loss": 0.6887, "step": 2525 }, { "epoch": 0.7829826804603045, "grad_norm": 0.15453248968821934, "learning_rate": 4.558923970071576e-06, "loss": 0.6834, "step": 2526 }, { "epoch": 0.7832926498508272, "grad_norm": 0.1626457401931501, "learning_rate": 4.546424952045756e-06, "loss": 0.7467, "step": 2527 }, { "epoch": 0.7836026192413499, "grad_norm": 0.15405231312775702, "learning_rate": 4.5339408939939465e-06, "loss": 0.7056, "step": 2528 }, { "epoch": 0.7839125886318726, "grad_norm": 0.1529977027368526, "learning_rate": 4.521471808001437e-06, "loss": 0.7324, "step": 2529 }, { "epoch": 0.7842225580223953, "grad_norm": 0.15458403622435918, "learning_rate": 4.5090177061390515e-06, "loss": 0.6812, "step": 2530 }, { "epoch": 0.784532527412918, "grad_norm": 0.15698710666910648, "learning_rate": 4.496578600463097e-06, "loss": 0.6982, "step": 2531 }, { "epoch": 0.7848424968034406, "grad_norm": 0.15058329325792, "learning_rate": 4.484154503015361e-06, "loss": 0.6958, "step": 2532 }, { "epoch": 0.7851524661939634, "grad_norm": 0.15720853137804808, "learning_rate": 4.4717454258231015e-06, "loss": 0.7337, "step": 2533 }, { "epoch": 0.785462435584486, "grad_norm": 0.15133831961435562, "learning_rate": 4.4593513808990444e-06, "loss": 0.6849, "step": 2534 }, { "epoch": 0.7857724049750087, "grad_norm": 0.15599034884571925, "learning_rate": 4.446972380241352e-06, "loss": 0.7318, "step": 2535 }, { "epoch": 0.7860823743655314, "grad_norm": 0.1544615488799161, "learning_rate": 4.434608435833631e-06, "loss": 0.733, "step": 2536 }, { "epoch": 0.786392343756054, "grad_norm": 0.14947265919014757, "learning_rate": 4.42225955964491e-06, "loss": 0.7172, "step": 2537 }, { "epoch": 0.7867023131465768, "grad_norm": 0.15609659471938198, "learning_rate": 4.409925763629632e-06, "loss": 0.729, "step": 2538 }, { "epoch": 0.7870122825370994, "grad_norm": 0.1563710912902964, "learning_rate": 4.39760705972764e-06, "loss": 0.7125, "step": 2539 }, { "epoch": 0.7873222519276222, "grad_norm": 0.15211560393517995, "learning_rate": 4.385303459864165e-06, "loss": 0.6896, "step": 2540 }, { "epoch": 0.7876322213181448, "grad_norm": 0.15458324787723568, "learning_rate": 4.373014975949823e-06, "loss": 0.7164, "step": 2541 }, { "epoch": 0.7879421907086676, "grad_norm": 0.6875522025670716, "learning_rate": 4.360741619880591e-06, "loss": 0.7195, "step": 2542 }, { "epoch": 0.7882521600991902, "grad_norm": 0.14748956328769344, "learning_rate": 4.348483403537796e-06, "loss": 0.6741, "step": 2543 }, { "epoch": 0.7885621294897129, "grad_norm": 0.15470565529730024, "learning_rate": 4.336240338788133e-06, "loss": 0.7064, "step": 2544 }, { "epoch": 0.7888720988802356, "grad_norm": 0.1477112561011346, "learning_rate": 4.324012437483591e-06, "loss": 0.7145, "step": 2545 }, { "epoch": 0.7891820682707582, "grad_norm": 0.15698802690981728, "learning_rate": 4.3117997114615265e-06, "loss": 0.7332, "step": 2546 }, { "epoch": 0.789492037661281, "grad_norm": 0.1568529479694275, "learning_rate": 4.299602172544557e-06, "loss": 0.7092, "step": 2547 }, { "epoch": 0.7898020070518036, "grad_norm": 0.15742466420850726, "learning_rate": 4.2874198325406245e-06, "loss": 0.7251, "step": 2548 }, { "epoch": 0.7901119764423263, "grad_norm": 0.1491894183278506, "learning_rate": 4.275252703242971e-06, "loss": 0.7489, "step": 2549 }, { "epoch": 0.790421945832849, "grad_norm": 0.1470024422141086, "learning_rate": 4.263100796430075e-06, "loss": 0.6907, "step": 2550 }, { "epoch": 0.7907319152233717, "grad_norm": 0.1551551229247707, "learning_rate": 4.250964123865722e-06, "loss": 0.6893, "step": 2551 }, { "epoch": 0.7910418846138944, "grad_norm": 0.15818376195347944, "learning_rate": 4.238842697298906e-06, "loss": 0.7145, "step": 2552 }, { "epoch": 0.7913518540044171, "grad_norm": 0.14813945522998795, "learning_rate": 4.2267365284639e-06, "loss": 0.6897, "step": 2553 }, { "epoch": 0.7916618233949397, "grad_norm": 0.15072615785625593, "learning_rate": 4.214645629080192e-06, "loss": 0.6921, "step": 2554 }, { "epoch": 0.7919717927854625, "grad_norm": 0.15684367216644027, "learning_rate": 4.202570010852471e-06, "loss": 0.7356, "step": 2555 }, { "epoch": 0.7922817621759851, "grad_norm": 0.24729308226996022, "learning_rate": 4.190509685470665e-06, "loss": 0.7086, "step": 2556 }, { "epoch": 0.7925917315665079, "grad_norm": 0.15272011772440947, "learning_rate": 4.178464664609878e-06, "loss": 0.6802, "step": 2557 }, { "epoch": 0.7929017009570305, "grad_norm": 0.45616332894649336, "learning_rate": 4.166434959930399e-06, "loss": 0.7089, "step": 2558 }, { "epoch": 0.7932116703475531, "grad_norm": 0.15655878881240753, "learning_rate": 4.154420583077696e-06, "loss": 0.7154, "step": 2559 }, { "epoch": 0.7935216397380759, "grad_norm": 0.15672233889308226, "learning_rate": 4.1424215456823935e-06, "loss": 0.707, "step": 2560 }, { "epoch": 0.7938316091285985, "grad_norm": 0.15636933016651133, "learning_rate": 4.13043785936027e-06, "loss": 0.7017, "step": 2561 }, { "epoch": 0.7941415785191213, "grad_norm": 0.16152411346271048, "learning_rate": 4.118469535712244e-06, "loss": 0.7214, "step": 2562 }, { "epoch": 0.7944515479096439, "grad_norm": 0.14808807413237357, "learning_rate": 4.106516586324356e-06, "loss": 0.6983, "step": 2563 }, { "epoch": 0.7947615173001666, "grad_norm": 0.15901965199983434, "learning_rate": 4.09457902276777e-06, "loss": 0.7068, "step": 2564 }, { "epoch": 0.7950714866906893, "grad_norm": 0.15375243725554932, "learning_rate": 4.082656856598754e-06, "loss": 0.7089, "step": 2565 }, { "epoch": 0.795381456081212, "grad_norm": 0.15618327185341174, "learning_rate": 4.070750099358669e-06, "loss": 0.705, "step": 2566 }, { "epoch": 0.7956914254717347, "grad_norm": 0.18309462492256184, "learning_rate": 4.058858762573958e-06, "loss": 0.6967, "step": 2567 }, { "epoch": 0.7960013948622574, "grad_norm": 0.40034600056888064, "learning_rate": 4.046982857756139e-06, "loss": 0.6844, "step": 2568 }, { "epoch": 0.79631136425278, "grad_norm": 0.15727707118486245, "learning_rate": 4.035122396401789e-06, "loss": 0.7271, "step": 2569 }, { "epoch": 0.7966213336433027, "grad_norm": 0.15011613027237594, "learning_rate": 4.023277389992539e-06, "loss": 0.718, "step": 2570 }, { "epoch": 0.7969313030338254, "grad_norm": 0.14806706860670077, "learning_rate": 4.011447849995045e-06, "loss": 0.6941, "step": 2571 }, { "epoch": 0.7972412724243481, "grad_norm": 0.15280780163431154, "learning_rate": 3.999633787861019e-06, "loss": 0.6929, "step": 2572 }, { "epoch": 0.7975512418148708, "grad_norm": 0.15217349240361455, "learning_rate": 3.9878352150271556e-06, "loss": 0.7011, "step": 2573 }, { "epoch": 0.7978612112053934, "grad_norm": 0.15417136001309226, "learning_rate": 3.976052142915172e-06, "loss": 0.7292, "step": 2574 }, { "epoch": 0.7981711805959162, "grad_norm": 0.15407366043690163, "learning_rate": 3.964284582931792e-06, "loss": 0.6846, "step": 2575 }, { "epoch": 0.7984811499864388, "grad_norm": 0.15546981596416032, "learning_rate": 3.952532546468688e-06, "loss": 0.735, "step": 2576 }, { "epoch": 0.7987911193769616, "grad_norm": 0.15101167965037726, "learning_rate": 3.940796044902548e-06, "loss": 0.7035, "step": 2577 }, { "epoch": 0.7991010887674842, "grad_norm": 0.15026983643530198, "learning_rate": 3.9290750895949805e-06, "loss": 0.6948, "step": 2578 }, { "epoch": 0.799411058158007, "grad_norm": 0.15391727573824118, "learning_rate": 3.917369691892574e-06, "loss": 0.692, "step": 2579 }, { "epoch": 0.7997210275485296, "grad_norm": 0.15542578100773213, "learning_rate": 3.905679863126841e-06, "loss": 0.7076, "step": 2580 }, { "epoch": 0.8000309969390522, "grad_norm": 0.15683114341340065, "learning_rate": 3.894005614614232e-06, "loss": 0.7257, "step": 2581 }, { "epoch": 0.800340966329575, "grad_norm": 0.16404282422056668, "learning_rate": 3.882346957656107e-06, "loss": 0.7123, "step": 2582 }, { "epoch": 0.8006509357200976, "grad_norm": 0.16366835263661628, "learning_rate": 3.870703903538724e-06, "loss": 0.7177, "step": 2583 }, { "epoch": 0.8009609051106203, "grad_norm": 0.14994511403995317, "learning_rate": 3.859076463533265e-06, "loss": 0.7158, "step": 2584 }, { "epoch": 0.801270874501143, "grad_norm": 0.15268077588427298, "learning_rate": 3.847464648895769e-06, "loss": 0.7308, "step": 2585 }, { "epoch": 0.8015808438916657, "grad_norm": 0.15542396721511093, "learning_rate": 3.835868470867159e-06, "loss": 0.7363, "step": 2586 }, { "epoch": 0.8018908132821884, "grad_norm": 0.34169315616878426, "learning_rate": 3.824287940673226e-06, "loss": 0.7016, "step": 2587 }, { "epoch": 0.8022007826727111, "grad_norm": 0.1556869588297132, "learning_rate": 3.8127230695246044e-06, "loss": 0.6766, "step": 2588 }, { "epoch": 0.8025107520632337, "grad_norm": 0.15356761737430094, "learning_rate": 3.8011738686167698e-06, "loss": 0.7233, "step": 2589 }, { "epoch": 0.8028207214537565, "grad_norm": 0.15180054623342878, "learning_rate": 3.789640349130037e-06, "loss": 0.7185, "step": 2590 }, { "epoch": 0.8031306908442791, "grad_norm": 0.15378109838559925, "learning_rate": 3.7781225222295324e-06, "loss": 0.6941, "step": 2591 }, { "epoch": 0.8034406602348018, "grad_norm": 0.15217879265897685, "learning_rate": 3.766620399065193e-06, "loss": 0.6756, "step": 2592 }, { "epoch": 0.8037506296253245, "grad_norm": 0.15246318776885617, "learning_rate": 3.755133990771751e-06, "loss": 0.7247, "step": 2593 }, { "epoch": 0.8040605990158471, "grad_norm": 0.23032086626443896, "learning_rate": 3.7436633084687346e-06, "loss": 0.6829, "step": 2594 }, { "epoch": 0.8043705684063699, "grad_norm": 0.1514371926516028, "learning_rate": 3.7322083632604368e-06, "loss": 0.6976, "step": 2595 }, { "epoch": 0.8046805377968925, "grad_norm": 0.1455122114155944, "learning_rate": 3.7207691662359247e-06, "loss": 0.6891, "step": 2596 }, { "epoch": 0.8049905071874153, "grad_norm": 0.15090604756379147, "learning_rate": 3.7093457284690094e-06, "loss": 0.713, "step": 2597 }, { "epoch": 0.8053004765779379, "grad_norm": 0.14866834980192536, "learning_rate": 3.6979380610182714e-06, "loss": 0.728, "step": 2598 }, { "epoch": 0.8056104459684607, "grad_norm": 0.15026166922350703, "learning_rate": 3.686546174926986e-06, "loss": 0.6798, "step": 2599 }, { "epoch": 0.8059204153589833, "grad_norm": 0.1543376166022528, "learning_rate": 3.675170081223187e-06, "loss": 0.7299, "step": 2600 }, { "epoch": 0.806230384749506, "grad_norm": 0.15797135562224457, "learning_rate": 3.6638097909196056e-06, "loss": 0.7483, "step": 2601 }, { "epoch": 0.8065403541400287, "grad_norm": 0.15162041855191566, "learning_rate": 3.6524653150136604e-06, "loss": 0.7174, "step": 2602 }, { "epoch": 0.8068503235305513, "grad_norm": 0.14672082390898392, "learning_rate": 3.641136664487492e-06, "loss": 0.7005, "step": 2603 }, { "epoch": 0.807160292921074, "grad_norm": 0.1453047057595835, "learning_rate": 3.6298238503078853e-06, "loss": 0.7228, "step": 2604 }, { "epoch": 0.8074702623115967, "grad_norm": 0.15237701134944587, "learning_rate": 3.61852688342633e-06, "loss": 0.6917, "step": 2605 }, { "epoch": 0.8077802317021194, "grad_norm": 0.15261475961125706, "learning_rate": 3.607245774778949e-06, "loss": 0.7107, "step": 2606 }, { "epoch": 0.8080902010926421, "grad_norm": 0.1464176242538122, "learning_rate": 3.595980535286525e-06, "loss": 0.6919, "step": 2607 }, { "epoch": 0.8084001704831648, "grad_norm": 0.1483454848507405, "learning_rate": 3.584731175854479e-06, "loss": 0.6996, "step": 2608 }, { "epoch": 0.8087101398736875, "grad_norm": 0.1529550251472833, "learning_rate": 3.5734977073728415e-06, "loss": 0.7173, "step": 2609 }, { "epoch": 0.8090201092642102, "grad_norm": 0.15283665785104164, "learning_rate": 3.5622801407162876e-06, "loss": 0.6928, "step": 2610 }, { "epoch": 0.8093300786547328, "grad_norm": 0.15028889280524754, "learning_rate": 3.551078486744084e-06, "loss": 0.7109, "step": 2611 }, { "epoch": 0.8096400480452556, "grad_norm": 0.14672387838241016, "learning_rate": 3.5398927563000874e-06, "loss": 0.6758, "step": 2612 }, { "epoch": 0.8099500174357782, "grad_norm": 0.14623994599295553, "learning_rate": 3.5287229602127514e-06, "loss": 0.7, "step": 2613 }, { "epoch": 0.8102599868263008, "grad_norm": 0.15403428165141653, "learning_rate": 3.5175691092950958e-06, "loss": 0.7454, "step": 2614 }, { "epoch": 0.8105699562168236, "grad_norm": 0.14949994153949842, "learning_rate": 3.5064312143447097e-06, "loss": 0.6932, "step": 2615 }, { "epoch": 0.8108799256073462, "grad_norm": 0.15411123038180588, "learning_rate": 3.4953092861437333e-06, "loss": 0.7006, "step": 2616 }, { "epoch": 0.811189894997869, "grad_norm": 0.1546784744961899, "learning_rate": 3.4842033354588466e-06, "loss": 0.6994, "step": 2617 }, { "epoch": 0.8114998643883916, "grad_norm": 0.15238043560321962, "learning_rate": 3.4731133730412657e-06, "loss": 0.6835, "step": 2618 }, { "epoch": 0.8118098337789144, "grad_norm": 0.14922232665584978, "learning_rate": 3.4620394096267453e-06, "loss": 0.7082, "step": 2619 }, { "epoch": 0.812119803169437, "grad_norm": 0.15201156834277046, "learning_rate": 3.450981455935516e-06, "loss": 0.7069, "step": 2620 }, { "epoch": 0.8124297725599597, "grad_norm": 0.3123149804071823, "learning_rate": 3.439939522672342e-06, "loss": 0.6928, "step": 2621 }, { "epoch": 0.8127397419504824, "grad_norm": 0.15080197727567113, "learning_rate": 3.4289136205264638e-06, "loss": 0.7182, "step": 2622 }, { "epoch": 0.8130497113410051, "grad_norm": 0.15447189315557722, "learning_rate": 3.417903760171599e-06, "loss": 0.7062, "step": 2623 }, { "epoch": 0.8133596807315278, "grad_norm": 0.1508186363008385, "learning_rate": 3.406909952265964e-06, "loss": 0.7108, "step": 2624 }, { "epoch": 0.8136696501220505, "grad_norm": 0.1483541442504289, "learning_rate": 3.3959322074521907e-06, "loss": 0.6897, "step": 2625 }, { "epoch": 0.8139796195125731, "grad_norm": 0.14805160583841592, "learning_rate": 3.3849705363574014e-06, "loss": 0.7092, "step": 2626 }, { "epoch": 0.8142895889030958, "grad_norm": 0.15283378534035272, "learning_rate": 3.37402494959314e-06, "loss": 0.6947, "step": 2627 }, { "epoch": 0.8145995582936185, "grad_norm": 0.14668803151476734, "learning_rate": 3.3630954577553674e-06, "loss": 0.6971, "step": 2628 }, { "epoch": 0.8149095276841412, "grad_norm": 0.1573568410259997, "learning_rate": 3.352182071424499e-06, "loss": 0.6815, "step": 2629 }, { "epoch": 0.8152194970746639, "grad_norm": 0.1604402310597049, "learning_rate": 3.3412848011653166e-06, "loss": 0.7313, "step": 2630 }, { "epoch": 0.8155294664651865, "grad_norm": 0.15076593036463598, "learning_rate": 3.330403657527035e-06, "loss": 0.7091, "step": 2631 }, { "epoch": 0.8158394358557093, "grad_norm": 0.150736707196884, "learning_rate": 3.319538651043244e-06, "loss": 0.7029, "step": 2632 }, { "epoch": 0.8161494052462319, "grad_norm": 0.1556133483044027, "learning_rate": 3.308689792231907e-06, "loss": 0.7155, "step": 2633 }, { "epoch": 0.8164593746367547, "grad_norm": 0.1611638046857171, "learning_rate": 3.297857091595367e-06, "loss": 0.7358, "step": 2634 }, { "epoch": 0.8167693440272773, "grad_norm": 0.14907101492038366, "learning_rate": 3.2870405596203046e-06, "loss": 0.7032, "step": 2635 }, { "epoch": 0.8170793134178, "grad_norm": 0.23306302074437155, "learning_rate": 3.2762402067777787e-06, "loss": 0.7227, "step": 2636 }, { "epoch": 0.8173892828083227, "grad_norm": 0.15139768844909002, "learning_rate": 3.2654560435231587e-06, "loss": 0.6939, "step": 2637 }, { "epoch": 0.8176992521988453, "grad_norm": 0.1531293779498242, "learning_rate": 3.2546880802961578e-06, "loss": 0.7028, "step": 2638 }, { "epoch": 0.8180092215893681, "grad_norm": 0.1507465598808232, "learning_rate": 3.243936327520798e-06, "loss": 0.7018, "step": 2639 }, { "epoch": 0.8183191909798907, "grad_norm": 0.15081253516603899, "learning_rate": 3.233200795605411e-06, "loss": 0.6823, "step": 2640 }, { "epoch": 0.8186291603704134, "grad_norm": 0.1524572266738776, "learning_rate": 3.2224814949426287e-06, "loss": 0.6987, "step": 2641 }, { "epoch": 0.8189391297609361, "grad_norm": 0.14856209265675843, "learning_rate": 3.211778435909365e-06, "loss": 0.706, "step": 2642 }, { "epoch": 0.8192490991514588, "grad_norm": 0.14754028378106465, "learning_rate": 3.201091628866815e-06, "loss": 0.7008, "step": 2643 }, { "epoch": 0.8195590685419815, "grad_norm": 0.16017782063144032, "learning_rate": 3.1904210841604334e-06, "loss": 0.7043, "step": 2644 }, { "epoch": 0.8198690379325042, "grad_norm": 0.1545434935158577, "learning_rate": 3.1797668121199555e-06, "loss": 0.7077, "step": 2645 }, { "epoch": 0.8201790073230268, "grad_norm": 0.1495966250089218, "learning_rate": 3.1691288230593286e-06, "loss": 0.6739, "step": 2646 }, { "epoch": 0.8204889767135496, "grad_norm": 0.14885525733269112, "learning_rate": 3.158507127276762e-06, "loss": 0.6668, "step": 2647 }, { "epoch": 0.8207989461040722, "grad_norm": 0.1528718166259496, "learning_rate": 3.1479017350546815e-06, "loss": 0.6926, "step": 2648 }, { "epoch": 0.8211089154945949, "grad_norm": 0.15678703574553615, "learning_rate": 3.1373126566597347e-06, "loss": 0.73, "step": 2649 }, { "epoch": 0.8214188848851176, "grad_norm": 0.18589895672588685, "learning_rate": 3.1267399023427834e-06, "loss": 0.7348, "step": 2650 }, { "epoch": 0.8217288542756402, "grad_norm": 0.15112488855994433, "learning_rate": 3.116183482338866e-06, "loss": 0.745, "step": 2651 }, { "epoch": 0.822038823666163, "grad_norm": 0.8063885014076018, "learning_rate": 3.1056434068672335e-06, "loss": 0.7135, "step": 2652 }, { "epoch": 0.8223487930566856, "grad_norm": 0.15091136653262255, "learning_rate": 3.0951196861312917e-06, "loss": 0.6896, "step": 2653 }, { "epoch": 0.8226587624472084, "grad_norm": 0.1477774316759618, "learning_rate": 3.084612330318624e-06, "loss": 0.6832, "step": 2654 }, { "epoch": 0.822968731837731, "grad_norm": 0.152405596235546, "learning_rate": 3.074121349600985e-06, "loss": 0.6943, "step": 2655 }, { "epoch": 0.8232787012282538, "grad_norm": 0.14745374756167545, "learning_rate": 3.063646754134244e-06, "loss": 0.6948, "step": 2656 }, { "epoch": 0.8235886706187764, "grad_norm": 0.14857922671849022, "learning_rate": 3.053188554058446e-06, "loss": 0.6902, "step": 2657 }, { "epoch": 0.8238986400092991, "grad_norm": 0.15524621473358374, "learning_rate": 3.042746759497739e-06, "loss": 0.6863, "step": 2658 }, { "epoch": 0.8242086093998218, "grad_norm": 0.15160853701503188, "learning_rate": 3.032321380560399e-06, "loss": 0.7277, "step": 2659 }, { "epoch": 0.8245185787903444, "grad_norm": 0.20623052431967753, "learning_rate": 3.021912427338807e-06, "loss": 0.7161, "step": 2660 }, { "epoch": 0.8248285481808671, "grad_norm": 0.15122873986608013, "learning_rate": 3.0115199099094483e-06, "loss": 0.7295, "step": 2661 }, { "epoch": 0.8251385175713898, "grad_norm": 0.14833770098822946, "learning_rate": 3.0011438383328915e-06, "loss": 0.6999, "step": 2662 }, { "epoch": 0.8254484869619125, "grad_norm": 0.14919222538477558, "learning_rate": 2.9907842226537887e-06, "loss": 0.6832, "step": 2663 }, { "epoch": 0.8257584563524352, "grad_norm": 0.1530945405577264, "learning_rate": 2.980441072900857e-06, "loss": 0.6968, "step": 2664 }, { "epoch": 0.8260684257429579, "grad_norm": 0.1506702413732101, "learning_rate": 2.970114399086881e-06, "loss": 0.6967, "step": 2665 }, { "epoch": 0.8263783951334805, "grad_norm": 0.15289125233022272, "learning_rate": 2.959804211208688e-06, "loss": 0.7041, "step": 2666 }, { "epoch": 0.8266883645240033, "grad_norm": 0.14436035636107947, "learning_rate": 2.949510519247152e-06, "loss": 0.7011, "step": 2667 }, { "epoch": 0.8269983339145259, "grad_norm": 0.14871344191635377, "learning_rate": 2.9392333331671707e-06, "loss": 0.7142, "step": 2668 }, { "epoch": 0.8273083033050487, "grad_norm": 0.14936636951966248, "learning_rate": 2.928972662917673e-06, "loss": 0.7053, "step": 2669 }, { "epoch": 0.8276182726955713, "grad_norm": 0.14958145955940308, "learning_rate": 2.918728518431586e-06, "loss": 0.7003, "step": 2670 }, { "epoch": 0.827928242086094, "grad_norm": 0.14550834180870978, "learning_rate": 2.908500909625862e-06, "loss": 0.7048, "step": 2671 }, { "epoch": 0.8282382114766167, "grad_norm": 0.14867880279172632, "learning_rate": 2.898289846401412e-06, "loss": 0.7251, "step": 2672 }, { "epoch": 0.8285481808671393, "grad_norm": 0.14862534051358392, "learning_rate": 2.888095338643169e-06, "loss": 0.6711, "step": 2673 }, { "epoch": 0.8288581502576621, "grad_norm": 0.14864853092979155, "learning_rate": 2.8779173962200024e-06, "loss": 0.714, "step": 2674 }, { "epoch": 0.8291681196481847, "grad_norm": 0.14842951921722336, "learning_rate": 2.867756028984765e-06, "loss": 0.7148, "step": 2675 }, { "epoch": 0.8294780890387075, "grad_norm": 0.1519419229821755, "learning_rate": 2.85761124677427e-06, "loss": 0.7147, "step": 2676 }, { "epoch": 0.8297880584292301, "grad_norm": 0.14709100902556838, "learning_rate": 2.8474830594092528e-06, "loss": 0.6957, "step": 2677 }, { "epoch": 0.8300980278197528, "grad_norm": 0.14232137168616701, "learning_rate": 2.837371476694413e-06, "loss": 0.6929, "step": 2678 }, { "epoch": 0.8304079972102755, "grad_norm": 0.14775386780946434, "learning_rate": 2.827276508418344e-06, "loss": 0.704, "step": 2679 }, { "epoch": 0.8307179666007982, "grad_norm": 0.14750996697596447, "learning_rate": 2.817198164353583e-06, "loss": 0.7111, "step": 2680 }, { "epoch": 0.8310279359913209, "grad_norm": 0.1512557706623397, "learning_rate": 2.8071364542565626e-06, "loss": 0.7082, "step": 2681 }, { "epoch": 0.8313379053818435, "grad_norm": 0.15221517290979775, "learning_rate": 2.797091387867601e-06, "loss": 0.6958, "step": 2682 }, { "epoch": 0.8316478747723662, "grad_norm": 0.18818480265778315, "learning_rate": 2.7870629749109303e-06, "loss": 0.7071, "step": 2683 }, { "epoch": 0.8319578441628889, "grad_norm": 0.15037913995166166, "learning_rate": 2.7770512250946403e-06, "loss": 0.7063, "step": 2684 }, { "epoch": 0.8322678135534116, "grad_norm": 0.14914343177951878, "learning_rate": 2.7670561481106982e-06, "loss": 0.7071, "step": 2685 }, { "epoch": 0.8325777829439343, "grad_norm": 0.1473310406802609, "learning_rate": 2.757077753634929e-06, "loss": 0.6789, "step": 2686 }, { "epoch": 0.832887752334457, "grad_norm": 0.1571394084083097, "learning_rate": 2.747116051327008e-06, "loss": 0.7546, "step": 2687 }, { "epoch": 0.8331977217249796, "grad_norm": 0.14784843067720863, "learning_rate": 2.7371710508304494e-06, "loss": 0.6924, "step": 2688 }, { "epoch": 0.8335076911155024, "grad_norm": 0.15037891675318937, "learning_rate": 2.727242761772606e-06, "loss": 0.7423, "step": 2689 }, { "epoch": 0.833817660506025, "grad_norm": 0.14633997500377727, "learning_rate": 2.7173311937646473e-06, "loss": 0.7199, "step": 2690 }, { "epoch": 0.8341276298965478, "grad_norm": 0.14649697656010097, "learning_rate": 2.7074363564015536e-06, "loss": 0.6931, "step": 2691 }, { "epoch": 0.8344375992870704, "grad_norm": 0.14683579208049194, "learning_rate": 2.697558259262114e-06, "loss": 0.6877, "step": 2692 }, { "epoch": 0.834747568677593, "grad_norm": 0.29585217050740953, "learning_rate": 2.6876969119089125e-06, "loss": 0.6966, "step": 2693 }, { "epoch": 0.8350575380681158, "grad_norm": 0.14685842432287932, "learning_rate": 2.6778523238883146e-06, "loss": 0.7207, "step": 2694 }, { "epoch": 0.8353675074586384, "grad_norm": 0.14793687965586177, "learning_rate": 2.6680245047304643e-06, "loss": 0.6706, "step": 2695 }, { "epoch": 0.8356774768491612, "grad_norm": 0.1472537910916512, "learning_rate": 2.6582134639492686e-06, "loss": 0.7164, "step": 2696 }, { "epoch": 0.8359874462396838, "grad_norm": 0.14857230979693367, "learning_rate": 2.648419211042397e-06, "loss": 0.699, "step": 2697 }, { "epoch": 0.8362974156302065, "grad_norm": 0.14572985347120457, "learning_rate": 2.63864175549126e-06, "loss": 0.7149, "step": 2698 }, { "epoch": 0.8366073850207292, "grad_norm": 0.14791422462591233, "learning_rate": 2.6288811067610276e-06, "loss": 0.7097, "step": 2699 }, { "epoch": 0.8369173544112519, "grad_norm": 0.14709931076583133, "learning_rate": 2.6191372743005696e-06, "loss": 0.7195, "step": 2700 }, { "epoch": 0.8372273238017746, "grad_norm": 0.14695918833482827, "learning_rate": 2.6094102675424895e-06, "loss": 0.7035, "step": 2701 }, { "epoch": 0.8375372931922973, "grad_norm": 0.1483026280934615, "learning_rate": 2.5997000959031238e-06, "loss": 0.7211, "step": 2702 }, { "epoch": 0.8378472625828199, "grad_norm": 0.14677665224945333, "learning_rate": 2.5900067687824693e-06, "loss": 0.6997, "step": 2703 }, { "epoch": 0.8381572319733427, "grad_norm": 0.14427339695938166, "learning_rate": 2.5803302955642616e-06, "loss": 0.7099, "step": 2704 }, { "epoch": 0.8384672013638653, "grad_norm": 0.14268931484744252, "learning_rate": 2.570670685615877e-06, "loss": 0.6771, "step": 2705 }, { "epoch": 0.838777170754388, "grad_norm": 0.1480036278352376, "learning_rate": 2.561027948288406e-06, "loss": 0.7039, "step": 2706 }, { "epoch": 0.8390871401449107, "grad_norm": 0.14503664985841963, "learning_rate": 2.551402092916586e-06, "loss": 0.7131, "step": 2707 }, { "epoch": 0.8393971095354333, "grad_norm": 0.14886714778507493, "learning_rate": 2.5417931288187992e-06, "loss": 0.7162, "step": 2708 }, { "epoch": 0.8397070789259561, "grad_norm": 0.1439206309629118, "learning_rate": 2.532201065297113e-06, "loss": 0.684, "step": 2709 }, { "epoch": 0.8400170483164787, "grad_norm": 0.14839413909954716, "learning_rate": 2.522625911637189e-06, "loss": 0.7151, "step": 2710 }, { "epoch": 0.8403270177070015, "grad_norm": 0.1473935986171068, "learning_rate": 2.5130676771083585e-06, "loss": 0.7286, "step": 2711 }, { "epoch": 0.8406369870975241, "grad_norm": 0.14347335352374804, "learning_rate": 2.5035263709635516e-06, "loss": 0.699, "step": 2712 }, { "epoch": 0.8409469564880468, "grad_norm": 0.15091148182965075, "learning_rate": 2.4940020024393175e-06, "loss": 0.7046, "step": 2713 }, { "epoch": 0.8412569258785695, "grad_norm": 0.14130037549280314, "learning_rate": 2.4844945807558074e-06, "loss": 0.6796, "step": 2714 }, { "epoch": 0.8415668952690922, "grad_norm": 0.14723496804777383, "learning_rate": 2.475004115116766e-06, "loss": 0.7111, "step": 2715 }, { "epoch": 0.8418768646596149, "grad_norm": 0.14612761739406802, "learning_rate": 2.465530614709528e-06, "loss": 0.7017, "step": 2716 }, { "epoch": 0.8421868340501375, "grad_norm": 0.1496913379100503, "learning_rate": 2.4560740887049983e-06, "loss": 0.7189, "step": 2717 }, { "epoch": 0.8424968034406602, "grad_norm": 0.15249238895449335, "learning_rate": 2.4466345462576557e-06, "loss": 0.7136, "step": 2718 }, { "epoch": 0.8428067728311829, "grad_norm": 0.1430592891344091, "learning_rate": 2.437211996505535e-06, "loss": 0.6885, "step": 2719 }, { "epoch": 0.8431167422217056, "grad_norm": 0.1483962970977551, "learning_rate": 2.42780644857022e-06, "loss": 0.7188, "step": 2720 }, { "epoch": 0.8434267116122283, "grad_norm": 0.14139277413585513, "learning_rate": 2.4184179115568364e-06, "loss": 0.6926, "step": 2721 }, { "epoch": 0.843736681002751, "grad_norm": 0.14433582864874941, "learning_rate": 2.4090463945540465e-06, "loss": 0.725, "step": 2722 }, { "epoch": 0.8440466503932736, "grad_norm": 0.14585713470734446, "learning_rate": 2.3996919066340276e-06, "loss": 0.7035, "step": 2723 }, { "epoch": 0.8443566197837964, "grad_norm": 0.14528714939303738, "learning_rate": 2.390354456852475e-06, "loss": 0.7152, "step": 2724 }, { "epoch": 0.844666589174319, "grad_norm": 0.15569806569065525, "learning_rate": 2.381034054248608e-06, "loss": 0.6866, "step": 2725 }, { "epoch": 0.8449765585648418, "grad_norm": 0.14715330188067, "learning_rate": 2.371730707845108e-06, "loss": 0.6855, "step": 2726 }, { "epoch": 0.8452865279553644, "grad_norm": 0.1477005556048246, "learning_rate": 2.3624444266481696e-06, "loss": 0.6812, "step": 2727 }, { "epoch": 0.845596497345887, "grad_norm": 0.14478168762517538, "learning_rate": 2.35317521964747e-06, "loss": 0.7133, "step": 2728 }, { "epoch": 0.8459064667364098, "grad_norm": 0.191124727436846, "learning_rate": 2.3439230958161363e-06, "loss": 0.7061, "step": 2729 }, { "epoch": 0.8462164361269324, "grad_norm": 0.14534632668790576, "learning_rate": 2.3346880641107883e-06, "loss": 0.7193, "step": 2730 }, { "epoch": 0.8465264055174552, "grad_norm": 0.1461244892595093, "learning_rate": 2.3254701334714636e-06, "loss": 0.7189, "step": 2731 }, { "epoch": 0.8468363749079778, "grad_norm": 0.1469186961550674, "learning_rate": 2.316269312821675e-06, "loss": 0.6762, "step": 2732 }, { "epoch": 0.8471463442985006, "grad_norm": 0.1437852086174647, "learning_rate": 2.3070856110683605e-06, "loss": 0.7057, "step": 2733 }, { "epoch": 0.8474563136890232, "grad_norm": 0.14740607678785586, "learning_rate": 2.2979190371018832e-06, "loss": 0.6893, "step": 2734 }, { "epoch": 0.8477662830795459, "grad_norm": 0.1463438803455395, "learning_rate": 2.2887695997960326e-06, "loss": 0.6785, "step": 2735 }, { "epoch": 0.8480762524700686, "grad_norm": 0.1475318072231885, "learning_rate": 2.279637308007996e-06, "loss": 0.7189, "step": 2736 }, { "epoch": 0.8483862218605913, "grad_norm": 0.14812318075340447, "learning_rate": 2.2705221705783798e-06, "loss": 0.7127, "step": 2737 }, { "epoch": 0.848696191251114, "grad_norm": 0.1441383248791917, "learning_rate": 2.2614241963311723e-06, "loss": 0.7082, "step": 2738 }, { "epoch": 0.8490061606416366, "grad_norm": 0.2716983129369128, "learning_rate": 2.2523433940737525e-06, "loss": 0.6967, "step": 2739 }, { "epoch": 0.8493161300321593, "grad_norm": 0.14577850320541066, "learning_rate": 2.243279772596871e-06, "loss": 0.6892, "step": 2740 }, { "epoch": 0.849626099422682, "grad_norm": 0.1431387525328625, "learning_rate": 2.234233340674652e-06, "loss": 0.6859, "step": 2741 }, { "epoch": 0.8499360688132047, "grad_norm": 0.1426032479212949, "learning_rate": 2.2252041070645736e-06, "loss": 0.7066, "step": 2742 }, { "epoch": 0.8502460382037274, "grad_norm": 0.14568916329707135, "learning_rate": 2.216192080507471e-06, "loss": 0.7143, "step": 2743 }, { "epoch": 0.8505560075942501, "grad_norm": 0.20900092460285263, "learning_rate": 2.2071972697275144e-06, "loss": 0.7206, "step": 2744 }, { "epoch": 0.8508659769847727, "grad_norm": 0.1460716570907594, "learning_rate": 2.1982196834322146e-06, "loss": 0.6974, "step": 2745 }, { "epoch": 0.8511759463752955, "grad_norm": 0.15705089011498208, "learning_rate": 2.189259330312403e-06, "loss": 0.7033, "step": 2746 }, { "epoch": 0.8514859157658181, "grad_norm": 0.1496546488668491, "learning_rate": 2.1803162190422354e-06, "loss": 0.7262, "step": 2747 }, { "epoch": 0.8517958851563409, "grad_norm": 0.14226900151621444, "learning_rate": 2.1713903582791707e-06, "loss": 0.7031, "step": 2748 }, { "epoch": 0.8521058545468635, "grad_norm": 0.14559126228539765, "learning_rate": 2.162481756663968e-06, "loss": 0.7124, "step": 2749 }, { "epoch": 0.8524158239373861, "grad_norm": 0.1505722207015279, "learning_rate": 2.1535904228206773e-06, "loss": 0.7051, "step": 2750 }, { "epoch": 0.8527257933279089, "grad_norm": 0.14914066569726975, "learning_rate": 2.144716365356645e-06, "loss": 0.7185, "step": 2751 }, { "epoch": 0.8530357627184315, "grad_norm": 0.30116123731567995, "learning_rate": 2.1358595928624724e-06, "loss": 0.7309, "step": 2752 }, { "epoch": 0.8533457321089543, "grad_norm": 0.15032624149642174, "learning_rate": 2.1270201139120463e-06, "loss": 0.7004, "step": 2753 }, { "epoch": 0.8536557014994769, "grad_norm": 0.14549384878488206, "learning_rate": 2.118197937062505e-06, "loss": 0.7039, "step": 2754 }, { "epoch": 0.8539656708899996, "grad_norm": 0.14802601403381743, "learning_rate": 2.1093930708542286e-06, "loss": 0.705, "step": 2755 }, { "epoch": 0.8542756402805223, "grad_norm": 0.15031520031617135, "learning_rate": 2.1006055238108592e-06, "loss": 0.7286, "step": 2756 }, { "epoch": 0.854585609671045, "grad_norm": 0.14625038253854025, "learning_rate": 2.091835304439249e-06, "loss": 0.6981, "step": 2757 }, { "epoch": 0.8548955790615677, "grad_norm": 0.15392116292498867, "learning_rate": 2.0830824212295007e-06, "loss": 0.7235, "step": 2758 }, { "epoch": 0.8552055484520904, "grad_norm": 0.1482402313732994, "learning_rate": 2.0743468826549164e-06, "loss": 0.7177, "step": 2759 }, { "epoch": 0.855515517842613, "grad_norm": 0.1604175010021036, "learning_rate": 2.065628697172015e-06, "loss": 0.7361, "step": 2760 }, { "epoch": 0.8558254872331357, "grad_norm": 0.14498787895762116, "learning_rate": 2.05692787322052e-06, "loss": 0.6998, "step": 2761 }, { "epoch": 0.8561354566236584, "grad_norm": 0.14875064906566982, "learning_rate": 2.048244419223331e-06, "loss": 0.6903, "step": 2762 }, { "epoch": 0.856445426014181, "grad_norm": 0.14685356862433105, "learning_rate": 2.0395783435865545e-06, "loss": 0.7251, "step": 2763 }, { "epoch": 0.8567553954047038, "grad_norm": 0.14804662117101708, "learning_rate": 2.030929654699463e-06, "loss": 0.7291, "step": 2764 }, { "epoch": 0.8570653647952264, "grad_norm": 0.14324798903792402, "learning_rate": 2.022298360934496e-06, "loss": 0.6955, "step": 2765 }, { "epoch": 0.8573753341857492, "grad_norm": 0.15048418957688123, "learning_rate": 2.013684470647259e-06, "loss": 0.7072, "step": 2766 }, { "epoch": 0.8576853035762718, "grad_norm": 0.14910766426194857, "learning_rate": 2.0050879921765044e-06, "loss": 0.6986, "step": 2767 }, { "epoch": 0.8579952729667946, "grad_norm": 0.14419821334443025, "learning_rate": 1.9965089338441323e-06, "loss": 0.7125, "step": 2768 }, { "epoch": 0.8583052423573172, "grad_norm": 0.1513509894608633, "learning_rate": 1.9879473039551777e-06, "loss": 0.7041, "step": 2769 }, { "epoch": 0.85861521174784, "grad_norm": 0.285800167721655, "learning_rate": 1.979403110797804e-06, "loss": 0.7181, "step": 2770 }, { "epoch": 0.8589251811383626, "grad_norm": 0.14734748225036048, "learning_rate": 1.9708763626432924e-06, "loss": 0.6917, "step": 2771 }, { "epoch": 0.8592351505288853, "grad_norm": 0.14994790967608884, "learning_rate": 1.9623670677460494e-06, "loss": 0.6977, "step": 2772 }, { "epoch": 0.859545119919408, "grad_norm": 0.3837969353215318, "learning_rate": 1.9538752343435674e-06, "loss": 0.7214, "step": 2773 }, { "epoch": 0.8598550893099306, "grad_norm": 0.14735200848631275, "learning_rate": 1.945400870656442e-06, "loss": 0.6998, "step": 2774 }, { "epoch": 0.8601650587004533, "grad_norm": 0.14740375777599538, "learning_rate": 1.9369439848883596e-06, "loss": 0.7098, "step": 2775 }, { "epoch": 0.860475028090976, "grad_norm": 0.3496736978592514, "learning_rate": 1.928504585226083e-06, "loss": 0.7322, "step": 2776 }, { "epoch": 0.8607849974814987, "grad_norm": 0.14354516144768997, "learning_rate": 1.9200826798394613e-06, "loss": 0.7081, "step": 2777 }, { "epoch": 0.8610949668720214, "grad_norm": 0.14869464095013277, "learning_rate": 1.9116782768813812e-06, "loss": 0.7443, "step": 2778 }, { "epoch": 0.8614049362625441, "grad_norm": 0.1465039548730289, "learning_rate": 1.9032913844878153e-06, "loss": 0.7171, "step": 2779 }, { "epoch": 0.8617149056530667, "grad_norm": 0.14872498963333575, "learning_rate": 1.89492201077776e-06, "loss": 0.7261, "step": 2780 }, { "epoch": 0.8620248750435895, "grad_norm": 0.14662933832224231, "learning_rate": 1.8865701638532651e-06, "loss": 0.6914, "step": 2781 }, { "epoch": 0.8623348444341121, "grad_norm": 0.14386903688921573, "learning_rate": 1.8782358517994238e-06, "loss": 0.6805, "step": 2782 }, { "epoch": 0.8626448138246349, "grad_norm": 0.1446941847925351, "learning_rate": 1.869919082684324e-06, "loss": 0.6878, "step": 2783 }, { "epoch": 0.8629547832151575, "grad_norm": 0.15157058754787978, "learning_rate": 1.8616198645591054e-06, "loss": 0.7058, "step": 2784 }, { "epoch": 0.8632647526056801, "grad_norm": 0.14613493289499313, "learning_rate": 1.8533382054578953e-06, "loss": 0.725, "step": 2785 }, { "epoch": 0.8635747219962029, "grad_norm": 0.23639484252827667, "learning_rate": 1.8450741133978312e-06, "loss": 0.7058, "step": 2786 }, { "epoch": 0.8638846913867255, "grad_norm": 0.14827148791817865, "learning_rate": 1.8368275963790406e-06, "loss": 0.7138, "step": 2787 }, { "epoch": 0.8641946607772483, "grad_norm": 0.1446383127256864, "learning_rate": 1.8285986623846397e-06, "loss": 0.7138, "step": 2788 }, { "epoch": 0.8645046301677709, "grad_norm": 0.14735808734035533, "learning_rate": 1.8203873193807252e-06, "loss": 0.7106, "step": 2789 }, { "epoch": 0.8648145995582937, "grad_norm": 0.15208478789094873, "learning_rate": 1.8121935753163588e-06, "loss": 0.7556, "step": 2790 }, { "epoch": 0.8651245689488163, "grad_norm": 0.1426746353717772, "learning_rate": 1.8040174381235708e-06, "loss": 0.6828, "step": 2791 }, { "epoch": 0.865434538339339, "grad_norm": 0.14486496825644105, "learning_rate": 1.7958589157173477e-06, "loss": 0.7117, "step": 2792 }, { "epoch": 0.8657445077298617, "grad_norm": 0.14333766941917733, "learning_rate": 1.7877180159956164e-06, "loss": 0.6708, "step": 2793 }, { "epoch": 0.8660544771203844, "grad_norm": 0.20303278523953094, "learning_rate": 1.7795947468392526e-06, "loss": 0.6871, "step": 2794 }, { "epoch": 0.866364446510907, "grad_norm": 0.14287209084474287, "learning_rate": 1.7714891161120618e-06, "loss": 0.7045, "step": 2795 }, { "epoch": 0.8666744159014297, "grad_norm": 0.27989144594354654, "learning_rate": 1.763401131660769e-06, "loss": 0.7163, "step": 2796 }, { "epoch": 0.8669843852919524, "grad_norm": 0.16402072990364525, "learning_rate": 1.7553308013150228e-06, "loss": 0.7042, "step": 2797 }, { "epoch": 0.8672943546824751, "grad_norm": 0.15532192231190176, "learning_rate": 1.7472781328873867e-06, "loss": 0.6954, "step": 2798 }, { "epoch": 0.8676043240729978, "grad_norm": 0.14782167221133183, "learning_rate": 1.7392431341733095e-06, "loss": 0.7213, "step": 2799 }, { "epoch": 0.8679142934635204, "grad_norm": 0.14777441436483751, "learning_rate": 1.7312258129511516e-06, "loss": 0.7118, "step": 2800 }, { "epoch": 0.8682242628540432, "grad_norm": 0.15115160598544383, "learning_rate": 1.7232261769821512e-06, "loss": 0.6844, "step": 2801 }, { "epoch": 0.8685342322445658, "grad_norm": 0.14838045714706863, "learning_rate": 1.7152442340104247e-06, "loss": 0.6988, "step": 2802 }, { "epoch": 0.8688442016350886, "grad_norm": 0.17719941619060153, "learning_rate": 1.707279991762978e-06, "loss": 0.7176, "step": 2803 }, { "epoch": 0.8691541710256112, "grad_norm": 0.15137841114769277, "learning_rate": 1.69933345794965e-06, "loss": 0.6936, "step": 2804 }, { "epoch": 0.869464140416134, "grad_norm": 0.13945160593438916, "learning_rate": 1.6914046402631745e-06, "loss": 0.6878, "step": 2805 }, { "epoch": 0.8697741098066566, "grad_norm": 0.1469638488194714, "learning_rate": 1.6834935463790959e-06, "loss": 0.7207, "step": 2806 }, { "epoch": 0.8700840791971792, "grad_norm": 0.14651992539381656, "learning_rate": 1.6756001839558367e-06, "loss": 0.7161, "step": 2807 }, { "epoch": 0.870394048587702, "grad_norm": 0.23344320349692133, "learning_rate": 1.6677245606346338e-06, "loss": 0.7365, "step": 2808 }, { "epoch": 0.8707040179782246, "grad_norm": 0.1418349809303907, "learning_rate": 1.659866684039546e-06, "loss": 0.7221, "step": 2809 }, { "epoch": 0.8710139873687474, "grad_norm": 0.14958677181525493, "learning_rate": 1.6520265617774756e-06, "loss": 0.7239, "step": 2810 }, { "epoch": 0.87132395675927, "grad_norm": 0.1405656195670991, "learning_rate": 1.64420420143812e-06, "loss": 0.6848, "step": 2811 }, { "epoch": 0.8716339261497927, "grad_norm": 0.14803083132502462, "learning_rate": 1.636399610593984e-06, "loss": 0.6897, "step": 2812 }, { "epoch": 0.8719438955403154, "grad_norm": 0.14393596735135805, "learning_rate": 1.6286127968003752e-06, "loss": 0.7011, "step": 2813 }, { "epoch": 0.8722538649308381, "grad_norm": 0.14521606212776286, "learning_rate": 1.620843767595388e-06, "loss": 0.721, "step": 2814 }, { "epoch": 0.8725638343213608, "grad_norm": 0.14708748984113615, "learning_rate": 1.6130925304999024e-06, "loss": 0.7243, "step": 2815 }, { "epoch": 0.8728738037118835, "grad_norm": 0.14428650228044232, "learning_rate": 1.6053590930175756e-06, "loss": 0.7124, "step": 2816 }, { "epoch": 0.8731837731024061, "grad_norm": 0.1381571551908202, "learning_rate": 1.5976434626348303e-06, "loss": 0.6627, "step": 2817 }, { "epoch": 0.8734937424929288, "grad_norm": 0.1450057630178747, "learning_rate": 1.5899456468208541e-06, "loss": 0.6777, "step": 2818 }, { "epoch": 0.8738037118834515, "grad_norm": 0.14569534186187003, "learning_rate": 1.5822656530275837e-06, "loss": 0.7026, "step": 2819 }, { "epoch": 0.8741136812739742, "grad_norm": 0.14357047149658017, "learning_rate": 1.5746034886897121e-06, "loss": 0.7154, "step": 2820 }, { "epoch": 0.8744236506644969, "grad_norm": 0.14169179292387912, "learning_rate": 1.566959161224666e-06, "loss": 0.7071, "step": 2821 }, { "epoch": 0.8747336200550195, "grad_norm": 0.1513019631772212, "learning_rate": 1.5593326780326057e-06, "loss": 0.7185, "step": 2822 }, { "epoch": 0.8750435894455423, "grad_norm": 0.1466421368341544, "learning_rate": 1.5517240464964167e-06, "loss": 0.6822, "step": 2823 }, { "epoch": 0.8753535588360649, "grad_norm": 0.14103634840517615, "learning_rate": 1.5441332739817028e-06, "loss": 0.7087, "step": 2824 }, { "epoch": 0.8756635282265877, "grad_norm": 0.14205462254222853, "learning_rate": 1.5365603678367813e-06, "loss": 0.702, "step": 2825 }, { "epoch": 0.8759734976171103, "grad_norm": 0.14507791528402197, "learning_rate": 1.5290053353926814e-06, "loss": 0.7188, "step": 2826 }, { "epoch": 0.876283467007633, "grad_norm": 0.14307683031656104, "learning_rate": 1.5214681839631085e-06, "loss": 0.6953, "step": 2827 }, { "epoch": 0.8765934363981557, "grad_norm": 0.14387439302354013, "learning_rate": 1.5139489208444724e-06, "loss": 0.6945, "step": 2828 }, { "epoch": 0.8769034057886783, "grad_norm": 0.15964048800128852, "learning_rate": 1.506447553315875e-06, "loss": 0.7264, "step": 2829 }, { "epoch": 0.8772133751792011, "grad_norm": 0.14659370612627098, "learning_rate": 1.49896408863907e-06, "loss": 0.6853, "step": 2830 }, { "epoch": 0.8775233445697237, "grad_norm": 0.14753039838898302, "learning_rate": 1.4914985340585042e-06, "loss": 0.7243, "step": 2831 }, { "epoch": 0.8778333139602464, "grad_norm": 0.1451285880492613, "learning_rate": 1.4840508968012657e-06, "loss": 0.7134, "step": 2832 }, { "epoch": 0.8781432833507691, "grad_norm": 0.1401905505439032, "learning_rate": 1.4766211840771162e-06, "loss": 0.6908, "step": 2833 }, { "epoch": 0.8784532527412918, "grad_norm": 0.14789312590887174, "learning_rate": 1.4692094030784577e-06, "loss": 0.7073, "step": 2834 }, { "epoch": 0.8787632221318145, "grad_norm": 0.14244504811699898, "learning_rate": 1.4618155609803198e-06, "loss": 0.6717, "step": 2835 }, { "epoch": 0.8790731915223372, "grad_norm": 0.1445495074843753, "learning_rate": 1.4544396649403924e-06, "loss": 0.7083, "step": 2836 }, { "epoch": 0.8793831609128598, "grad_norm": 0.1415316513913604, "learning_rate": 1.4470817220989687e-06, "loss": 0.6944, "step": 2837 }, { "epoch": 0.8796931303033826, "grad_norm": 0.14673800567842982, "learning_rate": 1.4397417395789793e-06, "loss": 0.7248, "step": 2838 }, { "epoch": 0.8800030996939052, "grad_norm": 0.14373568297553793, "learning_rate": 1.4324197244859583e-06, "loss": 0.7148, "step": 2839 }, { "epoch": 0.880313069084428, "grad_norm": 0.1444558648610033, "learning_rate": 1.4251156839080493e-06, "loss": 0.7304, "step": 2840 }, { "epoch": 0.8806230384749506, "grad_norm": 0.14562424046396774, "learning_rate": 1.4178296249159961e-06, "loss": 0.7155, "step": 2841 }, { "epoch": 0.8809330078654732, "grad_norm": 0.14061247891282136, "learning_rate": 1.4105615545631346e-06, "loss": 0.6605, "step": 2842 }, { "epoch": 0.881242977255996, "grad_norm": 0.1439185137706457, "learning_rate": 1.403311479885383e-06, "loss": 0.6784, "step": 2843 }, { "epoch": 0.8815529466465186, "grad_norm": 0.14586315093545676, "learning_rate": 1.3960794079012452e-06, "loss": 0.6841, "step": 2844 }, { "epoch": 0.8818629160370414, "grad_norm": 0.1425796778970297, "learning_rate": 1.388865345611794e-06, "loss": 0.702, "step": 2845 }, { "epoch": 0.882172885427564, "grad_norm": 0.14209708476569755, "learning_rate": 1.3816693000006699e-06, "loss": 0.6915, "step": 2846 }, { "epoch": 0.8824828548180867, "grad_norm": 0.1417699922980872, "learning_rate": 1.3744912780340648e-06, "loss": 0.7054, "step": 2847 }, { "epoch": 0.8827928242086094, "grad_norm": 0.41288665317588913, "learning_rate": 1.3673312866607336e-06, "loss": 0.7215, "step": 2848 }, { "epoch": 0.8831027935991321, "grad_norm": 0.2049237186061246, "learning_rate": 1.3601893328119674e-06, "loss": 0.7218, "step": 2849 }, { "epoch": 0.8834127629896548, "grad_norm": 0.14219819155016253, "learning_rate": 1.353065423401605e-06, "loss": 0.6796, "step": 2850 }, { "epoch": 0.8837227323801775, "grad_norm": 0.15064133873384958, "learning_rate": 1.3459595653260027e-06, "loss": 0.7452, "step": 2851 }, { "epoch": 0.8840327017707001, "grad_norm": 0.14521627178871505, "learning_rate": 1.3388717654640626e-06, "loss": 0.7273, "step": 2852 }, { "epoch": 0.8843426711612228, "grad_norm": 0.14050700298748617, "learning_rate": 1.3318020306771851e-06, "loss": 0.6897, "step": 2853 }, { "epoch": 0.8846526405517455, "grad_norm": 0.14245196479535202, "learning_rate": 1.324750367809291e-06, "loss": 0.686, "step": 2854 }, { "epoch": 0.8849626099422682, "grad_norm": 0.14432364771972614, "learning_rate": 1.3177167836868155e-06, "loss": 0.6851, "step": 2855 }, { "epoch": 0.8852725793327909, "grad_norm": 0.14396956800005842, "learning_rate": 1.3107012851186718e-06, "loss": 0.6951, "step": 2856 }, { "epoch": 0.8855825487233135, "grad_norm": 0.14840633898660247, "learning_rate": 1.3037038788962896e-06, "loss": 0.7279, "step": 2857 }, { "epoch": 0.8858925181138363, "grad_norm": 0.14302075896052202, "learning_rate": 1.2967245717935594e-06, "loss": 0.7121, "step": 2858 }, { "epoch": 0.8862024875043589, "grad_norm": 0.14230749890667335, "learning_rate": 1.2897633705668722e-06, "loss": 0.7259, "step": 2859 }, { "epoch": 0.8865124568948817, "grad_norm": 0.14395575802558397, "learning_rate": 1.2828202819550839e-06, "loss": 0.6993, "step": 2860 }, { "epoch": 0.8868224262854043, "grad_norm": 0.14041848321105083, "learning_rate": 1.2758953126795026e-06, "loss": 0.7092, "step": 2861 }, { "epoch": 0.887132395675927, "grad_norm": 0.14251489361730793, "learning_rate": 1.2689884694439192e-06, "loss": 0.699, "step": 2862 }, { "epoch": 0.8874423650664497, "grad_norm": 0.14299859635892168, "learning_rate": 1.2620997589345585e-06, "loss": 0.6973, "step": 2863 }, { "epoch": 0.8877523344569723, "grad_norm": 0.14174758644323082, "learning_rate": 1.255229187820104e-06, "loss": 0.6851, "step": 2864 }, { "epoch": 0.8880623038474951, "grad_norm": 0.147174048914162, "learning_rate": 1.2483767627516752e-06, "loss": 0.7199, "step": 2865 }, { "epoch": 0.8883722732380177, "grad_norm": 0.14296898855818654, "learning_rate": 1.2415424903628237e-06, "loss": 0.6923, "step": 2866 }, { "epoch": 0.8886822426285405, "grad_norm": 0.14221401560420774, "learning_rate": 1.2347263772695262e-06, "loss": 0.7105, "step": 2867 }, { "epoch": 0.8889922120190631, "grad_norm": 0.14754256991748185, "learning_rate": 1.2279284300701866e-06, "loss": 0.7344, "step": 2868 }, { "epoch": 0.8893021814095858, "grad_norm": 0.14597877408801146, "learning_rate": 1.2211486553456164e-06, "loss": 0.7261, "step": 2869 }, { "epoch": 0.8896121508001085, "grad_norm": 0.1460626935489509, "learning_rate": 1.2143870596590413e-06, "loss": 0.708, "step": 2870 }, { "epoch": 0.8899221201906312, "grad_norm": 0.13904638499219563, "learning_rate": 1.2076436495560828e-06, "loss": 0.6776, "step": 2871 }, { "epoch": 0.8902320895811539, "grad_norm": 0.14326126479189488, "learning_rate": 1.2009184315647637e-06, "loss": 0.726, "step": 2872 }, { "epoch": 0.8905420589716766, "grad_norm": 0.14110057860804753, "learning_rate": 1.1942114121954895e-06, "loss": 0.6913, "step": 2873 }, { "epoch": 0.8908520283621992, "grad_norm": 0.14349538921902685, "learning_rate": 1.1875225979410532e-06, "loss": 0.6957, "step": 2874 }, { "epoch": 0.8911619977527219, "grad_norm": 0.13975491986338062, "learning_rate": 1.1808519952766217e-06, "loss": 0.6908, "step": 2875 }, { "epoch": 0.8914719671432446, "grad_norm": 0.14209021175613837, "learning_rate": 1.174199610659732e-06, "loss": 0.7075, "step": 2876 }, { "epoch": 0.8917819365337672, "grad_norm": 0.14588328898724393, "learning_rate": 1.1675654505302836e-06, "loss": 0.7162, "step": 2877 }, { "epoch": 0.89209190592429, "grad_norm": 0.1462741836254337, "learning_rate": 1.1609495213105438e-06, "loss": 0.7043, "step": 2878 }, { "epoch": 0.8924018753148126, "grad_norm": 0.1404570642174073, "learning_rate": 1.1543518294051115e-06, "loss": 0.7151, "step": 2879 }, { "epoch": 0.8927118447053354, "grad_norm": 0.1454828743578135, "learning_rate": 1.1477723812009534e-06, "loss": 0.7255, "step": 2880 }, { "epoch": 0.893021814095858, "grad_norm": 0.13940108806253118, "learning_rate": 1.1412111830673588e-06, "loss": 0.7085, "step": 2881 }, { "epoch": 0.8933317834863808, "grad_norm": 0.14118534689231177, "learning_rate": 1.1346682413559518e-06, "loss": 0.6873, "step": 2882 }, { "epoch": 0.8936417528769034, "grad_norm": 0.14695794690538408, "learning_rate": 1.1281435624006964e-06, "loss": 0.7244, "step": 2883 }, { "epoch": 0.8939517222674261, "grad_norm": 0.1451108458767314, "learning_rate": 1.121637152517856e-06, "loss": 0.7339, "step": 2884 }, { "epoch": 0.8942616916579488, "grad_norm": 0.14445852775994958, "learning_rate": 1.1151490180060276e-06, "loss": 0.7022, "step": 2885 }, { "epoch": 0.8945716610484714, "grad_norm": 0.14587151407241455, "learning_rate": 1.108679165146107e-06, "loss": 0.6919, "step": 2886 }, { "epoch": 0.8948816304389942, "grad_norm": 0.1366250309318356, "learning_rate": 1.1022276002012933e-06, "loss": 0.6937, "step": 2887 }, { "epoch": 0.8951915998295168, "grad_norm": 0.15887196651161084, "learning_rate": 1.0957943294170882e-06, "loss": 0.7154, "step": 2888 }, { "epoch": 0.8955015692200395, "grad_norm": 0.14400026196541338, "learning_rate": 1.0893793590212653e-06, "loss": 0.7191, "step": 2889 }, { "epoch": 0.8958115386105622, "grad_norm": 0.14133735218860974, "learning_rate": 1.0829826952239086e-06, "loss": 0.6832, "step": 2890 }, { "epoch": 0.8961215080010849, "grad_norm": 0.1465812002852079, "learning_rate": 1.0766043442173578e-06, "loss": 0.7222, "step": 2891 }, { "epoch": 0.8964314773916076, "grad_norm": 0.1470660241071487, "learning_rate": 1.0702443121762406e-06, "loss": 0.6913, "step": 2892 }, { "epoch": 0.8967414467821303, "grad_norm": 0.14305620551862747, "learning_rate": 1.0639026052574386e-06, "loss": 0.721, "step": 2893 }, { "epoch": 0.8970514161726529, "grad_norm": 0.13971444336108363, "learning_rate": 1.0575792296000987e-06, "loss": 0.7014, "step": 2894 }, { "epoch": 0.8973613855631757, "grad_norm": 0.1407155383836549, "learning_rate": 1.0512741913256264e-06, "loss": 0.6999, "step": 2895 }, { "epoch": 0.8976713549536983, "grad_norm": 0.14343855844422632, "learning_rate": 1.0449874965376683e-06, "loss": 0.7084, "step": 2896 }, { "epoch": 0.897981324344221, "grad_norm": 0.14121246168589072, "learning_rate": 1.0387191513221184e-06, "loss": 0.7067, "step": 2897 }, { "epoch": 0.8982912937347437, "grad_norm": 0.14337327430152474, "learning_rate": 1.0324691617471005e-06, "loss": 0.7083, "step": 2898 }, { "epoch": 0.8986012631252663, "grad_norm": 0.14326515165223105, "learning_rate": 1.0262375338629837e-06, "loss": 0.7047, "step": 2899 }, { "epoch": 0.8989112325157891, "grad_norm": 0.16513942514830732, "learning_rate": 1.0200242737023447e-06, "loss": 0.7181, "step": 2900 }, { "epoch": 0.8992212019063117, "grad_norm": 0.1441663326895941, "learning_rate": 1.0138293872799875e-06, "loss": 0.7069, "step": 2901 }, { "epoch": 0.8995311712968345, "grad_norm": 0.14727137059301276, "learning_rate": 1.0076528805929265e-06, "loss": 0.7091, "step": 2902 }, { "epoch": 0.8998411406873571, "grad_norm": 0.14203259535204849, "learning_rate": 1.0014947596203873e-06, "loss": 0.6999, "step": 2903 }, { "epoch": 0.9001511100778798, "grad_norm": 0.14289746326207997, "learning_rate": 9.95355030323799e-07, "loss": 0.7202, "step": 2904 }, { "epoch": 0.9004610794684025, "grad_norm": 0.14662762182536784, "learning_rate": 9.892336986467743e-07, "loss": 0.7057, "step": 2905 }, { "epoch": 0.9007710488589252, "grad_norm": 0.14158397715804175, "learning_rate": 9.831307705151328e-07, "loss": 0.688, "step": 2906 }, { "epoch": 0.9010810182494479, "grad_norm": 0.14087477102808935, "learning_rate": 9.77046251836864e-07, "loss": 0.6668, "step": 2907 }, { "epoch": 0.9013909876399705, "grad_norm": 0.14366980777509858, "learning_rate": 9.709801485021386e-07, "loss": 0.7133, "step": 2908 }, { "epoch": 0.9017009570304932, "grad_norm": 0.1447438590875137, "learning_rate": 9.64932466383315e-07, "loss": 0.7136, "step": 2909 }, { "epoch": 0.9020109264210159, "grad_norm": 0.13983292301531625, "learning_rate": 9.589032113348918e-07, "loss": 0.7001, "step": 2910 }, { "epoch": 0.9023208958115386, "grad_norm": 0.14251365533747126, "learning_rate": 9.52892389193556e-07, "loss": 0.6933, "step": 2911 }, { "epoch": 0.9026308652020613, "grad_norm": 0.13941908356403343, "learning_rate": 9.46900005778133e-07, "loss": 0.6942, "step": 2912 }, { "epoch": 0.902940834592584, "grad_norm": 0.1470687663064787, "learning_rate": 9.409260668896048e-07, "loss": 0.7148, "step": 2913 }, { "epoch": 0.9032508039831066, "grad_norm": 0.14324437353866726, "learning_rate": 9.34970578311103e-07, "loss": 0.7319, "step": 2914 }, { "epoch": 0.9035607733736294, "grad_norm": 0.13996851673818278, "learning_rate": 9.290335458078803e-07, "loss": 0.6889, "step": 2915 }, { "epoch": 0.903870742764152, "grad_norm": 0.1385953167895268, "learning_rate": 9.23114975127346e-07, "loss": 0.6914, "step": 2916 }, { "epoch": 0.9041807121546748, "grad_norm": 0.14054769485995225, "learning_rate": 9.172148719990237e-07, "loss": 0.705, "step": 2917 }, { "epoch": 0.9044906815451974, "grad_norm": 0.14479171607113622, "learning_rate": 9.1133324213456e-07, "loss": 0.6993, "step": 2918 }, { "epoch": 0.9048006509357202, "grad_norm": 0.14334978214799352, "learning_rate": 9.054700912277203e-07, "loss": 0.7078, "step": 2919 }, { "epoch": 0.9051106203262428, "grad_norm": 0.14042412210163663, "learning_rate": 8.996254249543823e-07, "loss": 0.6904, "step": 2920 }, { "epoch": 0.9054205897167654, "grad_norm": 0.14650532445397876, "learning_rate": 8.937992489725289e-07, "loss": 0.7808, "step": 2921 }, { "epoch": 0.9057305591072882, "grad_norm": 0.14529270391139298, "learning_rate": 8.879915689222418e-07, "loss": 0.7286, "step": 2922 }, { "epoch": 0.9060405284978108, "grad_norm": 0.14220326090103746, "learning_rate": 8.822023904256994e-07, "loss": 0.6957, "step": 2923 }, { "epoch": 0.9063504978883336, "grad_norm": 0.14487182563832052, "learning_rate": 8.764317190871652e-07, "loss": 0.7163, "step": 2924 }, { "epoch": 0.9066604672788562, "grad_norm": 0.1424721558606499, "learning_rate": 8.706795604929974e-07, "loss": 0.7043, "step": 2925 }, { "epoch": 0.9069704366693789, "grad_norm": 0.14677025473317787, "learning_rate": 8.649459202116195e-07, "loss": 0.701, "step": 2926 }, { "epoch": 0.9072804060599016, "grad_norm": 0.14503323060118412, "learning_rate": 8.592308037935359e-07, "loss": 0.7064, "step": 2927 }, { "epoch": 0.9075903754504243, "grad_norm": 0.1436800877738954, "learning_rate": 8.535342167713168e-07, "loss": 0.7028, "step": 2928 }, { "epoch": 0.907900344840947, "grad_norm": 0.14783259082543848, "learning_rate": 8.478561646595907e-07, "loss": 0.687, "step": 2929 }, { "epoch": 0.9082103142314697, "grad_norm": 0.1468394586971382, "learning_rate": 8.421966529550607e-07, "loss": 0.726, "step": 2930 }, { "epoch": 0.9085202836219923, "grad_norm": 0.14519920785222004, "learning_rate": 8.365556871364511e-07, "loss": 0.7283, "step": 2931 }, { "epoch": 0.908830253012515, "grad_norm": 0.14726186341593736, "learning_rate": 8.30933272664567e-07, "loss": 0.7164, "step": 2932 }, { "epoch": 0.9091402224030377, "grad_norm": 0.14120853452509763, "learning_rate": 8.253294149822277e-07, "loss": 0.6678, "step": 2933 }, { "epoch": 0.9094501917935603, "grad_norm": 0.14475794946845338, "learning_rate": 8.197441195142963e-07, "loss": 0.7289, "step": 2934 }, { "epoch": 0.9097601611840831, "grad_norm": 0.14368916971805054, "learning_rate": 8.141773916676809e-07, "loss": 0.7055, "step": 2935 }, { "epoch": 0.9100701305746057, "grad_norm": 0.16573631855760723, "learning_rate": 8.086292368312909e-07, "loss": 0.7139, "step": 2936 }, { "epoch": 0.9103800999651285, "grad_norm": 0.14349804672193572, "learning_rate": 8.030996603760744e-07, "loss": 0.6939, "step": 2937 }, { "epoch": 0.9106900693556511, "grad_norm": 0.14122109794460783, "learning_rate": 7.975886676549871e-07, "loss": 0.6755, "step": 2938 }, { "epoch": 0.9110000387461739, "grad_norm": 0.1442215607629505, "learning_rate": 7.92096264002995e-07, "loss": 0.6929, "step": 2939 }, { "epoch": 0.9113100081366965, "grad_norm": 0.2125378727302207, "learning_rate": 7.866224547370716e-07, "loss": 0.7124, "step": 2940 }, { "epoch": 0.9116199775272192, "grad_norm": 0.14285284650674937, "learning_rate": 7.811672451561847e-07, "loss": 0.7429, "step": 2941 }, { "epoch": 0.9119299469177419, "grad_norm": 0.1408902857541205, "learning_rate": 7.757306405413012e-07, "loss": 0.6826, "step": 2942 }, { "epoch": 0.9122399163082645, "grad_norm": 0.14456706675944475, "learning_rate": 7.703126461553756e-07, "loss": 0.7222, "step": 2943 }, { "epoch": 0.9125498856987873, "grad_norm": 0.14261239950986626, "learning_rate": 7.649132672433457e-07, "loss": 0.7133, "step": 2944 }, { "epoch": 0.9128598550893099, "grad_norm": 0.14389957500265643, "learning_rate": 7.595325090321304e-07, "loss": 0.7016, "step": 2945 }, { "epoch": 0.9131698244798326, "grad_norm": 0.14080231862912107, "learning_rate": 7.54170376730623e-07, "loss": 0.7195, "step": 2946 }, { "epoch": 0.9134797938703553, "grad_norm": 0.14178389061529534, "learning_rate": 7.488268755296823e-07, "loss": 0.7118, "step": 2947 }, { "epoch": 0.913789763260878, "grad_norm": 0.14562063725842567, "learning_rate": 7.435020106021329e-07, "loss": 0.7493, "step": 2948 }, { "epoch": 0.9140997326514007, "grad_norm": 0.1426697533102575, "learning_rate": 7.381957871027623e-07, "loss": 0.7014, "step": 2949 }, { "epoch": 0.9144097020419234, "grad_norm": 0.14278732198568345, "learning_rate": 7.329082101683038e-07, "loss": 0.6906, "step": 2950 }, { "epoch": 0.914719671432446, "grad_norm": 0.289138030538667, "learning_rate": 7.276392849174473e-07, "loss": 0.7221, "step": 2951 }, { "epoch": 0.9150296408229688, "grad_norm": 0.3026668745802733, "learning_rate": 7.22389016450824e-07, "loss": 0.6929, "step": 2952 }, { "epoch": 0.9153396102134914, "grad_norm": 0.1480921345385168, "learning_rate": 7.171574098510015e-07, "loss": 0.6812, "step": 2953 }, { "epoch": 0.915649579604014, "grad_norm": 0.14616129475864123, "learning_rate": 7.119444701824885e-07, "loss": 0.7042, "step": 2954 }, { "epoch": 0.9159595489945368, "grad_norm": 0.14211674722329665, "learning_rate": 7.067502024917106e-07, "loss": 0.7325, "step": 2955 }, { "epoch": 0.9162695183850594, "grad_norm": 0.1462025327736644, "learning_rate": 7.015746118070388e-07, "loss": 0.7032, "step": 2956 }, { "epoch": 0.9165794877755822, "grad_norm": 0.1590046773020044, "learning_rate": 6.964177031387387e-07, "loss": 0.6946, "step": 2957 }, { "epoch": 0.9168894571661048, "grad_norm": 0.17360772120786258, "learning_rate": 6.912794814790102e-07, "loss": 0.6804, "step": 2958 }, { "epoch": 0.9171994265566276, "grad_norm": 0.13785571721264583, "learning_rate": 6.861599518019501e-07, "loss": 0.6865, "step": 2959 }, { "epoch": 0.9175093959471502, "grad_norm": 0.14135870448176055, "learning_rate": 6.810591190635696e-07, "loss": 0.7001, "step": 2960 }, { "epoch": 0.9178193653376729, "grad_norm": 0.14408651666361846, "learning_rate": 6.759769882017764e-07, "loss": 0.7249, "step": 2961 }, { "epoch": 0.9181293347281956, "grad_norm": 0.14845338222737586, "learning_rate": 6.709135641363685e-07, "loss": 0.7354, "step": 2962 }, { "epoch": 0.9184393041187183, "grad_norm": 0.14390405527843583, "learning_rate": 6.658688517690493e-07, "loss": 0.7024, "step": 2963 }, { "epoch": 0.918749273509241, "grad_norm": 0.14777056737449656, "learning_rate": 6.608428559833879e-07, "loss": 0.7269, "step": 2964 }, { "epoch": 0.9190592428997636, "grad_norm": 0.13840677892467645, "learning_rate": 6.558355816448502e-07, "loss": 0.6782, "step": 2965 }, { "epoch": 0.9193692122902863, "grad_norm": 0.13901772576958957, "learning_rate": 6.508470336007744e-07, "loss": 0.7025, "step": 2966 }, { "epoch": 0.919679181680809, "grad_norm": 0.14388039465451624, "learning_rate": 6.458772166803706e-07, "loss": 0.7138, "step": 2967 }, { "epoch": 0.9199891510713317, "grad_norm": 0.21632026971408708, "learning_rate": 6.409261356947105e-07, "loss": 0.712, "step": 2968 }, { "epoch": 0.9202991204618544, "grad_norm": 0.14068681689770413, "learning_rate": 6.359937954367379e-07, "loss": 0.6868, "step": 2969 }, { "epoch": 0.9206090898523771, "grad_norm": 0.14284863698045533, "learning_rate": 6.310802006812488e-07, "loss": 0.7037, "step": 2970 }, { "epoch": 0.9209190592428997, "grad_norm": 0.14191966802703582, "learning_rate": 6.261853561848918e-07, "loss": 0.7068, "step": 2971 }, { "epoch": 0.9212290286334225, "grad_norm": 0.14102239496878954, "learning_rate": 6.213092666861676e-07, "loss": 0.7082, "step": 2972 }, { "epoch": 0.9215389980239451, "grad_norm": 0.14650635058107173, "learning_rate": 6.164519369054156e-07, "loss": 0.7005, "step": 2973 }, { "epoch": 0.9218489674144679, "grad_norm": 0.1530129835963939, "learning_rate": 6.116133715448213e-07, "loss": 0.7327, "step": 2974 }, { "epoch": 0.9221589368049905, "grad_norm": 0.1407008511411501, "learning_rate": 6.067935752884025e-07, "loss": 0.6827, "step": 2975 }, { "epoch": 0.9224689061955131, "grad_norm": 0.13856122941988716, "learning_rate": 6.019925528020044e-07, "loss": 0.6879, "step": 2976 }, { "epoch": 0.9227788755860359, "grad_norm": 0.1442437085618463, "learning_rate": 5.972103087333003e-07, "loss": 0.7061, "step": 2977 }, { "epoch": 0.9230888449765585, "grad_norm": 0.14025844188472686, "learning_rate": 5.924468477117851e-07, "loss": 0.697, "step": 2978 }, { "epoch": 0.9233988143670813, "grad_norm": 0.14587040643278365, "learning_rate": 5.877021743487766e-07, "loss": 0.7158, "step": 2979 }, { "epoch": 0.9237087837576039, "grad_norm": 0.13770924330030415, "learning_rate": 5.829762932373917e-07, "loss": 0.7028, "step": 2980 }, { "epoch": 0.9240187531481266, "grad_norm": 0.14219213614658457, "learning_rate": 5.782692089525643e-07, "loss": 0.7126, "step": 2981 }, { "epoch": 0.9243287225386493, "grad_norm": 0.1449216765105502, "learning_rate": 5.735809260510339e-07, "loss": 0.7195, "step": 2982 }, { "epoch": 0.924638691929172, "grad_norm": 0.14133444490652156, "learning_rate": 5.689114490713277e-07, "loss": 0.7059, "step": 2983 }, { "epoch": 0.9249486613196947, "grad_norm": 0.14292362853413376, "learning_rate": 5.642607825337853e-07, "loss": 0.7361, "step": 2984 }, { "epoch": 0.9252586307102174, "grad_norm": 0.1608699561938007, "learning_rate": 5.596289309405189e-07, "loss": 0.716, "step": 2985 }, { "epoch": 0.92556860010074, "grad_norm": 0.14810269745424265, "learning_rate": 5.550158987754372e-07, "loss": 0.6876, "step": 2986 }, { "epoch": 0.9258785694912628, "grad_norm": 0.13951566703879947, "learning_rate": 5.504216905042325e-07, "loss": 0.7071, "step": 2987 }, { "epoch": 0.9261885388817854, "grad_norm": 0.14934032233780706, "learning_rate": 5.458463105743605e-07, "loss": 0.7359, "step": 2988 }, { "epoch": 0.9264985082723081, "grad_norm": 0.14327521021172643, "learning_rate": 5.412897634150694e-07, "loss": 0.7173, "step": 2989 }, { "epoch": 0.9268084776628308, "grad_norm": 0.13905902047169127, "learning_rate": 5.367520534373571e-07, "loss": 0.687, "step": 2990 }, { "epoch": 0.9271184470533534, "grad_norm": 0.14463540832724894, "learning_rate": 5.32233185033999e-07, "loss": 0.7012, "step": 2991 }, { "epoch": 0.9274284164438762, "grad_norm": 0.14906469085226723, "learning_rate": 5.27733162579529e-07, "loss": 0.7206, "step": 2992 }, { "epoch": 0.9277383858343988, "grad_norm": 0.13514483980094089, "learning_rate": 5.232519904302336e-07, "loss": 0.6782, "step": 2993 }, { "epoch": 0.9280483552249216, "grad_norm": 0.1412876002792785, "learning_rate": 5.187896729241515e-07, "loss": 0.7245, "step": 2994 }, { "epoch": 0.9283583246154442, "grad_norm": 0.14436629518113936, "learning_rate": 5.143462143810696e-07, "loss": 0.7085, "step": 2995 }, { "epoch": 0.928668294005967, "grad_norm": 0.14709286139296285, "learning_rate": 5.09921619102518e-07, "loss": 0.7122, "step": 2996 }, { "epoch": 0.9289782633964896, "grad_norm": 0.13914794640945444, "learning_rate": 5.055158913717684e-07, "loss": 0.6995, "step": 2997 }, { "epoch": 0.9292882327870123, "grad_norm": 0.15056557448921776, "learning_rate": 5.011290354538223e-07, "loss": 0.7346, "step": 2998 }, { "epoch": 0.929598202177535, "grad_norm": 0.14256244248779548, "learning_rate": 4.967610555954206e-07, "loss": 0.7194, "step": 2999 }, { "epoch": 0.9299081715680576, "grad_norm": 0.1422612671126717, "learning_rate": 4.924119560250207e-07, "loss": 0.7066, "step": 3000 }, { "epoch": 0.9302181409585804, "grad_norm": 0.14193715880766822, "learning_rate": 4.880817409528105e-07, "loss": 0.7017, "step": 3001 }, { "epoch": 0.930528110349103, "grad_norm": 0.14326320597029255, "learning_rate": 4.837704145706946e-07, "loss": 0.7098, "step": 3002 }, { "epoch": 0.9308380797396257, "grad_norm": 0.16462735332802514, "learning_rate": 4.794779810522899e-07, "loss": 0.7011, "step": 3003 }, { "epoch": 0.9311480491301484, "grad_norm": 0.1392753063203525, "learning_rate": 4.752044445529258e-07, "loss": 0.7063, "step": 3004 }, { "epoch": 0.9314580185206711, "grad_norm": 0.14447679207618255, "learning_rate": 4.7094980920964204e-07, "loss": 0.7274, "step": 3005 }, { "epoch": 0.9317679879111938, "grad_norm": 0.14073595336644878, "learning_rate": 4.667140791411728e-07, "loss": 0.677, "step": 3006 }, { "epoch": 0.9320779573017165, "grad_norm": 0.1439583686154386, "learning_rate": 4.624972584479581e-07, "loss": 0.7081, "step": 3007 }, { "epoch": 0.9323879266922391, "grad_norm": 0.1421560551680614, "learning_rate": 4.582993512121281e-07, "loss": 0.7151, "step": 3008 }, { "epoch": 0.9326978960827619, "grad_norm": 0.1386174049850885, "learning_rate": 4.541203614975009e-07, "loss": 0.6854, "step": 3009 }, { "epoch": 0.9330078654732845, "grad_norm": 0.13997704228081456, "learning_rate": 4.499602933495961e-07, "loss": 0.6997, "step": 3010 }, { "epoch": 0.9333178348638071, "grad_norm": 0.14001933302397349, "learning_rate": 4.458191507955945e-07, "loss": 0.7032, "step": 3011 }, { "epoch": 0.9336278042543299, "grad_norm": 0.14288782043567835, "learning_rate": 4.4169693784437363e-07, "loss": 0.6975, "step": 3012 }, { "epoch": 0.9339377736448525, "grad_norm": 0.14369756037774317, "learning_rate": 4.3759365848647704e-07, "loss": 0.7153, "step": 3013 }, { "epoch": 0.9342477430353753, "grad_norm": 0.13978495750569148, "learning_rate": 4.3350931669412066e-07, "loss": 0.714, "step": 3014 }, { "epoch": 0.9345577124258979, "grad_norm": 0.14062990480681278, "learning_rate": 4.29443916421195e-07, "loss": 0.7023, "step": 3015 }, { "epoch": 0.9348676818164207, "grad_norm": 0.13965768676621745, "learning_rate": 4.2539746160323636e-07, "loss": 0.6771, "step": 3016 }, { "epoch": 0.9351776512069433, "grad_norm": 0.1395984976516905, "learning_rate": 4.213699561574602e-07, "loss": 0.7176, "step": 3017 }, { "epoch": 0.935487620597466, "grad_norm": 0.15259745736889951, "learning_rate": 4.1736140398273004e-07, "loss": 0.7366, "step": 3018 }, { "epoch": 0.9357975899879887, "grad_norm": 0.13988033665467248, "learning_rate": 4.133718089595595e-07, "loss": 0.7028, "step": 3019 }, { "epoch": 0.9361075593785114, "grad_norm": 0.14230325243700978, "learning_rate": 4.094011749501103e-07, "loss": 0.7161, "step": 3020 }, { "epoch": 0.9364175287690341, "grad_norm": 0.14134815534964332, "learning_rate": 4.0544950579819443e-07, "loss": 0.6954, "step": 3021 }, { "epoch": 0.9367274981595567, "grad_norm": 0.1404918058829581, "learning_rate": 4.015168053292584e-07, "loss": 0.6963, "step": 3022 }, { "epoch": 0.9370374675500794, "grad_norm": 0.14354382732617726, "learning_rate": 3.9760307735039027e-07, "loss": 0.7171, "step": 3023 }, { "epoch": 0.9373474369406021, "grad_norm": 0.14407886248702453, "learning_rate": 3.9370832565031045e-07, "loss": 0.7076, "step": 3024 }, { "epoch": 0.9376574063311248, "grad_norm": 0.14986246447896917, "learning_rate": 3.8983255399936747e-07, "loss": 0.7113, "step": 3025 }, { "epoch": 0.9379673757216475, "grad_norm": 0.14822892918892144, "learning_rate": 3.859757661495378e-07, "loss": 0.7606, "step": 3026 }, { "epoch": 0.9382773451121702, "grad_norm": 0.13819804534849564, "learning_rate": 3.821379658344215e-07, "loss": 0.6914, "step": 3027 }, { "epoch": 0.9385873145026928, "grad_norm": 0.14462698568818871, "learning_rate": 3.7831915676923347e-07, "loss": 0.7106, "step": 3028 }, { "epoch": 0.9388972838932156, "grad_norm": 0.1403643516617219, "learning_rate": 3.745193426508076e-07, "loss": 0.6908, "step": 3029 }, { "epoch": 0.9392072532837382, "grad_norm": 0.16937051061096986, "learning_rate": 3.7073852715758804e-07, "loss": 0.711, "step": 3030 }, { "epoch": 0.939517222674261, "grad_norm": 0.2392308421055891, "learning_rate": 3.669767139496294e-07, "loss": 0.7046, "step": 3031 }, { "epoch": 0.9398271920647836, "grad_norm": 0.14509617335871497, "learning_rate": 3.632339066685875e-07, "loss": 0.7368, "step": 3032 }, { "epoch": 0.9401371614553062, "grad_norm": 0.1442226760400472, "learning_rate": 3.5951010893772396e-07, "loss": 0.7038, "step": 3033 }, { "epoch": 0.940447130845829, "grad_norm": 0.14014048570568186, "learning_rate": 3.5580532436189084e-07, "loss": 0.6896, "step": 3034 }, { "epoch": 0.9407571002363516, "grad_norm": 0.1382646142046346, "learning_rate": 3.5211955652753925e-07, "loss": 0.6601, "step": 3035 }, { "epoch": 0.9410670696268744, "grad_norm": 0.13948978534631273, "learning_rate": 3.4845280900271506e-07, "loss": 0.6902, "step": 3036 }, { "epoch": 0.941377039017397, "grad_norm": 0.14283865603342846, "learning_rate": 3.4480508533703884e-07, "loss": 0.7137, "step": 3037 }, { "epoch": 0.9416870084079197, "grad_norm": 0.14093178823649105, "learning_rate": 3.4117638906173035e-07, "loss": 0.7202, "step": 3038 }, { "epoch": 0.9419969777984424, "grad_norm": 0.1427331582510297, "learning_rate": 3.3756672368957746e-07, "loss": 0.736, "step": 3039 }, { "epoch": 0.9423069471889651, "grad_norm": 0.13968527107086082, "learning_rate": 3.339760927149516e-07, "loss": 0.6871, "step": 3040 }, { "epoch": 0.9426169165794878, "grad_norm": 0.1409750959202098, "learning_rate": 3.304044996137967e-07, "loss": 0.6929, "step": 3041 }, { "epoch": 0.9429268859700105, "grad_norm": 0.1420026490088744, "learning_rate": 3.2685194784362053e-07, "loss": 0.7061, "step": 3042 }, { "epoch": 0.9432368553605331, "grad_norm": 0.1427147810545726, "learning_rate": 3.233184408435075e-07, "loss": 0.7372, "step": 3043 }, { "epoch": 0.9435468247510558, "grad_norm": 0.14023479740615577, "learning_rate": 3.198039820341015e-07, "loss": 0.6959, "step": 3044 }, { "epoch": 0.9438567941415785, "grad_norm": 0.13912076160280454, "learning_rate": 3.1630857481760535e-07, "loss": 0.6923, "step": 3045 }, { "epoch": 0.9441667635321012, "grad_norm": 0.14230400632207219, "learning_rate": 3.128322225777791e-07, "loss": 0.7208, "step": 3046 }, { "epoch": 0.9444767329226239, "grad_norm": 0.141199280372135, "learning_rate": 3.0937492867993966e-07, "loss": 0.6981, "step": 3047 }, { "epoch": 0.9447867023131465, "grad_norm": 0.1381062014171425, "learning_rate": 3.059366964709498e-07, "loss": 0.7187, "step": 3048 }, { "epoch": 0.9450966717036693, "grad_norm": 0.14359047047917098, "learning_rate": 3.025175292792204e-07, "loss": 0.7132, "step": 3049 }, { "epoch": 0.9454066410941919, "grad_norm": 0.14320001741647181, "learning_rate": 2.9911743041471044e-07, "loss": 0.7151, "step": 3050 }, { "epoch": 0.9457166104847147, "grad_norm": 0.14356369169220262, "learning_rate": 2.957364031689136e-07, "loss": 0.7257, "step": 3051 }, { "epoch": 0.9460265798752373, "grad_norm": 0.13595579496273644, "learning_rate": 2.923744508148696e-07, "loss": 0.6567, "step": 3052 }, { "epoch": 0.94633654926576, "grad_norm": 0.13789186660654631, "learning_rate": 2.8903157660713944e-07, "loss": 0.6948, "step": 3053 }, { "epoch": 0.9466465186562827, "grad_norm": 0.13868742875284962, "learning_rate": 2.8570778378182786e-07, "loss": 0.7054, "step": 3054 }, { "epoch": 0.9469564880468053, "grad_norm": 0.1412460958947571, "learning_rate": 2.8240307555656097e-07, "loss": 0.6921, "step": 3055 }, { "epoch": 0.9472664574373281, "grad_norm": 0.1420068215882202, "learning_rate": 2.791174551304887e-07, "loss": 0.6983, "step": 3056 }, { "epoch": 0.9475764268278507, "grad_norm": 0.14965567611276823, "learning_rate": 2.758509256842934e-07, "loss": 0.7151, "step": 3057 }, { "epoch": 0.9478863962183735, "grad_norm": 0.14002650437223996, "learning_rate": 2.726034903801633e-07, "loss": 0.6961, "step": 3058 }, { "epoch": 0.9481963656088961, "grad_norm": 0.16272354618100426, "learning_rate": 2.693751523618104e-07, "loss": 0.7107, "step": 3059 }, { "epoch": 0.9485063349994188, "grad_norm": 0.1358589442295398, "learning_rate": 2.661659147544526e-07, "loss": 0.6637, "step": 3060 }, { "epoch": 0.9488163043899415, "grad_norm": 0.15156010488824062, "learning_rate": 2.6297578066482254e-07, "loss": 0.7054, "step": 3061 }, { "epoch": 0.9491262737804642, "grad_norm": 0.14256575742881603, "learning_rate": 2.598047531811654e-07, "loss": 0.7103, "step": 3062 }, { "epoch": 0.9494362431709868, "grad_norm": 0.14167474479692682, "learning_rate": 2.5665283537321227e-07, "loss": 0.7271, "step": 3063 }, { "epoch": 0.9497462125615096, "grad_norm": 0.13693040852305713, "learning_rate": 2.5352003029221584e-07, "loss": 0.7002, "step": 3064 }, { "epoch": 0.9500561819520322, "grad_norm": 0.1438513485252067, "learning_rate": 2.504063409709101e-07, "loss": 0.7165, "step": 3065 }, { "epoch": 0.950366151342555, "grad_norm": 0.14125443763527837, "learning_rate": 2.473117704235328e-07, "loss": 0.7133, "step": 3066 }, { "epoch": 0.9506761207330776, "grad_norm": 0.1401201495340611, "learning_rate": 2.4423632164581213e-07, "loss": 0.683, "step": 3067 }, { "epoch": 0.9509860901236002, "grad_norm": 0.13974259857303045, "learning_rate": 2.4117999761496205e-07, "loss": 0.6951, "step": 3068 }, { "epoch": 0.951296059514123, "grad_norm": 0.14026971009959996, "learning_rate": 2.381428012896847e-07, "loss": 0.7046, "step": 3069 }, { "epoch": 0.9516060289046456, "grad_norm": 0.14528149943733026, "learning_rate": 2.3512473561016823e-07, "loss": 0.7193, "step": 3070 }, { "epoch": 0.9519159982951684, "grad_norm": 0.14050624045533575, "learning_rate": 2.321258034980778e-07, "loss": 0.7009, "step": 3071 }, { "epoch": 0.952225967685691, "grad_norm": 0.13849663435477844, "learning_rate": 2.291460078565555e-07, "loss": 0.6846, "step": 3072 }, { "epoch": 0.9525359370762138, "grad_norm": 0.13895523314290806, "learning_rate": 2.2618535157022058e-07, "loss": 0.6806, "step": 3073 }, { "epoch": 0.9528459064667364, "grad_norm": 0.1382744523353227, "learning_rate": 2.2324383750516264e-07, "loss": 0.7014, "step": 3074 }, { "epoch": 0.9531558758572591, "grad_norm": 0.17054706501392158, "learning_rate": 2.2032146850894166e-07, "loss": 0.7095, "step": 3075 }, { "epoch": 0.9534658452477818, "grad_norm": 0.14036012466532802, "learning_rate": 2.174182474105835e-07, "loss": 0.7234, "step": 3076 }, { "epoch": 0.9537758146383045, "grad_norm": 0.1394740433188166, "learning_rate": 2.1453417702057556e-07, "loss": 0.7134, "step": 3077 }, { "epoch": 0.9540857840288272, "grad_norm": 0.14065277933662043, "learning_rate": 2.116692601308734e-07, "loss": 0.7013, "step": 3078 }, { "epoch": 0.9543957534193498, "grad_norm": 0.13835387358206416, "learning_rate": 2.08823499514883e-07, "loss": 0.6877, "step": 3079 }, { "epoch": 0.9547057228098725, "grad_norm": 0.14600796851596617, "learning_rate": 2.0599689792746956e-07, "loss": 0.6977, "step": 3080 }, { "epoch": 0.9550156922003952, "grad_norm": 0.14196485364244552, "learning_rate": 2.0318945810494873e-07, "loss": 0.7085, "step": 3081 }, { "epoch": 0.9553256615909179, "grad_norm": 0.1411175444354257, "learning_rate": 2.0040118276508647e-07, "loss": 0.7151, "step": 3082 }, { "epoch": 0.9556356309814406, "grad_norm": 0.13912975556255366, "learning_rate": 1.9763207460710587e-07, "loss": 0.7117, "step": 3083 }, { "epoch": 0.9559456003719633, "grad_norm": 0.14136246228668348, "learning_rate": 1.9488213631166043e-07, "loss": 0.6871, "step": 3084 }, { "epoch": 0.9562555697624859, "grad_norm": 0.1374614256509199, "learning_rate": 1.921513705408562e-07, "loss": 0.6976, "step": 3085 }, { "epoch": 0.9565655391530087, "grad_norm": 0.14217941081481922, "learning_rate": 1.8943977993823193e-07, "loss": 0.6992, "step": 3086 }, { "epoch": 0.9568755085435313, "grad_norm": 0.13957423477004016, "learning_rate": 1.8674736712877006e-07, "loss": 0.6999, "step": 3087 }, { "epoch": 0.9571854779340541, "grad_norm": 0.14384236579829257, "learning_rate": 1.8407413471889012e-07, "loss": 0.7258, "step": 3088 }, { "epoch": 0.9574954473245767, "grad_norm": 0.1388488311194664, "learning_rate": 1.8142008529642875e-07, "loss": 0.6876, "step": 3089 }, { "epoch": 0.9578054167150993, "grad_norm": 0.13997364950913477, "learning_rate": 1.787852214306729e-07, "loss": 0.6999, "step": 3090 }, { "epoch": 0.9581153861056221, "grad_norm": 0.14162637397184863, "learning_rate": 1.7616954567232003e-07, "loss": 0.6986, "step": 3091 }, { "epoch": 0.9584253554961447, "grad_norm": 0.13732679561608782, "learning_rate": 1.735730605535002e-07, "loss": 0.6794, "step": 3092 }, { "epoch": 0.9587353248866675, "grad_norm": 0.13836189115574218, "learning_rate": 1.70995768587765e-07, "loss": 0.7037, "step": 3093 }, { "epoch": 0.9590452942771901, "grad_norm": 0.15440544183597715, "learning_rate": 1.6843767227008756e-07, "loss": 0.7004, "step": 3094 }, { "epoch": 0.9593552636677128, "grad_norm": 0.1459122588245586, "learning_rate": 1.658987740768514e-07, "loss": 0.6809, "step": 3095 }, { "epoch": 0.9596652330582355, "grad_norm": 0.1450516764994146, "learning_rate": 1.6337907646586381e-07, "loss": 0.712, "step": 3096 }, { "epoch": 0.9599752024487582, "grad_norm": 0.14209216108866893, "learning_rate": 1.6087858187634252e-07, "loss": 0.7126, "step": 3097 }, { "epoch": 0.9602851718392809, "grad_norm": 0.14299260707319678, "learning_rate": 1.5839729272890903e-07, "loss": 0.736, "step": 3098 }, { "epoch": 0.9605951412298036, "grad_norm": 0.14495437030087657, "learning_rate": 1.5593521142559964e-07, "loss": 0.71, "step": 3099 }, { "epoch": 0.9609051106203262, "grad_norm": 0.14142191016008693, "learning_rate": 1.534923403498567e-07, "loss": 0.7082, "step": 3100 }, { "epoch": 0.9612150800108489, "grad_norm": 0.1431236322323679, "learning_rate": 1.5106868186652412e-07, "loss": 0.7086, "step": 3101 }, { "epoch": 0.9615250494013716, "grad_norm": 0.13652430231156434, "learning_rate": 1.4866423832184285e-07, "loss": 0.6972, "step": 3102 }, { "epoch": 0.9618350187918943, "grad_norm": 0.13986521146472464, "learning_rate": 1.4627901204345763e-07, "loss": 0.7005, "step": 3103 }, { "epoch": 0.962144988182417, "grad_norm": 0.14524501094612377, "learning_rate": 1.439130053404103e-07, "loss": 0.7239, "step": 3104 }, { "epoch": 0.9624549575729396, "grad_norm": 0.13866524475213618, "learning_rate": 1.4156622050313317e-07, "loss": 0.6995, "step": 3105 }, { "epoch": 0.9627649269634624, "grad_norm": 0.1392802013473805, "learning_rate": 1.3923865980345564e-07, "loss": 0.7269, "step": 3106 }, { "epoch": 0.963074896353985, "grad_norm": 0.13896611592221686, "learning_rate": 1.3693032549459306e-07, "loss": 0.6723, "step": 3107 }, { "epoch": 0.9633848657445078, "grad_norm": 0.1432886481134157, "learning_rate": 1.3464121981114463e-07, "loss": 0.7352, "step": 3108 }, { "epoch": 0.9636948351350304, "grad_norm": 0.13986533146186322, "learning_rate": 1.3237134496910664e-07, "loss": 0.6781, "step": 3109 }, { "epoch": 0.9640048045255531, "grad_norm": 0.13788655616486448, "learning_rate": 1.301207031658458e-07, "loss": 0.6994, "step": 3110 }, { "epoch": 0.9643147739160758, "grad_norm": 0.14224381532070568, "learning_rate": 1.278892965801237e-07, "loss": 0.7152, "step": 3111 }, { "epoch": 0.9646247433065984, "grad_norm": 0.13956951209110993, "learning_rate": 1.2567712737206804e-07, "loss": 0.6874, "step": 3112 }, { "epoch": 0.9649347126971212, "grad_norm": 0.1406415149858836, "learning_rate": 1.234841976831902e-07, "loss": 0.6789, "step": 3113 }, { "epoch": 0.9652446820876438, "grad_norm": 0.14178711807139094, "learning_rate": 1.2131050963638092e-07, "loss": 0.7204, "step": 3114 }, { "epoch": 0.9655546514781665, "grad_norm": 0.14032218936932103, "learning_rate": 1.191560653358903e-07, "loss": 0.6623, "step": 3115 }, { "epoch": 0.9658646208686892, "grad_norm": 0.14376151618789987, "learning_rate": 1.1702086686735448e-07, "loss": 0.7159, "step": 3116 }, { "epoch": 0.9661745902592119, "grad_norm": 0.13518101110358852, "learning_rate": 1.1490491629776667e-07, "loss": 0.6749, "step": 3117 }, { "epoch": 0.9664845596497346, "grad_norm": 0.1365801613825561, "learning_rate": 1.1280821567549505e-07, "loss": 0.6934, "step": 3118 }, { "epoch": 0.9667945290402573, "grad_norm": 0.13909238239014285, "learning_rate": 1.1073076703027153e-07, "loss": 0.7114, "step": 3119 }, { "epoch": 0.96710449843078, "grad_norm": 0.1423320612595569, "learning_rate": 1.0867257237318519e-07, "loss": 0.7442, "step": 3120 }, { "epoch": 0.9674144678213027, "grad_norm": 0.1411785599582699, "learning_rate": 1.0663363369669333e-07, "loss": 0.6732, "step": 3121 }, { "epoch": 0.9677244372118253, "grad_norm": 0.139296404033744, "learning_rate": 1.0461395297460597e-07, "loss": 0.7277, "step": 3122 }, { "epoch": 0.968034406602348, "grad_norm": 0.1387170670264265, "learning_rate": 1.0261353216209691e-07, "loss": 0.7067, "step": 3123 }, { "epoch": 0.9683443759928707, "grad_norm": 0.1395275523472799, "learning_rate": 1.0063237319569042e-07, "loss": 0.7166, "step": 3124 }, { "epoch": 0.9686543453833933, "grad_norm": 0.1420292044086915, "learning_rate": 9.867047799326346e-08, "loss": 0.6971, "step": 3125 }, { "epoch": 0.9689643147739161, "grad_norm": 0.14103692606516674, "learning_rate": 9.672784845404792e-08, "loss": 0.7031, "step": 3126 }, { "epoch": 0.9692742841644387, "grad_norm": 0.14235033550216997, "learning_rate": 9.480448645862617e-08, "loss": 0.7146, "step": 3127 }, { "epoch": 0.9695842535549615, "grad_norm": 0.1361285619633862, "learning_rate": 9.290039386892213e-08, "loss": 0.694, "step": 3128 }, { "epoch": 0.9698942229454841, "grad_norm": 0.16433420204626917, "learning_rate": 9.101557252821247e-08, "loss": 0.7336, "step": 3129 }, { "epoch": 0.9702041923360069, "grad_norm": 0.13879607318803916, "learning_rate": 8.915002426111763e-08, "loss": 0.7032, "step": 3130 }, { "epoch": 0.9705141617265295, "grad_norm": 0.14056001251004796, "learning_rate": 8.7303750873593e-08, "loss": 0.6967, "step": 3131 }, { "epoch": 0.9708241311170522, "grad_norm": 0.13805765142599968, "learning_rate": 8.547675415294665e-08, "loss": 0.7187, "step": 3132 }, { "epoch": 0.9711341005075749, "grad_norm": 0.14051772003127058, "learning_rate": 8.366903586781494e-08, "loss": 0.7121, "step": 3133 }, { "epoch": 0.9714440698980976, "grad_norm": 0.13961482682230023, "learning_rate": 8.188059776817803e-08, "loss": 0.7123, "step": 3134 }, { "epoch": 0.9717540392886203, "grad_norm": 0.1384155844460683, "learning_rate": 8.011144158534878e-08, "loss": 0.6784, "step": 3135 }, { "epoch": 0.9720640086791429, "grad_norm": 0.13965523056957604, "learning_rate": 7.836156903197279e-08, "loss": 0.7161, "step": 3136 }, { "epoch": 0.9723739780696656, "grad_norm": 0.1404738204093845, "learning_rate": 7.663098180203721e-08, "loss": 0.6918, "step": 3137 }, { "epoch": 0.9726839474601883, "grad_norm": 0.1443083503422536, "learning_rate": 7.491968157084418e-08, "loss": 0.7045, "step": 3138 }, { "epoch": 0.972993916850711, "grad_norm": 0.14292133064132068, "learning_rate": 7.322766999503961e-08, "loss": 0.7036, "step": 3139 }, { "epoch": 0.9733038862412337, "grad_norm": 0.14580924597282777, "learning_rate": 7.155494871258884e-08, "loss": 0.7342, "step": 3140 }, { "epoch": 0.9736138556317564, "grad_norm": 0.1413259861181631, "learning_rate": 6.990151934278322e-08, "loss": 0.7021, "step": 3141 }, { "epoch": 0.973923825022279, "grad_norm": 0.14003966115672023, "learning_rate": 6.82673834862424e-08, "loss": 0.6894, "step": 3142 }, { "epoch": 0.9742337944128018, "grad_norm": 0.13863876090171567, "learning_rate": 6.66525427249054e-08, "loss": 0.6889, "step": 3143 }, { "epoch": 0.9745437638033244, "grad_norm": 0.14483172477483355, "learning_rate": 6.505699862203285e-08, "loss": 0.7497, "step": 3144 }, { "epoch": 0.9748537331938472, "grad_norm": 0.14053753730021376, "learning_rate": 6.348075272220922e-08, "loss": 0.7234, "step": 3145 }, { "epoch": 0.9751637025843698, "grad_norm": 0.13715406217273918, "learning_rate": 6.192380655132946e-08, "loss": 0.7021, "step": 3146 }, { "epoch": 0.9754736719748924, "grad_norm": 0.13989200700189383, "learning_rate": 6.038616161661015e-08, "loss": 0.7178, "step": 3147 }, { "epoch": 0.9757836413654152, "grad_norm": 0.1377535155429957, "learning_rate": 5.886781940658504e-08, "loss": 0.6792, "step": 3148 }, { "epoch": 0.9760936107559378, "grad_norm": 0.13615308274019458, "learning_rate": 5.736878139109614e-08, "loss": 0.6874, "step": 3149 }, { "epoch": 0.9764035801464606, "grad_norm": 0.22480000549187368, "learning_rate": 5.588904902130266e-08, "loss": 0.681, "step": 3150 }, { "epoch": 0.9767135495369832, "grad_norm": 0.14087384338224176, "learning_rate": 5.442862372967428e-08, "loss": 0.7193, "step": 3151 }, { "epoch": 0.9770235189275059, "grad_norm": 0.14417976022511447, "learning_rate": 5.298750692998456e-08, "loss": 0.7194, "step": 3152 }, { "epoch": 0.9773334883180286, "grad_norm": 0.14408880435500598, "learning_rate": 5.1565700017324196e-08, "loss": 0.7046, "step": 3153 }, { "epoch": 0.9776434577085513, "grad_norm": 0.14315426780358176, "learning_rate": 5.0163204368083305e-08, "loss": 0.7372, "step": 3154 }, { "epoch": 0.977953427099074, "grad_norm": 0.1396946917454336, "learning_rate": 4.878002133996251e-08, "loss": 0.7039, "step": 3155 }, { "epoch": 0.9782633964895967, "grad_norm": 0.14072017229662015, "learning_rate": 4.741615227196627e-08, "loss": 0.7029, "step": 3156 }, { "epoch": 0.9785733658801193, "grad_norm": 0.13730850315221726, "learning_rate": 4.607159848439402e-08, "loss": 0.6822, "step": 3157 }, { "epoch": 0.978883335270642, "grad_norm": 0.14438727610883542, "learning_rate": 4.4746361278860116e-08, "loss": 0.707, "step": 3158 }, { "epoch": 0.9791933046611647, "grad_norm": 0.31448166173166364, "learning_rate": 4.344044193826946e-08, "loss": 0.6831, "step": 3159 }, { "epoch": 0.9795032740516874, "grad_norm": 0.13945752038415152, "learning_rate": 4.215384172683079e-08, "loss": 0.6972, "step": 3160 }, { "epoch": 0.9798132434422101, "grad_norm": 0.13682009392197242, "learning_rate": 4.088656189004558e-08, "loss": 0.6802, "step": 3161 }, { "epoch": 0.9801232128327327, "grad_norm": 0.2353046633896138, "learning_rate": 3.9638603654719163e-08, "loss": 0.6864, "step": 3162 }, { "epoch": 0.9804331822232555, "grad_norm": 0.13873261546522186, "learning_rate": 3.840996822894738e-08, "loss": 0.6835, "step": 3163 }, { "epoch": 0.9807431516137781, "grad_norm": 0.1427618166733423, "learning_rate": 3.720065680212326e-08, "loss": 0.7106, "step": 3164 }, { "epoch": 0.9810531210043009, "grad_norm": 0.1734397217251119, "learning_rate": 3.6010670544930346e-08, "loss": 0.7048, "step": 3165 }, { "epoch": 0.9813630903948235, "grad_norm": 0.3251096329784772, "learning_rate": 3.4840010609344944e-08, "loss": 0.671, "step": 3166 }, { "epoch": 0.9816730597853462, "grad_norm": 0.13810708077377465, "learning_rate": 3.36886781286383e-08, "loss": 0.6922, "step": 3167 }, { "epoch": 0.9819830291758689, "grad_norm": 0.14846487421408708, "learning_rate": 3.255667421736552e-08, "loss": 0.7713, "step": 3168 }, { "epoch": 0.9822929985663915, "grad_norm": 0.14601645008298336, "learning_rate": 3.1443999971372265e-08, "loss": 0.7392, "step": 3169 }, { "epoch": 0.9826029679569143, "grad_norm": 0.13788252919404181, "learning_rate": 3.035065646779467e-08, "loss": 0.704, "step": 3170 }, { "epoch": 0.9829129373474369, "grad_norm": 0.13722516784698982, "learning_rate": 2.9276644765054985e-08, "loss": 0.6916, "step": 3171 }, { "epoch": 0.9832229067379596, "grad_norm": 0.1414097676896304, "learning_rate": 2.8221965902859306e-08, "loss": 0.7244, "step": 3172 }, { "epoch": 0.9835328761284823, "grad_norm": 0.1355990670075512, "learning_rate": 2.718662090219759e-08, "loss": 0.6628, "step": 3173 }, { "epoch": 0.983842845519005, "grad_norm": 0.13986196462339545, "learning_rate": 2.6170610765348102e-08, "loss": 0.7195, "step": 3174 }, { "epoch": 0.9841528149095277, "grad_norm": 0.14263849228037276, "learning_rate": 2.517393647586408e-08, "loss": 0.7191, "step": 3175 }, { "epoch": 0.9844627843000504, "grad_norm": 0.1561205437758421, "learning_rate": 2.4196598998589283e-08, "loss": 0.7293, "step": 3176 }, { "epoch": 0.984772753690573, "grad_norm": 0.13709827366862978, "learning_rate": 2.323859927964245e-08, "loss": 0.7012, "step": 3177 }, { "epoch": 0.9850827230810958, "grad_norm": 0.1405594946858409, "learning_rate": 2.2299938246423958e-08, "loss": 0.703, "step": 3178 }, { "epoch": 0.9853926924716184, "grad_norm": 0.14154641695085518, "learning_rate": 2.1380616807613607e-08, "loss": 0.7021, "step": 3179 }, { "epoch": 0.9857026618621411, "grad_norm": 0.1363007487311652, "learning_rate": 2.0480635853168397e-08, "loss": 0.701, "step": 3180 }, { "epoch": 0.9860126312526638, "grad_norm": 0.13728536353826892, "learning_rate": 1.9599996254322518e-08, "loss": 0.6998, "step": 3181 }, { "epoch": 0.9863226006431864, "grad_norm": 0.13766796077231558, "learning_rate": 1.873869886358959e-08, "loss": 0.6802, "step": 3182 }, { "epoch": 0.9866325700337092, "grad_norm": 0.14009898084070954, "learning_rate": 1.789674451475154e-08, "loss": 0.7001, "step": 3183 }, { "epoch": 0.9869425394242318, "grad_norm": 0.13543642966091046, "learning_rate": 1.707413402287639e-08, "loss": 0.6938, "step": 3184 }, { "epoch": 0.9872525088147546, "grad_norm": 0.14867385096585084, "learning_rate": 1.6270868184296017e-08, "loss": 0.7049, "step": 3185 }, { "epoch": 0.9875624782052772, "grad_norm": 0.14129426189002597, "learning_rate": 1.54869477766173e-08, "loss": 0.71, "step": 3186 }, { "epoch": 0.9878724475958, "grad_norm": 0.1427118468610065, "learning_rate": 1.472237355872652e-08, "loss": 0.6643, "step": 3187 }, { "epoch": 0.9881824169863226, "grad_norm": 0.1432432174446173, "learning_rate": 1.3977146270771625e-08, "loss": 0.7341, "step": 3188 }, { "epoch": 0.9884923863768453, "grad_norm": 0.1383759268654872, "learning_rate": 1.3251266634182191e-08, "loss": 0.693, "step": 3189 }, { "epoch": 0.988802355767368, "grad_norm": 0.1381597502951157, "learning_rate": 1.2544735351647241e-08, "loss": 0.6971, "step": 3190 }, { "epoch": 0.9891123251578906, "grad_norm": 0.1458657391460303, "learning_rate": 1.1857553107132991e-08, "loss": 0.7193, "step": 3191 }, { "epoch": 0.9894222945484134, "grad_norm": 0.14025368535441352, "learning_rate": 1.1189720565873974e-08, "loss": 0.7173, "step": 3192 }, { "epoch": 0.989732263938936, "grad_norm": 0.14211139651476465, "learning_rate": 1.05412383743686e-08, "loss": 0.7205, "step": 3193 }, { "epoch": 0.9900422333294587, "grad_norm": 0.1427218346279645, "learning_rate": 9.91210716038804e-09, "loss": 0.7329, "step": 3194 }, { "epoch": 0.9903522027199814, "grad_norm": 0.14037645964453083, "learning_rate": 9.302327532969558e-09, "loss": 0.7058, "step": 3195 }, { "epoch": 0.9906621721105041, "grad_norm": 0.14681801300583638, "learning_rate": 8.711900082412072e-09, "loss": 0.691, "step": 3196 }, { "epoch": 0.9909721415010267, "grad_norm": 0.13628660308821575, "learning_rate": 8.140825380287266e-09, "loss": 0.6721, "step": 3197 }, { "epoch": 0.9912821108915495, "grad_norm": 0.14037957489718692, "learning_rate": 7.58910397942847e-09, "loss": 0.7323, "step": 3198 }, { "epoch": 0.9915920802820721, "grad_norm": 0.16783891718568586, "learning_rate": 7.056736413935117e-09, "loss": 0.7044, "step": 3199 }, { "epoch": 0.9919020496725949, "grad_norm": 0.13758182412342512, "learning_rate": 6.543723199170515e-09, "loss": 0.6924, "step": 3200 }, { "epoch": 0.9922120190631175, "grad_norm": 0.14024468713569335, "learning_rate": 6.050064831759628e-09, "loss": 0.7135, "step": 3201 }, { "epoch": 0.9925219884536403, "grad_norm": 0.1421734295267628, "learning_rate": 5.5757617895935144e-09, "loss": 0.702, "step": 3202 }, { "epoch": 0.9928319578441629, "grad_norm": 0.13714219177197312, "learning_rate": 5.120814531829332e-09, "loss": 0.6885, "step": 3203 }, { "epoch": 0.9931419272346855, "grad_norm": 0.13957758890616162, "learning_rate": 4.685223498877012e-09, "loss": 0.7112, "step": 3204 }, { "epoch": 0.9934518966252083, "grad_norm": 0.14096912954963134, "learning_rate": 4.268989112419242e-09, "loss": 0.7447, "step": 3205 }, { "epoch": 0.9937618660157309, "grad_norm": 0.13859188555154953, "learning_rate": 3.872111775393705e-09, "loss": 0.6935, "step": 3206 }, { "epoch": 0.9940718354062537, "grad_norm": 0.1396603293614873, "learning_rate": 3.4945918720019622e-09, "loss": 0.6795, "step": 3207 }, { "epoch": 0.9943818047967763, "grad_norm": 0.13671215985307633, "learning_rate": 3.136429767705007e-09, "loss": 0.6747, "step": 3208 }, { "epoch": 0.994691774187299, "grad_norm": 0.14389896170043612, "learning_rate": 2.79762580922327e-09, "loss": 0.7122, "step": 3209 }, { "epoch": 0.9950017435778217, "grad_norm": 0.13749058219676877, "learning_rate": 2.4781803245410574e-09, "loss": 0.6877, "step": 3210 }, { "epoch": 0.9953117129683444, "grad_norm": 0.1419618727430252, "learning_rate": 2.1780936228998904e-09, "loss": 0.6964, "step": 3211 }, { "epoch": 0.995621682358867, "grad_norm": 0.1456234611968163, "learning_rate": 1.897365994800726e-09, "loss": 0.71, "step": 3212 }, { "epoch": 0.9959316517493898, "grad_norm": 0.24228252176056686, "learning_rate": 1.6359977120061765e-09, "loss": 0.7095, "step": 3213 }, { "epoch": 0.9962416211399124, "grad_norm": 0.14023253827014284, "learning_rate": 1.3939890275338486e-09, "loss": 0.6801, "step": 3214 }, { "epoch": 0.9965515905304351, "grad_norm": 0.13828874141836353, "learning_rate": 1.1713401756652253e-09, "loss": 0.7088, "step": 3215 }, { "epoch": 0.9968615599209578, "grad_norm": 0.13998005665225852, "learning_rate": 9.680513719345642e-10, "loss": 0.7, "step": 3216 }, { "epoch": 0.9971715293114805, "grad_norm": 0.14332863384549646, "learning_rate": 7.841228131399981e-10, "loss": 0.7342, "step": 3217 }, { "epoch": 0.9974814987020032, "grad_norm": 0.1536877273250328, "learning_rate": 6.19554677334655e-10, "loss": 0.6986, "step": 3218 }, { "epoch": 0.9977914680925258, "grad_norm": 0.14066594022196796, "learning_rate": 4.743471238288777e-10, "loss": 0.71, "step": 3219 }, { "epoch": 0.9981014374830486, "grad_norm": 0.1393181259403004, "learning_rate": 3.485002931946646e-10, "loss": 0.7046, "step": 3220 }, { "epoch": 0.9984114068735712, "grad_norm": 0.14475342977298608, "learning_rate": 2.420143072567882e-10, "loss": 0.708, "step": 3221 }, { "epoch": 0.998721376264094, "grad_norm": 0.13884400547252093, "learning_rate": 1.548892691016768e-10, "loss": 0.707, "step": 3222 }, { "epoch": 0.9990313456546166, "grad_norm": 0.14232022519031393, "learning_rate": 8.712526306853264e-11, "loss": 0.7189, "step": 3223 }, { "epoch": 0.9993413150451393, "grad_norm": 0.13757338039181813, "learning_rate": 3.872235476043429e-11, "loss": 0.7071, "step": 3224 }, { "epoch": 0.999651284435662, "grad_norm": 0.13631778303533673, "learning_rate": 9.680591033234265e-12, "loss": 0.6814, "step": 3225 }, { "epoch": 0.9999612538261846, "grad_norm": 0.14775163834268715, "learning_rate": 0.0, "loss": 0.7025, "step": 3226 }, { "epoch": 0.9999612538261846, "step": 3226, "total_flos": 6219175525613568.0, "train_loss": 0.7459644535204231, "train_runtime": 59291.6308, "train_samples_per_second": 24.376, "train_steps_per_second": 0.054 } ], "logging_steps": 1, "max_steps": 3226, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6219175525613568.0, "train_batch_size": 7, "trial_name": null, "trial_params": null }