{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 4764, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00020990764063811922, "grad_norm": 0.8211116790771484, "learning_rate": 2.09643605870021e-07, "loss": 0.9775, "step": 1 }, { "epoch": 0.00041981528127623844, "grad_norm": 0.7017123103141785, "learning_rate": 4.19287211740042e-07, "loss": 0.868, "step": 2 }, { "epoch": 0.0006297229219143577, "grad_norm": 0.6900752782821655, "learning_rate": 6.28930817610063e-07, "loss": 0.8862, "step": 3 }, { "epoch": 0.0008396305625524769, "grad_norm": 0.7597951292991638, "learning_rate": 8.38574423480084e-07, "loss": 0.923, "step": 4 }, { "epoch": 0.0010495382031905961, "grad_norm": 0.8734421730041504, "learning_rate": 1.048218029350105e-06, "loss": 1.0745, "step": 5 }, { "epoch": 0.0012594458438287153, "grad_norm": 0.8313158750534058, "learning_rate": 1.257861635220126e-06, "loss": 0.9817, "step": 6 }, { "epoch": 0.0014693534844668346, "grad_norm": 0.656913697719574, "learning_rate": 1.467505241090147e-06, "loss": 0.8544, "step": 7 }, { "epoch": 0.0016792611251049538, "grad_norm": 0.7147523164749146, "learning_rate": 1.677148846960168e-06, "loss": 0.8955, "step": 8 }, { "epoch": 0.001889168765743073, "grad_norm": 0.753795325756073, "learning_rate": 1.8867924528301887e-06, "loss": 0.881, "step": 9 }, { "epoch": 0.0020990764063811922, "grad_norm": 0.7141973972320557, "learning_rate": 2.09643605870021e-06, "loss": 0.9229, "step": 10 }, { "epoch": 0.0023089840470193117, "grad_norm": 0.8047632575035095, "learning_rate": 2.306079664570231e-06, "loss": 0.9438, "step": 11 }, { "epoch": 0.0025188916876574307, "grad_norm": 0.8492773175239563, "learning_rate": 2.515723270440252e-06, "loss": 0.9977, "step": 12 }, { "epoch": 0.00272879932829555, "grad_norm": 0.744103729724884, "learning_rate": 2.7253668763102727e-06, "loss": 0.9049, "step": 13 }, { "epoch": 0.002938706968933669, "grad_norm": 0.7073290944099426, "learning_rate": 2.935010482180294e-06, "loss": 0.8824, "step": 14 }, { "epoch": 0.0031486146095717885, "grad_norm": 0.6808626651763916, "learning_rate": 3.1446540880503146e-06, "loss": 0.8854, "step": 15 }, { "epoch": 0.0033585222502099076, "grad_norm": 0.6526097655296326, "learning_rate": 3.354297693920336e-06, "loss": 0.8219, "step": 16 }, { "epoch": 0.003568429890848027, "grad_norm": 0.8246333003044128, "learning_rate": 3.563941299790356e-06, "loss": 0.9948, "step": 17 }, { "epoch": 0.003778337531486146, "grad_norm": 0.8494473099708557, "learning_rate": 3.7735849056603773e-06, "loss": 1.0187, "step": 18 }, { "epoch": 0.003988245172124265, "grad_norm": 0.9259452223777771, "learning_rate": 3.9832285115303985e-06, "loss": 1.0864, "step": 19 }, { "epoch": 0.0041981528127623844, "grad_norm": 0.8235483765602112, "learning_rate": 4.19287211740042e-06, "loss": 1.0255, "step": 20 }, { "epoch": 0.004408060453400504, "grad_norm": 0.8101536631584167, "learning_rate": 4.40251572327044e-06, "loss": 0.9892, "step": 21 }, { "epoch": 0.004617968094038623, "grad_norm": 0.7406070232391357, "learning_rate": 4.612159329140462e-06, "loss": 0.9266, "step": 22 }, { "epoch": 0.004827875734676742, "grad_norm": 0.7151230573654175, "learning_rate": 4.821802935010482e-06, "loss": 0.902, "step": 23 }, { "epoch": 0.005037783375314861, "grad_norm": 0.6921148896217346, "learning_rate": 5.031446540880504e-06, "loss": 0.8621, "step": 24 }, { "epoch": 0.005247691015952981, "grad_norm": 0.6169761419296265, "learning_rate": 5.241090146750524e-06, "loss": 0.8296, "step": 25 }, { "epoch": 0.0054575986565911, "grad_norm": 0.679093599319458, "learning_rate": 5.4507337526205454e-06, "loss": 0.8593, "step": 26 }, { "epoch": 0.005667506297229219, "grad_norm": 0.8058671951293945, "learning_rate": 5.660377358490566e-06, "loss": 0.9504, "step": 27 }, { "epoch": 0.005877413937867338, "grad_norm": 0.8705667853355408, "learning_rate": 5.870020964360588e-06, "loss": 1.0245, "step": 28 }, { "epoch": 0.006087321578505458, "grad_norm": 0.7334048748016357, "learning_rate": 6.079664570230608e-06, "loss": 0.9172, "step": 29 }, { "epoch": 0.006297229219143577, "grad_norm": 0.7490406632423401, "learning_rate": 6.289308176100629e-06, "loss": 0.9329, "step": 30 }, { "epoch": 0.006507136859781696, "grad_norm": 0.8739404678344727, "learning_rate": 6.49895178197065e-06, "loss": 0.9584, "step": 31 }, { "epoch": 0.006717044500419815, "grad_norm": 0.7686614990234375, "learning_rate": 6.708595387840672e-06, "loss": 0.9493, "step": 32 }, { "epoch": 0.0069269521410579345, "grad_norm": 0.9184572100639343, "learning_rate": 6.918238993710692e-06, "loss": 1.0678, "step": 33 }, { "epoch": 0.007136859781696054, "grad_norm": 0.8303453326225281, "learning_rate": 7.127882599580712e-06, "loss": 0.9834, "step": 34 }, { "epoch": 0.0073467674223341725, "grad_norm": 0.6670796871185303, "learning_rate": 7.337526205450735e-06, "loss": 0.8434, "step": 35 }, { "epoch": 0.007556675062972292, "grad_norm": 0.6518784761428833, "learning_rate": 7.547169811320755e-06, "loss": 0.8445, "step": 36 }, { "epoch": 0.007766582703610411, "grad_norm": 0.8396414518356323, "learning_rate": 7.756813417190776e-06, "loss": 0.9229, "step": 37 }, { "epoch": 0.00797649034424853, "grad_norm": 0.9019722938537598, "learning_rate": 7.966457023060797e-06, "loss": 0.9545, "step": 38 }, { "epoch": 0.00818639798488665, "grad_norm": 0.6634522676467896, "learning_rate": 8.176100628930818e-06, "loss": 0.8279, "step": 39 }, { "epoch": 0.008396305625524769, "grad_norm": 0.86155104637146, "learning_rate": 8.38574423480084e-06, "loss": 0.9326, "step": 40 }, { "epoch": 0.008606213266162888, "grad_norm": 0.6448208689689636, "learning_rate": 8.59538784067086e-06, "loss": 0.8261, "step": 41 }, { "epoch": 0.008816120906801008, "grad_norm": 0.7442598938941956, "learning_rate": 8.80503144654088e-06, "loss": 0.9352, "step": 42 }, { "epoch": 0.009026028547439127, "grad_norm": 0.8854546546936035, "learning_rate": 9.014675052410902e-06, "loss": 1.017, "step": 43 }, { "epoch": 0.009235936188077247, "grad_norm": 0.66485196352005, "learning_rate": 9.224318658280923e-06, "loss": 0.8096, "step": 44 }, { "epoch": 0.009445843828715366, "grad_norm": 0.5998132824897766, "learning_rate": 9.433962264150944e-06, "loss": 0.7932, "step": 45 }, { "epoch": 0.009655751469353484, "grad_norm": 0.6402536034584045, "learning_rate": 9.643605870020965e-06, "loss": 0.8146, "step": 46 }, { "epoch": 0.009865659109991603, "grad_norm": 0.6665769219398499, "learning_rate": 9.853249475890985e-06, "loss": 0.8435, "step": 47 }, { "epoch": 0.010075566750629723, "grad_norm": 0.6947203874588013, "learning_rate": 1.0062893081761008e-05, "loss": 0.8712, "step": 48 }, { "epoch": 0.010285474391267842, "grad_norm": 0.6707759499549866, "learning_rate": 1.0272536687631027e-05, "loss": 0.8382, "step": 49 }, { "epoch": 0.010495382031905962, "grad_norm": 0.6716253161430359, "learning_rate": 1.0482180293501048e-05, "loss": 0.8887, "step": 50 }, { "epoch": 0.010705289672544081, "grad_norm": 0.6954374313354492, "learning_rate": 1.069182389937107e-05, "loss": 0.895, "step": 51 }, { "epoch": 0.0109151973131822, "grad_norm": 0.5633372068405151, "learning_rate": 1.0901467505241091e-05, "loss": 0.7971, "step": 52 }, { "epoch": 0.01112510495382032, "grad_norm": 0.5513401031494141, "learning_rate": 1.1111111111111112e-05, "loss": 0.7848, "step": 53 }, { "epoch": 0.011335012594458438, "grad_norm": 0.6735419034957886, "learning_rate": 1.1320754716981132e-05, "loss": 0.8558, "step": 54 }, { "epoch": 0.011544920235096557, "grad_norm": 0.5273690223693848, "learning_rate": 1.1530398322851153e-05, "loss": 0.7531, "step": 55 }, { "epoch": 0.011754827875734676, "grad_norm": 0.612610936164856, "learning_rate": 1.1740041928721176e-05, "loss": 0.8009, "step": 56 }, { "epoch": 0.011964735516372796, "grad_norm": 0.5250133275985718, "learning_rate": 1.1949685534591196e-05, "loss": 0.7411, "step": 57 }, { "epoch": 0.012174643157010915, "grad_norm": 0.6210602521896362, "learning_rate": 1.2159329140461215e-05, "loss": 0.814, "step": 58 }, { "epoch": 0.012384550797649035, "grad_norm": 0.5767892003059387, "learning_rate": 1.2368972746331238e-05, "loss": 0.7512, "step": 59 }, { "epoch": 0.012594458438287154, "grad_norm": 0.5112836360931396, "learning_rate": 1.2578616352201259e-05, "loss": 0.7405, "step": 60 }, { "epoch": 0.012804366078925274, "grad_norm": 0.5214811563491821, "learning_rate": 1.2788259958071281e-05, "loss": 0.7152, "step": 61 }, { "epoch": 0.013014273719563391, "grad_norm": 0.5820367932319641, "learning_rate": 1.29979035639413e-05, "loss": 0.7359, "step": 62 }, { "epoch": 0.01322418136020151, "grad_norm": 0.5728296637535095, "learning_rate": 1.320754716981132e-05, "loss": 0.7096, "step": 63 }, { "epoch": 0.01343408900083963, "grad_norm": 0.514997124671936, "learning_rate": 1.3417190775681343e-05, "loss": 0.6945, "step": 64 }, { "epoch": 0.01364399664147775, "grad_norm": 0.5707572102546692, "learning_rate": 1.3626834381551362e-05, "loss": 0.7065, "step": 65 }, { "epoch": 0.013853904282115869, "grad_norm": 0.5674712657928467, "learning_rate": 1.3836477987421385e-05, "loss": 0.6759, "step": 66 }, { "epoch": 0.014063811922753989, "grad_norm": 0.5445975661277771, "learning_rate": 1.4046121593291406e-05, "loss": 0.6487, "step": 67 }, { "epoch": 0.014273719563392108, "grad_norm": 0.5629355311393738, "learning_rate": 1.4255765199161425e-05, "loss": 0.6635, "step": 68 }, { "epoch": 0.014483627204030227, "grad_norm": 0.47151610255241394, "learning_rate": 1.4465408805031447e-05, "loss": 0.5976, "step": 69 }, { "epoch": 0.014693534844668345, "grad_norm": 0.44633767008781433, "learning_rate": 1.467505241090147e-05, "loss": 0.5738, "step": 70 }, { "epoch": 0.014903442485306465, "grad_norm": 0.48507657647132874, "learning_rate": 1.488469601677149e-05, "loss": 0.5859, "step": 71 }, { "epoch": 0.015113350125944584, "grad_norm": 0.4147733151912689, "learning_rate": 1.509433962264151e-05, "loss": 0.5386, "step": 72 }, { "epoch": 0.015323257766582703, "grad_norm": 0.635608434677124, "learning_rate": 1.530398322851153e-05, "loss": 0.6529, "step": 73 }, { "epoch": 0.015533165407220823, "grad_norm": 0.5556919574737549, "learning_rate": 1.5513626834381552e-05, "loss": 0.5935, "step": 74 }, { "epoch": 0.015743073047858942, "grad_norm": 0.5627433657646179, "learning_rate": 1.572327044025157e-05, "loss": 0.5677, "step": 75 }, { "epoch": 0.01595298068849706, "grad_norm": 0.5727344155311584, "learning_rate": 1.5932914046121594e-05, "loss": 0.5701, "step": 76 }, { "epoch": 0.01616288832913518, "grad_norm": 0.5192092657089233, "learning_rate": 1.6142557651991616e-05, "loss": 0.537, "step": 77 }, { "epoch": 0.0163727959697733, "grad_norm": 0.6583610773086548, "learning_rate": 1.6352201257861635e-05, "loss": 0.5718, "step": 78 }, { "epoch": 0.01658270361041142, "grad_norm": 0.4762994050979614, "learning_rate": 1.6561844863731658e-05, "loss": 0.5016, "step": 79 }, { "epoch": 0.016792611251049538, "grad_norm": 0.7903013825416565, "learning_rate": 1.677148846960168e-05, "loss": 0.6123, "step": 80 }, { "epoch": 0.01700251889168766, "grad_norm": 0.6027877330780029, "learning_rate": 1.69811320754717e-05, "loss": 0.5085, "step": 81 }, { "epoch": 0.017212426532325777, "grad_norm": 0.6400225162506104, "learning_rate": 1.719077568134172e-05, "loss": 0.5325, "step": 82 }, { "epoch": 0.017422334172963894, "grad_norm": 0.5193424224853516, "learning_rate": 1.740041928721174e-05, "loss": 0.476, "step": 83 }, { "epoch": 0.017632241813602016, "grad_norm": 0.5318325757980347, "learning_rate": 1.761006289308176e-05, "loss": 0.4574, "step": 84 }, { "epoch": 0.017842149454240133, "grad_norm": 0.5530166029930115, "learning_rate": 1.7819706498951782e-05, "loss": 0.4331, "step": 85 }, { "epoch": 0.018052057094878254, "grad_norm": 0.5483909845352173, "learning_rate": 1.8029350104821805e-05, "loss": 0.4254, "step": 86 }, { "epoch": 0.018261964735516372, "grad_norm": 0.4599871039390564, "learning_rate": 1.8238993710691824e-05, "loss": 0.3828, "step": 87 }, { "epoch": 0.018471872376154493, "grad_norm": 0.47489672899246216, "learning_rate": 1.8448637316561846e-05, "loss": 0.3989, "step": 88 }, { "epoch": 0.01868178001679261, "grad_norm": 0.6532279253005981, "learning_rate": 1.865828092243187e-05, "loss": 0.3968, "step": 89 }, { "epoch": 0.018891687657430732, "grad_norm": 0.707245945930481, "learning_rate": 1.8867924528301888e-05, "loss": 0.4099, "step": 90 }, { "epoch": 0.01910159529806885, "grad_norm": 0.4290582835674286, "learning_rate": 1.9077568134171907e-05, "loss": 0.3546, "step": 91 }, { "epoch": 0.019311502938706968, "grad_norm": 0.4370197057723999, "learning_rate": 1.928721174004193e-05, "loss": 0.3519, "step": 92 }, { "epoch": 0.01952141057934509, "grad_norm": 0.4862878918647766, "learning_rate": 1.9496855345911952e-05, "loss": 0.3494, "step": 93 }, { "epoch": 0.019731318219983206, "grad_norm": 0.514576256275177, "learning_rate": 1.970649895178197e-05, "loss": 0.326, "step": 94 }, { "epoch": 0.019941225860621328, "grad_norm": 0.3785364031791687, "learning_rate": 1.9916142557651993e-05, "loss": 0.3196, "step": 95 }, { "epoch": 0.020151133501259445, "grad_norm": 0.4459572732448578, "learning_rate": 2.0125786163522016e-05, "loss": 0.3245, "step": 96 }, { "epoch": 0.020361041141897566, "grad_norm": 0.3634876310825348, "learning_rate": 2.0335429769392035e-05, "loss": 0.2999, "step": 97 }, { "epoch": 0.020570948782535684, "grad_norm": 0.39789989590644836, "learning_rate": 2.0545073375262054e-05, "loss": 0.2908, "step": 98 }, { "epoch": 0.020780856423173802, "grad_norm": 0.3628767430782318, "learning_rate": 2.0754716981132076e-05, "loss": 0.278, "step": 99 }, { "epoch": 0.020990764063811923, "grad_norm": 0.3945654332637787, "learning_rate": 2.0964360587002095e-05, "loss": 0.2902, "step": 100 }, { "epoch": 0.02120067170445004, "grad_norm": 0.2995467185974121, "learning_rate": 2.1174004192872118e-05, "loss": 0.2748, "step": 101 }, { "epoch": 0.021410579345088162, "grad_norm": 0.2776371240615845, "learning_rate": 2.138364779874214e-05, "loss": 0.2772, "step": 102 }, { "epoch": 0.02162048698572628, "grad_norm": 0.292316734790802, "learning_rate": 2.159329140461216e-05, "loss": 0.2732, "step": 103 }, { "epoch": 0.0218303946263644, "grad_norm": 0.26235565543174744, "learning_rate": 2.1802935010482182e-05, "loss": 0.2592, "step": 104 }, { "epoch": 0.02204030226700252, "grad_norm": 0.2782291769981384, "learning_rate": 2.2012578616352204e-05, "loss": 0.2586, "step": 105 }, { "epoch": 0.02225020990764064, "grad_norm": 0.24781855940818787, "learning_rate": 2.2222222222222223e-05, "loss": 0.2571, "step": 106 }, { "epoch": 0.022460117548278757, "grad_norm": 0.3645837604999542, "learning_rate": 2.2431865828092242e-05, "loss": 0.247, "step": 107 }, { "epoch": 0.022670025188916875, "grad_norm": 0.4096992313861847, "learning_rate": 2.2641509433962265e-05, "loss": 0.2648, "step": 108 }, { "epoch": 0.022879932829554996, "grad_norm": 0.36801740527153015, "learning_rate": 2.2851153039832284e-05, "loss": 0.2677, "step": 109 }, { "epoch": 0.023089840470193114, "grad_norm": 0.221563920378685, "learning_rate": 2.3060796645702306e-05, "loss": 0.2544, "step": 110 }, { "epoch": 0.023299748110831235, "grad_norm": 0.17734010517597198, "learning_rate": 2.327044025157233e-05, "loss": 0.2458, "step": 111 }, { "epoch": 0.023509655751469353, "grad_norm": 0.23815220594406128, "learning_rate": 2.348008385744235e-05, "loss": 0.2249, "step": 112 }, { "epoch": 0.023719563392107474, "grad_norm": 0.24534103274345398, "learning_rate": 2.368972746331237e-05, "loss": 0.2283, "step": 113 }, { "epoch": 0.02392947103274559, "grad_norm": 0.17044654488563538, "learning_rate": 2.3899371069182393e-05, "loss": 0.2543, "step": 114 }, { "epoch": 0.02413937867338371, "grad_norm": 0.22411420941352844, "learning_rate": 2.4109014675052412e-05, "loss": 0.2537, "step": 115 }, { "epoch": 0.02434928631402183, "grad_norm": 0.17107880115509033, "learning_rate": 2.431865828092243e-05, "loss": 0.2477, "step": 116 }, { "epoch": 0.02455919395465995, "grad_norm": 0.19663883745670319, "learning_rate": 2.4528301886792453e-05, "loss": 0.2503, "step": 117 }, { "epoch": 0.02476910159529807, "grad_norm": 0.2755718231201172, "learning_rate": 2.4737945492662476e-05, "loss": 0.201, "step": 118 }, { "epoch": 0.024979009235936187, "grad_norm": 0.16535572707653046, "learning_rate": 2.4947589098532495e-05, "loss": 0.2364, "step": 119 }, { "epoch": 0.02518891687657431, "grad_norm": 0.15798090398311615, "learning_rate": 2.5157232704402517e-05, "loss": 0.214, "step": 120 }, { "epoch": 0.025398824517212426, "grad_norm": 0.1900860220193863, "learning_rate": 2.5366876310272536e-05, "loss": 0.2182, "step": 121 }, { "epoch": 0.025608732157850547, "grad_norm": 0.18855144083499908, "learning_rate": 2.5576519916142562e-05, "loss": 0.2123, "step": 122 }, { "epoch": 0.025818639798488665, "grad_norm": 0.1780049353837967, "learning_rate": 2.578616352201258e-05, "loss": 0.2197, "step": 123 }, { "epoch": 0.026028547439126783, "grad_norm": 0.27040061354637146, "learning_rate": 2.59958071278826e-05, "loss": 0.2395, "step": 124 }, { "epoch": 0.026238455079764904, "grad_norm": 0.1983417570590973, "learning_rate": 2.6205450733752623e-05, "loss": 0.2197, "step": 125 }, { "epoch": 0.02644836272040302, "grad_norm": 0.1723383665084839, "learning_rate": 2.641509433962264e-05, "loss": 0.2183, "step": 126 }, { "epoch": 0.026658270361041143, "grad_norm": 0.24477605521678925, "learning_rate": 2.662473794549266e-05, "loss": 0.2293, "step": 127 }, { "epoch": 0.02686817800167926, "grad_norm": 0.15110079944133759, "learning_rate": 2.6834381551362687e-05, "loss": 0.2245, "step": 128 }, { "epoch": 0.02707808564231738, "grad_norm": 0.22366492450237274, "learning_rate": 2.7044025157232706e-05, "loss": 0.2031, "step": 129 }, { "epoch": 0.0272879932829555, "grad_norm": 0.32891571521759033, "learning_rate": 2.7253668763102725e-05, "loss": 0.1821, "step": 130 }, { "epoch": 0.02749790092359362, "grad_norm": 0.20027081668376923, "learning_rate": 2.746331236897275e-05, "loss": 0.223, "step": 131 }, { "epoch": 0.027707808564231738, "grad_norm": 0.2269366830587387, "learning_rate": 2.767295597484277e-05, "loss": 0.2246, "step": 132 }, { "epoch": 0.027917716204869856, "grad_norm": 0.1355280727148056, "learning_rate": 2.788259958071279e-05, "loss": 0.2183, "step": 133 }, { "epoch": 0.028127623845507977, "grad_norm": 0.29291626811027527, "learning_rate": 2.809224318658281e-05, "loss": 0.2049, "step": 134 }, { "epoch": 0.028337531486146095, "grad_norm": 0.1778186410665512, "learning_rate": 2.830188679245283e-05, "loss": 0.2095, "step": 135 }, { "epoch": 0.028547439126784216, "grad_norm": 0.23263931274414062, "learning_rate": 2.851153039832285e-05, "loss": 0.2372, "step": 136 }, { "epoch": 0.028757346767422334, "grad_norm": 0.2121749222278595, "learning_rate": 2.8721174004192875e-05, "loss": 0.2107, "step": 137 }, { "epoch": 0.028967254408060455, "grad_norm": 0.19954991340637207, "learning_rate": 2.8930817610062894e-05, "loss": 0.2181, "step": 138 }, { "epoch": 0.029177162048698572, "grad_norm": 0.15431921184062958, "learning_rate": 2.9140461215932913e-05, "loss": 0.2198, "step": 139 }, { "epoch": 0.02938706968933669, "grad_norm": 0.17603729665279388, "learning_rate": 2.935010482180294e-05, "loss": 0.2339, "step": 140 }, { "epoch": 0.02959697732997481, "grad_norm": 0.1604471355676651, "learning_rate": 2.9559748427672958e-05, "loss": 0.2064, "step": 141 }, { "epoch": 0.02980688497061293, "grad_norm": 0.17169184982776642, "learning_rate": 2.976939203354298e-05, "loss": 0.1987, "step": 142 }, { "epoch": 0.03001679261125105, "grad_norm": 0.1285230815410614, "learning_rate": 2.9979035639413e-05, "loss": 0.2342, "step": 143 }, { "epoch": 0.030226700251889168, "grad_norm": 0.1755395084619522, "learning_rate": 3.018867924528302e-05, "loss": 0.222, "step": 144 }, { "epoch": 0.03043660789252729, "grad_norm": 0.15474987030029297, "learning_rate": 3.0398322851153044e-05, "loss": 0.2159, "step": 145 }, { "epoch": 0.030646515533165407, "grad_norm": 0.12986472249031067, "learning_rate": 3.060796645702306e-05, "loss": 0.2124, "step": 146 }, { "epoch": 0.030856423173803528, "grad_norm": 0.1458188146352768, "learning_rate": 3.081761006289308e-05, "loss": 0.214, "step": 147 }, { "epoch": 0.031066330814441646, "grad_norm": 0.1323792040348053, "learning_rate": 3.1027253668763105e-05, "loss": 0.2153, "step": 148 }, { "epoch": 0.03127623845507976, "grad_norm": 0.16542711853981018, "learning_rate": 3.1236897274633124e-05, "loss": 0.2154, "step": 149 }, { "epoch": 0.031486146095717885, "grad_norm": 0.17730407416820526, "learning_rate": 3.144654088050314e-05, "loss": 0.2202, "step": 150 }, { "epoch": 0.031696053736356006, "grad_norm": 0.15039502084255219, "learning_rate": 3.165618448637317e-05, "loss": 0.2056, "step": 151 }, { "epoch": 0.03190596137699412, "grad_norm": 0.20309150218963623, "learning_rate": 3.186582809224319e-05, "loss": 0.2175, "step": 152 }, { "epoch": 0.03211586901763224, "grad_norm": 0.16652604937553406, "learning_rate": 3.207547169811321e-05, "loss": 0.2076, "step": 153 }, { "epoch": 0.03232577665827036, "grad_norm": 0.14530467987060547, "learning_rate": 3.228511530398323e-05, "loss": 0.2167, "step": 154 }, { "epoch": 0.032535684298908484, "grad_norm": 0.13003528118133545, "learning_rate": 3.249475890985325e-05, "loss": 0.2089, "step": 155 }, { "epoch": 0.0327455919395466, "grad_norm": 0.16985855996608734, "learning_rate": 3.270440251572327e-05, "loss": 0.1994, "step": 156 }, { "epoch": 0.03295549958018472, "grad_norm": 0.18479777872562408, "learning_rate": 3.29140461215933e-05, "loss": 0.2127, "step": 157 }, { "epoch": 0.03316540722082284, "grad_norm": 0.1541491150856018, "learning_rate": 3.3123689727463316e-05, "loss": 0.2208, "step": 158 }, { "epoch": 0.033375314861460954, "grad_norm": 0.13511165976524353, "learning_rate": 3.3333333333333335e-05, "loss": 0.218, "step": 159 }, { "epoch": 0.033585222502099076, "grad_norm": 0.1392865628004074, "learning_rate": 3.354297693920336e-05, "loss": 0.226, "step": 160 }, { "epoch": 0.0337951301427372, "grad_norm": 0.1614847183227539, "learning_rate": 3.375262054507338e-05, "loss": 0.2072, "step": 161 }, { "epoch": 0.03400503778337532, "grad_norm": 0.12186679244041443, "learning_rate": 3.39622641509434e-05, "loss": 0.2193, "step": 162 }, { "epoch": 0.03421494542401343, "grad_norm": 0.1777278333902359, "learning_rate": 3.417190775681342e-05, "loss": 0.202, "step": 163 }, { "epoch": 0.03442485306465155, "grad_norm": 0.13323499262332916, "learning_rate": 3.438155136268344e-05, "loss": 0.1983, "step": 164 }, { "epoch": 0.034634760705289674, "grad_norm": 0.22301942110061646, "learning_rate": 3.4591194968553456e-05, "loss": 0.2252, "step": 165 }, { "epoch": 0.03484466834592779, "grad_norm": 0.1538372039794922, "learning_rate": 3.480083857442348e-05, "loss": 0.2071, "step": 166 }, { "epoch": 0.03505457598656591, "grad_norm": 0.128251314163208, "learning_rate": 3.50104821802935e-05, "loss": 0.214, "step": 167 }, { "epoch": 0.03526448362720403, "grad_norm": 0.26556313037872314, "learning_rate": 3.522012578616352e-05, "loss": 0.1863, "step": 168 }, { "epoch": 0.03547439126784215, "grad_norm": 0.14938302338123322, "learning_rate": 3.5429769392033546e-05, "loss": 0.2152, "step": 169 }, { "epoch": 0.035684298908480266, "grad_norm": 0.19645771384239197, "learning_rate": 3.5639412997903565e-05, "loss": 0.1977, "step": 170 }, { "epoch": 0.03589420654911839, "grad_norm": 0.1725340038537979, "learning_rate": 3.5849056603773584e-05, "loss": 0.1981, "step": 171 }, { "epoch": 0.03610411418975651, "grad_norm": 0.16048069298267365, "learning_rate": 3.605870020964361e-05, "loss": 0.1983, "step": 172 }, { "epoch": 0.03631402183039462, "grad_norm": 0.21855585277080536, "learning_rate": 3.626834381551363e-05, "loss": 0.2321, "step": 173 }, { "epoch": 0.036523929471032744, "grad_norm": 0.1136787012219429, "learning_rate": 3.647798742138365e-05, "loss": 0.1973, "step": 174 }, { "epoch": 0.036733837111670865, "grad_norm": 0.15145623683929443, "learning_rate": 3.6687631027253674e-05, "loss": 0.1947, "step": 175 }, { "epoch": 0.036943744752308987, "grad_norm": 0.21631890535354614, "learning_rate": 3.689727463312369e-05, "loss": 0.2151, "step": 176 }, { "epoch": 0.0371536523929471, "grad_norm": 0.2623152434825897, "learning_rate": 3.710691823899371e-05, "loss": 0.1814, "step": 177 }, { "epoch": 0.03736356003358522, "grad_norm": 0.1753605455160141, "learning_rate": 3.731656184486374e-05, "loss": 0.203, "step": 178 }, { "epoch": 0.03757346767422334, "grad_norm": 0.10878176242113113, "learning_rate": 3.752620545073376e-05, "loss": 0.2052, "step": 179 }, { "epoch": 0.037783375314861464, "grad_norm": 0.13699688017368317, "learning_rate": 3.7735849056603776e-05, "loss": 0.2004, "step": 180 }, { "epoch": 0.03799328295549958, "grad_norm": 0.14288806915283203, "learning_rate": 3.7945492662473795e-05, "loss": 0.1745, "step": 181 }, { "epoch": 0.0382031905961377, "grad_norm": 0.12457548081874847, "learning_rate": 3.8155136268343814e-05, "loss": 0.194, "step": 182 }, { "epoch": 0.03841309823677582, "grad_norm": 0.167145237326622, "learning_rate": 3.836477987421384e-05, "loss": 0.2027, "step": 183 }, { "epoch": 0.038623005877413935, "grad_norm": 0.12857979536056519, "learning_rate": 3.857442348008386e-05, "loss": 0.2126, "step": 184 }, { "epoch": 0.038832913518052056, "grad_norm": 0.16190126538276672, "learning_rate": 3.878406708595388e-05, "loss": 0.2037, "step": 185 }, { "epoch": 0.03904282115869018, "grad_norm": 0.168744757771492, "learning_rate": 3.8993710691823904e-05, "loss": 0.2108, "step": 186 }, { "epoch": 0.0392527287993283, "grad_norm": 0.1676539033651352, "learning_rate": 3.920335429769392e-05, "loss": 0.216, "step": 187 }, { "epoch": 0.03946263643996641, "grad_norm": 0.13556820154190063, "learning_rate": 3.941299790356394e-05, "loss": 0.2085, "step": 188 }, { "epoch": 0.039672544080604534, "grad_norm": 0.1797979772090912, "learning_rate": 3.962264150943397e-05, "loss": 0.1933, "step": 189 }, { "epoch": 0.039882451721242655, "grad_norm": 0.20826327800750732, "learning_rate": 3.983228511530399e-05, "loss": 0.2214, "step": 190 }, { "epoch": 0.04009235936188077, "grad_norm": 0.19972363114356995, "learning_rate": 4.0041928721174006e-05, "loss": 0.1941, "step": 191 }, { "epoch": 0.04030226700251889, "grad_norm": 0.149556502699852, "learning_rate": 4.025157232704403e-05, "loss": 0.1956, "step": 192 }, { "epoch": 0.04051217464315701, "grad_norm": 0.22496013343334198, "learning_rate": 4.046121593291405e-05, "loss": 0.1973, "step": 193 }, { "epoch": 0.04072208228379513, "grad_norm": 0.16132576763629913, "learning_rate": 4.067085953878407e-05, "loss": 0.203, "step": 194 }, { "epoch": 0.04093198992443325, "grad_norm": 0.17156128585338593, "learning_rate": 4.088050314465409e-05, "loss": 0.1942, "step": 195 }, { "epoch": 0.04114189756507137, "grad_norm": 0.14846180379390717, "learning_rate": 4.109014675052411e-05, "loss": 0.2005, "step": 196 }, { "epoch": 0.04135180520570949, "grad_norm": 0.20252752304077148, "learning_rate": 4.129979035639413e-05, "loss": 0.186, "step": 197 }, { "epoch": 0.041561712846347604, "grad_norm": 0.16286462545394897, "learning_rate": 4.150943396226415e-05, "loss": 0.1997, "step": 198 }, { "epoch": 0.041771620486985725, "grad_norm": 0.13008786737918854, "learning_rate": 4.171907756813417e-05, "loss": 0.2052, "step": 199 }, { "epoch": 0.041981528127623846, "grad_norm": 0.13853180408477783, "learning_rate": 4.192872117400419e-05, "loss": 0.1907, "step": 200 }, { "epoch": 0.04219143576826197, "grad_norm": 0.20382314920425415, "learning_rate": 4.213836477987422e-05, "loss": 0.1931, "step": 201 }, { "epoch": 0.04240134340890008, "grad_norm": 0.23206844925880432, "learning_rate": 4.2348008385744236e-05, "loss": 0.1849, "step": 202 }, { "epoch": 0.0426112510495382, "grad_norm": 0.2456827312707901, "learning_rate": 4.2557651991614255e-05, "loss": 0.2088, "step": 203 }, { "epoch": 0.042821158690176324, "grad_norm": 0.15247821807861328, "learning_rate": 4.276729559748428e-05, "loss": 0.1944, "step": 204 }, { "epoch": 0.043031066330814445, "grad_norm": 0.174981027841568, "learning_rate": 4.29769392033543e-05, "loss": 0.2161, "step": 205 }, { "epoch": 0.04324097397145256, "grad_norm": 0.20193031430244446, "learning_rate": 4.318658280922432e-05, "loss": 0.2084, "step": 206 }, { "epoch": 0.04345088161209068, "grad_norm": 0.20125791430473328, "learning_rate": 4.3396226415094345e-05, "loss": 0.189, "step": 207 }, { "epoch": 0.0436607892527288, "grad_norm": 0.16958673298358917, "learning_rate": 4.3605870020964364e-05, "loss": 0.1952, "step": 208 }, { "epoch": 0.043870696893366916, "grad_norm": 0.20714177191257477, "learning_rate": 4.381551362683438e-05, "loss": 0.1832, "step": 209 }, { "epoch": 0.04408060453400504, "grad_norm": 0.14229562878608704, "learning_rate": 4.402515723270441e-05, "loss": 0.1806, "step": 210 }, { "epoch": 0.04429051217464316, "grad_norm": 0.1985626220703125, "learning_rate": 4.423480083857443e-05, "loss": 0.1973, "step": 211 }, { "epoch": 0.04450041981528128, "grad_norm": 0.13714846968650818, "learning_rate": 4.4444444444444447e-05, "loss": 0.2018, "step": 212 }, { "epoch": 0.044710327455919394, "grad_norm": 0.25591611862182617, "learning_rate": 4.4654088050314466e-05, "loss": 0.2001, "step": 213 }, { "epoch": 0.044920235096557515, "grad_norm": 0.29017260670661926, "learning_rate": 4.4863731656184485e-05, "loss": 0.2278, "step": 214 }, { "epoch": 0.045130142737195636, "grad_norm": 0.12594066560268402, "learning_rate": 4.5073375262054504e-05, "loss": 0.2023, "step": 215 }, { "epoch": 0.04534005037783375, "grad_norm": 0.1602821797132492, "learning_rate": 4.528301886792453e-05, "loss": 0.2101, "step": 216 }, { "epoch": 0.04554995801847187, "grad_norm": 0.36664336919784546, "learning_rate": 4.549266247379455e-05, "loss": 0.1665, "step": 217 }, { "epoch": 0.04575986565910999, "grad_norm": 0.20665952563285828, "learning_rate": 4.570230607966457e-05, "loss": 0.1965, "step": 218 }, { "epoch": 0.045969773299748114, "grad_norm": 0.15664128959178925, "learning_rate": 4.5911949685534594e-05, "loss": 0.2104, "step": 219 }, { "epoch": 0.04617968094038623, "grad_norm": 0.14733830094337463, "learning_rate": 4.612159329140461e-05, "loss": 0.1955, "step": 220 }, { "epoch": 0.04638958858102435, "grad_norm": 0.19135522842407227, "learning_rate": 4.633123689727464e-05, "loss": 0.1947, "step": 221 }, { "epoch": 0.04659949622166247, "grad_norm": 0.24508413672447205, "learning_rate": 4.654088050314466e-05, "loss": 0.2005, "step": 222 }, { "epoch": 0.046809403862300585, "grad_norm": 0.16794531047344208, "learning_rate": 4.6750524109014677e-05, "loss": 0.1958, "step": 223 }, { "epoch": 0.047019311502938706, "grad_norm": 0.1870536506175995, "learning_rate": 4.69601677148847e-05, "loss": 0.1868, "step": 224 }, { "epoch": 0.04722921914357683, "grad_norm": 0.16214439272880554, "learning_rate": 4.716981132075472e-05, "loss": 0.2003, "step": 225 }, { "epoch": 0.04743912678421495, "grad_norm": 0.24978841841220856, "learning_rate": 4.737945492662474e-05, "loss": 0.1909, "step": 226 }, { "epoch": 0.04764903442485306, "grad_norm": 0.163265198469162, "learning_rate": 4.7589098532494766e-05, "loss": 0.2056, "step": 227 }, { "epoch": 0.04785894206549118, "grad_norm": 0.24885287880897522, "learning_rate": 4.7798742138364785e-05, "loss": 0.2035, "step": 228 }, { "epoch": 0.048068849706129305, "grad_norm": 0.1393681764602661, "learning_rate": 4.8008385744234804e-05, "loss": 0.1976, "step": 229 }, { "epoch": 0.04827875734676742, "grad_norm": 0.17042241990566254, "learning_rate": 4.8218029350104823e-05, "loss": 0.1824, "step": 230 }, { "epoch": 0.04848866498740554, "grad_norm": 0.1625502109527588, "learning_rate": 4.842767295597484e-05, "loss": 0.1904, "step": 231 }, { "epoch": 0.04869857262804366, "grad_norm": 0.14994169771671295, "learning_rate": 4.863731656184486e-05, "loss": 0.1926, "step": 232 }, { "epoch": 0.04890848026868178, "grad_norm": 0.15602821111679077, "learning_rate": 4.884696016771489e-05, "loss": 0.1828, "step": 233 }, { "epoch": 0.0491183879093199, "grad_norm": 0.13405688107013702, "learning_rate": 4.9056603773584906e-05, "loss": 0.1989, "step": 234 }, { "epoch": 0.04932829554995802, "grad_norm": 0.18559689819812775, "learning_rate": 4.9266247379454926e-05, "loss": 0.2131, "step": 235 }, { "epoch": 0.04953820319059614, "grad_norm": 0.1557319462299347, "learning_rate": 4.947589098532495e-05, "loss": 0.19, "step": 236 }, { "epoch": 0.04974811083123426, "grad_norm": 0.2162303477525711, "learning_rate": 4.968553459119497e-05, "loss": 0.196, "step": 237 }, { "epoch": 0.049958018471872374, "grad_norm": 0.17403477430343628, "learning_rate": 4.989517819706499e-05, "loss": 0.1997, "step": 238 }, { "epoch": 0.050167926112510496, "grad_norm": 0.11738390475511551, "learning_rate": 5.010482180293501e-05, "loss": 0.1955, "step": 239 }, { "epoch": 0.05037783375314862, "grad_norm": 0.15942999720573425, "learning_rate": 5.0314465408805034e-05, "loss": 0.1899, "step": 240 }, { "epoch": 0.05058774139378673, "grad_norm": 0.14695511758327484, "learning_rate": 5.052410901467506e-05, "loss": 0.2032, "step": 241 }, { "epoch": 0.05079764903442485, "grad_norm": 0.16291062533855438, "learning_rate": 5.073375262054507e-05, "loss": 0.1929, "step": 242 }, { "epoch": 0.05100755667506297, "grad_norm": 0.18514905869960785, "learning_rate": 5.09433962264151e-05, "loss": 0.1899, "step": 243 }, { "epoch": 0.051217464315701094, "grad_norm": 0.2196233868598938, "learning_rate": 5.1153039832285124e-05, "loss": 0.1741, "step": 244 }, { "epoch": 0.05142737195633921, "grad_norm": 0.19183433055877686, "learning_rate": 5.1362683438155136e-05, "loss": 0.1985, "step": 245 }, { "epoch": 0.05163727959697733, "grad_norm": 0.1604142189025879, "learning_rate": 5.157232704402516e-05, "loss": 0.1865, "step": 246 }, { "epoch": 0.05184718723761545, "grad_norm": 0.18311725556850433, "learning_rate": 5.178197064989518e-05, "loss": 0.2005, "step": 247 }, { "epoch": 0.052057094878253565, "grad_norm": 0.21732251346111298, "learning_rate": 5.19916142557652e-05, "loss": 0.2032, "step": 248 }, { "epoch": 0.052267002518891686, "grad_norm": 0.2600694000720978, "learning_rate": 5.220125786163522e-05, "loss": 0.1838, "step": 249 }, { "epoch": 0.05247691015952981, "grad_norm": 0.18634290993213654, "learning_rate": 5.2410901467505245e-05, "loss": 0.1809, "step": 250 }, { "epoch": 0.05268681780016793, "grad_norm": 0.14735780656337738, "learning_rate": 5.262054507337526e-05, "loss": 0.1981, "step": 251 }, { "epoch": 0.05289672544080604, "grad_norm": 0.16118381917476654, "learning_rate": 5.283018867924528e-05, "loss": 0.1836, "step": 252 }, { "epoch": 0.053106633081444164, "grad_norm": 0.17707999050617218, "learning_rate": 5.303983228511531e-05, "loss": 0.2044, "step": 253 }, { "epoch": 0.053316540722082285, "grad_norm": 0.25897523760795593, "learning_rate": 5.324947589098532e-05, "loss": 0.1753, "step": 254 }, { "epoch": 0.0535264483627204, "grad_norm": 0.1371389776468277, "learning_rate": 5.345911949685535e-05, "loss": 0.1839, "step": 255 }, { "epoch": 0.05373635600335852, "grad_norm": 0.21849682927131653, "learning_rate": 5.366876310272537e-05, "loss": 0.1938, "step": 256 }, { "epoch": 0.05394626364399664, "grad_norm": 0.16861748695373535, "learning_rate": 5.3878406708595385e-05, "loss": 0.1751, "step": 257 }, { "epoch": 0.05415617128463476, "grad_norm": 0.19400931894779205, "learning_rate": 5.408805031446541e-05, "loss": 0.2117, "step": 258 }, { "epoch": 0.05436607892527288, "grad_norm": 0.18074113130569458, "learning_rate": 5.429769392033544e-05, "loss": 0.1949, "step": 259 }, { "epoch": 0.054575986565911, "grad_norm": 0.17707990109920502, "learning_rate": 5.450733752620545e-05, "loss": 0.199, "step": 260 }, { "epoch": 0.05478589420654912, "grad_norm": 0.16568966209888458, "learning_rate": 5.4716981132075475e-05, "loss": 0.2027, "step": 261 }, { "epoch": 0.05499580184718724, "grad_norm": 0.24486149847507477, "learning_rate": 5.49266247379455e-05, "loss": 0.2042, "step": 262 }, { "epoch": 0.055205709487825355, "grad_norm": 0.20431551337242126, "learning_rate": 5.513626834381551e-05, "loss": 0.2013, "step": 263 }, { "epoch": 0.055415617128463476, "grad_norm": 0.21347559988498688, "learning_rate": 5.534591194968554e-05, "loss": 0.1687, "step": 264 }, { "epoch": 0.0556255247691016, "grad_norm": 0.22354485094547272, "learning_rate": 5.555555555555556e-05, "loss": 0.1819, "step": 265 }, { "epoch": 0.05583543240973971, "grad_norm": 0.2891826033592224, "learning_rate": 5.576519916142558e-05, "loss": 0.1927, "step": 266 }, { "epoch": 0.05604534005037783, "grad_norm": 0.2465565800666809, "learning_rate": 5.5974842767295596e-05, "loss": 0.1852, "step": 267 }, { "epoch": 0.056255247691015954, "grad_norm": 0.1743604689836502, "learning_rate": 5.618448637316562e-05, "loss": 0.1722, "step": 268 }, { "epoch": 0.056465155331654075, "grad_norm": 0.2461417019367218, "learning_rate": 5.6394129979035634e-05, "loss": 0.1816, "step": 269 }, { "epoch": 0.05667506297229219, "grad_norm": 0.21032604575157166, "learning_rate": 5.660377358490566e-05, "loss": 0.182, "step": 270 }, { "epoch": 0.05688497061293031, "grad_norm": 0.2683754861354828, "learning_rate": 5.6813417190775686e-05, "loss": 0.1718, "step": 271 }, { "epoch": 0.05709487825356843, "grad_norm": 0.18455228209495544, "learning_rate": 5.70230607966457e-05, "loss": 0.1954, "step": 272 }, { "epoch": 0.057304785894206546, "grad_norm": 0.22255247831344604, "learning_rate": 5.7232704402515724e-05, "loss": 0.1922, "step": 273 }, { "epoch": 0.05751469353484467, "grad_norm": 0.22789119184017181, "learning_rate": 5.744234800838575e-05, "loss": 0.1882, "step": 274 }, { "epoch": 0.05772460117548279, "grad_norm": 0.20874802768230438, "learning_rate": 5.765199161425576e-05, "loss": 0.1859, "step": 275 }, { "epoch": 0.05793450881612091, "grad_norm": 0.17554089426994324, "learning_rate": 5.786163522012579e-05, "loss": 0.1793, "step": 276 }, { "epoch": 0.058144416456759024, "grad_norm": 0.2011173665523529, "learning_rate": 5.8071278825995814e-05, "loss": 0.2009, "step": 277 }, { "epoch": 0.058354324097397145, "grad_norm": 0.28261420130729675, "learning_rate": 5.8280922431865826e-05, "loss": 0.2002, "step": 278 }, { "epoch": 0.058564231738035266, "grad_norm": 0.2356766015291214, "learning_rate": 5.849056603773585e-05, "loss": 0.2095, "step": 279 }, { "epoch": 0.05877413937867338, "grad_norm": 0.15072722733020782, "learning_rate": 5.870020964360588e-05, "loss": 0.1781, "step": 280 }, { "epoch": 0.0589840470193115, "grad_norm": 0.1446981132030487, "learning_rate": 5.89098532494759e-05, "loss": 0.1813, "step": 281 }, { "epoch": 0.05919395465994962, "grad_norm": 0.16516901552677155, "learning_rate": 5.9119496855345916e-05, "loss": 0.1946, "step": 282 }, { "epoch": 0.059403862300587744, "grad_norm": 0.16732774674892426, "learning_rate": 5.9329140461215935e-05, "loss": 0.1813, "step": 283 }, { "epoch": 0.05961376994122586, "grad_norm": 0.2000836730003357, "learning_rate": 5.953878406708596e-05, "loss": 0.1918, "step": 284 }, { "epoch": 0.05982367758186398, "grad_norm": 0.15576116740703583, "learning_rate": 5.974842767295597e-05, "loss": 0.1814, "step": 285 }, { "epoch": 0.0600335852225021, "grad_norm": 0.18421867489814758, "learning_rate": 5.9958071278826e-05, "loss": 0.1745, "step": 286 }, { "epoch": 0.06024349286314022, "grad_norm": 0.2663988471031189, "learning_rate": 6.0167714884696025e-05, "loss": 0.1889, "step": 287 }, { "epoch": 0.060453400503778336, "grad_norm": 0.18096649646759033, "learning_rate": 6.037735849056604e-05, "loss": 0.191, "step": 288 }, { "epoch": 0.06066330814441646, "grad_norm": 0.24025796353816986, "learning_rate": 6.058700209643606e-05, "loss": 0.1907, "step": 289 }, { "epoch": 0.06087321578505458, "grad_norm": 0.1682557910680771, "learning_rate": 6.079664570230609e-05, "loss": 0.1747, "step": 290 }, { "epoch": 0.06108312342569269, "grad_norm": 0.2801767587661743, "learning_rate": 6.10062893081761e-05, "loss": 0.1813, "step": 291 }, { "epoch": 0.061293031066330814, "grad_norm": 0.2288123071193695, "learning_rate": 6.121593291404612e-05, "loss": 0.1681, "step": 292 }, { "epoch": 0.061502938706968935, "grad_norm": 0.14664186537265778, "learning_rate": 6.142557651991615e-05, "loss": 0.1723, "step": 293 }, { "epoch": 0.061712846347607056, "grad_norm": 0.15858514606952667, "learning_rate": 6.163522012578616e-05, "loss": 0.198, "step": 294 }, { "epoch": 0.06192275398824517, "grad_norm": 0.24689258635044098, "learning_rate": 6.184486373165618e-05, "loss": 0.1865, "step": 295 }, { "epoch": 0.06213266162888329, "grad_norm": 0.19141900539398193, "learning_rate": 6.205450733752621e-05, "loss": 0.2103, "step": 296 }, { "epoch": 0.06234256926952141, "grad_norm": 0.22445173561573029, "learning_rate": 6.226415094339622e-05, "loss": 0.1778, "step": 297 }, { "epoch": 0.06255247691015953, "grad_norm": 0.1805533766746521, "learning_rate": 6.247379454926625e-05, "loss": 0.1752, "step": 298 }, { "epoch": 0.06276238455079765, "grad_norm": 0.17849349975585938, "learning_rate": 6.268343815513627e-05, "loss": 0.1735, "step": 299 }, { "epoch": 0.06297229219143577, "grad_norm": 0.2601464092731476, "learning_rate": 6.289308176100629e-05, "loss": 0.1868, "step": 300 }, { "epoch": 0.06318219983207389, "grad_norm": 0.2865089178085327, "learning_rate": 6.310272536687631e-05, "loss": 0.2058, "step": 301 }, { "epoch": 0.06339210747271201, "grad_norm": 0.1764407902956009, "learning_rate": 6.331236897274634e-05, "loss": 0.1828, "step": 302 }, { "epoch": 0.06360201511335013, "grad_norm": 0.136027991771698, "learning_rate": 6.352201257861635e-05, "loss": 0.1841, "step": 303 }, { "epoch": 0.06381192275398824, "grad_norm": 0.26960527896881104, "learning_rate": 6.373165618448638e-05, "loss": 0.1806, "step": 304 }, { "epoch": 0.06402183039462636, "grad_norm": 0.2371356189250946, "learning_rate": 6.39412997903564e-05, "loss": 0.1755, "step": 305 }, { "epoch": 0.06423173803526448, "grad_norm": 0.16067345440387726, "learning_rate": 6.415094339622641e-05, "loss": 0.1899, "step": 306 }, { "epoch": 0.0644416456759026, "grad_norm": 0.1733190417289734, "learning_rate": 6.436058700209644e-05, "loss": 0.1829, "step": 307 }, { "epoch": 0.06465155331654072, "grad_norm": 0.19170600175857544, "learning_rate": 6.457023060796647e-05, "loss": 0.1777, "step": 308 }, { "epoch": 0.06486146095717885, "grad_norm": 0.17290905117988586, "learning_rate": 6.477987421383648e-05, "loss": 0.187, "step": 309 }, { "epoch": 0.06507136859781697, "grad_norm": 0.3149113059043884, "learning_rate": 6.49895178197065e-05, "loss": 0.2037, "step": 310 }, { "epoch": 0.06528127623845507, "grad_norm": 0.15404744446277618, "learning_rate": 6.519916142557653e-05, "loss": 0.1909, "step": 311 }, { "epoch": 0.0654911838790932, "grad_norm": 0.157347172498703, "learning_rate": 6.540880503144654e-05, "loss": 0.191, "step": 312 }, { "epoch": 0.06570109151973132, "grad_norm": 0.23342733085155487, "learning_rate": 6.561844863731657e-05, "loss": 0.1867, "step": 313 }, { "epoch": 0.06591099916036944, "grad_norm": 0.2597595155239105, "learning_rate": 6.58280922431866e-05, "loss": 0.1949, "step": 314 }, { "epoch": 0.06612090680100756, "grad_norm": 0.23665842413902283, "learning_rate": 6.60377358490566e-05, "loss": 0.1898, "step": 315 }, { "epoch": 0.06633081444164568, "grad_norm": 0.18460237979888916, "learning_rate": 6.624737945492663e-05, "loss": 0.1824, "step": 316 }, { "epoch": 0.0665407220822838, "grad_norm": 0.1677280068397522, "learning_rate": 6.645702306079666e-05, "loss": 0.1871, "step": 317 }, { "epoch": 0.06675062972292191, "grad_norm": 0.1769377440214157, "learning_rate": 6.666666666666667e-05, "loss": 0.1779, "step": 318 }, { "epoch": 0.06696053736356003, "grad_norm": 0.1881011724472046, "learning_rate": 6.68763102725367e-05, "loss": 0.1711, "step": 319 }, { "epoch": 0.06717044500419815, "grad_norm": 0.19110549986362457, "learning_rate": 6.708595387840672e-05, "loss": 0.1823, "step": 320 }, { "epoch": 0.06738035264483627, "grad_norm": 0.26796162128448486, "learning_rate": 6.729559748427673e-05, "loss": 0.1863, "step": 321 }, { "epoch": 0.0675902602854744, "grad_norm": 0.17290090024471283, "learning_rate": 6.750524109014676e-05, "loss": 0.1816, "step": 322 }, { "epoch": 0.06780016792611251, "grad_norm": 0.2324109524488449, "learning_rate": 6.771488469601677e-05, "loss": 0.1944, "step": 323 }, { "epoch": 0.06801007556675064, "grad_norm": 0.24944299459457397, "learning_rate": 6.79245283018868e-05, "loss": 0.1965, "step": 324 }, { "epoch": 0.06821998320738874, "grad_norm": 0.2102229744195938, "learning_rate": 6.813417190775681e-05, "loss": 0.221, "step": 325 }, { "epoch": 0.06842989084802686, "grad_norm": 0.22497773170471191, "learning_rate": 6.834381551362684e-05, "loss": 0.1818, "step": 326 }, { "epoch": 0.06863979848866499, "grad_norm": 0.19047041237354279, "learning_rate": 6.855345911949685e-05, "loss": 0.1817, "step": 327 }, { "epoch": 0.0688497061293031, "grad_norm": 0.19890040159225464, "learning_rate": 6.876310272536687e-05, "loss": 0.2054, "step": 328 }, { "epoch": 0.06905961376994123, "grad_norm": 0.18274420499801636, "learning_rate": 6.89727463312369e-05, "loss": 0.1732, "step": 329 }, { "epoch": 0.06926952141057935, "grad_norm": 0.20556879043579102, "learning_rate": 6.918238993710691e-05, "loss": 0.1773, "step": 330 }, { "epoch": 0.06947942905121747, "grad_norm": 0.19632075726985931, "learning_rate": 6.939203354297694e-05, "loss": 0.1841, "step": 331 }, { "epoch": 0.06968933669185558, "grad_norm": 0.25757917761802673, "learning_rate": 6.960167714884696e-05, "loss": 0.1861, "step": 332 }, { "epoch": 0.0698992443324937, "grad_norm": 0.1654757410287857, "learning_rate": 6.981132075471698e-05, "loss": 0.1797, "step": 333 }, { "epoch": 0.07010915197313182, "grad_norm": 0.1424175500869751, "learning_rate": 7.0020964360587e-05, "loss": 0.1659, "step": 334 }, { "epoch": 0.07031905961376994, "grad_norm": 0.21559248864650726, "learning_rate": 7.023060796645703e-05, "loss": 0.175, "step": 335 }, { "epoch": 0.07052896725440806, "grad_norm": 0.3100188076496124, "learning_rate": 7.044025157232704e-05, "loss": 0.1644, "step": 336 }, { "epoch": 0.07073887489504618, "grad_norm": 0.2530849277973175, "learning_rate": 7.064989517819707e-05, "loss": 0.1697, "step": 337 }, { "epoch": 0.0709487825356843, "grad_norm": 0.2111438810825348, "learning_rate": 7.085953878406709e-05, "loss": 0.202, "step": 338 }, { "epoch": 0.07115869017632241, "grad_norm": 0.21798165142536163, "learning_rate": 7.10691823899371e-05, "loss": 0.1832, "step": 339 }, { "epoch": 0.07136859781696053, "grad_norm": 0.24779516458511353, "learning_rate": 7.127882599580713e-05, "loss": 0.1934, "step": 340 }, { "epoch": 0.07157850545759865, "grad_norm": 0.21718356013298035, "learning_rate": 7.148846960167716e-05, "loss": 0.1951, "step": 341 }, { "epoch": 0.07178841309823678, "grad_norm": 0.22320568561553955, "learning_rate": 7.169811320754717e-05, "loss": 0.1619, "step": 342 }, { "epoch": 0.0719983207388749, "grad_norm": 0.19393590092658997, "learning_rate": 7.19077568134172e-05, "loss": 0.201, "step": 343 }, { "epoch": 0.07220822837951302, "grad_norm": 0.1626208871603012, "learning_rate": 7.211740041928722e-05, "loss": 0.1848, "step": 344 }, { "epoch": 0.07241813602015114, "grad_norm": 0.2256711721420288, "learning_rate": 7.232704402515723e-05, "loss": 0.192, "step": 345 }, { "epoch": 0.07262804366078925, "grad_norm": 0.2225414514541626, "learning_rate": 7.253668763102726e-05, "loss": 0.1907, "step": 346 }, { "epoch": 0.07283795130142737, "grad_norm": 0.1786690205335617, "learning_rate": 7.274633123689728e-05, "loss": 0.1756, "step": 347 }, { "epoch": 0.07304785894206549, "grad_norm": 0.2155577391386032, "learning_rate": 7.29559748427673e-05, "loss": 0.1739, "step": 348 }, { "epoch": 0.07325776658270361, "grad_norm": 0.32533329725265503, "learning_rate": 7.316561844863732e-05, "loss": 0.1988, "step": 349 }, { "epoch": 0.07346767422334173, "grad_norm": 0.1870083510875702, "learning_rate": 7.337526205450735e-05, "loss": 0.1615, "step": 350 }, { "epoch": 0.07367758186397985, "grad_norm": 0.2160840779542923, "learning_rate": 7.358490566037736e-05, "loss": 0.1901, "step": 351 }, { "epoch": 0.07388748950461797, "grad_norm": 0.19049416482448578, "learning_rate": 7.379454926624739e-05, "loss": 0.1763, "step": 352 }, { "epoch": 0.0740973971452561, "grad_norm": 0.15733250975608826, "learning_rate": 7.400419287211741e-05, "loss": 0.1812, "step": 353 }, { "epoch": 0.0743073047858942, "grad_norm": 0.22470858693122864, "learning_rate": 7.421383647798742e-05, "loss": 0.194, "step": 354 }, { "epoch": 0.07451721242653232, "grad_norm": 0.3271860182285309, "learning_rate": 7.442348008385745e-05, "loss": 0.1708, "step": 355 }, { "epoch": 0.07472712006717044, "grad_norm": 0.17839424312114716, "learning_rate": 7.463312368972748e-05, "loss": 0.182, "step": 356 }, { "epoch": 0.07493702770780857, "grad_norm": 0.1907908171415329, "learning_rate": 7.484276729559749e-05, "loss": 0.163, "step": 357 }, { "epoch": 0.07514693534844669, "grad_norm": 0.20342503488063812, "learning_rate": 7.505241090146751e-05, "loss": 0.2029, "step": 358 }, { "epoch": 0.07535684298908481, "grad_norm": 0.21872438490390778, "learning_rate": 7.526205450733753e-05, "loss": 0.192, "step": 359 }, { "epoch": 0.07556675062972293, "grad_norm": 0.22313977777957916, "learning_rate": 7.547169811320755e-05, "loss": 0.1823, "step": 360 }, { "epoch": 0.07577665827036104, "grad_norm": 0.1931924819946289, "learning_rate": 7.568134171907756e-05, "loss": 0.1992, "step": 361 }, { "epoch": 0.07598656591099916, "grad_norm": 0.2859954535961151, "learning_rate": 7.589098532494759e-05, "loss": 0.1684, "step": 362 }, { "epoch": 0.07619647355163728, "grad_norm": 0.18601499497890472, "learning_rate": 7.610062893081762e-05, "loss": 0.1871, "step": 363 }, { "epoch": 0.0764063811922754, "grad_norm": 0.26345667243003845, "learning_rate": 7.631027253668763e-05, "loss": 0.1883, "step": 364 }, { "epoch": 0.07661628883291352, "grad_norm": 0.24455974996089935, "learning_rate": 7.651991614255765e-05, "loss": 0.1991, "step": 365 }, { "epoch": 0.07682619647355164, "grad_norm": 0.1787412464618683, "learning_rate": 7.672955974842768e-05, "loss": 0.1895, "step": 366 }, { "epoch": 0.07703610411418976, "grad_norm": 0.2711624205112457, "learning_rate": 7.693920335429769e-05, "loss": 0.1889, "step": 367 }, { "epoch": 0.07724601175482787, "grad_norm": 0.2764052152633667, "learning_rate": 7.714884696016772e-05, "loss": 0.1895, "step": 368 }, { "epoch": 0.07745591939546599, "grad_norm": 0.15490169823169708, "learning_rate": 7.735849056603774e-05, "loss": 0.1812, "step": 369 }, { "epoch": 0.07766582703610411, "grad_norm": 0.18659183382987976, "learning_rate": 7.756813417190776e-05, "loss": 0.1871, "step": 370 }, { "epoch": 0.07787573467674223, "grad_norm": 0.19188903272151947, "learning_rate": 7.777777777777778e-05, "loss": 0.1716, "step": 371 }, { "epoch": 0.07808564231738035, "grad_norm": 0.22174161672592163, "learning_rate": 7.798742138364781e-05, "loss": 0.1795, "step": 372 }, { "epoch": 0.07829554995801848, "grad_norm": 0.1882723569869995, "learning_rate": 7.819706498951782e-05, "loss": 0.1874, "step": 373 }, { "epoch": 0.0785054575986566, "grad_norm": 0.1762145459651947, "learning_rate": 7.840670859538785e-05, "loss": 0.1852, "step": 374 }, { "epoch": 0.0787153652392947, "grad_norm": 0.17931701242923737, "learning_rate": 7.861635220125787e-05, "loss": 0.1817, "step": 375 }, { "epoch": 0.07892527287993283, "grad_norm": 0.1833990216255188, "learning_rate": 7.882599580712788e-05, "loss": 0.1852, "step": 376 }, { "epoch": 0.07913518052057095, "grad_norm": 0.2758026421070099, "learning_rate": 7.903563941299791e-05, "loss": 0.1868, "step": 377 }, { "epoch": 0.07934508816120907, "grad_norm": 0.24222204089164734, "learning_rate": 7.924528301886794e-05, "loss": 0.186, "step": 378 }, { "epoch": 0.07955499580184719, "grad_norm": 0.17609156668186188, "learning_rate": 7.945492662473795e-05, "loss": 0.1807, "step": 379 }, { "epoch": 0.07976490344248531, "grad_norm": 0.23695167899131775, "learning_rate": 7.966457023060797e-05, "loss": 0.1799, "step": 380 }, { "epoch": 0.07997481108312343, "grad_norm": 0.25356245040893555, "learning_rate": 7.9874213836478e-05, "loss": 0.1899, "step": 381 }, { "epoch": 0.08018471872376154, "grad_norm": 0.23144365847110748, "learning_rate": 8.008385744234801e-05, "loss": 0.192, "step": 382 }, { "epoch": 0.08039462636439966, "grad_norm": 0.1521812379360199, "learning_rate": 8.029350104821804e-05, "loss": 0.1864, "step": 383 }, { "epoch": 0.08060453400503778, "grad_norm": 0.16725748777389526, "learning_rate": 8.050314465408806e-05, "loss": 0.202, "step": 384 }, { "epoch": 0.0808144416456759, "grad_norm": 0.21173058450222015, "learning_rate": 8.071278825995808e-05, "loss": 0.1751, "step": 385 }, { "epoch": 0.08102434928631402, "grad_norm": 0.15676653385162354, "learning_rate": 8.09224318658281e-05, "loss": 0.1946, "step": 386 }, { "epoch": 0.08123425692695214, "grad_norm": 0.21838362514972687, "learning_rate": 8.113207547169813e-05, "loss": 0.1889, "step": 387 }, { "epoch": 0.08144416456759027, "grad_norm": 0.19586238265037537, "learning_rate": 8.134171907756814e-05, "loss": 0.1884, "step": 388 }, { "epoch": 0.08165407220822837, "grad_norm": 0.21012739837169647, "learning_rate": 8.155136268343817e-05, "loss": 0.1822, "step": 389 }, { "epoch": 0.0818639798488665, "grad_norm": 0.2092917114496231, "learning_rate": 8.176100628930818e-05, "loss": 0.1783, "step": 390 }, { "epoch": 0.08207388748950462, "grad_norm": 0.3745954930782318, "learning_rate": 8.19706498951782e-05, "loss": 0.1976, "step": 391 }, { "epoch": 0.08228379513014274, "grad_norm": 0.2579379081726074, "learning_rate": 8.218029350104822e-05, "loss": 0.197, "step": 392 }, { "epoch": 0.08249370277078086, "grad_norm": 0.18806852400302887, "learning_rate": 8.238993710691824e-05, "loss": 0.1873, "step": 393 }, { "epoch": 0.08270361041141898, "grad_norm": 0.24592849612236023, "learning_rate": 8.259958071278825e-05, "loss": 0.2083, "step": 394 }, { "epoch": 0.0829135180520571, "grad_norm": 0.2678208649158478, "learning_rate": 8.280922431865828e-05, "loss": 0.1878, "step": 395 }, { "epoch": 0.08312342569269521, "grad_norm": 0.2023075520992279, "learning_rate": 8.30188679245283e-05, "loss": 0.1861, "step": 396 }, { "epoch": 0.08333333333333333, "grad_norm": 0.2390558272600174, "learning_rate": 8.322851153039832e-05, "loss": 0.1802, "step": 397 }, { "epoch": 0.08354324097397145, "grad_norm": 0.175222247838974, "learning_rate": 8.343815513626834e-05, "loss": 0.1674, "step": 398 }, { "epoch": 0.08375314861460957, "grad_norm": 0.2340380698442459, "learning_rate": 8.364779874213837e-05, "loss": 0.17, "step": 399 }, { "epoch": 0.08396305625524769, "grad_norm": 0.20814655721187592, "learning_rate": 8.385744234800838e-05, "loss": 0.1513, "step": 400 }, { "epoch": 0.08417296389588581, "grad_norm": 0.34867948293685913, "learning_rate": 8.406708595387841e-05, "loss": 0.1644, "step": 401 }, { "epoch": 0.08438287153652393, "grad_norm": 0.21573619544506073, "learning_rate": 8.427672955974843e-05, "loss": 0.1785, "step": 402 }, { "epoch": 0.08459277917716204, "grad_norm": 0.21437713503837585, "learning_rate": 8.448637316561845e-05, "loss": 0.1523, "step": 403 }, { "epoch": 0.08480268681780016, "grad_norm": 0.2250152826309204, "learning_rate": 8.469601677148847e-05, "loss": 0.1728, "step": 404 }, { "epoch": 0.08501259445843828, "grad_norm": 0.2514733672142029, "learning_rate": 8.49056603773585e-05, "loss": 0.1755, "step": 405 }, { "epoch": 0.0852225020990764, "grad_norm": 0.16003377735614777, "learning_rate": 8.511530398322851e-05, "loss": 0.1935, "step": 406 }, { "epoch": 0.08543240973971453, "grad_norm": 0.1792365163564682, "learning_rate": 8.532494758909854e-05, "loss": 0.1991, "step": 407 }, { "epoch": 0.08564231738035265, "grad_norm": 0.16854703426361084, "learning_rate": 8.553459119496856e-05, "loss": 0.1904, "step": 408 }, { "epoch": 0.08585222502099077, "grad_norm": 0.19401603937149048, "learning_rate": 8.574423480083857e-05, "loss": 0.1681, "step": 409 }, { "epoch": 0.08606213266162889, "grad_norm": 0.15639828145503998, "learning_rate": 8.59538784067086e-05, "loss": 0.1628, "step": 410 }, { "epoch": 0.086272040302267, "grad_norm": 0.15357258915901184, "learning_rate": 8.616352201257863e-05, "loss": 0.1905, "step": 411 }, { "epoch": 0.08648194794290512, "grad_norm": 0.256944477558136, "learning_rate": 8.637316561844864e-05, "loss": 0.2015, "step": 412 }, { "epoch": 0.08669185558354324, "grad_norm": 0.13482192158699036, "learning_rate": 8.658280922431866e-05, "loss": 0.1718, "step": 413 }, { "epoch": 0.08690176322418136, "grad_norm": 0.16525831818580627, "learning_rate": 8.679245283018869e-05, "loss": 0.1778, "step": 414 }, { "epoch": 0.08711167086481948, "grad_norm": 0.2145531326532364, "learning_rate": 8.70020964360587e-05, "loss": 0.1714, "step": 415 }, { "epoch": 0.0873215785054576, "grad_norm": 0.20481255650520325, "learning_rate": 8.721174004192873e-05, "loss": 0.1832, "step": 416 }, { "epoch": 0.08753148614609572, "grad_norm": 0.14865756034851074, "learning_rate": 8.742138364779875e-05, "loss": 0.1659, "step": 417 }, { "epoch": 0.08774139378673383, "grad_norm": 0.14713706076145172, "learning_rate": 8.763102725366877e-05, "loss": 0.1674, "step": 418 }, { "epoch": 0.08795130142737195, "grad_norm": 0.1728464812040329, "learning_rate": 8.784067085953879e-05, "loss": 0.1727, "step": 419 }, { "epoch": 0.08816120906801007, "grad_norm": 0.2646033465862274, "learning_rate": 8.805031446540882e-05, "loss": 0.1973, "step": 420 }, { "epoch": 0.0883711167086482, "grad_norm": 0.2262433022260666, "learning_rate": 8.825995807127883e-05, "loss": 0.1817, "step": 421 }, { "epoch": 0.08858102434928632, "grad_norm": 0.16398945450782776, "learning_rate": 8.846960167714886e-05, "loss": 0.1628, "step": 422 }, { "epoch": 0.08879093198992444, "grad_norm": 0.15976634621620178, "learning_rate": 8.867924528301888e-05, "loss": 0.1678, "step": 423 }, { "epoch": 0.08900083963056256, "grad_norm": 0.19314904510974884, "learning_rate": 8.888888888888889e-05, "loss": 0.1753, "step": 424 }, { "epoch": 0.08921074727120067, "grad_norm": 0.21701818704605103, "learning_rate": 8.909853249475892e-05, "loss": 0.19, "step": 425 }, { "epoch": 0.08942065491183879, "grad_norm": 0.22768795490264893, "learning_rate": 8.930817610062893e-05, "loss": 0.1613, "step": 426 }, { "epoch": 0.08963056255247691, "grad_norm": 0.13172288239002228, "learning_rate": 8.951781970649896e-05, "loss": 0.1749, "step": 427 }, { "epoch": 0.08984047019311503, "grad_norm": 0.2015480250120163, "learning_rate": 8.972746331236897e-05, "loss": 0.1857, "step": 428 }, { "epoch": 0.09005037783375315, "grad_norm": 0.20239531993865967, "learning_rate": 8.9937106918239e-05, "loss": 0.1804, "step": 429 }, { "epoch": 0.09026028547439127, "grad_norm": 0.17528317868709564, "learning_rate": 9.014675052410901e-05, "loss": 0.1729, "step": 430 }, { "epoch": 0.0904701931150294, "grad_norm": 0.16663801670074463, "learning_rate": 9.035639412997903e-05, "loss": 0.1896, "step": 431 }, { "epoch": 0.0906801007556675, "grad_norm": 0.18777558207511902, "learning_rate": 9.056603773584906e-05, "loss": 0.1801, "step": 432 }, { "epoch": 0.09089000839630562, "grad_norm": 0.1440989226102829, "learning_rate": 9.077568134171907e-05, "loss": 0.179, "step": 433 }, { "epoch": 0.09109991603694374, "grad_norm": 0.1786854863166809, "learning_rate": 9.09853249475891e-05, "loss": 0.1799, "step": 434 }, { "epoch": 0.09130982367758186, "grad_norm": 0.20794442296028137, "learning_rate": 9.119496855345912e-05, "loss": 0.1731, "step": 435 }, { "epoch": 0.09151973131821999, "grad_norm": 0.16377133131027222, "learning_rate": 9.140461215932914e-05, "loss": 0.1815, "step": 436 }, { "epoch": 0.0917296389588581, "grad_norm": 0.174666628241539, "learning_rate": 9.161425576519916e-05, "loss": 0.1798, "step": 437 }, { "epoch": 0.09193954659949623, "grad_norm": 0.2127188891172409, "learning_rate": 9.182389937106919e-05, "loss": 0.1805, "step": 438 }, { "epoch": 0.09214945424013433, "grad_norm": 0.1936446726322174, "learning_rate": 9.203354297693921e-05, "loss": 0.1723, "step": 439 }, { "epoch": 0.09235936188077246, "grad_norm": 0.18736332654953003, "learning_rate": 9.224318658280923e-05, "loss": 0.1849, "step": 440 }, { "epoch": 0.09256926952141058, "grad_norm": 0.16817238926887512, "learning_rate": 9.245283018867925e-05, "loss": 0.1746, "step": 441 }, { "epoch": 0.0927791771620487, "grad_norm": 0.18249107897281647, "learning_rate": 9.266247379454928e-05, "loss": 0.1628, "step": 442 }, { "epoch": 0.09298908480268682, "grad_norm": 0.1728898137807846, "learning_rate": 9.287211740041929e-05, "loss": 0.203, "step": 443 }, { "epoch": 0.09319899244332494, "grad_norm": 0.16144797205924988, "learning_rate": 9.308176100628931e-05, "loss": 0.1843, "step": 444 }, { "epoch": 0.09340890008396306, "grad_norm": 0.19680747389793396, "learning_rate": 9.329140461215934e-05, "loss": 0.168, "step": 445 }, { "epoch": 0.09361880772460117, "grad_norm": 0.16198395192623138, "learning_rate": 9.350104821802935e-05, "loss": 0.1598, "step": 446 }, { "epoch": 0.09382871536523929, "grad_norm": 0.17398878931999207, "learning_rate": 9.371069182389938e-05, "loss": 0.17, "step": 447 }, { "epoch": 0.09403862300587741, "grad_norm": 0.18602675199508667, "learning_rate": 9.39203354297694e-05, "loss": 0.1716, "step": 448 }, { "epoch": 0.09424853064651553, "grad_norm": 0.18403322994709015, "learning_rate": 9.412997903563942e-05, "loss": 0.1948, "step": 449 }, { "epoch": 0.09445843828715365, "grad_norm": 0.18783587217330933, "learning_rate": 9.433962264150944e-05, "loss": 0.1812, "step": 450 }, { "epoch": 0.09466834592779177, "grad_norm": 0.20252300798892975, "learning_rate": 9.454926624737947e-05, "loss": 0.1666, "step": 451 }, { "epoch": 0.0948782535684299, "grad_norm": 0.20129899680614471, "learning_rate": 9.475890985324948e-05, "loss": 0.1695, "step": 452 }, { "epoch": 0.095088161209068, "grad_norm": 0.17035968601703644, "learning_rate": 9.496855345911951e-05, "loss": 0.1678, "step": 453 }, { "epoch": 0.09529806884970612, "grad_norm": 0.20403030514717102, "learning_rate": 9.517819706498953e-05, "loss": 0.1843, "step": 454 }, { "epoch": 0.09550797649034425, "grad_norm": 0.17489562928676605, "learning_rate": 9.538784067085954e-05, "loss": 0.1657, "step": 455 }, { "epoch": 0.09571788413098237, "grad_norm": 0.19699983298778534, "learning_rate": 9.559748427672957e-05, "loss": 0.1634, "step": 456 }, { "epoch": 0.09592779177162049, "grad_norm": 0.17113354802131653, "learning_rate": 9.58071278825996e-05, "loss": 0.1879, "step": 457 }, { "epoch": 0.09613769941225861, "grad_norm": 0.2290397435426712, "learning_rate": 9.601677148846961e-05, "loss": 0.1784, "step": 458 }, { "epoch": 0.09634760705289673, "grad_norm": 0.2173147350549698, "learning_rate": 9.622641509433963e-05, "loss": 0.1673, "step": 459 }, { "epoch": 0.09655751469353484, "grad_norm": 0.18280835449695587, "learning_rate": 9.643605870020965e-05, "loss": 0.1806, "step": 460 }, { "epoch": 0.09676742233417296, "grad_norm": 0.14638672769069672, "learning_rate": 9.664570230607967e-05, "loss": 0.1833, "step": 461 }, { "epoch": 0.09697732997481108, "grad_norm": 0.16228064894676208, "learning_rate": 9.685534591194969e-05, "loss": 0.1762, "step": 462 }, { "epoch": 0.0971872376154492, "grad_norm": 0.1836690902709961, "learning_rate": 9.706498951781971e-05, "loss": 0.1782, "step": 463 }, { "epoch": 0.09739714525608732, "grad_norm": 0.19470515847206116, "learning_rate": 9.727463312368972e-05, "loss": 0.185, "step": 464 }, { "epoch": 0.09760705289672544, "grad_norm": 0.1833791732788086, "learning_rate": 9.748427672955975e-05, "loss": 0.1753, "step": 465 }, { "epoch": 0.09781696053736356, "grad_norm": 0.22608265280723572, "learning_rate": 9.769392033542977e-05, "loss": 0.1792, "step": 466 }, { "epoch": 0.09802686817800169, "grad_norm": 0.16552825272083282, "learning_rate": 9.790356394129979e-05, "loss": 0.1808, "step": 467 }, { "epoch": 0.0982367758186398, "grad_norm": 0.2294851690530777, "learning_rate": 9.811320754716981e-05, "loss": 0.1935, "step": 468 }, { "epoch": 0.09844668345927791, "grad_norm": 0.26589101552963257, "learning_rate": 9.832285115303984e-05, "loss": 0.1676, "step": 469 }, { "epoch": 0.09865659109991604, "grad_norm": 0.22315791249275208, "learning_rate": 9.853249475890985e-05, "loss": 0.1806, "step": 470 }, { "epoch": 0.09886649874055416, "grad_norm": 0.16855137050151825, "learning_rate": 9.874213836477988e-05, "loss": 0.1816, "step": 471 }, { "epoch": 0.09907640638119228, "grad_norm": 0.19197392463684082, "learning_rate": 9.89517819706499e-05, "loss": 0.1923, "step": 472 }, { "epoch": 0.0992863140218304, "grad_norm": 0.18722014129161835, "learning_rate": 9.916142557651992e-05, "loss": 0.1823, "step": 473 }, { "epoch": 0.09949622166246852, "grad_norm": 0.15668706595897675, "learning_rate": 9.937106918238994e-05, "loss": 0.1896, "step": 474 }, { "epoch": 0.09970612930310663, "grad_norm": 0.17297013103961945, "learning_rate": 9.958071278825997e-05, "loss": 0.1907, "step": 475 }, { "epoch": 0.09991603694374475, "grad_norm": 0.23546694219112396, "learning_rate": 9.979035639412998e-05, "loss": 0.1647, "step": 476 }, { "epoch": 0.10012594458438287, "grad_norm": 0.1627054661512375, "learning_rate": 0.0001, "loss": 0.1686, "step": 477 }, { "epoch": 0.10033585222502099, "grad_norm": 0.21043647825717926, "learning_rate": 9.999998657442895e-05, "loss": 0.1865, "step": 478 }, { "epoch": 0.10054575986565911, "grad_norm": 0.19615764915943146, "learning_rate": 9.999994629772298e-05, "loss": 0.1683, "step": 479 }, { "epoch": 0.10075566750629723, "grad_norm": 0.2884671092033386, "learning_rate": 9.999987916990372e-05, "loss": 0.1858, "step": 480 }, { "epoch": 0.10096557514693535, "grad_norm": 0.2381323128938675, "learning_rate": 9.999978519100723e-05, "loss": 0.1879, "step": 481 }, { "epoch": 0.10117548278757346, "grad_norm": 0.19187557697296143, "learning_rate": 9.999966436108398e-05, "loss": 0.1808, "step": 482 }, { "epoch": 0.10138539042821158, "grad_norm": 0.19443491101264954, "learning_rate": 9.999951668019887e-05, "loss": 0.1659, "step": 483 }, { "epoch": 0.1015952980688497, "grad_norm": 0.20151716470718384, "learning_rate": 9.999934214843116e-05, "loss": 0.1679, "step": 484 }, { "epoch": 0.10180520570948783, "grad_norm": 0.2896507978439331, "learning_rate": 9.999914076587464e-05, "loss": 0.1734, "step": 485 }, { "epoch": 0.10201511335012595, "grad_norm": 0.21598441898822784, "learning_rate": 9.999891253263741e-05, "loss": 0.1779, "step": 486 }, { "epoch": 0.10222502099076407, "grad_norm": 0.196011021733284, "learning_rate": 9.999865744884207e-05, "loss": 0.1815, "step": 487 }, { "epoch": 0.10243492863140219, "grad_norm": 0.20962318778038025, "learning_rate": 9.999837551462558e-05, "loss": 0.1727, "step": 488 }, { "epoch": 0.1026448362720403, "grad_norm": 0.18339572846889496, "learning_rate": 9.999806673013935e-05, "loss": 0.1689, "step": 489 }, { "epoch": 0.10285474391267842, "grad_norm": 0.20994813740253448, "learning_rate": 9.999773109554922e-05, "loss": 0.1768, "step": 490 }, { "epoch": 0.10306465155331654, "grad_norm": 0.163935124874115, "learning_rate": 9.999736861103541e-05, "loss": 0.1777, "step": 491 }, { "epoch": 0.10327455919395466, "grad_norm": 0.1465967446565628, "learning_rate": 9.99969792767926e-05, "loss": 0.1846, "step": 492 }, { "epoch": 0.10348446683459278, "grad_norm": 0.21443922817707062, "learning_rate": 9.999656309302987e-05, "loss": 0.1807, "step": 493 }, { "epoch": 0.1036943744752309, "grad_norm": 0.18504248559474945, "learning_rate": 9.999612005997071e-05, "loss": 0.1548, "step": 494 }, { "epoch": 0.10390428211586902, "grad_norm": 0.15490441024303436, "learning_rate": 9.999565017785305e-05, "loss": 0.1696, "step": 495 }, { "epoch": 0.10411418975650713, "grad_norm": 0.1881389319896698, "learning_rate": 9.999515344692923e-05, "loss": 0.1852, "step": 496 }, { "epoch": 0.10432409739714525, "grad_norm": 0.16337451338768005, "learning_rate": 9.999462986746598e-05, "loss": 0.1834, "step": 497 }, { "epoch": 0.10453400503778337, "grad_norm": 0.16641898453235626, "learning_rate": 9.99940794397445e-05, "loss": 0.187, "step": 498 }, { "epoch": 0.1047439126784215, "grad_norm": 0.15948446094989777, "learning_rate": 9.999350216406038e-05, "loss": 0.1835, "step": 499 }, { "epoch": 0.10495382031905962, "grad_norm": 0.1550200879573822, "learning_rate": 9.999289804072363e-05, "loss": 0.1706, "step": 500 }, { "epoch": 0.10516372795969774, "grad_norm": 0.16597698628902435, "learning_rate": 9.999226707005867e-05, "loss": 0.1811, "step": 501 }, { "epoch": 0.10537363560033586, "grad_norm": 0.17551501095294952, "learning_rate": 9.999160925240434e-05, "loss": 0.1677, "step": 502 }, { "epoch": 0.10558354324097396, "grad_norm": 0.15515847504138947, "learning_rate": 9.999092458811393e-05, "loss": 0.1789, "step": 503 }, { "epoch": 0.10579345088161209, "grad_norm": 0.18121638894081116, "learning_rate": 9.99902130775551e-05, "loss": 0.1684, "step": 504 }, { "epoch": 0.10600335852225021, "grad_norm": 0.1853945255279541, "learning_rate": 9.998947472110994e-05, "loss": 0.1907, "step": 505 }, { "epoch": 0.10621326616288833, "grad_norm": 0.21303139626979828, "learning_rate": 9.998870951917496e-05, "loss": 0.1712, "step": 506 }, { "epoch": 0.10642317380352645, "grad_norm": 0.16773764789104462, "learning_rate": 9.998791747216113e-05, "loss": 0.1756, "step": 507 }, { "epoch": 0.10663308144416457, "grad_norm": 0.18033501505851746, "learning_rate": 9.998709858049376e-05, "loss": 0.1654, "step": 508 }, { "epoch": 0.10684298908480269, "grad_norm": 0.14199328422546387, "learning_rate": 9.998625284461263e-05, "loss": 0.1587, "step": 509 }, { "epoch": 0.1070528967254408, "grad_norm": 0.19968685507774353, "learning_rate": 9.998538026497192e-05, "loss": 0.1796, "step": 510 }, { "epoch": 0.10726280436607892, "grad_norm": 0.17311611771583557, "learning_rate": 9.998448084204021e-05, "loss": 0.1864, "step": 511 }, { "epoch": 0.10747271200671704, "grad_norm": 0.20124119520187378, "learning_rate": 9.998355457630053e-05, "loss": 0.1829, "step": 512 }, { "epoch": 0.10768261964735516, "grad_norm": 0.12473297864198685, "learning_rate": 9.998260146825029e-05, "loss": 0.175, "step": 513 }, { "epoch": 0.10789252728799328, "grad_norm": 0.1696644425392151, "learning_rate": 9.998162151840135e-05, "loss": 0.1762, "step": 514 }, { "epoch": 0.1081024349286314, "grad_norm": 0.1781477928161621, "learning_rate": 9.998061472727996e-05, "loss": 0.1679, "step": 515 }, { "epoch": 0.10831234256926953, "grad_norm": 0.19112960994243622, "learning_rate": 9.997958109542675e-05, "loss": 0.1553, "step": 516 }, { "epoch": 0.10852225020990765, "grad_norm": 0.1417030394077301, "learning_rate": 9.997852062339685e-05, "loss": 0.1737, "step": 517 }, { "epoch": 0.10873215785054575, "grad_norm": 0.15080858767032623, "learning_rate": 9.997743331175976e-05, "loss": 0.1595, "step": 518 }, { "epoch": 0.10894206549118388, "grad_norm": 0.2046668380498886, "learning_rate": 9.997631916109937e-05, "loss": 0.1839, "step": 519 }, { "epoch": 0.109151973131822, "grad_norm": 0.19941595196723938, "learning_rate": 9.997517817201401e-05, "loss": 0.1718, "step": 520 }, { "epoch": 0.10936188077246012, "grad_norm": 0.15989692509174347, "learning_rate": 9.997401034511642e-05, "loss": 0.1613, "step": 521 }, { "epoch": 0.10957178841309824, "grad_norm": 0.1697997897863388, "learning_rate": 9.997281568103374e-05, "loss": 0.1603, "step": 522 }, { "epoch": 0.10978169605373636, "grad_norm": 0.1840822696685791, "learning_rate": 9.997159418040754e-05, "loss": 0.1735, "step": 523 }, { "epoch": 0.10999160369437448, "grad_norm": 0.20991730690002441, "learning_rate": 9.99703458438938e-05, "loss": 0.154, "step": 524 }, { "epoch": 0.11020151133501259, "grad_norm": 0.16802968084812164, "learning_rate": 9.99690706721629e-05, "loss": 0.1761, "step": 525 }, { "epoch": 0.11041141897565071, "grad_norm": 0.18329255282878876, "learning_rate": 9.996776866589962e-05, "loss": 0.1609, "step": 526 }, { "epoch": 0.11062132661628883, "grad_norm": 0.18645748496055603, "learning_rate": 9.996643982580318e-05, "loss": 0.1793, "step": 527 }, { "epoch": 0.11083123425692695, "grad_norm": 0.1966720074415207, "learning_rate": 9.996508415258722e-05, "loss": 0.1714, "step": 528 }, { "epoch": 0.11104114189756507, "grad_norm": 0.18155452609062195, "learning_rate": 9.996370164697974e-05, "loss": 0.1673, "step": 529 }, { "epoch": 0.1112510495382032, "grad_norm": 0.2004195600748062, "learning_rate": 9.996229230972317e-05, "loss": 0.1865, "step": 530 }, { "epoch": 0.11146095717884132, "grad_norm": 0.15521694719791412, "learning_rate": 9.996085614157438e-05, "loss": 0.1757, "step": 531 }, { "epoch": 0.11167086481947942, "grad_norm": 0.1686578243970871, "learning_rate": 9.995939314330462e-05, "loss": 0.1768, "step": 532 }, { "epoch": 0.11188077246011754, "grad_norm": 0.20034368336200714, "learning_rate": 9.995790331569954e-05, "loss": 0.1823, "step": 533 }, { "epoch": 0.11209068010075567, "grad_norm": 0.1494702696800232, "learning_rate": 9.995638665955922e-05, "loss": 0.175, "step": 534 }, { "epoch": 0.11230058774139379, "grad_norm": 0.16365233063697815, "learning_rate": 9.995484317569814e-05, "loss": 0.1716, "step": 535 }, { "epoch": 0.11251049538203191, "grad_norm": 0.19227434694766998, "learning_rate": 9.995327286494521e-05, "loss": 0.1605, "step": 536 }, { "epoch": 0.11272040302267003, "grad_norm": 0.21946166455745697, "learning_rate": 9.995167572814365e-05, "loss": 0.182, "step": 537 }, { "epoch": 0.11293031066330815, "grad_norm": 0.2211793065071106, "learning_rate": 9.995005176615124e-05, "loss": 0.1783, "step": 538 }, { "epoch": 0.11314021830394626, "grad_norm": 0.2154102325439453, "learning_rate": 9.994840097984006e-05, "loss": 0.1888, "step": 539 }, { "epoch": 0.11335012594458438, "grad_norm": 0.20600587129592896, "learning_rate": 9.994672337009658e-05, "loss": 0.1871, "step": 540 }, { "epoch": 0.1135600335852225, "grad_norm": 0.22028079628944397, "learning_rate": 9.994501893782176e-05, "loss": 0.1855, "step": 541 }, { "epoch": 0.11376994122586062, "grad_norm": 0.23957398533821106, "learning_rate": 9.99432876839309e-05, "loss": 0.1616, "step": 542 }, { "epoch": 0.11397984886649874, "grad_norm": 0.14516577124595642, "learning_rate": 9.994152960935375e-05, "loss": 0.1864, "step": 543 }, { "epoch": 0.11418975650713686, "grad_norm": 0.14327426254749298, "learning_rate": 9.99397447150344e-05, "loss": 0.1596, "step": 544 }, { "epoch": 0.11439966414777498, "grad_norm": 0.15387804806232452, "learning_rate": 9.99379330019314e-05, "loss": 0.1638, "step": 545 }, { "epoch": 0.11460957178841309, "grad_norm": 0.14998720586299896, "learning_rate": 9.993609447101767e-05, "loss": 0.1456, "step": 546 }, { "epoch": 0.11481947942905121, "grad_norm": 0.21649526059627533, "learning_rate": 9.993422912328054e-05, "loss": 0.1656, "step": 547 }, { "epoch": 0.11502938706968933, "grad_norm": 0.19370022416114807, "learning_rate": 9.993233695972175e-05, "loss": 0.173, "step": 548 }, { "epoch": 0.11523929471032746, "grad_norm": 0.20101432502269745, "learning_rate": 9.993041798135745e-05, "loss": 0.1914, "step": 549 }, { "epoch": 0.11544920235096558, "grad_norm": 0.18047718703746796, "learning_rate": 9.992847218921816e-05, "loss": 0.2084, "step": 550 }, { "epoch": 0.1156591099916037, "grad_norm": 0.18755191564559937, "learning_rate": 9.99264995843488e-05, "loss": 0.1646, "step": 551 }, { "epoch": 0.11586901763224182, "grad_norm": 0.2605830430984497, "learning_rate": 9.992450016780876e-05, "loss": 0.1767, "step": 552 }, { "epoch": 0.11607892527287993, "grad_norm": 0.1701487898826599, "learning_rate": 9.99224739406717e-05, "loss": 0.173, "step": 553 }, { "epoch": 0.11628883291351805, "grad_norm": 0.19497598707675934, "learning_rate": 9.99204209040258e-05, "loss": 0.1857, "step": 554 }, { "epoch": 0.11649874055415617, "grad_norm": 0.16379636526107788, "learning_rate": 9.991834105897356e-05, "loss": 0.154, "step": 555 }, { "epoch": 0.11670864819479429, "grad_norm": 0.18630164861679077, "learning_rate": 9.991623440663192e-05, "loss": 0.1871, "step": 556 }, { "epoch": 0.11691855583543241, "grad_norm": 0.219542995095253, "learning_rate": 9.991410094813221e-05, "loss": 0.1877, "step": 557 }, { "epoch": 0.11712846347607053, "grad_norm": 0.1855912059545517, "learning_rate": 9.991194068462011e-05, "loss": 0.1771, "step": 558 }, { "epoch": 0.11733837111670865, "grad_norm": 0.15919265151023865, "learning_rate": 9.990975361725577e-05, "loss": 0.1684, "step": 559 }, { "epoch": 0.11754827875734676, "grad_norm": 0.14570386707782745, "learning_rate": 9.990753974721366e-05, "loss": 0.1656, "step": 560 }, { "epoch": 0.11775818639798488, "grad_norm": 0.14153516292572021, "learning_rate": 9.990529907568272e-05, "loss": 0.1642, "step": 561 }, { "epoch": 0.117968094038623, "grad_norm": 0.14015786349773407, "learning_rate": 9.99030316038662e-05, "loss": 0.1704, "step": 562 }, { "epoch": 0.11817800167926112, "grad_norm": 0.2615254819393158, "learning_rate": 9.99007373329818e-05, "loss": 0.1601, "step": 563 }, { "epoch": 0.11838790931989925, "grad_norm": 0.2077956199645996, "learning_rate": 9.989841626426162e-05, "loss": 0.1715, "step": 564 }, { "epoch": 0.11859781696053737, "grad_norm": 0.1743435561656952, "learning_rate": 9.989606839895208e-05, "loss": 0.1725, "step": 565 }, { "epoch": 0.11880772460117549, "grad_norm": 0.14333437383174896, "learning_rate": 9.989369373831407e-05, "loss": 0.1731, "step": 566 }, { "epoch": 0.1190176322418136, "grad_norm": 0.16472546756267548, "learning_rate": 9.989129228362284e-05, "loss": 0.1641, "step": 567 }, { "epoch": 0.11922753988245172, "grad_norm": 0.19595298171043396, "learning_rate": 9.988886403616802e-05, "loss": 0.1789, "step": 568 }, { "epoch": 0.11943744752308984, "grad_norm": 0.15337203443050385, "learning_rate": 9.988640899725361e-05, "loss": 0.166, "step": 569 }, { "epoch": 0.11964735516372796, "grad_norm": 0.13052469491958618, "learning_rate": 9.988392716819806e-05, "loss": 0.1762, "step": 570 }, { "epoch": 0.11985726280436608, "grad_norm": 0.17160564661026, "learning_rate": 9.988141855033415e-05, "loss": 0.1742, "step": 571 }, { "epoch": 0.1200671704450042, "grad_norm": 0.15600250661373138, "learning_rate": 9.987888314500906e-05, "loss": 0.1819, "step": 572 }, { "epoch": 0.12027707808564232, "grad_norm": 0.17436926066875458, "learning_rate": 9.987632095358437e-05, "loss": 0.154, "step": 573 }, { "epoch": 0.12048698572628044, "grad_norm": 0.1583249568939209, "learning_rate": 9.987373197743603e-05, "loss": 0.1812, "step": 574 }, { "epoch": 0.12069689336691855, "grad_norm": 0.14877773821353912, "learning_rate": 9.987111621795437e-05, "loss": 0.1591, "step": 575 }, { "epoch": 0.12090680100755667, "grad_norm": 0.13348308205604553, "learning_rate": 9.986847367654414e-05, "loss": 0.164, "step": 576 }, { "epoch": 0.12111670864819479, "grad_norm": 0.16104738414287567, "learning_rate": 9.986580435462443e-05, "loss": 0.1568, "step": 577 }, { "epoch": 0.12132661628883291, "grad_norm": 0.1393367350101471, "learning_rate": 9.98631082536287e-05, "loss": 0.1504, "step": 578 }, { "epoch": 0.12153652392947104, "grad_norm": 0.17166094481945038, "learning_rate": 9.986038537500488e-05, "loss": 0.1746, "step": 579 }, { "epoch": 0.12174643157010916, "grad_norm": 0.19518902897834778, "learning_rate": 9.985763572021516e-05, "loss": 0.1961, "step": 580 }, { "epoch": 0.12195633921074728, "grad_norm": 0.17867891490459442, "learning_rate": 9.985485929073619e-05, "loss": 0.1836, "step": 581 }, { "epoch": 0.12216624685138538, "grad_norm": 0.2270008772611618, "learning_rate": 9.9852056088059e-05, "loss": 0.1847, "step": 582 }, { "epoch": 0.1223761544920235, "grad_norm": 0.16426697373390198, "learning_rate": 9.984922611368892e-05, "loss": 0.1587, "step": 583 }, { "epoch": 0.12258606213266163, "grad_norm": 0.16188107430934906, "learning_rate": 9.984636936914575e-05, "loss": 0.1783, "step": 584 }, { "epoch": 0.12279596977329975, "grad_norm": 0.16157647967338562, "learning_rate": 9.984348585596361e-05, "loss": 0.1612, "step": 585 }, { "epoch": 0.12300587741393787, "grad_norm": 0.17590083181858063, "learning_rate": 9.984057557569104e-05, "loss": 0.1852, "step": 586 }, { "epoch": 0.12321578505457599, "grad_norm": 0.13868169486522675, "learning_rate": 9.983763852989088e-05, "loss": 0.1915, "step": 587 }, { "epoch": 0.12342569269521411, "grad_norm": 0.15027277171611786, "learning_rate": 9.983467472014043e-05, "loss": 0.1924, "step": 588 }, { "epoch": 0.12363560033585222, "grad_norm": 0.16492144763469696, "learning_rate": 9.983168414803132e-05, "loss": 0.1768, "step": 589 }, { "epoch": 0.12384550797649034, "grad_norm": 0.16169002652168274, "learning_rate": 9.982866681516954e-05, "loss": 0.1704, "step": 590 }, { "epoch": 0.12405541561712846, "grad_norm": 0.16292813420295715, "learning_rate": 9.982562272317546e-05, "loss": 0.1687, "step": 591 }, { "epoch": 0.12426532325776658, "grad_norm": 0.16432535648345947, "learning_rate": 9.982255187368386e-05, "loss": 0.1702, "step": 592 }, { "epoch": 0.1244752308984047, "grad_norm": 0.13967812061309814, "learning_rate": 9.981945426834382e-05, "loss": 0.1858, "step": 593 }, { "epoch": 0.12468513853904283, "grad_norm": 0.16663575172424316, "learning_rate": 9.981632990881885e-05, "loss": 0.158, "step": 594 }, { "epoch": 0.12489504617968095, "grad_norm": 0.17746829986572266, "learning_rate": 9.981317879678679e-05, "loss": 0.1679, "step": 595 }, { "epoch": 0.12510495382031905, "grad_norm": 0.15006081759929657, "learning_rate": 9.981000093393986e-05, "loss": 0.1659, "step": 596 }, { "epoch": 0.1253148614609572, "grad_norm": 0.15521980822086334, "learning_rate": 9.980679632198466e-05, "loss": 0.1587, "step": 597 }, { "epoch": 0.1255247691015953, "grad_norm": 0.17882917821407318, "learning_rate": 9.980356496264212e-05, "loss": 0.1571, "step": 598 }, { "epoch": 0.12573467674223343, "grad_norm": 0.17295823991298676, "learning_rate": 9.980030685764754e-05, "loss": 0.1936, "step": 599 }, { "epoch": 0.12594458438287154, "grad_norm": 0.1699322760105133, "learning_rate": 9.979702200875065e-05, "loss": 0.1555, "step": 600 }, { "epoch": 0.12615449202350965, "grad_norm": 0.15549197793006897, "learning_rate": 9.979371041771543e-05, "loss": 0.1569, "step": 601 }, { "epoch": 0.12636439966414778, "grad_norm": 0.16207100450992584, "learning_rate": 9.979037208632034e-05, "loss": 0.1609, "step": 602 }, { "epoch": 0.1265743073047859, "grad_norm": 0.156686931848526, "learning_rate": 9.978700701635807e-05, "loss": 0.165, "step": 603 }, { "epoch": 0.12678421494542402, "grad_norm": 0.16895289719104767, "learning_rate": 9.97836152096358e-05, "loss": 0.1623, "step": 604 }, { "epoch": 0.12699412258606213, "grad_norm": 0.14077980816364288, "learning_rate": 9.978019666797498e-05, "loss": 0.1573, "step": 605 }, { "epoch": 0.12720403022670027, "grad_norm": 0.14448733627796173, "learning_rate": 9.977675139321146e-05, "loss": 0.1671, "step": 606 }, { "epoch": 0.12741393786733837, "grad_norm": 0.1879337728023529, "learning_rate": 9.977327938719541e-05, "loss": 0.1644, "step": 607 }, { "epoch": 0.12762384550797648, "grad_norm": 0.22603319585323334, "learning_rate": 9.976978065179138e-05, "loss": 0.1817, "step": 608 }, { "epoch": 0.12783375314861462, "grad_norm": 0.19939488172531128, "learning_rate": 9.976625518887828e-05, "loss": 0.1925, "step": 609 }, { "epoch": 0.12804366078925272, "grad_norm": 0.17629611492156982, "learning_rate": 9.976270300034936e-05, "loss": 0.1603, "step": 610 }, { "epoch": 0.12825356842989086, "grad_norm": 0.17778918147087097, "learning_rate": 9.975912408811223e-05, "loss": 0.1849, "step": 611 }, { "epoch": 0.12846347607052896, "grad_norm": 0.2002590447664261, "learning_rate": 9.975551845408886e-05, "loss": 0.166, "step": 612 }, { "epoch": 0.1286733837111671, "grad_norm": 0.20324808359146118, "learning_rate": 9.975188610021553e-05, "loss": 0.1673, "step": 613 }, { "epoch": 0.1288832913518052, "grad_norm": 0.20010024309158325, "learning_rate": 9.974822702844291e-05, "loss": 0.1529, "step": 614 }, { "epoch": 0.12909319899244331, "grad_norm": 0.18944194912910461, "learning_rate": 9.974454124073603e-05, "loss": 0.1652, "step": 615 }, { "epoch": 0.12930310663308145, "grad_norm": 0.16956883668899536, "learning_rate": 9.974082873907418e-05, "loss": 0.1812, "step": 616 }, { "epoch": 0.12951301427371956, "grad_norm": 0.21157757937908173, "learning_rate": 9.973708952545111e-05, "loss": 0.1862, "step": 617 }, { "epoch": 0.1297229219143577, "grad_norm": 0.20554983615875244, "learning_rate": 9.973332360187486e-05, "loss": 0.1705, "step": 618 }, { "epoch": 0.1299328295549958, "grad_norm": 0.21964187920093536, "learning_rate": 9.97295309703678e-05, "loss": 0.1807, "step": 619 }, { "epoch": 0.13014273719563393, "grad_norm": 0.15438182651996613, "learning_rate": 9.972571163296666e-05, "loss": 0.1951, "step": 620 }, { "epoch": 0.13035264483627204, "grad_norm": 0.17805755138397217, "learning_rate": 9.972186559172253e-05, "loss": 0.1543, "step": 621 }, { "epoch": 0.13056255247691015, "grad_norm": 0.1914507895708084, "learning_rate": 9.97179928487008e-05, "loss": 0.1815, "step": 622 }, { "epoch": 0.13077246011754828, "grad_norm": 0.18952740728855133, "learning_rate": 9.971409340598123e-05, "loss": 0.1601, "step": 623 }, { "epoch": 0.1309823677581864, "grad_norm": 0.1550108641386032, "learning_rate": 9.971016726565791e-05, "loss": 0.1726, "step": 624 }, { "epoch": 0.13119227539882453, "grad_norm": 0.15205375850200653, "learning_rate": 9.970621442983929e-05, "loss": 0.1536, "step": 625 }, { "epoch": 0.13140218303946263, "grad_norm": 0.15574775636196136, "learning_rate": 9.970223490064809e-05, "loss": 0.167, "step": 626 }, { "epoch": 0.13161209068010077, "grad_norm": 0.18050767481327057, "learning_rate": 9.969822868022143e-05, "loss": 0.1752, "step": 627 }, { "epoch": 0.13182199832073888, "grad_norm": 0.1507614701986313, "learning_rate": 9.969419577071076e-05, "loss": 0.1712, "step": 628 }, { "epoch": 0.13203190596137698, "grad_norm": 0.20433206856250763, "learning_rate": 9.96901361742818e-05, "loss": 0.1782, "step": 629 }, { "epoch": 0.13224181360201512, "grad_norm": 0.14705337584018707, "learning_rate": 9.968604989311467e-05, "loss": 0.1554, "step": 630 }, { "epoch": 0.13245172124265323, "grad_norm": 0.1573822945356369, "learning_rate": 9.968193692940382e-05, "loss": 0.1762, "step": 631 }, { "epoch": 0.13266162888329136, "grad_norm": 0.17816203832626343, "learning_rate": 9.967779728535797e-05, "loss": 0.1783, "step": 632 }, { "epoch": 0.13287153652392947, "grad_norm": 0.1705409288406372, "learning_rate": 9.967363096320022e-05, "loss": 0.1734, "step": 633 }, { "epoch": 0.1330814441645676, "grad_norm": 0.14574241638183594, "learning_rate": 9.966943796516798e-05, "loss": 0.1599, "step": 634 }, { "epoch": 0.1332913518052057, "grad_norm": 0.1341760754585266, "learning_rate": 9.966521829351297e-05, "loss": 0.1833, "step": 635 }, { "epoch": 0.13350125944584382, "grad_norm": 0.1680106520652771, "learning_rate": 9.966097195050128e-05, "loss": 0.1701, "step": 636 }, { "epoch": 0.13371116708648195, "grad_norm": 0.1687798947095871, "learning_rate": 9.965669893841326e-05, "loss": 0.1669, "step": 637 }, { "epoch": 0.13392107472712006, "grad_norm": 0.1792893260717392, "learning_rate": 9.965239925954364e-05, "loss": 0.1705, "step": 638 }, { "epoch": 0.1341309823677582, "grad_norm": 0.15732963383197784, "learning_rate": 9.964807291620144e-05, "loss": 0.176, "step": 639 }, { "epoch": 0.1343408900083963, "grad_norm": 0.14176061749458313, "learning_rate": 9.964371991070999e-05, "loss": 0.1755, "step": 640 }, { "epoch": 0.13455079764903444, "grad_norm": 0.1526860147714615, "learning_rate": 9.963934024540698e-05, "loss": 0.1589, "step": 641 }, { "epoch": 0.13476070528967254, "grad_norm": 0.15608391165733337, "learning_rate": 9.963493392264435e-05, "loss": 0.1652, "step": 642 }, { "epoch": 0.13497061293031065, "grad_norm": 0.14473848044872284, "learning_rate": 9.963050094478845e-05, "loss": 0.1742, "step": 643 }, { "epoch": 0.1351805205709488, "grad_norm": 0.1382717341184616, "learning_rate": 9.962604131421984e-05, "loss": 0.1713, "step": 644 }, { "epoch": 0.1353904282115869, "grad_norm": 0.14345118403434753, "learning_rate": 9.962155503333348e-05, "loss": 0.1648, "step": 645 }, { "epoch": 0.13560033585222503, "grad_norm": 0.1398243010044098, "learning_rate": 9.96170421045386e-05, "loss": 0.1889, "step": 646 }, { "epoch": 0.13581024349286314, "grad_norm": 0.14956693351268768, "learning_rate": 9.96125025302587e-05, "loss": 0.1744, "step": 647 }, { "epoch": 0.13602015113350127, "grad_norm": 0.15064063668251038, "learning_rate": 9.96079363129317e-05, "loss": 0.1625, "step": 648 }, { "epoch": 0.13623005877413938, "grad_norm": 0.15413698554039001, "learning_rate": 9.960334345500974e-05, "loss": 0.1745, "step": 649 }, { "epoch": 0.13643996641477749, "grad_norm": 0.18636789917945862, "learning_rate": 9.959872395895929e-05, "loss": 0.1639, "step": 650 }, { "epoch": 0.13664987405541562, "grad_norm": 0.2444021999835968, "learning_rate": 9.959407782726108e-05, "loss": 0.1826, "step": 651 }, { "epoch": 0.13685978169605373, "grad_norm": 0.16845685243606567, "learning_rate": 9.958940506241026e-05, "loss": 0.1793, "step": 652 }, { "epoch": 0.13706968933669186, "grad_norm": 0.16233941912651062, "learning_rate": 9.958470566691618e-05, "loss": 0.18, "step": 653 }, { "epoch": 0.13727959697732997, "grad_norm": 0.18665330111980438, "learning_rate": 9.95799796433025e-05, "loss": 0.1662, "step": 654 }, { "epoch": 0.1374895046179681, "grad_norm": 0.18732531368732452, "learning_rate": 9.957522699410723e-05, "loss": 0.1646, "step": 655 }, { "epoch": 0.1376994122586062, "grad_norm": 0.12580807507038116, "learning_rate": 9.957044772188266e-05, "loss": 0.1616, "step": 656 }, { "epoch": 0.13790931989924432, "grad_norm": 0.17410576343536377, "learning_rate": 9.956564182919535e-05, "loss": 0.1628, "step": 657 }, { "epoch": 0.13811922753988246, "grad_norm": 0.1753510683774948, "learning_rate": 9.95608093186262e-05, "loss": 0.1867, "step": 658 }, { "epoch": 0.13832913518052056, "grad_norm": 0.17664988338947296, "learning_rate": 9.955595019277032e-05, "loss": 0.1819, "step": 659 }, { "epoch": 0.1385390428211587, "grad_norm": 0.1787986010313034, "learning_rate": 9.955106445423722e-05, "loss": 0.1668, "step": 660 }, { "epoch": 0.1387489504617968, "grad_norm": 0.14035004377365112, "learning_rate": 9.954615210565065e-05, "loss": 0.1651, "step": 661 }, { "epoch": 0.13895885810243494, "grad_norm": 0.20008955895900726, "learning_rate": 9.954121314964864e-05, "loss": 0.179, "step": 662 }, { "epoch": 0.13916876574307305, "grad_norm": 0.22378800809383392, "learning_rate": 9.953624758888352e-05, "loss": 0.1729, "step": 663 }, { "epoch": 0.13937867338371115, "grad_norm": 0.18687045574188232, "learning_rate": 9.953125542602193e-05, "loss": 0.1756, "step": 664 }, { "epoch": 0.1395885810243493, "grad_norm": 0.1506877988576889, "learning_rate": 9.952623666374475e-05, "loss": 0.1616, "step": 665 }, { "epoch": 0.1397984886649874, "grad_norm": 0.20918136835098267, "learning_rate": 9.95211913047472e-05, "loss": 0.1888, "step": 666 }, { "epoch": 0.14000839630562553, "grad_norm": 0.17521612346172333, "learning_rate": 9.951611935173872e-05, "loss": 0.1886, "step": 667 }, { "epoch": 0.14021830394626364, "grad_norm": 0.14788690209388733, "learning_rate": 9.951102080744308e-05, "loss": 0.162, "step": 668 }, { "epoch": 0.14042821158690177, "grad_norm": 0.18560314178466797, "learning_rate": 9.950589567459832e-05, "loss": 0.1573, "step": 669 }, { "epoch": 0.14063811922753988, "grad_norm": 0.16913674771785736, "learning_rate": 9.950074395595675e-05, "loss": 0.1713, "step": 670 }, { "epoch": 0.140848026868178, "grad_norm": 0.13768184185028076, "learning_rate": 9.949556565428496e-05, "loss": 0.1733, "step": 671 }, { "epoch": 0.14105793450881612, "grad_norm": 0.19472239911556244, "learning_rate": 9.949036077236382e-05, "loss": 0.1638, "step": 672 }, { "epoch": 0.14126784214945423, "grad_norm": 0.17684867978096008, "learning_rate": 9.948512931298846e-05, "loss": 0.1686, "step": 673 }, { "epoch": 0.14147774979009237, "grad_norm": 0.20061515271663666, "learning_rate": 9.94798712789683e-05, "loss": 0.1921, "step": 674 }, { "epoch": 0.14168765743073047, "grad_norm": 0.17213481664657593, "learning_rate": 9.9474586673127e-05, "loss": 0.1636, "step": 675 }, { "epoch": 0.1418975650713686, "grad_norm": 0.16144217550754547, "learning_rate": 9.946927549830258e-05, "loss": 0.1594, "step": 676 }, { "epoch": 0.14210747271200672, "grad_norm": 0.16045495867729187, "learning_rate": 9.946393775734719e-05, "loss": 0.1585, "step": 677 }, { "epoch": 0.14231738035264482, "grad_norm": 0.168419748544693, "learning_rate": 9.945857345312735e-05, "loss": 0.1618, "step": 678 }, { "epoch": 0.14252728799328296, "grad_norm": 0.16631141304969788, "learning_rate": 9.945318258852383e-05, "loss": 0.1648, "step": 679 }, { "epoch": 0.14273719563392107, "grad_norm": 0.17133933305740356, "learning_rate": 9.944776516643161e-05, "loss": 0.1902, "step": 680 }, { "epoch": 0.1429471032745592, "grad_norm": 0.144994854927063, "learning_rate": 9.944232118976e-05, "loss": 0.1645, "step": 681 }, { "epoch": 0.1431570109151973, "grad_norm": 0.13521502912044525, "learning_rate": 9.943685066143252e-05, "loss": 0.1679, "step": 682 }, { "epoch": 0.14336691855583544, "grad_norm": 0.1505574733018875, "learning_rate": 9.943135358438698e-05, "loss": 0.1497, "step": 683 }, { "epoch": 0.14357682619647355, "grad_norm": 0.1701841652393341, "learning_rate": 9.942582996157544e-05, "loss": 0.141, "step": 684 }, { "epoch": 0.14378673383711166, "grad_norm": 0.16892337799072266, "learning_rate": 9.94202797959642e-05, "loss": 0.1845, "step": 685 }, { "epoch": 0.1439966414777498, "grad_norm": 0.1322741061449051, "learning_rate": 9.941470309053384e-05, "loss": 0.1635, "step": 686 }, { "epoch": 0.1442065491183879, "grad_norm": 0.18180270493030548, "learning_rate": 9.940909984827915e-05, "loss": 0.1521, "step": 687 }, { "epoch": 0.14441645675902604, "grad_norm": 0.17136745154857635, "learning_rate": 9.940347007220924e-05, "loss": 0.1694, "step": 688 }, { "epoch": 0.14462636439966414, "grad_norm": 0.16122983396053314, "learning_rate": 9.93978137653474e-05, "loss": 0.1726, "step": 689 }, { "epoch": 0.14483627204030228, "grad_norm": 0.15024663507938385, "learning_rate": 9.939213093073118e-05, "loss": 0.1703, "step": 690 }, { "epoch": 0.14504617968094038, "grad_norm": 0.14193399250507355, "learning_rate": 9.938642157141245e-05, "loss": 0.1837, "step": 691 }, { "epoch": 0.1452560873215785, "grad_norm": 0.15650463104248047, "learning_rate": 9.938068569045721e-05, "loss": 0.1665, "step": 692 }, { "epoch": 0.14546599496221663, "grad_norm": 0.14254000782966614, "learning_rate": 9.937492329094577e-05, "loss": 0.175, "step": 693 }, { "epoch": 0.14567590260285473, "grad_norm": 0.17051447927951813, "learning_rate": 9.93691343759727e-05, "loss": 0.1664, "step": 694 }, { "epoch": 0.14588581024349287, "grad_norm": 0.16692955791950226, "learning_rate": 9.936331894864677e-05, "loss": 0.1682, "step": 695 }, { "epoch": 0.14609571788413098, "grad_norm": 0.20158237218856812, "learning_rate": 9.935747701209096e-05, "loss": 0.1574, "step": 696 }, { "epoch": 0.1463056255247691, "grad_norm": 0.1351911425590515, "learning_rate": 9.935160856944257e-05, "loss": 0.1618, "step": 697 }, { "epoch": 0.14651553316540722, "grad_norm": 0.1611570417881012, "learning_rate": 9.934571362385305e-05, "loss": 0.164, "step": 698 }, { "epoch": 0.14672544080604533, "grad_norm": 0.22555968165397644, "learning_rate": 9.933979217848815e-05, "loss": 0.2044, "step": 699 }, { "epoch": 0.14693534844668346, "grad_norm": 0.17471735179424286, "learning_rate": 9.93338442365278e-05, "loss": 0.1809, "step": 700 }, { "epoch": 0.14714525608732157, "grad_norm": 0.12285126745700836, "learning_rate": 9.93278698011662e-05, "loss": 0.1644, "step": 701 }, { "epoch": 0.1473551637279597, "grad_norm": 0.1364145576953888, "learning_rate": 9.932186887561175e-05, "loss": 0.1655, "step": 702 }, { "epoch": 0.1475650713685978, "grad_norm": 0.14189580082893372, "learning_rate": 9.931584146308708e-05, "loss": 0.1729, "step": 703 }, { "epoch": 0.14777497900923595, "grad_norm": 0.2254076600074768, "learning_rate": 9.930978756682905e-05, "loss": 0.1731, "step": 704 }, { "epoch": 0.14798488664987405, "grad_norm": 0.15398503839969635, "learning_rate": 9.930370719008875e-05, "loss": 0.1813, "step": 705 }, { "epoch": 0.1481947942905122, "grad_norm": 0.15267014503479004, "learning_rate": 9.929760033613146e-05, "loss": 0.1681, "step": 706 }, { "epoch": 0.1484047019311503, "grad_norm": 0.1611442267894745, "learning_rate": 9.929146700823671e-05, "loss": 0.1925, "step": 707 }, { "epoch": 0.1486146095717884, "grad_norm": 0.15641498565673828, "learning_rate": 9.928530720969827e-05, "loss": 0.1733, "step": 708 }, { "epoch": 0.14882451721242654, "grad_norm": 0.12724818289279938, "learning_rate": 9.927912094382403e-05, "loss": 0.1735, "step": 709 }, { "epoch": 0.14903442485306465, "grad_norm": 0.17570020258426666, "learning_rate": 9.92729082139362e-05, "loss": 0.1732, "step": 710 }, { "epoch": 0.14924433249370278, "grad_norm": 0.19477427005767822, "learning_rate": 9.926666902337115e-05, "loss": 0.1551, "step": 711 }, { "epoch": 0.1494542401343409, "grad_norm": 0.16798420250415802, "learning_rate": 9.926040337547946e-05, "loss": 0.1906, "step": 712 }, { "epoch": 0.14966414777497902, "grad_norm": 0.17551501095294952, "learning_rate": 9.925411127362594e-05, "loss": 0.1747, "step": 713 }, { "epoch": 0.14987405541561713, "grad_norm": 0.1842852234840393, "learning_rate": 9.924779272118957e-05, "loss": 0.1563, "step": 714 }, { "epoch": 0.15008396305625524, "grad_norm": 0.1805478185415268, "learning_rate": 9.924144772156358e-05, "loss": 0.165, "step": 715 }, { "epoch": 0.15029387069689337, "grad_norm": 0.21542084217071533, "learning_rate": 9.923507627815536e-05, "loss": 0.1469, "step": 716 }, { "epoch": 0.15050377833753148, "grad_norm": 0.15649262070655823, "learning_rate": 9.922867839438654e-05, "loss": 0.1706, "step": 717 }, { "epoch": 0.15071368597816961, "grad_norm": 0.14313052594661713, "learning_rate": 9.92222540736929e-05, "loss": 0.1737, "step": 718 }, { "epoch": 0.15092359361880772, "grad_norm": 0.13190749287605286, "learning_rate": 9.92158033195245e-05, "loss": 0.1742, "step": 719 }, { "epoch": 0.15113350125944586, "grad_norm": 0.18187177181243896, "learning_rate": 9.920932613534549e-05, "loss": 0.1748, "step": 720 }, { "epoch": 0.15134340890008396, "grad_norm": 0.14302362501621246, "learning_rate": 9.920282252463429e-05, "loss": 0.1711, "step": 721 }, { "epoch": 0.15155331654072207, "grad_norm": 0.16898708045482635, "learning_rate": 9.919629249088347e-05, "loss": 0.1616, "step": 722 }, { "epoch": 0.1517632241813602, "grad_norm": 0.15528154373168945, "learning_rate": 9.918973603759984e-05, "loss": 0.1652, "step": 723 }, { "epoch": 0.15197313182199831, "grad_norm": 0.1718195527791977, "learning_rate": 9.918315316830434e-05, "loss": 0.1758, "step": 724 }, { "epoch": 0.15218303946263645, "grad_norm": 0.15404529869556427, "learning_rate": 9.917654388653211e-05, "loss": 0.162, "step": 725 }, { "epoch": 0.15239294710327456, "grad_norm": 0.19148094952106476, "learning_rate": 9.916990819583252e-05, "loss": 0.1887, "step": 726 }, { "epoch": 0.1526028547439127, "grad_norm": 0.21298371255397797, "learning_rate": 9.916324609976906e-05, "loss": 0.1712, "step": 727 }, { "epoch": 0.1528127623845508, "grad_norm": 0.2041487991809845, "learning_rate": 9.915655760191944e-05, "loss": 0.167, "step": 728 }, { "epoch": 0.1530226700251889, "grad_norm": 0.17242367565631866, "learning_rate": 9.914984270587552e-05, "loss": 0.163, "step": 729 }, { "epoch": 0.15323257766582704, "grad_norm": 0.1646365076303482, "learning_rate": 9.914310141524339e-05, "loss": 0.1638, "step": 730 }, { "epoch": 0.15344248530646515, "grad_norm": 0.1947726458311081, "learning_rate": 9.913633373364324e-05, "loss": 0.1828, "step": 731 }, { "epoch": 0.15365239294710328, "grad_norm": 0.17962804436683655, "learning_rate": 9.912953966470948e-05, "loss": 0.1699, "step": 732 }, { "epoch": 0.1538623005877414, "grad_norm": 0.17121249437332153, "learning_rate": 9.912271921209068e-05, "loss": 0.1605, "step": 733 }, { "epoch": 0.15407220822837953, "grad_norm": 0.17192313075065613, "learning_rate": 9.911587237944959e-05, "loss": 0.1746, "step": 734 }, { "epoch": 0.15428211586901763, "grad_norm": 0.1262722760438919, "learning_rate": 9.910899917046311e-05, "loss": 0.1677, "step": 735 }, { "epoch": 0.15449202350965574, "grad_norm": 0.15364903211593628, "learning_rate": 9.910209958882231e-05, "loss": 0.1792, "step": 736 }, { "epoch": 0.15470193115029388, "grad_norm": 0.17038469016551971, "learning_rate": 9.909517363823241e-05, "loss": 0.1588, "step": 737 }, { "epoch": 0.15491183879093198, "grad_norm": 0.14676110446453094, "learning_rate": 9.908822132241281e-05, "loss": 0.1639, "step": 738 }, { "epoch": 0.15512174643157012, "grad_norm": 0.15300148725509644, "learning_rate": 9.908124264509707e-05, "loss": 0.1752, "step": 739 }, { "epoch": 0.15533165407220823, "grad_norm": 0.1372271180152893, "learning_rate": 9.90742376100329e-05, "loss": 0.1728, "step": 740 }, { "epoch": 0.15554156171284636, "grad_norm": 0.13885709643363953, "learning_rate": 9.906720622098215e-05, "loss": 0.1625, "step": 741 }, { "epoch": 0.15575146935348447, "grad_norm": 0.12582066655158997, "learning_rate": 9.906014848172086e-05, "loss": 0.17, "step": 742 }, { "epoch": 0.15596137699412257, "grad_norm": 0.13245654106140137, "learning_rate": 9.905306439603918e-05, "loss": 0.1711, "step": 743 }, { "epoch": 0.1561712846347607, "grad_norm": 0.12488420307636261, "learning_rate": 9.904595396774142e-05, "loss": 0.1725, "step": 744 }, { "epoch": 0.15638119227539882, "grad_norm": 0.1577821522951126, "learning_rate": 9.903881720064606e-05, "loss": 0.1707, "step": 745 }, { "epoch": 0.15659109991603695, "grad_norm": 0.16098785400390625, "learning_rate": 9.903165409858567e-05, "loss": 0.1608, "step": 746 }, { "epoch": 0.15680100755667506, "grad_norm": 0.14498507976531982, "learning_rate": 9.902446466540707e-05, "loss": 0.1776, "step": 747 }, { "epoch": 0.1570109151973132, "grad_norm": 0.15581083297729492, "learning_rate": 9.901724890497109e-05, "loss": 0.1746, "step": 748 }, { "epoch": 0.1572208228379513, "grad_norm": 0.16081508994102478, "learning_rate": 9.901000682115276e-05, "loss": 0.1869, "step": 749 }, { "epoch": 0.1574307304785894, "grad_norm": 0.16249604523181915, "learning_rate": 9.900273841784126e-05, "loss": 0.1629, "step": 750 }, { "epoch": 0.15764063811922754, "grad_norm": 0.1613830178976059, "learning_rate": 9.899544369893992e-05, "loss": 0.1588, "step": 751 }, { "epoch": 0.15785054575986565, "grad_norm": 0.15741245448589325, "learning_rate": 9.898812266836613e-05, "loss": 0.1708, "step": 752 }, { "epoch": 0.1580604534005038, "grad_norm": 0.12306373566389084, "learning_rate": 9.898077533005144e-05, "loss": 0.1751, "step": 753 }, { "epoch": 0.1582703610411419, "grad_norm": 0.15071968734264374, "learning_rate": 9.897340168794155e-05, "loss": 0.1686, "step": 754 }, { "epoch": 0.15848026868178003, "grad_norm": 0.14102348685264587, "learning_rate": 9.896600174599632e-05, "loss": 0.1701, "step": 755 }, { "epoch": 0.15869017632241814, "grad_norm": 0.16288180649280548, "learning_rate": 9.895857550818963e-05, "loss": 0.1652, "step": 756 }, { "epoch": 0.15890008396305624, "grad_norm": 0.17342409491539001, "learning_rate": 9.895112297850956e-05, "loss": 0.1702, "step": 757 }, { "epoch": 0.15910999160369438, "grad_norm": 0.13485445082187653, "learning_rate": 9.894364416095829e-05, "loss": 0.1705, "step": 758 }, { "epoch": 0.15931989924433249, "grad_norm": 0.17127734422683716, "learning_rate": 9.893613905955211e-05, "loss": 0.1831, "step": 759 }, { "epoch": 0.15952980688497062, "grad_norm": 0.13848379254341125, "learning_rate": 9.892860767832144e-05, "loss": 0.161, "step": 760 }, { "epoch": 0.15973971452560873, "grad_norm": 0.12154096364974976, "learning_rate": 9.892105002131081e-05, "loss": 0.1637, "step": 761 }, { "epoch": 0.15994962216624686, "grad_norm": 0.15433917939662933, "learning_rate": 9.891346609257882e-05, "loss": 0.1765, "step": 762 }, { "epoch": 0.16015952980688497, "grad_norm": 0.16946150362491608, "learning_rate": 9.890585589619825e-05, "loss": 0.1766, "step": 763 }, { "epoch": 0.16036943744752308, "grad_norm": 0.14503143727779388, "learning_rate": 9.889821943625594e-05, "loss": 0.1458, "step": 764 }, { "epoch": 0.1605793450881612, "grad_norm": 0.16491486132144928, "learning_rate": 9.889055671685283e-05, "loss": 0.1861, "step": 765 }, { "epoch": 0.16078925272879932, "grad_norm": 0.15087951719760895, "learning_rate": 9.888286774210398e-05, "loss": 0.1745, "step": 766 }, { "epoch": 0.16099916036943746, "grad_norm": 0.19266510009765625, "learning_rate": 9.887515251613857e-05, "loss": 0.1711, "step": 767 }, { "epoch": 0.16120906801007556, "grad_norm": 0.19419905543327332, "learning_rate": 9.886741104309981e-05, "loss": 0.1705, "step": 768 }, { "epoch": 0.1614189756507137, "grad_norm": 0.13408306241035461, "learning_rate": 9.885964332714508e-05, "loss": 0.1629, "step": 769 }, { "epoch": 0.1616288832913518, "grad_norm": 0.17757900059223175, "learning_rate": 9.885184937244581e-05, "loss": 0.1643, "step": 770 }, { "epoch": 0.1618387909319899, "grad_norm": 0.16376204788684845, "learning_rate": 9.884402918318754e-05, "loss": 0.1869, "step": 771 }, { "epoch": 0.16204869857262805, "grad_norm": 0.1362319141626358, "learning_rate": 9.883618276356988e-05, "loss": 0.1736, "step": 772 }, { "epoch": 0.16225860621326615, "grad_norm": 0.15015824139118195, "learning_rate": 9.882831011780653e-05, "loss": 0.1598, "step": 773 }, { "epoch": 0.1624685138539043, "grad_norm": 0.1815597116947174, "learning_rate": 9.882041125012528e-05, "loss": 0.1597, "step": 774 }, { "epoch": 0.1626784214945424, "grad_norm": 0.16493958234786987, "learning_rate": 9.881248616476803e-05, "loss": 0.181, "step": 775 }, { "epoch": 0.16288832913518053, "grad_norm": 0.1213487982749939, "learning_rate": 9.880453486599072e-05, "loss": 0.1644, "step": 776 }, { "epoch": 0.16309823677581864, "grad_norm": 0.1440252661705017, "learning_rate": 9.879655735806337e-05, "loss": 0.1657, "step": 777 }, { "epoch": 0.16330814441645675, "grad_norm": 0.1406947672367096, "learning_rate": 9.878855364527007e-05, "loss": 0.172, "step": 778 }, { "epoch": 0.16351805205709488, "grad_norm": 0.17073537409305573, "learning_rate": 9.878052373190902e-05, "loss": 0.1852, "step": 779 }, { "epoch": 0.163727959697733, "grad_norm": 0.15924111008644104, "learning_rate": 9.877246762229247e-05, "loss": 0.18, "step": 780 }, { "epoch": 0.16393786733837112, "grad_norm": 0.14216449856758118, "learning_rate": 9.876438532074672e-05, "loss": 0.1575, "step": 781 }, { "epoch": 0.16414777497900923, "grad_norm": 0.12731285393238068, "learning_rate": 9.875627683161217e-05, "loss": 0.161, "step": 782 }, { "epoch": 0.16435768261964737, "grad_norm": 0.16211947798728943, "learning_rate": 9.874814215924324e-05, "loss": 0.1592, "step": 783 }, { "epoch": 0.16456759026028547, "grad_norm": 0.1463523507118225, "learning_rate": 9.873998130800844e-05, "loss": 0.1773, "step": 784 }, { "epoch": 0.16477749790092358, "grad_norm": 0.14601489901542664, "learning_rate": 9.873179428229033e-05, "loss": 0.1685, "step": 785 }, { "epoch": 0.16498740554156172, "grad_norm": 0.1590069830417633, "learning_rate": 9.872358108648557e-05, "loss": 0.1709, "step": 786 }, { "epoch": 0.16519731318219982, "grad_norm": 0.14027854800224304, "learning_rate": 9.871534172500479e-05, "loss": 0.1842, "step": 787 }, { "epoch": 0.16540722082283796, "grad_norm": 0.12902334332466125, "learning_rate": 9.870707620227271e-05, "loss": 0.1484, "step": 788 }, { "epoch": 0.16561712846347607, "grad_norm": 0.13427670300006866, "learning_rate": 9.869878452272812e-05, "loss": 0.1599, "step": 789 }, { "epoch": 0.1658270361041142, "grad_norm": 0.17016825079917908, "learning_rate": 9.869046669082386e-05, "loss": 0.1506, "step": 790 }, { "epoch": 0.1660369437447523, "grad_norm": 0.13757233321666718, "learning_rate": 9.868212271102678e-05, "loss": 0.1574, "step": 791 }, { "epoch": 0.16624685138539042, "grad_norm": 0.19485127925872803, "learning_rate": 9.867375258781778e-05, "loss": 0.1686, "step": 792 }, { "epoch": 0.16645675902602855, "grad_norm": 0.16131381690502167, "learning_rate": 9.866535632569182e-05, "loss": 0.1693, "step": 793 }, { "epoch": 0.16666666666666666, "grad_norm": 0.16035085916519165, "learning_rate": 9.865693392915787e-05, "loss": 0.1589, "step": 794 }, { "epoch": 0.1668765743073048, "grad_norm": 0.16287830471992493, "learning_rate": 9.864848540273897e-05, "loss": 0.1749, "step": 795 }, { "epoch": 0.1670864819479429, "grad_norm": 0.16353754699230194, "learning_rate": 9.864001075097214e-05, "loss": 0.1846, "step": 796 }, { "epoch": 0.16729638958858103, "grad_norm": 0.19826947152614594, "learning_rate": 9.863150997840849e-05, "loss": 0.1624, "step": 797 }, { "epoch": 0.16750629722921914, "grad_norm": 0.14296779036521912, "learning_rate": 9.862298308961313e-05, "loss": 0.1823, "step": 798 }, { "epoch": 0.16771620486985725, "grad_norm": 0.1371513456106186, "learning_rate": 9.861443008916517e-05, "loss": 0.1742, "step": 799 }, { "epoch": 0.16792611251049538, "grad_norm": 0.17043526470661163, "learning_rate": 9.86058509816578e-05, "loss": 0.1661, "step": 800 }, { "epoch": 0.1681360201511335, "grad_norm": 0.17045053839683533, "learning_rate": 9.859724577169815e-05, "loss": 0.1773, "step": 801 }, { "epoch": 0.16834592779177163, "grad_norm": 0.1564718633890152, "learning_rate": 9.858861446390748e-05, "loss": 0.1636, "step": 802 }, { "epoch": 0.16855583543240973, "grad_norm": 0.13335298001766205, "learning_rate": 9.857995706292092e-05, "loss": 0.18, "step": 803 }, { "epoch": 0.16876574307304787, "grad_norm": 0.14552432298660278, "learning_rate": 9.857127357338775e-05, "loss": 0.1599, "step": 804 }, { "epoch": 0.16897565071368598, "grad_norm": 0.1437654346227646, "learning_rate": 9.856256399997119e-05, "loss": 0.178, "step": 805 }, { "epoch": 0.16918555835432408, "grad_norm": 0.143344447016716, "learning_rate": 9.855382834734848e-05, "loss": 0.159, "step": 806 }, { "epoch": 0.16939546599496222, "grad_norm": 0.15504872798919678, "learning_rate": 9.854506662021085e-05, "loss": 0.1944, "step": 807 }, { "epoch": 0.16960537363560033, "grad_norm": 0.12894919514656067, "learning_rate": 9.853627882326357e-05, "loss": 0.1677, "step": 808 }, { "epoch": 0.16981528127623846, "grad_norm": 0.1339186728000641, "learning_rate": 9.852746496122587e-05, "loss": 0.1666, "step": 809 }, { "epoch": 0.17002518891687657, "grad_norm": 0.11661262065172195, "learning_rate": 9.8518625038831e-05, "loss": 0.1611, "step": 810 }, { "epoch": 0.1702350965575147, "grad_norm": 0.14091713726520538, "learning_rate": 9.85097590608262e-05, "loss": 0.1483, "step": 811 }, { "epoch": 0.1704450041981528, "grad_norm": 0.13658110797405243, "learning_rate": 9.85008670319727e-05, "loss": 0.1667, "step": 812 }, { "epoch": 0.17065491183879095, "grad_norm": 0.13288968801498413, "learning_rate": 9.849194895704575e-05, "loss": 0.1685, "step": 813 }, { "epoch": 0.17086481947942905, "grad_norm": 0.1870601624250412, "learning_rate": 9.84830048408345e-05, "loss": 0.1794, "step": 814 }, { "epoch": 0.17107472712006716, "grad_norm": 0.15742088854312897, "learning_rate": 9.84740346881422e-05, "loss": 0.1546, "step": 815 }, { "epoch": 0.1712846347607053, "grad_norm": 0.1625792235136032, "learning_rate": 9.846503850378602e-05, "loss": 0.1506, "step": 816 }, { "epoch": 0.1714945424013434, "grad_norm": 0.1315995752811432, "learning_rate": 9.845601629259708e-05, "loss": 0.1611, "step": 817 }, { "epoch": 0.17170445004198154, "grad_norm": 0.16079425811767578, "learning_rate": 9.844696805942055e-05, "loss": 0.1588, "step": 818 }, { "epoch": 0.17191435768261965, "grad_norm": 0.16755861043930054, "learning_rate": 9.843789380911554e-05, "loss": 0.1629, "step": 819 }, { "epoch": 0.17212426532325778, "grad_norm": 0.12923209369182587, "learning_rate": 9.84287935465551e-05, "loss": 0.1675, "step": 820 }, { "epoch": 0.1723341729638959, "grad_norm": 0.14494512975215912, "learning_rate": 9.84196672766263e-05, "loss": 0.1674, "step": 821 }, { "epoch": 0.172544080604534, "grad_norm": 0.13111597299575806, "learning_rate": 9.841051500423014e-05, "loss": 0.1835, "step": 822 }, { "epoch": 0.17275398824517213, "grad_norm": 0.1468263864517212, "learning_rate": 9.840133673428162e-05, "loss": 0.1635, "step": 823 }, { "epoch": 0.17296389588581024, "grad_norm": 0.14243565499782562, "learning_rate": 9.839213247170967e-05, "loss": 0.1617, "step": 824 }, { "epoch": 0.17317380352644837, "grad_norm": 0.14563241600990295, "learning_rate": 9.838290222145718e-05, "loss": 0.1614, "step": 825 }, { "epoch": 0.17338371116708648, "grad_norm": 0.15182636678218842, "learning_rate": 9.837364598848102e-05, "loss": 0.1712, "step": 826 }, { "epoch": 0.17359361880772461, "grad_norm": 0.15270951390266418, "learning_rate": 9.8364363777752e-05, "loss": 0.1721, "step": 827 }, { "epoch": 0.17380352644836272, "grad_norm": 0.14734511077404022, "learning_rate": 9.835505559425487e-05, "loss": 0.1751, "step": 828 }, { "epoch": 0.17401343408900083, "grad_norm": 0.13699984550476074, "learning_rate": 9.834572144298834e-05, "loss": 0.1895, "step": 829 }, { "epoch": 0.17422334172963896, "grad_norm": 0.1422858089208603, "learning_rate": 9.833636132896505e-05, "loss": 0.1757, "step": 830 }, { "epoch": 0.17443324937027707, "grad_norm": 0.1300913542509079, "learning_rate": 9.832697525721161e-05, "loss": 0.1774, "step": 831 }, { "epoch": 0.1746431570109152, "grad_norm": 0.16247855126857758, "learning_rate": 9.831756323276856e-05, "loss": 0.1499, "step": 832 }, { "epoch": 0.1748530646515533, "grad_norm": 0.13687850534915924, "learning_rate": 9.830812526069036e-05, "loss": 0.1739, "step": 833 }, { "epoch": 0.17506297229219145, "grad_norm": 0.16242316365242004, "learning_rate": 9.829866134604543e-05, "loss": 0.1703, "step": 834 }, { "epoch": 0.17527287993282956, "grad_norm": 0.1983039677143097, "learning_rate": 9.82891714939161e-05, "loss": 0.1726, "step": 835 }, { "epoch": 0.17548278757346766, "grad_norm": 0.17525269091129303, "learning_rate": 9.827965570939861e-05, "loss": 0.1839, "step": 836 }, { "epoch": 0.1756926952141058, "grad_norm": 0.1517772525548935, "learning_rate": 9.827011399760319e-05, "loss": 0.17, "step": 837 }, { "epoch": 0.1759026028547439, "grad_norm": 0.17186671495437622, "learning_rate": 9.826054636365396e-05, "loss": 0.1526, "step": 838 }, { "epoch": 0.17611251049538204, "grad_norm": 0.15066854655742645, "learning_rate": 9.825095281268894e-05, "loss": 0.1442, "step": 839 }, { "epoch": 0.17632241813602015, "grad_norm": 0.14937272667884827, "learning_rate": 9.82413333498601e-05, "loss": 0.1641, "step": 840 }, { "epoch": 0.17653232577665828, "grad_norm": 0.1843566745519638, "learning_rate": 9.823168798033328e-05, "loss": 0.1674, "step": 841 }, { "epoch": 0.1767422334172964, "grad_norm": 0.15555419027805328, "learning_rate": 9.82220167092883e-05, "loss": 0.1551, "step": 842 }, { "epoch": 0.1769521410579345, "grad_norm": 0.1585703343153, "learning_rate": 9.821231954191885e-05, "loss": 0.1578, "step": 843 }, { "epoch": 0.17716204869857263, "grad_norm": 0.1662077009677887, "learning_rate": 9.82025964834325e-05, "loss": 0.1778, "step": 844 }, { "epoch": 0.17737195633921074, "grad_norm": 0.1385970562696457, "learning_rate": 9.819284753905078e-05, "loss": 0.1653, "step": 845 }, { "epoch": 0.17758186397984888, "grad_norm": 0.14429986476898193, "learning_rate": 9.81830727140091e-05, "loss": 0.1801, "step": 846 }, { "epoch": 0.17779177162048698, "grad_norm": 0.16140583157539368, "learning_rate": 9.817327201355675e-05, "loss": 0.1808, "step": 847 }, { "epoch": 0.17800167926112512, "grad_norm": 0.13548845052719116, "learning_rate": 9.816344544295692e-05, "loss": 0.1567, "step": 848 }, { "epoch": 0.17821158690176322, "grad_norm": 0.14740432798862457, "learning_rate": 9.815359300748674e-05, "loss": 0.1737, "step": 849 }, { "epoch": 0.17842149454240133, "grad_norm": 0.15687352418899536, "learning_rate": 9.814371471243715e-05, "loss": 0.1775, "step": 850 }, { "epoch": 0.17863140218303947, "grad_norm": 0.14518170058727264, "learning_rate": 9.813381056311307e-05, "loss": 0.1536, "step": 851 }, { "epoch": 0.17884130982367757, "grad_norm": 0.1569397896528244, "learning_rate": 9.812388056483319e-05, "loss": 0.1667, "step": 852 }, { "epoch": 0.1790512174643157, "grad_norm": 0.13371489942073822, "learning_rate": 9.81139247229302e-05, "loss": 0.175, "step": 853 }, { "epoch": 0.17926112510495382, "grad_norm": 0.15128138661384583, "learning_rate": 9.810394304275058e-05, "loss": 0.1601, "step": 854 }, { "epoch": 0.17947103274559195, "grad_norm": 0.12941020727157593, "learning_rate": 9.809393552965476e-05, "loss": 0.1579, "step": 855 }, { "epoch": 0.17968094038623006, "grad_norm": 0.1644691675901413, "learning_rate": 9.808390218901696e-05, "loss": 0.1737, "step": 856 }, { "epoch": 0.17989084802686817, "grad_norm": 0.16271840035915375, "learning_rate": 9.807384302622533e-05, "loss": 0.1846, "step": 857 }, { "epoch": 0.1801007556675063, "grad_norm": 0.15787339210510254, "learning_rate": 9.806375804668189e-05, "loss": 0.1733, "step": 858 }, { "epoch": 0.1803106633081444, "grad_norm": 0.16740775108337402, "learning_rate": 9.805364725580248e-05, "loss": 0.1625, "step": 859 }, { "epoch": 0.18052057094878254, "grad_norm": 0.11970685422420502, "learning_rate": 9.804351065901682e-05, "loss": 0.1554, "step": 860 }, { "epoch": 0.18073047858942065, "grad_norm": 0.21359467506408691, "learning_rate": 9.803334826176852e-05, "loss": 0.1658, "step": 861 }, { "epoch": 0.1809403862300588, "grad_norm": 0.1699121594429016, "learning_rate": 9.8023160069515e-05, "loss": 0.1608, "step": 862 }, { "epoch": 0.1811502938706969, "grad_norm": 0.15073060989379883, "learning_rate": 9.801294608772755e-05, "loss": 0.156, "step": 863 }, { "epoch": 0.181360201511335, "grad_norm": 0.17342859506607056, "learning_rate": 9.800270632189133e-05, "loss": 0.1829, "step": 864 }, { "epoch": 0.18157010915197314, "grad_norm": 0.20463357865810394, "learning_rate": 9.799244077750531e-05, "loss": 0.1839, "step": 865 }, { "epoch": 0.18178001679261124, "grad_norm": 0.17843082547187805, "learning_rate": 9.798214946008234e-05, "loss": 0.1707, "step": 866 }, { "epoch": 0.18198992443324938, "grad_norm": 0.14360542595386505, "learning_rate": 9.797183237514907e-05, "loss": 0.1687, "step": 867 }, { "epoch": 0.18219983207388749, "grad_norm": 0.15701772272586823, "learning_rate": 9.796148952824603e-05, "loss": 0.1584, "step": 868 }, { "epoch": 0.18240973971452562, "grad_norm": 0.12531954050064087, "learning_rate": 9.795112092492755e-05, "loss": 0.1467, "step": 869 }, { "epoch": 0.18261964735516373, "grad_norm": 0.16143764555454254, "learning_rate": 9.794072657076182e-05, "loss": 0.1687, "step": 870 }, { "epoch": 0.18282955499580184, "grad_norm": 0.13665997982025146, "learning_rate": 9.793030647133084e-05, "loss": 0.1644, "step": 871 }, { "epoch": 0.18303946263643997, "grad_norm": 0.15797339379787445, "learning_rate": 9.791986063223045e-05, "loss": 0.1737, "step": 872 }, { "epoch": 0.18324937027707808, "grad_norm": 0.13726350665092468, "learning_rate": 9.790938905907027e-05, "loss": 0.1842, "step": 873 }, { "epoch": 0.1834592779177162, "grad_norm": 0.13706736266613007, "learning_rate": 9.78988917574738e-05, "loss": 0.1699, "step": 874 }, { "epoch": 0.18366918555835432, "grad_norm": 0.13394543528556824, "learning_rate": 9.788836873307835e-05, "loss": 0.1757, "step": 875 }, { "epoch": 0.18387909319899245, "grad_norm": 0.13909773528575897, "learning_rate": 9.7877819991535e-05, "loss": 0.1761, "step": 876 }, { "epoch": 0.18408900083963056, "grad_norm": 0.1416359543800354, "learning_rate": 9.786724553850865e-05, "loss": 0.1657, "step": 877 }, { "epoch": 0.18429890848026867, "grad_norm": 0.13720481097698212, "learning_rate": 9.785664537967806e-05, "loss": 0.173, "step": 878 }, { "epoch": 0.1845088161209068, "grad_norm": 0.14136558771133423, "learning_rate": 9.784601952073573e-05, "loss": 0.1625, "step": 879 }, { "epoch": 0.1847187237615449, "grad_norm": 0.1424713134765625, "learning_rate": 9.783536796738802e-05, "loss": 0.1676, "step": 880 }, { "epoch": 0.18492863140218305, "grad_norm": 0.163283571600914, "learning_rate": 9.782469072535502e-05, "loss": 0.1574, "step": 881 }, { "epoch": 0.18513853904282115, "grad_norm": 0.16645042598247528, "learning_rate": 9.781398780037067e-05, "loss": 0.1576, "step": 882 }, { "epoch": 0.1853484466834593, "grad_norm": 0.17722882330417633, "learning_rate": 9.780325919818268e-05, "loss": 0.1815, "step": 883 }, { "epoch": 0.1855583543240974, "grad_norm": 0.1435573697090149, "learning_rate": 9.779250492455257e-05, "loss": 0.168, "step": 884 }, { "epoch": 0.1857682619647355, "grad_norm": 0.13388660550117493, "learning_rate": 9.778172498525559e-05, "loss": 0.1493, "step": 885 }, { "epoch": 0.18597816960537364, "grad_norm": 0.15198923647403717, "learning_rate": 9.777091938608088e-05, "loss": 0.1759, "step": 886 }, { "epoch": 0.18618807724601175, "grad_norm": 0.14781691133975983, "learning_rate": 9.776008813283125e-05, "loss": 0.1521, "step": 887 }, { "epoch": 0.18639798488664988, "grad_norm": 0.18348653614521027, "learning_rate": 9.774923123132332e-05, "loss": 0.1522, "step": 888 }, { "epoch": 0.186607892527288, "grad_norm": 0.15346892178058624, "learning_rate": 9.773834868738752e-05, "loss": 0.1593, "step": 889 }, { "epoch": 0.18681780016792612, "grad_norm": 0.1563442200422287, "learning_rate": 9.7727440506868e-05, "loss": 0.1569, "step": 890 }, { "epoch": 0.18702770780856423, "grad_norm": 0.12280000746250153, "learning_rate": 9.771650669562274e-05, "loss": 0.1593, "step": 891 }, { "epoch": 0.18723761544920234, "grad_norm": 0.1622755527496338, "learning_rate": 9.770554725952341e-05, "loss": 0.1736, "step": 892 }, { "epoch": 0.18744752308984047, "grad_norm": 0.19185318052768707, "learning_rate": 9.769456220445549e-05, "loss": 0.165, "step": 893 }, { "epoch": 0.18765743073047858, "grad_norm": 0.16040024161338806, "learning_rate": 9.768355153631822e-05, "loss": 0.1837, "step": 894 }, { "epoch": 0.18786733837111672, "grad_norm": 0.11211330443620682, "learning_rate": 9.767251526102456e-05, "loss": 0.152, "step": 895 }, { "epoch": 0.18807724601175482, "grad_norm": 0.16628898680210114, "learning_rate": 9.766145338450125e-05, "loss": 0.1729, "step": 896 }, { "epoch": 0.18828715365239296, "grad_norm": 0.14260315895080566, "learning_rate": 9.765036591268877e-05, "loss": 0.165, "step": 897 }, { "epoch": 0.18849706129303107, "grad_norm": 0.12734055519104004, "learning_rate": 9.763925285154135e-05, "loss": 0.1714, "step": 898 }, { "epoch": 0.18870696893366917, "grad_norm": 0.1465056985616684, "learning_rate": 9.762811420702693e-05, "loss": 0.1805, "step": 899 }, { "epoch": 0.1889168765743073, "grad_norm": 0.12369433790445328, "learning_rate": 9.761694998512727e-05, "loss": 0.1737, "step": 900 }, { "epoch": 0.18912678421494541, "grad_norm": 0.13893358409404755, "learning_rate": 9.760576019183775e-05, "loss": 0.1502, "step": 901 }, { "epoch": 0.18933669185558355, "grad_norm": 0.13517262041568756, "learning_rate": 9.759454483316761e-05, "loss": 0.1648, "step": 902 }, { "epoch": 0.18954659949622166, "grad_norm": 0.13170479238033295, "learning_rate": 9.75833039151397e-05, "loss": 0.159, "step": 903 }, { "epoch": 0.1897565071368598, "grad_norm": 0.13293495774269104, "learning_rate": 9.757203744379067e-05, "loss": 0.1667, "step": 904 }, { "epoch": 0.1899664147774979, "grad_norm": 0.12381456047296524, "learning_rate": 9.756074542517088e-05, "loss": 0.1684, "step": 905 }, { "epoch": 0.190176322418136, "grad_norm": 0.12754730880260468, "learning_rate": 9.75494278653444e-05, "loss": 0.1559, "step": 906 }, { "epoch": 0.19038623005877414, "grad_norm": 0.17824846506118774, "learning_rate": 9.753808477038899e-05, "loss": 0.1533, "step": 907 }, { "epoch": 0.19059613769941225, "grad_norm": 0.13958828151226044, "learning_rate": 9.752671614639619e-05, "loss": 0.1494, "step": 908 }, { "epoch": 0.19080604534005038, "grad_norm": 0.14593012630939484, "learning_rate": 9.75153219994712e-05, "loss": 0.1598, "step": 909 }, { "epoch": 0.1910159529806885, "grad_norm": 0.19262051582336426, "learning_rate": 9.750390233573293e-05, "loss": 0.1638, "step": 910 }, { "epoch": 0.19122586062132663, "grad_norm": 0.15087257325649261, "learning_rate": 9.7492457161314e-05, "loss": 0.1644, "step": 911 }, { "epoch": 0.19143576826196473, "grad_norm": 0.17646470665931702, "learning_rate": 9.748098648236072e-05, "loss": 0.1644, "step": 912 }, { "epoch": 0.19164567590260284, "grad_norm": 0.15336105227470398, "learning_rate": 9.746949030503312e-05, "loss": 0.1717, "step": 913 }, { "epoch": 0.19185558354324098, "grad_norm": 0.15364870429039001, "learning_rate": 9.745796863550492e-05, "loss": 0.1721, "step": 914 }, { "epoch": 0.19206549118387908, "grad_norm": 0.1769437938928604, "learning_rate": 9.74464214799635e-05, "loss": 0.1485, "step": 915 }, { "epoch": 0.19227539882451722, "grad_norm": 0.18178406357765198, "learning_rate": 9.743484884460993e-05, "loss": 0.1523, "step": 916 }, { "epoch": 0.19248530646515533, "grad_norm": 0.16841888427734375, "learning_rate": 9.742325073565905e-05, "loss": 0.148, "step": 917 }, { "epoch": 0.19269521410579346, "grad_norm": 0.13603579998016357, "learning_rate": 9.741162715933924e-05, "loss": 0.161, "step": 918 }, { "epoch": 0.19290512174643157, "grad_norm": 0.1348285973072052, "learning_rate": 9.739997812189265e-05, "loss": 0.145, "step": 919 }, { "epoch": 0.19311502938706968, "grad_norm": 0.13922441005706787, "learning_rate": 9.738830362957508e-05, "loss": 0.1607, "step": 920 }, { "epoch": 0.1933249370277078, "grad_norm": 0.12017328292131424, "learning_rate": 9.7376603688656e-05, "loss": 0.152, "step": 921 }, { "epoch": 0.19353484466834592, "grad_norm": 0.14594700932502747, "learning_rate": 9.736487830541853e-05, "loss": 0.1638, "step": 922 }, { "epoch": 0.19374475230898405, "grad_norm": 0.14390654861927032, "learning_rate": 9.73531274861595e-05, "loss": 0.1608, "step": 923 }, { "epoch": 0.19395465994962216, "grad_norm": 0.12394702434539795, "learning_rate": 9.734135123718933e-05, "loss": 0.1612, "step": 924 }, { "epoch": 0.1941645675902603, "grad_norm": 0.16754676401615143, "learning_rate": 9.732954956483218e-05, "loss": 0.1791, "step": 925 }, { "epoch": 0.1943744752308984, "grad_norm": 0.2816343605518341, "learning_rate": 9.731772247542576e-05, "loss": 0.1489, "step": 926 }, { "epoch": 0.19458438287153654, "grad_norm": 0.15909412503242493, "learning_rate": 9.730586997532155e-05, "loss": 0.1531, "step": 927 }, { "epoch": 0.19479429051217464, "grad_norm": 0.16626602411270142, "learning_rate": 9.729399207088457e-05, "loss": 0.1729, "step": 928 }, { "epoch": 0.19500419815281275, "grad_norm": 0.14581038057804108, "learning_rate": 9.728208876849354e-05, "loss": 0.1616, "step": 929 }, { "epoch": 0.1952141057934509, "grad_norm": 0.14985312521457672, "learning_rate": 9.727016007454079e-05, "loss": 0.1583, "step": 930 }, { "epoch": 0.195424013434089, "grad_norm": 0.13958559930324554, "learning_rate": 9.725820599543234e-05, "loss": 0.1646, "step": 931 }, { "epoch": 0.19563392107472713, "grad_norm": 0.1607862412929535, "learning_rate": 9.724622653758777e-05, "loss": 0.1549, "step": 932 }, { "epoch": 0.19584382871536524, "grad_norm": 0.17007960379123688, "learning_rate": 9.723422170744031e-05, "loss": 0.1718, "step": 933 }, { "epoch": 0.19605373635600337, "grad_norm": 0.1419927030801773, "learning_rate": 9.722219151143688e-05, "loss": 0.1689, "step": 934 }, { "epoch": 0.19626364399664148, "grad_norm": 0.1631292998790741, "learning_rate": 9.721013595603793e-05, "loss": 0.1611, "step": 935 }, { "epoch": 0.1964735516372796, "grad_norm": 0.19870012998580933, "learning_rate": 9.719805504771758e-05, "loss": 0.1836, "step": 936 }, { "epoch": 0.19668345927791772, "grad_norm": 0.14150285720825195, "learning_rate": 9.718594879296355e-05, "loss": 0.1718, "step": 937 }, { "epoch": 0.19689336691855583, "grad_norm": 0.1416793018579483, "learning_rate": 9.717381719827716e-05, "loss": 0.1511, "step": 938 }, { "epoch": 0.19710327455919396, "grad_norm": 0.14615508913993835, "learning_rate": 9.716166027017339e-05, "loss": 0.1599, "step": 939 }, { "epoch": 0.19731318219983207, "grad_norm": 0.13773533701896667, "learning_rate": 9.714947801518076e-05, "loss": 0.1765, "step": 940 }, { "epoch": 0.1975230898404702, "grad_norm": 0.11306725442409515, "learning_rate": 9.713727043984143e-05, "loss": 0.1678, "step": 941 }, { "epoch": 0.1977329974811083, "grad_norm": 0.14150364696979523, "learning_rate": 9.712503755071115e-05, "loss": 0.1535, "step": 942 }, { "epoch": 0.19794290512174642, "grad_norm": 0.15026667714118958, "learning_rate": 9.711277935435925e-05, "loss": 0.155, "step": 943 }, { "epoch": 0.19815281276238456, "grad_norm": 0.1540324091911316, "learning_rate": 9.710049585736866e-05, "loss": 0.1866, "step": 944 }, { "epoch": 0.19836272040302266, "grad_norm": 0.12859384715557098, "learning_rate": 9.708818706633591e-05, "loss": 0.1512, "step": 945 }, { "epoch": 0.1985726280436608, "grad_norm": 0.14280495047569275, "learning_rate": 9.707585298787109e-05, "loss": 0.1558, "step": 946 }, { "epoch": 0.1987825356842989, "grad_norm": 0.14329122006893158, "learning_rate": 9.70634936285979e-05, "loss": 0.153, "step": 947 }, { "epoch": 0.19899244332493704, "grad_norm": 0.18129687011241913, "learning_rate": 9.705110899515359e-05, "loss": 0.1592, "step": 948 }, { "epoch": 0.19920235096557515, "grad_norm": 0.13191339373588562, "learning_rate": 9.7038699094189e-05, "loss": 0.1565, "step": 949 }, { "epoch": 0.19941225860621326, "grad_norm": 0.1381015032529831, "learning_rate": 9.702626393236849e-05, "loss": 0.1704, "step": 950 }, { "epoch": 0.1996221662468514, "grad_norm": 0.12249460071325302, "learning_rate": 9.701380351637007e-05, "loss": 0.1534, "step": 951 }, { "epoch": 0.1998320738874895, "grad_norm": 0.1820680946111679, "learning_rate": 9.700131785288525e-05, "loss": 0.1805, "step": 952 }, { "epoch": 0.20004198152812763, "grad_norm": 0.13920508325099945, "learning_rate": 9.698880694861913e-05, "loss": 0.1652, "step": 953 }, { "epoch": 0.20025188916876574, "grad_norm": 0.14306975901126862, "learning_rate": 9.697627081029033e-05, "loss": 0.1397, "step": 954 }, { "epoch": 0.20046179680940387, "grad_norm": 0.13642224669456482, "learning_rate": 9.696370944463104e-05, "loss": 0.174, "step": 955 }, { "epoch": 0.20067170445004198, "grad_norm": 0.12083397060632706, "learning_rate": 9.695112285838704e-05, "loss": 0.1807, "step": 956 }, { "epoch": 0.2008816120906801, "grad_norm": 0.17191384732723236, "learning_rate": 9.693851105831757e-05, "loss": 0.1656, "step": 957 }, { "epoch": 0.20109151973131822, "grad_norm": 0.1415136307477951, "learning_rate": 9.692587405119549e-05, "loss": 0.1726, "step": 958 }, { "epoch": 0.20130142737195633, "grad_norm": 0.15210871398448944, "learning_rate": 9.691321184380713e-05, "loss": 0.1614, "step": 959 }, { "epoch": 0.20151133501259447, "grad_norm": 0.14295780658721924, "learning_rate": 9.690052444295239e-05, "loss": 0.1884, "step": 960 }, { "epoch": 0.20172124265323257, "grad_norm": 0.13754574954509735, "learning_rate": 9.688781185544471e-05, "loss": 0.1719, "step": 961 }, { "epoch": 0.2019311502938707, "grad_norm": 0.1425933688879013, "learning_rate": 9.687507408811104e-05, "loss": 0.1564, "step": 962 }, { "epoch": 0.20214105793450882, "grad_norm": 0.1202697604894638, "learning_rate": 9.686231114779184e-05, "loss": 0.1584, "step": 963 }, { "epoch": 0.20235096557514692, "grad_norm": 0.12710309028625488, "learning_rate": 9.684952304134111e-05, "loss": 0.168, "step": 964 }, { "epoch": 0.20256087321578506, "grad_norm": 0.14089150726795197, "learning_rate": 9.683670977562633e-05, "loss": 0.1852, "step": 965 }, { "epoch": 0.20277078085642317, "grad_norm": 0.13602103292942047, "learning_rate": 9.682387135752856e-05, "loss": 0.1673, "step": 966 }, { "epoch": 0.2029806884970613, "grad_norm": 0.13352209329605103, "learning_rate": 9.68110077939423e-05, "loss": 0.1571, "step": 967 }, { "epoch": 0.2031905961376994, "grad_norm": 0.13534000515937805, "learning_rate": 9.679811909177556e-05, "loss": 0.1706, "step": 968 }, { "epoch": 0.20340050377833754, "grad_norm": 0.12475798279047012, "learning_rate": 9.67852052579499e-05, "loss": 0.1532, "step": 969 }, { "epoch": 0.20361041141897565, "grad_norm": 0.1589164137840271, "learning_rate": 9.677226629940032e-05, "loss": 0.1597, "step": 970 }, { "epoch": 0.20382031905961376, "grad_norm": 0.14976951479911804, "learning_rate": 9.675930222307537e-05, "loss": 0.1731, "step": 971 }, { "epoch": 0.2040302267002519, "grad_norm": 0.12664476037025452, "learning_rate": 9.6746313035937e-05, "loss": 0.1363, "step": 972 }, { "epoch": 0.20424013434089, "grad_norm": 0.12275015562772751, "learning_rate": 9.673329874496075e-05, "loss": 0.1574, "step": 973 }, { "epoch": 0.20445004198152814, "grad_norm": 0.13222196698188782, "learning_rate": 9.672025935713556e-05, "loss": 0.1535, "step": 974 }, { "epoch": 0.20465994962216624, "grad_norm": 0.15173639357089996, "learning_rate": 9.670719487946389e-05, "loss": 0.1792, "step": 975 }, { "epoch": 0.20486985726280438, "grad_norm": 0.17748403549194336, "learning_rate": 9.669410531896167e-05, "loss": 0.1891, "step": 976 }, { "epoch": 0.20507976490344249, "grad_norm": 0.14893555641174316, "learning_rate": 9.66809906826583e-05, "loss": 0.1438, "step": 977 }, { "epoch": 0.2052896725440806, "grad_norm": 0.16388140618801117, "learning_rate": 9.66678509775966e-05, "loss": 0.1641, "step": 978 }, { "epoch": 0.20549958018471873, "grad_norm": 0.17024089395999908, "learning_rate": 9.665468621083293e-05, "loss": 0.1839, "step": 979 }, { "epoch": 0.20570948782535683, "grad_norm": 0.14790277183055878, "learning_rate": 9.664149638943707e-05, "loss": 0.1701, "step": 980 }, { "epoch": 0.20591939546599497, "grad_norm": 0.14114373922348022, "learning_rate": 9.662828152049223e-05, "loss": 0.1699, "step": 981 }, { "epoch": 0.20612930310663308, "grad_norm": 0.12556609511375427, "learning_rate": 9.661504161109513e-05, "loss": 0.1607, "step": 982 }, { "epoch": 0.2063392107472712, "grad_norm": 0.11868204176425934, "learning_rate": 9.660177666835585e-05, "loss": 0.1487, "step": 983 }, { "epoch": 0.20654911838790932, "grad_norm": 0.12117776274681091, "learning_rate": 9.658848669939805e-05, "loss": 0.1668, "step": 984 }, { "epoch": 0.20675902602854743, "grad_norm": 0.126139298081398, "learning_rate": 9.657517171135866e-05, "loss": 0.1535, "step": 985 }, { "epoch": 0.20696893366918556, "grad_norm": 0.15042434632778168, "learning_rate": 9.656183171138818e-05, "loss": 0.168, "step": 986 }, { "epoch": 0.20717884130982367, "grad_norm": 0.1438342183828354, "learning_rate": 9.65484667066505e-05, "loss": 0.1628, "step": 987 }, { "epoch": 0.2073887489504618, "grad_norm": 0.1280035823583603, "learning_rate": 9.65350767043229e-05, "loss": 0.1655, "step": 988 }, { "epoch": 0.2075986565910999, "grad_norm": 0.13997893035411835, "learning_rate": 9.652166171159614e-05, "loss": 0.1766, "step": 989 }, { "epoch": 0.20780856423173805, "grad_norm": 0.1653011292219162, "learning_rate": 9.650822173567438e-05, "loss": 0.1744, "step": 990 }, { "epoch": 0.20801847187237615, "grad_norm": 0.1532358080148697, "learning_rate": 9.64947567837752e-05, "loss": 0.1529, "step": 991 }, { "epoch": 0.20822837951301426, "grad_norm": 0.13623711466789246, "learning_rate": 9.648126686312955e-05, "loss": 0.1632, "step": 992 }, { "epoch": 0.2084382871536524, "grad_norm": 0.13220278918743134, "learning_rate": 9.646775198098186e-05, "loss": 0.1462, "step": 993 }, { "epoch": 0.2086481947942905, "grad_norm": 0.16712597012519836, "learning_rate": 9.645421214458992e-05, "loss": 0.1658, "step": 994 }, { "epoch": 0.20885810243492864, "grad_norm": 0.16033975780010223, "learning_rate": 9.644064736122493e-05, "loss": 0.1523, "step": 995 }, { "epoch": 0.20906801007556675, "grad_norm": 0.13283276557922363, "learning_rate": 9.64270576381715e-05, "loss": 0.1738, "step": 996 }, { "epoch": 0.20927791771620488, "grad_norm": 0.2072938233613968, "learning_rate": 9.64134429827276e-05, "loss": 0.1589, "step": 997 }, { "epoch": 0.209487825356843, "grad_norm": 0.15322737395763397, "learning_rate": 9.639980340220462e-05, "loss": 0.1772, "step": 998 }, { "epoch": 0.2096977329974811, "grad_norm": 0.21009685099124908, "learning_rate": 9.638613890392734e-05, "loss": 0.1619, "step": 999 }, { "epoch": 0.20990764063811923, "grad_norm": 0.1521667093038559, "learning_rate": 9.63724494952339e-05, "loss": 0.1878, "step": 1000 }, { "epoch": 0.21011754827875734, "grad_norm": 0.11604970693588257, "learning_rate": 9.635873518347581e-05, "loss": 0.1621, "step": 1001 }, { "epoch": 0.21032745591939547, "grad_norm": 0.16362364590168, "learning_rate": 9.634499597601798e-05, "loss": 0.1779, "step": 1002 }, { "epoch": 0.21053736356003358, "grad_norm": 0.15217158198356628, "learning_rate": 9.63312318802387e-05, "loss": 0.1657, "step": 1003 }, { "epoch": 0.21074727120067172, "grad_norm": 0.15649117529392242, "learning_rate": 9.631744290352956e-05, "loss": 0.1817, "step": 1004 }, { "epoch": 0.21095717884130982, "grad_norm": 0.15730923414230347, "learning_rate": 9.63036290532956e-05, "loss": 0.1646, "step": 1005 }, { "epoch": 0.21116708648194793, "grad_norm": 0.1645728051662445, "learning_rate": 9.628979033695513e-05, "loss": 0.1546, "step": 1006 }, { "epoch": 0.21137699412258606, "grad_norm": 0.13902553915977478, "learning_rate": 9.627592676193988e-05, "loss": 0.157, "step": 1007 }, { "epoch": 0.21158690176322417, "grad_norm": 0.11890498548746109, "learning_rate": 9.626203833569491e-05, "loss": 0.1668, "step": 1008 }, { "epoch": 0.2117968094038623, "grad_norm": 0.14101600646972656, "learning_rate": 9.624812506567863e-05, "loss": 0.1477, "step": 1009 }, { "epoch": 0.21200671704450041, "grad_norm": 0.1529974490404129, "learning_rate": 9.623418695936275e-05, "loss": 0.1675, "step": 1010 }, { "epoch": 0.21221662468513855, "grad_norm": 0.15540415048599243, "learning_rate": 9.622022402423239e-05, "loss": 0.1797, "step": 1011 }, { "epoch": 0.21242653232577666, "grad_norm": 0.16966544091701508, "learning_rate": 9.620623626778593e-05, "loss": 0.1642, "step": 1012 }, { "epoch": 0.21263643996641476, "grad_norm": 0.1261100172996521, "learning_rate": 9.619222369753515e-05, "loss": 0.1633, "step": 1013 }, { "epoch": 0.2128463476070529, "grad_norm": 0.15353427827358246, "learning_rate": 9.617818632100508e-05, "loss": 0.1638, "step": 1014 }, { "epoch": 0.213056255247691, "grad_norm": 0.17301282286643982, "learning_rate": 9.616412414573416e-05, "loss": 0.1637, "step": 1015 }, { "epoch": 0.21326616288832914, "grad_norm": 0.14541825652122498, "learning_rate": 9.615003717927406e-05, "loss": 0.1677, "step": 1016 }, { "epoch": 0.21347607052896725, "grad_norm": 0.13302357494831085, "learning_rate": 9.613592542918981e-05, "loss": 0.1595, "step": 1017 }, { "epoch": 0.21368597816960538, "grad_norm": 0.17005443572998047, "learning_rate": 9.612178890305976e-05, "loss": 0.1799, "step": 1018 }, { "epoch": 0.2138958858102435, "grad_norm": 0.20354509353637695, "learning_rate": 9.610762760847553e-05, "loss": 0.1549, "step": 1019 }, { "epoch": 0.2141057934508816, "grad_norm": 0.16336257755756378, "learning_rate": 9.609344155304206e-05, "loss": 0.1404, "step": 1020 }, { "epoch": 0.21431570109151973, "grad_norm": 0.14682699739933014, "learning_rate": 9.607923074437758e-05, "loss": 0.1666, "step": 1021 }, { "epoch": 0.21452560873215784, "grad_norm": 0.15628460049629211, "learning_rate": 9.606499519011367e-05, "loss": 0.1723, "step": 1022 }, { "epoch": 0.21473551637279598, "grad_norm": 0.18369454145431519, "learning_rate": 9.605073489789506e-05, "loss": 0.1622, "step": 1023 }, { "epoch": 0.21494542401343408, "grad_norm": 0.14383664727210999, "learning_rate": 9.603644987537994e-05, "loss": 0.1776, "step": 1024 }, { "epoch": 0.21515533165407222, "grad_norm": 0.12850511074066162, "learning_rate": 9.602214013023963e-05, "loss": 0.1552, "step": 1025 }, { "epoch": 0.21536523929471033, "grad_norm": 0.12851618230342865, "learning_rate": 9.60078056701588e-05, "loss": 0.1634, "step": 1026 }, { "epoch": 0.21557514693534843, "grad_norm": 0.1252540647983551, "learning_rate": 9.599344650283544e-05, "loss": 0.1553, "step": 1027 }, { "epoch": 0.21578505457598657, "grad_norm": 0.14331619441509247, "learning_rate": 9.597906263598067e-05, "loss": 0.1681, "step": 1028 }, { "epoch": 0.21599496221662468, "grad_norm": 0.13295383751392365, "learning_rate": 9.5964654077319e-05, "loss": 0.1748, "step": 1029 }, { "epoch": 0.2162048698572628, "grad_norm": 0.14106275141239166, "learning_rate": 9.595022083458815e-05, "loss": 0.1642, "step": 1030 }, { "epoch": 0.21641477749790092, "grad_norm": 0.21775981783866882, "learning_rate": 9.59357629155391e-05, "loss": 0.1465, "step": 1031 }, { "epoch": 0.21662468513853905, "grad_norm": 0.19797000288963318, "learning_rate": 9.592128032793607e-05, "loss": 0.1559, "step": 1032 }, { "epoch": 0.21683459277917716, "grad_norm": 0.14408251643180847, "learning_rate": 9.590677307955656e-05, "loss": 0.1696, "step": 1033 }, { "epoch": 0.2170445004198153, "grad_norm": 0.14695291221141815, "learning_rate": 9.589224117819128e-05, "loss": 0.1415, "step": 1034 }, { "epoch": 0.2172544080604534, "grad_norm": 0.1755986511707306, "learning_rate": 9.58776846316442e-05, "loss": 0.1565, "step": 1035 }, { "epoch": 0.2174643157010915, "grad_norm": 0.2155688852071762, "learning_rate": 9.58631034477325e-05, "loss": 0.1631, "step": 1036 }, { "epoch": 0.21767422334172964, "grad_norm": 0.16168251633644104, "learning_rate": 9.584849763428665e-05, "loss": 0.174, "step": 1037 }, { "epoch": 0.21788413098236775, "grad_norm": 0.14332053065299988, "learning_rate": 9.583386719915026e-05, "loss": 0.1733, "step": 1038 }, { "epoch": 0.2180940386230059, "grad_norm": 0.1143033355474472, "learning_rate": 9.581921215018023e-05, "loss": 0.1431, "step": 1039 }, { "epoch": 0.218303946263644, "grad_norm": 0.16438762843608856, "learning_rate": 9.580453249524667e-05, "loss": 0.1731, "step": 1040 }, { "epoch": 0.21851385390428213, "grad_norm": 0.16054236888885498, "learning_rate": 9.578982824223285e-05, "loss": 0.1706, "step": 1041 }, { "epoch": 0.21872376154492024, "grad_norm": 0.14396212995052338, "learning_rate": 9.577509939903535e-05, "loss": 0.1694, "step": 1042 }, { "epoch": 0.21893366918555834, "grad_norm": 0.14557866752147675, "learning_rate": 9.576034597356382e-05, "loss": 0.1692, "step": 1043 }, { "epoch": 0.21914357682619648, "grad_norm": 0.1378258913755417, "learning_rate": 9.574556797374124e-05, "loss": 0.1616, "step": 1044 }, { "epoch": 0.2193534844668346, "grad_norm": 0.11917531490325928, "learning_rate": 9.573076540750371e-05, "loss": 0.1523, "step": 1045 }, { "epoch": 0.21956339210747272, "grad_norm": 0.12740926444530487, "learning_rate": 9.571593828280058e-05, "loss": 0.1581, "step": 1046 }, { "epoch": 0.21977329974811083, "grad_norm": 0.12773928046226501, "learning_rate": 9.570108660759432e-05, "loss": 0.1657, "step": 1047 }, { "epoch": 0.21998320738874896, "grad_norm": 0.15594597160816193, "learning_rate": 9.568621038986061e-05, "loss": 0.175, "step": 1048 }, { "epoch": 0.22019311502938707, "grad_norm": 0.1410975605249405, "learning_rate": 9.567130963758834e-05, "loss": 0.1527, "step": 1049 }, { "epoch": 0.22040302267002518, "grad_norm": 0.1446082442998886, "learning_rate": 9.565638435877957e-05, "loss": 0.1553, "step": 1050 }, { "epoch": 0.2206129303106633, "grad_norm": 0.14107242226600647, "learning_rate": 9.564143456144949e-05, "loss": 0.1628, "step": 1051 }, { "epoch": 0.22082283795130142, "grad_norm": 0.15437398850917816, "learning_rate": 9.562646025362649e-05, "loss": 0.1704, "step": 1052 }, { "epoch": 0.22103274559193956, "grad_norm": 0.13233573734760284, "learning_rate": 9.561146144335212e-05, "loss": 0.1588, "step": 1053 }, { "epoch": 0.22124265323257766, "grad_norm": 0.13578198850154877, "learning_rate": 9.559643813868107e-05, "loss": 0.1649, "step": 1054 }, { "epoch": 0.2214525608732158, "grad_norm": 0.17958106100559235, "learning_rate": 9.55813903476812e-05, "loss": 0.1707, "step": 1055 }, { "epoch": 0.2216624685138539, "grad_norm": 0.14567284286022186, "learning_rate": 9.556631807843352e-05, "loss": 0.1622, "step": 1056 }, { "epoch": 0.221872376154492, "grad_norm": 0.12384296208620071, "learning_rate": 9.55512213390322e-05, "loss": 0.1763, "step": 1057 }, { "epoch": 0.22208228379513015, "grad_norm": 0.14242222905158997, "learning_rate": 9.553610013758453e-05, "loss": 0.1695, "step": 1058 }, { "epoch": 0.22229219143576825, "grad_norm": 0.13062936067581177, "learning_rate": 9.552095448221093e-05, "loss": 0.1631, "step": 1059 }, { "epoch": 0.2225020990764064, "grad_norm": 0.158433735370636, "learning_rate": 9.550578438104494e-05, "loss": 0.1679, "step": 1060 }, { "epoch": 0.2227120067170445, "grad_norm": 0.14844875037670135, "learning_rate": 9.549058984223329e-05, "loss": 0.1606, "step": 1061 }, { "epoch": 0.22292191435768263, "grad_norm": 0.1360979825258255, "learning_rate": 9.547537087393579e-05, "loss": 0.1528, "step": 1062 }, { "epoch": 0.22313182199832074, "grad_norm": 0.1500774621963501, "learning_rate": 9.546012748432535e-05, "loss": 0.1639, "step": 1063 }, { "epoch": 0.22334172963895885, "grad_norm": 0.1147661879658699, "learning_rate": 9.544485968158803e-05, "loss": 0.1729, "step": 1064 }, { "epoch": 0.22355163727959698, "grad_norm": 0.13184726238250732, "learning_rate": 9.542956747392299e-05, "loss": 0.1733, "step": 1065 }, { "epoch": 0.2237615449202351, "grad_norm": 0.14958997070789337, "learning_rate": 9.541425086954251e-05, "loss": 0.1621, "step": 1066 }, { "epoch": 0.22397145256087322, "grad_norm": 0.13268455862998962, "learning_rate": 9.539890987667193e-05, "loss": 0.1542, "step": 1067 }, { "epoch": 0.22418136020151133, "grad_norm": 0.12983419001102448, "learning_rate": 9.538354450354974e-05, "loss": 0.1608, "step": 1068 }, { "epoch": 0.22439126784214947, "grad_norm": 0.11506141722202301, "learning_rate": 9.536815475842747e-05, "loss": 0.1521, "step": 1069 }, { "epoch": 0.22460117548278757, "grad_norm": 0.16174156963825226, "learning_rate": 9.535274064956976e-05, "loss": 0.176, "step": 1070 }, { "epoch": 0.22481108312342568, "grad_norm": 0.14904946088790894, "learning_rate": 9.533730218525438e-05, "loss": 0.1743, "step": 1071 }, { "epoch": 0.22502099076406382, "grad_norm": 0.1392981857061386, "learning_rate": 9.53218393737721e-05, "loss": 0.1619, "step": 1072 }, { "epoch": 0.22523089840470192, "grad_norm": 0.1343872994184494, "learning_rate": 9.530635222342682e-05, "loss": 0.1436, "step": 1073 }, { "epoch": 0.22544080604534006, "grad_norm": 0.13077078759670258, "learning_rate": 9.529084074253549e-05, "loss": 0.1501, "step": 1074 }, { "epoch": 0.22565071368597817, "grad_norm": 0.13800117373466492, "learning_rate": 9.527530493942812e-05, "loss": 0.1634, "step": 1075 }, { "epoch": 0.2258606213266163, "grad_norm": 0.15503515303134918, "learning_rate": 9.525974482244782e-05, "loss": 0.1682, "step": 1076 }, { "epoch": 0.2260705289672544, "grad_norm": 0.13676904141902924, "learning_rate": 9.524416039995068e-05, "loss": 0.1648, "step": 1077 }, { "epoch": 0.22628043660789252, "grad_norm": 0.1415819525718689, "learning_rate": 9.522855168030595e-05, "loss": 0.1732, "step": 1078 }, { "epoch": 0.22649034424853065, "grad_norm": 0.14641550183296204, "learning_rate": 9.521291867189581e-05, "loss": 0.1535, "step": 1079 }, { "epoch": 0.22670025188916876, "grad_norm": 0.13009630143642426, "learning_rate": 9.519726138311558e-05, "loss": 0.1624, "step": 1080 }, { "epoch": 0.2269101595298069, "grad_norm": 0.12057724595069885, "learning_rate": 9.51815798223736e-05, "loss": 0.1626, "step": 1081 }, { "epoch": 0.227120067170445, "grad_norm": 0.13820677995681763, "learning_rate": 9.516587399809116e-05, "loss": 0.1566, "step": 1082 }, { "epoch": 0.22732997481108314, "grad_norm": 0.19474509358406067, "learning_rate": 9.515014391870269e-05, "loss": 0.1591, "step": 1083 }, { "epoch": 0.22753988245172124, "grad_norm": 0.13335439562797546, "learning_rate": 9.513438959265561e-05, "loss": 0.1559, "step": 1084 }, { "epoch": 0.22774979009235935, "grad_norm": 0.16238383948802948, "learning_rate": 9.511861102841036e-05, "loss": 0.1701, "step": 1085 }, { "epoch": 0.22795969773299748, "grad_norm": 0.129498690366745, "learning_rate": 9.510280823444034e-05, "loss": 0.1542, "step": 1086 }, { "epoch": 0.2281696053736356, "grad_norm": 0.13683976233005524, "learning_rate": 9.508698121923204e-05, "loss": 0.1647, "step": 1087 }, { "epoch": 0.22837951301427373, "grad_norm": 0.15396280586719513, "learning_rate": 9.507112999128493e-05, "loss": 0.1519, "step": 1088 }, { "epoch": 0.22858942065491183, "grad_norm": 0.12210803478956223, "learning_rate": 9.50552545591115e-05, "loss": 0.1579, "step": 1089 }, { "epoch": 0.22879932829554997, "grad_norm": 0.1511714905500412, "learning_rate": 9.503935493123719e-05, "loss": 0.155, "step": 1090 }, { "epoch": 0.22900923593618808, "grad_norm": 0.14200669527053833, "learning_rate": 9.502343111620047e-05, "loss": 0.174, "step": 1091 }, { "epoch": 0.22921914357682618, "grad_norm": 0.12680545449256897, "learning_rate": 9.500748312255281e-05, "loss": 0.1844, "step": 1092 }, { "epoch": 0.22942905121746432, "grad_norm": 0.14180205762386322, "learning_rate": 9.499151095885861e-05, "loss": 0.1619, "step": 1093 }, { "epoch": 0.22963895885810243, "grad_norm": 0.12461036443710327, "learning_rate": 9.497551463369532e-05, "loss": 0.1558, "step": 1094 }, { "epoch": 0.22984886649874056, "grad_norm": 0.17455211281776428, "learning_rate": 9.495949415565332e-05, "loss": 0.172, "step": 1095 }, { "epoch": 0.23005877413937867, "grad_norm": 0.15062375366687775, "learning_rate": 9.494344953333599e-05, "loss": 0.1656, "step": 1096 }, { "epoch": 0.2302686817800168, "grad_norm": 0.14742077887058258, "learning_rate": 9.492738077535963e-05, "loss": 0.1751, "step": 1097 }, { "epoch": 0.2304785894206549, "grad_norm": 0.1343587338924408, "learning_rate": 9.491128789035355e-05, "loss": 0.16, "step": 1098 }, { "epoch": 0.23068849706129302, "grad_norm": 0.13116249442100525, "learning_rate": 9.489517088696e-05, "loss": 0.1495, "step": 1099 }, { "epoch": 0.23089840470193115, "grad_norm": 0.17240563035011292, "learning_rate": 9.487902977383415e-05, "loss": 0.171, "step": 1100 }, { "epoch": 0.23110831234256926, "grad_norm": 0.1725616306066513, "learning_rate": 9.486286455964417e-05, "loss": 0.1672, "step": 1101 }, { "epoch": 0.2313182199832074, "grad_norm": 0.16195368766784668, "learning_rate": 9.484667525307116e-05, "loss": 0.1697, "step": 1102 }, { "epoch": 0.2315281276238455, "grad_norm": 0.15474824607372284, "learning_rate": 9.483046186280912e-05, "loss": 0.1757, "step": 1103 }, { "epoch": 0.23173803526448364, "grad_norm": 0.13763071596622467, "learning_rate": 9.481422439756503e-05, "loss": 0.1545, "step": 1104 }, { "epoch": 0.23194794290512175, "grad_norm": 0.15999558568000793, "learning_rate": 9.479796286605876e-05, "loss": 0.168, "step": 1105 }, { "epoch": 0.23215785054575985, "grad_norm": 0.13550569117069244, "learning_rate": 9.478167727702315e-05, "loss": 0.1642, "step": 1106 }, { "epoch": 0.232367758186398, "grad_norm": 0.1440218985080719, "learning_rate": 9.47653676392039e-05, "loss": 0.1763, "step": 1107 }, { "epoch": 0.2325776658270361, "grad_norm": 0.13457611203193665, "learning_rate": 9.47490339613597e-05, "loss": 0.1604, "step": 1108 }, { "epoch": 0.23278757346767423, "grad_norm": 0.13920889794826508, "learning_rate": 9.473267625226207e-05, "loss": 0.158, "step": 1109 }, { "epoch": 0.23299748110831234, "grad_norm": 0.1562851071357727, "learning_rate": 9.47162945206955e-05, "loss": 0.1647, "step": 1110 }, { "epoch": 0.23320738874895047, "grad_norm": 0.13289715349674225, "learning_rate": 9.469988877545735e-05, "loss": 0.1535, "step": 1111 }, { "epoch": 0.23341729638958858, "grad_norm": 0.13001225888729095, "learning_rate": 9.468345902535787e-05, "loss": 0.1513, "step": 1112 }, { "epoch": 0.2336272040302267, "grad_norm": 0.1498699188232422, "learning_rate": 9.466700527922021e-05, "loss": 0.1711, "step": 1113 }, { "epoch": 0.23383711167086482, "grad_norm": 0.14534981548786163, "learning_rate": 9.465052754588042e-05, "loss": 0.1663, "step": 1114 }, { "epoch": 0.23404701931150293, "grad_norm": 0.18299245834350586, "learning_rate": 9.463402583418741e-05, "loss": 0.1478, "step": 1115 }, { "epoch": 0.23425692695214106, "grad_norm": 0.12957628071308136, "learning_rate": 9.461750015300296e-05, "loss": 0.1528, "step": 1116 }, { "epoch": 0.23446683459277917, "grad_norm": 0.13237446546554565, "learning_rate": 9.460095051120179e-05, "loss": 0.1749, "step": 1117 }, { "epoch": 0.2346767422334173, "grad_norm": 0.12967370450496674, "learning_rate": 9.458437691767138e-05, "loss": 0.1663, "step": 1118 }, { "epoch": 0.23488664987405541, "grad_norm": 0.14041563868522644, "learning_rate": 9.456777938131216e-05, "loss": 0.1581, "step": 1119 }, { "epoch": 0.23509655751469352, "grad_norm": 0.1613098829984665, "learning_rate": 9.455115791103736e-05, "loss": 0.1581, "step": 1120 }, { "epoch": 0.23530646515533166, "grad_norm": 0.12044283002614975, "learning_rate": 9.453451251577312e-05, "loss": 0.1667, "step": 1121 }, { "epoch": 0.23551637279596976, "grad_norm": 0.11583617329597473, "learning_rate": 9.451784320445838e-05, "loss": 0.1703, "step": 1122 }, { "epoch": 0.2357262804366079, "grad_norm": 0.13826625049114227, "learning_rate": 9.450114998604493e-05, "loss": 0.1634, "step": 1123 }, { "epoch": 0.235936188077246, "grad_norm": 0.1239803358912468, "learning_rate": 9.448443286949745e-05, "loss": 0.1593, "step": 1124 }, { "epoch": 0.23614609571788414, "grad_norm": 0.11571444571018219, "learning_rate": 9.446769186379338e-05, "loss": 0.1718, "step": 1125 }, { "epoch": 0.23635600335852225, "grad_norm": 0.13045620918273926, "learning_rate": 9.445092697792302e-05, "loss": 0.1767, "step": 1126 }, { "epoch": 0.23656591099916036, "grad_norm": 0.14445173740386963, "learning_rate": 9.44341382208895e-05, "loss": 0.1518, "step": 1127 }, { "epoch": 0.2367758186397985, "grad_norm": 0.13226744532585144, "learning_rate": 9.441732560170879e-05, "loss": 0.1575, "step": 1128 }, { "epoch": 0.2369857262804366, "grad_norm": 0.12810592353343964, "learning_rate": 9.440048912940961e-05, "loss": 0.1772, "step": 1129 }, { "epoch": 0.23719563392107473, "grad_norm": 0.12598370015621185, "learning_rate": 9.438362881303358e-05, "loss": 0.1663, "step": 1130 }, { "epoch": 0.23740554156171284, "grad_norm": 0.11832118779420853, "learning_rate": 9.436674466163503e-05, "loss": 0.1495, "step": 1131 }, { "epoch": 0.23761544920235098, "grad_norm": 0.13778379559516907, "learning_rate": 9.434983668428116e-05, "loss": 0.1637, "step": 1132 }, { "epoch": 0.23782535684298908, "grad_norm": 0.12428466975688934, "learning_rate": 9.433290489005193e-05, "loss": 0.1552, "step": 1133 }, { "epoch": 0.2380352644836272, "grad_norm": 0.11398124694824219, "learning_rate": 9.431594928804011e-05, "loss": 0.1814, "step": 1134 }, { "epoch": 0.23824517212426533, "grad_norm": 0.12184252589941025, "learning_rate": 9.429896988735123e-05, "loss": 0.1618, "step": 1135 }, { "epoch": 0.23845507976490343, "grad_norm": 0.12840035557746887, "learning_rate": 9.428196669710363e-05, "loss": 0.1858, "step": 1136 }, { "epoch": 0.23866498740554157, "grad_norm": 0.12335941940546036, "learning_rate": 9.42649397264284e-05, "loss": 0.1605, "step": 1137 }, { "epoch": 0.23887489504617967, "grad_norm": 0.13650208711624146, "learning_rate": 9.42478889844694e-05, "loss": 0.1811, "step": 1138 }, { "epoch": 0.2390848026868178, "grad_norm": 0.14578799903392792, "learning_rate": 9.423081448038334e-05, "loss": 0.1781, "step": 1139 }, { "epoch": 0.23929471032745592, "grad_norm": 0.13492166996002197, "learning_rate": 9.421371622333952e-05, "loss": 0.1644, "step": 1140 }, { "epoch": 0.23950461796809405, "grad_norm": 0.12785859405994415, "learning_rate": 9.419659422252017e-05, "loss": 0.1614, "step": 1141 }, { "epoch": 0.23971452560873216, "grad_norm": 0.1437363475561142, "learning_rate": 9.417944848712014e-05, "loss": 0.1737, "step": 1142 }, { "epoch": 0.23992443324937027, "grad_norm": 0.12084683775901794, "learning_rate": 9.416227902634711e-05, "loss": 0.16, "step": 1143 }, { "epoch": 0.2401343408900084, "grad_norm": 0.13587456941604614, "learning_rate": 9.414508584942147e-05, "loss": 0.1666, "step": 1144 }, { "epoch": 0.2403442485306465, "grad_norm": 0.1418214589357376, "learning_rate": 9.412786896557638e-05, "loss": 0.1526, "step": 1145 }, { "epoch": 0.24055415617128464, "grad_norm": 0.13639256358146667, "learning_rate": 9.411062838405762e-05, "loss": 0.1739, "step": 1146 }, { "epoch": 0.24076406381192275, "grad_norm": 0.1474551260471344, "learning_rate": 9.409336411412386e-05, "loss": 0.1626, "step": 1147 }, { "epoch": 0.2409739714525609, "grad_norm": 0.14461228251457214, "learning_rate": 9.407607616504635e-05, "loss": 0.1693, "step": 1148 }, { "epoch": 0.241183879093199, "grad_norm": 0.15202075242996216, "learning_rate": 9.405876454610915e-05, "loss": 0.1607, "step": 1149 }, { "epoch": 0.2413937867338371, "grad_norm": 0.14336565136909485, "learning_rate": 9.404142926660897e-05, "loss": 0.1363, "step": 1150 }, { "epoch": 0.24160369437447524, "grad_norm": 0.15390141308307648, "learning_rate": 9.402407033585527e-05, "loss": 0.1666, "step": 1151 }, { "epoch": 0.24181360201511334, "grad_norm": 0.16369935870170593, "learning_rate": 9.400668776317017e-05, "loss": 0.1584, "step": 1152 }, { "epoch": 0.24202350965575148, "grad_norm": 0.13057595491409302, "learning_rate": 9.398928155788853e-05, "loss": 0.1638, "step": 1153 }, { "epoch": 0.24223341729638959, "grad_norm": 0.13949787616729736, "learning_rate": 9.397185172935787e-05, "loss": 0.1558, "step": 1154 }, { "epoch": 0.24244332493702772, "grad_norm": 0.1358940303325653, "learning_rate": 9.395439828693841e-05, "loss": 0.1482, "step": 1155 }, { "epoch": 0.24265323257766583, "grad_norm": 0.1706434041261673, "learning_rate": 9.393692124000305e-05, "loss": 0.1663, "step": 1156 }, { "epoch": 0.24286314021830394, "grad_norm": 0.16532990336418152, "learning_rate": 9.391942059793734e-05, "loss": 0.169, "step": 1157 }, { "epoch": 0.24307304785894207, "grad_norm": 0.13570407032966614, "learning_rate": 9.390189637013958e-05, "loss": 0.1546, "step": 1158 }, { "epoch": 0.24328295549958018, "grad_norm": 0.12366579473018646, "learning_rate": 9.388434856602061e-05, "loss": 0.1517, "step": 1159 }, { "epoch": 0.2434928631402183, "grad_norm": 0.13216842710971832, "learning_rate": 9.386677719500406e-05, "loss": 0.1765, "step": 1160 }, { "epoch": 0.24370277078085642, "grad_norm": 0.15686704218387604, "learning_rate": 9.38491822665261e-05, "loss": 0.1783, "step": 1161 }, { "epoch": 0.24391267842149456, "grad_norm": 0.12082766741514206, "learning_rate": 9.383156379003566e-05, "loss": 0.1667, "step": 1162 }, { "epoch": 0.24412258606213266, "grad_norm": 0.12626822292804718, "learning_rate": 9.381392177499426e-05, "loss": 0.1589, "step": 1163 }, { "epoch": 0.24433249370277077, "grad_norm": 0.13412562012672424, "learning_rate": 9.379625623087603e-05, "loss": 0.1548, "step": 1164 }, { "epoch": 0.2445424013434089, "grad_norm": 0.14654812216758728, "learning_rate": 9.37785671671678e-05, "loss": 0.1521, "step": 1165 }, { "epoch": 0.244752308984047, "grad_norm": 0.14660955965518951, "learning_rate": 9.376085459336898e-05, "loss": 0.173, "step": 1166 }, { "epoch": 0.24496221662468515, "grad_norm": 0.1366211324930191, "learning_rate": 9.374311851899166e-05, "loss": 0.1795, "step": 1167 }, { "epoch": 0.24517212426532325, "grad_norm": 0.1468217670917511, "learning_rate": 9.37253589535605e-05, "loss": 0.1606, "step": 1168 }, { "epoch": 0.2453820319059614, "grad_norm": 0.13741639256477356, "learning_rate": 9.370757590661277e-05, "loss": 0.1653, "step": 1169 }, { "epoch": 0.2455919395465995, "grad_norm": 0.14487077295780182, "learning_rate": 9.368976938769839e-05, "loss": 0.1638, "step": 1170 }, { "epoch": 0.2458018471872376, "grad_norm": 0.14959833025932312, "learning_rate": 9.367193940637988e-05, "loss": 0.1839, "step": 1171 }, { "epoch": 0.24601175482787574, "grad_norm": 0.14737145602703094, "learning_rate": 9.365408597223233e-05, "loss": 0.1737, "step": 1172 }, { "epoch": 0.24622166246851385, "grad_norm": 0.1258353590965271, "learning_rate": 9.363620909484345e-05, "loss": 0.1574, "step": 1173 }, { "epoch": 0.24643157010915198, "grad_norm": 0.12692254781723022, "learning_rate": 9.361830878381352e-05, "loss": 0.1676, "step": 1174 }, { "epoch": 0.2466414777497901, "grad_norm": 0.13535474240779877, "learning_rate": 9.360038504875545e-05, "loss": 0.162, "step": 1175 }, { "epoch": 0.24685138539042822, "grad_norm": 0.1385834813117981, "learning_rate": 9.358243789929465e-05, "loss": 0.1611, "step": 1176 }, { "epoch": 0.24706129303106633, "grad_norm": 0.15537822246551514, "learning_rate": 9.356446734506918e-05, "loss": 0.1561, "step": 1177 }, { "epoch": 0.24727120067170444, "grad_norm": 0.13240060210227966, "learning_rate": 9.354647339572961e-05, "loss": 0.1566, "step": 1178 }, { "epoch": 0.24748110831234257, "grad_norm": 0.12547266483306885, "learning_rate": 9.352845606093914e-05, "loss": 0.1743, "step": 1179 }, { "epoch": 0.24769101595298068, "grad_norm": 0.132894828915596, "learning_rate": 9.351041535037347e-05, "loss": 0.1733, "step": 1180 }, { "epoch": 0.24790092359361882, "grad_norm": 0.1465143859386444, "learning_rate": 9.349235127372086e-05, "loss": 0.1776, "step": 1181 }, { "epoch": 0.24811083123425692, "grad_norm": 0.15798325836658478, "learning_rate": 9.347426384068216e-05, "loss": 0.1811, "step": 1182 }, { "epoch": 0.24832073887489506, "grad_norm": 0.11733090877532959, "learning_rate": 9.345615306097071e-05, "loss": 0.158, "step": 1183 }, { "epoch": 0.24853064651553317, "grad_norm": 0.11072125285863876, "learning_rate": 9.343801894431244e-05, "loss": 0.1688, "step": 1184 }, { "epoch": 0.24874055415617127, "grad_norm": 0.11508002877235413, "learning_rate": 9.341986150044574e-05, "loss": 0.1538, "step": 1185 }, { "epoch": 0.2489504617968094, "grad_norm": 0.13166356086730957, "learning_rate": 9.340168073912162e-05, "loss": 0.1489, "step": 1186 }, { "epoch": 0.24916036943744752, "grad_norm": 0.14776529371738434, "learning_rate": 9.338347667010352e-05, "loss": 0.1459, "step": 1187 }, { "epoch": 0.24937027707808565, "grad_norm": 0.13042989373207092, "learning_rate": 9.336524930316749e-05, "loss": 0.1595, "step": 1188 }, { "epoch": 0.24958018471872376, "grad_norm": 0.11727176606655121, "learning_rate": 9.334699864810201e-05, "loss": 0.1525, "step": 1189 }, { "epoch": 0.2497900923593619, "grad_norm": 0.14481998980045319, "learning_rate": 9.332872471470811e-05, "loss": 0.1627, "step": 1190 }, { "epoch": 0.25, "grad_norm": 0.13064126670360565, "learning_rate": 9.331042751279929e-05, "loss": 0.1711, "step": 1191 }, { "epoch": 0.2502099076406381, "grad_norm": 0.1243947371840477, "learning_rate": 9.329210705220159e-05, "loss": 0.1592, "step": 1192 }, { "epoch": 0.2504198152812762, "grad_norm": 0.10476188361644745, "learning_rate": 9.327376334275351e-05, "loss": 0.1548, "step": 1193 }, { "epoch": 0.2506297229219144, "grad_norm": 0.1116849035024643, "learning_rate": 9.325539639430602e-05, "loss": 0.1621, "step": 1194 }, { "epoch": 0.2508396305625525, "grad_norm": 0.134584441781044, "learning_rate": 9.323700621672263e-05, "loss": 0.1657, "step": 1195 }, { "epoch": 0.2510495382031906, "grad_norm": 0.14008194208145142, "learning_rate": 9.321859281987926e-05, "loss": 0.1569, "step": 1196 }, { "epoch": 0.2512594458438287, "grad_norm": 0.136927530169487, "learning_rate": 9.320015621366432e-05, "loss": 0.1612, "step": 1197 }, { "epoch": 0.25146935348446686, "grad_norm": 0.12949542701244354, "learning_rate": 9.31816964079787e-05, "loss": 0.1612, "step": 1198 }, { "epoch": 0.25167926112510497, "grad_norm": 0.131092369556427, "learning_rate": 9.316321341273574e-05, "loss": 0.1663, "step": 1199 }, { "epoch": 0.2518891687657431, "grad_norm": 0.15064486861228943, "learning_rate": 9.314470723786121e-05, "loss": 0.1626, "step": 1200 }, { "epoch": 0.2520990764063812, "grad_norm": 0.14182250201702118, "learning_rate": 9.312617789329339e-05, "loss": 0.1775, "step": 1201 }, { "epoch": 0.2523089840470193, "grad_norm": 0.14778292179107666, "learning_rate": 9.310762538898292e-05, "loss": 0.1699, "step": 1202 }, { "epoch": 0.25251889168765745, "grad_norm": 0.13292577862739563, "learning_rate": 9.308904973489292e-05, "loss": 0.1598, "step": 1203 }, { "epoch": 0.25272879932829556, "grad_norm": 0.14361746609210968, "learning_rate": 9.307045094099898e-05, "loss": 0.1701, "step": 1204 }, { "epoch": 0.25293870696893367, "grad_norm": 0.18234290182590485, "learning_rate": 9.305182901728904e-05, "loss": 0.1454, "step": 1205 }, { "epoch": 0.2531486146095718, "grad_norm": 0.13576486706733704, "learning_rate": 9.30331839737635e-05, "loss": 0.1751, "step": 1206 }, { "epoch": 0.2533585222502099, "grad_norm": 0.14371106028556824, "learning_rate": 9.30145158204352e-05, "loss": 0.1625, "step": 1207 }, { "epoch": 0.25356842989084805, "grad_norm": 0.14439460635185242, "learning_rate": 9.299582456732933e-05, "loss": 0.1687, "step": 1208 }, { "epoch": 0.25377833753148615, "grad_norm": 0.12489113956689835, "learning_rate": 9.297711022448357e-05, "loss": 0.1578, "step": 1209 }, { "epoch": 0.25398824517212426, "grad_norm": 0.11665599048137665, "learning_rate": 9.29583728019479e-05, "loss": 0.1585, "step": 1210 }, { "epoch": 0.25419815281276237, "grad_norm": 0.1369842141866684, "learning_rate": 9.293961230978476e-05, "loss": 0.1685, "step": 1211 }, { "epoch": 0.25440806045340053, "grad_norm": 0.1267922818660736, "learning_rate": 9.292082875806895e-05, "loss": 0.1512, "step": 1212 }, { "epoch": 0.25461796809403864, "grad_norm": 0.13632832467556, "learning_rate": 9.29020221568877e-05, "loss": 0.1588, "step": 1213 }, { "epoch": 0.25482787573467675, "grad_norm": 0.13703875243663788, "learning_rate": 9.288319251634057e-05, "loss": 0.1653, "step": 1214 }, { "epoch": 0.25503778337531485, "grad_norm": 0.1690177023410797, "learning_rate": 9.28643398465395e-05, "loss": 0.1452, "step": 1215 }, { "epoch": 0.25524769101595296, "grad_norm": 0.13149164617061615, "learning_rate": 9.28454641576088e-05, "loss": 0.1585, "step": 1216 }, { "epoch": 0.2554575986565911, "grad_norm": 0.11899828165769577, "learning_rate": 9.282656545968516e-05, "loss": 0.1666, "step": 1217 }, { "epoch": 0.25566750629722923, "grad_norm": 0.1439366489648819, "learning_rate": 9.28076437629176e-05, "loss": 0.17, "step": 1218 }, { "epoch": 0.25587741393786734, "grad_norm": 0.1383795440196991, "learning_rate": 9.278869907746751e-05, "loss": 0.1609, "step": 1219 }, { "epoch": 0.25608732157850544, "grad_norm": 0.13621094822883606, "learning_rate": 9.276973141350862e-05, "loss": 0.1657, "step": 1220 }, { "epoch": 0.25629722921914355, "grad_norm": 0.10558905452489853, "learning_rate": 9.2750740781227e-05, "loss": 0.1548, "step": 1221 }, { "epoch": 0.2565071368597817, "grad_norm": 0.1380019336938858, "learning_rate": 9.273172719082105e-05, "loss": 0.1647, "step": 1222 }, { "epoch": 0.2567170445004198, "grad_norm": 0.1091807410120964, "learning_rate": 9.27126906525015e-05, "loss": 0.1598, "step": 1223 }, { "epoch": 0.25692695214105793, "grad_norm": 0.13246595859527588, "learning_rate": 9.269363117649139e-05, "loss": 0.1684, "step": 1224 }, { "epoch": 0.25713685978169604, "grad_norm": 0.1521276980638504, "learning_rate": 9.267454877302614e-05, "loss": 0.1745, "step": 1225 }, { "epoch": 0.2573467674223342, "grad_norm": 0.13144977390766144, "learning_rate": 9.26554434523534e-05, "loss": 0.1652, "step": 1226 }, { "epoch": 0.2575566750629723, "grad_norm": 0.14672890305519104, "learning_rate": 9.263631522473317e-05, "loss": 0.1563, "step": 1227 }, { "epoch": 0.2577665827036104, "grad_norm": 0.1289316564798355, "learning_rate": 9.261716410043774e-05, "loss": 0.1594, "step": 1228 }, { "epoch": 0.2579764903442485, "grad_norm": 0.13553071022033691, "learning_rate": 9.259799008975173e-05, "loss": 0.1457, "step": 1229 }, { "epoch": 0.25818639798488663, "grad_norm": 0.11398561298847198, "learning_rate": 9.257879320297198e-05, "loss": 0.1583, "step": 1230 }, { "epoch": 0.2583963056255248, "grad_norm": 0.13140664994716644, "learning_rate": 9.255957345040769e-05, "loss": 0.1667, "step": 1231 }, { "epoch": 0.2586062132661629, "grad_norm": 0.11999331414699554, "learning_rate": 9.25403308423803e-05, "loss": 0.1656, "step": 1232 }, { "epoch": 0.258816120906801, "grad_norm": 0.14149609208106995, "learning_rate": 9.25210653892235e-05, "loss": 0.1605, "step": 1233 }, { "epoch": 0.2590260285474391, "grad_norm": 0.1261209398508072, "learning_rate": 9.250177710128332e-05, "loss": 0.1594, "step": 1234 }, { "epoch": 0.2592359361880772, "grad_norm": 0.15169532597064972, "learning_rate": 9.248246598891798e-05, "loss": 0.1744, "step": 1235 }, { "epoch": 0.2594458438287154, "grad_norm": 0.12228883057832718, "learning_rate": 9.246313206249802e-05, "loss": 0.1753, "step": 1236 }, { "epoch": 0.2596557514693535, "grad_norm": 0.14006085693836212, "learning_rate": 9.244377533240618e-05, "loss": 0.1614, "step": 1237 }, { "epoch": 0.2598656591099916, "grad_norm": 0.11327286064624786, "learning_rate": 9.242439580903745e-05, "loss": 0.1447, "step": 1238 }, { "epoch": 0.2600755667506297, "grad_norm": 0.11881330609321594, "learning_rate": 9.24049935027991e-05, "loss": 0.153, "step": 1239 }, { "epoch": 0.26028547439126787, "grad_norm": 0.11056238412857056, "learning_rate": 9.238556842411061e-05, "loss": 0.1607, "step": 1240 }, { "epoch": 0.260495382031906, "grad_norm": 0.1527404934167862, "learning_rate": 9.23661205834037e-05, "loss": 0.182, "step": 1241 }, { "epoch": 0.2607052896725441, "grad_norm": 0.11118711531162262, "learning_rate": 9.234664999112229e-05, "loss": 0.1756, "step": 1242 }, { "epoch": 0.2609151973131822, "grad_norm": 0.1669243574142456, "learning_rate": 9.232715665772252e-05, "loss": 0.1475, "step": 1243 }, { "epoch": 0.2611251049538203, "grad_norm": 0.14391769468784332, "learning_rate": 9.230764059367276e-05, "loss": 0.1807, "step": 1244 }, { "epoch": 0.26133501259445846, "grad_norm": 0.11989136785268784, "learning_rate": 9.228810180945362e-05, "loss": 0.1689, "step": 1245 }, { "epoch": 0.26154492023509657, "grad_norm": 0.1314937323331833, "learning_rate": 9.226854031555783e-05, "loss": 0.1672, "step": 1246 }, { "epoch": 0.2617548278757347, "grad_norm": 0.13055700063705444, "learning_rate": 9.224895612249039e-05, "loss": 0.1807, "step": 1247 }, { "epoch": 0.2619647355163728, "grad_norm": 0.1459030658006668, "learning_rate": 9.222934924076843e-05, "loss": 0.1669, "step": 1248 }, { "epoch": 0.2621746431570109, "grad_norm": 0.14526896178722382, "learning_rate": 9.22097196809213e-05, "loss": 0.1546, "step": 1249 }, { "epoch": 0.26238455079764905, "grad_norm": 0.12496902793645859, "learning_rate": 9.219006745349053e-05, "loss": 0.1733, "step": 1250 }, { "epoch": 0.26259445843828716, "grad_norm": 0.12503020465373993, "learning_rate": 9.217039256902983e-05, "loss": 0.1697, "step": 1251 }, { "epoch": 0.26280436607892527, "grad_norm": 0.1265639215707779, "learning_rate": 9.215069503810505e-05, "loss": 0.1688, "step": 1252 }, { "epoch": 0.2630142737195634, "grad_norm": 0.1370651125907898, "learning_rate": 9.213097487129418e-05, "loss": 0.138, "step": 1253 }, { "epoch": 0.26322418136020154, "grad_norm": 0.1354241967201233, "learning_rate": 9.211123207918746e-05, "loss": 0.1563, "step": 1254 }, { "epoch": 0.26343408900083964, "grad_norm": 0.13176575303077698, "learning_rate": 9.209146667238719e-05, "loss": 0.1683, "step": 1255 }, { "epoch": 0.26364399664147775, "grad_norm": 0.1484147608280182, "learning_rate": 9.207167866150783e-05, "loss": 0.1617, "step": 1256 }, { "epoch": 0.26385390428211586, "grad_norm": 0.1321289986371994, "learning_rate": 9.205186805717603e-05, "loss": 0.1592, "step": 1257 }, { "epoch": 0.26406381192275397, "grad_norm": 0.13387440145015717, "learning_rate": 9.203203487003049e-05, "loss": 0.1758, "step": 1258 }, { "epoch": 0.26427371956339213, "grad_norm": 0.15501335263252258, "learning_rate": 9.201217911072213e-05, "loss": 0.1748, "step": 1259 }, { "epoch": 0.26448362720403024, "grad_norm": 0.13924849033355713, "learning_rate": 9.199230078991393e-05, "loss": 0.1526, "step": 1260 }, { "epoch": 0.26469353484466834, "grad_norm": 0.155021071434021, "learning_rate": 9.197239991828099e-05, "loss": 0.1566, "step": 1261 }, { "epoch": 0.26490344248530645, "grad_norm": 0.10661531239748001, "learning_rate": 9.195247650651055e-05, "loss": 0.159, "step": 1262 }, { "epoch": 0.26511335012594456, "grad_norm": 0.15488192439079285, "learning_rate": 9.193253056530194e-05, "loss": 0.1772, "step": 1263 }, { "epoch": 0.2653232577665827, "grad_norm": 0.11265954375267029, "learning_rate": 9.191256210536655e-05, "loss": 0.1549, "step": 1264 }, { "epoch": 0.26553316540722083, "grad_norm": 0.1617230325937271, "learning_rate": 9.189257113742793e-05, "loss": 0.1737, "step": 1265 }, { "epoch": 0.26574307304785894, "grad_norm": 0.11783073842525482, "learning_rate": 9.187255767222169e-05, "loss": 0.1615, "step": 1266 }, { "epoch": 0.26595298068849704, "grad_norm": 0.14353324472904205, "learning_rate": 9.185252172049553e-05, "loss": 0.1578, "step": 1267 }, { "epoch": 0.2661628883291352, "grad_norm": 0.1482020765542984, "learning_rate": 9.183246329300917e-05, "loss": 0.1483, "step": 1268 }, { "epoch": 0.2663727959697733, "grad_norm": 0.1308319866657257, "learning_rate": 9.181238240053446e-05, "loss": 0.1742, "step": 1269 }, { "epoch": 0.2665827036104114, "grad_norm": 0.13996942341327667, "learning_rate": 9.179227905385532e-05, "loss": 0.1649, "step": 1270 }, { "epoch": 0.2667926112510495, "grad_norm": 0.13277165591716766, "learning_rate": 9.177215326376769e-05, "loss": 0.1732, "step": 1271 }, { "epoch": 0.26700251889168763, "grad_norm": 0.12486343085765839, "learning_rate": 9.175200504107957e-05, "loss": 0.1493, "step": 1272 }, { "epoch": 0.2672124265323258, "grad_norm": 0.12765567004680634, "learning_rate": 9.173183439661103e-05, "loss": 0.1533, "step": 1273 }, { "epoch": 0.2674223341729639, "grad_norm": 0.11738407611846924, "learning_rate": 9.171164134119417e-05, "loss": 0.1551, "step": 1274 }, { "epoch": 0.267632241813602, "grad_norm": 0.12661415338516235, "learning_rate": 9.16914258856731e-05, "loss": 0.1655, "step": 1275 }, { "epoch": 0.2678421494542401, "grad_norm": 0.10266046971082687, "learning_rate": 9.167118804090401e-05, "loss": 0.1506, "step": 1276 }, { "epoch": 0.2680520570948782, "grad_norm": 0.13607458770275116, "learning_rate": 9.165092781775506e-05, "loss": 0.1621, "step": 1277 }, { "epoch": 0.2682619647355164, "grad_norm": 0.14382155239582062, "learning_rate": 9.163064522710649e-05, "loss": 0.1756, "step": 1278 }, { "epoch": 0.2684718723761545, "grad_norm": 0.14696674048900604, "learning_rate": 9.161034027985047e-05, "loss": 0.1506, "step": 1279 }, { "epoch": 0.2686817800167926, "grad_norm": 0.12927404046058655, "learning_rate": 9.159001298689122e-05, "loss": 0.1693, "step": 1280 }, { "epoch": 0.2688916876574307, "grad_norm": 0.1403432935476303, "learning_rate": 9.1569663359145e-05, "loss": 0.178, "step": 1281 }, { "epoch": 0.2691015952980689, "grad_norm": 0.12026988714933395, "learning_rate": 9.154929140754e-05, "loss": 0.1611, "step": 1282 }, { "epoch": 0.269311502938707, "grad_norm": 0.1373865157365799, "learning_rate": 9.152889714301645e-05, "loss": 0.1696, "step": 1283 }, { "epoch": 0.2695214105793451, "grad_norm": 0.13119877874851227, "learning_rate": 9.150848057652648e-05, "loss": 0.178, "step": 1284 }, { "epoch": 0.2697313182199832, "grad_norm": 0.12198714911937714, "learning_rate": 9.148804171903432e-05, "loss": 0.1579, "step": 1285 }, { "epoch": 0.2699412258606213, "grad_norm": 0.12499682605266571, "learning_rate": 9.146758058151603e-05, "loss": 0.1669, "step": 1286 }, { "epoch": 0.27015113350125947, "grad_norm": 0.12025036662817001, "learning_rate": 9.144709717495978e-05, "loss": 0.1602, "step": 1287 }, { "epoch": 0.2703610411418976, "grad_norm": 0.1325342059135437, "learning_rate": 9.142659151036558e-05, "loss": 0.1559, "step": 1288 }, { "epoch": 0.2705709487825357, "grad_norm": 0.14255130290985107, "learning_rate": 9.140606359874546e-05, "loss": 0.1441, "step": 1289 }, { "epoch": 0.2707808564231738, "grad_norm": 0.13800451159477234, "learning_rate": 9.138551345112337e-05, "loss": 0.1543, "step": 1290 }, { "epoch": 0.2709907640638119, "grad_norm": 0.10976230353116989, "learning_rate": 9.136494107853521e-05, "loss": 0.1513, "step": 1291 }, { "epoch": 0.27120067170445006, "grad_norm": 0.1330142766237259, "learning_rate": 9.134434649202882e-05, "loss": 0.1527, "step": 1292 }, { "epoch": 0.27141057934508817, "grad_norm": 0.13712665438652039, "learning_rate": 9.132372970266397e-05, "loss": 0.1621, "step": 1293 }, { "epoch": 0.2716204869857263, "grad_norm": 0.15420737862586975, "learning_rate": 9.130309072151231e-05, "loss": 0.1773, "step": 1294 }, { "epoch": 0.2718303946263644, "grad_norm": 0.14155341684818268, "learning_rate": 9.12824295596575e-05, "loss": 0.1615, "step": 1295 }, { "epoch": 0.27204030226700254, "grad_norm": 0.14036308228969574, "learning_rate": 9.126174622819499e-05, "loss": 0.1496, "step": 1296 }, { "epoch": 0.27225020990764065, "grad_norm": 0.13568180799484253, "learning_rate": 9.124104073823225e-05, "loss": 0.1562, "step": 1297 }, { "epoch": 0.27246011754827876, "grad_norm": 0.12056776136159897, "learning_rate": 9.122031310088861e-05, "loss": 0.1722, "step": 1298 }, { "epoch": 0.27267002518891686, "grad_norm": 0.139260396361351, "learning_rate": 9.119956332729524e-05, "loss": 0.1797, "step": 1299 }, { "epoch": 0.27287993282955497, "grad_norm": 0.16409984230995178, "learning_rate": 9.117879142859527e-05, "loss": 0.1661, "step": 1300 }, { "epoch": 0.27308984047019313, "grad_norm": 0.1330333650112152, "learning_rate": 9.115799741594368e-05, "loss": 0.1794, "step": 1301 }, { "epoch": 0.27329974811083124, "grad_norm": 0.12173225730657578, "learning_rate": 9.113718130050734e-05, "loss": 0.1449, "step": 1302 }, { "epoch": 0.27350965575146935, "grad_norm": 0.10996419191360474, "learning_rate": 9.111634309346496e-05, "loss": 0.1626, "step": 1303 }, { "epoch": 0.27371956339210746, "grad_norm": 0.13602428138256073, "learning_rate": 9.109548280600715e-05, "loss": 0.1769, "step": 1304 }, { "epoch": 0.2739294710327456, "grad_norm": 0.11914105713367462, "learning_rate": 9.107460044933635e-05, "loss": 0.1664, "step": 1305 }, { "epoch": 0.2741393786733837, "grad_norm": 0.12417447566986084, "learning_rate": 9.105369603466686e-05, "loss": 0.1455, "step": 1306 }, { "epoch": 0.27434928631402183, "grad_norm": 0.13093994557857513, "learning_rate": 9.103276957322484e-05, "loss": 0.1721, "step": 1307 }, { "epoch": 0.27455919395465994, "grad_norm": 0.13235576450824738, "learning_rate": 9.101182107624827e-05, "loss": 0.1614, "step": 1308 }, { "epoch": 0.27476910159529805, "grad_norm": 0.11165154725313187, "learning_rate": 9.099085055498696e-05, "loss": 0.1656, "step": 1309 }, { "epoch": 0.2749790092359362, "grad_norm": 0.10962437093257904, "learning_rate": 9.096985802070259e-05, "loss": 0.1633, "step": 1310 }, { "epoch": 0.2751889168765743, "grad_norm": 0.12218458205461502, "learning_rate": 9.09488434846686e-05, "loss": 0.1542, "step": 1311 }, { "epoch": 0.2753988245172124, "grad_norm": 0.11284459382295609, "learning_rate": 9.09278069581703e-05, "loss": 0.1591, "step": 1312 }, { "epoch": 0.27560873215785053, "grad_norm": 0.10450571775436401, "learning_rate": 9.090674845250475e-05, "loss": 0.1571, "step": 1313 }, { "epoch": 0.27581863979848864, "grad_norm": 0.12627463042736053, "learning_rate": 9.088566797898089e-05, "loss": 0.1613, "step": 1314 }, { "epoch": 0.2760285474391268, "grad_norm": 0.1578662246465683, "learning_rate": 9.08645655489194e-05, "loss": 0.1506, "step": 1315 }, { "epoch": 0.2762384550797649, "grad_norm": 0.11859230697154999, "learning_rate": 9.084344117365276e-05, "loss": 0.1502, "step": 1316 }, { "epoch": 0.276448362720403, "grad_norm": 0.11622706055641174, "learning_rate": 9.082229486452524e-05, "loss": 0.1674, "step": 1317 }, { "epoch": 0.2766582703610411, "grad_norm": 0.12626247107982635, "learning_rate": 9.08011266328929e-05, "loss": 0.1522, "step": 1318 }, { "epoch": 0.2768681780016793, "grad_norm": 0.13337282836437225, "learning_rate": 9.077993649012357e-05, "loss": 0.1647, "step": 1319 }, { "epoch": 0.2770780856423174, "grad_norm": 0.1255621761083603, "learning_rate": 9.075872444759683e-05, "loss": 0.1564, "step": 1320 }, { "epoch": 0.2772879932829555, "grad_norm": 0.12364380061626434, "learning_rate": 9.073749051670403e-05, "loss": 0.1555, "step": 1321 }, { "epoch": 0.2774979009235936, "grad_norm": 0.11966480314731598, "learning_rate": 9.07162347088483e-05, "loss": 0.163, "step": 1322 }, { "epoch": 0.2777078085642317, "grad_norm": 0.10513235628604889, "learning_rate": 9.069495703544443e-05, "loss": 0.1578, "step": 1323 }, { "epoch": 0.2779177162048699, "grad_norm": 0.10422056168317795, "learning_rate": 9.06736575079191e-05, "loss": 0.1435, "step": 1324 }, { "epoch": 0.278127623845508, "grad_norm": 0.12099135667085648, "learning_rate": 9.065233613771059e-05, "loss": 0.1606, "step": 1325 }, { "epoch": 0.2783375314861461, "grad_norm": 0.10527081042528152, "learning_rate": 9.063099293626898e-05, "loss": 0.1606, "step": 1326 }, { "epoch": 0.2785474391267842, "grad_norm": 0.15932826697826385, "learning_rate": 9.060962791505605e-05, "loss": 0.1626, "step": 1327 }, { "epoch": 0.2787573467674223, "grad_norm": 0.14808166027069092, "learning_rate": 9.05882410855453e-05, "loss": 0.1648, "step": 1328 }, { "epoch": 0.27896725440806047, "grad_norm": 0.1467159539461136, "learning_rate": 9.056683245922196e-05, "loss": 0.1533, "step": 1329 }, { "epoch": 0.2791771620486986, "grad_norm": 0.13326837122440338, "learning_rate": 9.054540204758295e-05, "loss": 0.1638, "step": 1330 }, { "epoch": 0.2793870696893367, "grad_norm": 0.14456981420516968, "learning_rate": 9.052394986213688e-05, "loss": 0.1639, "step": 1331 }, { "epoch": 0.2795969773299748, "grad_norm": 0.1200864389538765, "learning_rate": 9.050247591440407e-05, "loss": 0.1556, "step": 1332 }, { "epoch": 0.27980688497061296, "grad_norm": 0.1340358853340149, "learning_rate": 9.04809802159165e-05, "loss": 0.1627, "step": 1333 }, { "epoch": 0.28001679261125106, "grad_norm": 0.14233215153217316, "learning_rate": 9.045946277821791e-05, "loss": 0.1604, "step": 1334 }, { "epoch": 0.28022670025188917, "grad_norm": 0.15358757972717285, "learning_rate": 9.04379236128636e-05, "loss": 0.1709, "step": 1335 }, { "epoch": 0.2804366078925273, "grad_norm": 0.14173360168933868, "learning_rate": 9.041636273142061e-05, "loss": 0.1659, "step": 1336 }, { "epoch": 0.2806465155331654, "grad_norm": 0.13441811501979828, "learning_rate": 9.039478014546761e-05, "loss": 0.171, "step": 1337 }, { "epoch": 0.28085642317380355, "grad_norm": 0.12063397467136383, "learning_rate": 9.037317586659498e-05, "loss": 0.1514, "step": 1338 }, { "epoch": 0.28106633081444166, "grad_norm": 0.15047864615917206, "learning_rate": 9.035154990640466e-05, "loss": 0.179, "step": 1339 }, { "epoch": 0.28127623845507976, "grad_norm": 0.1325361728668213, "learning_rate": 9.032990227651034e-05, "loss": 0.1659, "step": 1340 }, { "epoch": 0.28148614609571787, "grad_norm": 0.14775165915489197, "learning_rate": 9.030823298853725e-05, "loss": 0.1732, "step": 1341 }, { "epoch": 0.281696053736356, "grad_norm": 0.1677199900150299, "learning_rate": 9.028654205412233e-05, "loss": 0.1696, "step": 1342 }, { "epoch": 0.28190596137699414, "grad_norm": 0.1279178410768509, "learning_rate": 9.026482948491407e-05, "loss": 0.1529, "step": 1343 }, { "epoch": 0.28211586901763225, "grad_norm": 0.12682883441448212, "learning_rate": 9.024309529257262e-05, "loss": 0.1513, "step": 1344 }, { "epoch": 0.28232577665827036, "grad_norm": 0.10392288863658905, "learning_rate": 9.022133948876975e-05, "loss": 0.1611, "step": 1345 }, { "epoch": 0.28253568429890846, "grad_norm": 0.11686202883720398, "learning_rate": 9.019956208518883e-05, "loss": 0.1672, "step": 1346 }, { "epoch": 0.2827455919395466, "grad_norm": 0.12934426963329315, "learning_rate": 9.017776309352481e-05, "loss": 0.1433, "step": 1347 }, { "epoch": 0.28295549958018473, "grad_norm": 0.11639254540205002, "learning_rate": 9.015594252548426e-05, "loss": 0.1707, "step": 1348 }, { "epoch": 0.28316540722082284, "grad_norm": 0.12937045097351074, "learning_rate": 9.013410039278531e-05, "loss": 0.1553, "step": 1349 }, { "epoch": 0.28337531486146095, "grad_norm": 0.1247095987200737, "learning_rate": 9.01122367071577e-05, "loss": 0.1679, "step": 1350 }, { "epoch": 0.28358522250209905, "grad_norm": 0.12007705122232437, "learning_rate": 9.00903514803427e-05, "loss": 0.1554, "step": 1351 }, { "epoch": 0.2837951301427372, "grad_norm": 0.17036741971969604, "learning_rate": 9.00684447240932e-05, "loss": 0.1698, "step": 1352 }, { "epoch": 0.2840050377833753, "grad_norm": 0.11295659095048904, "learning_rate": 9.004651645017363e-05, "loss": 0.1635, "step": 1353 }, { "epoch": 0.28421494542401343, "grad_norm": 0.1332911103963852, "learning_rate": 9.002456667035997e-05, "loss": 0.169, "step": 1354 }, { "epoch": 0.28442485306465154, "grad_norm": 0.1429576724767685, "learning_rate": 9.000259539643972e-05, "loss": 0.1641, "step": 1355 }, { "epoch": 0.28463476070528965, "grad_norm": 0.14206334948539734, "learning_rate": 8.998060264021201e-05, "loss": 0.1692, "step": 1356 }, { "epoch": 0.2848446683459278, "grad_norm": 0.11948953568935394, "learning_rate": 8.995858841348743e-05, "loss": 0.1594, "step": 1357 }, { "epoch": 0.2850545759865659, "grad_norm": 0.12463600933551788, "learning_rate": 8.993655272808811e-05, "loss": 0.1657, "step": 1358 }, { "epoch": 0.285264483627204, "grad_norm": 0.15918885171413422, "learning_rate": 8.991449559584775e-05, "loss": 0.1532, "step": 1359 }, { "epoch": 0.28547439126784213, "grad_norm": 0.12887489795684814, "learning_rate": 8.989241702861149e-05, "loss": 0.1724, "step": 1360 }, { "epoch": 0.2856842989084803, "grad_norm": 0.1456436812877655, "learning_rate": 8.987031703823606e-05, "loss": 0.1529, "step": 1361 }, { "epoch": 0.2858942065491184, "grad_norm": 0.16147381067276, "learning_rate": 8.984819563658964e-05, "loss": 0.1655, "step": 1362 }, { "epoch": 0.2861041141897565, "grad_norm": 0.18096451461315155, "learning_rate": 8.982605283555192e-05, "loss": 0.152, "step": 1363 }, { "epoch": 0.2863140218303946, "grad_norm": 0.12937211990356445, "learning_rate": 8.980388864701412e-05, "loss": 0.1627, "step": 1364 }, { "epoch": 0.2865239294710327, "grad_norm": 0.16723109781742096, "learning_rate": 8.97817030828789e-05, "loss": 0.1666, "step": 1365 }, { "epoch": 0.2867338371116709, "grad_norm": 0.1125638410449028, "learning_rate": 8.97594961550604e-05, "loss": 0.1514, "step": 1366 }, { "epoch": 0.286943744752309, "grad_norm": 0.12200607359409332, "learning_rate": 8.973726787548427e-05, "loss": 0.1593, "step": 1367 }, { "epoch": 0.2871536523929471, "grad_norm": 0.12197429686784744, "learning_rate": 8.971501825608762e-05, "loss": 0.1662, "step": 1368 }, { "epoch": 0.2873635600335852, "grad_norm": 0.11950329691171646, "learning_rate": 8.969274730881893e-05, "loss": 0.1619, "step": 1369 }, { "epoch": 0.2875734676742233, "grad_norm": 0.13871550559997559, "learning_rate": 8.96704550456383e-05, "loss": 0.1467, "step": 1370 }, { "epoch": 0.2877833753148615, "grad_norm": 0.11093387007713318, "learning_rate": 8.964814147851713e-05, "loss": 0.1703, "step": 1371 }, { "epoch": 0.2879932829554996, "grad_norm": 0.13531048595905304, "learning_rate": 8.962580661943831e-05, "loss": 0.1587, "step": 1372 }, { "epoch": 0.2882031905961377, "grad_norm": 0.11192765831947327, "learning_rate": 8.96034504803962e-05, "loss": 0.1537, "step": 1373 }, { "epoch": 0.2884130982367758, "grad_norm": 0.12293984740972519, "learning_rate": 8.958107307339653e-05, "loss": 0.1644, "step": 1374 }, { "epoch": 0.28862300587741396, "grad_norm": 0.12963594496250153, "learning_rate": 8.955867441045652e-05, "loss": 0.16, "step": 1375 }, { "epoch": 0.28883291351805207, "grad_norm": 0.11687605082988739, "learning_rate": 8.95362545036047e-05, "loss": 0.157, "step": 1376 }, { "epoch": 0.2890428211586902, "grad_norm": 0.11696937680244446, "learning_rate": 8.951381336488114e-05, "loss": 0.1572, "step": 1377 }, { "epoch": 0.2892527287993283, "grad_norm": 0.12374909222126007, "learning_rate": 8.949135100633718e-05, "loss": 0.1584, "step": 1378 }, { "epoch": 0.2894626364399664, "grad_norm": 0.133868008852005, "learning_rate": 8.946886744003569e-05, "loss": 0.1415, "step": 1379 }, { "epoch": 0.28967254408060455, "grad_norm": 0.11039866507053375, "learning_rate": 8.94463626780508e-05, "loss": 0.1783, "step": 1380 }, { "epoch": 0.28988245172124266, "grad_norm": 0.1570889800786972, "learning_rate": 8.94238367324681e-05, "loss": 0.1707, "step": 1381 }, { "epoch": 0.29009235936188077, "grad_norm": 0.1414021998643875, "learning_rate": 8.940128961538454e-05, "loss": 0.1339, "step": 1382 }, { "epoch": 0.2903022670025189, "grad_norm": 0.12162122875452042, "learning_rate": 8.937872133890843e-05, "loss": 0.1595, "step": 1383 }, { "epoch": 0.290512174643157, "grad_norm": 0.1278192549943924, "learning_rate": 8.935613191515946e-05, "loss": 0.1386, "step": 1384 }, { "epoch": 0.29072208228379515, "grad_norm": 0.12220022082328796, "learning_rate": 8.933352135626867e-05, "loss": 0.1663, "step": 1385 }, { "epoch": 0.29093198992443325, "grad_norm": 0.15517336130142212, "learning_rate": 8.931088967437844e-05, "loss": 0.1596, "step": 1386 }, { "epoch": 0.29114189756507136, "grad_norm": 0.13512621819972992, "learning_rate": 8.928823688164249e-05, "loss": 0.1412, "step": 1387 }, { "epoch": 0.29135180520570947, "grad_norm": 0.10588439553976059, "learning_rate": 8.92655629902259e-05, "loss": 0.1502, "step": 1388 }, { "epoch": 0.29156171284634763, "grad_norm": 0.16245464980602264, "learning_rate": 8.924286801230506e-05, "loss": 0.1669, "step": 1389 }, { "epoch": 0.29177162048698574, "grad_norm": 0.12498240917921066, "learning_rate": 8.92201519600677e-05, "loss": 0.1518, "step": 1390 }, { "epoch": 0.29198152812762385, "grad_norm": 0.13893572986125946, "learning_rate": 8.919741484571286e-05, "loss": 0.18, "step": 1391 }, { "epoch": 0.29219143576826195, "grad_norm": 0.1361815184354782, "learning_rate": 8.917465668145086e-05, "loss": 0.166, "step": 1392 }, { "epoch": 0.29240134340890006, "grad_norm": 0.12212132662534714, "learning_rate": 8.915187747950339e-05, "loss": 0.1606, "step": 1393 }, { "epoch": 0.2926112510495382, "grad_norm": 0.12465030699968338, "learning_rate": 8.912907725210342e-05, "loss": 0.1454, "step": 1394 }, { "epoch": 0.29282115869017633, "grad_norm": 0.15186457335948944, "learning_rate": 8.910625601149512e-05, "loss": 0.1581, "step": 1395 }, { "epoch": 0.29303106633081444, "grad_norm": 0.1369972676038742, "learning_rate": 8.908341376993409e-05, "loss": 0.1636, "step": 1396 }, { "epoch": 0.29324097397145255, "grad_norm": 0.12631604075431824, "learning_rate": 8.90605505396871e-05, "loss": 0.1536, "step": 1397 }, { "epoch": 0.29345088161209065, "grad_norm": 0.13023649156093597, "learning_rate": 8.903766633303222e-05, "loss": 0.1615, "step": 1398 }, { "epoch": 0.2936607892527288, "grad_norm": 0.13841018080711365, "learning_rate": 8.90147611622588e-05, "loss": 0.1756, "step": 1399 }, { "epoch": 0.2938706968933669, "grad_norm": 0.14210903644561768, "learning_rate": 8.899183503966746e-05, "loss": 0.1661, "step": 1400 }, { "epoch": 0.29408060453400503, "grad_norm": 0.13981324434280396, "learning_rate": 8.896888797757003e-05, "loss": 0.1547, "step": 1401 }, { "epoch": 0.29429051217464314, "grad_norm": 0.11732388287782669, "learning_rate": 8.89459199882896e-05, "loss": 0.1648, "step": 1402 }, { "epoch": 0.2945004198152813, "grad_norm": 0.13775314390659332, "learning_rate": 8.892293108416055e-05, "loss": 0.171, "step": 1403 }, { "epoch": 0.2947103274559194, "grad_norm": 0.11232218891382217, "learning_rate": 8.889992127752839e-05, "loss": 0.1813, "step": 1404 }, { "epoch": 0.2949202350965575, "grad_norm": 0.12403729557991028, "learning_rate": 8.887689058074994e-05, "loss": 0.1694, "step": 1405 }, { "epoch": 0.2951301427371956, "grad_norm": 0.12742987275123596, "learning_rate": 8.885383900619321e-05, "loss": 0.1584, "step": 1406 }, { "epoch": 0.29534005037783373, "grad_norm": 0.12318763881921768, "learning_rate": 8.883076656623741e-05, "loss": 0.1484, "step": 1407 }, { "epoch": 0.2955499580184719, "grad_norm": 0.1265798807144165, "learning_rate": 8.880767327327297e-05, "loss": 0.1708, "step": 1408 }, { "epoch": 0.29575986565911, "grad_norm": 0.12394752353429794, "learning_rate": 8.878455913970154e-05, "loss": 0.1599, "step": 1409 }, { "epoch": 0.2959697732997481, "grad_norm": 0.1458372324705124, "learning_rate": 8.876142417793591e-05, "loss": 0.1751, "step": 1410 }, { "epoch": 0.2961796809403862, "grad_norm": 0.14673961699008942, "learning_rate": 8.87382684004001e-05, "loss": 0.1651, "step": 1411 }, { "epoch": 0.2963895885810244, "grad_norm": 0.15783408284187317, "learning_rate": 8.871509181952925e-05, "loss": 0.1788, "step": 1412 }, { "epoch": 0.2965994962216625, "grad_norm": 0.1385459154844284, "learning_rate": 8.869189444776979e-05, "loss": 0.158, "step": 1413 }, { "epoch": 0.2968094038623006, "grad_norm": 0.12669824063777924, "learning_rate": 8.866867629757916e-05, "loss": 0.1556, "step": 1414 }, { "epoch": 0.2970193115029387, "grad_norm": 0.12607593834400177, "learning_rate": 8.86454373814261e-05, "loss": 0.145, "step": 1415 }, { "epoch": 0.2972292191435768, "grad_norm": 0.15276843309402466, "learning_rate": 8.86221777117904e-05, "loss": 0.1644, "step": 1416 }, { "epoch": 0.29743912678421497, "grad_norm": 0.154046893119812, "learning_rate": 8.859889730116304e-05, "loss": 0.162, "step": 1417 }, { "epoch": 0.2976490344248531, "grad_norm": 0.13313519954681396, "learning_rate": 8.857559616204613e-05, "loss": 0.1659, "step": 1418 }, { "epoch": 0.2978589420654912, "grad_norm": 0.11869316548109055, "learning_rate": 8.855227430695294e-05, "loss": 0.149, "step": 1419 }, { "epoch": 0.2980688497061293, "grad_norm": 0.13355213403701782, "learning_rate": 8.852893174840782e-05, "loss": 0.1674, "step": 1420 }, { "epoch": 0.2982787573467674, "grad_norm": 0.11961048096418381, "learning_rate": 8.850556849894625e-05, "loss": 0.1596, "step": 1421 }, { "epoch": 0.29848866498740556, "grad_norm": 0.15859396755695343, "learning_rate": 8.848218457111483e-05, "loss": 0.1688, "step": 1422 }, { "epoch": 0.29869857262804367, "grad_norm": 0.12413342297077179, "learning_rate": 8.845877997747127e-05, "loss": 0.1764, "step": 1423 }, { "epoch": 0.2989084802686818, "grad_norm": 0.1361013799905777, "learning_rate": 8.843535473058437e-05, "loss": 0.166, "step": 1424 }, { "epoch": 0.2991183879093199, "grad_norm": 0.17420732975006104, "learning_rate": 8.841190884303402e-05, "loss": 0.1699, "step": 1425 }, { "epoch": 0.29932829554995805, "grad_norm": 0.13347145915031433, "learning_rate": 8.838844232741122e-05, "loss": 0.1642, "step": 1426 }, { "epoch": 0.29953820319059615, "grad_norm": 0.1271234154701233, "learning_rate": 8.8364955196318e-05, "loss": 0.1425, "step": 1427 }, { "epoch": 0.29974811083123426, "grad_norm": 0.17551806569099426, "learning_rate": 8.834144746236748e-05, "loss": 0.1771, "step": 1428 }, { "epoch": 0.29995801847187237, "grad_norm": 0.16244301199913025, "learning_rate": 8.831791913818387e-05, "loss": 0.1598, "step": 1429 }, { "epoch": 0.3001679261125105, "grad_norm": 0.1273384541273117, "learning_rate": 8.829437023640239e-05, "loss": 0.1448, "step": 1430 }, { "epoch": 0.30037783375314864, "grad_norm": 0.10530916601419449, "learning_rate": 8.827080076966938e-05, "loss": 0.1502, "step": 1431 }, { "epoch": 0.30058774139378674, "grad_norm": 0.10858159512281418, "learning_rate": 8.824721075064215e-05, "loss": 0.1674, "step": 1432 }, { "epoch": 0.30079764903442485, "grad_norm": 0.12510398030281067, "learning_rate": 8.82236001919891e-05, "loss": 0.1739, "step": 1433 }, { "epoch": 0.30100755667506296, "grad_norm": 0.12490396946668625, "learning_rate": 8.819996910638962e-05, "loss": 0.1441, "step": 1434 }, { "epoch": 0.30121746431570107, "grad_norm": 0.13503123819828033, "learning_rate": 8.817631750653412e-05, "loss": 0.1545, "step": 1435 }, { "epoch": 0.30142737195633923, "grad_norm": 0.12778516113758087, "learning_rate": 8.815264540512411e-05, "loss": 0.1618, "step": 1436 }, { "epoch": 0.30163727959697734, "grad_norm": 0.12481486797332764, "learning_rate": 8.812895281487201e-05, "loss": 0.1541, "step": 1437 }, { "epoch": 0.30184718723761544, "grad_norm": 0.15225140750408173, "learning_rate": 8.810523974850131e-05, "loss": 0.1737, "step": 1438 }, { "epoch": 0.30205709487825355, "grad_norm": 0.12728790938854218, "learning_rate": 8.808150621874643e-05, "loss": 0.1627, "step": 1439 }, { "epoch": 0.3022670025188917, "grad_norm": 0.1369866281747818, "learning_rate": 8.805775223835284e-05, "loss": 0.1685, "step": 1440 }, { "epoch": 0.3024769101595298, "grad_norm": 0.13604697585105896, "learning_rate": 8.803397782007697e-05, "loss": 0.1623, "step": 1441 }, { "epoch": 0.30268681780016793, "grad_norm": 0.12286148965358734, "learning_rate": 8.801018297668622e-05, "loss": 0.1544, "step": 1442 }, { "epoch": 0.30289672544080604, "grad_norm": 0.12319783121347427, "learning_rate": 8.798636772095896e-05, "loss": 0.1559, "step": 1443 }, { "epoch": 0.30310663308144414, "grad_norm": 0.13532286882400513, "learning_rate": 8.796253206568454e-05, "loss": 0.1462, "step": 1444 }, { "epoch": 0.3033165407220823, "grad_norm": 0.14293217658996582, "learning_rate": 8.793867602366326e-05, "loss": 0.167, "step": 1445 }, { "epoch": 0.3035264483627204, "grad_norm": 0.1477852612733841, "learning_rate": 8.791479960770633e-05, "loss": 0.166, "step": 1446 }, { "epoch": 0.3037363560033585, "grad_norm": 0.1326388120651245, "learning_rate": 8.789090283063595e-05, "loss": 0.1737, "step": 1447 }, { "epoch": 0.30394626364399663, "grad_norm": 0.14401552081108093, "learning_rate": 8.786698570528522e-05, "loss": 0.1645, "step": 1448 }, { "epoch": 0.30415617128463474, "grad_norm": 0.13078097999095917, "learning_rate": 8.784304824449819e-05, "loss": 0.1712, "step": 1449 }, { "epoch": 0.3043660789252729, "grad_norm": 0.12756723165512085, "learning_rate": 8.781909046112983e-05, "loss": 0.1606, "step": 1450 }, { "epoch": 0.304575986565911, "grad_norm": 0.13873012363910675, "learning_rate": 8.779511236804603e-05, "loss": 0.1682, "step": 1451 }, { "epoch": 0.3047858942065491, "grad_norm": 0.12623098492622375, "learning_rate": 8.777111397812354e-05, "loss": 0.1617, "step": 1452 }, { "epoch": 0.3049958018471872, "grad_norm": 0.1419798731803894, "learning_rate": 8.774709530425006e-05, "loss": 0.1446, "step": 1453 }, { "epoch": 0.3052057094878254, "grad_norm": 0.1524171084165573, "learning_rate": 8.772305635932416e-05, "loss": 0.1606, "step": 1454 }, { "epoch": 0.3054156171284635, "grad_norm": 0.1456252783536911, "learning_rate": 8.769899715625533e-05, "loss": 0.1491, "step": 1455 }, { "epoch": 0.3056255247691016, "grad_norm": 0.16533705592155457, "learning_rate": 8.767491770796388e-05, "loss": 0.1455, "step": 1456 }, { "epoch": 0.3058354324097397, "grad_norm": 0.1010533794760704, "learning_rate": 8.765081802738104e-05, "loss": 0.1599, "step": 1457 }, { "epoch": 0.3060453400503778, "grad_norm": 0.1445004791021347, "learning_rate": 8.762669812744887e-05, "loss": 0.1788, "step": 1458 }, { "epoch": 0.306255247691016, "grad_norm": 0.1463691145181656, "learning_rate": 8.760255802112032e-05, "loss": 0.1554, "step": 1459 }, { "epoch": 0.3064651553316541, "grad_norm": 0.14523173868656158, "learning_rate": 8.757839772135919e-05, "loss": 0.1768, "step": 1460 }, { "epoch": 0.3066750629722922, "grad_norm": 0.12887312471866608, "learning_rate": 8.75542172411401e-05, "loss": 0.1385, "step": 1461 }, { "epoch": 0.3068849706129303, "grad_norm": 0.12244197726249695, "learning_rate": 8.753001659344852e-05, "loss": 0.177, "step": 1462 }, { "epoch": 0.3070948782535684, "grad_norm": 0.1390761286020279, "learning_rate": 8.750579579128077e-05, "loss": 0.152, "step": 1463 }, { "epoch": 0.30730478589420657, "grad_norm": 0.1180199533700943, "learning_rate": 8.748155484764393e-05, "loss": 0.1822, "step": 1464 }, { "epoch": 0.3075146935348447, "grad_norm": 0.12501199543476105, "learning_rate": 8.745729377555598e-05, "loss": 0.1585, "step": 1465 }, { "epoch": 0.3077246011754828, "grad_norm": 0.1500568687915802, "learning_rate": 8.743301258804567e-05, "loss": 0.1571, "step": 1466 }, { "epoch": 0.3079345088161209, "grad_norm": 0.10966819524765015, "learning_rate": 8.740871129815253e-05, "loss": 0.1654, "step": 1467 }, { "epoch": 0.30814441645675905, "grad_norm": 0.14994241297245026, "learning_rate": 8.73843899189269e-05, "loss": 0.156, "step": 1468 }, { "epoch": 0.30835432409739716, "grad_norm": 0.13417619466781616, "learning_rate": 8.736004846342996e-05, "loss": 0.16, "step": 1469 }, { "epoch": 0.30856423173803527, "grad_norm": 0.12869034707546234, "learning_rate": 8.733568694473359e-05, "loss": 0.1526, "step": 1470 }, { "epoch": 0.3087741393786734, "grad_norm": 0.14110779762268066, "learning_rate": 8.731130537592048e-05, "loss": 0.1677, "step": 1471 }, { "epoch": 0.3089840470193115, "grad_norm": 0.11759759485721588, "learning_rate": 8.728690377008412e-05, "loss": 0.1583, "step": 1472 }, { "epoch": 0.30919395465994964, "grad_norm": 0.13631226122379303, "learning_rate": 8.726248214032871e-05, "loss": 0.151, "step": 1473 }, { "epoch": 0.30940386230058775, "grad_norm": 0.13227972388267517, "learning_rate": 8.723804049976922e-05, "loss": 0.1616, "step": 1474 }, { "epoch": 0.30961376994122586, "grad_norm": 0.15408119559288025, "learning_rate": 8.721357886153137e-05, "loss": 0.1731, "step": 1475 }, { "epoch": 0.30982367758186397, "grad_norm": 0.14567722380161285, "learning_rate": 8.718909723875163e-05, "loss": 0.1555, "step": 1476 }, { "epoch": 0.3100335852225021, "grad_norm": 0.1292102187871933, "learning_rate": 8.71645956445772e-05, "loss": 0.1566, "step": 1477 }, { "epoch": 0.31024349286314024, "grad_norm": 0.12155589461326599, "learning_rate": 8.714007409216595e-05, "loss": 0.168, "step": 1478 }, { "epoch": 0.31045340050377834, "grad_norm": 0.1343718022108078, "learning_rate": 8.711553259468657e-05, "loss": 0.1673, "step": 1479 }, { "epoch": 0.31066330814441645, "grad_norm": 0.11374703049659729, "learning_rate": 8.709097116531835e-05, "loss": 0.1655, "step": 1480 }, { "epoch": 0.31087321578505456, "grad_norm": 0.13738465309143066, "learning_rate": 8.706638981725139e-05, "loss": 0.1573, "step": 1481 }, { "epoch": 0.3110831234256927, "grad_norm": 0.13506565988063812, "learning_rate": 8.70417885636864e-05, "loss": 0.1649, "step": 1482 }, { "epoch": 0.3112930310663308, "grad_norm": 0.1244024857878685, "learning_rate": 8.701716741783483e-05, "loss": 0.1605, "step": 1483 }, { "epoch": 0.31150293870696893, "grad_norm": 0.15039315819740295, "learning_rate": 8.69925263929188e-05, "loss": 0.1634, "step": 1484 }, { "epoch": 0.31171284634760704, "grad_norm": 0.12742501497268677, "learning_rate": 8.69678655021711e-05, "loss": 0.1649, "step": 1485 }, { "epoch": 0.31192275398824515, "grad_norm": 0.1437070369720459, "learning_rate": 8.694318475883518e-05, "loss": 0.1565, "step": 1486 }, { "epoch": 0.3121326616288833, "grad_norm": 0.13695649802684784, "learning_rate": 8.691848417616517e-05, "loss": 0.1628, "step": 1487 }, { "epoch": 0.3123425692695214, "grad_norm": 0.12154117226600647, "learning_rate": 8.689376376742586e-05, "loss": 0.159, "step": 1488 }, { "epoch": 0.3125524769101595, "grad_norm": 0.12522564828395844, "learning_rate": 8.686902354589266e-05, "loss": 0.1567, "step": 1489 }, { "epoch": 0.31276238455079763, "grad_norm": 0.14949142932891846, "learning_rate": 8.684426352485165e-05, "loss": 0.1602, "step": 1490 }, { "epoch": 0.31297229219143574, "grad_norm": 0.13096019625663757, "learning_rate": 8.68194837175995e-05, "loss": 0.157, "step": 1491 }, { "epoch": 0.3131821998320739, "grad_norm": 0.12589357793331146, "learning_rate": 8.679468413744356e-05, "loss": 0.1711, "step": 1492 }, { "epoch": 0.313392107472712, "grad_norm": 0.1338101178407669, "learning_rate": 8.676986479770175e-05, "loss": 0.1628, "step": 1493 }, { "epoch": 0.3136020151133501, "grad_norm": 0.13691024482250214, "learning_rate": 8.674502571170262e-05, "loss": 0.1604, "step": 1494 }, { "epoch": 0.3138119227539882, "grad_norm": 0.14150124788284302, "learning_rate": 8.672016689278535e-05, "loss": 0.1707, "step": 1495 }, { "epoch": 0.3140218303946264, "grad_norm": 0.13233508169651031, "learning_rate": 8.669528835429969e-05, "loss": 0.1421, "step": 1496 }, { "epoch": 0.3142317380352645, "grad_norm": 0.11874301731586456, "learning_rate": 8.667039010960596e-05, "loss": 0.1576, "step": 1497 }, { "epoch": 0.3144416456759026, "grad_norm": 0.10090667009353638, "learning_rate": 8.66454721720751e-05, "loss": 0.158, "step": 1498 }, { "epoch": 0.3146515533165407, "grad_norm": 0.126051664352417, "learning_rate": 8.662053455508862e-05, "loss": 0.1725, "step": 1499 }, { "epoch": 0.3148614609571788, "grad_norm": 0.11627226322889328, "learning_rate": 8.659557727203858e-05, "loss": 0.1572, "step": 1500 }, { "epoch": 0.315071368597817, "grad_norm": 0.12283121049404144, "learning_rate": 8.657060033632763e-05, "loss": 0.1633, "step": 1501 }, { "epoch": 0.3152812762384551, "grad_norm": 0.11624766141176224, "learning_rate": 8.654560376136891e-05, "loss": 0.1762, "step": 1502 }, { "epoch": 0.3154911838790932, "grad_norm": 0.13653838634490967, "learning_rate": 8.65205875605862e-05, "loss": 0.1784, "step": 1503 }, { "epoch": 0.3157010915197313, "grad_norm": 0.12123393267393112, "learning_rate": 8.649555174741374e-05, "loss": 0.1551, "step": 1504 }, { "epoch": 0.3159109991603694, "grad_norm": 0.12023040652275085, "learning_rate": 8.647049633529635e-05, "loss": 0.174, "step": 1505 }, { "epoch": 0.3161209068010076, "grad_norm": 0.12046807259321213, "learning_rate": 8.644542133768936e-05, "loss": 0.1703, "step": 1506 }, { "epoch": 0.3163308144416457, "grad_norm": 0.11224795132875443, "learning_rate": 8.64203267680586e-05, "loss": 0.1435, "step": 1507 }, { "epoch": 0.3165407220822838, "grad_norm": 0.1572083830833435, "learning_rate": 8.639521263988044e-05, "loss": 0.153, "step": 1508 }, { "epoch": 0.3167506297229219, "grad_norm": 0.1247357502579689, "learning_rate": 8.637007896664173e-05, "loss": 0.1566, "step": 1509 }, { "epoch": 0.31696053736356006, "grad_norm": 0.12798605859279633, "learning_rate": 8.634492576183984e-05, "loss": 0.1526, "step": 1510 }, { "epoch": 0.31717044500419816, "grad_norm": 0.1474895477294922, "learning_rate": 8.631975303898261e-05, "loss": 0.169, "step": 1511 }, { "epoch": 0.31738035264483627, "grad_norm": 0.11636695265769958, "learning_rate": 8.629456081158838e-05, "loss": 0.1613, "step": 1512 }, { "epoch": 0.3175902602854744, "grad_norm": 0.11031098663806915, "learning_rate": 8.626934909318591e-05, "loss": 0.1576, "step": 1513 }, { "epoch": 0.3178001679261125, "grad_norm": 0.12879802286624908, "learning_rate": 8.624411789731452e-05, "loss": 0.1556, "step": 1514 }, { "epoch": 0.31801007556675065, "grad_norm": 0.1240730881690979, "learning_rate": 8.621886723752392e-05, "loss": 0.1633, "step": 1515 }, { "epoch": 0.31821998320738876, "grad_norm": 0.10870420932769775, "learning_rate": 8.619359712737427e-05, "loss": 0.1385, "step": 1516 }, { "epoch": 0.31842989084802686, "grad_norm": 0.11011097580194473, "learning_rate": 8.616830758043622e-05, "loss": 0.1655, "step": 1517 }, { "epoch": 0.31863979848866497, "grad_norm": 0.17963182926177979, "learning_rate": 8.614299861029083e-05, "loss": 0.1651, "step": 1518 }, { "epoch": 0.31884970612930313, "grad_norm": 0.12011148035526276, "learning_rate": 8.61176702305296e-05, "loss": 0.1657, "step": 1519 }, { "epoch": 0.31905961376994124, "grad_norm": 0.1365176886320114, "learning_rate": 8.609232245475443e-05, "loss": 0.1839, "step": 1520 }, { "epoch": 0.31926952141057935, "grad_norm": 0.1179775819182396, "learning_rate": 8.606695529657767e-05, "loss": 0.1581, "step": 1521 }, { "epoch": 0.31947942905121746, "grad_norm": 0.15118494629859924, "learning_rate": 8.604156876962206e-05, "loss": 0.1831, "step": 1522 }, { "epoch": 0.31968933669185556, "grad_norm": 0.14029067754745483, "learning_rate": 8.601616288752073e-05, "loss": 0.1708, "step": 1523 }, { "epoch": 0.3198992443324937, "grad_norm": 0.14659684896469116, "learning_rate": 8.599073766391725e-05, "loss": 0.1599, "step": 1524 }, { "epoch": 0.32010915197313183, "grad_norm": 0.1260727345943451, "learning_rate": 8.596529311246551e-05, "loss": 0.1623, "step": 1525 }, { "epoch": 0.32031905961376994, "grad_norm": 0.12665653228759766, "learning_rate": 8.593982924682986e-05, "loss": 0.1672, "step": 1526 }, { "epoch": 0.32052896725440805, "grad_norm": 0.12388867139816284, "learning_rate": 8.591434608068493e-05, "loss": 0.1738, "step": 1527 }, { "epoch": 0.32073887489504616, "grad_norm": 0.12167280912399292, "learning_rate": 8.58888436277158e-05, "loss": 0.1627, "step": 1528 }, { "epoch": 0.3209487825356843, "grad_norm": 0.12876589596271515, "learning_rate": 8.586332190161786e-05, "loss": 0.1646, "step": 1529 }, { "epoch": 0.3211586901763224, "grad_norm": 0.12370327860116959, "learning_rate": 8.583778091609683e-05, "loss": 0.156, "step": 1530 }, { "epoch": 0.32136859781696053, "grad_norm": 0.11784835159778595, "learning_rate": 8.581222068486884e-05, "loss": 0.1664, "step": 1531 }, { "epoch": 0.32157850545759864, "grad_norm": 0.14363597333431244, "learning_rate": 8.578664122166029e-05, "loss": 0.1724, "step": 1532 }, { "epoch": 0.3217884130982368, "grad_norm": 0.11469676345586777, "learning_rate": 8.576104254020796e-05, "loss": 0.1469, "step": 1533 }, { "epoch": 0.3219983207388749, "grad_norm": 0.12534739077091217, "learning_rate": 8.573542465425892e-05, "loss": 0.1687, "step": 1534 }, { "epoch": 0.322208228379513, "grad_norm": 0.12772725522518158, "learning_rate": 8.570978757757057e-05, "loss": 0.159, "step": 1535 }, { "epoch": 0.3224181360201511, "grad_norm": 0.14892329275608063, "learning_rate": 8.568413132391058e-05, "loss": 0.1586, "step": 1536 }, { "epoch": 0.32262804366078923, "grad_norm": 0.1334220916032791, "learning_rate": 8.565845590705695e-05, "loss": 0.1692, "step": 1537 }, { "epoch": 0.3228379513014274, "grad_norm": 0.1165885403752327, "learning_rate": 8.563276134079798e-05, "loss": 0.1673, "step": 1538 }, { "epoch": 0.3230478589420655, "grad_norm": 0.11613230407238007, "learning_rate": 8.560704763893224e-05, "loss": 0.1512, "step": 1539 }, { "epoch": 0.3232577665827036, "grad_norm": 0.11873317509889603, "learning_rate": 8.558131481526856e-05, "loss": 0.1556, "step": 1540 }, { "epoch": 0.3234676742233417, "grad_norm": 0.12491460889577866, "learning_rate": 8.555556288362605e-05, "loss": 0.1728, "step": 1541 }, { "epoch": 0.3236775818639798, "grad_norm": 0.1342668980360031, "learning_rate": 8.552979185783412e-05, "loss": 0.1746, "step": 1542 }, { "epoch": 0.323887489504618, "grad_norm": 0.1082087978720665, "learning_rate": 8.550400175173236e-05, "loss": 0.1668, "step": 1543 }, { "epoch": 0.3240973971452561, "grad_norm": 0.12464690953493118, "learning_rate": 8.547819257917065e-05, "loss": 0.1642, "step": 1544 }, { "epoch": 0.3243073047858942, "grad_norm": 0.11872904002666473, "learning_rate": 8.545236435400914e-05, "loss": 0.1586, "step": 1545 }, { "epoch": 0.3245172124265323, "grad_norm": 0.14073589444160461, "learning_rate": 8.542651709011814e-05, "loss": 0.1733, "step": 1546 }, { "epoch": 0.32472712006717047, "grad_norm": 0.11652261763811111, "learning_rate": 8.540065080137824e-05, "loss": 0.1689, "step": 1547 }, { "epoch": 0.3249370277078086, "grad_norm": 0.14177857339382172, "learning_rate": 8.537476550168022e-05, "loss": 0.1504, "step": 1548 }, { "epoch": 0.3251469353484467, "grad_norm": 0.11179566383361816, "learning_rate": 8.534886120492509e-05, "loss": 0.1504, "step": 1549 }, { "epoch": 0.3253568429890848, "grad_norm": 0.141026109457016, "learning_rate": 8.532293792502403e-05, "loss": 0.1642, "step": 1550 }, { "epoch": 0.3255667506297229, "grad_norm": 0.11241170763969421, "learning_rate": 8.529699567589844e-05, "loss": 0.1662, "step": 1551 }, { "epoch": 0.32577665827036106, "grad_norm": 0.11532200872898102, "learning_rate": 8.52710344714799e-05, "loss": 0.1588, "step": 1552 }, { "epoch": 0.32598656591099917, "grad_norm": 0.13776087760925293, "learning_rate": 8.524505432571019e-05, "loss": 0.1654, "step": 1553 }, { "epoch": 0.3261964735516373, "grad_norm": 0.11376765370368958, "learning_rate": 8.521905525254123e-05, "loss": 0.1536, "step": 1554 }, { "epoch": 0.3264063811922754, "grad_norm": 0.11074686050415039, "learning_rate": 8.519303726593508e-05, "loss": 0.1589, "step": 1555 }, { "epoch": 0.3266162888329135, "grad_norm": 0.11213414371013641, "learning_rate": 8.516700037986406e-05, "loss": 0.1548, "step": 1556 }, { "epoch": 0.32682619647355166, "grad_norm": 0.14128370583057404, "learning_rate": 8.514094460831052e-05, "loss": 0.1556, "step": 1557 }, { "epoch": 0.32703610411418976, "grad_norm": 0.1197216585278511, "learning_rate": 8.5114869965267e-05, "loss": 0.146, "step": 1558 }, { "epoch": 0.32724601175482787, "grad_norm": 0.12456157803535461, "learning_rate": 8.508877646473623e-05, "loss": 0.1586, "step": 1559 }, { "epoch": 0.327455919395466, "grad_norm": 0.11759025603532791, "learning_rate": 8.506266412073099e-05, "loss": 0.1564, "step": 1560 }, { "epoch": 0.32766582703610414, "grad_norm": 0.1229950562119484, "learning_rate": 8.503653294727418e-05, "loss": 0.159, "step": 1561 }, { "epoch": 0.32787573467674225, "grad_norm": 0.11062034219503403, "learning_rate": 8.501038295839887e-05, "loss": 0.1536, "step": 1562 }, { "epoch": 0.32808564231738035, "grad_norm": 0.1223401203751564, "learning_rate": 8.498421416814818e-05, "loss": 0.1473, "step": 1563 }, { "epoch": 0.32829554995801846, "grad_norm": 0.13364095985889435, "learning_rate": 8.495802659057536e-05, "loss": 0.169, "step": 1564 }, { "epoch": 0.32850545759865657, "grad_norm": 0.1344279795885086, "learning_rate": 8.493182023974372e-05, "loss": 0.1537, "step": 1565 }, { "epoch": 0.32871536523929473, "grad_norm": 0.17328642308712006, "learning_rate": 8.490559512972671e-05, "loss": 0.1476, "step": 1566 }, { "epoch": 0.32892527287993284, "grad_norm": 0.1484242081642151, "learning_rate": 8.487935127460779e-05, "loss": 0.1666, "step": 1567 }, { "epoch": 0.32913518052057095, "grad_norm": 0.17023669183254242, "learning_rate": 8.485308868848049e-05, "loss": 0.1653, "step": 1568 }, { "epoch": 0.32934508816120905, "grad_norm": 0.14054429531097412, "learning_rate": 8.482680738544843e-05, "loss": 0.1675, "step": 1569 }, { "epoch": 0.32955499580184716, "grad_norm": 0.1140383705496788, "learning_rate": 8.480050737962531e-05, "loss": 0.1612, "step": 1570 }, { "epoch": 0.3297649034424853, "grad_norm": 0.13256023824214935, "learning_rate": 8.477418868513476e-05, "loss": 0.1595, "step": 1571 }, { "epoch": 0.32997481108312343, "grad_norm": 0.13051024079322815, "learning_rate": 8.474785131611058e-05, "loss": 0.1621, "step": 1572 }, { "epoch": 0.33018471872376154, "grad_norm": 0.11545020341873169, "learning_rate": 8.472149528669651e-05, "loss": 0.1683, "step": 1573 }, { "epoch": 0.33039462636439965, "grad_norm": 0.13742703199386597, "learning_rate": 8.469512061104635e-05, "loss": 0.1611, "step": 1574 }, { "epoch": 0.3306045340050378, "grad_norm": 0.13308970630168915, "learning_rate": 8.46687273033239e-05, "loss": 0.148, "step": 1575 }, { "epoch": 0.3308144416456759, "grad_norm": 0.13026942312717438, "learning_rate": 8.464231537770298e-05, "loss": 0.1772, "step": 1576 }, { "epoch": 0.331024349286314, "grad_norm": 0.11242230236530304, "learning_rate": 8.461588484836738e-05, "loss": 0.1632, "step": 1577 }, { "epoch": 0.33123425692695213, "grad_norm": 0.13627904653549194, "learning_rate": 8.45894357295109e-05, "loss": 0.1644, "step": 1578 }, { "epoch": 0.33144416456759024, "grad_norm": 0.11995008587837219, "learning_rate": 8.456296803533734e-05, "loss": 0.1532, "step": 1579 }, { "epoch": 0.3316540722082284, "grad_norm": 0.12165706604719162, "learning_rate": 8.453648178006044e-05, "loss": 0.1596, "step": 1580 }, { "epoch": 0.3318639798488665, "grad_norm": 0.1462729126214981, "learning_rate": 8.450997697790393e-05, "loss": 0.1572, "step": 1581 }, { "epoch": 0.3320738874895046, "grad_norm": 0.11989284306764603, "learning_rate": 8.448345364310149e-05, "loss": 0.1631, "step": 1582 }, { "epoch": 0.3322837951301427, "grad_norm": 0.1307172328233719, "learning_rate": 8.445691178989674e-05, "loss": 0.1639, "step": 1583 }, { "epoch": 0.33249370277078083, "grad_norm": 0.13907490670681, "learning_rate": 8.44303514325433e-05, "loss": 0.1466, "step": 1584 }, { "epoch": 0.332703610411419, "grad_norm": 0.21999351680278778, "learning_rate": 8.440377258530465e-05, "loss": 0.1596, "step": 1585 }, { "epoch": 0.3329135180520571, "grad_norm": 0.11224652826786041, "learning_rate": 8.437717526245428e-05, "loss": 0.1543, "step": 1586 }, { "epoch": 0.3331234256926952, "grad_norm": 0.12237223982810974, "learning_rate": 8.435055947827552e-05, "loss": 0.149, "step": 1587 }, { "epoch": 0.3333333333333333, "grad_norm": 0.1236414983868599, "learning_rate": 8.432392524706168e-05, "loss": 0.1421, "step": 1588 }, { "epoch": 0.3335432409739715, "grad_norm": 0.12933699786663055, "learning_rate": 8.429727258311593e-05, "loss": 0.152, "step": 1589 }, { "epoch": 0.3337531486146096, "grad_norm": 0.14159703254699707, "learning_rate": 8.427060150075137e-05, "loss": 0.1705, "step": 1590 }, { "epoch": 0.3339630562552477, "grad_norm": 0.1670205146074295, "learning_rate": 8.424391201429099e-05, "loss": 0.1544, "step": 1591 }, { "epoch": 0.3341729638958858, "grad_norm": 0.11887665838003159, "learning_rate": 8.421720413806764e-05, "loss": 0.147, "step": 1592 }, { "epoch": 0.3343828715365239, "grad_norm": 0.11611668765544891, "learning_rate": 8.419047788642407e-05, "loss": 0.1758, "step": 1593 }, { "epoch": 0.33459277917716207, "grad_norm": 0.15789294242858887, "learning_rate": 8.416373327371287e-05, "loss": 0.1533, "step": 1594 }, { "epoch": 0.3348026868178002, "grad_norm": 0.13557223975658417, "learning_rate": 8.413697031429653e-05, "loss": 0.1588, "step": 1595 }, { "epoch": 0.3350125944584383, "grad_norm": 0.29409486055374146, "learning_rate": 8.411018902254736e-05, "loss": 0.1686, "step": 1596 }, { "epoch": 0.3352225020990764, "grad_norm": 0.13911570608615875, "learning_rate": 8.408338941284752e-05, "loss": 0.168, "step": 1597 }, { "epoch": 0.3354324097397145, "grad_norm": 0.1477447748184204, "learning_rate": 8.405657149958902e-05, "loss": 0.1611, "step": 1598 }, { "epoch": 0.33564231738035266, "grad_norm": 0.13031038641929626, "learning_rate": 8.402973529717368e-05, "loss": 0.1676, "step": 1599 }, { "epoch": 0.33585222502099077, "grad_norm": 0.12024478614330292, "learning_rate": 8.400288082001319e-05, "loss": 0.1567, "step": 1600 }, { "epoch": 0.3360621326616289, "grad_norm": 0.12338093668222427, "learning_rate": 8.397600808252897e-05, "loss": 0.1585, "step": 1601 }, { "epoch": 0.336272040302267, "grad_norm": 0.12878558039665222, "learning_rate": 8.394911709915232e-05, "loss": 0.1624, "step": 1602 }, { "epoch": 0.33648194794290515, "grad_norm": 0.12042135745286942, "learning_rate": 8.392220788432431e-05, "loss": 0.1719, "step": 1603 }, { "epoch": 0.33669185558354325, "grad_norm": 0.14576420187950134, "learning_rate": 8.389528045249579e-05, "loss": 0.1669, "step": 1604 }, { "epoch": 0.33690176322418136, "grad_norm": 0.13084185123443604, "learning_rate": 8.386833481812744e-05, "loss": 0.1556, "step": 1605 }, { "epoch": 0.33711167086481947, "grad_norm": 0.15233568847179413, "learning_rate": 8.384137099568965e-05, "loss": 0.1674, "step": 1606 }, { "epoch": 0.3373215785054576, "grad_norm": 0.15160681307315826, "learning_rate": 8.381438899966261e-05, "loss": 0.1727, "step": 1607 }, { "epoch": 0.33753148614609574, "grad_norm": 0.13782384991645813, "learning_rate": 8.378738884453627e-05, "loss": 0.1754, "step": 1608 }, { "epoch": 0.33774139378673385, "grad_norm": 0.12946204841136932, "learning_rate": 8.376037054481034e-05, "loss": 0.176, "step": 1609 }, { "epoch": 0.33795130142737195, "grad_norm": 0.11807133257389069, "learning_rate": 8.373333411499426e-05, "loss": 0.1465, "step": 1610 }, { "epoch": 0.33816120906801006, "grad_norm": 0.1325267106294632, "learning_rate": 8.370627956960721e-05, "loss": 0.1475, "step": 1611 }, { "epoch": 0.33837111670864817, "grad_norm": 0.12463680654764175, "learning_rate": 8.36792069231781e-05, "loss": 0.1691, "step": 1612 }, { "epoch": 0.33858102434928633, "grad_norm": 0.1311996728181839, "learning_rate": 8.365211619024555e-05, "loss": 0.1562, "step": 1613 }, { "epoch": 0.33879093198992444, "grad_norm": 0.13213030993938446, "learning_rate": 8.362500738535792e-05, "loss": 0.1611, "step": 1614 }, { "epoch": 0.33900083963056254, "grad_norm": 0.1333753913640976, "learning_rate": 8.359788052307324e-05, "loss": 0.1612, "step": 1615 }, { "epoch": 0.33921074727120065, "grad_norm": 0.11926085501909256, "learning_rate": 8.357073561795928e-05, "loss": 0.1621, "step": 1616 }, { "epoch": 0.3394206549118388, "grad_norm": 0.14635923504829407, "learning_rate": 8.354357268459343e-05, "loss": 0.1716, "step": 1617 }, { "epoch": 0.3396305625524769, "grad_norm": 0.1318664848804474, "learning_rate": 8.351639173756284e-05, "loss": 0.171, "step": 1618 }, { "epoch": 0.33984047019311503, "grad_norm": 0.13774079084396362, "learning_rate": 8.34891927914643e-05, "loss": 0.1527, "step": 1619 }, { "epoch": 0.34005037783375314, "grad_norm": 0.1164809837937355, "learning_rate": 8.346197586090426e-05, "loss": 0.161, "step": 1620 }, { "epoch": 0.34026028547439124, "grad_norm": 0.13626568019390106, "learning_rate": 8.343474096049881e-05, "loss": 0.1573, "step": 1621 }, { "epoch": 0.3404701931150294, "grad_norm": 0.10750927031040192, "learning_rate": 8.340748810487375e-05, "loss": 0.1516, "step": 1622 }, { "epoch": 0.3406801007556675, "grad_norm": 0.1209016814827919, "learning_rate": 8.338021730866448e-05, "loss": 0.1604, "step": 1623 }, { "epoch": 0.3408900083963056, "grad_norm": 0.12268110364675522, "learning_rate": 8.335292858651601e-05, "loss": 0.1723, "step": 1624 }, { "epoch": 0.34109991603694373, "grad_norm": 0.1716935932636261, "learning_rate": 8.332562195308305e-05, "loss": 0.1617, "step": 1625 }, { "epoch": 0.3413098236775819, "grad_norm": 0.12799715995788574, "learning_rate": 8.329829742302984e-05, "loss": 0.1677, "step": 1626 }, { "epoch": 0.34151973131822, "grad_norm": 0.12748028337955475, "learning_rate": 8.327095501103029e-05, "loss": 0.1587, "step": 1627 }, { "epoch": 0.3417296389588581, "grad_norm": 0.15869416296482086, "learning_rate": 8.324359473176793e-05, "loss": 0.1644, "step": 1628 }, { "epoch": 0.3419395465994962, "grad_norm": 0.14217017590999603, "learning_rate": 8.321621659993583e-05, "loss": 0.1688, "step": 1629 }, { "epoch": 0.3421494542401343, "grad_norm": 0.11858684569597244, "learning_rate": 8.318882063023669e-05, "loss": 0.1513, "step": 1630 }, { "epoch": 0.3423593618807725, "grad_norm": 0.11514747142791748, "learning_rate": 8.316140683738274e-05, "loss": 0.146, "step": 1631 }, { "epoch": 0.3425692695214106, "grad_norm": 0.13995614647865295, "learning_rate": 8.313397523609585e-05, "loss": 0.1567, "step": 1632 }, { "epoch": 0.3427791771620487, "grad_norm": 0.10559704899787903, "learning_rate": 8.310652584110738e-05, "loss": 0.1577, "step": 1633 }, { "epoch": 0.3429890848026868, "grad_norm": 0.11493990570306778, "learning_rate": 8.30790586671583e-05, "loss": 0.1431, "step": 1634 }, { "epoch": 0.3431989924433249, "grad_norm": 0.14548490941524506, "learning_rate": 8.305157372899913e-05, "loss": 0.1791, "step": 1635 }, { "epoch": 0.3434089000839631, "grad_norm": 0.13234320282936096, "learning_rate": 8.302407104138988e-05, "loss": 0.1568, "step": 1636 }, { "epoch": 0.3436188077246012, "grad_norm": 0.1269916445016861, "learning_rate": 8.299655061910012e-05, "loss": 0.1494, "step": 1637 }, { "epoch": 0.3438287153652393, "grad_norm": 0.12883053719997406, "learning_rate": 8.296901247690896e-05, "loss": 0.1633, "step": 1638 }, { "epoch": 0.3440386230058774, "grad_norm": 0.12479805201292038, "learning_rate": 8.294145662960502e-05, "loss": 0.1744, "step": 1639 }, { "epoch": 0.34424853064651556, "grad_norm": 0.13455580174922943, "learning_rate": 8.29138830919864e-05, "loss": 0.1787, "step": 1640 }, { "epoch": 0.34445843828715367, "grad_norm": 0.11923965066671371, "learning_rate": 8.288629187886073e-05, "loss": 0.1703, "step": 1641 }, { "epoch": 0.3446683459277918, "grad_norm": 0.1303246170282364, "learning_rate": 8.285868300504511e-05, "loss": 0.1669, "step": 1642 }, { "epoch": 0.3448782535684299, "grad_norm": 0.09619605541229248, "learning_rate": 8.283105648536616e-05, "loss": 0.163, "step": 1643 }, { "epoch": 0.345088161209068, "grad_norm": 0.11518269032239914, "learning_rate": 8.280341233465992e-05, "loss": 0.1511, "step": 1644 }, { "epoch": 0.34529806884970615, "grad_norm": 0.10729756206274033, "learning_rate": 8.277575056777195e-05, "loss": 0.1671, "step": 1645 }, { "epoch": 0.34550797649034426, "grad_norm": 0.13708291947841644, "learning_rate": 8.274807119955725e-05, "loss": 0.1719, "step": 1646 }, { "epoch": 0.34571788413098237, "grad_norm": 0.11630459129810333, "learning_rate": 8.272037424488028e-05, "loss": 0.1505, "step": 1647 }, { "epoch": 0.3459277917716205, "grad_norm": 0.15268699824810028, "learning_rate": 8.26926597186149e-05, "loss": 0.1714, "step": 1648 }, { "epoch": 0.3461376994122586, "grad_norm": 0.1177772805094719, "learning_rate": 8.266492763564451e-05, "loss": 0.1651, "step": 1649 }, { "epoch": 0.34634760705289674, "grad_norm": 0.12576864659786224, "learning_rate": 8.263717801086182e-05, "loss": 0.1542, "step": 1650 }, { "epoch": 0.34655751469353485, "grad_norm": 0.10988842695951462, "learning_rate": 8.260941085916901e-05, "loss": 0.1436, "step": 1651 }, { "epoch": 0.34676742233417296, "grad_norm": 0.12295406311750412, "learning_rate": 8.258162619547771e-05, "loss": 0.1611, "step": 1652 }, { "epoch": 0.34697732997481107, "grad_norm": 0.12765416502952576, "learning_rate": 8.25538240347089e-05, "loss": 0.1707, "step": 1653 }, { "epoch": 0.34718723761544923, "grad_norm": 0.128097802400589, "learning_rate": 8.252600439179295e-05, "loss": 0.1777, "step": 1654 }, { "epoch": 0.34739714525608734, "grad_norm": 0.12253068387508392, "learning_rate": 8.24981672816697e-05, "loss": 0.1625, "step": 1655 }, { "epoch": 0.34760705289672544, "grad_norm": 0.12777206301689148, "learning_rate": 8.247031271928826e-05, "loss": 0.1822, "step": 1656 }, { "epoch": 0.34781696053736355, "grad_norm": 0.10205968469381332, "learning_rate": 8.24424407196072e-05, "loss": 0.1459, "step": 1657 }, { "epoch": 0.34802686817800166, "grad_norm": 0.11570538580417633, "learning_rate": 8.241455129759441e-05, "loss": 0.1719, "step": 1658 }, { "epoch": 0.3482367758186398, "grad_norm": 0.12801894545555115, "learning_rate": 8.238664446822715e-05, "loss": 0.1628, "step": 1659 }, { "epoch": 0.34844668345927793, "grad_norm": 0.13114169239997864, "learning_rate": 8.2358720246492e-05, "loss": 0.1768, "step": 1660 }, { "epoch": 0.34865659109991604, "grad_norm": 0.14204199612140656, "learning_rate": 8.233077864738495e-05, "loss": 0.1359, "step": 1661 }, { "epoch": 0.34886649874055414, "grad_norm": 0.13539434969425201, "learning_rate": 8.230281968591125e-05, "loss": 0.1696, "step": 1662 }, { "epoch": 0.34907640638119225, "grad_norm": 0.10501176118850708, "learning_rate": 8.22748433770855e-05, "loss": 0.1593, "step": 1663 }, { "epoch": 0.3492863140218304, "grad_norm": 0.10992568731307983, "learning_rate": 8.224684973593161e-05, "loss": 0.167, "step": 1664 }, { "epoch": 0.3494962216624685, "grad_norm": 0.10650011897087097, "learning_rate": 8.221883877748285e-05, "loss": 0.1533, "step": 1665 }, { "epoch": 0.3497061293031066, "grad_norm": 0.10955175757408142, "learning_rate": 8.219081051678167e-05, "loss": 0.1684, "step": 1666 }, { "epoch": 0.34991603694374473, "grad_norm": 0.11864706128835678, "learning_rate": 8.216276496887995e-05, "loss": 0.1662, "step": 1667 }, { "epoch": 0.3501259445843829, "grad_norm": 0.11310485005378723, "learning_rate": 8.213470214883876e-05, "loss": 0.1551, "step": 1668 }, { "epoch": 0.350335852225021, "grad_norm": 0.11496865004301071, "learning_rate": 8.210662207172848e-05, "loss": 0.1599, "step": 1669 }, { "epoch": 0.3505457598656591, "grad_norm": 0.16101498901844025, "learning_rate": 8.207852475262876e-05, "loss": 0.1599, "step": 1670 }, { "epoch": 0.3507556675062972, "grad_norm": 0.12098553776741028, "learning_rate": 8.205041020662849e-05, "loss": 0.1541, "step": 1671 }, { "epoch": 0.3509655751469353, "grad_norm": 0.14668092131614685, "learning_rate": 8.202227844882583e-05, "loss": 0.1625, "step": 1672 }, { "epoch": 0.3511754827875735, "grad_norm": 0.11160324513912201, "learning_rate": 8.199412949432817e-05, "loss": 0.1534, "step": 1673 }, { "epoch": 0.3513853904282116, "grad_norm": 0.1357201784849167, "learning_rate": 8.196596335825217e-05, "loss": 0.1702, "step": 1674 }, { "epoch": 0.3515952980688497, "grad_norm": 0.11901015788316727, "learning_rate": 8.193778005572365e-05, "loss": 0.1581, "step": 1675 }, { "epoch": 0.3518052057094878, "grad_norm": 0.12563678622245789, "learning_rate": 8.19095796018777e-05, "loss": 0.1681, "step": 1676 }, { "epoch": 0.3520151133501259, "grad_norm": 0.12322632968425751, "learning_rate": 8.188136201185863e-05, "loss": 0.1503, "step": 1677 }, { "epoch": 0.3522250209907641, "grad_norm": 0.13357025384902954, "learning_rate": 8.185312730081991e-05, "loss": 0.1704, "step": 1678 }, { "epoch": 0.3524349286314022, "grad_norm": 0.12442915886640549, "learning_rate": 8.182487548392422e-05, "loss": 0.163, "step": 1679 }, { "epoch": 0.3526448362720403, "grad_norm": 0.12422217428684235, "learning_rate": 8.179660657634342e-05, "loss": 0.1492, "step": 1680 }, { "epoch": 0.3528547439126784, "grad_norm": 0.12983065843582153, "learning_rate": 8.176832059325861e-05, "loss": 0.1509, "step": 1681 }, { "epoch": 0.35306465155331657, "grad_norm": 0.10647305846214294, "learning_rate": 8.174001754985996e-05, "loss": 0.1775, "step": 1682 }, { "epoch": 0.3532745591939547, "grad_norm": 0.1141040176153183, "learning_rate": 8.171169746134686e-05, "loss": 0.1617, "step": 1683 }, { "epoch": 0.3534844668345928, "grad_norm": 0.13015101850032806, "learning_rate": 8.168336034292786e-05, "loss": 0.16, "step": 1684 }, { "epoch": 0.3536943744752309, "grad_norm": 0.1377038061618805, "learning_rate": 8.165500620982062e-05, "loss": 0.1524, "step": 1685 }, { "epoch": 0.353904282115869, "grad_norm": 0.1097029447555542, "learning_rate": 8.162663507725198e-05, "loss": 0.1697, "step": 1686 }, { "epoch": 0.35411418975650716, "grad_norm": 0.1175478920340538, "learning_rate": 8.159824696045788e-05, "loss": 0.1826, "step": 1687 }, { "epoch": 0.35432409739714527, "grad_norm": 0.10951685905456543, "learning_rate": 8.156984187468337e-05, "loss": 0.156, "step": 1688 }, { "epoch": 0.3545340050377834, "grad_norm": 0.11676156520843506, "learning_rate": 8.154141983518265e-05, "loss": 0.1443, "step": 1689 }, { "epoch": 0.3547439126784215, "grad_norm": 0.10480359196662903, "learning_rate": 8.151298085721899e-05, "loss": 0.1675, "step": 1690 }, { "epoch": 0.3549538203190596, "grad_norm": 0.11646043509244919, "learning_rate": 8.148452495606478e-05, "loss": 0.1576, "step": 1691 }, { "epoch": 0.35516372795969775, "grad_norm": 0.1242692843079567, "learning_rate": 8.14560521470015e-05, "loss": 0.1486, "step": 1692 }, { "epoch": 0.35537363560033586, "grad_norm": 0.09616512805223465, "learning_rate": 8.142756244531967e-05, "loss": 0.1522, "step": 1693 }, { "epoch": 0.35558354324097396, "grad_norm": 0.139311745762825, "learning_rate": 8.13990558663189e-05, "loss": 0.1613, "step": 1694 }, { "epoch": 0.35579345088161207, "grad_norm": 0.11522821336984634, "learning_rate": 8.137053242530795e-05, "loss": 0.1603, "step": 1695 }, { "epoch": 0.35600335852225024, "grad_norm": 0.11472437530755997, "learning_rate": 8.134199213760447e-05, "loss": 0.1537, "step": 1696 }, { "epoch": 0.35621326616288834, "grad_norm": 0.14459721744060516, "learning_rate": 8.13134350185353e-05, "loss": 0.151, "step": 1697 }, { "epoch": 0.35642317380352645, "grad_norm": 0.12816378474235535, "learning_rate": 8.128486108343625e-05, "loss": 0.1677, "step": 1698 }, { "epoch": 0.35663308144416456, "grad_norm": 0.11897154897451401, "learning_rate": 8.125627034765218e-05, "loss": 0.1802, "step": 1699 }, { "epoch": 0.35684298908480266, "grad_norm": 0.12595590949058533, "learning_rate": 8.122766282653696e-05, "loss": 0.1668, "step": 1700 }, { "epoch": 0.3570528967254408, "grad_norm": 0.1388867050409317, "learning_rate": 8.119903853545348e-05, "loss": 0.1826, "step": 1701 }, { "epoch": 0.35726280436607893, "grad_norm": 0.1385848969221115, "learning_rate": 8.117039748977366e-05, "loss": 0.1581, "step": 1702 }, { "epoch": 0.35747271200671704, "grad_norm": 0.14706510305404663, "learning_rate": 8.114173970487838e-05, "loss": 0.1618, "step": 1703 }, { "epoch": 0.35768261964735515, "grad_norm": 0.15850181877613068, "learning_rate": 8.111306519615751e-05, "loss": 0.1527, "step": 1704 }, { "epoch": 0.35789252728799326, "grad_norm": 0.11479350924491882, "learning_rate": 8.108437397900995e-05, "loss": 0.166, "step": 1705 }, { "epoch": 0.3581024349286314, "grad_norm": 0.14089131355285645, "learning_rate": 8.10556660688435e-05, "loss": 0.1599, "step": 1706 }, { "epoch": 0.3583123425692695, "grad_norm": 0.11980936676263809, "learning_rate": 8.102694148107499e-05, "loss": 0.1685, "step": 1707 }, { "epoch": 0.35852225020990763, "grad_norm": 0.12391865998506546, "learning_rate": 8.099820023113018e-05, "loss": 0.164, "step": 1708 }, { "epoch": 0.35873215785054574, "grad_norm": 0.11180952191352844, "learning_rate": 8.096944233444379e-05, "loss": 0.1498, "step": 1709 }, { "epoch": 0.3589420654911839, "grad_norm": 0.12841565907001495, "learning_rate": 8.094066780645943e-05, "loss": 0.1606, "step": 1710 }, { "epoch": 0.359151973131822, "grad_norm": 0.11051752418279648, "learning_rate": 8.09118766626297e-05, "loss": 0.1519, "step": 1711 }, { "epoch": 0.3593618807724601, "grad_norm": 0.12816421687602997, "learning_rate": 8.088306891841609e-05, "loss": 0.1657, "step": 1712 }, { "epoch": 0.3595717884130982, "grad_norm": 0.10819518566131592, "learning_rate": 8.085424458928904e-05, "loss": 0.1805, "step": 1713 }, { "epoch": 0.35978169605373633, "grad_norm": 0.10757758468389511, "learning_rate": 8.082540369072786e-05, "loss": 0.1524, "step": 1714 }, { "epoch": 0.3599916036943745, "grad_norm": 0.13418933749198914, "learning_rate": 8.079654623822077e-05, "loss": 0.1623, "step": 1715 }, { "epoch": 0.3602015113350126, "grad_norm": 0.11752188950777054, "learning_rate": 8.076767224726487e-05, "loss": 0.1736, "step": 1716 }, { "epoch": 0.3604114189756507, "grad_norm": 0.1459878832101822, "learning_rate": 8.073878173336618e-05, "loss": 0.1502, "step": 1717 }, { "epoch": 0.3606213266162888, "grad_norm": 0.10812754929065704, "learning_rate": 8.070987471203955e-05, "loss": 0.1484, "step": 1718 }, { "epoch": 0.3608312342569269, "grad_norm": 0.1168069913983345, "learning_rate": 8.06809511988087e-05, "loss": 0.1355, "step": 1719 }, { "epoch": 0.3610411418975651, "grad_norm": 0.133521169424057, "learning_rate": 8.065201120920625e-05, "loss": 0.1707, "step": 1720 }, { "epoch": 0.3612510495382032, "grad_norm": 0.12936338782310486, "learning_rate": 8.062305475877358e-05, "loss": 0.1518, "step": 1721 }, { "epoch": 0.3614609571788413, "grad_norm": 0.11167915165424347, "learning_rate": 8.059408186306104e-05, "loss": 0.1365, "step": 1722 }, { "epoch": 0.3616708648194794, "grad_norm": 0.1492997705936432, "learning_rate": 8.056509253762768e-05, "loss": 0.1719, "step": 1723 }, { "epoch": 0.3618807724601176, "grad_norm": 0.1078883707523346, "learning_rate": 8.053608679804144e-05, "loss": 0.1466, "step": 1724 }, { "epoch": 0.3620906801007557, "grad_norm": 0.11772193014621735, "learning_rate": 8.050706465987907e-05, "loss": 0.1607, "step": 1725 }, { "epoch": 0.3623005877413938, "grad_norm": 0.11475538462400436, "learning_rate": 8.047802613872615e-05, "loss": 0.1727, "step": 1726 }, { "epoch": 0.3625104953820319, "grad_norm": 0.10466550290584564, "learning_rate": 8.044897125017696e-05, "loss": 0.1586, "step": 1727 }, { "epoch": 0.36272040302267, "grad_norm": 0.13003644347190857, "learning_rate": 8.041990000983472e-05, "loss": 0.168, "step": 1728 }, { "epoch": 0.36293031066330816, "grad_norm": 0.11580996215343475, "learning_rate": 8.039081243331128e-05, "loss": 0.1532, "step": 1729 }, { "epoch": 0.36314021830394627, "grad_norm": 0.11512108892202377, "learning_rate": 8.036170853622738e-05, "loss": 0.153, "step": 1730 }, { "epoch": 0.3633501259445844, "grad_norm": 0.14258354902267456, "learning_rate": 8.033258833421246e-05, "loss": 0.1542, "step": 1731 }, { "epoch": 0.3635600335852225, "grad_norm": 0.11443816125392914, "learning_rate": 8.030345184290473e-05, "loss": 0.1523, "step": 1732 }, { "epoch": 0.36376994122586065, "grad_norm": 0.098850317299366, "learning_rate": 8.027429907795116e-05, "loss": 0.1413, "step": 1733 }, { "epoch": 0.36397984886649876, "grad_norm": 0.09741919487714767, "learning_rate": 8.024513005500744e-05, "loss": 0.1482, "step": 1734 }, { "epoch": 0.36418975650713686, "grad_norm": 0.12056808918714523, "learning_rate": 8.021594478973801e-05, "loss": 0.1621, "step": 1735 }, { "epoch": 0.36439966414777497, "grad_norm": 0.10978755354881287, "learning_rate": 8.018674329781603e-05, "loss": 0.1578, "step": 1736 }, { "epoch": 0.3646095717884131, "grad_norm": 0.12888531386852264, "learning_rate": 8.015752559492335e-05, "loss": 0.1552, "step": 1737 }, { "epoch": 0.36481947942905124, "grad_norm": 0.12127751111984253, "learning_rate": 8.012829169675056e-05, "loss": 0.165, "step": 1738 }, { "epoch": 0.36502938706968935, "grad_norm": 0.10316291451454163, "learning_rate": 8.009904161899692e-05, "loss": 0.1306, "step": 1739 }, { "epoch": 0.36523929471032746, "grad_norm": 0.12621663510799408, "learning_rate": 8.006977537737043e-05, "loss": 0.1586, "step": 1740 }, { "epoch": 0.36544920235096556, "grad_norm": 0.13226893544197083, "learning_rate": 8.004049298758767e-05, "loss": 0.1432, "step": 1741 }, { "epoch": 0.36565910999160367, "grad_norm": 0.11602537333965302, "learning_rate": 8.001119446537397e-05, "loss": 0.152, "step": 1742 }, { "epoch": 0.36586901763224183, "grad_norm": 0.10899031907320023, "learning_rate": 7.998187982646332e-05, "loss": 0.1489, "step": 1743 }, { "epoch": 0.36607892527287994, "grad_norm": 0.11780837923288345, "learning_rate": 7.995254908659836e-05, "loss": 0.166, "step": 1744 }, { "epoch": 0.36628883291351805, "grad_norm": 0.13604290783405304, "learning_rate": 7.992320226153032e-05, "loss": 0.1731, "step": 1745 }, { "epoch": 0.36649874055415615, "grad_norm": 0.1298649162054062, "learning_rate": 7.989383936701916e-05, "loss": 0.151, "step": 1746 }, { "epoch": 0.3667086481947943, "grad_norm": 0.12146075814962387, "learning_rate": 7.986446041883342e-05, "loss": 0.1674, "step": 1747 }, { "epoch": 0.3669185558354324, "grad_norm": 0.13171765208244324, "learning_rate": 7.983506543275025e-05, "loss": 0.147, "step": 1748 }, { "epoch": 0.36712846347607053, "grad_norm": 0.13356606662273407, "learning_rate": 7.980565442455545e-05, "loss": 0.1724, "step": 1749 }, { "epoch": 0.36733837111670864, "grad_norm": 0.13275739550590515, "learning_rate": 7.977622741004338e-05, "loss": 0.1586, "step": 1750 }, { "epoch": 0.36754827875734675, "grad_norm": 0.1208769828081131, "learning_rate": 7.974678440501703e-05, "loss": 0.1558, "step": 1751 }, { "epoch": 0.3677581863979849, "grad_norm": 0.15097500383853912, "learning_rate": 7.971732542528798e-05, "loss": 0.157, "step": 1752 }, { "epoch": 0.367968094038623, "grad_norm": 0.13939009606838226, "learning_rate": 7.968785048667634e-05, "loss": 0.1483, "step": 1753 }, { "epoch": 0.3681780016792611, "grad_norm": 0.1147853285074234, "learning_rate": 7.965835960501086e-05, "loss": 0.1638, "step": 1754 }, { "epoch": 0.36838790931989923, "grad_norm": 0.10615186393260956, "learning_rate": 7.96288527961288e-05, "loss": 0.1674, "step": 1755 }, { "epoch": 0.36859781696053734, "grad_norm": 0.10610269010066986, "learning_rate": 7.959933007587598e-05, "loss": 0.1598, "step": 1756 }, { "epoch": 0.3688077246011755, "grad_norm": 0.11840075999498367, "learning_rate": 7.956979146010683e-05, "loss": 0.1673, "step": 1757 }, { "epoch": 0.3690176322418136, "grad_norm": 0.1263597160577774, "learning_rate": 7.954023696468418e-05, "loss": 0.1332, "step": 1758 }, { "epoch": 0.3692275398824517, "grad_norm": 0.13218241930007935, "learning_rate": 7.951066660547952e-05, "loss": 0.165, "step": 1759 }, { "epoch": 0.3694374475230898, "grad_norm": 0.11182461678981781, "learning_rate": 7.948108039837281e-05, "loss": 0.175, "step": 1760 }, { "epoch": 0.369647355163728, "grad_norm": 0.12093862891197205, "learning_rate": 7.94514783592525e-05, "loss": 0.17, "step": 1761 }, { "epoch": 0.3698572628043661, "grad_norm": 0.1129969134926796, "learning_rate": 7.942186050401556e-05, "loss": 0.1605, "step": 1762 }, { "epoch": 0.3700671704450042, "grad_norm": 0.11315370351076126, "learning_rate": 7.939222684856747e-05, "loss": 0.1724, "step": 1763 }, { "epoch": 0.3702770780856423, "grad_norm": 0.12104691565036774, "learning_rate": 7.936257740882218e-05, "loss": 0.1693, "step": 1764 }, { "epoch": 0.3704869857262804, "grad_norm": 0.12000355124473572, "learning_rate": 7.933291220070209e-05, "loss": 0.1855, "step": 1765 }, { "epoch": 0.3706968933669186, "grad_norm": 0.11460446566343307, "learning_rate": 7.930323124013812e-05, "loss": 0.1531, "step": 1766 }, { "epoch": 0.3709068010075567, "grad_norm": 0.12112744152545929, "learning_rate": 7.927353454306962e-05, "loss": 0.166, "step": 1767 }, { "epoch": 0.3711167086481948, "grad_norm": 0.11343761533498764, "learning_rate": 7.92438221254444e-05, "loss": 0.1718, "step": 1768 }, { "epoch": 0.3713266162888329, "grad_norm": 0.09574848413467407, "learning_rate": 7.921409400321869e-05, "loss": 0.1552, "step": 1769 }, { "epoch": 0.371536523929471, "grad_norm": 0.12521985173225403, "learning_rate": 7.918435019235719e-05, "loss": 0.173, "step": 1770 }, { "epoch": 0.37174643157010917, "grad_norm": 0.13208284974098206, "learning_rate": 7.915459070883297e-05, "loss": 0.1461, "step": 1771 }, { "epoch": 0.3719563392107473, "grad_norm": 0.13827428221702576, "learning_rate": 7.91248155686276e-05, "loss": 0.1763, "step": 1772 }, { "epoch": 0.3721662468513854, "grad_norm": 0.11231647431850433, "learning_rate": 7.909502478773099e-05, "loss": 0.1507, "step": 1773 }, { "epoch": 0.3723761544920235, "grad_norm": 0.13902410864830017, "learning_rate": 7.906521838214147e-05, "loss": 0.1658, "step": 1774 }, { "epoch": 0.37258606213266166, "grad_norm": 0.1284346878528595, "learning_rate": 7.903539636786574e-05, "loss": 0.1473, "step": 1775 }, { "epoch": 0.37279596977329976, "grad_norm": 0.09582658857107162, "learning_rate": 7.900555876091894e-05, "loss": 0.1578, "step": 1776 }, { "epoch": 0.37300587741393787, "grad_norm": 0.11832595616579056, "learning_rate": 7.897570557732452e-05, "loss": 0.1528, "step": 1777 }, { "epoch": 0.373215785054576, "grad_norm": 0.12966689467430115, "learning_rate": 7.894583683311434e-05, "loss": 0.1861, "step": 1778 }, { "epoch": 0.3734256926952141, "grad_norm": 0.11145227402448654, "learning_rate": 7.891595254432857e-05, "loss": 0.1555, "step": 1779 }, { "epoch": 0.37363560033585225, "grad_norm": 0.13669519126415253, "learning_rate": 7.888605272701578e-05, "loss": 0.1618, "step": 1780 }, { "epoch": 0.37384550797649035, "grad_norm": 0.12473908811807632, "learning_rate": 7.885613739723285e-05, "loss": 0.1604, "step": 1781 }, { "epoch": 0.37405541561712846, "grad_norm": 0.10881557315587997, "learning_rate": 7.8826206571045e-05, "loss": 0.1701, "step": 1782 }, { "epoch": 0.37426532325776657, "grad_norm": 0.11604516953229904, "learning_rate": 7.879626026452576e-05, "loss": 0.1467, "step": 1783 }, { "epoch": 0.3744752308984047, "grad_norm": 0.11594154685735703, "learning_rate": 7.876629849375696e-05, "loss": 0.181, "step": 1784 }, { "epoch": 0.37468513853904284, "grad_norm": 0.11912267655134201, "learning_rate": 7.87363212748288e-05, "loss": 0.1451, "step": 1785 }, { "epoch": 0.37489504617968095, "grad_norm": 0.11206775903701782, "learning_rate": 7.87063286238397e-05, "loss": 0.1706, "step": 1786 }, { "epoch": 0.37510495382031905, "grad_norm": 0.12397405505180359, "learning_rate": 7.86763205568964e-05, "loss": 0.1599, "step": 1787 }, { "epoch": 0.37531486146095716, "grad_norm": 0.1209942027926445, "learning_rate": 7.864629709011395e-05, "loss": 0.139, "step": 1788 }, { "epoch": 0.3755247691015953, "grad_norm": 0.10950622707605362, "learning_rate": 7.86162582396156e-05, "loss": 0.1625, "step": 1789 }, { "epoch": 0.37573467674223343, "grad_norm": 0.12611068785190582, "learning_rate": 7.85862040215329e-05, "loss": 0.1719, "step": 1790 }, { "epoch": 0.37594458438287154, "grad_norm": 0.1125999391078949, "learning_rate": 7.855613445200567e-05, "loss": 0.1732, "step": 1791 }, { "epoch": 0.37615449202350965, "grad_norm": 0.12845373153686523, "learning_rate": 7.852604954718195e-05, "loss": 0.1477, "step": 1792 }, { "epoch": 0.37636439966414775, "grad_norm": 0.17373661696910858, "learning_rate": 7.8495949323218e-05, "loss": 0.158, "step": 1793 }, { "epoch": 0.3765743073047859, "grad_norm": 0.10421296954154968, "learning_rate": 7.846583379627836e-05, "loss": 0.1683, "step": 1794 }, { "epoch": 0.376784214945424, "grad_norm": 0.1311960518360138, "learning_rate": 7.843570298253577e-05, "loss": 0.1361, "step": 1795 }, { "epoch": 0.37699412258606213, "grad_norm": 0.17410288751125336, "learning_rate": 7.840555689817111e-05, "loss": 0.1678, "step": 1796 }, { "epoch": 0.37720403022670024, "grad_norm": 0.11635412275791168, "learning_rate": 7.837539555937354e-05, "loss": 0.1582, "step": 1797 }, { "epoch": 0.37741393786733834, "grad_norm": 0.1161051094532013, "learning_rate": 7.83452189823404e-05, "loss": 0.1686, "step": 1798 }, { "epoch": 0.3776238455079765, "grad_norm": 0.11930254846811295, "learning_rate": 7.831502718327719e-05, "loss": 0.1717, "step": 1799 }, { "epoch": 0.3778337531486146, "grad_norm": 0.12649065256118774, "learning_rate": 7.82848201783976e-05, "loss": 0.1418, "step": 1800 }, { "epoch": 0.3780436607892527, "grad_norm": 0.1233733594417572, "learning_rate": 7.825459798392349e-05, "loss": 0.1412, "step": 1801 }, { "epoch": 0.37825356842989083, "grad_norm": 0.12751632928848267, "learning_rate": 7.822436061608485e-05, "loss": 0.1591, "step": 1802 }, { "epoch": 0.378463476070529, "grad_norm": 0.11433545500040054, "learning_rate": 7.819410809111984e-05, "loss": 0.1608, "step": 1803 }, { "epoch": 0.3786733837111671, "grad_norm": 0.12332860380411148, "learning_rate": 7.816384042527479e-05, "loss": 0.1544, "step": 1804 }, { "epoch": 0.3788832913518052, "grad_norm": 0.14029990136623383, "learning_rate": 7.813355763480407e-05, "loss": 0.1652, "step": 1805 }, { "epoch": 0.3790931989924433, "grad_norm": 0.12804871797561646, "learning_rate": 7.810325973597027e-05, "loss": 0.1464, "step": 1806 }, { "epoch": 0.3793031066330814, "grad_norm": 0.10844779014587402, "learning_rate": 7.807294674504405e-05, "loss": 0.1517, "step": 1807 }, { "epoch": 0.3795130142737196, "grad_norm": 0.1057022362947464, "learning_rate": 7.804261867830418e-05, "loss": 0.1558, "step": 1808 }, { "epoch": 0.3797229219143577, "grad_norm": 0.1034378632903099, "learning_rate": 7.801227555203751e-05, "loss": 0.1519, "step": 1809 }, { "epoch": 0.3799328295549958, "grad_norm": 0.13120600581169128, "learning_rate": 7.7981917382539e-05, "loss": 0.1689, "step": 1810 }, { "epoch": 0.3801427371956339, "grad_norm": 0.14298629760742188, "learning_rate": 7.795154418611169e-05, "loss": 0.1609, "step": 1811 }, { "epoch": 0.380352644836272, "grad_norm": 0.1206570416688919, "learning_rate": 7.792115597906666e-05, "loss": 0.1481, "step": 1812 }, { "epoch": 0.3805625524769102, "grad_norm": 0.12700314819812775, "learning_rate": 7.789075277772309e-05, "loss": 0.1526, "step": 1813 }, { "epoch": 0.3807724601175483, "grad_norm": 0.13419455289840698, "learning_rate": 7.786033459840817e-05, "loss": 0.1665, "step": 1814 }, { "epoch": 0.3809823677581864, "grad_norm": 0.11350502818822861, "learning_rate": 7.78299014574572e-05, "loss": 0.1293, "step": 1815 }, { "epoch": 0.3811922753988245, "grad_norm": 0.12667128443717957, "learning_rate": 7.779945337121343e-05, "loss": 0.1474, "step": 1816 }, { "epoch": 0.38140218303946266, "grad_norm": 0.14447607100009918, "learning_rate": 7.77689903560282e-05, "loss": 0.1642, "step": 1817 }, { "epoch": 0.38161209068010077, "grad_norm": 0.1254795789718628, "learning_rate": 7.773851242826083e-05, "loss": 0.1565, "step": 1818 }, { "epoch": 0.3818219983207389, "grad_norm": 0.1166321262717247, "learning_rate": 7.770801960427869e-05, "loss": 0.1583, "step": 1819 }, { "epoch": 0.382031905961377, "grad_norm": 0.11959258466959, "learning_rate": 7.767751190045706e-05, "loss": 0.1623, "step": 1820 }, { "epoch": 0.3822418136020151, "grad_norm": 0.09385194629430771, "learning_rate": 7.764698933317935e-05, "loss": 0.1525, "step": 1821 }, { "epoch": 0.38245172124265325, "grad_norm": 0.1065850630402565, "learning_rate": 7.761645191883684e-05, "loss": 0.1538, "step": 1822 }, { "epoch": 0.38266162888329136, "grad_norm": 0.13342434167861938, "learning_rate": 7.758589967382883e-05, "loss": 0.1488, "step": 1823 }, { "epoch": 0.38287153652392947, "grad_norm": 0.13576741516590118, "learning_rate": 7.755533261456256e-05, "loss": 0.1693, "step": 1824 }, { "epoch": 0.3830814441645676, "grad_norm": 0.12756936252117157, "learning_rate": 7.752475075745325e-05, "loss": 0.1573, "step": 1825 }, { "epoch": 0.3832913518052057, "grad_norm": 0.1328425407409668, "learning_rate": 7.749415411892404e-05, "loss": 0.142, "step": 1826 }, { "epoch": 0.38350125944584385, "grad_norm": 0.1451667994260788, "learning_rate": 7.746354271540605e-05, "loss": 0.1544, "step": 1827 }, { "epoch": 0.38371116708648195, "grad_norm": 0.13675998151302338, "learning_rate": 7.743291656333826e-05, "loss": 0.1576, "step": 1828 }, { "epoch": 0.38392107472712006, "grad_norm": 0.11071248352527618, "learning_rate": 7.740227567916767e-05, "loss": 0.1636, "step": 1829 }, { "epoch": 0.38413098236775817, "grad_norm": 0.13009920716285706, "learning_rate": 7.73716200793491e-05, "loss": 0.1705, "step": 1830 }, { "epoch": 0.38434089000839633, "grad_norm": 0.13077101111412048, "learning_rate": 7.734094978034532e-05, "loss": 0.1715, "step": 1831 }, { "epoch": 0.38455079764903444, "grad_norm": 0.11513860523700714, "learning_rate": 7.731026479862696e-05, "loss": 0.1584, "step": 1832 }, { "epoch": 0.38476070528967254, "grad_norm": 0.10579150915145874, "learning_rate": 7.727956515067259e-05, "loss": 0.1618, "step": 1833 }, { "epoch": 0.38497061293031065, "grad_norm": 0.1078706756234169, "learning_rate": 7.724885085296858e-05, "loss": 0.167, "step": 1834 }, { "epoch": 0.38518052057094876, "grad_norm": 0.131106436252594, "learning_rate": 7.721812192200924e-05, "loss": 0.1525, "step": 1835 }, { "epoch": 0.3853904282115869, "grad_norm": 0.15161815285682678, "learning_rate": 7.718737837429668e-05, "loss": 0.1523, "step": 1836 }, { "epoch": 0.38560033585222503, "grad_norm": 0.11949379742145538, "learning_rate": 7.715662022634094e-05, "loss": 0.1543, "step": 1837 }, { "epoch": 0.38581024349286314, "grad_norm": 0.16841568052768707, "learning_rate": 7.712584749465981e-05, "loss": 0.1754, "step": 1838 }, { "epoch": 0.38602015113350124, "grad_norm": 0.1119508296251297, "learning_rate": 7.709506019577895e-05, "loss": 0.16, "step": 1839 }, { "epoch": 0.38623005877413935, "grad_norm": 0.12148314714431763, "learning_rate": 7.706425834623184e-05, "loss": 0.1545, "step": 1840 }, { "epoch": 0.3864399664147775, "grad_norm": 0.13948237895965576, "learning_rate": 7.703344196255979e-05, "loss": 0.156, "step": 1841 }, { "epoch": 0.3866498740554156, "grad_norm": 0.11464816331863403, "learning_rate": 7.700261106131191e-05, "loss": 0.172, "step": 1842 }, { "epoch": 0.38685978169605373, "grad_norm": 0.11518428474664688, "learning_rate": 7.697176565904507e-05, "loss": 0.171, "step": 1843 }, { "epoch": 0.38706968933669184, "grad_norm": 0.11466710269451141, "learning_rate": 7.694090577232398e-05, "loss": 0.1545, "step": 1844 }, { "epoch": 0.38727959697733, "grad_norm": 0.13466320931911469, "learning_rate": 7.691003141772108e-05, "loss": 0.1487, "step": 1845 }, { "epoch": 0.3874895046179681, "grad_norm": 0.1101074293255806, "learning_rate": 7.687914261181664e-05, "loss": 0.1659, "step": 1846 }, { "epoch": 0.3876994122586062, "grad_norm": 0.12081106752157211, "learning_rate": 7.684823937119862e-05, "loss": 0.1558, "step": 1847 }, { "epoch": 0.3879093198992443, "grad_norm": 0.12375544011592865, "learning_rate": 7.681732171246276e-05, "loss": 0.1626, "step": 1848 }, { "epoch": 0.3881192275398824, "grad_norm": 0.13245245814323425, "learning_rate": 7.678638965221262e-05, "loss": 0.1445, "step": 1849 }, { "epoch": 0.3883291351805206, "grad_norm": 0.12885937094688416, "learning_rate": 7.675544320705932e-05, "loss": 0.151, "step": 1850 }, { "epoch": 0.3885390428211587, "grad_norm": 0.15650451183319092, "learning_rate": 7.67244823936219e-05, "loss": 0.1456, "step": 1851 }, { "epoch": 0.3887489504617968, "grad_norm": 0.13623982667922974, "learning_rate": 7.669350722852693e-05, "loss": 0.1726, "step": 1852 }, { "epoch": 0.3889588581024349, "grad_norm": 0.1482025533914566, "learning_rate": 7.666251772840887e-05, "loss": 0.1594, "step": 1853 }, { "epoch": 0.3891687657430731, "grad_norm": 0.12205847352743149, "learning_rate": 7.663151390990975e-05, "loss": 0.1656, "step": 1854 }, { "epoch": 0.3893786733837112, "grad_norm": 0.11122436821460724, "learning_rate": 7.660049578967934e-05, "loss": 0.1674, "step": 1855 }, { "epoch": 0.3895885810243493, "grad_norm": 0.13141264021396637, "learning_rate": 7.656946338437505e-05, "loss": 0.1543, "step": 1856 }, { "epoch": 0.3897984886649874, "grad_norm": 0.12650364637374878, "learning_rate": 7.653841671066203e-05, "loss": 0.1469, "step": 1857 }, { "epoch": 0.3900083963056255, "grad_norm": 0.12014992535114288, "learning_rate": 7.650735578521302e-05, "loss": 0.1541, "step": 1858 }, { "epoch": 0.39021830394626367, "grad_norm": 0.1951693892478943, "learning_rate": 7.647628062470848e-05, "loss": 0.1653, "step": 1859 }, { "epoch": 0.3904282115869018, "grad_norm": 0.1394760012626648, "learning_rate": 7.644519124583643e-05, "loss": 0.1573, "step": 1860 }, { "epoch": 0.3906381192275399, "grad_norm": 0.14827001094818115, "learning_rate": 7.641408766529266e-05, "loss": 0.1585, "step": 1861 }, { "epoch": 0.390848026868178, "grad_norm": 0.1227707639336586, "learning_rate": 7.63829698997804e-05, "loss": 0.1548, "step": 1862 }, { "epoch": 0.3910579345088161, "grad_norm": 0.13828247785568237, "learning_rate": 7.635183796601068e-05, "loss": 0.1789, "step": 1863 }, { "epoch": 0.39126784214945426, "grad_norm": 0.13925831019878387, "learning_rate": 7.632069188070204e-05, "loss": 0.1699, "step": 1864 }, { "epoch": 0.39147774979009237, "grad_norm": 0.13822948932647705, "learning_rate": 7.628953166058062e-05, "loss": 0.1674, "step": 1865 }, { "epoch": 0.3916876574307305, "grad_norm": 0.14011485874652863, "learning_rate": 7.625835732238018e-05, "loss": 0.1778, "step": 1866 }, { "epoch": 0.3918975650713686, "grad_norm": 0.1188986524939537, "learning_rate": 7.622716888284205e-05, "loss": 0.1563, "step": 1867 }, { "epoch": 0.39210747271200674, "grad_norm": 0.10080984979867935, "learning_rate": 7.619596635871513e-05, "loss": 0.169, "step": 1868 }, { "epoch": 0.39231738035264485, "grad_norm": 0.11640693992376328, "learning_rate": 7.616474976675592e-05, "loss": 0.1699, "step": 1869 }, { "epoch": 0.39252728799328296, "grad_norm": 0.1325574517250061, "learning_rate": 7.613351912372842e-05, "loss": 0.1727, "step": 1870 }, { "epoch": 0.39273719563392107, "grad_norm": 0.12673214077949524, "learning_rate": 7.610227444640417e-05, "loss": 0.1579, "step": 1871 }, { "epoch": 0.3929471032745592, "grad_norm": 0.10220383107662201, "learning_rate": 7.607101575156232e-05, "loss": 0.1559, "step": 1872 }, { "epoch": 0.39315701091519734, "grad_norm": 0.11445717513561249, "learning_rate": 7.603974305598948e-05, "loss": 0.1609, "step": 1873 }, { "epoch": 0.39336691855583544, "grad_norm": 0.10287608951330185, "learning_rate": 7.600845637647981e-05, "loss": 0.1442, "step": 1874 }, { "epoch": 0.39357682619647355, "grad_norm": 0.11501127481460571, "learning_rate": 7.597715572983499e-05, "loss": 0.162, "step": 1875 }, { "epoch": 0.39378673383711166, "grad_norm": 0.18507914245128632, "learning_rate": 7.594584113286413e-05, "loss": 0.1611, "step": 1876 }, { "epoch": 0.39399664147774976, "grad_norm": 0.11232730746269226, "learning_rate": 7.591451260238394e-05, "loss": 0.1555, "step": 1877 }, { "epoch": 0.39420654911838793, "grad_norm": 0.0994970053434372, "learning_rate": 7.588317015521853e-05, "loss": 0.1331, "step": 1878 }, { "epoch": 0.39441645675902604, "grad_norm": 0.11033397912979126, "learning_rate": 7.58518138081995e-05, "loss": 0.1611, "step": 1879 }, { "epoch": 0.39462636439966414, "grad_norm": 0.12892796099185944, "learning_rate": 7.582044357816593e-05, "loss": 0.1597, "step": 1880 }, { "epoch": 0.39483627204030225, "grad_norm": 0.13372941315174103, "learning_rate": 7.578905948196437e-05, "loss": 0.1605, "step": 1881 }, { "epoch": 0.3950461796809404, "grad_norm": 0.11890514194965363, "learning_rate": 7.57576615364488e-05, "loss": 0.1598, "step": 1882 }, { "epoch": 0.3952560873215785, "grad_norm": 0.14822334051132202, "learning_rate": 7.57262497584806e-05, "loss": 0.1801, "step": 1883 }, { "epoch": 0.3954659949622166, "grad_norm": 0.1431664526462555, "learning_rate": 7.569482416492863e-05, "loss": 0.1632, "step": 1884 }, { "epoch": 0.39567590260285473, "grad_norm": 0.11748656630516052, "learning_rate": 7.566338477266914e-05, "loss": 0.1479, "step": 1885 }, { "epoch": 0.39588581024349284, "grad_norm": 0.11131531000137329, "learning_rate": 7.563193159858584e-05, "loss": 0.1545, "step": 1886 }, { "epoch": 0.396095717884131, "grad_norm": 0.11406468600034714, "learning_rate": 7.560046465956975e-05, "loss": 0.1505, "step": 1887 }, { "epoch": 0.3963056255247691, "grad_norm": 0.12816153466701508, "learning_rate": 7.556898397251938e-05, "loss": 0.1709, "step": 1888 }, { "epoch": 0.3965155331654072, "grad_norm": 0.11210453510284424, "learning_rate": 7.553748955434054e-05, "loss": 0.1575, "step": 1889 }, { "epoch": 0.3967254408060453, "grad_norm": 0.11992809921503067, "learning_rate": 7.55059814219465e-05, "loss": 0.1396, "step": 1890 }, { "epoch": 0.39693534844668343, "grad_norm": 0.11076368391513824, "learning_rate": 7.54744595922578e-05, "loss": 0.1574, "step": 1891 }, { "epoch": 0.3971452560873216, "grad_norm": 0.12722079455852509, "learning_rate": 7.544292408220241e-05, "loss": 0.1595, "step": 1892 }, { "epoch": 0.3973551637279597, "grad_norm": 0.12932024896144867, "learning_rate": 7.541137490871559e-05, "loss": 0.1822, "step": 1893 }, { "epoch": 0.3975650713685978, "grad_norm": 0.1031857281923294, "learning_rate": 7.537981208874002e-05, "loss": 0.1624, "step": 1894 }, { "epoch": 0.3977749790092359, "grad_norm": 0.10605569183826447, "learning_rate": 7.534823563922558e-05, "loss": 0.1553, "step": 1895 }, { "epoch": 0.3979848866498741, "grad_norm": 0.11709260195493698, "learning_rate": 7.531664557712962e-05, "loss": 0.1636, "step": 1896 }, { "epoch": 0.3981947942905122, "grad_norm": 0.12204380333423615, "learning_rate": 7.528504191941668e-05, "loss": 0.1446, "step": 1897 }, { "epoch": 0.3984047019311503, "grad_norm": 0.1183619275689125, "learning_rate": 7.525342468305865e-05, "loss": 0.1715, "step": 1898 }, { "epoch": 0.3986146095717884, "grad_norm": 0.15471158921718597, "learning_rate": 7.522179388503471e-05, "loss": 0.1648, "step": 1899 }, { "epoch": 0.3988245172124265, "grad_norm": 0.11495169252157211, "learning_rate": 7.519014954233136e-05, "loss": 0.1752, "step": 1900 }, { "epoch": 0.3990344248530647, "grad_norm": 0.1107286810874939, "learning_rate": 7.515849167194227e-05, "loss": 0.1535, "step": 1901 }, { "epoch": 0.3992443324937028, "grad_norm": 0.13978683948516846, "learning_rate": 7.51268202908685e-05, "loss": 0.1535, "step": 1902 }, { "epoch": 0.3994542401343409, "grad_norm": 0.13453927636146545, "learning_rate": 7.509513541611826e-05, "loss": 0.154, "step": 1903 }, { "epoch": 0.399664147774979, "grad_norm": 0.14580851793289185, "learning_rate": 7.506343706470707e-05, "loss": 0.1758, "step": 1904 }, { "epoch": 0.3998740554156171, "grad_norm": 0.14356370270252228, "learning_rate": 7.503172525365766e-05, "loss": 0.1781, "step": 1905 }, { "epoch": 0.40008396305625527, "grad_norm": 0.1313529908657074, "learning_rate": 7.500000000000001e-05, "loss": 0.1627, "step": 1906 }, { "epoch": 0.4002938706968934, "grad_norm": 0.11975359171628952, "learning_rate": 7.49682613207713e-05, "loss": 0.1542, "step": 1907 }, { "epoch": 0.4005037783375315, "grad_norm": 0.1087488904595375, "learning_rate": 7.493650923301592e-05, "loss": 0.1641, "step": 1908 }, { "epoch": 0.4007136859781696, "grad_norm": 0.1269085705280304, "learning_rate": 7.490474375378546e-05, "loss": 0.1627, "step": 1909 }, { "epoch": 0.40092359361880775, "grad_norm": 0.1457659900188446, "learning_rate": 7.487296490013875e-05, "loss": 0.1808, "step": 1910 }, { "epoch": 0.40113350125944586, "grad_norm": 0.1324755698442459, "learning_rate": 7.484117268914171e-05, "loss": 0.1625, "step": 1911 }, { "epoch": 0.40134340890008396, "grad_norm": 0.14338278770446777, "learning_rate": 7.48093671378675e-05, "loss": 0.1551, "step": 1912 }, { "epoch": 0.40155331654072207, "grad_norm": 0.13183006644248962, "learning_rate": 7.477754826339645e-05, "loss": 0.1611, "step": 1913 }, { "epoch": 0.4017632241813602, "grad_norm": 0.11733553558588028, "learning_rate": 7.474571608281599e-05, "loss": 0.1514, "step": 1914 }, { "epoch": 0.40197313182199834, "grad_norm": 0.12164483219385147, "learning_rate": 7.471387061322074e-05, "loss": 0.1609, "step": 1915 }, { "epoch": 0.40218303946263645, "grad_norm": 0.1412167251110077, "learning_rate": 7.468201187171245e-05, "loss": 0.1594, "step": 1916 }, { "epoch": 0.40239294710327456, "grad_norm": 0.14494703710079193, "learning_rate": 7.46501398754e-05, "loss": 0.1702, "step": 1917 }, { "epoch": 0.40260285474391266, "grad_norm": 0.1307775378227234, "learning_rate": 7.461825464139934e-05, "loss": 0.1626, "step": 1918 }, { "epoch": 0.40281276238455077, "grad_norm": 0.11550642549991608, "learning_rate": 7.458635618683362e-05, "loss": 0.1708, "step": 1919 }, { "epoch": 0.40302267002518893, "grad_norm": 0.11663535982370377, "learning_rate": 7.4554444528833e-05, "loss": 0.1467, "step": 1920 }, { "epoch": 0.40323257766582704, "grad_norm": 0.11326033622026443, "learning_rate": 7.452251968453481e-05, "loss": 0.1555, "step": 1921 }, { "epoch": 0.40344248530646515, "grad_norm": 0.10349076241254807, "learning_rate": 7.449058167108337e-05, "loss": 0.1725, "step": 1922 }, { "epoch": 0.40365239294710326, "grad_norm": 0.12648595869541168, "learning_rate": 7.445863050563017e-05, "loss": 0.1535, "step": 1923 }, { "epoch": 0.4038623005877414, "grad_norm": 0.12781694531440735, "learning_rate": 7.442666620533368e-05, "loss": 0.1718, "step": 1924 }, { "epoch": 0.4040722082283795, "grad_norm": 0.14075273275375366, "learning_rate": 7.439468878735947e-05, "loss": 0.1496, "step": 1925 }, { "epoch": 0.40428211586901763, "grad_norm": 0.10260710120201111, "learning_rate": 7.436269826888015e-05, "loss": 0.1569, "step": 1926 }, { "epoch": 0.40449202350965574, "grad_norm": 0.15809062123298645, "learning_rate": 7.433069466707537e-05, "loss": 0.1714, "step": 1927 }, { "epoch": 0.40470193115029385, "grad_norm": 0.12049717456102371, "learning_rate": 7.429867799913177e-05, "loss": 0.1624, "step": 1928 }, { "epoch": 0.404911838790932, "grad_norm": 0.11036177724599838, "learning_rate": 7.426664828224304e-05, "loss": 0.1672, "step": 1929 }, { "epoch": 0.4051217464315701, "grad_norm": 0.12535598874092102, "learning_rate": 7.423460553360988e-05, "loss": 0.1677, "step": 1930 }, { "epoch": 0.4053316540722082, "grad_norm": 0.11823215335607529, "learning_rate": 7.420254977043998e-05, "loss": 0.159, "step": 1931 }, { "epoch": 0.40554156171284633, "grad_norm": 0.13293218612670898, "learning_rate": 7.4170481009948e-05, "loss": 0.1688, "step": 1932 }, { "epoch": 0.40575146935348444, "grad_norm": 0.11967580020427704, "learning_rate": 7.41383992693556e-05, "loss": 0.1649, "step": 1933 }, { "epoch": 0.4059613769941226, "grad_norm": 0.12903308868408203, "learning_rate": 7.410630456589141e-05, "loss": 0.1527, "step": 1934 }, { "epoch": 0.4061712846347607, "grad_norm": 0.13130050897598267, "learning_rate": 7.407419691679104e-05, "loss": 0.1607, "step": 1935 }, { "epoch": 0.4063811922753988, "grad_norm": 0.12664926052093506, "learning_rate": 7.404207633929699e-05, "loss": 0.151, "step": 1936 }, { "epoch": 0.4065910999160369, "grad_norm": 0.11870288848876953, "learning_rate": 7.400994285065878e-05, "loss": 0.151, "step": 1937 }, { "epoch": 0.4068010075566751, "grad_norm": 0.10910684615373611, "learning_rate": 7.397779646813282e-05, "loss": 0.1393, "step": 1938 }, { "epoch": 0.4070109151973132, "grad_norm": 0.13992692530155182, "learning_rate": 7.394563720898245e-05, "loss": 0.18, "step": 1939 }, { "epoch": 0.4072208228379513, "grad_norm": 0.1333228498697281, "learning_rate": 7.39134650904779e-05, "loss": 0.1536, "step": 1940 }, { "epoch": 0.4074307304785894, "grad_norm": 0.11630193144083023, "learning_rate": 7.388128012989636e-05, "loss": 0.1603, "step": 1941 }, { "epoch": 0.4076406381192275, "grad_norm": 0.12996290624141693, "learning_rate": 7.38490823445219e-05, "loss": 0.1532, "step": 1942 }, { "epoch": 0.4078505457598657, "grad_norm": 0.12526121735572815, "learning_rate": 7.381687175164544e-05, "loss": 0.1684, "step": 1943 }, { "epoch": 0.4080604534005038, "grad_norm": 0.13063152134418488, "learning_rate": 7.378464836856481e-05, "loss": 0.1637, "step": 1944 }, { "epoch": 0.4082703610411419, "grad_norm": 0.11410169303417206, "learning_rate": 7.375241221258471e-05, "loss": 0.1621, "step": 1945 }, { "epoch": 0.40848026868178, "grad_norm": 0.1353740096092224, "learning_rate": 7.37201633010167e-05, "loss": 0.1274, "step": 1946 }, { "epoch": 0.4086901763224181, "grad_norm": 0.1723635345697403, "learning_rate": 7.368790165117916e-05, "loss": 0.1589, "step": 1947 }, { "epoch": 0.40890008396305627, "grad_norm": 0.111173614859581, "learning_rate": 7.365562728039734e-05, "loss": 0.1626, "step": 1948 }, { "epoch": 0.4091099916036944, "grad_norm": 0.10644244402647018, "learning_rate": 7.362334020600334e-05, "loss": 0.1709, "step": 1949 }, { "epoch": 0.4093198992443325, "grad_norm": 0.12528212368488312, "learning_rate": 7.359104044533601e-05, "loss": 0.1688, "step": 1950 }, { "epoch": 0.4095298068849706, "grad_norm": 0.11388932913541794, "learning_rate": 7.35587280157411e-05, "loss": 0.1624, "step": 1951 }, { "epoch": 0.40973971452560876, "grad_norm": 0.11479576677083969, "learning_rate": 7.35264029345711e-05, "loss": 0.1728, "step": 1952 }, { "epoch": 0.40994962216624686, "grad_norm": 0.11558470875024796, "learning_rate": 7.349406521918533e-05, "loss": 0.1286, "step": 1953 }, { "epoch": 0.41015952980688497, "grad_norm": 0.12116646766662598, "learning_rate": 7.346171488694988e-05, "loss": 0.1704, "step": 1954 }, { "epoch": 0.4103694374475231, "grad_norm": 0.12693926692008972, "learning_rate": 7.34293519552376e-05, "loss": 0.165, "step": 1955 }, { "epoch": 0.4105793450881612, "grad_norm": 0.11315865814685822, "learning_rate": 7.339697644142815e-05, "loss": 0.1562, "step": 1956 }, { "epoch": 0.41078925272879935, "grad_norm": 0.13625770807266235, "learning_rate": 7.33645883629079e-05, "loss": 0.1792, "step": 1957 }, { "epoch": 0.41099916036943746, "grad_norm": 0.11685556173324585, "learning_rate": 7.333218773707e-05, "loss": 0.1626, "step": 1958 }, { "epoch": 0.41120906801007556, "grad_norm": 0.13109751045703888, "learning_rate": 7.329977458131431e-05, "loss": 0.1563, "step": 1959 }, { "epoch": 0.41141897565071367, "grad_norm": 0.1340368390083313, "learning_rate": 7.326734891304745e-05, "loss": 0.1559, "step": 1960 }, { "epoch": 0.41162888329135183, "grad_norm": 0.12216036021709442, "learning_rate": 7.323491074968275e-05, "loss": 0.1604, "step": 1961 }, { "epoch": 0.41183879093198994, "grad_norm": 0.11884403228759766, "learning_rate": 7.320246010864023e-05, "loss": 0.1572, "step": 1962 }, { "epoch": 0.41204869857262805, "grad_norm": 0.11110524088144302, "learning_rate": 7.316999700734663e-05, "loss": 0.1606, "step": 1963 }, { "epoch": 0.41225860621326615, "grad_norm": 0.12798446416854858, "learning_rate": 7.313752146323539e-05, "loss": 0.1625, "step": 1964 }, { "epoch": 0.41246851385390426, "grad_norm": 0.11968263983726501, "learning_rate": 7.31050334937466e-05, "loss": 0.1749, "step": 1965 }, { "epoch": 0.4126784214945424, "grad_norm": 0.10930939763784409, "learning_rate": 7.307253311632704e-05, "loss": 0.1722, "step": 1966 }, { "epoch": 0.41288832913518053, "grad_norm": 0.12315979599952698, "learning_rate": 7.304002034843015e-05, "loss": 0.1776, "step": 1967 }, { "epoch": 0.41309823677581864, "grad_norm": 0.1320822834968567, "learning_rate": 7.300749520751608e-05, "loss": 0.1539, "step": 1968 }, { "epoch": 0.41330814441645675, "grad_norm": 0.12075715512037277, "learning_rate": 7.297495771105152e-05, "loss": 0.1414, "step": 1969 }, { "epoch": 0.41351805205709485, "grad_norm": 0.1056932806968689, "learning_rate": 7.294240787650986e-05, "loss": 0.1485, "step": 1970 }, { "epoch": 0.413727959697733, "grad_norm": 0.10487929731607437, "learning_rate": 7.290984572137111e-05, "loss": 0.1728, "step": 1971 }, { "epoch": 0.4139378673383711, "grad_norm": 0.10982023179531097, "learning_rate": 7.287727126312191e-05, "loss": 0.1608, "step": 1972 }, { "epoch": 0.41414777497900923, "grad_norm": 0.09598106890916824, "learning_rate": 7.284468451925546e-05, "loss": 0.1503, "step": 1973 }, { "epoch": 0.41435768261964734, "grad_norm": 0.14356835186481476, "learning_rate": 7.281208550727159e-05, "loss": 0.1495, "step": 1974 }, { "epoch": 0.4145675902602855, "grad_norm": 0.10420846194028854, "learning_rate": 7.277947424467672e-05, "loss": 0.1586, "step": 1975 }, { "epoch": 0.4147774979009236, "grad_norm": 0.12487095594406128, "learning_rate": 7.274685074898386e-05, "loss": 0.175, "step": 1976 }, { "epoch": 0.4149874055415617, "grad_norm": 0.11194780468940735, "learning_rate": 7.271421503771251e-05, "loss": 0.1574, "step": 1977 }, { "epoch": 0.4151973131821998, "grad_norm": 0.12557700276374817, "learning_rate": 7.268156712838888e-05, "loss": 0.1529, "step": 1978 }, { "epoch": 0.41540722082283793, "grad_norm": 0.10330946743488312, "learning_rate": 7.264890703854559e-05, "loss": 0.1311, "step": 1979 }, { "epoch": 0.4156171284634761, "grad_norm": 0.11576497554779053, "learning_rate": 7.261623478572185e-05, "loss": 0.1518, "step": 1980 }, { "epoch": 0.4158270361041142, "grad_norm": 0.1512029618024826, "learning_rate": 7.258355038746345e-05, "loss": 0.1629, "step": 1981 }, { "epoch": 0.4160369437447523, "grad_norm": 0.142082080245018, "learning_rate": 7.25508538613226e-05, "loss": 0.1607, "step": 1982 }, { "epoch": 0.4162468513853904, "grad_norm": 0.13623762130737305, "learning_rate": 7.251814522485813e-05, "loss": 0.1695, "step": 1983 }, { "epoch": 0.4164567590260285, "grad_norm": 0.1192348450422287, "learning_rate": 7.248542449563529e-05, "loss": 0.1668, "step": 1984 }, { "epoch": 0.4166666666666667, "grad_norm": 0.12478368729352951, "learning_rate": 7.245269169122588e-05, "loss": 0.1427, "step": 1985 }, { "epoch": 0.4168765743073048, "grad_norm": 0.1585802137851715, "learning_rate": 7.241994682920816e-05, "loss": 0.1778, "step": 1986 }, { "epoch": 0.4170864819479429, "grad_norm": 0.10179055482149124, "learning_rate": 7.238718992716687e-05, "loss": 0.1358, "step": 1987 }, { "epoch": 0.417296389588581, "grad_norm": 0.11688593029975891, "learning_rate": 7.23544210026932e-05, "loss": 0.1603, "step": 1988 }, { "epoch": 0.41750629722921917, "grad_norm": 0.1158967837691307, "learning_rate": 7.232164007338484e-05, "loss": 0.1408, "step": 1989 }, { "epoch": 0.4177162048698573, "grad_norm": 0.1265357881784439, "learning_rate": 7.228884715684588e-05, "loss": 0.1402, "step": 1990 }, { "epoch": 0.4179261125104954, "grad_norm": 0.10598758608102798, "learning_rate": 7.225604227068684e-05, "loss": 0.1431, "step": 1991 }, { "epoch": 0.4181360201511335, "grad_norm": 0.12342094630002975, "learning_rate": 7.222322543252474e-05, "loss": 0.1539, "step": 1992 }, { "epoch": 0.4183459277917716, "grad_norm": 0.12338914722204208, "learning_rate": 7.219039665998295e-05, "loss": 0.1719, "step": 1993 }, { "epoch": 0.41855583543240976, "grad_norm": 0.11439438164234161, "learning_rate": 7.215755597069126e-05, "loss": 0.1639, "step": 1994 }, { "epoch": 0.41876574307304787, "grad_norm": 0.12324292212724686, "learning_rate": 7.212470338228589e-05, "loss": 0.1759, "step": 1995 }, { "epoch": 0.418975650713686, "grad_norm": 0.10522071272134781, "learning_rate": 7.209183891240941e-05, "loss": 0.1475, "step": 1996 }, { "epoch": 0.4191855583543241, "grad_norm": 0.09488867968320847, "learning_rate": 7.205896257871082e-05, "loss": 0.1564, "step": 1997 }, { "epoch": 0.4193954659949622, "grad_norm": 0.12088136374950409, "learning_rate": 7.202607439884543e-05, "loss": 0.1671, "step": 1998 }, { "epoch": 0.41960537363560035, "grad_norm": 0.11763715744018555, "learning_rate": 7.199317439047499e-05, "loss": 0.1463, "step": 1999 }, { "epoch": 0.41981528127623846, "grad_norm": 0.10898829251527786, "learning_rate": 7.196026257126749e-05, "loss": 0.1697, "step": 2000 }, { "epoch": 0.42002518891687657, "grad_norm": 0.114218570291996, "learning_rate": 7.192733895889737e-05, "loss": 0.1693, "step": 2001 }, { "epoch": 0.4202350965575147, "grad_norm": 0.11072574555873871, "learning_rate": 7.189440357104536e-05, "loss": 0.1599, "step": 2002 }, { "epoch": 0.42044500419815284, "grad_norm": 0.118053138256073, "learning_rate": 7.186145642539852e-05, "loss": 0.1697, "step": 2003 }, { "epoch": 0.42065491183879095, "grad_norm": 0.11998876184225082, "learning_rate": 7.18284975396502e-05, "loss": 0.1666, "step": 2004 }, { "epoch": 0.42086481947942905, "grad_norm": 0.12326207011938095, "learning_rate": 7.179552693150009e-05, "loss": 0.137, "step": 2005 }, { "epoch": 0.42107472712006716, "grad_norm": 0.11958475410938263, "learning_rate": 7.176254461865415e-05, "loss": 0.1571, "step": 2006 }, { "epoch": 0.42128463476070527, "grad_norm": 0.1109607145190239, "learning_rate": 7.172955061882465e-05, "loss": 0.1359, "step": 2007 }, { "epoch": 0.42149454240134343, "grad_norm": 0.1003115177154541, "learning_rate": 7.16965449497301e-05, "loss": 0.1643, "step": 2008 }, { "epoch": 0.42170445004198154, "grad_norm": 0.10504936426877975, "learning_rate": 7.166352762909532e-05, "loss": 0.1469, "step": 2009 }, { "epoch": 0.42191435768261965, "grad_norm": 0.11309293657541275, "learning_rate": 7.163049867465135e-05, "loss": 0.1474, "step": 2010 }, { "epoch": 0.42212426532325775, "grad_norm": 0.10572872310876846, "learning_rate": 7.15974581041355e-05, "loss": 0.1622, "step": 2011 }, { "epoch": 0.42233417296389586, "grad_norm": 0.10934752225875854, "learning_rate": 7.156440593529132e-05, "loss": 0.162, "step": 2012 }, { "epoch": 0.422544080604534, "grad_norm": 0.12137974798679352, "learning_rate": 7.153134218586857e-05, "loss": 0.1438, "step": 2013 }, { "epoch": 0.42275398824517213, "grad_norm": 0.12691549956798553, "learning_rate": 7.149826687362322e-05, "loss": 0.1494, "step": 2014 }, { "epoch": 0.42296389588581024, "grad_norm": 0.12283293902873993, "learning_rate": 7.146518001631751e-05, "loss": 0.1578, "step": 2015 }, { "epoch": 0.42317380352644834, "grad_norm": 0.13865120708942413, "learning_rate": 7.143208163171979e-05, "loss": 0.1652, "step": 2016 }, { "epoch": 0.4233837111670865, "grad_norm": 0.11391118913888931, "learning_rate": 7.139897173760469e-05, "loss": 0.148, "step": 2017 }, { "epoch": 0.4235936188077246, "grad_norm": 0.12447668612003326, "learning_rate": 7.136585035175294e-05, "loss": 0.1672, "step": 2018 }, { "epoch": 0.4238035264483627, "grad_norm": 0.1198267936706543, "learning_rate": 7.133271749195153e-05, "loss": 0.1572, "step": 2019 }, { "epoch": 0.42401343408900083, "grad_norm": 0.10948833078145981, "learning_rate": 7.129957317599352e-05, "loss": 0.1612, "step": 2020 }, { "epoch": 0.42422334172963894, "grad_norm": 0.12680894136428833, "learning_rate": 7.126641742167819e-05, "loss": 0.1648, "step": 2021 }, { "epoch": 0.4244332493702771, "grad_norm": 0.14302381873130798, "learning_rate": 7.123325024681093e-05, "loss": 0.1617, "step": 2022 }, { "epoch": 0.4246431570109152, "grad_norm": 0.13583184778690338, "learning_rate": 7.120007166920326e-05, "loss": 0.1666, "step": 2023 }, { "epoch": 0.4248530646515533, "grad_norm": 0.11573361605405807, "learning_rate": 7.116688170667284e-05, "loss": 0.1523, "step": 2024 }, { "epoch": 0.4250629722921914, "grad_norm": 0.13895514607429504, "learning_rate": 7.113368037704345e-05, "loss": 0.158, "step": 2025 }, { "epoch": 0.42527287993282953, "grad_norm": 0.1131201833486557, "learning_rate": 7.110046769814496e-05, "loss": 0.1764, "step": 2026 }, { "epoch": 0.4254827875734677, "grad_norm": 0.13642175495624542, "learning_rate": 7.106724368781331e-05, "loss": 0.1797, "step": 2027 }, { "epoch": 0.4256926952141058, "grad_norm": 0.11173267662525177, "learning_rate": 7.103400836389059e-05, "loss": 0.1685, "step": 2028 }, { "epoch": 0.4259026028547439, "grad_norm": 0.11620894074440002, "learning_rate": 7.10007617442249e-05, "loss": 0.1552, "step": 2029 }, { "epoch": 0.426112510495382, "grad_norm": 0.11680643260478973, "learning_rate": 7.096750384667044e-05, "loss": 0.1576, "step": 2030 }, { "epoch": 0.4263224181360202, "grad_norm": 0.13483844697475433, "learning_rate": 7.093423468908748e-05, "loss": 0.1609, "step": 2031 }, { "epoch": 0.4265323257766583, "grad_norm": 0.11245349794626236, "learning_rate": 7.09009542893423e-05, "loss": 0.165, "step": 2032 }, { "epoch": 0.4267422334172964, "grad_norm": 0.1436799019575119, "learning_rate": 7.086766266530723e-05, "loss": 0.1439, "step": 2033 }, { "epoch": 0.4269521410579345, "grad_norm": 0.11214408278465271, "learning_rate": 7.083435983486066e-05, "loss": 0.1678, "step": 2034 }, { "epoch": 0.4271620486985726, "grad_norm": 0.11312124133110046, "learning_rate": 7.080104581588693e-05, "loss": 0.1683, "step": 2035 }, { "epoch": 0.42737195633921077, "grad_norm": 0.12795935571193695, "learning_rate": 7.076772062627647e-05, "loss": 0.1657, "step": 2036 }, { "epoch": 0.4275818639798489, "grad_norm": 0.10281221568584442, "learning_rate": 7.073438428392562e-05, "loss": 0.1394, "step": 2037 }, { "epoch": 0.427791771620487, "grad_norm": 0.10964559763669968, "learning_rate": 7.070103680673683e-05, "loss": 0.1598, "step": 2038 }, { "epoch": 0.4280016792611251, "grad_norm": 0.10823410749435425, "learning_rate": 7.066767821261837e-05, "loss": 0.1601, "step": 2039 }, { "epoch": 0.4282115869017632, "grad_norm": 0.11483200639486313, "learning_rate": 7.063430851948463e-05, "loss": 0.14, "step": 2040 }, { "epoch": 0.42842149454240136, "grad_norm": 0.1289099007844925, "learning_rate": 7.060092774525588e-05, "loss": 0.1567, "step": 2041 }, { "epoch": 0.42863140218303947, "grad_norm": 0.11432711780071259, "learning_rate": 7.056753590785835e-05, "loss": 0.1559, "step": 2042 }, { "epoch": 0.4288413098236776, "grad_norm": 0.13700692355632782, "learning_rate": 7.053413302522423e-05, "loss": 0.1716, "step": 2043 }, { "epoch": 0.4290512174643157, "grad_norm": 0.12998148798942566, "learning_rate": 7.050071911529163e-05, "loss": 0.1539, "step": 2044 }, { "epoch": 0.42926112510495384, "grad_norm": 0.1319163739681244, "learning_rate": 7.046729419600457e-05, "loss": 0.1638, "step": 2045 }, { "epoch": 0.42947103274559195, "grad_norm": 0.1296456903219223, "learning_rate": 7.043385828531299e-05, "loss": 0.1765, "step": 2046 }, { "epoch": 0.42968094038623006, "grad_norm": 0.133979931473732, "learning_rate": 7.040041140117278e-05, "loss": 0.1721, "step": 2047 }, { "epoch": 0.42989084802686817, "grad_norm": 0.1405065357685089, "learning_rate": 7.036695356154564e-05, "loss": 0.1552, "step": 2048 }, { "epoch": 0.4301007556675063, "grad_norm": 0.13646145164966583, "learning_rate": 7.033348478439921e-05, "loss": 0.1515, "step": 2049 }, { "epoch": 0.43031066330814444, "grad_norm": 0.1469520926475525, "learning_rate": 7.030000508770698e-05, "loss": 0.1447, "step": 2050 }, { "epoch": 0.43052057094878254, "grad_norm": 0.13606610894203186, "learning_rate": 7.026651448944833e-05, "loss": 0.1728, "step": 2051 }, { "epoch": 0.43073047858942065, "grad_norm": 0.14016586542129517, "learning_rate": 7.023301300760843e-05, "loss": 0.161, "step": 2052 }, { "epoch": 0.43094038623005876, "grad_norm": 0.12709060311317444, "learning_rate": 7.019950066017839e-05, "loss": 0.1502, "step": 2053 }, { "epoch": 0.43115029387069687, "grad_norm": 0.11851731687784195, "learning_rate": 7.016597746515511e-05, "loss": 0.1493, "step": 2054 }, { "epoch": 0.43136020151133503, "grad_norm": 0.16392657160758972, "learning_rate": 7.013244344054127e-05, "loss": 0.1776, "step": 2055 }, { "epoch": 0.43157010915197314, "grad_norm": 0.122653529047966, "learning_rate": 7.009889860434544e-05, "loss": 0.1408, "step": 2056 }, { "epoch": 0.43178001679261124, "grad_norm": 0.12323731184005737, "learning_rate": 7.006534297458195e-05, "loss": 0.1467, "step": 2057 }, { "epoch": 0.43198992443324935, "grad_norm": 0.12453500181436539, "learning_rate": 7.003177656927095e-05, "loss": 0.1561, "step": 2058 }, { "epoch": 0.4321998320738875, "grad_norm": 0.12548905611038208, "learning_rate": 6.999819940643833e-05, "loss": 0.1551, "step": 2059 }, { "epoch": 0.4324097397145256, "grad_norm": 0.10663434118032455, "learning_rate": 6.996461150411584e-05, "loss": 0.1586, "step": 2060 }, { "epoch": 0.43261964735516373, "grad_norm": 0.11009252071380615, "learning_rate": 6.993101288034094e-05, "loss": 0.1428, "step": 2061 }, { "epoch": 0.43282955499580184, "grad_norm": 0.11233685165643692, "learning_rate": 6.989740355315683e-05, "loss": 0.1484, "step": 2062 }, { "epoch": 0.43303946263643994, "grad_norm": 0.12727439403533936, "learning_rate": 6.986378354061254e-05, "loss": 0.161, "step": 2063 }, { "epoch": 0.4332493702770781, "grad_norm": 0.13386842608451843, "learning_rate": 6.983015286076272e-05, "loss": 0.1355, "step": 2064 }, { "epoch": 0.4334592779177162, "grad_norm": 0.13135772943496704, "learning_rate": 6.979651153166786e-05, "loss": 0.1719, "step": 2065 }, { "epoch": 0.4336691855583543, "grad_norm": 0.12024439871311188, "learning_rate": 6.97628595713941e-05, "loss": 0.1664, "step": 2066 }, { "epoch": 0.4338790931989924, "grad_norm": 0.10455571860074997, "learning_rate": 6.972919699801333e-05, "loss": 0.1484, "step": 2067 }, { "epoch": 0.4340890008396306, "grad_norm": 0.15053103864192963, "learning_rate": 6.96955238296031e-05, "loss": 0.168, "step": 2068 }, { "epoch": 0.4342989084802687, "grad_norm": 0.11767338216304779, "learning_rate": 6.966184008424668e-05, "loss": 0.1642, "step": 2069 }, { "epoch": 0.4345088161209068, "grad_norm": 0.11430288106203079, "learning_rate": 6.9628145780033e-05, "loss": 0.1595, "step": 2070 }, { "epoch": 0.4347187237615449, "grad_norm": 0.1504775881767273, "learning_rate": 6.959444093505671e-05, "loss": 0.1553, "step": 2071 }, { "epoch": 0.434928631402183, "grad_norm": 0.12450301647186279, "learning_rate": 6.956072556741804e-05, "loss": 0.1516, "step": 2072 }, { "epoch": 0.4351385390428212, "grad_norm": 0.15835943818092346, "learning_rate": 6.952699969522292e-05, "loss": 0.1733, "step": 2073 }, { "epoch": 0.4353484466834593, "grad_norm": 0.1206754669547081, "learning_rate": 6.949326333658292e-05, "loss": 0.1613, "step": 2074 }, { "epoch": 0.4355583543240974, "grad_norm": 0.10483954846858978, "learning_rate": 6.945951650961524e-05, "loss": 0.1635, "step": 2075 }, { "epoch": 0.4357682619647355, "grad_norm": 0.11579787731170654, "learning_rate": 6.942575923244268e-05, "loss": 0.1521, "step": 2076 }, { "epoch": 0.4359781696053736, "grad_norm": 0.16092592477798462, "learning_rate": 6.93919915231937e-05, "loss": 0.1611, "step": 2077 }, { "epoch": 0.4361880772460118, "grad_norm": 0.11030247807502747, "learning_rate": 6.935821340000231e-05, "loss": 0.1465, "step": 2078 }, { "epoch": 0.4363979848866499, "grad_norm": 0.11840491741895676, "learning_rate": 6.932442488100813e-05, "loss": 0.1565, "step": 2079 }, { "epoch": 0.436607892527288, "grad_norm": 0.14345033466815948, "learning_rate": 6.929062598435636e-05, "loss": 0.1655, "step": 2080 }, { "epoch": 0.4368178001679261, "grad_norm": 0.12973609566688538, "learning_rate": 6.925681672819781e-05, "loss": 0.131, "step": 2081 }, { "epoch": 0.43702770780856426, "grad_norm": 0.11152931302785873, "learning_rate": 6.922299713068878e-05, "loss": 0.1545, "step": 2082 }, { "epoch": 0.43723761544920237, "grad_norm": 0.11546137928962708, "learning_rate": 6.918916720999123e-05, "loss": 0.1683, "step": 2083 }, { "epoch": 0.4374475230898405, "grad_norm": 0.11870436370372772, "learning_rate": 6.915532698427254e-05, "loss": 0.1493, "step": 2084 }, { "epoch": 0.4376574307304786, "grad_norm": 0.09569094330072403, "learning_rate": 6.912147647170571e-05, "loss": 0.1415, "step": 2085 }, { "epoch": 0.4378673383711167, "grad_norm": 0.1221393570303917, "learning_rate": 6.908761569046923e-05, "loss": 0.1463, "step": 2086 }, { "epoch": 0.43807724601175485, "grad_norm": 0.12907950580120087, "learning_rate": 6.90537446587471e-05, "loss": 0.1615, "step": 2087 }, { "epoch": 0.43828715365239296, "grad_norm": 0.12217225879430771, "learning_rate": 6.901986339472888e-05, "loss": 0.1592, "step": 2088 }, { "epoch": 0.43849706129303107, "grad_norm": 0.12174089252948761, "learning_rate": 6.898597191660956e-05, "loss": 0.1526, "step": 2089 }, { "epoch": 0.4387069689336692, "grad_norm": 0.11348958313465118, "learning_rate": 6.89520702425896e-05, "loss": 0.157, "step": 2090 }, { "epoch": 0.4389168765743073, "grad_norm": 0.11617136001586914, "learning_rate": 6.891815839087505e-05, "loss": 0.1511, "step": 2091 }, { "epoch": 0.43912678421494544, "grad_norm": 0.11774633824825287, "learning_rate": 6.888423637967728e-05, "loss": 0.1561, "step": 2092 }, { "epoch": 0.43933669185558355, "grad_norm": 0.11033912003040314, "learning_rate": 6.885030422721324e-05, "loss": 0.1508, "step": 2093 }, { "epoch": 0.43954659949622166, "grad_norm": 0.12567701935768127, "learning_rate": 6.881636195170521e-05, "loss": 0.1489, "step": 2094 }, { "epoch": 0.43975650713685976, "grad_norm": 0.1044977456331253, "learning_rate": 6.878240957138102e-05, "loss": 0.1499, "step": 2095 }, { "epoch": 0.4399664147774979, "grad_norm": 0.1258642077445984, "learning_rate": 6.874844710447386e-05, "loss": 0.1473, "step": 2096 }, { "epoch": 0.44017632241813603, "grad_norm": 0.11813251674175262, "learning_rate": 6.871447456922233e-05, "loss": 0.1404, "step": 2097 }, { "epoch": 0.44038623005877414, "grad_norm": 0.11908237636089325, "learning_rate": 6.868049198387047e-05, "loss": 0.1612, "step": 2098 }, { "epoch": 0.44059613769941225, "grad_norm": 0.1303856521844864, "learning_rate": 6.864649936666772e-05, "loss": 0.1722, "step": 2099 }, { "epoch": 0.44080604534005036, "grad_norm": 0.1194954439997673, "learning_rate": 6.861249673586887e-05, "loss": 0.1446, "step": 2100 }, { "epoch": 0.4410159529806885, "grad_norm": 0.1111513301730156, "learning_rate": 6.857848410973413e-05, "loss": 0.1462, "step": 2101 }, { "epoch": 0.4412258606213266, "grad_norm": 0.11029879748821259, "learning_rate": 6.854446150652905e-05, "loss": 0.1533, "step": 2102 }, { "epoch": 0.44143576826196473, "grad_norm": 0.1370946615934372, "learning_rate": 6.851042894452452e-05, "loss": 0.1477, "step": 2103 }, { "epoch": 0.44164567590260284, "grad_norm": 0.09855300188064575, "learning_rate": 6.847638644199684e-05, "loss": 0.1449, "step": 2104 }, { "epoch": 0.44185558354324095, "grad_norm": 0.11461108922958374, "learning_rate": 6.844233401722757e-05, "loss": 0.1602, "step": 2105 }, { "epoch": 0.4420654911838791, "grad_norm": 0.11520881950855255, "learning_rate": 6.840827168850368e-05, "loss": 0.1571, "step": 2106 }, { "epoch": 0.4422753988245172, "grad_norm": 0.11658153682947159, "learning_rate": 6.837419947411742e-05, "loss": 0.1533, "step": 2107 }, { "epoch": 0.4424853064651553, "grad_norm": 0.1229148879647255, "learning_rate": 6.834011739236632e-05, "loss": 0.1479, "step": 2108 }, { "epoch": 0.44269521410579343, "grad_norm": 0.12906445562839508, "learning_rate": 6.830602546155324e-05, "loss": 0.1719, "step": 2109 }, { "epoch": 0.4429051217464316, "grad_norm": 0.10934906452894211, "learning_rate": 6.827192369998636e-05, "loss": 0.1503, "step": 2110 }, { "epoch": 0.4431150293870697, "grad_norm": 0.12860219180583954, "learning_rate": 6.823781212597903e-05, "loss": 0.1578, "step": 2111 }, { "epoch": 0.4433249370277078, "grad_norm": 0.1294265240430832, "learning_rate": 6.820369075785003e-05, "loss": 0.1538, "step": 2112 }, { "epoch": 0.4435348446683459, "grad_norm": 0.10323406755924225, "learning_rate": 6.816955961392327e-05, "loss": 0.1406, "step": 2113 }, { "epoch": 0.443744752308984, "grad_norm": 0.10694414377212524, "learning_rate": 6.813541871252797e-05, "loss": 0.1636, "step": 2114 }, { "epoch": 0.4439546599496222, "grad_norm": 0.13600675761699677, "learning_rate": 6.810126807199854e-05, "loss": 0.1454, "step": 2115 }, { "epoch": 0.4441645675902603, "grad_norm": 0.12228574603796005, "learning_rate": 6.806710771067469e-05, "loss": 0.1637, "step": 2116 }, { "epoch": 0.4443744752308984, "grad_norm": 0.12474840879440308, "learning_rate": 6.803293764690131e-05, "loss": 0.1626, "step": 2117 }, { "epoch": 0.4445843828715365, "grad_norm": 0.1158130019903183, "learning_rate": 6.799875789902848e-05, "loss": 0.1538, "step": 2118 }, { "epoch": 0.4447942905121746, "grad_norm": 0.11266588419675827, "learning_rate": 6.796456848541152e-05, "loss": 0.1459, "step": 2119 }, { "epoch": 0.4450041981528128, "grad_norm": 0.12051046639680862, "learning_rate": 6.793036942441095e-05, "loss": 0.1488, "step": 2120 }, { "epoch": 0.4452141057934509, "grad_norm": 0.12589138746261597, "learning_rate": 6.789616073439243e-05, "loss": 0.1652, "step": 2121 }, { "epoch": 0.445424013434089, "grad_norm": 0.1199890673160553, "learning_rate": 6.78619424337268e-05, "loss": 0.1563, "step": 2122 }, { "epoch": 0.4456339210747271, "grad_norm": 0.11333800107240677, "learning_rate": 6.782771454079006e-05, "loss": 0.1578, "step": 2123 }, { "epoch": 0.44584382871536526, "grad_norm": 0.1303377002477646, "learning_rate": 6.77934770739634e-05, "loss": 0.1461, "step": 2124 }, { "epoch": 0.44605373635600337, "grad_norm": 0.15686602890491486, "learning_rate": 6.775923005163308e-05, "loss": 0.148, "step": 2125 }, { "epoch": 0.4462636439966415, "grad_norm": 0.11671863496303558, "learning_rate": 6.772497349219058e-05, "loss": 0.1668, "step": 2126 }, { "epoch": 0.4464735516372796, "grad_norm": 0.11080017685890198, "learning_rate": 6.769070741403242e-05, "loss": 0.1674, "step": 2127 }, { "epoch": 0.4466834592779177, "grad_norm": 0.10152662545442581, "learning_rate": 6.76564318355603e-05, "loss": 0.1476, "step": 2128 }, { "epoch": 0.44689336691855586, "grad_norm": 0.11543727666139603, "learning_rate": 6.762214677518096e-05, "loss": 0.154, "step": 2129 }, { "epoch": 0.44710327455919396, "grad_norm": 0.1334647685289383, "learning_rate": 6.758785225130626e-05, "loss": 0.1769, "step": 2130 }, { "epoch": 0.44731318219983207, "grad_norm": 0.1134229451417923, "learning_rate": 6.755354828235315e-05, "loss": 0.1508, "step": 2131 }, { "epoch": 0.4475230898404702, "grad_norm": 0.11326369643211365, "learning_rate": 6.751923488674366e-05, "loss": 0.1585, "step": 2132 }, { "epoch": 0.4477329974811083, "grad_norm": 0.12377768754959106, "learning_rate": 6.748491208290484e-05, "loss": 0.1558, "step": 2133 }, { "epoch": 0.44794290512174645, "grad_norm": 0.1556466668844223, "learning_rate": 6.745057988926886e-05, "loss": 0.1517, "step": 2134 }, { "epoch": 0.44815281276238456, "grad_norm": 0.11329285055398941, "learning_rate": 6.741623832427284e-05, "loss": 0.1695, "step": 2135 }, { "epoch": 0.44836272040302266, "grad_norm": 0.12504523992538452, "learning_rate": 6.738188740635902e-05, "loss": 0.1459, "step": 2136 }, { "epoch": 0.44857262804366077, "grad_norm": 0.10221695154905319, "learning_rate": 6.734752715397461e-05, "loss": 0.1462, "step": 2137 }, { "epoch": 0.44878253568429893, "grad_norm": 0.1422422230243683, "learning_rate": 6.731315758557187e-05, "loss": 0.1565, "step": 2138 }, { "epoch": 0.44899244332493704, "grad_norm": 0.12211299687623978, "learning_rate": 6.727877871960803e-05, "loss": 0.156, "step": 2139 }, { "epoch": 0.44920235096557515, "grad_norm": 0.11609075218439102, "learning_rate": 6.724439057454533e-05, "loss": 0.1679, "step": 2140 }, { "epoch": 0.44941225860621326, "grad_norm": 0.1233009323477745, "learning_rate": 6.720999316885098e-05, "loss": 0.1694, "step": 2141 }, { "epoch": 0.44962216624685136, "grad_norm": 0.1292707473039627, "learning_rate": 6.717558652099718e-05, "loss": 0.1669, "step": 2142 }, { "epoch": 0.4498320738874895, "grad_norm": 0.1204388439655304, "learning_rate": 6.71411706494611e-05, "loss": 0.1632, "step": 2143 }, { "epoch": 0.45004198152812763, "grad_norm": 0.12831495702266693, "learning_rate": 6.710674557272482e-05, "loss": 0.1447, "step": 2144 }, { "epoch": 0.45025188916876574, "grad_norm": 0.14179843664169312, "learning_rate": 6.707231130927542e-05, "loss": 0.159, "step": 2145 }, { "epoch": 0.45046179680940385, "grad_norm": 0.11582005023956299, "learning_rate": 6.703786787760487e-05, "loss": 0.1608, "step": 2146 }, { "epoch": 0.45067170445004195, "grad_norm": 0.11273009330034256, "learning_rate": 6.700341529621008e-05, "loss": 0.1467, "step": 2147 }, { "epoch": 0.4508816120906801, "grad_norm": 0.14487546682357788, "learning_rate": 6.696895358359288e-05, "loss": 0.1618, "step": 2148 }, { "epoch": 0.4510915197313182, "grad_norm": 0.11221778392791748, "learning_rate": 6.693448275826e-05, "loss": 0.155, "step": 2149 }, { "epoch": 0.45130142737195633, "grad_norm": 0.11724277585744858, "learning_rate": 6.690000283872304e-05, "loss": 0.1535, "step": 2150 }, { "epoch": 0.45151133501259444, "grad_norm": 0.13033629953861237, "learning_rate": 6.686551384349853e-05, "loss": 0.1818, "step": 2151 }, { "epoch": 0.4517212426532326, "grad_norm": 0.10972466319799423, "learning_rate": 6.683101579110783e-05, "loss": 0.1362, "step": 2152 }, { "epoch": 0.4519311502938707, "grad_norm": 0.12394952774047852, "learning_rate": 6.679650870007718e-05, "loss": 0.1563, "step": 2153 }, { "epoch": 0.4521410579345088, "grad_norm": 0.13337190449237823, "learning_rate": 6.67619925889377e-05, "loss": 0.1643, "step": 2154 }, { "epoch": 0.4523509655751469, "grad_norm": 0.10738395899534225, "learning_rate": 6.672746747622532e-05, "loss": 0.1372, "step": 2155 }, { "epoch": 0.45256087321578503, "grad_norm": 0.1167917475104332, "learning_rate": 6.669293338048079e-05, "loss": 0.1618, "step": 2156 }, { "epoch": 0.4527707808564232, "grad_norm": 0.12487170845270157, "learning_rate": 6.665839032024975e-05, "loss": 0.1601, "step": 2157 }, { "epoch": 0.4529806884970613, "grad_norm": 0.12744808197021484, "learning_rate": 6.662383831408258e-05, "loss": 0.1533, "step": 2158 }, { "epoch": 0.4531905961376994, "grad_norm": 0.113549143075943, "learning_rate": 6.658927738053451e-05, "loss": 0.1674, "step": 2159 }, { "epoch": 0.4534005037783375, "grad_norm": 0.12157785147428513, "learning_rate": 6.655470753816553e-05, "loss": 0.1648, "step": 2160 }, { "epoch": 0.4536104114189756, "grad_norm": 0.11910437792539597, "learning_rate": 6.652012880554047e-05, "loss": 0.1561, "step": 2161 }, { "epoch": 0.4538203190596138, "grad_norm": 0.12395650148391724, "learning_rate": 6.648554120122889e-05, "loss": 0.1508, "step": 2162 }, { "epoch": 0.4540302267002519, "grad_norm": 0.11354140937328339, "learning_rate": 6.64509447438051e-05, "loss": 0.1588, "step": 2163 }, { "epoch": 0.45424013434089, "grad_norm": 0.12797436118125916, "learning_rate": 6.64163394518482e-05, "loss": 0.166, "step": 2164 }, { "epoch": 0.4544500419815281, "grad_norm": 0.14938712120056152, "learning_rate": 6.638172534394204e-05, "loss": 0.1466, "step": 2165 }, { "epoch": 0.45465994962216627, "grad_norm": 0.12750832736492157, "learning_rate": 6.634710243867516e-05, "loss": 0.1657, "step": 2166 }, { "epoch": 0.4548698572628044, "grad_norm": 0.10431057959794998, "learning_rate": 6.631247075464086e-05, "loss": 0.1601, "step": 2167 }, { "epoch": 0.4550797649034425, "grad_norm": 0.12698952853679657, "learning_rate": 6.627783031043715e-05, "loss": 0.1571, "step": 2168 }, { "epoch": 0.4552896725440806, "grad_norm": 0.1288319081068039, "learning_rate": 6.624318112466676e-05, "loss": 0.1589, "step": 2169 }, { "epoch": 0.4554995801847187, "grad_norm": 0.12063757330179214, "learning_rate": 6.620852321593705e-05, "loss": 0.1415, "step": 2170 }, { "epoch": 0.45570948782535686, "grad_norm": 0.12932997941970825, "learning_rate": 6.617385660286014e-05, "loss": 0.1685, "step": 2171 }, { "epoch": 0.45591939546599497, "grad_norm": 0.12156279385089874, "learning_rate": 6.613918130405278e-05, "loss": 0.1682, "step": 2172 }, { "epoch": 0.4561293031066331, "grad_norm": 0.1219090148806572, "learning_rate": 6.61044973381364e-05, "loss": 0.1604, "step": 2173 }, { "epoch": 0.4563392107472712, "grad_norm": 0.10800813138484955, "learning_rate": 6.606980472373709e-05, "loss": 0.1334, "step": 2174 }, { "epoch": 0.45654911838790935, "grad_norm": 0.13099253177642822, "learning_rate": 6.603510347948556e-05, "loss": 0.1587, "step": 2175 }, { "epoch": 0.45675902602854745, "grad_norm": 0.1315564215183258, "learning_rate": 6.60003936240172e-05, "loss": 0.1729, "step": 2176 }, { "epoch": 0.45696893366918556, "grad_norm": 0.12284686416387558, "learning_rate": 6.596567517597196e-05, "loss": 0.1803, "step": 2177 }, { "epoch": 0.45717884130982367, "grad_norm": 0.1192544549703598, "learning_rate": 6.593094815399446e-05, "loss": 0.1685, "step": 2178 }, { "epoch": 0.4573887489504618, "grad_norm": 0.11488565057516098, "learning_rate": 6.58962125767339e-05, "loss": 0.1607, "step": 2179 }, { "epoch": 0.45759865659109994, "grad_norm": 0.12251152843236923, "learning_rate": 6.586146846284409e-05, "loss": 0.1671, "step": 2180 }, { "epoch": 0.45780856423173805, "grad_norm": 0.1218481957912445, "learning_rate": 6.582671583098337e-05, "loss": 0.1585, "step": 2181 }, { "epoch": 0.45801847187237615, "grad_norm": 0.11864989995956421, "learning_rate": 6.579195469981476e-05, "loss": 0.1597, "step": 2182 }, { "epoch": 0.45822837951301426, "grad_norm": 0.10890363156795502, "learning_rate": 6.575718508800574e-05, "loss": 0.1592, "step": 2183 }, { "epoch": 0.45843828715365237, "grad_norm": 0.11092708259820938, "learning_rate": 6.57224070142284e-05, "loss": 0.1653, "step": 2184 }, { "epoch": 0.45864819479429053, "grad_norm": 0.11390335112810135, "learning_rate": 6.568762049715933e-05, "loss": 0.1637, "step": 2185 }, { "epoch": 0.45885810243492864, "grad_norm": 0.12344920635223389, "learning_rate": 6.565282555547975e-05, "loss": 0.1569, "step": 2186 }, { "epoch": 0.45906801007556675, "grad_norm": 0.11731364578008652, "learning_rate": 6.561802220787526e-05, "loss": 0.167, "step": 2187 }, { "epoch": 0.45927791771620485, "grad_norm": 0.11007954180240631, "learning_rate": 6.558321047303612e-05, "loss": 0.1587, "step": 2188 }, { "epoch": 0.459487825356843, "grad_norm": 0.1331484168767929, "learning_rate": 6.554839036965696e-05, "loss": 0.139, "step": 2189 }, { "epoch": 0.4596977329974811, "grad_norm": 0.11050381511449814, "learning_rate": 6.551356191643705e-05, "loss": 0.1589, "step": 2190 }, { "epoch": 0.45990764063811923, "grad_norm": 0.12360599637031555, "learning_rate": 6.547872513208e-05, "loss": 0.1571, "step": 2191 }, { "epoch": 0.46011754827875734, "grad_norm": 0.11798024922609329, "learning_rate": 6.5443880035294e-05, "loss": 0.1574, "step": 2192 }, { "epoch": 0.46032745591939545, "grad_norm": 0.11354199051856995, "learning_rate": 6.540902664479164e-05, "loss": 0.1553, "step": 2193 }, { "epoch": 0.4605373635600336, "grad_norm": 0.131741002202034, "learning_rate": 6.537416497928998e-05, "loss": 0.1608, "step": 2194 }, { "epoch": 0.4607472712006717, "grad_norm": 0.122543103992939, "learning_rate": 6.533929505751055e-05, "loss": 0.1542, "step": 2195 }, { "epoch": 0.4609571788413098, "grad_norm": 0.11904080957174301, "learning_rate": 6.530441689817929e-05, "loss": 0.1704, "step": 2196 }, { "epoch": 0.46116708648194793, "grad_norm": 0.12378960847854614, "learning_rate": 6.526953052002657e-05, "loss": 0.1644, "step": 2197 }, { "epoch": 0.46137699412258604, "grad_norm": 0.11525364220142365, "learning_rate": 6.523463594178716e-05, "loss": 0.1706, "step": 2198 }, { "epoch": 0.4615869017632242, "grad_norm": 0.12415199726819992, "learning_rate": 6.519973318220025e-05, "loss": 0.1585, "step": 2199 }, { "epoch": 0.4617968094038623, "grad_norm": 0.12286436557769775, "learning_rate": 6.516482226000945e-05, "loss": 0.1526, "step": 2200 }, { "epoch": 0.4620067170445004, "grad_norm": 0.11900891363620758, "learning_rate": 6.512990319396266e-05, "loss": 0.1455, "step": 2201 }, { "epoch": 0.4622166246851385, "grad_norm": 0.12508201599121094, "learning_rate": 6.509497600281226e-05, "loss": 0.1577, "step": 2202 }, { "epoch": 0.4624265323257767, "grad_norm": 0.1479506492614746, "learning_rate": 6.506004070531494e-05, "loss": 0.1578, "step": 2203 }, { "epoch": 0.4626364399664148, "grad_norm": 0.1091904416680336, "learning_rate": 6.502509732023177e-05, "loss": 0.1482, "step": 2204 }, { "epoch": 0.4628463476070529, "grad_norm": 0.12727074325084686, "learning_rate": 6.499014586632812e-05, "loss": 0.1545, "step": 2205 }, { "epoch": 0.463056255247691, "grad_norm": 0.11460824310779572, "learning_rate": 6.495518636237373e-05, "loss": 0.1647, "step": 2206 }, { "epoch": 0.4632661628883291, "grad_norm": 0.12993770837783813, "learning_rate": 6.492021882714263e-05, "loss": 0.1628, "step": 2207 }, { "epoch": 0.4634760705289673, "grad_norm": 0.1012057512998581, "learning_rate": 6.488524327941323e-05, "loss": 0.1633, "step": 2208 }, { "epoch": 0.4636859781696054, "grad_norm": 0.09874848276376724, "learning_rate": 6.485025973796817e-05, "loss": 0.1434, "step": 2209 }, { "epoch": 0.4638958858102435, "grad_norm": 0.12227320671081543, "learning_rate": 6.48152682215944e-05, "loss": 0.1718, "step": 2210 }, { "epoch": 0.4641057934508816, "grad_norm": 0.11163394153118134, "learning_rate": 6.47802687490832e-05, "loss": 0.161, "step": 2211 }, { "epoch": 0.4643157010915197, "grad_norm": 0.12014634162187576, "learning_rate": 6.474526133923006e-05, "loss": 0.1624, "step": 2212 }, { "epoch": 0.46452560873215787, "grad_norm": 0.11568920314311981, "learning_rate": 6.471024601083475e-05, "loss": 0.1492, "step": 2213 }, { "epoch": 0.464735516372796, "grad_norm": 0.12237660586833954, "learning_rate": 6.467522278270133e-05, "loss": 0.1574, "step": 2214 }, { "epoch": 0.4649454240134341, "grad_norm": 0.0965287908911705, "learning_rate": 6.464019167363804e-05, "loss": 0.16, "step": 2215 }, { "epoch": 0.4651553316540722, "grad_norm": 0.12272460013628006, "learning_rate": 6.460515270245742e-05, "loss": 0.1649, "step": 2216 }, { "epoch": 0.46536523929471035, "grad_norm": 0.12838344275951385, "learning_rate": 6.457010588797619e-05, "loss": 0.1565, "step": 2217 }, { "epoch": 0.46557514693534846, "grad_norm": 0.13334384560585022, "learning_rate": 6.453505124901526e-05, "loss": 0.1387, "step": 2218 }, { "epoch": 0.46578505457598657, "grad_norm": 0.10292670875787735, "learning_rate": 6.44999888043998e-05, "loss": 0.1581, "step": 2219 }, { "epoch": 0.4659949622166247, "grad_norm": 0.1194579005241394, "learning_rate": 6.446491857295914e-05, "loss": 0.1673, "step": 2220 }, { "epoch": 0.4662048698572628, "grad_norm": 0.1133674830198288, "learning_rate": 6.442984057352679e-05, "loss": 0.1623, "step": 2221 }, { "epoch": 0.46641477749790095, "grad_norm": 0.10083403438329697, "learning_rate": 6.439475482494043e-05, "loss": 0.1565, "step": 2222 }, { "epoch": 0.46662468513853905, "grad_norm": 0.1386317014694214, "learning_rate": 6.435966134604195e-05, "loss": 0.18, "step": 2223 }, { "epoch": 0.46683459277917716, "grad_norm": 0.11142829060554504, "learning_rate": 6.432456015567728e-05, "loss": 0.167, "step": 2224 }, { "epoch": 0.46704450041981527, "grad_norm": 0.11028482764959335, "learning_rate": 6.428945127269663e-05, "loss": 0.1711, "step": 2225 }, { "epoch": 0.4672544080604534, "grad_norm": 0.11065543442964554, "learning_rate": 6.425433471595422e-05, "loss": 0.1526, "step": 2226 }, { "epoch": 0.46746431570109154, "grad_norm": 0.11224319040775299, "learning_rate": 6.421921050430849e-05, "loss": 0.1582, "step": 2227 }, { "epoch": 0.46767422334172964, "grad_norm": 0.12173019349575043, "learning_rate": 6.418407865662189e-05, "loss": 0.1714, "step": 2228 }, { "epoch": 0.46788413098236775, "grad_norm": 0.11253444850444794, "learning_rate": 6.414893919176106e-05, "loss": 0.1751, "step": 2229 }, { "epoch": 0.46809403862300586, "grad_norm": 0.1227567121386528, "learning_rate": 6.411379212859669e-05, "loss": 0.1558, "step": 2230 }, { "epoch": 0.468303946263644, "grad_norm": 0.12723511457443237, "learning_rate": 6.407863748600355e-05, "loss": 0.1574, "step": 2231 }, { "epoch": 0.46851385390428213, "grad_norm": 0.10584603250026703, "learning_rate": 6.40434752828605e-05, "loss": 0.156, "step": 2232 }, { "epoch": 0.46872376154492024, "grad_norm": 0.13455379009246826, "learning_rate": 6.400830553805043e-05, "loss": 0.157, "step": 2233 }, { "epoch": 0.46893366918555834, "grad_norm": 0.10209176689386368, "learning_rate": 6.397312827046029e-05, "loss": 0.161, "step": 2234 }, { "epoch": 0.46914357682619645, "grad_norm": 0.15381069481372833, "learning_rate": 6.393794349898111e-05, "loss": 0.1614, "step": 2235 }, { "epoch": 0.4693534844668346, "grad_norm": 0.12260998785495758, "learning_rate": 6.390275124250788e-05, "loss": 0.1451, "step": 2236 }, { "epoch": 0.4695633921074727, "grad_norm": 0.12964041531085968, "learning_rate": 6.386755151993967e-05, "loss": 0.1464, "step": 2237 }, { "epoch": 0.46977329974811083, "grad_norm": 0.14182746410369873, "learning_rate": 6.383234435017952e-05, "loss": 0.1666, "step": 2238 }, { "epoch": 0.46998320738874894, "grad_norm": 0.11773552000522614, "learning_rate": 6.37971297521345e-05, "loss": 0.1609, "step": 2239 }, { "epoch": 0.47019311502938704, "grad_norm": 0.12091661989688873, "learning_rate": 6.376190774471562e-05, "loss": 0.161, "step": 2240 }, { "epoch": 0.4704030226700252, "grad_norm": 0.12359412759542465, "learning_rate": 6.372667834683794e-05, "loss": 0.1407, "step": 2241 }, { "epoch": 0.4706129303106633, "grad_norm": 0.11386679857969284, "learning_rate": 6.369144157742045e-05, "loss": 0.1644, "step": 2242 }, { "epoch": 0.4708228379513014, "grad_norm": 0.12322553992271423, "learning_rate": 6.365619745538607e-05, "loss": 0.1661, "step": 2243 }, { "epoch": 0.47103274559193953, "grad_norm": 0.11949003487825394, "learning_rate": 6.362094599966171e-05, "loss": 0.1469, "step": 2244 }, { "epoch": 0.4712426532325777, "grad_norm": 0.11004175245761871, "learning_rate": 6.358568722917824e-05, "loss": 0.1452, "step": 2245 }, { "epoch": 0.4714525608732158, "grad_norm": 0.11459292471408844, "learning_rate": 6.355042116287038e-05, "loss": 0.166, "step": 2246 }, { "epoch": 0.4716624685138539, "grad_norm": 0.12747009098529816, "learning_rate": 6.351514781967682e-05, "loss": 0.1472, "step": 2247 }, { "epoch": 0.471872376154492, "grad_norm": 0.11906711012125015, "learning_rate": 6.347986721854018e-05, "loss": 0.1686, "step": 2248 }, { "epoch": 0.4720822837951301, "grad_norm": 0.12261734902858734, "learning_rate": 6.344457937840693e-05, "loss": 0.1578, "step": 2249 }, { "epoch": 0.4722921914357683, "grad_norm": 0.12668012082576752, "learning_rate": 6.340928431822744e-05, "loss": 0.1583, "step": 2250 }, { "epoch": 0.4725020990764064, "grad_norm": 0.14289303123950958, "learning_rate": 6.337398205695597e-05, "loss": 0.1392, "step": 2251 }, { "epoch": 0.4727120067170445, "grad_norm": 0.12961414456367493, "learning_rate": 6.333867261355064e-05, "loss": 0.1697, "step": 2252 }, { "epoch": 0.4729219143576826, "grad_norm": 0.11781413853168488, "learning_rate": 6.330335600697344e-05, "loss": 0.163, "step": 2253 }, { "epoch": 0.4731318219983207, "grad_norm": 0.13710007071495056, "learning_rate": 6.326803225619018e-05, "loss": 0.1705, "step": 2254 }, { "epoch": 0.4733417296389589, "grad_norm": 0.14093898236751556, "learning_rate": 6.323270138017052e-05, "loss": 0.1622, "step": 2255 }, { "epoch": 0.473551637279597, "grad_norm": 0.12070971727371216, "learning_rate": 6.319736339788795e-05, "loss": 0.1289, "step": 2256 }, { "epoch": 0.4737615449202351, "grad_norm": 0.12553520500659943, "learning_rate": 6.316201832831978e-05, "loss": 0.1514, "step": 2257 }, { "epoch": 0.4739714525608732, "grad_norm": 0.11646711826324463, "learning_rate": 6.312666619044712e-05, "loss": 0.1509, "step": 2258 }, { "epoch": 0.47418136020151136, "grad_norm": 0.12744106352329254, "learning_rate": 6.309130700325486e-05, "loss": 0.1599, "step": 2259 }, { "epoch": 0.47439126784214947, "grad_norm": 0.11816243827342987, "learning_rate": 6.305594078573172e-05, "loss": 0.1579, "step": 2260 }, { "epoch": 0.4746011754827876, "grad_norm": 0.1067761555314064, "learning_rate": 6.302056755687013e-05, "loss": 0.149, "step": 2261 }, { "epoch": 0.4748110831234257, "grad_norm": 0.1448090523481369, "learning_rate": 6.298518733566634e-05, "loss": 0.1471, "step": 2262 }, { "epoch": 0.4750209907640638, "grad_norm": 0.10772380977869034, "learning_rate": 6.294980014112035e-05, "loss": 0.1418, "step": 2263 }, { "epoch": 0.47523089840470195, "grad_norm": 0.11698709428310394, "learning_rate": 6.291440599223587e-05, "loss": 0.1535, "step": 2264 }, { "epoch": 0.47544080604534006, "grad_norm": 0.11132114380598068, "learning_rate": 6.287900490802038e-05, "loss": 0.1472, "step": 2265 }, { "epoch": 0.47565071368597817, "grad_norm": 0.10131499171257019, "learning_rate": 6.284359690748506e-05, "loss": 0.1649, "step": 2266 }, { "epoch": 0.4758606213266163, "grad_norm": 0.1225217878818512, "learning_rate": 6.280818200964484e-05, "loss": 0.1692, "step": 2267 }, { "epoch": 0.4760705289672544, "grad_norm": 0.11604775488376617, "learning_rate": 6.27727602335183e-05, "loss": 0.156, "step": 2268 }, { "epoch": 0.47628043660789254, "grad_norm": 0.09776950627565384, "learning_rate": 6.273733159812773e-05, "loss": 0.167, "step": 2269 }, { "epoch": 0.47649034424853065, "grad_norm": 0.127922922372818, "learning_rate": 6.27018961224992e-05, "loss": 0.1732, "step": 2270 }, { "epoch": 0.47670025188916876, "grad_norm": 0.1038019210100174, "learning_rate": 6.266645382566226e-05, "loss": 0.1538, "step": 2271 }, { "epoch": 0.47691015952980687, "grad_norm": 0.11374037712812424, "learning_rate": 6.26310047266503e-05, "loss": 0.1592, "step": 2272 }, { "epoch": 0.47712006717044503, "grad_norm": 0.1249479278922081, "learning_rate": 6.259554884450028e-05, "loss": 0.1638, "step": 2273 }, { "epoch": 0.47732997481108314, "grad_norm": 0.13736027479171753, "learning_rate": 6.25600861982528e-05, "loss": 0.181, "step": 2274 }, { "epoch": 0.47753988245172124, "grad_norm": 0.1172303855419159, "learning_rate": 6.252461680695214e-05, "loss": 0.1549, "step": 2275 }, { "epoch": 0.47774979009235935, "grad_norm": 0.1163245216012001, "learning_rate": 6.248914068964618e-05, "loss": 0.1821, "step": 2276 }, { "epoch": 0.47795969773299746, "grad_norm": 0.11509289592504501, "learning_rate": 6.245365786538636e-05, "loss": 0.1618, "step": 2277 }, { "epoch": 0.4781696053736356, "grad_norm": 0.11194191128015518, "learning_rate": 6.241816835322781e-05, "loss": 0.1384, "step": 2278 }, { "epoch": 0.4783795130142737, "grad_norm": 0.13771222531795502, "learning_rate": 6.238267217222919e-05, "loss": 0.1614, "step": 2279 }, { "epoch": 0.47858942065491183, "grad_norm": 0.1045057475566864, "learning_rate": 6.234716934145277e-05, "loss": 0.1522, "step": 2280 }, { "epoch": 0.47879932829554994, "grad_norm": 0.11645108461380005, "learning_rate": 6.231165987996436e-05, "loss": 0.1701, "step": 2281 }, { "epoch": 0.4790092359361881, "grad_norm": 0.12459120154380798, "learning_rate": 6.227614380683337e-05, "loss": 0.1764, "step": 2282 }, { "epoch": 0.4792191435768262, "grad_norm": 0.13852064311504364, "learning_rate": 6.224062114113275e-05, "loss": 0.1558, "step": 2283 }, { "epoch": 0.4794290512174643, "grad_norm": 0.10844726115465164, "learning_rate": 6.220509190193896e-05, "loss": 0.1665, "step": 2284 }, { "epoch": 0.4796389588581024, "grad_norm": 0.12302237004041672, "learning_rate": 6.216955610833203e-05, "loss": 0.1694, "step": 2285 }, { "epoch": 0.47984886649874053, "grad_norm": 0.11576231569051743, "learning_rate": 6.213401377939548e-05, "loss": 0.1683, "step": 2286 }, { "epoch": 0.4800587741393787, "grad_norm": 0.1322503685951233, "learning_rate": 6.209846493421637e-05, "loss": 0.167, "step": 2287 }, { "epoch": 0.4802686817800168, "grad_norm": 0.11987553536891937, "learning_rate": 6.206290959188523e-05, "loss": 0.1593, "step": 2288 }, { "epoch": 0.4804785894206549, "grad_norm": 0.11214146018028259, "learning_rate": 6.20273477714961e-05, "loss": 0.1829, "step": 2289 }, { "epoch": 0.480688497061293, "grad_norm": 0.11640086024999619, "learning_rate": 6.199177949214649e-05, "loss": 0.1542, "step": 2290 }, { "epoch": 0.4808984047019311, "grad_norm": 0.10756421089172363, "learning_rate": 6.195620477293734e-05, "loss": 0.1394, "step": 2291 }, { "epoch": 0.4811083123425693, "grad_norm": 0.11862246692180634, "learning_rate": 6.192062363297315e-05, "loss": 0.1585, "step": 2292 }, { "epoch": 0.4813182199832074, "grad_norm": 0.10187924653291702, "learning_rate": 6.188503609136175e-05, "loss": 0.1536, "step": 2293 }, { "epoch": 0.4815281276238455, "grad_norm": 0.1261720061302185, "learning_rate": 6.18494421672145e-05, "loss": 0.1619, "step": 2294 }, { "epoch": 0.4817380352644836, "grad_norm": 0.11360353976488113, "learning_rate": 6.181384187964613e-05, "loss": 0.1417, "step": 2295 }, { "epoch": 0.4819479429051218, "grad_norm": 0.1149376705288887, "learning_rate": 6.177823524777481e-05, "loss": 0.1569, "step": 2296 }, { "epoch": 0.4821578505457599, "grad_norm": 0.10513467341661453, "learning_rate": 6.174262229072212e-05, "loss": 0.138, "step": 2297 }, { "epoch": 0.482367758186398, "grad_norm": 0.10287132859230042, "learning_rate": 6.170700302761303e-05, "loss": 0.16, "step": 2298 }, { "epoch": 0.4825776658270361, "grad_norm": 0.12048831582069397, "learning_rate": 6.16713774775759e-05, "loss": 0.1764, "step": 2299 }, { "epoch": 0.4827875734676742, "grad_norm": 0.14703017473220825, "learning_rate": 6.163574565974246e-05, "loss": 0.1571, "step": 2300 }, { "epoch": 0.48299748110831237, "grad_norm": 0.10593768209218979, "learning_rate": 6.160010759324783e-05, "loss": 0.1544, "step": 2301 }, { "epoch": 0.4832073887489505, "grad_norm": 0.1123269572854042, "learning_rate": 6.156446329723042e-05, "loss": 0.1518, "step": 2302 }, { "epoch": 0.4834172963895886, "grad_norm": 0.12359534204006195, "learning_rate": 6.152881279083209e-05, "loss": 0.1573, "step": 2303 }, { "epoch": 0.4836272040302267, "grad_norm": 0.11115585267543793, "learning_rate": 6.149315609319793e-05, "loss": 0.1718, "step": 2304 }, { "epoch": 0.4838371116708648, "grad_norm": 0.12576039135456085, "learning_rate": 6.145749322347642e-05, "loss": 0.1581, "step": 2305 }, { "epoch": 0.48404701931150296, "grad_norm": 0.1161019429564476, "learning_rate": 6.142182420081931e-05, "loss": 0.1493, "step": 2306 }, { "epoch": 0.48425692695214106, "grad_norm": 0.10857407003641129, "learning_rate": 6.138614904438172e-05, "loss": 0.1642, "step": 2307 }, { "epoch": 0.48446683459277917, "grad_norm": 0.1394800990819931, "learning_rate": 6.1350467773322e-05, "loss": 0.1528, "step": 2308 }, { "epoch": 0.4846767422334173, "grad_norm": 0.11143230646848679, "learning_rate": 6.131478040680183e-05, "loss": 0.1509, "step": 2309 }, { "epoch": 0.48488664987405544, "grad_norm": 0.13411763310432434, "learning_rate": 6.12790869639861e-05, "loss": 0.1417, "step": 2310 }, { "epoch": 0.48509655751469355, "grad_norm": 0.1560184210538864, "learning_rate": 6.124338746404305e-05, "loss": 0.157, "step": 2311 }, { "epoch": 0.48530646515533166, "grad_norm": 0.13342639803886414, "learning_rate": 6.12076819261441e-05, "loss": 0.1601, "step": 2312 }, { "epoch": 0.48551637279596976, "grad_norm": 0.16688786447048187, "learning_rate": 6.117197036946394e-05, "loss": 0.168, "step": 2313 }, { "epoch": 0.48572628043660787, "grad_norm": 0.10276120901107788, "learning_rate": 6.11362528131805e-05, "loss": 0.1504, "step": 2314 }, { "epoch": 0.48593618807724603, "grad_norm": 0.1053510457277298, "learning_rate": 6.110052927647491e-05, "loss": 0.1517, "step": 2315 }, { "epoch": 0.48614609571788414, "grad_norm": 0.10654187202453613, "learning_rate": 6.106479977853154e-05, "loss": 0.1563, "step": 2316 }, { "epoch": 0.48635600335852225, "grad_norm": 0.11610613763332367, "learning_rate": 6.102906433853796e-05, "loss": 0.1549, "step": 2317 }, { "epoch": 0.48656591099916036, "grad_norm": 0.11274497210979462, "learning_rate": 6.099332297568488e-05, "loss": 0.1561, "step": 2318 }, { "epoch": 0.48677581863979846, "grad_norm": 0.12665775418281555, "learning_rate": 6.095757570916626e-05, "loss": 0.1618, "step": 2319 }, { "epoch": 0.4869857262804366, "grad_norm": 0.12388937175273895, "learning_rate": 6.092182255817918e-05, "loss": 0.1636, "step": 2320 }, { "epoch": 0.48719563392107473, "grad_norm": 0.09930404275655746, "learning_rate": 6.088606354192392e-05, "loss": 0.1542, "step": 2321 }, { "epoch": 0.48740554156171284, "grad_norm": 0.09602123498916626, "learning_rate": 6.0850298679603845e-05, "loss": 0.142, "step": 2322 }, { "epoch": 0.48761544920235095, "grad_norm": 0.1361406445503235, "learning_rate": 6.0814527990425555e-05, "loss": 0.1487, "step": 2323 }, { "epoch": 0.4878253568429891, "grad_norm": 0.11267097294330597, "learning_rate": 6.0778751493598686e-05, "loss": 0.1535, "step": 2324 }, { "epoch": 0.4880352644836272, "grad_norm": 0.11054635792970657, "learning_rate": 6.0742969208336074e-05, "loss": 0.1473, "step": 2325 }, { "epoch": 0.4882451721242653, "grad_norm": 0.12421461194753647, "learning_rate": 6.070718115385359e-05, "loss": 0.1659, "step": 2326 }, { "epoch": 0.48845507976490343, "grad_norm": 0.1250905990600586, "learning_rate": 6.0671387349370246e-05, "loss": 0.1605, "step": 2327 }, { "epoch": 0.48866498740554154, "grad_norm": 0.13226497173309326, "learning_rate": 6.0635587814108143e-05, "loss": 0.1671, "step": 2328 }, { "epoch": 0.4888748950461797, "grad_norm": 0.1366906613111496, "learning_rate": 6.059978256729244e-05, "loss": 0.1671, "step": 2329 }, { "epoch": 0.4890848026868178, "grad_norm": 0.11474860459566116, "learning_rate": 6.056397162815137e-05, "loss": 0.1408, "step": 2330 }, { "epoch": 0.4892947103274559, "grad_norm": 0.14364750683307648, "learning_rate": 6.052815501591622e-05, "loss": 0.1577, "step": 2331 }, { "epoch": 0.489504617968094, "grad_norm": 0.10247082263231277, "learning_rate": 6.0492332749821345e-05, "loss": 0.1471, "step": 2332 }, { "epoch": 0.48971452560873213, "grad_norm": 0.12132863700389862, "learning_rate": 6.045650484910412e-05, "loss": 0.1532, "step": 2333 }, { "epoch": 0.4899244332493703, "grad_norm": 0.11156227439641953, "learning_rate": 6.042067133300494e-05, "loss": 0.1455, "step": 2334 }, { "epoch": 0.4901343408900084, "grad_norm": 0.11364928632974625, "learning_rate": 6.038483222076722e-05, "loss": 0.166, "step": 2335 }, { "epoch": 0.4903442485306465, "grad_norm": 0.11692879348993301, "learning_rate": 6.034898753163738e-05, "loss": 0.1474, "step": 2336 }, { "epoch": 0.4905541561712846, "grad_norm": 0.13082164525985718, "learning_rate": 6.031313728486483e-05, "loss": 0.1598, "step": 2337 }, { "epoch": 0.4907640638119228, "grad_norm": 0.11700566112995148, "learning_rate": 6.0277281499701986e-05, "loss": 0.1662, "step": 2338 }, { "epoch": 0.4909739714525609, "grad_norm": 0.12339949607849121, "learning_rate": 6.0241420195404216e-05, "loss": 0.1474, "step": 2339 }, { "epoch": 0.491183879093199, "grad_norm": 0.12292884290218353, "learning_rate": 6.020555339122987e-05, "loss": 0.1556, "step": 2340 }, { "epoch": 0.4913937867338371, "grad_norm": 0.1513158679008484, "learning_rate": 6.0169681106440236e-05, "loss": 0.1432, "step": 2341 }, { "epoch": 0.4916036943744752, "grad_norm": 0.12808825075626373, "learning_rate": 6.0133803360299544e-05, "loss": 0.142, "step": 2342 }, { "epoch": 0.49181360201511337, "grad_norm": 0.10223434865474701, "learning_rate": 6.009792017207496e-05, "loss": 0.1619, "step": 2343 }, { "epoch": 0.4920235096557515, "grad_norm": 0.1480330377817154, "learning_rate": 6.006203156103659e-05, "loss": 0.1505, "step": 2344 }, { "epoch": 0.4922334172963896, "grad_norm": 0.11763409525156021, "learning_rate": 6.002613754645743e-05, "loss": 0.1559, "step": 2345 }, { "epoch": 0.4924433249370277, "grad_norm": 0.1263841837644577, "learning_rate": 5.99902381476134e-05, "loss": 0.147, "step": 2346 }, { "epoch": 0.4926532325776658, "grad_norm": 0.12160757929086685, "learning_rate": 5.9954333383783256e-05, "loss": 0.1588, "step": 2347 }, { "epoch": 0.49286314021830396, "grad_norm": 0.20746250450611115, "learning_rate": 5.991842327424871e-05, "loss": 0.1598, "step": 2348 }, { "epoch": 0.49307304785894207, "grad_norm": 0.12765191495418549, "learning_rate": 5.988250783829432e-05, "loss": 0.1618, "step": 2349 }, { "epoch": 0.4932829554995802, "grad_norm": 0.10818785429000854, "learning_rate": 5.9846587095207474e-05, "loss": 0.1521, "step": 2350 }, { "epoch": 0.4934928631402183, "grad_norm": 0.13110002875328064, "learning_rate": 5.981066106427843e-05, "loss": 0.1658, "step": 2351 }, { "epoch": 0.49370277078085645, "grad_norm": 0.11284149438142776, "learning_rate": 5.97747297648003e-05, "loss": 0.1578, "step": 2352 }, { "epoch": 0.49391267842149456, "grad_norm": 0.12948429584503174, "learning_rate": 5.973879321606901e-05, "loss": 0.1439, "step": 2353 }, { "epoch": 0.49412258606213266, "grad_norm": 0.1391395777463913, "learning_rate": 5.970285143738331e-05, "loss": 0.1697, "step": 2354 }, { "epoch": 0.49433249370277077, "grad_norm": 0.11676032096147537, "learning_rate": 5.966690444804474e-05, "loss": 0.1543, "step": 2355 }, { "epoch": 0.4945424013434089, "grad_norm": 0.1376785784959793, "learning_rate": 5.963095226735769e-05, "loss": 0.1547, "step": 2356 }, { "epoch": 0.49475230898404704, "grad_norm": 0.1433243304491043, "learning_rate": 5.9594994914629244e-05, "loss": 0.1516, "step": 2357 }, { "epoch": 0.49496221662468515, "grad_norm": 0.12082047760486603, "learning_rate": 5.955903240916938e-05, "loss": 0.1679, "step": 2358 }, { "epoch": 0.49517212426532325, "grad_norm": 0.0982949510216713, "learning_rate": 5.952306477029074e-05, "loss": 0.1559, "step": 2359 }, { "epoch": 0.49538203190596136, "grad_norm": 0.13121630251407623, "learning_rate": 5.9487092017308806e-05, "loss": 0.1696, "step": 2360 }, { "epoch": 0.49559193954659947, "grad_norm": 0.13477589190006256, "learning_rate": 5.9451114169541744e-05, "loss": 0.1565, "step": 2361 }, { "epoch": 0.49580184718723763, "grad_norm": 0.11858514696359634, "learning_rate": 5.94151312463105e-05, "loss": 0.1648, "step": 2362 }, { "epoch": 0.49601175482787574, "grad_norm": 0.0997430831193924, "learning_rate": 5.93791432669387e-05, "loss": 0.1547, "step": 2363 }, { "epoch": 0.49622166246851385, "grad_norm": 0.11332326382398605, "learning_rate": 5.934315025075272e-05, "loss": 0.1512, "step": 2364 }, { "epoch": 0.49643157010915195, "grad_norm": 0.1203446090221405, "learning_rate": 5.930715221708163e-05, "loss": 0.1677, "step": 2365 }, { "epoch": 0.4966414777497901, "grad_norm": 0.12282302975654602, "learning_rate": 5.927114918525722e-05, "loss": 0.1525, "step": 2366 }, { "epoch": 0.4968513853904282, "grad_norm": 0.10741288959980011, "learning_rate": 5.923514117461391e-05, "loss": 0.1345, "step": 2367 }, { "epoch": 0.49706129303106633, "grad_norm": 0.10739348083734512, "learning_rate": 5.919912820448884e-05, "loss": 0.1439, "step": 2368 }, { "epoch": 0.49727120067170444, "grad_norm": 0.11860400438308716, "learning_rate": 5.9163110294221804e-05, "loss": 0.1743, "step": 2369 }, { "epoch": 0.49748110831234255, "grad_norm": 0.1082897037267685, "learning_rate": 5.912708746315522e-05, "loss": 0.1542, "step": 2370 }, { "epoch": 0.4976910159529807, "grad_norm": 0.11466725915670395, "learning_rate": 5.909105973063419e-05, "loss": 0.16, "step": 2371 }, { "epoch": 0.4979009235936188, "grad_norm": 0.1288958340883255, "learning_rate": 5.905502711600641e-05, "loss": 0.1641, "step": 2372 }, { "epoch": 0.4981108312342569, "grad_norm": 0.1337483674287796, "learning_rate": 5.9018989638622234e-05, "loss": 0.1446, "step": 2373 }, { "epoch": 0.49832073887489503, "grad_norm": 0.12721973657608032, "learning_rate": 5.8982947317834615e-05, "loss": 0.1371, "step": 2374 }, { "epoch": 0.49853064651553314, "grad_norm": 0.13450993597507477, "learning_rate": 5.894690017299909e-05, "loss": 0.1605, "step": 2375 }, { "epoch": 0.4987405541561713, "grad_norm": 0.130893275141716, "learning_rate": 5.891084822347379e-05, "loss": 0.1582, "step": 2376 }, { "epoch": 0.4989504617968094, "grad_norm": 0.10963684320449829, "learning_rate": 5.887479148861948e-05, "loss": 0.17, "step": 2377 }, { "epoch": 0.4991603694374475, "grad_norm": 0.12141723185777664, "learning_rate": 5.883872998779939e-05, "loss": 0.1423, "step": 2378 }, { "epoch": 0.4993702770780856, "grad_norm": 0.12254346907138824, "learning_rate": 5.8802663740379414e-05, "loss": 0.1598, "step": 2379 }, { "epoch": 0.4995801847187238, "grad_norm": 0.1324474960565567, "learning_rate": 5.8766592765727934e-05, "loss": 0.1687, "step": 2380 }, { "epoch": 0.4997900923593619, "grad_norm": 0.13154703378677368, "learning_rate": 5.873051708321589e-05, "loss": 0.1434, "step": 2381 }, { "epoch": 0.5, "grad_norm": 0.10656025260686874, "learning_rate": 5.869443671221673e-05, "loss": 0.1556, "step": 2382 }, { "epoch": 0.5002099076406381, "grad_norm": 0.11154552549123764, "learning_rate": 5.865835167210647e-05, "loss": 0.1674, "step": 2383 }, { "epoch": 0.5004198152812762, "grad_norm": 0.12540102005004883, "learning_rate": 5.862226198226357e-05, "loss": 0.1599, "step": 2384 }, { "epoch": 0.5006297229219143, "grad_norm": 0.10726683586835861, "learning_rate": 5.858616766206905e-05, "loss": 0.164, "step": 2385 }, { "epoch": 0.5008396305625524, "grad_norm": 0.10406230390071869, "learning_rate": 5.855006873090635e-05, "loss": 0.1492, "step": 2386 }, { "epoch": 0.5010495382031906, "grad_norm": 0.12061581760644913, "learning_rate": 5.851396520816146e-05, "loss": 0.166, "step": 2387 }, { "epoch": 0.5012594458438288, "grad_norm": 0.12048531323671341, "learning_rate": 5.8477857113222745e-05, "loss": 0.1409, "step": 2388 }, { "epoch": 0.5014693534844669, "grad_norm": 0.11254918575286865, "learning_rate": 5.8441744465481115e-05, "loss": 0.1502, "step": 2389 }, { "epoch": 0.501679261125105, "grad_norm": 0.1056586280465126, "learning_rate": 5.840562728432988e-05, "loss": 0.1677, "step": 2390 }, { "epoch": 0.5018891687657431, "grad_norm": 0.1021779254078865, "learning_rate": 5.836950558916481e-05, "loss": 0.1671, "step": 2391 }, { "epoch": 0.5020990764063812, "grad_norm": 0.1173490360379219, "learning_rate": 5.8333379399384026e-05, "loss": 0.1707, "step": 2392 }, { "epoch": 0.5023089840470193, "grad_norm": 0.11009548604488373, "learning_rate": 5.829724873438816e-05, "loss": 0.1395, "step": 2393 }, { "epoch": 0.5025188916876574, "grad_norm": 0.10494174063205719, "learning_rate": 5.82611136135802e-05, "loss": 0.1598, "step": 2394 }, { "epoch": 0.5027287993282955, "grad_norm": 0.1159316897392273, "learning_rate": 5.822497405636553e-05, "loss": 0.1812, "step": 2395 }, { "epoch": 0.5029387069689337, "grad_norm": 0.12583115696907043, "learning_rate": 5.81888300821519e-05, "loss": 0.148, "step": 2396 }, { "epoch": 0.5031486146095718, "grad_norm": 0.13148032128810883, "learning_rate": 5.815268171034947e-05, "loss": 0.1589, "step": 2397 }, { "epoch": 0.5033585222502099, "grad_norm": 0.10882681608200073, "learning_rate": 5.811652896037073e-05, "loss": 0.1567, "step": 2398 }, { "epoch": 0.503568429890848, "grad_norm": 0.12563377618789673, "learning_rate": 5.808037185163054e-05, "loss": 0.1635, "step": 2399 }, { "epoch": 0.5037783375314862, "grad_norm": 0.11974649131298065, "learning_rate": 5.8044210403546096e-05, "loss": 0.1517, "step": 2400 }, { "epoch": 0.5039882451721243, "grad_norm": 0.12729738652706146, "learning_rate": 5.8008044635536916e-05, "loss": 0.1369, "step": 2401 }, { "epoch": 0.5041981528127624, "grad_norm": 0.13505300879478455, "learning_rate": 5.797187456702483e-05, "loss": 0.1695, "step": 2402 }, { "epoch": 0.5044080604534005, "grad_norm": 0.13184520602226257, "learning_rate": 5.7935700217434017e-05, "loss": 0.15, "step": 2403 }, { "epoch": 0.5046179680940386, "grad_norm": 0.13112464547157288, "learning_rate": 5.789952160619092e-05, "loss": 0.147, "step": 2404 }, { "epoch": 0.5048278757346767, "grad_norm": 0.12043753266334534, "learning_rate": 5.7863338752724275e-05, "loss": 0.1529, "step": 2405 }, { "epoch": 0.5050377833753149, "grad_norm": 0.11811091750860214, "learning_rate": 5.782715167646508e-05, "loss": 0.1357, "step": 2406 }, { "epoch": 0.505247691015953, "grad_norm": 0.10397012531757355, "learning_rate": 5.779096039684667e-05, "loss": 0.1423, "step": 2407 }, { "epoch": 0.5054575986565911, "grad_norm": 0.1074468344449997, "learning_rate": 5.7754764933304554e-05, "loss": 0.1691, "step": 2408 }, { "epoch": 0.5056675062972292, "grad_norm": 0.11894194036722183, "learning_rate": 5.7718565305276526e-05, "loss": 0.159, "step": 2409 }, { "epoch": 0.5058774139378673, "grad_norm": 0.11252366006374359, "learning_rate": 5.7682361532202634e-05, "loss": 0.1545, "step": 2410 }, { "epoch": 0.5060873215785054, "grad_norm": 0.14150729775428772, "learning_rate": 5.76461536335251e-05, "loss": 0.1618, "step": 2411 }, { "epoch": 0.5062972292191436, "grad_norm": 0.16456341743469238, "learning_rate": 5.7609941628688424e-05, "loss": 0.1666, "step": 2412 }, { "epoch": 0.5065071368597817, "grad_norm": 0.10939100384712219, "learning_rate": 5.757372553713923e-05, "loss": 0.1688, "step": 2413 }, { "epoch": 0.5067170445004198, "grad_norm": 0.11484073102474213, "learning_rate": 5.753750537832645e-05, "loss": 0.1712, "step": 2414 }, { "epoch": 0.506926952141058, "grad_norm": 0.1294430047273636, "learning_rate": 5.7501281171701106e-05, "loss": 0.1646, "step": 2415 }, { "epoch": 0.5071368597816961, "grad_norm": 0.15343913435935974, "learning_rate": 5.7465052936716425e-05, "loss": 0.1496, "step": 2416 }, { "epoch": 0.5073467674223342, "grad_norm": 0.10801581293344498, "learning_rate": 5.7428820692827776e-05, "loss": 0.144, "step": 2417 }, { "epoch": 0.5075566750629723, "grad_norm": 0.12166569381952286, "learning_rate": 5.739258445949275e-05, "loss": 0.1625, "step": 2418 }, { "epoch": 0.5077665827036104, "grad_norm": 0.0971064567565918, "learning_rate": 5.735634425617098e-05, "loss": 0.1623, "step": 2419 }, { "epoch": 0.5079764903442485, "grad_norm": 0.13411295413970947, "learning_rate": 5.7320100102324325e-05, "loss": 0.1556, "step": 2420 }, { "epoch": 0.5081863979848866, "grad_norm": 0.12255419790744781, "learning_rate": 5.7283852017416696e-05, "loss": 0.1536, "step": 2421 }, { "epoch": 0.5083963056255247, "grad_norm": 0.10330315679311752, "learning_rate": 5.724760002091417e-05, "loss": 0.1439, "step": 2422 }, { "epoch": 0.5086062132661628, "grad_norm": 0.1231582835316658, "learning_rate": 5.721134413228486e-05, "loss": 0.1585, "step": 2423 }, { "epoch": 0.5088161209068011, "grad_norm": 0.11223944276571274, "learning_rate": 5.717508437099903e-05, "loss": 0.1564, "step": 2424 }, { "epoch": 0.5090260285474392, "grad_norm": 0.12430944293737411, "learning_rate": 5.713882075652901e-05, "loss": 0.1611, "step": 2425 }, { "epoch": 0.5092359361880773, "grad_norm": 0.11220264434814453, "learning_rate": 5.7102553308349174e-05, "loss": 0.1527, "step": 2426 }, { "epoch": 0.5094458438287154, "grad_norm": 0.1385231614112854, "learning_rate": 5.706628204593596e-05, "loss": 0.15, "step": 2427 }, { "epoch": 0.5096557514693535, "grad_norm": 0.12652051448822021, "learning_rate": 5.703000698876788e-05, "loss": 0.148, "step": 2428 }, { "epoch": 0.5098656591099916, "grad_norm": 0.11996873468160629, "learning_rate": 5.6993728156325466e-05, "loss": 0.1569, "step": 2429 }, { "epoch": 0.5100755667506297, "grad_norm": 0.1229998916387558, "learning_rate": 5.6957445568091284e-05, "loss": 0.152, "step": 2430 }, { "epoch": 0.5102854743912678, "grad_norm": 0.10919797420501709, "learning_rate": 5.692115924354989e-05, "loss": 0.1454, "step": 2431 }, { "epoch": 0.5104953820319059, "grad_norm": 0.13112346827983856, "learning_rate": 5.6884869202187906e-05, "loss": 0.1603, "step": 2432 }, { "epoch": 0.510705289672544, "grad_norm": 0.11570712178945541, "learning_rate": 5.684857546349388e-05, "loss": 0.1649, "step": 2433 }, { "epoch": 0.5109151973131822, "grad_norm": 0.12106207013130188, "learning_rate": 5.68122780469584e-05, "loss": 0.1573, "step": 2434 }, { "epoch": 0.5111251049538204, "grad_norm": 0.13014116883277893, "learning_rate": 5.6775976972073985e-05, "loss": 0.1487, "step": 2435 }, { "epoch": 0.5113350125944585, "grad_norm": 0.12581390142440796, "learning_rate": 5.673967225833519e-05, "loss": 0.1679, "step": 2436 }, { "epoch": 0.5115449202350966, "grad_norm": 0.15242688357830048, "learning_rate": 5.6703363925238404e-05, "loss": 0.1614, "step": 2437 }, { "epoch": 0.5117548278757347, "grad_norm": 0.11680883914232254, "learning_rate": 5.6667051992282094e-05, "loss": 0.16, "step": 2438 }, { "epoch": 0.5119647355163728, "grad_norm": 0.10956526547670364, "learning_rate": 5.663073647896656e-05, "loss": 0.1488, "step": 2439 }, { "epoch": 0.5121746431570109, "grad_norm": 0.1150960773229599, "learning_rate": 5.6594417404794086e-05, "loss": 0.1479, "step": 2440 }, { "epoch": 0.512384550797649, "grad_norm": 0.11409033834934235, "learning_rate": 5.655809478926881e-05, "loss": 0.162, "step": 2441 }, { "epoch": 0.5125944584382871, "grad_norm": 0.13178421556949615, "learning_rate": 5.6521768651896855e-05, "loss": 0.1529, "step": 2442 }, { "epoch": 0.5128043660789253, "grad_norm": 0.10844630002975464, "learning_rate": 5.648543901218615e-05, "loss": 0.1551, "step": 2443 }, { "epoch": 0.5130142737195634, "grad_norm": 0.10883074998855591, "learning_rate": 5.644910588964656e-05, "loss": 0.1396, "step": 2444 }, { "epoch": 0.5132241813602015, "grad_norm": 0.11679378896951675, "learning_rate": 5.641276930378978e-05, "loss": 0.1572, "step": 2445 }, { "epoch": 0.5134340890008396, "grad_norm": 0.11967083066701889, "learning_rate": 5.637642927412942e-05, "loss": 0.1731, "step": 2446 }, { "epoch": 0.5136439966414778, "grad_norm": 0.10097982734441757, "learning_rate": 5.634008582018086e-05, "loss": 0.1567, "step": 2447 }, { "epoch": 0.5138539042821159, "grad_norm": 0.11456773430109024, "learning_rate": 5.630373896146142e-05, "loss": 0.1783, "step": 2448 }, { "epoch": 0.514063811922754, "grad_norm": 0.12176181375980377, "learning_rate": 5.6267388717490144e-05, "loss": 0.1558, "step": 2449 }, { "epoch": 0.5142737195633921, "grad_norm": 0.11129914224147797, "learning_rate": 5.623103510778798e-05, "loss": 0.1558, "step": 2450 }, { "epoch": 0.5144836272040302, "grad_norm": 0.11947834491729736, "learning_rate": 5.619467815187762e-05, "loss": 0.1612, "step": 2451 }, { "epoch": 0.5146935348446684, "grad_norm": 0.11735592782497406, "learning_rate": 5.615831786928358e-05, "loss": 0.1593, "step": 2452 }, { "epoch": 0.5149034424853065, "grad_norm": 0.10720401257276535, "learning_rate": 5.6121954279532206e-05, "loss": 0.1529, "step": 2453 }, { "epoch": 0.5151133501259446, "grad_norm": 0.11391155421733856, "learning_rate": 5.608558740215152e-05, "loss": 0.1464, "step": 2454 }, { "epoch": 0.5153232577665827, "grad_norm": 0.12067031115293503, "learning_rate": 5.604921725667139e-05, "loss": 0.1541, "step": 2455 }, { "epoch": 0.5155331654072208, "grad_norm": 0.12708424031734467, "learning_rate": 5.601284386262342e-05, "loss": 0.1629, "step": 2456 }, { "epoch": 0.5157430730478589, "grad_norm": 0.11665681004524231, "learning_rate": 5.597646723954095e-05, "loss": 0.1685, "step": 2457 }, { "epoch": 0.515952980688497, "grad_norm": 0.11358939856290817, "learning_rate": 5.5940087406959054e-05, "loss": 0.147, "step": 2458 }, { "epoch": 0.5161628883291351, "grad_norm": 0.11850309371948242, "learning_rate": 5.5903704384414535e-05, "loss": 0.1663, "step": 2459 }, { "epoch": 0.5163727959697733, "grad_norm": 0.10734523087739944, "learning_rate": 5.586731819144591e-05, "loss": 0.1735, "step": 2460 }, { "epoch": 0.5165827036104114, "grad_norm": 0.11942029744386673, "learning_rate": 5.5830928847593376e-05, "loss": 0.1494, "step": 2461 }, { "epoch": 0.5167926112510496, "grad_norm": 0.0976463109254837, "learning_rate": 5.579453637239886e-05, "loss": 0.1525, "step": 2462 }, { "epoch": 0.5170025188916877, "grad_norm": 0.12405764311552048, "learning_rate": 5.5758140785405974e-05, "loss": 0.1342, "step": 2463 }, { "epoch": 0.5172124265323258, "grad_norm": 0.10457320511341095, "learning_rate": 5.5721742106159934e-05, "loss": 0.1381, "step": 2464 }, { "epoch": 0.5174223341729639, "grad_norm": 0.11121641099452972, "learning_rate": 5.568534035420771e-05, "loss": 0.1613, "step": 2465 }, { "epoch": 0.517632241813602, "grad_norm": 0.1370898187160492, "learning_rate": 5.564893554909781e-05, "loss": 0.16, "step": 2466 }, { "epoch": 0.5178421494542401, "grad_norm": 0.14508257806301117, "learning_rate": 5.561252771038052e-05, "loss": 0.159, "step": 2467 }, { "epoch": 0.5180520570948782, "grad_norm": 0.12698331475257874, "learning_rate": 5.557611685760763e-05, "loss": 0.155, "step": 2468 }, { "epoch": 0.5182619647355163, "grad_norm": 0.12996205687522888, "learning_rate": 5.5539703010332624e-05, "loss": 0.1562, "step": 2469 }, { "epoch": 0.5184718723761544, "grad_norm": 0.10742896795272827, "learning_rate": 5.550328618811055e-05, "loss": 0.1368, "step": 2470 }, { "epoch": 0.5186817800167927, "grad_norm": 0.11978666484355927, "learning_rate": 5.5466866410498095e-05, "loss": 0.1434, "step": 2471 }, { "epoch": 0.5188916876574308, "grad_norm": 0.1411794275045395, "learning_rate": 5.5430443697053494e-05, "loss": 0.1494, "step": 2472 }, { "epoch": 0.5191015952980689, "grad_norm": 0.12275288999080658, "learning_rate": 5.5394018067336586e-05, "loss": 0.145, "step": 2473 }, { "epoch": 0.519311502938707, "grad_norm": 0.1017669290304184, "learning_rate": 5.535758954090876e-05, "loss": 0.1433, "step": 2474 }, { "epoch": 0.5195214105793451, "grad_norm": 0.10941984504461288, "learning_rate": 5.5321158137332975e-05, "loss": 0.1615, "step": 2475 }, { "epoch": 0.5197313182199832, "grad_norm": 0.13621585071086884, "learning_rate": 5.5284723876173705e-05, "loss": 0.1633, "step": 2476 }, { "epoch": 0.5199412258606213, "grad_norm": 0.12817037105560303, "learning_rate": 5.524828677699703e-05, "loss": 0.1513, "step": 2477 }, { "epoch": 0.5201511335012594, "grad_norm": 0.125384122133255, "learning_rate": 5.521184685937044e-05, "loss": 0.1695, "step": 2478 }, { "epoch": 0.5203610411418975, "grad_norm": 0.12680883705615997, "learning_rate": 5.517540414286306e-05, "loss": 0.167, "step": 2479 }, { "epoch": 0.5205709487825357, "grad_norm": 0.11645618081092834, "learning_rate": 5.5138958647045426e-05, "loss": 0.1475, "step": 2480 }, { "epoch": 0.5207808564231738, "grad_norm": 0.11582057178020477, "learning_rate": 5.510251039148961e-05, "loss": 0.1616, "step": 2481 }, { "epoch": 0.520990764063812, "grad_norm": 0.11036519706249237, "learning_rate": 5.5066059395769164e-05, "loss": 0.1553, "step": 2482 }, { "epoch": 0.5212006717044501, "grad_norm": 0.16732360422611237, "learning_rate": 5.502960567945909e-05, "loss": 0.1551, "step": 2483 }, { "epoch": 0.5214105793450882, "grad_norm": 0.13218390941619873, "learning_rate": 5.49931492621359e-05, "loss": 0.1705, "step": 2484 }, { "epoch": 0.5216204869857263, "grad_norm": 0.13806398212909698, "learning_rate": 5.495669016337749e-05, "loss": 0.1754, "step": 2485 }, { "epoch": 0.5218303946263644, "grad_norm": 0.13826845586299896, "learning_rate": 5.492022840276324e-05, "loss": 0.1556, "step": 2486 }, { "epoch": 0.5220403022670025, "grad_norm": 0.14759372174739838, "learning_rate": 5.488376399987394e-05, "loss": 0.1565, "step": 2487 }, { "epoch": 0.5222502099076406, "grad_norm": 0.1314990073442459, "learning_rate": 5.484729697429183e-05, "loss": 0.1668, "step": 2488 }, { "epoch": 0.5224601175482787, "grad_norm": 0.1437479555606842, "learning_rate": 5.4810827345600516e-05, "loss": 0.162, "step": 2489 }, { "epoch": 0.5226700251889169, "grad_norm": 0.10261430591344833, "learning_rate": 5.4774355133385015e-05, "loss": 0.1673, "step": 2490 }, { "epoch": 0.522879932829555, "grad_norm": 0.10956492274999619, "learning_rate": 5.473788035723176e-05, "loss": 0.1699, "step": 2491 }, { "epoch": 0.5230898404701931, "grad_norm": 0.14587171375751495, "learning_rate": 5.470140303672853e-05, "loss": 0.1696, "step": 2492 }, { "epoch": 0.5232997481108312, "grad_norm": 0.10633595287799835, "learning_rate": 5.466492319146447e-05, "loss": 0.1651, "step": 2493 }, { "epoch": 0.5235096557514693, "grad_norm": 0.1117524653673172, "learning_rate": 5.4628440841030096e-05, "loss": 0.1566, "step": 2494 }, { "epoch": 0.5237195633921075, "grad_norm": 0.09613008797168732, "learning_rate": 5.4591956005017276e-05, "loss": 0.1673, "step": 2495 }, { "epoch": 0.5239294710327456, "grad_norm": 0.0960233211517334, "learning_rate": 5.455546870301919e-05, "loss": 0.1304, "step": 2496 }, { "epoch": 0.5241393786733837, "grad_norm": 0.1320691704750061, "learning_rate": 5.451897895463034e-05, "loss": 0.1479, "step": 2497 }, { "epoch": 0.5243492863140218, "grad_norm": 0.09648372232913971, "learning_rate": 5.448248677944658e-05, "loss": 0.1512, "step": 2498 }, { "epoch": 0.52455919395466, "grad_norm": 0.12556767463684082, "learning_rate": 5.444599219706502e-05, "loss": 0.1654, "step": 2499 }, { "epoch": 0.5247691015952981, "grad_norm": 0.11492988467216492, "learning_rate": 5.440949522708409e-05, "loss": 0.1388, "step": 2500 }, { "epoch": 0.5249790092359362, "grad_norm": 0.11009164899587631, "learning_rate": 5.4372995889103504e-05, "loss": 0.1525, "step": 2501 }, { "epoch": 0.5251889168765743, "grad_norm": 0.11773049831390381, "learning_rate": 5.433649420272425e-05, "loss": 0.1393, "step": 2502 }, { "epoch": 0.5253988245172124, "grad_norm": 0.14838874340057373, "learning_rate": 5.4299990187548535e-05, "loss": 0.1666, "step": 2503 }, { "epoch": 0.5256087321578505, "grad_norm": 0.12586016952991486, "learning_rate": 5.426348386317986e-05, "loss": 0.1461, "step": 2504 }, { "epoch": 0.5258186397984886, "grad_norm": 0.12975488603115082, "learning_rate": 5.422697524922299e-05, "loss": 0.1463, "step": 2505 }, { "epoch": 0.5260285474391267, "grad_norm": 0.11897221952676773, "learning_rate": 5.419046436528384e-05, "loss": 0.1606, "step": 2506 }, { "epoch": 0.5262384550797649, "grad_norm": 0.12332843244075775, "learning_rate": 5.4153951230969604e-05, "loss": 0.1395, "step": 2507 }, { "epoch": 0.5264483627204031, "grad_norm": 0.1560123860836029, "learning_rate": 5.411743586588868e-05, "loss": 0.1745, "step": 2508 }, { "epoch": 0.5266582703610412, "grad_norm": 0.1406356692314148, "learning_rate": 5.408091828965063e-05, "loss": 0.1603, "step": 2509 }, { "epoch": 0.5268681780016793, "grad_norm": 0.11687145382165909, "learning_rate": 5.404439852186624e-05, "loss": 0.1644, "step": 2510 }, { "epoch": 0.5270780856423174, "grad_norm": 0.09882459789514542, "learning_rate": 5.4007876582147465e-05, "loss": 0.1432, "step": 2511 }, { "epoch": 0.5272879932829555, "grad_norm": 0.14484454691410065, "learning_rate": 5.397135249010744e-05, "loss": 0.1574, "step": 2512 }, { "epoch": 0.5274979009235936, "grad_norm": 0.11317654699087143, "learning_rate": 5.393482626536038e-05, "loss": 0.1544, "step": 2513 }, { "epoch": 0.5277078085642317, "grad_norm": 0.12852750718593597, "learning_rate": 5.389829792752175e-05, "loss": 0.1545, "step": 2514 }, { "epoch": 0.5279177162048698, "grad_norm": 0.11933259665966034, "learning_rate": 5.3861767496208085e-05, "loss": 0.1527, "step": 2515 }, { "epoch": 0.5281276238455079, "grad_norm": 0.12715746462345123, "learning_rate": 5.382523499103705e-05, "loss": 0.1668, "step": 2516 }, { "epoch": 0.5283375314861462, "grad_norm": 0.12052647024393082, "learning_rate": 5.378870043162746e-05, "loss": 0.153, "step": 2517 }, { "epoch": 0.5285474391267843, "grad_norm": 0.17828109860420227, "learning_rate": 5.375216383759919e-05, "loss": 0.1625, "step": 2518 }, { "epoch": 0.5287573467674224, "grad_norm": 0.13423284888267517, "learning_rate": 5.371562522857323e-05, "loss": 0.1547, "step": 2519 }, { "epoch": 0.5289672544080605, "grad_norm": 0.13876746594905853, "learning_rate": 5.367908462417166e-05, "loss": 0.1533, "step": 2520 }, { "epoch": 0.5291771620486986, "grad_norm": 0.11023764312267303, "learning_rate": 5.3642542044017593e-05, "loss": 0.1598, "step": 2521 }, { "epoch": 0.5293870696893367, "grad_norm": 0.09991083294153214, "learning_rate": 5.360599750773526e-05, "loss": 0.1496, "step": 2522 }, { "epoch": 0.5295969773299748, "grad_norm": 0.11138133704662323, "learning_rate": 5.356945103494988e-05, "loss": 0.1579, "step": 2523 }, { "epoch": 0.5298068849706129, "grad_norm": 0.09804137051105499, "learning_rate": 5.353290264528776e-05, "loss": 0.1649, "step": 2524 }, { "epoch": 0.530016792611251, "grad_norm": 0.1129557341337204, "learning_rate": 5.3496352358376225e-05, "loss": 0.172, "step": 2525 }, { "epoch": 0.5302267002518891, "grad_norm": 0.15934525430202484, "learning_rate": 5.345980019384361e-05, "loss": 0.1357, "step": 2526 }, { "epoch": 0.5304366078925273, "grad_norm": 0.11331132799386978, "learning_rate": 5.3423246171319264e-05, "loss": 0.1515, "step": 2527 }, { "epoch": 0.5306465155331654, "grad_norm": 0.11778911203145981, "learning_rate": 5.338669031043351e-05, "loss": 0.1574, "step": 2528 }, { "epoch": 0.5308564231738035, "grad_norm": 0.11816948652267456, "learning_rate": 5.3350132630817715e-05, "loss": 0.1473, "step": 2529 }, { "epoch": 0.5310663308144417, "grad_norm": 0.10855350643396378, "learning_rate": 5.331357315210417e-05, "loss": 0.1506, "step": 2530 }, { "epoch": 0.5312762384550798, "grad_norm": 0.09619951248168945, "learning_rate": 5.327701189392616e-05, "loss": 0.1481, "step": 2531 }, { "epoch": 0.5314861460957179, "grad_norm": 0.26531854271888733, "learning_rate": 5.324044887591788e-05, "loss": 0.1677, "step": 2532 }, { "epoch": 0.531696053736356, "grad_norm": 0.10267012566328049, "learning_rate": 5.320388411771458e-05, "loss": 0.1474, "step": 2533 }, { "epoch": 0.5319059613769941, "grad_norm": 0.12348689883947372, "learning_rate": 5.3167317638952285e-05, "loss": 0.1638, "step": 2534 }, { "epoch": 0.5321158690176322, "grad_norm": 0.10946419090032578, "learning_rate": 5.313074945926808e-05, "loss": 0.163, "step": 2535 }, { "epoch": 0.5323257766582704, "grad_norm": 0.12162724882364273, "learning_rate": 5.3094179598299906e-05, "loss": 0.1661, "step": 2536 }, { "epoch": 0.5325356842989085, "grad_norm": 0.11680992692708969, "learning_rate": 5.30576080756866e-05, "loss": 0.1595, "step": 2537 }, { "epoch": 0.5327455919395466, "grad_norm": 0.1095302477478981, "learning_rate": 5.3021034911067926e-05, "loss": 0.155, "step": 2538 }, { "epoch": 0.5329554995801847, "grad_norm": 0.12963855266571045, "learning_rate": 5.298446012408449e-05, "loss": 0.1228, "step": 2539 }, { "epoch": 0.5331654072208228, "grad_norm": 0.10611241310834885, "learning_rate": 5.294788373437779e-05, "loss": 0.1619, "step": 2540 }, { "epoch": 0.533375314861461, "grad_norm": 0.12048359215259552, "learning_rate": 5.29113057615902e-05, "loss": 0.1526, "step": 2541 }, { "epoch": 0.533585222502099, "grad_norm": 0.1172405406832695, "learning_rate": 5.28747262253649e-05, "loss": 0.1708, "step": 2542 }, { "epoch": 0.5337951301427372, "grad_norm": 0.11585681885480881, "learning_rate": 5.2838145145345976e-05, "loss": 0.1613, "step": 2543 }, { "epoch": 0.5340050377833753, "grad_norm": 0.14023533463478088, "learning_rate": 5.280156254117825e-05, "loss": 0.1613, "step": 2544 }, { "epoch": 0.5342149454240135, "grad_norm": 0.13292895257472992, "learning_rate": 5.2764978432507464e-05, "loss": 0.1618, "step": 2545 }, { "epoch": 0.5344248530646516, "grad_norm": 0.13747252523899078, "learning_rate": 5.272839283898009e-05, "loss": 0.1577, "step": 2546 }, { "epoch": 0.5346347607052897, "grad_norm": 0.12026356905698776, "learning_rate": 5.2691805780243454e-05, "loss": 0.1552, "step": 2547 }, { "epoch": 0.5348446683459278, "grad_norm": 0.11890849471092224, "learning_rate": 5.26552172759456e-05, "loss": 0.1541, "step": 2548 }, { "epoch": 0.5350545759865659, "grad_norm": 0.1233121007680893, "learning_rate": 5.2618627345735436e-05, "loss": 0.159, "step": 2549 }, { "epoch": 0.535264483627204, "grad_norm": 0.11645440012216568, "learning_rate": 5.258203600926257e-05, "loss": 0.1693, "step": 2550 }, { "epoch": 0.5354743912678421, "grad_norm": 0.10614755749702454, "learning_rate": 5.25454432861774e-05, "loss": 0.1595, "step": 2551 }, { "epoch": 0.5356842989084802, "grad_norm": 0.10766322165727615, "learning_rate": 5.2508849196131016e-05, "loss": 0.1567, "step": 2552 }, { "epoch": 0.5358942065491183, "grad_norm": 0.13056086003780365, "learning_rate": 5.2472253758775327e-05, "loss": 0.1524, "step": 2553 }, { "epoch": 0.5361041141897565, "grad_norm": 0.10615919530391693, "learning_rate": 5.243565699376287e-05, "loss": 0.1624, "step": 2554 }, { "epoch": 0.5363140218303947, "grad_norm": 0.10946070402860641, "learning_rate": 5.239905892074699e-05, "loss": 0.1697, "step": 2555 }, { "epoch": 0.5365239294710328, "grad_norm": 0.13687758147716522, "learning_rate": 5.236245955938166e-05, "loss": 0.1644, "step": 2556 }, { "epoch": 0.5367338371116709, "grad_norm": 0.13054099678993225, "learning_rate": 5.23258589293216e-05, "loss": 0.1557, "step": 2557 }, { "epoch": 0.536943744752309, "grad_norm": 0.10445389896631241, "learning_rate": 5.228925705022215e-05, "loss": 0.1549, "step": 2558 }, { "epoch": 0.5371536523929471, "grad_norm": 0.14048103988170624, "learning_rate": 5.225265394173938e-05, "loss": 0.1556, "step": 2559 }, { "epoch": 0.5373635600335852, "grad_norm": 0.12132364511489868, "learning_rate": 5.221604962352999e-05, "loss": 0.1713, "step": 2560 }, { "epoch": 0.5375734676742233, "grad_norm": 0.1090470552444458, "learning_rate": 5.217944411525133e-05, "loss": 0.1695, "step": 2561 }, { "epoch": 0.5377833753148614, "grad_norm": 0.11678619682788849, "learning_rate": 5.21428374365614e-05, "loss": 0.1702, "step": 2562 }, { "epoch": 0.5379932829554995, "grad_norm": 0.13385826349258423, "learning_rate": 5.2106229607118805e-05, "loss": 0.1614, "step": 2563 }, { "epoch": 0.5382031905961377, "grad_norm": 0.12655134499073029, "learning_rate": 5.2069620646582805e-05, "loss": 0.1385, "step": 2564 }, { "epoch": 0.5384130982367759, "grad_norm": 0.13991332054138184, "learning_rate": 5.203301057461325e-05, "loss": 0.1655, "step": 2565 }, { "epoch": 0.538623005877414, "grad_norm": 0.12059177458286285, "learning_rate": 5.1996399410870576e-05, "loss": 0.1637, "step": 2566 }, { "epoch": 0.5388329135180521, "grad_norm": 0.1222875714302063, "learning_rate": 5.195978717501582e-05, "loss": 0.1477, "step": 2567 }, { "epoch": 0.5390428211586902, "grad_norm": 0.11073408275842667, "learning_rate": 5.192317388671058e-05, "loss": 0.1579, "step": 2568 }, { "epoch": 0.5392527287993283, "grad_norm": 0.1314646303653717, "learning_rate": 5.188655956561701e-05, "loss": 0.1479, "step": 2569 }, { "epoch": 0.5394626364399664, "grad_norm": 0.12713418900966644, "learning_rate": 5.184994423139788e-05, "loss": 0.1586, "step": 2570 }, { "epoch": 0.5396725440806045, "grad_norm": 0.1251654475927353, "learning_rate": 5.181332790371645e-05, "loss": 0.1563, "step": 2571 }, { "epoch": 0.5398824517212426, "grad_norm": 0.12144947052001953, "learning_rate": 5.17767106022365e-05, "loss": 0.1537, "step": 2572 }, { "epoch": 0.5400923593618808, "grad_norm": 0.11639848351478577, "learning_rate": 5.174009234662237e-05, "loss": 0.1411, "step": 2573 }, { "epoch": 0.5403022670025189, "grad_norm": 0.11065072566270828, "learning_rate": 5.170347315653893e-05, "loss": 0.1625, "step": 2574 }, { "epoch": 0.540512174643157, "grad_norm": 0.11936859041452408, "learning_rate": 5.1666853051651456e-05, "loss": 0.161, "step": 2575 }, { "epoch": 0.5407220822837951, "grad_norm": 0.11808102577924728, "learning_rate": 5.163023205162584e-05, "loss": 0.1552, "step": 2576 }, { "epoch": 0.5409319899244333, "grad_norm": 0.09937819093465805, "learning_rate": 5.159361017612836e-05, "loss": 0.1362, "step": 2577 }, { "epoch": 0.5411418975650714, "grad_norm": 0.13057714700698853, "learning_rate": 5.1556987444825836e-05, "loss": 0.1691, "step": 2578 }, { "epoch": 0.5413518052057095, "grad_norm": 0.11962155252695084, "learning_rate": 5.1520363877385466e-05, "loss": 0.1598, "step": 2579 }, { "epoch": 0.5415617128463476, "grad_norm": 0.10816418379545212, "learning_rate": 5.148373949347498e-05, "loss": 0.1354, "step": 2580 }, { "epoch": 0.5417716204869857, "grad_norm": 0.1253233402967453, "learning_rate": 5.144711431276249e-05, "loss": 0.1731, "step": 2581 }, { "epoch": 0.5419815281276238, "grad_norm": 0.11587586253881454, "learning_rate": 5.1410488354916555e-05, "loss": 0.1617, "step": 2582 }, { "epoch": 0.542191435768262, "grad_norm": 0.09815829247236252, "learning_rate": 5.1373861639606145e-05, "loss": 0.1668, "step": 2583 }, { "epoch": 0.5424013434089001, "grad_norm": 0.10858689993619919, "learning_rate": 5.133723418650067e-05, "loss": 0.175, "step": 2584 }, { "epoch": 0.5426112510495382, "grad_norm": 0.09952938556671143, "learning_rate": 5.130060601526988e-05, "loss": 0.1695, "step": 2585 }, { "epoch": 0.5428211586901763, "grad_norm": 0.11876317113637924, "learning_rate": 5.1263977145583964e-05, "loss": 0.1524, "step": 2586 }, { "epoch": 0.5430310663308144, "grad_norm": 0.13846537470817566, "learning_rate": 5.122734759711343e-05, "loss": 0.1654, "step": 2587 }, { "epoch": 0.5432409739714525, "grad_norm": 0.12535333633422852, "learning_rate": 5.119071738952922e-05, "loss": 0.1583, "step": 2588 }, { "epoch": 0.5434508816120907, "grad_norm": 0.11854858696460724, "learning_rate": 5.115408654250255e-05, "loss": 0.1542, "step": 2589 }, { "epoch": 0.5436607892527288, "grad_norm": 0.12108839303255081, "learning_rate": 5.111745507570507e-05, "loss": 0.1707, "step": 2590 }, { "epoch": 0.5438706968933669, "grad_norm": 0.11636438220739365, "learning_rate": 5.1080823008808665e-05, "loss": 0.1692, "step": 2591 }, { "epoch": 0.5440806045340051, "grad_norm": 0.12799912691116333, "learning_rate": 5.104419036148564e-05, "loss": 0.1615, "step": 2592 }, { "epoch": 0.5442905121746432, "grad_norm": 0.13550209999084473, "learning_rate": 5.1007557153408514e-05, "loss": 0.1735, "step": 2593 }, { "epoch": 0.5445004198152813, "grad_norm": 0.1329326629638672, "learning_rate": 5.09709234042502e-05, "loss": 0.1626, "step": 2594 }, { "epoch": 0.5447103274559194, "grad_norm": 0.11522291600704193, "learning_rate": 5.093428913368383e-05, "loss": 0.1679, "step": 2595 }, { "epoch": 0.5449202350965575, "grad_norm": 0.12600111961364746, "learning_rate": 5.089765436138286e-05, "loss": 0.1675, "step": 2596 }, { "epoch": 0.5451301427371956, "grad_norm": 0.09551798552274704, "learning_rate": 5.086101910702099e-05, "loss": 0.139, "step": 2597 }, { "epoch": 0.5453400503778337, "grad_norm": 0.1303727775812149, "learning_rate": 5.08243833902722e-05, "loss": 0.1529, "step": 2598 }, { "epoch": 0.5455499580184718, "grad_norm": 0.12384460121393204, "learning_rate": 5.078774723081069e-05, "loss": 0.1598, "step": 2599 }, { "epoch": 0.5457598656591099, "grad_norm": 0.14714710414409637, "learning_rate": 5.075111064831092e-05, "loss": 0.1645, "step": 2600 }, { "epoch": 0.5459697732997482, "grad_norm": 0.11978684365749359, "learning_rate": 5.071447366244758e-05, "loss": 0.1509, "step": 2601 }, { "epoch": 0.5461796809403863, "grad_norm": 0.1111048012971878, "learning_rate": 5.0677836292895567e-05, "loss": 0.1582, "step": 2602 }, { "epoch": 0.5463895885810244, "grad_norm": 0.13465799391269684, "learning_rate": 5.064119855932998e-05, "loss": 0.1609, "step": 2603 }, { "epoch": 0.5465994962216625, "grad_norm": 0.1144922599196434, "learning_rate": 5.060456048142611e-05, "loss": 0.1664, "step": 2604 }, { "epoch": 0.5468094038623006, "grad_norm": 0.11836301535367966, "learning_rate": 5.056792207885945e-05, "loss": 0.1602, "step": 2605 }, { "epoch": 0.5470193115029387, "grad_norm": 0.10931508988142014, "learning_rate": 5.053128337130568e-05, "loss": 0.1613, "step": 2606 }, { "epoch": 0.5472292191435768, "grad_norm": 0.1221369057893753, "learning_rate": 5.0494644378440595e-05, "loss": 0.1648, "step": 2607 }, { "epoch": 0.5474391267842149, "grad_norm": 0.10257924348115921, "learning_rate": 5.045800511994017e-05, "loss": 0.1468, "step": 2608 }, { "epoch": 0.547649034424853, "grad_norm": 0.1801408976316452, "learning_rate": 5.0421365615480556e-05, "loss": 0.1319, "step": 2609 }, { "epoch": 0.5478589420654912, "grad_norm": 0.11400793492794037, "learning_rate": 5.038472588473796e-05, "loss": 0.1465, "step": 2610 }, { "epoch": 0.5480688497061293, "grad_norm": 0.13004319369792938, "learning_rate": 5.034808594738878e-05, "loss": 0.1747, "step": 2611 }, { "epoch": 0.5482787573467675, "grad_norm": 0.13634802401065826, "learning_rate": 5.03114458231095e-05, "loss": 0.1789, "step": 2612 }, { "epoch": 0.5484886649874056, "grad_norm": 0.1203138455748558, "learning_rate": 5.02748055315767e-05, "loss": 0.1697, "step": 2613 }, { "epoch": 0.5486985726280437, "grad_norm": 0.09810598939657211, "learning_rate": 5.0238165092467037e-05, "loss": 0.1586, "step": 2614 }, { "epoch": 0.5489084802686818, "grad_norm": 0.1147264614701271, "learning_rate": 5.02015245254573e-05, "loss": 0.1738, "step": 2615 }, { "epoch": 0.5491183879093199, "grad_norm": 0.11263872683048248, "learning_rate": 5.0164883850224274e-05, "loss": 0.1583, "step": 2616 }, { "epoch": 0.549328295549958, "grad_norm": 0.12805059552192688, "learning_rate": 5.012824308644487e-05, "loss": 0.1538, "step": 2617 }, { "epoch": 0.5495382031905961, "grad_norm": 0.10417455434799194, "learning_rate": 5.009160225379598e-05, "loss": 0.1664, "step": 2618 }, { "epoch": 0.5497481108312342, "grad_norm": 0.12117290496826172, "learning_rate": 5.005496137195461e-05, "loss": 0.1587, "step": 2619 }, { "epoch": 0.5499580184718724, "grad_norm": 0.11718293279409409, "learning_rate": 5.00183204605977e-05, "loss": 0.1491, "step": 2620 }, { "epoch": 0.5501679261125105, "grad_norm": 0.12543454766273499, "learning_rate": 4.9981679539402316e-05, "loss": 0.1582, "step": 2621 }, { "epoch": 0.5503778337531486, "grad_norm": 0.09536810219287872, "learning_rate": 4.9945038628045414e-05, "loss": 0.1578, "step": 2622 }, { "epoch": 0.5505877413937867, "grad_norm": 0.11349605768918991, "learning_rate": 4.990839774620403e-05, "loss": 0.1595, "step": 2623 }, { "epoch": 0.5507976490344249, "grad_norm": 0.11620128154754639, "learning_rate": 4.9871756913555146e-05, "loss": 0.1535, "step": 2624 }, { "epoch": 0.551007556675063, "grad_norm": 0.11295798420906067, "learning_rate": 4.983511614977574e-05, "loss": 0.1662, "step": 2625 }, { "epoch": 0.5512174643157011, "grad_norm": 0.13581858575344086, "learning_rate": 4.979847547454271e-05, "loss": 0.1761, "step": 2626 }, { "epoch": 0.5514273719563392, "grad_norm": 0.12520180642604828, "learning_rate": 4.976183490753296e-05, "loss": 0.1394, "step": 2627 }, { "epoch": 0.5516372795969773, "grad_norm": 0.17680776119232178, "learning_rate": 4.972519446842333e-05, "loss": 0.1417, "step": 2628 }, { "epoch": 0.5518471872376155, "grad_norm": 0.11935435235500336, "learning_rate": 4.9688554176890523e-05, "loss": 0.1684, "step": 2629 }, { "epoch": 0.5520570948782536, "grad_norm": 0.12541987001895905, "learning_rate": 4.965191405261123e-05, "loss": 0.1568, "step": 2630 }, { "epoch": 0.5522670025188917, "grad_norm": 0.09537042677402496, "learning_rate": 4.9615274115262056e-05, "loss": 0.1452, "step": 2631 }, { "epoch": 0.5524769101595298, "grad_norm": 0.10122919082641602, "learning_rate": 4.9578634384519456e-05, "loss": 0.1543, "step": 2632 }, { "epoch": 0.5526868178001679, "grad_norm": 0.12309399247169495, "learning_rate": 4.954199488005983e-05, "loss": 0.16, "step": 2633 }, { "epoch": 0.552896725440806, "grad_norm": 0.12081638723611832, "learning_rate": 4.9505355621559416e-05, "loss": 0.1726, "step": 2634 }, { "epoch": 0.5531066330814441, "grad_norm": 0.12514087557792664, "learning_rate": 4.946871662869434e-05, "loss": 0.1374, "step": 2635 }, { "epoch": 0.5533165407220823, "grad_norm": 0.12766608595848083, "learning_rate": 4.943207792114055e-05, "loss": 0.1481, "step": 2636 }, { "epoch": 0.5535264483627204, "grad_norm": 0.1229751855134964, "learning_rate": 4.9395439518573904e-05, "loss": 0.1447, "step": 2637 }, { "epoch": 0.5537363560033586, "grad_norm": 0.12326975166797638, "learning_rate": 4.935880144067003e-05, "loss": 0.1579, "step": 2638 }, { "epoch": 0.5539462636439967, "grad_norm": 0.12600168585777283, "learning_rate": 4.932216370710444e-05, "loss": 0.1614, "step": 2639 }, { "epoch": 0.5541561712846348, "grad_norm": 0.10681076347827911, "learning_rate": 4.928552633755243e-05, "loss": 0.169, "step": 2640 }, { "epoch": 0.5543660789252729, "grad_norm": 0.12379185855388641, "learning_rate": 4.9248889351689074e-05, "loss": 0.1744, "step": 2641 }, { "epoch": 0.554575986565911, "grad_norm": 0.1300443708896637, "learning_rate": 4.9212252769189324e-05, "loss": 0.1698, "step": 2642 }, { "epoch": 0.5547858942065491, "grad_norm": 0.12122491747140884, "learning_rate": 4.917561660972781e-05, "loss": 0.1606, "step": 2643 }, { "epoch": 0.5549958018471872, "grad_norm": 0.10484204441308975, "learning_rate": 4.913898089297903e-05, "loss": 0.1355, "step": 2644 }, { "epoch": 0.5552057094878253, "grad_norm": 0.12357357144355774, "learning_rate": 4.9102345638617145e-05, "loss": 0.1752, "step": 2645 }, { "epoch": 0.5554156171284634, "grad_norm": 0.09464117139577866, "learning_rate": 4.9065710866316175e-05, "loss": 0.1591, "step": 2646 }, { "epoch": 0.5556255247691015, "grad_norm": 0.11045074462890625, "learning_rate": 4.9029076595749804e-05, "loss": 0.1484, "step": 2647 }, { "epoch": 0.5558354324097398, "grad_norm": 0.16476228833198547, "learning_rate": 4.899244284659148e-05, "loss": 0.147, "step": 2648 }, { "epoch": 0.5560453400503779, "grad_norm": 0.13166958093643188, "learning_rate": 4.895580963851438e-05, "loss": 0.1555, "step": 2649 }, { "epoch": 0.556255247691016, "grad_norm": 0.12306392192840576, "learning_rate": 4.8919176991191346e-05, "loss": 0.1481, "step": 2650 }, { "epoch": 0.5564651553316541, "grad_norm": 0.11133769154548645, "learning_rate": 4.888254492429494e-05, "loss": 0.138, "step": 2651 }, { "epoch": 0.5566750629722922, "grad_norm": 0.11845657974481583, "learning_rate": 4.884591345749746e-05, "loss": 0.15, "step": 2652 }, { "epoch": 0.5568849706129303, "grad_norm": 0.10165873169898987, "learning_rate": 4.880928261047079e-05, "loss": 0.1562, "step": 2653 }, { "epoch": 0.5570948782535684, "grad_norm": 0.11259020119905472, "learning_rate": 4.877265240288657e-05, "loss": 0.1548, "step": 2654 }, { "epoch": 0.5573047858942065, "grad_norm": 0.12641459703445435, "learning_rate": 4.873602285441604e-05, "loss": 0.1682, "step": 2655 }, { "epoch": 0.5575146935348446, "grad_norm": 0.11751089245080948, "learning_rate": 4.869939398473014e-05, "loss": 0.1488, "step": 2656 }, { "epoch": 0.5577246011754828, "grad_norm": 0.13346509635448456, "learning_rate": 4.866276581349934e-05, "loss": 0.1501, "step": 2657 }, { "epoch": 0.5579345088161209, "grad_norm": 0.10571077466011047, "learning_rate": 4.8626138360393867e-05, "loss": 0.1431, "step": 2658 }, { "epoch": 0.558144416456759, "grad_norm": 0.12459591776132584, "learning_rate": 4.858951164508346e-05, "loss": 0.1528, "step": 2659 }, { "epoch": 0.5583543240973972, "grad_norm": 0.15180979669094086, "learning_rate": 4.8552885687237525e-05, "loss": 0.1573, "step": 2660 }, { "epoch": 0.5585642317380353, "grad_norm": 0.1342502236366272, "learning_rate": 4.8516260506525024e-05, "loss": 0.1638, "step": 2661 }, { "epoch": 0.5587741393786734, "grad_norm": 0.11851493269205093, "learning_rate": 4.847963612261454e-05, "loss": 0.1759, "step": 2662 }, { "epoch": 0.5589840470193115, "grad_norm": 0.1613032966852188, "learning_rate": 4.844301255517419e-05, "loss": 0.1781, "step": 2663 }, { "epoch": 0.5591939546599496, "grad_norm": 0.11613575369119644, "learning_rate": 4.840638982387165e-05, "loss": 0.1406, "step": 2664 }, { "epoch": 0.5594038623005877, "grad_norm": 0.11966195702552795, "learning_rate": 4.8369767948374173e-05, "loss": 0.1617, "step": 2665 }, { "epoch": 0.5596137699412259, "grad_norm": 0.11400119960308075, "learning_rate": 4.833314694834855e-05, "loss": 0.1483, "step": 2666 }, { "epoch": 0.559823677581864, "grad_norm": 0.10645996034145355, "learning_rate": 4.829652684346109e-05, "loss": 0.1568, "step": 2667 }, { "epoch": 0.5600335852225021, "grad_norm": 0.1172577515244484, "learning_rate": 4.825990765337763e-05, "loss": 0.1661, "step": 2668 }, { "epoch": 0.5602434928631402, "grad_norm": 0.15190641582012177, "learning_rate": 4.8223289397763514e-05, "loss": 0.1571, "step": 2669 }, { "epoch": 0.5604534005037783, "grad_norm": 0.10944955050945282, "learning_rate": 4.818667209628358e-05, "loss": 0.1562, "step": 2670 }, { "epoch": 0.5606633081444165, "grad_norm": 0.11813446134328842, "learning_rate": 4.815005576860212e-05, "loss": 0.1637, "step": 2671 }, { "epoch": 0.5608732157850546, "grad_norm": 0.11116372793912888, "learning_rate": 4.8113440434383e-05, "loss": 0.1475, "step": 2672 }, { "epoch": 0.5610831234256927, "grad_norm": 0.14472845196723938, "learning_rate": 4.807682611328944e-05, "loss": 0.1577, "step": 2673 }, { "epoch": 0.5612930310663308, "grad_norm": 0.11451143026351929, "learning_rate": 4.80402128249842e-05, "loss": 0.1526, "step": 2674 }, { "epoch": 0.5615029387069689, "grad_norm": 0.09532547742128372, "learning_rate": 4.8003600589129435e-05, "loss": 0.1426, "step": 2675 }, { "epoch": 0.5617128463476071, "grad_norm": 0.11534955352544785, "learning_rate": 4.796698942538677e-05, "loss": 0.1606, "step": 2676 }, { "epoch": 0.5619227539882452, "grad_norm": 0.10720717161893845, "learning_rate": 4.7930379353417206e-05, "loss": 0.1667, "step": 2677 }, { "epoch": 0.5621326616288833, "grad_norm": 0.09807775169610977, "learning_rate": 4.789377039288121e-05, "loss": 0.1641, "step": 2678 }, { "epoch": 0.5623425692695214, "grad_norm": 0.12788937985897064, "learning_rate": 4.7857162563438614e-05, "loss": 0.1721, "step": 2679 }, { "epoch": 0.5625524769101595, "grad_norm": 0.11621350049972534, "learning_rate": 4.782055588474868e-05, "loss": 0.163, "step": 2680 }, { "epoch": 0.5627623845507976, "grad_norm": 0.10405416041612625, "learning_rate": 4.778395037647002e-05, "loss": 0.1434, "step": 2681 }, { "epoch": 0.5629722921914357, "grad_norm": 0.13004978001117706, "learning_rate": 4.7747346058260615e-05, "loss": 0.1514, "step": 2682 }, { "epoch": 0.5631821998320738, "grad_norm": 0.10238310694694519, "learning_rate": 4.771074294977786e-05, "loss": 0.1637, "step": 2683 }, { "epoch": 0.563392107472712, "grad_norm": 0.1132587268948555, "learning_rate": 4.767414107067841e-05, "loss": 0.1593, "step": 2684 }, { "epoch": 0.5636020151133502, "grad_norm": 0.12371500581502914, "learning_rate": 4.763754044061834e-05, "loss": 0.1538, "step": 2685 }, { "epoch": 0.5638119227539883, "grad_norm": 0.11944448202848434, "learning_rate": 4.7600941079253016e-05, "loss": 0.1496, "step": 2686 }, { "epoch": 0.5640218303946264, "grad_norm": 0.11663755774497986, "learning_rate": 4.7564343006237136e-05, "loss": 0.1606, "step": 2687 }, { "epoch": 0.5642317380352645, "grad_norm": 0.10922671854496002, "learning_rate": 4.7527746241224685e-05, "loss": 0.1746, "step": 2688 }, { "epoch": 0.5644416456759026, "grad_norm": 0.11024421453475952, "learning_rate": 4.749115080386899e-05, "loss": 0.1628, "step": 2689 }, { "epoch": 0.5646515533165407, "grad_norm": 0.12064459174871445, "learning_rate": 4.745455671382263e-05, "loss": 0.1449, "step": 2690 }, { "epoch": 0.5648614609571788, "grad_norm": 0.11907904595136642, "learning_rate": 4.7417963990737445e-05, "loss": 0.1435, "step": 2691 }, { "epoch": 0.5650713685978169, "grad_norm": 0.13502107560634613, "learning_rate": 4.738137265426457e-05, "loss": 0.149, "step": 2692 }, { "epoch": 0.565281276238455, "grad_norm": 0.13327385485172272, "learning_rate": 4.734478272405441e-05, "loss": 0.1581, "step": 2693 }, { "epoch": 0.5654911838790933, "grad_norm": 0.10342027992010117, "learning_rate": 4.730819421975656e-05, "loss": 0.1312, "step": 2694 }, { "epoch": 0.5657010915197314, "grad_norm": 0.1462603062391281, "learning_rate": 4.7271607161019915e-05, "loss": 0.1568, "step": 2695 }, { "epoch": 0.5659109991603695, "grad_norm": 0.10993596166372299, "learning_rate": 4.723502156749254e-05, "loss": 0.1525, "step": 2696 }, { "epoch": 0.5661209068010076, "grad_norm": 0.11225152015686035, "learning_rate": 4.719843745882177e-05, "loss": 0.1591, "step": 2697 }, { "epoch": 0.5663308144416457, "grad_norm": 0.1225719004869461, "learning_rate": 4.716185485465405e-05, "loss": 0.1305, "step": 2698 }, { "epoch": 0.5665407220822838, "grad_norm": 0.11367135494947433, "learning_rate": 4.712527377463511e-05, "loss": 0.1628, "step": 2699 }, { "epoch": 0.5667506297229219, "grad_norm": 0.13585880398750305, "learning_rate": 4.708869423840981e-05, "loss": 0.1495, "step": 2700 }, { "epoch": 0.56696053736356, "grad_norm": 0.14989891648292542, "learning_rate": 4.7052116265622214e-05, "loss": 0.1574, "step": 2701 }, { "epoch": 0.5671704450041981, "grad_norm": 0.1249023824930191, "learning_rate": 4.7015539875915515e-05, "loss": 0.1598, "step": 2702 }, { "epoch": 0.5673803526448362, "grad_norm": 0.10144059360027313, "learning_rate": 4.697896508893208e-05, "loss": 0.1474, "step": 2703 }, { "epoch": 0.5675902602854744, "grad_norm": 0.11371605843305588, "learning_rate": 4.694239192431341e-05, "loss": 0.153, "step": 2704 }, { "epoch": 0.5678001679261125, "grad_norm": 0.11912527680397034, "learning_rate": 4.690582040170012e-05, "loss": 0.1626, "step": 2705 }, { "epoch": 0.5680100755667506, "grad_norm": 0.11181552708148956, "learning_rate": 4.686925054073193e-05, "loss": 0.1519, "step": 2706 }, { "epoch": 0.5682199832073888, "grad_norm": 0.10603776574134827, "learning_rate": 4.683268236104773e-05, "loss": 0.1548, "step": 2707 }, { "epoch": 0.5684298908480269, "grad_norm": 0.12990587949752808, "learning_rate": 4.679611588228544e-05, "loss": 0.1661, "step": 2708 }, { "epoch": 0.568639798488665, "grad_norm": 0.10635796189308167, "learning_rate": 4.675955112408211e-05, "loss": 0.1363, "step": 2709 }, { "epoch": 0.5688497061293031, "grad_norm": 0.13790719211101532, "learning_rate": 4.6722988106073855e-05, "loss": 0.1739, "step": 2710 }, { "epoch": 0.5690596137699412, "grad_norm": 0.1612750142812729, "learning_rate": 4.668642684789585e-05, "loss": 0.1429, "step": 2711 }, { "epoch": 0.5692695214105793, "grad_norm": 0.09567128121852875, "learning_rate": 4.664986736918229e-05, "loss": 0.1346, "step": 2712 }, { "epoch": 0.5694794290512175, "grad_norm": 0.10926074534654617, "learning_rate": 4.6613309689566494e-05, "loss": 0.168, "step": 2713 }, { "epoch": 0.5696893366918556, "grad_norm": 0.12982285022735596, "learning_rate": 4.657675382868075e-05, "loss": 0.1559, "step": 2714 }, { "epoch": 0.5698992443324937, "grad_norm": 0.11957161128520966, "learning_rate": 4.65401998061564e-05, "loss": 0.1705, "step": 2715 }, { "epoch": 0.5701091519731318, "grad_norm": 0.1028459444642067, "learning_rate": 4.650364764162378e-05, "loss": 0.146, "step": 2716 }, { "epoch": 0.5703190596137699, "grad_norm": 0.1195150837302208, "learning_rate": 4.6467097354712244e-05, "loss": 0.1469, "step": 2717 }, { "epoch": 0.570528967254408, "grad_norm": 0.11445170640945435, "learning_rate": 4.6430548965050136e-05, "loss": 0.1748, "step": 2718 }, { "epoch": 0.5707388748950462, "grad_norm": 0.10902340710163116, "learning_rate": 4.6394002492264767e-05, "loss": 0.145, "step": 2719 }, { "epoch": 0.5709487825356843, "grad_norm": 0.09901425987482071, "learning_rate": 4.635745795598241e-05, "loss": 0.1384, "step": 2720 }, { "epoch": 0.5711586901763224, "grad_norm": 0.1242944598197937, "learning_rate": 4.6320915375828356e-05, "loss": 0.1573, "step": 2721 }, { "epoch": 0.5713685978169606, "grad_norm": 0.11540963500738144, "learning_rate": 4.6284374771426775e-05, "loss": 0.1307, "step": 2722 }, { "epoch": 0.5715785054575987, "grad_norm": 0.10758115351200104, "learning_rate": 4.624783616240081e-05, "loss": 0.165, "step": 2723 }, { "epoch": 0.5717884130982368, "grad_norm": 0.11912379413843155, "learning_rate": 4.6211299568372545e-05, "loss": 0.1502, "step": 2724 }, { "epoch": 0.5719983207388749, "grad_norm": 0.10736312717199326, "learning_rate": 4.617476500896296e-05, "loss": 0.1508, "step": 2725 }, { "epoch": 0.572208228379513, "grad_norm": 0.10583416372537613, "learning_rate": 4.613823250379194e-05, "loss": 0.1646, "step": 2726 }, { "epoch": 0.5724181360201511, "grad_norm": 0.10542289167642593, "learning_rate": 4.610170207247826e-05, "loss": 0.1598, "step": 2727 }, { "epoch": 0.5726280436607892, "grad_norm": 0.12792536616325378, "learning_rate": 4.606517373463963e-05, "loss": 0.1554, "step": 2728 }, { "epoch": 0.5728379513014273, "grad_norm": 0.10677836835384369, "learning_rate": 4.6028647509892574e-05, "loss": 0.1621, "step": 2729 }, { "epoch": 0.5730478589420654, "grad_norm": 0.10829446464776993, "learning_rate": 4.5992123417852526e-05, "loss": 0.1507, "step": 2730 }, { "epoch": 0.5732577665827037, "grad_norm": 0.1175784021615982, "learning_rate": 4.595560147813375e-05, "loss": 0.1704, "step": 2731 }, { "epoch": 0.5734676742233418, "grad_norm": 0.10802135616540909, "learning_rate": 4.5919081710349396e-05, "loss": 0.1707, "step": 2732 }, { "epoch": 0.5736775818639799, "grad_norm": 0.11632566154003143, "learning_rate": 4.588256413411134e-05, "loss": 0.1642, "step": 2733 }, { "epoch": 0.573887489504618, "grad_norm": 0.12177208811044693, "learning_rate": 4.5846048769030414e-05, "loss": 0.156, "step": 2734 }, { "epoch": 0.5740973971452561, "grad_norm": 0.11091190576553345, "learning_rate": 4.5809535634716175e-05, "loss": 0.1666, "step": 2735 }, { "epoch": 0.5743073047858942, "grad_norm": 0.11668343096971512, "learning_rate": 4.577302475077703e-05, "loss": 0.1381, "step": 2736 }, { "epoch": 0.5745172124265323, "grad_norm": 0.10720910876989365, "learning_rate": 4.573651613682013e-05, "loss": 0.1391, "step": 2737 }, { "epoch": 0.5747271200671704, "grad_norm": 0.12557905912399292, "learning_rate": 4.570000981245147e-05, "loss": 0.1548, "step": 2738 }, { "epoch": 0.5749370277078085, "grad_norm": 0.11367420852184296, "learning_rate": 4.566350579727577e-05, "loss": 0.1329, "step": 2739 }, { "epoch": 0.5751469353484466, "grad_norm": 0.11300312727689743, "learning_rate": 4.562700411089651e-05, "loss": 0.164, "step": 2740 }, { "epoch": 0.5753568429890848, "grad_norm": 0.1259835660457611, "learning_rate": 4.559050477291591e-05, "loss": 0.1534, "step": 2741 }, { "epoch": 0.575566750629723, "grad_norm": 0.10597500950098038, "learning_rate": 4.555400780293499e-05, "loss": 0.148, "step": 2742 }, { "epoch": 0.5757766582703611, "grad_norm": 0.11727175116539001, "learning_rate": 4.551751322055343e-05, "loss": 0.1628, "step": 2743 }, { "epoch": 0.5759865659109992, "grad_norm": 0.10633343458175659, "learning_rate": 4.5481021045369664e-05, "loss": 0.1497, "step": 2744 }, { "epoch": 0.5761964735516373, "grad_norm": 0.12353439629077911, "learning_rate": 4.5444531296980824e-05, "loss": 0.149, "step": 2745 }, { "epoch": 0.5764063811922754, "grad_norm": 0.1032649576663971, "learning_rate": 4.540804399498275e-05, "loss": 0.1385, "step": 2746 }, { "epoch": 0.5766162888329135, "grad_norm": 0.13375771045684814, "learning_rate": 4.537155915896991e-05, "loss": 0.1561, "step": 2747 }, { "epoch": 0.5768261964735516, "grad_norm": 0.11897927522659302, "learning_rate": 4.533507680853555e-05, "loss": 0.1527, "step": 2748 }, { "epoch": 0.5770361041141897, "grad_norm": 0.12122731655836105, "learning_rate": 4.5298596963271485e-05, "loss": 0.1456, "step": 2749 }, { "epoch": 0.5772460117548279, "grad_norm": 0.11372151225805283, "learning_rate": 4.5262119642768255e-05, "loss": 0.1489, "step": 2750 }, { "epoch": 0.577455919395466, "grad_norm": 0.10282732546329498, "learning_rate": 4.5225644866615e-05, "loss": 0.1572, "step": 2751 }, { "epoch": 0.5776658270361041, "grad_norm": 0.10694164037704468, "learning_rate": 4.518917265439949e-05, "loss": 0.1388, "step": 2752 }, { "epoch": 0.5778757346767422, "grad_norm": 0.11439422518014908, "learning_rate": 4.515270302570819e-05, "loss": 0.143, "step": 2753 }, { "epoch": 0.5780856423173804, "grad_norm": 0.11277041584253311, "learning_rate": 4.511623600012607e-05, "loss": 0.1586, "step": 2754 }, { "epoch": 0.5782955499580185, "grad_norm": 0.11806753277778625, "learning_rate": 4.507977159723677e-05, "loss": 0.1676, "step": 2755 }, { "epoch": 0.5785054575986566, "grad_norm": 0.1432192325592041, "learning_rate": 4.504330983662251e-05, "loss": 0.166, "step": 2756 }, { "epoch": 0.5787153652392947, "grad_norm": 0.11583569645881653, "learning_rate": 4.500685073786411e-05, "loss": 0.1629, "step": 2757 }, { "epoch": 0.5789252728799328, "grad_norm": 0.09797297418117523, "learning_rate": 4.49703943205409e-05, "loss": 0.1543, "step": 2758 }, { "epoch": 0.579135180520571, "grad_norm": 0.12834526598453522, "learning_rate": 4.493394060423084e-05, "loss": 0.1506, "step": 2759 }, { "epoch": 0.5793450881612091, "grad_norm": 0.1404927521944046, "learning_rate": 4.4897489608510406e-05, "loss": 0.1588, "step": 2760 }, { "epoch": 0.5795549958018472, "grad_norm": 0.11288987100124359, "learning_rate": 4.486104135295458e-05, "loss": 0.1622, "step": 2761 }, { "epoch": 0.5797649034424853, "grad_norm": 0.11807925254106522, "learning_rate": 4.482459585713695e-05, "loss": 0.1612, "step": 2762 }, { "epoch": 0.5799748110831234, "grad_norm": 0.142020121216774, "learning_rate": 4.4788153140629564e-05, "loss": 0.1351, "step": 2763 }, { "epoch": 0.5801847187237615, "grad_norm": 0.10823410749435425, "learning_rate": 4.4751713223002975e-05, "loss": 0.1501, "step": 2764 }, { "epoch": 0.5803946263643996, "grad_norm": 0.11584329605102539, "learning_rate": 4.4715276123826286e-05, "loss": 0.153, "step": 2765 }, { "epoch": 0.5806045340050378, "grad_norm": 0.11066281795501709, "learning_rate": 4.467884186266705e-05, "loss": 0.1457, "step": 2766 }, { "epoch": 0.5808144416456759, "grad_norm": 0.1182861477136612, "learning_rate": 4.464241045909125e-05, "loss": 0.1589, "step": 2767 }, { "epoch": 0.581024349286314, "grad_norm": 0.13916128873825073, "learning_rate": 4.460598193266343e-05, "loss": 0.1642, "step": 2768 }, { "epoch": 0.5812342569269522, "grad_norm": 0.12714353203773499, "learning_rate": 4.4569556302946525e-05, "loss": 0.1486, "step": 2769 }, { "epoch": 0.5814441645675903, "grad_norm": 0.10147153586149216, "learning_rate": 4.4533133589501916e-05, "loss": 0.1613, "step": 2770 }, { "epoch": 0.5816540722082284, "grad_norm": 0.1153976321220398, "learning_rate": 4.4496713811889455e-05, "loss": 0.1864, "step": 2771 }, { "epoch": 0.5818639798488665, "grad_norm": 0.0999862402677536, "learning_rate": 4.446029698966738e-05, "loss": 0.1459, "step": 2772 }, { "epoch": 0.5820738874895046, "grad_norm": 0.11894406378269196, "learning_rate": 4.442388314239239e-05, "loss": 0.151, "step": 2773 }, { "epoch": 0.5822837951301427, "grad_norm": 0.13212604820728302, "learning_rate": 4.4387472289619496e-05, "loss": 0.1583, "step": 2774 }, { "epoch": 0.5824937027707808, "grad_norm": 0.11380186676979065, "learning_rate": 4.43510644509022e-05, "loss": 0.1678, "step": 2775 }, { "epoch": 0.5827036104114189, "grad_norm": 0.11516433954238892, "learning_rate": 4.431465964579231e-05, "loss": 0.1676, "step": 2776 }, { "epoch": 0.582913518052057, "grad_norm": 0.14080291986465454, "learning_rate": 4.427825789384007e-05, "loss": 0.1407, "step": 2777 }, { "epoch": 0.5831234256926953, "grad_norm": 0.13508716225624084, "learning_rate": 4.424185921459403e-05, "loss": 0.1608, "step": 2778 }, { "epoch": 0.5833333333333334, "grad_norm": 0.11875400692224503, "learning_rate": 4.4205463627601135e-05, "loss": 0.1561, "step": 2779 }, { "epoch": 0.5835432409739715, "grad_norm": 0.12027221918106079, "learning_rate": 4.4169071152406636e-05, "loss": 0.1537, "step": 2780 }, { "epoch": 0.5837531486146096, "grad_norm": 0.09931481629610062, "learning_rate": 4.4132681808554124e-05, "loss": 0.1623, "step": 2781 }, { "epoch": 0.5839630562552477, "grad_norm": 0.12637794017791748, "learning_rate": 4.4096295615585483e-05, "loss": 0.1591, "step": 2782 }, { "epoch": 0.5841729638958858, "grad_norm": 0.11060679703950882, "learning_rate": 4.4059912593040964e-05, "loss": 0.1452, "step": 2783 }, { "epoch": 0.5843828715365239, "grad_norm": 0.12010272592306137, "learning_rate": 4.4023532760459055e-05, "loss": 0.179, "step": 2784 }, { "epoch": 0.584592779177162, "grad_norm": 0.10742159187793732, "learning_rate": 4.3987156137376585e-05, "loss": 0.1607, "step": 2785 }, { "epoch": 0.5848026868178001, "grad_norm": 0.11892808228731155, "learning_rate": 4.395078274332862e-05, "loss": 0.1546, "step": 2786 }, { "epoch": 0.5850125944584383, "grad_norm": 0.12276134639978409, "learning_rate": 4.3914412597848506e-05, "loss": 0.159, "step": 2787 }, { "epoch": 0.5852225020990764, "grad_norm": 0.10355465859174728, "learning_rate": 4.387804572046781e-05, "loss": 0.1518, "step": 2788 }, { "epoch": 0.5854324097397146, "grad_norm": 0.1348765641450882, "learning_rate": 4.384168213071642e-05, "loss": 0.1628, "step": 2789 }, { "epoch": 0.5856423173803527, "grad_norm": 0.10627730190753937, "learning_rate": 4.380532184812239e-05, "loss": 0.1587, "step": 2790 }, { "epoch": 0.5858522250209908, "grad_norm": 0.13369868695735931, "learning_rate": 4.376896489221203e-05, "loss": 0.155, "step": 2791 }, { "epoch": 0.5860621326616289, "grad_norm": 0.10591692477464676, "learning_rate": 4.373261128250987e-05, "loss": 0.1605, "step": 2792 }, { "epoch": 0.586272040302267, "grad_norm": 0.09910458326339722, "learning_rate": 4.369626103853859e-05, "loss": 0.1597, "step": 2793 }, { "epoch": 0.5864819479429051, "grad_norm": 0.1167030781507492, "learning_rate": 4.365991417981915e-05, "loss": 0.1627, "step": 2794 }, { "epoch": 0.5866918555835432, "grad_norm": 0.10456610471010208, "learning_rate": 4.362357072587061e-05, "loss": 0.1488, "step": 2795 }, { "epoch": 0.5869017632241813, "grad_norm": 0.11257223784923553, "learning_rate": 4.3587230696210224e-05, "loss": 0.1537, "step": 2796 }, { "epoch": 0.5871116708648195, "grad_norm": 0.1202881932258606, "learning_rate": 4.355089411035346e-05, "loss": 0.1685, "step": 2797 }, { "epoch": 0.5873215785054576, "grad_norm": 0.1485619693994522, "learning_rate": 4.3514560987813864e-05, "loss": 0.16, "step": 2798 }, { "epoch": 0.5875314861460957, "grad_norm": 0.12521955370903015, "learning_rate": 4.347823134810315e-05, "loss": 0.1631, "step": 2799 }, { "epoch": 0.5877413937867338, "grad_norm": 0.09790458530187607, "learning_rate": 4.344190521073119e-05, "loss": 0.1359, "step": 2800 }, { "epoch": 0.587951301427372, "grad_norm": 0.11199049651622772, "learning_rate": 4.340558259520594e-05, "loss": 0.1458, "step": 2801 }, { "epoch": 0.5881612090680101, "grad_norm": 0.1020960882306099, "learning_rate": 4.336926352103345e-05, "loss": 0.1559, "step": 2802 }, { "epoch": 0.5883711167086482, "grad_norm": 0.12979748845100403, "learning_rate": 4.3332948007717925e-05, "loss": 0.1654, "step": 2803 }, { "epoch": 0.5885810243492863, "grad_norm": 0.1190989539027214, "learning_rate": 4.32966360747616e-05, "loss": 0.1682, "step": 2804 }, { "epoch": 0.5887909319899244, "grad_norm": 0.10461704432964325, "learning_rate": 4.3260327741664824e-05, "loss": 0.1759, "step": 2805 }, { "epoch": 0.5890008396305626, "grad_norm": 0.11184418946504593, "learning_rate": 4.3224023027926007e-05, "loss": 0.1606, "step": 2806 }, { "epoch": 0.5892107472712007, "grad_norm": 0.10270063579082489, "learning_rate": 4.31877219530416e-05, "loss": 0.1488, "step": 2807 }, { "epoch": 0.5894206549118388, "grad_norm": 0.12192819267511368, "learning_rate": 4.315142453650613e-05, "loss": 0.1723, "step": 2808 }, { "epoch": 0.5896305625524769, "grad_norm": 0.11914799362421036, "learning_rate": 4.311513079781211e-05, "loss": 0.1651, "step": 2809 }, { "epoch": 0.589840470193115, "grad_norm": 0.11079401522874832, "learning_rate": 4.307884075645012e-05, "loss": 0.1475, "step": 2810 }, { "epoch": 0.5900503778337531, "grad_norm": 0.13855847716331482, "learning_rate": 4.3042554431908735e-05, "loss": 0.1488, "step": 2811 }, { "epoch": 0.5902602854743912, "grad_norm": 0.12921065092086792, "learning_rate": 4.3006271843674546e-05, "loss": 0.1688, "step": 2812 }, { "epoch": 0.5904701931150294, "grad_norm": 0.15366600453853607, "learning_rate": 4.296999301123212e-05, "loss": 0.1582, "step": 2813 }, { "epoch": 0.5906801007556675, "grad_norm": 0.12695199251174927, "learning_rate": 4.293371795406404e-05, "loss": 0.1645, "step": 2814 }, { "epoch": 0.5908900083963057, "grad_norm": 0.13746286928653717, "learning_rate": 4.289744669165085e-05, "loss": 0.1688, "step": 2815 }, { "epoch": 0.5910999160369438, "grad_norm": 0.09618178009986877, "learning_rate": 4.286117924347101e-05, "loss": 0.1746, "step": 2816 }, { "epoch": 0.5913098236775819, "grad_norm": 0.12498153746128082, "learning_rate": 4.282491562900097e-05, "loss": 0.1399, "step": 2817 }, { "epoch": 0.59151973131822, "grad_norm": 0.11736780405044556, "learning_rate": 4.278865586771515e-05, "loss": 0.145, "step": 2818 }, { "epoch": 0.5917296389588581, "grad_norm": 0.12359510362148285, "learning_rate": 4.2752399979085836e-05, "loss": 0.1406, "step": 2819 }, { "epoch": 0.5919395465994962, "grad_norm": 0.13295608758926392, "learning_rate": 4.27161479825833e-05, "loss": 0.1425, "step": 2820 }, { "epoch": 0.5921494542401343, "grad_norm": 0.12067288160324097, "learning_rate": 4.267989989767568e-05, "loss": 0.1526, "step": 2821 }, { "epoch": 0.5923593618807724, "grad_norm": 0.12987972795963287, "learning_rate": 4.264365574382903e-05, "loss": 0.1488, "step": 2822 }, { "epoch": 0.5925692695214105, "grad_norm": 0.13814137876033783, "learning_rate": 4.2607415540507264e-05, "loss": 0.1575, "step": 2823 }, { "epoch": 0.5927791771620488, "grad_norm": 0.10198640078306198, "learning_rate": 4.2571179307172235e-05, "loss": 0.165, "step": 2824 }, { "epoch": 0.5929890848026869, "grad_norm": 0.1260635405778885, "learning_rate": 4.2534947063283594e-05, "loss": 0.1578, "step": 2825 }, { "epoch": 0.593198992443325, "grad_norm": 0.11537054181098938, "learning_rate": 4.2498718828298906e-05, "loss": 0.1623, "step": 2826 }, { "epoch": 0.5934089000839631, "grad_norm": 0.11723814159631729, "learning_rate": 4.246249462167356e-05, "loss": 0.1391, "step": 2827 }, { "epoch": 0.5936188077246012, "grad_norm": 0.11966104060411453, "learning_rate": 4.242627446286076e-05, "loss": 0.1635, "step": 2828 }, { "epoch": 0.5938287153652393, "grad_norm": 0.11630399525165558, "learning_rate": 4.23900583713116e-05, "loss": 0.1578, "step": 2829 }, { "epoch": 0.5940386230058774, "grad_norm": 0.14845402538776398, "learning_rate": 4.235384636647492e-05, "loss": 0.1327, "step": 2830 }, { "epoch": 0.5942485306465155, "grad_norm": 0.1327534019947052, "learning_rate": 4.2317638467797384e-05, "loss": 0.1416, "step": 2831 }, { "epoch": 0.5944584382871536, "grad_norm": 0.11541690677404404, "learning_rate": 4.228143469472348e-05, "loss": 0.1712, "step": 2832 }, { "epoch": 0.5946683459277917, "grad_norm": 0.11513253301382065, "learning_rate": 4.224523506669545e-05, "loss": 0.1337, "step": 2833 }, { "epoch": 0.5948782535684299, "grad_norm": 0.127321258187294, "learning_rate": 4.220903960315333e-05, "loss": 0.1663, "step": 2834 }, { "epoch": 0.595088161209068, "grad_norm": 0.12505748867988586, "learning_rate": 4.2172848323534915e-05, "loss": 0.1545, "step": 2835 }, { "epoch": 0.5952980688497062, "grad_norm": 0.10772762447595596, "learning_rate": 4.213666124727575e-05, "loss": 0.1606, "step": 2836 }, { "epoch": 0.5955079764903443, "grad_norm": 0.12374143302440643, "learning_rate": 4.210047839380909e-05, "loss": 0.1502, "step": 2837 }, { "epoch": 0.5957178841309824, "grad_norm": 0.11770914494991302, "learning_rate": 4.206429978256599e-05, "loss": 0.1639, "step": 2838 }, { "epoch": 0.5959277917716205, "grad_norm": 0.12015975266695023, "learning_rate": 4.2028125432975175e-05, "loss": 0.1571, "step": 2839 }, { "epoch": 0.5961376994122586, "grad_norm": 0.10522496700286865, "learning_rate": 4.1991955364463096e-05, "loss": 0.1538, "step": 2840 }, { "epoch": 0.5963476070528967, "grad_norm": 0.1105801910161972, "learning_rate": 4.195578959645391e-05, "loss": 0.1461, "step": 2841 }, { "epoch": 0.5965575146935348, "grad_norm": 0.11777419596910477, "learning_rate": 4.191962814836945e-05, "loss": 0.1573, "step": 2842 }, { "epoch": 0.596767422334173, "grad_norm": 0.12069468200206757, "learning_rate": 4.188347103962928e-05, "loss": 0.1567, "step": 2843 }, { "epoch": 0.5969773299748111, "grad_norm": 0.11869499832391739, "learning_rate": 4.184731828965054e-05, "loss": 0.1645, "step": 2844 }, { "epoch": 0.5971872376154492, "grad_norm": 0.1150764673948288, "learning_rate": 4.181116991784811e-05, "loss": 0.1523, "step": 2845 }, { "epoch": 0.5973971452560873, "grad_norm": 0.1119702160358429, "learning_rate": 4.177502594363448e-05, "loss": 0.144, "step": 2846 }, { "epoch": 0.5976070528967254, "grad_norm": 0.17013654112815857, "learning_rate": 4.173888638641981e-05, "loss": 0.1726, "step": 2847 }, { "epoch": 0.5978169605373636, "grad_norm": 0.10498031228780746, "learning_rate": 4.1702751265611835e-05, "loss": 0.1413, "step": 2848 }, { "epoch": 0.5980268681780017, "grad_norm": 0.11212137341499329, "learning_rate": 4.166662060061597e-05, "loss": 0.137, "step": 2849 }, { "epoch": 0.5982367758186398, "grad_norm": 0.12234228104352951, "learning_rate": 4.163049441083522e-05, "loss": 0.1719, "step": 2850 }, { "epoch": 0.5984466834592779, "grad_norm": 0.10817383229732513, "learning_rate": 4.1594372715670124e-05, "loss": 0.1584, "step": 2851 }, { "epoch": 0.5986565910999161, "grad_norm": 0.13407209515571594, "learning_rate": 4.155825553451889e-05, "loss": 0.1636, "step": 2852 }, { "epoch": 0.5988664987405542, "grad_norm": 0.15183603763580322, "learning_rate": 4.152214288677726e-05, "loss": 0.1723, "step": 2853 }, { "epoch": 0.5990764063811923, "grad_norm": 0.11314138770103455, "learning_rate": 4.1486034791838554e-05, "loss": 0.1527, "step": 2854 }, { "epoch": 0.5992863140218304, "grad_norm": 0.11711379885673523, "learning_rate": 4.144993126909365e-05, "loss": 0.1605, "step": 2855 }, { "epoch": 0.5994962216624685, "grad_norm": 0.13769389688968658, "learning_rate": 4.141383233793097e-05, "loss": 0.1506, "step": 2856 }, { "epoch": 0.5997061293031066, "grad_norm": 0.16857290267944336, "learning_rate": 4.137773801773644e-05, "loss": 0.1744, "step": 2857 }, { "epoch": 0.5999160369437447, "grad_norm": 0.1259475201368332, "learning_rate": 4.1341648327893544e-05, "loss": 0.1668, "step": 2858 }, { "epoch": 0.6001259445843828, "grad_norm": 0.10995698720216751, "learning_rate": 4.1305563287783284e-05, "loss": 0.146, "step": 2859 }, { "epoch": 0.600335852225021, "grad_norm": 0.10539478063583374, "learning_rate": 4.1269482916784124e-05, "loss": 0.146, "step": 2860 }, { "epoch": 0.6005457598656591, "grad_norm": 0.11225578933954239, "learning_rate": 4.1233407234272084e-05, "loss": 0.1416, "step": 2861 }, { "epoch": 0.6007556675062973, "grad_norm": 0.11747094988822937, "learning_rate": 4.11973362596206e-05, "loss": 0.1529, "step": 2862 }, { "epoch": 0.6009655751469354, "grad_norm": 0.13106967508792877, "learning_rate": 4.116127001220063e-05, "loss": 0.1593, "step": 2863 }, { "epoch": 0.6011754827875735, "grad_norm": 0.1032341942191124, "learning_rate": 4.112520851138054e-05, "loss": 0.1539, "step": 2864 }, { "epoch": 0.6013853904282116, "grad_norm": 0.11255158483982086, "learning_rate": 4.1089151776526215e-05, "loss": 0.1546, "step": 2865 }, { "epoch": 0.6015952980688497, "grad_norm": 0.11748593300580978, "learning_rate": 4.1053099827000915e-05, "loss": 0.1536, "step": 2866 }, { "epoch": 0.6018052057094878, "grad_norm": 0.13722585141658783, "learning_rate": 4.10170526821654e-05, "loss": 0.1613, "step": 2867 }, { "epoch": 0.6020151133501259, "grad_norm": 0.12346901744604111, "learning_rate": 4.098101036137778e-05, "loss": 0.1535, "step": 2868 }, { "epoch": 0.602225020990764, "grad_norm": 0.11565665900707245, "learning_rate": 4.0944972883993594e-05, "loss": 0.1492, "step": 2869 }, { "epoch": 0.6024349286314021, "grad_norm": 0.12450307607650757, "learning_rate": 4.0908940269365835e-05, "loss": 0.1507, "step": 2870 }, { "epoch": 0.6026448362720404, "grad_norm": 0.10263592004776001, "learning_rate": 4.0872912536844805e-05, "loss": 0.1552, "step": 2871 }, { "epoch": 0.6028547439126785, "grad_norm": 0.10213368386030197, "learning_rate": 4.0836889705778215e-05, "loss": 0.1548, "step": 2872 }, { "epoch": 0.6030646515533166, "grad_norm": 0.12833476066589355, "learning_rate": 4.0800871795511164e-05, "loss": 0.1657, "step": 2873 }, { "epoch": 0.6032745591939547, "grad_norm": 0.13922633230686188, "learning_rate": 4.07648588253861e-05, "loss": 0.14, "step": 2874 }, { "epoch": 0.6034844668345928, "grad_norm": 0.12405204772949219, "learning_rate": 4.072885081474278e-05, "loss": 0.1756, "step": 2875 }, { "epoch": 0.6036943744752309, "grad_norm": 0.12056063115596771, "learning_rate": 4.069284778291837e-05, "loss": 0.1625, "step": 2876 }, { "epoch": 0.603904282115869, "grad_norm": 0.1193803995847702, "learning_rate": 4.06568497492473e-05, "loss": 0.162, "step": 2877 }, { "epoch": 0.6041141897565071, "grad_norm": 0.1092044785618782, "learning_rate": 4.062085673306132e-05, "loss": 0.1538, "step": 2878 }, { "epoch": 0.6043240973971452, "grad_norm": 0.1371615082025528, "learning_rate": 4.058486875368952e-05, "loss": 0.1605, "step": 2879 }, { "epoch": 0.6045340050377834, "grad_norm": 0.11028730124235153, "learning_rate": 4.054888583045827e-05, "loss": 0.1572, "step": 2880 }, { "epoch": 0.6047439126784215, "grad_norm": 0.13068102300167084, "learning_rate": 4.05129079826912e-05, "loss": 0.1578, "step": 2881 }, { "epoch": 0.6049538203190596, "grad_norm": 0.10060226172208786, "learning_rate": 4.0476935229709265e-05, "loss": 0.1732, "step": 2882 }, { "epoch": 0.6051637279596978, "grad_norm": 0.10314224660396576, "learning_rate": 4.0440967590830625e-05, "loss": 0.1646, "step": 2883 }, { "epoch": 0.6053736356003359, "grad_norm": 0.13616135716438293, "learning_rate": 4.040500508537077e-05, "loss": 0.1778, "step": 2884 }, { "epoch": 0.605583543240974, "grad_norm": 0.21042796969413757, "learning_rate": 4.036904773264234e-05, "loss": 0.1501, "step": 2885 }, { "epoch": 0.6057934508816121, "grad_norm": 0.115667924284935, "learning_rate": 4.033309555195527e-05, "loss": 0.1707, "step": 2886 }, { "epoch": 0.6060033585222502, "grad_norm": 0.17195206880569458, "learning_rate": 4.02971485626167e-05, "loss": 0.1539, "step": 2887 }, { "epoch": 0.6062132661628883, "grad_norm": 0.10804500430822372, "learning_rate": 4.0261206783931e-05, "loss": 0.1494, "step": 2888 }, { "epoch": 0.6064231738035264, "grad_norm": 0.1175246611237526, "learning_rate": 4.02252702351997e-05, "loss": 0.1579, "step": 2889 }, { "epoch": 0.6066330814441646, "grad_norm": 0.11100801080465317, "learning_rate": 4.018933893572157e-05, "loss": 0.1525, "step": 2890 }, { "epoch": 0.6068429890848027, "grad_norm": 0.1287776231765747, "learning_rate": 4.015341290479255e-05, "loss": 0.1573, "step": 2891 }, { "epoch": 0.6070528967254408, "grad_norm": 0.0996914654970169, "learning_rate": 4.01174921617057e-05, "loss": 0.1667, "step": 2892 }, { "epoch": 0.6072628043660789, "grad_norm": 0.11938051134347916, "learning_rate": 4.0081576725751294e-05, "loss": 0.1585, "step": 2893 }, { "epoch": 0.607472712006717, "grad_norm": 0.13560616970062256, "learning_rate": 4.004566661621676e-05, "loss": 0.1533, "step": 2894 }, { "epoch": 0.6076826196473551, "grad_norm": 0.15640297532081604, "learning_rate": 4.000976185238662e-05, "loss": 0.1637, "step": 2895 }, { "epoch": 0.6078925272879933, "grad_norm": 0.12788182497024536, "learning_rate": 3.9973862453542575e-05, "loss": 0.1464, "step": 2896 }, { "epoch": 0.6081024349286314, "grad_norm": 0.1404847800731659, "learning_rate": 3.9937968438963416e-05, "loss": 0.1727, "step": 2897 }, { "epoch": 0.6083123425692695, "grad_norm": 0.1205708235502243, "learning_rate": 3.990207982792505e-05, "loss": 0.1554, "step": 2898 }, { "epoch": 0.6085222502099077, "grad_norm": 0.13461466133594513, "learning_rate": 3.986619663970047e-05, "loss": 0.1713, "step": 2899 }, { "epoch": 0.6087321578505458, "grad_norm": 0.12866820394992828, "learning_rate": 3.983031889355978e-05, "loss": 0.1588, "step": 2900 }, { "epoch": 0.6089420654911839, "grad_norm": 0.11548592895269394, "learning_rate": 3.9794446608770134e-05, "loss": 0.1507, "step": 2901 }, { "epoch": 0.609151973131822, "grad_norm": 0.12713722884655, "learning_rate": 3.975857980459579e-05, "loss": 0.1499, "step": 2902 }, { "epoch": 0.6093618807724601, "grad_norm": 0.13451896607875824, "learning_rate": 3.9722718500298026e-05, "loss": 0.1675, "step": 2903 }, { "epoch": 0.6095717884130982, "grad_norm": 0.12704823911190033, "learning_rate": 3.9686862715135176e-05, "loss": 0.1434, "step": 2904 }, { "epoch": 0.6097816960537363, "grad_norm": 0.11345957219600677, "learning_rate": 3.965101246836265e-05, "loss": 0.1603, "step": 2905 }, { "epoch": 0.6099916036943744, "grad_norm": 0.10510886460542679, "learning_rate": 3.96151677792328e-05, "loss": 0.1541, "step": 2906 }, { "epoch": 0.6102015113350125, "grad_norm": 0.1235707551240921, "learning_rate": 3.957932866699508e-05, "loss": 0.1409, "step": 2907 }, { "epoch": 0.6104114189756508, "grad_norm": 0.11852505803108215, "learning_rate": 3.954349515089589e-05, "loss": 0.167, "step": 2908 }, { "epoch": 0.6106213266162889, "grad_norm": 0.10151386260986328, "learning_rate": 3.950766725017866e-05, "loss": 0.1484, "step": 2909 }, { "epoch": 0.610831234256927, "grad_norm": 0.1326039880514145, "learning_rate": 3.947184498408378e-05, "loss": 0.1552, "step": 2910 }, { "epoch": 0.6110411418975651, "grad_norm": 0.14236897230148315, "learning_rate": 3.9436028371848646e-05, "loss": 0.1583, "step": 2911 }, { "epoch": 0.6112510495382032, "grad_norm": 0.1296806037425995, "learning_rate": 3.940021743270759e-05, "loss": 0.1542, "step": 2912 }, { "epoch": 0.6114609571788413, "grad_norm": 0.10085690766572952, "learning_rate": 3.936441218589187e-05, "loss": 0.1518, "step": 2913 }, { "epoch": 0.6116708648194794, "grad_norm": 0.1088392585515976, "learning_rate": 3.9328612650629765e-05, "loss": 0.1605, "step": 2914 }, { "epoch": 0.6118807724601175, "grad_norm": 0.09953225404024124, "learning_rate": 3.929281884614643e-05, "loss": 0.1409, "step": 2915 }, { "epoch": 0.6120906801007556, "grad_norm": 0.10369337350130081, "learning_rate": 3.925703079166394e-05, "loss": 0.1579, "step": 2916 }, { "epoch": 0.6123005877413937, "grad_norm": 0.10897854715585709, "learning_rate": 3.922124850640131e-05, "loss": 0.1493, "step": 2917 }, { "epoch": 0.612510495382032, "grad_norm": 0.09800262004137039, "learning_rate": 3.918547200957445e-05, "loss": 0.1606, "step": 2918 }, { "epoch": 0.6127204030226701, "grad_norm": 0.09800999611616135, "learning_rate": 3.914970132039616e-05, "loss": 0.1465, "step": 2919 }, { "epoch": 0.6129303106633082, "grad_norm": 0.11520504951477051, "learning_rate": 3.9113936458076106e-05, "loss": 0.1544, "step": 2920 }, { "epoch": 0.6131402183039463, "grad_norm": 0.12162473052740097, "learning_rate": 3.9078177441820834e-05, "loss": 0.1511, "step": 2921 }, { "epoch": 0.6133501259445844, "grad_norm": 0.14628733694553375, "learning_rate": 3.9042424290833746e-05, "loss": 0.1564, "step": 2922 }, { "epoch": 0.6135600335852225, "grad_norm": 0.11655198037624359, "learning_rate": 3.900667702431512e-05, "loss": 0.1665, "step": 2923 }, { "epoch": 0.6137699412258606, "grad_norm": 0.11280401051044464, "learning_rate": 3.897093566146204e-05, "loss": 0.1672, "step": 2924 }, { "epoch": 0.6139798488664987, "grad_norm": 0.13265039026737213, "learning_rate": 3.8935200221468446e-05, "loss": 0.1703, "step": 2925 }, { "epoch": 0.6141897565071368, "grad_norm": 0.10153202712535858, "learning_rate": 3.88994707235251e-05, "loss": 0.1498, "step": 2926 }, { "epoch": 0.614399664147775, "grad_norm": 0.1125868558883667, "learning_rate": 3.886374718681952e-05, "loss": 0.1737, "step": 2927 }, { "epoch": 0.6146095717884131, "grad_norm": 0.1055431067943573, "learning_rate": 3.882802963053607e-05, "loss": 0.1531, "step": 2928 }, { "epoch": 0.6148194794290512, "grad_norm": 0.12137924879789352, "learning_rate": 3.8792318073855913e-05, "loss": 0.1498, "step": 2929 }, { "epoch": 0.6150293870696893, "grad_norm": 0.11451124399900436, "learning_rate": 3.8756612535956954e-05, "loss": 0.1613, "step": 2930 }, { "epoch": 0.6152392947103275, "grad_norm": 0.10448075830936432, "learning_rate": 3.87209130360139e-05, "loss": 0.1565, "step": 2931 }, { "epoch": 0.6154492023509656, "grad_norm": 0.11644694209098816, "learning_rate": 3.868521959319817e-05, "loss": 0.1691, "step": 2932 }, { "epoch": 0.6156591099916037, "grad_norm": 0.11724112182855606, "learning_rate": 3.8649532226678015e-05, "loss": 0.1637, "step": 2933 }, { "epoch": 0.6158690176322418, "grad_norm": 0.1371581256389618, "learning_rate": 3.861385095561829e-05, "loss": 0.143, "step": 2934 }, { "epoch": 0.6160789252728799, "grad_norm": 0.12549670040607452, "learning_rate": 3.85781757991807e-05, "loss": 0.1523, "step": 2935 }, { "epoch": 0.6162888329135181, "grad_norm": 0.10449302196502686, "learning_rate": 3.85425067765236e-05, "loss": 0.1508, "step": 2936 }, { "epoch": 0.6164987405541562, "grad_norm": 0.12108724564313889, "learning_rate": 3.8506843906802085e-05, "loss": 0.1502, "step": 2937 }, { "epoch": 0.6167086481947943, "grad_norm": 0.10048652440309525, "learning_rate": 3.8471187209167925e-05, "loss": 0.1491, "step": 2938 }, { "epoch": 0.6169185558354324, "grad_norm": 0.13422608375549316, "learning_rate": 3.843553670276957e-05, "loss": 0.1629, "step": 2939 }, { "epoch": 0.6171284634760705, "grad_norm": 0.14751432836055756, "learning_rate": 3.8399892406752183e-05, "loss": 0.1563, "step": 2940 }, { "epoch": 0.6173383711167086, "grad_norm": 0.11447716504335403, "learning_rate": 3.836425434025754e-05, "loss": 0.1604, "step": 2941 }, { "epoch": 0.6175482787573467, "grad_norm": 0.13063320517539978, "learning_rate": 3.83286225224241e-05, "loss": 0.1692, "step": 2942 }, { "epoch": 0.6177581863979849, "grad_norm": 0.13792036473751068, "learning_rate": 3.8292996972386976e-05, "loss": 0.1506, "step": 2943 }, { "epoch": 0.617968094038623, "grad_norm": 0.12366560101509094, "learning_rate": 3.825737770927789e-05, "loss": 0.1476, "step": 2944 }, { "epoch": 0.6181780016792612, "grad_norm": 0.1087699830532074, "learning_rate": 3.8221764752225195e-05, "loss": 0.1565, "step": 2945 }, { "epoch": 0.6183879093198993, "grad_norm": 0.1199265792965889, "learning_rate": 3.8186158120353884e-05, "loss": 0.174, "step": 2946 }, { "epoch": 0.6185978169605374, "grad_norm": 0.10412439703941345, "learning_rate": 3.815055783278553e-05, "loss": 0.1285, "step": 2947 }, { "epoch": 0.6188077246011755, "grad_norm": 0.12668044865131378, "learning_rate": 3.8114963908638266e-05, "loss": 0.1644, "step": 2948 }, { "epoch": 0.6190176322418136, "grad_norm": 0.13107618689537048, "learning_rate": 3.807937636702688e-05, "loss": 0.1654, "step": 2949 }, { "epoch": 0.6192275398824517, "grad_norm": 0.11218323558568954, "learning_rate": 3.8043795227062675e-05, "loss": 0.1364, "step": 2950 }, { "epoch": 0.6194374475230898, "grad_norm": 0.10924772173166275, "learning_rate": 3.800822050785353e-05, "loss": 0.1565, "step": 2951 }, { "epoch": 0.6196473551637279, "grad_norm": 0.12488337606191635, "learning_rate": 3.797265222850391e-05, "loss": 0.1632, "step": 2952 }, { "epoch": 0.619857262804366, "grad_norm": 0.12105172872543335, "learning_rate": 3.7937090408114784e-05, "loss": 0.1601, "step": 2953 }, { "epoch": 0.6200671704450041, "grad_norm": 0.13021235167980194, "learning_rate": 3.7901535065783633e-05, "loss": 0.1697, "step": 2954 }, { "epoch": 0.6202770780856424, "grad_norm": 0.12432187795639038, "learning_rate": 3.786598622060453e-05, "loss": 0.1648, "step": 2955 }, { "epoch": 0.6204869857262805, "grad_norm": 0.10438645631074905, "learning_rate": 3.7830443891667986e-05, "loss": 0.1515, "step": 2956 }, { "epoch": 0.6206968933669186, "grad_norm": 0.10381833463907242, "learning_rate": 3.7794908098061046e-05, "loss": 0.1398, "step": 2957 }, { "epoch": 0.6209068010075567, "grad_norm": 0.1131964772939682, "learning_rate": 3.775937885886726e-05, "loss": 0.1674, "step": 2958 }, { "epoch": 0.6211167086481948, "grad_norm": 0.09784296900033951, "learning_rate": 3.772385619316663e-05, "loss": 0.1519, "step": 2959 }, { "epoch": 0.6213266162888329, "grad_norm": 0.10954361408948898, "learning_rate": 3.7688340120035654e-05, "loss": 0.1376, "step": 2960 }, { "epoch": 0.621536523929471, "grad_norm": 0.10720100998878479, "learning_rate": 3.7652830658547254e-05, "loss": 0.1494, "step": 2961 }, { "epoch": 0.6217464315701091, "grad_norm": 0.11651554703712463, "learning_rate": 3.761732782777083e-05, "loss": 0.1522, "step": 2962 }, { "epoch": 0.6219563392107472, "grad_norm": 0.12341861426830292, "learning_rate": 3.7581831646772205e-05, "loss": 0.1486, "step": 2963 }, { "epoch": 0.6221662468513854, "grad_norm": 0.12296581268310547, "learning_rate": 3.754634213461365e-05, "loss": 0.1553, "step": 2964 }, { "epoch": 0.6223761544920235, "grad_norm": 0.11492616683244705, "learning_rate": 3.7510859310353836e-05, "loss": 0.1526, "step": 2965 }, { "epoch": 0.6225860621326617, "grad_norm": 0.13258479535579681, "learning_rate": 3.7475383193047866e-05, "loss": 0.1521, "step": 2966 }, { "epoch": 0.6227959697732998, "grad_norm": 0.11529724299907684, "learning_rate": 3.7439913801747214e-05, "loss": 0.1423, "step": 2967 }, { "epoch": 0.6230058774139379, "grad_norm": 0.12614604830741882, "learning_rate": 3.7404451155499755e-05, "loss": 0.1416, "step": 2968 }, { "epoch": 0.623215785054576, "grad_norm": 0.13627269864082336, "learning_rate": 3.7368995273349716e-05, "loss": 0.1701, "step": 2969 }, { "epoch": 0.6234256926952141, "grad_norm": 0.09332132339477539, "learning_rate": 3.733354617433776e-05, "loss": 0.1567, "step": 2970 }, { "epoch": 0.6236356003358522, "grad_norm": 0.1580103486776352, "learning_rate": 3.729810387750082e-05, "loss": 0.1779, "step": 2971 }, { "epoch": 0.6238455079764903, "grad_norm": 0.10894285887479782, "learning_rate": 3.726266840187226e-05, "loss": 0.1634, "step": 2972 }, { "epoch": 0.6240554156171285, "grad_norm": 0.14514589309692383, "learning_rate": 3.72272397664817e-05, "loss": 0.1551, "step": 2973 }, { "epoch": 0.6242653232577666, "grad_norm": 0.13061858713626862, "learning_rate": 3.719181799035519e-05, "loss": 0.1612, "step": 2974 }, { "epoch": 0.6244752308984047, "grad_norm": 0.1479208767414093, "learning_rate": 3.715640309251495e-05, "loss": 0.159, "step": 2975 }, { "epoch": 0.6246851385390428, "grad_norm": 0.11024679988622665, "learning_rate": 3.712099509197964e-05, "loss": 0.1415, "step": 2976 }, { "epoch": 0.624895046179681, "grad_norm": 0.12445060163736343, "learning_rate": 3.7085594007764134e-05, "loss": 0.1511, "step": 2977 }, { "epoch": 0.625104953820319, "grad_norm": 0.10473346710205078, "learning_rate": 3.705019985887966e-05, "loss": 0.1575, "step": 2978 }, { "epoch": 0.6253148614609572, "grad_norm": 0.11598935723304749, "learning_rate": 3.701481266433367e-05, "loss": 0.1517, "step": 2979 }, { "epoch": 0.6255247691015953, "grad_norm": 0.1171158105134964, "learning_rate": 3.697943244312988e-05, "loss": 0.1755, "step": 2980 }, { "epoch": 0.6257346767422334, "grad_norm": 0.12433173507452011, "learning_rate": 3.694405921426831e-05, "loss": 0.1409, "step": 2981 }, { "epoch": 0.6259445843828715, "grad_norm": 0.13838061690330505, "learning_rate": 3.690869299674515e-05, "loss": 0.1632, "step": 2982 }, { "epoch": 0.6261544920235097, "grad_norm": 0.1209079921245575, "learning_rate": 3.687333380955289e-05, "loss": 0.15, "step": 2983 }, { "epoch": 0.6263643996641478, "grad_norm": 0.11533726751804352, "learning_rate": 3.683798167168023e-05, "loss": 0.1593, "step": 2984 }, { "epoch": 0.6265743073047859, "grad_norm": 0.11141890287399292, "learning_rate": 3.680263660211206e-05, "loss": 0.1448, "step": 2985 }, { "epoch": 0.626784214945424, "grad_norm": 0.10416781902313232, "learning_rate": 3.676729861982948e-05, "loss": 0.1492, "step": 2986 }, { "epoch": 0.6269941225860621, "grad_norm": 0.13024356961250305, "learning_rate": 3.673196774380983e-05, "loss": 0.15, "step": 2987 }, { "epoch": 0.6272040302267002, "grad_norm": 0.1203446090221405, "learning_rate": 3.669664399302658e-05, "loss": 0.1673, "step": 2988 }, { "epoch": 0.6274139378673383, "grad_norm": 0.11524245887994766, "learning_rate": 3.6661327386449365e-05, "loss": 0.1347, "step": 2989 }, { "epoch": 0.6276238455079765, "grad_norm": 0.1355539858341217, "learning_rate": 3.662601794304404e-05, "loss": 0.1643, "step": 2990 }, { "epoch": 0.6278337531486146, "grad_norm": 0.11982898414134979, "learning_rate": 3.659071568177258e-05, "loss": 0.1396, "step": 2991 }, { "epoch": 0.6280436607892528, "grad_norm": 0.11782204359769821, "learning_rate": 3.655542062159308e-05, "loss": 0.1611, "step": 2992 }, { "epoch": 0.6282535684298909, "grad_norm": 0.13236179947853088, "learning_rate": 3.652013278145983e-05, "loss": 0.1534, "step": 2993 }, { "epoch": 0.628463476070529, "grad_norm": 0.11964340507984161, "learning_rate": 3.648485218032317e-05, "loss": 0.1508, "step": 2994 }, { "epoch": 0.6286733837111671, "grad_norm": 0.11381798982620239, "learning_rate": 3.644957883712964e-05, "loss": 0.1849, "step": 2995 }, { "epoch": 0.6288832913518052, "grad_norm": 0.13884767889976501, "learning_rate": 3.641431277082178e-05, "loss": 0.1621, "step": 2996 }, { "epoch": 0.6290931989924433, "grad_norm": 0.13538014888763428, "learning_rate": 3.63790540003383e-05, "loss": 0.1598, "step": 2997 }, { "epoch": 0.6293031066330814, "grad_norm": 0.11793238669633865, "learning_rate": 3.6343802544613936e-05, "loss": 0.1484, "step": 2998 }, { "epoch": 0.6295130142737195, "grad_norm": 0.15728533267974854, "learning_rate": 3.6308558422579565e-05, "loss": 0.1581, "step": 2999 }, { "epoch": 0.6297229219143576, "grad_norm": 0.10782770812511444, "learning_rate": 3.6273321653162055e-05, "loss": 0.1559, "step": 3000 }, { "epoch": 0.6299328295549959, "grad_norm": 0.11745624989271164, "learning_rate": 3.623809225528438e-05, "loss": 0.1651, "step": 3001 }, { "epoch": 0.630142737195634, "grad_norm": 0.1334666609764099, "learning_rate": 3.620287024786553e-05, "loss": 0.1458, "step": 3002 }, { "epoch": 0.6303526448362721, "grad_norm": 0.1259807050228119, "learning_rate": 3.6167655649820495e-05, "loss": 0.1555, "step": 3003 }, { "epoch": 0.6305625524769102, "grad_norm": 0.13792045414447784, "learning_rate": 3.6132448480060346e-05, "loss": 0.1523, "step": 3004 }, { "epoch": 0.6307724601175483, "grad_norm": 0.10497776418924332, "learning_rate": 3.609724875749213e-05, "loss": 0.1446, "step": 3005 }, { "epoch": 0.6309823677581864, "grad_norm": 0.11485005170106888, "learning_rate": 3.606205650101889e-05, "loss": 0.15, "step": 3006 }, { "epoch": 0.6311922753988245, "grad_norm": 0.12223370373249054, "learning_rate": 3.602687172953971e-05, "loss": 0.1607, "step": 3007 }, { "epoch": 0.6314021830394626, "grad_norm": 0.12655122578144073, "learning_rate": 3.599169446194958e-05, "loss": 0.1636, "step": 3008 }, { "epoch": 0.6316120906801007, "grad_norm": 0.11615359783172607, "learning_rate": 3.595652471713953e-05, "loss": 0.1651, "step": 3009 }, { "epoch": 0.6318219983207388, "grad_norm": 0.11752969771623611, "learning_rate": 3.592136251399646e-05, "loss": 0.1485, "step": 3010 }, { "epoch": 0.632031905961377, "grad_norm": 0.11900593340396881, "learning_rate": 3.5886207871403326e-05, "loss": 0.159, "step": 3011 }, { "epoch": 0.6322418136020151, "grad_norm": 0.10577517747879028, "learning_rate": 3.5851060808238944e-05, "loss": 0.1333, "step": 3012 }, { "epoch": 0.6324517212426533, "grad_norm": 0.104424849152565, "learning_rate": 3.5815921343378125e-05, "loss": 0.1541, "step": 3013 }, { "epoch": 0.6326616288832914, "grad_norm": 0.1216193214058876, "learning_rate": 3.5780789495691526e-05, "loss": 0.1536, "step": 3014 }, { "epoch": 0.6328715365239295, "grad_norm": 0.11798959970474243, "learning_rate": 3.574566528404577e-05, "loss": 0.1673, "step": 3015 }, { "epoch": 0.6330814441645676, "grad_norm": 0.15715515613555908, "learning_rate": 3.5710548727303384e-05, "loss": 0.1482, "step": 3016 }, { "epoch": 0.6332913518052057, "grad_norm": 0.10306575894355774, "learning_rate": 3.567543984432272e-05, "loss": 0.1473, "step": 3017 }, { "epoch": 0.6335012594458438, "grad_norm": 0.1256084144115448, "learning_rate": 3.5640338653958064e-05, "loss": 0.1462, "step": 3018 }, { "epoch": 0.6337111670864819, "grad_norm": 0.11586036533117294, "learning_rate": 3.5605245175059564e-05, "loss": 0.1608, "step": 3019 }, { "epoch": 0.6339210747271201, "grad_norm": 0.13672485947608948, "learning_rate": 3.5570159426473217e-05, "loss": 0.1662, "step": 3020 }, { "epoch": 0.6341309823677582, "grad_norm": 0.1345447152853012, "learning_rate": 3.553508142704086e-05, "loss": 0.1672, "step": 3021 }, { "epoch": 0.6343408900083963, "grad_norm": 0.11291158199310303, "learning_rate": 3.55000111956002e-05, "loss": 0.1299, "step": 3022 }, { "epoch": 0.6345507976490344, "grad_norm": 0.1844807267189026, "learning_rate": 3.546494875098476e-05, "loss": 0.153, "step": 3023 }, { "epoch": 0.6347607052896725, "grad_norm": 0.11099909991025925, "learning_rate": 3.542989411202383e-05, "loss": 0.1448, "step": 3024 }, { "epoch": 0.6349706129303107, "grad_norm": 0.1145697608590126, "learning_rate": 3.539484729754259e-05, "loss": 0.1591, "step": 3025 }, { "epoch": 0.6351805205709488, "grad_norm": 0.11213824152946472, "learning_rate": 3.5359808326361966e-05, "loss": 0.1506, "step": 3026 }, { "epoch": 0.6353904282115869, "grad_norm": 0.11795682460069656, "learning_rate": 3.5324777217298674e-05, "loss": 0.167, "step": 3027 }, { "epoch": 0.635600335852225, "grad_norm": 0.10587096959352493, "learning_rate": 3.528975398916525e-05, "loss": 0.152, "step": 3028 }, { "epoch": 0.6358102434928632, "grad_norm": 0.1358242630958557, "learning_rate": 3.5254738660769945e-05, "loss": 0.1631, "step": 3029 }, { "epoch": 0.6360201511335013, "grad_norm": 0.1284114420413971, "learning_rate": 3.521973125091681e-05, "loss": 0.1721, "step": 3030 }, { "epoch": 0.6362300587741394, "grad_norm": 0.11894943565130234, "learning_rate": 3.5184731778405606e-05, "loss": 0.1525, "step": 3031 }, { "epoch": 0.6364399664147775, "grad_norm": 0.13244931399822235, "learning_rate": 3.514974026203185e-05, "loss": 0.1662, "step": 3032 }, { "epoch": 0.6366498740554156, "grad_norm": 0.1697516143321991, "learning_rate": 3.5114756720586775e-05, "loss": 0.1602, "step": 3033 }, { "epoch": 0.6368597816960537, "grad_norm": 0.13253948092460632, "learning_rate": 3.507978117285737e-05, "loss": 0.1611, "step": 3034 }, { "epoch": 0.6370696893366918, "grad_norm": 0.09245211631059647, "learning_rate": 3.5044813637626287e-05, "loss": 0.143, "step": 3035 }, { "epoch": 0.6372795969773299, "grad_norm": 0.10185031592845917, "learning_rate": 3.5009854133671895e-05, "loss": 0.1428, "step": 3036 }, { "epoch": 0.637489504617968, "grad_norm": 0.11988876760005951, "learning_rate": 3.497490267976825e-05, "loss": 0.1596, "step": 3037 }, { "epoch": 0.6376994122586063, "grad_norm": 0.10822691023349762, "learning_rate": 3.4939959294685075e-05, "loss": 0.1648, "step": 3038 }, { "epoch": 0.6379093198992444, "grad_norm": 0.12224753201007843, "learning_rate": 3.490502399718775e-05, "loss": 0.1777, "step": 3039 }, { "epoch": 0.6381192275398825, "grad_norm": 0.11879666894674301, "learning_rate": 3.487009680603736e-05, "loss": 0.1783, "step": 3040 }, { "epoch": 0.6383291351805206, "grad_norm": 0.1191924661397934, "learning_rate": 3.483517773999057e-05, "loss": 0.1533, "step": 3041 }, { "epoch": 0.6385390428211587, "grad_norm": 0.10266808420419693, "learning_rate": 3.480026681779975e-05, "loss": 0.1683, "step": 3042 }, { "epoch": 0.6387489504617968, "grad_norm": 0.11044494807720184, "learning_rate": 3.476536405821286e-05, "loss": 0.1344, "step": 3043 }, { "epoch": 0.6389588581024349, "grad_norm": 0.12145400047302246, "learning_rate": 3.473046947997345e-05, "loss": 0.1396, "step": 3044 }, { "epoch": 0.639168765743073, "grad_norm": 0.13350744545459747, "learning_rate": 3.469558310182072e-05, "loss": 0.1552, "step": 3045 }, { "epoch": 0.6393786733837111, "grad_norm": 0.12254119664430618, "learning_rate": 3.4660704942489463e-05, "loss": 0.1474, "step": 3046 }, { "epoch": 0.6395885810243492, "grad_norm": 0.11156780272722244, "learning_rate": 3.462583502071002e-05, "loss": 0.153, "step": 3047 }, { "epoch": 0.6397984886649875, "grad_norm": 0.10157320648431778, "learning_rate": 3.459097335520837e-05, "loss": 0.15, "step": 3048 }, { "epoch": 0.6400083963056256, "grad_norm": 0.11385181546211243, "learning_rate": 3.4556119964706e-05, "loss": 0.1669, "step": 3049 }, { "epoch": 0.6402183039462637, "grad_norm": 0.12311366945505142, "learning_rate": 3.4521274867920014e-05, "loss": 0.1607, "step": 3050 }, { "epoch": 0.6404282115869018, "grad_norm": 0.11515744030475616, "learning_rate": 3.4486438083562964e-05, "loss": 0.1471, "step": 3051 }, { "epoch": 0.6406381192275399, "grad_norm": 0.1235782578587532, "learning_rate": 3.445160963034304e-05, "loss": 0.1597, "step": 3052 }, { "epoch": 0.640848026868178, "grad_norm": 0.1385941505432129, "learning_rate": 3.44167895269639e-05, "loss": 0.1529, "step": 3053 }, { "epoch": 0.6410579345088161, "grad_norm": 0.24027882516384125, "learning_rate": 3.438197779212475e-05, "loss": 0.1648, "step": 3054 }, { "epoch": 0.6412678421494542, "grad_norm": 0.10963878035545349, "learning_rate": 3.4347174444520266e-05, "loss": 0.1673, "step": 3055 }, { "epoch": 0.6414777497900923, "grad_norm": 0.12933169305324554, "learning_rate": 3.4312379502840665e-05, "loss": 0.1478, "step": 3056 }, { "epoch": 0.6416876574307305, "grad_norm": 0.11227238178253174, "learning_rate": 3.427759298577163e-05, "loss": 0.1534, "step": 3057 }, { "epoch": 0.6418975650713686, "grad_norm": 0.12470339983701706, "learning_rate": 3.4242814911994284e-05, "loss": 0.1521, "step": 3058 }, { "epoch": 0.6421074727120067, "grad_norm": 0.12207306921482086, "learning_rate": 3.4208045300185256e-05, "loss": 0.1701, "step": 3059 }, { "epoch": 0.6423173803526449, "grad_norm": 0.1703091263771057, "learning_rate": 3.417328416901663e-05, "loss": 0.1528, "step": 3060 }, { "epoch": 0.642527287993283, "grad_norm": 0.12895244359970093, "learning_rate": 3.413853153715593e-05, "loss": 0.1616, "step": 3061 }, { "epoch": 0.6427371956339211, "grad_norm": 0.10771409422159195, "learning_rate": 3.4103787423266106e-05, "loss": 0.1451, "step": 3062 }, { "epoch": 0.6429471032745592, "grad_norm": 0.1198326051235199, "learning_rate": 3.406905184600555e-05, "loss": 0.1571, "step": 3063 }, { "epoch": 0.6431570109151973, "grad_norm": 0.11176072061061859, "learning_rate": 3.403432482402806e-05, "loss": 0.1602, "step": 3064 }, { "epoch": 0.6433669185558354, "grad_norm": 0.11805977672338486, "learning_rate": 3.3999606375982816e-05, "loss": 0.1562, "step": 3065 }, { "epoch": 0.6435768261964736, "grad_norm": 0.12517598271369934, "learning_rate": 3.3964896520514445e-05, "loss": 0.1566, "step": 3066 }, { "epoch": 0.6437867338371117, "grad_norm": 0.09705498069524765, "learning_rate": 3.393019527626292e-05, "loss": 0.154, "step": 3067 }, { "epoch": 0.6439966414777498, "grad_norm": 0.13115854561328888, "learning_rate": 3.38955026618636e-05, "loss": 0.156, "step": 3068 }, { "epoch": 0.6442065491183879, "grad_norm": 0.10653633624315262, "learning_rate": 3.386081869594723e-05, "loss": 0.146, "step": 3069 }, { "epoch": 0.644416456759026, "grad_norm": 0.12678882479667664, "learning_rate": 3.382614339713986e-05, "loss": 0.1642, "step": 3070 }, { "epoch": 0.6446263643996641, "grad_norm": 0.10640235245227814, "learning_rate": 3.379147678406296e-05, "loss": 0.1381, "step": 3071 }, { "epoch": 0.6448362720403022, "grad_norm": 0.13540606200695038, "learning_rate": 3.3756818875333264e-05, "loss": 0.1519, "step": 3072 }, { "epoch": 0.6450461796809404, "grad_norm": 0.14184518158435822, "learning_rate": 3.372216968956286e-05, "loss": 0.1577, "step": 3073 }, { "epoch": 0.6452560873215785, "grad_norm": 0.1309628039598465, "learning_rate": 3.368752924535914e-05, "loss": 0.1513, "step": 3074 }, { "epoch": 0.6454659949622166, "grad_norm": 0.11215526610612869, "learning_rate": 3.3652897561324854e-05, "loss": 0.1553, "step": 3075 }, { "epoch": 0.6456759026028548, "grad_norm": 0.13833361864089966, "learning_rate": 3.361827465605797e-05, "loss": 0.1704, "step": 3076 }, { "epoch": 0.6458858102434929, "grad_norm": 0.10265146195888519, "learning_rate": 3.35836605481518e-05, "loss": 0.1483, "step": 3077 }, { "epoch": 0.646095717884131, "grad_norm": 0.10432995110750198, "learning_rate": 3.354905525619492e-05, "loss": 0.1549, "step": 3078 }, { "epoch": 0.6463056255247691, "grad_norm": 0.15146160125732422, "learning_rate": 3.3514458798771135e-05, "loss": 0.1563, "step": 3079 }, { "epoch": 0.6465155331654072, "grad_norm": 0.13524983823299408, "learning_rate": 3.347987119445953e-05, "loss": 0.1638, "step": 3080 }, { "epoch": 0.6467254408060453, "grad_norm": 0.11744452267885208, "learning_rate": 3.344529246183448e-05, "loss": 0.1476, "step": 3081 }, { "epoch": 0.6469353484466834, "grad_norm": 0.11828684061765671, "learning_rate": 3.341072261946549e-05, "loss": 0.1505, "step": 3082 }, { "epoch": 0.6471452560873215, "grad_norm": 0.1145751029253006, "learning_rate": 3.337616168591743e-05, "loss": 0.1522, "step": 3083 }, { "epoch": 0.6473551637279596, "grad_norm": 0.10468050092458725, "learning_rate": 3.334160967975025e-05, "loss": 0.1472, "step": 3084 }, { "epoch": 0.6475650713685979, "grad_norm": 0.10044942796230316, "learning_rate": 3.3307066619519225e-05, "loss": 0.1608, "step": 3085 }, { "epoch": 0.647774979009236, "grad_norm": 0.11021267622709274, "learning_rate": 3.327253252377469e-05, "loss": 0.1476, "step": 3086 }, { "epoch": 0.6479848866498741, "grad_norm": 0.11376648396253586, "learning_rate": 3.323800741106231e-05, "loss": 0.1584, "step": 3087 }, { "epoch": 0.6481947942905122, "grad_norm": 0.12721943855285645, "learning_rate": 3.320349129992282e-05, "loss": 0.1635, "step": 3088 }, { "epoch": 0.6484047019311503, "grad_norm": 0.11218689382076263, "learning_rate": 3.3168984208892186e-05, "loss": 0.1543, "step": 3089 }, { "epoch": 0.6486146095717884, "grad_norm": 0.10654529184103012, "learning_rate": 3.3134486156501474e-05, "loss": 0.1484, "step": 3090 }, { "epoch": 0.6488245172124265, "grad_norm": 0.12575599551200867, "learning_rate": 3.309999716127696e-05, "loss": 0.1611, "step": 3091 }, { "epoch": 0.6490344248530646, "grad_norm": 0.1281367838382721, "learning_rate": 3.306551724174002e-05, "loss": 0.1555, "step": 3092 }, { "epoch": 0.6492443324937027, "grad_norm": 0.12584231793880463, "learning_rate": 3.303104641640714e-05, "loss": 0.143, "step": 3093 }, { "epoch": 0.6494542401343409, "grad_norm": 0.11681025475263596, "learning_rate": 3.299658470378993e-05, "loss": 0.1556, "step": 3094 }, { "epoch": 0.649664147774979, "grad_norm": 0.12307055294513702, "learning_rate": 3.2962132122395144e-05, "loss": 0.1556, "step": 3095 }, { "epoch": 0.6498740554156172, "grad_norm": 0.123899906873703, "learning_rate": 3.2927688690724584e-05, "loss": 0.1494, "step": 3096 }, { "epoch": 0.6500839630562553, "grad_norm": 0.13147427141666412, "learning_rate": 3.2893254427275186e-05, "loss": 0.1506, "step": 3097 }, { "epoch": 0.6502938706968934, "grad_norm": 0.11978549510240555, "learning_rate": 3.285882935053891e-05, "loss": 0.1497, "step": 3098 }, { "epoch": 0.6505037783375315, "grad_norm": 0.12688401341438293, "learning_rate": 3.2824413479002837e-05, "loss": 0.162, "step": 3099 }, { "epoch": 0.6507136859781696, "grad_norm": 0.13381405174732208, "learning_rate": 3.279000683114903e-05, "loss": 0.1558, "step": 3100 }, { "epoch": 0.6509235936188077, "grad_norm": 0.11726606637239456, "learning_rate": 3.2755609425454695e-05, "loss": 0.1566, "step": 3101 }, { "epoch": 0.6511335012594458, "grad_norm": 0.14988724887371063, "learning_rate": 3.272122128039198e-05, "loss": 0.1487, "step": 3102 }, { "epoch": 0.6513434089000839, "grad_norm": 0.15168358385562897, "learning_rate": 3.268684241442814e-05, "loss": 0.1637, "step": 3103 }, { "epoch": 0.6515533165407221, "grad_norm": 0.10664239525794983, "learning_rate": 3.2652472846025394e-05, "loss": 0.1514, "step": 3104 }, { "epoch": 0.6517632241813602, "grad_norm": 0.12004858255386353, "learning_rate": 3.261811259364098e-05, "loss": 0.177, "step": 3105 }, { "epoch": 0.6519731318219983, "grad_norm": 0.11165560036897659, "learning_rate": 3.258376167572718e-05, "loss": 0.1578, "step": 3106 }, { "epoch": 0.6521830394626364, "grad_norm": 0.1488139033317566, "learning_rate": 3.2549420110731166e-05, "loss": 0.1524, "step": 3107 }, { "epoch": 0.6523929471032746, "grad_norm": 0.12690328061580658, "learning_rate": 3.251508791709517e-05, "loss": 0.171, "step": 3108 }, { "epoch": 0.6526028547439127, "grad_norm": 0.12381751090288162, "learning_rate": 3.248076511325635e-05, "loss": 0.1619, "step": 3109 }, { "epoch": 0.6528127623845508, "grad_norm": 0.10168858617544174, "learning_rate": 3.244645171764685e-05, "loss": 0.1493, "step": 3110 }, { "epoch": 0.6530226700251889, "grad_norm": 0.1144690215587616, "learning_rate": 3.2412147748693746e-05, "loss": 0.1627, "step": 3111 }, { "epoch": 0.653232577665827, "grad_norm": 0.12090957164764404, "learning_rate": 3.2377853224819056e-05, "loss": 0.1716, "step": 3112 }, { "epoch": 0.6534424853064652, "grad_norm": 0.11143061518669128, "learning_rate": 3.2343568164439716e-05, "loss": 0.1552, "step": 3113 }, { "epoch": 0.6536523929471033, "grad_norm": 0.12665286660194397, "learning_rate": 3.2309292585967585e-05, "loss": 0.1642, "step": 3114 }, { "epoch": 0.6538623005877414, "grad_norm": 0.12189696729183197, "learning_rate": 3.227502650780943e-05, "loss": 0.1598, "step": 3115 }, { "epoch": 0.6540722082283795, "grad_norm": 0.12147743999958038, "learning_rate": 3.224076994836692e-05, "loss": 0.1518, "step": 3116 }, { "epoch": 0.6542821158690176, "grad_norm": 0.10486441850662231, "learning_rate": 3.2206522926036616e-05, "loss": 0.1327, "step": 3117 }, { "epoch": 0.6544920235096557, "grad_norm": 0.13366806507110596, "learning_rate": 3.217228545920994e-05, "loss": 0.169, "step": 3118 }, { "epoch": 0.6547019311502938, "grad_norm": 0.12645749747753143, "learning_rate": 3.21380575662732e-05, "loss": 0.1449, "step": 3119 }, { "epoch": 0.654911838790932, "grad_norm": 0.11253198981285095, "learning_rate": 3.210383926560759e-05, "loss": 0.1596, "step": 3120 }, { "epoch": 0.6551217464315701, "grad_norm": 0.1114022433757782, "learning_rate": 3.2069630575589046e-05, "loss": 0.1436, "step": 3121 }, { "epoch": 0.6553316540722083, "grad_norm": 0.11162886768579483, "learning_rate": 3.203543151458847e-05, "loss": 0.1565, "step": 3122 }, { "epoch": 0.6555415617128464, "grad_norm": 0.09162106364965439, "learning_rate": 3.200124210097152e-05, "loss": 0.1511, "step": 3123 }, { "epoch": 0.6557514693534845, "grad_norm": 0.10009972006082535, "learning_rate": 3.196706235309871e-05, "loss": 0.1409, "step": 3124 }, { "epoch": 0.6559613769941226, "grad_norm": 0.11142247170209885, "learning_rate": 3.193289228932531e-05, "loss": 0.1459, "step": 3125 }, { "epoch": 0.6561712846347607, "grad_norm": 0.11290930956602097, "learning_rate": 3.189873192800147e-05, "loss": 0.171, "step": 3126 }, { "epoch": 0.6563811922753988, "grad_norm": 0.11958450824022293, "learning_rate": 3.186458128747206e-05, "loss": 0.1613, "step": 3127 }, { "epoch": 0.6565910999160369, "grad_norm": 0.1151280552148819, "learning_rate": 3.1830440386076745e-05, "loss": 0.1648, "step": 3128 }, { "epoch": 0.656801007556675, "grad_norm": 0.11343618482351303, "learning_rate": 3.179630924214998e-05, "loss": 0.1578, "step": 3129 }, { "epoch": 0.6570109151973131, "grad_norm": 0.1188909262418747, "learning_rate": 3.176218787402097e-05, "loss": 0.1682, "step": 3130 }, { "epoch": 0.6572208228379512, "grad_norm": 0.13589932024478912, "learning_rate": 3.172807630001366e-05, "loss": 0.1731, "step": 3131 }, { "epoch": 0.6574307304785895, "grad_norm": 0.12900803983211517, "learning_rate": 3.1693974538446756e-05, "loss": 0.1457, "step": 3132 }, { "epoch": 0.6576406381192276, "grad_norm": 0.12908852100372314, "learning_rate": 3.165988260763369e-05, "loss": 0.159, "step": 3133 }, { "epoch": 0.6578505457598657, "grad_norm": 0.14849267899990082, "learning_rate": 3.16258005258826e-05, "loss": 0.1688, "step": 3134 }, { "epoch": 0.6580604534005038, "grad_norm": 0.1258804351091385, "learning_rate": 3.159172831149631e-05, "loss": 0.1725, "step": 3135 }, { "epoch": 0.6582703610411419, "grad_norm": 0.12530116736888885, "learning_rate": 3.155766598277243e-05, "loss": 0.1408, "step": 3136 }, { "epoch": 0.65848026868178, "grad_norm": 0.14081692695617676, "learning_rate": 3.1523613558003174e-05, "loss": 0.1622, "step": 3137 }, { "epoch": 0.6586901763224181, "grad_norm": 0.1500757932662964, "learning_rate": 3.1489571055475486e-05, "loss": 0.1671, "step": 3138 }, { "epoch": 0.6589000839630562, "grad_norm": 0.11542218178510666, "learning_rate": 3.145553849347097e-05, "loss": 0.1495, "step": 3139 }, { "epoch": 0.6591099916036943, "grad_norm": 0.12322347611188889, "learning_rate": 3.1421515890265893e-05, "loss": 0.139, "step": 3140 }, { "epoch": 0.6593198992443325, "grad_norm": 0.1026599332690239, "learning_rate": 3.1387503264131137e-05, "loss": 0.1599, "step": 3141 }, { "epoch": 0.6595298068849706, "grad_norm": 0.13048797845840454, "learning_rate": 3.135350063333229e-05, "loss": 0.177, "step": 3142 }, { "epoch": 0.6597397145256088, "grad_norm": 0.11457457393407822, "learning_rate": 3.1319508016129526e-05, "loss": 0.176, "step": 3143 }, { "epoch": 0.6599496221662469, "grad_norm": 0.11056394875049591, "learning_rate": 3.1285525430777686e-05, "loss": 0.1572, "step": 3144 }, { "epoch": 0.660159529806885, "grad_norm": 0.12389667332172394, "learning_rate": 3.125155289552616e-05, "loss": 0.1479, "step": 3145 }, { "epoch": 0.6603694374475231, "grad_norm": 0.1127491444349289, "learning_rate": 3.121759042861898e-05, "loss": 0.1665, "step": 3146 }, { "epoch": 0.6605793450881612, "grad_norm": 0.11531811952590942, "learning_rate": 3.11836380482948e-05, "loss": 0.1625, "step": 3147 }, { "epoch": 0.6607892527287993, "grad_norm": 0.1351030021905899, "learning_rate": 3.114969577278679e-05, "loss": 0.1437, "step": 3148 }, { "epoch": 0.6609991603694374, "grad_norm": 0.13135118782520294, "learning_rate": 3.111576362032273e-05, "loss": 0.1598, "step": 3149 }, { "epoch": 0.6612090680100756, "grad_norm": 0.112535759806633, "learning_rate": 3.1081841609124964e-05, "loss": 0.1584, "step": 3150 }, { "epoch": 0.6614189756507137, "grad_norm": 0.1222812607884407, "learning_rate": 3.10479297574104e-05, "loss": 0.1773, "step": 3151 }, { "epoch": 0.6616288832913518, "grad_norm": 0.11697451025247574, "learning_rate": 3.1014028083390456e-05, "loss": 0.1705, "step": 3152 }, { "epoch": 0.6618387909319899, "grad_norm": 0.12584251165390015, "learning_rate": 3.0980136605271127e-05, "loss": 0.1519, "step": 3153 }, { "epoch": 0.662048698572628, "grad_norm": 0.12311432510614395, "learning_rate": 3.0946255341252914e-05, "loss": 0.1631, "step": 3154 }, { "epoch": 0.6622586062132662, "grad_norm": 0.1087951511144638, "learning_rate": 3.09123843095308e-05, "loss": 0.1443, "step": 3155 }, { "epoch": 0.6624685138539043, "grad_norm": 0.11787470430135727, "learning_rate": 3.087852352829431e-05, "loss": 0.1655, "step": 3156 }, { "epoch": 0.6626784214945424, "grad_norm": 0.13332132995128632, "learning_rate": 3.0844673015727476e-05, "loss": 0.1495, "step": 3157 }, { "epoch": 0.6628883291351805, "grad_norm": 0.09370341897010803, "learning_rate": 3.081083279000878e-05, "loss": 0.1664, "step": 3158 }, { "epoch": 0.6630982367758187, "grad_norm": 0.1299756020307541, "learning_rate": 3.0777002869311214e-05, "loss": 0.1561, "step": 3159 }, { "epoch": 0.6633081444164568, "grad_norm": 0.1356882005929947, "learning_rate": 3.074318327180219e-05, "loss": 0.1468, "step": 3160 }, { "epoch": 0.6635180520570949, "grad_norm": 0.12208224087953568, "learning_rate": 3.070937401564365e-05, "loss": 0.1571, "step": 3161 }, { "epoch": 0.663727959697733, "grad_norm": 0.1734291911125183, "learning_rate": 3.0675575118991886e-05, "loss": 0.1561, "step": 3162 }, { "epoch": 0.6639378673383711, "grad_norm": 0.11419927328824997, "learning_rate": 3.064178659999771e-05, "loss": 0.1735, "step": 3163 }, { "epoch": 0.6641477749790092, "grad_norm": 0.10793764144182205, "learning_rate": 3.06080084768063e-05, "loss": 0.1581, "step": 3164 }, { "epoch": 0.6643576826196473, "grad_norm": 0.12163906544446945, "learning_rate": 3.0574240767557315e-05, "loss": 0.1681, "step": 3165 }, { "epoch": 0.6645675902602854, "grad_norm": 0.12725725769996643, "learning_rate": 3.054048349038476e-05, "loss": 0.1486, "step": 3166 }, { "epoch": 0.6647774979009236, "grad_norm": 0.1115790456533432, "learning_rate": 3.050673666341708e-05, "loss": 0.1458, "step": 3167 }, { "epoch": 0.6649874055415617, "grad_norm": 0.12533406913280487, "learning_rate": 3.04730003047771e-05, "loss": 0.1533, "step": 3168 }, { "epoch": 0.6651973131821999, "grad_norm": 0.11061311513185501, "learning_rate": 3.0439274432581987e-05, "loss": 0.157, "step": 3169 }, { "epoch": 0.665407220822838, "grad_norm": 0.11690240353345871, "learning_rate": 3.04055590649433e-05, "loss": 0.1543, "step": 3170 }, { "epoch": 0.6656171284634761, "grad_norm": 0.11588296294212341, "learning_rate": 3.0371854219967e-05, "loss": 0.15, "step": 3171 }, { "epoch": 0.6658270361041142, "grad_norm": 0.13207639753818512, "learning_rate": 3.0338159915753328e-05, "loss": 0.1495, "step": 3172 }, { "epoch": 0.6660369437447523, "grad_norm": 0.14322279393672943, "learning_rate": 3.0304476170396912e-05, "loss": 0.1324, "step": 3173 }, { "epoch": 0.6662468513853904, "grad_norm": 0.1106521412730217, "learning_rate": 3.0270803001986682e-05, "loss": 0.134, "step": 3174 }, { "epoch": 0.6664567590260285, "grad_norm": 0.1531291902065277, "learning_rate": 3.0237140428605913e-05, "loss": 0.1617, "step": 3175 }, { "epoch": 0.6666666666666666, "grad_norm": 0.1315673142671585, "learning_rate": 3.0203488468332154e-05, "loss": 0.1656, "step": 3176 }, { "epoch": 0.6668765743073047, "grad_norm": 0.1372612863779068, "learning_rate": 3.0169847139237294e-05, "loss": 0.1671, "step": 3177 }, { "epoch": 0.667086481947943, "grad_norm": 0.10959561169147491, "learning_rate": 3.0136216459387477e-05, "loss": 0.1617, "step": 3178 }, { "epoch": 0.6672963895885811, "grad_norm": 0.11745753139257431, "learning_rate": 3.010259644684317e-05, "loss": 0.1597, "step": 3179 }, { "epoch": 0.6675062972292192, "grad_norm": 0.12485745549201965, "learning_rate": 3.006898711965907e-05, "loss": 0.14, "step": 3180 }, { "epoch": 0.6677162048698573, "grad_norm": 0.12715327739715576, "learning_rate": 3.0035388495884154e-05, "loss": 0.1428, "step": 3181 }, { "epoch": 0.6679261125104954, "grad_norm": 0.12533673644065857, "learning_rate": 3.000180059356168e-05, "loss": 0.1351, "step": 3182 }, { "epoch": 0.6681360201511335, "grad_norm": 0.11587972939014435, "learning_rate": 2.9968223430729076e-05, "loss": 0.168, "step": 3183 }, { "epoch": 0.6683459277917716, "grad_norm": 0.12353124469518661, "learning_rate": 2.9934657025418055e-05, "loss": 0.1468, "step": 3184 }, { "epoch": 0.6685558354324097, "grad_norm": 0.13154150545597076, "learning_rate": 2.990110139565457e-05, "loss": 0.1688, "step": 3185 }, { "epoch": 0.6687657430730478, "grad_norm": 0.13129670917987823, "learning_rate": 2.9867556559458738e-05, "loss": 0.1636, "step": 3186 }, { "epoch": 0.668975650713686, "grad_norm": 0.1262674629688263, "learning_rate": 2.9834022534844897e-05, "loss": 0.178, "step": 3187 }, { "epoch": 0.6691855583543241, "grad_norm": 0.13294339179992676, "learning_rate": 2.9800499339821603e-05, "loss": 0.1688, "step": 3188 }, { "epoch": 0.6693954659949622, "grad_norm": 0.11901882290840149, "learning_rate": 2.9766986992391583e-05, "loss": 0.1786, "step": 3189 }, { "epoch": 0.6696053736356004, "grad_norm": 0.13393592834472656, "learning_rate": 2.9733485510551706e-05, "loss": 0.1481, "step": 3190 }, { "epoch": 0.6698152812762385, "grad_norm": 0.1557946652173996, "learning_rate": 2.9699994912293038e-05, "loss": 0.1651, "step": 3191 }, { "epoch": 0.6700251889168766, "grad_norm": 0.13933785259723663, "learning_rate": 2.966651521560081e-05, "loss": 0.1512, "step": 3192 }, { "epoch": 0.6702350965575147, "grad_norm": 0.12215977162122726, "learning_rate": 2.963304643845437e-05, "loss": 0.1748, "step": 3193 }, { "epoch": 0.6704450041981528, "grad_norm": 0.13210149109363556, "learning_rate": 2.9599588598827233e-05, "loss": 0.176, "step": 3194 }, { "epoch": 0.6706549118387909, "grad_norm": 0.14156967401504517, "learning_rate": 2.9566141714687e-05, "loss": 0.1388, "step": 3195 }, { "epoch": 0.670864819479429, "grad_norm": 0.12155076861381531, "learning_rate": 2.9532705803995464e-05, "loss": 0.1399, "step": 3196 }, { "epoch": 0.6710747271200672, "grad_norm": 0.12318848073482513, "learning_rate": 2.9499280884708393e-05, "loss": 0.1541, "step": 3197 }, { "epoch": 0.6712846347607053, "grad_norm": 0.1175316795706749, "learning_rate": 2.946586697477579e-05, "loss": 0.1486, "step": 3198 }, { "epoch": 0.6714945424013434, "grad_norm": 0.12203275412321091, "learning_rate": 2.9432464092141655e-05, "loss": 0.1647, "step": 3199 }, { "epoch": 0.6717044500419815, "grad_norm": 0.15709438920021057, "learning_rate": 2.939907225474413e-05, "loss": 0.1512, "step": 3200 }, { "epoch": 0.6719143576826196, "grad_norm": 0.11067797243595123, "learning_rate": 2.9365691480515368e-05, "loss": 0.1693, "step": 3201 }, { "epoch": 0.6721242653232578, "grad_norm": 0.0939958468079567, "learning_rate": 2.9332321787381622e-05, "loss": 0.1538, "step": 3202 }, { "epoch": 0.6723341729638959, "grad_norm": 0.12024837732315063, "learning_rate": 2.9298963193263195e-05, "loss": 0.1575, "step": 3203 }, { "epoch": 0.672544080604534, "grad_norm": 0.10305708646774292, "learning_rate": 2.926561571607438e-05, "loss": 0.1482, "step": 3204 }, { "epoch": 0.6727539882451721, "grad_norm": 0.12283535301685333, "learning_rate": 2.923227937372354e-05, "loss": 0.1531, "step": 3205 }, { "epoch": 0.6729638958858103, "grad_norm": 0.12914074957370758, "learning_rate": 2.9198954184113074e-05, "loss": 0.1724, "step": 3206 }, { "epoch": 0.6731738035264484, "grad_norm": 0.12399044632911682, "learning_rate": 2.9165640165139346e-05, "loss": 0.1426, "step": 3207 }, { "epoch": 0.6733837111670865, "grad_norm": 0.12031276524066925, "learning_rate": 2.9132337334692767e-05, "loss": 0.1614, "step": 3208 }, { "epoch": 0.6735936188077246, "grad_norm": 0.10775694251060486, "learning_rate": 2.9099045710657702e-05, "loss": 0.1346, "step": 3209 }, { "epoch": 0.6738035264483627, "grad_norm": 0.11117519438266754, "learning_rate": 2.9065765310912523e-05, "loss": 0.1516, "step": 3210 }, { "epoch": 0.6740134340890008, "grad_norm": 0.12548020482063293, "learning_rate": 2.9032496153329558e-05, "loss": 0.1512, "step": 3211 }, { "epoch": 0.6742233417296389, "grad_norm": 0.11416387557983398, "learning_rate": 2.8999238255775113e-05, "loss": 0.1363, "step": 3212 }, { "epoch": 0.674433249370277, "grad_norm": 0.12334737926721573, "learning_rate": 2.896599163610943e-05, "loss": 0.157, "step": 3213 }, { "epoch": 0.6746431570109152, "grad_norm": 0.13314662873744965, "learning_rate": 2.8932756312186703e-05, "loss": 0.1614, "step": 3214 }, { "epoch": 0.6748530646515534, "grad_norm": 0.1167728453874588, "learning_rate": 2.889953230185505e-05, "loss": 0.1456, "step": 3215 }, { "epoch": 0.6750629722921915, "grad_norm": 0.10373178124427795, "learning_rate": 2.886631962295655e-05, "loss": 0.1516, "step": 3216 }, { "epoch": 0.6752728799328296, "grad_norm": 0.12552447617053986, "learning_rate": 2.8833118293327177e-05, "loss": 0.1517, "step": 3217 }, { "epoch": 0.6754827875734677, "grad_norm": 0.12713772058486938, "learning_rate": 2.8799928330796742e-05, "loss": 0.1542, "step": 3218 }, { "epoch": 0.6756926952141058, "grad_norm": 0.13052749633789062, "learning_rate": 2.876674975318908e-05, "loss": 0.1585, "step": 3219 }, { "epoch": 0.6759026028547439, "grad_norm": 0.1303415447473526, "learning_rate": 2.8733582578321817e-05, "loss": 0.164, "step": 3220 }, { "epoch": 0.676112510495382, "grad_norm": 0.11603204160928726, "learning_rate": 2.8700426824006487e-05, "loss": 0.1487, "step": 3221 }, { "epoch": 0.6763224181360201, "grad_norm": 0.11492080241441727, "learning_rate": 2.8667282508048488e-05, "loss": 0.1541, "step": 3222 }, { "epoch": 0.6765323257766582, "grad_norm": 0.1157570406794548, "learning_rate": 2.8634149648247045e-05, "loss": 0.1717, "step": 3223 }, { "epoch": 0.6767422334172963, "grad_norm": 0.11512907594442368, "learning_rate": 2.8601028262395336e-05, "loss": 0.1701, "step": 3224 }, { "epoch": 0.6769521410579346, "grad_norm": 0.10905471444129944, "learning_rate": 2.8567918368280233e-05, "loss": 0.1583, "step": 3225 }, { "epoch": 0.6771620486985727, "grad_norm": 0.12866859138011932, "learning_rate": 2.8534819983682525e-05, "loss": 0.1519, "step": 3226 }, { "epoch": 0.6773719563392108, "grad_norm": 0.12510065734386444, "learning_rate": 2.8501733126376783e-05, "loss": 0.1347, "step": 3227 }, { "epoch": 0.6775818639798489, "grad_norm": 0.10416650027036667, "learning_rate": 2.8468657814131443e-05, "loss": 0.149, "step": 3228 }, { "epoch": 0.677791771620487, "grad_norm": 0.12856334447860718, "learning_rate": 2.843559406470869e-05, "loss": 0.1694, "step": 3229 }, { "epoch": 0.6780016792611251, "grad_norm": 0.13767604529857635, "learning_rate": 2.84025418958645e-05, "loss": 0.1659, "step": 3230 }, { "epoch": 0.6782115869017632, "grad_norm": 0.12578238546848297, "learning_rate": 2.8369501325348658e-05, "loss": 0.1686, "step": 3231 }, { "epoch": 0.6784214945424013, "grad_norm": 0.11950796842575073, "learning_rate": 2.8336472370904688e-05, "loss": 0.1435, "step": 3232 }, { "epoch": 0.6786314021830394, "grad_norm": 0.1251773238182068, "learning_rate": 2.8303455050269912e-05, "loss": 0.154, "step": 3233 }, { "epoch": 0.6788413098236776, "grad_norm": 0.13571852445602417, "learning_rate": 2.8270449381175367e-05, "loss": 0.1417, "step": 3234 }, { "epoch": 0.6790512174643157, "grad_norm": 0.12125960737466812, "learning_rate": 2.8237455381345846e-05, "loss": 0.1635, "step": 3235 }, { "epoch": 0.6792611251049538, "grad_norm": 0.1407996267080307, "learning_rate": 2.8204473068499914e-05, "loss": 0.1663, "step": 3236 }, { "epoch": 0.679471032745592, "grad_norm": 0.09970130771398544, "learning_rate": 2.8171502460349825e-05, "loss": 0.1593, "step": 3237 }, { "epoch": 0.6796809403862301, "grad_norm": 0.10902740061283112, "learning_rate": 2.8138543574601506e-05, "loss": 0.1619, "step": 3238 }, { "epoch": 0.6798908480268682, "grad_norm": 0.10807805508375168, "learning_rate": 2.8105596428954645e-05, "loss": 0.1743, "step": 3239 }, { "epoch": 0.6801007556675063, "grad_norm": 0.11692219227552414, "learning_rate": 2.807266104110264e-05, "loss": 0.1482, "step": 3240 }, { "epoch": 0.6803106633081444, "grad_norm": 0.13072606921195984, "learning_rate": 2.8039737428732526e-05, "loss": 0.1614, "step": 3241 }, { "epoch": 0.6805205709487825, "grad_norm": 0.12482010573148727, "learning_rate": 2.800682560952504e-05, "loss": 0.1431, "step": 3242 }, { "epoch": 0.6807304785894207, "grad_norm": 0.11568538844585419, "learning_rate": 2.797392560115456e-05, "loss": 0.1525, "step": 3243 }, { "epoch": 0.6809403862300588, "grad_norm": 0.12819020450115204, "learning_rate": 2.7941037421289197e-05, "loss": 0.134, "step": 3244 }, { "epoch": 0.6811502938706969, "grad_norm": 0.13712668418884277, "learning_rate": 2.79081610875906e-05, "loss": 0.1565, "step": 3245 }, { "epoch": 0.681360201511335, "grad_norm": 0.1392073929309845, "learning_rate": 2.787529661771413e-05, "loss": 0.1685, "step": 3246 }, { "epoch": 0.6815701091519731, "grad_norm": 0.12498795986175537, "learning_rate": 2.7842444029308746e-05, "loss": 0.1477, "step": 3247 }, { "epoch": 0.6817800167926112, "grad_norm": 0.13243268430233002, "learning_rate": 2.7809603340017064e-05, "loss": 0.1549, "step": 3248 }, { "epoch": 0.6819899244332494, "grad_norm": 0.1170286163687706, "learning_rate": 2.7776774567475273e-05, "loss": 0.143, "step": 3249 }, { "epoch": 0.6821998320738875, "grad_norm": 0.10857724398374557, "learning_rate": 2.7743957729313165e-05, "loss": 0.1534, "step": 3250 }, { "epoch": 0.6824097397145256, "grad_norm": 0.13726240396499634, "learning_rate": 2.771115284315414e-05, "loss": 0.1608, "step": 3251 }, { "epoch": 0.6826196473551638, "grad_norm": 0.1267462968826294, "learning_rate": 2.7678359926615173e-05, "loss": 0.1562, "step": 3252 }, { "epoch": 0.6828295549958019, "grad_norm": 0.12552182376384735, "learning_rate": 2.764557899730681e-05, "loss": 0.1796, "step": 3253 }, { "epoch": 0.68303946263644, "grad_norm": 0.15919016301631927, "learning_rate": 2.7612810072833146e-05, "loss": 0.1457, "step": 3254 }, { "epoch": 0.6832493702770781, "grad_norm": 0.1230967566370964, "learning_rate": 2.7580053170791853e-05, "loss": 0.1374, "step": 3255 }, { "epoch": 0.6834592779177162, "grad_norm": 0.11543905735015869, "learning_rate": 2.7547308308774112e-05, "loss": 0.1452, "step": 3256 }, { "epoch": 0.6836691855583543, "grad_norm": 0.11295941472053528, "learning_rate": 2.7514575504364702e-05, "loss": 0.164, "step": 3257 }, { "epoch": 0.6838790931989924, "grad_norm": 0.11988019198179245, "learning_rate": 2.7481854775141896e-05, "loss": 0.1611, "step": 3258 }, { "epoch": 0.6840890008396305, "grad_norm": 0.1507832258939743, "learning_rate": 2.7449146138677397e-05, "loss": 0.1701, "step": 3259 }, { "epoch": 0.6842989084802686, "grad_norm": 0.12370573729276657, "learning_rate": 2.7416449612536566e-05, "loss": 0.1511, "step": 3260 }, { "epoch": 0.6845088161209067, "grad_norm": 0.11700747907161713, "learning_rate": 2.7383765214278144e-05, "loss": 0.158, "step": 3261 }, { "epoch": 0.684718723761545, "grad_norm": 0.10759387165307999, "learning_rate": 2.7351092961454418e-05, "loss": 0.152, "step": 3262 }, { "epoch": 0.6849286314021831, "grad_norm": 0.10964877158403397, "learning_rate": 2.7318432871611126e-05, "loss": 0.1713, "step": 3263 }, { "epoch": 0.6851385390428212, "grad_norm": 0.12055815756320953, "learning_rate": 2.7285784962287465e-05, "loss": 0.1623, "step": 3264 }, { "epoch": 0.6853484466834593, "grad_norm": 0.10874463617801666, "learning_rate": 2.7253149251016163e-05, "loss": 0.1453, "step": 3265 }, { "epoch": 0.6855583543240974, "grad_norm": 0.1362292468547821, "learning_rate": 2.722052575532329e-05, "loss": 0.1497, "step": 3266 }, { "epoch": 0.6857682619647355, "grad_norm": 0.11515046656131744, "learning_rate": 2.718791449272843e-05, "loss": 0.1507, "step": 3267 }, { "epoch": 0.6859781696053736, "grad_norm": 0.10474392026662827, "learning_rate": 2.7155315480744542e-05, "loss": 0.1558, "step": 3268 }, { "epoch": 0.6861880772460117, "grad_norm": 0.10753299295902252, "learning_rate": 2.7122728736878088e-05, "loss": 0.164, "step": 3269 }, { "epoch": 0.6863979848866498, "grad_norm": 0.11624909192323685, "learning_rate": 2.7090154278628884e-05, "loss": 0.1512, "step": 3270 }, { "epoch": 0.686607892527288, "grad_norm": 0.1127047911286354, "learning_rate": 2.7057592123490138e-05, "loss": 0.1813, "step": 3271 }, { "epoch": 0.6868178001679262, "grad_norm": 0.18497790396213531, "learning_rate": 2.702504228894849e-05, "loss": 0.1857, "step": 3272 }, { "epoch": 0.6870277078085643, "grad_norm": 0.12313918769359589, "learning_rate": 2.699250479248393e-05, "loss": 0.1577, "step": 3273 }, { "epoch": 0.6872376154492024, "grad_norm": 0.10727359354496002, "learning_rate": 2.695997965156985e-05, "loss": 0.1543, "step": 3274 }, { "epoch": 0.6874475230898405, "grad_norm": 0.13982883095741272, "learning_rate": 2.692746688367298e-05, "loss": 0.1453, "step": 3275 }, { "epoch": 0.6876574307304786, "grad_norm": 0.12415811419487, "learning_rate": 2.689496650625341e-05, "loss": 0.1614, "step": 3276 }, { "epoch": 0.6878673383711167, "grad_norm": 0.13363641500473022, "learning_rate": 2.6862478536764625e-05, "loss": 0.1731, "step": 3277 }, { "epoch": 0.6880772460117548, "grad_norm": 0.12584188580513, "learning_rate": 2.6830002992653375e-05, "loss": 0.1698, "step": 3278 }, { "epoch": 0.6882871536523929, "grad_norm": 0.11803454160690308, "learning_rate": 2.6797539891359803e-05, "loss": 0.1713, "step": 3279 }, { "epoch": 0.6884970612930311, "grad_norm": 0.11334547400474548, "learning_rate": 2.6765089250317266e-05, "loss": 0.1586, "step": 3280 }, { "epoch": 0.6887069689336692, "grad_norm": 0.12662799656391144, "learning_rate": 2.673265108695256e-05, "loss": 0.165, "step": 3281 }, { "epoch": 0.6889168765743073, "grad_norm": 0.11635055392980576, "learning_rate": 2.670022541868571e-05, "loss": 0.1566, "step": 3282 }, { "epoch": 0.6891267842149454, "grad_norm": 0.12273656576871872, "learning_rate": 2.6667812262930027e-05, "loss": 0.1591, "step": 3283 }, { "epoch": 0.6893366918555835, "grad_norm": 0.10126462578773499, "learning_rate": 2.66354116370921e-05, "loss": 0.1567, "step": 3284 }, { "epoch": 0.6895465994962217, "grad_norm": 0.10264413803815842, "learning_rate": 2.660302355857186e-05, "loss": 0.1605, "step": 3285 }, { "epoch": 0.6897565071368598, "grad_norm": 0.11424882709980011, "learning_rate": 2.6570648044762426e-05, "loss": 0.1309, "step": 3286 }, { "epoch": 0.6899664147774979, "grad_norm": 0.135597363114357, "learning_rate": 2.653828511305015e-05, "loss": 0.1575, "step": 3287 }, { "epoch": 0.690176322418136, "grad_norm": 0.1263936311006546, "learning_rate": 2.650593478081468e-05, "loss": 0.1459, "step": 3288 }, { "epoch": 0.6903862300587741, "grad_norm": 0.10977193713188171, "learning_rate": 2.6473597065428903e-05, "loss": 0.163, "step": 3289 }, { "epoch": 0.6905961376994123, "grad_norm": 0.13382400572299957, "learning_rate": 2.644127198425891e-05, "loss": 0.1532, "step": 3290 }, { "epoch": 0.6908060453400504, "grad_norm": 0.12279796600341797, "learning_rate": 2.6408959554663993e-05, "loss": 0.1564, "step": 3291 }, { "epoch": 0.6910159529806885, "grad_norm": 0.09982291609048843, "learning_rate": 2.6376659793996676e-05, "loss": 0.1521, "step": 3292 }, { "epoch": 0.6912258606213266, "grad_norm": 0.11847012490034103, "learning_rate": 2.6344372719602666e-05, "loss": 0.1573, "step": 3293 }, { "epoch": 0.6914357682619647, "grad_norm": 0.10027361661195755, "learning_rate": 2.6312098348820856e-05, "loss": 0.1491, "step": 3294 }, { "epoch": 0.6916456759026028, "grad_norm": 0.10242442041635513, "learning_rate": 2.6279836698983318e-05, "loss": 0.164, "step": 3295 }, { "epoch": 0.691855583543241, "grad_norm": 0.11035119742155075, "learning_rate": 2.6247587787415306e-05, "loss": 0.1445, "step": 3296 }, { "epoch": 0.6920654911838791, "grad_norm": 0.12222526222467422, "learning_rate": 2.621535163143519e-05, "loss": 0.1509, "step": 3297 }, { "epoch": 0.6922753988245172, "grad_norm": 0.11721225827932358, "learning_rate": 2.6183128248354566e-05, "loss": 0.1492, "step": 3298 }, { "epoch": 0.6924853064651554, "grad_norm": 0.09510528296232224, "learning_rate": 2.6150917655478107e-05, "loss": 0.1509, "step": 3299 }, { "epoch": 0.6926952141057935, "grad_norm": 0.13749557733535767, "learning_rate": 2.6118719870103638e-05, "loss": 0.1633, "step": 3300 }, { "epoch": 0.6929051217464316, "grad_norm": 0.1405005007982254, "learning_rate": 2.608653490952211e-05, "loss": 0.1508, "step": 3301 }, { "epoch": 0.6931150293870697, "grad_norm": 0.13257022202014923, "learning_rate": 2.6054362791017573e-05, "loss": 0.1527, "step": 3302 }, { "epoch": 0.6933249370277078, "grad_norm": 0.12121597677469254, "learning_rate": 2.6022203531867195e-05, "loss": 0.1613, "step": 3303 }, { "epoch": 0.6935348446683459, "grad_norm": 0.11893527954816818, "learning_rate": 2.599005714934123e-05, "loss": 0.1416, "step": 3304 }, { "epoch": 0.693744752308984, "grad_norm": 0.12664799392223358, "learning_rate": 2.5957923660703e-05, "loss": 0.1539, "step": 3305 }, { "epoch": 0.6939546599496221, "grad_norm": 0.14884023368358612, "learning_rate": 2.5925803083208966e-05, "loss": 0.1489, "step": 3306 }, { "epoch": 0.6941645675902602, "grad_norm": 0.13241392374038696, "learning_rate": 2.58936954341086e-05, "loss": 0.1585, "step": 3307 }, { "epoch": 0.6943744752308985, "grad_norm": 0.12439943104982376, "learning_rate": 2.586160073064442e-05, "loss": 0.1614, "step": 3308 }, { "epoch": 0.6945843828715366, "grad_norm": 0.10723230987787247, "learning_rate": 2.582951899005201e-05, "loss": 0.1402, "step": 3309 }, { "epoch": 0.6947942905121747, "grad_norm": 0.12096869945526123, "learning_rate": 2.579745022956003e-05, "loss": 0.1592, "step": 3310 }, { "epoch": 0.6950041981528128, "grad_norm": 0.11293628066778183, "learning_rate": 2.5765394466390126e-05, "loss": 0.1835, "step": 3311 }, { "epoch": 0.6952141057934509, "grad_norm": 0.11762647330760956, "learning_rate": 2.573335171775697e-05, "loss": 0.1518, "step": 3312 }, { "epoch": 0.695424013434089, "grad_norm": 0.13241271674633026, "learning_rate": 2.5701322000868223e-05, "loss": 0.161, "step": 3313 }, { "epoch": 0.6956339210747271, "grad_norm": 0.13515324890613556, "learning_rate": 2.566930533292465e-05, "loss": 0.1468, "step": 3314 }, { "epoch": 0.6958438287153652, "grad_norm": 0.11079118400812149, "learning_rate": 2.5637301731119862e-05, "loss": 0.145, "step": 3315 }, { "epoch": 0.6960537363560033, "grad_norm": 0.16726547479629517, "learning_rate": 2.5605311212640547e-05, "loss": 0.1358, "step": 3316 }, { "epoch": 0.6962636439966414, "grad_norm": 0.1353970617055893, "learning_rate": 2.5573333794666326e-05, "loss": 0.1537, "step": 3317 }, { "epoch": 0.6964735516372796, "grad_norm": 0.10820521414279938, "learning_rate": 2.554136949436984e-05, "loss": 0.1382, "step": 3318 }, { "epoch": 0.6966834592779177, "grad_norm": 0.20719845592975616, "learning_rate": 2.5509418328916635e-05, "loss": 0.1446, "step": 3319 }, { "epoch": 0.6968933669185559, "grad_norm": 0.12657080590724945, "learning_rate": 2.5477480315465217e-05, "loss": 0.1575, "step": 3320 }, { "epoch": 0.697103274559194, "grad_norm": 0.14084367454051971, "learning_rate": 2.5445555471167e-05, "loss": 0.136, "step": 3321 }, { "epoch": 0.6973131821998321, "grad_norm": 0.12968450784683228, "learning_rate": 2.541364381316639e-05, "loss": 0.1644, "step": 3322 }, { "epoch": 0.6975230898404702, "grad_norm": 0.1417001336812973, "learning_rate": 2.5381745358600662e-05, "loss": 0.155, "step": 3323 }, { "epoch": 0.6977329974811083, "grad_norm": 0.13210801780223846, "learning_rate": 2.5349860124600024e-05, "loss": 0.1715, "step": 3324 }, { "epoch": 0.6979429051217464, "grad_norm": 0.11423230171203613, "learning_rate": 2.531798812828754e-05, "loss": 0.1522, "step": 3325 }, { "epoch": 0.6981528127623845, "grad_norm": 0.1215028241276741, "learning_rate": 2.5286129386779256e-05, "loss": 0.1583, "step": 3326 }, { "epoch": 0.6983627204030227, "grad_norm": 0.1341346949338913, "learning_rate": 2.525428391718403e-05, "loss": 0.1684, "step": 3327 }, { "epoch": 0.6985726280436608, "grad_norm": 0.1249391958117485, "learning_rate": 2.5222451736603575e-05, "loss": 0.1583, "step": 3328 }, { "epoch": 0.6987825356842989, "grad_norm": 0.12421222776174545, "learning_rate": 2.5190632862132495e-05, "loss": 0.155, "step": 3329 }, { "epoch": 0.698992443324937, "grad_norm": 0.10891323536634445, "learning_rate": 2.5158827310858297e-05, "loss": 0.1539, "step": 3330 }, { "epoch": 0.6992023509655751, "grad_norm": 0.11101838946342468, "learning_rate": 2.5127035099861262e-05, "loss": 0.1493, "step": 3331 }, { "epoch": 0.6994122586062133, "grad_norm": 0.12890756130218506, "learning_rate": 2.5095256246214537e-05, "loss": 0.159, "step": 3332 }, { "epoch": 0.6996221662468514, "grad_norm": 0.10890180617570877, "learning_rate": 2.50634907669841e-05, "loss": 0.1376, "step": 3333 }, { "epoch": 0.6998320738874895, "grad_norm": 0.11944495141506195, "learning_rate": 2.503173867922872e-05, "loss": 0.1747, "step": 3334 }, { "epoch": 0.7000419815281276, "grad_norm": 0.10909661650657654, "learning_rate": 2.500000000000001e-05, "loss": 0.1483, "step": 3335 }, { "epoch": 0.7002518891687658, "grad_norm": 0.12965939939022064, "learning_rate": 2.496827474634236e-05, "loss": 0.1651, "step": 3336 }, { "epoch": 0.7004617968094039, "grad_norm": 0.13666345179080963, "learning_rate": 2.4936562935292958e-05, "loss": 0.1501, "step": 3337 }, { "epoch": 0.700671704450042, "grad_norm": 0.112842857837677, "learning_rate": 2.490486458388175e-05, "loss": 0.1633, "step": 3338 }, { "epoch": 0.7008816120906801, "grad_norm": 0.11788749694824219, "learning_rate": 2.487317970913151e-05, "loss": 0.1611, "step": 3339 }, { "epoch": 0.7010915197313182, "grad_norm": 0.11510729044675827, "learning_rate": 2.484150832805773e-05, "loss": 0.155, "step": 3340 }, { "epoch": 0.7013014273719563, "grad_norm": 0.1199810653924942, "learning_rate": 2.4809850457668653e-05, "loss": 0.1567, "step": 3341 }, { "epoch": 0.7015113350125944, "grad_norm": 0.13416971266269684, "learning_rate": 2.4778206114965284e-05, "loss": 0.146, "step": 3342 }, { "epoch": 0.7017212426532325, "grad_norm": 0.09663829207420349, "learning_rate": 2.474657531694136e-05, "loss": 0.1597, "step": 3343 }, { "epoch": 0.7019311502938707, "grad_norm": 0.1087060421705246, "learning_rate": 2.4714958080583338e-05, "loss": 0.1514, "step": 3344 }, { "epoch": 0.7021410579345088, "grad_norm": 0.12510518729686737, "learning_rate": 2.4683354422870403e-05, "loss": 0.1674, "step": 3345 }, { "epoch": 0.702350965575147, "grad_norm": 0.11586936563253403, "learning_rate": 2.465176436077441e-05, "loss": 0.1534, "step": 3346 }, { "epoch": 0.7025608732157851, "grad_norm": 0.15616261959075928, "learning_rate": 2.4620187911259996e-05, "loss": 0.1715, "step": 3347 }, { "epoch": 0.7027707808564232, "grad_norm": 0.11882652342319489, "learning_rate": 2.4588625091284428e-05, "loss": 0.1618, "step": 3348 }, { "epoch": 0.7029806884970613, "grad_norm": 0.11008409410715103, "learning_rate": 2.4557075917797624e-05, "loss": 0.1489, "step": 3349 }, { "epoch": 0.7031905961376994, "grad_norm": 0.11380871385335922, "learning_rate": 2.452554040774221e-05, "loss": 0.1601, "step": 3350 }, { "epoch": 0.7034005037783375, "grad_norm": 0.12558501958847046, "learning_rate": 2.4494018578053514e-05, "loss": 0.1624, "step": 3351 }, { "epoch": 0.7036104114189756, "grad_norm": 0.13026200234889984, "learning_rate": 2.446251044565946e-05, "loss": 0.1596, "step": 3352 }, { "epoch": 0.7038203190596137, "grad_norm": 0.10429004579782486, "learning_rate": 2.443101602748063e-05, "loss": 0.1502, "step": 3353 }, { "epoch": 0.7040302267002518, "grad_norm": 0.13567504286766052, "learning_rate": 2.439953534043024e-05, "loss": 0.151, "step": 3354 }, { "epoch": 0.7042401343408901, "grad_norm": 0.12509050965309143, "learning_rate": 2.436806840141418e-05, "loss": 0.155, "step": 3355 }, { "epoch": 0.7044500419815282, "grad_norm": 0.13032452762126923, "learning_rate": 2.4336615227330868e-05, "loss": 0.1724, "step": 3356 }, { "epoch": 0.7046599496221663, "grad_norm": 0.11473406106233597, "learning_rate": 2.4305175835071393e-05, "loss": 0.1438, "step": 3357 }, { "epoch": 0.7048698572628044, "grad_norm": 0.12625393271446228, "learning_rate": 2.427375024151941e-05, "loss": 0.1428, "step": 3358 }, { "epoch": 0.7050797649034425, "grad_norm": 0.12183254957199097, "learning_rate": 2.4242338463551212e-05, "loss": 0.1523, "step": 3359 }, { "epoch": 0.7052896725440806, "grad_norm": 0.11879599094390869, "learning_rate": 2.4210940518035628e-05, "loss": 0.1682, "step": 3360 }, { "epoch": 0.7054995801847187, "grad_norm": 0.1198955848813057, "learning_rate": 2.417955642183407e-05, "loss": 0.1378, "step": 3361 }, { "epoch": 0.7057094878253568, "grad_norm": 0.11626074463129044, "learning_rate": 2.4148186191800516e-05, "loss": 0.155, "step": 3362 }, { "epoch": 0.7059193954659949, "grad_norm": 0.12635177373886108, "learning_rate": 2.411682984478149e-05, "loss": 0.1619, "step": 3363 }, { "epoch": 0.7061293031066331, "grad_norm": 0.11214794963598251, "learning_rate": 2.4085487397616076e-05, "loss": 0.1556, "step": 3364 }, { "epoch": 0.7063392107472712, "grad_norm": 0.11064722388982773, "learning_rate": 2.4054158867135878e-05, "loss": 0.1426, "step": 3365 }, { "epoch": 0.7065491183879093, "grad_norm": 0.12300856411457062, "learning_rate": 2.4022844270165018e-05, "loss": 0.1535, "step": 3366 }, { "epoch": 0.7067590260285475, "grad_norm": 0.10280261933803558, "learning_rate": 2.3991543623520187e-05, "loss": 0.1531, "step": 3367 }, { "epoch": 0.7069689336691856, "grad_norm": 0.12246986478567123, "learning_rate": 2.3960256944010516e-05, "loss": 0.1625, "step": 3368 }, { "epoch": 0.7071788413098237, "grad_norm": 0.11043044179677963, "learning_rate": 2.39289842484377e-05, "loss": 0.1463, "step": 3369 }, { "epoch": 0.7073887489504618, "grad_norm": 0.11431458592414856, "learning_rate": 2.3897725553595833e-05, "loss": 0.163, "step": 3370 }, { "epoch": 0.7075986565910999, "grad_norm": 0.1473897248506546, "learning_rate": 2.38664808762716e-05, "loss": 0.1758, "step": 3371 }, { "epoch": 0.707808564231738, "grad_norm": 0.12143264710903168, "learning_rate": 2.3835250233244093e-05, "loss": 0.1776, "step": 3372 }, { "epoch": 0.7080184718723762, "grad_norm": 0.11493881046772003, "learning_rate": 2.380403364128487e-05, "loss": 0.1555, "step": 3373 }, { "epoch": 0.7082283795130143, "grad_norm": 0.12290722876787186, "learning_rate": 2.3772831117157967e-05, "loss": 0.1693, "step": 3374 }, { "epoch": 0.7084382871536524, "grad_norm": 0.09523230046033859, "learning_rate": 2.3741642677619824e-05, "loss": 0.1408, "step": 3375 }, { "epoch": 0.7086481947942905, "grad_norm": 0.1283012479543686, "learning_rate": 2.3710468339419406e-05, "loss": 0.1473, "step": 3376 }, { "epoch": 0.7088581024349286, "grad_norm": 0.10908805578947067, "learning_rate": 2.3679308119297988e-05, "loss": 0.1586, "step": 3377 }, { "epoch": 0.7090680100755667, "grad_norm": 0.12288475781679153, "learning_rate": 2.364816203398934e-05, "loss": 0.1382, "step": 3378 }, { "epoch": 0.7092779177162049, "grad_norm": 0.11164416372776031, "learning_rate": 2.36170301002196e-05, "loss": 0.1518, "step": 3379 }, { "epoch": 0.709487825356843, "grad_norm": 0.1325451135635376, "learning_rate": 2.358591233470736e-05, "loss": 0.152, "step": 3380 }, { "epoch": 0.7096977329974811, "grad_norm": 0.11789526045322418, "learning_rate": 2.3554808754163566e-05, "loss": 0.1634, "step": 3381 }, { "epoch": 0.7099076406381192, "grad_norm": 0.11117194592952728, "learning_rate": 2.3523719375291535e-05, "loss": 0.1599, "step": 3382 }, { "epoch": 0.7101175482787574, "grad_norm": 0.12263230234384537, "learning_rate": 2.349264421478699e-05, "loss": 0.146, "step": 3383 }, { "epoch": 0.7103274559193955, "grad_norm": 0.1571977734565735, "learning_rate": 2.346158328933799e-05, "loss": 0.1534, "step": 3384 }, { "epoch": 0.7105373635600336, "grad_norm": 0.1306907832622528, "learning_rate": 2.3430536615624966e-05, "loss": 0.1579, "step": 3385 }, { "epoch": 0.7107472712006717, "grad_norm": 0.12317048013210297, "learning_rate": 2.339950421032069e-05, "loss": 0.138, "step": 3386 }, { "epoch": 0.7109571788413098, "grad_norm": 0.1279805451631546, "learning_rate": 2.3368486090090248e-05, "loss": 0.158, "step": 3387 }, { "epoch": 0.7111670864819479, "grad_norm": 0.11604040861129761, "learning_rate": 2.3337482271591125e-05, "loss": 0.1573, "step": 3388 }, { "epoch": 0.711376994122586, "grad_norm": 0.10710416734218597, "learning_rate": 2.3306492771473064e-05, "loss": 0.1292, "step": 3389 }, { "epoch": 0.7115869017632241, "grad_norm": 0.11916560679674149, "learning_rate": 2.3275517606378145e-05, "loss": 0.1635, "step": 3390 }, { "epoch": 0.7117968094038623, "grad_norm": 0.1225900799036026, "learning_rate": 2.3244556792940685e-05, "loss": 0.1712, "step": 3391 }, { "epoch": 0.7120067170445005, "grad_norm": 0.14129529893398285, "learning_rate": 2.3213610347787406e-05, "loss": 0.1793, "step": 3392 }, { "epoch": 0.7122166246851386, "grad_norm": 0.11461693048477173, "learning_rate": 2.318267828753724e-05, "loss": 0.154, "step": 3393 }, { "epoch": 0.7124265323257767, "grad_norm": 0.10360202938318253, "learning_rate": 2.31517606288014e-05, "loss": 0.1342, "step": 3394 }, { "epoch": 0.7126364399664148, "grad_norm": 0.1024770587682724, "learning_rate": 2.3120857388183365e-05, "loss": 0.1675, "step": 3395 }, { "epoch": 0.7128463476070529, "grad_norm": 0.1129373162984848, "learning_rate": 2.3089968582278915e-05, "loss": 0.1643, "step": 3396 }, { "epoch": 0.713056255247691, "grad_norm": 0.13601639866828918, "learning_rate": 2.3059094227676047e-05, "loss": 0.1457, "step": 3397 }, { "epoch": 0.7132661628883291, "grad_norm": 0.109441839158535, "learning_rate": 2.3028234340954953e-05, "loss": 0.1654, "step": 3398 }, { "epoch": 0.7134760705289672, "grad_norm": 0.12384672462940216, "learning_rate": 2.2997388938688104e-05, "loss": 0.1711, "step": 3399 }, { "epoch": 0.7136859781696053, "grad_norm": 0.11933963000774384, "learning_rate": 2.296655803744021e-05, "loss": 0.1605, "step": 3400 }, { "epoch": 0.7138958858102435, "grad_norm": 0.11621814221143723, "learning_rate": 2.2935741653768167e-05, "loss": 0.1433, "step": 3401 }, { "epoch": 0.7141057934508817, "grad_norm": 0.14022648334503174, "learning_rate": 2.2904939804221064e-05, "loss": 0.1663, "step": 3402 }, { "epoch": 0.7143157010915198, "grad_norm": 0.11107058823108673, "learning_rate": 2.2874152505340206e-05, "loss": 0.1586, "step": 3403 }, { "epoch": 0.7145256087321579, "grad_norm": 0.10179081559181213, "learning_rate": 2.2843379773659075e-05, "loss": 0.162, "step": 3404 }, { "epoch": 0.714735516372796, "grad_norm": 0.12742018699645996, "learning_rate": 2.2812621625703323e-05, "loss": 0.1654, "step": 3405 }, { "epoch": 0.7149454240134341, "grad_norm": 0.12928232550621033, "learning_rate": 2.278187807799079e-05, "loss": 0.1617, "step": 3406 }, { "epoch": 0.7151553316540722, "grad_norm": 0.12862341105937958, "learning_rate": 2.2751149147031432e-05, "loss": 0.1622, "step": 3407 }, { "epoch": 0.7153652392947103, "grad_norm": 0.11547853797674179, "learning_rate": 2.2720434849327427e-05, "loss": 0.1541, "step": 3408 }, { "epoch": 0.7155751469353484, "grad_norm": 0.14156000316143036, "learning_rate": 2.2689735201373047e-05, "loss": 0.1556, "step": 3409 }, { "epoch": 0.7157850545759865, "grad_norm": 0.10587289184331894, "learning_rate": 2.2659050219654692e-05, "loss": 0.156, "step": 3410 }, { "epoch": 0.7159949622166247, "grad_norm": 0.11178750544786453, "learning_rate": 2.2628379920650906e-05, "loss": 0.1572, "step": 3411 }, { "epoch": 0.7162048698572628, "grad_norm": 0.16399018466472626, "learning_rate": 2.2597724320832338e-05, "loss": 0.1598, "step": 3412 }, { "epoch": 0.716414777497901, "grad_norm": 0.14096342027187347, "learning_rate": 2.256708343666174e-05, "loss": 0.1566, "step": 3413 }, { "epoch": 0.716624685138539, "grad_norm": 0.12449552118778229, "learning_rate": 2.253645728459397e-05, "loss": 0.1673, "step": 3414 }, { "epoch": 0.7168345927791772, "grad_norm": 0.12170739471912384, "learning_rate": 2.250584588107597e-05, "loss": 0.1691, "step": 3415 }, { "epoch": 0.7170445004198153, "grad_norm": 0.12910914421081543, "learning_rate": 2.2475249242546755e-05, "loss": 0.1628, "step": 3416 }, { "epoch": 0.7172544080604534, "grad_norm": 0.10747946798801422, "learning_rate": 2.2444667385437455e-05, "loss": 0.1692, "step": 3417 }, { "epoch": 0.7174643157010915, "grad_norm": 0.1221894845366478, "learning_rate": 2.2414100326171188e-05, "loss": 0.1652, "step": 3418 }, { "epoch": 0.7176742233417296, "grad_norm": 0.11467088758945465, "learning_rate": 2.238354808116316e-05, "loss": 0.1522, "step": 3419 }, { "epoch": 0.7178841309823678, "grad_norm": 0.12537196278572083, "learning_rate": 2.2353010666820645e-05, "loss": 0.1529, "step": 3420 }, { "epoch": 0.7180940386230059, "grad_norm": 0.11724396049976349, "learning_rate": 2.2322488099542938e-05, "loss": 0.1618, "step": 3421 }, { "epoch": 0.718303946263644, "grad_norm": 0.11824364215135574, "learning_rate": 2.2291980395721336e-05, "loss": 0.1535, "step": 3422 }, { "epoch": 0.7185138539042821, "grad_norm": 0.13034173846244812, "learning_rate": 2.226148757173918e-05, "loss": 0.1525, "step": 3423 }, { "epoch": 0.7187237615449202, "grad_norm": 0.1236094981431961, "learning_rate": 2.2231009643971817e-05, "loss": 0.1649, "step": 3424 }, { "epoch": 0.7189336691855583, "grad_norm": 0.1695818454027176, "learning_rate": 2.2200546628786585e-05, "loss": 0.1362, "step": 3425 }, { "epoch": 0.7191435768261965, "grad_norm": 0.12300975620746613, "learning_rate": 2.2170098542542815e-05, "loss": 0.1496, "step": 3426 }, { "epoch": 0.7193534844668346, "grad_norm": 0.11368977278470993, "learning_rate": 2.2139665401591837e-05, "loss": 0.1646, "step": 3427 }, { "epoch": 0.7195633921074727, "grad_norm": 0.10919873416423798, "learning_rate": 2.2109247222276912e-05, "loss": 0.1437, "step": 3428 }, { "epoch": 0.7197732997481109, "grad_norm": 0.11324533820152283, "learning_rate": 2.207884402093334e-05, "loss": 0.1471, "step": 3429 }, { "epoch": 0.719983207388749, "grad_norm": 0.11208245158195496, "learning_rate": 2.204845581388832e-05, "loss": 0.1497, "step": 3430 }, { "epoch": 0.7201931150293871, "grad_norm": 0.1129378080368042, "learning_rate": 2.201808261746102e-05, "loss": 0.1445, "step": 3431 }, { "epoch": 0.7204030226700252, "grad_norm": 0.12027370184659958, "learning_rate": 2.19877244479625e-05, "loss": 0.1725, "step": 3432 }, { "epoch": 0.7206129303106633, "grad_norm": 0.11807487159967422, "learning_rate": 2.195738132169583e-05, "loss": 0.1366, "step": 3433 }, { "epoch": 0.7208228379513014, "grad_norm": 0.11906223744153976, "learning_rate": 2.1927053254955955e-05, "loss": 0.1593, "step": 3434 }, { "epoch": 0.7210327455919395, "grad_norm": 0.1283073127269745, "learning_rate": 2.1896740264029737e-05, "loss": 0.1578, "step": 3435 }, { "epoch": 0.7212426532325776, "grad_norm": 0.1256849616765976, "learning_rate": 2.1866442365195927e-05, "loss": 0.1289, "step": 3436 }, { "epoch": 0.7214525608732157, "grad_norm": 0.11481695622205734, "learning_rate": 2.1836159574725218e-05, "loss": 0.1416, "step": 3437 }, { "epoch": 0.7216624685138538, "grad_norm": 0.10437635332345963, "learning_rate": 2.1805891908880172e-05, "loss": 0.1439, "step": 3438 }, { "epoch": 0.7218723761544921, "grad_norm": 0.13188210129737854, "learning_rate": 2.1775639383915168e-05, "loss": 0.1446, "step": 3439 }, { "epoch": 0.7220822837951302, "grad_norm": 0.10859008878469467, "learning_rate": 2.1745402016076515e-05, "loss": 0.1658, "step": 3440 }, { "epoch": 0.7222921914357683, "grad_norm": 0.12801185250282288, "learning_rate": 2.17151798216024e-05, "loss": 0.1517, "step": 3441 }, { "epoch": 0.7225020990764064, "grad_norm": 0.12271418422460556, "learning_rate": 2.1684972816722815e-05, "loss": 0.1544, "step": 3442 }, { "epoch": 0.7227120067170445, "grad_norm": 0.11636804044246674, "learning_rate": 2.165478101765961e-05, "loss": 0.1609, "step": 3443 }, { "epoch": 0.7229219143576826, "grad_norm": 0.11120007187128067, "learning_rate": 2.1624604440626472e-05, "loss": 0.1422, "step": 3444 }, { "epoch": 0.7231318219983207, "grad_norm": 0.11704123020172119, "learning_rate": 2.159444310182891e-05, "loss": 0.1685, "step": 3445 }, { "epoch": 0.7233417296389588, "grad_norm": 0.12351725250482559, "learning_rate": 2.156429701746426e-05, "loss": 0.1706, "step": 3446 }, { "epoch": 0.7235516372795969, "grad_norm": 0.13515010476112366, "learning_rate": 2.1534166203721644e-05, "loss": 0.1618, "step": 3447 }, { "epoch": 0.7237615449202351, "grad_norm": 0.13099972903728485, "learning_rate": 2.150405067678199e-05, "loss": 0.1554, "step": 3448 }, { "epoch": 0.7239714525608733, "grad_norm": 0.11865659058094025, "learning_rate": 2.147395045281806e-05, "loss": 0.1353, "step": 3449 }, { "epoch": 0.7241813602015114, "grad_norm": 0.10976529866456985, "learning_rate": 2.1443865547994336e-05, "loss": 0.1384, "step": 3450 }, { "epoch": 0.7243912678421495, "grad_norm": 0.1451970487833023, "learning_rate": 2.1413795978467105e-05, "loss": 0.1531, "step": 3451 }, { "epoch": 0.7246011754827876, "grad_norm": 0.1169748529791832, "learning_rate": 2.138374176038442e-05, "loss": 0.1566, "step": 3452 }, { "epoch": 0.7248110831234257, "grad_norm": 0.12926514446735382, "learning_rate": 2.1353702909886066e-05, "loss": 0.1495, "step": 3453 }, { "epoch": 0.7250209907640638, "grad_norm": 0.11117440462112427, "learning_rate": 2.13236794431036e-05, "loss": 0.1586, "step": 3454 }, { "epoch": 0.7252308984047019, "grad_norm": 0.12877735495567322, "learning_rate": 2.1293671376160308e-05, "loss": 0.1579, "step": 3455 }, { "epoch": 0.72544080604534, "grad_norm": 0.10876293480396271, "learning_rate": 2.1263678725171216e-05, "loss": 0.1548, "step": 3456 }, { "epoch": 0.7256507136859782, "grad_norm": 0.1247049942612648, "learning_rate": 2.123370150624303e-05, "loss": 0.1566, "step": 3457 }, { "epoch": 0.7258606213266163, "grad_norm": 0.12170092016458511, "learning_rate": 2.1203739735474248e-05, "loss": 0.1521, "step": 3458 }, { "epoch": 0.7260705289672544, "grad_norm": 0.1315077245235443, "learning_rate": 2.1173793428955023e-05, "loss": 0.1673, "step": 3459 }, { "epoch": 0.7262804366078925, "grad_norm": 0.12094591557979584, "learning_rate": 2.114386260276715e-05, "loss": 0.1611, "step": 3460 }, { "epoch": 0.7264903442485307, "grad_norm": 0.1278723180294037, "learning_rate": 2.1113947272984224e-05, "loss": 0.1564, "step": 3461 }, { "epoch": 0.7267002518891688, "grad_norm": 0.1344061940908432, "learning_rate": 2.1084047455671435e-05, "loss": 0.1566, "step": 3462 }, { "epoch": 0.7269101595298069, "grad_norm": 0.11192744970321655, "learning_rate": 2.1054163166885675e-05, "loss": 0.17, "step": 3463 }, { "epoch": 0.727120067170445, "grad_norm": 0.12571950256824493, "learning_rate": 2.102429442267549e-05, "loss": 0.1429, "step": 3464 }, { "epoch": 0.7273299748110831, "grad_norm": 0.13342975080013275, "learning_rate": 2.099444123908106e-05, "loss": 0.1457, "step": 3465 }, { "epoch": 0.7275398824517213, "grad_norm": 0.1195952519774437, "learning_rate": 2.0964603632134265e-05, "loss": 0.1709, "step": 3466 }, { "epoch": 0.7277497900923594, "grad_norm": 0.1186537891626358, "learning_rate": 2.093478161785855e-05, "loss": 0.1511, "step": 3467 }, { "epoch": 0.7279596977329975, "grad_norm": 0.09371523559093475, "learning_rate": 2.0904975212269024e-05, "loss": 0.146, "step": 3468 }, { "epoch": 0.7281696053736356, "grad_norm": 0.10283226519823074, "learning_rate": 2.087518443137239e-05, "loss": 0.1652, "step": 3469 }, { "epoch": 0.7283795130142737, "grad_norm": 0.11636362224817276, "learning_rate": 2.0845409291167023e-05, "loss": 0.1381, "step": 3470 }, { "epoch": 0.7285894206549118, "grad_norm": 0.12790684401988983, "learning_rate": 2.0815649807642824e-05, "loss": 0.1555, "step": 3471 }, { "epoch": 0.7287993282955499, "grad_norm": 0.1255161166191101, "learning_rate": 2.0785905996781312e-05, "loss": 0.1752, "step": 3472 }, { "epoch": 0.729009235936188, "grad_norm": 0.1269216686487198, "learning_rate": 2.075617787455561e-05, "loss": 0.1698, "step": 3473 }, { "epoch": 0.7292191435768262, "grad_norm": 0.11415970325469971, "learning_rate": 2.072646545693038e-05, "loss": 0.157, "step": 3474 }, { "epoch": 0.7294290512174643, "grad_norm": 0.12151431292295456, "learning_rate": 2.0696768759861884e-05, "loss": 0.1682, "step": 3475 }, { "epoch": 0.7296389588581025, "grad_norm": 0.13548152148723602, "learning_rate": 2.0667087799297923e-05, "loss": 0.1282, "step": 3476 }, { "epoch": 0.7298488664987406, "grad_norm": 0.1168590858578682, "learning_rate": 2.063742259117783e-05, "loss": 0.149, "step": 3477 }, { "epoch": 0.7300587741393787, "grad_norm": 0.1430933028459549, "learning_rate": 2.060777315143253e-05, "loss": 0.1624, "step": 3478 }, { "epoch": 0.7302686817800168, "grad_norm": 0.1568112075328827, "learning_rate": 2.0578139495984437e-05, "loss": 0.1584, "step": 3479 }, { "epoch": 0.7304785894206549, "grad_norm": 0.13265225291252136, "learning_rate": 2.0548521640747525e-05, "loss": 0.1516, "step": 3480 }, { "epoch": 0.730688497061293, "grad_norm": 0.13450318574905396, "learning_rate": 2.0518919601627197e-05, "loss": 0.1408, "step": 3481 }, { "epoch": 0.7308984047019311, "grad_norm": 0.11844109743833542, "learning_rate": 2.0489333394520483e-05, "loss": 0.1582, "step": 3482 }, { "epoch": 0.7311083123425692, "grad_norm": 0.13041557371616364, "learning_rate": 2.0459763035315828e-05, "loss": 0.1563, "step": 3483 }, { "epoch": 0.7313182199832073, "grad_norm": 0.1173725575208664, "learning_rate": 2.043020853989319e-05, "loss": 0.1603, "step": 3484 }, { "epoch": 0.7315281276238456, "grad_norm": 0.12801702320575714, "learning_rate": 2.0400669924124016e-05, "loss": 0.1765, "step": 3485 }, { "epoch": 0.7317380352644837, "grad_norm": 0.11436662822961807, "learning_rate": 2.0371147203871195e-05, "loss": 0.1773, "step": 3486 }, { "epoch": 0.7319479429051218, "grad_norm": 0.10988820344209671, "learning_rate": 2.0341640394989154e-05, "loss": 0.1451, "step": 3487 }, { "epoch": 0.7321578505457599, "grad_norm": 0.13320259749889374, "learning_rate": 2.031214951332367e-05, "loss": 0.1696, "step": 3488 }, { "epoch": 0.732367758186398, "grad_norm": 0.11017803847789764, "learning_rate": 2.0282674574712024e-05, "loss": 0.1373, "step": 3489 }, { "epoch": 0.7325776658270361, "grad_norm": 0.1179022565484047, "learning_rate": 2.0253215594982965e-05, "loss": 0.1649, "step": 3490 }, { "epoch": 0.7327875734676742, "grad_norm": 0.11736958473920822, "learning_rate": 2.0223772589956623e-05, "loss": 0.1576, "step": 3491 }, { "epoch": 0.7329974811083123, "grad_norm": 0.1266537308692932, "learning_rate": 2.0194345575444556e-05, "loss": 0.1714, "step": 3492 }, { "epoch": 0.7332073887489504, "grad_norm": 0.12094676494598389, "learning_rate": 2.0164934567249756e-05, "loss": 0.1656, "step": 3493 }, { "epoch": 0.7334172963895886, "grad_norm": 0.10691839456558228, "learning_rate": 2.0135539581166592e-05, "loss": 0.1653, "step": 3494 }, { "epoch": 0.7336272040302267, "grad_norm": 0.1287030428647995, "learning_rate": 2.0106160632980848e-05, "loss": 0.1445, "step": 3495 }, { "epoch": 0.7338371116708649, "grad_norm": 0.14120900630950928, "learning_rate": 2.0076797738469692e-05, "loss": 0.1625, "step": 3496 }, { "epoch": 0.734047019311503, "grad_norm": 0.1141500473022461, "learning_rate": 2.0047450913401672e-05, "loss": 0.1485, "step": 3497 }, { "epoch": 0.7342569269521411, "grad_norm": 0.11120381951332092, "learning_rate": 2.001812017353668e-05, "loss": 0.1557, "step": 3498 }, { "epoch": 0.7344668345927792, "grad_norm": 0.14555326104164124, "learning_rate": 1.998880553462604e-05, "loss": 0.1643, "step": 3499 }, { "epoch": 0.7346767422334173, "grad_norm": 0.14281868934631348, "learning_rate": 1.9959507012412343e-05, "loss": 0.1435, "step": 3500 }, { "epoch": 0.7348866498740554, "grad_norm": 0.11729402840137482, "learning_rate": 1.9930224622629583e-05, "loss": 0.145, "step": 3501 }, { "epoch": 0.7350965575146935, "grad_norm": 0.1215299442410469, "learning_rate": 1.990095838100307e-05, "loss": 0.1502, "step": 3502 }, { "epoch": 0.7353064651553316, "grad_norm": 0.13514041900634766, "learning_rate": 1.9871708303249437e-05, "loss": 0.153, "step": 3503 }, { "epoch": 0.7355163727959698, "grad_norm": 0.12567518651485443, "learning_rate": 1.9842474405076655e-05, "loss": 0.1486, "step": 3504 }, { "epoch": 0.7357262804366079, "grad_norm": 0.11353006213903427, "learning_rate": 1.981325670218398e-05, "loss": 0.1442, "step": 3505 }, { "epoch": 0.735936188077246, "grad_norm": 0.09958070516586304, "learning_rate": 1.9784055210261983e-05, "loss": 0.1425, "step": 3506 }, { "epoch": 0.7361460957178841, "grad_norm": 0.13688147068023682, "learning_rate": 1.9754869944992577e-05, "loss": 0.1615, "step": 3507 }, { "epoch": 0.7363560033585222, "grad_norm": 0.11260102689266205, "learning_rate": 1.972570092204886e-05, "loss": 0.1495, "step": 3508 }, { "epoch": 0.7365659109991604, "grad_norm": 0.12669405341148376, "learning_rate": 1.969654815709529e-05, "loss": 0.1451, "step": 3509 }, { "epoch": 0.7367758186397985, "grad_norm": 0.1202203780412674, "learning_rate": 1.9667411665787545e-05, "loss": 0.1581, "step": 3510 }, { "epoch": 0.7369857262804366, "grad_norm": 0.1210731789469719, "learning_rate": 1.9638291463772623e-05, "loss": 0.1559, "step": 3511 }, { "epoch": 0.7371956339210747, "grad_norm": 0.11393287032842636, "learning_rate": 1.9609187566688724e-05, "loss": 0.1584, "step": 3512 }, { "epoch": 0.7374055415617129, "grad_norm": 0.11847749352455139, "learning_rate": 1.9580099990165296e-05, "loss": 0.1697, "step": 3513 }, { "epoch": 0.737615449202351, "grad_norm": 0.11857393383979797, "learning_rate": 1.955102874982304e-05, "loss": 0.1537, "step": 3514 }, { "epoch": 0.7378253568429891, "grad_norm": 0.12403122335672379, "learning_rate": 1.9521973861273875e-05, "loss": 0.1598, "step": 3515 }, { "epoch": 0.7380352644836272, "grad_norm": 0.12605972588062286, "learning_rate": 1.9492935340120933e-05, "loss": 0.1617, "step": 3516 }, { "epoch": 0.7382451721242653, "grad_norm": 0.1054360643029213, "learning_rate": 1.946391320195857e-05, "loss": 0.156, "step": 3517 }, { "epoch": 0.7384550797649034, "grad_norm": 0.13782882690429688, "learning_rate": 1.943490746237232e-05, "loss": 0.1562, "step": 3518 }, { "epoch": 0.7386649874055415, "grad_norm": 0.12123388051986694, "learning_rate": 1.9405918136938962e-05, "loss": 0.1664, "step": 3519 }, { "epoch": 0.7388748950461796, "grad_norm": 0.11024918407201767, "learning_rate": 1.937694524122641e-05, "loss": 0.1486, "step": 3520 }, { "epoch": 0.7390848026868178, "grad_norm": 0.11437639594078064, "learning_rate": 1.934798879079378e-05, "loss": 0.1682, "step": 3521 }, { "epoch": 0.739294710327456, "grad_norm": 0.10866285115480423, "learning_rate": 1.9319048801191304e-05, "loss": 0.1601, "step": 3522 }, { "epoch": 0.7395046179680941, "grad_norm": 0.1192096620798111, "learning_rate": 1.929012528796046e-05, "loss": 0.1647, "step": 3523 }, { "epoch": 0.7397145256087322, "grad_norm": 0.1007518619298935, "learning_rate": 1.926121826663383e-05, "loss": 0.1511, "step": 3524 }, { "epoch": 0.7399244332493703, "grad_norm": 0.13826365768909454, "learning_rate": 1.9232327752735137e-05, "loss": 0.1582, "step": 3525 }, { "epoch": 0.7401343408900084, "grad_norm": 0.11610054224729538, "learning_rate": 1.920345376177925e-05, "loss": 0.1566, "step": 3526 }, { "epoch": 0.7403442485306465, "grad_norm": 0.10525956749916077, "learning_rate": 1.9174596309272143e-05, "loss": 0.161, "step": 3527 }, { "epoch": 0.7405541561712846, "grad_norm": 0.1056981310248375, "learning_rate": 1.914575541071098e-05, "loss": 0.1545, "step": 3528 }, { "epoch": 0.7407640638119227, "grad_norm": 0.12345774471759796, "learning_rate": 1.9116931081583928e-05, "loss": 0.1598, "step": 3529 }, { "epoch": 0.7409739714525608, "grad_norm": 0.10237154364585876, "learning_rate": 1.9088123337370313e-05, "loss": 0.1456, "step": 3530 }, { "epoch": 0.7411838790931989, "grad_norm": 0.1328197717666626, "learning_rate": 1.905933219354058e-05, "loss": 0.1298, "step": 3531 }, { "epoch": 0.7413937867338372, "grad_norm": 0.10656782239675522, "learning_rate": 1.9030557665556226e-05, "loss": 0.1612, "step": 3532 }, { "epoch": 0.7416036943744753, "grad_norm": 0.1428254395723343, "learning_rate": 1.900179976886982e-05, "loss": 0.1552, "step": 3533 }, { "epoch": 0.7418136020151134, "grad_norm": 0.12702079117298126, "learning_rate": 1.8973058518925013e-05, "loss": 0.1551, "step": 3534 }, { "epoch": 0.7420235096557515, "grad_norm": 0.1408890336751938, "learning_rate": 1.894433393115651e-05, "loss": 0.143, "step": 3535 }, { "epoch": 0.7422334172963896, "grad_norm": 0.11404120922088623, "learning_rate": 1.8915626020990073e-05, "loss": 0.1605, "step": 3536 }, { "epoch": 0.7424433249370277, "grad_norm": 0.13507436215877533, "learning_rate": 1.8886934803842504e-05, "loss": 0.1709, "step": 3537 }, { "epoch": 0.7426532325776658, "grad_norm": 0.11132592707872391, "learning_rate": 1.8858260295121643e-05, "loss": 0.1439, "step": 3538 }, { "epoch": 0.7428631402183039, "grad_norm": 0.10579460859298706, "learning_rate": 1.882960251022634e-05, "loss": 0.1681, "step": 3539 }, { "epoch": 0.743073047858942, "grad_norm": 0.12509001791477203, "learning_rate": 1.8800961464546518e-05, "loss": 0.1598, "step": 3540 }, { "epoch": 0.7432829554995802, "grad_norm": 0.1258598417043686, "learning_rate": 1.8772337173463045e-05, "loss": 0.1458, "step": 3541 }, { "epoch": 0.7434928631402183, "grad_norm": 0.11713558435440063, "learning_rate": 1.874372965234783e-05, "loss": 0.1467, "step": 3542 }, { "epoch": 0.7437027707808564, "grad_norm": 0.15114378929138184, "learning_rate": 1.8715138916563758e-05, "loss": 0.1623, "step": 3543 }, { "epoch": 0.7439126784214946, "grad_norm": 0.12825213372707367, "learning_rate": 1.8686564981464706e-05, "loss": 0.1508, "step": 3544 }, { "epoch": 0.7441225860621327, "grad_norm": 0.11679855734109879, "learning_rate": 1.865800786239554e-05, "loss": 0.1595, "step": 3545 }, { "epoch": 0.7443324937027708, "grad_norm": 0.12408013641834259, "learning_rate": 1.8629467574692073e-05, "loss": 0.1753, "step": 3546 }, { "epoch": 0.7445424013434089, "grad_norm": 0.10703358054161072, "learning_rate": 1.860094413368108e-05, "loss": 0.153, "step": 3547 }, { "epoch": 0.744752308984047, "grad_norm": 0.10585933923721313, "learning_rate": 1.857243755468034e-05, "loss": 0.1504, "step": 3548 }, { "epoch": 0.7449622166246851, "grad_norm": 0.12033756822347641, "learning_rate": 1.8543947852998528e-05, "loss": 0.1558, "step": 3549 }, { "epoch": 0.7451721242653233, "grad_norm": 0.1374211460351944, "learning_rate": 1.8515475043935237e-05, "loss": 0.163, "step": 3550 }, { "epoch": 0.7453820319059614, "grad_norm": 0.11834236234426498, "learning_rate": 1.848701914278101e-05, "loss": 0.1593, "step": 3551 }, { "epoch": 0.7455919395465995, "grad_norm": 0.11231713742017746, "learning_rate": 1.8458580164817357e-05, "loss": 0.1468, "step": 3552 }, { "epoch": 0.7458018471872376, "grad_norm": 0.12200900167226791, "learning_rate": 1.843015812531663e-05, "loss": 0.1597, "step": 3553 }, { "epoch": 0.7460117548278757, "grad_norm": 0.10779359191656113, "learning_rate": 1.840175303954213e-05, "loss": 0.1655, "step": 3554 }, { "epoch": 0.7462216624685138, "grad_norm": 0.12323888391256332, "learning_rate": 1.8373364922748027e-05, "loss": 0.1652, "step": 3555 }, { "epoch": 0.746431570109152, "grad_norm": 0.12674176692962646, "learning_rate": 1.8344993790179387e-05, "loss": 0.1576, "step": 3556 }, { "epoch": 0.7466414777497901, "grad_norm": 0.12774962186813354, "learning_rate": 1.8316639657072155e-05, "loss": 0.1443, "step": 3557 }, { "epoch": 0.7468513853904282, "grad_norm": 0.1553429812192917, "learning_rate": 1.828830253865316e-05, "loss": 0.1389, "step": 3558 }, { "epoch": 0.7470612930310663, "grad_norm": 0.15105120837688446, "learning_rate": 1.825998245014005e-05, "loss": 0.1585, "step": 3559 }, { "epoch": 0.7472712006717045, "grad_norm": 0.11354130506515503, "learning_rate": 1.8231679406741402e-05, "loss": 0.1407, "step": 3560 }, { "epoch": 0.7474811083123426, "grad_norm": 0.12431042641401291, "learning_rate": 1.820339342365658e-05, "loss": 0.1613, "step": 3561 }, { "epoch": 0.7476910159529807, "grad_norm": 0.11633086949586868, "learning_rate": 1.8175124516075796e-05, "loss": 0.1369, "step": 3562 }, { "epoch": 0.7479009235936188, "grad_norm": 0.11281375586986542, "learning_rate": 1.8146872699180108e-05, "loss": 0.1599, "step": 3563 }, { "epoch": 0.7481108312342569, "grad_norm": 0.14105822145938873, "learning_rate": 1.811863798814138e-05, "loss": 0.1585, "step": 3564 }, { "epoch": 0.748320738874895, "grad_norm": 0.1503167301416397, "learning_rate": 1.8090420398122304e-05, "loss": 0.1506, "step": 3565 }, { "epoch": 0.7485306465155331, "grad_norm": 0.12038581073284149, "learning_rate": 1.8062219944276365e-05, "loss": 0.1624, "step": 3566 }, { "epoch": 0.7487405541561712, "grad_norm": 0.12901602685451508, "learning_rate": 1.8034036641747847e-05, "loss": 0.1505, "step": 3567 }, { "epoch": 0.7489504617968094, "grad_norm": 0.12743479013442993, "learning_rate": 1.8005870505671823e-05, "loss": 0.1685, "step": 3568 }, { "epoch": 0.7491603694374476, "grad_norm": 0.13382984697818756, "learning_rate": 1.797772155117417e-05, "loss": 0.1702, "step": 3569 }, { "epoch": 0.7493702770780857, "grad_norm": 0.10113348811864853, "learning_rate": 1.7949589793371535e-05, "loss": 0.1412, "step": 3570 }, { "epoch": 0.7495801847187238, "grad_norm": 0.13199184834957123, "learning_rate": 1.792147524737125e-05, "loss": 0.1587, "step": 3571 }, { "epoch": 0.7497900923593619, "grad_norm": 0.11780549585819244, "learning_rate": 1.7893377928271526e-05, "loss": 0.1481, "step": 3572 }, { "epoch": 0.75, "grad_norm": 0.12867914140224457, "learning_rate": 1.786529785116125e-05, "loss": 0.1594, "step": 3573 }, { "epoch": 0.7502099076406381, "grad_norm": 0.11833992600440979, "learning_rate": 1.7837235031120063e-05, "loss": 0.148, "step": 3574 }, { "epoch": 0.7504198152812762, "grad_norm": 0.12939533591270447, "learning_rate": 1.7809189483218336e-05, "loss": 0.1623, "step": 3575 }, { "epoch": 0.7506297229219143, "grad_norm": 0.11650772392749786, "learning_rate": 1.7781161222517163e-05, "loss": 0.1444, "step": 3576 }, { "epoch": 0.7508396305625524, "grad_norm": 0.13511453568935394, "learning_rate": 1.7753150264068398e-05, "loss": 0.1568, "step": 3577 }, { "epoch": 0.7510495382031906, "grad_norm": 0.13567636907100677, "learning_rate": 1.7725156622914518e-05, "loss": 0.157, "step": 3578 }, { "epoch": 0.7512594458438288, "grad_norm": 0.1252739429473877, "learning_rate": 1.7697180314088767e-05, "loss": 0.1641, "step": 3579 }, { "epoch": 0.7514693534844669, "grad_norm": 0.11738979071378708, "learning_rate": 1.766922135261505e-05, "loss": 0.1527, "step": 3580 }, { "epoch": 0.751679261125105, "grad_norm": 0.13429807126522064, "learning_rate": 1.7641279753507993e-05, "loss": 0.1539, "step": 3581 }, { "epoch": 0.7518891687657431, "grad_norm": 0.11173707246780396, "learning_rate": 1.761335553177286e-05, "loss": 0.1585, "step": 3582 }, { "epoch": 0.7520990764063812, "grad_norm": 0.12978778779506683, "learning_rate": 1.75854487024056e-05, "loss": 0.1543, "step": 3583 }, { "epoch": 0.7523089840470193, "grad_norm": 0.11753234267234802, "learning_rate": 1.7557559280392806e-05, "loss": 0.1657, "step": 3584 }, { "epoch": 0.7525188916876574, "grad_norm": 0.11019283533096313, "learning_rate": 1.7529687280711747e-05, "loss": 0.16, "step": 3585 }, { "epoch": 0.7527287993282955, "grad_norm": 0.120496965944767, "learning_rate": 1.750183271833032e-05, "loss": 0.1477, "step": 3586 }, { "epoch": 0.7529387069689337, "grad_norm": 0.1287451982498169, "learning_rate": 1.7473995608207056e-05, "loss": 0.1575, "step": 3587 }, { "epoch": 0.7531486146095718, "grad_norm": 0.11368725448846817, "learning_rate": 1.744617596529111e-05, "loss": 0.1489, "step": 3588 }, { "epoch": 0.7533585222502099, "grad_norm": 0.12659108638763428, "learning_rate": 1.7418373804522293e-05, "loss": 0.1504, "step": 3589 }, { "epoch": 0.753568429890848, "grad_norm": 0.1425846368074417, "learning_rate": 1.7390589140830987e-05, "loss": 0.1535, "step": 3590 }, { "epoch": 0.7537783375314862, "grad_norm": 0.11686164885759354, "learning_rate": 1.7362821989138204e-05, "loss": 0.1522, "step": 3591 }, { "epoch": 0.7539882451721243, "grad_norm": 0.12282045185565948, "learning_rate": 1.7335072364355498e-05, "loss": 0.1387, "step": 3592 }, { "epoch": 0.7541981528127624, "grad_norm": 0.11782212555408478, "learning_rate": 1.7307340281385093e-05, "loss": 0.1715, "step": 3593 }, { "epoch": 0.7544080604534005, "grad_norm": 0.10659587383270264, "learning_rate": 1.7279625755119733e-05, "loss": 0.1621, "step": 3594 }, { "epoch": 0.7546179680940386, "grad_norm": 0.13287796080112457, "learning_rate": 1.7251928800442757e-05, "loss": 0.1571, "step": 3595 }, { "epoch": 0.7548278757346767, "grad_norm": 0.10035673528909683, "learning_rate": 1.7224249432228063e-05, "loss": 0.146, "step": 3596 }, { "epoch": 0.7550377833753149, "grad_norm": 0.12282916158437729, "learning_rate": 1.719658766534008e-05, "loss": 0.1582, "step": 3597 }, { "epoch": 0.755247691015953, "grad_norm": 0.10245541483163834, "learning_rate": 1.7168943514633863e-05, "loss": 0.1493, "step": 3598 }, { "epoch": 0.7554575986565911, "grad_norm": 0.14495441317558289, "learning_rate": 1.7141316994954904e-05, "loss": 0.158, "step": 3599 }, { "epoch": 0.7556675062972292, "grad_norm": 0.11242242157459259, "learning_rate": 1.7113708121139278e-05, "loss": 0.1492, "step": 3600 }, { "epoch": 0.7558774139378673, "grad_norm": 0.12160933762788773, "learning_rate": 1.7086116908013606e-05, "loss": 0.1626, "step": 3601 }, { "epoch": 0.7560873215785054, "grad_norm": 0.12219791114330292, "learning_rate": 1.7058543370394986e-05, "loss": 0.1785, "step": 3602 }, { "epoch": 0.7562972292191436, "grad_norm": 0.13026191294193268, "learning_rate": 1.7030987523091042e-05, "loss": 0.1561, "step": 3603 }, { "epoch": 0.7565071368597817, "grad_norm": 0.10388700664043427, "learning_rate": 1.700344938089989e-05, "loss": 0.158, "step": 3604 }, { "epoch": 0.7567170445004198, "grad_norm": 0.10586515814065933, "learning_rate": 1.697592895861014e-05, "loss": 0.1507, "step": 3605 }, { "epoch": 0.756926952141058, "grad_norm": 0.11730484664440155, "learning_rate": 1.694842627100089e-05, "loss": 0.1702, "step": 3606 }, { "epoch": 0.7571368597816961, "grad_norm": 0.1099337711930275, "learning_rate": 1.6920941332841706e-05, "loss": 0.1574, "step": 3607 }, { "epoch": 0.7573467674223342, "grad_norm": 0.11451583355665207, "learning_rate": 1.6893474158892636e-05, "loss": 0.1607, "step": 3608 }, { "epoch": 0.7575566750629723, "grad_norm": 0.10995732247829437, "learning_rate": 1.6866024763904158e-05, "loss": 0.1393, "step": 3609 }, { "epoch": 0.7577665827036104, "grad_norm": 0.1097119078040123, "learning_rate": 1.683859316261726e-05, "loss": 0.1537, "step": 3610 }, { "epoch": 0.7579764903442485, "grad_norm": 0.09406376630067825, "learning_rate": 1.6811179369763334e-05, "loss": 0.1495, "step": 3611 }, { "epoch": 0.7581863979848866, "grad_norm": 0.12109964340925217, "learning_rate": 1.6783783400064168e-05, "loss": 0.1702, "step": 3612 }, { "epoch": 0.7583963056255247, "grad_norm": 0.1099427118897438, "learning_rate": 1.6756405268232073e-05, "loss": 0.1639, "step": 3613 }, { "epoch": 0.7586062132661628, "grad_norm": 0.11676996946334839, "learning_rate": 1.672904498896971e-05, "loss": 0.1662, "step": 3614 }, { "epoch": 0.7588161209068011, "grad_norm": 0.1413038820028305, "learning_rate": 1.670170257697018e-05, "loss": 0.1614, "step": 3615 }, { "epoch": 0.7590260285474392, "grad_norm": 0.11984378844499588, "learning_rate": 1.6674378046916983e-05, "loss": 0.1526, "step": 3616 }, { "epoch": 0.7592359361880773, "grad_norm": 0.13852879405021667, "learning_rate": 1.6647071413483988e-05, "loss": 0.1705, "step": 3617 }, { "epoch": 0.7594458438287154, "grad_norm": 0.12046951800584793, "learning_rate": 1.6619782691335546e-05, "loss": 0.1518, "step": 3618 }, { "epoch": 0.7596557514693535, "grad_norm": 0.1124371588230133, "learning_rate": 1.6592511895126263e-05, "loss": 0.1517, "step": 3619 }, { "epoch": 0.7598656591099916, "grad_norm": 0.12519319355487823, "learning_rate": 1.6565259039501195e-05, "loss": 0.1564, "step": 3620 }, { "epoch": 0.7600755667506297, "grad_norm": 0.11457381397485733, "learning_rate": 1.6538024139095748e-05, "loss": 0.1466, "step": 3621 }, { "epoch": 0.7602854743912678, "grad_norm": 0.12174105644226074, "learning_rate": 1.65108072085357e-05, "loss": 0.1555, "step": 3622 }, { "epoch": 0.7604953820319059, "grad_norm": 0.11587201803922653, "learning_rate": 1.6483608262437154e-05, "loss": 0.1525, "step": 3623 }, { "epoch": 0.760705289672544, "grad_norm": 0.10113554447889328, "learning_rate": 1.645642731540657e-05, "loss": 0.1589, "step": 3624 }, { "epoch": 0.7609151973131822, "grad_norm": 0.12906788289546967, "learning_rate": 1.6429264382040737e-05, "loss": 0.1556, "step": 3625 }, { "epoch": 0.7611251049538204, "grad_norm": 0.1424580067396164, "learning_rate": 1.6402119476926765e-05, "loss": 0.1488, "step": 3626 }, { "epoch": 0.7613350125944585, "grad_norm": 0.12937603890895844, "learning_rate": 1.637499261464209e-05, "loss": 0.15, "step": 3627 }, { "epoch": 0.7615449202350966, "grad_norm": 0.1439439356327057, "learning_rate": 1.6347883809754455e-05, "loss": 0.1582, "step": 3628 }, { "epoch": 0.7617548278757347, "grad_norm": 0.15245719254016876, "learning_rate": 1.6320793076821893e-05, "loss": 0.1646, "step": 3629 }, { "epoch": 0.7619647355163728, "grad_norm": 0.11606140434741974, "learning_rate": 1.6293720430392783e-05, "loss": 0.1685, "step": 3630 }, { "epoch": 0.7621746431570109, "grad_norm": 0.1192961186170578, "learning_rate": 1.6266665885005738e-05, "loss": 0.1885, "step": 3631 }, { "epoch": 0.762384550797649, "grad_norm": 0.15543906390666962, "learning_rate": 1.6239629455189677e-05, "loss": 0.1428, "step": 3632 }, { "epoch": 0.7625944584382871, "grad_norm": 0.1320749819278717, "learning_rate": 1.621261115546373e-05, "loss": 0.1582, "step": 3633 }, { "epoch": 0.7628043660789253, "grad_norm": 0.1201024129986763, "learning_rate": 1.6185611000337397e-05, "loss": 0.1593, "step": 3634 }, { "epoch": 0.7630142737195634, "grad_norm": 0.13097235560417175, "learning_rate": 1.6158629004310362e-05, "loss": 0.1602, "step": 3635 }, { "epoch": 0.7632241813602015, "grad_norm": 0.14049144089221954, "learning_rate": 1.6131665181872567e-05, "loss": 0.1642, "step": 3636 }, { "epoch": 0.7634340890008396, "grad_norm": 0.11801209300756454, "learning_rate": 1.610471954750421e-05, "loss": 0.1708, "step": 3637 }, { "epoch": 0.7636439966414778, "grad_norm": 0.11156057566404343, "learning_rate": 1.6077792115675683e-05, "loss": 0.1614, "step": 3638 }, { "epoch": 0.7638539042821159, "grad_norm": 0.12610188126564026, "learning_rate": 1.605088290084769e-05, "loss": 0.1494, "step": 3639 }, { "epoch": 0.764063811922754, "grad_norm": 0.11332854628562927, "learning_rate": 1.602399191747104e-05, "loss": 0.1535, "step": 3640 }, { "epoch": 0.7642737195633921, "grad_norm": 0.12335386872291565, "learning_rate": 1.599711917998682e-05, "loss": 0.1569, "step": 3641 }, { "epoch": 0.7644836272040302, "grad_norm": 0.13249602913856506, "learning_rate": 1.5970264702826317e-05, "loss": 0.164, "step": 3642 }, { "epoch": 0.7646935348446684, "grad_norm": 0.10993882268667221, "learning_rate": 1.5943428500410986e-05, "loss": 0.1546, "step": 3643 }, { "epoch": 0.7649034424853065, "grad_norm": 0.11593450605869293, "learning_rate": 1.5916610587152485e-05, "loss": 0.1827, "step": 3644 }, { "epoch": 0.7651133501259446, "grad_norm": 0.1301153302192688, "learning_rate": 1.5889810977452652e-05, "loss": 0.1627, "step": 3645 }, { "epoch": 0.7653232577665827, "grad_norm": 0.13022619485855103, "learning_rate": 1.5863029685703477e-05, "loss": 0.158, "step": 3646 }, { "epoch": 0.7655331654072208, "grad_norm": 0.13365799188613892, "learning_rate": 1.583626672628714e-05, "loss": 0.1686, "step": 3647 }, { "epoch": 0.7657430730478589, "grad_norm": 0.12435203790664673, "learning_rate": 1.5809522113575946e-05, "loss": 0.1338, "step": 3648 }, { "epoch": 0.765952980688497, "grad_norm": 0.12281453609466553, "learning_rate": 1.5782795861932376e-05, "loss": 0.1623, "step": 3649 }, { "epoch": 0.7661628883291351, "grad_norm": 0.14482922852039337, "learning_rate": 1.5756087985709013e-05, "loss": 0.1542, "step": 3650 }, { "epoch": 0.7663727959697733, "grad_norm": 0.1194080114364624, "learning_rate": 1.5729398499248626e-05, "loss": 0.1378, "step": 3651 }, { "epoch": 0.7665827036104114, "grad_norm": 0.12180230766534805, "learning_rate": 1.570272741688408e-05, "loss": 0.1674, "step": 3652 }, { "epoch": 0.7667926112510496, "grad_norm": 0.1125471219420433, "learning_rate": 1.5676074752938337e-05, "loss": 0.1681, "step": 3653 }, { "epoch": 0.7670025188916877, "grad_norm": 0.13706545531749725, "learning_rate": 1.564944052172449e-05, "loss": 0.1549, "step": 3654 }, { "epoch": 0.7672124265323258, "grad_norm": 0.10688710957765579, "learning_rate": 1.5622824737545734e-05, "loss": 0.1571, "step": 3655 }, { "epoch": 0.7674223341729639, "grad_norm": 0.12797637283802032, "learning_rate": 1.5596227414695353e-05, "loss": 0.1723, "step": 3656 }, { "epoch": 0.767632241813602, "grad_norm": 0.12048980593681335, "learning_rate": 1.5569648567456718e-05, "loss": 0.1547, "step": 3657 }, { "epoch": 0.7678421494542401, "grad_norm": 0.1186976432800293, "learning_rate": 1.5543088210103257e-05, "loss": 0.1509, "step": 3658 }, { "epoch": 0.7680520570948782, "grad_norm": 0.1264502853155136, "learning_rate": 1.5516546356898527e-05, "loss": 0.1488, "step": 3659 }, { "epoch": 0.7682619647355163, "grad_norm": 0.11536482721567154, "learning_rate": 1.5490023022096097e-05, "loss": 0.1657, "step": 3660 }, { "epoch": 0.7684718723761544, "grad_norm": 0.12747111916542053, "learning_rate": 1.5463518219939584e-05, "loss": 0.1364, "step": 3661 }, { "epoch": 0.7686817800167927, "grad_norm": 0.12515799701213837, "learning_rate": 1.5437031964662662e-05, "loss": 0.1437, "step": 3662 }, { "epoch": 0.7688916876574308, "grad_norm": 0.10816127061843872, "learning_rate": 1.5410564270489102e-05, "loss": 0.1616, "step": 3663 }, { "epoch": 0.7691015952980689, "grad_norm": 0.12977659702301025, "learning_rate": 1.5384115151632628e-05, "loss": 0.1623, "step": 3664 }, { "epoch": 0.769311502938707, "grad_norm": 0.1383851319551468, "learning_rate": 1.535768462229703e-05, "loss": 0.1327, "step": 3665 }, { "epoch": 0.7695214105793451, "grad_norm": 0.11733153462409973, "learning_rate": 1.533127269667609e-05, "loss": 0.156, "step": 3666 }, { "epoch": 0.7697313182199832, "grad_norm": 0.12266400456428528, "learning_rate": 1.5304879388953663e-05, "loss": 0.1505, "step": 3667 }, { "epoch": 0.7699412258606213, "grad_norm": 0.13448049128055573, "learning_rate": 1.5278504713303505e-05, "loss": 0.1525, "step": 3668 }, { "epoch": 0.7701511335012594, "grad_norm": 0.12451707571744919, "learning_rate": 1.5252148683889439e-05, "loss": 0.1507, "step": 3669 }, { "epoch": 0.7703610411418975, "grad_norm": 0.12748034298419952, "learning_rate": 1.5225811314865241e-05, "loss": 0.1613, "step": 3670 }, { "epoch": 0.7705709487825357, "grad_norm": 0.11085929721593857, "learning_rate": 1.5199492620374705e-05, "loss": 0.1532, "step": 3671 }, { "epoch": 0.7707808564231738, "grad_norm": 0.11221485584974289, "learning_rate": 1.5173192614551562e-05, "loss": 0.1467, "step": 3672 }, { "epoch": 0.770990764063812, "grad_norm": 0.12698894739151, "learning_rate": 1.5146911311519518e-05, "loss": 0.1642, "step": 3673 }, { "epoch": 0.7712006717044501, "grad_norm": 0.12469375878572464, "learning_rate": 1.5120648725392223e-05, "loss": 0.1677, "step": 3674 }, { "epoch": 0.7714105793450882, "grad_norm": 0.13764609396457672, "learning_rate": 1.5094404870273293e-05, "loss": 0.166, "step": 3675 }, { "epoch": 0.7716204869857263, "grad_norm": 0.16062691807746887, "learning_rate": 1.5068179760256273e-05, "loss": 0.168, "step": 3676 }, { "epoch": 0.7718303946263644, "grad_norm": 0.11727982014417648, "learning_rate": 1.5041973409424653e-05, "loss": 0.1555, "step": 3677 }, { "epoch": 0.7720403022670025, "grad_norm": 0.11587007343769073, "learning_rate": 1.5015785831851837e-05, "loss": 0.1606, "step": 3678 }, { "epoch": 0.7722502099076406, "grad_norm": 0.10293179005384445, "learning_rate": 1.498961704160114e-05, "loss": 0.1486, "step": 3679 }, { "epoch": 0.7724601175482787, "grad_norm": 0.11385567486286163, "learning_rate": 1.4963467052725827e-05, "loss": 0.1658, "step": 3680 }, { "epoch": 0.7726700251889169, "grad_norm": 0.11889761686325073, "learning_rate": 1.4937335879269038e-05, "loss": 0.1651, "step": 3681 }, { "epoch": 0.772879932829555, "grad_norm": 0.10438660532236099, "learning_rate": 1.4911223535263774e-05, "loss": 0.1619, "step": 3682 }, { "epoch": 0.7730898404701931, "grad_norm": 0.11689276248216629, "learning_rate": 1.4885130034732992e-05, "loss": 0.1635, "step": 3683 }, { "epoch": 0.7732997481108312, "grad_norm": 0.10763343423604965, "learning_rate": 1.4859055391689497e-05, "loss": 0.1542, "step": 3684 }, { "epoch": 0.7735096557514693, "grad_norm": 0.11056458950042725, "learning_rate": 1.4832999620135957e-05, "loss": 0.1441, "step": 3685 }, { "epoch": 0.7737195633921075, "grad_norm": 0.13629314303398132, "learning_rate": 1.4806962734064917e-05, "loss": 0.1298, "step": 3686 }, { "epoch": 0.7739294710327456, "grad_norm": 0.12725608050823212, "learning_rate": 1.4780944747458774e-05, "loss": 0.1537, "step": 3687 }, { "epoch": 0.7741393786733837, "grad_norm": 0.1429300606250763, "learning_rate": 1.4754945674289817e-05, "loss": 0.1577, "step": 3688 }, { "epoch": 0.7743492863140218, "grad_norm": 0.12305251508951187, "learning_rate": 1.47289655285201e-05, "loss": 0.1615, "step": 3689 }, { "epoch": 0.77455919395466, "grad_norm": 0.13106046617031097, "learning_rate": 1.470300432410157e-05, "loss": 0.1672, "step": 3690 }, { "epoch": 0.7747691015952981, "grad_norm": 0.14498285949230194, "learning_rate": 1.4677062074975972e-05, "loss": 0.1691, "step": 3691 }, { "epoch": 0.7749790092359362, "grad_norm": 0.11447465419769287, "learning_rate": 1.465113879507492e-05, "loss": 0.1575, "step": 3692 }, { "epoch": 0.7751889168765743, "grad_norm": 0.14588482677936554, "learning_rate": 1.4625234498319784e-05, "loss": 0.1709, "step": 3693 }, { "epoch": 0.7753988245172124, "grad_norm": 0.12316252291202545, "learning_rate": 1.459934919862177e-05, "loss": 0.1603, "step": 3694 }, { "epoch": 0.7756087321578505, "grad_norm": 0.12461654841899872, "learning_rate": 1.457348290988187e-05, "loss": 0.1408, "step": 3695 }, { "epoch": 0.7758186397984886, "grad_norm": 0.10895299166440964, "learning_rate": 1.4547635645990875e-05, "loss": 0.1746, "step": 3696 }, { "epoch": 0.7760285474391267, "grad_norm": 0.11470239609479904, "learning_rate": 1.452180742082936e-05, "loss": 0.1567, "step": 3697 }, { "epoch": 0.7762384550797649, "grad_norm": 0.1124269962310791, "learning_rate": 1.4495998248267662e-05, "loss": 0.1804, "step": 3698 }, { "epoch": 0.7764483627204031, "grad_norm": 0.11764495819807053, "learning_rate": 1.4470208142165892e-05, "loss": 0.1562, "step": 3699 }, { "epoch": 0.7766582703610412, "grad_norm": 0.1293368935585022, "learning_rate": 1.444443711637395e-05, "loss": 0.1509, "step": 3700 }, { "epoch": 0.7768681780016793, "grad_norm": 0.11314057558774948, "learning_rate": 1.4418685184731462e-05, "loss": 0.1511, "step": 3701 }, { "epoch": 0.7770780856423174, "grad_norm": 0.12856042385101318, "learning_rate": 1.4392952361067785e-05, "loss": 0.1564, "step": 3702 }, { "epoch": 0.7772879932829555, "grad_norm": 0.1180645078420639, "learning_rate": 1.436723865920202e-05, "loss": 0.1642, "step": 3703 }, { "epoch": 0.7774979009235936, "grad_norm": 0.11847218871116638, "learning_rate": 1.4341544092943055e-05, "loss": 0.149, "step": 3704 }, { "epoch": 0.7777078085642317, "grad_norm": 0.12634143233299255, "learning_rate": 1.4315868676089427e-05, "loss": 0.166, "step": 3705 }, { "epoch": 0.7779177162048698, "grad_norm": 0.10073088109493256, "learning_rate": 1.4290212422429444e-05, "loss": 0.1506, "step": 3706 }, { "epoch": 0.7781276238455079, "grad_norm": 0.12493392080068588, "learning_rate": 1.4264575345741065e-05, "loss": 0.1443, "step": 3707 }, { "epoch": 0.7783375314861462, "grad_norm": 0.15417474508285522, "learning_rate": 1.4238957459792041e-05, "loss": 0.1462, "step": 3708 }, { "epoch": 0.7785474391267843, "grad_norm": 0.10922432690858841, "learning_rate": 1.4213358778339719e-05, "loss": 0.154, "step": 3709 }, { "epoch": 0.7787573467674224, "grad_norm": 0.11463958770036697, "learning_rate": 1.4187779315131184e-05, "loss": 0.1524, "step": 3710 }, { "epoch": 0.7789672544080605, "grad_norm": 0.11828161031007767, "learning_rate": 1.4162219083903172e-05, "loss": 0.1409, "step": 3711 }, { "epoch": 0.7791771620486986, "grad_norm": 0.09642691910266876, "learning_rate": 1.4136678098382156e-05, "loss": 0.1422, "step": 3712 }, { "epoch": 0.7793870696893367, "grad_norm": 0.12066272646188736, "learning_rate": 1.4111156372284207e-05, "loss": 0.1521, "step": 3713 }, { "epoch": 0.7795969773299748, "grad_norm": 0.12618005275726318, "learning_rate": 1.4085653919315072e-05, "loss": 0.1534, "step": 3714 }, { "epoch": 0.7798068849706129, "grad_norm": 0.14962835609912872, "learning_rate": 1.4060170753170154e-05, "loss": 0.1598, "step": 3715 }, { "epoch": 0.780016792611251, "grad_norm": 0.12119120359420776, "learning_rate": 1.4034706887534493e-05, "loss": 0.1585, "step": 3716 }, { "epoch": 0.7802267002518891, "grad_norm": 0.1072816252708435, "learning_rate": 1.4009262336082767e-05, "loss": 0.153, "step": 3717 }, { "epoch": 0.7804366078925273, "grad_norm": 0.1178685873746872, "learning_rate": 1.3983837112479282e-05, "loss": 0.1649, "step": 3718 }, { "epoch": 0.7806465155331654, "grad_norm": 0.11183042824268341, "learning_rate": 1.3958431230377966e-05, "loss": 0.1508, "step": 3719 }, { "epoch": 0.7808564231738035, "grad_norm": 0.12451529502868652, "learning_rate": 1.3933044703422337e-05, "loss": 0.1617, "step": 3720 }, { "epoch": 0.7810663308144417, "grad_norm": 0.132112517952919, "learning_rate": 1.3907677545245579e-05, "loss": 0.1497, "step": 3721 }, { "epoch": 0.7812762384550798, "grad_norm": 0.12987057864665985, "learning_rate": 1.3882329769470425e-05, "loss": 0.1535, "step": 3722 }, { "epoch": 0.7814861460957179, "grad_norm": 0.11654311418533325, "learning_rate": 1.3857001389709174e-05, "loss": 0.1488, "step": 3723 }, { "epoch": 0.781696053736356, "grad_norm": 0.12595154345035553, "learning_rate": 1.3831692419563786e-05, "loss": 0.1533, "step": 3724 }, { "epoch": 0.7819059613769941, "grad_norm": 0.14091216027736664, "learning_rate": 1.380640287262574e-05, "loss": 0.1734, "step": 3725 }, { "epoch": 0.7821158690176322, "grad_norm": 0.14180079102516174, "learning_rate": 1.3781132762476096e-05, "loss": 0.1464, "step": 3726 }, { "epoch": 0.7823257766582704, "grad_norm": 0.12052421271800995, "learning_rate": 1.3755882102685486e-05, "loss": 0.1564, "step": 3727 }, { "epoch": 0.7825356842989085, "grad_norm": 0.1349646896123886, "learning_rate": 1.373065090681408e-05, "loss": 0.1486, "step": 3728 }, { "epoch": 0.7827455919395466, "grad_norm": 0.14333775639533997, "learning_rate": 1.3705439188411645e-05, "loss": 0.1527, "step": 3729 }, { "epoch": 0.7829554995801847, "grad_norm": 0.13959290087223053, "learning_rate": 1.3680246961017401e-05, "loss": 0.1659, "step": 3730 }, { "epoch": 0.7831654072208228, "grad_norm": 0.11381187289953232, "learning_rate": 1.3655074238160175e-05, "loss": 0.1587, "step": 3731 }, { "epoch": 0.783375314861461, "grad_norm": 0.14192262291908264, "learning_rate": 1.3629921033358272e-05, "loss": 0.1367, "step": 3732 }, { "epoch": 0.783585222502099, "grad_norm": 0.10846276581287384, "learning_rate": 1.3604787360119574e-05, "loss": 0.1476, "step": 3733 }, { "epoch": 0.7837951301427372, "grad_norm": 0.10947103053331375, "learning_rate": 1.3579673231941415e-05, "loss": 0.1479, "step": 3734 }, { "epoch": 0.7840050377833753, "grad_norm": 0.11812829971313477, "learning_rate": 1.3554578662310657e-05, "loss": 0.1748, "step": 3735 }, { "epoch": 0.7842149454240135, "grad_norm": 0.13791900873184204, "learning_rate": 1.3529503664703664e-05, "loss": 0.1586, "step": 3736 }, { "epoch": 0.7844248530646516, "grad_norm": 0.12275446206331253, "learning_rate": 1.3504448252586272e-05, "loss": 0.1587, "step": 3737 }, { "epoch": 0.7846347607052897, "grad_norm": 0.1119173988699913, "learning_rate": 1.347941243941382e-05, "loss": 0.1353, "step": 3738 }, { "epoch": 0.7848446683459278, "grad_norm": 0.1312474012374878, "learning_rate": 1.3454396238631101e-05, "loss": 0.1613, "step": 3739 }, { "epoch": 0.7850545759865659, "grad_norm": 0.1218884214758873, "learning_rate": 1.3429399663672382e-05, "loss": 0.1622, "step": 3740 }, { "epoch": 0.785264483627204, "grad_norm": 0.12461695820093155, "learning_rate": 1.3404422727961413e-05, "loss": 0.1413, "step": 3741 }, { "epoch": 0.7854743912678421, "grad_norm": 0.12566153705120087, "learning_rate": 1.3379465444911376e-05, "loss": 0.1357, "step": 3742 }, { "epoch": 0.7856842989084802, "grad_norm": 0.10288352519273758, "learning_rate": 1.3354527827924912e-05, "loss": 0.1728, "step": 3743 }, { "epoch": 0.7858942065491183, "grad_norm": 0.11198544502258301, "learning_rate": 1.3329609890394046e-05, "loss": 0.1342, "step": 3744 }, { "epoch": 0.7861041141897565, "grad_norm": 0.10878617316484451, "learning_rate": 1.3304711645700324e-05, "loss": 0.1562, "step": 3745 }, { "epoch": 0.7863140218303947, "grad_norm": 0.13761012256145477, "learning_rate": 1.327983310721465e-05, "loss": 0.1676, "step": 3746 }, { "epoch": 0.7865239294710328, "grad_norm": 0.10911747813224792, "learning_rate": 1.3254974288297384e-05, "loss": 0.1426, "step": 3747 }, { "epoch": 0.7867338371116709, "grad_norm": 0.10680964589118958, "learning_rate": 1.3230135202298256e-05, "loss": 0.1717, "step": 3748 }, { "epoch": 0.786943744752309, "grad_norm": 0.12579379975795746, "learning_rate": 1.3205315862556444e-05, "loss": 0.1505, "step": 3749 }, { "epoch": 0.7871536523929471, "grad_norm": 0.11058539152145386, "learning_rate": 1.3180516282400513e-05, "loss": 0.1435, "step": 3750 }, { "epoch": 0.7873635600335852, "grad_norm": 0.14114023745059967, "learning_rate": 1.3155736475148372e-05, "loss": 0.1517, "step": 3751 }, { "epoch": 0.7875734676742233, "grad_norm": 0.11331699043512344, "learning_rate": 1.3130976454107336e-05, "loss": 0.1407, "step": 3752 }, { "epoch": 0.7877833753148614, "grad_norm": 0.1149875819683075, "learning_rate": 1.3106236232574138e-05, "loss": 0.1596, "step": 3753 }, { "epoch": 0.7879932829554995, "grad_norm": 0.13585644960403442, "learning_rate": 1.3081515823834828e-05, "loss": 0.1616, "step": 3754 }, { "epoch": 0.7882031905961377, "grad_norm": 0.12654295563697815, "learning_rate": 1.3056815241164826e-05, "loss": 0.166, "step": 3755 }, { "epoch": 0.7884130982367759, "grad_norm": 0.1386784017086029, "learning_rate": 1.3032134497828913e-05, "loss": 0.1668, "step": 3756 }, { "epoch": 0.788623005877414, "grad_norm": 0.14663977921009064, "learning_rate": 1.300747360708121e-05, "loss": 0.1635, "step": 3757 }, { "epoch": 0.7888329135180521, "grad_norm": 0.14930498600006104, "learning_rate": 1.2982832582165177e-05, "loss": 0.1521, "step": 3758 }, { "epoch": 0.7890428211586902, "grad_norm": 0.10799439996480942, "learning_rate": 1.295821143631361e-05, "loss": 0.1494, "step": 3759 }, { "epoch": 0.7892527287993283, "grad_norm": 0.1401253193616867, "learning_rate": 1.2933610182748629e-05, "loss": 0.1447, "step": 3760 }, { "epoch": 0.7894626364399664, "grad_norm": 0.12463638931512833, "learning_rate": 1.2909028834681646e-05, "loss": 0.1575, "step": 3761 }, { "epoch": 0.7896725440806045, "grad_norm": 0.10640936344861984, "learning_rate": 1.2884467405313444e-05, "loss": 0.1617, "step": 3762 }, { "epoch": 0.7898824517212426, "grad_norm": 0.11918281018733978, "learning_rate": 1.2859925907834053e-05, "loss": 0.1667, "step": 3763 }, { "epoch": 0.7900923593618808, "grad_norm": 0.12711642682552338, "learning_rate": 1.2835404355422814e-05, "loss": 0.1687, "step": 3764 }, { "epoch": 0.7903022670025189, "grad_norm": 0.11989998072385788, "learning_rate": 1.2810902761248373e-05, "loss": 0.1669, "step": 3765 }, { "epoch": 0.790512174643157, "grad_norm": 0.15876778960227966, "learning_rate": 1.2786421138468636e-05, "loss": 0.1621, "step": 3766 }, { "epoch": 0.7907220822837951, "grad_norm": 0.11356843262910843, "learning_rate": 1.2761959500230791e-05, "loss": 0.1512, "step": 3767 }, { "epoch": 0.7909319899244333, "grad_norm": 0.1321927160024643, "learning_rate": 1.2737517859671305e-05, "loss": 0.1434, "step": 3768 }, { "epoch": 0.7911418975650714, "grad_norm": 0.1526881903409958, "learning_rate": 1.2713096229915883e-05, "loss": 0.1544, "step": 3769 }, { "epoch": 0.7913518052057095, "grad_norm": 0.12897861003875732, "learning_rate": 1.2688694624079516e-05, "loss": 0.1725, "step": 3770 }, { "epoch": 0.7915617128463476, "grad_norm": 0.1343359798192978, "learning_rate": 1.2664313055266436e-05, "loss": 0.1426, "step": 3771 }, { "epoch": 0.7917716204869857, "grad_norm": 0.1096104085445404, "learning_rate": 1.2639951536570066e-05, "loss": 0.1809, "step": 3772 }, { "epoch": 0.7919815281276238, "grad_norm": 0.1288156807422638, "learning_rate": 1.2615610081073104e-05, "loss": 0.1642, "step": 3773 }, { "epoch": 0.792191435768262, "grad_norm": 0.1227651983499527, "learning_rate": 1.2591288701847487e-05, "loss": 0.1484, "step": 3774 }, { "epoch": 0.7924013434089001, "grad_norm": 0.1349715292453766, "learning_rate": 1.2566987411954345e-05, "loss": 0.1542, "step": 3775 }, { "epoch": 0.7926112510495382, "grad_norm": 0.1394587904214859, "learning_rate": 1.2542706224444023e-05, "loss": 0.1617, "step": 3776 }, { "epoch": 0.7928211586901763, "grad_norm": 0.11542592942714691, "learning_rate": 1.2518445152356062e-05, "loss": 0.1578, "step": 3777 }, { "epoch": 0.7930310663308144, "grad_norm": 0.11972792446613312, "learning_rate": 1.2494204208719251e-05, "loss": 0.1503, "step": 3778 }, { "epoch": 0.7932409739714525, "grad_norm": 0.11570564657449722, "learning_rate": 1.2469983406551484e-05, "loss": 0.1738, "step": 3779 }, { "epoch": 0.7934508816120907, "grad_norm": 0.12285133451223373, "learning_rate": 1.2445782758859909e-05, "loss": 0.1399, "step": 3780 }, { "epoch": 0.7936607892527288, "grad_norm": 0.12845146656036377, "learning_rate": 1.2421602278640804e-05, "loss": 0.1371, "step": 3781 }, { "epoch": 0.7938706968933669, "grad_norm": 0.12982353568077087, "learning_rate": 1.2397441978879676e-05, "loss": 0.1551, "step": 3782 }, { "epoch": 0.7940806045340051, "grad_norm": 0.12580209970474243, "learning_rate": 1.2373301872551134e-05, "loss": 0.1566, "step": 3783 }, { "epoch": 0.7942905121746432, "grad_norm": 0.18834897875785828, "learning_rate": 1.2349181972618973e-05, "loss": 0.1583, "step": 3784 }, { "epoch": 0.7945004198152813, "grad_norm": 0.11271581053733826, "learning_rate": 1.2325082292036127e-05, "loss": 0.146, "step": 3785 }, { "epoch": 0.7947103274559194, "grad_norm": 0.12085813283920288, "learning_rate": 1.230100284374468e-05, "loss": 0.1465, "step": 3786 }, { "epoch": 0.7949202350965575, "grad_norm": 0.11245040595531464, "learning_rate": 1.2276943640675842e-05, "loss": 0.1355, "step": 3787 }, { "epoch": 0.7951301427371956, "grad_norm": 0.14471422135829926, "learning_rate": 1.2252904695749951e-05, "loss": 0.1607, "step": 3788 }, { "epoch": 0.7953400503778337, "grad_norm": 0.11694999784231186, "learning_rate": 1.2228886021876463e-05, "loss": 0.1618, "step": 3789 }, { "epoch": 0.7955499580184718, "grad_norm": 0.10486170649528503, "learning_rate": 1.2204887631953975e-05, "loss": 0.1332, "step": 3790 }, { "epoch": 0.7957598656591099, "grad_norm": 0.10166893154382706, "learning_rate": 1.2180909538870177e-05, "loss": 0.1565, "step": 3791 }, { "epoch": 0.7959697732997482, "grad_norm": 0.13252127170562744, "learning_rate": 1.2156951755501822e-05, "loss": 0.1525, "step": 3792 }, { "epoch": 0.7961796809403863, "grad_norm": 0.13535131514072418, "learning_rate": 1.213301429471479e-05, "loss": 0.1439, "step": 3793 }, { "epoch": 0.7963895885810244, "grad_norm": 0.1082453727722168, "learning_rate": 1.2109097169364064e-05, "loss": 0.1517, "step": 3794 }, { "epoch": 0.7965994962216625, "grad_norm": 0.11083705723285675, "learning_rate": 1.2085200392293683e-05, "loss": 0.1339, "step": 3795 }, { "epoch": 0.7968094038623006, "grad_norm": 0.12013557553291321, "learning_rate": 1.2061323976336752e-05, "loss": 0.1669, "step": 3796 }, { "epoch": 0.7970193115029387, "grad_norm": 0.12751804292201996, "learning_rate": 1.203746793431546e-05, "loss": 0.1488, "step": 3797 }, { "epoch": 0.7972292191435768, "grad_norm": 0.1310308873653412, "learning_rate": 1.2013632279041042e-05, "loss": 0.1618, "step": 3798 }, { "epoch": 0.7974391267842149, "grad_norm": 0.12405902147293091, "learning_rate": 1.1989817023313792e-05, "loss": 0.1654, "step": 3799 }, { "epoch": 0.797649034424853, "grad_norm": 0.11220216006040573, "learning_rate": 1.1966022179923043e-05, "loss": 0.1552, "step": 3800 }, { "epoch": 0.7978589420654912, "grad_norm": 0.11725794523954391, "learning_rate": 1.1942247761647174e-05, "loss": 0.1732, "step": 3801 }, { "epoch": 0.7980688497061293, "grad_norm": 0.11368782073259354, "learning_rate": 1.191849378125357e-05, "loss": 0.168, "step": 3802 }, { "epoch": 0.7982787573467675, "grad_norm": 0.13916854560375214, "learning_rate": 1.1894760251498699e-05, "loss": 0.1479, "step": 3803 }, { "epoch": 0.7984886649874056, "grad_norm": 0.10956691950559616, "learning_rate": 1.1871047185127987e-05, "loss": 0.146, "step": 3804 }, { "epoch": 0.7986985726280437, "grad_norm": 0.1180625706911087, "learning_rate": 1.1847354594875893e-05, "loss": 0.156, "step": 3805 }, { "epoch": 0.7989084802686818, "grad_norm": 0.11851587146520615, "learning_rate": 1.1823682493465876e-05, "loss": 0.1718, "step": 3806 }, { "epoch": 0.7991183879093199, "grad_norm": 0.10954640805721283, "learning_rate": 1.1800030893610404e-05, "loss": 0.1582, "step": 3807 }, { "epoch": 0.799328295549958, "grad_norm": 0.16035374999046326, "learning_rate": 1.1776399808010924e-05, "loss": 0.1674, "step": 3808 }, { "epoch": 0.7995382031905961, "grad_norm": 0.12425439804792404, "learning_rate": 1.1752789249357865e-05, "loss": 0.1442, "step": 3809 }, { "epoch": 0.7997481108312342, "grad_norm": 0.12865933775901794, "learning_rate": 1.1729199230330617e-05, "loss": 0.1663, "step": 3810 }, { "epoch": 0.7999580184718724, "grad_norm": 0.11566450446844101, "learning_rate": 1.1705629763597603e-05, "loss": 0.1532, "step": 3811 }, { "epoch": 0.8001679261125105, "grad_norm": 0.1077750027179718, "learning_rate": 1.1682080861816152e-05, "loss": 0.1584, "step": 3812 }, { "epoch": 0.8003778337531486, "grad_norm": 0.11821168661117554, "learning_rate": 1.165855253763254e-05, "loss": 0.1609, "step": 3813 }, { "epoch": 0.8005877413937867, "grad_norm": 0.11459922045469284, "learning_rate": 1.1635044803682011e-05, "loss": 0.1458, "step": 3814 }, { "epoch": 0.8007976490344249, "grad_norm": 0.11377918720245361, "learning_rate": 1.1611557672588785e-05, "loss": 0.1485, "step": 3815 }, { "epoch": 0.801007556675063, "grad_norm": 0.1579093039035797, "learning_rate": 1.1588091156965975e-05, "loss": 0.1708, "step": 3816 }, { "epoch": 0.8012174643157011, "grad_norm": 0.1275099813938141, "learning_rate": 1.1564645269415637e-05, "loss": 0.1549, "step": 3817 }, { "epoch": 0.8014273719563392, "grad_norm": 0.1207624152302742, "learning_rate": 1.1541220022528725e-05, "loss": 0.1688, "step": 3818 }, { "epoch": 0.8016372795969773, "grad_norm": 0.13440079987049103, "learning_rate": 1.1517815428885186e-05, "loss": 0.1575, "step": 3819 }, { "epoch": 0.8018471872376155, "grad_norm": 0.1259511113166809, "learning_rate": 1.149443150105377e-05, "loss": 0.1534, "step": 3820 }, { "epoch": 0.8020570948782536, "grad_norm": 0.10427981615066528, "learning_rate": 1.1471068251592204e-05, "loss": 0.148, "step": 3821 }, { "epoch": 0.8022670025188917, "grad_norm": 0.10866251587867737, "learning_rate": 1.1447725693047062e-05, "loss": 0.1467, "step": 3822 }, { "epoch": 0.8024769101595298, "grad_norm": 0.11648157238960266, "learning_rate": 1.1424403837953862e-05, "loss": 0.1399, "step": 3823 }, { "epoch": 0.8026868178001679, "grad_norm": 0.12202499806880951, "learning_rate": 1.1401102698836968e-05, "loss": 0.1513, "step": 3824 }, { "epoch": 0.802896725440806, "grad_norm": 0.1321491301059723, "learning_rate": 1.1377822288209611e-05, "loss": 0.1674, "step": 3825 }, { "epoch": 0.8031066330814441, "grad_norm": 0.11231104284524918, "learning_rate": 1.135456261857391e-05, "loss": 0.1501, "step": 3826 }, { "epoch": 0.8033165407220823, "grad_norm": 0.14203539490699768, "learning_rate": 1.1331323702420842e-05, "loss": 0.1549, "step": 3827 }, { "epoch": 0.8035264483627204, "grad_norm": 0.11880168318748474, "learning_rate": 1.1308105552230231e-05, "loss": 0.1549, "step": 3828 }, { "epoch": 0.8037363560033586, "grad_norm": 0.11236574500799179, "learning_rate": 1.1284908180470748e-05, "loss": 0.1486, "step": 3829 }, { "epoch": 0.8039462636439967, "grad_norm": 0.12606748938560486, "learning_rate": 1.1261731599599911e-05, "loss": 0.1531, "step": 3830 }, { "epoch": 0.8041561712846348, "grad_norm": 0.13274842500686646, "learning_rate": 1.1238575822064096e-05, "loss": 0.1598, "step": 3831 }, { "epoch": 0.8043660789252729, "grad_norm": 0.1181090697646141, "learning_rate": 1.1215440860298465e-05, "loss": 0.1613, "step": 3832 }, { "epoch": 0.804575986565911, "grad_norm": 0.12809668481349945, "learning_rate": 1.1192326726727042e-05, "loss": 0.1592, "step": 3833 }, { "epoch": 0.8047858942065491, "grad_norm": 0.11602126806974411, "learning_rate": 1.11692334337626e-05, "loss": 0.1514, "step": 3834 }, { "epoch": 0.8049958018471872, "grad_norm": 0.12484683841466904, "learning_rate": 1.1146160993806804e-05, "loss": 0.1589, "step": 3835 }, { "epoch": 0.8052057094878253, "grad_norm": 0.1220654621720314, "learning_rate": 1.1123109419250072e-05, "loss": 0.1499, "step": 3836 }, { "epoch": 0.8054156171284634, "grad_norm": 0.12533478438854218, "learning_rate": 1.1100078722471618e-05, "loss": 0.157, "step": 3837 }, { "epoch": 0.8056255247691015, "grad_norm": 0.15392319858074188, "learning_rate": 1.1077068915839467e-05, "loss": 0.1609, "step": 3838 }, { "epoch": 0.8058354324097398, "grad_norm": 0.16898441314697266, "learning_rate": 1.1054080011710383e-05, "loss": 0.1617, "step": 3839 }, { "epoch": 0.8060453400503779, "grad_norm": 0.1096329316496849, "learning_rate": 1.1031112022429984e-05, "loss": 0.1393, "step": 3840 }, { "epoch": 0.806255247691016, "grad_norm": 0.1226249560713768, "learning_rate": 1.1008164960332556e-05, "loss": 0.1633, "step": 3841 }, { "epoch": 0.8064651553316541, "grad_norm": 0.10499352961778641, "learning_rate": 1.0985238837741213e-05, "loss": 0.1456, "step": 3842 }, { "epoch": 0.8066750629722922, "grad_norm": 0.14240071177482605, "learning_rate": 1.0962333666967789e-05, "loss": 0.1511, "step": 3843 }, { "epoch": 0.8068849706129303, "grad_norm": 0.11764491349458694, "learning_rate": 1.0939449460312918e-05, "loss": 0.1681, "step": 3844 }, { "epoch": 0.8070948782535684, "grad_norm": 0.1263282299041748, "learning_rate": 1.0916586230065922e-05, "loss": 0.135, "step": 3845 }, { "epoch": 0.8073047858942065, "grad_norm": 0.14005941152572632, "learning_rate": 1.0893743988504884e-05, "loss": 0.1527, "step": 3846 }, { "epoch": 0.8075146935348446, "grad_norm": 0.1181284487247467, "learning_rate": 1.08709227478966e-05, "loss": 0.1388, "step": 3847 }, { "epoch": 0.8077246011754828, "grad_norm": 0.10536040365695953, "learning_rate": 1.0848122520496607e-05, "loss": 0.1736, "step": 3848 }, { "epoch": 0.8079345088161209, "grad_norm": 0.10224374383687973, "learning_rate": 1.0825343318549141e-05, "loss": 0.1439, "step": 3849 }, { "epoch": 0.808144416456759, "grad_norm": 0.10331021994352341, "learning_rate": 1.080258515428716e-05, "loss": 0.1465, "step": 3850 }, { "epoch": 0.8083543240973972, "grad_norm": 0.1074858009815216, "learning_rate": 1.07798480399323e-05, "loss": 0.1531, "step": 3851 }, { "epoch": 0.8085642317380353, "grad_norm": 0.1292872577905655, "learning_rate": 1.075713198769494e-05, "loss": 0.1535, "step": 3852 }, { "epoch": 0.8087741393786734, "grad_norm": 0.1513303965330124, "learning_rate": 1.0734437009774106e-05, "loss": 0.1667, "step": 3853 }, { "epoch": 0.8089840470193115, "grad_norm": 0.12170423567295074, "learning_rate": 1.0711763118357527e-05, "loss": 0.16, "step": 3854 }, { "epoch": 0.8091939546599496, "grad_norm": 0.12069430202245712, "learning_rate": 1.068911032562157e-05, "loss": 0.1503, "step": 3855 }, { "epoch": 0.8094038623005877, "grad_norm": 0.11853886395692825, "learning_rate": 1.0666478643731332e-05, "loss": 0.1626, "step": 3856 }, { "epoch": 0.8096137699412259, "grad_norm": 0.1364051252603531, "learning_rate": 1.0643868084840542e-05, "loss": 0.1507, "step": 3857 }, { "epoch": 0.809823677581864, "grad_norm": 0.11953744292259216, "learning_rate": 1.0621278661091571e-05, "loss": 0.1444, "step": 3858 }, { "epoch": 0.8100335852225021, "grad_norm": 0.14346711337566376, "learning_rate": 1.0598710384615457e-05, "loss": 0.1534, "step": 3859 }, { "epoch": 0.8102434928631402, "grad_norm": 0.14622189104557037, "learning_rate": 1.0576163267531901e-05, "loss": 0.1648, "step": 3860 }, { "epoch": 0.8104534005037783, "grad_norm": 0.1627495288848877, "learning_rate": 1.0553637321949217e-05, "loss": 0.1608, "step": 3861 }, { "epoch": 0.8106633081444165, "grad_norm": 0.10121377557516098, "learning_rate": 1.0531132559964329e-05, "loss": 0.1524, "step": 3862 }, { "epoch": 0.8108732157850546, "grad_norm": 0.13579495251178741, "learning_rate": 1.0508648993662806e-05, "loss": 0.1506, "step": 3863 }, { "epoch": 0.8110831234256927, "grad_norm": 0.14970248937606812, "learning_rate": 1.0486186635118866e-05, "loss": 0.16, "step": 3864 }, { "epoch": 0.8112930310663308, "grad_norm": 0.14687329530715942, "learning_rate": 1.0463745496395294e-05, "loss": 0.1777, "step": 3865 }, { "epoch": 0.8115029387069689, "grad_norm": 0.1286310851573944, "learning_rate": 1.0441325589543494e-05, "loss": 0.1405, "step": 3866 }, { "epoch": 0.8117128463476071, "grad_norm": 0.12971775233745575, "learning_rate": 1.0418926926603467e-05, "loss": 0.1536, "step": 3867 }, { "epoch": 0.8119227539882452, "grad_norm": 0.11588292568922043, "learning_rate": 1.039654951960381e-05, "loss": 0.1585, "step": 3868 }, { "epoch": 0.8121326616288833, "grad_norm": 0.11025537550449371, "learning_rate": 1.0374193380561704e-05, "loss": 0.158, "step": 3869 }, { "epoch": 0.8123425692695214, "grad_norm": 0.11403115838766098, "learning_rate": 1.0351858521482893e-05, "loss": 0.1537, "step": 3870 }, { "epoch": 0.8125524769101595, "grad_norm": 0.11629325896501541, "learning_rate": 1.0329544954361709e-05, "loss": 0.1495, "step": 3871 }, { "epoch": 0.8127623845507976, "grad_norm": 0.12189958244562149, "learning_rate": 1.0307252691181063e-05, "loss": 0.166, "step": 3872 }, { "epoch": 0.8129722921914357, "grad_norm": 0.11428286880254745, "learning_rate": 1.0284981743912397e-05, "loss": 0.1617, "step": 3873 }, { "epoch": 0.8131821998320738, "grad_norm": 0.13367648422718048, "learning_rate": 1.0262732124515729e-05, "loss": 0.1518, "step": 3874 }, { "epoch": 0.813392107472712, "grad_norm": 0.09968928247690201, "learning_rate": 1.0240503844939603e-05, "loss": 0.1431, "step": 3875 }, { "epoch": 0.8136020151133502, "grad_norm": 0.12892545759677887, "learning_rate": 1.0218296917121111e-05, "loss": 0.1501, "step": 3876 }, { "epoch": 0.8138119227539883, "grad_norm": 0.14232294261455536, "learning_rate": 1.0196111352985887e-05, "loss": 0.1384, "step": 3877 }, { "epoch": 0.8140218303946264, "grad_norm": 0.12474681437015533, "learning_rate": 1.0173947164448089e-05, "loss": 0.1441, "step": 3878 }, { "epoch": 0.8142317380352645, "grad_norm": 0.11398620903491974, "learning_rate": 1.0151804363410383e-05, "loss": 0.1494, "step": 3879 }, { "epoch": 0.8144416456759026, "grad_norm": 0.1219918429851532, "learning_rate": 1.0129682961763948e-05, "loss": 0.1552, "step": 3880 }, { "epoch": 0.8146515533165407, "grad_norm": 0.12299709022045135, "learning_rate": 1.0107582971388524e-05, "loss": 0.1359, "step": 3881 }, { "epoch": 0.8148614609571788, "grad_norm": 0.12771455943584442, "learning_rate": 1.0085504404152273e-05, "loss": 0.1404, "step": 3882 }, { "epoch": 0.8150713685978169, "grad_norm": 0.12867510318756104, "learning_rate": 1.0063447271911897e-05, "loss": 0.1677, "step": 3883 }, { "epoch": 0.815281276238455, "grad_norm": 0.12489780783653259, "learning_rate": 1.0041411586512572e-05, "loss": 0.1458, "step": 3884 }, { "epoch": 0.8154911838790933, "grad_norm": 0.11988552659749985, "learning_rate": 1.0019397359787991e-05, "loss": 0.1367, "step": 3885 }, { "epoch": 0.8157010915197314, "grad_norm": 0.11586282402276993, "learning_rate": 9.997404603560278e-06, "loss": 0.1635, "step": 3886 }, { "epoch": 0.8159109991603695, "grad_norm": 0.11998885124921799, "learning_rate": 9.975433329640054e-06, "loss": 0.1705, "step": 3887 }, { "epoch": 0.8161209068010076, "grad_norm": 0.12188997864723206, "learning_rate": 9.953483549826387e-06, "loss": 0.1435, "step": 3888 }, { "epoch": 0.8163308144416457, "grad_norm": 0.12801359593868256, "learning_rate": 9.931555275906812e-06, "loss": 0.155, "step": 3889 }, { "epoch": 0.8165407220822838, "grad_norm": 0.12145894020795822, "learning_rate": 9.909648519657316e-06, "loss": 0.1516, "step": 3890 }, { "epoch": 0.8167506297229219, "grad_norm": 0.13459019362926483, "learning_rate": 9.887763292842323e-06, "loss": 0.1392, "step": 3891 }, { "epoch": 0.81696053736356, "grad_norm": 0.10860946029424667, "learning_rate": 9.86589960721469e-06, "loss": 0.1655, "step": 3892 }, { "epoch": 0.8171704450041981, "grad_norm": 0.15670441091060638, "learning_rate": 9.844057474515745e-06, "loss": 0.1706, "step": 3893 }, { "epoch": 0.8173803526448362, "grad_norm": 0.10645842552185059, "learning_rate": 9.822236906475191e-06, "loss": 0.154, "step": 3894 }, { "epoch": 0.8175902602854744, "grad_norm": 0.11920606344938278, "learning_rate": 9.800437914811189e-06, "loss": 0.1782, "step": 3895 }, { "epoch": 0.8178001679261125, "grad_norm": 0.11042717099189758, "learning_rate": 9.778660511230253e-06, "loss": 0.1434, "step": 3896 }, { "epoch": 0.8180100755667506, "grad_norm": 0.13635507225990295, "learning_rate": 9.756904707427395e-06, "loss": 0.1564, "step": 3897 }, { "epoch": 0.8182199832073888, "grad_norm": 0.11731656640768051, "learning_rate": 9.73517051508595e-06, "loss": 0.1452, "step": 3898 }, { "epoch": 0.8184298908480269, "grad_norm": 0.1272083967924118, "learning_rate": 9.713457945877691e-06, "loss": 0.1511, "step": 3899 }, { "epoch": 0.818639798488665, "grad_norm": 0.11312077194452286, "learning_rate": 9.691767011462743e-06, "loss": 0.1552, "step": 3900 }, { "epoch": 0.8188497061293031, "grad_norm": 0.12853741645812988, "learning_rate": 9.670097723489662e-06, "loss": 0.1598, "step": 3901 }, { "epoch": 0.8190596137699412, "grad_norm": 0.10277300328016281, "learning_rate": 9.648450093595346e-06, "loss": 0.1612, "step": 3902 }, { "epoch": 0.8192695214105793, "grad_norm": 0.1202654018998146, "learning_rate": 9.626824133405043e-06, "loss": 0.1625, "step": 3903 }, { "epoch": 0.8194794290512175, "grad_norm": 0.1286875158548355, "learning_rate": 9.605219854532393e-06, "loss": 0.1555, "step": 3904 }, { "epoch": 0.8196893366918556, "grad_norm": 0.1292945295572281, "learning_rate": 9.583637268579405e-06, "loss": 0.1494, "step": 3905 }, { "epoch": 0.8198992443324937, "grad_norm": 0.12690283358097076, "learning_rate": 9.562076387136414e-06, "loss": 0.1494, "step": 3906 }, { "epoch": 0.8201091519731318, "grad_norm": 0.0949103832244873, "learning_rate": 9.540537221782098e-06, "loss": 0.1495, "step": 3907 }, { "epoch": 0.8203190596137699, "grad_norm": 0.11503642797470093, "learning_rate": 9.519019784083488e-06, "loss": 0.1485, "step": 3908 }, { "epoch": 0.820528967254408, "grad_norm": 0.12526684999465942, "learning_rate": 9.497524085595943e-06, "loss": 0.1641, "step": 3909 }, { "epoch": 0.8207388748950462, "grad_norm": 0.1197986826300621, "learning_rate": 9.476050137863136e-06, "loss": 0.1546, "step": 3910 }, { "epoch": 0.8209487825356843, "grad_norm": 0.13118648529052734, "learning_rate": 9.454597952417066e-06, "loss": 0.153, "step": 3911 }, { "epoch": 0.8211586901763224, "grad_norm": 0.11788372695446014, "learning_rate": 9.433167540778037e-06, "loss": 0.151, "step": 3912 }, { "epoch": 0.8213685978169606, "grad_norm": 0.12314417958259583, "learning_rate": 9.411758914454698e-06, "loss": 0.1612, "step": 3913 }, { "epoch": 0.8215785054575987, "grad_norm": 0.12039327621459961, "learning_rate": 9.39037208494396e-06, "loss": 0.1559, "step": 3914 }, { "epoch": 0.8217884130982368, "grad_norm": 0.11974295228719711, "learning_rate": 9.369007063731029e-06, "loss": 0.1501, "step": 3915 }, { "epoch": 0.8219983207388749, "grad_norm": 0.11983851343393326, "learning_rate": 9.34766386228942e-06, "loss": 0.1537, "step": 3916 }, { "epoch": 0.822208228379513, "grad_norm": 0.1342703104019165, "learning_rate": 9.326342492080908e-06, "loss": 0.1545, "step": 3917 }, { "epoch": 0.8224181360201511, "grad_norm": 0.13480761647224426, "learning_rate": 9.305042964555567e-06, "loss": 0.1538, "step": 3918 }, { "epoch": 0.8226280436607892, "grad_norm": 0.1398279070854187, "learning_rate": 9.283765291151724e-06, "loss": 0.1599, "step": 3919 }, { "epoch": 0.8228379513014273, "grad_norm": 0.12116552144289017, "learning_rate": 9.262509483295977e-06, "loss": 0.1463, "step": 3920 }, { "epoch": 0.8230478589420654, "grad_norm": 0.11443023383617401, "learning_rate": 9.241275552403168e-06, "loss": 0.1425, "step": 3921 }, { "epoch": 0.8232577665827037, "grad_norm": 0.13521820306777954, "learning_rate": 9.220063509876431e-06, "loss": 0.1679, "step": 3922 }, { "epoch": 0.8234676742233418, "grad_norm": 0.13272807002067566, "learning_rate": 9.198873367107109e-06, "loss": 0.1609, "step": 3923 }, { "epoch": 0.8236775818639799, "grad_norm": 0.1287853866815567, "learning_rate": 9.177705135474774e-06, "loss": 0.1554, "step": 3924 }, { "epoch": 0.823887489504618, "grad_norm": 0.11695458739995956, "learning_rate": 9.15655882634725e-06, "loss": 0.1451, "step": 3925 }, { "epoch": 0.8240973971452561, "grad_norm": 0.11539758741855621, "learning_rate": 9.13543445108061e-06, "loss": 0.1518, "step": 3926 }, { "epoch": 0.8243073047858942, "grad_norm": 0.1678249090909958, "learning_rate": 9.114332021019117e-06, "loss": 0.1645, "step": 3927 }, { "epoch": 0.8245172124265323, "grad_norm": 0.13638179004192352, "learning_rate": 9.093251547495257e-06, "loss": 0.1731, "step": 3928 }, { "epoch": 0.8247271200671704, "grad_norm": 0.12491890788078308, "learning_rate": 9.072193041829708e-06, "loss": 0.1337, "step": 3929 }, { "epoch": 0.8249370277078085, "grad_norm": 0.1244591474533081, "learning_rate": 9.051156515331417e-06, "loss": 0.1621, "step": 3930 }, { "epoch": 0.8251469353484466, "grad_norm": 0.11335837841033936, "learning_rate": 9.030141979297429e-06, "loss": 0.1443, "step": 3931 }, { "epoch": 0.8253568429890848, "grad_norm": 0.12867414951324463, "learning_rate": 9.009149445013054e-06, "loss": 0.1667, "step": 3932 }, { "epoch": 0.825566750629723, "grad_norm": 0.09924521297216415, "learning_rate": 8.988178923751745e-06, "loss": 0.1433, "step": 3933 }, { "epoch": 0.8257766582703611, "grad_norm": 0.11446885764598846, "learning_rate": 8.96723042677517e-06, "loss": 0.1481, "step": 3934 }, { "epoch": 0.8259865659109992, "grad_norm": 0.15089714527130127, "learning_rate": 8.946303965333147e-06, "loss": 0.1504, "step": 3935 }, { "epoch": 0.8261964735516373, "grad_norm": 0.12573710083961487, "learning_rate": 8.925399550663661e-06, "loss": 0.1535, "step": 3936 }, { "epoch": 0.8264063811922754, "grad_norm": 0.10890112817287445, "learning_rate": 8.904517193992862e-06, "loss": 0.1465, "step": 3937 }, { "epoch": 0.8266162888329135, "grad_norm": 0.12931612133979797, "learning_rate": 8.883656906535042e-06, "loss": 0.1582, "step": 3938 }, { "epoch": 0.8268261964735516, "grad_norm": 0.12019342184066772, "learning_rate": 8.86281869949267e-06, "loss": 0.1655, "step": 3939 }, { "epoch": 0.8270361041141897, "grad_norm": 0.12204300612211227, "learning_rate": 8.842002584056325e-06, "loss": 0.1582, "step": 3940 }, { "epoch": 0.8272460117548279, "grad_norm": 0.11083068698644638, "learning_rate": 8.821208571404726e-06, "loss": 0.139, "step": 3941 }, { "epoch": 0.827455919395466, "grad_norm": 0.10661565512418747, "learning_rate": 8.800436672704759e-06, "loss": 0.1469, "step": 3942 }, { "epoch": 0.8276658270361041, "grad_norm": 0.13081884384155273, "learning_rate": 8.7796868991114e-06, "loss": 0.1551, "step": 3943 }, { "epoch": 0.8278757346767422, "grad_norm": 0.09872268885374069, "learning_rate": 8.758959261767758e-06, "loss": 0.1479, "step": 3944 }, { "epoch": 0.8280856423173804, "grad_norm": 0.11020808666944504, "learning_rate": 8.73825377180501e-06, "loss": 0.1536, "step": 3945 }, { "epoch": 0.8282955499580185, "grad_norm": 0.11102128773927689, "learning_rate": 8.717570440342521e-06, "loss": 0.1503, "step": 3946 }, { "epoch": 0.8285054575986566, "grad_norm": 0.11287751793861389, "learning_rate": 8.696909278487697e-06, "loss": 0.144, "step": 3947 }, { "epoch": 0.8287153652392947, "grad_norm": 0.13401024043560028, "learning_rate": 8.676270297336054e-06, "loss": 0.1656, "step": 3948 }, { "epoch": 0.8289252728799328, "grad_norm": 0.13312119245529175, "learning_rate": 8.65565350797119e-06, "loss": 0.1551, "step": 3949 }, { "epoch": 0.829135180520571, "grad_norm": 0.1267576813697815, "learning_rate": 8.63505892146479e-06, "loss": 0.1588, "step": 3950 }, { "epoch": 0.8293450881612091, "grad_norm": 0.11618844419717789, "learning_rate": 8.614486548876644e-06, "loss": 0.1496, "step": 3951 }, { "epoch": 0.8295549958018472, "grad_norm": 0.12536628544330597, "learning_rate": 8.59393640125456e-06, "loss": 0.1727, "step": 3952 }, { "epoch": 0.8297649034424853, "grad_norm": 0.13767768442630768, "learning_rate": 8.573408489634427e-06, "loss": 0.1791, "step": 3953 }, { "epoch": 0.8299748110831234, "grad_norm": 0.12117113918066025, "learning_rate": 8.552902825040233e-06, "loss": 0.1372, "step": 3954 }, { "epoch": 0.8301847187237615, "grad_norm": 0.1230069175362587, "learning_rate": 8.532419418483972e-06, "loss": 0.1698, "step": 3955 }, { "epoch": 0.8303946263643996, "grad_norm": 0.11761808395385742, "learning_rate": 8.511958280965703e-06, "loss": 0.1598, "step": 3956 }, { "epoch": 0.8306045340050378, "grad_norm": 0.11687527596950531, "learning_rate": 8.491519423473526e-06, "loss": 0.1521, "step": 3957 }, { "epoch": 0.8308144416456759, "grad_norm": 0.12229825556278229, "learning_rate": 8.47110285698357e-06, "loss": 0.1605, "step": 3958 }, { "epoch": 0.831024349286314, "grad_norm": 0.11104927211999893, "learning_rate": 8.450708592459999e-06, "loss": 0.1315, "step": 3959 }, { "epoch": 0.8312342569269522, "grad_norm": 0.11745012551546097, "learning_rate": 8.430336640855007e-06, "loss": 0.1548, "step": 3960 }, { "epoch": 0.8314441645675903, "grad_norm": 0.1281951665878296, "learning_rate": 8.409987013108784e-06, "loss": 0.1455, "step": 3961 }, { "epoch": 0.8316540722082284, "grad_norm": 0.11386459320783615, "learning_rate": 8.389659720149546e-06, "loss": 0.159, "step": 3962 }, { "epoch": 0.8318639798488665, "grad_norm": 0.13125190138816833, "learning_rate": 8.369354772893523e-06, "loss": 0.1468, "step": 3963 }, { "epoch": 0.8320738874895046, "grad_norm": 0.1297435313463211, "learning_rate": 8.349072182244939e-06, "loss": 0.167, "step": 3964 }, { "epoch": 0.8322837951301427, "grad_norm": 0.12241479754447937, "learning_rate": 8.328811959095995e-06, "loss": 0.1691, "step": 3965 }, { "epoch": 0.8324937027707808, "grad_norm": 0.16956380009651184, "learning_rate": 8.308574114326906e-06, "loss": 0.1632, "step": 3966 }, { "epoch": 0.8327036104114189, "grad_norm": 0.1364806890487671, "learning_rate": 8.288358658805845e-06, "loss": 0.1449, "step": 3967 }, { "epoch": 0.832913518052057, "grad_norm": 0.12498215585947037, "learning_rate": 8.268165603388983e-06, "loss": 0.1588, "step": 3968 }, { "epoch": 0.8331234256926953, "grad_norm": 0.12091932445764542, "learning_rate": 8.247994958920446e-06, "loss": 0.1536, "step": 3969 }, { "epoch": 0.8333333333333334, "grad_norm": 0.11790355294942856, "learning_rate": 8.22784673623232e-06, "loss": 0.1526, "step": 3970 }, { "epoch": 0.8335432409739715, "grad_norm": 0.12362008541822433, "learning_rate": 8.207720946144687e-06, "loss": 0.1575, "step": 3971 }, { "epoch": 0.8337531486146096, "grad_norm": 0.11366129666566849, "learning_rate": 8.187617599465558e-06, "loss": 0.15, "step": 3972 }, { "epoch": 0.8339630562552477, "grad_norm": 0.12639477849006653, "learning_rate": 8.167536706990858e-06, "loss": 0.1588, "step": 3973 }, { "epoch": 0.8341729638958858, "grad_norm": 0.10803087800741196, "learning_rate": 8.147478279504489e-06, "loss": 0.1534, "step": 3974 }, { "epoch": 0.8343828715365239, "grad_norm": 0.11064185947179794, "learning_rate": 8.127442327778306e-06, "loss": 0.1582, "step": 3975 }, { "epoch": 0.834592779177162, "grad_norm": 0.12956248223781586, "learning_rate": 8.10742886257207e-06, "loss": 0.1537, "step": 3976 }, { "epoch": 0.8348026868178001, "grad_norm": 0.1277211457490921, "learning_rate": 8.087437894633459e-06, "loss": 0.1398, "step": 3977 }, { "epoch": 0.8350125944584383, "grad_norm": 0.1317525953054428, "learning_rate": 8.067469434698083e-06, "loss": 0.1636, "step": 3978 }, { "epoch": 0.8352225020990764, "grad_norm": 0.10670098662376404, "learning_rate": 8.047523493489462e-06, "loss": 0.1457, "step": 3979 }, { "epoch": 0.8354324097397146, "grad_norm": 0.11436796188354492, "learning_rate": 8.027600081719017e-06, "loss": 0.1444, "step": 3980 }, { "epoch": 0.8356423173803527, "grad_norm": 0.10503967851400375, "learning_rate": 8.007699210086084e-06, "loss": 0.1571, "step": 3981 }, { "epoch": 0.8358522250209908, "grad_norm": 0.1279638260602951, "learning_rate": 7.98782088927787e-06, "loss": 0.1493, "step": 3982 }, { "epoch": 0.8360621326616289, "grad_norm": 0.1345963031053543, "learning_rate": 7.967965129969506e-06, "loss": 0.1509, "step": 3983 }, { "epoch": 0.836272040302267, "grad_norm": 0.12100664526224136, "learning_rate": 7.948131942823984e-06, "loss": 0.1646, "step": 3984 }, { "epoch": 0.8364819479429051, "grad_norm": 0.11500248312950134, "learning_rate": 7.928321338492185e-06, "loss": 0.1506, "step": 3985 }, { "epoch": 0.8366918555835432, "grad_norm": 0.10120087116956711, "learning_rate": 7.90853332761282e-06, "loss": 0.1467, "step": 3986 }, { "epoch": 0.8369017632241813, "grad_norm": 0.13616779446601868, "learning_rate": 7.888767920812545e-06, "loss": 0.1419, "step": 3987 }, { "epoch": 0.8371116708648195, "grad_norm": 0.20254676043987274, "learning_rate": 7.869025128705815e-06, "loss": 0.1465, "step": 3988 }, { "epoch": 0.8373215785054576, "grad_norm": 0.13580451905727386, "learning_rate": 7.849304961894966e-06, "loss": 0.1713, "step": 3989 }, { "epoch": 0.8375314861460957, "grad_norm": 0.13808149099349976, "learning_rate": 7.829607430970177e-06, "loss": 0.1477, "step": 3990 }, { "epoch": 0.8377413937867338, "grad_norm": 0.12043146789073944, "learning_rate": 7.809932546509463e-06, "loss": 0.1573, "step": 3991 }, { "epoch": 0.837951301427372, "grad_norm": 0.12428410351276398, "learning_rate": 7.790280319078713e-06, "loss": 0.1787, "step": 3992 }, { "epoch": 0.8381612090680101, "grad_norm": 0.16309167444705963, "learning_rate": 7.77065075923159e-06, "loss": 0.1482, "step": 3993 }, { "epoch": 0.8383711167086482, "grad_norm": 0.10672210156917572, "learning_rate": 7.751043877509623e-06, "loss": 0.1473, "step": 3994 }, { "epoch": 0.8385810243492863, "grad_norm": 0.11658763885498047, "learning_rate": 7.731459684442171e-06, "loss": 0.1476, "step": 3995 }, { "epoch": 0.8387909319899244, "grad_norm": 0.13712359964847565, "learning_rate": 7.71189819054639e-06, "loss": 0.1629, "step": 3996 }, { "epoch": 0.8390008396305626, "grad_norm": 0.11185979843139648, "learning_rate": 7.692359406327238e-06, "loss": 0.1694, "step": 3997 }, { "epoch": 0.8392107472712007, "grad_norm": 0.12185225635766983, "learning_rate": 7.672843342277498e-06, "loss": 0.1613, "step": 3998 }, { "epoch": 0.8394206549118388, "grad_norm": 0.1134178638458252, "learning_rate": 7.65335000887773e-06, "loss": 0.1537, "step": 3999 }, { "epoch": 0.8396305625524769, "grad_norm": 0.11603078246116638, "learning_rate": 7.633879416596312e-06, "loss": 0.1592, "step": 4000 }, { "epoch": 0.839840470193115, "grad_norm": 0.11461564153432846, "learning_rate": 7.6144315758893916e-06, "loss": 0.1722, "step": 4001 }, { "epoch": 0.8400503778337531, "grad_norm": 0.12421731650829315, "learning_rate": 7.595006497200907e-06, "loss": 0.1532, "step": 4002 }, { "epoch": 0.8402602854743912, "grad_norm": 0.12341675162315369, "learning_rate": 7.575604190962549e-06, "loss": 0.1564, "step": 4003 }, { "epoch": 0.8404701931150294, "grad_norm": 0.12224044650793076, "learning_rate": 7.556224667593831e-06, "loss": 0.1335, "step": 4004 }, { "epoch": 0.8406801007556675, "grad_norm": 0.12325417250394821, "learning_rate": 7.536867937501985e-06, "loss": 0.1567, "step": 4005 }, { "epoch": 0.8408900083963057, "grad_norm": 0.15603864192962646, "learning_rate": 7.517534011082017e-06, "loss": 0.1484, "step": 4006 }, { "epoch": 0.8410999160369438, "grad_norm": 0.1234329491853714, "learning_rate": 7.498222898716683e-06, "loss": 0.1608, "step": 4007 }, { "epoch": 0.8413098236775819, "grad_norm": 0.16258041560649872, "learning_rate": 7.4789346107765e-06, "loss": 0.149, "step": 4008 }, { "epoch": 0.84151973131822, "grad_norm": 0.10971637070178986, "learning_rate": 7.4596691576197145e-06, "loss": 0.1506, "step": 4009 }, { "epoch": 0.8417296389588581, "grad_norm": 0.1123458594083786, "learning_rate": 7.440426549592316e-06, "loss": 0.1536, "step": 4010 }, { "epoch": 0.8419395465994962, "grad_norm": 0.11952552199363708, "learning_rate": 7.421206797028013e-06, "loss": 0.1577, "step": 4011 }, { "epoch": 0.8421494542401343, "grad_norm": 0.12449429929256439, "learning_rate": 7.402009910248276e-06, "loss": 0.1518, "step": 4012 }, { "epoch": 0.8423593618807724, "grad_norm": 0.11453278362751007, "learning_rate": 7.382835899562263e-06, "loss": 0.1605, "step": 4013 }, { "epoch": 0.8425692695214105, "grad_norm": 0.10088256746530533, "learning_rate": 7.363684775266844e-06, "loss": 0.1573, "step": 4014 }, { "epoch": 0.8427791771620488, "grad_norm": 0.11597353219985962, "learning_rate": 7.344556547646608e-06, "loss": 0.1635, "step": 4015 }, { "epoch": 0.8429890848026869, "grad_norm": 0.11462254077196121, "learning_rate": 7.325451226973867e-06, "loss": 0.1471, "step": 4016 }, { "epoch": 0.843198992443325, "grad_norm": 0.10850072652101517, "learning_rate": 7.306368823508608e-06, "loss": 0.1614, "step": 4017 }, { "epoch": 0.8434089000839631, "grad_norm": 0.1240755096077919, "learning_rate": 7.287309347498517e-06, "loss": 0.1563, "step": 4018 }, { "epoch": 0.8436188077246012, "grad_norm": 0.1336059272289276, "learning_rate": 7.2682728091789665e-06, "loss": 0.1644, "step": 4019 }, { "epoch": 0.8438287153652393, "grad_norm": 0.11125699430704117, "learning_rate": 7.2492592187730136e-06, "loss": 0.1544, "step": 4020 }, { "epoch": 0.8440386230058774, "grad_norm": 0.12155328691005707, "learning_rate": 7.23026858649139e-06, "loss": 0.1489, "step": 4021 }, { "epoch": 0.8442485306465155, "grad_norm": 0.14981696009635925, "learning_rate": 7.211300922532505e-06, "loss": 0.1455, "step": 4022 }, { "epoch": 0.8444584382871536, "grad_norm": 0.11799201369285583, "learning_rate": 7.192356237082404e-06, "loss": 0.141, "step": 4023 }, { "epoch": 0.8446683459277917, "grad_norm": 0.12192690372467041, "learning_rate": 7.17343454031485e-06, "loss": 0.1511, "step": 4024 }, { "epoch": 0.8448782535684299, "grad_norm": 0.12700971961021423, "learning_rate": 7.154535842391208e-06, "loss": 0.1443, "step": 4025 }, { "epoch": 0.845088161209068, "grad_norm": 0.12561286985874176, "learning_rate": 7.1356601534605105e-06, "loss": 0.1444, "step": 4026 }, { "epoch": 0.8452980688497062, "grad_norm": 0.10910797864198685, "learning_rate": 7.1168074836594375e-06, "loss": 0.1626, "step": 4027 }, { "epoch": 0.8455079764903443, "grad_norm": 0.13664831221103668, "learning_rate": 7.097977843112302e-06, "loss": 0.164, "step": 4028 }, { "epoch": 0.8457178841309824, "grad_norm": 0.131288543343544, "learning_rate": 7.0791712419310545e-06, "loss": 0.1524, "step": 4029 }, { "epoch": 0.8459277917716205, "grad_norm": 0.14658468961715698, "learning_rate": 7.060387690215258e-06, "loss": 0.1503, "step": 4030 }, { "epoch": 0.8461376994122586, "grad_norm": 0.1412951648235321, "learning_rate": 7.04162719805212e-06, "loss": 0.1398, "step": 4031 }, { "epoch": 0.8463476070528967, "grad_norm": 0.1180790513753891, "learning_rate": 7.022889775516439e-06, "loss": 0.146, "step": 4032 }, { "epoch": 0.8465575146935348, "grad_norm": 0.1320628523826599, "learning_rate": 7.0041754326706625e-06, "loss": 0.1591, "step": 4033 }, { "epoch": 0.846767422334173, "grad_norm": 0.1340452879667282, "learning_rate": 6.9854841795648166e-06, "loss": 0.138, "step": 4034 }, { "epoch": 0.8469773299748111, "grad_norm": 0.10865829885005951, "learning_rate": 6.9668160262365e-06, "loss": 0.1525, "step": 4035 }, { "epoch": 0.8471872376154492, "grad_norm": 0.1448470950126648, "learning_rate": 6.948170982710972e-06, "loss": 0.1516, "step": 4036 }, { "epoch": 0.8473971452560873, "grad_norm": 0.13538743555545807, "learning_rate": 6.929549059001028e-06, "loss": 0.1629, "step": 4037 }, { "epoch": 0.8476070528967254, "grad_norm": 0.10407985746860504, "learning_rate": 6.91095026510708e-06, "loss": 0.1506, "step": 4038 }, { "epoch": 0.8478169605373636, "grad_norm": 0.11444611102342606, "learning_rate": 6.892374611017094e-06, "loss": 0.1356, "step": 4039 }, { "epoch": 0.8480268681780017, "grad_norm": 0.11654292047023773, "learning_rate": 6.873822106706612e-06, "loss": 0.1708, "step": 4040 }, { "epoch": 0.8482367758186398, "grad_norm": 0.11343565583229065, "learning_rate": 6.855292762138793e-06, "loss": 0.1588, "step": 4041 }, { "epoch": 0.8484466834592779, "grad_norm": 0.11720237135887146, "learning_rate": 6.836786587264271e-06, "loss": 0.1587, "step": 4042 }, { "epoch": 0.8486565910999161, "grad_norm": 0.11687421053647995, "learning_rate": 6.818303592021308e-06, "loss": 0.1495, "step": 4043 }, { "epoch": 0.8488664987405542, "grad_norm": 0.1309545636177063, "learning_rate": 6.79984378633568e-06, "loss": 0.1577, "step": 4044 }, { "epoch": 0.8490764063811923, "grad_norm": 0.13915085792541504, "learning_rate": 6.7814071801207464e-06, "loss": 0.1716, "step": 4045 }, { "epoch": 0.8492863140218304, "grad_norm": 0.1628192812204361, "learning_rate": 6.762993783277371e-06, "loss": 0.1372, "step": 4046 }, { "epoch": 0.8494962216624685, "grad_norm": 0.11110463738441467, "learning_rate": 6.7446036056939775e-06, "loss": 0.1683, "step": 4047 }, { "epoch": 0.8497061293031066, "grad_norm": 0.12355831265449524, "learning_rate": 6.726236657246504e-06, "loss": 0.1418, "step": 4048 }, { "epoch": 0.8499160369437447, "grad_norm": 0.131812185049057, "learning_rate": 6.707892947798422e-06, "loss": 0.1572, "step": 4049 }, { "epoch": 0.8501259445843828, "grad_norm": 0.14924898743629456, "learning_rate": 6.689572487200718e-06, "loss": 0.1597, "step": 4050 }, { "epoch": 0.850335852225021, "grad_norm": 0.10449478030204773, "learning_rate": 6.671275285291911e-06, "loss": 0.1476, "step": 4051 }, { "epoch": 0.8505457598656591, "grad_norm": 0.11552565544843674, "learning_rate": 6.653001351897991e-06, "loss": 0.1545, "step": 4052 }, { "epoch": 0.8507556675062973, "grad_norm": 0.11661684513092041, "learning_rate": 6.634750696832514e-06, "loss": 0.1701, "step": 4053 }, { "epoch": 0.8509655751469354, "grad_norm": 0.10988535732030869, "learning_rate": 6.616523329896474e-06, "loss": 0.1623, "step": 4054 }, { "epoch": 0.8511754827875735, "grad_norm": 0.1000858023762703, "learning_rate": 6.598319260878405e-06, "loss": 0.1403, "step": 4055 }, { "epoch": 0.8513853904282116, "grad_norm": 0.13062427937984467, "learning_rate": 6.580138499554267e-06, "loss": 0.1551, "step": 4056 }, { "epoch": 0.8515952980688497, "grad_norm": 0.11242105811834335, "learning_rate": 6.561981055687577e-06, "loss": 0.1615, "step": 4057 }, { "epoch": 0.8518052057094878, "grad_norm": 0.11550282686948776, "learning_rate": 6.543846939029296e-06, "loss": 0.1584, "step": 4058 }, { "epoch": 0.8520151133501259, "grad_norm": 0.12232505530118942, "learning_rate": 6.525736159317852e-06, "loss": 0.1653, "step": 4059 }, { "epoch": 0.852225020990764, "grad_norm": 0.11430735141038895, "learning_rate": 6.5076487262791415e-06, "loss": 0.1486, "step": 4060 }, { "epoch": 0.8524349286314021, "grad_norm": 0.11951231956481934, "learning_rate": 6.489584649626534e-06, "loss": 0.1535, "step": 4061 }, { "epoch": 0.8526448362720404, "grad_norm": 0.10858254134654999, "learning_rate": 6.4715439390608644e-06, "loss": 0.1517, "step": 4062 }, { "epoch": 0.8528547439126785, "grad_norm": 0.09623479098081589, "learning_rate": 6.453526604270394e-06, "loss": 0.1449, "step": 4063 }, { "epoch": 0.8530646515533166, "grad_norm": 0.14456307888031006, "learning_rate": 6.435532654930826e-06, "loss": 0.1549, "step": 4064 }, { "epoch": 0.8532745591939547, "grad_norm": 0.1130475103855133, "learning_rate": 6.417562100705354e-06, "loss": 0.1563, "step": 4065 }, { "epoch": 0.8534844668345928, "grad_norm": 0.11575406044721603, "learning_rate": 6.399614951244559e-06, "loss": 0.1458, "step": 4066 }, { "epoch": 0.8536943744752309, "grad_norm": 0.11347532272338867, "learning_rate": 6.381691216186475e-06, "loss": 0.1507, "step": 4067 }, { "epoch": 0.853904282115869, "grad_norm": 0.1290491372346878, "learning_rate": 6.363790905156558e-06, "loss": 0.1645, "step": 4068 }, { "epoch": 0.8541141897565071, "grad_norm": 0.10183484852313995, "learning_rate": 6.34591402776768e-06, "loss": 0.1316, "step": 4069 }, { "epoch": 0.8543240973971452, "grad_norm": 0.14135852456092834, "learning_rate": 6.328060593620133e-06, "loss": 0.171, "step": 4070 }, { "epoch": 0.8545340050377834, "grad_norm": 0.11686300486326218, "learning_rate": 6.310230612301621e-06, "loss": 0.16, "step": 4071 }, { "epoch": 0.8547439126784215, "grad_norm": 0.13613487780094147, "learning_rate": 6.29242409338725e-06, "loss": 0.1665, "step": 4072 }, { "epoch": 0.8549538203190596, "grad_norm": 0.1191660612821579, "learning_rate": 6.274641046439517e-06, "loss": 0.1693, "step": 4073 }, { "epoch": 0.8551637279596978, "grad_norm": 0.106855608522892, "learning_rate": 6.256881481008348e-06, "loss": 0.1415, "step": 4074 }, { "epoch": 0.8553736356003359, "grad_norm": 0.11627862602472305, "learning_rate": 6.239145406631031e-06, "loss": 0.1502, "step": 4075 }, { "epoch": 0.855583543240974, "grad_norm": 0.16022130846977234, "learning_rate": 6.221432832832208e-06, "loss": 0.1395, "step": 4076 }, { "epoch": 0.8557934508816121, "grad_norm": 0.11979082226753235, "learning_rate": 6.203743769123982e-06, "loss": 0.1652, "step": 4077 }, { "epoch": 0.8560033585222502, "grad_norm": 0.12621894478797913, "learning_rate": 6.186078225005759e-06, "loss": 0.1387, "step": 4078 }, { "epoch": 0.8562132661628883, "grad_norm": 0.117207370698452, "learning_rate": 6.168436209964346e-06, "loss": 0.1399, "step": 4079 }, { "epoch": 0.8564231738035264, "grad_norm": 0.13264822959899902, "learning_rate": 6.150817733473907e-06, "loss": 0.1677, "step": 4080 }, { "epoch": 0.8566330814441646, "grad_norm": 0.1263989955186844, "learning_rate": 6.1332228049959525e-06, "loss": 0.1549, "step": 4081 }, { "epoch": 0.8568429890848027, "grad_norm": 0.11813917756080627, "learning_rate": 6.115651433979402e-06, "loss": 0.1543, "step": 4082 }, { "epoch": 0.8570528967254408, "grad_norm": 0.13481532037258148, "learning_rate": 6.098103629860441e-06, "loss": 0.1667, "step": 4083 }, { "epoch": 0.8572628043660789, "grad_norm": 0.13173486292362213, "learning_rate": 6.080579402062659e-06, "loss": 0.1572, "step": 4084 }, { "epoch": 0.857472712006717, "grad_norm": 0.11830896884202957, "learning_rate": 6.063078759996954e-06, "loss": 0.1664, "step": 4085 }, { "epoch": 0.8576826196473551, "grad_norm": 0.11736118793487549, "learning_rate": 6.0456017130615886e-06, "loss": 0.1507, "step": 4086 }, { "epoch": 0.8578925272879933, "grad_norm": 0.1546947956085205, "learning_rate": 6.028148270642131e-06, "loss": 0.1592, "step": 4087 }, { "epoch": 0.8581024349286314, "grad_norm": 0.1077655777335167, "learning_rate": 6.0107184421114734e-06, "loss": 0.1524, "step": 4088 }, { "epoch": 0.8583123425692695, "grad_norm": 0.14029689133167267, "learning_rate": 5.9933122368298346e-06, "loss": 0.1617, "step": 4089 }, { "epoch": 0.8585222502099077, "grad_norm": 0.12839555740356445, "learning_rate": 5.975929664144747e-06, "loss": 0.1648, "step": 4090 }, { "epoch": 0.8587321578505458, "grad_norm": 0.11472969502210617, "learning_rate": 5.958570733391039e-06, "loss": 0.158, "step": 4091 }, { "epoch": 0.8589420654911839, "grad_norm": 0.12960311770439148, "learning_rate": 5.941235453890864e-06, "loss": 0.1501, "step": 4092 }, { "epoch": 0.859151973131822, "grad_norm": 0.12296982854604721, "learning_rate": 5.923923834953648e-06, "loss": 0.1516, "step": 4093 }, { "epoch": 0.8593618807724601, "grad_norm": 0.14688728749752045, "learning_rate": 5.90663588587615e-06, "loss": 0.1536, "step": 4094 }, { "epoch": 0.8595717884130982, "grad_norm": 0.09548702090978622, "learning_rate": 5.889371615942379e-06, "loss": 0.1465, "step": 4095 }, { "epoch": 0.8597816960537363, "grad_norm": 0.1445847600698471, "learning_rate": 5.87213103442365e-06, "loss": 0.1558, "step": 4096 }, { "epoch": 0.8599916036943744, "grad_norm": 0.14124633371829987, "learning_rate": 5.854914150578527e-06, "loss": 0.1678, "step": 4097 }, { "epoch": 0.8602015113350125, "grad_norm": 0.12228516489267349, "learning_rate": 5.837720973652894e-06, "loss": 0.1665, "step": 4098 }, { "epoch": 0.8604114189756508, "grad_norm": 0.1366337239742279, "learning_rate": 5.820551512879873e-06, "loss": 0.1619, "step": 4099 }, { "epoch": 0.8606213266162889, "grad_norm": 0.15276916325092316, "learning_rate": 5.803405777479854e-06, "loss": 0.1687, "step": 4100 }, { "epoch": 0.860831234256927, "grad_norm": 0.12131724506616592, "learning_rate": 5.78628377666049e-06, "loss": 0.1653, "step": 4101 }, { "epoch": 0.8610411418975651, "grad_norm": 0.1318751573562622, "learning_rate": 5.769185519616671e-06, "loss": 0.1502, "step": 4102 }, { "epoch": 0.8612510495382032, "grad_norm": 0.12935899198055267, "learning_rate": 5.752111015530593e-06, "loss": 0.1631, "step": 4103 }, { "epoch": 0.8614609571788413, "grad_norm": 0.11931544542312622, "learning_rate": 5.735060273571618e-06, "loss": 0.1315, "step": 4104 }, { "epoch": 0.8616708648194794, "grad_norm": 0.11439146846532822, "learning_rate": 5.71803330289638e-06, "loss": 0.1484, "step": 4105 }, { "epoch": 0.8618807724601175, "grad_norm": 0.1328824758529663, "learning_rate": 5.701030112648781e-06, "loss": 0.1601, "step": 4106 }, { "epoch": 0.8620906801007556, "grad_norm": 0.13809579610824585, "learning_rate": 5.684050711959904e-06, "loss": 0.1609, "step": 4107 }, { "epoch": 0.8623005877413937, "grad_norm": 0.13090141117572784, "learning_rate": 5.667095109948078e-06, "loss": 0.1469, "step": 4108 }, { "epoch": 0.862510495382032, "grad_norm": 0.1246684342622757, "learning_rate": 5.650163315718854e-06, "loss": 0.1417, "step": 4109 }, { "epoch": 0.8627204030226701, "grad_norm": 0.12435910850763321, "learning_rate": 5.633255338364979e-06, "loss": 0.1511, "step": 4110 }, { "epoch": 0.8629303106633082, "grad_norm": 0.12332982569932938, "learning_rate": 5.6163711869664345e-06, "loss": 0.1438, "step": 4111 }, { "epoch": 0.8631402183039463, "grad_norm": 0.12709768116474152, "learning_rate": 5.599510870590391e-06, "loss": 0.1544, "step": 4112 }, { "epoch": 0.8633501259445844, "grad_norm": 0.12950602173805237, "learning_rate": 5.5826743982912265e-06, "loss": 0.1315, "step": 4113 }, { "epoch": 0.8635600335852225, "grad_norm": 0.12958264350891113, "learning_rate": 5.565861779110499e-06, "loss": 0.1665, "step": 4114 }, { "epoch": 0.8637699412258606, "grad_norm": 0.11284958571195602, "learning_rate": 5.549073022076989e-06, "loss": 0.1583, "step": 4115 }, { "epoch": 0.8639798488664987, "grad_norm": 0.12838315963745117, "learning_rate": 5.532308136206632e-06, "loss": 0.1599, "step": 4116 }, { "epoch": 0.8641897565071368, "grad_norm": 0.11664266884326935, "learning_rate": 5.515567130502558e-06, "loss": 0.1719, "step": 4117 }, { "epoch": 0.864399664147775, "grad_norm": 0.10369646549224854, "learning_rate": 5.498850013955065e-06, "loss": 0.1469, "step": 4118 }, { "epoch": 0.8646095717884131, "grad_norm": 0.11366793513298035, "learning_rate": 5.482156795541632e-06, "loss": 0.134, "step": 4119 }, { "epoch": 0.8648194794290512, "grad_norm": 0.11349675804376602, "learning_rate": 5.465487484226889e-06, "loss": 0.154, "step": 4120 }, { "epoch": 0.8650293870696893, "grad_norm": 0.12319415807723999, "learning_rate": 5.448842088962647e-06, "loss": 0.1545, "step": 4121 }, { "epoch": 0.8652392947103275, "grad_norm": 0.11100887507200241, "learning_rate": 5.4322206186878495e-06, "loss": 0.1574, "step": 4122 }, { "epoch": 0.8654492023509656, "grad_norm": 0.10524474829435349, "learning_rate": 5.415623082328625e-06, "loss": 0.1418, "step": 4123 }, { "epoch": 0.8656591099916037, "grad_norm": 0.1143570989370346, "learning_rate": 5.399049488798225e-06, "loss": 0.164, "step": 4124 }, { "epoch": 0.8658690176322418, "grad_norm": 0.12815353274345398, "learning_rate": 5.382499846997041e-06, "loss": 0.1611, "step": 4125 }, { "epoch": 0.8660789252728799, "grad_norm": 0.11535675823688507, "learning_rate": 5.365974165812599e-06, "loss": 0.1447, "step": 4126 }, { "epoch": 0.8662888329135181, "grad_norm": 0.13475079834461212, "learning_rate": 5.349472454119586e-06, "loss": 0.1869, "step": 4127 }, { "epoch": 0.8664987405541562, "grad_norm": 0.1347225308418274, "learning_rate": 5.332994720779794e-06, "loss": 0.158, "step": 4128 }, { "epoch": 0.8667086481947943, "grad_norm": 0.12452715635299683, "learning_rate": 5.316540974642137e-06, "loss": 0.1515, "step": 4129 }, { "epoch": 0.8669185558354324, "grad_norm": 0.13022229075431824, "learning_rate": 5.300111224542648e-06, "loss": 0.1605, "step": 4130 }, { "epoch": 0.8671284634760705, "grad_norm": 0.1267905980348587, "learning_rate": 5.2837054793045016e-06, "loss": 0.1723, "step": 4131 }, { "epoch": 0.8673383711167086, "grad_norm": 0.13651971518993378, "learning_rate": 5.2673237477379286e-06, "loss": 0.1691, "step": 4132 }, { "epoch": 0.8675482787573467, "grad_norm": 0.12502793967723846, "learning_rate": 5.250966038640315e-06, "loss": 0.1622, "step": 4133 }, { "epoch": 0.8677581863979849, "grad_norm": 0.12554888427257538, "learning_rate": 5.234632360796099e-06, "loss": 0.1553, "step": 4134 }, { "epoch": 0.867968094038623, "grad_norm": 0.11070720851421356, "learning_rate": 5.218322722976865e-06, "loss": 0.1463, "step": 4135 }, { "epoch": 0.8681780016792612, "grad_norm": 0.15806786715984344, "learning_rate": 5.202037133941251e-06, "loss": 0.1615, "step": 4136 }, { "epoch": 0.8683879093198993, "grad_norm": 0.11458860337734222, "learning_rate": 5.185775602434989e-06, "loss": 0.16, "step": 4137 }, { "epoch": 0.8685978169605374, "grad_norm": 0.12466058880090714, "learning_rate": 5.169538137190893e-06, "loss": 0.15, "step": 4138 }, { "epoch": 0.8688077246011755, "grad_norm": 0.11807677894830704, "learning_rate": 5.1533247469288555e-06, "loss": 0.1686, "step": 4139 }, { "epoch": 0.8690176322418136, "grad_norm": 0.1190905049443245, "learning_rate": 5.137135440355834e-06, "loss": 0.1494, "step": 4140 }, { "epoch": 0.8692275398824517, "grad_norm": 0.12761694192886353, "learning_rate": 5.120970226165861e-06, "loss": 0.1424, "step": 4141 }, { "epoch": 0.8694374475230898, "grad_norm": 0.13342903554439545, "learning_rate": 5.104829113040016e-06, "loss": 0.1581, "step": 4142 }, { "epoch": 0.8696473551637279, "grad_norm": 0.10999980568885803, "learning_rate": 5.088712109646449e-06, "loss": 0.1511, "step": 4143 }, { "epoch": 0.869857262804366, "grad_norm": 0.12726296484470367, "learning_rate": 5.072619224640363e-06, "loss": 0.1562, "step": 4144 }, { "epoch": 0.8700671704450041, "grad_norm": 0.11894024908542633, "learning_rate": 5.056550466664018e-06, "loss": 0.1649, "step": 4145 }, { "epoch": 0.8702770780856424, "grad_norm": 0.11655131727457047, "learning_rate": 5.040505844346671e-06, "loss": 0.1537, "step": 4146 }, { "epoch": 0.8704869857262805, "grad_norm": 0.14043211936950684, "learning_rate": 5.0244853663046765e-06, "loss": 0.15, "step": 4147 }, { "epoch": 0.8706968933669186, "grad_norm": 0.12577585875988007, "learning_rate": 5.00848904114139e-06, "loss": 0.1608, "step": 4148 }, { "epoch": 0.8709068010075567, "grad_norm": 0.1366935670375824, "learning_rate": 4.9925168774472065e-06, "loss": 0.162, "step": 4149 }, { "epoch": 0.8711167086481948, "grad_norm": 0.139494389295578, "learning_rate": 4.976568883799537e-06, "loss": 0.1458, "step": 4150 }, { "epoch": 0.8713266162888329, "grad_norm": 0.1221657320857048, "learning_rate": 4.960645068762809e-06, "loss": 0.1635, "step": 4151 }, { "epoch": 0.871536523929471, "grad_norm": 0.10618814080953598, "learning_rate": 4.9447454408885095e-06, "loss": 0.1581, "step": 4152 }, { "epoch": 0.8717464315701091, "grad_norm": 0.13304735720157623, "learning_rate": 4.9288700087150694e-06, "loss": 0.1416, "step": 4153 }, { "epoch": 0.8719563392107472, "grad_norm": 0.1176249235868454, "learning_rate": 4.913018780767969e-06, "loss": 0.1449, "step": 4154 }, { "epoch": 0.8721662468513854, "grad_norm": 0.11962371319532394, "learning_rate": 4.897191765559667e-06, "loss": 0.1606, "step": 4155 }, { "epoch": 0.8723761544920235, "grad_norm": 0.12428487837314606, "learning_rate": 4.881388971589656e-06, "loss": 0.156, "step": 4156 }, { "epoch": 0.8725860621326617, "grad_norm": 0.18144725263118744, "learning_rate": 4.865610407344384e-06, "loss": 0.1437, "step": 4157 }, { "epoch": 0.8727959697732998, "grad_norm": 0.14240662753582, "learning_rate": 4.8498560812973045e-06, "loss": 0.1378, "step": 4158 }, { "epoch": 0.8730058774139379, "grad_norm": 0.13271981477737427, "learning_rate": 4.834126001908845e-06, "loss": 0.1613, "step": 4159 }, { "epoch": 0.873215785054576, "grad_norm": 0.119957834482193, "learning_rate": 4.818420177626421e-06, "loss": 0.1441, "step": 4160 }, { "epoch": 0.8734256926952141, "grad_norm": 0.13145653903484344, "learning_rate": 4.802738616884417e-06, "loss": 0.1557, "step": 4161 }, { "epoch": 0.8736356003358522, "grad_norm": 0.13927792012691498, "learning_rate": 4.787081328104193e-06, "loss": 0.1488, "step": 4162 }, { "epoch": 0.8738455079764903, "grad_norm": 0.1195240393280983, "learning_rate": 4.771448319694061e-06, "loss": 0.1436, "step": 4163 }, { "epoch": 0.8740554156171285, "grad_norm": 0.11769021302461624, "learning_rate": 4.755839600049317e-06, "loss": 0.1546, "step": 4164 }, { "epoch": 0.8742653232577666, "grad_norm": 0.13738517463207245, "learning_rate": 4.7402551775522e-06, "loss": 0.1571, "step": 4165 }, { "epoch": 0.8744752308984047, "grad_norm": 0.13803741335868835, "learning_rate": 4.724695060571888e-06, "loss": 0.1429, "step": 4166 }, { "epoch": 0.8746851385390428, "grad_norm": 0.12481208890676498, "learning_rate": 4.7091592574645184e-06, "loss": 0.1498, "step": 4167 }, { "epoch": 0.874895046179681, "grad_norm": 0.10957111418247223, "learning_rate": 4.6936477765731905e-06, "loss": 0.1439, "step": 4168 }, { "epoch": 0.875104953820319, "grad_norm": 0.11496666073799133, "learning_rate": 4.6781606262279066e-06, "loss": 0.1538, "step": 4169 }, { "epoch": 0.8753148614609572, "grad_norm": 0.1191176176071167, "learning_rate": 4.662697814745631e-06, "loss": 0.1374, "step": 4170 }, { "epoch": 0.8755247691015953, "grad_norm": 0.14217902719974518, "learning_rate": 4.647259350430233e-06, "loss": 0.1665, "step": 4171 }, { "epoch": 0.8757346767422334, "grad_norm": 0.12614914774894714, "learning_rate": 4.631845241572552e-06, "loss": 0.1713, "step": 4172 }, { "epoch": 0.8759445843828715, "grad_norm": 0.14050200581550598, "learning_rate": 4.616455496450279e-06, "loss": 0.1798, "step": 4173 }, { "epoch": 0.8761544920235097, "grad_norm": 0.10861799865961075, "learning_rate": 4.601090123328078e-06, "loss": 0.1637, "step": 4174 }, { "epoch": 0.8763643996641478, "grad_norm": 0.12950311601161957, "learning_rate": 4.585749130457495e-06, "loss": 0.1709, "step": 4175 }, { "epoch": 0.8765743073047859, "grad_norm": 0.13461849093437195, "learning_rate": 4.570432526077006e-06, "loss": 0.157, "step": 4176 }, { "epoch": 0.876784214945424, "grad_norm": 0.1331222951412201, "learning_rate": 4.555140318411971e-06, "loss": 0.1447, "step": 4177 }, { "epoch": 0.8769941225860621, "grad_norm": 0.12269078940153122, "learning_rate": 4.539872515674659e-06, "loss": 0.1413, "step": 4178 }, { "epoch": 0.8772040302267002, "grad_norm": 0.11593735963106155, "learning_rate": 4.524629126064223e-06, "loss": 0.1412, "step": 4179 }, { "epoch": 0.8774139378673383, "grad_norm": 0.12780681252479553, "learning_rate": 4.509410157766714e-06, "loss": 0.1514, "step": 4180 }, { "epoch": 0.8776238455079765, "grad_norm": 0.12802265584468842, "learning_rate": 4.494215618955061e-06, "loss": 0.1486, "step": 4181 }, { "epoch": 0.8778337531486146, "grad_norm": 0.11089033633470535, "learning_rate": 4.479045517789088e-06, "loss": 0.156, "step": 4182 }, { "epoch": 0.8780436607892528, "grad_norm": 0.15606744587421417, "learning_rate": 4.463899862415483e-06, "loss": 0.1572, "step": 4183 }, { "epoch": 0.8782535684298909, "grad_norm": 0.11626232415437698, "learning_rate": 4.448778660967795e-06, "loss": 0.1608, "step": 4184 }, { "epoch": 0.878463476070529, "grad_norm": 0.13428574800491333, "learning_rate": 4.433681921566474e-06, "loss": 0.1445, "step": 4185 }, { "epoch": 0.8786733837111671, "grad_norm": 0.11833200603723526, "learning_rate": 4.418609652318817e-06, "loss": 0.1372, "step": 4186 }, { "epoch": 0.8788832913518052, "grad_norm": 0.12063129991292953, "learning_rate": 4.4035618613189444e-06, "loss": 0.1723, "step": 4187 }, { "epoch": 0.8790931989924433, "grad_norm": 0.11937091499567032, "learning_rate": 4.388538556647897e-06, "loss": 0.135, "step": 4188 }, { "epoch": 0.8793031066330814, "grad_norm": 0.1174740195274353, "learning_rate": 4.3735397463735175e-06, "loss": 0.1317, "step": 4189 }, { "epoch": 0.8795130142737195, "grad_norm": 0.1225159540772438, "learning_rate": 4.358565438550516e-06, "loss": 0.162, "step": 4190 }, { "epoch": 0.8797229219143576, "grad_norm": 0.11938232183456421, "learning_rate": 4.343615641220433e-06, "loss": 0.1599, "step": 4191 }, { "epoch": 0.8799328295549959, "grad_norm": 0.10696382075548172, "learning_rate": 4.328690362411647e-06, "loss": 0.148, "step": 4192 }, { "epoch": 0.880142737195634, "grad_norm": 0.15508660674095154, "learning_rate": 4.313789610139396e-06, "loss": 0.1693, "step": 4193 }, { "epoch": 0.8803526448362721, "grad_norm": 0.12078120559453964, "learning_rate": 4.298913392405702e-06, "loss": 0.1331, "step": 4194 }, { "epoch": 0.8805625524769102, "grad_norm": 0.1250450164079666, "learning_rate": 4.284061717199433e-06, "loss": 0.1589, "step": 4195 }, { "epoch": 0.8807724601175483, "grad_norm": 0.13361555337905884, "learning_rate": 4.26923459249628e-06, "loss": 0.1607, "step": 4196 }, { "epoch": 0.8809823677581864, "grad_norm": 0.13139186799526215, "learning_rate": 4.254432026258759e-06, "loss": 0.1457, "step": 4197 }, { "epoch": 0.8811922753988245, "grad_norm": 0.11095529049634933, "learning_rate": 4.23965402643618e-06, "loss": 0.1487, "step": 4198 }, { "epoch": 0.8814021830394626, "grad_norm": 0.11550076305866241, "learning_rate": 4.224900600964665e-06, "loss": 0.1418, "step": 4199 }, { "epoch": 0.8816120906801007, "grad_norm": 0.11938626319169998, "learning_rate": 4.210171757767145e-06, "loss": 0.1607, "step": 4200 }, { "epoch": 0.8818219983207388, "grad_norm": 0.11363077908754349, "learning_rate": 4.1954675047533365e-06, "loss": 0.1495, "step": 4201 }, { "epoch": 0.882031905961377, "grad_norm": 0.1299435794353485, "learning_rate": 4.180787849819767e-06, "loss": 0.1479, "step": 4202 }, { "epoch": 0.8822418136020151, "grad_norm": 0.11774297058582306, "learning_rate": 4.166132800849743e-06, "loss": 0.1397, "step": 4203 }, { "epoch": 0.8824517212426533, "grad_norm": 0.11965102702379227, "learning_rate": 4.151502365713356e-06, "loss": 0.157, "step": 4204 }, { "epoch": 0.8826616288832914, "grad_norm": 0.1356746107339859, "learning_rate": 4.136896552267494e-06, "loss": 0.1574, "step": 4205 }, { "epoch": 0.8828715365239295, "grad_norm": 0.12477898597717285, "learning_rate": 4.122315368355812e-06, "loss": 0.1287, "step": 4206 }, { "epoch": 0.8830814441645676, "grad_norm": 0.12956759333610535, "learning_rate": 4.107758821808738e-06, "loss": 0.1412, "step": 4207 }, { "epoch": 0.8832913518052057, "grad_norm": 0.12473218142986298, "learning_rate": 4.093226920443449e-06, "loss": 0.1597, "step": 4208 }, { "epoch": 0.8835012594458438, "grad_norm": 0.12124751508235931, "learning_rate": 4.0787196720639355e-06, "loss": 0.1483, "step": 4209 }, { "epoch": 0.8837111670864819, "grad_norm": 0.12192367017269135, "learning_rate": 4.064237084460915e-06, "loss": 0.1628, "step": 4210 }, { "epoch": 0.8839210747271201, "grad_norm": 0.113349050283432, "learning_rate": 4.04977916541186e-06, "loss": 0.137, "step": 4211 }, { "epoch": 0.8841309823677582, "grad_norm": 0.12009799480438232, "learning_rate": 4.035345922681e-06, "loss": 0.1451, "step": 4212 }, { "epoch": 0.8843408900083963, "grad_norm": 0.13234446942806244, "learning_rate": 4.020937364019328e-06, "loss": 0.1671, "step": 4213 }, { "epoch": 0.8845507976490344, "grad_norm": 0.11510760337114334, "learning_rate": 4.006553497164578e-06, "loss": 0.144, "step": 4214 }, { "epoch": 0.8847607052896725, "grad_norm": 0.1241830363869667, "learning_rate": 3.992194329841192e-06, "loss": 0.1509, "step": 4215 }, { "epoch": 0.8849706129303107, "grad_norm": 0.12140922993421555, "learning_rate": 3.977859869760375e-06, "loss": 0.1665, "step": 4216 }, { "epoch": 0.8851805205709488, "grad_norm": 0.11366656422615051, "learning_rate": 3.963550124620069e-06, "loss": 0.1619, "step": 4217 }, { "epoch": 0.8853904282115869, "grad_norm": 0.1194479912519455, "learning_rate": 3.949265102104932e-06, "loss": 0.1564, "step": 4218 }, { "epoch": 0.885600335852225, "grad_norm": 0.11060012876987457, "learning_rate": 3.935004809886344e-06, "loss": 0.1634, "step": 4219 }, { "epoch": 0.8858102434928632, "grad_norm": 0.12809467315673828, "learning_rate": 3.920769255622409e-06, "loss": 0.1583, "step": 4220 }, { "epoch": 0.8860201511335013, "grad_norm": 0.12603220343589783, "learning_rate": 3.906558446957947e-06, "loss": 0.1392, "step": 4221 }, { "epoch": 0.8862300587741394, "grad_norm": 0.19363883137702942, "learning_rate": 3.892372391524479e-06, "loss": 0.1511, "step": 4222 }, { "epoch": 0.8864399664147775, "grad_norm": 0.12570086121559143, "learning_rate": 3.878211096940254e-06, "loss": 0.1607, "step": 4223 }, { "epoch": 0.8866498740554156, "grad_norm": 0.17181172966957092, "learning_rate": 3.8640745708101996e-06, "loss": 0.1587, "step": 4224 }, { "epoch": 0.8868597816960537, "grad_norm": 0.11868228018283844, "learning_rate": 3.849962820725949e-06, "loss": 0.163, "step": 4225 }, { "epoch": 0.8870696893366918, "grad_norm": 0.12187950313091278, "learning_rate": 3.835875854265847e-06, "loss": 0.1495, "step": 4226 }, { "epoch": 0.8872795969773299, "grad_norm": 0.12567535042762756, "learning_rate": 3.8218136789949175e-06, "loss": 0.1573, "step": 4227 }, { "epoch": 0.887489504617968, "grad_norm": 0.1315799057483673, "learning_rate": 3.8077763024648606e-06, "loss": 0.1643, "step": 4228 }, { "epoch": 0.8876994122586063, "grad_norm": 0.11614145338535309, "learning_rate": 3.793763732214073e-06, "loss": 0.1656, "step": 4229 }, { "epoch": 0.8879093198992444, "grad_norm": 0.13281457126140594, "learning_rate": 3.779775975767619e-06, "loss": 0.1591, "step": 4230 }, { "epoch": 0.8881192275398825, "grad_norm": 0.10211368650197983, "learning_rate": 3.7658130406372503e-06, "loss": 0.1517, "step": 4231 }, { "epoch": 0.8883291351805206, "grad_norm": 0.10663280636072159, "learning_rate": 3.751874934321381e-06, "loss": 0.1539, "step": 4232 }, { "epoch": 0.8885390428211587, "grad_norm": 0.12947557866573334, "learning_rate": 3.7379616643050842e-06, "loss": 0.157, "step": 4233 }, { "epoch": 0.8887489504617968, "grad_norm": 0.10921616107225418, "learning_rate": 3.7240732380601162e-06, "loss": 0.1516, "step": 4234 }, { "epoch": 0.8889588581024349, "grad_norm": 0.1422072798013687, "learning_rate": 3.7102096630448835e-06, "loss": 0.1566, "step": 4235 }, { "epoch": 0.889168765743073, "grad_norm": 0.19969822466373444, "learning_rate": 3.6963709467044194e-06, "loss": 0.1541, "step": 4236 }, { "epoch": 0.8893786733837111, "grad_norm": 0.11462516337633133, "learning_rate": 3.6825570964704403e-06, "loss": 0.1422, "step": 4237 }, { "epoch": 0.8895885810243492, "grad_norm": 0.13612455129623413, "learning_rate": 3.6687681197613124e-06, "loss": 0.1509, "step": 4238 }, { "epoch": 0.8897984886649875, "grad_norm": 0.1351628452539444, "learning_rate": 3.655004023982017e-06, "loss": 0.1414, "step": 4239 }, { "epoch": 0.8900083963056256, "grad_norm": 0.13770541548728943, "learning_rate": 3.6412648165241923e-06, "loss": 0.1422, "step": 4240 }, { "epoch": 0.8902183039462637, "grad_norm": 0.13861404359340668, "learning_rate": 3.627550504766103e-06, "loss": 0.1543, "step": 4241 }, { "epoch": 0.8904282115869018, "grad_norm": 0.13922984898090363, "learning_rate": 3.6138610960726625e-06, "loss": 0.1676, "step": 4242 }, { "epoch": 0.8906381192275399, "grad_norm": 0.13794945180416107, "learning_rate": 3.60019659779538e-06, "loss": 0.1644, "step": 4243 }, { "epoch": 0.890848026868178, "grad_norm": 0.11094696819782257, "learning_rate": 3.586557017272413e-06, "loss": 0.1504, "step": 4244 }, { "epoch": 0.8910579345088161, "grad_norm": 0.12312065064907074, "learning_rate": 3.5729423618285074e-06, "loss": 0.1493, "step": 4245 }, { "epoch": 0.8912678421494542, "grad_norm": 0.12740106880664825, "learning_rate": 3.559352638775071e-06, "loss": 0.1575, "step": 4246 }, { "epoch": 0.8914777497900923, "grad_norm": 0.11770659685134888, "learning_rate": 3.545787855410082e-06, "loss": 0.152, "step": 4247 }, { "epoch": 0.8916876574307305, "grad_norm": 0.1185218021273613, "learning_rate": 3.5322480190181473e-06, "loss": 0.1632, "step": 4248 }, { "epoch": 0.8918975650713686, "grad_norm": 0.12321362644433975, "learning_rate": 3.5187331368704557e-06, "loss": 0.1471, "step": 4249 }, { "epoch": 0.8921074727120067, "grad_norm": 0.11762972176074982, "learning_rate": 3.505243216224818e-06, "loss": 0.1572, "step": 4250 }, { "epoch": 0.8923173803526449, "grad_norm": 0.10275141894817352, "learning_rate": 3.4917782643256237e-06, "loss": 0.1376, "step": 4251 }, { "epoch": 0.892527287993283, "grad_norm": 0.13671152293682098, "learning_rate": 3.478338288403865e-06, "loss": 0.1674, "step": 4252 }, { "epoch": 0.8927371956339211, "grad_norm": 0.13780957460403442, "learning_rate": 3.464923295677103e-06, "loss": 0.1413, "step": 4253 }, { "epoch": 0.8929471032745592, "grad_norm": 0.130636528134346, "learning_rate": 3.451533293349507e-06, "loss": 0.1513, "step": 4254 }, { "epoch": 0.8931570109151973, "grad_norm": 0.10833965986967087, "learning_rate": 3.438168288611826e-06, "loss": 0.1485, "step": 4255 }, { "epoch": 0.8933669185558354, "grad_norm": 0.12039763480424881, "learning_rate": 3.4248282886413517e-06, "loss": 0.1308, "step": 4256 }, { "epoch": 0.8935768261964736, "grad_norm": 0.12278316169977188, "learning_rate": 3.411513300601965e-06, "loss": 0.1373, "step": 4257 }, { "epoch": 0.8937867338371117, "grad_norm": 0.1298147439956665, "learning_rate": 3.39822333164414e-06, "loss": 0.1633, "step": 4258 }, { "epoch": 0.8939966414777498, "grad_norm": 0.11132081598043442, "learning_rate": 3.3849583889048853e-06, "loss": 0.1608, "step": 4259 }, { "epoch": 0.8942065491183879, "grad_norm": 0.12436234951019287, "learning_rate": 3.371718479507774e-06, "loss": 0.1603, "step": 4260 }, { "epoch": 0.894416456759026, "grad_norm": 0.12160570919513702, "learning_rate": 3.358503610562941e-06, "loss": 0.1515, "step": 4261 }, { "epoch": 0.8946263643996641, "grad_norm": 0.10866683721542358, "learning_rate": 3.3453137891670704e-06, "loss": 0.1364, "step": 4262 }, { "epoch": 0.8948362720403022, "grad_norm": 0.11041587591171265, "learning_rate": 3.3321490224034024e-06, "loss": 0.1435, "step": 4263 }, { "epoch": 0.8950461796809404, "grad_norm": 0.1161423921585083, "learning_rate": 3.3190093173417203e-06, "loss": 0.1515, "step": 4264 }, { "epoch": 0.8952560873215785, "grad_norm": 0.130842387676239, "learning_rate": 3.3058946810383406e-06, "loss": 0.161, "step": 4265 }, { "epoch": 0.8954659949622166, "grad_norm": 0.15605497360229492, "learning_rate": 3.2928051205361134e-06, "loss": 0.133, "step": 4266 }, { "epoch": 0.8956759026028548, "grad_norm": 0.10918981581926346, "learning_rate": 3.2797406428644483e-06, "loss": 0.1506, "step": 4267 }, { "epoch": 0.8958858102434929, "grad_norm": 0.11886347085237503, "learning_rate": 3.266701255039267e-06, "loss": 0.1506, "step": 4268 }, { "epoch": 0.896095717884131, "grad_norm": 0.12514930963516235, "learning_rate": 3.2536869640630074e-06, "loss": 0.1512, "step": 4269 }, { "epoch": 0.8963056255247691, "grad_norm": 0.14683309197425842, "learning_rate": 3.2406977769246505e-06, "loss": 0.1536, "step": 4270 }, { "epoch": 0.8965155331654072, "grad_norm": 0.13536719977855682, "learning_rate": 3.227733700599683e-06, "loss": 0.162, "step": 4271 }, { "epoch": 0.8967254408060453, "grad_norm": 0.11566172540187836, "learning_rate": 3.214794742050109e-06, "loss": 0.1523, "step": 4272 }, { "epoch": 0.8969353484466834, "grad_norm": 0.13411453366279602, "learning_rate": 3.201880908224447e-06, "loss": 0.1476, "step": 4273 }, { "epoch": 0.8971452560873215, "grad_norm": 0.1259935200214386, "learning_rate": 3.1889922060577115e-06, "loss": 0.1567, "step": 4274 }, { "epoch": 0.8973551637279596, "grad_norm": 0.10426774621009827, "learning_rate": 3.176128642471443e-06, "loss": 0.1411, "step": 4275 }, { "epoch": 0.8975650713685979, "grad_norm": 0.11820261180400848, "learning_rate": 3.163290224373672e-06, "loss": 0.1548, "step": 4276 }, { "epoch": 0.897774979009236, "grad_norm": 0.12722282111644745, "learning_rate": 3.15047695865891e-06, "loss": 0.1411, "step": 4277 }, { "epoch": 0.8979848866498741, "grad_norm": 0.11995641887187958, "learning_rate": 3.137688852208165e-06, "loss": 0.1528, "step": 4278 }, { "epoch": 0.8981947942905122, "grad_norm": 0.1154097393155098, "learning_rate": 3.124925911888971e-06, "loss": 0.1593, "step": 4279 }, { "epoch": 0.8984047019311503, "grad_norm": 0.15159837901592255, "learning_rate": 3.1121881445552947e-06, "loss": 0.1753, "step": 4280 }, { "epoch": 0.8986146095717884, "grad_norm": 0.11547110974788666, "learning_rate": 3.099475557047621e-06, "loss": 0.1609, "step": 4281 }, { "epoch": 0.8988245172124265, "grad_norm": 0.13358214497566223, "learning_rate": 3.0867881561928826e-06, "loss": 0.1488, "step": 4282 }, { "epoch": 0.8990344248530646, "grad_norm": 0.14517351984977722, "learning_rate": 3.0741259488045293e-06, "loss": 0.1549, "step": 4283 }, { "epoch": 0.8992443324937027, "grad_norm": 0.14740052819252014, "learning_rate": 3.061488941682439e-06, "loss": 0.1639, "step": 4284 }, { "epoch": 0.8994542401343409, "grad_norm": 0.11952229589223862, "learning_rate": 3.0488771416129767e-06, "loss": 0.1698, "step": 4285 }, { "epoch": 0.899664147774979, "grad_norm": 0.12098808586597443, "learning_rate": 3.0362905553689538e-06, "loss": 0.162, "step": 4286 }, { "epoch": 0.8998740554156172, "grad_norm": 0.11084499210119247, "learning_rate": 3.0237291897096788e-06, "loss": 0.1532, "step": 4287 }, { "epoch": 0.9000839630562553, "grad_norm": 0.12836486101150513, "learning_rate": 3.0111930513808785e-06, "loss": 0.1651, "step": 4288 }, { "epoch": 0.9002938706968934, "grad_norm": 0.14764492213726044, "learning_rate": 2.998682147114751e-06, "loss": 0.178, "step": 4289 }, { "epoch": 0.9005037783375315, "grad_norm": 0.1267375499010086, "learning_rate": 2.986196483629933e-06, "loss": 0.14, "step": 4290 }, { "epoch": 0.9007136859781696, "grad_norm": 0.1228199303150177, "learning_rate": 2.9737360676315106e-06, "loss": 0.1696, "step": 4291 }, { "epoch": 0.9009235936188077, "grad_norm": 0.1166803389787674, "learning_rate": 2.961300905811015e-06, "loss": 0.1489, "step": 4292 }, { "epoch": 0.9011335012594458, "grad_norm": 0.14002828299999237, "learning_rate": 2.94889100484641e-06, "loss": 0.168, "step": 4293 }, { "epoch": 0.9013434089000839, "grad_norm": 0.14958305656909943, "learning_rate": 2.9365063714020866e-06, "loss": 0.1609, "step": 4294 }, { "epoch": 0.9015533165407221, "grad_norm": 0.1365419179201126, "learning_rate": 2.924147012128897e-06, "loss": 0.1389, "step": 4295 }, { "epoch": 0.9017632241813602, "grad_norm": 0.1232651099562645, "learning_rate": 2.9118129336640874e-06, "loss": 0.1676, "step": 4296 }, { "epoch": 0.9019731318219983, "grad_norm": 0.10802242159843445, "learning_rate": 2.8995041426313486e-06, "loss": 0.1411, "step": 4297 }, { "epoch": 0.9021830394626364, "grad_norm": 0.10859952867031097, "learning_rate": 2.887220645640759e-06, "loss": 0.1423, "step": 4298 }, { "epoch": 0.9023929471032746, "grad_norm": 0.12386050075292587, "learning_rate": 2.8749624492888595e-06, "loss": 0.1526, "step": 4299 }, { "epoch": 0.9026028547439127, "grad_norm": 0.11284506320953369, "learning_rate": 2.862729560158578e-06, "loss": 0.1698, "step": 4300 }, { "epoch": 0.9028127623845508, "grad_norm": 0.13383999466896057, "learning_rate": 2.8505219848192488e-06, "loss": 0.1741, "step": 4301 }, { "epoch": 0.9030226700251889, "grad_norm": 0.09913118928670883, "learning_rate": 2.838339729826617e-06, "loss": 0.1575, "step": 4302 }, { "epoch": 0.903232577665827, "grad_norm": 0.14457812905311584, "learning_rate": 2.8261828017228375e-06, "loss": 0.1596, "step": 4303 }, { "epoch": 0.9034424853064652, "grad_norm": 0.12830603122711182, "learning_rate": 2.8140512070364665e-06, "loss": 0.1578, "step": 4304 }, { "epoch": 0.9036523929471033, "grad_norm": 0.11501707136631012, "learning_rate": 2.801944952282437e-06, "loss": 0.1354, "step": 4305 }, { "epoch": 0.9038623005877414, "grad_norm": 0.1554587483406067, "learning_rate": 2.7898640439620815e-06, "loss": 0.1448, "step": 4306 }, { "epoch": 0.9040722082283795, "grad_norm": 0.10302325338125229, "learning_rate": 2.7778084885631217e-06, "loss": 0.1709, "step": 4307 }, { "epoch": 0.9042821158690176, "grad_norm": 0.11777830123901367, "learning_rate": 2.7657782925596786e-06, "loss": 0.1542, "step": 4308 }, { "epoch": 0.9044920235096557, "grad_norm": 0.12227735668420792, "learning_rate": 2.7537734624122402e-06, "loss": 0.1687, "step": 4309 }, { "epoch": 0.9047019311502938, "grad_norm": 0.12648968398571014, "learning_rate": 2.741794004567666e-06, "loss": 0.1573, "step": 4310 }, { "epoch": 0.904911838790932, "grad_norm": 0.12902848422527313, "learning_rate": 2.7298399254592045e-06, "loss": 0.1565, "step": 4311 }, { "epoch": 0.9051217464315701, "grad_norm": 0.11057978123426437, "learning_rate": 2.7179112315064703e-06, "loss": 0.1469, "step": 4312 }, { "epoch": 0.9053316540722083, "grad_norm": 0.13053567707538605, "learning_rate": 2.7060079291154394e-06, "loss": 0.1511, "step": 4313 }, { "epoch": 0.9055415617128464, "grad_norm": 0.13333049416542053, "learning_rate": 2.6941300246784597e-06, "loss": 0.1542, "step": 4314 }, { "epoch": 0.9057514693534845, "grad_norm": 0.12608638405799866, "learning_rate": 2.6822775245742336e-06, "loss": 0.1535, "step": 4315 }, { "epoch": 0.9059613769941226, "grad_norm": 0.12417828291654587, "learning_rate": 2.6704504351678315e-06, "loss": 0.1688, "step": 4316 }, { "epoch": 0.9061712846347607, "grad_norm": 0.15067745745182037, "learning_rate": 2.658648762810667e-06, "loss": 0.1694, "step": 4317 }, { "epoch": 0.9063811922753988, "grad_norm": 0.1259569376707077, "learning_rate": 2.6468725138405148e-06, "loss": 0.1548, "step": 4318 }, { "epoch": 0.9065910999160369, "grad_norm": 0.129777729511261, "learning_rate": 2.6351216945814717e-06, "loss": 0.1444, "step": 4319 }, { "epoch": 0.906801007556675, "grad_norm": 0.11254890263080597, "learning_rate": 2.623396311344012e-06, "loss": 0.1666, "step": 4320 }, { "epoch": 0.9070109151973131, "grad_norm": 0.11656245589256287, "learning_rate": 2.611696370424932e-06, "loss": 0.1517, "step": 4321 }, { "epoch": 0.9072208228379512, "grad_norm": 0.1227530986070633, "learning_rate": 2.6000218781073615e-06, "loss": 0.1488, "step": 4322 }, { "epoch": 0.9074307304785895, "grad_norm": 0.14218954741954803, "learning_rate": 2.5883728406607634e-06, "loss": 0.1356, "step": 4323 }, { "epoch": 0.9076406381192276, "grad_norm": 0.13774141669273376, "learning_rate": 2.576749264340955e-06, "loss": 0.1685, "step": 4324 }, { "epoch": 0.9078505457598657, "grad_norm": 0.12770342826843262, "learning_rate": 2.5651511553900663e-06, "loss": 0.1664, "step": 4325 }, { "epoch": 0.9080604534005038, "grad_norm": 0.12919233739376068, "learning_rate": 2.5535785200365203e-06, "loss": 0.1515, "step": 4326 }, { "epoch": 0.9082703610411419, "grad_norm": 0.13036112487316132, "learning_rate": 2.5420313644950957e-06, "loss": 0.155, "step": 4327 }, { "epoch": 0.90848026868178, "grad_norm": 0.12815465033054352, "learning_rate": 2.5305096949668872e-06, "loss": 0.1449, "step": 4328 }, { "epoch": 0.9086901763224181, "grad_norm": 0.1162593811750412, "learning_rate": 2.5190135176392902e-06, "loss": 0.1529, "step": 4329 }, { "epoch": 0.9089000839630562, "grad_norm": 0.11832818388938904, "learning_rate": 2.507542838686022e-06, "loss": 0.161, "step": 4330 }, { "epoch": 0.9091099916036943, "grad_norm": 0.1431998759508133, "learning_rate": 2.496097664267083e-06, "loss": 0.1733, "step": 4331 }, { "epoch": 0.9093198992443325, "grad_norm": 0.1280035823583603, "learning_rate": 2.484678000528806e-06, "loss": 0.1432, "step": 4332 }, { "epoch": 0.9095298068849706, "grad_norm": 0.13008855283260345, "learning_rate": 2.473283853603808e-06, "loss": 0.1487, "step": 4333 }, { "epoch": 0.9097397145256088, "grad_norm": 0.12129084020853043, "learning_rate": 2.461915229611006e-06, "loss": 0.1565, "step": 4334 }, { "epoch": 0.9099496221662469, "grad_norm": 0.13218483328819275, "learning_rate": 2.450572134655604e-06, "loss": 0.1539, "step": 4335 }, { "epoch": 0.910159529806885, "grad_norm": 0.12636138498783112, "learning_rate": 2.4392545748291197e-06, "loss": 0.1521, "step": 4336 }, { "epoch": 0.9103694374475231, "grad_norm": 0.13400456309318542, "learning_rate": 2.4279625562093234e-06, "loss": 0.1657, "step": 4337 }, { "epoch": 0.9105793450881612, "grad_norm": 0.11741022765636444, "learning_rate": 2.4166960848603037e-06, "loss": 0.1578, "step": 4338 }, { "epoch": 0.9107892527287993, "grad_norm": 0.10905701667070389, "learning_rate": 2.4054551668323986e-06, "loss": 0.1492, "step": 4339 }, { "epoch": 0.9109991603694374, "grad_norm": 0.1346195489168167, "learning_rate": 2.394239808162241e-06, "loss": 0.1383, "step": 4340 }, { "epoch": 0.9112090680100756, "grad_norm": 0.13491378724575043, "learning_rate": 2.38305001487274e-06, "loss": 0.1678, "step": 4341 }, { "epoch": 0.9114189756507137, "grad_norm": 0.12403254956007004, "learning_rate": 2.371885792973072e-06, "loss": 0.154, "step": 4342 }, { "epoch": 0.9116288832913518, "grad_norm": 0.11544390022754669, "learning_rate": 2.3607471484586697e-06, "loss": 0.1582, "step": 4343 }, { "epoch": 0.9118387909319899, "grad_norm": 0.11051946133375168, "learning_rate": 2.3496340873112368e-06, "loss": 0.1465, "step": 4344 }, { "epoch": 0.912048698572628, "grad_norm": 0.12439233809709549, "learning_rate": 2.3385466154987655e-06, "loss": 0.1463, "step": 4345 }, { "epoch": 0.9122586062132662, "grad_norm": 0.10868985950946808, "learning_rate": 2.327484738975455e-06, "loss": 0.1615, "step": 4346 }, { "epoch": 0.9124685138539043, "grad_norm": 0.11184863746166229, "learning_rate": 2.3164484636817973e-06, "loss": 0.1413, "step": 4347 }, { "epoch": 0.9126784214945424, "grad_norm": 0.13447201251983643, "learning_rate": 2.305437795544513e-06, "loss": 0.1658, "step": 4348 }, { "epoch": 0.9128883291351805, "grad_norm": 0.11348041892051697, "learning_rate": 2.294452740476599e-06, "loss": 0.1634, "step": 4349 }, { "epoch": 0.9130982367758187, "grad_norm": 0.11615937948226929, "learning_rate": 2.2834933043772766e-06, "loss": 0.1795, "step": 4350 }, { "epoch": 0.9133081444164568, "grad_norm": 0.1186215952038765, "learning_rate": 2.2725594931320037e-06, "loss": 0.1595, "step": 4351 }, { "epoch": 0.9135180520570949, "grad_norm": 0.11456868052482605, "learning_rate": 2.2616513126125004e-06, "loss": 0.1648, "step": 4352 }, { "epoch": 0.913727959697733, "grad_norm": 0.10546039789915085, "learning_rate": 2.2507687686766975e-06, "loss": 0.1625, "step": 4353 }, { "epoch": 0.9139378673383711, "grad_norm": 0.13526315987110138, "learning_rate": 2.239911867168776e-06, "loss": 0.1654, "step": 4354 }, { "epoch": 0.9141477749790092, "grad_norm": 0.1123739704489708, "learning_rate": 2.2290806139191377e-06, "loss": 0.1558, "step": 4355 }, { "epoch": 0.9143576826196473, "grad_norm": 0.1591699868440628, "learning_rate": 2.2182750147444033e-06, "loss": 0.1424, "step": 4356 }, { "epoch": 0.9145675902602854, "grad_norm": 0.11519675701856613, "learning_rate": 2.2074950754474465e-06, "loss": 0.1592, "step": 4357 }, { "epoch": 0.9147774979009236, "grad_norm": 0.1345817595720291, "learning_rate": 2.1967408018173264e-06, "loss": 0.1672, "step": 4358 }, { "epoch": 0.9149874055415617, "grad_norm": 0.12809444963932037, "learning_rate": 2.1860121996293504e-06, "loss": 0.167, "step": 4359 }, { "epoch": 0.9151973131821999, "grad_norm": 0.11888294667005539, "learning_rate": 2.1753092746449933e-06, "loss": 0.1632, "step": 4360 }, { "epoch": 0.915407220822838, "grad_norm": 0.1209503635764122, "learning_rate": 2.1646320326119908e-06, "loss": 0.1538, "step": 4361 }, { "epoch": 0.9156171284634761, "grad_norm": 0.1321111023426056, "learning_rate": 2.153980479264267e-06, "loss": 0.1638, "step": 4362 }, { "epoch": 0.9158270361041142, "grad_norm": 0.1377587914466858, "learning_rate": 2.143354620321947e-06, "loss": 0.1631, "step": 4363 }, { "epoch": 0.9160369437447523, "grad_norm": 0.13363386690616608, "learning_rate": 2.132754461491343e-06, "loss": 0.1479, "step": 4364 }, { "epoch": 0.9162468513853904, "grad_norm": 0.12408062070608139, "learning_rate": 2.122180008465008e-06, "loss": 0.1529, "step": 4365 }, { "epoch": 0.9164567590260285, "grad_norm": 0.1353202611207962, "learning_rate": 2.11163126692166e-06, "loss": 0.1424, "step": 4366 }, { "epoch": 0.9166666666666666, "grad_norm": 0.12677113711833954, "learning_rate": 2.1011082425262007e-06, "loss": 0.1504, "step": 4367 }, { "epoch": 0.9168765743073047, "grad_norm": 0.11069560050964355, "learning_rate": 2.090610940929738e-06, "loss": 0.1541, "step": 4368 }, { "epoch": 0.917086481947943, "grad_norm": 0.11136747896671295, "learning_rate": 2.080139367769568e-06, "loss": 0.1599, "step": 4369 }, { "epoch": 0.9172963895885811, "grad_norm": 0.10797339677810669, "learning_rate": 2.0696935286691697e-06, "loss": 0.154, "step": 4370 }, { "epoch": 0.9175062972292192, "grad_norm": 0.1661231368780136, "learning_rate": 2.0592734292381844e-06, "loss": 0.1599, "step": 4371 }, { "epoch": 0.9177162048698573, "grad_norm": 0.11923778802156448, "learning_rate": 2.048879075072452e-06, "loss": 0.1685, "step": 4372 }, { "epoch": 0.9179261125104954, "grad_norm": 0.12219996005296707, "learning_rate": 2.038510471753979e-06, "loss": 0.1592, "step": 4373 }, { "epoch": 0.9181360201511335, "grad_norm": 0.1292019933462143, "learning_rate": 2.028167624850935e-06, "loss": 0.1608, "step": 4374 }, { "epoch": 0.9183459277917716, "grad_norm": 0.11530636996030807, "learning_rate": 2.0178505399176704e-06, "loss": 0.1483, "step": 4375 }, { "epoch": 0.9185558354324097, "grad_norm": 0.11238705366849899, "learning_rate": 2.007559222494687e-06, "loss": 0.1477, "step": 4376 }, { "epoch": 0.9187657430730478, "grad_norm": 0.11351519078016281, "learning_rate": 1.9972936781086693e-06, "loss": 0.1593, "step": 4377 }, { "epoch": 0.918975650713686, "grad_norm": 0.1325710266828537, "learning_rate": 1.987053912272446e-06, "loss": 0.142, "step": 4378 }, { "epoch": 0.9191855583543241, "grad_norm": 0.11840063333511353, "learning_rate": 1.976839930485008e-06, "loss": 0.1433, "step": 4379 }, { "epoch": 0.9193954659949622, "grad_norm": 0.114000603556633, "learning_rate": 1.9666517382314897e-06, "loss": 0.1491, "step": 4380 }, { "epoch": 0.9196053736356004, "grad_norm": 0.10446514934301376, "learning_rate": 1.956489340983181e-06, "loss": 0.1572, "step": 4381 }, { "epoch": 0.9198152812762385, "grad_norm": 0.11224140971899033, "learning_rate": 1.9463527441975325e-06, "loss": 0.1389, "step": 4382 }, { "epoch": 0.9200251889168766, "grad_norm": 0.13361458480358124, "learning_rate": 1.9362419533181187e-06, "loss": 0.1628, "step": 4383 }, { "epoch": 0.9202350965575147, "grad_norm": 0.11145322024822235, "learning_rate": 1.9261569737746675e-06, "loss": 0.1533, "step": 4384 }, { "epoch": 0.9204450041981528, "grad_norm": 0.09477204829454422, "learning_rate": 1.916097810983042e-06, "loss": 0.1589, "step": 4385 }, { "epoch": 0.9206549118387909, "grad_norm": 0.12341588735580444, "learning_rate": 1.906064470345248e-06, "loss": 0.1664, "step": 4386 }, { "epoch": 0.920864819479429, "grad_norm": 0.11354007571935654, "learning_rate": 1.8960569572494203e-06, "loss": 0.151, "step": 4387 }, { "epoch": 0.9210747271200672, "grad_norm": 0.10435278713703156, "learning_rate": 1.8860752770698155e-06, "loss": 0.1529, "step": 4388 }, { "epoch": 0.9212846347607053, "grad_norm": 0.11676231026649475, "learning_rate": 1.8761194351668176e-06, "loss": 0.1622, "step": 4389 }, { "epoch": 0.9214945424013434, "grad_norm": 0.11899908632040024, "learning_rate": 1.8661894368869505e-06, "loss": 0.1299, "step": 4390 }, { "epoch": 0.9217044500419815, "grad_norm": 0.12129631638526917, "learning_rate": 1.8562852875628534e-06, "loss": 0.1408, "step": 4391 }, { "epoch": 0.9219143576826196, "grad_norm": 0.12856750190258026, "learning_rate": 1.846406992513272e-06, "loss": 0.1555, "step": 4392 }, { "epoch": 0.9221242653232578, "grad_norm": 0.11216092109680176, "learning_rate": 1.8365545570430798e-06, "loss": 0.1479, "step": 4393 }, { "epoch": 0.9223341729638959, "grad_norm": 0.11643592268228531, "learning_rate": 1.8267279864432662e-06, "loss": 0.137, "step": 4394 }, { "epoch": 0.922544080604534, "grad_norm": 0.11467355489730835, "learning_rate": 1.8169272859909158e-06, "loss": 0.1541, "step": 4395 }, { "epoch": 0.9227539882451721, "grad_norm": 0.11284631490707397, "learning_rate": 1.8071524609492295e-06, "loss": 0.1328, "step": 4396 }, { "epoch": 0.9229638958858103, "grad_norm": 0.12459743767976761, "learning_rate": 1.797403516567503e-06, "loss": 0.1555, "step": 4397 }, { "epoch": 0.9231738035264484, "grad_norm": 0.13417114317417145, "learning_rate": 1.7876804580811645e-06, "loss": 0.1484, "step": 4398 }, { "epoch": 0.9233837111670865, "grad_norm": 0.11709386855363846, "learning_rate": 1.7779832907117045e-06, "loss": 0.1446, "step": 4399 }, { "epoch": 0.9235936188077246, "grad_norm": 0.14955921471118927, "learning_rate": 1.7683120196667235e-06, "loss": 0.185, "step": 4400 }, { "epoch": 0.9238035264483627, "grad_norm": 0.12120398879051208, "learning_rate": 1.7586666501399118e-06, "loss": 0.1655, "step": 4401 }, { "epoch": 0.9240134340890008, "grad_norm": 0.12226814031600952, "learning_rate": 1.7490471873110647e-06, "loss": 0.1701, "step": 4402 }, { "epoch": 0.9242233417296389, "grad_norm": 0.11423861980438232, "learning_rate": 1.7394536363460445e-06, "loss": 0.1466, "step": 4403 }, { "epoch": 0.924433249370277, "grad_norm": 0.14197257161140442, "learning_rate": 1.7298860023968078e-06, "loss": 0.1527, "step": 4404 }, { "epoch": 0.9246431570109152, "grad_norm": 0.13661980628967285, "learning_rate": 1.720344290601389e-06, "loss": 0.158, "step": 4405 }, { "epoch": 0.9248530646515534, "grad_norm": 0.1328696757555008, "learning_rate": 1.7108285060839169e-06, "loss": 0.1617, "step": 4406 }, { "epoch": 0.9250629722921915, "grad_norm": 0.1082594245672226, "learning_rate": 1.7013386539545761e-06, "loss": 0.1529, "step": 4407 }, { "epoch": 0.9252728799328296, "grad_norm": 0.12489262223243713, "learning_rate": 1.6918747393096456e-06, "loss": 0.1524, "step": 4408 }, { "epoch": 0.9254827875734677, "grad_norm": 0.1348544806241989, "learning_rate": 1.6824367672314433e-06, "loss": 0.1536, "step": 4409 }, { "epoch": 0.9256926952141058, "grad_norm": 0.12416580319404602, "learning_rate": 1.6730247427883872e-06, "loss": 0.1535, "step": 4410 }, { "epoch": 0.9259026028547439, "grad_norm": 0.12714757025241852, "learning_rate": 1.663638671034956e-06, "loss": 0.1826, "step": 4411 }, { "epoch": 0.926112510495382, "grad_norm": 0.13926753401756287, "learning_rate": 1.6542785570116736e-06, "loss": 0.1477, "step": 4412 }, { "epoch": 0.9263224181360201, "grad_norm": 0.11886262148618698, "learning_rate": 1.6449444057451414e-06, "loss": 0.1558, "step": 4413 }, { "epoch": 0.9265323257766582, "grad_norm": 0.11909741908311844, "learning_rate": 1.6356362222479993e-06, "loss": 0.1637, "step": 4414 }, { "epoch": 0.9267422334172963, "grad_norm": 0.1119212731719017, "learning_rate": 1.6263540115189823e-06, "loss": 0.1406, "step": 4415 }, { "epoch": 0.9269521410579346, "grad_norm": 0.11529149860143661, "learning_rate": 1.6170977785428254e-06, "loss": 0.1537, "step": 4416 }, { "epoch": 0.9271620486985727, "grad_norm": 0.10490969568490982, "learning_rate": 1.6078675282903354e-06, "loss": 0.1465, "step": 4417 }, { "epoch": 0.9273719563392108, "grad_norm": 0.11506490409374237, "learning_rate": 1.5986632657183865e-06, "loss": 0.166, "step": 4418 }, { "epoch": 0.9275818639798489, "grad_norm": 0.12108638882637024, "learning_rate": 1.5894849957698632e-06, "loss": 0.1598, "step": 4419 }, { "epoch": 0.927791771620487, "grad_norm": 0.10503645241260529, "learning_rate": 1.580332723373712e-06, "loss": 0.1662, "step": 4420 }, { "epoch": 0.9280016792611251, "grad_norm": 0.12522466480731964, "learning_rate": 1.5712064534449067e-06, "loss": 0.1527, "step": 4421 }, { "epoch": 0.9282115869017632, "grad_norm": 0.12102288752794266, "learning_rate": 1.5621061908844714e-06, "loss": 0.17, "step": 4422 }, { "epoch": 0.9284214945424013, "grad_norm": 0.12997739017009735, "learning_rate": 1.5530319405794525e-06, "loss": 0.1323, "step": 4423 }, { "epoch": 0.9286314021830394, "grad_norm": 0.12769199907779694, "learning_rate": 1.5439837074029239e-06, "loss": 0.1465, "step": 4424 }, { "epoch": 0.9288413098236776, "grad_norm": 0.10771086812019348, "learning_rate": 1.5349614962139936e-06, "loss": 0.155, "step": 4425 }, { "epoch": 0.9290512174643157, "grad_norm": 0.13490477204322815, "learning_rate": 1.525965311857802e-06, "loss": 0.1665, "step": 4426 }, { "epoch": 0.9292611251049538, "grad_norm": 0.11361309885978699, "learning_rate": 1.516995159165502e-06, "loss": 0.1578, "step": 4427 }, { "epoch": 0.929471032745592, "grad_norm": 0.1189640536904335, "learning_rate": 1.5080510429542672e-06, "loss": 0.1415, "step": 4428 }, { "epoch": 0.9296809403862301, "grad_norm": 0.11884358525276184, "learning_rate": 1.499132968027306e-06, "loss": 0.1617, "step": 4429 }, { "epoch": 0.9298908480268682, "grad_norm": 0.14828267693519592, "learning_rate": 1.4902409391738091e-06, "loss": 0.1596, "step": 4430 }, { "epoch": 0.9301007556675063, "grad_norm": 0.13310547173023224, "learning_rate": 1.4813749611690075e-06, "loss": 0.1535, "step": 4431 }, { "epoch": 0.9303106633081444, "grad_norm": 0.11745025217533112, "learning_rate": 1.472535038774142e-06, "loss": 0.1728, "step": 4432 }, { "epoch": 0.9305205709487825, "grad_norm": 0.11373081803321838, "learning_rate": 1.4637211767364378e-06, "loss": 0.1595, "step": 4433 }, { "epoch": 0.9307304785894207, "grad_norm": 0.1132584884762764, "learning_rate": 1.4549333797891472e-06, "loss": 0.1439, "step": 4434 }, { "epoch": 0.9309403862300588, "grad_norm": 0.11979074031114578, "learning_rate": 1.4461716526515234e-06, "loss": 0.1511, "step": 4435 }, { "epoch": 0.9311502938706969, "grad_norm": 0.13811197876930237, "learning_rate": 1.4374360000288134e-06, "loss": 0.15, "step": 4436 }, { "epoch": 0.931360201511335, "grad_norm": 0.10705514252185822, "learning_rate": 1.428726426612248e-06, "loss": 0.1465, "step": 4437 }, { "epoch": 0.9315701091519731, "grad_norm": 0.13573043048381805, "learning_rate": 1.4200429370790802e-06, "loss": 0.1302, "step": 4438 }, { "epoch": 0.9317800167926112, "grad_norm": 0.11842697113752365, "learning_rate": 1.4113855360925354e-06, "loss": 0.148, "step": 4439 }, { "epoch": 0.9319899244332494, "grad_norm": 0.11456546932458878, "learning_rate": 1.4027542283018448e-06, "loss": 0.1573, "step": 4440 }, { "epoch": 0.9321998320738875, "grad_norm": 0.1204330176115036, "learning_rate": 1.3941490183422113e-06, "loss": 0.1516, "step": 4441 }, { "epoch": 0.9324097397145256, "grad_norm": 0.126027449965477, "learning_rate": 1.385569910834833e-06, "loss": 0.1512, "step": 4442 }, { "epoch": 0.9326196473551638, "grad_norm": 0.1196598932147026, "learning_rate": 1.3770169103868802e-06, "loss": 0.165, "step": 4443 }, { "epoch": 0.9328295549958019, "grad_norm": 0.17786267399787903, "learning_rate": 1.368490021591512e-06, "loss": 0.1337, "step": 4444 }, { "epoch": 0.93303946263644, "grad_norm": 0.1896003633737564, "learning_rate": 1.3599892490278654e-06, "loss": 0.1614, "step": 4445 }, { "epoch": 0.9332493702770781, "grad_norm": 0.1261228770017624, "learning_rate": 1.3515145972610443e-06, "loss": 0.1517, "step": 4446 }, { "epoch": 0.9334592779177162, "grad_norm": 0.12102353572845459, "learning_rate": 1.343066070842136e-06, "loss": 0.1561, "step": 4447 }, { "epoch": 0.9336691855583543, "grad_norm": 0.12365438044071198, "learning_rate": 1.334643674308189e-06, "loss": 0.165, "step": 4448 }, { "epoch": 0.9338790931989924, "grad_norm": 0.1193314641714096, "learning_rate": 1.32624741218223e-06, "loss": 0.1686, "step": 4449 }, { "epoch": 0.9340890008396305, "grad_norm": 0.11541610211133957, "learning_rate": 1.3178772889732293e-06, "loss": 0.1624, "step": 4450 }, { "epoch": 0.9342989084802686, "grad_norm": 0.13764046132564545, "learning_rate": 1.309533309176142e-06, "loss": 0.1497, "step": 4451 }, { "epoch": 0.9345088161209067, "grad_norm": 0.1194557324051857, "learning_rate": 1.3012154772718777e-06, "loss": 0.1665, "step": 4452 }, { "epoch": 0.934718723761545, "grad_norm": 0.13757576048374176, "learning_rate": 1.2929237977273023e-06, "loss": 0.1688, "step": 4453 }, { "epoch": 0.9349286314021831, "grad_norm": 0.12633179128170013, "learning_rate": 1.2846582749952317e-06, "loss": 0.1358, "step": 4454 }, { "epoch": 0.9351385390428212, "grad_norm": 0.12188289314508438, "learning_rate": 1.2764189135144377e-06, "loss": 0.1544, "step": 4455 }, { "epoch": 0.9353484466834593, "grad_norm": 0.11736936867237091, "learning_rate": 1.268205717709664e-06, "loss": 0.1607, "step": 4456 }, { "epoch": 0.9355583543240974, "grad_norm": 0.12075451761484146, "learning_rate": 1.2600186919915712e-06, "loss": 0.1577, "step": 4457 }, { "epoch": 0.9357682619647355, "grad_norm": 0.12973789870738983, "learning_rate": 1.2518578407567706e-06, "loss": 0.1482, "step": 4458 }, { "epoch": 0.9359781696053736, "grad_norm": 0.12099910527467728, "learning_rate": 1.2437231683878393e-06, "loss": 0.1564, "step": 4459 }, { "epoch": 0.9361880772460117, "grad_norm": 0.13991937041282654, "learning_rate": 1.2356146792532775e-06, "loss": 0.1357, "step": 4460 }, { "epoch": 0.9363979848866498, "grad_norm": 0.13311533629894257, "learning_rate": 1.2275323777075297e-06, "loss": 0.1652, "step": 4461 }, { "epoch": 0.936607892527288, "grad_norm": 0.11790844053030014, "learning_rate": 1.2194762680909743e-06, "loss": 0.1722, "step": 4462 }, { "epoch": 0.9368178001679262, "grad_norm": 0.1021541953086853, "learning_rate": 1.2114463547299338e-06, "loss": 0.1554, "step": 4463 }, { "epoch": 0.9370277078085643, "grad_norm": 0.1211385428905487, "learning_rate": 1.2034426419366473e-06, "loss": 0.174, "step": 4464 }, { "epoch": 0.9372376154492024, "grad_norm": 0.16256774961948395, "learning_rate": 1.1954651340092936e-06, "loss": 0.1276, "step": 4465 }, { "epoch": 0.9374475230898405, "grad_norm": 0.12904593348503113, "learning_rate": 1.1875138352319737e-06, "loss": 0.1466, "step": 4466 }, { "epoch": 0.9376574307304786, "grad_norm": 0.12402849644422531, "learning_rate": 1.179588749874716e-06, "loss": 0.1295, "step": 4467 }, { "epoch": 0.9378673383711167, "grad_norm": 0.14143963158130646, "learning_rate": 1.171689882193483e-06, "loss": 0.165, "step": 4468 }, { "epoch": 0.9380772460117548, "grad_norm": 0.1427275836467743, "learning_rate": 1.163817236430137e-06, "loss": 0.1736, "step": 4469 }, { "epoch": 0.9382871536523929, "grad_norm": 0.12246612459421158, "learning_rate": 1.1559708168124793e-06, "loss": 0.1681, "step": 4470 }, { "epoch": 0.9384970612930311, "grad_norm": 0.11412835866212845, "learning_rate": 1.1481506275541953e-06, "loss": 0.1462, "step": 4471 }, { "epoch": 0.9387069689336692, "grad_norm": 0.13155506551265717, "learning_rate": 1.1403566728549253e-06, "loss": 0.1659, "step": 4472 }, { "epoch": 0.9389168765743073, "grad_norm": 0.11218184232711792, "learning_rate": 1.1325889569001935e-06, "loss": 0.1421, "step": 4473 }, { "epoch": 0.9391267842149454, "grad_norm": 0.13464675843715668, "learning_rate": 1.1248474838614465e-06, "loss": 0.1593, "step": 4474 }, { "epoch": 0.9393366918555835, "grad_norm": 0.11895395815372467, "learning_rate": 1.1171322578960197e-06, "loss": 0.1504, "step": 4475 }, { "epoch": 0.9395465994962217, "grad_norm": 0.12074901163578033, "learning_rate": 1.1094432831471768e-06, "loss": 0.1641, "step": 4476 }, { "epoch": 0.9397565071368598, "grad_norm": 0.1069439947605133, "learning_rate": 1.101780563744076e-06, "loss": 0.1479, "step": 4477 }, { "epoch": 0.9399664147774979, "grad_norm": 0.10945051163434982, "learning_rate": 1.0941441038017586e-06, "loss": 0.161, "step": 4478 }, { "epoch": 0.940176322418136, "grad_norm": 0.12555761635303497, "learning_rate": 1.0865339074211833e-06, "loss": 0.1463, "step": 4479 }, { "epoch": 0.9403862300587741, "grad_norm": 0.1251411885023117, "learning_rate": 1.0789499786892032e-06, "loss": 0.153, "step": 4480 }, { "epoch": 0.9405961376994123, "grad_norm": 0.10783831775188446, "learning_rate": 1.071392321678566e-06, "loss": 0.1574, "step": 4481 }, { "epoch": 0.9408060453400504, "grad_norm": 0.119156114757061, "learning_rate": 1.0638609404478916e-06, "loss": 0.1579, "step": 4482 }, { "epoch": 0.9410159529806885, "grad_norm": 0.12803539633750916, "learning_rate": 1.0563558390417172e-06, "loss": 0.1532, "step": 4483 }, { "epoch": 0.9412258606213266, "grad_norm": 0.11497396975755692, "learning_rate": 1.0488770214904465e-06, "loss": 0.1618, "step": 4484 }, { "epoch": 0.9414357682619647, "grad_norm": 0.15252766013145447, "learning_rate": 1.0414244918103777e-06, "loss": 0.1586, "step": 4485 }, { "epoch": 0.9416456759026028, "grad_norm": 0.11600207537412643, "learning_rate": 1.033998254003693e-06, "loss": 0.1373, "step": 4486 }, { "epoch": 0.941855583543241, "grad_norm": 0.11400733888149261, "learning_rate": 1.026598312058441e-06, "loss": 0.1475, "step": 4487 }, { "epoch": 0.9420654911838791, "grad_norm": 0.12781751155853271, "learning_rate": 1.0192246699485652e-06, "loss": 0.1657, "step": 4488 }, { "epoch": 0.9422753988245172, "grad_norm": 0.13421955704689026, "learning_rate": 1.0118773316338869e-06, "loss": 0.1624, "step": 4489 }, { "epoch": 0.9424853064651554, "grad_norm": 0.12550324201583862, "learning_rate": 1.004556301060089e-06, "loss": 0.1554, "step": 4490 }, { "epoch": 0.9426952141057935, "grad_norm": 0.13535472750663757, "learning_rate": 9.972615821587316e-07, "loss": 0.1354, "step": 4491 }, { "epoch": 0.9429051217464316, "grad_norm": 0.146240696310997, "learning_rate": 9.899931788472427e-07, "loss": 0.1489, "step": 4492 }, { "epoch": 0.9431150293870697, "grad_norm": 0.12862592935562134, "learning_rate": 9.827510950289275e-07, "loss": 0.1612, "step": 4493 }, { "epoch": 0.9433249370277078, "grad_norm": 0.11790810525417328, "learning_rate": 9.75535334592942e-07, "loss": 0.1415, "step": 4494 }, { "epoch": 0.9435348446683459, "grad_norm": 0.11090364307165146, "learning_rate": 9.683459014143248e-07, "loss": 0.1649, "step": 4495 }, { "epoch": 0.943744752308984, "grad_norm": 0.12426898628473282, "learning_rate": 9.611827993539547e-07, "loss": 0.1573, "step": 4496 }, { "epoch": 0.9439546599496221, "grad_norm": 0.15216581523418427, "learning_rate": 9.540460322585875e-07, "loss": 0.1518, "step": 4497 }, { "epoch": 0.9441645675902602, "grad_norm": 0.11171586066484451, "learning_rate": 9.469356039608357e-07, "loss": 0.1326, "step": 4498 }, { "epoch": 0.9443744752308985, "grad_norm": 0.12237754464149475, "learning_rate": 9.398515182791446e-07, "loss": 0.1544, "step": 4499 }, { "epoch": 0.9445843828715366, "grad_norm": 0.12762495875358582, "learning_rate": 9.327937790178487e-07, "loss": 0.1702, "step": 4500 }, { "epoch": 0.9447942905121747, "grad_norm": 0.12392231822013855, "learning_rate": 9.257623899671053e-07, "loss": 0.1591, "step": 4501 }, { "epoch": 0.9450041981528128, "grad_norm": 0.11078055948019028, "learning_rate": 9.187573549029327e-07, "loss": 0.1492, "step": 4502 }, { "epoch": 0.9452141057934509, "grad_norm": 0.14925430715084076, "learning_rate": 9.117786775871939e-07, "loss": 0.1325, "step": 4503 }, { "epoch": 0.945424013434089, "grad_norm": 0.12183661013841629, "learning_rate": 9.048263617675967e-07, "loss": 0.1605, "step": 4504 }, { "epoch": 0.9456339210747271, "grad_norm": 0.11003254354000092, "learning_rate": 8.97900411177699e-07, "loss": 0.1536, "step": 4505 }, { "epoch": 0.9458438287153652, "grad_norm": 0.11058861762285233, "learning_rate": 8.910008295368921e-07, "loss": 0.145, "step": 4506 }, { "epoch": 0.9460537363560033, "grad_norm": 0.12683282792568207, "learning_rate": 8.841276205504067e-07, "loss": 0.1614, "step": 4507 }, { "epoch": 0.9462636439966414, "grad_norm": 0.10946714878082275, "learning_rate": 8.772807879093126e-07, "loss": 0.1584, "step": 4508 }, { "epoch": 0.9464735516372796, "grad_norm": 0.11910434067249298, "learning_rate": 8.704603352905183e-07, "loss": 0.1635, "step": 4509 }, { "epoch": 0.9466834592779177, "grad_norm": 0.12066495418548584, "learning_rate": 8.636662663567608e-07, "loss": 0.1588, "step": 4510 }, { "epoch": 0.9468933669185559, "grad_norm": 0.12396088242530823, "learning_rate": 8.568985847566157e-07, "loss": 0.1564, "step": 4511 }, { "epoch": 0.947103274559194, "grad_norm": 0.12058629095554352, "learning_rate": 8.501572941244762e-07, "loss": 0.1484, "step": 4512 }, { "epoch": 0.9473131821998321, "grad_norm": 0.1340767741203308, "learning_rate": 8.434423980805684e-07, "loss": 0.1525, "step": 4513 }, { "epoch": 0.9475230898404702, "grad_norm": 0.14222848415374756, "learning_rate": 8.367539002309465e-07, "loss": 0.1531, "step": 4514 }, { "epoch": 0.9477329974811083, "grad_norm": 0.14000943303108215, "learning_rate": 8.300918041674932e-07, "loss": 0.1748, "step": 4515 }, { "epoch": 0.9479429051217464, "grad_norm": 0.1195470541715622, "learning_rate": 8.234561134678909e-07, "loss": 0.1471, "step": 4516 }, { "epoch": 0.9481528127623845, "grad_norm": 0.10464131087064743, "learning_rate": 8.168468316956723e-07, "loss": 0.1451, "step": 4517 }, { "epoch": 0.9483627204030227, "grad_norm": 0.11033995449542999, "learning_rate": 8.102639624001707e-07, "loss": 0.1417, "step": 4518 }, { "epoch": 0.9485726280436608, "grad_norm": 0.12074270844459534, "learning_rate": 8.037075091165358e-07, "loss": 0.1486, "step": 4519 }, { "epoch": 0.9487825356842989, "grad_norm": 0.1229119673371315, "learning_rate": 7.971774753657235e-07, "loss": 0.1597, "step": 4520 }, { "epoch": 0.948992443324937, "grad_norm": 0.12053476274013519, "learning_rate": 7.906738646545176e-07, "loss": 0.1463, "step": 4521 }, { "epoch": 0.9492023509655751, "grad_norm": 0.11490915715694427, "learning_rate": 7.841966804755129e-07, "loss": 0.1505, "step": 4522 }, { "epoch": 0.9494122586062133, "grad_norm": 0.1358073353767395, "learning_rate": 7.77745926307094e-07, "loss": 0.1653, "step": 4523 }, { "epoch": 0.9496221662468514, "grad_norm": 0.13225747644901276, "learning_rate": 7.713216056134731e-07, "loss": 0.1442, "step": 4524 }, { "epoch": 0.9498320738874895, "grad_norm": 0.1354922205209732, "learning_rate": 7.649237218446458e-07, "loss": 0.156, "step": 4525 }, { "epoch": 0.9500419815281276, "grad_norm": 0.11272266507148743, "learning_rate": 7.585522784364363e-07, "loss": 0.1669, "step": 4526 }, { "epoch": 0.9502518891687658, "grad_norm": 0.12645363807678223, "learning_rate": 7.522072788104406e-07, "loss": 0.1653, "step": 4527 }, { "epoch": 0.9504617968094039, "grad_norm": 0.12966369092464447, "learning_rate": 7.458887263740721e-07, "loss": 0.1421, "step": 4528 }, { "epoch": 0.950671704450042, "grad_norm": 0.1188059002161026, "learning_rate": 7.395966245205443e-07, "loss": 0.1632, "step": 4529 }, { "epoch": 0.9508816120906801, "grad_norm": 0.1147613450884819, "learning_rate": 7.333309766288599e-07, "loss": 0.1567, "step": 4530 }, { "epoch": 0.9510915197313182, "grad_norm": 0.12427303940057755, "learning_rate": 7.270917860638049e-07, "loss": 0.1619, "step": 4531 }, { "epoch": 0.9513014273719563, "grad_norm": 0.10449483245611191, "learning_rate": 7.20879056175977e-07, "loss": 0.1574, "step": 4532 }, { "epoch": 0.9515113350125944, "grad_norm": 0.10406437516212463, "learning_rate": 7.146927903017464e-07, "loss": 0.1531, "step": 4533 }, { "epoch": 0.9517212426532325, "grad_norm": 0.12593679130077362, "learning_rate": 7.085329917632888e-07, "loss": 0.1596, "step": 4534 }, { "epoch": 0.9519311502938707, "grad_norm": 0.15952709317207336, "learning_rate": 7.02399663868547e-07, "loss": 0.1507, "step": 4535 }, { "epoch": 0.9521410579345088, "grad_norm": 0.14857974648475647, "learning_rate": 6.962928099112643e-07, "loss": 0.1593, "step": 4536 }, { "epoch": 0.952350965575147, "grad_norm": 0.12613141536712646, "learning_rate": 6.902124331709503e-07, "loss": 0.1467, "step": 4537 }, { "epoch": 0.9525608732157851, "grad_norm": 0.12536998093128204, "learning_rate": 6.841585369129266e-07, "loss": 0.1616, "step": 4538 }, { "epoch": 0.9527707808564232, "grad_norm": 0.11849667876958847, "learning_rate": 6.781311243882593e-07, "loss": 0.1564, "step": 4539 }, { "epoch": 0.9529806884970613, "grad_norm": 0.13249583542346954, "learning_rate": 6.721301988338036e-07, "loss": 0.1581, "step": 4540 }, { "epoch": 0.9531905961376994, "grad_norm": 0.13716809451580048, "learning_rate": 6.661557634721982e-07, "loss": 0.1582, "step": 4541 }, { "epoch": 0.9534005037783375, "grad_norm": 0.11763902753591537, "learning_rate": 6.602078215118601e-07, "loss": 0.1544, "step": 4542 }, { "epoch": 0.9536104114189756, "grad_norm": 0.11533381044864655, "learning_rate": 6.542863761469565e-07, "loss": 0.1462, "step": 4543 }, { "epoch": 0.9538203190596137, "grad_norm": 0.10741002857685089, "learning_rate": 6.483914305574434e-07, "loss": 0.141, "step": 4544 }, { "epoch": 0.9540302267002518, "grad_norm": 0.11406449973583221, "learning_rate": 6.425229879090444e-07, "loss": 0.165, "step": 4545 }, { "epoch": 0.9542401343408901, "grad_norm": 0.13373495638370514, "learning_rate": 6.366810513532495e-07, "loss": 0.1569, "step": 4546 }, { "epoch": 0.9544500419815282, "grad_norm": 0.10696794092655182, "learning_rate": 6.308656240272992e-07, "loss": 0.1377, "step": 4547 }, { "epoch": 0.9546599496221663, "grad_norm": 0.12216789275407791, "learning_rate": 6.250767090542231e-07, "loss": 0.1471, "step": 4548 }, { "epoch": 0.9548698572628044, "grad_norm": 0.1208495944738388, "learning_rate": 6.193143095427956e-07, "loss": 0.1516, "step": 4549 }, { "epoch": 0.9550797649034425, "grad_norm": 0.11360879987478256, "learning_rate": 6.135784285875579e-07, "loss": 0.1607, "step": 4550 }, { "epoch": 0.9552896725440806, "grad_norm": 0.11046485602855682, "learning_rate": 6.078690692688127e-07, "loss": 0.1551, "step": 4551 }, { "epoch": 0.9554995801847187, "grad_norm": 0.12528584897518158, "learning_rate": 6.02186234652613e-07, "loss": 0.1537, "step": 4552 }, { "epoch": 0.9557094878253568, "grad_norm": 0.12984655797481537, "learning_rate": 5.965299277907677e-07, "loss": 0.1542, "step": 4553 }, { "epoch": 0.9559193954659949, "grad_norm": 0.1632155030965805, "learning_rate": 5.909001517208468e-07, "loss": 0.1564, "step": 4554 }, { "epoch": 0.9561293031066331, "grad_norm": 0.11997831612825394, "learning_rate": 5.852969094661709e-07, "loss": 0.1497, "step": 4555 }, { "epoch": 0.9563392107472712, "grad_norm": 0.11431669443845749, "learning_rate": 5.797202040358052e-07, "loss": 0.1433, "step": 4556 }, { "epoch": 0.9565491183879093, "grad_norm": 0.12266053259372711, "learning_rate": 5.741700384245596e-07, "loss": 0.1592, "step": 4557 }, { "epoch": 0.9567590260285475, "grad_norm": 0.09542907774448395, "learning_rate": 5.686464156130167e-07, "loss": 0.1376, "step": 4558 }, { "epoch": 0.9569689336691856, "grad_norm": 0.10986118763685226, "learning_rate": 5.631493385674814e-07, "loss": 0.173, "step": 4559 }, { "epoch": 0.9571788413098237, "grad_norm": 0.12995390594005585, "learning_rate": 5.576788102400144e-07, "loss": 0.1373, "step": 4560 }, { "epoch": 0.9573887489504618, "grad_norm": 0.11493270099163055, "learning_rate": 5.522348335683935e-07, "loss": 0.1528, "step": 4561 }, { "epoch": 0.9575986565910999, "grad_norm": 0.1076839342713356, "learning_rate": 5.468174114761859e-07, "loss": 0.1406, "step": 4562 }, { "epoch": 0.957808564231738, "grad_norm": 0.15728503465652466, "learning_rate": 5.414265468726531e-07, "loss": 0.1595, "step": 4563 }, { "epoch": 0.9580184718723762, "grad_norm": 0.12800243496894836, "learning_rate": 5.360622426528183e-07, "loss": 0.1685, "step": 4564 }, { "epoch": 0.9582283795130143, "grad_norm": 0.11527343839406967, "learning_rate": 5.307245016974382e-07, "loss": 0.1527, "step": 4565 }, { "epoch": 0.9584382871536524, "grad_norm": 0.12676075100898743, "learning_rate": 5.254133268729921e-07, "loss": 0.161, "step": 4566 }, { "epoch": 0.9586481947942905, "grad_norm": 0.13688981533050537, "learning_rate": 5.201287210317151e-07, "loss": 0.1551, "step": 4567 }, { "epoch": 0.9588581024349286, "grad_norm": 0.11639519780874252, "learning_rate": 5.148706870115539e-07, "loss": 0.1414, "step": 4568 }, { "epoch": 0.9590680100755667, "grad_norm": 0.11846988648176193, "learning_rate": 5.096392276361883e-07, "loss": 0.1552, "step": 4569 }, { "epoch": 0.9592779177162049, "grad_norm": 0.1394091099500656, "learning_rate": 5.044343457150436e-07, "loss": 0.1479, "step": 4570 }, { "epoch": 0.959487825356843, "grad_norm": 0.11145713180303574, "learning_rate": 4.992560440432503e-07, "loss": 0.1563, "step": 4571 }, { "epoch": 0.9596977329974811, "grad_norm": 0.11287238448858261, "learning_rate": 4.941043254016786e-07, "loss": 0.16, "step": 4572 }, { "epoch": 0.9599076406381192, "grad_norm": 0.11632929742336273, "learning_rate": 4.889791925569209e-07, "loss": 0.1571, "step": 4573 }, { "epoch": 0.9601175482787574, "grad_norm": 0.12415865063667297, "learning_rate": 4.838806482612867e-07, "loss": 0.1635, "step": 4574 }, { "epoch": 0.9603274559193955, "grad_norm": 0.1393875777721405, "learning_rate": 4.788086952528137e-07, "loss": 0.166, "step": 4575 }, { "epoch": 0.9605373635600336, "grad_norm": 0.11887677013874054, "learning_rate": 4.737633362552507e-07, "loss": 0.1635, "step": 4576 }, { "epoch": 0.9607472712006717, "grad_norm": 0.13373823463916779, "learning_rate": 4.6874457397808045e-07, "loss": 0.1603, "step": 4577 }, { "epoch": 0.9609571788413098, "grad_norm": 0.13354790210723877, "learning_rate": 4.637524111164804e-07, "loss": 0.1722, "step": 4578 }, { "epoch": 0.9611670864819479, "grad_norm": 0.11867116391658783, "learning_rate": 4.5878685035136706e-07, "loss": 0.1515, "step": 4579 }, { "epoch": 0.961376994122586, "grad_norm": 0.12191498279571533, "learning_rate": 4.5384789434935735e-07, "loss": 0.1505, "step": 4580 }, { "epoch": 0.9615869017632241, "grad_norm": 0.10336653143167496, "learning_rate": 4.489355457627853e-07, "loss": 0.1638, "step": 4581 }, { "epoch": 0.9617968094038623, "grad_norm": 0.12683366239070892, "learning_rate": 4.440498072296906e-07, "loss": 0.1631, "step": 4582 }, { "epoch": 0.9620067170445005, "grad_norm": 0.12422499805688858, "learning_rate": 4.391906813738245e-07, "loss": 0.1663, "step": 4583 }, { "epoch": 0.9622166246851386, "grad_norm": 0.12688611447811127, "learning_rate": 4.343581708046496e-07, "loss": 0.148, "step": 4584 }, { "epoch": 0.9624265323257767, "grad_norm": 0.12651990354061127, "learning_rate": 4.2955227811734e-07, "loss": 0.1581, "step": 4585 }, { "epoch": 0.9626364399664148, "grad_norm": 0.14679253101348877, "learning_rate": 4.24773005892759e-07, "loss": 0.1606, "step": 4586 }, { "epoch": 0.9628463476070529, "grad_norm": 0.13088642060756683, "learning_rate": 4.2002035669749806e-07, "loss": 0.1519, "step": 4587 }, { "epoch": 0.963056255247691, "grad_norm": 0.12559828162193298, "learning_rate": 4.152943330838377e-07, "loss": 0.1526, "step": 4588 }, { "epoch": 0.9632661628883291, "grad_norm": 0.10229134559631348, "learning_rate": 4.105949375897478e-07, "loss": 0.1408, "step": 4589 }, { "epoch": 0.9634760705289672, "grad_norm": 0.12710246443748474, "learning_rate": 4.059221727389151e-07, "loss": 0.1634, "step": 4590 }, { "epoch": 0.9636859781696053, "grad_norm": 0.1332884132862091, "learning_rate": 4.012760410407324e-07, "loss": 0.1715, "step": 4591 }, { "epoch": 0.9638958858102435, "grad_norm": 0.11965722590684891, "learning_rate": 3.9665654499026486e-07, "loss": 0.1553, "step": 4592 }, { "epoch": 0.9641057934508817, "grad_norm": 0.12464660406112671, "learning_rate": 3.920636870682948e-07, "loss": 0.1404, "step": 4593 }, { "epoch": 0.9643157010915198, "grad_norm": 0.11357168853282928, "learning_rate": 3.874974697412881e-07, "loss": 0.132, "step": 4594 }, { "epoch": 0.9645256087321579, "grad_norm": 0.12431029975414276, "learning_rate": 3.8295789546141656e-07, "loss": 0.1411, "step": 4595 }, { "epoch": 0.964735516372796, "grad_norm": 0.12060980498790741, "learning_rate": 3.7844496666651906e-07, "loss": 0.1745, "step": 4596 }, { "epoch": 0.9649454240134341, "grad_norm": 0.12177833914756775, "learning_rate": 3.7395868578015693e-07, "loss": 0.1497, "step": 4597 }, { "epoch": 0.9651553316540722, "grad_norm": 0.12002568691968918, "learning_rate": 3.694990552115585e-07, "loss": 0.1559, "step": 4598 }, { "epoch": 0.9653652392947103, "grad_norm": 0.11855290830135345, "learning_rate": 3.650660773556469e-07, "loss": 0.1494, "step": 4599 }, { "epoch": 0.9655751469353484, "grad_norm": 0.12736105918884277, "learning_rate": 3.6065975459303456e-07, "loss": 0.1301, "step": 4600 }, { "epoch": 0.9657850545759865, "grad_norm": 0.12847037613391876, "learning_rate": 3.5628008929001753e-07, "loss": 0.1367, "step": 4601 }, { "epoch": 0.9659949622166247, "grad_norm": 0.11419855803251266, "learning_rate": 3.5192708379856997e-07, "loss": 0.1432, "step": 4602 }, { "epoch": 0.9662048698572628, "grad_norm": 0.11686286330223083, "learning_rate": 3.4760074045636637e-07, "loss": 0.1561, "step": 4603 }, { "epoch": 0.966414777497901, "grad_norm": 0.12151585519313812, "learning_rate": 3.433010615867427e-07, "loss": 0.1552, "step": 4604 }, { "epoch": 0.966624685138539, "grad_norm": 0.14519096910953522, "learning_rate": 3.3902804949872967e-07, "loss": 0.1587, "step": 4605 }, { "epoch": 0.9668345927791772, "grad_norm": 0.12270315736532211, "learning_rate": 3.347817064870307e-07, "loss": 0.1582, "step": 4606 }, { "epoch": 0.9670445004198153, "grad_norm": 0.12619507312774658, "learning_rate": 3.3056203483202705e-07, "loss": 0.1546, "step": 4607 }, { "epoch": 0.9672544080604534, "grad_norm": 0.15550176799297333, "learning_rate": 3.2636903679978405e-07, "loss": 0.1434, "step": 4608 }, { "epoch": 0.9674643157010915, "grad_norm": 0.14259426295757294, "learning_rate": 3.222027146420337e-07, "loss": 0.1554, "step": 4609 }, { "epoch": 0.9676742233417296, "grad_norm": 0.12078682333230972, "learning_rate": 3.1806307059618625e-07, "loss": 0.1171, "step": 4610 }, { "epoch": 0.9678841309823678, "grad_norm": 0.11571485549211502, "learning_rate": 3.1395010688532454e-07, "loss": 0.153, "step": 4611 }, { "epoch": 0.9680940386230059, "grad_norm": 0.15199881792068481, "learning_rate": 3.0986382571820406e-07, "loss": 0.1633, "step": 4612 }, { "epoch": 0.968303946263644, "grad_norm": 0.12040861696004868, "learning_rate": 3.0580422928925824e-07, "loss": 0.166, "step": 4613 }, { "epoch": 0.9685138539042821, "grad_norm": 0.14831073582172394, "learning_rate": 3.01771319778571e-07, "loss": 0.1626, "step": 4614 }, { "epoch": 0.9687237615449202, "grad_norm": 0.12354908138513565, "learning_rate": 2.977650993519099e-07, "loss": 0.1603, "step": 4615 }, { "epoch": 0.9689336691855583, "grad_norm": 0.1218198910355568, "learning_rate": 2.937855701607206e-07, "loss": 0.1749, "step": 4616 }, { "epoch": 0.9691435768261965, "grad_norm": 0.1256335824728012, "learning_rate": 2.898327343420826e-07, "loss": 0.1417, "step": 4617 }, { "epoch": 0.9693534844668346, "grad_norm": 0.3487498164176941, "learning_rate": 2.8590659401876996e-07, "loss": 0.1604, "step": 4618 }, { "epoch": 0.9695633921074727, "grad_norm": 0.100483737885952, "learning_rate": 2.8200715129920175e-07, "loss": 0.1562, "step": 4619 }, { "epoch": 0.9697732997481109, "grad_norm": 0.13053487241268158, "learning_rate": 2.7813440827747504e-07, "loss": 0.1552, "step": 4620 }, { "epoch": 0.969983207388749, "grad_norm": 0.1330847293138504, "learning_rate": 2.7428836703333737e-07, "loss": 0.1602, "step": 4621 }, { "epoch": 0.9701931150293871, "grad_norm": 0.13313066959381104, "learning_rate": 2.704690296322032e-07, "loss": 0.1574, "step": 4622 }, { "epoch": 0.9704030226700252, "grad_norm": 0.11992690712213516, "learning_rate": 2.66676398125143e-07, "loss": 0.1513, "step": 4623 }, { "epoch": 0.9706129303106633, "grad_norm": 0.1217782199382782, "learning_rate": 2.629104745488886e-07, "loss": 0.1639, "step": 4624 }, { "epoch": 0.9708228379513014, "grad_norm": 0.13410820066928864, "learning_rate": 2.5917126092582234e-07, "loss": 0.1624, "step": 4625 }, { "epoch": 0.9710327455919395, "grad_norm": 0.1345527172088623, "learning_rate": 2.5545875926398786e-07, "loss": 0.14, "step": 4626 }, { "epoch": 0.9712426532325776, "grad_norm": 0.11989013850688934, "learning_rate": 2.5177297155709046e-07, "loss": 0.1574, "step": 4627 }, { "epoch": 0.9714525608732157, "grad_norm": 0.12046922743320465, "learning_rate": 2.481138997844745e-07, "loss": 0.1605, "step": 4628 }, { "epoch": 0.9716624685138538, "grad_norm": 0.12484660744667053, "learning_rate": 2.44481545911146e-07, "loss": 0.1545, "step": 4629 }, { "epoch": 0.9718723761544921, "grad_norm": 0.10974811762571335, "learning_rate": 2.4087591188776683e-07, "loss": 0.1552, "step": 4630 }, { "epoch": 0.9720822837951302, "grad_norm": 0.13422971963882446, "learning_rate": 2.3729699965063802e-07, "loss": 0.1461, "step": 4631 }, { "epoch": 0.9722921914357683, "grad_norm": 0.1330391764640808, "learning_rate": 2.3374481112172775e-07, "loss": 0.1688, "step": 4632 }, { "epoch": 0.9725020990764064, "grad_norm": 0.13906224071979523, "learning_rate": 2.3021934820862678e-07, "loss": 0.1665, "step": 4633 }, { "epoch": 0.9727120067170445, "grad_norm": 0.1329282969236374, "learning_rate": 2.2672061280460399e-07, "loss": 0.1581, "step": 4634 }, { "epoch": 0.9729219143576826, "grad_norm": 0.12213360518217087, "learning_rate": 2.2324860678855087e-07, "loss": 0.1712, "step": 4635 }, { "epoch": 0.9731318219983207, "grad_norm": 0.1564663201570511, "learning_rate": 2.1980333202502036e-07, "loss": 0.1634, "step": 4636 }, { "epoch": 0.9733417296389588, "grad_norm": 0.13800211250782013, "learning_rate": 2.1638479036419912e-07, "loss": 0.1504, "step": 4637 }, { "epoch": 0.9735516372795969, "grad_norm": 0.11233645677566528, "learning_rate": 2.129929836419242e-07, "loss": 0.1623, "step": 4638 }, { "epoch": 0.9737615449202351, "grad_norm": 0.13837553560733795, "learning_rate": 2.0962791367967195e-07, "loss": 0.1611, "step": 4639 }, { "epoch": 0.9739714525608733, "grad_norm": 0.11701783537864685, "learning_rate": 2.0628958228456341e-07, "loss": 0.1553, "step": 4640 }, { "epoch": 0.9741813602015114, "grad_norm": 0.13130126893520355, "learning_rate": 2.02977991249359e-07, "loss": 0.1379, "step": 4641 }, { "epoch": 0.9743912678421495, "grad_norm": 0.12259750068187714, "learning_rate": 1.996931423524584e-07, "loss": 0.1514, "step": 4642 }, { "epoch": 0.9746011754827876, "grad_norm": 0.14832593500614166, "learning_rate": 1.9643503735789493e-07, "loss": 0.1533, "step": 4643 }, { "epoch": 0.9748110831234257, "grad_norm": 0.12299138307571411, "learning_rate": 1.9320367801535232e-07, "loss": 0.1528, "step": 4644 }, { "epoch": 0.9750209907640638, "grad_norm": 0.11201336234807968, "learning_rate": 1.8999906606014806e-07, "loss": 0.1555, "step": 4645 }, { "epoch": 0.9752308984047019, "grad_norm": 0.11352569609880447, "learning_rate": 1.8682120321322217e-07, "loss": 0.1502, "step": 4646 }, { "epoch": 0.97544080604534, "grad_norm": 0.12598967552185059, "learning_rate": 1.8367009118115953e-07, "loss": 0.138, "step": 4647 }, { "epoch": 0.9756507136859782, "grad_norm": 0.10122178494930267, "learning_rate": 1.8054573165618426e-07, "loss": 0.1397, "step": 4648 }, { "epoch": 0.9758606213266163, "grad_norm": 0.15723147988319397, "learning_rate": 1.7744812631615425e-07, "loss": 0.1501, "step": 4649 }, { "epoch": 0.9760705289672544, "grad_norm": 0.13425561785697937, "learning_rate": 1.743772768245444e-07, "loss": 0.1795, "step": 4650 }, { "epoch": 0.9762804366078925, "grad_norm": 0.11695228517055511, "learning_rate": 1.7133318483047445e-07, "loss": 0.1559, "step": 4651 }, { "epoch": 0.9764903442485307, "grad_norm": 0.1349976360797882, "learning_rate": 1.6831585196869226e-07, "loss": 0.1651, "step": 4652 }, { "epoch": 0.9767002518891688, "grad_norm": 0.12021506577730179, "learning_rate": 1.6532527985957392e-07, "loss": 0.1567, "step": 4653 }, { "epoch": 0.9769101595298069, "grad_norm": 0.11645930260419846, "learning_rate": 1.6236147010912362e-07, "loss": 0.1484, "step": 4654 }, { "epoch": 0.977120067170445, "grad_norm": 0.12351713329553604, "learning_rate": 1.5942442430897376e-07, "loss": 0.1496, "step": 4655 }, { "epoch": 0.9773299748110831, "grad_norm": 0.11955785006284714, "learning_rate": 1.5651414403639043e-07, "loss": 0.155, "step": 4656 }, { "epoch": 0.9775398824517213, "grad_norm": 0.12057064473628998, "learning_rate": 1.5363063085425678e-07, "loss": 0.1698, "step": 4657 }, { "epoch": 0.9777497900923594, "grad_norm": 0.1271531581878662, "learning_rate": 1.507738863110897e-07, "loss": 0.1419, "step": 4658 }, { "epoch": 0.9779596977329975, "grad_norm": 0.1251571774482727, "learning_rate": 1.4794391194101754e-07, "loss": 0.1495, "step": 4659 }, { "epoch": 0.9781696053736356, "grad_norm": 0.14511661231517792, "learning_rate": 1.4514070926380795e-07, "loss": 0.1397, "step": 4660 }, { "epoch": 0.9783795130142737, "grad_norm": 0.10586293786764145, "learning_rate": 1.4236427978484013e-07, "loss": 0.1662, "step": 4661 }, { "epoch": 0.9785894206549118, "grad_norm": 0.12456677854061127, "learning_rate": 1.396146249951269e-07, "loss": 0.1655, "step": 4662 }, { "epoch": 0.9787993282955499, "grad_norm": 0.1349252313375473, "learning_rate": 1.3689174637129264e-07, "loss": 0.1753, "step": 4663 }, { "epoch": 0.979009235936188, "grad_norm": 0.11700873076915741, "learning_rate": 1.341956453755844e-07, "loss": 0.1436, "step": 4664 }, { "epoch": 0.9792191435768262, "grad_norm": 0.1258639246225357, "learning_rate": 1.3152632345586613e-07, "loss": 0.1508, "step": 4665 }, { "epoch": 0.9794290512174643, "grad_norm": 0.12187707424163818, "learning_rate": 1.2888378204563012e-07, "loss": 0.1591, "step": 4666 }, { "epoch": 0.9796389588581025, "grad_norm": 0.1171417310833931, "learning_rate": 1.2626802256398561e-07, "loss": 0.1586, "step": 4667 }, { "epoch": 0.9798488664987406, "grad_norm": 0.12227719277143478, "learning_rate": 1.236790464156423e-07, "loss": 0.1579, "step": 4668 }, { "epoch": 0.9800587741393787, "grad_norm": 0.12303393334150314, "learning_rate": 1.2111685499094915e-07, "loss": 0.1452, "step": 4669 }, { "epoch": 0.9802686817800168, "grad_norm": 0.12497860938310623, "learning_rate": 1.1858144966586104e-07, "loss": 0.174, "step": 4670 }, { "epoch": 0.9804785894206549, "grad_norm": 0.11376795172691345, "learning_rate": 1.160728318019444e-07, "loss": 0.1686, "step": 4671 }, { "epoch": 0.980688497061293, "grad_norm": 0.12611518800258636, "learning_rate": 1.1359100274638824e-07, "loss": 0.1358, "step": 4672 }, { "epoch": 0.9808984047019311, "grad_norm": 0.13577139377593994, "learning_rate": 1.1113596383199309e-07, "loss": 0.157, "step": 4673 }, { "epoch": 0.9811083123425692, "grad_norm": 0.13323377072811127, "learning_rate": 1.0870771637715993e-07, "loss": 0.1636, "step": 4674 }, { "epoch": 0.9813182199832073, "grad_norm": 0.11575714498758316, "learning_rate": 1.0630626168592894e-07, "loss": 0.1697, "step": 4675 }, { "epoch": 0.9815281276238456, "grad_norm": 0.11939375847578049, "learning_rate": 1.039316010479241e-07, "loss": 0.151, "step": 4676 }, { "epoch": 0.9817380352644837, "grad_norm": 0.13515305519104004, "learning_rate": 1.0158373573839197e-07, "loss": 0.158, "step": 4677 }, { "epoch": 0.9819479429051218, "grad_norm": 0.12420187145471573, "learning_rate": 9.926266701820175e-08, "loss": 0.1578, "step": 4678 }, { "epoch": 0.9821578505457599, "grad_norm": 0.14188450574874878, "learning_rate": 9.696839613381193e-08, "loss": 0.1716, "step": 4679 }, { "epoch": 0.982367758186398, "grad_norm": 0.11995568871498108, "learning_rate": 9.470092431729805e-08, "loss": 0.147, "step": 4680 }, { "epoch": 0.9825776658270361, "grad_norm": 0.1103273257613182, "learning_rate": 9.246025278634163e-08, "loss": 0.1414, "step": 4681 }, { "epoch": 0.9827875734676742, "grad_norm": 0.15861214697360992, "learning_rate": 9.024638274424125e-08, "loss": 0.1489, "step": 4682 }, { "epoch": 0.9829974811083123, "grad_norm": 0.18855759501457214, "learning_rate": 8.805931537989586e-08, "loss": 0.1508, "step": 4683 }, { "epoch": 0.9832073887489504, "grad_norm": 0.11294414848089218, "learning_rate": 8.589905186780489e-08, "loss": 0.1566, "step": 4684 }, { "epoch": 0.9834172963895886, "grad_norm": 0.11468642950057983, "learning_rate": 8.376559336807922e-08, "loss": 0.1548, "step": 4685 }, { "epoch": 0.9836272040302267, "grad_norm": 0.14178474247455597, "learning_rate": 8.165894102644123e-08, "loss": 0.1494, "step": 4686 }, { "epoch": 0.9838371116708649, "grad_norm": 0.12208564579486847, "learning_rate": 7.957909597420821e-08, "loss": 0.1638, "step": 4687 }, { "epoch": 0.984047019311503, "grad_norm": 0.132912278175354, "learning_rate": 7.752605932830337e-08, "loss": 0.1541, "step": 4688 }, { "epoch": 0.9842569269521411, "grad_norm": 0.11923284828662872, "learning_rate": 7.549983219125035e-08, "loss": 0.1585, "step": 4689 }, { "epoch": 0.9844668345927792, "grad_norm": 0.12066918611526489, "learning_rate": 7.350041565118981e-08, "loss": 0.1498, "step": 4690 }, { "epoch": 0.9846767422334173, "grad_norm": 0.16000604629516602, "learning_rate": 7.152781078184622e-08, "loss": 0.1571, "step": 4691 }, { "epoch": 0.9848866498740554, "grad_norm": 0.11645068973302841, "learning_rate": 6.958201864254999e-08, "loss": 0.1472, "step": 4692 }, { "epoch": 0.9850965575146935, "grad_norm": 0.11970088630914688, "learning_rate": 6.766304027824855e-08, "loss": 0.1431, "step": 4693 }, { "epoch": 0.9853064651553316, "grad_norm": 0.11811143159866333, "learning_rate": 6.577087671946758e-08, "loss": 0.1406, "step": 4694 }, { "epoch": 0.9855163727959698, "grad_norm": 0.13158312439918518, "learning_rate": 6.390552898234425e-08, "loss": 0.1644, "step": 4695 }, { "epoch": 0.9857262804366079, "grad_norm": 0.11352595686912537, "learning_rate": 6.206699806861061e-08, "loss": 0.1587, "step": 4696 }, { "epoch": 0.985936188077246, "grad_norm": 0.11427728831768036, "learning_rate": 6.025528496560462e-08, "loss": 0.1509, "step": 4697 }, { "epoch": 0.9861460957178841, "grad_norm": 0.1264103204011917, "learning_rate": 5.847039064625359e-08, "loss": 0.1508, "step": 4698 }, { "epoch": 0.9863560033585222, "grad_norm": 0.11748611181974411, "learning_rate": 5.671231606909078e-08, "loss": 0.1546, "step": 4699 }, { "epoch": 0.9865659109991604, "grad_norm": 0.1317833513021469, "learning_rate": 5.498106217823873e-08, "loss": 0.1345, "step": 4700 }, { "epoch": 0.9867758186397985, "grad_norm": 0.12351711839437485, "learning_rate": 5.327662990342042e-08, "loss": 0.1615, "step": 4701 }, { "epoch": 0.9869857262804366, "grad_norm": 0.11820079386234283, "learning_rate": 5.159902015995366e-08, "loss": 0.144, "step": 4702 }, { "epoch": 0.9871956339210747, "grad_norm": 0.13129431009292603, "learning_rate": 4.994823384876224e-08, "loss": 0.1539, "step": 4703 }, { "epoch": 0.9874055415617129, "grad_norm": 0.12318190187215805, "learning_rate": 4.832427185634259e-08, "loss": 0.1536, "step": 4704 }, { "epoch": 0.987615449202351, "grad_norm": 0.12245703488588333, "learning_rate": 4.672713505480819e-08, "loss": 0.1579, "step": 4705 }, { "epoch": 0.9878253568429891, "grad_norm": 0.12133902311325073, "learning_rate": 4.5156824301856306e-08, "loss": 0.1676, "step": 4706 }, { "epoch": 0.9880352644836272, "grad_norm": 0.10715369135141373, "learning_rate": 4.361334044077903e-08, "loss": 0.1446, "step": 4707 }, { "epoch": 0.9882451721242653, "grad_norm": 0.12753887474536896, "learning_rate": 4.209668430046332e-08, "loss": 0.1637, "step": 4708 }, { "epoch": 0.9884550797649034, "grad_norm": 0.15687881410121918, "learning_rate": 4.0606856695385444e-08, "loss": 0.1553, "step": 4709 }, { "epoch": 0.9886649874055415, "grad_norm": 0.12655100226402283, "learning_rate": 3.914385842562207e-08, "loss": 0.1387, "step": 4710 }, { "epoch": 0.9888748950461796, "grad_norm": 0.11232425272464752, "learning_rate": 3.770769027683363e-08, "loss": 0.1517, "step": 4711 }, { "epoch": 0.9890848026868178, "grad_norm": 0.11365073174238205, "learning_rate": 3.629835302026985e-08, "loss": 0.1573, "step": 4712 }, { "epoch": 0.989294710327456, "grad_norm": 0.1190265566110611, "learning_rate": 3.491584741278642e-08, "loss": 0.1549, "step": 4713 }, { "epoch": 0.9895046179680941, "grad_norm": 0.12508736550807953, "learning_rate": 3.356017419681723e-08, "loss": 0.1497, "step": 4714 }, { "epoch": 0.9897145256087322, "grad_norm": 0.11955531686544418, "learning_rate": 3.223133410038548e-08, "loss": 0.1601, "step": 4715 }, { "epoch": 0.9899244332493703, "grad_norm": 0.0987209752202034, "learning_rate": 3.0929327837114774e-08, "loss": 0.1482, "step": 4716 }, { "epoch": 0.9901343408900084, "grad_norm": 0.13037124276161194, "learning_rate": 2.9654156106206922e-08, "loss": 0.1531, "step": 4717 }, { "epoch": 0.9903442485306465, "grad_norm": 0.1238628700375557, "learning_rate": 2.8405819592464135e-08, "loss": 0.1519, "step": 4718 }, { "epoch": 0.9905541561712846, "grad_norm": 0.14739884436130524, "learning_rate": 2.718431896626683e-08, "loss": 0.1736, "step": 4719 }, { "epoch": 0.9907640638119227, "grad_norm": 0.11886577308177948, "learning_rate": 2.5989654883590286e-08, "loss": 0.1669, "step": 4720 }, { "epoch": 0.9909739714525608, "grad_norm": 0.09993169456720352, "learning_rate": 2.482182798599353e-08, "loss": 0.1513, "step": 4721 }, { "epoch": 0.9911838790931989, "grad_norm": 0.13532714545726776, "learning_rate": 2.3680838900635995e-08, "loss": 0.1597, "step": 4722 }, { "epoch": 0.9913937867338372, "grad_norm": 0.13656656444072723, "learning_rate": 2.2566688240244216e-08, "loss": 0.1385, "step": 4723 }, { "epoch": 0.9916036943744753, "grad_norm": 0.11988739669322968, "learning_rate": 2.147937660314514e-08, "loss": 0.1531, "step": 4724 }, { "epoch": 0.9918136020151134, "grad_norm": 0.15171854197978973, "learning_rate": 2.0418904573249464e-08, "loss": 0.154, "step": 4725 }, { "epoch": 0.9920235096557515, "grad_norm": 0.10903916507959366, "learning_rate": 1.938527272005719e-08, "loss": 0.1532, "step": 4726 }, { "epoch": 0.9922334172963896, "grad_norm": 0.13691365718841553, "learning_rate": 1.8378481598657627e-08, "loss": 0.1564, "step": 4727 }, { "epoch": 0.9924433249370277, "grad_norm": 0.10917693376541138, "learning_rate": 1.739853174970718e-08, "loss": 0.157, "step": 4728 }, { "epoch": 0.9926532325776658, "grad_norm": 0.11434437334537506, "learning_rate": 1.644542369947377e-08, "loss": 0.1492, "step": 4729 }, { "epoch": 0.9928631402183039, "grad_norm": 0.13532006740570068, "learning_rate": 1.5519157959792417e-08, "loss": 0.1622, "step": 4730 }, { "epoch": 0.993073047858942, "grad_norm": 0.11654216051101685, "learning_rate": 1.4619735028087445e-08, "loss": 0.173, "step": 4731 }, { "epoch": 0.9932829554995802, "grad_norm": 0.13726460933685303, "learning_rate": 1.374715538737248e-08, "loss": 0.148, "step": 4732 }, { "epoch": 0.9934928631402183, "grad_norm": 0.12805287539958954, "learning_rate": 1.2901419506239354e-08, "loss": 0.1477, "step": 4733 }, { "epoch": 0.9937027707808564, "grad_norm": 0.12175727635622025, "learning_rate": 1.2082527838874757e-08, "loss": 0.1736, "step": 4734 }, { "epoch": 0.9939126784214946, "grad_norm": 0.14802756905555725, "learning_rate": 1.129048082503803e-08, "loss": 0.1662, "step": 4735 }, { "epoch": 0.9941225860621327, "grad_norm": 0.09921912103891373, "learning_rate": 1.0525278890072265e-08, "loss": 0.1419, "step": 4736 }, { "epoch": 0.9943324937027708, "grad_norm": 0.1295527070760727, "learning_rate": 9.786922444915415e-09, "loss": 0.1565, "step": 4737 }, { "epoch": 0.9945424013434089, "grad_norm": 0.13671144843101501, "learning_rate": 9.075411886078078e-09, "loss": 0.1452, "step": 4738 }, { "epoch": 0.994752308984047, "grad_norm": 0.1243344098329544, "learning_rate": 8.390747595654613e-09, "loss": 0.1305, "step": 4739 }, { "epoch": 0.9949622166246851, "grad_norm": 0.12893415987491608, "learning_rate": 7.732929941334233e-09, "loss": 0.1589, "step": 4740 }, { "epoch": 0.9951721242653233, "grad_norm": 0.1381278932094574, "learning_rate": 7.101959276373249e-09, "loss": 0.159, "step": 4741 }, { "epoch": 0.9953820319059614, "grad_norm": 0.11679843813180923, "learning_rate": 6.4978359396172806e-09, "loss": 0.168, "step": 4742 }, { "epoch": 0.9955919395465995, "grad_norm": 0.13081474602222443, "learning_rate": 5.9205602554957e-09, "loss": 0.1489, "step": 4743 }, { "epoch": 0.9958018471872376, "grad_norm": 0.13281650841236115, "learning_rate": 5.370132534021632e-09, "loss": 0.1624, "step": 4744 }, { "epoch": 0.9960117548278757, "grad_norm": 0.10714995861053467, "learning_rate": 4.8465530707808575e-09, "loss": 0.1534, "step": 4745 }, { "epoch": 0.9962216624685138, "grad_norm": 0.11529166996479034, "learning_rate": 4.349822146954008e-09, "loss": 0.1494, "step": 4746 }, { "epoch": 0.996431570109152, "grad_norm": 0.11040196567773819, "learning_rate": 3.879940029294371e-09, "loss": 0.1583, "step": 4747 }, { "epoch": 0.9966414777497901, "grad_norm": 0.10761739313602448, "learning_rate": 3.436906970138987e-09, "loss": 0.163, "step": 4748 }, { "epoch": 0.9968513853904282, "grad_norm": 0.13290688395500183, "learning_rate": 3.020723207408649e-09, "loss": 0.1525, "step": 4749 }, { "epoch": 0.9970612930310663, "grad_norm": 0.10621462017297745, "learning_rate": 2.631388964596804e-09, "loss": 0.1414, "step": 4750 }, { "epoch": 0.9972712006717045, "grad_norm": 0.12779180705547333, "learning_rate": 2.2689044507917534e-09, "loss": 0.164, "step": 4751 }, { "epoch": 0.9974811083123426, "grad_norm": 0.12876422703266144, "learning_rate": 1.9332698606544517e-09, "loss": 0.1419, "step": 4752 }, { "epoch": 0.9976910159529807, "grad_norm": 0.12758982181549072, "learning_rate": 1.6244853744296073e-09, "loss": 0.1386, "step": 4753 }, { "epoch": 0.9979009235936188, "grad_norm": 0.11979212611913681, "learning_rate": 1.3425511579401307e-09, "loss": 0.1541, "step": 4754 }, { "epoch": 0.9981108312342569, "grad_norm": 0.12721380591392517, "learning_rate": 1.0874673625926868e-09, "loss": 0.1404, "step": 4755 }, { "epoch": 0.998320738874895, "grad_norm": 0.12794804573059082, "learning_rate": 8.592341253665925e-10, "loss": 0.159, "step": 4756 }, { "epoch": 0.9985306465155331, "grad_norm": 0.11918237060308456, "learning_rate": 6.578515688360209e-10, "loss": 0.1439, "step": 4757 }, { "epoch": 0.9987405541561712, "grad_norm": 0.1348177194595337, "learning_rate": 4.833198011422457e-10, "loss": 0.171, "step": 4758 }, { "epoch": 0.9989504617968094, "grad_norm": 0.1429336965084076, "learning_rate": 3.3563891602139685e-10, "loss": 0.1625, "step": 4759 }, { "epoch": 0.9991603694374476, "grad_norm": 0.10812459141016006, "learning_rate": 2.1480899277115385e-10, "loss": 0.1494, "step": 4760 }, { "epoch": 0.9993702770780857, "grad_norm": 0.12720505893230438, "learning_rate": 1.2083009628405252e-10, "loss": 0.1366, "step": 4761 }, { "epoch": 0.9995801847187238, "grad_norm": 0.10869085788726807, "learning_rate": 5.3702277030831524e-11, "loss": 0.1306, "step": 4762 }, { "epoch": 0.9997900923593619, "grad_norm": 0.13745978474617004, "learning_rate": 1.3425571060432518e-11, "loss": 0.1508, "step": 4763 }, { "epoch": 1.0, "grad_norm": 0.1369509994983673, "learning_rate": 0.0, "loss": 0.1475, "step": 4764 }, { "epoch": 1.0, "step": 4764, "total_flos": 3.4097062376237957e+18, "train_loss": 0.17368370302859773, "train_runtime": 25648.7488, "train_samples_per_second": 5.943, "train_steps_per_second": 0.186 } ], "logging_steps": 1.0, "max_steps": 4764, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.4097062376237957e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }