{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9985775248933146, "eval_steps": 500, "global_step": 3162, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000948316737790422, "grad_norm": 12.246086019020522, "learning_rate": 3.1545741324921134e-08, "loss": 0.3675, "step": 1 }, { "epoch": 0.001896633475580844, "grad_norm": 11.694026905196086, "learning_rate": 6.309148264984227e-08, "loss": 0.3513, "step": 2 }, { "epoch": 0.002844950213371266, "grad_norm": 10.688899348005608, "learning_rate": 9.463722397476342e-08, "loss": 0.3177, "step": 3 }, { "epoch": 0.003793266951161688, "grad_norm": 11.921298417211082, "learning_rate": 1.2618296529968454e-07, "loss": 0.3556, "step": 4 }, { "epoch": 0.00474158368895211, "grad_norm": 11.282902382225787, "learning_rate": 1.5772870662460568e-07, "loss": 0.3662, "step": 5 }, { "epoch": 0.005689900426742532, "grad_norm": 12.449826286939778, "learning_rate": 1.8927444794952683e-07, "loss": 0.3546, "step": 6 }, { "epoch": 0.006638217164532954, "grad_norm": 11.727097480206721, "learning_rate": 2.2082018927444798e-07, "loss": 0.3635, "step": 7 }, { "epoch": 0.007586533902323376, "grad_norm": 11.441751312661113, "learning_rate": 2.5236593059936907e-07, "loss": 0.3636, "step": 8 }, { "epoch": 0.008534850640113799, "grad_norm": 10.632930884848795, "learning_rate": 2.8391167192429027e-07, "loss": 0.2923, "step": 9 }, { "epoch": 0.00948316737790422, "grad_norm": 11.025857208188647, "learning_rate": 3.1545741324921137e-07, "loss": 0.3449, "step": 10 }, { "epoch": 0.010431484115694643, "grad_norm": 11.86359857266447, "learning_rate": 3.470031545741325e-07, "loss": 0.3354, "step": 11 }, { "epoch": 0.011379800853485065, "grad_norm": 11.01751351812872, "learning_rate": 3.7854889589905366e-07, "loss": 0.3369, "step": 12 }, { "epoch": 0.012328117591275486, "grad_norm": 9.502190495628849, "learning_rate": 4.100946372239748e-07, "loss": 0.3179, "step": 13 }, { "epoch": 0.013276434329065908, "grad_norm": 7.858408977040966, "learning_rate": 4.4164037854889596e-07, "loss": 0.2565, "step": 14 }, { "epoch": 0.01422475106685633, "grad_norm": 8.154333698814211, "learning_rate": 4.7318611987381705e-07, "loss": 0.2589, "step": 15 }, { "epoch": 0.015173067804646752, "grad_norm": 8.475444781638856, "learning_rate": 5.047318611987381e-07, "loss": 0.3001, "step": 16 }, { "epoch": 0.016121384542437174, "grad_norm": 7.13737669899092, "learning_rate": 5.362776025236594e-07, "loss": 0.2641, "step": 17 }, { "epoch": 0.017069701280227598, "grad_norm": 5.554684184392653, "learning_rate": 5.678233438485805e-07, "loss": 0.1902, "step": 18 }, { "epoch": 0.018018018018018018, "grad_norm": 4.568211300813283, "learning_rate": 5.993690851735017e-07, "loss": 0.208, "step": 19 }, { "epoch": 0.01896633475580844, "grad_norm": 4.7579569152913646, "learning_rate": 6.309148264984227e-07, "loss": 0.1994, "step": 20 }, { "epoch": 0.01991465149359886, "grad_norm": 4.7128465676673486, "learning_rate": 6.62460567823344e-07, "loss": 0.229, "step": 21 }, { "epoch": 0.020862968231389285, "grad_norm": 4.005405411985473, "learning_rate": 6.94006309148265e-07, "loss": 0.2095, "step": 22 }, { "epoch": 0.021811284969179705, "grad_norm": 4.676075338959145, "learning_rate": 7.255520504731863e-07, "loss": 0.2178, "step": 23 }, { "epoch": 0.02275960170697013, "grad_norm": 2.3652635706654435, "learning_rate": 7.570977917981073e-07, "loss": 0.1524, "step": 24 }, { "epoch": 0.02370791844476055, "grad_norm": 2.685337556789167, "learning_rate": 7.886435331230284e-07, "loss": 0.1672, "step": 25 }, { "epoch": 0.024656235182550973, "grad_norm": 2.430942973848189, "learning_rate": 8.201892744479496e-07, "loss": 0.1526, "step": 26 }, { "epoch": 0.025604551920341393, "grad_norm": 2.6467399445694286, "learning_rate": 8.517350157728707e-07, "loss": 0.1605, "step": 27 }, { "epoch": 0.026552868658131817, "grad_norm": 1.9805248826128374, "learning_rate": 8.832807570977919e-07, "loss": 0.1223, "step": 28 }, { "epoch": 0.027501185395922237, "grad_norm": 2.2695664454959785, "learning_rate": 9.148264984227131e-07, "loss": 0.1991, "step": 29 }, { "epoch": 0.02844950213371266, "grad_norm": 2.221333597086963, "learning_rate": 9.463722397476341e-07, "loss": 0.1466, "step": 30 }, { "epoch": 0.02939781887150308, "grad_norm": 2.4412316593782633, "learning_rate": 9.779179810725552e-07, "loss": 0.1757, "step": 31 }, { "epoch": 0.030346135609293504, "grad_norm": 2.3894901293863198, "learning_rate": 1.0094637223974763e-06, "loss": 0.1467, "step": 32 }, { "epoch": 0.031294452347083924, "grad_norm": 2.2254181707911003, "learning_rate": 1.0410094637223975e-06, "loss": 0.1403, "step": 33 }, { "epoch": 0.03224276908487435, "grad_norm": 2.0835670435267573, "learning_rate": 1.0725552050473188e-06, "loss": 0.1295, "step": 34 }, { "epoch": 0.03319108582266477, "grad_norm": 2.490534255553767, "learning_rate": 1.1041009463722398e-06, "loss": 0.1781, "step": 35 }, { "epoch": 0.034139402560455195, "grad_norm": 2.4852979753797526, "learning_rate": 1.135646687697161e-06, "loss": 0.1635, "step": 36 }, { "epoch": 0.03508771929824561, "grad_norm": 2.1821267836376093, "learning_rate": 1.1671924290220821e-06, "loss": 0.1536, "step": 37 }, { "epoch": 0.036036036036036036, "grad_norm": 2.0995981360286193, "learning_rate": 1.1987381703470034e-06, "loss": 0.1892, "step": 38 }, { "epoch": 0.03698435277382646, "grad_norm": 1.7671849396147046, "learning_rate": 1.2302839116719244e-06, "loss": 0.1441, "step": 39 }, { "epoch": 0.03793266951161688, "grad_norm": 1.9145592423222202, "learning_rate": 1.2618296529968455e-06, "loss": 0.1515, "step": 40 }, { "epoch": 0.0388809862494073, "grad_norm": 2.909485299628588, "learning_rate": 1.2933753943217667e-06, "loss": 0.176, "step": 41 }, { "epoch": 0.03982930298719772, "grad_norm": 1.6871888183428478, "learning_rate": 1.324921135646688e-06, "loss": 0.1215, "step": 42 }, { "epoch": 0.04077761972498815, "grad_norm": 1.5396564532901138, "learning_rate": 1.3564668769716088e-06, "loss": 0.1334, "step": 43 }, { "epoch": 0.04172593646277857, "grad_norm": 2.205033481070129, "learning_rate": 1.38801261829653e-06, "loss": 0.1397, "step": 44 }, { "epoch": 0.04267425320056899, "grad_norm": 1.8497757762613358, "learning_rate": 1.4195583596214513e-06, "loss": 0.1274, "step": 45 }, { "epoch": 0.04362256993835941, "grad_norm": 1.9376215434540043, "learning_rate": 1.4511041009463726e-06, "loss": 0.1228, "step": 46 }, { "epoch": 0.044570886676149835, "grad_norm": 1.594889970345864, "learning_rate": 1.4826498422712934e-06, "loss": 0.1137, "step": 47 }, { "epoch": 0.04551920341394026, "grad_norm": 1.7592570423176281, "learning_rate": 1.5141955835962146e-06, "loss": 0.141, "step": 48 }, { "epoch": 0.046467520151730675, "grad_norm": 1.6146283602515956, "learning_rate": 1.545741324921136e-06, "loss": 0.1428, "step": 49 }, { "epoch": 0.0474158368895211, "grad_norm": 1.503278573378982, "learning_rate": 1.5772870662460567e-06, "loss": 0.1318, "step": 50 }, { "epoch": 0.04836415362731152, "grad_norm": 1.37572777178569, "learning_rate": 1.608832807570978e-06, "loss": 0.1315, "step": 51 }, { "epoch": 0.049312470365101946, "grad_norm": 1.6002275154635794, "learning_rate": 1.6403785488958992e-06, "loss": 0.0935, "step": 52 }, { "epoch": 0.05026078710289237, "grad_norm": 1.9567696662008847, "learning_rate": 1.6719242902208203e-06, "loss": 0.1271, "step": 53 }, { "epoch": 0.051209103840682786, "grad_norm": 1.601626063178932, "learning_rate": 1.7034700315457413e-06, "loss": 0.0959, "step": 54 }, { "epoch": 0.05215742057847321, "grad_norm": 1.886431535590579, "learning_rate": 1.7350157728706626e-06, "loss": 0.1218, "step": 55 }, { "epoch": 0.05310573731626363, "grad_norm": 1.5354494166136305, "learning_rate": 1.7665615141955838e-06, "loss": 0.1139, "step": 56 }, { "epoch": 0.05405405405405406, "grad_norm": 2.311230053300576, "learning_rate": 1.7981072555205049e-06, "loss": 0.1426, "step": 57 }, { "epoch": 0.055002370791844474, "grad_norm": 1.6253071180005185, "learning_rate": 1.8296529968454261e-06, "loss": 0.1175, "step": 58 }, { "epoch": 0.0559506875296349, "grad_norm": 1.3821063491809322, "learning_rate": 1.8611987381703472e-06, "loss": 0.132, "step": 59 }, { "epoch": 0.05689900426742532, "grad_norm": 1.7624392868013044, "learning_rate": 1.8927444794952682e-06, "loss": 0.1221, "step": 60 }, { "epoch": 0.057847321005215745, "grad_norm": 1.3398437874784876, "learning_rate": 1.9242902208201892e-06, "loss": 0.125, "step": 61 }, { "epoch": 0.05879563774300616, "grad_norm": 1.562570182505017, "learning_rate": 1.9558359621451105e-06, "loss": 0.1413, "step": 62 }, { "epoch": 0.059743954480796585, "grad_norm": 1.6769755616188486, "learning_rate": 1.9873817034700317e-06, "loss": 0.1559, "step": 63 }, { "epoch": 0.06069227121858701, "grad_norm": 1.3917364499829268, "learning_rate": 2.0189274447949526e-06, "loss": 0.1377, "step": 64 }, { "epoch": 0.06164058795637743, "grad_norm": 1.8502674559797263, "learning_rate": 2.050473186119874e-06, "loss": 0.1487, "step": 65 }, { "epoch": 0.06258890469416785, "grad_norm": 3.158783977874437, "learning_rate": 2.082018927444795e-06, "loss": 0.119, "step": 66 }, { "epoch": 0.06353722143195828, "grad_norm": 1.811584236109641, "learning_rate": 2.1135646687697163e-06, "loss": 0.122, "step": 67 }, { "epoch": 0.0644855381697487, "grad_norm": 2.917344328319794, "learning_rate": 2.1451104100946376e-06, "loss": 0.1313, "step": 68 }, { "epoch": 0.06543385490753911, "grad_norm": 1.8029019845335916, "learning_rate": 2.1766561514195584e-06, "loss": 0.1138, "step": 69 }, { "epoch": 0.06638217164532954, "grad_norm": 1.6898543330406532, "learning_rate": 2.2082018927444797e-06, "loss": 0.1191, "step": 70 }, { "epoch": 0.06733048838311996, "grad_norm": 2.2925732127308214, "learning_rate": 2.239747634069401e-06, "loss": 0.1306, "step": 71 }, { "epoch": 0.06827880512091039, "grad_norm": 1.4433490292568716, "learning_rate": 2.271293375394322e-06, "loss": 0.1055, "step": 72 }, { "epoch": 0.06922712185870081, "grad_norm": 1.3862506183642664, "learning_rate": 2.302839116719243e-06, "loss": 0.1075, "step": 73 }, { "epoch": 0.07017543859649122, "grad_norm": 1.2816575561632197, "learning_rate": 2.3343848580441643e-06, "loss": 0.1028, "step": 74 }, { "epoch": 0.07112375533428165, "grad_norm": 1.893923472034316, "learning_rate": 2.3659305993690855e-06, "loss": 0.1011, "step": 75 }, { "epoch": 0.07207207207207207, "grad_norm": 1.6025824634915868, "learning_rate": 2.3974763406940068e-06, "loss": 0.1317, "step": 76 }, { "epoch": 0.07302038880986249, "grad_norm": 1.7176261068301808, "learning_rate": 2.4290220820189276e-06, "loss": 0.1447, "step": 77 }, { "epoch": 0.07396870554765292, "grad_norm": 2.4231160050612863, "learning_rate": 2.460567823343849e-06, "loss": 0.1384, "step": 78 }, { "epoch": 0.07491702228544334, "grad_norm": 1.2193411546548798, "learning_rate": 2.49211356466877e-06, "loss": 0.0992, "step": 79 }, { "epoch": 0.07586533902323377, "grad_norm": 1.5164983059809367, "learning_rate": 2.523659305993691e-06, "loss": 0.1001, "step": 80 }, { "epoch": 0.07681365576102418, "grad_norm": 1.6017905795769134, "learning_rate": 2.5552050473186126e-06, "loss": 0.1048, "step": 81 }, { "epoch": 0.0777619724988146, "grad_norm": 1.1836761079302904, "learning_rate": 2.5867507886435334e-06, "loss": 0.0982, "step": 82 }, { "epoch": 0.07871028923660503, "grad_norm": 3.3493572839513566, "learning_rate": 2.6182965299684543e-06, "loss": 0.1184, "step": 83 }, { "epoch": 0.07965860597439545, "grad_norm": 2.0313461386241722, "learning_rate": 2.649842271293376e-06, "loss": 0.1174, "step": 84 }, { "epoch": 0.08060692271218586, "grad_norm": 1.7152579326543271, "learning_rate": 2.6813880126182968e-06, "loss": 0.117, "step": 85 }, { "epoch": 0.0815552394499763, "grad_norm": 1.9082671591126898, "learning_rate": 2.7129337539432176e-06, "loss": 0.1538, "step": 86 }, { "epoch": 0.08250355618776671, "grad_norm": 1.1544236926306861, "learning_rate": 2.7444794952681393e-06, "loss": 0.0906, "step": 87 }, { "epoch": 0.08345187292555714, "grad_norm": 1.2516823902614436, "learning_rate": 2.77602523659306e-06, "loss": 0.1452, "step": 88 }, { "epoch": 0.08440018966334756, "grad_norm": 1.0339206815219761, "learning_rate": 2.807570977917981e-06, "loss": 0.0836, "step": 89 }, { "epoch": 0.08534850640113797, "grad_norm": 1.668394516826565, "learning_rate": 2.8391167192429026e-06, "loss": 0.1129, "step": 90 }, { "epoch": 0.0862968231389284, "grad_norm": 1.497152077632149, "learning_rate": 2.8706624605678234e-06, "loss": 0.1062, "step": 91 }, { "epoch": 0.08724513987671882, "grad_norm": 1.197731872894548, "learning_rate": 2.902208201892745e-06, "loss": 0.1102, "step": 92 }, { "epoch": 0.08819345661450925, "grad_norm": 1.4271367098608596, "learning_rate": 2.933753943217666e-06, "loss": 0.1175, "step": 93 }, { "epoch": 0.08914177335229967, "grad_norm": 1.6337936419255448, "learning_rate": 2.9652996845425868e-06, "loss": 0.1148, "step": 94 }, { "epoch": 0.09009009009009009, "grad_norm": 1.7427850789821318, "learning_rate": 2.9968454258675085e-06, "loss": 0.1246, "step": 95 }, { "epoch": 0.09103840682788052, "grad_norm": 1.2870967429199511, "learning_rate": 3.0283911671924293e-06, "loss": 0.087, "step": 96 }, { "epoch": 0.09198672356567093, "grad_norm": 1.1311991406490958, "learning_rate": 3.05993690851735e-06, "loss": 0.0996, "step": 97 }, { "epoch": 0.09293504030346135, "grad_norm": 1.3742928108454626, "learning_rate": 3.091482649842272e-06, "loss": 0.1293, "step": 98 }, { "epoch": 0.09388335704125178, "grad_norm": 1.516028333204866, "learning_rate": 3.1230283911671926e-06, "loss": 0.1078, "step": 99 }, { "epoch": 0.0948316737790422, "grad_norm": 1.1404699421620854, "learning_rate": 3.1545741324921135e-06, "loss": 0.1142, "step": 100 }, { "epoch": 0.09577999051683263, "grad_norm": 1.7924070029612504, "learning_rate": 3.186119873817035e-06, "loss": 0.1419, "step": 101 }, { "epoch": 0.09672830725462304, "grad_norm": 1.7297435544466835, "learning_rate": 3.217665615141956e-06, "loss": 0.117, "step": 102 }, { "epoch": 0.09767662399241346, "grad_norm": 1.2304625316537265, "learning_rate": 3.2492113564668772e-06, "loss": 0.0834, "step": 103 }, { "epoch": 0.09862494073020389, "grad_norm": 1.6554297434059837, "learning_rate": 3.2807570977917985e-06, "loss": 0.1251, "step": 104 }, { "epoch": 0.09957325746799431, "grad_norm": 1.9749022078409877, "learning_rate": 3.3123028391167193e-06, "loss": 0.1485, "step": 105 }, { "epoch": 0.10052157420578474, "grad_norm": 1.7816458729316766, "learning_rate": 3.3438485804416405e-06, "loss": 0.1168, "step": 106 }, { "epoch": 0.10146989094357516, "grad_norm": 1.6366795934026652, "learning_rate": 3.375394321766562e-06, "loss": 0.1221, "step": 107 }, { "epoch": 0.10241820768136557, "grad_norm": 1.0846701931516913, "learning_rate": 3.4069400630914826e-06, "loss": 0.086, "step": 108 }, { "epoch": 0.103366524419156, "grad_norm": 1.374499027535318, "learning_rate": 3.4384858044164043e-06, "loss": 0.1097, "step": 109 }, { "epoch": 0.10431484115694642, "grad_norm": 1.7733166976712489, "learning_rate": 3.470031545741325e-06, "loss": 0.1098, "step": 110 }, { "epoch": 0.10526315789473684, "grad_norm": 2.980678296553409, "learning_rate": 3.5015772870662464e-06, "loss": 0.0917, "step": 111 }, { "epoch": 0.10621147463252727, "grad_norm": 0.9904577188744437, "learning_rate": 3.5331230283911676e-06, "loss": 0.0777, "step": 112 }, { "epoch": 0.10715979137031768, "grad_norm": 1.4107631975145143, "learning_rate": 3.5646687697160885e-06, "loss": 0.0902, "step": 113 }, { "epoch": 0.10810810810810811, "grad_norm": 1.786967710835369, "learning_rate": 3.5962145110410097e-06, "loss": 0.0974, "step": 114 }, { "epoch": 0.10905642484589853, "grad_norm": 1.6278373703409408, "learning_rate": 3.627760252365931e-06, "loss": 0.1031, "step": 115 }, { "epoch": 0.11000474158368895, "grad_norm": 1.0856958566494381, "learning_rate": 3.6593059936908522e-06, "loss": 0.0872, "step": 116 }, { "epoch": 0.11095305832147938, "grad_norm": 1.0746142572780863, "learning_rate": 3.690851735015773e-06, "loss": 0.0753, "step": 117 }, { "epoch": 0.1119013750592698, "grad_norm": 1.794687772916648, "learning_rate": 3.7223974763406943e-06, "loss": 0.094, "step": 118 }, { "epoch": 0.11284969179706021, "grad_norm": 2.0574961246450543, "learning_rate": 3.7539432176656156e-06, "loss": 0.1032, "step": 119 }, { "epoch": 0.11379800853485064, "grad_norm": 1.0887603543641189, "learning_rate": 3.7854889589905364e-06, "loss": 0.0869, "step": 120 }, { "epoch": 0.11474632527264106, "grad_norm": 1.4381581196511768, "learning_rate": 3.817034700315458e-06, "loss": 0.105, "step": 121 }, { "epoch": 0.11569464201043149, "grad_norm": 2.0884869908112984, "learning_rate": 3.8485804416403785e-06, "loss": 0.1072, "step": 122 }, { "epoch": 0.1166429587482219, "grad_norm": 1.3918016525882038, "learning_rate": 3.8801261829653e-06, "loss": 0.0995, "step": 123 }, { "epoch": 0.11759127548601232, "grad_norm": 1.1199618265144746, "learning_rate": 3.911671924290221e-06, "loss": 0.0693, "step": 124 }, { "epoch": 0.11853959222380275, "grad_norm": 2.913976726787567, "learning_rate": 3.943217665615142e-06, "loss": 0.1203, "step": 125 }, { "epoch": 0.11948790896159317, "grad_norm": 1.4548880466216083, "learning_rate": 3.9747634069400635e-06, "loss": 0.0891, "step": 126 }, { "epoch": 0.1204362256993836, "grad_norm": 3.1711394720986235, "learning_rate": 4.006309148264985e-06, "loss": 0.1223, "step": 127 }, { "epoch": 0.12138454243717402, "grad_norm": 1.888765811166245, "learning_rate": 4.037854889589905e-06, "loss": 0.13, "step": 128 }, { "epoch": 0.12233285917496443, "grad_norm": 1.2398551211997078, "learning_rate": 4.069400630914827e-06, "loss": 0.1103, "step": 129 }, { "epoch": 0.12328117591275486, "grad_norm": 1.7438322556304724, "learning_rate": 4.100946372239748e-06, "loss": 0.1147, "step": 130 }, { "epoch": 0.12422949265054528, "grad_norm": 0.9363387889716617, "learning_rate": 4.132492113564669e-06, "loss": 0.0995, "step": 131 }, { "epoch": 0.1251778093883357, "grad_norm": 1.446859084810851, "learning_rate": 4.16403785488959e-06, "loss": 0.0994, "step": 132 }, { "epoch": 0.12612612612612611, "grad_norm": 1.1856203072681963, "learning_rate": 4.195583596214511e-06, "loss": 0.0927, "step": 133 }, { "epoch": 0.12707444286391656, "grad_norm": 1.103336827372462, "learning_rate": 4.227129337539433e-06, "loss": 0.0815, "step": 134 }, { "epoch": 0.12802275960170698, "grad_norm": 1.897384655096208, "learning_rate": 4.258675078864354e-06, "loss": 0.1248, "step": 135 }, { "epoch": 0.1289710763394974, "grad_norm": 1.6223901695891558, "learning_rate": 4.290220820189275e-06, "loss": 0.1456, "step": 136 }, { "epoch": 0.1299193930772878, "grad_norm": 1.93689861193564, "learning_rate": 4.321766561514196e-06, "loss": 0.1236, "step": 137 }, { "epoch": 0.13086770981507823, "grad_norm": 1.7202395942479507, "learning_rate": 4.353312302839117e-06, "loss": 0.0994, "step": 138 }, { "epoch": 0.13181602655286867, "grad_norm": 2.1336251410837717, "learning_rate": 4.384858044164038e-06, "loss": 0.0963, "step": 139 }, { "epoch": 0.1327643432906591, "grad_norm": 2.086908025505944, "learning_rate": 4.416403785488959e-06, "loss": 0.1397, "step": 140 }, { "epoch": 0.1337126600284495, "grad_norm": 1.903049841336412, "learning_rate": 4.447949526813881e-06, "loss": 0.1188, "step": 141 }, { "epoch": 0.13466097676623992, "grad_norm": 1.237639055790405, "learning_rate": 4.479495268138802e-06, "loss": 0.0864, "step": 142 }, { "epoch": 0.13560929350403034, "grad_norm": 1.533833989919448, "learning_rate": 4.511041009463723e-06, "loss": 0.1188, "step": 143 }, { "epoch": 0.13655761024182078, "grad_norm": 1.7546010414420699, "learning_rate": 4.542586750788644e-06, "loss": 0.1364, "step": 144 }, { "epoch": 0.1375059269796112, "grad_norm": 2.9799276151902645, "learning_rate": 4.574132492113565e-06, "loss": 0.1226, "step": 145 }, { "epoch": 0.13845424371740161, "grad_norm": 1.1723876001348499, "learning_rate": 4.605678233438486e-06, "loss": 0.086, "step": 146 }, { "epoch": 0.13940256045519203, "grad_norm": 2.069220754870492, "learning_rate": 4.637223974763407e-06, "loss": 0.1196, "step": 147 }, { "epoch": 0.14035087719298245, "grad_norm": 3.9795001087139124, "learning_rate": 4.6687697160883285e-06, "loss": 0.1152, "step": 148 }, { "epoch": 0.14129919393077287, "grad_norm": 1.4634422746453415, "learning_rate": 4.70031545741325e-06, "loss": 0.0916, "step": 149 }, { "epoch": 0.1422475106685633, "grad_norm": 1.3185726560010742, "learning_rate": 4.731861198738171e-06, "loss": 0.0904, "step": 150 }, { "epoch": 0.14319582740635373, "grad_norm": 1.5552910898557228, "learning_rate": 4.7634069400630914e-06, "loss": 0.0899, "step": 151 }, { "epoch": 0.14414414414414414, "grad_norm": 1.0997805514097108, "learning_rate": 4.7949526813880135e-06, "loss": 0.0795, "step": 152 }, { "epoch": 0.14509246088193456, "grad_norm": 1.7076641753438397, "learning_rate": 4.826498422712934e-06, "loss": 0.1081, "step": 153 }, { "epoch": 0.14604077761972498, "grad_norm": 1.6735518250841006, "learning_rate": 4.858044164037855e-06, "loss": 0.1068, "step": 154 }, { "epoch": 0.14698909435751542, "grad_norm": 1.2033878521779449, "learning_rate": 4.8895899053627764e-06, "loss": 0.0934, "step": 155 }, { "epoch": 0.14793741109530584, "grad_norm": 1.4908718795559122, "learning_rate": 4.921135646687698e-06, "loss": 0.1072, "step": 156 }, { "epoch": 0.14888572783309625, "grad_norm": 1.3234990953707453, "learning_rate": 4.952681388012618e-06, "loss": 0.104, "step": 157 }, { "epoch": 0.14983404457088667, "grad_norm": 1.3354249814975963, "learning_rate": 4.98422712933754e-06, "loss": 0.1189, "step": 158 }, { "epoch": 0.1507823613086771, "grad_norm": 1.224445144859879, "learning_rate": 5.015772870662461e-06, "loss": 0.1114, "step": 159 }, { "epoch": 0.15173067804646753, "grad_norm": 1.3554440133569026, "learning_rate": 5.047318611987382e-06, "loss": 0.1278, "step": 160 }, { "epoch": 0.15267899478425795, "grad_norm": 1.4393478098545054, "learning_rate": 5.078864353312303e-06, "loss": 0.1201, "step": 161 }, { "epoch": 0.15362731152204837, "grad_norm": 1.032684740456474, "learning_rate": 5.110410094637225e-06, "loss": 0.0841, "step": 162 }, { "epoch": 0.15457562825983878, "grad_norm": 1.2508286920209446, "learning_rate": 5.141955835962146e-06, "loss": 0.0863, "step": 163 }, { "epoch": 0.1555239449976292, "grad_norm": 1.899093372512286, "learning_rate": 5.173501577287067e-06, "loss": 0.1662, "step": 164 }, { "epoch": 0.15647226173541964, "grad_norm": 1.4000014551423334, "learning_rate": 5.205047318611987e-06, "loss": 0.0909, "step": 165 }, { "epoch": 0.15742057847321006, "grad_norm": 1.9418542456678585, "learning_rate": 5.2365930599369085e-06, "loss": 0.1013, "step": 166 }, { "epoch": 0.15836889521100048, "grad_norm": 1.5538903766146939, "learning_rate": 5.268138801261831e-06, "loss": 0.1177, "step": 167 }, { "epoch": 0.1593172119487909, "grad_norm": 1.3035129364423688, "learning_rate": 5.299684542586752e-06, "loss": 0.0961, "step": 168 }, { "epoch": 0.1602655286865813, "grad_norm": 1.273421849890499, "learning_rate": 5.331230283911672e-06, "loss": 0.1252, "step": 169 }, { "epoch": 0.16121384542437173, "grad_norm": 1.123016604976548, "learning_rate": 5.3627760252365935e-06, "loss": 0.0999, "step": 170 }, { "epoch": 0.16216216216216217, "grad_norm": 1.2409364166994, "learning_rate": 5.394321766561515e-06, "loss": 0.1095, "step": 171 }, { "epoch": 0.1631104788999526, "grad_norm": 1.1082140455460585, "learning_rate": 5.425867507886435e-06, "loss": 0.0736, "step": 172 }, { "epoch": 0.164058795637743, "grad_norm": 1.2872459579560394, "learning_rate": 5.457413249211357e-06, "loss": 0.0928, "step": 173 }, { "epoch": 0.16500711237553342, "grad_norm": 1.3830237110418746, "learning_rate": 5.4889589905362786e-06, "loss": 0.0973, "step": 174 }, { "epoch": 0.16595542911332384, "grad_norm": 1.2546887092347754, "learning_rate": 5.520504731861199e-06, "loss": 0.0832, "step": 175 }, { "epoch": 0.16690374585111428, "grad_norm": 1.1708284069676944, "learning_rate": 5.55205047318612e-06, "loss": 0.1075, "step": 176 }, { "epoch": 0.1678520625889047, "grad_norm": 1.101853335061695, "learning_rate": 5.5835962145110415e-06, "loss": 0.0897, "step": 177 }, { "epoch": 0.16880037932669512, "grad_norm": 1.015907357215909, "learning_rate": 5.615141955835962e-06, "loss": 0.0819, "step": 178 }, { "epoch": 0.16974869606448553, "grad_norm": 1.8752154604515816, "learning_rate": 5.646687697160884e-06, "loss": 0.1021, "step": 179 }, { "epoch": 0.17069701280227595, "grad_norm": 1.6971011710183759, "learning_rate": 5.678233438485805e-06, "loss": 0.0996, "step": 180 }, { "epoch": 0.1716453295400664, "grad_norm": 1.2212507178791898, "learning_rate": 5.709779179810726e-06, "loss": 0.1079, "step": 181 }, { "epoch": 0.1725936462778568, "grad_norm": 1.7343284525300247, "learning_rate": 5.741324921135647e-06, "loss": 0.1292, "step": 182 }, { "epoch": 0.17354196301564723, "grad_norm": 1.4376592014404461, "learning_rate": 5.772870662460568e-06, "loss": 0.1312, "step": 183 }, { "epoch": 0.17449027975343764, "grad_norm": 1.2528619821880524, "learning_rate": 5.80441640378549e-06, "loss": 0.0762, "step": 184 }, { "epoch": 0.17543859649122806, "grad_norm": 1.9247297159171304, "learning_rate": 5.835962145110411e-06, "loss": 0.1403, "step": 185 }, { "epoch": 0.1763869132290185, "grad_norm": 1.5028101353474104, "learning_rate": 5.867507886435332e-06, "loss": 0.1147, "step": 186 }, { "epoch": 0.17733522996680892, "grad_norm": 2.4179600186213714, "learning_rate": 5.899053627760253e-06, "loss": 0.0913, "step": 187 }, { "epoch": 0.17828354670459934, "grad_norm": 1.518835105924909, "learning_rate": 5.9305993690851736e-06, "loss": 0.0918, "step": 188 }, { "epoch": 0.17923186344238975, "grad_norm": 1.6543687104918372, "learning_rate": 5.962145110410095e-06, "loss": 0.122, "step": 189 }, { "epoch": 0.18018018018018017, "grad_norm": 1.4531807393638785, "learning_rate": 5.993690851735017e-06, "loss": 0.1228, "step": 190 }, { "epoch": 0.1811284969179706, "grad_norm": 1.4665808153812976, "learning_rate": 6.025236593059937e-06, "loss": 0.1014, "step": 191 }, { "epoch": 0.18207681365576103, "grad_norm": 1.2889682170490027, "learning_rate": 6.056782334384859e-06, "loss": 0.1055, "step": 192 }, { "epoch": 0.18302513039355145, "grad_norm": 1.3310497561635966, "learning_rate": 6.08832807570978e-06, "loss": 0.119, "step": 193 }, { "epoch": 0.18397344713134187, "grad_norm": 1.3246051325093873, "learning_rate": 6.1198738170347e-06, "loss": 0.1288, "step": 194 }, { "epoch": 0.18492176386913228, "grad_norm": 1.1979924093987135, "learning_rate": 6.1514195583596215e-06, "loss": 0.0877, "step": 195 }, { "epoch": 0.1858700806069227, "grad_norm": 1.1280419900810446, "learning_rate": 6.182965299684544e-06, "loss": 0.1085, "step": 196 }, { "epoch": 0.18681839734471314, "grad_norm": 1.3307017446168579, "learning_rate": 6.214511041009465e-06, "loss": 0.0853, "step": 197 }, { "epoch": 0.18776671408250356, "grad_norm": 1.1814823672365349, "learning_rate": 6.246056782334385e-06, "loss": 0.1066, "step": 198 }, { "epoch": 0.18871503082029398, "grad_norm": 0.7829348670836794, "learning_rate": 6.2776025236593065e-06, "loss": 0.0662, "step": 199 }, { "epoch": 0.1896633475580844, "grad_norm": 1.2435224715978643, "learning_rate": 6.309148264984227e-06, "loss": 0.088, "step": 200 }, { "epoch": 0.1906116642958748, "grad_norm": 1.0014149948809556, "learning_rate": 6.340694006309149e-06, "loss": 0.0975, "step": 201 }, { "epoch": 0.19155998103366526, "grad_norm": 0.9250673471848995, "learning_rate": 6.37223974763407e-06, "loss": 0.0877, "step": 202 }, { "epoch": 0.19250829777145567, "grad_norm": 1.056412139362465, "learning_rate": 6.4037854889589915e-06, "loss": 0.0763, "step": 203 }, { "epoch": 0.1934566145092461, "grad_norm": 0.9891782097788515, "learning_rate": 6.435331230283912e-06, "loss": 0.0834, "step": 204 }, { "epoch": 0.1944049312470365, "grad_norm": 1.0792725374885792, "learning_rate": 6.466876971608833e-06, "loss": 0.0885, "step": 205 }, { "epoch": 0.19535324798482692, "grad_norm": 1.2366811021393578, "learning_rate": 6.4984227129337544e-06, "loss": 0.0954, "step": 206 }, { "epoch": 0.19630156472261737, "grad_norm": 1.024115365006771, "learning_rate": 6.529968454258676e-06, "loss": 0.1215, "step": 207 }, { "epoch": 0.19724988146040778, "grad_norm": 1.2203185957532192, "learning_rate": 6.561514195583597e-06, "loss": 0.1202, "step": 208 }, { "epoch": 0.1981981981981982, "grad_norm": 0.9501403270885721, "learning_rate": 6.593059936908518e-06, "loss": 0.0715, "step": 209 }, { "epoch": 0.19914651493598862, "grad_norm": 1.5511308370546482, "learning_rate": 6.624605678233439e-06, "loss": 0.1089, "step": 210 }, { "epoch": 0.20009483167377903, "grad_norm": 0.9433860573102355, "learning_rate": 6.65615141955836e-06, "loss": 0.0648, "step": 211 }, { "epoch": 0.20104314841156948, "grad_norm": 1.0981902231687461, "learning_rate": 6.687697160883281e-06, "loss": 0.0663, "step": 212 }, { "epoch": 0.2019914651493599, "grad_norm": 1.064443363672458, "learning_rate": 6.719242902208203e-06, "loss": 0.077, "step": 213 }, { "epoch": 0.2029397818871503, "grad_norm": 1.3753290546304533, "learning_rate": 6.750788643533124e-06, "loss": 0.1093, "step": 214 }, { "epoch": 0.20388809862494073, "grad_norm": 1.2200081175269764, "learning_rate": 6.782334384858045e-06, "loss": 0.1094, "step": 215 }, { "epoch": 0.20483641536273114, "grad_norm": 0.9141258918864384, "learning_rate": 6.813880126182965e-06, "loss": 0.0911, "step": 216 }, { "epoch": 0.20578473210052156, "grad_norm": 2.528170753397052, "learning_rate": 6.8454258675078865e-06, "loss": 0.1079, "step": 217 }, { "epoch": 0.206733048838312, "grad_norm": 1.4430688823297448, "learning_rate": 6.876971608832809e-06, "loss": 0.1053, "step": 218 }, { "epoch": 0.20768136557610242, "grad_norm": 1.0186932336289805, "learning_rate": 6.90851735015773e-06, "loss": 0.0861, "step": 219 }, { "epoch": 0.20862968231389284, "grad_norm": 1.1420742589304766, "learning_rate": 6.94006309148265e-06, "loss": 0.094, "step": 220 }, { "epoch": 0.20957799905168326, "grad_norm": 1.2741420533987797, "learning_rate": 6.9716088328075715e-06, "loss": 0.0951, "step": 221 }, { "epoch": 0.21052631578947367, "grad_norm": 0.9075216722351295, "learning_rate": 7.003154574132493e-06, "loss": 0.0866, "step": 222 }, { "epoch": 0.21147463252726412, "grad_norm": 1.1980754719122302, "learning_rate": 7.034700315457413e-06, "loss": 0.0914, "step": 223 }, { "epoch": 0.21242294926505453, "grad_norm": 1.1939921471415105, "learning_rate": 7.066246056782335e-06, "loss": 0.1047, "step": 224 }, { "epoch": 0.21337126600284495, "grad_norm": 0.8519438677271276, "learning_rate": 7.0977917981072565e-06, "loss": 0.0941, "step": 225 }, { "epoch": 0.21431958274063537, "grad_norm": 0.789532854502906, "learning_rate": 7.129337539432177e-06, "loss": 0.0819, "step": 226 }, { "epoch": 0.21526789947842578, "grad_norm": 1.2111156014392817, "learning_rate": 7.160883280757098e-06, "loss": 0.1027, "step": 227 }, { "epoch": 0.21621621621621623, "grad_norm": 1.0588737043402552, "learning_rate": 7.1924290220820195e-06, "loss": 0.0952, "step": 228 }, { "epoch": 0.21716453295400664, "grad_norm": 0.933483217055125, "learning_rate": 7.22397476340694e-06, "loss": 0.0763, "step": 229 }, { "epoch": 0.21811284969179706, "grad_norm": 1.049586247769339, "learning_rate": 7.255520504731862e-06, "loss": 0.0789, "step": 230 }, { "epoch": 0.21906116642958748, "grad_norm": 1.1220808424289264, "learning_rate": 7.287066246056783e-06, "loss": 0.074, "step": 231 }, { "epoch": 0.2200094831673779, "grad_norm": 1.254391611101815, "learning_rate": 7.3186119873817045e-06, "loss": 0.093, "step": 232 }, { "epoch": 0.22095779990516834, "grad_norm": 1.274839766592392, "learning_rate": 7.350157728706625e-06, "loss": 0.0938, "step": 233 }, { "epoch": 0.22190611664295876, "grad_norm": 1.2629251738997191, "learning_rate": 7.381703470031546e-06, "loss": 0.1129, "step": 234 }, { "epoch": 0.22285443338074917, "grad_norm": 1.3595829605121952, "learning_rate": 7.413249211356468e-06, "loss": 0.1062, "step": 235 }, { "epoch": 0.2238027501185396, "grad_norm": 1.353026352957774, "learning_rate": 7.444794952681389e-06, "loss": 0.117, "step": 236 }, { "epoch": 0.22475106685633, "grad_norm": 1.3472351125895725, "learning_rate": 7.47634069400631e-06, "loss": 0.0827, "step": 237 }, { "epoch": 0.22569938359412042, "grad_norm": 0.9510770172761661, "learning_rate": 7.507886435331231e-06, "loss": 0.0759, "step": 238 }, { "epoch": 0.22664770033191087, "grad_norm": 1.2025915899822757, "learning_rate": 7.5394321766561515e-06, "loss": 0.0807, "step": 239 }, { "epoch": 0.22759601706970128, "grad_norm": 1.1640028047547857, "learning_rate": 7.570977917981073e-06, "loss": 0.0709, "step": 240 }, { "epoch": 0.2285443338074917, "grad_norm": 1.5223127858935517, "learning_rate": 7.602523659305995e-06, "loss": 0.1018, "step": 241 }, { "epoch": 0.22949265054528212, "grad_norm": 1.8495916864800697, "learning_rate": 7.634069400630916e-06, "loss": 0.0968, "step": 242 }, { "epoch": 0.23044096728307253, "grad_norm": 1.8476848640745251, "learning_rate": 7.665615141955837e-06, "loss": 0.086, "step": 243 }, { "epoch": 0.23138928402086298, "grad_norm": 1.4644626825262619, "learning_rate": 7.697160883280757e-06, "loss": 0.0974, "step": 244 }, { "epoch": 0.2323376007586534, "grad_norm": 1.8857810882326624, "learning_rate": 7.728706624605679e-06, "loss": 0.1036, "step": 245 }, { "epoch": 0.2332859174964438, "grad_norm": 1.7638762752182895, "learning_rate": 7.7602523659306e-06, "loss": 0.1097, "step": 246 }, { "epoch": 0.23423423423423423, "grad_norm": 1.2348758426158113, "learning_rate": 7.791798107255522e-06, "loss": 0.0866, "step": 247 }, { "epoch": 0.23518255097202465, "grad_norm": 1.1223471436540764, "learning_rate": 7.823343848580442e-06, "loss": 0.0564, "step": 248 }, { "epoch": 0.2361308677098151, "grad_norm": 0.8821001750676984, "learning_rate": 7.854889589905364e-06, "loss": 0.0696, "step": 249 }, { "epoch": 0.2370791844476055, "grad_norm": 0.9899264223411232, "learning_rate": 7.886435331230284e-06, "loss": 0.0702, "step": 250 }, { "epoch": 0.23802750118539592, "grad_norm": 0.9289219027994224, "learning_rate": 7.917981072555205e-06, "loss": 0.0843, "step": 251 }, { "epoch": 0.23897581792318634, "grad_norm": 1.0579670590751298, "learning_rate": 7.949526813880127e-06, "loss": 0.0921, "step": 252 }, { "epoch": 0.23992413466097676, "grad_norm": 1.4593486745973783, "learning_rate": 7.981072555205049e-06, "loss": 0.1229, "step": 253 }, { "epoch": 0.2408724513987672, "grad_norm": 0.9496576247693762, "learning_rate": 8.01261829652997e-06, "loss": 0.0861, "step": 254 }, { "epoch": 0.24182076813655762, "grad_norm": 1.1030565317688061, "learning_rate": 8.04416403785489e-06, "loss": 0.0893, "step": 255 }, { "epoch": 0.24276908487434803, "grad_norm": 0.9907604990146169, "learning_rate": 8.07570977917981e-06, "loss": 0.0928, "step": 256 }, { "epoch": 0.24371740161213845, "grad_norm": 0.9460810229319789, "learning_rate": 8.107255520504732e-06, "loss": 0.0974, "step": 257 }, { "epoch": 0.24466571834992887, "grad_norm": 0.8329291976282354, "learning_rate": 8.138801261829655e-06, "loss": 0.077, "step": 258 }, { "epoch": 0.24561403508771928, "grad_norm": 0.8587085474520708, "learning_rate": 8.170347003154575e-06, "loss": 0.0837, "step": 259 }, { "epoch": 0.24656235182550973, "grad_norm": 0.9113223159844124, "learning_rate": 8.201892744479495e-06, "loss": 0.088, "step": 260 }, { "epoch": 0.24751066856330015, "grad_norm": 0.8328940868524983, "learning_rate": 8.233438485804417e-06, "loss": 0.091, "step": 261 }, { "epoch": 0.24845898530109056, "grad_norm": 1.4264090310082065, "learning_rate": 8.264984227129338e-06, "loss": 0.1354, "step": 262 }, { "epoch": 0.24940730203888098, "grad_norm": 1.0550225951223755, "learning_rate": 8.296529968454258e-06, "loss": 0.0972, "step": 263 }, { "epoch": 0.2503556187766714, "grad_norm": 1.053508559451355, "learning_rate": 8.32807570977918e-06, "loss": 0.1035, "step": 264 }, { "epoch": 0.25130393551446184, "grad_norm": 1.4971087544821369, "learning_rate": 8.359621451104102e-06, "loss": 0.1001, "step": 265 }, { "epoch": 0.25225225225225223, "grad_norm": 1.075521297085326, "learning_rate": 8.391167192429023e-06, "loss": 0.0923, "step": 266 }, { "epoch": 0.2532005689900427, "grad_norm": 1.6910075728505873, "learning_rate": 8.422712933753943e-06, "loss": 0.1212, "step": 267 }, { "epoch": 0.2541488857278331, "grad_norm": 1.5073460991202734, "learning_rate": 8.454258675078865e-06, "loss": 0.087, "step": 268 }, { "epoch": 0.2550972024656235, "grad_norm": 1.0201575671512444, "learning_rate": 8.485804416403787e-06, "loss": 0.0871, "step": 269 }, { "epoch": 0.25604551920341395, "grad_norm": 1.1193230353064818, "learning_rate": 8.517350157728708e-06, "loss": 0.1031, "step": 270 }, { "epoch": 0.25699383594120434, "grad_norm": 1.3593779355277376, "learning_rate": 8.548895899053628e-06, "loss": 0.0861, "step": 271 }, { "epoch": 0.2579421526789948, "grad_norm": 1.5824627519870196, "learning_rate": 8.58044164037855e-06, "loss": 0.0998, "step": 272 }, { "epoch": 0.25889046941678523, "grad_norm": 2.316620691088296, "learning_rate": 8.61198738170347e-06, "loss": 0.1237, "step": 273 }, { "epoch": 0.2598387861545756, "grad_norm": 1.3708391836342668, "learning_rate": 8.643533123028391e-06, "loss": 0.0806, "step": 274 }, { "epoch": 0.26078710289236606, "grad_norm": 1.259879695037933, "learning_rate": 8.675078864353313e-06, "loss": 0.088, "step": 275 }, { "epoch": 0.26173541963015645, "grad_norm": 1.236718933875791, "learning_rate": 8.706624605678234e-06, "loss": 0.0842, "step": 276 }, { "epoch": 0.2626837363679469, "grad_norm": 1.438488419989871, "learning_rate": 8.738170347003156e-06, "loss": 0.0955, "step": 277 }, { "epoch": 0.26363205310573734, "grad_norm": 0.9563516338397714, "learning_rate": 8.769716088328076e-06, "loss": 0.0761, "step": 278 }, { "epoch": 0.26458036984352773, "grad_norm": 1.2728124128011007, "learning_rate": 8.801261829652997e-06, "loss": 0.0805, "step": 279 }, { "epoch": 0.2655286865813182, "grad_norm": 1.2205595373118223, "learning_rate": 8.832807570977919e-06, "loss": 0.0879, "step": 280 }, { "epoch": 0.26647700331910856, "grad_norm": 0.959493141925286, "learning_rate": 8.86435331230284e-06, "loss": 0.0728, "step": 281 }, { "epoch": 0.267425320056899, "grad_norm": 1.4340945839201555, "learning_rate": 8.895899053627761e-06, "loss": 0.0897, "step": 282 }, { "epoch": 0.26837363679468945, "grad_norm": 1.0061297486879381, "learning_rate": 8.927444794952682e-06, "loss": 0.0857, "step": 283 }, { "epoch": 0.26932195353247984, "grad_norm": 1.5459293734675696, "learning_rate": 8.958990536277604e-06, "loss": 0.1029, "step": 284 }, { "epoch": 0.2702702702702703, "grad_norm": 1.3222303946698841, "learning_rate": 8.990536277602524e-06, "loss": 0.084, "step": 285 }, { "epoch": 0.2712185870080607, "grad_norm": 1.185863549947665, "learning_rate": 9.022082018927446e-06, "loss": 0.1311, "step": 286 }, { "epoch": 0.2721669037458511, "grad_norm": 0.8959238307125761, "learning_rate": 9.053627760252367e-06, "loss": 0.067, "step": 287 }, { "epoch": 0.27311522048364156, "grad_norm": 1.369443136318961, "learning_rate": 9.085173501577289e-06, "loss": 0.1093, "step": 288 }, { "epoch": 0.27406353722143195, "grad_norm": 1.1052390238476015, "learning_rate": 9.116719242902209e-06, "loss": 0.103, "step": 289 }, { "epoch": 0.2750118539592224, "grad_norm": 1.325059650748033, "learning_rate": 9.14826498422713e-06, "loss": 0.1111, "step": 290 }, { "epoch": 0.2759601706970128, "grad_norm": 1.3248936963910136, "learning_rate": 9.17981072555205e-06, "loss": 0.0933, "step": 291 }, { "epoch": 0.27690848743480323, "grad_norm": 1.127118183479871, "learning_rate": 9.211356466876972e-06, "loss": 0.0891, "step": 292 }, { "epoch": 0.2778568041725936, "grad_norm": 1.3108916887707827, "learning_rate": 9.242902208201894e-06, "loss": 0.0939, "step": 293 }, { "epoch": 0.27880512091038406, "grad_norm": 1.0013886049046197, "learning_rate": 9.274447949526815e-06, "loss": 0.0692, "step": 294 }, { "epoch": 0.2797534376481745, "grad_norm": 1.1156101698361054, "learning_rate": 9.305993690851735e-06, "loss": 0.0868, "step": 295 }, { "epoch": 0.2807017543859649, "grad_norm": 1.2522202479933553, "learning_rate": 9.337539432176657e-06, "loss": 0.0914, "step": 296 }, { "epoch": 0.28165007112375534, "grad_norm": 1.3755827124206237, "learning_rate": 9.369085173501577e-06, "loss": 0.0936, "step": 297 }, { "epoch": 0.28259838786154573, "grad_norm": 1.4694162511089293, "learning_rate": 9.4006309148265e-06, "loss": 0.1071, "step": 298 }, { "epoch": 0.2835467045993362, "grad_norm": 1.255879045911956, "learning_rate": 9.43217665615142e-06, "loss": 0.0815, "step": 299 }, { "epoch": 0.2844950213371266, "grad_norm": 1.560204819302283, "learning_rate": 9.463722397476342e-06, "loss": 0.1234, "step": 300 }, { "epoch": 0.285443338074917, "grad_norm": 1.0121817898281276, "learning_rate": 9.495268138801262e-06, "loss": 0.0595, "step": 301 }, { "epoch": 0.28639165481270745, "grad_norm": 1.0711466156341418, "learning_rate": 9.526813880126183e-06, "loss": 0.0641, "step": 302 }, { "epoch": 0.28733997155049784, "grad_norm": 1.1496695710149105, "learning_rate": 9.558359621451105e-06, "loss": 0.0761, "step": 303 }, { "epoch": 0.2882882882882883, "grad_norm": 1.2059272704315518, "learning_rate": 9.589905362776027e-06, "loss": 0.0756, "step": 304 }, { "epoch": 0.28923660502607873, "grad_norm": 1.0424292745296735, "learning_rate": 9.621451104100947e-06, "loss": 0.0855, "step": 305 }, { "epoch": 0.2901849217638691, "grad_norm": 1.1497786768197902, "learning_rate": 9.652996845425868e-06, "loss": 0.071, "step": 306 }, { "epoch": 0.29113323850165956, "grad_norm": 1.3472444992692172, "learning_rate": 9.68454258675079e-06, "loss": 0.0934, "step": 307 }, { "epoch": 0.29208155523944995, "grad_norm": 1.3345310370843513, "learning_rate": 9.71608832807571e-06, "loss": 0.0998, "step": 308 }, { "epoch": 0.2930298719772404, "grad_norm": 1.01109508034154, "learning_rate": 9.747634069400632e-06, "loss": 0.0762, "step": 309 }, { "epoch": 0.29397818871503084, "grad_norm": 0.9249973635125475, "learning_rate": 9.779179810725553e-06, "loss": 0.074, "step": 310 }, { "epoch": 0.29492650545282123, "grad_norm": 0.804446344253587, "learning_rate": 9.810725552050473e-06, "loss": 0.0517, "step": 311 }, { "epoch": 0.2958748221906117, "grad_norm": 0.965596925556689, "learning_rate": 9.842271293375395e-06, "loss": 0.098, "step": 312 }, { "epoch": 0.29682313892840206, "grad_norm": 2.012807451707843, "learning_rate": 9.873817034700316e-06, "loss": 0.1038, "step": 313 }, { "epoch": 0.2977714556661925, "grad_norm": 1.2864066063043205, "learning_rate": 9.905362776025236e-06, "loss": 0.1102, "step": 314 }, { "epoch": 0.29871977240398295, "grad_norm": 0.8775284858258785, "learning_rate": 9.936908517350158e-06, "loss": 0.0913, "step": 315 }, { "epoch": 0.29966808914177334, "grad_norm": 0.9395466275555749, "learning_rate": 9.96845425867508e-06, "loss": 0.1156, "step": 316 }, { "epoch": 0.3006164058795638, "grad_norm": 1.031977177693936, "learning_rate": 1e-05, "loss": 0.0772, "step": 317 }, { "epoch": 0.3015647226173542, "grad_norm": 0.906696222035988, "learning_rate": 9.999996951577431e-06, "loss": 0.0745, "step": 318 }, { "epoch": 0.3025130393551446, "grad_norm": 1.6486632782552955, "learning_rate": 9.999987806313436e-06, "loss": 0.1295, "step": 319 }, { "epoch": 0.30346135609293506, "grad_norm": 1.0682004904191784, "learning_rate": 9.999972564219169e-06, "loss": 0.089, "step": 320 }, { "epoch": 0.30440967283072545, "grad_norm": 1.0160084965418597, "learning_rate": 9.999951225313217e-06, "loss": 0.0795, "step": 321 }, { "epoch": 0.3053579895685159, "grad_norm": 1.1229797355618714, "learning_rate": 9.999923789621598e-06, "loss": 0.0924, "step": 322 }, { "epoch": 0.3063063063063063, "grad_norm": 0.9925832526069106, "learning_rate": 9.999890257177766e-06, "loss": 0.0803, "step": 323 }, { "epoch": 0.30725462304409673, "grad_norm": 1.1785860516178814, "learning_rate": 9.999850628022611e-06, "loss": 0.0797, "step": 324 }, { "epoch": 0.3082029397818872, "grad_norm": 1.1520304204509717, "learning_rate": 9.999804902204455e-06, "loss": 0.0775, "step": 325 }, { "epoch": 0.30915125651967756, "grad_norm": 1.0880132191910508, "learning_rate": 9.999753079779054e-06, "loss": 0.0906, "step": 326 }, { "epoch": 0.310099573257468, "grad_norm": 1.5767657455822397, "learning_rate": 9.999695160809598e-06, "loss": 0.0956, "step": 327 }, { "epoch": 0.3110478899952584, "grad_norm": 0.7125012678361342, "learning_rate": 9.999631145366713e-06, "loss": 0.0661, "step": 328 }, { "epoch": 0.31199620673304884, "grad_norm": 1.088584252037159, "learning_rate": 9.999561033528457e-06, "loss": 0.1149, "step": 329 }, { "epoch": 0.3129445234708393, "grad_norm": 0.8523222222870042, "learning_rate": 9.999484825380323e-06, "loss": 0.0913, "step": 330 }, { "epoch": 0.3138928402086297, "grad_norm": 1.0164571883774136, "learning_rate": 9.999402521015236e-06, "loss": 0.0878, "step": 331 }, { "epoch": 0.3148411569464201, "grad_norm": 0.7164573705993513, "learning_rate": 9.999314120533557e-06, "loss": 0.0866, "step": 332 }, { "epoch": 0.3157894736842105, "grad_norm": 0.7954216406429697, "learning_rate": 9.999219624043075e-06, "loss": 0.0702, "step": 333 }, { "epoch": 0.31673779042200095, "grad_norm": 0.7996263107367133, "learning_rate": 9.99911903165902e-06, "loss": 0.0758, "step": 334 }, { "epoch": 0.3176861071597914, "grad_norm": 1.101451187378474, "learning_rate": 9.999012343504049e-06, "loss": 0.0957, "step": 335 }, { "epoch": 0.3186344238975818, "grad_norm": 0.7265535166036453, "learning_rate": 9.998899559708254e-06, "loss": 0.0743, "step": 336 }, { "epoch": 0.31958274063537223, "grad_norm": 1.272801256055057, "learning_rate": 9.998780680409161e-06, "loss": 0.0952, "step": 337 }, { "epoch": 0.3205310573731626, "grad_norm": 0.8770881337944402, "learning_rate": 9.99865570575173e-06, "loss": 0.066, "step": 338 }, { "epoch": 0.32147937411095306, "grad_norm": 1.0607119132841634, "learning_rate": 9.998524635888347e-06, "loss": 0.0913, "step": 339 }, { "epoch": 0.32242769084874345, "grad_norm": 0.9189346974278031, "learning_rate": 9.998387470978837e-06, "loss": 0.0881, "step": 340 }, { "epoch": 0.3233760075865339, "grad_norm": 0.7272168469454553, "learning_rate": 9.998244211190454e-06, "loss": 0.0713, "step": 341 }, { "epoch": 0.32432432432432434, "grad_norm": 0.9819255696828616, "learning_rate": 9.998094856697885e-06, "loss": 0.0834, "step": 342 }, { "epoch": 0.32527264106211473, "grad_norm": 0.6857773270509248, "learning_rate": 9.997939407683249e-06, "loss": 0.0524, "step": 343 }, { "epoch": 0.3262209577999052, "grad_norm": 1.0324591704355464, "learning_rate": 9.99777786433609e-06, "loss": 0.1108, "step": 344 }, { "epoch": 0.32716927453769556, "grad_norm": 1.1264206703681527, "learning_rate": 9.997610226853399e-06, "loss": 0.0987, "step": 345 }, { "epoch": 0.328117591275486, "grad_norm": 0.95789066514891, "learning_rate": 9.997436495439581e-06, "loss": 0.093, "step": 346 }, { "epoch": 0.32906590801327645, "grad_norm": 1.0448222803112024, "learning_rate": 9.997256670306478e-06, "loss": 0.0983, "step": 347 }, { "epoch": 0.33001422475106684, "grad_norm": 0.7737283316563024, "learning_rate": 9.997070751673367e-06, "loss": 0.0706, "step": 348 }, { "epoch": 0.3309625414888573, "grad_norm": 0.9596984880180834, "learning_rate": 9.99687873976695e-06, "loss": 0.0991, "step": 349 }, { "epoch": 0.3319108582266477, "grad_norm": 0.8411109119380658, "learning_rate": 9.99668063482136e-06, "loss": 0.0678, "step": 350 }, { "epoch": 0.3328591749644381, "grad_norm": 1.136491883808786, "learning_rate": 9.996476437078162e-06, "loss": 0.0986, "step": 351 }, { "epoch": 0.33380749170222856, "grad_norm": 3.03438587624818, "learning_rate": 9.996266146786344e-06, "loss": 0.0969, "step": 352 }, { "epoch": 0.33475580844001895, "grad_norm": 1.2333568047254937, "learning_rate": 9.996049764202332e-06, "loss": 0.0832, "step": 353 }, { "epoch": 0.3357041251778094, "grad_norm": 1.1301139087376384, "learning_rate": 9.995827289589974e-06, "loss": 0.0994, "step": 354 }, { "epoch": 0.3366524419155998, "grad_norm": 1.0303329732235522, "learning_rate": 9.995598723220548e-06, "loss": 0.0757, "step": 355 }, { "epoch": 0.33760075865339023, "grad_norm": 1.0605991674508604, "learning_rate": 9.995364065372762e-06, "loss": 0.0815, "step": 356 }, { "epoch": 0.3385490753911807, "grad_norm": 0.7941030771981634, "learning_rate": 9.995123316332752e-06, "loss": 0.0747, "step": 357 }, { "epoch": 0.33949739212897106, "grad_norm": 1.2313896272302265, "learning_rate": 9.994876476394075e-06, "loss": 0.0769, "step": 358 }, { "epoch": 0.3404457088667615, "grad_norm": 1.1944743493159886, "learning_rate": 9.994623545857727e-06, "loss": 0.0979, "step": 359 }, { "epoch": 0.3413940256045519, "grad_norm": 0.8285281294809631, "learning_rate": 9.994364525032116e-06, "loss": 0.0793, "step": 360 }, { "epoch": 0.34234234234234234, "grad_norm": 1.4761389910370195, "learning_rate": 9.994099414233091e-06, "loss": 0.0913, "step": 361 }, { "epoch": 0.3432906590801328, "grad_norm": 1.5408966458771916, "learning_rate": 9.993828213783915e-06, "loss": 0.0973, "step": 362 }, { "epoch": 0.3442389758179232, "grad_norm": 1.4559933930399096, "learning_rate": 9.993550924015283e-06, "loss": 0.0999, "step": 363 }, { "epoch": 0.3451872925557136, "grad_norm": 0.8454336561992738, "learning_rate": 9.993267545265314e-06, "loss": 0.0655, "step": 364 }, { "epoch": 0.346135609293504, "grad_norm": 0.796992439441769, "learning_rate": 9.992978077879552e-06, "loss": 0.0696, "step": 365 }, { "epoch": 0.34708392603129445, "grad_norm": 1.0553149426590827, "learning_rate": 9.992682522210963e-06, "loss": 0.0787, "step": 366 }, { "epoch": 0.3480322427690849, "grad_norm": 1.4860431297237584, "learning_rate": 9.992380878619939e-06, "loss": 0.106, "step": 367 }, { "epoch": 0.3489805595068753, "grad_norm": 1.3032907057151817, "learning_rate": 9.992073147474292e-06, "loss": 0.1021, "step": 368 }, { "epoch": 0.34992887624466573, "grad_norm": 1.0894704335759804, "learning_rate": 9.991759329149266e-06, "loss": 0.0905, "step": 369 }, { "epoch": 0.3508771929824561, "grad_norm": 1.1130576081628205, "learning_rate": 9.991439424027518e-06, "loss": 0.0846, "step": 370 }, { "epoch": 0.35182550972024657, "grad_norm": 0.9253664091514998, "learning_rate": 9.991113432499128e-06, "loss": 0.0882, "step": 371 }, { "epoch": 0.352773826458037, "grad_norm": 0.841899923853967, "learning_rate": 9.990781354961605e-06, "loss": 0.0806, "step": 372 }, { "epoch": 0.3537221431958274, "grad_norm": 0.9407729946270026, "learning_rate": 9.99044319181987e-06, "loss": 0.0939, "step": 373 }, { "epoch": 0.35467045993361784, "grad_norm": 0.9090058769044609, "learning_rate": 9.99009894348627e-06, "loss": 0.0891, "step": 374 }, { "epoch": 0.35561877667140823, "grad_norm": 0.6294083333837054, "learning_rate": 9.989748610380571e-06, "loss": 0.0706, "step": 375 }, { "epoch": 0.3565670934091987, "grad_norm": 0.9163781177038506, "learning_rate": 9.98939219292996e-06, "loss": 0.0697, "step": 376 }, { "epoch": 0.3575154101469891, "grad_norm": 1.1693511630739546, "learning_rate": 9.989029691569037e-06, "loss": 0.1056, "step": 377 }, { "epoch": 0.3584637268847795, "grad_norm": 1.0414233510818562, "learning_rate": 9.988661106739827e-06, "loss": 0.0988, "step": 378 }, { "epoch": 0.35941204362256995, "grad_norm": 1.2822153621266594, "learning_rate": 9.988286438891774e-06, "loss": 0.1189, "step": 379 }, { "epoch": 0.36036036036036034, "grad_norm": 0.63669429794073, "learning_rate": 9.987905688481732e-06, "loss": 0.0828, "step": 380 }, { "epoch": 0.3613086770981508, "grad_norm": 0.826754093590745, "learning_rate": 9.98751885597398e-06, "loss": 0.0848, "step": 381 }, { "epoch": 0.3622569938359412, "grad_norm": 0.8825949393702691, "learning_rate": 9.987125941840205e-06, "loss": 0.092, "step": 382 }, { "epoch": 0.3632053105737316, "grad_norm": 0.6103241173744877, "learning_rate": 9.986726946559517e-06, "loss": 0.08, "step": 383 }, { "epoch": 0.36415362731152207, "grad_norm": 0.7105367439957658, "learning_rate": 9.986321870618441e-06, "loss": 0.0685, "step": 384 }, { "epoch": 0.36510194404931245, "grad_norm": 1.802287343988455, "learning_rate": 9.985910714510908e-06, "loss": 0.0818, "step": 385 }, { "epoch": 0.3660502607871029, "grad_norm": 0.7732813708584271, "learning_rate": 9.985493478738275e-06, "loss": 0.07, "step": 386 }, { "epoch": 0.3669985775248933, "grad_norm": 0.8451643375246307, "learning_rate": 9.985070163809306e-06, "loss": 0.0744, "step": 387 }, { "epoch": 0.36794689426268373, "grad_norm": 1.126067442650852, "learning_rate": 9.984640770240173e-06, "loss": 0.1101, "step": 388 }, { "epoch": 0.3688952110004742, "grad_norm": 0.6652401258855057, "learning_rate": 9.984205298554467e-06, "loss": 0.0663, "step": 389 }, { "epoch": 0.36984352773826457, "grad_norm": 1.0802552975196003, "learning_rate": 9.983763749283193e-06, "loss": 0.0975, "step": 390 }, { "epoch": 0.370791844476055, "grad_norm": 0.7496808510910429, "learning_rate": 9.983316122964757e-06, "loss": 0.0701, "step": 391 }, { "epoch": 0.3717401612138454, "grad_norm": 0.6248602765248035, "learning_rate": 9.982862420144986e-06, "loss": 0.0643, "step": 392 }, { "epoch": 0.37268847795163584, "grad_norm": 1.7058022738803864, "learning_rate": 9.982402641377105e-06, "loss": 0.0936, "step": 393 }, { "epoch": 0.3736367946894263, "grad_norm": 1.205579756742393, "learning_rate": 9.98193678722176e-06, "loss": 0.0811, "step": 394 }, { "epoch": 0.3745851114272167, "grad_norm": 0.8021701752607538, "learning_rate": 9.981464858246993e-06, "loss": 0.0719, "step": 395 }, { "epoch": 0.3755334281650071, "grad_norm": 0.9210208736552777, "learning_rate": 9.980986855028267e-06, "loss": 0.0589, "step": 396 }, { "epoch": 0.3764817449027975, "grad_norm": 1.0458476195224804, "learning_rate": 9.980502778148438e-06, "loss": 0.0696, "step": 397 }, { "epoch": 0.37743006164058795, "grad_norm": 1.5095103680379303, "learning_rate": 9.980012628197778e-06, "loss": 0.0909, "step": 398 }, { "epoch": 0.3783783783783784, "grad_norm": 0.9521689001456719, "learning_rate": 9.979516405773956e-06, "loss": 0.0844, "step": 399 }, { "epoch": 0.3793266951161688, "grad_norm": 0.9909335290642662, "learning_rate": 9.979014111482057e-06, "loss": 0.079, "step": 400 }, { "epoch": 0.38027501185395923, "grad_norm": 1.300023515267878, "learning_rate": 9.978505745934559e-06, "loss": 0.1087, "step": 401 }, { "epoch": 0.3812233285917496, "grad_norm": 0.8905160216053487, "learning_rate": 9.977991309751347e-06, "loss": 0.0654, "step": 402 }, { "epoch": 0.38217164532954007, "grad_norm": 0.7908744916198801, "learning_rate": 9.97747080355971e-06, "loss": 0.0697, "step": 403 }, { "epoch": 0.3831199620673305, "grad_norm": 1.0819522254088034, "learning_rate": 9.976944227994337e-06, "loss": 0.0729, "step": 404 }, { "epoch": 0.3840682788051209, "grad_norm": 0.9319836261266163, "learning_rate": 9.976411583697316e-06, "loss": 0.077, "step": 405 }, { "epoch": 0.38501659554291134, "grad_norm": 0.7209233770781128, "learning_rate": 9.97587287131814e-06, "loss": 0.0708, "step": 406 }, { "epoch": 0.38596491228070173, "grad_norm": 0.8430932582390814, "learning_rate": 9.975328091513696e-06, "loss": 0.07, "step": 407 }, { "epoch": 0.3869132290184922, "grad_norm": 0.7932090811238357, "learning_rate": 9.974777244948271e-06, "loss": 0.0648, "step": 408 }, { "epoch": 0.3878615457562826, "grad_norm": 0.9213278429313838, "learning_rate": 9.974220332293554e-06, "loss": 0.0737, "step": 409 }, { "epoch": 0.388809862494073, "grad_norm": 0.4369389269684112, "learning_rate": 9.973657354228623e-06, "loss": 0.0509, "step": 410 }, { "epoch": 0.38975817923186346, "grad_norm": 0.7988805293653696, "learning_rate": 9.973088311439957e-06, "loss": 0.0684, "step": 411 }, { "epoch": 0.39070649596965384, "grad_norm": 0.9648310793568026, "learning_rate": 9.97251320462143e-06, "loss": 0.0849, "step": 412 }, { "epoch": 0.3916548127074443, "grad_norm": 0.7585613690692753, "learning_rate": 9.97193203447431e-06, "loss": 0.077, "step": 413 }, { "epoch": 0.39260312944523473, "grad_norm": 0.9380377046145346, "learning_rate": 9.971344801707256e-06, "loss": 0.0771, "step": 414 }, { "epoch": 0.3935514461830251, "grad_norm": 0.9822247506181627, "learning_rate": 9.970751507036323e-06, "loss": 0.1123, "step": 415 }, { "epoch": 0.39449976292081557, "grad_norm": 0.7156423865364446, "learning_rate": 9.970152151184956e-06, "loss": 0.0801, "step": 416 }, { "epoch": 0.39544807965860596, "grad_norm": 1.05912629502688, "learning_rate": 9.96954673488399e-06, "loss": 0.0804, "step": 417 }, { "epoch": 0.3963963963963964, "grad_norm": 1.1230479850270394, "learning_rate": 9.968935258871652e-06, "loss": 0.0799, "step": 418 }, { "epoch": 0.39734471313418684, "grad_norm": 1.0054642393242061, "learning_rate": 9.968317723893556e-06, "loss": 0.082, "step": 419 }, { "epoch": 0.39829302987197723, "grad_norm": 1.227859524837509, "learning_rate": 9.967694130702706e-06, "loss": 0.1069, "step": 420 }, { "epoch": 0.3992413466097677, "grad_norm": 1.2136272659300074, "learning_rate": 9.96706448005949e-06, "loss": 0.1112, "step": 421 }, { "epoch": 0.40018966334755807, "grad_norm": 0.9692912194018656, "learning_rate": 9.96642877273169e-06, "loss": 0.0837, "step": 422 }, { "epoch": 0.4011379800853485, "grad_norm": 0.7181203670103851, "learning_rate": 9.965787009494458e-06, "loss": 0.0648, "step": 423 }, { "epoch": 0.40208629682313896, "grad_norm": 0.9389223502528147, "learning_rate": 9.96513919113035e-06, "loss": 0.0846, "step": 424 }, { "epoch": 0.40303461356092934, "grad_norm": 0.6566856036851983, "learning_rate": 9.964485318429292e-06, "loss": 0.0776, "step": 425 }, { "epoch": 0.4039829302987198, "grad_norm": 1.0028156563396406, "learning_rate": 9.963825392188595e-06, "loss": 0.0719, "step": 426 }, { "epoch": 0.4049312470365102, "grad_norm": 0.9682157984093804, "learning_rate": 9.963159413212952e-06, "loss": 0.1058, "step": 427 }, { "epoch": 0.4058795637743006, "grad_norm": 1.1561667939356075, "learning_rate": 9.96248738231444e-06, "loss": 0.0982, "step": 428 }, { "epoch": 0.406827880512091, "grad_norm": 0.7960344078481167, "learning_rate": 9.961809300312512e-06, "loss": 0.0643, "step": 429 }, { "epoch": 0.40777619724988146, "grad_norm": 0.914323773268032, "learning_rate": 9.961125168034e-06, "loss": 0.0835, "step": 430 }, { "epoch": 0.4087245139876719, "grad_norm": 0.7441869330920762, "learning_rate": 9.960434986313113e-06, "loss": 0.0559, "step": 431 }, { "epoch": 0.4096728307254623, "grad_norm": 2.4732017252552367, "learning_rate": 9.959738755991437e-06, "loss": 0.1445, "step": 432 }, { "epoch": 0.41062114746325273, "grad_norm": 0.8533585342555405, "learning_rate": 9.959036477917935e-06, "loss": 0.0575, "step": 433 }, { "epoch": 0.4115694642010431, "grad_norm": 0.8190438451317316, "learning_rate": 9.95832815294894e-06, "loss": 0.0794, "step": 434 }, { "epoch": 0.41251778093883357, "grad_norm": 1.0046620676404385, "learning_rate": 9.957613781948164e-06, "loss": 0.0686, "step": 435 }, { "epoch": 0.413466097676624, "grad_norm": 0.9887051267008984, "learning_rate": 9.956893365786691e-06, "loss": 0.0618, "step": 436 }, { "epoch": 0.4144144144144144, "grad_norm": 0.6105909207601089, "learning_rate": 9.95616690534297e-06, "loss": 0.0572, "step": 437 }, { "epoch": 0.41536273115220484, "grad_norm": 1.5234824479103468, "learning_rate": 9.955434401502825e-06, "loss": 0.0994, "step": 438 }, { "epoch": 0.41631104788999523, "grad_norm": 1.1295839815001452, "learning_rate": 9.954695855159454e-06, "loss": 0.073, "step": 439 }, { "epoch": 0.4172593646277857, "grad_norm": 0.6583329952843571, "learning_rate": 9.95395126721341e-06, "loss": 0.0699, "step": 440 }, { "epoch": 0.4182076813655761, "grad_norm": 0.955937586299997, "learning_rate": 9.953200638572625e-06, "loss": 0.0815, "step": 441 }, { "epoch": 0.4191559981033665, "grad_norm": 1.5323108400108396, "learning_rate": 9.95244397015239e-06, "loss": 0.0732, "step": 442 }, { "epoch": 0.42010431484115696, "grad_norm": 1.677920724371183, "learning_rate": 9.951681262875365e-06, "loss": 0.0944, "step": 443 }, { "epoch": 0.42105263157894735, "grad_norm": 0.8926328574943209, "learning_rate": 9.95091251767157e-06, "loss": 0.0731, "step": 444 }, { "epoch": 0.4220009483167378, "grad_norm": 1.2692898943255297, "learning_rate": 9.950137735478389e-06, "loss": 0.1029, "step": 445 }, { "epoch": 0.42294926505452823, "grad_norm": 0.7345506207483801, "learning_rate": 9.949356917240569e-06, "loss": 0.0748, "step": 446 }, { "epoch": 0.4238975817923186, "grad_norm": 1.2435473519034808, "learning_rate": 9.948570063910216e-06, "loss": 0.1009, "step": 447 }, { "epoch": 0.42484589853010907, "grad_norm": 0.7650866909769807, "learning_rate": 9.947777176446792e-06, "loss": 0.0746, "step": 448 }, { "epoch": 0.42579421526789946, "grad_norm": 1.3807429981979404, "learning_rate": 9.946978255817121e-06, "loss": 0.0701, "step": 449 }, { "epoch": 0.4267425320056899, "grad_norm": 0.5315623424461096, "learning_rate": 9.946173302995382e-06, "loss": 0.0574, "step": 450 }, { "epoch": 0.42769084874348035, "grad_norm": 0.8562951763201797, "learning_rate": 9.94536231896311e-06, "loss": 0.0951, "step": 451 }, { "epoch": 0.42863916548127073, "grad_norm": 1.1965590998104225, "learning_rate": 9.944545304709192e-06, "loss": 0.0877, "step": 452 }, { "epoch": 0.4295874822190612, "grad_norm": 1.2735339749816497, "learning_rate": 9.943722261229872e-06, "loss": 0.0768, "step": 453 }, { "epoch": 0.43053579895685157, "grad_norm": 0.9370658659046329, "learning_rate": 9.942893189528743e-06, "loss": 0.0782, "step": 454 }, { "epoch": 0.431484115694642, "grad_norm": 1.5520551397042521, "learning_rate": 9.942058090616748e-06, "loss": 0.1039, "step": 455 }, { "epoch": 0.43243243243243246, "grad_norm": 1.3529615602541014, "learning_rate": 9.941216965512183e-06, "loss": 0.0867, "step": 456 }, { "epoch": 0.43338074917022285, "grad_norm": 1.192234505990805, "learning_rate": 9.940369815240688e-06, "loss": 0.0809, "step": 457 }, { "epoch": 0.4343290659080133, "grad_norm": 0.9763205758532367, "learning_rate": 9.939516640835254e-06, "loss": 0.0652, "step": 458 }, { "epoch": 0.4352773826458037, "grad_norm": 1.3415645605638937, "learning_rate": 9.938657443336212e-06, "loss": 0.109, "step": 459 }, { "epoch": 0.4362256993835941, "grad_norm": 1.1595154129634277, "learning_rate": 9.937792223791244e-06, "loss": 0.1002, "step": 460 }, { "epoch": 0.43717401612138457, "grad_norm": 1.33436975844217, "learning_rate": 9.936920983255372e-06, "loss": 0.114, "step": 461 }, { "epoch": 0.43812233285917496, "grad_norm": 1.0009653043703806, "learning_rate": 9.936043722790956e-06, "loss": 0.0827, "step": 462 }, { "epoch": 0.4390706495969654, "grad_norm": 1.1900315382859075, "learning_rate": 9.935160443467704e-06, "loss": 0.0991, "step": 463 }, { "epoch": 0.4400189663347558, "grad_norm": 0.7796648666540394, "learning_rate": 9.934271146362658e-06, "loss": 0.0729, "step": 464 }, { "epoch": 0.44096728307254623, "grad_norm": 0.7692033539386839, "learning_rate": 9.933375832560199e-06, "loss": 0.0752, "step": 465 }, { "epoch": 0.4419155998103367, "grad_norm": 0.7898679053377281, "learning_rate": 9.932474503152047e-06, "loss": 0.0557, "step": 466 }, { "epoch": 0.44286391654812707, "grad_norm": 1.308054442070126, "learning_rate": 9.931567159237252e-06, "loss": 0.1, "step": 467 }, { "epoch": 0.4438122332859175, "grad_norm": 0.8281027248286734, "learning_rate": 9.930653801922205e-06, "loss": 0.1066, "step": 468 }, { "epoch": 0.4447605500237079, "grad_norm": 0.6589498594732086, "learning_rate": 9.929734432320621e-06, "loss": 0.061, "step": 469 }, { "epoch": 0.44570886676149835, "grad_norm": 1.0105820136512023, "learning_rate": 9.928809051553554e-06, "loss": 0.0771, "step": 470 }, { "epoch": 0.4466571834992888, "grad_norm": 1.174475732403723, "learning_rate": 9.927877660749385e-06, "loss": 0.1029, "step": 471 }, { "epoch": 0.4476055002370792, "grad_norm": 0.7007588523937572, "learning_rate": 9.92694026104382e-06, "loss": 0.0548, "step": 472 }, { "epoch": 0.4485538169748696, "grad_norm": 0.7548622992450297, "learning_rate": 9.925996853579897e-06, "loss": 0.071, "step": 473 }, { "epoch": 0.44950213371266, "grad_norm": 0.9151211373906433, "learning_rate": 9.92504743950798e-06, "loss": 0.0728, "step": 474 }, { "epoch": 0.45045045045045046, "grad_norm": 1.3188113799099948, "learning_rate": 9.924092019985751e-06, "loss": 0.071, "step": 475 }, { "epoch": 0.45139876718824085, "grad_norm": 0.834826643366671, "learning_rate": 9.923130596178221e-06, "loss": 0.0827, "step": 476 }, { "epoch": 0.4523470839260313, "grad_norm": 0.8853088211117691, "learning_rate": 9.922163169257722e-06, "loss": 0.0714, "step": 477 }, { "epoch": 0.45329540066382173, "grad_norm": 0.9773650061711494, "learning_rate": 9.921189740403902e-06, "loss": 0.0902, "step": 478 }, { "epoch": 0.4542437174016121, "grad_norm": 0.8530429782086267, "learning_rate": 9.92021031080373e-06, "loss": 0.0896, "step": 479 }, { "epoch": 0.45519203413940257, "grad_norm": 0.6841245724165017, "learning_rate": 9.919224881651494e-06, "loss": 0.0574, "step": 480 }, { "epoch": 0.45614035087719296, "grad_norm": 0.8751901827667304, "learning_rate": 9.918233454148795e-06, "loss": 0.0712, "step": 481 }, { "epoch": 0.4570886676149834, "grad_norm": 0.8605318101074332, "learning_rate": 9.917236029504549e-06, "loss": 0.0758, "step": 482 }, { "epoch": 0.45803698435277385, "grad_norm": 0.6297402738230038, "learning_rate": 9.916232608934982e-06, "loss": 0.0835, "step": 483 }, { "epoch": 0.45898530109056423, "grad_norm": 1.2633792305334934, "learning_rate": 9.915223193663639e-06, "loss": 0.097, "step": 484 }, { "epoch": 0.4599336178283547, "grad_norm": 0.9453282561376489, "learning_rate": 9.914207784921366e-06, "loss": 0.0813, "step": 485 }, { "epoch": 0.46088193456614507, "grad_norm": 1.0981998450683066, "learning_rate": 9.913186383946322e-06, "loss": 0.0831, "step": 486 }, { "epoch": 0.4618302513039355, "grad_norm": 0.9453607555522517, "learning_rate": 9.91215899198397e-06, "loss": 0.0668, "step": 487 }, { "epoch": 0.46277856804172596, "grad_norm": 0.8480655824160724, "learning_rate": 9.911125610287085e-06, "loss": 0.0803, "step": 488 }, { "epoch": 0.46372688477951635, "grad_norm": 0.7365032755805906, "learning_rate": 9.910086240115738e-06, "loss": 0.0503, "step": 489 }, { "epoch": 0.4646752015173068, "grad_norm": 0.9926545138390478, "learning_rate": 9.909040882737301e-06, "loss": 0.0785, "step": 490 }, { "epoch": 0.4656235182550972, "grad_norm": 1.078153469225969, "learning_rate": 9.907989539426455e-06, "loss": 0.0942, "step": 491 }, { "epoch": 0.4665718349928876, "grad_norm": 0.891582918999742, "learning_rate": 9.906932211465173e-06, "loss": 0.0713, "step": 492 }, { "epoch": 0.46752015173067807, "grad_norm": 0.8352029023952229, "learning_rate": 9.90586890014273e-06, "loss": 0.0871, "step": 493 }, { "epoch": 0.46846846846846846, "grad_norm": 1.4543230270611818, "learning_rate": 9.904799606755695e-06, "loss": 0.1049, "step": 494 }, { "epoch": 0.4694167852062589, "grad_norm": 0.9571877161884975, "learning_rate": 9.90372433260793e-06, "loss": 0.0856, "step": 495 }, { "epoch": 0.4703651019440493, "grad_norm": 0.6657483404024113, "learning_rate": 9.90264307901059e-06, "loss": 0.0631, "step": 496 }, { "epoch": 0.47131341868183974, "grad_norm": 1.2493973473928695, "learning_rate": 9.901555847282123e-06, "loss": 0.0973, "step": 497 }, { "epoch": 0.4722617354196302, "grad_norm": 0.6689914382563446, "learning_rate": 9.900462638748266e-06, "loss": 0.0582, "step": 498 }, { "epoch": 0.47321005215742057, "grad_norm": 0.8246501895880392, "learning_rate": 9.899363454742044e-06, "loss": 0.0727, "step": 499 }, { "epoch": 0.474158368895211, "grad_norm": 1.442170890658491, "learning_rate": 9.898258296603769e-06, "loss": 0.0931, "step": 500 }, { "epoch": 0.4751066856330014, "grad_norm": 0.7582565389247256, "learning_rate": 9.897147165681034e-06, "loss": 0.0722, "step": 501 }, { "epoch": 0.47605500237079185, "grad_norm": 0.627525129279453, "learning_rate": 9.896030063328718e-06, "loss": 0.0597, "step": 502 }, { "epoch": 0.4770033191085823, "grad_norm": 0.6342149242840518, "learning_rate": 9.894906990908982e-06, "loss": 0.0725, "step": 503 }, { "epoch": 0.4779516358463727, "grad_norm": 0.8212079234115165, "learning_rate": 9.893777949791266e-06, "loss": 0.0649, "step": 504 }, { "epoch": 0.4788999525841631, "grad_norm": 0.8923951454231676, "learning_rate": 9.89264294135229e-06, "loss": 0.0595, "step": 505 }, { "epoch": 0.4798482693219535, "grad_norm": 1.0318440665130484, "learning_rate": 9.891501966976041e-06, "loss": 0.0842, "step": 506 }, { "epoch": 0.48079658605974396, "grad_norm": 0.6944537972828242, "learning_rate": 9.890355028053793e-06, "loss": 0.0752, "step": 507 }, { "epoch": 0.4817449027975344, "grad_norm": 1.0705584030604105, "learning_rate": 9.889202125984088e-06, "loss": 0.0647, "step": 508 }, { "epoch": 0.4826932195353248, "grad_norm": 0.9754252622446561, "learning_rate": 9.88804326217274e-06, "loss": 0.0687, "step": 509 }, { "epoch": 0.48364153627311524, "grad_norm": 0.9660762094606946, "learning_rate": 9.886878438032828e-06, "loss": 0.0789, "step": 510 }, { "epoch": 0.4845898530109056, "grad_norm": 0.5832722133461282, "learning_rate": 9.885707654984703e-06, "loss": 0.0636, "step": 511 }, { "epoch": 0.48553816974869607, "grad_norm": 0.7052006552554221, "learning_rate": 9.884530914455984e-06, "loss": 0.0586, "step": 512 }, { "epoch": 0.4864864864864865, "grad_norm": 0.9822072228951928, "learning_rate": 9.88334821788155e-06, "loss": 0.0645, "step": 513 }, { "epoch": 0.4874348032242769, "grad_norm": 0.9641946540266126, "learning_rate": 9.882159566703547e-06, "loss": 0.0885, "step": 514 }, { "epoch": 0.48838311996206735, "grad_norm": 0.6403136140606015, "learning_rate": 9.880964962371378e-06, "loss": 0.0678, "step": 515 }, { "epoch": 0.48933143669985774, "grad_norm": 0.7486541793123711, "learning_rate": 9.879764406341705e-06, "loss": 0.0741, "step": 516 }, { "epoch": 0.4902797534376482, "grad_norm": 0.5779229700891555, "learning_rate": 9.87855790007845e-06, "loss": 0.0646, "step": 517 }, { "epoch": 0.49122807017543857, "grad_norm": 0.7611283230447122, "learning_rate": 9.87734544505279e-06, "loss": 0.0768, "step": 518 }, { "epoch": 0.492176386913229, "grad_norm": 0.5823535883100547, "learning_rate": 9.876127042743155e-06, "loss": 0.0703, "step": 519 }, { "epoch": 0.49312470365101946, "grad_norm": 0.6827829977739827, "learning_rate": 9.874902694635226e-06, "loss": 0.0772, "step": 520 }, { "epoch": 0.49407302038880985, "grad_norm": 0.7254200544564426, "learning_rate": 9.873672402221937e-06, "loss": 0.0634, "step": 521 }, { "epoch": 0.4950213371266003, "grad_norm": 0.6425214796651868, "learning_rate": 9.872436167003468e-06, "loss": 0.064, "step": 522 }, { "epoch": 0.4959696538643907, "grad_norm": 0.623192525545158, "learning_rate": 9.871193990487242e-06, "loss": 0.077, "step": 523 }, { "epoch": 0.4969179706021811, "grad_norm": 0.7225947749173619, "learning_rate": 9.869945874187936e-06, "loss": 0.075, "step": 524 }, { "epoch": 0.49786628733997157, "grad_norm": 2.0516616577595435, "learning_rate": 9.868691819627462e-06, "loss": 0.0867, "step": 525 }, { "epoch": 0.49881460407776196, "grad_norm": 1.0257158284306434, "learning_rate": 9.867431828334974e-06, "loss": 0.0588, "step": 526 }, { "epoch": 0.4997629208155524, "grad_norm": 0.8403229438927825, "learning_rate": 9.86616590184687e-06, "loss": 0.0823, "step": 527 }, { "epoch": 0.5007112375533428, "grad_norm": 0.6449240492145598, "learning_rate": 9.864894041706779e-06, "loss": 0.0567, "step": 528 }, { "epoch": 0.5016595542911333, "grad_norm": 0.8789018684523284, "learning_rate": 9.863616249465567e-06, "loss": 0.0713, "step": 529 }, { "epoch": 0.5026078710289237, "grad_norm": 0.9524887983478211, "learning_rate": 9.862332526681336e-06, "loss": 0.0835, "step": 530 }, { "epoch": 0.5035561877667141, "grad_norm": 0.6422268170348604, "learning_rate": 9.861042874919417e-06, "loss": 0.0606, "step": 531 }, { "epoch": 0.5045045045045045, "grad_norm": 0.9032374038451735, "learning_rate": 9.859747295752374e-06, "loss": 0.0773, "step": 532 }, { "epoch": 0.505452821242295, "grad_norm": 0.9269404822199643, "learning_rate": 9.858445790759992e-06, "loss": 0.0822, "step": 533 }, { "epoch": 0.5064011379800853, "grad_norm": 0.7043514434980399, "learning_rate": 9.857138361529288e-06, "loss": 0.0688, "step": 534 }, { "epoch": 0.5073494547178757, "grad_norm": 0.8239211698855243, "learning_rate": 9.8558250096545e-06, "loss": 0.0542, "step": 535 }, { "epoch": 0.5082977714556662, "grad_norm": 0.8633975590563754, "learning_rate": 9.85450573673709e-06, "loss": 0.0744, "step": 536 }, { "epoch": 0.5092460881934566, "grad_norm": 0.6985004021466871, "learning_rate": 9.853180544385737e-06, "loss": 0.047, "step": 537 }, { "epoch": 0.510194404931247, "grad_norm": 0.5889042803503781, "learning_rate": 9.851849434216338e-06, "loss": 0.0557, "step": 538 }, { "epoch": 0.5111427216690374, "grad_norm": 0.7765705663935071, "learning_rate": 9.850512407852012e-06, "loss": 0.0669, "step": 539 }, { "epoch": 0.5120910384068279, "grad_norm": 0.8204550382112847, "learning_rate": 9.849169466923086e-06, "loss": 0.0685, "step": 540 }, { "epoch": 0.5130393551446183, "grad_norm": 0.5256883407913393, "learning_rate": 9.847820613067098e-06, "loss": 0.0537, "step": 541 }, { "epoch": 0.5139876718824087, "grad_norm": 0.6838576750776693, "learning_rate": 9.8464658479288e-06, "loss": 0.0704, "step": 542 }, { "epoch": 0.5149359886201992, "grad_norm": 0.8974806559813661, "learning_rate": 9.845105173160152e-06, "loss": 0.0899, "step": 543 }, { "epoch": 0.5158843053579896, "grad_norm": 0.7219053990698988, "learning_rate": 9.843738590420317e-06, "loss": 0.0468, "step": 544 }, { "epoch": 0.51683262209578, "grad_norm": 1.032987889739876, "learning_rate": 9.842366101375664e-06, "loss": 0.0562, "step": 545 }, { "epoch": 0.5177809388335705, "grad_norm": 0.7651951768284668, "learning_rate": 9.840987707699765e-06, "loss": 0.0669, "step": 546 }, { "epoch": 0.5187292555713608, "grad_norm": 0.6813496832389402, "learning_rate": 9.839603411073388e-06, "loss": 0.0706, "step": 547 }, { "epoch": 0.5196775723091512, "grad_norm": 0.7229692269198181, "learning_rate": 9.838213213184505e-06, "loss": 0.0771, "step": 548 }, { "epoch": 0.5206258890469416, "grad_norm": 1.157471128375012, "learning_rate": 9.836817115728277e-06, "loss": 0.0932, "step": 549 }, { "epoch": 0.5215742057847321, "grad_norm": 0.8058138449457062, "learning_rate": 9.835415120407063e-06, "loss": 0.0539, "step": 550 }, { "epoch": 0.5225225225225225, "grad_norm": 0.6915528599019737, "learning_rate": 9.834007228930414e-06, "loss": 0.0688, "step": 551 }, { "epoch": 0.5234708392603129, "grad_norm": 0.8835152385091712, "learning_rate": 9.832593443015068e-06, "loss": 0.0605, "step": 552 }, { "epoch": 0.5244191559981034, "grad_norm": 0.6896706794263241, "learning_rate": 9.83117376438495e-06, "loss": 0.0668, "step": 553 }, { "epoch": 0.5253674727358938, "grad_norm": 0.7651857964351815, "learning_rate": 9.829748194771175e-06, "loss": 0.064, "step": 554 }, { "epoch": 0.5263157894736842, "grad_norm": 0.6216741056003758, "learning_rate": 9.828316735912037e-06, "loss": 0.0541, "step": 555 }, { "epoch": 0.5272641062114747, "grad_norm": 0.6813673301708452, "learning_rate": 9.826879389553014e-06, "loss": 0.0574, "step": 556 }, { "epoch": 0.5282124229492651, "grad_norm": 0.7147998418504048, "learning_rate": 9.825436157446761e-06, "loss": 0.0576, "step": 557 }, { "epoch": 0.5291607396870555, "grad_norm": 0.6352148290105686, "learning_rate": 9.82398704135311e-06, "loss": 0.066, "step": 558 }, { "epoch": 0.5301090564248458, "grad_norm": 0.8511240887028577, "learning_rate": 9.822532043039068e-06, "loss": 0.0687, "step": 559 }, { "epoch": 0.5310573731626363, "grad_norm": 0.6876408977841421, "learning_rate": 9.821071164278815e-06, "loss": 0.0838, "step": 560 }, { "epoch": 0.5320056899004267, "grad_norm": 0.7354217835184531, "learning_rate": 9.819604406853703e-06, "loss": 0.0552, "step": 561 }, { "epoch": 0.5329540066382171, "grad_norm": 0.9572067784227991, "learning_rate": 9.818131772552249e-06, "loss": 0.1099, "step": 562 }, { "epoch": 0.5339023233760076, "grad_norm": 0.7931127239607592, "learning_rate": 9.816653263170137e-06, "loss": 0.0706, "step": 563 }, { "epoch": 0.534850640113798, "grad_norm": 0.8242420526129728, "learning_rate": 9.815168880510218e-06, "loss": 0.0946, "step": 564 }, { "epoch": 0.5357989568515884, "grad_norm": 1.0330372476146157, "learning_rate": 9.8136786263825e-06, "loss": 0.0951, "step": 565 }, { "epoch": 0.5367472735893789, "grad_norm": 0.7553297432270302, "learning_rate": 9.812182502604151e-06, "loss": 0.0663, "step": 566 }, { "epoch": 0.5376955903271693, "grad_norm": 0.8446853429895546, "learning_rate": 9.810680510999505e-06, "loss": 0.0728, "step": 567 }, { "epoch": 0.5386439070649597, "grad_norm": 0.5089680701907852, "learning_rate": 9.809172653400036e-06, "loss": 0.0501, "step": 568 }, { "epoch": 0.5395922238027501, "grad_norm": 0.7258180066288827, "learning_rate": 9.807658931644382e-06, "loss": 0.0752, "step": 569 }, { "epoch": 0.5405405405405406, "grad_norm": 0.7028402619162881, "learning_rate": 9.806139347578331e-06, "loss": 0.059, "step": 570 }, { "epoch": 0.541488857278331, "grad_norm": 0.7248854010393692, "learning_rate": 9.804613903054813e-06, "loss": 0.0851, "step": 571 }, { "epoch": 0.5424371740161213, "grad_norm": 0.7176555652391681, "learning_rate": 9.803082599933911e-06, "loss": 0.0697, "step": 572 }, { "epoch": 0.5433854907539118, "grad_norm": 0.4808404612456389, "learning_rate": 9.801545440082845e-06, "loss": 0.0569, "step": 573 }, { "epoch": 0.5443338074917022, "grad_norm": 0.8731137568130377, "learning_rate": 9.800002425375984e-06, "loss": 0.0657, "step": 574 }, { "epoch": 0.5452821242294926, "grad_norm": 0.7816194292982013, "learning_rate": 9.798453557694828e-06, "loss": 0.0724, "step": 575 }, { "epoch": 0.5462304409672831, "grad_norm": 0.9042436959378762, "learning_rate": 9.796898838928022e-06, "loss": 0.0784, "step": 576 }, { "epoch": 0.5471787577050735, "grad_norm": 1.0293154765529384, "learning_rate": 9.79533827097134e-06, "loss": 0.098, "step": 577 }, { "epoch": 0.5481270744428639, "grad_norm": 0.8678391414260259, "learning_rate": 9.793771855727691e-06, "loss": 0.0635, "step": 578 }, { "epoch": 0.5490753911806543, "grad_norm": 0.6041409950077287, "learning_rate": 9.792199595107115e-06, "loss": 0.0524, "step": 579 }, { "epoch": 0.5500237079184448, "grad_norm": 1.0292476772898875, "learning_rate": 9.790621491026773e-06, "loss": 0.0829, "step": 580 }, { "epoch": 0.5509720246562352, "grad_norm": 0.7074515600768486, "learning_rate": 9.78903754541096e-06, "loss": 0.0704, "step": 581 }, { "epoch": 0.5519203413940256, "grad_norm": 0.7603340975922476, "learning_rate": 9.787447760191092e-06, "loss": 0.0788, "step": 582 }, { "epoch": 0.5528686581318161, "grad_norm": 1.0766706695954442, "learning_rate": 9.785852137305699e-06, "loss": 0.079, "step": 583 }, { "epoch": 0.5538169748696065, "grad_norm": 0.7555731931730972, "learning_rate": 9.784250678700435e-06, "loss": 0.0705, "step": 584 }, { "epoch": 0.5547652916073968, "grad_norm": 0.7010961175305198, "learning_rate": 9.782643386328073e-06, "loss": 0.0713, "step": 585 }, { "epoch": 0.5557136083451872, "grad_norm": 1.0580272254821363, "learning_rate": 9.781030262148492e-06, "loss": 0.0671, "step": 586 }, { "epoch": 0.5566619250829777, "grad_norm": 0.6594876081209583, "learning_rate": 9.779411308128685e-06, "loss": 0.0867, "step": 587 }, { "epoch": 0.5576102418207681, "grad_norm": 1.3649847896410103, "learning_rate": 9.777786526242759e-06, "loss": 0.0847, "step": 588 }, { "epoch": 0.5585585585585585, "grad_norm": 0.6223880228627037, "learning_rate": 9.776155918471916e-06, "loss": 0.0579, "step": 589 }, { "epoch": 0.559506875296349, "grad_norm": 0.6862572922646061, "learning_rate": 9.774519486804476e-06, "loss": 0.053, "step": 590 }, { "epoch": 0.5604551920341394, "grad_norm": 0.6562455064809456, "learning_rate": 9.772877233235848e-06, "loss": 0.0651, "step": 591 }, { "epoch": 0.5614035087719298, "grad_norm": 0.7150505236504866, "learning_rate": 9.771229159768547e-06, "loss": 0.0697, "step": 592 }, { "epoch": 0.5623518255097203, "grad_norm": 0.7505406859172821, "learning_rate": 9.769575268412182e-06, "loss": 0.0691, "step": 593 }, { "epoch": 0.5633001422475107, "grad_norm": 0.7340490905887499, "learning_rate": 9.767915561183456e-06, "loss": 0.0748, "step": 594 }, { "epoch": 0.5642484589853011, "grad_norm": 0.7987611706335997, "learning_rate": 9.766250040106166e-06, "loss": 0.0682, "step": 595 }, { "epoch": 0.5651967757230915, "grad_norm": 1.2974449597341617, "learning_rate": 9.764578707211199e-06, "loss": 0.0751, "step": 596 }, { "epoch": 0.566145092460882, "grad_norm": 0.6191420122018653, "learning_rate": 9.762901564536523e-06, "loss": 0.0667, "step": 597 }, { "epoch": 0.5670934091986723, "grad_norm": 0.6903639931399153, "learning_rate": 9.761218614127193e-06, "loss": 0.0653, "step": 598 }, { "epoch": 0.5680417259364627, "grad_norm": 0.7974449669867185, "learning_rate": 9.759529858035351e-06, "loss": 0.0662, "step": 599 }, { "epoch": 0.5689900426742532, "grad_norm": 1.6445977802603875, "learning_rate": 9.75783529832021e-06, "loss": 0.0781, "step": 600 }, { "epoch": 0.5699383594120436, "grad_norm": 0.7682344601188886, "learning_rate": 9.756134937048066e-06, "loss": 0.0516, "step": 601 }, { "epoch": 0.570886676149834, "grad_norm": 0.6505039594954853, "learning_rate": 9.754428776292287e-06, "loss": 0.0522, "step": 602 }, { "epoch": 0.5718349928876245, "grad_norm": 1.0748139183671632, "learning_rate": 9.752716818133309e-06, "loss": 0.0787, "step": 603 }, { "epoch": 0.5727833096254149, "grad_norm": 0.7575374337239762, "learning_rate": 9.750999064658644e-06, "loss": 0.0618, "step": 604 }, { "epoch": 0.5737316263632053, "grad_norm": 0.5005741056916544, "learning_rate": 9.749275517962868e-06, "loss": 0.0579, "step": 605 }, { "epoch": 0.5746799431009957, "grad_norm": 0.9747236186565804, "learning_rate": 9.747546180147618e-06, "loss": 0.1137, "step": 606 }, { "epoch": 0.5756282598387862, "grad_norm": 0.5945741852680105, "learning_rate": 9.745811053321597e-06, "loss": 0.0528, "step": 607 }, { "epoch": 0.5765765765765766, "grad_norm": 0.8767385416979725, "learning_rate": 9.744070139600564e-06, "loss": 0.0756, "step": 608 }, { "epoch": 0.577524893314367, "grad_norm": 0.805183732938404, "learning_rate": 9.742323441107335e-06, "loss": 0.0796, "step": 609 }, { "epoch": 0.5784732100521575, "grad_norm": 0.4622182813428181, "learning_rate": 9.74057095997178e-06, "loss": 0.0466, "step": 610 }, { "epoch": 0.5794215267899478, "grad_norm": 1.323185570736391, "learning_rate": 9.738812698330821e-06, "loss": 0.0803, "step": 611 }, { "epoch": 0.5803698435277382, "grad_norm": 0.6017510939556475, "learning_rate": 9.737048658328428e-06, "loss": 0.0473, "step": 612 }, { "epoch": 0.5813181602655287, "grad_norm": 0.9340483579893749, "learning_rate": 9.735278842115616e-06, "loss": 0.0726, "step": 613 }, { "epoch": 0.5822664770033191, "grad_norm": 0.8017302866486061, "learning_rate": 9.733503251850443e-06, "loss": 0.0508, "step": 614 }, { "epoch": 0.5832147937411095, "grad_norm": 0.4915103436956615, "learning_rate": 9.73172188969801e-06, "loss": 0.0511, "step": 615 }, { "epoch": 0.5841631104788999, "grad_norm": 0.5454251857464146, "learning_rate": 9.729934757830455e-06, "loss": 0.043, "step": 616 }, { "epoch": 0.5851114272166904, "grad_norm": 0.45382702737394764, "learning_rate": 9.728141858426953e-06, "loss": 0.046, "step": 617 }, { "epoch": 0.5860597439544808, "grad_norm": 0.5609546349379012, "learning_rate": 9.726343193673707e-06, "loss": 0.0528, "step": 618 }, { "epoch": 0.5870080606922712, "grad_norm": 0.600673482298699, "learning_rate": 9.724538765763953e-06, "loss": 0.0539, "step": 619 }, { "epoch": 0.5879563774300617, "grad_norm": 0.9417089865736203, "learning_rate": 9.722728576897956e-06, "loss": 0.0583, "step": 620 }, { "epoch": 0.5889046941678521, "grad_norm": 0.4653439643190733, "learning_rate": 9.720912629283004e-06, "loss": 0.05, "step": 621 }, { "epoch": 0.5898530109056425, "grad_norm": 1.026549188147293, "learning_rate": 9.719090925133408e-06, "loss": 0.0643, "step": 622 }, { "epoch": 0.590801327643433, "grad_norm": 0.7947545630855374, "learning_rate": 9.717263466670496e-06, "loss": 0.0827, "step": 623 }, { "epoch": 0.5917496443812233, "grad_norm": 0.5505357789361721, "learning_rate": 9.715430256122616e-06, "loss": 0.057, "step": 624 }, { "epoch": 0.5926979611190137, "grad_norm": 0.6227650085275758, "learning_rate": 9.713591295725126e-06, "loss": 0.0613, "step": 625 }, { "epoch": 0.5936462778568041, "grad_norm": 0.8089764410308476, "learning_rate": 9.711746587720398e-06, "loss": 0.0575, "step": 626 }, { "epoch": 0.5945945945945946, "grad_norm": 0.8681782262186932, "learning_rate": 9.709896134357815e-06, "loss": 0.0664, "step": 627 }, { "epoch": 0.595542911332385, "grad_norm": 0.682165737662686, "learning_rate": 9.708039937893759e-06, "loss": 0.0558, "step": 628 }, { "epoch": 0.5964912280701754, "grad_norm": 0.6331915650172267, "learning_rate": 9.706178000591617e-06, "loss": 0.0628, "step": 629 }, { "epoch": 0.5974395448079659, "grad_norm": 0.5712611189361939, "learning_rate": 9.704310324721782e-06, "loss": 0.0741, "step": 630 }, { "epoch": 0.5983878615457563, "grad_norm": 0.4974903145873453, "learning_rate": 9.70243691256164e-06, "loss": 0.0569, "step": 631 }, { "epoch": 0.5993361782835467, "grad_norm": 0.8755421451427193, "learning_rate": 9.700557766395567e-06, "loss": 0.0884, "step": 632 }, { "epoch": 0.6002844950213371, "grad_norm": 0.5236784076286586, "learning_rate": 9.698672888514938e-06, "loss": 0.0493, "step": 633 }, { "epoch": 0.6012328117591276, "grad_norm": 0.6525012362182552, "learning_rate": 9.696782281218117e-06, "loss": 0.0683, "step": 634 }, { "epoch": 0.602181128496918, "grad_norm": 0.5119217968942416, "learning_rate": 9.69488594681045e-06, "loss": 0.0449, "step": 635 }, { "epoch": 0.6031294452347084, "grad_norm": 0.6576021927278618, "learning_rate": 9.692983887604269e-06, "loss": 0.0674, "step": 636 }, { "epoch": 0.6040777619724989, "grad_norm": 0.7157400695119305, "learning_rate": 9.691076105918885e-06, "loss": 0.0692, "step": 637 }, { "epoch": 0.6050260787102892, "grad_norm": 0.873028935018846, "learning_rate": 9.689162604080589e-06, "loss": 0.0999, "step": 638 }, { "epoch": 0.6059743954480796, "grad_norm": 0.8384167589559871, "learning_rate": 9.687243384422646e-06, "loss": 0.0771, "step": 639 }, { "epoch": 0.6069227121858701, "grad_norm": 0.5020655439555515, "learning_rate": 9.685318449285292e-06, "loss": 0.0512, "step": 640 }, { "epoch": 0.6078710289236605, "grad_norm": 0.36608001502573706, "learning_rate": 9.683387801015733e-06, "loss": 0.0377, "step": 641 }, { "epoch": 0.6088193456614509, "grad_norm": 0.7919506442179929, "learning_rate": 9.681451441968144e-06, "loss": 0.0775, "step": 642 }, { "epoch": 0.6097676623992413, "grad_norm": 0.6274619623629013, "learning_rate": 9.67950937450366e-06, "loss": 0.0645, "step": 643 }, { "epoch": 0.6107159791370318, "grad_norm": 0.5896565427831529, "learning_rate": 9.677561600990378e-06, "loss": 0.0595, "step": 644 }, { "epoch": 0.6116642958748222, "grad_norm": 0.5142338666265971, "learning_rate": 9.67560812380335e-06, "loss": 0.0597, "step": 645 }, { "epoch": 0.6126126126126126, "grad_norm": 0.6109668570207277, "learning_rate": 9.67364894532459e-06, "loss": 0.07, "step": 646 }, { "epoch": 0.6135609293504031, "grad_norm": 0.6756478515313759, "learning_rate": 9.671684067943056e-06, "loss": 0.0612, "step": 647 }, { "epoch": 0.6145092460881935, "grad_norm": 0.6142876685386528, "learning_rate": 9.669713494054662e-06, "loss": 0.06, "step": 648 }, { "epoch": 0.6154575628259839, "grad_norm": 0.8252522199066464, "learning_rate": 9.667737226062262e-06, "loss": 0.118, "step": 649 }, { "epoch": 0.6164058795637744, "grad_norm": 0.48924053020562824, "learning_rate": 9.665755266375657e-06, "loss": 0.0542, "step": 650 }, { "epoch": 0.6173541963015647, "grad_norm": 0.9087121397095356, "learning_rate": 9.663767617411587e-06, "loss": 0.0611, "step": 651 }, { "epoch": 0.6183025130393551, "grad_norm": 0.7764764902550111, "learning_rate": 9.66177428159373e-06, "loss": 0.0676, "step": 652 }, { "epoch": 0.6192508297771455, "grad_norm": 0.44918893065172116, "learning_rate": 9.659775261352697e-06, "loss": 0.0474, "step": 653 }, { "epoch": 0.620199146514936, "grad_norm": 0.9162652994629981, "learning_rate": 9.657770559126034e-06, "loss": 0.0981, "step": 654 }, { "epoch": 0.6211474632527264, "grad_norm": 0.6543823860401999, "learning_rate": 9.655760177358208e-06, "loss": 0.0744, "step": 655 }, { "epoch": 0.6220957799905168, "grad_norm": 0.44085186666179094, "learning_rate": 9.653744118500623e-06, "loss": 0.0532, "step": 656 }, { "epoch": 0.6230440967283073, "grad_norm": 0.7980175435844092, "learning_rate": 9.651722385011592e-06, "loss": 0.0807, "step": 657 }, { "epoch": 0.6239924134660977, "grad_norm": 0.4853866988799319, "learning_rate": 9.649694979356358e-06, "loss": 0.0454, "step": 658 }, { "epoch": 0.6249407302038881, "grad_norm": 0.5662361885259662, "learning_rate": 9.647661904007076e-06, "loss": 0.0621, "step": 659 }, { "epoch": 0.6258890469416786, "grad_norm": 0.8127269026146419, "learning_rate": 9.645623161442814e-06, "loss": 0.0773, "step": 660 }, { "epoch": 0.626837363679469, "grad_norm": 0.6294162739235921, "learning_rate": 9.643578754149552e-06, "loss": 0.0599, "step": 661 }, { "epoch": 0.6277856804172594, "grad_norm": 0.6965237350859914, "learning_rate": 9.641528684620179e-06, "loss": 0.0542, "step": 662 }, { "epoch": 0.6287339971550497, "grad_norm": 0.5265921422928361, "learning_rate": 9.639472955354483e-06, "loss": 0.0496, "step": 663 }, { "epoch": 0.6296823138928402, "grad_norm": 0.8663040094375097, "learning_rate": 9.63741156885916e-06, "loss": 0.0733, "step": 664 }, { "epoch": 0.6306306306306306, "grad_norm": 0.7508837936313448, "learning_rate": 9.635344527647798e-06, "loss": 0.08, "step": 665 }, { "epoch": 0.631578947368421, "grad_norm": 0.6827540936282853, "learning_rate": 9.633271834240885e-06, "loss": 0.0732, "step": 666 }, { "epoch": 0.6325272641062115, "grad_norm": 0.7441700461651841, "learning_rate": 9.631193491165798e-06, "loss": 0.0555, "step": 667 }, { "epoch": 0.6334755808440019, "grad_norm": 0.8313881844290032, "learning_rate": 9.629109500956803e-06, "loss": 0.0782, "step": 668 }, { "epoch": 0.6344238975817923, "grad_norm": 0.47754915650781987, "learning_rate": 9.627019866155056e-06, "loss": 0.0547, "step": 669 }, { "epoch": 0.6353722143195828, "grad_norm": 0.6618532396312571, "learning_rate": 9.624924589308591e-06, "loss": 0.0515, "step": 670 }, { "epoch": 0.6363205310573732, "grad_norm": 1.147117197534475, "learning_rate": 9.622823672972323e-06, "loss": 0.0882, "step": 671 }, { "epoch": 0.6372688477951636, "grad_norm": 0.5779383814129484, "learning_rate": 9.620717119708047e-06, "loss": 0.0659, "step": 672 }, { "epoch": 0.638217164532954, "grad_norm": 0.5799389859663083, "learning_rate": 9.618604932084427e-06, "loss": 0.0606, "step": 673 }, { "epoch": 0.6391654812707445, "grad_norm": 6.608545253943764, "learning_rate": 9.616487112677e-06, "loss": 0.066, "step": 674 }, { "epoch": 0.6401137980085349, "grad_norm": 0.7235578117181891, "learning_rate": 9.614363664068168e-06, "loss": 0.0628, "step": 675 }, { "epoch": 0.6410621147463252, "grad_norm": 0.6994528460712487, "learning_rate": 9.6122345888472e-06, "loss": 0.0628, "step": 676 }, { "epoch": 0.6420104314841157, "grad_norm": 0.6208663188504899, "learning_rate": 9.610099889610224e-06, "loss": 0.0554, "step": 677 }, { "epoch": 0.6429587482219061, "grad_norm": 0.6345977149189366, "learning_rate": 9.607959568960226e-06, "loss": 0.0632, "step": 678 }, { "epoch": 0.6439070649596965, "grad_norm": 0.8061055021711904, "learning_rate": 9.605813629507046e-06, "loss": 0.0684, "step": 679 }, { "epoch": 0.6448553816974869, "grad_norm": 0.6913423639588181, "learning_rate": 9.603662073867375e-06, "loss": 0.0673, "step": 680 }, { "epoch": 0.6458036984352774, "grad_norm": 0.7586179752230898, "learning_rate": 9.601504904664758e-06, "loss": 0.0702, "step": 681 }, { "epoch": 0.6467520151730678, "grad_norm": 0.5215807067369997, "learning_rate": 9.599342124529576e-06, "loss": 0.0484, "step": 682 }, { "epoch": 0.6477003319108582, "grad_norm": 0.4193899811291156, "learning_rate": 9.597173736099056e-06, "loss": 0.0455, "step": 683 }, { "epoch": 0.6486486486486487, "grad_norm": 1.0231627903377674, "learning_rate": 9.594999742017267e-06, "loss": 0.0755, "step": 684 }, { "epoch": 0.6495969653864391, "grad_norm": 0.5818860445113369, "learning_rate": 9.592820144935107e-06, "loss": 0.0457, "step": 685 }, { "epoch": 0.6505452821242295, "grad_norm": 0.8523614115619248, "learning_rate": 9.590634947510312e-06, "loss": 0.0666, "step": 686 }, { "epoch": 0.65149359886202, "grad_norm": 0.6819462318103672, "learning_rate": 9.588444152407441e-06, "loss": 0.0621, "step": 687 }, { "epoch": 0.6524419155998104, "grad_norm": 0.7350860734842137, "learning_rate": 9.586247762297882e-06, "loss": 0.0616, "step": 688 }, { "epoch": 0.6533902323376007, "grad_norm": 0.6877200427996193, "learning_rate": 9.584045779859848e-06, "loss": 0.0691, "step": 689 }, { "epoch": 0.6543385490753911, "grad_norm": 0.7777410132259543, "learning_rate": 9.581838207778367e-06, "loss": 0.0672, "step": 690 }, { "epoch": 0.6552868658131816, "grad_norm": 1.0340407583447775, "learning_rate": 9.579625048745281e-06, "loss": 0.0692, "step": 691 }, { "epoch": 0.656235182550972, "grad_norm": 0.6061769180463831, "learning_rate": 9.577406305459251e-06, "loss": 0.0519, "step": 692 }, { "epoch": 0.6571834992887624, "grad_norm": 0.7287017758175208, "learning_rate": 9.575181980625743e-06, "loss": 0.0626, "step": 693 }, { "epoch": 0.6581318160265529, "grad_norm": 0.6923184185544935, "learning_rate": 9.57295207695703e-06, "loss": 0.0602, "step": 694 }, { "epoch": 0.6590801327643433, "grad_norm": 0.7441802004305137, "learning_rate": 9.570716597172187e-06, "loss": 0.0785, "step": 695 }, { "epoch": 0.6600284495021337, "grad_norm": 0.5600328414907927, "learning_rate": 9.568475543997088e-06, "loss": 0.0525, "step": 696 }, { "epoch": 0.6609767662399242, "grad_norm": 0.6179093672887623, "learning_rate": 9.566228920164405e-06, "loss": 0.0498, "step": 697 }, { "epoch": 0.6619250829777146, "grad_norm": 1.0001632318997007, "learning_rate": 9.563976728413602e-06, "loss": 0.1065, "step": 698 }, { "epoch": 0.662873399715505, "grad_norm": 0.6197443639375237, "learning_rate": 9.56171897149093e-06, "loss": 0.0429, "step": 699 }, { "epoch": 0.6638217164532954, "grad_norm": 0.7426532648337794, "learning_rate": 9.55945565214943e-06, "loss": 0.0603, "step": 700 }, { "epoch": 0.6647700331910859, "grad_norm": 0.9809220324323352, "learning_rate": 9.557186773148922e-06, "loss": 0.0844, "step": 701 }, { "epoch": 0.6657183499288762, "grad_norm": 0.6596268576375636, "learning_rate": 9.554912337256007e-06, "loss": 0.0627, "step": 702 }, { "epoch": 0.6666666666666666, "grad_norm": 0.6445430375796782, "learning_rate": 9.552632347244062e-06, "loss": 0.0621, "step": 703 }, { "epoch": 0.6676149834044571, "grad_norm": 0.6854389668990125, "learning_rate": 9.550346805893236e-06, "loss": 0.0709, "step": 704 }, { "epoch": 0.6685633001422475, "grad_norm": 0.9157472924094435, "learning_rate": 9.548055715990448e-06, "loss": 0.0669, "step": 705 }, { "epoch": 0.6695116168800379, "grad_norm": 0.6210182918721243, "learning_rate": 9.545759080329381e-06, "loss": 0.0642, "step": 706 }, { "epoch": 0.6704599336178284, "grad_norm": 0.5811606762164421, "learning_rate": 9.543456901710483e-06, "loss": 0.0734, "step": 707 }, { "epoch": 0.6714082503556188, "grad_norm": 0.6797271720519124, "learning_rate": 9.541149182940958e-06, "loss": 0.0543, "step": 708 }, { "epoch": 0.6723565670934092, "grad_norm": 0.5126068611905316, "learning_rate": 9.538835926834766e-06, "loss": 0.0504, "step": 709 }, { "epoch": 0.6733048838311996, "grad_norm": 0.6464058845065579, "learning_rate": 9.536517136212623e-06, "loss": 0.0596, "step": 710 }, { "epoch": 0.6742532005689901, "grad_norm": 0.5987248394746172, "learning_rate": 9.534192813901986e-06, "loss": 0.0561, "step": 711 }, { "epoch": 0.6752015173067805, "grad_norm": 0.5757268664620699, "learning_rate": 9.531862962737065e-06, "loss": 0.0662, "step": 712 }, { "epoch": 0.6761498340445709, "grad_norm": 0.6884820373956889, "learning_rate": 9.529527585558806e-06, "loss": 0.0734, "step": 713 }, { "epoch": 0.6770981507823614, "grad_norm": 0.5599551362853026, "learning_rate": 9.5271866852149e-06, "loss": 0.0497, "step": 714 }, { "epoch": 0.6780464675201517, "grad_norm": 1.2727013612767513, "learning_rate": 9.524840264559762e-06, "loss": 0.0806, "step": 715 }, { "epoch": 0.6789947842579421, "grad_norm": 0.5125594480614294, "learning_rate": 9.522488326454551e-06, "loss": 0.0464, "step": 716 }, { "epoch": 0.6799431009957326, "grad_norm": 0.9279881234599379, "learning_rate": 9.520130873767141e-06, "loss": 0.0466, "step": 717 }, { "epoch": 0.680891417733523, "grad_norm": 0.5884738866592291, "learning_rate": 9.517767909372143e-06, "loss": 0.0463, "step": 718 }, { "epoch": 0.6818397344713134, "grad_norm": 0.6405987798189022, "learning_rate": 9.515399436150879e-06, "loss": 0.0646, "step": 719 }, { "epoch": 0.6827880512091038, "grad_norm": 0.6141893191288851, "learning_rate": 9.513025456991394e-06, "loss": 0.0713, "step": 720 }, { "epoch": 0.6837363679468943, "grad_norm": 0.5294631004623913, "learning_rate": 9.510645974788441e-06, "loss": 0.0533, "step": 721 }, { "epoch": 0.6846846846846847, "grad_norm": 0.5983803884552171, "learning_rate": 9.508260992443492e-06, "loss": 0.0574, "step": 722 }, { "epoch": 0.6856330014224751, "grad_norm": 0.7168015362345571, "learning_rate": 9.505870512864715e-06, "loss": 0.0622, "step": 723 }, { "epoch": 0.6865813181602656, "grad_norm": 0.8061703745318712, "learning_rate": 9.503474538966992e-06, "loss": 0.072, "step": 724 }, { "epoch": 0.687529634898056, "grad_norm": 0.6410612258118752, "learning_rate": 9.501073073671896e-06, "loss": 0.0454, "step": 725 }, { "epoch": 0.6884779516358464, "grad_norm": 0.790215058142473, "learning_rate": 9.498666119907701e-06, "loss": 0.0677, "step": 726 }, { "epoch": 0.6894262683736367, "grad_norm": 0.6299133472058956, "learning_rate": 9.496253680609371e-06, "loss": 0.0585, "step": 727 }, { "epoch": 0.6903745851114272, "grad_norm": 1.0623017139889208, "learning_rate": 9.493835758718561e-06, "loss": 0.069, "step": 728 }, { "epoch": 0.6913229018492176, "grad_norm": 0.5536012592608316, "learning_rate": 9.491412357183607e-06, "loss": 0.0686, "step": 729 }, { "epoch": 0.692271218587008, "grad_norm": 0.6038206755461478, "learning_rate": 9.488983478959534e-06, "loss": 0.0706, "step": 730 }, { "epoch": 0.6932195353247985, "grad_norm": 0.6342419868913964, "learning_rate": 9.486549127008037e-06, "loss": 0.0496, "step": 731 }, { "epoch": 0.6941678520625889, "grad_norm": 1.1555208683238716, "learning_rate": 9.484109304297493e-06, "loss": 0.0834, "step": 732 }, { "epoch": 0.6951161688003793, "grad_norm": 0.8509380581545992, "learning_rate": 9.481664013802943e-06, "loss": 0.0794, "step": 733 }, { "epoch": 0.6960644855381698, "grad_norm": 0.8224046322343856, "learning_rate": 9.479213258506102e-06, "loss": 0.0869, "step": 734 }, { "epoch": 0.6970128022759602, "grad_norm": 0.6505920471844966, "learning_rate": 9.476757041395342e-06, "loss": 0.0642, "step": 735 }, { "epoch": 0.6979611190137506, "grad_norm": 0.5162948092375159, "learning_rate": 9.474295365465697e-06, "loss": 0.0539, "step": 736 }, { "epoch": 0.698909435751541, "grad_norm": 0.7194486779836317, "learning_rate": 9.471828233718863e-06, "loss": 0.0585, "step": 737 }, { "epoch": 0.6998577524893315, "grad_norm": 0.9014549238602243, "learning_rate": 9.46935564916318e-06, "loss": 0.0874, "step": 738 }, { "epoch": 0.7008060692271219, "grad_norm": 0.7378312572460828, "learning_rate": 9.466877614813645e-06, "loss": 0.0657, "step": 739 }, { "epoch": 0.7017543859649122, "grad_norm": 0.815800968244944, "learning_rate": 9.464394133691891e-06, "loss": 0.0538, "step": 740 }, { "epoch": 0.7027027027027027, "grad_norm": 0.5271528573688194, "learning_rate": 9.461905208826202e-06, "loss": 0.0619, "step": 741 }, { "epoch": 0.7036510194404931, "grad_norm": 0.9062594050922635, "learning_rate": 9.459410843251496e-06, "loss": 0.0659, "step": 742 }, { "epoch": 0.7045993361782835, "grad_norm": 0.6578698656781865, "learning_rate": 9.456911040009323e-06, "loss": 0.0577, "step": 743 }, { "epoch": 0.705547652916074, "grad_norm": 0.6791351680766123, "learning_rate": 9.454405802147864e-06, "loss": 0.0669, "step": 744 }, { "epoch": 0.7064959696538644, "grad_norm": 0.7662019136887008, "learning_rate": 9.451895132721933e-06, "loss": 0.0692, "step": 745 }, { "epoch": 0.7074442863916548, "grad_norm": 0.6997379483885225, "learning_rate": 9.449379034792961e-06, "loss": 0.0609, "step": 746 }, { "epoch": 0.7083926031294452, "grad_norm": 0.6231531262832446, "learning_rate": 9.446857511429e-06, "loss": 0.0568, "step": 747 }, { "epoch": 0.7093409198672357, "grad_norm": 0.638618143024491, "learning_rate": 9.444330565704715e-06, "loss": 0.0391, "step": 748 }, { "epoch": 0.7102892366050261, "grad_norm": 0.6101709327712237, "learning_rate": 9.441798200701388e-06, "loss": 0.0692, "step": 749 }, { "epoch": 0.7112375533428165, "grad_norm": 0.7771396965466206, "learning_rate": 9.439260419506906e-06, "loss": 0.0616, "step": 750 }, { "epoch": 0.712185870080607, "grad_norm": 0.663533581873393, "learning_rate": 9.436717225215761e-06, "loss": 0.0706, "step": 751 }, { "epoch": 0.7131341868183974, "grad_norm": 0.7406791150442034, "learning_rate": 9.434168620929045e-06, "loss": 0.0759, "step": 752 }, { "epoch": 0.7140825035561877, "grad_norm": 0.6589932311994989, "learning_rate": 9.431614609754446e-06, "loss": 0.0676, "step": 753 }, { "epoch": 0.7150308202939782, "grad_norm": 0.7873737037891946, "learning_rate": 9.429055194806247e-06, "loss": 0.0661, "step": 754 }, { "epoch": 0.7159791370317686, "grad_norm": 0.6588547169267579, "learning_rate": 9.42649037920532e-06, "loss": 0.068, "step": 755 }, { "epoch": 0.716927453769559, "grad_norm": 0.8208102856389554, "learning_rate": 9.423920166079122e-06, "loss": 0.0829, "step": 756 }, { "epoch": 0.7178757705073494, "grad_norm": 0.5652492127213, "learning_rate": 9.421344558561689e-06, "loss": 0.0754, "step": 757 }, { "epoch": 0.7188240872451399, "grad_norm": 2.03543668980321, "learning_rate": 9.418763559793639e-06, "loss": 0.0469, "step": 758 }, { "epoch": 0.7197724039829303, "grad_norm": 0.7132600676949169, "learning_rate": 9.41617717292216e-06, "loss": 0.058, "step": 759 }, { "epoch": 0.7207207207207207, "grad_norm": 0.5814418519545377, "learning_rate": 9.413585401101014e-06, "loss": 0.0676, "step": 760 }, { "epoch": 0.7216690374585112, "grad_norm": 0.778087468578043, "learning_rate": 9.410988247490527e-06, "loss": 0.0565, "step": 761 }, { "epoch": 0.7226173541963016, "grad_norm": 0.5978506887698309, "learning_rate": 9.408385715257589e-06, "loss": 0.0526, "step": 762 }, { "epoch": 0.723565670934092, "grad_norm": 0.7345386180038043, "learning_rate": 9.405777807575643e-06, "loss": 0.0779, "step": 763 }, { "epoch": 0.7245139876718824, "grad_norm": 0.6765882263629432, "learning_rate": 9.403164527624695e-06, "loss": 0.0739, "step": 764 }, { "epoch": 0.7254623044096729, "grad_norm": 0.6200059319183251, "learning_rate": 9.400545878591297e-06, "loss": 0.0425, "step": 765 }, { "epoch": 0.7264106211474632, "grad_norm": 0.5764913642807622, "learning_rate": 9.397921863668545e-06, "loss": 0.0525, "step": 766 }, { "epoch": 0.7273589378852536, "grad_norm": 0.5072870053545583, "learning_rate": 9.395292486056087e-06, "loss": 0.0466, "step": 767 }, { "epoch": 0.7283072546230441, "grad_norm": 0.6266493674563252, "learning_rate": 9.3926577489601e-06, "loss": 0.0564, "step": 768 }, { "epoch": 0.7292555713608345, "grad_norm": 0.6781903020718192, "learning_rate": 9.390017655593303e-06, "loss": 0.0625, "step": 769 }, { "epoch": 0.7302038880986249, "grad_norm": 0.6970906328583575, "learning_rate": 9.387372209174943e-06, "loss": 0.0499, "step": 770 }, { "epoch": 0.7311522048364154, "grad_norm": 0.4830643779006922, "learning_rate": 9.384721412930797e-06, "loss": 0.0522, "step": 771 }, { "epoch": 0.7321005215742058, "grad_norm": 0.5981146539751457, "learning_rate": 9.382065270093164e-06, "loss": 0.0503, "step": 772 }, { "epoch": 0.7330488383119962, "grad_norm": 0.6288690777841561, "learning_rate": 9.37940378390086e-06, "loss": 0.0505, "step": 773 }, { "epoch": 0.7339971550497866, "grad_norm": 0.6043657243192845, "learning_rate": 9.376736957599219e-06, "loss": 0.048, "step": 774 }, { "epoch": 0.7349454717875771, "grad_norm": 1.3199303132586044, "learning_rate": 9.37406479444009e-06, "loss": 0.0787, "step": 775 }, { "epoch": 0.7358937885253675, "grad_norm": 0.9970354985082576, "learning_rate": 9.37138729768182e-06, "loss": 0.0593, "step": 776 }, { "epoch": 0.7368421052631579, "grad_norm": 0.6154243426982743, "learning_rate": 9.36870447058927e-06, "loss": 0.0552, "step": 777 }, { "epoch": 0.7377904220009484, "grad_norm": 0.688917247579616, "learning_rate": 9.366016316433796e-06, "loss": 0.0688, "step": 778 }, { "epoch": 0.7387387387387387, "grad_norm": 0.8890574424533809, "learning_rate": 9.363322838493252e-06, "loss": 0.0616, "step": 779 }, { "epoch": 0.7396870554765291, "grad_norm": 0.5256518464793154, "learning_rate": 9.360624040051975e-06, "loss": 0.0449, "step": 780 }, { "epoch": 0.7406353722143196, "grad_norm": 0.7015686604630017, "learning_rate": 9.357919924400802e-06, "loss": 0.0744, "step": 781 }, { "epoch": 0.74158368895211, "grad_norm": 0.5444389461448026, "learning_rate": 9.355210494837046e-06, "loss": 0.058, "step": 782 }, { "epoch": 0.7425320056899004, "grad_norm": 0.8635005280396899, "learning_rate": 9.352495754664501e-06, "loss": 0.0817, "step": 783 }, { "epoch": 0.7434803224276908, "grad_norm": 0.3975227023619501, "learning_rate": 9.349775707193439e-06, "loss": 0.0325, "step": 784 }, { "epoch": 0.7444286391654813, "grad_norm": 0.9671794171858287, "learning_rate": 9.347050355740598e-06, "loss": 0.0942, "step": 785 }, { "epoch": 0.7453769559032717, "grad_norm": 0.8627076848581986, "learning_rate": 9.34431970362919e-06, "loss": 0.0603, "step": 786 }, { "epoch": 0.7463252726410621, "grad_norm": 0.676971569472859, "learning_rate": 9.341583754188887e-06, "loss": 0.0609, "step": 787 }, { "epoch": 0.7472735893788526, "grad_norm": 0.6234019106033082, "learning_rate": 9.338842510755822e-06, "loss": 0.0527, "step": 788 }, { "epoch": 0.748221906116643, "grad_norm": 0.5688808355503273, "learning_rate": 9.336095976672578e-06, "loss": 0.0746, "step": 789 }, { "epoch": 0.7491702228544334, "grad_norm": 0.8927220033190019, "learning_rate": 9.3333441552882e-06, "loss": 0.0663, "step": 790 }, { "epoch": 0.7501185395922239, "grad_norm": 0.6760705893906477, "learning_rate": 9.33058704995817e-06, "loss": 0.0607, "step": 791 }, { "epoch": 0.7510668563300142, "grad_norm": 0.6421619908578323, "learning_rate": 9.327824664044418e-06, "loss": 0.0601, "step": 792 }, { "epoch": 0.7520151730678046, "grad_norm": 0.7064042205046658, "learning_rate": 9.32505700091531e-06, "loss": 0.0656, "step": 793 }, { "epoch": 0.752963489805595, "grad_norm": 0.6789456621715841, "learning_rate": 9.322284063945651e-06, "loss": 0.0754, "step": 794 }, { "epoch": 0.7539118065433855, "grad_norm": 0.6349001762224292, "learning_rate": 9.319505856516674e-06, "loss": 0.055, "step": 795 }, { "epoch": 0.7548601232811759, "grad_norm": 0.7970733715082516, "learning_rate": 9.316722382016037e-06, "loss": 0.0606, "step": 796 }, { "epoch": 0.7558084400189663, "grad_norm": 0.6989262918440643, "learning_rate": 9.313933643837825e-06, "loss": 0.0419, "step": 797 }, { "epoch": 0.7567567567567568, "grad_norm": 0.45444386596780545, "learning_rate": 9.311139645382539e-06, "loss": 0.0499, "step": 798 }, { "epoch": 0.7577050734945472, "grad_norm": 0.7340919059070612, "learning_rate": 9.308340390057091e-06, "loss": 0.0605, "step": 799 }, { "epoch": 0.7586533902323376, "grad_norm": 0.49624254277855845, "learning_rate": 9.305535881274812e-06, "loss": 0.038, "step": 800 }, { "epoch": 0.7596017069701281, "grad_norm": 0.5558026345234958, "learning_rate": 9.302726122455425e-06, "loss": 0.0477, "step": 801 }, { "epoch": 0.7605500237079185, "grad_norm": 0.7377034794768454, "learning_rate": 9.299911117025071e-06, "loss": 0.0798, "step": 802 }, { "epoch": 0.7614983404457089, "grad_norm": 0.6642309640857783, "learning_rate": 9.297090868416276e-06, "loss": 0.0578, "step": 803 }, { "epoch": 0.7624466571834992, "grad_norm": 0.4901567357915514, "learning_rate": 9.294265380067965e-06, "loss": 0.0546, "step": 804 }, { "epoch": 0.7633949739212897, "grad_norm": 0.5504015183910195, "learning_rate": 9.291434655425452e-06, "loss": 0.0476, "step": 805 }, { "epoch": 0.7643432906590801, "grad_norm": 0.7880325712467479, "learning_rate": 9.288598697940433e-06, "loss": 0.0967, "step": 806 }, { "epoch": 0.7652916073968705, "grad_norm": 1.0094413699993006, "learning_rate": 9.285757511070987e-06, "loss": 0.0547, "step": 807 }, { "epoch": 0.766239924134661, "grad_norm": 0.5462824953438216, "learning_rate": 9.28291109828157e-06, "loss": 0.0622, "step": 808 }, { "epoch": 0.7671882408724514, "grad_norm": 0.6095693174069973, "learning_rate": 9.28005946304301e-06, "loss": 0.054, "step": 809 }, { "epoch": 0.7681365576102418, "grad_norm": 0.5522598480936777, "learning_rate": 9.277202608832502e-06, "loss": 0.0608, "step": 810 }, { "epoch": 0.7690848743480322, "grad_norm": 0.8887551561479244, "learning_rate": 9.274340539133604e-06, "loss": 0.0733, "step": 811 }, { "epoch": 0.7700331910858227, "grad_norm": 0.6536519593388536, "learning_rate": 9.271473257436239e-06, "loss": 0.0704, "step": 812 }, { "epoch": 0.7709815078236131, "grad_norm": 0.6903014054311826, "learning_rate": 9.268600767236677e-06, "loss": 0.0839, "step": 813 }, { "epoch": 0.7719298245614035, "grad_norm": 0.5929159416904847, "learning_rate": 9.265723072037546e-06, "loss": 0.0592, "step": 814 }, { "epoch": 0.772878141299194, "grad_norm": 0.7439638317959937, "learning_rate": 9.26284017534782e-06, "loss": 0.0568, "step": 815 }, { "epoch": 0.7738264580369844, "grad_norm": 0.5860050856048022, "learning_rate": 9.259952080682812e-06, "loss": 0.0667, "step": 816 }, { "epoch": 0.7747747747747747, "grad_norm": 0.4842910654706692, "learning_rate": 9.257058791564175e-06, "loss": 0.0513, "step": 817 }, { "epoch": 0.7757230915125652, "grad_norm": 0.789038697553299, "learning_rate": 9.254160311519896e-06, "loss": 0.0557, "step": 818 }, { "epoch": 0.7766714082503556, "grad_norm": 0.5387139258318481, "learning_rate": 9.251256644084292e-06, "loss": 0.0558, "step": 819 }, { "epoch": 0.777619724988146, "grad_norm": 0.8887946106511906, "learning_rate": 9.248347792798006e-06, "loss": 0.0776, "step": 820 }, { "epoch": 0.7785680417259364, "grad_norm": 0.7477907494684204, "learning_rate": 9.245433761208e-06, "loss": 0.0706, "step": 821 }, { "epoch": 0.7795163584637269, "grad_norm": 0.8176178183928178, "learning_rate": 9.242514552867556e-06, "loss": 0.0806, "step": 822 }, { "epoch": 0.7804646752015173, "grad_norm": 0.5104409829727489, "learning_rate": 9.239590171336262e-06, "loss": 0.0427, "step": 823 }, { "epoch": 0.7814129919393077, "grad_norm": 0.5922185838285359, "learning_rate": 9.236660620180024e-06, "loss": 0.0553, "step": 824 }, { "epoch": 0.7823613086770982, "grad_norm": 0.9414341871189567, "learning_rate": 9.23372590297104e-06, "loss": 0.0678, "step": 825 }, { "epoch": 0.7833096254148886, "grad_norm": 0.49939628701466243, "learning_rate": 9.230786023287819e-06, "loss": 0.0437, "step": 826 }, { "epoch": 0.784257942152679, "grad_norm": 0.519425273825053, "learning_rate": 9.227840984715154e-06, "loss": 0.0497, "step": 827 }, { "epoch": 0.7852062588904695, "grad_norm": 0.5443123255099412, "learning_rate": 9.224890790844137e-06, "loss": 0.0612, "step": 828 }, { "epoch": 0.7861545756282599, "grad_norm": 0.511905527310258, "learning_rate": 9.221935445272144e-06, "loss": 0.0449, "step": 829 }, { "epoch": 0.7871028923660502, "grad_norm": 0.6705781452415145, "learning_rate": 9.218974951602829e-06, "loss": 0.063, "step": 830 }, { "epoch": 0.7880512091038406, "grad_norm": 0.47754646141190604, "learning_rate": 9.216009313446125e-06, "loss": 0.0688, "step": 831 }, { "epoch": 0.7889995258416311, "grad_norm": 0.5705276893342319, "learning_rate": 9.213038534418244e-06, "loss": 0.0686, "step": 832 }, { "epoch": 0.7899478425794215, "grad_norm": 0.4253509537520698, "learning_rate": 9.21006261814166e-06, "loss": 0.0427, "step": 833 }, { "epoch": 0.7908961593172119, "grad_norm": 0.533220697742502, "learning_rate": 9.207081568245112e-06, "loss": 0.0394, "step": 834 }, { "epoch": 0.7918444760550024, "grad_norm": 0.5786737951816707, "learning_rate": 9.2040953883636e-06, "loss": 0.0556, "step": 835 }, { "epoch": 0.7927927927927928, "grad_norm": 1.05765776588404, "learning_rate": 9.20110408213838e-06, "loss": 0.0388, "step": 836 }, { "epoch": 0.7937411095305832, "grad_norm": 0.809530041430475, "learning_rate": 9.19810765321696e-06, "loss": 0.1042, "step": 837 }, { "epoch": 0.7946894262683737, "grad_norm": 0.4767483114016521, "learning_rate": 9.19510610525309e-06, "loss": 0.0586, "step": 838 }, { "epoch": 0.7956377430061641, "grad_norm": 0.6212000890855088, "learning_rate": 9.192099441906765e-06, "loss": 0.063, "step": 839 }, { "epoch": 0.7965860597439545, "grad_norm": 0.5793471462839893, "learning_rate": 9.189087666844219e-06, "loss": 0.0599, "step": 840 }, { "epoch": 0.7975343764817449, "grad_norm": 0.6109133021965912, "learning_rate": 9.186070783737915e-06, "loss": 0.0655, "step": 841 }, { "epoch": 0.7984826932195354, "grad_norm": 1.7579309929430755, "learning_rate": 9.183048796266547e-06, "loss": 0.0531, "step": 842 }, { "epoch": 0.7994310099573257, "grad_norm": 0.6305893305402994, "learning_rate": 9.180021708115034e-06, "loss": 0.069, "step": 843 }, { "epoch": 0.8003793266951161, "grad_norm": 0.5799218206040034, "learning_rate": 9.176989522974512e-06, "loss": 0.0548, "step": 844 }, { "epoch": 0.8013276434329066, "grad_norm": 0.5205329821796497, "learning_rate": 9.173952244542335e-06, "loss": 0.0551, "step": 845 }, { "epoch": 0.802275960170697, "grad_norm": 0.6401356176971456, "learning_rate": 9.170909876522067e-06, "loss": 0.0613, "step": 846 }, { "epoch": 0.8032242769084874, "grad_norm": 0.6283553782308525, "learning_rate": 9.167862422623474e-06, "loss": 0.0681, "step": 847 }, { "epoch": 0.8041725936462779, "grad_norm": 0.5291087716357314, "learning_rate": 9.164809886562532e-06, "loss": 0.0428, "step": 848 }, { "epoch": 0.8051209103840683, "grad_norm": 0.6176212098121372, "learning_rate": 9.161752272061405e-06, "loss": 0.0607, "step": 849 }, { "epoch": 0.8060692271218587, "grad_norm": 0.5258734780929885, "learning_rate": 9.158689582848454e-06, "loss": 0.0555, "step": 850 }, { "epoch": 0.8070175438596491, "grad_norm": 0.5473102285657928, "learning_rate": 9.155621822658229e-06, "loss": 0.0461, "step": 851 }, { "epoch": 0.8079658605974396, "grad_norm": 0.7147069989389465, "learning_rate": 9.15254899523146e-06, "loss": 0.0699, "step": 852 }, { "epoch": 0.80891417733523, "grad_norm": 0.5116476113725856, "learning_rate": 9.14947110431506e-06, "loss": 0.0593, "step": 853 }, { "epoch": 0.8098624940730204, "grad_norm": 0.599625799358922, "learning_rate": 9.146388153662109e-06, "loss": 0.0719, "step": 854 }, { "epoch": 0.8108108108108109, "grad_norm": 0.5657265833927722, "learning_rate": 9.143300147031866e-06, "loss": 0.0539, "step": 855 }, { "epoch": 0.8117591275486012, "grad_norm": 0.490285928003467, "learning_rate": 9.14020708818975e-06, "loss": 0.0551, "step": 856 }, { "epoch": 0.8127074442863916, "grad_norm": 0.5667257690255696, "learning_rate": 9.137108980907341e-06, "loss": 0.0485, "step": 857 }, { "epoch": 0.813655761024182, "grad_norm": 0.7288808283591064, "learning_rate": 9.134005828962373e-06, "loss": 0.0464, "step": 858 }, { "epoch": 0.8146040777619725, "grad_norm": 0.6578159612053353, "learning_rate": 9.130897636138736e-06, "loss": 0.0458, "step": 859 }, { "epoch": 0.8155523944997629, "grad_norm": 0.6699312438910595, "learning_rate": 9.127784406226462e-06, "loss": 0.0484, "step": 860 }, { "epoch": 0.8165007112375533, "grad_norm": 0.7577555099867996, "learning_rate": 9.124666143021728e-06, "loss": 0.0552, "step": 861 }, { "epoch": 0.8174490279753438, "grad_norm": 0.6571718231580975, "learning_rate": 9.121542850326849e-06, "loss": 0.0418, "step": 862 }, { "epoch": 0.8183973447131342, "grad_norm": 0.7375729716381728, "learning_rate": 9.118414531950268e-06, "loss": 0.0586, "step": 863 }, { "epoch": 0.8193456614509246, "grad_norm": 0.7778186396499422, "learning_rate": 9.115281191706563e-06, "loss": 0.0638, "step": 864 }, { "epoch": 0.8202939781887151, "grad_norm": 1.1753642296648885, "learning_rate": 9.11214283341643e-06, "loss": 0.0914, "step": 865 }, { "epoch": 0.8212422949265055, "grad_norm": 0.6221136965708439, "learning_rate": 9.108999460906687e-06, "loss": 0.0513, "step": 866 }, { "epoch": 0.8221906116642959, "grad_norm": 1.0126156537474953, "learning_rate": 9.105851078010265e-06, "loss": 0.0511, "step": 867 }, { "epoch": 0.8231389284020862, "grad_norm": 0.609505398312846, "learning_rate": 9.102697688566204e-06, "loss": 0.0607, "step": 868 }, { "epoch": 0.8240872451398767, "grad_norm": 0.6781545775462046, "learning_rate": 9.09953929641965e-06, "loss": 0.0537, "step": 869 }, { "epoch": 0.8250355618776671, "grad_norm": 0.6162914997785193, "learning_rate": 9.096375905421849e-06, "loss": 0.0514, "step": 870 }, { "epoch": 0.8259838786154575, "grad_norm": 0.9380195573648793, "learning_rate": 9.093207519430138e-06, "loss": 0.0592, "step": 871 }, { "epoch": 0.826932195353248, "grad_norm": 0.6891518456384623, "learning_rate": 9.090034142307955e-06, "loss": 0.0611, "step": 872 }, { "epoch": 0.8278805120910384, "grad_norm": 0.6860355795137043, "learning_rate": 9.086855777924813e-06, "loss": 0.0651, "step": 873 }, { "epoch": 0.8288288288288288, "grad_norm": 0.5941193542193252, "learning_rate": 9.083672430156313e-06, "loss": 0.0561, "step": 874 }, { "epoch": 0.8297771455666193, "grad_norm": 0.9859763647912905, "learning_rate": 9.080484102884132e-06, "loss": 0.0558, "step": 875 }, { "epoch": 0.8307254623044097, "grad_norm": 0.6607364577205248, "learning_rate": 9.077290799996015e-06, "loss": 0.0445, "step": 876 }, { "epoch": 0.8316737790422001, "grad_norm": 0.4579344621348973, "learning_rate": 9.074092525385777e-06, "loss": 0.0532, "step": 877 }, { "epoch": 0.8326220957799905, "grad_norm": 0.44020290978074095, "learning_rate": 9.070889282953297e-06, "loss": 0.0432, "step": 878 }, { "epoch": 0.833570412517781, "grad_norm": 0.7817453278171299, "learning_rate": 9.067681076604507e-06, "loss": 0.0622, "step": 879 }, { "epoch": 0.8345187292555714, "grad_norm": 0.6106825636941368, "learning_rate": 9.064467910251396e-06, "loss": 0.0499, "step": 880 }, { "epoch": 0.8354670459933617, "grad_norm": 0.5733918003298187, "learning_rate": 9.061249787812e-06, "loss": 0.058, "step": 881 }, { "epoch": 0.8364153627311522, "grad_norm": 0.734104839469145, "learning_rate": 9.058026713210396e-06, "loss": 0.0603, "step": 882 }, { "epoch": 0.8373636794689426, "grad_norm": 0.5863205921902287, "learning_rate": 9.054798690376702e-06, "loss": 0.0542, "step": 883 }, { "epoch": 0.838311996206733, "grad_norm": 0.6529541400114963, "learning_rate": 9.051565723247072e-06, "loss": 0.0546, "step": 884 }, { "epoch": 0.8392603129445235, "grad_norm": 0.8496840763418192, "learning_rate": 9.048327815763682e-06, "loss": 0.0499, "step": 885 }, { "epoch": 0.8402086296823139, "grad_norm": 0.4879463969986272, "learning_rate": 9.045084971874738e-06, "loss": 0.0404, "step": 886 }, { "epoch": 0.8411569464201043, "grad_norm": 0.48366631890428774, "learning_rate": 9.041837195534462e-06, "loss": 0.0438, "step": 887 }, { "epoch": 0.8421052631578947, "grad_norm": 0.5668428745474414, "learning_rate": 9.038584490703095e-06, "loss": 0.0577, "step": 888 }, { "epoch": 0.8430535798956852, "grad_norm": 0.6139669391301276, "learning_rate": 9.03532686134688e-06, "loss": 0.0699, "step": 889 }, { "epoch": 0.8440018966334756, "grad_norm": 0.6191388281587789, "learning_rate": 9.032064311438073e-06, "loss": 0.0588, "step": 890 }, { "epoch": 0.844950213371266, "grad_norm": 0.6152385003685913, "learning_rate": 9.028796844954924e-06, "loss": 0.0537, "step": 891 }, { "epoch": 0.8458985301090565, "grad_norm": 0.755005932732524, "learning_rate": 9.025524465881683e-06, "loss": 0.0649, "step": 892 }, { "epoch": 0.8468468468468469, "grad_norm": 0.7676513218085563, "learning_rate": 9.022247178208585e-06, "loss": 0.0635, "step": 893 }, { "epoch": 0.8477951635846372, "grad_norm": 0.5920047067355723, "learning_rate": 9.018964985931856e-06, "loss": 0.06, "step": 894 }, { "epoch": 0.8487434803224277, "grad_norm": 0.5807083572715754, "learning_rate": 9.015677893053695e-06, "loss": 0.0505, "step": 895 }, { "epoch": 0.8496917970602181, "grad_norm": 0.7897487160161104, "learning_rate": 9.012385903582286e-06, "loss": 0.0714, "step": 896 }, { "epoch": 0.8506401137980085, "grad_norm": 0.5382652341176712, "learning_rate": 9.009089021531777e-06, "loss": 0.0512, "step": 897 }, { "epoch": 0.8515884305357989, "grad_norm": 0.8441756486986386, "learning_rate": 9.005787250922285e-06, "loss": 0.0766, "step": 898 }, { "epoch": 0.8525367472735894, "grad_norm": 0.42966299233294036, "learning_rate": 9.002480595779883e-06, "loss": 0.0469, "step": 899 }, { "epoch": 0.8534850640113798, "grad_norm": 0.5779848432711783, "learning_rate": 8.999169060136609e-06, "loss": 0.0549, "step": 900 }, { "epoch": 0.8544333807491702, "grad_norm": 0.49828391414464324, "learning_rate": 8.995852648030444e-06, "loss": 0.0513, "step": 901 }, { "epoch": 0.8553816974869607, "grad_norm": 0.5712972033755797, "learning_rate": 8.99253136350532e-06, "loss": 0.0642, "step": 902 }, { "epoch": 0.8563300142247511, "grad_norm": 0.7463859566833713, "learning_rate": 8.989205210611106e-06, "loss": 0.0669, "step": 903 }, { "epoch": 0.8572783309625415, "grad_norm": 0.6015754760898006, "learning_rate": 8.98587419340361e-06, "loss": 0.0518, "step": 904 }, { "epoch": 0.8582266477003319, "grad_norm": 0.7279488477743896, "learning_rate": 8.982538315944573e-06, "loss": 0.0603, "step": 905 }, { "epoch": 0.8591749644381224, "grad_norm": 0.41210687518386613, "learning_rate": 8.979197582301662e-06, "loss": 0.0508, "step": 906 }, { "epoch": 0.8601232811759127, "grad_norm": 1.4900739335277513, "learning_rate": 8.97585199654846e-06, "loss": 0.072, "step": 907 }, { "epoch": 0.8610715979137031, "grad_norm": 0.5450963951689192, "learning_rate": 8.972501562764476e-06, "loss": 0.0566, "step": 908 }, { "epoch": 0.8620199146514936, "grad_norm": 0.5356916948533633, "learning_rate": 8.969146285035119e-06, "loss": 0.0471, "step": 909 }, { "epoch": 0.862968231389284, "grad_norm": 0.6064958608566305, "learning_rate": 8.965786167451713e-06, "loss": 0.0586, "step": 910 }, { "epoch": 0.8639165481270744, "grad_norm": 0.6550030676781202, "learning_rate": 8.962421214111486e-06, "loss": 0.0622, "step": 911 }, { "epoch": 0.8648648648648649, "grad_norm": 0.5789487697080219, "learning_rate": 8.959051429117551e-06, "loss": 0.0587, "step": 912 }, { "epoch": 0.8658131816026553, "grad_norm": 0.6480466907010984, "learning_rate": 8.955676816578922e-06, "loss": 0.0596, "step": 913 }, { "epoch": 0.8667614983404457, "grad_norm": 0.703037972481164, "learning_rate": 8.9522973806105e-06, "loss": 0.0836, "step": 914 }, { "epoch": 0.8677098150782361, "grad_norm": 0.49499510899266297, "learning_rate": 8.94891312533306e-06, "loss": 0.0493, "step": 915 }, { "epoch": 0.8686581318160266, "grad_norm": 0.4679737716122778, "learning_rate": 8.945524054873261e-06, "loss": 0.0473, "step": 916 }, { "epoch": 0.869606448553817, "grad_norm": 0.4868047238192127, "learning_rate": 8.942130173363628e-06, "loss": 0.0617, "step": 917 }, { "epoch": 0.8705547652916074, "grad_norm": 0.48143223119722567, "learning_rate": 8.938731484942557e-06, "loss": 0.0459, "step": 918 }, { "epoch": 0.8715030820293979, "grad_norm": 0.5109365563225756, "learning_rate": 8.935327993754307e-06, "loss": 0.0603, "step": 919 }, { "epoch": 0.8724513987671882, "grad_norm": 0.5946328530954544, "learning_rate": 8.931919703948981e-06, "loss": 0.0663, "step": 920 }, { "epoch": 0.8733997155049786, "grad_norm": 0.6675396299202498, "learning_rate": 8.928506619682549e-06, "loss": 0.0522, "step": 921 }, { "epoch": 0.8743480322427691, "grad_norm": 0.5242785281728278, "learning_rate": 8.925088745116817e-06, "loss": 0.0477, "step": 922 }, { "epoch": 0.8752963489805595, "grad_norm": 0.4607255100157249, "learning_rate": 8.921666084419435e-06, "loss": 0.0444, "step": 923 }, { "epoch": 0.8762446657183499, "grad_norm": 0.6127086410246447, "learning_rate": 8.918238641763894e-06, "loss": 0.0505, "step": 924 }, { "epoch": 0.8771929824561403, "grad_norm": 0.7108664485212953, "learning_rate": 8.914806421329505e-06, "loss": 0.0372, "step": 925 }, { "epoch": 0.8781412991939308, "grad_norm": 0.48171514690034495, "learning_rate": 8.911369427301418e-06, "loss": 0.0467, "step": 926 }, { "epoch": 0.8790896159317212, "grad_norm": 0.5032020795283936, "learning_rate": 8.907927663870592e-06, "loss": 0.0383, "step": 927 }, { "epoch": 0.8800379326695116, "grad_norm": 0.6490864569323296, "learning_rate": 8.90448113523381e-06, "loss": 0.0703, "step": 928 }, { "epoch": 0.8809862494073021, "grad_norm": 0.5274849878368799, "learning_rate": 8.901029845593658e-06, "loss": 0.0497, "step": 929 }, { "epoch": 0.8819345661450925, "grad_norm": 0.7209898569229573, "learning_rate": 8.897573799158534e-06, "loss": 0.0845, "step": 930 }, { "epoch": 0.8828828828828829, "grad_norm": 0.653701403062353, "learning_rate": 8.894113000142636e-06, "loss": 0.0528, "step": 931 }, { "epoch": 0.8838311996206734, "grad_norm": 0.5252034559155617, "learning_rate": 8.890647452765954e-06, "loss": 0.054, "step": 932 }, { "epoch": 0.8847795163584637, "grad_norm": 0.6597062824750437, "learning_rate": 8.887177161254267e-06, "loss": 0.0508, "step": 933 }, { "epoch": 0.8857278330962541, "grad_norm": 0.9841434864966624, "learning_rate": 8.883702129839144e-06, "loss": 0.06, "step": 934 }, { "epoch": 0.8866761498340445, "grad_norm": 0.4716559195813748, "learning_rate": 8.880222362757928e-06, "loss": 0.0484, "step": 935 }, { "epoch": 0.887624466571835, "grad_norm": 0.6275887169553205, "learning_rate": 8.87673786425374e-06, "loss": 0.055, "step": 936 }, { "epoch": 0.8885727833096254, "grad_norm": 0.5480616561224483, "learning_rate": 8.87324863857547e-06, "loss": 0.0512, "step": 937 }, { "epoch": 0.8895211000474158, "grad_norm": 0.5716073816122306, "learning_rate": 8.869754689977774e-06, "loss": 0.0575, "step": 938 }, { "epoch": 0.8904694167852063, "grad_norm": 0.8761043849726794, "learning_rate": 8.866256022721062e-06, "loss": 0.0508, "step": 939 }, { "epoch": 0.8914177335229967, "grad_norm": 0.7017157731117182, "learning_rate": 8.862752641071499e-06, "loss": 0.0546, "step": 940 }, { "epoch": 0.8923660502607871, "grad_norm": 1.5138916151321196, "learning_rate": 8.859244549301005e-06, "loss": 0.0658, "step": 941 }, { "epoch": 0.8933143669985776, "grad_norm": 0.8433261605133346, "learning_rate": 8.855731751687233e-06, "loss": 0.0553, "step": 942 }, { "epoch": 0.894262683736368, "grad_norm": 0.5494966721887847, "learning_rate": 8.852214252513582e-06, "loss": 0.0494, "step": 943 }, { "epoch": 0.8952110004741584, "grad_norm": 0.6006177701179363, "learning_rate": 8.848692056069184e-06, "loss": 0.0612, "step": 944 }, { "epoch": 0.8961593172119487, "grad_norm": 0.6876171031491582, "learning_rate": 8.84516516664889e-06, "loss": 0.0609, "step": 945 }, { "epoch": 0.8971076339497392, "grad_norm": 0.846588378426009, "learning_rate": 8.841633588553287e-06, "loss": 0.0593, "step": 946 }, { "epoch": 0.8980559506875296, "grad_norm": 1.175631640532978, "learning_rate": 8.838097326088667e-06, "loss": 0.0767, "step": 947 }, { "epoch": 0.89900426742532, "grad_norm": 0.7010270158444133, "learning_rate": 8.834556383567042e-06, "loss": 0.0637, "step": 948 }, { "epoch": 0.8999525841631105, "grad_norm": 0.7103962193756044, "learning_rate": 8.831010765306124e-06, "loss": 0.047, "step": 949 }, { "epoch": 0.9009009009009009, "grad_norm": 0.9919713077792982, "learning_rate": 8.827460475629334e-06, "loss": 0.0699, "step": 950 }, { "epoch": 0.9018492176386913, "grad_norm": 0.9438936607800321, "learning_rate": 8.823905518865782e-06, "loss": 0.0962, "step": 951 }, { "epoch": 0.9027975343764817, "grad_norm": 0.41357107371942303, "learning_rate": 8.820345899350275e-06, "loss": 0.0393, "step": 952 }, { "epoch": 0.9037458511142722, "grad_norm": 0.6094306471098007, "learning_rate": 8.8167816214233e-06, "loss": 0.0547, "step": 953 }, { "epoch": 0.9046941678520626, "grad_norm": 0.45434395748515616, "learning_rate": 8.81321268943103e-06, "loss": 0.0458, "step": 954 }, { "epoch": 0.905642484589853, "grad_norm": 0.584662000585842, "learning_rate": 8.809639107725308e-06, "loss": 0.0684, "step": 955 }, { "epoch": 0.9065908013276435, "grad_norm": 0.6281479664499341, "learning_rate": 8.80606088066365e-06, "loss": 0.0485, "step": 956 }, { "epoch": 0.9075391180654339, "grad_norm": 0.5220137398785665, "learning_rate": 8.802478012609235e-06, "loss": 0.0478, "step": 957 }, { "epoch": 0.9084874348032242, "grad_norm": 0.7613507347001472, "learning_rate": 8.798890507930899e-06, "loss": 0.0534, "step": 958 }, { "epoch": 0.9094357515410147, "grad_norm": 0.5338153539509801, "learning_rate": 8.795298371003138e-06, "loss": 0.0467, "step": 959 }, { "epoch": 0.9103840682788051, "grad_norm": 0.508435320780577, "learning_rate": 8.791701606206092e-06, "loss": 0.05, "step": 960 }, { "epoch": 0.9113323850165955, "grad_norm": 0.6801979027503147, "learning_rate": 8.788100217925541e-06, "loss": 0.0654, "step": 961 }, { "epoch": 0.9122807017543859, "grad_norm": 0.5472159955708181, "learning_rate": 8.78449421055291e-06, "loss": 0.0566, "step": 962 }, { "epoch": 0.9132290184921764, "grad_norm": 0.5546852372370231, "learning_rate": 8.78088358848525e-06, "loss": 0.0544, "step": 963 }, { "epoch": 0.9141773352299668, "grad_norm": 0.7376086419870055, "learning_rate": 8.777268356125244e-06, "loss": 0.0618, "step": 964 }, { "epoch": 0.9151256519677572, "grad_norm": 0.461174714622349, "learning_rate": 8.773648517881194e-06, "loss": 0.0527, "step": 965 }, { "epoch": 0.9160739687055477, "grad_norm": 1.100649311314461, "learning_rate": 8.770024078167017e-06, "loss": 0.075, "step": 966 }, { "epoch": 0.9170222854433381, "grad_norm": 0.5385193734337945, "learning_rate": 8.766395041402245e-06, "loss": 0.056, "step": 967 }, { "epoch": 0.9179706021811285, "grad_norm": 0.4215583451342763, "learning_rate": 8.762761412012011e-06, "loss": 0.045, "step": 968 }, { "epoch": 0.918918918918919, "grad_norm": 0.5690890175604749, "learning_rate": 8.75912319442705e-06, "loss": 0.0568, "step": 969 }, { "epoch": 0.9198672356567094, "grad_norm": 0.5598668678593514, "learning_rate": 8.755480393083694e-06, "loss": 0.0629, "step": 970 }, { "epoch": 0.9208155523944997, "grad_norm": 0.4230299561301444, "learning_rate": 8.751833012423861e-06, "loss": 0.0402, "step": 971 }, { "epoch": 0.9217638691322901, "grad_norm": 0.8504416588391118, "learning_rate": 8.74818105689505e-06, "loss": 0.0521, "step": 972 }, { "epoch": 0.9227121858700806, "grad_norm": 0.461086821346764, "learning_rate": 8.744524530950351e-06, "loss": 0.0426, "step": 973 }, { "epoch": 0.923660502607871, "grad_norm": 0.5086789755859074, "learning_rate": 8.740863439048412e-06, "loss": 0.0487, "step": 974 }, { "epoch": 0.9246088193456614, "grad_norm": 0.45915883182777006, "learning_rate": 8.737197785653457e-06, "loss": 0.0444, "step": 975 }, { "epoch": 0.9255571360834519, "grad_norm": 0.6701095989032753, "learning_rate": 8.73352757523527e-06, "loss": 0.0707, "step": 976 }, { "epoch": 0.9265054528212423, "grad_norm": 0.460793794881083, "learning_rate": 8.729852812269192e-06, "loss": 0.0462, "step": 977 }, { "epoch": 0.9274537695590327, "grad_norm": 0.42146552351647865, "learning_rate": 8.726173501236115e-06, "loss": 0.0413, "step": 978 }, { "epoch": 0.9284020862968232, "grad_norm": 0.4515670497285217, "learning_rate": 8.722489646622477e-06, "loss": 0.0486, "step": 979 }, { "epoch": 0.9293504030346136, "grad_norm": 0.7298661971153528, "learning_rate": 8.718801252920257e-06, "loss": 0.0728, "step": 980 }, { "epoch": 0.930298719772404, "grad_norm": 0.6123325398467794, "learning_rate": 8.715108324626967e-06, "loss": 0.0528, "step": 981 }, { "epoch": 0.9312470365101944, "grad_norm": 0.5334963078534037, "learning_rate": 8.711410866245648e-06, "loss": 0.0409, "step": 982 }, { "epoch": 0.9321953532479849, "grad_norm": 0.44851971952458897, "learning_rate": 8.70770888228487e-06, "loss": 0.0509, "step": 983 }, { "epoch": 0.9331436699857752, "grad_norm": 0.9770313333004932, "learning_rate": 8.704002377258714e-06, "loss": 0.0463, "step": 984 }, { "epoch": 0.9340919867235656, "grad_norm": 0.7370636377202378, "learning_rate": 8.700291355686779e-06, "loss": 0.0637, "step": 985 }, { "epoch": 0.9350403034613561, "grad_norm": 0.6070776528057518, "learning_rate": 8.69657582209417e-06, "loss": 0.0488, "step": 986 }, { "epoch": 0.9359886201991465, "grad_norm": 0.7278417266877663, "learning_rate": 8.692855781011494e-06, "loss": 0.0501, "step": 987 }, { "epoch": 0.9369369369369369, "grad_norm": 0.4731052806759658, "learning_rate": 8.689131236974853e-06, "loss": 0.0417, "step": 988 }, { "epoch": 0.9378852536747273, "grad_norm": 0.45598792555472306, "learning_rate": 8.68540219452584e-06, "loss": 0.0396, "step": 989 }, { "epoch": 0.9388335704125178, "grad_norm": 0.5661429908370399, "learning_rate": 8.681668658211535e-06, "loss": 0.0577, "step": 990 }, { "epoch": 0.9397818871503082, "grad_norm": 0.41955875165931145, "learning_rate": 8.677930632584496e-06, "loss": 0.0432, "step": 991 }, { "epoch": 0.9407302038880986, "grad_norm": 0.4107826749470781, "learning_rate": 8.674188122202756e-06, "loss": 0.0535, "step": 992 }, { "epoch": 0.9416785206258891, "grad_norm": 0.47653411892607034, "learning_rate": 8.670441131629816e-06, "loss": 0.0586, "step": 993 }, { "epoch": 0.9426268373636795, "grad_norm": 0.53171021829938, "learning_rate": 8.66668966543464e-06, "loss": 0.0518, "step": 994 }, { "epoch": 0.9435751541014699, "grad_norm": 0.43148473645836083, "learning_rate": 8.662933728191651e-06, "loss": 0.0431, "step": 995 }, { "epoch": 0.9445234708392604, "grad_norm": 0.4471351558402442, "learning_rate": 8.659173324480722e-06, "loss": 0.0438, "step": 996 }, { "epoch": 0.9454717875770507, "grad_norm": 0.5782265716940447, "learning_rate": 8.65540845888717e-06, "loss": 0.0719, "step": 997 }, { "epoch": 0.9464201043148411, "grad_norm": 0.4141433604011682, "learning_rate": 8.651639136001762e-06, "loss": 0.0469, "step": 998 }, { "epoch": 0.9473684210526315, "grad_norm": 0.6061219180547935, "learning_rate": 8.647865360420686e-06, "loss": 0.0489, "step": 999 }, { "epoch": 0.948316737790422, "grad_norm": 0.48916698447496854, "learning_rate": 8.644087136745572e-06, "loss": 0.0513, "step": 1000 }, { "epoch": 0.9492650545282124, "grad_norm": 0.37441669864478105, "learning_rate": 8.640304469583469e-06, "loss": 0.0412, "step": 1001 }, { "epoch": 0.9502133712660028, "grad_norm": 1.0623152293680482, "learning_rate": 8.636517363546838e-06, "loss": 0.0655, "step": 1002 }, { "epoch": 0.9511616880037933, "grad_norm": 0.7061581986197312, "learning_rate": 8.63272582325357e-06, "loss": 0.0499, "step": 1003 }, { "epoch": 0.9521100047415837, "grad_norm": 0.5399127227606683, "learning_rate": 8.62892985332694e-06, "loss": 0.0518, "step": 1004 }, { "epoch": 0.9530583214793741, "grad_norm": 0.4559892605058489, "learning_rate": 8.625129458395643e-06, "loss": 0.0459, "step": 1005 }, { "epoch": 0.9540066382171646, "grad_norm": 0.485355373272851, "learning_rate": 8.621324643093762e-06, "loss": 0.0454, "step": 1006 }, { "epoch": 0.954954954954955, "grad_norm": 0.7459047370537332, "learning_rate": 8.617515412060771e-06, "loss": 0.06, "step": 1007 }, { "epoch": 0.9559032716927454, "grad_norm": 0.7374476556281685, "learning_rate": 8.613701769941526e-06, "loss": 0.0677, "step": 1008 }, { "epoch": 0.9568515884305357, "grad_norm": 0.5640575902917073, "learning_rate": 8.609883721386266e-06, "loss": 0.0464, "step": 1009 }, { "epoch": 0.9577999051683262, "grad_norm": 0.5329518829334081, "learning_rate": 8.606061271050601e-06, "loss": 0.0422, "step": 1010 }, { "epoch": 0.9587482219061166, "grad_norm": 0.5672285885118362, "learning_rate": 8.602234423595509e-06, "loss": 0.0432, "step": 1011 }, { "epoch": 0.959696538643907, "grad_norm": 0.49279890911522445, "learning_rate": 8.598403183687328e-06, "loss": 0.0411, "step": 1012 }, { "epoch": 0.9606448553816975, "grad_norm": 0.5195118583178678, "learning_rate": 8.594567555997755e-06, "loss": 0.0575, "step": 1013 }, { "epoch": 0.9615931721194879, "grad_norm": 1.372925234445775, "learning_rate": 8.590727545203833e-06, "loss": 0.0615, "step": 1014 }, { "epoch": 0.9625414888572783, "grad_norm": 0.7147315054833345, "learning_rate": 8.586883155987955e-06, "loss": 0.0712, "step": 1015 }, { "epoch": 0.9634898055950688, "grad_norm": 0.5802509696174448, "learning_rate": 8.583034393037848e-06, "loss": 0.0552, "step": 1016 }, { "epoch": 0.9644381223328592, "grad_norm": 0.49007583048635933, "learning_rate": 8.579181261046576e-06, "loss": 0.0449, "step": 1017 }, { "epoch": 0.9653864390706496, "grad_norm": 0.48751614831454176, "learning_rate": 8.57532376471253e-06, "loss": 0.0475, "step": 1018 }, { "epoch": 0.96633475580844, "grad_norm": 0.6496160692100631, "learning_rate": 8.571461908739415e-06, "loss": 0.0523, "step": 1019 }, { "epoch": 0.9672830725462305, "grad_norm": 0.481345745516473, "learning_rate": 8.567595697836266e-06, "loss": 0.0515, "step": 1020 }, { "epoch": 0.9682313892840209, "grad_norm": 0.5247818144993567, "learning_rate": 8.563725136717419e-06, "loss": 0.0494, "step": 1021 }, { "epoch": 0.9691797060218112, "grad_norm": 0.8474516614825078, "learning_rate": 8.559850230102513e-06, "loss": 0.0578, "step": 1022 }, { "epoch": 0.9701280227596017, "grad_norm": 0.7494686751693889, "learning_rate": 8.555970982716492e-06, "loss": 0.0613, "step": 1023 }, { "epoch": 0.9710763394973921, "grad_norm": 0.528161959351856, "learning_rate": 8.55208739928959e-06, "loss": 0.0446, "step": 1024 }, { "epoch": 0.9720246562351825, "grad_norm": 0.7556057248494816, "learning_rate": 8.54819948455733e-06, "loss": 0.0611, "step": 1025 }, { "epoch": 0.972972972972973, "grad_norm": 0.4857201457975449, "learning_rate": 8.54430724326051e-06, "loss": 0.0396, "step": 1026 }, { "epoch": 0.9739212897107634, "grad_norm": 0.4633933638270801, "learning_rate": 8.540410680145213e-06, "loss": 0.045, "step": 1027 }, { "epoch": 0.9748696064485538, "grad_norm": 0.5215732727679809, "learning_rate": 8.536509799962784e-06, "loss": 0.047, "step": 1028 }, { "epoch": 0.9758179231863442, "grad_norm": 1.5449712519877792, "learning_rate": 8.532604607469839e-06, "loss": 0.0717, "step": 1029 }, { "epoch": 0.9767662399241347, "grad_norm": 0.46693259860172376, "learning_rate": 8.528695107428247e-06, "loss": 0.0458, "step": 1030 }, { "epoch": 0.9777145566619251, "grad_norm": 0.5388054089062692, "learning_rate": 8.52478130460513e-06, "loss": 0.047, "step": 1031 }, { "epoch": 0.9786628733997155, "grad_norm": 0.5283181708144433, "learning_rate": 8.520863203772858e-06, "loss": 0.0496, "step": 1032 }, { "epoch": 0.979611190137506, "grad_norm": 0.5890035811704775, "learning_rate": 8.516940809709044e-06, "loss": 0.0437, "step": 1033 }, { "epoch": 0.9805595068752964, "grad_norm": 0.446739345865473, "learning_rate": 8.513014127196533e-06, "loss": 0.042, "step": 1034 }, { "epoch": 0.9815078236130867, "grad_norm": 0.49851759898580866, "learning_rate": 8.509083161023399e-06, "loss": 0.0553, "step": 1035 }, { "epoch": 0.9824561403508771, "grad_norm": 0.8986990099986447, "learning_rate": 8.505147915982943e-06, "loss": 0.0491, "step": 1036 }, { "epoch": 0.9834044570886676, "grad_norm": 0.4813313700157437, "learning_rate": 8.501208396873677e-06, "loss": 0.0524, "step": 1037 }, { "epoch": 0.984352773826458, "grad_norm": 0.7823009578163489, "learning_rate": 8.497264608499332e-06, "loss": 0.0542, "step": 1038 }, { "epoch": 0.9853010905642484, "grad_norm": 0.5256393060960738, "learning_rate": 8.49331655566884e-06, "loss": 0.0545, "step": 1039 }, { "epoch": 0.9862494073020389, "grad_norm": 0.5400471979930811, "learning_rate": 8.489364243196334e-06, "loss": 0.0495, "step": 1040 }, { "epoch": 0.9871977240398293, "grad_norm": 0.5862041954662611, "learning_rate": 8.485407675901142e-06, "loss": 0.0442, "step": 1041 }, { "epoch": 0.9881460407776197, "grad_norm": 0.6834922008296388, "learning_rate": 8.48144685860778e-06, "loss": 0.064, "step": 1042 }, { "epoch": 0.9890943575154102, "grad_norm": 0.8002369541010694, "learning_rate": 8.477481796145945e-06, "loss": 0.0464, "step": 1043 }, { "epoch": 0.9900426742532006, "grad_norm": 0.47393154077930216, "learning_rate": 8.47351249335051e-06, "loss": 0.0485, "step": 1044 }, { "epoch": 0.990990990990991, "grad_norm": 0.4987272807246751, "learning_rate": 8.469538955061525e-06, "loss": 0.0478, "step": 1045 }, { "epoch": 0.9919393077287814, "grad_norm": 0.6406968710094035, "learning_rate": 8.465561186124193e-06, "loss": 0.0494, "step": 1046 }, { "epoch": 0.9928876244665719, "grad_norm": 0.5319476049591959, "learning_rate": 8.46157919138889e-06, "loss": 0.038, "step": 1047 }, { "epoch": 0.9938359412043622, "grad_norm": 0.5377926003236448, "learning_rate": 8.457592975711128e-06, "loss": 0.0415, "step": 1048 }, { "epoch": 0.9947842579421526, "grad_norm": 0.5054973123174826, "learning_rate": 8.45360254395158e-06, "loss": 0.0509, "step": 1049 }, { "epoch": 0.9957325746799431, "grad_norm": 0.6511826899131821, "learning_rate": 8.449607900976056e-06, "loss": 0.0496, "step": 1050 }, { "epoch": 0.9966808914177335, "grad_norm": 0.34335574918053036, "learning_rate": 8.445609051655497e-06, "loss": 0.0322, "step": 1051 }, { "epoch": 0.9976292081555239, "grad_norm": 0.5324023086103392, "learning_rate": 8.441606000865978e-06, "loss": 0.0465, "step": 1052 }, { "epoch": 0.9985775248933144, "grad_norm": 0.3971741987281817, "learning_rate": 8.437598753488693e-06, "loss": 0.0316, "step": 1053 }, { "epoch": 0.9995258416311048, "grad_norm": 0.4702644191912913, "learning_rate": 8.43358731440996e-06, "loss": 0.0424, "step": 1054 }, { "epoch": 0.9995258416311048, "eval_loss": 0.05579984560608864, "eval_runtime": 205.6016, "eval_samples_per_second": 34.547, "eval_steps_per_second": 1.08, "step": 1054 }, { "epoch": 1.0004741583688952, "grad_norm": 0.4866459767505418, "learning_rate": 8.429571688521196e-06, "loss": 0.0508, "step": 1055 }, { "epoch": 1.0014224751066856, "grad_norm": 0.43980501175306935, "learning_rate": 8.425551880718938e-06, "loss": 0.0348, "step": 1056 }, { "epoch": 1.002370791844476, "grad_norm": 0.5671234502453784, "learning_rate": 8.421527895904811e-06, "loss": 0.0407, "step": 1057 }, { "epoch": 1.0033191085822666, "grad_norm": 0.47452207722793455, "learning_rate": 8.417499738985539e-06, "loss": 0.0481, "step": 1058 }, { "epoch": 1.004267425320057, "grad_norm": 0.5013037212797574, "learning_rate": 8.413467414872934e-06, "loss": 0.0433, "step": 1059 }, { "epoch": 1.0052157420578474, "grad_norm": 0.6889982998548566, "learning_rate": 8.409430928483881e-06, "loss": 0.0465, "step": 1060 }, { "epoch": 1.0061640587956378, "grad_norm": 0.6860518531915715, "learning_rate": 8.40539028474035e-06, "loss": 0.0463, "step": 1061 }, { "epoch": 1.0071123755334281, "grad_norm": 0.883560729451268, "learning_rate": 8.40134548856938e-06, "loss": 0.0503, "step": 1062 }, { "epoch": 1.0080606922712185, "grad_norm": 0.535065661745393, "learning_rate": 8.397296544903067e-06, "loss": 0.0516, "step": 1063 }, { "epoch": 1.009009009009009, "grad_norm": 0.5038131455825761, "learning_rate": 8.393243458678565e-06, "loss": 0.0446, "step": 1064 }, { "epoch": 1.0099573257467995, "grad_norm": 0.5572240876317798, "learning_rate": 8.389186234838086e-06, "loss": 0.0501, "step": 1065 }, { "epoch": 1.01090564248459, "grad_norm": 0.5298425351016989, "learning_rate": 8.385124878328881e-06, "loss": 0.049, "step": 1066 }, { "epoch": 1.0118539592223803, "grad_norm": 0.6691906659193966, "learning_rate": 8.381059394103244e-06, "loss": 0.054, "step": 1067 }, { "epoch": 1.0128022759601707, "grad_norm": 0.43516390524601356, "learning_rate": 8.376989787118498e-06, "loss": 0.0409, "step": 1068 }, { "epoch": 1.013750592697961, "grad_norm": 0.5385522154452346, "learning_rate": 8.372916062336998e-06, "loss": 0.0424, "step": 1069 }, { "epoch": 1.0146989094357515, "grad_norm": 0.48362376282426833, "learning_rate": 8.368838224726117e-06, "loss": 0.0555, "step": 1070 }, { "epoch": 1.0156472261735419, "grad_norm": 0.41768940684559036, "learning_rate": 8.364756279258245e-06, "loss": 0.0379, "step": 1071 }, { "epoch": 1.0165955429113325, "grad_norm": 0.5803726520999999, "learning_rate": 8.360670230910777e-06, "loss": 0.0434, "step": 1072 }, { "epoch": 1.0175438596491229, "grad_norm": 0.4357479407163672, "learning_rate": 8.356580084666114e-06, "loss": 0.0411, "step": 1073 }, { "epoch": 1.0184921763869133, "grad_norm": 0.5709121778249012, "learning_rate": 8.352485845511658e-06, "loss": 0.0409, "step": 1074 }, { "epoch": 1.0194404931247036, "grad_norm": 0.5523633969751309, "learning_rate": 8.348387518439794e-06, "loss": 0.0489, "step": 1075 }, { "epoch": 1.020388809862494, "grad_norm": 0.6742232849407267, "learning_rate": 8.344285108447896e-06, "loss": 0.0493, "step": 1076 }, { "epoch": 1.0213371266002844, "grad_norm": 0.3560104483717237, "learning_rate": 8.340178620538316e-06, "loss": 0.0367, "step": 1077 }, { "epoch": 1.0222854433380748, "grad_norm": 0.5324136816558409, "learning_rate": 8.336068059718375e-06, "loss": 0.0468, "step": 1078 }, { "epoch": 1.0232337600758654, "grad_norm": 0.506743337131701, "learning_rate": 8.331953431000368e-06, "loss": 0.0712, "step": 1079 }, { "epoch": 1.0241820768136558, "grad_norm": 0.415891887112626, "learning_rate": 8.327834739401543e-06, "loss": 0.0431, "step": 1080 }, { "epoch": 1.0251303935514462, "grad_norm": 0.62544359839528, "learning_rate": 8.323711989944107e-06, "loss": 0.0418, "step": 1081 }, { "epoch": 1.0260787102892366, "grad_norm": 0.5632716938219344, "learning_rate": 8.319585187655211e-06, "loss": 0.0547, "step": 1082 }, { "epoch": 1.027027027027027, "grad_norm": 0.4086615551677634, "learning_rate": 8.315454337566952e-06, "loss": 0.0427, "step": 1083 }, { "epoch": 1.0279753437648174, "grad_norm": 0.5161406978477902, "learning_rate": 8.311319444716358e-06, "loss": 0.06, "step": 1084 }, { "epoch": 1.028923660502608, "grad_norm": 0.604906094268659, "learning_rate": 8.307180514145392e-06, "loss": 0.052, "step": 1085 }, { "epoch": 1.0298719772403984, "grad_norm": 0.5968839553881752, "learning_rate": 8.303037550900935e-06, "loss": 0.0529, "step": 1086 }, { "epoch": 1.0308202939781888, "grad_norm": 0.5598009215343084, "learning_rate": 8.298890560034792e-06, "loss": 0.0397, "step": 1087 }, { "epoch": 1.0317686107159791, "grad_norm": 0.6572448390948811, "learning_rate": 8.29473954660367e-06, "loss": 0.0678, "step": 1088 }, { "epoch": 1.0327169274537695, "grad_norm": 0.6576638795258621, "learning_rate": 8.29058451566919e-06, "loss": 0.0598, "step": 1089 }, { "epoch": 1.03366524419156, "grad_norm": 0.7400846520221458, "learning_rate": 8.286425472297868e-06, "loss": 0.0415, "step": 1090 }, { "epoch": 1.0346135609293503, "grad_norm": 0.5570271824791166, "learning_rate": 8.282262421561111e-06, "loss": 0.0398, "step": 1091 }, { "epoch": 1.035561877667141, "grad_norm": 0.46875414138844956, "learning_rate": 8.278095368535215e-06, "loss": 0.0494, "step": 1092 }, { "epoch": 1.0365101944049313, "grad_norm": 0.5969883635346438, "learning_rate": 8.273924318301354e-06, "loss": 0.0747, "step": 1093 }, { "epoch": 1.0374585111427217, "grad_norm": 0.4752850264908405, "learning_rate": 8.26974927594558e-06, "loss": 0.0425, "step": 1094 }, { "epoch": 1.038406827880512, "grad_norm": 0.44930878997000867, "learning_rate": 8.26557024655881e-06, "loss": 0.0433, "step": 1095 }, { "epoch": 1.0393551446183025, "grad_norm": 0.6013056895336477, "learning_rate": 8.261387235236821e-06, "loss": 0.0329, "step": 1096 }, { "epoch": 1.0403034613560929, "grad_norm": 0.6657503155051709, "learning_rate": 8.257200247080249e-06, "loss": 0.0601, "step": 1097 }, { "epoch": 1.0412517780938833, "grad_norm": 0.6832978810435474, "learning_rate": 8.253009287194576e-06, "loss": 0.0466, "step": 1098 }, { "epoch": 1.0422000948316739, "grad_norm": 0.6455854633128209, "learning_rate": 8.248814360690129e-06, "loss": 0.0399, "step": 1099 }, { "epoch": 1.0431484115694643, "grad_norm": 0.4721829163859669, "learning_rate": 8.244615472682074e-06, "loss": 0.0442, "step": 1100 }, { "epoch": 1.0440967283072546, "grad_norm": 0.6488059712530941, "learning_rate": 8.240412628290405e-06, "loss": 0.0553, "step": 1101 }, { "epoch": 1.045045045045045, "grad_norm": 0.4881875750545383, "learning_rate": 8.236205832639936e-06, "loss": 0.0394, "step": 1102 }, { "epoch": 1.0459933617828354, "grad_norm": 0.5543410049132131, "learning_rate": 8.231995090860306e-06, "loss": 0.0496, "step": 1103 }, { "epoch": 1.0469416785206258, "grad_norm": 0.4693748617137878, "learning_rate": 8.227780408085964e-06, "loss": 0.0455, "step": 1104 }, { "epoch": 1.0478899952584164, "grad_norm": 0.43672702099083865, "learning_rate": 8.22356178945616e-06, "loss": 0.033, "step": 1105 }, { "epoch": 1.0488383119962068, "grad_norm": 0.53698914158417, "learning_rate": 8.219339240114955e-06, "loss": 0.0428, "step": 1106 }, { "epoch": 1.0497866287339972, "grad_norm": 0.4824835633653137, "learning_rate": 8.215112765211186e-06, "loss": 0.0454, "step": 1107 }, { "epoch": 1.0507349454717876, "grad_norm": 0.3929806462421986, "learning_rate": 8.210882369898487e-06, "loss": 0.0331, "step": 1108 }, { "epoch": 1.051683262209578, "grad_norm": 0.4717562604419889, "learning_rate": 8.206648059335276e-06, "loss": 0.0371, "step": 1109 }, { "epoch": 1.0526315789473684, "grad_norm": 0.48523700167872613, "learning_rate": 8.202409838684737e-06, "loss": 0.044, "step": 1110 }, { "epoch": 1.0535798956851588, "grad_norm": 0.5711347825035666, "learning_rate": 8.198167713114824e-06, "loss": 0.0482, "step": 1111 }, { "epoch": 1.0545282124229494, "grad_norm": 0.6497519216289137, "learning_rate": 8.193921687798257e-06, "loss": 0.0583, "step": 1112 }, { "epoch": 1.0554765291607398, "grad_norm": 0.41025741095361523, "learning_rate": 8.189671767912502e-06, "loss": 0.0391, "step": 1113 }, { "epoch": 1.0564248458985301, "grad_norm": 0.42565607076822326, "learning_rate": 8.185417958639787e-06, "loss": 0.0511, "step": 1114 }, { "epoch": 1.0573731626363205, "grad_norm": 0.6066779731441627, "learning_rate": 8.18116026516707e-06, "loss": 0.0425, "step": 1115 }, { "epoch": 1.058321479374111, "grad_norm": 0.614194232019259, "learning_rate": 8.17689869268605e-06, "loss": 0.0399, "step": 1116 }, { "epoch": 1.0592697961119013, "grad_norm": 0.654835728000199, "learning_rate": 8.17263324639316e-06, "loss": 0.0509, "step": 1117 }, { "epoch": 1.0602181128496917, "grad_norm": 2.390822654668922, "learning_rate": 8.168363931489554e-06, "loss": 0.0538, "step": 1118 }, { "epoch": 1.0611664295874823, "grad_norm": 0.5309786733261611, "learning_rate": 8.164090753181097e-06, "loss": 0.0438, "step": 1119 }, { "epoch": 1.0621147463252727, "grad_norm": 0.5765552296723528, "learning_rate": 8.159813716678375e-06, "loss": 0.0573, "step": 1120 }, { "epoch": 1.063063063063063, "grad_norm": 0.702317664551232, "learning_rate": 8.15553282719667e-06, "loss": 0.0498, "step": 1121 }, { "epoch": 1.0640113798008535, "grad_norm": 0.7582330479572109, "learning_rate": 8.15124808995597e-06, "loss": 0.0686, "step": 1122 }, { "epoch": 1.0649596965386439, "grad_norm": 2.8142295533632353, "learning_rate": 8.146959510180947e-06, "loss": 0.0448, "step": 1123 }, { "epoch": 1.0659080132764343, "grad_norm": 0.8554647159912601, "learning_rate": 8.142667093100966e-06, "loss": 0.0636, "step": 1124 }, { "epoch": 1.0668563300142249, "grad_norm": 0.4583216098085599, "learning_rate": 8.138370843950067e-06, "loss": 0.0407, "step": 1125 }, { "epoch": 1.0678046467520153, "grad_norm": 0.5879158711527235, "learning_rate": 8.13407076796696e-06, "loss": 0.0779, "step": 1126 }, { "epoch": 1.0687529634898056, "grad_norm": 0.39347630183739596, "learning_rate": 8.129766870395027e-06, "loss": 0.0364, "step": 1127 }, { "epoch": 1.069701280227596, "grad_norm": 0.42771084872916015, "learning_rate": 8.125459156482307e-06, "loss": 0.0413, "step": 1128 }, { "epoch": 1.0706495969653864, "grad_norm": 0.5517158326408007, "learning_rate": 8.121147631481491e-06, "loss": 0.037, "step": 1129 }, { "epoch": 1.0715979137031768, "grad_norm": 0.49889308342240923, "learning_rate": 8.116832300649924e-06, "loss": 0.0464, "step": 1130 }, { "epoch": 1.0725462304409672, "grad_norm": 0.5993137897003766, "learning_rate": 8.112513169249583e-06, "loss": 0.0609, "step": 1131 }, { "epoch": 1.0734945471787578, "grad_norm": 0.543205478981709, "learning_rate": 8.108190242547082e-06, "loss": 0.0469, "step": 1132 }, { "epoch": 1.0744428639165482, "grad_norm": 0.7091841906617682, "learning_rate": 8.103863525813667e-06, "loss": 0.0414, "step": 1133 }, { "epoch": 1.0753911806543386, "grad_norm": 0.4370481397065285, "learning_rate": 8.0995330243252e-06, "loss": 0.0458, "step": 1134 }, { "epoch": 1.076339497392129, "grad_norm": 0.4502863591933898, "learning_rate": 8.095198743362162e-06, "loss": 0.0473, "step": 1135 }, { "epoch": 1.0772878141299194, "grad_norm": 0.5013748238976868, "learning_rate": 8.090860688209641e-06, "loss": 0.0466, "step": 1136 }, { "epoch": 1.0782361308677098, "grad_norm": 0.6748865559678139, "learning_rate": 8.086518864157325e-06, "loss": 0.0553, "step": 1137 }, { "epoch": 1.0791844476055001, "grad_norm": 0.6409644237971479, "learning_rate": 8.0821732764995e-06, "loss": 0.049, "step": 1138 }, { "epoch": 1.0801327643432908, "grad_norm": 0.6146233989253884, "learning_rate": 8.077823930535045e-06, "loss": 0.0472, "step": 1139 }, { "epoch": 1.0810810810810811, "grad_norm": 0.4066821137097016, "learning_rate": 8.073470831567414e-06, "loss": 0.0326, "step": 1140 }, { "epoch": 1.0820293978188715, "grad_norm": 0.8437034804113782, "learning_rate": 8.069113984904642e-06, "loss": 0.054, "step": 1141 }, { "epoch": 1.082977714556662, "grad_norm": 0.5161472652823138, "learning_rate": 8.064753395859333e-06, "loss": 0.0436, "step": 1142 }, { "epoch": 1.0839260312944523, "grad_norm": 0.48759533161402335, "learning_rate": 8.060389069748653e-06, "loss": 0.0462, "step": 1143 }, { "epoch": 1.0848743480322427, "grad_norm": 0.4172865968707472, "learning_rate": 8.05602101189433e-06, "loss": 0.0376, "step": 1144 }, { "epoch": 1.085822664770033, "grad_norm": 0.6834848316978862, "learning_rate": 8.051649227622634e-06, "loss": 0.0474, "step": 1145 }, { "epoch": 1.0867709815078237, "grad_norm": 0.6135237968103057, "learning_rate": 8.047273722264384e-06, "loss": 0.0377, "step": 1146 }, { "epoch": 1.087719298245614, "grad_norm": 0.5735481827911786, "learning_rate": 8.042894501154937e-06, "loss": 0.0577, "step": 1147 }, { "epoch": 1.0886676149834045, "grad_norm": 0.5126797458774718, "learning_rate": 8.03851156963418e-06, "loss": 0.0342, "step": 1148 }, { "epoch": 1.0896159317211949, "grad_norm": 0.5394680925590052, "learning_rate": 8.034124933046523e-06, "loss": 0.0427, "step": 1149 }, { "epoch": 1.0905642484589853, "grad_norm": 1.0619045413666424, "learning_rate": 8.029734596740895e-06, "loss": 0.0347, "step": 1150 }, { "epoch": 1.0915125651967756, "grad_norm": 0.5219216100491317, "learning_rate": 8.025340566070737e-06, "loss": 0.0392, "step": 1151 }, { "epoch": 1.0924608819345663, "grad_norm": 0.47125200466301426, "learning_rate": 8.020942846393993e-06, "loss": 0.049, "step": 1152 }, { "epoch": 1.0934091986723566, "grad_norm": 0.4310790509560298, "learning_rate": 8.016541443073105e-06, "loss": 0.038, "step": 1153 }, { "epoch": 1.094357515410147, "grad_norm": 0.7534317067913003, "learning_rate": 8.01213636147501e-06, "loss": 0.0536, "step": 1154 }, { "epoch": 1.0953058321479374, "grad_norm": 0.6207928110119649, "learning_rate": 8.007727606971127e-06, "loss": 0.0434, "step": 1155 }, { "epoch": 1.0962541488857278, "grad_norm": 0.57563137722865, "learning_rate": 8.003315184937355e-06, "loss": 0.0364, "step": 1156 }, { "epoch": 1.0972024656235182, "grad_norm": 0.5509639511135459, "learning_rate": 7.998899100754065e-06, "loss": 0.0517, "step": 1157 }, { "epoch": 1.0981507823613086, "grad_norm": 0.4267295425494574, "learning_rate": 7.994479359806091e-06, "loss": 0.0405, "step": 1158 }, { "epoch": 1.0990990990990992, "grad_norm": 0.49807239389073216, "learning_rate": 7.990055967482733e-06, "loss": 0.0352, "step": 1159 }, { "epoch": 1.1000474158368896, "grad_norm": 0.7895890014997967, "learning_rate": 7.985628929177733e-06, "loss": 0.0373, "step": 1160 }, { "epoch": 1.10099573257468, "grad_norm": 0.9249656986473295, "learning_rate": 7.98119825028929e-06, "loss": 0.0457, "step": 1161 }, { "epoch": 1.1019440493124704, "grad_norm": 0.627936744238416, "learning_rate": 7.976763936220031e-06, "loss": 0.047, "step": 1162 }, { "epoch": 1.1028923660502608, "grad_norm": 0.6238528107028002, "learning_rate": 7.972325992377026e-06, "loss": 0.0488, "step": 1163 }, { "epoch": 1.1038406827880511, "grad_norm": 0.4657651602214921, "learning_rate": 7.967884424171764e-06, "loss": 0.0327, "step": 1164 }, { "epoch": 1.1047889995258415, "grad_norm": 0.5850593930568389, "learning_rate": 7.963439237020157e-06, "loss": 0.0521, "step": 1165 }, { "epoch": 1.1057373162636321, "grad_norm": 0.7614458412566218, "learning_rate": 7.958990436342525e-06, "loss": 0.0708, "step": 1166 }, { "epoch": 1.1066856330014225, "grad_norm": 0.5375594263604339, "learning_rate": 7.954538027563601e-06, "loss": 0.0427, "step": 1167 }, { "epoch": 1.107633949739213, "grad_norm": 0.6028510385361613, "learning_rate": 7.950082016112514e-06, "loss": 0.0461, "step": 1168 }, { "epoch": 1.1085822664770033, "grad_norm": 0.7339781460019883, "learning_rate": 7.945622407422787e-06, "loss": 0.0449, "step": 1169 }, { "epoch": 1.1095305832147937, "grad_norm": 0.5248693288710427, "learning_rate": 7.941159206932327e-06, "loss": 0.0451, "step": 1170 }, { "epoch": 1.110478899952584, "grad_norm": 0.500024623512, "learning_rate": 7.936692420083423e-06, "loss": 0.0438, "step": 1171 }, { "epoch": 1.1114272166903745, "grad_norm": 0.46620520422481104, "learning_rate": 7.932222052322736e-06, "loss": 0.0469, "step": 1172 }, { "epoch": 1.112375533428165, "grad_norm": 0.7140775122015546, "learning_rate": 7.927748109101296e-06, "loss": 0.0492, "step": 1173 }, { "epoch": 1.1133238501659555, "grad_norm": 0.5204616685358204, "learning_rate": 7.923270595874489e-06, "loss": 0.0516, "step": 1174 }, { "epoch": 1.1142721669037459, "grad_norm": 0.41118460218118136, "learning_rate": 7.918789518102057e-06, "loss": 0.0374, "step": 1175 }, { "epoch": 1.1152204836415363, "grad_norm": 0.5459512894558529, "learning_rate": 7.914304881248085e-06, "loss": 0.0506, "step": 1176 }, { "epoch": 1.1161688003793266, "grad_norm": 0.5574376078006338, "learning_rate": 7.909816690781005e-06, "loss": 0.0515, "step": 1177 }, { "epoch": 1.117117117117117, "grad_norm": 0.2933208036522205, "learning_rate": 7.905324952173573e-06, "loss": 0.0345, "step": 1178 }, { "epoch": 1.1180654338549076, "grad_norm": 0.4993016043563598, "learning_rate": 7.900829670902876e-06, "loss": 0.0487, "step": 1179 }, { "epoch": 1.119013750592698, "grad_norm": 0.4981848296785552, "learning_rate": 7.896330852450324e-06, "loss": 0.049, "step": 1180 }, { "epoch": 1.1199620673304884, "grad_norm": 0.5492037552465713, "learning_rate": 7.891828502301637e-06, "loss": 0.0626, "step": 1181 }, { "epoch": 1.1209103840682788, "grad_norm": 0.5945243855558169, "learning_rate": 7.887322625946836e-06, "loss": 0.0566, "step": 1182 }, { "epoch": 1.1218587008060692, "grad_norm": 0.5008044550443231, "learning_rate": 7.882813228880253e-06, "loss": 0.0365, "step": 1183 }, { "epoch": 1.1228070175438596, "grad_norm": 0.4974033363856913, "learning_rate": 7.878300316600504e-06, "loss": 0.0508, "step": 1184 }, { "epoch": 1.12375533428165, "grad_norm": 1.1163525573958883, "learning_rate": 7.873783894610496e-06, "loss": 0.0591, "step": 1185 }, { "epoch": 1.1247036510194406, "grad_norm": 0.529056835529291, "learning_rate": 7.869263968417412e-06, "loss": 0.041, "step": 1186 }, { "epoch": 1.125651967757231, "grad_norm": 0.39561323424421474, "learning_rate": 7.864740543532711e-06, "loss": 0.043, "step": 1187 }, { "epoch": 1.1266002844950214, "grad_norm": 0.5719661025649166, "learning_rate": 7.860213625472119e-06, "loss": 0.0394, "step": 1188 }, { "epoch": 1.1275486012328118, "grad_norm": 0.683758567602495, "learning_rate": 7.855683219755617e-06, "loss": 0.0596, "step": 1189 }, { "epoch": 1.1284969179706021, "grad_norm": 0.43615448430346, "learning_rate": 7.851149331907442e-06, "loss": 0.0458, "step": 1190 }, { "epoch": 1.1294452347083925, "grad_norm": 0.5455285097679818, "learning_rate": 7.84661196745608e-06, "loss": 0.0458, "step": 1191 }, { "epoch": 1.1303935514461831, "grad_norm": 0.46823267018975095, "learning_rate": 7.842071131934246e-06, "loss": 0.0395, "step": 1192 }, { "epoch": 1.1313418681839735, "grad_norm": 0.6410339405297693, "learning_rate": 7.837526830878901e-06, "loss": 0.0528, "step": 1193 }, { "epoch": 1.132290184921764, "grad_norm": 0.6329347246677137, "learning_rate": 7.83297906983122e-06, "loss": 0.0694, "step": 1194 }, { "epoch": 1.1332385016595543, "grad_norm": 1.1861963543383596, "learning_rate": 7.828427854336604e-06, "loss": 0.0748, "step": 1195 }, { "epoch": 1.1341868183973447, "grad_norm": 0.4102764477580429, "learning_rate": 7.823873189944664e-06, "loss": 0.0393, "step": 1196 }, { "epoch": 1.135135135135135, "grad_norm": 0.4452453730186256, "learning_rate": 7.819315082209217e-06, "loss": 0.0354, "step": 1197 }, { "epoch": 1.1360834518729255, "grad_norm": 0.45033548160883075, "learning_rate": 7.814753536688278e-06, "loss": 0.0366, "step": 1198 }, { "epoch": 1.1370317686107159, "grad_norm": 0.5479341929657664, "learning_rate": 7.810188558944054e-06, "loss": 0.0606, "step": 1199 }, { "epoch": 1.1379800853485065, "grad_norm": 0.5550674838469363, "learning_rate": 7.805620154542938e-06, "loss": 0.0406, "step": 1200 }, { "epoch": 1.1389284020862969, "grad_norm": 0.6478804122717543, "learning_rate": 7.801048329055502e-06, "loss": 0.0644, "step": 1201 }, { "epoch": 1.1398767188240873, "grad_norm": 0.40041584333311264, "learning_rate": 7.796473088056487e-06, "loss": 0.0378, "step": 1202 }, { "epoch": 1.1408250355618776, "grad_norm": 0.4223598793738602, "learning_rate": 7.7918944371248e-06, "loss": 0.039, "step": 1203 }, { "epoch": 1.141773352299668, "grad_norm": 0.4961808103718134, "learning_rate": 7.787312381843503e-06, "loss": 0.042, "step": 1204 }, { "epoch": 1.1427216690374584, "grad_norm": 0.4738964503477037, "learning_rate": 7.78272692779982e-06, "loss": 0.036, "step": 1205 }, { "epoch": 1.143669985775249, "grad_norm": 0.48670332456917464, "learning_rate": 7.778138080585107e-06, "loss": 0.0298, "step": 1206 }, { "epoch": 1.1446183025130394, "grad_norm": 0.5855118813585286, "learning_rate": 7.77354584579486e-06, "loss": 0.0421, "step": 1207 }, { "epoch": 1.1455666192508298, "grad_norm": 0.4306606042859337, "learning_rate": 7.768950229028713e-06, "loss": 0.0417, "step": 1208 }, { "epoch": 1.1465149359886202, "grad_norm": 0.4725287807287063, "learning_rate": 7.764351235890416e-06, "loss": 0.0386, "step": 1209 }, { "epoch": 1.1474632527264106, "grad_norm": 0.6431359997140053, "learning_rate": 7.759748871987838e-06, "loss": 0.0759, "step": 1210 }, { "epoch": 1.148411569464201, "grad_norm": 0.42470773245514043, "learning_rate": 7.75514314293296e-06, "loss": 0.0393, "step": 1211 }, { "epoch": 1.1493598862019914, "grad_norm": 0.4346642115827222, "learning_rate": 7.750534054341866e-06, "loss": 0.0381, "step": 1212 }, { "epoch": 1.150308202939782, "grad_norm": 0.5312065161872603, "learning_rate": 7.745921611834734e-06, "loss": 0.0397, "step": 1213 }, { "epoch": 1.1512565196775724, "grad_norm": 0.5385091421622926, "learning_rate": 7.741305821035836e-06, "loss": 0.0582, "step": 1214 }, { "epoch": 1.1522048364153628, "grad_norm": 0.4778078646025971, "learning_rate": 7.736686687573523e-06, "loss": 0.0437, "step": 1215 }, { "epoch": 1.1531531531531531, "grad_norm": 0.709371140831048, "learning_rate": 7.732064217080224e-06, "loss": 0.0492, "step": 1216 }, { "epoch": 1.1541014698909435, "grad_norm": 0.4183114831368168, "learning_rate": 7.727438415192434e-06, "loss": 0.0393, "step": 1217 }, { "epoch": 1.155049786628734, "grad_norm": 0.541221858338076, "learning_rate": 7.722809287550716e-06, "loss": 0.0459, "step": 1218 }, { "epoch": 1.1559981033665245, "grad_norm": 0.4067212457533803, "learning_rate": 7.718176839799682e-06, "loss": 0.0362, "step": 1219 }, { "epoch": 1.156946420104315, "grad_norm": 0.4383704109007449, "learning_rate": 7.713541077587996e-06, "loss": 0.0339, "step": 1220 }, { "epoch": 1.1578947368421053, "grad_norm": 0.4650740040984162, "learning_rate": 7.708902006568365e-06, "loss": 0.036, "step": 1221 }, { "epoch": 1.1588430535798957, "grad_norm": 0.3918848259293643, "learning_rate": 7.704259632397525e-06, "loss": 0.0307, "step": 1222 }, { "epoch": 1.159791370317686, "grad_norm": 0.49823361822736345, "learning_rate": 7.699613960736247e-06, "loss": 0.0474, "step": 1223 }, { "epoch": 1.1607396870554765, "grad_norm": 0.4868259723121106, "learning_rate": 7.694964997249317e-06, "loss": 0.0463, "step": 1224 }, { "epoch": 1.1616880037932669, "grad_norm": 0.48069332380581936, "learning_rate": 7.690312747605536e-06, "loss": 0.0426, "step": 1225 }, { "epoch": 1.1626363205310573, "grad_norm": 0.491998777057983, "learning_rate": 7.685657217477716e-06, "loss": 0.0389, "step": 1226 }, { "epoch": 1.1635846372688479, "grad_norm": 0.35813001299038266, "learning_rate": 7.680998412542665e-06, "loss": 0.0331, "step": 1227 }, { "epoch": 1.1645329540066383, "grad_norm": 0.5968215449565825, "learning_rate": 7.676336338481182e-06, "loss": 0.0406, "step": 1228 }, { "epoch": 1.1654812707444286, "grad_norm": 0.5559719660043877, "learning_rate": 7.671671000978063e-06, "loss": 0.0712, "step": 1229 }, { "epoch": 1.166429587482219, "grad_norm": 0.42335427343139226, "learning_rate": 7.66700240572207e-06, "loss": 0.0376, "step": 1230 }, { "epoch": 1.1673779042200094, "grad_norm": 0.49655937772975667, "learning_rate": 7.662330558405943e-06, "loss": 0.0414, "step": 1231 }, { "epoch": 1.1683262209577998, "grad_norm": 0.3994453444748161, "learning_rate": 7.657655464726395e-06, "loss": 0.0404, "step": 1232 }, { "epoch": 1.1692745376955904, "grad_norm": 0.41285430890496555, "learning_rate": 7.652977130384083e-06, "loss": 0.0486, "step": 1233 }, { "epoch": 1.1702228544333808, "grad_norm": 0.6071610144149513, "learning_rate": 7.648295561083627e-06, "loss": 0.0633, "step": 1234 }, { "epoch": 1.1711711711711712, "grad_norm": 0.5259863664992493, "learning_rate": 7.643610762533584e-06, "loss": 0.0585, "step": 1235 }, { "epoch": 1.1721194879089616, "grad_norm": 0.3946045435612188, "learning_rate": 7.638922740446457e-06, "loss": 0.0355, "step": 1236 }, { "epoch": 1.173067804646752, "grad_norm": 0.3568622458812664, "learning_rate": 7.63423150053867e-06, "loss": 0.0295, "step": 1237 }, { "epoch": 1.1740161213845424, "grad_norm": 0.5469312105534027, "learning_rate": 7.629537048530583e-06, "loss": 0.0403, "step": 1238 }, { "epoch": 1.1749644381223328, "grad_norm": 0.5387702508086157, "learning_rate": 7.6248393901464564e-06, "loss": 0.0483, "step": 1239 }, { "epoch": 1.1759127548601234, "grad_norm": 0.5406781757294887, "learning_rate": 7.620138531114476e-06, "loss": 0.0419, "step": 1240 }, { "epoch": 1.1768610715979138, "grad_norm": 0.48066173819586494, "learning_rate": 7.61543447716672e-06, "loss": 0.0471, "step": 1241 }, { "epoch": 1.1778093883357041, "grad_norm": 0.5132766758180038, "learning_rate": 7.610727234039168e-06, "loss": 0.0373, "step": 1242 }, { "epoch": 1.1787577050734945, "grad_norm": 0.4440200211246783, "learning_rate": 7.606016807471686e-06, "loss": 0.0303, "step": 1243 }, { "epoch": 1.179706021811285, "grad_norm": 0.5120784198135685, "learning_rate": 7.601303203208021e-06, "loss": 0.0415, "step": 1244 }, { "epoch": 1.1806543385490753, "grad_norm": 0.5119968299555113, "learning_rate": 7.596586426995798e-06, "loss": 0.033, "step": 1245 }, { "epoch": 1.181602655286866, "grad_norm": 0.649285479277398, "learning_rate": 7.591866484586505e-06, "loss": 0.0412, "step": 1246 }, { "epoch": 1.1825509720246563, "grad_norm": 0.8899853131901044, "learning_rate": 7.587143381735498e-06, "loss": 0.0495, "step": 1247 }, { "epoch": 1.1834992887624467, "grad_norm": 0.6010967444971024, "learning_rate": 7.5824171242019796e-06, "loss": 0.0412, "step": 1248 }, { "epoch": 1.184447605500237, "grad_norm": 0.5257812163804602, "learning_rate": 7.5776877177490004e-06, "loss": 0.041, "step": 1249 }, { "epoch": 1.1853959222380275, "grad_norm": 0.5059417156750543, "learning_rate": 7.572955168143456e-06, "loss": 0.0564, "step": 1250 }, { "epoch": 1.1863442389758179, "grad_norm": 0.5287163894503856, "learning_rate": 7.568219481156067e-06, "loss": 0.0443, "step": 1251 }, { "epoch": 1.1872925557136083, "grad_norm": 0.5764185328336728, "learning_rate": 7.563480662561386e-06, "loss": 0.0408, "step": 1252 }, { "epoch": 1.1882408724513986, "grad_norm": 0.4606910371892103, "learning_rate": 7.55873871813778e-06, "loss": 0.0486, "step": 1253 }, { "epoch": 1.1891891891891893, "grad_norm": 0.4307521432754428, "learning_rate": 7.553993653667432e-06, "loss": 0.0311, "step": 1254 }, { "epoch": 1.1901375059269796, "grad_norm": 0.9134386020304202, "learning_rate": 7.549245474936324e-06, "loss": 0.0461, "step": 1255 }, { "epoch": 1.19108582266477, "grad_norm": 0.4558303355166927, "learning_rate": 7.544494187734237e-06, "loss": 0.0499, "step": 1256 }, { "epoch": 1.1920341394025604, "grad_norm": 0.3989843552269527, "learning_rate": 7.539739797854746e-06, "loss": 0.0349, "step": 1257 }, { "epoch": 1.1929824561403508, "grad_norm": 0.3906291841756068, "learning_rate": 7.5349823110952046e-06, "loss": 0.0281, "step": 1258 }, { "epoch": 1.1939307728781414, "grad_norm": 0.41315135278835374, "learning_rate": 7.530221733256749e-06, "loss": 0.0436, "step": 1259 }, { "epoch": 1.1948790896159318, "grad_norm": 1.0487429774583699, "learning_rate": 7.525458070144276e-06, "loss": 0.0456, "step": 1260 }, { "epoch": 1.1958274063537222, "grad_norm": 0.7492477244429876, "learning_rate": 7.520691327566449e-06, "loss": 0.061, "step": 1261 }, { "epoch": 1.1967757230915126, "grad_norm": 0.6635915757614768, "learning_rate": 7.515921511335689e-06, "loss": 0.0686, "step": 1262 }, { "epoch": 1.197724039829303, "grad_norm": 0.5712864519563702, "learning_rate": 7.511148627268161e-06, "loss": 0.0487, "step": 1263 }, { "epoch": 1.1986723565670934, "grad_norm": 0.603343671959781, "learning_rate": 7.5063726811837716e-06, "loss": 0.052, "step": 1264 }, { "epoch": 1.1996206733048838, "grad_norm": 0.4406684429170475, "learning_rate": 7.501593678906161e-06, "loss": 0.0414, "step": 1265 }, { "epoch": 1.2005689900426741, "grad_norm": 0.6442697808186765, "learning_rate": 7.496811626262699e-06, "loss": 0.0423, "step": 1266 }, { "epoch": 1.2015173067804648, "grad_norm": 0.5559558578993309, "learning_rate": 7.492026529084468e-06, "loss": 0.0402, "step": 1267 }, { "epoch": 1.2024656235182551, "grad_norm": 0.5454144102950605, "learning_rate": 7.487238393206271e-06, "loss": 0.0367, "step": 1268 }, { "epoch": 1.2034139402560455, "grad_norm": 0.5693008724632117, "learning_rate": 7.4824472244666134e-06, "loss": 0.043, "step": 1269 }, { "epoch": 1.204362256993836, "grad_norm": 0.6776730947602304, "learning_rate": 7.477653028707694e-06, "loss": 0.0455, "step": 1270 }, { "epoch": 1.2053105737316263, "grad_norm": 0.5055727203867033, "learning_rate": 7.472855811775411e-06, "loss": 0.0398, "step": 1271 }, { "epoch": 1.2062588904694167, "grad_norm": 0.6863655340816167, "learning_rate": 7.468055579519338e-06, "loss": 0.0338, "step": 1272 }, { "epoch": 1.2072072072072073, "grad_norm": 0.5081101945117912, "learning_rate": 7.4632523377927335e-06, "loss": 0.0495, "step": 1273 }, { "epoch": 1.2081555239449977, "grad_norm": 0.2936628736656289, "learning_rate": 7.458446092452518e-06, "loss": 0.0271, "step": 1274 }, { "epoch": 1.209103840682788, "grad_norm": 0.36318292005655967, "learning_rate": 7.453636849359281e-06, "loss": 0.0291, "step": 1275 }, { "epoch": 1.2100521574205785, "grad_norm": 0.6126900566455881, "learning_rate": 7.448824614377264e-06, "loss": 0.0486, "step": 1276 }, { "epoch": 1.2110004741583689, "grad_norm": 0.5964527538723606, "learning_rate": 7.444009393374356e-06, "loss": 0.0459, "step": 1277 }, { "epoch": 1.2119487908961593, "grad_norm": 0.473196708998212, "learning_rate": 7.43919119222209e-06, "loss": 0.0323, "step": 1278 }, { "epoch": 1.2128971076339496, "grad_norm": 0.4144321881388711, "learning_rate": 7.434370016795629e-06, "loss": 0.0393, "step": 1279 }, { "epoch": 1.2138454243717403, "grad_norm": 0.5122361225802895, "learning_rate": 7.429545872973765e-06, "loss": 0.0405, "step": 1280 }, { "epoch": 1.2147937411095306, "grad_norm": 0.5224599737121133, "learning_rate": 7.424718766638915e-06, "loss": 0.0468, "step": 1281 }, { "epoch": 1.215742057847321, "grad_norm": 0.3874552596536454, "learning_rate": 7.419888703677097e-06, "loss": 0.0352, "step": 1282 }, { "epoch": 1.2166903745851114, "grad_norm": 0.5431714072384897, "learning_rate": 7.415055689977943e-06, "loss": 0.0573, "step": 1283 }, { "epoch": 1.2176386913229018, "grad_norm": 0.6074067354337574, "learning_rate": 7.4102197314346765e-06, "loss": 0.0569, "step": 1284 }, { "epoch": 1.2185870080606922, "grad_norm": 0.5384133252576224, "learning_rate": 7.40538083394412e-06, "loss": 0.0386, "step": 1285 }, { "epoch": 1.2195353247984828, "grad_norm": 0.6817357798912372, "learning_rate": 7.400539003406675e-06, "loss": 0.0514, "step": 1286 }, { "epoch": 1.2204836415362732, "grad_norm": 0.576076706598001, "learning_rate": 7.3956942457263184e-06, "loss": 0.0475, "step": 1287 }, { "epoch": 1.2214319582740636, "grad_norm": 0.3761588683293121, "learning_rate": 7.3908465668105955e-06, "loss": 0.0342, "step": 1288 }, { "epoch": 1.222380275011854, "grad_norm": 0.5622394777117021, "learning_rate": 7.3859959725706185e-06, "loss": 0.0513, "step": 1289 }, { "epoch": 1.2233285917496444, "grad_norm": 0.4516184849695626, "learning_rate": 7.381142468921052e-06, "loss": 0.0388, "step": 1290 }, { "epoch": 1.2242769084874348, "grad_norm": 0.3617313806173479, "learning_rate": 7.376286061780108e-06, "loss": 0.0279, "step": 1291 }, { "epoch": 1.2252252252252251, "grad_norm": 0.6044099547624433, "learning_rate": 7.371426757069538e-06, "loss": 0.0772, "step": 1292 }, { "epoch": 1.2261735419630155, "grad_norm": 0.46194460498572665, "learning_rate": 7.3665645607146266e-06, "loss": 0.0332, "step": 1293 }, { "epoch": 1.2271218587008061, "grad_norm": 0.6412897652910509, "learning_rate": 7.3616994786441865e-06, "loss": 0.0655, "step": 1294 }, { "epoch": 1.2280701754385965, "grad_norm": 0.4834273536592659, "learning_rate": 7.356831516790549e-06, "loss": 0.0424, "step": 1295 }, { "epoch": 1.229018492176387, "grad_norm": 0.4916473080153912, "learning_rate": 7.351960681089555e-06, "loss": 0.0374, "step": 1296 }, { "epoch": 1.2299668089141773, "grad_norm": 0.5082915641168106, "learning_rate": 7.347086977480552e-06, "loss": 0.0545, "step": 1297 }, { "epoch": 1.2309151256519677, "grad_norm": 0.47685406428324695, "learning_rate": 7.3422104119063815e-06, "loss": 0.0647, "step": 1298 }, { "epoch": 1.231863442389758, "grad_norm": 0.40154465693360425, "learning_rate": 7.337330990313377e-06, "loss": 0.0404, "step": 1299 }, { "epoch": 1.2328117591275487, "grad_norm": 0.535708958789697, "learning_rate": 7.332448718651355e-06, "loss": 0.0365, "step": 1300 }, { "epoch": 1.233760075865339, "grad_norm": 0.7669741889240554, "learning_rate": 7.327563602873604e-06, "loss": 0.0563, "step": 1301 }, { "epoch": 1.2347083926031295, "grad_norm": 0.4935695147587443, "learning_rate": 7.322675648936887e-06, "loss": 0.042, "step": 1302 }, { "epoch": 1.2356567093409199, "grad_norm": 0.5649346481989781, "learning_rate": 7.31778486280142e-06, "loss": 0.0596, "step": 1303 }, { "epoch": 1.2366050260787103, "grad_norm": 0.5681237194541117, "learning_rate": 7.312891250430878e-06, "loss": 0.0582, "step": 1304 }, { "epoch": 1.2375533428165006, "grad_norm": 0.3890664055979468, "learning_rate": 7.3079948177923785e-06, "loss": 0.0363, "step": 1305 }, { "epoch": 1.238501659554291, "grad_norm": 0.7279319600890861, "learning_rate": 7.303095570856482e-06, "loss": 0.0403, "step": 1306 }, { "epoch": 1.2394499762920816, "grad_norm": 0.6266894196950298, "learning_rate": 7.298193515597177e-06, "loss": 0.0481, "step": 1307 }, { "epoch": 1.240398293029872, "grad_norm": 0.5020074442444163, "learning_rate": 7.29328865799188e-06, "loss": 0.0325, "step": 1308 }, { "epoch": 1.2413466097676624, "grad_norm": 0.4375223150765817, "learning_rate": 7.288381004021419e-06, "loss": 0.0512, "step": 1309 }, { "epoch": 1.2422949265054528, "grad_norm": 0.38682876731404114, "learning_rate": 7.283470559670037e-06, "loss": 0.0458, "step": 1310 }, { "epoch": 1.2432432432432432, "grad_norm": 0.821002820077065, "learning_rate": 7.278557330925378e-06, "loss": 0.0441, "step": 1311 }, { "epoch": 1.2441915599810336, "grad_norm": 0.36531044461502105, "learning_rate": 7.273641323778482e-06, "loss": 0.0321, "step": 1312 }, { "epoch": 1.2451398767188242, "grad_norm": 0.8754259648216565, "learning_rate": 7.268722544223773e-06, "loss": 0.0551, "step": 1313 }, { "epoch": 1.2460881934566146, "grad_norm": 0.7875132084608685, "learning_rate": 7.263800998259061e-06, "loss": 0.0576, "step": 1314 }, { "epoch": 1.247036510194405, "grad_norm": 0.7147209757326294, "learning_rate": 7.258876691885526e-06, "loss": 0.0596, "step": 1315 }, { "epoch": 1.2479848269321954, "grad_norm": 0.6384875688620932, "learning_rate": 7.253949631107713e-06, "loss": 0.0539, "step": 1316 }, { "epoch": 1.2489331436699858, "grad_norm": 0.4048643826220915, "learning_rate": 7.24901982193353e-06, "loss": 0.0353, "step": 1317 }, { "epoch": 1.2498814604077761, "grad_norm": 0.5060955434368241, "learning_rate": 7.2440872703742314e-06, "loss": 0.0375, "step": 1318 }, { "epoch": 1.2508297771455665, "grad_norm": 0.3576520845581809, "learning_rate": 7.239151982444421e-06, "loss": 0.0311, "step": 1319 }, { "epoch": 1.251778093883357, "grad_norm": 0.6201923777708439, "learning_rate": 7.234213964162033e-06, "loss": 0.0461, "step": 1320 }, { "epoch": 1.2527264106211475, "grad_norm": 0.48992618453543096, "learning_rate": 7.2292732215483316e-06, "loss": 0.0425, "step": 1321 }, { "epoch": 1.253674727358938, "grad_norm": 0.5956731057701343, "learning_rate": 7.2243297606279114e-06, "loss": 0.0525, "step": 1322 }, { "epoch": 1.2546230440967283, "grad_norm": 0.5525693608729175, "learning_rate": 7.219383587428673e-06, "loss": 0.0423, "step": 1323 }, { "epoch": 1.2555713608345187, "grad_norm": 0.4690246094368384, "learning_rate": 7.214434707981825e-06, "loss": 0.0454, "step": 1324 }, { "epoch": 1.256519677572309, "grad_norm": 0.44673878692961855, "learning_rate": 7.209483128321881e-06, "loss": 0.0249, "step": 1325 }, { "epoch": 1.2574679943100997, "grad_norm": 0.43586039667176457, "learning_rate": 7.204528854486641e-06, "loss": 0.0479, "step": 1326 }, { "epoch": 1.25841631104789, "grad_norm": 0.5118218171714337, "learning_rate": 7.199571892517194e-06, "loss": 0.0567, "step": 1327 }, { "epoch": 1.2593646277856805, "grad_norm": 0.5965669555284678, "learning_rate": 7.194612248457907e-06, "loss": 0.044, "step": 1328 }, { "epoch": 1.2603129445234709, "grad_norm": 0.8010455651119536, "learning_rate": 7.189649928356413e-06, "loss": 0.0685, "step": 1329 }, { "epoch": 1.2612612612612613, "grad_norm": 0.4947586841353111, "learning_rate": 7.184684938263617e-06, "loss": 0.048, "step": 1330 }, { "epoch": 1.2622095779990516, "grad_norm": 0.43519037882369493, "learning_rate": 7.179717284233671e-06, "loss": 0.04, "step": 1331 }, { "epoch": 1.263157894736842, "grad_norm": 0.7723051121174525, "learning_rate": 7.174746972323976e-06, "loss": 0.0613, "step": 1332 }, { "epoch": 1.2641062114746324, "grad_norm": 0.670904737479652, "learning_rate": 7.169774008595178e-06, "loss": 0.042, "step": 1333 }, { "epoch": 1.2650545282124228, "grad_norm": 0.43695736935248614, "learning_rate": 7.164798399111159e-06, "loss": 0.0448, "step": 1334 }, { "epoch": 1.2660028449502134, "grad_norm": 0.45307761051009254, "learning_rate": 7.159820149939019e-06, "loss": 0.0448, "step": 1335 }, { "epoch": 1.2669511616880038, "grad_norm": 0.41666909675783154, "learning_rate": 7.154839267149082e-06, "loss": 0.042, "step": 1336 }, { "epoch": 1.2678994784257942, "grad_norm": 0.3919081120060574, "learning_rate": 7.1498557568148795e-06, "loss": 0.0452, "step": 1337 }, { "epoch": 1.2688477951635846, "grad_norm": 0.7267472981372689, "learning_rate": 7.144869625013155e-06, "loss": 0.0409, "step": 1338 }, { "epoch": 1.269796111901375, "grad_norm": 0.40985430024156805, "learning_rate": 7.1398808778238395e-06, "loss": 0.0418, "step": 1339 }, { "epoch": 1.2707444286391656, "grad_norm": 0.47624354455999457, "learning_rate": 7.134889521330056e-06, "loss": 0.0452, "step": 1340 }, { "epoch": 1.271692745376956, "grad_norm": 0.45691791490421313, "learning_rate": 7.129895561618113e-06, "loss": 0.0421, "step": 1341 }, { "epoch": 1.2726410621147464, "grad_norm": 0.997741074324212, "learning_rate": 7.124899004777489e-06, "loss": 0.0586, "step": 1342 }, { "epoch": 1.2735893788525368, "grad_norm": 0.4702689373485785, "learning_rate": 7.119899856900831e-06, "loss": 0.0381, "step": 1343 }, { "epoch": 1.2745376955903271, "grad_norm": 0.454291173319732, "learning_rate": 7.114898124083944e-06, "loss": 0.0289, "step": 1344 }, { "epoch": 1.2754860123281175, "grad_norm": 0.4439776694870836, "learning_rate": 7.109893812425785e-06, "loss": 0.0357, "step": 1345 }, { "epoch": 1.276434329065908, "grad_norm": 0.747360067085319, "learning_rate": 7.104886928028462e-06, "loss": 0.0537, "step": 1346 }, { "epoch": 1.2773826458036983, "grad_norm": 0.5685529445018764, "learning_rate": 7.09987747699721e-06, "loss": 0.0527, "step": 1347 }, { "epoch": 1.278330962541489, "grad_norm": 0.5663642475796429, "learning_rate": 7.0948654654403996e-06, "loss": 0.043, "step": 1348 }, { "epoch": 1.2792792792792793, "grad_norm": 0.4203561592323753, "learning_rate": 7.08985089946952e-06, "loss": 0.0442, "step": 1349 }, { "epoch": 1.2802275960170697, "grad_norm": 0.8806133526242714, "learning_rate": 7.0848337851991836e-06, "loss": 0.0683, "step": 1350 }, { "epoch": 1.28117591275486, "grad_norm": 0.4621171462388856, "learning_rate": 7.0798141287470974e-06, "loss": 0.0441, "step": 1351 }, { "epoch": 1.2821242294926505, "grad_norm": 0.4303832781558267, "learning_rate": 7.074791936234083e-06, "loss": 0.0432, "step": 1352 }, { "epoch": 1.283072546230441, "grad_norm": 0.593851630131963, "learning_rate": 7.069767213784037e-06, "loss": 0.0583, "step": 1353 }, { "epoch": 1.2840208629682315, "grad_norm": 0.45030313612782086, "learning_rate": 7.064739967523957e-06, "loss": 0.0367, "step": 1354 }, { "epoch": 1.2849691797060219, "grad_norm": 0.5237382802642779, "learning_rate": 7.059710203583908e-06, "loss": 0.0335, "step": 1355 }, { "epoch": 1.2859174964438123, "grad_norm": 0.46802621728751537, "learning_rate": 7.054677928097031e-06, "loss": 0.0362, "step": 1356 }, { "epoch": 1.2868658131816026, "grad_norm": 0.7397145007928059, "learning_rate": 7.0496431471995255e-06, "loss": 0.0352, "step": 1357 }, { "epoch": 1.287814129919393, "grad_norm": 0.44236361469981716, "learning_rate": 7.044605867030647e-06, "loss": 0.0367, "step": 1358 }, { "epoch": 1.2887624466571834, "grad_norm": 0.45828570870759805, "learning_rate": 7.039566093732701e-06, "loss": 0.0323, "step": 1359 }, { "epoch": 1.2897107633949738, "grad_norm": 0.5487427414966448, "learning_rate": 7.034523833451028e-06, "loss": 0.0399, "step": 1360 }, { "epoch": 1.2906590801327644, "grad_norm": 0.4529932412914496, "learning_rate": 7.029479092334005e-06, "loss": 0.0366, "step": 1361 }, { "epoch": 1.2916073968705548, "grad_norm": 0.49013485919505534, "learning_rate": 7.024431876533035e-06, "loss": 0.0296, "step": 1362 }, { "epoch": 1.2925557136083452, "grad_norm": 0.37179582929309557, "learning_rate": 7.019382192202535e-06, "loss": 0.033, "step": 1363 }, { "epoch": 1.2935040303461356, "grad_norm": 0.743064802050604, "learning_rate": 7.014330045499933e-06, "loss": 0.0559, "step": 1364 }, { "epoch": 1.294452347083926, "grad_norm": 0.7804393891223528, "learning_rate": 7.009275442585661e-06, "loss": 0.0695, "step": 1365 }, { "epoch": 1.2954006638217166, "grad_norm": 0.6357982901839727, "learning_rate": 7.004218389623144e-06, "loss": 0.0534, "step": 1366 }, { "epoch": 1.296348980559507, "grad_norm": 0.7619430095143129, "learning_rate": 6.999158892778799e-06, "loss": 0.0879, "step": 1367 }, { "epoch": 1.2972972972972974, "grad_norm": 0.45770470276857766, "learning_rate": 6.994096958222018e-06, "loss": 0.0367, "step": 1368 }, { "epoch": 1.2982456140350878, "grad_norm": 0.410119514751568, "learning_rate": 6.989032592125167e-06, "loss": 0.0365, "step": 1369 }, { "epoch": 1.2991939307728781, "grad_norm": 0.6850761718031776, "learning_rate": 6.983965800663575e-06, "loss": 0.0463, "step": 1370 }, { "epoch": 1.3001422475106685, "grad_norm": 0.46513153341224145, "learning_rate": 6.978896590015534e-06, "loss": 0.0337, "step": 1371 }, { "epoch": 1.301090564248459, "grad_norm": 0.34172001512071726, "learning_rate": 6.973824966362281e-06, "loss": 0.0335, "step": 1372 }, { "epoch": 1.3020388809862493, "grad_norm": 0.6484578028683804, "learning_rate": 6.968750935887998e-06, "loss": 0.051, "step": 1373 }, { "epoch": 1.3029871977240397, "grad_norm": 0.5873634371396844, "learning_rate": 6.963674504779799e-06, "loss": 0.0549, "step": 1374 }, { "epoch": 1.3039355144618303, "grad_norm": 0.3675784223470725, "learning_rate": 6.958595679227726e-06, "loss": 0.0391, "step": 1375 }, { "epoch": 1.3048838311996207, "grad_norm": 0.7235773354505487, "learning_rate": 6.9535144654247445e-06, "loss": 0.0413, "step": 1376 }, { "epoch": 1.305832147937411, "grad_norm": 0.39024326330660314, "learning_rate": 6.948430869566728e-06, "loss": 0.0327, "step": 1377 }, { "epoch": 1.3067804646752015, "grad_norm": 0.516621026121203, "learning_rate": 6.943344897852455e-06, "loss": 0.0573, "step": 1378 }, { "epoch": 1.3077287814129919, "grad_norm": 0.6477258197173674, "learning_rate": 6.938256556483604e-06, "loss": 0.0471, "step": 1379 }, { "epoch": 1.3086770981507825, "grad_norm": 0.6102880930185496, "learning_rate": 6.933165851664739e-06, "loss": 0.0632, "step": 1380 }, { "epoch": 1.3096254148885729, "grad_norm": 0.5470569354066884, "learning_rate": 6.928072789603306e-06, "loss": 0.0555, "step": 1381 }, { "epoch": 1.3105737316263633, "grad_norm": 0.3806001449449141, "learning_rate": 6.92297737650963e-06, "loss": 0.0376, "step": 1382 }, { "epoch": 1.3115220483641536, "grad_norm": 0.3967286117945718, "learning_rate": 6.9178796185969e-06, "loss": 0.032, "step": 1383 }, { "epoch": 1.312470365101944, "grad_norm": 0.4801789565687751, "learning_rate": 6.912779522081164e-06, "loss": 0.0419, "step": 1384 }, { "epoch": 1.3134186818397344, "grad_norm": 0.574663042478777, "learning_rate": 6.90767709318132e-06, "loss": 0.0407, "step": 1385 }, { "epoch": 1.3143669985775248, "grad_norm": 0.5147733905582667, "learning_rate": 6.902572338119112e-06, "loss": 0.0426, "step": 1386 }, { "epoch": 1.3153153153153152, "grad_norm": 0.5809011274647721, "learning_rate": 6.897465263119123e-06, "loss": 0.0324, "step": 1387 }, { "epoch": 1.3162636320531058, "grad_norm": 0.5966890883725477, "learning_rate": 6.8923558744087596e-06, "loss": 0.0382, "step": 1388 }, { "epoch": 1.3172119487908962, "grad_norm": 0.3584098045942277, "learning_rate": 6.887244178218252e-06, "loss": 0.0312, "step": 1389 }, { "epoch": 1.3181602655286866, "grad_norm": 0.42233674326286247, "learning_rate": 6.882130180780645e-06, "loss": 0.0316, "step": 1390 }, { "epoch": 1.319108582266477, "grad_norm": 0.6092760045208329, "learning_rate": 6.8770138883317895e-06, "loss": 0.0424, "step": 1391 }, { "epoch": 1.3200568990042674, "grad_norm": 0.4390762035044672, "learning_rate": 6.871895307110332e-06, "loss": 0.0498, "step": 1392 }, { "epoch": 1.321005215742058, "grad_norm": 0.45130718481148246, "learning_rate": 6.866774443357714e-06, "loss": 0.0408, "step": 1393 }, { "epoch": 1.3219535324798484, "grad_norm": 0.8863760397108307, "learning_rate": 6.861651303318156e-06, "loss": 0.0308, "step": 1394 }, { "epoch": 1.3229018492176388, "grad_norm": 0.37381232566509554, "learning_rate": 6.856525893238659e-06, "loss": 0.0335, "step": 1395 }, { "epoch": 1.3238501659554291, "grad_norm": 2.173476279498807, "learning_rate": 6.851398219368987e-06, "loss": 0.0652, "step": 1396 }, { "epoch": 1.3247984826932195, "grad_norm": 0.5112102465327201, "learning_rate": 6.846268287961667e-06, "loss": 0.0518, "step": 1397 }, { "epoch": 1.32574679943101, "grad_norm": 0.44499703462617834, "learning_rate": 6.841136105271979e-06, "loss": 0.0359, "step": 1398 }, { "epoch": 1.3266951161688003, "grad_norm": 0.4541899725717781, "learning_rate": 6.8360016775579486e-06, "loss": 0.0433, "step": 1399 }, { "epoch": 1.3276434329065907, "grad_norm": 0.7733390965492225, "learning_rate": 6.830865011080336e-06, "loss": 0.0683, "step": 1400 }, { "epoch": 1.328591749644381, "grad_norm": 0.4772059867844145, "learning_rate": 6.825726112102635e-06, "loss": 0.0459, "step": 1401 }, { "epoch": 1.3295400663821717, "grad_norm": 0.6433805256237654, "learning_rate": 6.820584986891058e-06, "loss": 0.0303, "step": 1402 }, { "epoch": 1.330488383119962, "grad_norm": 0.34738184879986195, "learning_rate": 6.8154416417145356e-06, "loss": 0.0283, "step": 1403 }, { "epoch": 1.3314366998577525, "grad_norm": 0.8994234854688081, "learning_rate": 6.8102960828447026e-06, "loss": 0.0495, "step": 1404 }, { "epoch": 1.3323850165955429, "grad_norm": 0.745413740002341, "learning_rate": 6.805148316555894e-06, "loss": 0.0532, "step": 1405 }, { "epoch": 1.3333333333333333, "grad_norm": 0.3870360377631428, "learning_rate": 6.799998349125137e-06, "loss": 0.0398, "step": 1406 }, { "epoch": 1.3342816500711239, "grad_norm": 0.5424025684088033, "learning_rate": 6.794846186832144e-06, "loss": 0.0464, "step": 1407 }, { "epoch": 1.3352299668089143, "grad_norm": 0.39099563297098944, "learning_rate": 6.789691835959299e-06, "loss": 0.0319, "step": 1408 }, { "epoch": 1.3361782835467046, "grad_norm": 0.5899481823910665, "learning_rate": 6.784535302791659e-06, "loss": 0.0488, "step": 1409 }, { "epoch": 1.337126600284495, "grad_norm": 0.504351897768241, "learning_rate": 6.779376593616941e-06, "loss": 0.0333, "step": 1410 }, { "epoch": 1.3380749170222854, "grad_norm": 0.7237802044557335, "learning_rate": 6.774215714725516e-06, "loss": 0.0592, "step": 1411 }, { "epoch": 1.3390232337600758, "grad_norm": 0.4534589627135591, "learning_rate": 6.769052672410399e-06, "loss": 0.0452, "step": 1412 }, { "epoch": 1.3399715504978662, "grad_norm": 0.6267058247818036, "learning_rate": 6.763887472967245e-06, "loss": 0.0711, "step": 1413 }, { "epoch": 1.3409198672356566, "grad_norm": 0.4858351172760667, "learning_rate": 6.758720122694336e-06, "loss": 0.0425, "step": 1414 }, { "epoch": 1.3418681839734472, "grad_norm": 0.432602033472308, "learning_rate": 6.75355062789258e-06, "loss": 0.0496, "step": 1415 }, { "epoch": 1.3428165007112376, "grad_norm": 0.6727456282074611, "learning_rate": 6.7483789948654986e-06, "loss": 0.074, "step": 1416 }, { "epoch": 1.343764817449028, "grad_norm": 0.39807817067818035, "learning_rate": 6.743205229919224e-06, "loss": 0.0318, "step": 1417 }, { "epoch": 1.3447131341868184, "grad_norm": 0.7372478449414445, "learning_rate": 6.73802933936248e-06, "loss": 0.0593, "step": 1418 }, { "epoch": 1.3456614509246088, "grad_norm": 0.3985144055506702, "learning_rate": 6.73285132950659e-06, "loss": 0.0358, "step": 1419 }, { "epoch": 1.3466097676623994, "grad_norm": 0.5951692184417887, "learning_rate": 6.727671206665458e-06, "loss": 0.0464, "step": 1420 }, { "epoch": 1.3475580844001898, "grad_norm": 0.38420866497778394, "learning_rate": 6.722488977155567e-06, "loss": 0.0284, "step": 1421 }, { "epoch": 1.3485064011379801, "grad_norm": 0.7464916944500526, "learning_rate": 6.717304647295965e-06, "loss": 0.0574, "step": 1422 }, { "epoch": 1.3494547178757705, "grad_norm": 0.5693062216202691, "learning_rate": 6.712118223408264e-06, "loss": 0.0392, "step": 1423 }, { "epoch": 1.350403034613561, "grad_norm": 0.4592788558938253, "learning_rate": 6.7069297118166295e-06, "loss": 0.0347, "step": 1424 }, { "epoch": 1.3513513513513513, "grad_norm": 0.5175822786697956, "learning_rate": 6.701739118847771e-06, "loss": 0.0536, "step": 1425 }, { "epoch": 1.3522996680891417, "grad_norm": 0.6912802037999823, "learning_rate": 6.696546450830937e-06, "loss": 0.0536, "step": 1426 }, { "epoch": 1.353247984826932, "grad_norm": 0.40537336423409387, "learning_rate": 6.691351714097906e-06, "loss": 0.0365, "step": 1427 }, { "epoch": 1.3541963015647225, "grad_norm": 0.532506588444211, "learning_rate": 6.6861549149829785e-06, "loss": 0.0463, "step": 1428 }, { "epoch": 1.355144618302513, "grad_norm": 0.5449727239357885, "learning_rate": 6.680956059822971e-06, "loss": 0.0445, "step": 1429 }, { "epoch": 1.3560929350403035, "grad_norm": 0.35162216761507775, "learning_rate": 6.675755154957208e-06, "loss": 0.03, "step": 1430 }, { "epoch": 1.3570412517780939, "grad_norm": 0.5723964216096473, "learning_rate": 6.670552206727507e-06, "loss": 0.0458, "step": 1431 }, { "epoch": 1.3579895685158843, "grad_norm": 0.6308418866530405, "learning_rate": 6.665347221478188e-06, "loss": 0.057, "step": 1432 }, { "epoch": 1.3589378852536746, "grad_norm": 0.498253296219615, "learning_rate": 6.660140205556046e-06, "loss": 0.0383, "step": 1433 }, { "epoch": 1.3598862019914653, "grad_norm": 0.40110848050156933, "learning_rate": 6.6549311653103544e-06, "loss": 0.0462, "step": 1434 }, { "epoch": 1.3608345187292556, "grad_norm": 0.3808306857794104, "learning_rate": 6.649720107092854e-06, "loss": 0.0368, "step": 1435 }, { "epoch": 1.361782835467046, "grad_norm": 0.5026216712918046, "learning_rate": 6.644507037257753e-06, "loss": 0.0394, "step": 1436 }, { "epoch": 1.3627311522048364, "grad_norm": 0.5063922606607929, "learning_rate": 6.639291962161702e-06, "loss": 0.0354, "step": 1437 }, { "epoch": 1.3636794689426268, "grad_norm": 0.34810512153075385, "learning_rate": 6.634074888163805e-06, "loss": 0.0253, "step": 1438 }, { "epoch": 1.3646277856804172, "grad_norm": 0.8525521272225376, "learning_rate": 6.628855821625601e-06, "loss": 0.0662, "step": 1439 }, { "epoch": 1.3655761024182076, "grad_norm": 0.4145939665119975, "learning_rate": 6.6236347689110546e-06, "loss": 0.0296, "step": 1440 }, { "epoch": 1.366524419155998, "grad_norm": 0.4775660065145518, "learning_rate": 6.618411736386558e-06, "loss": 0.0523, "step": 1441 }, { "epoch": 1.3674727358937886, "grad_norm": 0.37369507508155264, "learning_rate": 6.613186730420917e-06, "loss": 0.0365, "step": 1442 }, { "epoch": 1.368421052631579, "grad_norm": 0.40495701137032075, "learning_rate": 6.607959757385338e-06, "loss": 0.0362, "step": 1443 }, { "epoch": 1.3693693693693694, "grad_norm": 0.36575125377415557, "learning_rate": 6.602730823653436e-06, "loss": 0.0338, "step": 1444 }, { "epoch": 1.3703176861071598, "grad_norm": 0.6249163504364997, "learning_rate": 6.597499935601204e-06, "loss": 0.051, "step": 1445 }, { "epoch": 1.3712660028449501, "grad_norm": 0.582172833981279, "learning_rate": 6.592267099607029e-06, "loss": 0.0438, "step": 1446 }, { "epoch": 1.3722143195827408, "grad_norm": 0.3237159800921359, "learning_rate": 6.587032322051667e-06, "loss": 0.0241, "step": 1447 }, { "epoch": 1.3731626363205311, "grad_norm": 0.39835957752966755, "learning_rate": 6.581795609318247e-06, "loss": 0.036, "step": 1448 }, { "epoch": 1.3741109530583215, "grad_norm": 0.5547572067644376, "learning_rate": 6.57655696779225e-06, "loss": 0.0371, "step": 1449 }, { "epoch": 1.375059269796112, "grad_norm": 0.5290408595276358, "learning_rate": 6.571316403861518e-06, "loss": 0.066, "step": 1450 }, { "epoch": 1.3760075865339023, "grad_norm": 0.5333270731702193, "learning_rate": 6.566073923916226e-06, "loss": 0.0483, "step": 1451 }, { "epoch": 1.3769559032716927, "grad_norm": 0.47040112700666353, "learning_rate": 6.5608295343488985e-06, "loss": 0.0424, "step": 1452 }, { "epoch": 1.377904220009483, "grad_norm": 0.38901166465860154, "learning_rate": 6.555583241554376e-06, "loss": 0.0432, "step": 1453 }, { "epoch": 1.3788525367472735, "grad_norm": 0.49414149390465306, "learning_rate": 6.55033505192983e-06, "loss": 0.0407, "step": 1454 }, { "epoch": 1.379800853485064, "grad_norm": 0.45253883891283475, "learning_rate": 6.545084971874738e-06, "loss": 0.0388, "step": 1455 }, { "epoch": 1.3807491702228545, "grad_norm": 0.5302487098048316, "learning_rate": 6.539833007790885e-06, "loss": 0.0295, "step": 1456 }, { "epoch": 1.3816974869606449, "grad_norm": 0.37787261318074306, "learning_rate": 6.534579166082355e-06, "loss": 0.0286, "step": 1457 }, { "epoch": 1.3826458036984353, "grad_norm": 0.4747311054007047, "learning_rate": 6.529323453155516e-06, "loss": 0.0338, "step": 1458 }, { "epoch": 1.3835941204362256, "grad_norm": 1.0261775992168287, "learning_rate": 6.524065875419025e-06, "loss": 0.043, "step": 1459 }, { "epoch": 1.384542437174016, "grad_norm": 0.5317475686310853, "learning_rate": 6.5188064392838114e-06, "loss": 0.0533, "step": 1460 }, { "epoch": 1.3854907539118066, "grad_norm": 0.5479284515608536, "learning_rate": 6.5135451511630636e-06, "loss": 0.058, "step": 1461 }, { "epoch": 1.386439070649597, "grad_norm": 0.6932046763335258, "learning_rate": 6.5082820174722364e-06, "loss": 0.0506, "step": 1462 }, { "epoch": 1.3873873873873874, "grad_norm": 0.3763860463333079, "learning_rate": 6.503017044629029e-06, "loss": 0.0291, "step": 1463 }, { "epoch": 1.3883357041251778, "grad_norm": 0.5382489774529615, "learning_rate": 6.4977502390533905e-06, "loss": 0.0394, "step": 1464 }, { "epoch": 1.3892840208629682, "grad_norm": 0.6889518317604814, "learning_rate": 6.4924816071675e-06, "loss": 0.0698, "step": 1465 }, { "epoch": 1.3902323376007586, "grad_norm": 0.6495562092445605, "learning_rate": 6.487211155395762e-06, "loss": 0.0516, "step": 1466 }, { "epoch": 1.391180654338549, "grad_norm": 0.6308551102529311, "learning_rate": 6.4819388901648e-06, "loss": 0.0491, "step": 1467 }, { "epoch": 1.3921289710763394, "grad_norm": 0.5361583528081098, "learning_rate": 6.476664817903457e-06, "loss": 0.0467, "step": 1468 }, { "epoch": 1.39307728781413, "grad_norm": 0.36452515652974693, "learning_rate": 6.471388945042769e-06, "loss": 0.0337, "step": 1469 }, { "epoch": 1.3940256045519204, "grad_norm": 0.5709516666140196, "learning_rate": 6.466111278015973e-06, "loss": 0.0356, "step": 1470 }, { "epoch": 1.3949739212897108, "grad_norm": 0.6635702342274311, "learning_rate": 6.460831823258492e-06, "loss": 0.052, "step": 1471 }, { "epoch": 1.3959222380275011, "grad_norm": 0.6089384298241021, "learning_rate": 6.455550587207931e-06, "loss": 0.0464, "step": 1472 }, { "epoch": 1.3968705547652915, "grad_norm": 0.5865814694576227, "learning_rate": 6.450267576304065e-06, "loss": 0.0493, "step": 1473 }, { "epoch": 1.3978188715030821, "grad_norm": 0.4169326925237183, "learning_rate": 6.444982796988835e-06, "loss": 0.0309, "step": 1474 }, { "epoch": 1.3987671882408725, "grad_norm": 0.4907017898205256, "learning_rate": 6.439696255706334e-06, "loss": 0.0267, "step": 1475 }, { "epoch": 1.399715504978663, "grad_norm": 0.6149659771951538, "learning_rate": 6.434407958902809e-06, "loss": 0.0461, "step": 1476 }, { "epoch": 1.4006638217164533, "grad_norm": 0.4695061046804246, "learning_rate": 6.429117913026647e-06, "loss": 0.0511, "step": 1477 }, { "epoch": 1.4016121384542437, "grad_norm": 0.5455301875944718, "learning_rate": 6.423826124528363e-06, "loss": 0.0413, "step": 1478 }, { "epoch": 1.402560455192034, "grad_norm": 0.40051594288158987, "learning_rate": 6.418532599860602e-06, "loss": 0.0382, "step": 1479 }, { "epoch": 1.4035087719298245, "grad_norm": 0.3913732454226365, "learning_rate": 6.413237345478121e-06, "loss": 0.0424, "step": 1480 }, { "epoch": 1.4044570886676149, "grad_norm": 0.42177204727179535, "learning_rate": 6.407940367837794e-06, "loss": 0.0341, "step": 1481 }, { "epoch": 1.4054054054054055, "grad_norm": 0.5927648854025513, "learning_rate": 6.402641673398589e-06, "loss": 0.0457, "step": 1482 }, { "epoch": 1.4063537221431959, "grad_norm": 0.6649752510649648, "learning_rate": 6.39734126862157e-06, "loss": 0.0399, "step": 1483 }, { "epoch": 1.4073020388809863, "grad_norm": 0.6602312989482694, "learning_rate": 6.392039159969884e-06, "loss": 0.0389, "step": 1484 }, { "epoch": 1.4082503556187767, "grad_norm": 0.4538930406178064, "learning_rate": 6.386735353908762e-06, "loss": 0.0295, "step": 1485 }, { "epoch": 1.409198672356567, "grad_norm": 0.5026530505058707, "learning_rate": 6.381429856905499e-06, "loss": 0.0391, "step": 1486 }, { "epoch": 1.4101469890943577, "grad_norm": 0.5396295314825587, "learning_rate": 6.3761226754294525e-06, "loss": 0.0524, "step": 1487 }, { "epoch": 1.411095305832148, "grad_norm": 0.700524696737488, "learning_rate": 6.370813815952039e-06, "loss": 0.0456, "step": 1488 }, { "epoch": 1.4120436225699384, "grad_norm": 0.5570830545575999, "learning_rate": 6.365503284946713e-06, "loss": 0.0376, "step": 1489 }, { "epoch": 1.4129919393077288, "grad_norm": 0.5029568178955325, "learning_rate": 6.360191088888975e-06, "loss": 0.0441, "step": 1490 }, { "epoch": 1.4139402560455192, "grad_norm": 0.39143841524057965, "learning_rate": 6.35487723425635e-06, "loss": 0.0322, "step": 1491 }, { "epoch": 1.4148885727833096, "grad_norm": 0.5179251130538431, "learning_rate": 6.349561727528388e-06, "loss": 0.039, "step": 1492 }, { "epoch": 1.4158368895211, "grad_norm": 0.4971363492885207, "learning_rate": 6.344244575186655e-06, "loss": 0.0389, "step": 1493 }, { "epoch": 1.4167852062588904, "grad_norm": 0.4650371165745209, "learning_rate": 6.338925783714721e-06, "loss": 0.0354, "step": 1494 }, { "epoch": 1.4177335229966808, "grad_norm": 0.5910877479719807, "learning_rate": 6.333605359598154e-06, "loss": 0.0458, "step": 1495 }, { "epoch": 1.4186818397344714, "grad_norm": 0.4704211557886303, "learning_rate": 6.328283309324516e-06, "loss": 0.0375, "step": 1496 }, { "epoch": 1.4196301564722618, "grad_norm": 0.5026935559926589, "learning_rate": 6.32295963938335e-06, "loss": 0.0368, "step": 1497 }, { "epoch": 1.4205784732100522, "grad_norm": 0.7197271409887576, "learning_rate": 6.317634356266175e-06, "loss": 0.0625, "step": 1498 }, { "epoch": 1.4215267899478425, "grad_norm": 0.5369735581060212, "learning_rate": 6.312307466466477e-06, "loss": 0.035, "step": 1499 }, { "epoch": 1.422475106685633, "grad_norm": 0.396321033510949, "learning_rate": 6.306978976479695e-06, "loss": 0.0294, "step": 1500 }, { "epoch": 1.4234234234234235, "grad_norm": 0.3409142303173711, "learning_rate": 6.30164889280323e-06, "loss": 0.035, "step": 1501 }, { "epoch": 1.424371740161214, "grad_norm": 0.49779800032590527, "learning_rate": 6.296317221936421e-06, "loss": 0.0344, "step": 1502 }, { "epoch": 1.4253200568990043, "grad_norm": 0.398334192892684, "learning_rate": 6.290983970380539e-06, "loss": 0.0298, "step": 1503 }, { "epoch": 1.4262683736367947, "grad_norm": 2.7922441337396253, "learning_rate": 6.28564914463879e-06, "loss": 0.0517, "step": 1504 }, { "epoch": 1.427216690374585, "grad_norm": 0.4840208519427491, "learning_rate": 6.280312751216291e-06, "loss": 0.0337, "step": 1505 }, { "epoch": 1.4281650071123755, "grad_norm": 0.5619043194162205, "learning_rate": 6.274974796620078e-06, "loss": 0.0537, "step": 1506 }, { "epoch": 1.4291133238501659, "grad_norm": 0.5122598193956247, "learning_rate": 6.269635287359086e-06, "loss": 0.0424, "step": 1507 }, { "epoch": 1.4300616405879563, "grad_norm": 0.9600984611912299, "learning_rate": 6.264294229944146e-06, "loss": 0.0589, "step": 1508 }, { "epoch": 1.4310099573257469, "grad_norm": 0.8407809045144361, "learning_rate": 6.258951630887982e-06, "loss": 0.0372, "step": 1509 }, { "epoch": 1.4319582740635373, "grad_norm": 0.7744939127361691, "learning_rate": 6.253607496705191e-06, "loss": 0.0444, "step": 1510 }, { "epoch": 1.4329065908013277, "grad_norm": 0.7703533886302798, "learning_rate": 6.248261833912243e-06, "loss": 0.0491, "step": 1511 }, { "epoch": 1.433854907539118, "grad_norm": 0.5182968502177208, "learning_rate": 6.242914649027476e-06, "loss": 0.0349, "step": 1512 }, { "epoch": 1.4348032242769084, "grad_norm": 0.5575121095897854, "learning_rate": 6.237565948571082e-06, "loss": 0.0436, "step": 1513 }, { "epoch": 1.435751541014699, "grad_norm": 0.5187081289232499, "learning_rate": 6.232215739065102e-06, "loss": 0.0414, "step": 1514 }, { "epoch": 1.4366998577524894, "grad_norm": 0.43186737340559983, "learning_rate": 6.226864027033413e-06, "loss": 0.0278, "step": 1515 }, { "epoch": 1.4376481744902798, "grad_norm": 0.44892769483591366, "learning_rate": 6.221510819001725e-06, "loss": 0.0297, "step": 1516 }, { "epoch": 1.4385964912280702, "grad_norm": 0.5269062893506762, "learning_rate": 6.216156121497579e-06, "loss": 0.0514, "step": 1517 }, { "epoch": 1.4395448079658606, "grad_norm": 0.6625936787851716, "learning_rate": 6.2107999410503235e-06, "loss": 0.0419, "step": 1518 }, { "epoch": 1.440493124703651, "grad_norm": 0.5592530362043953, "learning_rate": 6.205442284191122e-06, "loss": 0.0432, "step": 1519 }, { "epoch": 1.4414414414414414, "grad_norm": 0.47852425989622077, "learning_rate": 6.200083157452934e-06, "loss": 0.031, "step": 1520 }, { "epoch": 1.4423897581792318, "grad_norm": 0.5811912176295393, "learning_rate": 6.194722567370511e-06, "loss": 0.057, "step": 1521 }, { "epoch": 1.4433380749170222, "grad_norm": 0.564889130546722, "learning_rate": 6.189360520480394e-06, "loss": 0.0423, "step": 1522 }, { "epoch": 1.4442863916548128, "grad_norm": 0.7421168415388308, "learning_rate": 6.183997023320894e-06, "loss": 0.058, "step": 1523 }, { "epoch": 1.4452347083926032, "grad_norm": 0.4925903110099146, "learning_rate": 6.178632082432093e-06, "loss": 0.043, "step": 1524 }, { "epoch": 1.4461830251303935, "grad_norm": 0.4022649394814297, "learning_rate": 6.173265704355838e-06, "loss": 0.0319, "step": 1525 }, { "epoch": 1.447131341868184, "grad_norm": 0.46419553742343383, "learning_rate": 6.16789789563572e-06, "loss": 0.0511, "step": 1526 }, { "epoch": 1.4480796586059743, "grad_norm": 0.686004067534123, "learning_rate": 6.16252866281708e-06, "loss": 0.0353, "step": 1527 }, { "epoch": 1.449027975343765, "grad_norm": 0.5485413747656448, "learning_rate": 6.1571580124469935e-06, "loss": 0.0387, "step": 1528 }, { "epoch": 1.4499762920815553, "grad_norm": 1.0798253002470137, "learning_rate": 6.151785951074266e-06, "loss": 0.0378, "step": 1529 }, { "epoch": 1.4509246088193457, "grad_norm": 0.6205350073393234, "learning_rate": 6.146412485249424e-06, "loss": 0.0441, "step": 1530 }, { "epoch": 1.451872925557136, "grad_norm": 0.5359792529386427, "learning_rate": 6.141037621524704e-06, "loss": 0.0457, "step": 1531 }, { "epoch": 1.4528212422949265, "grad_norm": 0.6496519633454682, "learning_rate": 6.1356613664540455e-06, "loss": 0.0322, "step": 1532 }, { "epoch": 1.4537695590327169, "grad_norm": 0.500829293631121, "learning_rate": 6.1302837265930925e-06, "loss": 0.0318, "step": 1533 }, { "epoch": 1.4547178757705073, "grad_norm": 0.6387522771798803, "learning_rate": 6.124904708499169e-06, "loss": 0.0375, "step": 1534 }, { "epoch": 1.4556661925082977, "grad_norm": 0.6065572240886973, "learning_rate": 6.119524318731286e-06, "loss": 0.0578, "step": 1535 }, { "epoch": 1.4566145092460883, "grad_norm": 0.4804845250123355, "learning_rate": 6.114142563850122e-06, "loss": 0.0464, "step": 1536 }, { "epoch": 1.4575628259838787, "grad_norm": 0.5386846320310618, "learning_rate": 6.1087594504180226e-06, "loss": 0.0354, "step": 1537 }, { "epoch": 1.458511142721669, "grad_norm": 0.5814430131886619, "learning_rate": 6.1033749849989896e-06, "loss": 0.0528, "step": 1538 }, { "epoch": 1.4594594594594594, "grad_norm": 0.41608042979081394, "learning_rate": 6.097989174158674e-06, "loss": 0.0371, "step": 1539 }, { "epoch": 1.4604077761972498, "grad_norm": 0.42647526096873056, "learning_rate": 6.092602024464364e-06, "loss": 0.041, "step": 1540 }, { "epoch": 1.4613560929350404, "grad_norm": 0.7880765483408534, "learning_rate": 6.087213542484987e-06, "loss": 0.0285, "step": 1541 }, { "epoch": 1.4623044096728308, "grad_norm": 0.3794551189159985, "learning_rate": 6.0818237347910905e-06, "loss": 0.0348, "step": 1542 }, { "epoch": 1.4632527264106212, "grad_norm": 0.5824672410897602, "learning_rate": 6.076432607954837e-06, "loss": 0.0473, "step": 1543 }, { "epoch": 1.4642010431484116, "grad_norm": 0.7642250674769905, "learning_rate": 6.071040168549999e-06, "loss": 0.055, "step": 1544 }, { "epoch": 1.465149359886202, "grad_norm": 0.4383923165951326, "learning_rate": 6.0656464231519505e-06, "loss": 0.0435, "step": 1545 }, { "epoch": 1.4660976766239924, "grad_norm": 0.5766848636422996, "learning_rate": 6.060251378337662e-06, "loss": 0.0472, "step": 1546 }, { "epoch": 1.4670459933617828, "grad_norm": 0.4669671289773233, "learning_rate": 6.05485504068568e-06, "loss": 0.0414, "step": 1547 }, { "epoch": 1.4679943100995732, "grad_norm": 0.5655404897789605, "learning_rate": 6.049457416776131e-06, "loss": 0.04, "step": 1548 }, { "epoch": 1.4689426268373635, "grad_norm": 0.5365501006997421, "learning_rate": 6.044058513190711e-06, "loss": 0.0383, "step": 1549 }, { "epoch": 1.4698909435751542, "grad_norm": 0.48431123525495323, "learning_rate": 6.038658336512677e-06, "loss": 0.0548, "step": 1550 }, { "epoch": 1.4708392603129445, "grad_norm": 0.5554437642357153, "learning_rate": 6.0332568933268375e-06, "loss": 0.0417, "step": 1551 }, { "epoch": 1.471787577050735, "grad_norm": 1.7635981147318227, "learning_rate": 6.0278541902195446e-06, "loss": 0.0515, "step": 1552 }, { "epoch": 1.4727358937885253, "grad_norm": 0.7232935188306358, "learning_rate": 6.022450233778685e-06, "loss": 0.0657, "step": 1553 }, { "epoch": 1.4736842105263157, "grad_norm": 0.5072524556149093, "learning_rate": 6.017045030593679e-06, "loss": 0.0438, "step": 1554 }, { "epoch": 1.4746325272641063, "grad_norm": 0.4631383186501888, "learning_rate": 6.011638587255461e-06, "loss": 0.0329, "step": 1555 }, { "epoch": 1.4755808440018967, "grad_norm": 0.45366075659990895, "learning_rate": 6.006230910356483e-06, "loss": 0.0352, "step": 1556 }, { "epoch": 1.476529160739687, "grad_norm": 0.5767111918678757, "learning_rate": 6.000822006490696e-06, "loss": 0.0375, "step": 1557 }, { "epoch": 1.4774774774774775, "grad_norm": 0.4071796252251615, "learning_rate": 5.995411882253554e-06, "loss": 0.0331, "step": 1558 }, { "epoch": 1.4784257942152679, "grad_norm": 0.43238641694345853, "learning_rate": 5.990000544241993e-06, "loss": 0.0463, "step": 1559 }, { "epoch": 1.4793741109530583, "grad_norm": 0.6798403742092483, "learning_rate": 5.984587999054428e-06, "loss": 0.0365, "step": 1560 }, { "epoch": 1.4803224276908487, "grad_norm": 0.5374744532934378, "learning_rate": 5.979174253290752e-06, "loss": 0.0445, "step": 1561 }, { "epoch": 1.481270744428639, "grad_norm": 0.36196300041262275, "learning_rate": 5.973759313552318e-06, "loss": 0.0303, "step": 1562 }, { "epoch": 1.4822190611664297, "grad_norm": 0.5642959313665694, "learning_rate": 5.9683431864419384e-06, "loss": 0.0449, "step": 1563 }, { "epoch": 1.48316737790422, "grad_norm": 0.5061601710793207, "learning_rate": 5.9629258785638675e-06, "loss": 0.0327, "step": 1564 }, { "epoch": 1.4841156946420104, "grad_norm": 0.44567373677114336, "learning_rate": 5.957507396523803e-06, "loss": 0.0299, "step": 1565 }, { "epoch": 1.4850640113798008, "grad_norm": 1.7557947841187413, "learning_rate": 5.952087746928875e-06, "loss": 0.041, "step": 1566 }, { "epoch": 1.4860123281175912, "grad_norm": 0.4668457994449059, "learning_rate": 5.9466669363876375e-06, "loss": 0.0344, "step": 1567 }, { "epoch": 1.4869606448553818, "grad_norm": 0.4261254948879091, "learning_rate": 5.9412449715100576e-06, "loss": 0.0303, "step": 1568 }, { "epoch": 1.4879089615931722, "grad_norm": 0.42754498865715357, "learning_rate": 5.93582185890751e-06, "loss": 0.0261, "step": 1569 }, { "epoch": 1.4888572783309626, "grad_norm": 0.4906574698227752, "learning_rate": 5.930397605192773e-06, "loss": 0.0444, "step": 1570 }, { "epoch": 1.489805595068753, "grad_norm": 0.7067845216764639, "learning_rate": 5.924972216980013e-06, "loss": 0.042, "step": 1571 }, { "epoch": 1.4907539118065434, "grad_norm": 0.48961667245950796, "learning_rate": 5.919545700884779e-06, "loss": 0.0425, "step": 1572 }, { "epoch": 1.4917022285443338, "grad_norm": 1.0609050385233834, "learning_rate": 5.914118063523996e-06, "loss": 0.0449, "step": 1573 }, { "epoch": 1.4926505452821242, "grad_norm": 1.0807634420097412, "learning_rate": 5.9086893115159605e-06, "loss": 0.0502, "step": 1574 }, { "epoch": 1.4935988620199145, "grad_norm": 0.6416643508613592, "learning_rate": 5.903259451480321e-06, "loss": 0.0441, "step": 1575 }, { "epoch": 1.4945471787577052, "grad_norm": 0.6277434629874132, "learning_rate": 5.897828490038082e-06, "loss": 0.0713, "step": 1576 }, { "epoch": 1.4954954954954955, "grad_norm": 0.33767932825829644, "learning_rate": 5.8923964338115895e-06, "loss": 0.0347, "step": 1577 }, { "epoch": 1.496443812233286, "grad_norm": 0.4477567855475462, "learning_rate": 5.886963289424524e-06, "loss": 0.0364, "step": 1578 }, { "epoch": 1.4973921289710763, "grad_norm": 0.41594278967554993, "learning_rate": 5.881529063501896e-06, "loss": 0.0338, "step": 1579 }, { "epoch": 1.4983404457088667, "grad_norm": 0.35754135982690183, "learning_rate": 5.8760937626700306e-06, "loss": 0.0287, "step": 1580 }, { "epoch": 1.4992887624466573, "grad_norm": 0.4123079201310612, "learning_rate": 5.8706573935565645e-06, "loss": 0.0347, "step": 1581 }, { "epoch": 1.5002370791844477, "grad_norm": 0.6029428312058318, "learning_rate": 5.865219962790438e-06, "loss": 0.051, "step": 1582 }, { "epoch": 1.501185395922238, "grad_norm": 0.5215535313309333, "learning_rate": 5.859781477001887e-06, "loss": 0.0502, "step": 1583 }, { "epoch": 1.5021337126600285, "grad_norm": 0.5200577257819287, "learning_rate": 5.8543419428224325e-06, "loss": 0.04, "step": 1584 }, { "epoch": 1.5030820293978189, "grad_norm": 0.3586196172148145, "learning_rate": 5.8489013668848726e-06, "loss": 0.0217, "step": 1585 }, { "epoch": 1.5040303461356093, "grad_norm": 0.6761618716488011, "learning_rate": 5.843459755823279e-06, "loss": 0.0628, "step": 1586 }, { "epoch": 1.5049786628733997, "grad_norm": 0.49941615403305933, "learning_rate": 5.8380171162729814e-06, "loss": 0.0376, "step": 1587 }, { "epoch": 1.50592697961119, "grad_norm": 0.7013305412374657, "learning_rate": 5.832573454870567e-06, "loss": 0.0519, "step": 1588 }, { "epoch": 1.5068752963489804, "grad_norm": 0.36446558452178207, "learning_rate": 5.827128778253867e-06, "loss": 0.0315, "step": 1589 }, { "epoch": 1.5078236130867708, "grad_norm": 0.6059271506224598, "learning_rate": 5.821683093061956e-06, "loss": 0.0588, "step": 1590 }, { "epoch": 1.5087719298245614, "grad_norm": 0.5681480190393359, "learning_rate": 5.81623640593513e-06, "loss": 0.0471, "step": 1591 }, { "epoch": 1.5097202465623518, "grad_norm": 0.5550547812099469, "learning_rate": 5.810788723514908e-06, "loss": 0.0535, "step": 1592 }, { "epoch": 1.5106685633001422, "grad_norm": 0.49253764363091057, "learning_rate": 5.805340052444028e-06, "loss": 0.0428, "step": 1593 }, { "epoch": 1.5116168800379328, "grad_norm": 0.5766442929322906, "learning_rate": 5.799890399366434e-06, "loss": 0.0399, "step": 1594 }, { "epoch": 1.5125651967757232, "grad_norm": 0.34949487700262954, "learning_rate": 5.79443977092726e-06, "loss": 0.0278, "step": 1595 }, { "epoch": 1.5135135135135136, "grad_norm": 0.659958933278198, "learning_rate": 5.788988173772835e-06, "loss": 0.047, "step": 1596 }, { "epoch": 1.514461830251304, "grad_norm": 0.5597846667352361, "learning_rate": 5.783535614550666e-06, "loss": 0.0404, "step": 1597 }, { "epoch": 1.5154101469890944, "grad_norm": 0.7866842188486388, "learning_rate": 5.778082099909436e-06, "loss": 0.0541, "step": 1598 }, { "epoch": 1.5163584637268848, "grad_norm": 0.3147662026332043, "learning_rate": 5.772627636498992e-06, "loss": 0.0308, "step": 1599 }, { "epoch": 1.5173067804646752, "grad_norm": 0.5584398566971551, "learning_rate": 5.76717223097034e-06, "loss": 0.0395, "step": 1600 }, { "epoch": 1.5182550972024655, "grad_norm": 0.4132888410051328, "learning_rate": 5.7617158899756276e-06, "loss": 0.034, "step": 1601 }, { "epoch": 1.519203413940256, "grad_norm": 0.37539181864065124, "learning_rate": 5.756258620168152e-06, "loss": 0.0416, "step": 1602 }, { "epoch": 1.5201517306780463, "grad_norm": 0.7346309696726602, "learning_rate": 5.750800428202338e-06, "loss": 0.0373, "step": 1603 }, { "epoch": 1.521100047415837, "grad_norm": 0.620382314193856, "learning_rate": 5.745341320733735e-06, "loss": 0.0331, "step": 1604 }, { "epoch": 1.5220483641536273, "grad_norm": 0.5684312369471517, "learning_rate": 5.739881304419009e-06, "loss": 0.0331, "step": 1605 }, { "epoch": 1.5229966808914177, "grad_norm": 0.9180566528310432, "learning_rate": 5.734420385915937e-06, "loss": 0.0396, "step": 1606 }, { "epoch": 1.5239449976292083, "grad_norm": 0.4119447176442743, "learning_rate": 5.728958571883393e-06, "loss": 0.0347, "step": 1607 }, { "epoch": 1.5248933143669987, "grad_norm": 0.46293074284568964, "learning_rate": 5.723495868981344e-06, "loss": 0.0297, "step": 1608 }, { "epoch": 1.525841631104789, "grad_norm": 0.4707389836290285, "learning_rate": 5.718032283870839e-06, "loss": 0.0354, "step": 1609 }, { "epoch": 1.5267899478425795, "grad_norm": 0.4178418025887583, "learning_rate": 5.7125678232140054e-06, "loss": 0.0286, "step": 1610 }, { "epoch": 1.5277382645803699, "grad_norm": 0.43658626940138434, "learning_rate": 5.70710249367404e-06, "loss": 0.0387, "step": 1611 }, { "epoch": 1.5286865813181603, "grad_norm": 0.45394473647054495, "learning_rate": 5.701636301915192e-06, "loss": 0.0306, "step": 1612 }, { "epoch": 1.5296348980559507, "grad_norm": 1.333103451681259, "learning_rate": 5.69616925460277e-06, "loss": 0.0481, "step": 1613 }, { "epoch": 1.530583214793741, "grad_norm": 0.6256616011256059, "learning_rate": 5.690701358403119e-06, "loss": 0.0442, "step": 1614 }, { "epoch": 1.5315315315315314, "grad_norm": 0.32612870761505114, "learning_rate": 5.685232619983625e-06, "loss": 0.026, "step": 1615 }, { "epoch": 1.5324798482693218, "grad_norm": 0.3891832320687818, "learning_rate": 5.679763046012697e-06, "loss": 0.0255, "step": 1616 }, { "epoch": 1.5334281650071122, "grad_norm": 0.46453499563730116, "learning_rate": 5.6742926431597645e-06, "loss": 0.0427, "step": 1617 }, { "epoch": 1.5343764817449028, "grad_norm": 0.47783783736140073, "learning_rate": 5.668821418095266e-06, "loss": 0.0421, "step": 1618 }, { "epoch": 1.5353247984826932, "grad_norm": 0.6743992447584107, "learning_rate": 5.663349377490646e-06, "loss": 0.0358, "step": 1619 }, { "epoch": 1.5362731152204836, "grad_norm": 0.4107463403974195, "learning_rate": 5.65787652801834e-06, "loss": 0.0357, "step": 1620 }, { "epoch": 1.5372214319582742, "grad_norm": 0.5248494282764801, "learning_rate": 5.65240287635177e-06, "loss": 0.0344, "step": 1621 }, { "epoch": 1.5381697486960646, "grad_norm": 0.606570796875963, "learning_rate": 5.646928429165339e-06, "loss": 0.0281, "step": 1622 }, { "epoch": 1.539118065433855, "grad_norm": 0.42829215552732597, "learning_rate": 5.641453193134421e-06, "loss": 0.0447, "step": 1623 }, { "epoch": 1.5400663821716454, "grad_norm": 0.5164772119258303, "learning_rate": 5.635977174935343e-06, "loss": 0.029, "step": 1624 }, { "epoch": 1.5410146989094358, "grad_norm": 0.37297442039349055, "learning_rate": 5.630500381245397e-06, "loss": 0.0358, "step": 1625 }, { "epoch": 1.5419630156472262, "grad_norm": 0.3902979566469329, "learning_rate": 5.625022818742812e-06, "loss": 0.0463, "step": 1626 }, { "epoch": 1.5429113323850165, "grad_norm": 0.6496769657487748, "learning_rate": 5.619544494106761e-06, "loss": 0.0474, "step": 1627 }, { "epoch": 1.543859649122807, "grad_norm": 0.4223958965983336, "learning_rate": 5.6140654140173424e-06, "loss": 0.0364, "step": 1628 }, { "epoch": 1.5448079658605973, "grad_norm": 0.43884925035729194, "learning_rate": 5.6085855851555784e-06, "loss": 0.033, "step": 1629 }, { "epoch": 1.5457562825983877, "grad_norm": 0.530824449652907, "learning_rate": 5.603105014203398e-06, "loss": 0.0483, "step": 1630 }, { "epoch": 1.5467045993361783, "grad_norm": 0.4284263908585556, "learning_rate": 5.597623707843643e-06, "loss": 0.0508, "step": 1631 }, { "epoch": 1.5476529160739687, "grad_norm": 0.4535017203711632, "learning_rate": 5.59214167276005e-06, "loss": 0.0377, "step": 1632 }, { "epoch": 1.548601232811759, "grad_norm": 0.45970117181525466, "learning_rate": 5.586658915637239e-06, "loss": 0.0271, "step": 1633 }, { "epoch": 1.5495495495495497, "grad_norm": 0.48042151558541607, "learning_rate": 5.581175443160718e-06, "loss": 0.053, "step": 1634 }, { "epoch": 1.55049786628734, "grad_norm": 0.38033052193121797, "learning_rate": 5.57569126201686e-06, "loss": 0.0413, "step": 1635 }, { "epoch": 1.5514461830251305, "grad_norm": 0.3695377907477157, "learning_rate": 5.570206378892909e-06, "loss": 0.0291, "step": 1636 }, { "epoch": 1.5523944997629209, "grad_norm": 0.354032936247863, "learning_rate": 5.564720800476958e-06, "loss": 0.0422, "step": 1637 }, { "epoch": 1.5533428165007113, "grad_norm": 1.0102411364289114, "learning_rate": 5.559234533457953e-06, "loss": 0.042, "step": 1638 }, { "epoch": 1.5542911332385017, "grad_norm": 0.738120928557875, "learning_rate": 5.553747584525682e-06, "loss": 0.0389, "step": 1639 }, { "epoch": 1.555239449976292, "grad_norm": 0.799209724109593, "learning_rate": 5.548259960370754e-06, "loss": 0.0395, "step": 1640 }, { "epoch": 1.5561877667140824, "grad_norm": 0.4062247593100686, "learning_rate": 5.542771667684612e-06, "loss": 0.0315, "step": 1641 }, { "epoch": 1.5571360834518728, "grad_norm": 0.3308586428988418, "learning_rate": 5.537282713159507e-06, "loss": 0.031, "step": 1642 }, { "epoch": 1.5580844001896632, "grad_norm": 0.6086645010113465, "learning_rate": 5.5317931034885044e-06, "loss": 0.0463, "step": 1643 }, { "epoch": 1.5590327169274538, "grad_norm": 0.4762141993084212, "learning_rate": 5.526302845365461e-06, "loss": 0.0351, "step": 1644 }, { "epoch": 1.5599810336652442, "grad_norm": 0.7153367461924497, "learning_rate": 5.520811945485031e-06, "loss": 0.0489, "step": 1645 }, { "epoch": 1.5609293504030346, "grad_norm": 0.47448305807931646, "learning_rate": 5.515320410542642e-06, "loss": 0.0568, "step": 1646 }, { "epoch": 1.561877667140825, "grad_norm": 0.5897241369310743, "learning_rate": 5.509828247234505e-06, "loss": 0.0489, "step": 1647 }, { "epoch": 1.5628259838786156, "grad_norm": 0.43836735991299125, "learning_rate": 5.5043354622575955e-06, "loss": 0.0276, "step": 1648 }, { "epoch": 1.563774300616406, "grad_norm": 0.37950941420274353, "learning_rate": 5.498842062309643e-06, "loss": 0.0265, "step": 1649 }, { "epoch": 1.5647226173541964, "grad_norm": 0.5616670816349663, "learning_rate": 5.4933480540891295e-06, "loss": 0.0577, "step": 1650 }, { "epoch": 1.5656709340919868, "grad_norm": 0.6568262974602079, "learning_rate": 5.487853444295278e-06, "loss": 0.0466, "step": 1651 }, { "epoch": 1.5666192508297772, "grad_norm": 0.7263693663581391, "learning_rate": 5.482358239628047e-06, "loss": 0.0411, "step": 1652 }, { "epoch": 1.5675675675675675, "grad_norm": 0.5207396419110628, "learning_rate": 5.476862446788118e-06, "loss": 0.0517, "step": 1653 }, { "epoch": 1.568515884305358, "grad_norm": 0.5188917964277431, "learning_rate": 5.471366072476891e-06, "loss": 0.0493, "step": 1654 }, { "epoch": 1.5694642010431483, "grad_norm": 0.4000030591629918, "learning_rate": 5.465869123396474e-06, "loss": 0.0304, "step": 1655 }, { "epoch": 1.5704125177809387, "grad_norm": 0.42697872862878716, "learning_rate": 5.460371606249677e-06, "loss": 0.0338, "step": 1656 }, { "epoch": 1.571360834518729, "grad_norm": 0.4896290919012838, "learning_rate": 5.454873527740002e-06, "loss": 0.0489, "step": 1657 }, { "epoch": 1.5723091512565197, "grad_norm": 0.7554443088523393, "learning_rate": 5.449374894571635e-06, "loss": 0.0391, "step": 1658 }, { "epoch": 1.57325746799431, "grad_norm": 0.5802368408383383, "learning_rate": 5.443875713449439e-06, "loss": 0.0411, "step": 1659 }, { "epoch": 1.5742057847321005, "grad_norm": 0.5018529796661364, "learning_rate": 5.438375991078946e-06, "loss": 0.0416, "step": 1660 }, { "epoch": 1.575154101469891, "grad_norm": 0.3183205147770903, "learning_rate": 5.4328757341663475e-06, "loss": 0.0349, "step": 1661 }, { "epoch": 1.5761024182076815, "grad_norm": 1.011391019460894, "learning_rate": 5.427374949418487e-06, "loss": 0.0337, "step": 1662 }, { "epoch": 1.5770507349454719, "grad_norm": 0.3191949723530378, "learning_rate": 5.421873643542846e-06, "loss": 0.0305, "step": 1663 }, { "epoch": 1.5779990516832623, "grad_norm": 0.634640230734434, "learning_rate": 5.4163718232475525e-06, "loss": 0.0508, "step": 1664 }, { "epoch": 1.5789473684210527, "grad_norm": 0.5852720760242924, "learning_rate": 5.4108694952413546e-06, "loss": 0.0443, "step": 1665 }, { "epoch": 1.579895685158843, "grad_norm": 0.5855319152782336, "learning_rate": 5.4053666662336176e-06, "loss": 0.0327, "step": 1666 }, { "epoch": 1.5808440018966334, "grad_norm": 0.3586298744998486, "learning_rate": 5.399863342934324e-06, "loss": 0.0406, "step": 1667 }, { "epoch": 1.5817923186344238, "grad_norm": 0.49399058191728706, "learning_rate": 5.394359532054054e-06, "loss": 0.0301, "step": 1668 }, { "epoch": 1.5827406353722142, "grad_norm": 1.2375067802560742, "learning_rate": 5.388855240303985e-06, "loss": 0.0701, "step": 1669 }, { "epoch": 1.5836889521100046, "grad_norm": 0.6195169804808017, "learning_rate": 5.38335047439588e-06, "loss": 0.0536, "step": 1670 }, { "epoch": 1.5846372688477952, "grad_norm": 0.4760457301339429, "learning_rate": 5.377845241042077e-06, "loss": 0.0459, "step": 1671 }, { "epoch": 1.5855855855855856, "grad_norm": 0.42237056822248414, "learning_rate": 5.372339546955493e-06, "loss": 0.041, "step": 1672 }, { "epoch": 1.586533902323376, "grad_norm": 0.7588679568473088, "learning_rate": 5.3668333988495955e-06, "loss": 0.0617, "step": 1673 }, { "epoch": 1.5874822190611664, "grad_norm": 0.5129085018035644, "learning_rate": 5.361326803438414e-06, "loss": 0.0466, "step": 1674 }, { "epoch": 1.588430535798957, "grad_norm": 0.35504034518206484, "learning_rate": 5.3558197674365174e-06, "loss": 0.0319, "step": 1675 }, { "epoch": 1.5893788525367474, "grad_norm": 0.4719261955609522, "learning_rate": 5.350312297559018e-06, "loss": 0.0434, "step": 1676 }, { "epoch": 1.5903271692745378, "grad_norm": 0.3948218019688561, "learning_rate": 5.344804400521554e-06, "loss": 0.034, "step": 1677 }, { "epoch": 1.5912754860123282, "grad_norm": 0.3895494909416238, "learning_rate": 5.3392960830402825e-06, "loss": 0.0297, "step": 1678 }, { "epoch": 1.5922238027501185, "grad_norm": 0.4137215707959619, "learning_rate": 5.333787351831875e-06, "loss": 0.0362, "step": 1679 }, { "epoch": 1.593172119487909, "grad_norm": 0.44517521813349287, "learning_rate": 5.328278213613509e-06, "loss": 0.0419, "step": 1680 }, { "epoch": 1.5941204362256993, "grad_norm": 0.46221054971476766, "learning_rate": 5.322768675102857e-06, "loss": 0.0426, "step": 1681 }, { "epoch": 1.5950687529634897, "grad_norm": 0.49217226245635887, "learning_rate": 5.31725874301808e-06, "loss": 0.044, "step": 1682 }, { "epoch": 1.59601706970128, "grad_norm": 0.5613029052207704, "learning_rate": 5.3117484240778176e-06, "loss": 0.0452, "step": 1683 }, { "epoch": 1.5969653864390705, "grad_norm": 0.38744508130227856, "learning_rate": 5.306237725001181e-06, "loss": 0.0346, "step": 1684 }, { "epoch": 1.597913703176861, "grad_norm": 0.6033140558546168, "learning_rate": 5.3007266525077484e-06, "loss": 0.0519, "step": 1685 }, { "epoch": 1.5988620199146515, "grad_norm": 0.3844932046652491, "learning_rate": 5.295215213317549e-06, "loss": 0.0339, "step": 1686 }, { "epoch": 1.5998103366524419, "grad_norm": 0.35948437819579393, "learning_rate": 5.289703414151062e-06, "loss": 0.0324, "step": 1687 }, { "epoch": 1.6007586533902325, "grad_norm": 0.49403545880259037, "learning_rate": 5.284191261729206e-06, "loss": 0.0377, "step": 1688 }, { "epoch": 1.6017069701280229, "grad_norm": 0.3950827081822717, "learning_rate": 5.278678762773326e-06, "loss": 0.0348, "step": 1689 }, { "epoch": 1.6026552868658133, "grad_norm": 0.46679652631135615, "learning_rate": 5.273165924005195e-06, "loss": 0.0354, "step": 1690 }, { "epoch": 1.6036036036036037, "grad_norm": 0.4187949082045406, "learning_rate": 5.267652752146994e-06, "loss": 0.0358, "step": 1691 }, { "epoch": 1.604551920341394, "grad_norm": 0.5390502663154565, "learning_rate": 5.2621392539213185e-06, "loss": 0.0415, "step": 1692 }, { "epoch": 1.6055002370791844, "grad_norm": 0.3749414556008541, "learning_rate": 5.256625436051156e-06, "loss": 0.0297, "step": 1693 }, { "epoch": 1.6064485538169748, "grad_norm": 0.43791392697589476, "learning_rate": 5.251111305259886e-06, "loss": 0.0427, "step": 1694 }, { "epoch": 1.6073968705547652, "grad_norm": 0.4147327877599569, "learning_rate": 5.245596868271265e-06, "loss": 0.0339, "step": 1695 }, { "epoch": 1.6083451872925556, "grad_norm": 0.4367673755602324, "learning_rate": 5.240082131809431e-06, "loss": 0.041, "step": 1696 }, { "epoch": 1.609293504030346, "grad_norm": 0.3920759685283346, "learning_rate": 5.234567102598881e-06, "loss": 0.0393, "step": 1697 }, { "epoch": 1.6102418207681366, "grad_norm": 0.38612730122526456, "learning_rate": 5.229051787364471e-06, "loss": 0.0385, "step": 1698 }, { "epoch": 1.611190137505927, "grad_norm": 0.3810633326641417, "learning_rate": 5.223536192831405e-06, "loss": 0.0413, "step": 1699 }, { "epoch": 1.6121384542437174, "grad_norm": 0.4008625923351243, "learning_rate": 5.21802032572523e-06, "loss": 0.0359, "step": 1700 }, { "epoch": 1.613086770981508, "grad_norm": 0.4980860436464716, "learning_rate": 5.212504192771822e-06, "loss": 0.0443, "step": 1701 }, { "epoch": 1.6140350877192984, "grad_norm": 0.41197267376046887, "learning_rate": 5.206987800697383e-06, "loss": 0.0337, "step": 1702 }, { "epoch": 1.6149834044570888, "grad_norm": 0.4783120794734666, "learning_rate": 5.20147115622843e-06, "loss": 0.0533, "step": 1703 }, { "epoch": 1.6159317211948792, "grad_norm": 0.4038354701384072, "learning_rate": 5.19595426609179e-06, "loss": 0.0414, "step": 1704 }, { "epoch": 1.6168800379326695, "grad_norm": 0.5116662661624387, "learning_rate": 5.1904371370145866e-06, "loss": 0.0433, "step": 1705 }, { "epoch": 1.61782835467046, "grad_norm": 0.3616855724287448, "learning_rate": 5.184919775724236e-06, "loss": 0.0344, "step": 1706 }, { "epoch": 1.6187766714082503, "grad_norm": 0.498168738572277, "learning_rate": 5.179402188948438e-06, "loss": 0.044, "step": 1707 }, { "epoch": 1.6197249881460407, "grad_norm": 0.5241305541833553, "learning_rate": 5.173884383415168e-06, "loss": 0.0496, "step": 1708 }, { "epoch": 1.620673304883831, "grad_norm": 0.6574907730267259, "learning_rate": 5.168366365852666e-06, "loss": 0.045, "step": 1709 }, { "epoch": 1.6216216216216215, "grad_norm": 0.40685958969794783, "learning_rate": 5.162848142989434e-06, "loss": 0.0298, "step": 1710 }, { "epoch": 1.6225699383594119, "grad_norm": 0.3934908581350256, "learning_rate": 5.157329721554218e-06, "loss": 0.0385, "step": 1711 }, { "epoch": 1.6235182550972025, "grad_norm": 0.5479655834153282, "learning_rate": 5.151811108276011e-06, "loss": 0.0414, "step": 1712 }, { "epoch": 1.6244665718349929, "grad_norm": 0.4485734151651903, "learning_rate": 5.146292309884043e-06, "loss": 0.034, "step": 1713 }, { "epoch": 1.6254148885727833, "grad_norm": 0.38005314977033383, "learning_rate": 5.140773333107763e-06, "loss": 0.0319, "step": 1714 }, { "epoch": 1.6263632053105739, "grad_norm": 0.49033527821168377, "learning_rate": 5.13525418467684e-06, "loss": 0.0347, "step": 1715 }, { "epoch": 1.6273115220483643, "grad_norm": 0.4376750362085485, "learning_rate": 5.129734871321153e-06, "loss": 0.0277, "step": 1716 }, { "epoch": 1.6282598387861547, "grad_norm": 0.4508191180466739, "learning_rate": 5.124215399770782e-06, "loss": 0.0298, "step": 1717 }, { "epoch": 1.629208155523945, "grad_norm": 0.5458986876368422, "learning_rate": 5.118695776756001e-06, "loss": 0.045, "step": 1718 }, { "epoch": 1.6301564722617354, "grad_norm": 0.48550938492428214, "learning_rate": 5.113176009007264e-06, "loss": 0.0353, "step": 1719 }, { "epoch": 1.6311047889995258, "grad_norm": 0.5214878006549084, "learning_rate": 5.1076561032552076e-06, "loss": 0.0492, "step": 1720 }, { "epoch": 1.6320531057373162, "grad_norm": 0.3575931662216187, "learning_rate": 5.102136066230634e-06, "loss": 0.0296, "step": 1721 }, { "epoch": 1.6330014224751066, "grad_norm": 0.38118575881201, "learning_rate": 5.096615904664505e-06, "loss": 0.0301, "step": 1722 }, { "epoch": 1.633949739212897, "grad_norm": 0.5525837485507148, "learning_rate": 5.091095625287933e-06, "loss": 0.0476, "step": 1723 }, { "epoch": 1.6348980559506874, "grad_norm": 0.45798268615313587, "learning_rate": 5.085575234832177e-06, "loss": 0.0375, "step": 1724 }, { "epoch": 1.635846372688478, "grad_norm": 0.32084838714691044, "learning_rate": 5.08005474002863e-06, "loss": 0.0288, "step": 1725 }, { "epoch": 1.6367946894262684, "grad_norm": 0.48894734808026186, "learning_rate": 5.074534147608813e-06, "loss": 0.037, "step": 1726 }, { "epoch": 1.6377430061640588, "grad_norm": 0.5863056845718708, "learning_rate": 5.069013464304365e-06, "loss": 0.0467, "step": 1727 }, { "epoch": 1.6386913229018494, "grad_norm": 0.5276793927642295, "learning_rate": 5.063492696847035e-06, "loss": 0.048, "step": 1728 }, { "epoch": 1.6396396396396398, "grad_norm": 0.4500869018064269, "learning_rate": 5.057971851968678e-06, "loss": 0.0346, "step": 1729 }, { "epoch": 1.6405879563774302, "grad_norm": 0.4974245711828883, "learning_rate": 5.05245093640124e-06, "loss": 0.0401, "step": 1730 }, { "epoch": 1.6415362731152205, "grad_norm": 0.5070745888081949, "learning_rate": 5.046929956876755e-06, "loss": 0.0376, "step": 1731 }, { "epoch": 1.642484589853011, "grad_norm": 0.6592711837920248, "learning_rate": 5.0414089201273335e-06, "loss": 0.0337, "step": 1732 }, { "epoch": 1.6434329065908013, "grad_norm": 0.6797373207354449, "learning_rate": 5.035887832885158e-06, "loss": 0.0403, "step": 1733 }, { "epoch": 1.6443812233285917, "grad_norm": 0.4710691221262725, "learning_rate": 5.03036670188247e-06, "loss": 0.0411, "step": 1734 }, { "epoch": 1.645329540066382, "grad_norm": 0.5376307614124892, "learning_rate": 5.024845533851567e-06, "loss": 0.0505, "step": 1735 }, { "epoch": 1.6462778568041725, "grad_norm": 0.5486265059532447, "learning_rate": 5.019324335524787e-06, "loss": 0.0447, "step": 1736 }, { "epoch": 1.6472261735419629, "grad_norm": 0.40758845237104635, "learning_rate": 5.013803113634513e-06, "loss": 0.034, "step": 1737 }, { "epoch": 1.6481744902797535, "grad_norm": 0.6104444776515453, "learning_rate": 5.00828187491315e-06, "loss": 0.0398, "step": 1738 }, { "epoch": 1.6491228070175439, "grad_norm": 0.4307141085331387, "learning_rate": 5.002760626093125e-06, "loss": 0.0366, "step": 1739 }, { "epoch": 1.6500711237553343, "grad_norm": 0.6440031916908964, "learning_rate": 4.997239373906877e-06, "loss": 0.0425, "step": 1740 }, { "epoch": 1.6510194404931247, "grad_norm": 0.5807419766923566, "learning_rate": 4.991718125086851e-06, "loss": 0.0434, "step": 1741 }, { "epoch": 1.6519677572309153, "grad_norm": 0.3942692224681114, "learning_rate": 4.986196886365488e-06, "loss": 0.031, "step": 1742 }, { "epoch": 1.6529160739687057, "grad_norm": 0.4128003706084824, "learning_rate": 4.9806756644752145e-06, "loss": 0.0383, "step": 1743 }, { "epoch": 1.653864390706496, "grad_norm": 0.4085578606989102, "learning_rate": 4.975154466148435e-06, "loss": 0.0261, "step": 1744 }, { "epoch": 1.6548127074442864, "grad_norm": 0.4837103927049282, "learning_rate": 4.969633298117533e-06, "loss": 0.0351, "step": 1745 }, { "epoch": 1.6557610241820768, "grad_norm": 0.7395875455109805, "learning_rate": 4.964112167114844e-06, "loss": 0.039, "step": 1746 }, { "epoch": 1.6567093409198672, "grad_norm": 0.4854107983390489, "learning_rate": 4.958591079872667e-06, "loss": 0.0357, "step": 1747 }, { "epoch": 1.6576576576576576, "grad_norm": 0.3483061645891692, "learning_rate": 4.953070043123247e-06, "loss": 0.035, "step": 1748 }, { "epoch": 1.658605974395448, "grad_norm": 0.37143969392815496, "learning_rate": 4.947549063598761e-06, "loss": 0.0299, "step": 1749 }, { "epoch": 1.6595542911332384, "grad_norm": 0.3749415284322426, "learning_rate": 4.942028148031322e-06, "loss": 0.0389, "step": 1750 }, { "epoch": 1.6605026078710288, "grad_norm": 0.4991282731845152, "learning_rate": 4.936507303152966e-06, "loss": 0.0262, "step": 1751 }, { "epoch": 1.6614509246088194, "grad_norm": 0.3863346846624415, "learning_rate": 4.930986535695636e-06, "loss": 0.0338, "step": 1752 }, { "epoch": 1.6623992413466098, "grad_norm": 0.3863586588308062, "learning_rate": 4.9254658523911885e-06, "loss": 0.0288, "step": 1753 }, { "epoch": 1.6633475580844002, "grad_norm": 0.5587025668116927, "learning_rate": 4.9199452599713715e-06, "loss": 0.0461, "step": 1754 }, { "epoch": 1.6642958748221908, "grad_norm": 0.4497105307702213, "learning_rate": 4.914424765167824e-06, "loss": 0.0385, "step": 1755 }, { "epoch": 1.6652441915599812, "grad_norm": 0.42629532853182683, "learning_rate": 4.908904374712069e-06, "loss": 0.0357, "step": 1756 }, { "epoch": 1.6661925082977715, "grad_norm": 0.3850936464294085, "learning_rate": 4.903384095335497e-06, "loss": 0.0261, "step": 1757 }, { "epoch": 1.667140825035562, "grad_norm": 0.5328143876026115, "learning_rate": 4.8978639337693665e-06, "loss": 0.0318, "step": 1758 }, { "epoch": 1.6680891417733523, "grad_norm": 0.3369899929014371, "learning_rate": 4.892343896744794e-06, "loss": 0.0246, "step": 1759 }, { "epoch": 1.6690374585111427, "grad_norm": 0.6423782785289791, "learning_rate": 4.8868239909927365e-06, "loss": 0.034, "step": 1760 }, { "epoch": 1.669985775248933, "grad_norm": 0.3956672279589752, "learning_rate": 4.881304223244002e-06, "loss": 0.0363, "step": 1761 }, { "epoch": 1.6709340919867235, "grad_norm": 0.3549262283376078, "learning_rate": 4.875784600229219e-06, "loss": 0.0274, "step": 1762 }, { "epoch": 1.6718824087245139, "grad_norm": 0.44033629831242393, "learning_rate": 4.870265128678848e-06, "loss": 0.036, "step": 1763 }, { "epoch": 1.6728307254623043, "grad_norm": 0.38102544990314435, "learning_rate": 4.864745815323162e-06, "loss": 0.0354, "step": 1764 }, { "epoch": 1.6737790422000949, "grad_norm": 0.7336679820913383, "learning_rate": 4.859226666892239e-06, "loss": 0.0372, "step": 1765 }, { "epoch": 1.6747273589378853, "grad_norm": 0.4783768456867206, "learning_rate": 4.8537076901159575e-06, "loss": 0.0319, "step": 1766 }, { "epoch": 1.6756756756756757, "grad_norm": 0.535219503295994, "learning_rate": 4.84818889172399e-06, "loss": 0.0452, "step": 1767 }, { "epoch": 1.676623992413466, "grad_norm": 0.2982755405751966, "learning_rate": 4.8426702784457835e-06, "loss": 0.0235, "step": 1768 }, { "epoch": 1.6775723091512567, "grad_norm": 0.45187225376000945, "learning_rate": 4.8371518570105695e-06, "loss": 0.0332, "step": 1769 }, { "epoch": 1.678520625889047, "grad_norm": 0.5137536616757206, "learning_rate": 4.831633634147335e-06, "loss": 0.0516, "step": 1770 }, { "epoch": 1.6794689426268374, "grad_norm": 0.45838875310802574, "learning_rate": 4.826115616584832e-06, "loss": 0.0401, "step": 1771 }, { "epoch": 1.6804172593646278, "grad_norm": 0.430174715041985, "learning_rate": 4.820597811051563e-06, "loss": 0.0297, "step": 1772 }, { "epoch": 1.6813655761024182, "grad_norm": 0.2962896754783431, "learning_rate": 4.815080224275765e-06, "loss": 0.0252, "step": 1773 }, { "epoch": 1.6823138928402086, "grad_norm": 0.37585418826664413, "learning_rate": 4.809562862985416e-06, "loss": 0.0321, "step": 1774 }, { "epoch": 1.683262209577999, "grad_norm": 0.6008863761423583, "learning_rate": 4.8040457339082115e-06, "loss": 0.0313, "step": 1775 }, { "epoch": 1.6842105263157894, "grad_norm": 0.6656061062299986, "learning_rate": 4.79852884377157e-06, "loss": 0.0642, "step": 1776 }, { "epoch": 1.6851588430535798, "grad_norm": 0.3957049880883251, "learning_rate": 4.793012199302619e-06, "loss": 0.0352, "step": 1777 }, { "epoch": 1.6861071597913702, "grad_norm": 0.5470753034512885, "learning_rate": 4.787495807228179e-06, "loss": 0.0417, "step": 1778 }, { "epoch": 1.6870554765291608, "grad_norm": 0.2999392604451325, "learning_rate": 4.78197967427477e-06, "loss": 0.0246, "step": 1779 }, { "epoch": 1.6880037932669512, "grad_norm": 0.39413640160909796, "learning_rate": 4.776463807168596e-06, "loss": 0.0348, "step": 1780 }, { "epoch": 1.6889521100047415, "grad_norm": 0.4398686088172201, "learning_rate": 4.770948212635531e-06, "loss": 0.0316, "step": 1781 }, { "epoch": 1.6899004267425322, "grad_norm": 0.43305122714845934, "learning_rate": 4.765432897401121e-06, "loss": 0.0317, "step": 1782 }, { "epoch": 1.6908487434803225, "grad_norm": 0.4517409472016299, "learning_rate": 4.7599178681905705e-06, "loss": 0.0413, "step": 1783 }, { "epoch": 1.691797060218113, "grad_norm": 0.5616680977844734, "learning_rate": 4.754403131728736e-06, "loss": 0.043, "step": 1784 }, { "epoch": 1.6927453769559033, "grad_norm": 0.37478799993443473, "learning_rate": 4.748888694740117e-06, "loss": 0.033, "step": 1785 }, { "epoch": 1.6936936936936937, "grad_norm": 0.5182765249765543, "learning_rate": 4.743374563948846e-06, "loss": 0.0412, "step": 1786 }, { "epoch": 1.694642010431484, "grad_norm": 0.5349561090586026, "learning_rate": 4.737860746078682e-06, "loss": 0.036, "step": 1787 }, { "epoch": 1.6955903271692745, "grad_norm": 0.8610126674967102, "learning_rate": 4.7323472478530075e-06, "loss": 0.0615, "step": 1788 }, { "epoch": 1.6965386439070649, "grad_norm": 0.5757179501499256, "learning_rate": 4.726834075994807e-06, "loss": 0.0471, "step": 1789 }, { "epoch": 1.6974869606448553, "grad_norm": 0.5279814169362466, "learning_rate": 4.721321237226676e-06, "loss": 0.0457, "step": 1790 }, { "epoch": 1.6984352773826457, "grad_norm": 0.3926629636024604, "learning_rate": 4.7158087382707955e-06, "loss": 0.045, "step": 1791 }, { "epoch": 1.6993835941204363, "grad_norm": 0.453224341946254, "learning_rate": 4.710296585848938e-06, "loss": 0.037, "step": 1792 }, { "epoch": 1.7003319108582267, "grad_norm": 0.44388438579404543, "learning_rate": 4.704784786682453e-06, "loss": 0.0343, "step": 1793 }, { "epoch": 1.701280227596017, "grad_norm": 0.38919604350005726, "learning_rate": 4.699273347492253e-06, "loss": 0.0235, "step": 1794 }, { "epoch": 1.7022285443338074, "grad_norm": 0.4603473957874724, "learning_rate": 4.693762274998819e-06, "loss": 0.0338, "step": 1795 }, { "epoch": 1.703176861071598, "grad_norm": 0.5169626767035945, "learning_rate": 4.688251575922185e-06, "loss": 0.0414, "step": 1796 }, { "epoch": 1.7041251778093884, "grad_norm": 0.3372120183155561, "learning_rate": 4.682741256981922e-06, "loss": 0.0263, "step": 1797 }, { "epoch": 1.7050734945471788, "grad_norm": 0.42042614991062877, "learning_rate": 4.6772313248971455e-06, "loss": 0.0381, "step": 1798 }, { "epoch": 1.7060218112849692, "grad_norm": 0.3793277130075472, "learning_rate": 4.671721786386492e-06, "loss": 0.0364, "step": 1799 }, { "epoch": 1.7069701280227596, "grad_norm": 0.36173441445946797, "learning_rate": 4.6662126481681255e-06, "loss": 0.027, "step": 1800 }, { "epoch": 1.70791844476055, "grad_norm": 0.42820923235128877, "learning_rate": 4.660703916959719e-06, "loss": 0.0318, "step": 1801 }, { "epoch": 1.7088667614983404, "grad_norm": 0.4465434244504414, "learning_rate": 4.655195599478448e-06, "loss": 0.0313, "step": 1802 }, { "epoch": 1.7098150782361308, "grad_norm": 0.46894391958023623, "learning_rate": 4.649687702440982e-06, "loss": 0.0446, "step": 1803 }, { "epoch": 1.7107633949739212, "grad_norm": 0.37867650726134006, "learning_rate": 4.644180232563484e-06, "loss": 0.0311, "step": 1804 }, { "epoch": 1.7117117117117115, "grad_norm": 0.3702374369015596, "learning_rate": 4.638673196561587e-06, "loss": 0.0289, "step": 1805 }, { "epoch": 1.7126600284495022, "grad_norm": 0.36563753129172316, "learning_rate": 4.633166601150407e-06, "loss": 0.033, "step": 1806 }, { "epoch": 1.7136083451872925, "grad_norm": 0.4200759775460671, "learning_rate": 4.627660453044508e-06, "loss": 0.0315, "step": 1807 }, { "epoch": 1.714556661925083, "grad_norm": 0.4364188122631288, "learning_rate": 4.622154758957923e-06, "loss": 0.0344, "step": 1808 }, { "epoch": 1.7155049786628735, "grad_norm": 0.36606066056529396, "learning_rate": 4.616649525604123e-06, "loss": 0.0285, "step": 1809 }, { "epoch": 1.716453295400664, "grad_norm": 0.5148294395120504, "learning_rate": 4.611144759696016e-06, "loss": 0.0587, "step": 1810 }, { "epoch": 1.7174016121384543, "grad_norm": 0.6881685438316215, "learning_rate": 4.605640467945946e-06, "loss": 0.0598, "step": 1811 }, { "epoch": 1.7183499288762447, "grad_norm": 0.5735474647239276, "learning_rate": 4.600136657065678e-06, "loss": 0.0399, "step": 1812 }, { "epoch": 1.719298245614035, "grad_norm": 0.36219250240066375, "learning_rate": 4.594633333766383e-06, "loss": 0.0343, "step": 1813 }, { "epoch": 1.7202465623518255, "grad_norm": 0.40670988595909385, "learning_rate": 4.589130504758648e-06, "loss": 0.0281, "step": 1814 }, { "epoch": 1.7211948790896159, "grad_norm": 0.43024128435525394, "learning_rate": 4.583628176752448e-06, "loss": 0.0342, "step": 1815 }, { "epoch": 1.7221431958274063, "grad_norm": 0.35903986981532177, "learning_rate": 4.578126356457154e-06, "loss": 0.0267, "step": 1816 }, { "epoch": 1.7230915125651967, "grad_norm": 0.3783152391951721, "learning_rate": 4.572625050581516e-06, "loss": 0.0268, "step": 1817 }, { "epoch": 1.724039829302987, "grad_norm": 0.4612759243512315, "learning_rate": 4.567124265833654e-06, "loss": 0.0314, "step": 1818 }, { "epoch": 1.7249881460407777, "grad_norm": 0.36167166767777, "learning_rate": 4.561624008921054e-06, "loss": 0.0313, "step": 1819 }, { "epoch": 1.725936462778568, "grad_norm": 0.5603215159296359, "learning_rate": 4.556124286550563e-06, "loss": 0.0536, "step": 1820 }, { "epoch": 1.7268847795163584, "grad_norm": 0.3181070277595216, "learning_rate": 4.550625105428367e-06, "loss": 0.0227, "step": 1821 }, { "epoch": 1.727833096254149, "grad_norm": 0.4081175989973651, "learning_rate": 4.545126472260001e-06, "loss": 0.0347, "step": 1822 }, { "epoch": 1.7287814129919394, "grad_norm": 0.4338531761499665, "learning_rate": 4.539628393750324e-06, "loss": 0.0349, "step": 1823 }, { "epoch": 1.7297297297297298, "grad_norm": 0.5070890405906837, "learning_rate": 4.534130876603527e-06, "loss": 0.0352, "step": 1824 }, { "epoch": 1.7306780464675202, "grad_norm": 0.3871814529225249, "learning_rate": 4.5286339275231115e-06, "loss": 0.0285, "step": 1825 }, { "epoch": 1.7316263632053106, "grad_norm": 0.40989108008019104, "learning_rate": 4.523137553211883e-06, "loss": 0.0325, "step": 1826 }, { "epoch": 1.732574679943101, "grad_norm": 0.4366308345846694, "learning_rate": 4.5176417603719555e-06, "loss": 0.0404, "step": 1827 }, { "epoch": 1.7335229966808914, "grad_norm": 0.3764252415741553, "learning_rate": 4.512146555704723e-06, "loss": 0.0333, "step": 1828 }, { "epoch": 1.7344713134186818, "grad_norm": 0.3711934688107721, "learning_rate": 4.506651945910872e-06, "loss": 0.0253, "step": 1829 }, { "epoch": 1.7354196301564722, "grad_norm": 1.0453131397548512, "learning_rate": 4.501157937690359e-06, "loss": 0.02, "step": 1830 }, { "epoch": 1.7363679468942625, "grad_norm": 0.4301932604454958, "learning_rate": 4.495664537742405e-06, "loss": 0.0383, "step": 1831 }, { "epoch": 1.7373162636320532, "grad_norm": 0.35893295448394297, "learning_rate": 4.490171752765494e-06, "loss": 0.0327, "step": 1832 }, { "epoch": 1.7382645803698435, "grad_norm": 0.5133813007167333, "learning_rate": 4.48467958945736e-06, "loss": 0.0296, "step": 1833 }, { "epoch": 1.739212897107634, "grad_norm": 0.43884167593440265, "learning_rate": 4.479188054514971e-06, "loss": 0.031, "step": 1834 }, { "epoch": 1.7401612138454243, "grad_norm": 0.3979319557061671, "learning_rate": 4.47369715463454e-06, "loss": 0.0278, "step": 1835 }, { "epoch": 1.741109530583215, "grad_norm": 0.3781187564948932, "learning_rate": 4.468206896511497e-06, "loss": 0.0273, "step": 1836 }, { "epoch": 1.7420578473210053, "grad_norm": 0.5240765391753767, "learning_rate": 4.462717286840493e-06, "loss": 0.0352, "step": 1837 }, { "epoch": 1.7430061640587957, "grad_norm": 0.3344508789434431, "learning_rate": 4.457228332315391e-06, "loss": 0.0295, "step": 1838 }, { "epoch": 1.743954480796586, "grad_norm": 0.3765289513807636, "learning_rate": 4.451740039629247e-06, "loss": 0.0321, "step": 1839 }, { "epoch": 1.7449027975343765, "grad_norm": 0.605396755278652, "learning_rate": 4.446252415474321e-06, "loss": 0.0618, "step": 1840 }, { "epoch": 1.7458511142721669, "grad_norm": 0.37145283802650364, "learning_rate": 4.440765466542048e-06, "loss": 0.0324, "step": 1841 }, { "epoch": 1.7467994310099573, "grad_norm": 0.34847655213343387, "learning_rate": 4.435279199523043e-06, "loss": 0.0331, "step": 1842 }, { "epoch": 1.7477477477477477, "grad_norm": 0.550675362545679, "learning_rate": 4.429793621107094e-06, "loss": 0.0381, "step": 1843 }, { "epoch": 1.748696064485538, "grad_norm": 0.43671841183507826, "learning_rate": 4.424308737983141e-06, "loss": 0.0427, "step": 1844 }, { "epoch": 1.7496443812233284, "grad_norm": 0.399738947003064, "learning_rate": 4.418824556839284e-06, "loss": 0.0259, "step": 1845 }, { "epoch": 1.750592697961119, "grad_norm": 0.4045286330954736, "learning_rate": 4.413341084362762e-06, "loss": 0.0283, "step": 1846 }, { "epoch": 1.7515410146989094, "grad_norm": 0.596321484570831, "learning_rate": 4.407858327239952e-06, "loss": 0.0416, "step": 1847 }, { "epoch": 1.7524893314366998, "grad_norm": 0.3962890329473834, "learning_rate": 4.402376292156357e-06, "loss": 0.0225, "step": 1848 }, { "epoch": 1.7534376481744904, "grad_norm": 0.6108314208398574, "learning_rate": 4.396894985796603e-06, "loss": 0.0549, "step": 1849 }, { "epoch": 1.7543859649122808, "grad_norm": 0.4926118481779225, "learning_rate": 4.391414414844423e-06, "loss": 0.0352, "step": 1850 }, { "epoch": 1.7553342816500712, "grad_norm": 0.44216328985669257, "learning_rate": 4.385934585982658e-06, "loss": 0.0359, "step": 1851 }, { "epoch": 1.7562825983878616, "grad_norm": 0.5108809743875607, "learning_rate": 4.38045550589324e-06, "loss": 0.0409, "step": 1852 }, { "epoch": 1.757230915125652, "grad_norm": 0.3407129811866031, "learning_rate": 4.374977181257189e-06, "loss": 0.0281, "step": 1853 }, { "epoch": 1.7581792318634424, "grad_norm": 0.5348515455628596, "learning_rate": 4.369499618754606e-06, "loss": 0.0342, "step": 1854 }, { "epoch": 1.7591275486012328, "grad_norm": 0.5279301635319789, "learning_rate": 4.364022825064658e-06, "loss": 0.045, "step": 1855 }, { "epoch": 1.7600758653390232, "grad_norm": 0.5050771561563465, "learning_rate": 4.358546806865581e-06, "loss": 0.0416, "step": 1856 }, { "epoch": 1.7610241820768135, "grad_norm": 0.43174863952207065, "learning_rate": 4.353071570834662e-06, "loss": 0.0317, "step": 1857 }, { "epoch": 1.761972498814604, "grad_norm": 0.41940789267051237, "learning_rate": 4.3475971236482304e-06, "loss": 0.0277, "step": 1858 }, { "epoch": 1.7629208155523945, "grad_norm": 0.44670846590238744, "learning_rate": 4.342123471981663e-06, "loss": 0.0496, "step": 1859 }, { "epoch": 1.763869132290185, "grad_norm": 0.4642364288838703, "learning_rate": 4.336650622509356e-06, "loss": 0.0402, "step": 1860 }, { "epoch": 1.7648174490279753, "grad_norm": 0.37073060927337753, "learning_rate": 4.331178581904735e-06, "loss": 0.0308, "step": 1861 }, { "epoch": 1.7657657657657657, "grad_norm": 0.4435398096512773, "learning_rate": 4.325707356840237e-06, "loss": 0.034, "step": 1862 }, { "epoch": 1.7667140825035563, "grad_norm": 0.4369429326386228, "learning_rate": 4.320236953987304e-06, "loss": 0.0339, "step": 1863 }, { "epoch": 1.7676623992413467, "grad_norm": 0.37836536445740354, "learning_rate": 4.3147673800163744e-06, "loss": 0.0281, "step": 1864 }, { "epoch": 1.768610715979137, "grad_norm": 0.3265793848210341, "learning_rate": 4.309298641596882e-06, "loss": 0.0316, "step": 1865 }, { "epoch": 1.7695590327169275, "grad_norm": 0.32105094423844494, "learning_rate": 4.303830745397231e-06, "loss": 0.0297, "step": 1866 }, { "epoch": 1.7705073494547179, "grad_norm": 0.9731314448776828, "learning_rate": 4.29836369808481e-06, "loss": 0.0496, "step": 1867 }, { "epoch": 1.7714556661925083, "grad_norm": 0.41064791082286095, "learning_rate": 4.292897506325962e-06, "loss": 0.0314, "step": 1868 }, { "epoch": 1.7724039829302987, "grad_norm": 0.41521808764811713, "learning_rate": 4.2874321767859945e-06, "loss": 0.0394, "step": 1869 }, { "epoch": 1.773352299668089, "grad_norm": 0.5429748722960639, "learning_rate": 4.281967716129163e-06, "loss": 0.0409, "step": 1870 }, { "epoch": 1.7743006164058794, "grad_norm": 0.4896073053023119, "learning_rate": 4.276504131018658e-06, "loss": 0.0434, "step": 1871 }, { "epoch": 1.7752489331436698, "grad_norm": 0.45935466151081317, "learning_rate": 4.271041428116608e-06, "loss": 0.0429, "step": 1872 }, { "epoch": 1.7761972498814604, "grad_norm": 0.4608816010569723, "learning_rate": 4.265579614084065e-06, "loss": 0.0329, "step": 1873 }, { "epoch": 1.7771455666192508, "grad_norm": 0.4675738747142554, "learning_rate": 4.260118695580992e-06, "loss": 0.0412, "step": 1874 }, { "epoch": 1.7780938833570412, "grad_norm": 0.5351511581707835, "learning_rate": 4.254658679266268e-06, "loss": 0.0342, "step": 1875 }, { "epoch": 1.7790422000948318, "grad_norm": 0.47923179366593766, "learning_rate": 4.249199571797664e-06, "loss": 0.0354, "step": 1876 }, { "epoch": 1.7799905168326222, "grad_norm": 0.4751740142576013, "learning_rate": 4.243741379831848e-06, "loss": 0.048, "step": 1877 }, { "epoch": 1.7809388335704126, "grad_norm": 0.43079624768664576, "learning_rate": 4.238284110024374e-06, "loss": 0.0383, "step": 1878 }, { "epoch": 1.781887150308203, "grad_norm": 0.4308505818689027, "learning_rate": 4.232827769029663e-06, "loss": 0.0306, "step": 1879 }, { "epoch": 1.7828354670459934, "grad_norm": 0.34173757368848234, "learning_rate": 4.2273723635010075e-06, "loss": 0.0279, "step": 1880 }, { "epoch": 1.7837837837837838, "grad_norm": 0.2974816565902399, "learning_rate": 4.221917900090566e-06, "loss": 0.0245, "step": 1881 }, { "epoch": 1.7847321005215742, "grad_norm": 0.429383794979146, "learning_rate": 4.216464385449335e-06, "loss": 0.0452, "step": 1882 }, { "epoch": 1.7856804172593645, "grad_norm": 0.37510369228807144, "learning_rate": 4.211011826227168e-06, "loss": 0.0313, "step": 1883 }, { "epoch": 1.786628733997155, "grad_norm": 0.5164577689856673, "learning_rate": 4.205560229072742e-06, "loss": 0.0406, "step": 1884 }, { "epoch": 1.7875770507349453, "grad_norm": 0.396145933769621, "learning_rate": 4.200109600633567e-06, "loss": 0.0442, "step": 1885 }, { "epoch": 1.788525367472736, "grad_norm": 0.4024500110736181, "learning_rate": 4.1946599475559724e-06, "loss": 0.0395, "step": 1886 }, { "epoch": 1.7894736842105263, "grad_norm": 0.4541562531386443, "learning_rate": 4.189211276485093e-06, "loss": 0.0343, "step": 1887 }, { "epoch": 1.7904220009483167, "grad_norm": 0.33885170624784317, "learning_rate": 4.183763594064874e-06, "loss": 0.027, "step": 1888 }, { "epoch": 1.791370317686107, "grad_norm": 0.7885135242857828, "learning_rate": 4.1783169069380445e-06, "loss": 0.0529, "step": 1889 }, { "epoch": 1.7923186344238977, "grad_norm": 1.5910177901330982, "learning_rate": 4.172871221746132e-06, "loss": 0.0693, "step": 1890 }, { "epoch": 1.793266951161688, "grad_norm": 0.3613904128403713, "learning_rate": 4.167426545129435e-06, "loss": 0.0317, "step": 1891 }, { "epoch": 1.7942152678994785, "grad_norm": 0.4949823076890155, "learning_rate": 4.16198288372702e-06, "loss": 0.0385, "step": 1892 }, { "epoch": 1.7951635846372689, "grad_norm": 0.5873559373413337, "learning_rate": 4.156540244176722e-06, "loss": 0.0434, "step": 1893 }, { "epoch": 1.7961119013750593, "grad_norm": 0.7461574029871659, "learning_rate": 4.151098633115129e-06, "loss": 0.0556, "step": 1894 }, { "epoch": 1.7970602181128497, "grad_norm": 0.4822043358893843, "learning_rate": 4.145658057177569e-06, "loss": 0.0323, "step": 1895 }, { "epoch": 1.79800853485064, "grad_norm": 0.8738249467892109, "learning_rate": 4.1402185229981155e-06, "loss": 0.0428, "step": 1896 }, { "epoch": 1.7989568515884304, "grad_norm": 0.5198052374533522, "learning_rate": 4.134780037209563e-06, "loss": 0.0399, "step": 1897 }, { "epoch": 1.7999051683262208, "grad_norm": 0.6244484104917516, "learning_rate": 4.129342606443436e-06, "loss": 0.0396, "step": 1898 }, { "epoch": 1.8008534850640112, "grad_norm": 0.6966474793382613, "learning_rate": 4.123906237329971e-06, "loss": 0.0383, "step": 1899 }, { "epoch": 1.8018018018018018, "grad_norm": 0.8286265100115936, "learning_rate": 4.118470936498105e-06, "loss": 0.0503, "step": 1900 }, { "epoch": 1.8027501185395922, "grad_norm": 0.6974574681974006, "learning_rate": 4.113036710575476e-06, "loss": 0.0395, "step": 1901 }, { "epoch": 1.8036984352773826, "grad_norm": 0.448930049036162, "learning_rate": 4.107603566188412e-06, "loss": 0.0377, "step": 1902 }, { "epoch": 1.8046467520151732, "grad_norm": 0.4293294488172685, "learning_rate": 4.102171509961919e-06, "loss": 0.0366, "step": 1903 }, { "epoch": 1.8055950687529636, "grad_norm": 0.5127412423653616, "learning_rate": 4.096740548519681e-06, "loss": 0.0486, "step": 1904 }, { "epoch": 1.806543385490754, "grad_norm": 0.5189142966976821, "learning_rate": 4.091310688484041e-06, "loss": 0.0385, "step": 1905 }, { "epoch": 1.8074917022285444, "grad_norm": 0.41364900033387003, "learning_rate": 4.085881936476005e-06, "loss": 0.0288, "step": 1906 }, { "epoch": 1.8084400189663348, "grad_norm": 0.687884255688505, "learning_rate": 4.080454299115224e-06, "loss": 0.0319, "step": 1907 }, { "epoch": 1.8093883357041252, "grad_norm": 0.5197620472106413, "learning_rate": 4.075027783019989e-06, "loss": 0.0352, "step": 1908 }, { "epoch": 1.8103366524419156, "grad_norm": 0.42525429976878526, "learning_rate": 4.0696023948072274e-06, "loss": 0.0408, "step": 1909 }, { "epoch": 1.811284969179706, "grad_norm": 0.5828270165546796, "learning_rate": 4.064178141092491e-06, "loss": 0.0335, "step": 1910 }, { "epoch": 1.8122332859174963, "grad_norm": 0.4343714216988695, "learning_rate": 4.058755028489945e-06, "loss": 0.0257, "step": 1911 }, { "epoch": 1.8131816026552867, "grad_norm": 0.5099005671458844, "learning_rate": 4.053333063612365e-06, "loss": 0.0382, "step": 1912 }, { "epoch": 1.8141299193930773, "grad_norm": 0.47310374354497153, "learning_rate": 4.0479122530711255e-06, "loss": 0.0386, "step": 1913 }, { "epoch": 1.8150782361308677, "grad_norm": 0.3880372722401674, "learning_rate": 4.042492603476197e-06, "loss": 0.0318, "step": 1914 }, { "epoch": 1.816026552868658, "grad_norm": 0.5630916853522246, "learning_rate": 4.037074121436135e-06, "loss": 0.0421, "step": 1915 }, { "epoch": 1.8169748696064487, "grad_norm": 0.4050061779858133, "learning_rate": 4.031656813558063e-06, "loss": 0.0218, "step": 1916 }, { "epoch": 1.817923186344239, "grad_norm": 0.5487831061447199, "learning_rate": 4.0262406864476816e-06, "loss": 0.0414, "step": 1917 }, { "epoch": 1.8188715030820295, "grad_norm": 0.4118953389358272, "learning_rate": 4.020825746709249e-06, "loss": 0.0275, "step": 1918 }, { "epoch": 1.8198198198198199, "grad_norm": 0.3646881365149475, "learning_rate": 4.015412000945573e-06, "loss": 0.0306, "step": 1919 }, { "epoch": 1.8207681365576103, "grad_norm": 0.36407786342534326, "learning_rate": 4.009999455758011e-06, "loss": 0.032, "step": 1920 }, { "epoch": 1.8217164532954007, "grad_norm": 0.6322503860720039, "learning_rate": 4.004588117746447e-06, "loss": 0.0305, "step": 1921 }, { "epoch": 1.822664770033191, "grad_norm": 0.35424208882329783, "learning_rate": 3.999177993509303e-06, "loss": 0.0296, "step": 1922 }, { "epoch": 1.8236130867709814, "grad_norm": 0.5037021770720644, "learning_rate": 3.9937690896435195e-06, "loss": 0.0456, "step": 1923 }, { "epoch": 1.8245614035087718, "grad_norm": 0.28275182943356875, "learning_rate": 3.98836141274454e-06, "loss": 0.0254, "step": 1924 }, { "epoch": 1.8255097202465622, "grad_norm": 1.0874197920409632, "learning_rate": 3.982954969406322e-06, "loss": 0.0378, "step": 1925 }, { "epoch": 1.8264580369843528, "grad_norm": 0.39797960479767225, "learning_rate": 3.977549766221316e-06, "loss": 0.0329, "step": 1926 }, { "epoch": 1.8274063537221432, "grad_norm": 0.4589297830996474, "learning_rate": 3.972145809780457e-06, "loss": 0.0341, "step": 1927 }, { "epoch": 1.8283546704599336, "grad_norm": 0.6342219517636322, "learning_rate": 3.966743106673165e-06, "loss": 0.0287, "step": 1928 }, { "epoch": 1.829302987197724, "grad_norm": 0.5386132100170457, "learning_rate": 3.961341663487324e-06, "loss": 0.0427, "step": 1929 }, { "epoch": 1.8302513039355146, "grad_norm": 0.5080057913810255, "learning_rate": 3.95594148680929e-06, "loss": 0.0355, "step": 1930 }, { "epoch": 1.831199620673305, "grad_norm": 0.3669532593099998, "learning_rate": 3.950542583223871e-06, "loss": 0.0336, "step": 1931 }, { "epoch": 1.8321479374110954, "grad_norm": 0.4557500825827911, "learning_rate": 3.945144959314322e-06, "loss": 0.0449, "step": 1932 }, { "epoch": 1.8330962541488858, "grad_norm": 0.41433118107831046, "learning_rate": 3.939748621662339e-06, "loss": 0.0394, "step": 1933 }, { "epoch": 1.8340445708866762, "grad_norm": 0.36284629747376757, "learning_rate": 3.93435357684805e-06, "loss": 0.0324, "step": 1934 }, { "epoch": 1.8349928876244666, "grad_norm": 0.6488793981008372, "learning_rate": 3.928959831450003e-06, "loss": 0.036, "step": 1935 }, { "epoch": 1.835941204362257, "grad_norm": 0.4072871196865869, "learning_rate": 3.923567392045167e-06, "loss": 0.0361, "step": 1936 }, { "epoch": 1.8368895211000473, "grad_norm": 0.5995330166879924, "learning_rate": 3.918176265208912e-06, "loss": 0.047, "step": 1937 }, { "epoch": 1.8378378378378377, "grad_norm": 0.5742511704396418, "learning_rate": 3.912786457515013e-06, "loss": 0.0368, "step": 1938 }, { "epoch": 1.838786154575628, "grad_norm": 0.37173427951758653, "learning_rate": 3.907397975535637e-06, "loss": 0.0295, "step": 1939 }, { "epoch": 1.8397344713134187, "grad_norm": 0.4283387112035294, "learning_rate": 3.902010825841328e-06, "loss": 0.0418, "step": 1940 }, { "epoch": 1.840682788051209, "grad_norm": 0.42658758128230867, "learning_rate": 3.896625015001011e-06, "loss": 0.0307, "step": 1941 }, { "epoch": 1.8416311047889995, "grad_norm": 0.4103737401087204, "learning_rate": 3.891240549581979e-06, "loss": 0.0466, "step": 1942 }, { "epoch": 1.84257942152679, "grad_norm": 0.5179813798652112, "learning_rate": 3.885857436149879e-06, "loss": 0.032, "step": 1943 }, { "epoch": 1.8435277382645805, "grad_norm": 0.36142149152719033, "learning_rate": 3.880475681268716e-06, "loss": 0.025, "step": 1944 }, { "epoch": 1.8444760550023709, "grad_norm": 0.38988766477546516, "learning_rate": 3.8750952915008315e-06, "loss": 0.0266, "step": 1945 }, { "epoch": 1.8454243717401613, "grad_norm": 0.4047589561085592, "learning_rate": 3.869716273406908e-06, "loss": 0.0291, "step": 1946 }, { "epoch": 1.8463726884779517, "grad_norm": 0.5591559055271785, "learning_rate": 3.864338633545956e-06, "loss": 0.0281, "step": 1947 }, { "epoch": 1.847321005215742, "grad_norm": 0.44542403886367465, "learning_rate": 3.8589623784752985e-06, "loss": 0.0407, "step": 1948 }, { "epoch": 1.8482693219535324, "grad_norm": 0.45876612433553804, "learning_rate": 3.853587514750579e-06, "loss": 0.0315, "step": 1949 }, { "epoch": 1.8492176386913228, "grad_norm": 0.3111868531988744, "learning_rate": 3.848214048925736e-06, "loss": 0.0277, "step": 1950 }, { "epoch": 1.8501659554291132, "grad_norm": 0.4353007357835014, "learning_rate": 3.842841987553007e-06, "loss": 0.0319, "step": 1951 }, { "epoch": 1.8511142721669036, "grad_norm": 0.5289123449785142, "learning_rate": 3.837471337182923e-06, "loss": 0.0318, "step": 1952 }, { "epoch": 1.8520625889046942, "grad_norm": 0.5868521892114408, "learning_rate": 3.832102104364281e-06, "loss": 0.0344, "step": 1953 }, { "epoch": 1.8530109056424846, "grad_norm": 0.5617353335767225, "learning_rate": 3.826734295644163e-06, "loss": 0.0352, "step": 1954 }, { "epoch": 1.853959222380275, "grad_norm": 0.5789507471320139, "learning_rate": 3.821367917567908e-06, "loss": 0.0529, "step": 1955 }, { "epoch": 1.8549075391180654, "grad_norm": 0.4921075321136551, "learning_rate": 3.816002976679107e-06, "loss": 0.0279, "step": 1956 }, { "epoch": 1.855855855855856, "grad_norm": 0.4600403890394494, "learning_rate": 3.8106394795196087e-06, "loss": 0.0317, "step": 1957 }, { "epoch": 1.8568041725936464, "grad_norm": 0.6494450216853208, "learning_rate": 3.80527743262949e-06, "loss": 0.0501, "step": 1958 }, { "epoch": 1.8577524893314368, "grad_norm": 0.3991594068949773, "learning_rate": 3.799916842547068e-06, "loss": 0.0337, "step": 1959 }, { "epoch": 1.8587008060692272, "grad_norm": 0.6181340333450788, "learning_rate": 3.7945577158088793e-06, "loss": 0.0364, "step": 1960 }, { "epoch": 1.8596491228070176, "grad_norm": 0.37075530743846147, "learning_rate": 3.7892000589496773e-06, "loss": 0.0246, "step": 1961 }, { "epoch": 1.860597439544808, "grad_norm": 0.5054091368468414, "learning_rate": 3.7838438785024216e-06, "loss": 0.0379, "step": 1962 }, { "epoch": 1.8615457562825983, "grad_norm": 0.44895407969120477, "learning_rate": 3.7784891809982767e-06, "loss": 0.0352, "step": 1963 }, { "epoch": 1.8624940730203887, "grad_norm": 0.35070271043852363, "learning_rate": 3.773135972966589e-06, "loss": 0.032, "step": 1964 }, { "epoch": 1.863442389758179, "grad_norm": 0.9903843261764405, "learning_rate": 3.7677842609349e-06, "loss": 0.0365, "step": 1965 }, { "epoch": 1.8643907064959695, "grad_norm": 0.48477176290481, "learning_rate": 3.762434051428918e-06, "loss": 0.0323, "step": 1966 }, { "epoch": 1.86533902323376, "grad_norm": 0.39374562548602104, "learning_rate": 3.7570853509725234e-06, "loss": 0.0302, "step": 1967 }, { "epoch": 1.8662873399715505, "grad_norm": 0.47578607994744204, "learning_rate": 3.7517381660877587e-06, "loss": 0.0431, "step": 1968 }, { "epoch": 1.8672356567093409, "grad_norm": 0.5424376735264428, "learning_rate": 3.7463925032948114e-06, "loss": 0.0487, "step": 1969 }, { "epoch": 1.8681839734471315, "grad_norm": 0.44176826832585336, "learning_rate": 3.741048369112019e-06, "loss": 0.0389, "step": 1970 }, { "epoch": 1.8691322901849219, "grad_norm": 0.4390592738444192, "learning_rate": 3.735705770055855e-06, "loss": 0.0412, "step": 1971 }, { "epoch": 1.8700806069227123, "grad_norm": 0.4183210896360201, "learning_rate": 3.7303647126409153e-06, "loss": 0.0366, "step": 1972 }, { "epoch": 1.8710289236605027, "grad_norm": 0.6065333984388325, "learning_rate": 3.7250252033799243e-06, "loss": 0.0467, "step": 1973 }, { "epoch": 1.871977240398293, "grad_norm": 0.518349410639526, "learning_rate": 3.71968724878371e-06, "loss": 0.041, "step": 1974 }, { "epoch": 1.8729255571360834, "grad_norm": 0.43597774812646534, "learning_rate": 3.714350855361212e-06, "loss": 0.0381, "step": 1975 }, { "epoch": 1.8738738738738738, "grad_norm": 0.5067159246242335, "learning_rate": 3.709016029619461e-06, "loss": 0.0482, "step": 1976 }, { "epoch": 1.8748221906116642, "grad_norm": 0.34210931197001854, "learning_rate": 3.703682778063581e-06, "loss": 0.0303, "step": 1977 }, { "epoch": 1.8757705073494546, "grad_norm": 0.42520698759519004, "learning_rate": 3.69835110719677e-06, "loss": 0.0445, "step": 1978 }, { "epoch": 1.876718824087245, "grad_norm": 0.4230721150319016, "learning_rate": 3.6930210235203067e-06, "loss": 0.0283, "step": 1979 }, { "epoch": 1.8776671408250356, "grad_norm": 0.42411343718392436, "learning_rate": 3.6876925335335255e-06, "loss": 0.03, "step": 1980 }, { "epoch": 1.878615457562826, "grad_norm": 0.616820313190297, "learning_rate": 3.6823656437338267e-06, "loss": 0.0374, "step": 1981 }, { "epoch": 1.8795637743006164, "grad_norm": 0.41781910861393473, "learning_rate": 3.6770403606166514e-06, "loss": 0.0304, "step": 1982 }, { "epoch": 1.8805120910384068, "grad_norm": 0.5860766950716109, "learning_rate": 3.6717166906754843e-06, "loss": 0.0404, "step": 1983 }, { "epoch": 1.8814604077761974, "grad_norm": 0.7252531827073903, "learning_rate": 3.666394640401848e-06, "loss": 0.0312, "step": 1984 }, { "epoch": 1.8824087245139878, "grad_norm": 0.553908681165879, "learning_rate": 3.6610742162852807e-06, "loss": 0.0452, "step": 1985 }, { "epoch": 1.8833570412517782, "grad_norm": 0.30665698348068665, "learning_rate": 3.655755424813346e-06, "loss": 0.0229, "step": 1986 }, { "epoch": 1.8843053579895686, "grad_norm": 0.6570412463785198, "learning_rate": 3.6504382724716137e-06, "loss": 0.0316, "step": 1987 }, { "epoch": 1.885253674727359, "grad_norm": 0.5144282052416823, "learning_rate": 3.6451227657436517e-06, "loss": 0.0349, "step": 1988 }, { "epoch": 1.8862019914651493, "grad_norm": 0.4071279928061887, "learning_rate": 3.639808911111028e-06, "loss": 0.0285, "step": 1989 }, { "epoch": 1.8871503082029397, "grad_norm": 0.5358728007249505, "learning_rate": 3.634496715053288e-06, "loss": 0.0351, "step": 1990 }, { "epoch": 1.88809862494073, "grad_norm": 0.368200617097554, "learning_rate": 3.6291861840479613e-06, "loss": 0.0298, "step": 1991 }, { "epoch": 1.8890469416785205, "grad_norm": 0.4507922059461737, "learning_rate": 3.6238773245705483e-06, "loss": 0.0375, "step": 1992 }, { "epoch": 1.8899952584163109, "grad_norm": 0.4081128435721486, "learning_rate": 3.6185701430945032e-06, "loss": 0.0383, "step": 1993 }, { "epoch": 1.8909435751541015, "grad_norm": 0.33297226361322074, "learning_rate": 3.613264646091239e-06, "loss": 0.0286, "step": 1994 }, { "epoch": 1.8918918918918919, "grad_norm": 0.53979762982992, "learning_rate": 3.6079608400301182e-06, "loss": 0.0358, "step": 1995 }, { "epoch": 1.8928402086296823, "grad_norm": 0.4090634897905571, "learning_rate": 3.6026587313784325e-06, "loss": 0.0319, "step": 1996 }, { "epoch": 1.8937885253674729, "grad_norm": 0.46149770151838015, "learning_rate": 3.597358326601413e-06, "loss": 0.0451, "step": 1997 }, { "epoch": 1.8947368421052633, "grad_norm": 0.3639527520811153, "learning_rate": 3.592059632162207e-06, "loss": 0.0334, "step": 1998 }, { "epoch": 1.8956851588430537, "grad_norm": 0.3891264328147345, "learning_rate": 3.5867626545218786e-06, "loss": 0.0352, "step": 1999 }, { "epoch": 1.896633475580844, "grad_norm": 0.5597673879815559, "learning_rate": 3.5814674001394007e-06, "loss": 0.0363, "step": 2000 }, { "epoch": 1.8975817923186344, "grad_norm": 0.3685996313252056, "learning_rate": 3.576173875471638e-06, "loss": 0.0326, "step": 2001 }, { "epoch": 1.8985301090564248, "grad_norm": 0.787312091428569, "learning_rate": 3.5708820869733552e-06, "loss": 0.0419, "step": 2002 }, { "epoch": 1.8994784257942152, "grad_norm": 0.3830970434240292, "learning_rate": 3.565592041097191e-06, "loss": 0.0284, "step": 2003 }, { "epoch": 1.9004267425320056, "grad_norm": 0.33426646878917393, "learning_rate": 3.5603037442936672e-06, "loss": 0.0297, "step": 2004 }, { "epoch": 1.901375059269796, "grad_norm": 0.33746177626737933, "learning_rate": 3.5550172030111687e-06, "loss": 0.0223, "step": 2005 }, { "epoch": 1.9023233760075864, "grad_norm": 0.44722672220400045, "learning_rate": 3.5497324236959363e-06, "loss": 0.0321, "step": 2006 }, { "epoch": 1.903271692745377, "grad_norm": 0.4834586259648445, "learning_rate": 3.5444494127920694e-06, "loss": 0.0335, "step": 2007 }, { "epoch": 1.9042200094831674, "grad_norm": 0.38059120675515146, "learning_rate": 3.5391681767415093e-06, "loss": 0.0287, "step": 2008 }, { "epoch": 1.9051683262209578, "grad_norm": 0.3075997620618822, "learning_rate": 3.533888721984029e-06, "loss": 0.0278, "step": 2009 }, { "epoch": 1.9061166429587484, "grad_norm": 0.40917618286352053, "learning_rate": 3.5286110549572337e-06, "loss": 0.0357, "step": 2010 }, { "epoch": 1.9070649596965388, "grad_norm": 0.42430230207581826, "learning_rate": 3.5233351820965445e-06, "loss": 0.0502, "step": 2011 }, { "epoch": 1.9080132764343292, "grad_norm": 0.3515137768052769, "learning_rate": 3.518061109835199e-06, "loss": 0.0315, "step": 2012 }, { "epoch": 1.9089615931721196, "grad_norm": 0.5095722782417215, "learning_rate": 3.51278884460424e-06, "loss": 0.0354, "step": 2013 }, { "epoch": 1.90990990990991, "grad_norm": 0.4270657425513164, "learning_rate": 3.5075183928325018e-06, "loss": 0.0368, "step": 2014 }, { "epoch": 1.9108582266477003, "grad_norm": 0.3808378401797508, "learning_rate": 3.5022497609466086e-06, "loss": 0.0291, "step": 2015 }, { "epoch": 1.9118065433854907, "grad_norm": 0.4100158806633352, "learning_rate": 3.4969829553709715e-06, "loss": 0.0349, "step": 2016 }, { "epoch": 1.912754860123281, "grad_norm": 0.5809676625893341, "learning_rate": 3.4917179825277652e-06, "loss": 0.04, "step": 2017 }, { "epoch": 1.9137031768610715, "grad_norm": 0.3906675617466994, "learning_rate": 3.4864548488369385e-06, "loss": 0.0355, "step": 2018 }, { "epoch": 1.9146514935988619, "grad_norm": 0.4223285103973374, "learning_rate": 3.4811935607161907e-06, "loss": 0.0271, "step": 2019 }, { "epoch": 1.9155998103366523, "grad_norm": 0.37651968514215806, "learning_rate": 3.4759341245809754e-06, "loss": 0.0261, "step": 2020 }, { "epoch": 1.9165481270744429, "grad_norm": 0.35511518960123745, "learning_rate": 3.470676546844486e-06, "loss": 0.0268, "step": 2021 }, { "epoch": 1.9174964438122333, "grad_norm": 0.6409970088958556, "learning_rate": 3.4654208339176475e-06, "loss": 0.04, "step": 2022 }, { "epoch": 1.9184447605500237, "grad_norm": 0.4635735551631786, "learning_rate": 3.460166992209115e-06, "loss": 0.0292, "step": 2023 }, { "epoch": 1.9193930772878143, "grad_norm": 0.38062200204638524, "learning_rate": 3.4549150281252635e-06, "loss": 0.0295, "step": 2024 }, { "epoch": 1.9203413940256047, "grad_norm": 0.6007637897320102, "learning_rate": 3.4496649480701717e-06, "loss": 0.0416, "step": 2025 }, { "epoch": 1.921289710763395, "grad_norm": 0.36191314067497893, "learning_rate": 3.4444167584456257e-06, "loss": 0.0315, "step": 2026 }, { "epoch": 1.9222380275011854, "grad_norm": 0.3231541173926272, "learning_rate": 3.439170465651104e-06, "loss": 0.0312, "step": 2027 }, { "epoch": 1.9231863442389758, "grad_norm": 0.38583478675562105, "learning_rate": 3.433926076083774e-06, "loss": 0.0385, "step": 2028 }, { "epoch": 1.9241346609767662, "grad_norm": 0.36409102702107604, "learning_rate": 3.4286835961384853e-06, "loss": 0.0315, "step": 2029 }, { "epoch": 1.9250829777145566, "grad_norm": 0.36312525560443826, "learning_rate": 3.4234430322077517e-06, "loss": 0.0253, "step": 2030 }, { "epoch": 1.926031294452347, "grad_norm": 0.33289279097003616, "learning_rate": 3.418204390681754e-06, "loss": 0.0256, "step": 2031 }, { "epoch": 1.9269796111901374, "grad_norm": 0.5320939559829481, "learning_rate": 3.4129676779483344e-06, "loss": 0.0281, "step": 2032 }, { "epoch": 1.9279279279279278, "grad_norm": 0.45005546438627286, "learning_rate": 3.407732900392973e-06, "loss": 0.0416, "step": 2033 }, { "epoch": 1.9288762446657184, "grad_norm": 0.409851104412246, "learning_rate": 3.402500064398798e-06, "loss": 0.0315, "step": 2034 }, { "epoch": 1.9298245614035088, "grad_norm": 0.3697105553298173, "learning_rate": 3.397269176346566e-06, "loss": 0.0235, "step": 2035 }, { "epoch": 1.9307728781412992, "grad_norm": 0.3938602684566997, "learning_rate": 3.3920402426146613e-06, "loss": 0.0307, "step": 2036 }, { "epoch": 1.9317211948790898, "grad_norm": 0.45578802286334, "learning_rate": 3.3868132695790856e-06, "loss": 0.0374, "step": 2037 }, { "epoch": 1.9326695116168802, "grad_norm": 0.36256121198136504, "learning_rate": 3.3815882636134423e-06, "loss": 0.033, "step": 2038 }, { "epoch": 1.9336178283546706, "grad_norm": 0.9568492163330367, "learning_rate": 3.3763652310889454e-06, "loss": 0.0425, "step": 2039 }, { "epoch": 1.934566145092461, "grad_norm": 0.3815862735223731, "learning_rate": 3.3711441783744014e-06, "loss": 0.0258, "step": 2040 }, { "epoch": 1.9355144618302513, "grad_norm": 0.5667383697238997, "learning_rate": 3.3659251118361955e-06, "loss": 0.0303, "step": 2041 }, { "epoch": 1.9364627785680417, "grad_norm": 0.6867359992131764, "learning_rate": 3.3607080378383006e-06, "loss": 0.0478, "step": 2042 }, { "epoch": 1.937411095305832, "grad_norm": 0.4296690617166379, "learning_rate": 3.3554929627422494e-06, "loss": 0.0333, "step": 2043 }, { "epoch": 1.9383594120436225, "grad_norm": 0.5162976443262678, "learning_rate": 3.3502798929071454e-06, "loss": 0.0554, "step": 2044 }, { "epoch": 1.9393077287814129, "grad_norm": 0.3368621100557327, "learning_rate": 3.345068834689649e-06, "loss": 0.0324, "step": 2045 }, { "epoch": 1.9402560455192033, "grad_norm": 0.37365269979182164, "learning_rate": 3.3398597944439554e-06, "loss": 0.0338, "step": 2046 }, { "epoch": 1.9412043622569939, "grad_norm": 0.5681446998189141, "learning_rate": 3.334652778521813e-06, "loss": 0.0411, "step": 2047 }, { "epoch": 1.9421526789947843, "grad_norm": 0.4740696492249626, "learning_rate": 3.3294477932724946e-06, "loss": 0.045, "step": 2048 }, { "epoch": 1.9431009957325747, "grad_norm": 0.3608937786548174, "learning_rate": 3.3242448450427945e-06, "loss": 0.0332, "step": 2049 }, { "epoch": 1.944049312470365, "grad_norm": 0.38506053572775895, "learning_rate": 3.319043940177031e-06, "loss": 0.028, "step": 2050 }, { "epoch": 1.9449976292081557, "grad_norm": 0.5698067483625857, "learning_rate": 3.3138450850170227e-06, "loss": 0.041, "step": 2051 }, { "epoch": 1.945945945945946, "grad_norm": 0.4420338241665105, "learning_rate": 3.3086482859020957e-06, "loss": 0.0373, "step": 2052 }, { "epoch": 1.9468942626837364, "grad_norm": 0.4953262443173467, "learning_rate": 3.3034535491690654e-06, "loss": 0.051, "step": 2053 }, { "epoch": 1.9478425794215268, "grad_norm": 0.4852410535068405, "learning_rate": 3.2982608811522306e-06, "loss": 0.0274, "step": 2054 }, { "epoch": 1.9487908961593172, "grad_norm": 0.6029040593040903, "learning_rate": 3.293070288183371e-06, "loss": 0.0532, "step": 2055 }, { "epoch": 1.9497392128971076, "grad_norm": 0.45973934841106084, "learning_rate": 3.287881776591737e-06, "loss": 0.0405, "step": 2056 }, { "epoch": 1.950687529634898, "grad_norm": 0.7309917431565905, "learning_rate": 3.282695352704036e-06, "loss": 0.0426, "step": 2057 }, { "epoch": 1.9516358463726884, "grad_norm": 0.33625680044079603, "learning_rate": 3.277511022844436e-06, "loss": 0.028, "step": 2058 }, { "epoch": 1.9525841631104788, "grad_norm": 0.8671444099283062, "learning_rate": 3.2723287933345426e-06, "loss": 0.0417, "step": 2059 }, { "epoch": 1.9535324798482692, "grad_norm": 0.4342735760080419, "learning_rate": 3.267148670493411e-06, "loss": 0.0348, "step": 2060 }, { "epoch": 1.9544807965860598, "grad_norm": 0.35268469905572575, "learning_rate": 3.2619706606375216e-06, "loss": 0.027, "step": 2061 }, { "epoch": 1.9554291133238502, "grad_norm": 0.3845496094181825, "learning_rate": 3.256794770080778e-06, "loss": 0.0318, "step": 2062 }, { "epoch": 1.9563774300616406, "grad_norm": 0.38361703791786117, "learning_rate": 3.2516210051345023e-06, "loss": 0.0371, "step": 2063 }, { "epoch": 1.9573257467994312, "grad_norm": 0.563671322887721, "learning_rate": 3.246449372107422e-06, "loss": 0.0362, "step": 2064 }, { "epoch": 1.9582740635372216, "grad_norm": 0.42306475584040243, "learning_rate": 3.2412798773056653e-06, "loss": 0.0339, "step": 2065 }, { "epoch": 1.959222380275012, "grad_norm": 0.40482383621032814, "learning_rate": 3.2361125270327578e-06, "loss": 0.0347, "step": 2066 }, { "epoch": 1.9601706970128023, "grad_norm": 0.44088167927957383, "learning_rate": 3.230947327589602e-06, "loss": 0.0384, "step": 2067 }, { "epoch": 1.9611190137505927, "grad_norm": 0.37961080268463626, "learning_rate": 3.2257842852744846e-06, "loss": 0.0228, "step": 2068 }, { "epoch": 1.962067330488383, "grad_norm": 0.4444789473749266, "learning_rate": 3.220623406383061e-06, "loss": 0.0449, "step": 2069 }, { "epoch": 1.9630156472261735, "grad_norm": 0.6015703335784637, "learning_rate": 3.2154646972083427e-06, "loss": 0.0595, "step": 2070 }, { "epoch": 1.9639639639639639, "grad_norm": 0.4963402770212196, "learning_rate": 3.210308164040704e-06, "loss": 0.0436, "step": 2071 }, { "epoch": 1.9649122807017543, "grad_norm": 0.3403181961401529, "learning_rate": 3.2051538131678585e-06, "loss": 0.036, "step": 2072 }, { "epoch": 1.9658605974395447, "grad_norm": 0.5178795816060555, "learning_rate": 3.200001650874863e-06, "loss": 0.0287, "step": 2073 }, { "epoch": 1.9668089141773353, "grad_norm": 0.2657038221760043, "learning_rate": 3.194851683444107e-06, "loss": 0.023, "step": 2074 }, { "epoch": 1.9677572309151257, "grad_norm": 0.36414575441011054, "learning_rate": 3.189703917155299e-06, "loss": 0.0256, "step": 2075 }, { "epoch": 1.968705547652916, "grad_norm": 0.399014676654186, "learning_rate": 3.184558358285465e-06, "loss": 0.0341, "step": 2076 }, { "epoch": 1.9696538643907064, "grad_norm": 0.48424044481722756, "learning_rate": 3.1794150131089434e-06, "loss": 0.0322, "step": 2077 }, { "epoch": 1.970602181128497, "grad_norm": 0.36430947108225803, "learning_rate": 3.1742738878973654e-06, "loss": 0.0355, "step": 2078 }, { "epoch": 1.9715504978662874, "grad_norm": 0.44863405179866295, "learning_rate": 3.1691349889196653e-06, "loss": 0.0438, "step": 2079 }, { "epoch": 1.9724988146040778, "grad_norm": 0.7130335206259649, "learning_rate": 3.1639983224420535e-06, "loss": 0.0378, "step": 2080 }, { "epoch": 1.9734471313418682, "grad_norm": 0.4430950820041455, "learning_rate": 3.1588638947280215e-06, "loss": 0.0365, "step": 2081 }, { "epoch": 1.9743954480796586, "grad_norm": 0.32869156151115164, "learning_rate": 3.153731712038335e-06, "loss": 0.0211, "step": 2082 }, { "epoch": 1.975343764817449, "grad_norm": 0.4257317909820218, "learning_rate": 3.1486017806310145e-06, "loss": 0.0386, "step": 2083 }, { "epoch": 1.9762920815552394, "grad_norm": 1.440703206043199, "learning_rate": 3.143474106761343e-06, "loss": 0.028, "step": 2084 }, { "epoch": 1.9772403982930298, "grad_norm": 0.39647997595721773, "learning_rate": 3.138348696681846e-06, "loss": 0.031, "step": 2085 }, { "epoch": 1.9781887150308202, "grad_norm": 0.394821466886043, "learning_rate": 3.1332255566422876e-06, "loss": 0.0324, "step": 2086 }, { "epoch": 1.9791370317686106, "grad_norm": 0.3544381757947608, "learning_rate": 3.1281046928896703e-06, "loss": 0.0265, "step": 2087 }, { "epoch": 1.9800853485064012, "grad_norm": 0.4768128542585291, "learning_rate": 3.1229861116682126e-06, "loss": 0.0366, "step": 2088 }, { "epoch": 1.9810336652441916, "grad_norm": 0.7243948999698084, "learning_rate": 3.1178698192193563e-06, "loss": 0.0479, "step": 2089 }, { "epoch": 1.981981981981982, "grad_norm": 0.4524011909213368, "learning_rate": 3.11275582178175e-06, "loss": 0.0313, "step": 2090 }, { "epoch": 1.9829302987197726, "grad_norm": 0.5179468508049, "learning_rate": 3.1076441255912425e-06, "loss": 0.0332, "step": 2091 }, { "epoch": 1.983878615457563, "grad_norm": 0.6765627732053985, "learning_rate": 3.102534736880878e-06, "loss": 0.0425, "step": 2092 }, { "epoch": 1.9848269321953533, "grad_norm": 1.035454737611202, "learning_rate": 3.097427661880889e-06, "loss": 0.0288, "step": 2093 }, { "epoch": 1.9857752489331437, "grad_norm": 0.4638694830728162, "learning_rate": 3.092322906818681e-06, "loss": 0.0379, "step": 2094 }, { "epoch": 1.986723565670934, "grad_norm": 0.4542609073123639, "learning_rate": 3.087220477918839e-06, "loss": 0.0433, "step": 2095 }, { "epoch": 1.9876718824087245, "grad_norm": 0.3059593385789022, "learning_rate": 3.082120381403102e-06, "loss": 0.0263, "step": 2096 }, { "epoch": 1.9886201991465149, "grad_norm": 0.8343736689898676, "learning_rate": 3.077022623490371e-06, "loss": 0.0445, "step": 2097 }, { "epoch": 1.9895685158843053, "grad_norm": 0.3930889133833989, "learning_rate": 3.0719272103966964e-06, "loss": 0.0316, "step": 2098 }, { "epoch": 1.9905168326220957, "grad_norm": 0.6283499649646375, "learning_rate": 3.066834148335264e-06, "loss": 0.043, "step": 2099 }, { "epoch": 1.991465149359886, "grad_norm": 0.3334337011671591, "learning_rate": 3.0617434435163983e-06, "loss": 0.0221, "step": 2100 }, { "epoch": 1.9924134660976767, "grad_norm": 0.44480723283000395, "learning_rate": 3.0566551021475467e-06, "loss": 0.0247, "step": 2101 }, { "epoch": 1.993361782835467, "grad_norm": 0.42335568280839386, "learning_rate": 3.0515691304332728e-06, "loss": 0.0342, "step": 2102 }, { "epoch": 1.9943100995732574, "grad_norm": 0.39275472992791943, "learning_rate": 3.0464855345752576e-06, "loss": 0.0258, "step": 2103 }, { "epoch": 1.995258416311048, "grad_norm": 0.4252459704357132, "learning_rate": 3.041404320772275e-06, "loss": 0.0395, "step": 2104 }, { "epoch": 1.9962067330488384, "grad_norm": 0.44204282810063367, "learning_rate": 3.036325495220202e-06, "loss": 0.03, "step": 2105 }, { "epoch": 1.9971550497866288, "grad_norm": 0.5267895944658703, "learning_rate": 3.031249064112004e-06, "loss": 0.0336, "step": 2106 }, { "epoch": 1.9981033665244192, "grad_norm": 0.7817180674735154, "learning_rate": 3.0261750336377204e-06, "loss": 0.0433, "step": 2107 }, { "epoch": 1.9990516832622096, "grad_norm": 0.39050551405758693, "learning_rate": 3.0211034099844667e-06, "loss": 0.0286, "step": 2108 }, { "epoch": 2.0, "grad_norm": 0.411241525627785, "learning_rate": 3.0160341993364267e-06, "loss": 0.0272, "step": 2109 }, { "epoch": 2.0, "eval_loss": 0.03899507224559784, "eval_runtime": 200.9629, "eval_samples_per_second": 35.345, "eval_steps_per_second": 1.105, "step": 2109 }, { "epoch": 2.0009483167377904, "grad_norm": 0.42867573482523097, "learning_rate": 3.010967407874835e-06, "loss": 0.0305, "step": 2110 }, { "epoch": 2.0018966334755808, "grad_norm": 0.7670970988426332, "learning_rate": 3.0059030417779843e-06, "loss": 0.0431, "step": 2111 }, { "epoch": 2.002844950213371, "grad_norm": 0.2967349239255996, "learning_rate": 3.000841107221203e-06, "loss": 0.0174, "step": 2112 }, { "epoch": 2.0037932669511616, "grad_norm": 0.35929335388522343, "learning_rate": 2.995781610376856e-06, "loss": 0.0271, "step": 2113 }, { "epoch": 2.004741583688952, "grad_norm": 0.31468404701781494, "learning_rate": 2.990724557414342e-06, "loss": 0.0239, "step": 2114 }, { "epoch": 2.0056899004267423, "grad_norm": 0.4799589378363981, "learning_rate": 2.985669954500069e-06, "loss": 0.0253, "step": 2115 }, { "epoch": 2.006638217164533, "grad_norm": 0.3826498293765311, "learning_rate": 2.980617807797467e-06, "loss": 0.0258, "step": 2116 }, { "epoch": 2.0075865339023236, "grad_norm": 0.40695416013123065, "learning_rate": 2.9755681234669663e-06, "loss": 0.0317, "step": 2117 }, { "epoch": 2.008534850640114, "grad_norm": 0.7967953274247074, "learning_rate": 2.9705209076659957e-06, "loss": 0.0221, "step": 2118 }, { "epoch": 2.0094831673779043, "grad_norm": 0.4552358236810522, "learning_rate": 2.965476166548974e-06, "loss": 0.0358, "step": 2119 }, { "epoch": 2.0104314841156947, "grad_norm": 0.28631994213003603, "learning_rate": 2.9604339062673003e-06, "loss": 0.0175, "step": 2120 }, { "epoch": 2.011379800853485, "grad_norm": 0.32057860078911904, "learning_rate": 2.9553941329693526e-06, "loss": 0.0178, "step": 2121 }, { "epoch": 2.0123281175912755, "grad_norm": 0.3624106623635382, "learning_rate": 2.9503568528004757e-06, "loss": 0.0201, "step": 2122 }, { "epoch": 2.013276434329066, "grad_norm": 0.4636739823079543, "learning_rate": 2.9453220719029694e-06, "loss": 0.0387, "step": 2123 }, { "epoch": 2.0142247510668563, "grad_norm": 0.48994564694002685, "learning_rate": 2.940289796416094e-06, "loss": 0.0295, "step": 2124 }, { "epoch": 2.0151730678046467, "grad_norm": 0.4386644278170872, "learning_rate": 2.935260032476045e-06, "loss": 0.0268, "step": 2125 }, { "epoch": 2.016121384542437, "grad_norm": 0.3758526320713401, "learning_rate": 2.9302327862159634e-06, "loss": 0.0175, "step": 2126 }, { "epoch": 2.0170697012802274, "grad_norm": 0.6491230889815814, "learning_rate": 2.9252080637659203e-06, "loss": 0.0248, "step": 2127 }, { "epoch": 2.018018018018018, "grad_norm": 0.39148598510057125, "learning_rate": 2.920185871252903e-06, "loss": 0.0191, "step": 2128 }, { "epoch": 2.0189663347558082, "grad_norm": 0.38730619737964056, "learning_rate": 2.9151662148008177e-06, "loss": 0.0206, "step": 2129 }, { "epoch": 2.019914651493599, "grad_norm": 0.2739978249686544, "learning_rate": 2.9101491005304803e-06, "loss": 0.0208, "step": 2130 }, { "epoch": 2.0208629682313894, "grad_norm": 0.2492924713448619, "learning_rate": 2.905134534559602e-06, "loss": 0.0182, "step": 2131 }, { "epoch": 2.02181128496918, "grad_norm": 0.3670158300028282, "learning_rate": 2.9001225230027923e-06, "loss": 0.0258, "step": 2132 }, { "epoch": 2.0227596017069702, "grad_norm": 0.5687471196705789, "learning_rate": 2.8951130719715393e-06, "loss": 0.0283, "step": 2133 }, { "epoch": 2.0237079184447606, "grad_norm": 0.4512446574071027, "learning_rate": 2.890106187574213e-06, "loss": 0.0369, "step": 2134 }, { "epoch": 2.024656235182551, "grad_norm": 0.302480559027787, "learning_rate": 2.8851018759160575e-06, "loss": 0.0188, "step": 2135 }, { "epoch": 2.0256045519203414, "grad_norm": 0.37270010872170284, "learning_rate": 2.8801001430991714e-06, "loss": 0.0231, "step": 2136 }, { "epoch": 2.026552868658132, "grad_norm": 0.37835817327163923, "learning_rate": 2.875100995222512e-06, "loss": 0.024, "step": 2137 }, { "epoch": 2.027501185395922, "grad_norm": 0.33368312776065706, "learning_rate": 2.8701044383818887e-06, "loss": 0.0151, "step": 2138 }, { "epoch": 2.0284495021337126, "grad_norm": 0.3635689040002093, "learning_rate": 2.8651104786699446e-06, "loss": 0.0207, "step": 2139 }, { "epoch": 2.029397818871503, "grad_norm": 0.33610953009892625, "learning_rate": 2.860119122176164e-06, "loss": 0.0165, "step": 2140 }, { "epoch": 2.0303461356092933, "grad_norm": 0.4536480112083883, "learning_rate": 2.855130374986847e-06, "loss": 0.0301, "step": 2141 }, { "epoch": 2.0312944523470837, "grad_norm": 0.4862579959409953, "learning_rate": 2.85014424318512e-06, "loss": 0.025, "step": 2142 }, { "epoch": 2.0322427690848746, "grad_norm": 0.37646189979934114, "learning_rate": 2.8451607328509206e-06, "loss": 0.019, "step": 2143 }, { "epoch": 2.033191085822665, "grad_norm": 0.4658791421599927, "learning_rate": 2.8401798500609825e-06, "loss": 0.0242, "step": 2144 }, { "epoch": 2.0341394025604553, "grad_norm": 0.3898784281727516, "learning_rate": 2.8352016008888407e-06, "loss": 0.0257, "step": 2145 }, { "epoch": 2.0350877192982457, "grad_norm": 0.3643245441073441, "learning_rate": 2.830225991404821e-06, "loss": 0.0187, "step": 2146 }, { "epoch": 2.036036036036036, "grad_norm": 0.5491866993620294, "learning_rate": 2.825253027676026e-06, "loss": 0.0337, "step": 2147 }, { "epoch": 2.0369843527738265, "grad_norm": 0.35627319428709453, "learning_rate": 2.8202827157663336e-06, "loss": 0.0238, "step": 2148 }, { "epoch": 2.037932669511617, "grad_norm": 0.40610530442283266, "learning_rate": 2.815315061736385e-06, "loss": 0.0294, "step": 2149 }, { "epoch": 2.0388809862494073, "grad_norm": 0.7297441686133572, "learning_rate": 2.8103500716435867e-06, "loss": 0.0519, "step": 2150 }, { "epoch": 2.0398293029871977, "grad_norm": 0.4458875962553178, "learning_rate": 2.805387751542095e-06, "loss": 0.0315, "step": 2151 }, { "epoch": 2.040777619724988, "grad_norm": 0.41848406140264266, "learning_rate": 2.8004281074828066e-06, "loss": 0.0215, "step": 2152 }, { "epoch": 2.0417259364627784, "grad_norm": 0.5998930027439111, "learning_rate": 2.7954711455133587e-06, "loss": 0.0261, "step": 2153 }, { "epoch": 2.042674253200569, "grad_norm": 0.3643955841072442, "learning_rate": 2.7905168716781207e-06, "loss": 0.0189, "step": 2154 }, { "epoch": 2.0436225699383592, "grad_norm": 0.3851753580513905, "learning_rate": 2.7855652920181743e-06, "loss": 0.0263, "step": 2155 }, { "epoch": 2.0445708866761496, "grad_norm": 0.35889957914574705, "learning_rate": 2.7806164125713287e-06, "loss": 0.0249, "step": 2156 }, { "epoch": 2.0455192034139404, "grad_norm": 0.45717380785394435, "learning_rate": 2.775670239372088e-06, "loss": 0.0231, "step": 2157 }, { "epoch": 2.046467520151731, "grad_norm": 0.45034784091600044, "learning_rate": 2.770726778451669e-06, "loss": 0.0238, "step": 2158 }, { "epoch": 2.0474158368895212, "grad_norm": 0.40079641133062344, "learning_rate": 2.765786035837971e-06, "loss": 0.0222, "step": 2159 }, { "epoch": 2.0483641536273116, "grad_norm": 0.35961652919739295, "learning_rate": 2.760848017555582e-06, "loss": 0.0248, "step": 2160 }, { "epoch": 2.049312470365102, "grad_norm": 0.29591792109816933, "learning_rate": 2.7559127296257694e-06, "loss": 0.0151, "step": 2161 }, { "epoch": 2.0502607871028924, "grad_norm": 0.34278967001786625, "learning_rate": 2.7509801780664725e-06, "loss": 0.017, "step": 2162 }, { "epoch": 2.051209103840683, "grad_norm": 0.6265169417466546, "learning_rate": 2.746050368892288e-06, "loss": 0.037, "step": 2163 }, { "epoch": 2.052157420578473, "grad_norm": 0.4978111695622906, "learning_rate": 2.741123308114477e-06, "loss": 0.0269, "step": 2164 }, { "epoch": 2.0531057373162636, "grad_norm": 0.3482001539800016, "learning_rate": 2.7361990017409406e-06, "loss": 0.0193, "step": 2165 }, { "epoch": 2.054054054054054, "grad_norm": 0.5690545161609503, "learning_rate": 2.7312774557762274e-06, "loss": 0.0371, "step": 2166 }, { "epoch": 2.0550023707918443, "grad_norm": 0.40646429616659996, "learning_rate": 2.7263586762215197e-06, "loss": 0.0205, "step": 2167 }, { "epoch": 2.0559506875296347, "grad_norm": 0.33861620939002895, "learning_rate": 2.721442669074622e-06, "loss": 0.0178, "step": 2168 }, { "epoch": 2.056899004267425, "grad_norm": 0.4037345819140745, "learning_rate": 2.716529440329965e-06, "loss": 0.0289, "step": 2169 }, { "epoch": 2.057847321005216, "grad_norm": 0.482627797833161, "learning_rate": 2.711618995978584e-06, "loss": 0.0352, "step": 2170 }, { "epoch": 2.0587956377430063, "grad_norm": 0.5082804569426425, "learning_rate": 2.7067113420081227e-06, "loss": 0.0349, "step": 2171 }, { "epoch": 2.0597439544807967, "grad_norm": 0.38145913486228483, "learning_rate": 2.7018064844028247e-06, "loss": 0.0251, "step": 2172 }, { "epoch": 2.060692271218587, "grad_norm": 0.5841097247038757, "learning_rate": 2.6969044291435194e-06, "loss": 0.0331, "step": 2173 }, { "epoch": 2.0616405879563775, "grad_norm": 0.41296435259903375, "learning_rate": 2.6920051822076215e-06, "loss": 0.0244, "step": 2174 }, { "epoch": 2.062588904694168, "grad_norm": 0.4727942470216596, "learning_rate": 2.687108749569124e-06, "loss": 0.0251, "step": 2175 }, { "epoch": 2.0635372214319583, "grad_norm": 0.43281565858200033, "learning_rate": 2.6822151371985806e-06, "loss": 0.0362, "step": 2176 }, { "epoch": 2.0644855381697487, "grad_norm": 0.7073861164110138, "learning_rate": 2.6773243510631147e-06, "loss": 0.0283, "step": 2177 }, { "epoch": 2.065433854907539, "grad_norm": 0.3892763301096195, "learning_rate": 2.6724363971263958e-06, "loss": 0.0277, "step": 2178 }, { "epoch": 2.0663821716453294, "grad_norm": 0.6146711615193001, "learning_rate": 2.667551281348647e-06, "loss": 0.0278, "step": 2179 }, { "epoch": 2.06733048838312, "grad_norm": 0.6424481373001304, "learning_rate": 2.6626690096866236e-06, "loss": 0.0226, "step": 2180 }, { "epoch": 2.0682788051209102, "grad_norm": 0.5062917952323074, "learning_rate": 2.657789588093621e-06, "loss": 0.0277, "step": 2181 }, { "epoch": 2.0692271218587006, "grad_norm": 0.4031910900078797, "learning_rate": 2.6529130225194494e-06, "loss": 0.0236, "step": 2182 }, { "epoch": 2.0701754385964914, "grad_norm": 0.3696551665775792, "learning_rate": 2.648039318910447e-06, "loss": 0.0241, "step": 2183 }, { "epoch": 2.071123755334282, "grad_norm": 0.4994149517651512, "learning_rate": 2.643168483209452e-06, "loss": 0.0373, "step": 2184 }, { "epoch": 2.0720720720720722, "grad_norm": 0.3435127049249243, "learning_rate": 2.6383005213558156e-06, "loss": 0.0202, "step": 2185 }, { "epoch": 2.0730203888098626, "grad_norm": 0.3550421787961135, "learning_rate": 2.6334354392853755e-06, "loss": 0.0307, "step": 2186 }, { "epoch": 2.073968705547653, "grad_norm": 0.5367065919243836, "learning_rate": 2.6285732429304634e-06, "loss": 0.0325, "step": 2187 }, { "epoch": 2.0749170222854434, "grad_norm": 0.41551554257616574, "learning_rate": 2.623713938219894e-06, "loss": 0.0194, "step": 2188 }, { "epoch": 2.075865339023234, "grad_norm": 0.42065530424138786, "learning_rate": 2.6188575310789475e-06, "loss": 0.0268, "step": 2189 }, { "epoch": 2.076813655761024, "grad_norm": 0.6462822911603555, "learning_rate": 2.614004027429382e-06, "loss": 0.0385, "step": 2190 }, { "epoch": 2.0777619724988146, "grad_norm": 0.36529894157157955, "learning_rate": 2.609153433189406e-06, "loss": 0.0169, "step": 2191 }, { "epoch": 2.078710289236605, "grad_norm": 0.4317237150788793, "learning_rate": 2.6043057542736837e-06, "loss": 0.0268, "step": 2192 }, { "epoch": 2.0796586059743953, "grad_norm": 0.42998002758472326, "learning_rate": 2.599460996593327e-06, "loss": 0.0308, "step": 2193 }, { "epoch": 2.0806069227121857, "grad_norm": 0.3558311679324967, "learning_rate": 2.5946191660558804e-06, "loss": 0.0214, "step": 2194 }, { "epoch": 2.081555239449976, "grad_norm": 0.38571064268038, "learning_rate": 2.5897802685653235e-06, "loss": 0.0226, "step": 2195 }, { "epoch": 2.0825035561877665, "grad_norm": 0.38651538906208016, "learning_rate": 2.58494431002206e-06, "loss": 0.0212, "step": 2196 }, { "epoch": 2.0834518729255573, "grad_norm": 0.4181252743146899, "learning_rate": 2.580111296322904e-06, "loss": 0.0226, "step": 2197 }, { "epoch": 2.0844001896633477, "grad_norm": 0.5094856971487342, "learning_rate": 2.5752812333610856e-06, "loss": 0.0296, "step": 2198 }, { "epoch": 2.085348506401138, "grad_norm": 0.40859568824515613, "learning_rate": 2.5704541270262347e-06, "loss": 0.0241, "step": 2199 }, { "epoch": 2.0862968231389285, "grad_norm": 0.4517293982946693, "learning_rate": 2.5656299832043718e-06, "loss": 0.0251, "step": 2200 }, { "epoch": 2.087245139876719, "grad_norm": 0.4318042476611322, "learning_rate": 2.5608088077779125e-06, "loss": 0.0227, "step": 2201 }, { "epoch": 2.0881934566145093, "grad_norm": 0.5689691059283708, "learning_rate": 2.5559906066256467e-06, "loss": 0.0311, "step": 2202 }, { "epoch": 2.0891417733522997, "grad_norm": 0.31693729502341195, "learning_rate": 2.551175385622737e-06, "loss": 0.0163, "step": 2203 }, { "epoch": 2.09009009009009, "grad_norm": 0.28795871774991333, "learning_rate": 2.546363150640721e-06, "loss": 0.0139, "step": 2204 }, { "epoch": 2.0910384068278804, "grad_norm": 0.6960867511827779, "learning_rate": 2.541553907547483e-06, "loss": 0.0435, "step": 2205 }, { "epoch": 2.091986723565671, "grad_norm": 0.31057178211753483, "learning_rate": 2.5367476622072674e-06, "loss": 0.0184, "step": 2206 }, { "epoch": 2.0929350403034612, "grad_norm": 0.39246559073720605, "learning_rate": 2.5319444204806624e-06, "loss": 0.0239, "step": 2207 }, { "epoch": 2.0938833570412516, "grad_norm": 0.3606580233864144, "learning_rate": 2.5271441882245896e-06, "loss": 0.0211, "step": 2208 }, { "epoch": 2.094831673779042, "grad_norm": 0.5082639846967826, "learning_rate": 2.5223469712923066e-06, "loss": 0.0301, "step": 2209 }, { "epoch": 2.095779990516833, "grad_norm": 0.3855278523669711, "learning_rate": 2.5175527755333874e-06, "loss": 0.0197, "step": 2210 }, { "epoch": 2.0967283072546232, "grad_norm": 0.433480401949784, "learning_rate": 2.5127616067937276e-06, "loss": 0.0243, "step": 2211 }, { "epoch": 2.0976766239924136, "grad_norm": 0.32045799775532713, "learning_rate": 2.507973470915532e-06, "loss": 0.0172, "step": 2212 }, { "epoch": 2.098624940730204, "grad_norm": 0.5026090192485358, "learning_rate": 2.503188373737304e-06, "loss": 0.0313, "step": 2213 }, { "epoch": 2.0995732574679944, "grad_norm": 0.5640464541480521, "learning_rate": 2.49840632109384e-06, "loss": 0.0303, "step": 2214 }, { "epoch": 2.100521574205785, "grad_norm": 0.41866789277310873, "learning_rate": 2.493627318816231e-06, "loss": 0.0188, "step": 2215 }, { "epoch": 2.101469890943575, "grad_norm": 0.6027998069884065, "learning_rate": 2.4888513727318405e-06, "loss": 0.0241, "step": 2216 }, { "epoch": 2.1024182076813656, "grad_norm": 0.4611166482064652, "learning_rate": 2.484078488664313e-06, "loss": 0.0218, "step": 2217 }, { "epoch": 2.103366524419156, "grad_norm": 0.3931582590198883, "learning_rate": 2.479308672433552e-06, "loss": 0.0203, "step": 2218 }, { "epoch": 2.1043148411569463, "grad_norm": 0.37466971883437067, "learning_rate": 2.474541929855725e-06, "loss": 0.0174, "step": 2219 }, { "epoch": 2.1052631578947367, "grad_norm": 0.45633595162427804, "learning_rate": 2.469778266743253e-06, "loss": 0.0314, "step": 2220 }, { "epoch": 2.106211474632527, "grad_norm": 0.3903053469088167, "learning_rate": 2.465017688904795e-06, "loss": 0.0176, "step": 2221 }, { "epoch": 2.1071597913703175, "grad_norm": 0.8643363208938867, "learning_rate": 2.460260202145256e-06, "loss": 0.0431, "step": 2222 }, { "epoch": 2.108108108108108, "grad_norm": 0.3287931874943448, "learning_rate": 2.4555058122657637e-06, "loss": 0.0174, "step": 2223 }, { "epoch": 2.1090564248458987, "grad_norm": 0.36189985329961605, "learning_rate": 2.4507545250636787e-06, "loss": 0.0203, "step": 2224 }, { "epoch": 2.110004741583689, "grad_norm": 0.37032833947211646, "learning_rate": 2.4460063463325713e-06, "loss": 0.0257, "step": 2225 }, { "epoch": 2.1109530583214795, "grad_norm": 0.3687022423175376, "learning_rate": 2.441261281862221e-06, "loss": 0.0195, "step": 2226 }, { "epoch": 2.11190137505927, "grad_norm": 0.43305034123331726, "learning_rate": 2.436519337438615e-06, "loss": 0.0336, "step": 2227 }, { "epoch": 2.1128496917970603, "grad_norm": 0.39966610064341257, "learning_rate": 2.431780518843935e-06, "loss": 0.0243, "step": 2228 }, { "epoch": 2.1137980085348507, "grad_norm": 0.31938860162562066, "learning_rate": 2.4270448318565455e-06, "loss": 0.0178, "step": 2229 }, { "epoch": 2.114746325272641, "grad_norm": 0.3541847991387582, "learning_rate": 2.422312282250999e-06, "loss": 0.0204, "step": 2230 }, { "epoch": 2.1156946420104314, "grad_norm": 0.629139027756549, "learning_rate": 2.4175828757980225e-06, "loss": 0.0329, "step": 2231 }, { "epoch": 2.116642958748222, "grad_norm": 0.44488333984325906, "learning_rate": 2.4128566182645023e-06, "loss": 0.0218, "step": 2232 }, { "epoch": 2.1175912754860122, "grad_norm": 0.44422645256345245, "learning_rate": 2.4081335154134956e-06, "loss": 0.023, "step": 2233 }, { "epoch": 2.1185395922238026, "grad_norm": 0.3902960872474805, "learning_rate": 2.4034135730042052e-06, "loss": 0.0275, "step": 2234 }, { "epoch": 2.119487908961593, "grad_norm": 0.5253829874658488, "learning_rate": 2.3986967967919807e-06, "loss": 0.0317, "step": 2235 }, { "epoch": 2.1204362256993834, "grad_norm": 0.43919978311796937, "learning_rate": 2.3939831925283176e-06, "loss": 0.0255, "step": 2236 }, { "epoch": 2.1213845424371742, "grad_norm": 0.42626314322476566, "learning_rate": 2.389272765960834e-06, "loss": 0.0248, "step": 2237 }, { "epoch": 2.1223328591749646, "grad_norm": 0.43952254077722236, "learning_rate": 2.384565522833282e-06, "loss": 0.0272, "step": 2238 }, { "epoch": 2.123281175912755, "grad_norm": 0.4951260444039173, "learning_rate": 2.3798614688855254e-06, "loss": 0.03, "step": 2239 }, { "epoch": 2.1242294926505454, "grad_norm": 0.3756698841381475, "learning_rate": 2.3751606098535435e-06, "loss": 0.024, "step": 2240 }, { "epoch": 2.125177809388336, "grad_norm": 0.2861348407988811, "learning_rate": 2.370462951469419e-06, "loss": 0.0155, "step": 2241 }, { "epoch": 2.126126126126126, "grad_norm": 0.4876169224219029, "learning_rate": 2.3657684994613285e-06, "loss": 0.0275, "step": 2242 }, { "epoch": 2.1270744428639166, "grad_norm": 0.4319771129151901, "learning_rate": 2.3610772595535423e-06, "loss": 0.0193, "step": 2243 }, { "epoch": 2.128022759601707, "grad_norm": 0.40162005659288424, "learning_rate": 2.356389237466416e-06, "loss": 0.0184, "step": 2244 }, { "epoch": 2.1289710763394973, "grad_norm": 0.5033492435726082, "learning_rate": 2.351704438916376e-06, "loss": 0.0256, "step": 2245 }, { "epoch": 2.1299193930772877, "grad_norm": 0.39876138394841576, "learning_rate": 2.3470228696159196e-06, "loss": 0.0209, "step": 2246 }, { "epoch": 2.130867709815078, "grad_norm": 0.5019158990595434, "learning_rate": 2.342344535273608e-06, "loss": 0.0274, "step": 2247 }, { "epoch": 2.1318160265528685, "grad_norm": 0.35799980319498204, "learning_rate": 2.3376694415940565e-06, "loss": 0.0226, "step": 2248 }, { "epoch": 2.132764343290659, "grad_norm": 0.45079273257538477, "learning_rate": 2.332997594277933e-06, "loss": 0.0301, "step": 2249 }, { "epoch": 2.1337126600284497, "grad_norm": 0.32674403307718536, "learning_rate": 2.3283289990219395e-06, "loss": 0.0212, "step": 2250 }, { "epoch": 2.13466097676624, "grad_norm": 0.5296184962800978, "learning_rate": 2.3236636615188175e-06, "loss": 0.0317, "step": 2251 }, { "epoch": 2.1356092935040305, "grad_norm": 0.3754232671858479, "learning_rate": 2.3190015874573373e-06, "loss": 0.0229, "step": 2252 }, { "epoch": 2.136557610241821, "grad_norm": 0.3656558614584108, "learning_rate": 2.3143427825222847e-06, "loss": 0.0219, "step": 2253 }, { "epoch": 2.1375059269796113, "grad_norm": 0.5213276424505731, "learning_rate": 2.3096872523944654e-06, "loss": 0.0343, "step": 2254 }, { "epoch": 2.1384542437174017, "grad_norm": 0.34243403378994786, "learning_rate": 2.305035002750684e-06, "loss": 0.0179, "step": 2255 }, { "epoch": 2.139402560455192, "grad_norm": 0.3976411991318559, "learning_rate": 2.3003860392637545e-06, "loss": 0.0184, "step": 2256 }, { "epoch": 2.1403508771929824, "grad_norm": 0.4602750365294984, "learning_rate": 2.2957403676024766e-06, "loss": 0.0224, "step": 2257 }, { "epoch": 2.141299193930773, "grad_norm": 0.9716168085000075, "learning_rate": 2.2910979934316368e-06, "loss": 0.0309, "step": 2258 }, { "epoch": 2.1422475106685632, "grad_norm": 0.3944634875673138, "learning_rate": 2.286458922412004e-06, "loss": 0.0248, "step": 2259 }, { "epoch": 2.1431958274063536, "grad_norm": 0.5135149676803258, "learning_rate": 2.28182316020032e-06, "loss": 0.0252, "step": 2260 }, { "epoch": 2.144144144144144, "grad_norm": 0.5864406162589035, "learning_rate": 2.2771907124492855e-06, "loss": 0.031, "step": 2261 }, { "epoch": 2.1450924608819344, "grad_norm": 0.3595593256803766, "learning_rate": 2.272561584807567e-06, "loss": 0.0211, "step": 2262 }, { "epoch": 2.146040777619725, "grad_norm": 0.5644326980878621, "learning_rate": 2.2679357829197773e-06, "loss": 0.025, "step": 2263 }, { "epoch": 2.1469890943575156, "grad_norm": 0.40039567554067224, "learning_rate": 2.263313312426477e-06, "loss": 0.0251, "step": 2264 }, { "epoch": 2.147937411095306, "grad_norm": 0.3494192171240132, "learning_rate": 2.2586941789641644e-06, "loss": 0.0279, "step": 2265 }, { "epoch": 2.1488857278330964, "grad_norm": 0.543967374973067, "learning_rate": 2.2540783881652672e-06, "loss": 0.0466, "step": 2266 }, { "epoch": 2.149834044570887, "grad_norm": 0.3348355579550652, "learning_rate": 2.249465945658135e-06, "loss": 0.0157, "step": 2267 }, { "epoch": 2.150782361308677, "grad_norm": 0.5992344414940606, "learning_rate": 2.2448568570670422e-06, "loss": 0.0201, "step": 2268 }, { "epoch": 2.1517306780464676, "grad_norm": 0.27638109684371753, "learning_rate": 2.2402511280121635e-06, "loss": 0.0139, "step": 2269 }, { "epoch": 2.152678994784258, "grad_norm": 0.4864039946795088, "learning_rate": 2.235648764109587e-06, "loss": 0.0232, "step": 2270 }, { "epoch": 2.1536273115220483, "grad_norm": 0.4375793326410557, "learning_rate": 2.2310497709712883e-06, "loss": 0.0239, "step": 2271 }, { "epoch": 2.1545756282598387, "grad_norm": 0.4049296761357532, "learning_rate": 2.22645415420514e-06, "loss": 0.0198, "step": 2272 }, { "epoch": 2.155523944997629, "grad_norm": 0.3080411672189438, "learning_rate": 2.2218619194148957e-06, "loss": 0.0192, "step": 2273 }, { "epoch": 2.1564722617354195, "grad_norm": 0.4256264536412931, "learning_rate": 2.2172730722001806e-06, "loss": 0.0223, "step": 2274 }, { "epoch": 2.15742057847321, "grad_norm": 0.27136420910261355, "learning_rate": 2.2126876181564955e-06, "loss": 0.0109, "step": 2275 }, { "epoch": 2.1583688952110003, "grad_norm": 0.4115704050998557, "learning_rate": 2.208105562875203e-06, "loss": 0.0293, "step": 2276 }, { "epoch": 2.1593172119487907, "grad_norm": 0.3998489255300917, "learning_rate": 2.2035269119435164e-06, "loss": 0.0213, "step": 2277 }, { "epoch": 2.1602655286865815, "grad_norm": 0.44243900795322916, "learning_rate": 2.1989516709445013e-06, "loss": 0.0224, "step": 2278 }, { "epoch": 2.161213845424372, "grad_norm": 0.44256484344324315, "learning_rate": 2.194379845457064e-06, "loss": 0.0297, "step": 2279 }, { "epoch": 2.1621621621621623, "grad_norm": 0.3483421838923968, "learning_rate": 2.1898114410559475e-06, "loss": 0.0228, "step": 2280 }, { "epoch": 2.1631104788999527, "grad_norm": 0.49993876242502255, "learning_rate": 2.185246463311725e-06, "loss": 0.0332, "step": 2281 }, { "epoch": 2.164058795637743, "grad_norm": 0.42001008552894303, "learning_rate": 2.1806849177907844e-06, "loss": 0.0264, "step": 2282 }, { "epoch": 2.1650071123755334, "grad_norm": 0.41695424744520754, "learning_rate": 2.176126810055337e-06, "loss": 0.0303, "step": 2283 }, { "epoch": 2.165955429113324, "grad_norm": 0.3240800567288141, "learning_rate": 2.171572145663398e-06, "loss": 0.0183, "step": 2284 }, { "epoch": 2.1669037458511142, "grad_norm": 0.39669442654415515, "learning_rate": 2.167020930168781e-06, "loss": 0.0229, "step": 2285 }, { "epoch": 2.1678520625889046, "grad_norm": 0.39546239248455106, "learning_rate": 2.1624731691211016e-06, "loss": 0.0231, "step": 2286 }, { "epoch": 2.168800379326695, "grad_norm": 0.4087436587688708, "learning_rate": 2.157928868065754e-06, "loss": 0.0277, "step": 2287 }, { "epoch": 2.1697486960644854, "grad_norm": 0.4530786653379887, "learning_rate": 2.153388032543923e-06, "loss": 0.0194, "step": 2288 }, { "epoch": 2.170697012802276, "grad_norm": 0.43939543462713626, "learning_rate": 2.1488506680925596e-06, "loss": 0.0258, "step": 2289 }, { "epoch": 2.171645329540066, "grad_norm": 0.4522241596008493, "learning_rate": 2.144316780244385e-06, "loss": 0.0184, "step": 2290 }, { "epoch": 2.172593646277857, "grad_norm": 1.1516269930596483, "learning_rate": 2.1397863745278825e-06, "loss": 0.0224, "step": 2291 }, { "epoch": 2.1735419630156474, "grad_norm": 0.3736163797314226, "learning_rate": 2.1352594564672907e-06, "loss": 0.0252, "step": 2292 }, { "epoch": 2.174490279753438, "grad_norm": 0.390543627117664, "learning_rate": 2.1307360315825894e-06, "loss": 0.0189, "step": 2293 }, { "epoch": 2.175438596491228, "grad_norm": 0.41425912587622055, "learning_rate": 2.1262161053895068e-06, "loss": 0.0214, "step": 2294 }, { "epoch": 2.1763869132290186, "grad_norm": 0.4604953079593103, "learning_rate": 2.121699683399497e-06, "loss": 0.0215, "step": 2295 }, { "epoch": 2.177335229966809, "grad_norm": 0.4114732763858242, "learning_rate": 2.1171867711197474e-06, "loss": 0.0214, "step": 2296 }, { "epoch": 2.1782835467045993, "grad_norm": 0.4056111357292128, "learning_rate": 2.112677374053164e-06, "loss": 0.0233, "step": 2297 }, { "epoch": 2.1792318634423897, "grad_norm": 0.32625279399687396, "learning_rate": 2.108171497698364e-06, "loss": 0.017, "step": 2298 }, { "epoch": 2.18018018018018, "grad_norm": 0.39967265560405546, "learning_rate": 2.1036691475496764e-06, "loss": 0.0182, "step": 2299 }, { "epoch": 2.1811284969179705, "grad_norm": 0.3725151692726603, "learning_rate": 2.0991703290971255e-06, "loss": 0.0287, "step": 2300 }, { "epoch": 2.182076813655761, "grad_norm": 0.3970538993097859, "learning_rate": 2.0946750478264287e-06, "loss": 0.0255, "step": 2301 }, { "epoch": 2.1830251303935513, "grad_norm": 0.3311384010233447, "learning_rate": 2.090183309218998e-06, "loss": 0.0156, "step": 2302 }, { "epoch": 2.1839734471313417, "grad_norm": 0.7722545177624061, "learning_rate": 2.085695118751916e-06, "loss": 0.0276, "step": 2303 }, { "epoch": 2.1849217638691325, "grad_norm": 0.6058568443970096, "learning_rate": 2.0812104818979437e-06, "loss": 0.022, "step": 2304 }, { "epoch": 2.185870080606923, "grad_norm": 0.37429742353382056, "learning_rate": 2.076729404125513e-06, "loss": 0.019, "step": 2305 }, { "epoch": 2.1868183973447133, "grad_norm": 0.5299104777253576, "learning_rate": 2.0722518908987043e-06, "loss": 0.0257, "step": 2306 }, { "epoch": 2.1877667140825037, "grad_norm": 0.3915958586603079, "learning_rate": 2.067777947677265e-06, "loss": 0.024, "step": 2307 }, { "epoch": 2.188715030820294, "grad_norm": 0.397731251360786, "learning_rate": 2.063307579916578e-06, "loss": 0.0195, "step": 2308 }, { "epoch": 2.1896633475580844, "grad_norm": 0.507467765523117, "learning_rate": 2.058840793067675e-06, "loss": 0.0318, "step": 2309 }, { "epoch": 2.190611664295875, "grad_norm": 0.38413895508990836, "learning_rate": 2.054377592577214e-06, "loss": 0.0202, "step": 2310 }, { "epoch": 2.1915599810336652, "grad_norm": 0.45976607540693787, "learning_rate": 2.049917983887487e-06, "loss": 0.0229, "step": 2311 }, { "epoch": 2.1925082977714556, "grad_norm": 0.361658805053007, "learning_rate": 2.0454619724363993e-06, "loss": 0.0227, "step": 2312 }, { "epoch": 2.193456614509246, "grad_norm": 0.3619090798648459, "learning_rate": 2.041009563657477e-06, "loss": 0.0199, "step": 2313 }, { "epoch": 2.1944049312470364, "grad_norm": 0.3771128928199703, "learning_rate": 2.036560762979845e-06, "loss": 0.0189, "step": 2314 }, { "epoch": 2.195353247984827, "grad_norm": 0.3715021611413161, "learning_rate": 2.032115575828238e-06, "loss": 0.0259, "step": 2315 }, { "epoch": 2.196301564722617, "grad_norm": 0.43350725720240935, "learning_rate": 2.027674007622975e-06, "loss": 0.0232, "step": 2316 }, { "epoch": 2.197249881460408, "grad_norm": 0.35582869608226325, "learning_rate": 2.0232360637799687e-06, "loss": 0.0189, "step": 2317 }, { "epoch": 2.1981981981981984, "grad_norm": 0.46958694231370757, "learning_rate": 2.0188017497107114e-06, "loss": 0.0221, "step": 2318 }, { "epoch": 2.199146514935989, "grad_norm": 0.3646835051350837, "learning_rate": 2.014371070822266e-06, "loss": 0.0243, "step": 2319 }, { "epoch": 2.200094831673779, "grad_norm": 0.3901856309198319, "learning_rate": 2.009944032517269e-06, "loss": 0.0191, "step": 2320 }, { "epoch": 2.2010431484115696, "grad_norm": 0.47290921322501533, "learning_rate": 2.005520640193911e-06, "loss": 0.0245, "step": 2321 }, { "epoch": 2.20199146514936, "grad_norm": 0.38202716048927027, "learning_rate": 2.0011008992459375e-06, "loss": 0.015, "step": 2322 }, { "epoch": 2.2029397818871503, "grad_norm": 0.40684472495286483, "learning_rate": 1.9966848150626478e-06, "loss": 0.0187, "step": 2323 }, { "epoch": 2.2038880986249407, "grad_norm": 0.41394820108088604, "learning_rate": 1.992272393028875e-06, "loss": 0.021, "step": 2324 }, { "epoch": 2.204836415362731, "grad_norm": 0.35052141592281283, "learning_rate": 1.987863638524991e-06, "loss": 0.0184, "step": 2325 }, { "epoch": 2.2057847321005215, "grad_norm": 0.5224256513759739, "learning_rate": 1.9834585569268973e-06, "loss": 0.031, "step": 2326 }, { "epoch": 2.206733048838312, "grad_norm": 0.4306504185840522, "learning_rate": 1.979057153606009e-06, "loss": 0.0252, "step": 2327 }, { "epoch": 2.2076813655761023, "grad_norm": 0.34500352219686414, "learning_rate": 1.9746594339292634e-06, "loss": 0.0139, "step": 2328 }, { "epoch": 2.2086296823138927, "grad_norm": 0.47428095884091725, "learning_rate": 1.9702654032591057e-06, "loss": 0.0299, "step": 2329 }, { "epoch": 2.209577999051683, "grad_norm": 0.31715247322701795, "learning_rate": 1.965875066953477e-06, "loss": 0.0187, "step": 2330 }, { "epoch": 2.2105263157894735, "grad_norm": 0.35097104725381645, "learning_rate": 1.961488430365821e-06, "loss": 0.018, "step": 2331 }, { "epoch": 2.2114746325272643, "grad_norm": 0.43904077797503993, "learning_rate": 1.957105498845065e-06, "loss": 0.0273, "step": 2332 }, { "epoch": 2.2124229492650547, "grad_norm": 0.37433258153809074, "learning_rate": 1.9527262777356175e-06, "loss": 0.0189, "step": 2333 }, { "epoch": 2.213371266002845, "grad_norm": 0.452382015943325, "learning_rate": 1.9483507723773693e-06, "loss": 0.02, "step": 2334 }, { "epoch": 2.2143195827406355, "grad_norm": 0.3844273691974591, "learning_rate": 1.9439789881056724e-06, "loss": 0.0296, "step": 2335 }, { "epoch": 2.215267899478426, "grad_norm": 0.3251514659777261, "learning_rate": 1.9396109302513465e-06, "loss": 0.0241, "step": 2336 }, { "epoch": 2.2162162162162162, "grad_norm": 0.3675273639039549, "learning_rate": 1.9352466041406687e-06, "loss": 0.0134, "step": 2337 }, { "epoch": 2.2171645329540066, "grad_norm": 0.3983214976925876, "learning_rate": 1.9308860150953583e-06, "loss": 0.0232, "step": 2338 }, { "epoch": 2.218112849691797, "grad_norm": 0.7144984519278534, "learning_rate": 1.926529168432587e-06, "loss": 0.036, "step": 2339 }, { "epoch": 2.2190611664295874, "grad_norm": 0.48637705204645265, "learning_rate": 1.9221760694649553e-06, "loss": 0.0339, "step": 2340 }, { "epoch": 2.220009483167378, "grad_norm": 0.7318106591805768, "learning_rate": 1.9178267235004984e-06, "loss": 0.0206, "step": 2341 }, { "epoch": 2.220957799905168, "grad_norm": 0.4343031634314887, "learning_rate": 1.9134811358426756e-06, "loss": 0.0293, "step": 2342 }, { "epoch": 2.2219061166429586, "grad_norm": 0.5411094163596777, "learning_rate": 1.909139311790362e-06, "loss": 0.0513, "step": 2343 }, { "epoch": 2.222854433380749, "grad_norm": 0.4262502588360842, "learning_rate": 1.9048012566378387e-06, "loss": 0.0273, "step": 2344 }, { "epoch": 2.22380275011854, "grad_norm": 0.38390892408447685, "learning_rate": 1.9004669756748017e-06, "loss": 0.0237, "step": 2345 }, { "epoch": 2.22475106685633, "grad_norm": 0.3107308166605013, "learning_rate": 1.8961364741863342e-06, "loss": 0.0184, "step": 2346 }, { "epoch": 2.2256993835941206, "grad_norm": 0.32935486433027755, "learning_rate": 1.8918097574529193e-06, "loss": 0.0168, "step": 2347 }, { "epoch": 2.226647700331911, "grad_norm": 0.3322923262299374, "learning_rate": 1.8874868307504185e-06, "loss": 0.0179, "step": 2348 }, { "epoch": 2.2275960170697013, "grad_norm": 0.36089333207950935, "learning_rate": 1.8831676993500758e-06, "loss": 0.0224, "step": 2349 }, { "epoch": 2.2285443338074917, "grad_norm": 0.41395718666883813, "learning_rate": 1.8788523685185084e-06, "loss": 0.021, "step": 2350 }, { "epoch": 2.229492650545282, "grad_norm": 0.4439323647263712, "learning_rate": 1.8745408435176932e-06, "loss": 0.0268, "step": 2351 }, { "epoch": 2.2304409672830725, "grad_norm": 0.427895772910043, "learning_rate": 1.8702331296049741e-06, "loss": 0.021, "step": 2352 }, { "epoch": 2.231389284020863, "grad_norm": 0.37486703243376646, "learning_rate": 1.8659292320330408e-06, "loss": 0.0177, "step": 2353 }, { "epoch": 2.2323376007586533, "grad_norm": 0.3841975396897219, "learning_rate": 1.861629156049935e-06, "loss": 0.0237, "step": 2354 }, { "epoch": 2.2332859174964437, "grad_norm": 0.3907334477612354, "learning_rate": 1.8573329068990358e-06, "loss": 0.0209, "step": 2355 }, { "epoch": 2.234234234234234, "grad_norm": 0.43207097544527573, "learning_rate": 1.8530404898190534e-06, "loss": 0.015, "step": 2356 }, { "epoch": 2.2351825509720245, "grad_norm": 0.40641958306313414, "learning_rate": 1.8487519100440316e-06, "loss": 0.0212, "step": 2357 }, { "epoch": 2.2361308677098153, "grad_norm": 0.45601425060223455, "learning_rate": 1.8444671728033314e-06, "loss": 0.025, "step": 2358 }, { "epoch": 2.2370791844476057, "grad_norm": 0.40452585916751155, "learning_rate": 1.8401862833216272e-06, "loss": 0.0221, "step": 2359 }, { "epoch": 2.238027501185396, "grad_norm": 0.32206562407761236, "learning_rate": 1.8359092468189048e-06, "loss": 0.0166, "step": 2360 }, { "epoch": 2.2389758179231865, "grad_norm": 0.4665259635354582, "learning_rate": 1.8316360685104478e-06, "loss": 0.0276, "step": 2361 }, { "epoch": 2.239924134660977, "grad_norm": 0.6336444720182508, "learning_rate": 1.827366753606839e-06, "loss": 0.0308, "step": 2362 }, { "epoch": 2.2408724513987672, "grad_norm": 0.4423139744797903, "learning_rate": 1.8231013073139504e-06, "loss": 0.0279, "step": 2363 }, { "epoch": 2.2418207681365576, "grad_norm": 0.4376868624329782, "learning_rate": 1.8188397348329328e-06, "loss": 0.0203, "step": 2364 }, { "epoch": 2.242769084874348, "grad_norm": 0.36625273715371953, "learning_rate": 1.814582041360215e-06, "loss": 0.018, "step": 2365 }, { "epoch": 2.2437174016121384, "grad_norm": 0.5777502114143614, "learning_rate": 1.8103282320874987e-06, "loss": 0.0245, "step": 2366 }, { "epoch": 2.244665718349929, "grad_norm": 0.45081701594898027, "learning_rate": 1.806078312201745e-06, "loss": 0.0302, "step": 2367 }, { "epoch": 2.245614035087719, "grad_norm": 0.4140190228187624, "learning_rate": 1.8018322868851779e-06, "loss": 0.0184, "step": 2368 }, { "epoch": 2.2465623518255096, "grad_norm": 0.457871291883172, "learning_rate": 1.7975901613152645e-06, "loss": 0.0181, "step": 2369 }, { "epoch": 2.2475106685633, "grad_norm": 0.36063582182993614, "learning_rate": 1.7933519406647243e-06, "loss": 0.0188, "step": 2370 }, { "epoch": 2.248458985301091, "grad_norm": 0.4098690379894618, "learning_rate": 1.7891176301015135e-06, "loss": 0.0301, "step": 2371 }, { "epoch": 2.249407302038881, "grad_norm": 0.39708742138503444, "learning_rate": 1.7848872347888163e-06, "loss": 0.0247, "step": 2372 }, { "epoch": 2.2503556187766716, "grad_norm": 0.41417175900835923, "learning_rate": 1.7806607598850467e-06, "loss": 0.018, "step": 2373 }, { "epoch": 2.251303935514462, "grad_norm": 0.4754670959345918, "learning_rate": 1.7764382105438394e-06, "loss": 0.0253, "step": 2374 }, { "epoch": 2.2522522522522523, "grad_norm": 0.8568347200994995, "learning_rate": 1.7722195919140383e-06, "loss": 0.0263, "step": 2375 }, { "epoch": 2.2532005689900427, "grad_norm": 0.38552968326653064, "learning_rate": 1.7680049091396967e-06, "loss": 0.022, "step": 2376 }, { "epoch": 2.254148885727833, "grad_norm": 0.5324795698459304, "learning_rate": 1.7637941673600668e-06, "loss": 0.0248, "step": 2377 }, { "epoch": 2.2550972024656235, "grad_norm": 0.5170883292235453, "learning_rate": 1.7595873717095973e-06, "loss": 0.0277, "step": 2378 }, { "epoch": 2.256045519203414, "grad_norm": 0.5100513984737847, "learning_rate": 1.7553845273179277e-06, "loss": 0.0274, "step": 2379 }, { "epoch": 2.2569938359412043, "grad_norm": 0.3704538407484803, "learning_rate": 1.7511856393098715e-06, "loss": 0.0213, "step": 2380 }, { "epoch": 2.2579421526789947, "grad_norm": 0.48491216247522145, "learning_rate": 1.7469907128054247e-06, "loss": 0.026, "step": 2381 }, { "epoch": 2.258890469416785, "grad_norm": 0.649876149364862, "learning_rate": 1.7427997529197533e-06, "loss": 0.0173, "step": 2382 }, { "epoch": 2.2598387861545755, "grad_norm": 0.48045437554087517, "learning_rate": 1.7386127647631802e-06, "loss": 0.0402, "step": 2383 }, { "epoch": 2.2607871028923663, "grad_norm": 0.38709200279463996, "learning_rate": 1.7344297534411918e-06, "loss": 0.0228, "step": 2384 }, { "epoch": 2.2617354196301562, "grad_norm": 0.39911202838197224, "learning_rate": 1.7302507240544197e-06, "loss": 0.0263, "step": 2385 }, { "epoch": 2.262683736367947, "grad_norm": 0.402703895487539, "learning_rate": 1.7260756816986468e-06, "loss": 0.0199, "step": 2386 }, { "epoch": 2.2636320531057375, "grad_norm": 0.4327109923756727, "learning_rate": 1.7219046314647875e-06, "loss": 0.0293, "step": 2387 }, { "epoch": 2.264580369843528, "grad_norm": 0.46823736554030093, "learning_rate": 1.7177375784388906e-06, "loss": 0.0202, "step": 2388 }, { "epoch": 2.2655286865813182, "grad_norm": 0.37427697435461244, "learning_rate": 1.7135745277021332e-06, "loss": 0.0188, "step": 2389 }, { "epoch": 2.2664770033191086, "grad_norm": 0.3760762544416307, "learning_rate": 1.7094154843308114e-06, "loss": 0.018, "step": 2390 }, { "epoch": 2.267425320056899, "grad_norm": 0.3185328153435644, "learning_rate": 1.7052604533963308e-06, "loss": 0.0168, "step": 2391 }, { "epoch": 2.2683736367946894, "grad_norm": 0.39366485747456337, "learning_rate": 1.7011094399652107e-06, "loss": 0.0179, "step": 2392 }, { "epoch": 2.26932195353248, "grad_norm": 0.4374421147215151, "learning_rate": 1.6969624490990654e-06, "loss": 0.0195, "step": 2393 }, { "epoch": 2.27027027027027, "grad_norm": 0.40108389408149603, "learning_rate": 1.6928194858546088e-06, "loss": 0.0175, "step": 2394 }, { "epoch": 2.2712185870080606, "grad_norm": 0.30247705105026723, "learning_rate": 1.6886805552836433e-06, "loss": 0.0159, "step": 2395 }, { "epoch": 2.272166903745851, "grad_norm": 0.4862095152615184, "learning_rate": 1.6845456624330492e-06, "loss": 0.0227, "step": 2396 }, { "epoch": 2.273115220483642, "grad_norm": 0.3745563544160728, "learning_rate": 1.68041481234479e-06, "loss": 0.018, "step": 2397 }, { "epoch": 2.2740635372214317, "grad_norm": 0.36704972477579006, "learning_rate": 1.6762880100558954e-06, "loss": 0.0236, "step": 2398 }, { "epoch": 2.2750118539592226, "grad_norm": 0.4747742150229455, "learning_rate": 1.6721652605984585e-06, "loss": 0.0219, "step": 2399 }, { "epoch": 2.275960170697013, "grad_norm": 0.3953571044047917, "learning_rate": 1.6680465689996345e-06, "loss": 0.0198, "step": 2400 }, { "epoch": 2.2769084874348033, "grad_norm": 0.43525927498232264, "learning_rate": 1.6639319402816263e-06, "loss": 0.0312, "step": 2401 }, { "epoch": 2.2778568041725937, "grad_norm": 0.3436816679628124, "learning_rate": 1.6598213794616862e-06, "loss": 0.0147, "step": 2402 }, { "epoch": 2.278805120910384, "grad_norm": 0.44877364797530483, "learning_rate": 1.655714891552106e-06, "loss": 0.0201, "step": 2403 }, { "epoch": 2.2797534376481745, "grad_norm": 0.3834821514632032, "learning_rate": 1.651612481560207e-06, "loss": 0.0209, "step": 2404 }, { "epoch": 2.280701754385965, "grad_norm": 0.472080313698594, "learning_rate": 1.6475141544883416e-06, "loss": 0.0251, "step": 2405 }, { "epoch": 2.2816500711237553, "grad_norm": 0.4625744377816762, "learning_rate": 1.643419915333886e-06, "loss": 0.0344, "step": 2406 }, { "epoch": 2.2825983878615457, "grad_norm": 0.3367193925166736, "learning_rate": 1.6393297690892256e-06, "loss": 0.0166, "step": 2407 }, { "epoch": 2.283546704599336, "grad_norm": 0.40850259795694177, "learning_rate": 1.6352437207417571e-06, "loss": 0.0196, "step": 2408 }, { "epoch": 2.2844950213371265, "grad_norm": 0.38565301084020687, "learning_rate": 1.631161775273885e-06, "loss": 0.0162, "step": 2409 }, { "epoch": 2.285443338074917, "grad_norm": 0.3851049737678369, "learning_rate": 1.6270839376630033e-06, "loss": 0.0194, "step": 2410 }, { "epoch": 2.2863916548127072, "grad_norm": 0.5577488269441532, "learning_rate": 1.6230102128815033e-06, "loss": 0.0367, "step": 2411 }, { "epoch": 2.287339971550498, "grad_norm": 0.5016112831857718, "learning_rate": 1.6189406058967577e-06, "loss": 0.0189, "step": 2412 }, { "epoch": 2.2882882882882885, "grad_norm": 0.5585880833977839, "learning_rate": 1.6148751216711206e-06, "loss": 0.026, "step": 2413 }, { "epoch": 2.289236605026079, "grad_norm": 0.4195620847627191, "learning_rate": 1.6108137651619154e-06, "loss": 0.0267, "step": 2414 }, { "epoch": 2.2901849217638692, "grad_norm": 0.5490900492941025, "learning_rate": 1.6067565413214353e-06, "loss": 0.0221, "step": 2415 }, { "epoch": 2.2911332385016596, "grad_norm": 0.5073374779777672, "learning_rate": 1.6027034550969356e-06, "loss": 0.0232, "step": 2416 }, { "epoch": 2.29208155523945, "grad_norm": 0.3707053895150809, "learning_rate": 1.5986545114306202e-06, "loss": 0.0216, "step": 2417 }, { "epoch": 2.2930298719772404, "grad_norm": 0.4036642305925845, "learning_rate": 1.5946097152596496e-06, "loss": 0.0198, "step": 2418 }, { "epoch": 2.293978188715031, "grad_norm": 0.2837666309597713, "learning_rate": 1.5905690715161209e-06, "loss": 0.0164, "step": 2419 }, { "epoch": 2.294926505452821, "grad_norm": 0.5566663426849048, "learning_rate": 1.586532585127069e-06, "loss": 0.0257, "step": 2420 }, { "epoch": 2.2958748221906116, "grad_norm": 0.6345053044837282, "learning_rate": 1.5825002610144623e-06, "loss": 0.0257, "step": 2421 }, { "epoch": 2.296823138928402, "grad_norm": 0.35480991236789217, "learning_rate": 1.57847210409519e-06, "loss": 0.0182, "step": 2422 }, { "epoch": 2.2977714556661923, "grad_norm": 0.5404075847950341, "learning_rate": 1.574448119281063e-06, "loss": 0.0213, "step": 2423 }, { "epoch": 2.2987197724039827, "grad_norm": 0.3390560549791458, "learning_rate": 1.570428311478805e-06, "loss": 0.0162, "step": 2424 }, { "epoch": 2.2996680891417736, "grad_norm": 0.7426039658923921, "learning_rate": 1.5664126855900424e-06, "loss": 0.0238, "step": 2425 }, { "epoch": 2.300616405879564, "grad_norm": 0.4266296618803969, "learning_rate": 1.5624012465113065e-06, "loss": 0.02, "step": 2426 }, { "epoch": 2.3015647226173543, "grad_norm": 0.3065546446187386, "learning_rate": 1.558393999134023e-06, "loss": 0.0156, "step": 2427 }, { "epoch": 2.3025130393551447, "grad_norm": 0.4512682712027585, "learning_rate": 1.554390948344503e-06, "loss": 0.0304, "step": 2428 }, { "epoch": 2.303461356092935, "grad_norm": 0.3422412628357766, "learning_rate": 1.5503920990239452e-06, "loss": 0.0226, "step": 2429 }, { "epoch": 2.3044096728307255, "grad_norm": 0.31396493988976687, "learning_rate": 1.5463974560484213e-06, "loss": 0.0205, "step": 2430 }, { "epoch": 2.305357989568516, "grad_norm": 0.35208787261922914, "learning_rate": 1.5424070242888733e-06, "loss": 0.0208, "step": 2431 }, { "epoch": 2.3063063063063063, "grad_norm": 0.6959424315073545, "learning_rate": 1.538420808611114e-06, "loss": 0.0144, "step": 2432 }, { "epoch": 2.3072546230440967, "grad_norm": 0.35449272063606063, "learning_rate": 1.534438813875807e-06, "loss": 0.0242, "step": 2433 }, { "epoch": 2.308202939781887, "grad_norm": 0.3970212613465219, "learning_rate": 1.530461044938476e-06, "loss": 0.0282, "step": 2434 }, { "epoch": 2.3091512565196775, "grad_norm": 0.3486460962177107, "learning_rate": 1.52648750664949e-06, "loss": 0.0168, "step": 2435 }, { "epoch": 2.310099573257468, "grad_norm": 0.33438665576098, "learning_rate": 1.522518203854056e-06, "loss": 0.0167, "step": 2436 }, { "epoch": 2.3110478899952582, "grad_norm": 0.4974236473530219, "learning_rate": 1.5185531413922217e-06, "loss": 0.0238, "step": 2437 }, { "epoch": 2.311996206733049, "grad_norm": 0.4821267448581767, "learning_rate": 1.5145923240988587e-06, "loss": 0.0212, "step": 2438 }, { "epoch": 2.3129445234708395, "grad_norm": 1.245911380396821, "learning_rate": 1.5106357568036662e-06, "loss": 0.0258, "step": 2439 }, { "epoch": 2.31389284020863, "grad_norm": 0.4281277216603392, "learning_rate": 1.5066834443311613e-06, "loss": 0.0278, "step": 2440 }, { "epoch": 2.3148411569464202, "grad_norm": 0.43810859496573806, "learning_rate": 1.50273539150067e-06, "loss": 0.0229, "step": 2441 }, { "epoch": 2.3157894736842106, "grad_norm": 0.33235104756376627, "learning_rate": 1.4987916031263234e-06, "loss": 0.0136, "step": 2442 }, { "epoch": 2.316737790422001, "grad_norm": 0.3297892350058022, "learning_rate": 1.4948520840170594e-06, "loss": 0.0261, "step": 2443 }, { "epoch": 2.3176861071597914, "grad_norm": 0.42008924032009215, "learning_rate": 1.4909168389766015e-06, "loss": 0.0312, "step": 2444 }, { "epoch": 2.318634423897582, "grad_norm": 0.3389579383262707, "learning_rate": 1.486985872803469e-06, "loss": 0.0176, "step": 2445 }, { "epoch": 2.319582740635372, "grad_norm": 0.5487799107230558, "learning_rate": 1.483059190290957e-06, "loss": 0.026, "step": 2446 }, { "epoch": 2.3205310573731626, "grad_norm": 0.4890771192858219, "learning_rate": 1.4791367962271425e-06, "loss": 0.0266, "step": 2447 }, { "epoch": 2.321479374110953, "grad_norm": 0.4438394146588021, "learning_rate": 1.4752186953948728e-06, "loss": 0.0196, "step": 2448 }, { "epoch": 2.3224276908487433, "grad_norm": 0.3662367949751032, "learning_rate": 1.4713048925717548e-06, "loss": 0.0175, "step": 2449 }, { "epoch": 2.3233760075865337, "grad_norm": 0.43993413507805795, "learning_rate": 1.4673953925301625e-06, "loss": 0.0219, "step": 2450 }, { "epoch": 2.3243243243243246, "grad_norm": 0.32060744843760514, "learning_rate": 1.463490200037216e-06, "loss": 0.0183, "step": 2451 }, { "epoch": 2.3252726410621145, "grad_norm": 0.3084422018115681, "learning_rate": 1.4595893198547889e-06, "loss": 0.0165, "step": 2452 }, { "epoch": 2.3262209577999053, "grad_norm": 0.512124514376596, "learning_rate": 1.4556927567394918e-06, "loss": 0.0256, "step": 2453 }, { "epoch": 2.3271692745376957, "grad_norm": 0.37175509555253117, "learning_rate": 1.4518005154426722e-06, "loss": 0.019, "step": 2454 }, { "epoch": 2.328117591275486, "grad_norm": 0.3864056129285522, "learning_rate": 1.44791260071041e-06, "loss": 0.0138, "step": 2455 }, { "epoch": 2.3290659080132765, "grad_norm": 1.0059816314469427, "learning_rate": 1.4440290172835087e-06, "loss": 0.0268, "step": 2456 }, { "epoch": 2.330014224751067, "grad_norm": 0.41813610021235836, "learning_rate": 1.4401497698974875e-06, "loss": 0.028, "step": 2457 }, { "epoch": 2.3309625414888573, "grad_norm": 0.5405977205603913, "learning_rate": 1.4362748632825824e-06, "loss": 0.0432, "step": 2458 }, { "epoch": 2.3319108582266477, "grad_norm": 0.4101040764363063, "learning_rate": 1.4324043021637346e-06, "loss": 0.0215, "step": 2459 }, { "epoch": 2.332859174964438, "grad_norm": 0.4178607760421023, "learning_rate": 1.4285380912605846e-06, "loss": 0.0238, "step": 2460 }, { "epoch": 2.3338074917022285, "grad_norm": 0.34664363111136054, "learning_rate": 1.4246762352874732e-06, "loss": 0.014, "step": 2461 }, { "epoch": 2.334755808440019, "grad_norm": 0.35675426926357756, "learning_rate": 1.4208187389534256e-06, "loss": 0.0236, "step": 2462 }, { "epoch": 2.3357041251778092, "grad_norm": 0.5405249144315799, "learning_rate": 1.4169656069621529e-06, "loss": 0.0204, "step": 2463 }, { "epoch": 2.3366524419155996, "grad_norm": 0.4288922042641505, "learning_rate": 1.4131168440120473e-06, "loss": 0.0252, "step": 2464 }, { "epoch": 2.33760075865339, "grad_norm": 0.3140019976226754, "learning_rate": 1.4092724547961678e-06, "loss": 0.0131, "step": 2465 }, { "epoch": 2.338549075391181, "grad_norm": 0.29742347585209866, "learning_rate": 1.4054324440022477e-06, "loss": 0.0157, "step": 2466 }, { "epoch": 2.3394973921289712, "grad_norm": 0.4063964441515196, "learning_rate": 1.401596816312673e-06, "loss": 0.028, "step": 2467 }, { "epoch": 2.3404457088667616, "grad_norm": 0.3493753945743093, "learning_rate": 1.3977655764044917e-06, "loss": 0.0183, "step": 2468 }, { "epoch": 2.341394025604552, "grad_norm": 0.3833665376816688, "learning_rate": 1.3939387289494e-06, "loss": 0.0197, "step": 2469 }, { "epoch": 2.3423423423423424, "grad_norm": 0.4778883575723333, "learning_rate": 1.3901162786137345e-06, "loss": 0.0215, "step": 2470 }, { "epoch": 2.343290659080133, "grad_norm": 0.4491332932865572, "learning_rate": 1.3862982300584738e-06, "loss": 0.0298, "step": 2471 }, { "epoch": 2.344238975817923, "grad_norm": 0.4417496299388204, "learning_rate": 1.3824845879392301e-06, "loss": 0.019, "step": 2472 }, { "epoch": 2.3451872925557136, "grad_norm": 0.6247482118776613, "learning_rate": 1.3786753569062389e-06, "loss": 0.0177, "step": 2473 }, { "epoch": 2.346135609293504, "grad_norm": 0.33182737994484557, "learning_rate": 1.3748705416043584e-06, "loss": 0.0196, "step": 2474 }, { "epoch": 2.3470839260312943, "grad_norm": 0.33528750198268, "learning_rate": 1.3710701466730613e-06, "loss": 0.0208, "step": 2475 }, { "epoch": 2.3480322427690847, "grad_norm": 0.3574549420742016, "learning_rate": 1.3672741767464327e-06, "loss": 0.0194, "step": 2476 }, { "epoch": 2.348980559506875, "grad_norm": 0.27124601377608953, "learning_rate": 1.3634826364531616e-06, "loss": 0.0104, "step": 2477 }, { "epoch": 2.3499288762446655, "grad_norm": 0.30245014567972933, "learning_rate": 1.3596955304165333e-06, "loss": 0.0172, "step": 2478 }, { "epoch": 2.3508771929824563, "grad_norm": 0.3867837433390609, "learning_rate": 1.3559128632544277e-06, "loss": 0.0352, "step": 2479 }, { "epoch": 2.3518255097202467, "grad_norm": 0.3628249738004892, "learning_rate": 1.3521346395793145e-06, "loss": 0.0194, "step": 2480 }, { "epoch": 2.352773826458037, "grad_norm": 0.4847287370973444, "learning_rate": 1.3483608639982388e-06, "loss": 0.0304, "step": 2481 }, { "epoch": 2.3537221431958275, "grad_norm": 0.386773109078386, "learning_rate": 1.3445915411128296e-06, "loss": 0.0244, "step": 2482 }, { "epoch": 2.354670459933618, "grad_norm": 0.40717680421180374, "learning_rate": 1.3408266755192785e-06, "loss": 0.0152, "step": 2483 }, { "epoch": 2.3556187766714083, "grad_norm": 0.45152313917690334, "learning_rate": 1.3370662718083498e-06, "loss": 0.0278, "step": 2484 }, { "epoch": 2.3565670934091987, "grad_norm": 0.381547289327534, "learning_rate": 1.3333103345653615e-06, "loss": 0.0214, "step": 2485 }, { "epoch": 2.357515410146989, "grad_norm": 0.394442820160816, "learning_rate": 1.3295588683701854e-06, "loss": 0.022, "step": 2486 }, { "epoch": 2.3584637268847795, "grad_norm": 0.3030069689069135, "learning_rate": 1.325811877797245e-06, "loss": 0.0174, "step": 2487 }, { "epoch": 2.35941204362257, "grad_norm": 0.4007388320851803, "learning_rate": 1.3220693674155054e-06, "loss": 0.0284, "step": 2488 }, { "epoch": 2.3603603603603602, "grad_norm": 0.299749300305697, "learning_rate": 1.3183313417884664e-06, "loss": 0.0144, "step": 2489 }, { "epoch": 2.3613086770981506, "grad_norm": 0.36912145130254453, "learning_rate": 1.3145978054741621e-06, "loss": 0.0243, "step": 2490 }, { "epoch": 2.362256993835941, "grad_norm": 0.3261697534340822, "learning_rate": 1.3108687630251487e-06, "loss": 0.0164, "step": 2491 }, { "epoch": 2.363205310573732, "grad_norm": 0.4265509212089778, "learning_rate": 1.307144218988507e-06, "loss": 0.0359, "step": 2492 }, { "epoch": 2.3641536273115222, "grad_norm": 0.3568021639376206, "learning_rate": 1.303424177905831e-06, "loss": 0.0258, "step": 2493 }, { "epoch": 2.3651019440493126, "grad_norm": 0.42325489445166964, "learning_rate": 1.2997086443132218e-06, "loss": 0.0202, "step": 2494 }, { "epoch": 2.366050260787103, "grad_norm": 0.3616531140656992, "learning_rate": 1.2959976227412879e-06, "loss": 0.0162, "step": 2495 }, { "epoch": 2.3669985775248934, "grad_norm": 0.46009969632898884, "learning_rate": 1.2922911177151332e-06, "loss": 0.0269, "step": 2496 }, { "epoch": 2.367946894262684, "grad_norm": 0.45119455082431575, "learning_rate": 1.2885891337543539e-06, "loss": 0.0254, "step": 2497 }, { "epoch": 2.368895211000474, "grad_norm": 0.42641075850683124, "learning_rate": 1.2848916753730366e-06, "loss": 0.0202, "step": 2498 }, { "epoch": 2.3698435277382646, "grad_norm": 0.43843407405722246, "learning_rate": 1.2811987470797455e-06, "loss": 0.0228, "step": 2499 }, { "epoch": 2.370791844476055, "grad_norm": 0.3639166296679119, "learning_rate": 1.277510353377524e-06, "loss": 0.0235, "step": 2500 }, { "epoch": 2.3717401612138453, "grad_norm": 0.6231047022819872, "learning_rate": 1.2738264987638865e-06, "loss": 0.0219, "step": 2501 }, { "epoch": 2.3726884779516357, "grad_norm": 0.39227134362244037, "learning_rate": 1.2701471877308091e-06, "loss": 0.0223, "step": 2502 }, { "epoch": 2.373636794689426, "grad_norm": 0.33779641698396323, "learning_rate": 1.2664724247647303e-06, "loss": 0.0187, "step": 2503 }, { "epoch": 2.3745851114272165, "grad_norm": 0.3165810683945163, "learning_rate": 1.262802214346544e-06, "loss": 0.0156, "step": 2504 }, { "epoch": 2.3755334281650073, "grad_norm": 0.3233124265456635, "learning_rate": 1.2591365609515892e-06, "loss": 0.0181, "step": 2505 }, { "epoch": 2.3764817449027973, "grad_norm": 0.4607946489288941, "learning_rate": 1.2554754690496496e-06, "loss": 0.0176, "step": 2506 }, { "epoch": 2.377430061640588, "grad_norm": 0.5043713488293808, "learning_rate": 1.25181894310495e-06, "loss": 0.0324, "step": 2507 }, { "epoch": 2.3783783783783785, "grad_norm": 0.31629505288698206, "learning_rate": 1.248166987576141e-06, "loss": 0.0157, "step": 2508 }, { "epoch": 2.379326695116169, "grad_norm": 0.3803164685028412, "learning_rate": 1.2445196069163078e-06, "loss": 0.0218, "step": 2509 }, { "epoch": 2.3802750118539593, "grad_norm": 0.43944861644470423, "learning_rate": 1.240876805572951e-06, "loss": 0.0192, "step": 2510 }, { "epoch": 2.3812233285917497, "grad_norm": 0.36603580264615937, "learning_rate": 1.23723858798799e-06, "loss": 0.0211, "step": 2511 }, { "epoch": 2.38217164532954, "grad_norm": 0.39441769552699, "learning_rate": 1.2336049585977566e-06, "loss": 0.0206, "step": 2512 }, { "epoch": 2.3831199620673305, "grad_norm": 0.5104756079026775, "learning_rate": 1.2299759218329833e-06, "loss": 0.0182, "step": 2513 }, { "epoch": 2.384068278805121, "grad_norm": 0.4036452102453263, "learning_rate": 1.2263514821188078e-06, "loss": 0.0298, "step": 2514 }, { "epoch": 2.3850165955429112, "grad_norm": 0.34363638723562545, "learning_rate": 1.2227316438747566e-06, "loss": 0.0193, "step": 2515 }, { "epoch": 2.3859649122807016, "grad_norm": 0.36067365522749045, "learning_rate": 1.2191164115147513e-06, "loss": 0.016, "step": 2516 }, { "epoch": 2.386913229018492, "grad_norm": 0.33927962718163435, "learning_rate": 1.2155057894470928e-06, "loss": 0.0136, "step": 2517 }, { "epoch": 2.387861545756283, "grad_norm": 0.42725471109065416, "learning_rate": 1.211899782074461e-06, "loss": 0.0251, "step": 2518 }, { "epoch": 2.388809862494073, "grad_norm": 0.33443915124320717, "learning_rate": 1.2082983937939101e-06, "loss": 0.0137, "step": 2519 }, { "epoch": 2.3897581792318636, "grad_norm": 0.48954597652416665, "learning_rate": 1.2047016289968632e-06, "loss": 0.034, "step": 2520 }, { "epoch": 2.390706495969654, "grad_norm": 0.3683504845385266, "learning_rate": 1.201109492069102e-06, "loss": 0.0137, "step": 2521 }, { "epoch": 2.3916548127074444, "grad_norm": 0.4613644539571518, "learning_rate": 1.1975219873907679e-06, "loss": 0.0227, "step": 2522 }, { "epoch": 2.392603129445235, "grad_norm": 0.3541642000129723, "learning_rate": 1.193939119336352e-06, "loss": 0.0304, "step": 2523 }, { "epoch": 2.393551446183025, "grad_norm": 0.4363097081176452, "learning_rate": 1.1903608922746929e-06, "loss": 0.019, "step": 2524 }, { "epoch": 2.3944997629208156, "grad_norm": 0.48307067062916964, "learning_rate": 1.1867873105689714e-06, "loss": 0.0246, "step": 2525 }, { "epoch": 2.395448079658606, "grad_norm": 0.3539294327572431, "learning_rate": 1.1832183785767004e-06, "loss": 0.0198, "step": 2526 }, { "epoch": 2.3963963963963963, "grad_norm": 0.3538424483292842, "learning_rate": 1.179654100649727e-06, "loss": 0.017, "step": 2527 }, { "epoch": 2.3973447131341867, "grad_norm": 0.4009227484286087, "learning_rate": 1.17609448113422e-06, "loss": 0.0226, "step": 2528 }, { "epoch": 2.398293029871977, "grad_norm": 0.383841849833669, "learning_rate": 1.1725395243706678e-06, "loss": 0.0171, "step": 2529 }, { "epoch": 2.3992413466097675, "grad_norm": 0.4068400535839265, "learning_rate": 1.168989234693878e-06, "loss": 0.02, "step": 2530 }, { "epoch": 2.400189663347558, "grad_norm": 0.35983771900013944, "learning_rate": 1.1654436164329602e-06, "loss": 0.0166, "step": 2531 }, { "epoch": 2.4011379800853483, "grad_norm": 0.2995912937081424, "learning_rate": 1.1619026739113332e-06, "loss": 0.0143, "step": 2532 }, { "epoch": 2.402086296823139, "grad_norm": 0.3237682151824729, "learning_rate": 1.1583664114467146e-06, "loss": 0.0196, "step": 2533 }, { "epoch": 2.4030346135609295, "grad_norm": 0.3277525209186715, "learning_rate": 1.1548348333511095e-06, "loss": 0.0247, "step": 2534 }, { "epoch": 2.40398293029872, "grad_norm": 0.3855701948617413, "learning_rate": 1.1513079439308183e-06, "loss": 0.0187, "step": 2535 }, { "epoch": 2.4049312470365103, "grad_norm": 0.4217863375770685, "learning_rate": 1.147785747486418e-06, "loss": 0.0254, "step": 2536 }, { "epoch": 2.4058795637743007, "grad_norm": 0.40959180694037955, "learning_rate": 1.1442682483127686e-06, "loss": 0.0251, "step": 2537 }, { "epoch": 2.406827880512091, "grad_norm": 0.4370468363653208, "learning_rate": 1.1407554506989965e-06, "loss": 0.0223, "step": 2538 }, { "epoch": 2.4077761972498815, "grad_norm": 0.48429407932806134, "learning_rate": 1.1372473589285016e-06, "loss": 0.025, "step": 2539 }, { "epoch": 2.408724513987672, "grad_norm": 0.33025993789585734, "learning_rate": 1.1337439772789388e-06, "loss": 0.0157, "step": 2540 }, { "epoch": 2.4096728307254622, "grad_norm": 0.3487731113190881, "learning_rate": 1.1302453100222272e-06, "loss": 0.0202, "step": 2541 }, { "epoch": 2.4106211474632526, "grad_norm": 0.31353805106431143, "learning_rate": 1.1267513614245291e-06, "loss": 0.0154, "step": 2542 }, { "epoch": 2.411569464201043, "grad_norm": 0.3389412149212434, "learning_rate": 1.1232621357462609e-06, "loss": 0.0205, "step": 2543 }, { "epoch": 2.4125177809388334, "grad_norm": 0.38136622760182587, "learning_rate": 1.1197776372420733e-06, "loss": 0.0219, "step": 2544 }, { "epoch": 2.413466097676624, "grad_norm": 0.27921173877988736, "learning_rate": 1.1162978701608574e-06, "loss": 0.0153, "step": 2545 }, { "epoch": 2.4144144144144146, "grad_norm": 0.40968982802283244, "learning_rate": 1.112822838745734e-06, "loss": 0.0285, "step": 2546 }, { "epoch": 2.415362731152205, "grad_norm": 0.4062864100779731, "learning_rate": 1.1093525472340471e-06, "loss": 0.0239, "step": 2547 }, { "epoch": 2.4163110478899954, "grad_norm": 0.3736524597587387, "learning_rate": 1.105886999857365e-06, "loss": 0.0225, "step": 2548 }, { "epoch": 2.417259364627786, "grad_norm": 0.3515641886656497, "learning_rate": 1.1024262008414677e-06, "loss": 0.028, "step": 2549 }, { "epoch": 2.418207681365576, "grad_norm": 0.6471429652164479, "learning_rate": 1.098970154406344e-06, "loss": 0.0262, "step": 2550 }, { "epoch": 2.4191559981033666, "grad_norm": 0.3650914464417128, "learning_rate": 1.095518864766194e-06, "loss": 0.0182, "step": 2551 }, { "epoch": 2.420104314841157, "grad_norm": 0.4376020269032313, "learning_rate": 1.09207233612941e-06, "loss": 0.0153, "step": 2552 }, { "epoch": 2.4210526315789473, "grad_norm": 0.34862166575759534, "learning_rate": 1.088630572698583e-06, "loss": 0.0162, "step": 2553 }, { "epoch": 2.4220009483167377, "grad_norm": 0.5082871266947292, "learning_rate": 1.0851935786704954e-06, "loss": 0.0272, "step": 2554 }, { "epoch": 2.422949265054528, "grad_norm": 0.41158101422665216, "learning_rate": 1.0817613582361074e-06, "loss": 0.0229, "step": 2555 }, { "epoch": 2.4238975817923185, "grad_norm": 0.43603252981364155, "learning_rate": 1.0783339155805644e-06, "loss": 0.027, "step": 2556 }, { "epoch": 2.424845898530109, "grad_norm": 0.49024524037096656, "learning_rate": 1.0749112548831848e-06, "loss": 0.0254, "step": 2557 }, { "epoch": 2.4257942152678993, "grad_norm": 0.3814270330954172, "learning_rate": 1.0714933803174526e-06, "loss": 0.0213, "step": 2558 }, { "epoch": 2.42674253200569, "grad_norm": 0.3976340849529511, "learning_rate": 1.0680802960510207e-06, "loss": 0.0227, "step": 2559 }, { "epoch": 2.4276908487434805, "grad_norm": 0.35166175212564016, "learning_rate": 1.0646720062456968e-06, "loss": 0.0201, "step": 2560 }, { "epoch": 2.428639165481271, "grad_norm": 0.5056479105854602, "learning_rate": 1.0612685150574432e-06, "loss": 0.0251, "step": 2561 }, { "epoch": 2.4295874822190613, "grad_norm": 0.49161815028338257, "learning_rate": 1.0578698266363734e-06, "loss": 0.0198, "step": 2562 }, { "epoch": 2.4305357989568517, "grad_norm": 0.41629650229173676, "learning_rate": 1.054475945126741e-06, "loss": 0.0216, "step": 2563 }, { "epoch": 2.431484115694642, "grad_norm": 0.4745068679798503, "learning_rate": 1.051086874666941e-06, "loss": 0.027, "step": 2564 }, { "epoch": 2.4324324324324325, "grad_norm": 0.5231051341052511, "learning_rate": 1.0477026193895024e-06, "loss": 0.0225, "step": 2565 }, { "epoch": 2.433380749170223, "grad_norm": 0.5996082247716173, "learning_rate": 1.0443231834210781e-06, "loss": 0.0343, "step": 2566 }, { "epoch": 2.4343290659080132, "grad_norm": 0.3254194676503433, "learning_rate": 1.0409485708824506e-06, "loss": 0.0172, "step": 2567 }, { "epoch": 2.4352773826458036, "grad_norm": 0.4487239432643308, "learning_rate": 1.0375787858885162e-06, "loss": 0.0294, "step": 2568 }, { "epoch": 2.436225699383594, "grad_norm": 0.46358117621037215, "learning_rate": 1.0342138325482859e-06, "loss": 0.023, "step": 2569 }, { "epoch": 2.4371740161213844, "grad_norm": 0.36382789199564586, "learning_rate": 1.0308537149648823e-06, "loss": 0.0166, "step": 2570 }, { "epoch": 2.438122332859175, "grad_norm": 0.5348352187185549, "learning_rate": 1.0274984372355273e-06, "loss": 0.0288, "step": 2571 }, { "epoch": 2.4390706495969656, "grad_norm": 0.3585580345192679, "learning_rate": 1.0241480034515406e-06, "loss": 0.023, "step": 2572 }, { "epoch": 2.4400189663347556, "grad_norm": 0.3805010713917909, "learning_rate": 1.02080241769834e-06, "loss": 0.0205, "step": 2573 }, { "epoch": 2.4409672830725464, "grad_norm": 0.3113821678519327, "learning_rate": 1.0174616840554274e-06, "loss": 0.0182, "step": 2574 }, { "epoch": 2.441915599810337, "grad_norm": 0.3576257333753975, "learning_rate": 1.0141258065963916e-06, "loss": 0.0182, "step": 2575 }, { "epoch": 2.442863916548127, "grad_norm": 0.4104979653412175, "learning_rate": 1.0107947893888965e-06, "loss": 0.0207, "step": 2576 }, { "epoch": 2.4438122332859176, "grad_norm": 0.521831837691463, "learning_rate": 1.0074686364946823e-06, "loss": 0.0288, "step": 2577 }, { "epoch": 2.444760550023708, "grad_norm": 0.44124587907058904, "learning_rate": 1.0041473519695577e-06, "loss": 0.0206, "step": 2578 }, { "epoch": 2.4457088667614983, "grad_norm": 0.435513491122848, "learning_rate": 1.000830939863392e-06, "loss": 0.0222, "step": 2579 }, { "epoch": 2.4466571834992887, "grad_norm": 0.5135158628816288, "learning_rate": 9.97519404220118e-07, "loss": 0.0534, "step": 2580 }, { "epoch": 2.447605500237079, "grad_norm": 0.40646237085276543, "learning_rate": 9.94212749077717e-07, "loss": 0.027, "step": 2581 }, { "epoch": 2.4485538169748695, "grad_norm": 0.7325890802201012, "learning_rate": 9.90910978468224e-07, "loss": 0.0369, "step": 2582 }, { "epoch": 2.44950213371266, "grad_norm": 0.4343579637391866, "learning_rate": 9.876140964177162e-07, "loss": 0.0218, "step": 2583 }, { "epoch": 2.4504504504504503, "grad_norm": 0.5766706870308571, "learning_rate": 9.84322106946306e-07, "loss": 0.0237, "step": 2584 }, { "epoch": 2.4513987671882407, "grad_norm": 0.39849795851452613, "learning_rate": 9.810350140681457e-07, "loss": 0.0182, "step": 2585 }, { "epoch": 2.452347083926031, "grad_norm": 0.4783557947198164, "learning_rate": 9.777528217914162e-07, "loss": 0.0294, "step": 2586 }, { "epoch": 2.453295400663822, "grad_norm": 0.42456242270260697, "learning_rate": 9.744755341183181e-07, "loss": 0.0168, "step": 2587 }, { "epoch": 2.4542437174016123, "grad_norm": 0.39678351340169365, "learning_rate": 9.712031550450774e-07, "loss": 0.0174, "step": 2588 }, { "epoch": 2.4551920341394027, "grad_norm": 0.47949218889264555, "learning_rate": 9.679356885619284e-07, "loss": 0.0262, "step": 2589 }, { "epoch": 2.456140350877193, "grad_norm": 0.3236340258110289, "learning_rate": 9.646731386531204e-07, "loss": 0.0166, "step": 2590 }, { "epoch": 2.4570886676149835, "grad_norm": 0.4597376524730793, "learning_rate": 9.61415509296907e-07, "loss": 0.0259, "step": 2591 }, { "epoch": 2.458036984352774, "grad_norm": 0.37074395951240846, "learning_rate": 9.581628044655394e-07, "loss": 0.0224, "step": 2592 }, { "epoch": 2.4589853010905642, "grad_norm": 0.37957880087331625, "learning_rate": 9.549150281252633e-07, "loss": 0.0251, "step": 2593 }, { "epoch": 2.4599336178283546, "grad_norm": 0.29271615580129845, "learning_rate": 9.516721842363197e-07, "loss": 0.0146, "step": 2594 }, { "epoch": 2.460881934566145, "grad_norm": 0.35582409975735413, "learning_rate": 9.484342767529292e-07, "loss": 0.0187, "step": 2595 }, { "epoch": 2.4618302513039354, "grad_norm": 0.3500315779102406, "learning_rate": 9.452013096232981e-07, "loss": 0.0208, "step": 2596 }, { "epoch": 2.462778568041726, "grad_norm": 0.3382420903851306, "learning_rate": 9.419732867896048e-07, "loss": 0.0255, "step": 2597 }, { "epoch": 2.463726884779516, "grad_norm": 0.578123966367445, "learning_rate": 9.38750212188001e-07, "loss": 0.0407, "step": 2598 }, { "epoch": 2.4646752015173066, "grad_norm": 0.43085532284231304, "learning_rate": 9.355320897486053e-07, "loss": 0.02, "step": 2599 }, { "epoch": 2.4656235182550974, "grad_norm": 0.35880891064401854, "learning_rate": 9.323189233954937e-07, "loss": 0.0205, "step": 2600 }, { "epoch": 2.466571834992888, "grad_norm": 0.40312804249813705, "learning_rate": 9.291107170467034e-07, "loss": 0.0221, "step": 2601 }, { "epoch": 2.467520151730678, "grad_norm": 0.6304307314634919, "learning_rate": 9.259074746142238e-07, "loss": 0.0199, "step": 2602 }, { "epoch": 2.4684684684684686, "grad_norm": 0.40697982149629996, "learning_rate": 9.227092000039867e-07, "loss": 0.0338, "step": 2603 }, { "epoch": 2.469416785206259, "grad_norm": 0.36786978507872714, "learning_rate": 9.195158971158702e-07, "loss": 0.0184, "step": 2604 }, { "epoch": 2.4703651019440493, "grad_norm": 0.6005718767374743, "learning_rate": 9.16327569843688e-07, "loss": 0.0295, "step": 2605 }, { "epoch": 2.4713134186818397, "grad_norm": 0.565930456400567, "learning_rate": 9.131442220751874e-07, "loss": 0.031, "step": 2606 }, { "epoch": 2.47226173541963, "grad_norm": 0.42111140772170896, "learning_rate": 9.099658576920467e-07, "loss": 0.0233, "step": 2607 }, { "epoch": 2.4732100521574205, "grad_norm": 0.47019365608336894, "learning_rate": 9.067924805698619e-07, "loss": 0.0263, "step": 2608 }, { "epoch": 2.474158368895211, "grad_norm": 0.4078998491617388, "learning_rate": 9.036240945781527e-07, "loss": 0.0185, "step": 2609 }, { "epoch": 2.4751066856330013, "grad_norm": 0.40269682929382367, "learning_rate": 9.004607035803508e-07, "loss": 0.0216, "step": 2610 }, { "epoch": 2.4760550023707917, "grad_norm": 0.2937728488786148, "learning_rate": 8.973023114337958e-07, "loss": 0.0126, "step": 2611 }, { "epoch": 2.477003319108582, "grad_norm": 0.33063640858727344, "learning_rate": 8.941489219897354e-07, "loss": 0.017, "step": 2612 }, { "epoch": 2.477951635846373, "grad_norm": 0.38049725788517497, "learning_rate": 8.910005390933124e-07, "loss": 0.0194, "step": 2613 }, { "epoch": 2.4788999525841633, "grad_norm": 0.5046033844115824, "learning_rate": 8.8785716658357e-07, "loss": 0.0261, "step": 2614 }, { "epoch": 2.4798482693219537, "grad_norm": 0.5299027510329987, "learning_rate": 8.847188082934383e-07, "loss": 0.0282, "step": 2615 }, { "epoch": 2.480796586059744, "grad_norm": 0.39419036803568774, "learning_rate": 8.815854680497327e-07, "loss": 0.0218, "step": 2616 }, { "epoch": 2.4817449027975345, "grad_norm": 0.3955758545159766, "learning_rate": 8.78457149673152e-07, "loss": 0.0228, "step": 2617 }, { "epoch": 2.482693219535325, "grad_norm": 0.47313816441818424, "learning_rate": 8.753338569782732e-07, "loss": 0.0221, "step": 2618 }, { "epoch": 2.4836415362731152, "grad_norm": 0.271371398644424, "learning_rate": 8.722155937735394e-07, "loss": 0.01, "step": 2619 }, { "epoch": 2.4845898530109056, "grad_norm": 0.3398216687335157, "learning_rate": 8.691023638612661e-07, "loss": 0.0163, "step": 2620 }, { "epoch": 2.485538169748696, "grad_norm": 0.6048741244957424, "learning_rate": 8.659941710376279e-07, "loss": 0.0258, "step": 2621 }, { "epoch": 2.4864864864864864, "grad_norm": 0.3436798347323189, "learning_rate": 8.628910190926599e-07, "loss": 0.0181, "step": 2622 }, { "epoch": 2.487434803224277, "grad_norm": 0.4156429457098121, "learning_rate": 8.59792911810251e-07, "loss": 0.0237, "step": 2623 }, { "epoch": 2.488383119962067, "grad_norm": 0.2954143857319232, "learning_rate": 8.566998529681337e-07, "loss": 0.0132, "step": 2624 }, { "epoch": 2.4893314366998576, "grad_norm": 0.39413478905762644, "learning_rate": 8.536118463378918e-07, "loss": 0.0162, "step": 2625 }, { "epoch": 2.4902797534376484, "grad_norm": 0.3892273346253101, "learning_rate": 8.505288956849428e-07, "loss": 0.0153, "step": 2626 }, { "epoch": 2.4912280701754383, "grad_norm": 0.6174085428601906, "learning_rate": 8.474510047685408e-07, "loss": 0.0311, "step": 2627 }, { "epoch": 2.492176386913229, "grad_norm": 0.4357016874336171, "learning_rate": 8.443781773417726e-07, "loss": 0.029, "step": 2628 }, { "epoch": 2.4931247036510196, "grad_norm": 0.34487830360142213, "learning_rate": 8.413104171515462e-07, "loss": 0.0146, "step": 2629 }, { "epoch": 2.49407302038881, "grad_norm": 0.36125275627714193, "learning_rate": 8.382477279385953e-07, "loss": 0.0226, "step": 2630 }, { "epoch": 2.4950213371266003, "grad_norm": 0.42552335088399396, "learning_rate": 8.35190113437469e-07, "loss": 0.0264, "step": 2631 }, { "epoch": 2.4959696538643907, "grad_norm": 0.34791091511753236, "learning_rate": 8.321375773765256e-07, "loss": 0.0175, "step": 2632 }, { "epoch": 2.496917970602181, "grad_norm": 0.3954500689347747, "learning_rate": 8.290901234779336e-07, "loss": 0.0188, "step": 2633 }, { "epoch": 2.4978662873399715, "grad_norm": 0.3663693629694859, "learning_rate": 8.260477554576657e-07, "loss": 0.0171, "step": 2634 }, { "epoch": 2.498814604077762, "grad_norm": 0.5494442209985945, "learning_rate": 8.230104770254899e-07, "loss": 0.0367, "step": 2635 }, { "epoch": 2.4997629208155523, "grad_norm": 0.5528576688557691, "learning_rate": 8.199782918849681e-07, "loss": 0.0285, "step": 2636 }, { "epoch": 2.5007112375533427, "grad_norm": 0.6404166113112415, "learning_rate": 8.169512037334553e-07, "loss": 0.0261, "step": 2637 }, { "epoch": 2.501659554291133, "grad_norm": 0.4286300123661097, "learning_rate": 8.139292162620866e-07, "loss": 0.0217, "step": 2638 }, { "epoch": 2.502607871028924, "grad_norm": 0.42741045341288486, "learning_rate": 8.109123331557828e-07, "loss": 0.021, "step": 2639 }, { "epoch": 2.503556187766714, "grad_norm": 0.4501781665468482, "learning_rate": 8.079005580932353e-07, "loss": 0.0245, "step": 2640 }, { "epoch": 2.5045045045045047, "grad_norm": 0.5105968099593348, "learning_rate": 8.048938947469109e-07, "loss": 0.0274, "step": 2641 }, { "epoch": 2.505452821242295, "grad_norm": 0.41552893159895754, "learning_rate": 8.018923467830403e-07, "loss": 0.0211, "step": 2642 }, { "epoch": 2.5064011379800855, "grad_norm": 0.4791684882671791, "learning_rate": 7.988959178616184e-07, "loss": 0.022, "step": 2643 }, { "epoch": 2.507349454717876, "grad_norm": 0.3439927281902292, "learning_rate": 7.959046116364e-07, "loss": 0.0147, "step": 2644 }, { "epoch": 2.5082977714556662, "grad_norm": 0.3146069088165673, "learning_rate": 7.929184317548888e-07, "loss": 0.014, "step": 2645 }, { "epoch": 2.5092460881934566, "grad_norm": 0.3109625989087509, "learning_rate": 7.899373818583417e-07, "loss": 0.0146, "step": 2646 }, { "epoch": 2.510194404931247, "grad_norm": 0.5903370862543935, "learning_rate": 7.869614655817576e-07, "loss": 0.0266, "step": 2647 }, { "epoch": 2.5111427216690374, "grad_norm": 0.42346350090954193, "learning_rate": 7.839906865538754e-07, "loss": 0.0228, "step": 2648 }, { "epoch": 2.512091038406828, "grad_norm": 0.5644352189518651, "learning_rate": 7.810250483971743e-07, "loss": 0.0177, "step": 2649 }, { "epoch": 2.513039355144618, "grad_norm": 0.6800732019791883, "learning_rate": 7.780645547278587e-07, "loss": 0.0388, "step": 2650 }, { "epoch": 2.5139876718824086, "grad_norm": 0.3672619635328786, "learning_rate": 7.751092091558632e-07, "loss": 0.0175, "step": 2651 }, { "epoch": 2.5149359886201994, "grad_norm": 0.36697647870892186, "learning_rate": 7.721590152848474e-07, "loss": 0.0186, "step": 2652 }, { "epoch": 2.5158843053579893, "grad_norm": 0.3949457428777839, "learning_rate": 7.692139767121826e-07, "loss": 0.013, "step": 2653 }, { "epoch": 2.51683262209578, "grad_norm": 0.36894744299049603, "learning_rate": 7.662740970289595e-07, "loss": 0.0206, "step": 2654 }, { "epoch": 2.5177809388335706, "grad_norm": 0.3004729811076744, "learning_rate": 7.633393798199778e-07, "loss": 0.0145, "step": 2655 }, { "epoch": 2.518729255571361, "grad_norm": 0.30582539821490057, "learning_rate": 7.604098286637379e-07, "loss": 0.0151, "step": 2656 }, { "epoch": 2.5196775723091513, "grad_norm": 0.41815777077524247, "learning_rate": 7.574854471324461e-07, "loss": 0.0253, "step": 2657 }, { "epoch": 2.5206258890469417, "grad_norm": 0.31145601353840013, "learning_rate": 7.545662387920016e-07, "loss": 0.0165, "step": 2658 }, { "epoch": 2.521574205784732, "grad_norm": 0.3562269510719666, "learning_rate": 7.516522072019955e-07, "loss": 0.0158, "step": 2659 }, { "epoch": 2.5225225225225225, "grad_norm": 0.4008350356129895, "learning_rate": 7.487433559157098e-07, "loss": 0.0171, "step": 2660 }, { "epoch": 2.523470839260313, "grad_norm": 0.6137367883176392, "learning_rate": 7.458396884801056e-07, "loss": 0.0244, "step": 2661 }, { "epoch": 2.5244191559981033, "grad_norm": 0.4813680993375203, "learning_rate": 7.429412084358262e-07, "loss": 0.0269, "step": 2662 }, { "epoch": 2.5253674727358937, "grad_norm": 0.46215577248037415, "learning_rate": 7.400479193171895e-07, "loss": 0.0266, "step": 2663 }, { "epoch": 2.526315789473684, "grad_norm": 0.37793669989539486, "learning_rate": 7.371598246521805e-07, "loss": 0.0158, "step": 2664 }, { "epoch": 2.527264106211475, "grad_norm": 0.41136067655717384, "learning_rate": 7.34276927962454e-07, "loss": 0.0238, "step": 2665 }, { "epoch": 2.528212422949265, "grad_norm": 0.33259267222332123, "learning_rate": 7.313992327633235e-07, "loss": 0.0209, "step": 2666 }, { "epoch": 2.5291607396870557, "grad_norm": 0.5819528717109627, "learning_rate": 7.285267425637621e-07, "loss": 0.0172, "step": 2667 }, { "epoch": 2.5301090564248456, "grad_norm": 0.30059814565316323, "learning_rate": 7.256594608663963e-07, "loss": 0.0185, "step": 2668 }, { "epoch": 2.5310573731626365, "grad_norm": 0.3712777470405421, "learning_rate": 7.227973911675002e-07, "loss": 0.0179, "step": 2669 }, { "epoch": 2.532005689900427, "grad_norm": 0.29855322296258097, "learning_rate": 7.199405369569911e-07, "loss": 0.0149, "step": 2670 }, { "epoch": 2.5329540066382172, "grad_norm": 0.40829074659712467, "learning_rate": 7.170889017184313e-07, "loss": 0.0187, "step": 2671 }, { "epoch": 2.5339023233760076, "grad_norm": 0.4407705399218079, "learning_rate": 7.142424889290139e-07, "loss": 0.0197, "step": 2672 }, { "epoch": 2.534850640113798, "grad_norm": 0.4866101217898875, "learning_rate": 7.114013020595684e-07, "loss": 0.0251, "step": 2673 }, { "epoch": 2.5357989568515884, "grad_norm": 0.40018962733942437, "learning_rate": 7.085653445745488e-07, "loss": 0.0233, "step": 2674 }, { "epoch": 2.536747273589379, "grad_norm": 0.383701601440475, "learning_rate": 7.057346199320342e-07, "loss": 0.0275, "step": 2675 }, { "epoch": 2.537695590327169, "grad_norm": 0.5349755091825953, "learning_rate": 7.029091315837245e-07, "loss": 0.0217, "step": 2676 }, { "epoch": 2.5386439070649596, "grad_norm": 0.3679483002865713, "learning_rate": 7.000888829749292e-07, "loss": 0.0158, "step": 2677 }, { "epoch": 2.53959222380275, "grad_norm": 0.35330307265879174, "learning_rate": 6.972738775445747e-07, "loss": 0.0191, "step": 2678 }, { "epoch": 2.5405405405405403, "grad_norm": 0.38659927985192755, "learning_rate": 6.9446411872519e-07, "loss": 0.0211, "step": 2679 }, { "epoch": 2.541488857278331, "grad_norm": 0.4064147988484457, "learning_rate": 6.916596099429096e-07, "loss": 0.0233, "step": 2680 }, { "epoch": 2.542437174016121, "grad_norm": 0.6580118862505251, "learning_rate": 6.888603546174638e-07, "loss": 0.0245, "step": 2681 }, { "epoch": 2.543385490753912, "grad_norm": 0.3944629105173342, "learning_rate": 6.860663561621767e-07, "loss": 0.0258, "step": 2682 }, { "epoch": 2.5443338074917023, "grad_norm": 0.33272900655389953, "learning_rate": 6.83277617983964e-07, "loss": 0.0179, "step": 2683 }, { "epoch": 2.5452821242294927, "grad_norm": 0.4167482558841912, "learning_rate": 6.804941434833285e-07, "loss": 0.0177, "step": 2684 }, { "epoch": 2.546230440967283, "grad_norm": 0.3992502938558958, "learning_rate": 6.777159360543501e-07, "loss": 0.0274, "step": 2685 }, { "epoch": 2.5471787577050735, "grad_norm": 0.40094207085691724, "learning_rate": 6.749429990846901e-07, "loss": 0.0274, "step": 2686 }, { "epoch": 2.548127074442864, "grad_norm": 0.3792531400747548, "learning_rate": 6.721753359555833e-07, "loss": 0.0241, "step": 2687 }, { "epoch": 2.5490753911806543, "grad_norm": 0.37741471606051397, "learning_rate": 6.6941295004183e-07, "loss": 0.0195, "step": 2688 }, { "epoch": 2.5500237079184447, "grad_norm": 0.32470565833329973, "learning_rate": 6.666558447118005e-07, "loss": 0.015, "step": 2689 }, { "epoch": 2.550972024656235, "grad_norm": 0.4293192559707278, "learning_rate": 6.639040233274225e-07, "loss": 0.0217, "step": 2690 }, { "epoch": 2.5519203413940255, "grad_norm": 0.3929905923840855, "learning_rate": 6.611574892441802e-07, "loss": 0.0196, "step": 2691 }, { "epoch": 2.552868658131816, "grad_norm": 0.3561960449808376, "learning_rate": 6.584162458111148e-07, "loss": 0.0226, "step": 2692 }, { "epoch": 2.5538169748696067, "grad_norm": 0.37662449677307064, "learning_rate": 6.556802963708115e-07, "loss": 0.0276, "step": 2693 }, { "epoch": 2.5547652916073966, "grad_norm": 0.5129824661775971, "learning_rate": 6.52949644259403e-07, "loss": 0.0362, "step": 2694 }, { "epoch": 2.5557136083451875, "grad_norm": 0.6692404460687522, "learning_rate": 6.502242928065633e-07, "loss": 0.0332, "step": 2695 }, { "epoch": 2.556661925082978, "grad_norm": 0.393927786278907, "learning_rate": 6.475042453354996e-07, "loss": 0.0206, "step": 2696 }, { "epoch": 2.5576102418207682, "grad_norm": 0.3577618677497919, "learning_rate": 6.44789505162955e-07, "loss": 0.0175, "step": 2697 }, { "epoch": 2.5585585585585586, "grad_norm": 0.3661222623781511, "learning_rate": 6.42080075599198e-07, "loss": 0.0117, "step": 2698 }, { "epoch": 2.559506875296349, "grad_norm": 0.30688738276260336, "learning_rate": 6.393759599480243e-07, "loss": 0.0161, "step": 2699 }, { "epoch": 2.5604551920341394, "grad_norm": 0.41630163331920195, "learning_rate": 6.366771615067497e-07, "loss": 0.0189, "step": 2700 }, { "epoch": 2.56140350877193, "grad_norm": 0.5340246040237304, "learning_rate": 6.339836835662039e-07, "loss": 0.0275, "step": 2701 }, { "epoch": 2.56235182550972, "grad_norm": 0.35423728423688666, "learning_rate": 6.312955294107304e-07, "loss": 0.0158, "step": 2702 }, { "epoch": 2.5633001422475106, "grad_norm": 0.47028395732329115, "learning_rate": 6.28612702318181e-07, "loss": 0.0289, "step": 2703 }, { "epoch": 2.564248458985301, "grad_norm": 0.4005227388788351, "learning_rate": 6.259352055599127e-07, "loss": 0.032, "step": 2704 }, { "epoch": 2.5651967757230913, "grad_norm": 0.467516363616176, "learning_rate": 6.23263042400783e-07, "loss": 0.0212, "step": 2705 }, { "epoch": 2.566145092460882, "grad_norm": 0.31564569013460975, "learning_rate": 6.205962160991424e-07, "loss": 0.0175, "step": 2706 }, { "epoch": 2.567093409198672, "grad_norm": 0.3373546761437316, "learning_rate": 6.179347299068377e-07, "loss": 0.0142, "step": 2707 }, { "epoch": 2.568041725936463, "grad_norm": 0.47280981766734664, "learning_rate": 6.152785870692041e-07, "loss": 0.0223, "step": 2708 }, { "epoch": 2.5689900426742533, "grad_norm": 0.36467322171181704, "learning_rate": 6.126277908250572e-07, "loss": 0.0187, "step": 2709 }, { "epoch": 2.5699383594120437, "grad_norm": 0.4297930835831616, "learning_rate": 6.099823444066982e-07, "loss": 0.0292, "step": 2710 }, { "epoch": 2.570886676149834, "grad_norm": 0.41906846780451784, "learning_rate": 6.073422510399007e-07, "loss": 0.0252, "step": 2711 }, { "epoch": 2.5718349928876245, "grad_norm": 0.38650910584228265, "learning_rate": 6.047075139439151e-07, "loss": 0.0226, "step": 2712 }, { "epoch": 2.572783309625415, "grad_norm": 0.5084741884777527, "learning_rate": 6.02078136331456e-07, "loss": 0.0195, "step": 2713 }, { "epoch": 2.5737316263632053, "grad_norm": 0.39278830005678883, "learning_rate": 5.994541214087052e-07, "loss": 0.021, "step": 2714 }, { "epoch": 2.5746799431009957, "grad_norm": 0.2952909468393027, "learning_rate": 5.968354723753056e-07, "loss": 0.0151, "step": 2715 }, { "epoch": 2.575628259838786, "grad_norm": 0.37045235908101837, "learning_rate": 5.94222192424358e-07, "loss": 0.0161, "step": 2716 }, { "epoch": 2.5765765765765765, "grad_norm": 0.3281172799358503, "learning_rate": 5.916142847424127e-07, "loss": 0.0148, "step": 2717 }, { "epoch": 2.577524893314367, "grad_norm": 0.4924919864359691, "learning_rate": 5.890117525094741e-07, "loss": 0.0285, "step": 2718 }, { "epoch": 2.5784732100521577, "grad_norm": 0.4481794894764316, "learning_rate": 5.864145988989867e-07, "loss": 0.0194, "step": 2719 }, { "epoch": 2.5794215267899476, "grad_norm": 0.5321347244112973, "learning_rate": 5.838228270778407e-07, "loss": 0.0204, "step": 2720 }, { "epoch": 2.5803698435277385, "grad_norm": 0.31872189522321803, "learning_rate": 5.812364402063636e-07, "loss": 0.0169, "step": 2721 }, { "epoch": 2.581318160265529, "grad_norm": 0.41681981250054423, "learning_rate": 5.786554414383128e-07, "loss": 0.0201, "step": 2722 }, { "epoch": 2.5822664770033192, "grad_norm": 0.5067386260406679, "learning_rate": 5.760798339208807e-07, "loss": 0.0234, "step": 2723 }, { "epoch": 2.5832147937411096, "grad_norm": 0.33542737410974843, "learning_rate": 5.735096207946822e-07, "loss": 0.0151, "step": 2724 }, { "epoch": 2.5841631104789, "grad_norm": 0.6467429650160105, "learning_rate": 5.709448051937544e-07, "loss": 0.0204, "step": 2725 }, { "epoch": 2.5851114272166904, "grad_norm": 0.38155909066198, "learning_rate": 5.683853902455561e-07, "loss": 0.0217, "step": 2726 }, { "epoch": 2.586059743954481, "grad_norm": 0.28662641689056056, "learning_rate": 5.658313790709569e-07, "loss": 0.0119, "step": 2727 }, { "epoch": 2.587008060692271, "grad_norm": 0.3990322895513413, "learning_rate": 5.632827747842395e-07, "loss": 0.023, "step": 2728 }, { "epoch": 2.5879563774300616, "grad_norm": 0.40716429826152295, "learning_rate": 5.607395804930943e-07, "loss": 0.0207, "step": 2729 }, { "epoch": 2.588904694167852, "grad_norm": 0.35974137294779707, "learning_rate": 5.582017992986122e-07, "loss": 0.0249, "step": 2730 }, { "epoch": 2.5898530109056423, "grad_norm": 0.38646462128999254, "learning_rate": 5.556694342952845e-07, "loss": 0.0186, "step": 2731 }, { "epoch": 2.590801327643433, "grad_norm": 0.37653004258963946, "learning_rate": 5.531424885710012e-07, "loss": 0.0268, "step": 2732 }, { "epoch": 2.591749644381223, "grad_norm": 0.4299208034382125, "learning_rate": 5.506209652070399e-07, "loss": 0.0349, "step": 2733 }, { "epoch": 2.592697961119014, "grad_norm": 0.38861976736094617, "learning_rate": 5.48104867278067e-07, "loss": 0.018, "step": 2734 }, { "epoch": 2.593646277856804, "grad_norm": 0.4175379738102784, "learning_rate": 5.455941978521368e-07, "loss": 0.0168, "step": 2735 }, { "epoch": 2.5945945945945947, "grad_norm": 0.33966670471790067, "learning_rate": 5.430889599906792e-07, "loss": 0.0151, "step": 2736 }, { "epoch": 2.595542911332385, "grad_norm": 0.3835190152377659, "learning_rate": 5.405891567485066e-07, "loss": 0.0213, "step": 2737 }, { "epoch": 2.5964912280701755, "grad_norm": 0.4113531869569931, "learning_rate": 5.380947911737988e-07, "loss": 0.0314, "step": 2738 }, { "epoch": 2.597439544807966, "grad_norm": 0.40520964068776627, "learning_rate": 5.356058663081093e-07, "loss": 0.0264, "step": 2739 }, { "epoch": 2.5983878615457563, "grad_norm": 0.7837946356297472, "learning_rate": 5.33122385186357e-07, "loss": 0.0217, "step": 2740 }, { "epoch": 2.5993361782835467, "grad_norm": 0.8806203038130418, "learning_rate": 5.306443508368197e-07, "loss": 0.0387, "step": 2741 }, { "epoch": 2.600284495021337, "grad_norm": 0.34074270045547633, "learning_rate": 5.281717662811381e-07, "loss": 0.0149, "step": 2742 }, { "epoch": 2.6012328117591275, "grad_norm": 0.4477584975020037, "learning_rate": 5.257046345343031e-07, "loss": 0.0246, "step": 2743 }, { "epoch": 2.602181128496918, "grad_norm": 0.5551990298380998, "learning_rate": 5.2324295860466e-07, "loss": 0.0254, "step": 2744 }, { "epoch": 2.6031294452347082, "grad_norm": 0.45009786762895765, "learning_rate": 5.207867414939005e-07, "loss": 0.024, "step": 2745 }, { "epoch": 2.6040777619724986, "grad_norm": 0.4832226736950542, "learning_rate": 5.183359861970572e-07, "loss": 0.0297, "step": 2746 }, { "epoch": 2.6050260787102895, "grad_norm": 0.3779678120598059, "learning_rate": 5.158906957025079e-07, "loss": 0.0137, "step": 2747 }, { "epoch": 2.6059743954480794, "grad_norm": 0.35142073959413767, "learning_rate": 5.134508729919635e-07, "loss": 0.0179, "step": 2748 }, { "epoch": 2.6069227121858702, "grad_norm": 0.40042406085470267, "learning_rate": 5.110165210404672e-07, "loss": 0.0225, "step": 2749 }, { "epoch": 2.6078710289236606, "grad_norm": 0.33663608338691814, "learning_rate": 5.085876428163938e-07, "loss": 0.0294, "step": 2750 }, { "epoch": 2.608819345661451, "grad_norm": 0.433579962152764, "learning_rate": 5.061642412814405e-07, "loss": 0.0188, "step": 2751 }, { "epoch": 2.6097676623992414, "grad_norm": 0.31497843130176856, "learning_rate": 5.037463193906295e-07, "loss": 0.018, "step": 2752 }, { "epoch": 2.610715979137032, "grad_norm": 0.4392939445838396, "learning_rate": 5.013338800923001e-07, "loss": 0.0217, "step": 2753 }, { "epoch": 2.611664295874822, "grad_norm": 0.38382293150188085, "learning_rate": 4.989269263281044e-07, "loss": 0.0216, "step": 2754 }, { "epoch": 2.6126126126126126, "grad_norm": 0.4649233732841508, "learning_rate": 4.965254610330089e-07, "loss": 0.032, "step": 2755 }, { "epoch": 2.613560929350403, "grad_norm": 0.37494406277381903, "learning_rate": 4.941294871352859e-07, "loss": 0.0223, "step": 2756 }, { "epoch": 2.6145092460881934, "grad_norm": 0.42108122053404773, "learning_rate": 4.917390075565098e-07, "loss": 0.0231, "step": 2757 }, { "epoch": 2.6154575628259837, "grad_norm": 0.47265684925500273, "learning_rate": 4.893540252115597e-07, "loss": 0.0209, "step": 2758 }, { "epoch": 2.616405879563774, "grad_norm": 0.3344680664465188, "learning_rate": 4.869745430086076e-07, "loss": 0.0143, "step": 2759 }, { "epoch": 2.617354196301565, "grad_norm": 0.38459266477883736, "learning_rate": 4.846005638491213e-07, "loss": 0.0203, "step": 2760 }, { "epoch": 2.618302513039355, "grad_norm": 0.3412043515396964, "learning_rate": 4.822320906278577e-07, "loss": 0.0139, "step": 2761 }, { "epoch": 2.6192508297771457, "grad_norm": 0.5208903375959671, "learning_rate": 4.798691262328586e-07, "loss": 0.0243, "step": 2762 }, { "epoch": 2.620199146514936, "grad_norm": 0.39653893800416085, "learning_rate": 4.775116735454511e-07, "loss": 0.0253, "step": 2763 }, { "epoch": 2.6211474632527265, "grad_norm": 0.2892258222331325, "learning_rate": 4.751597354402382e-07, "loss": 0.0145, "step": 2764 }, { "epoch": 2.622095779990517, "grad_norm": 0.40845586230105796, "learning_rate": 4.728133147851011e-07, "loss": 0.0234, "step": 2765 }, { "epoch": 2.6230440967283073, "grad_norm": 0.30239125683950585, "learning_rate": 4.7047241444119376e-07, "loss": 0.0143, "step": 2766 }, { "epoch": 2.6239924134660977, "grad_norm": 0.4211431383939641, "learning_rate": 4.681370372629368e-07, "loss": 0.027, "step": 2767 }, { "epoch": 2.624940730203888, "grad_norm": 0.5032346373585926, "learning_rate": 4.6580718609801513e-07, "loss": 0.0185, "step": 2768 }, { "epoch": 2.6258890469416785, "grad_norm": 0.3655746633014726, "learning_rate": 4.634828637873795e-07, "loss": 0.0223, "step": 2769 }, { "epoch": 2.626837363679469, "grad_norm": 0.3074549708482703, "learning_rate": 4.6116407316523446e-07, "loss": 0.018, "step": 2770 }, { "epoch": 2.6277856804172592, "grad_norm": 0.46834675787128, "learning_rate": 4.5885081705904334e-07, "loss": 0.0255, "step": 2771 }, { "epoch": 2.6287339971550496, "grad_norm": 0.37778189298548, "learning_rate": 4.565430982895175e-07, "loss": 0.0229, "step": 2772 }, { "epoch": 2.6296823138928405, "grad_norm": 0.46883682144212735, "learning_rate": 4.5424091967061877e-07, "loss": 0.0157, "step": 2773 }, { "epoch": 2.6306306306306304, "grad_norm": 0.40055616739928074, "learning_rate": 4.5194428400955224e-07, "loss": 0.0208, "step": 2774 }, { "epoch": 2.6315789473684212, "grad_norm": 0.5249183820066241, "learning_rate": 4.496531941067639e-07, "loss": 0.0247, "step": 2775 }, { "epoch": 2.6325272641062116, "grad_norm": 0.34712677696632804, "learning_rate": 4.4736765275593897e-07, "loss": 0.0216, "step": 2776 }, { "epoch": 2.633475580844002, "grad_norm": 0.3929916373129789, "learning_rate": 4.4508766274399426e-07, "loss": 0.0142, "step": 2777 }, { "epoch": 2.6344238975817924, "grad_norm": 0.4849560252243947, "learning_rate": 4.428132268510793e-07, "loss": 0.0219, "step": 2778 }, { "epoch": 2.635372214319583, "grad_norm": 0.36600196074929786, "learning_rate": 4.4054434785057163e-07, "loss": 0.0164, "step": 2779 }, { "epoch": 2.636320531057373, "grad_norm": 0.5093185802679411, "learning_rate": 4.3828102850907104e-07, "loss": 0.0236, "step": 2780 }, { "epoch": 2.6372688477951636, "grad_norm": 0.3862188682640724, "learning_rate": 4.360232715863988e-07, "loss": 0.0264, "step": 2781 }, { "epoch": 2.638217164532954, "grad_norm": 0.3574239021740862, "learning_rate": 4.33771079835596e-07, "loss": 0.0162, "step": 2782 }, { "epoch": 2.6391654812707444, "grad_norm": 0.4706427605900885, "learning_rate": 4.3152445600291306e-07, "loss": 0.0289, "step": 2783 }, { "epoch": 2.6401137980085347, "grad_norm": 0.3667202133626864, "learning_rate": 4.2928340282781366e-07, "loss": 0.0204, "step": 2784 }, { "epoch": 2.641062114746325, "grad_norm": 0.413719793835401, "learning_rate": 4.270479230429708e-07, "loss": 0.0235, "step": 2785 }, { "epoch": 2.642010431484116, "grad_norm": 0.44950116717007177, "learning_rate": 4.2481801937425735e-07, "loss": 0.0223, "step": 2786 }, { "epoch": 2.642958748221906, "grad_norm": 0.35663470393335955, "learning_rate": 4.225936945407494e-07, "loss": 0.014, "step": 2787 }, { "epoch": 2.6439070649596967, "grad_norm": 0.39202001192001895, "learning_rate": 4.203749512547201e-07, "loss": 0.0196, "step": 2788 }, { "epoch": 2.6448553816974867, "grad_norm": 0.32486780874552906, "learning_rate": 4.1816179222163525e-07, "loss": 0.0215, "step": 2789 }, { "epoch": 2.6458036984352775, "grad_norm": 0.3723048747718762, "learning_rate": 4.1595422014015343e-07, "loss": 0.0224, "step": 2790 }, { "epoch": 2.646752015173068, "grad_norm": 0.357232800265432, "learning_rate": 4.137522377021186e-07, "loss": 0.0171, "step": 2791 }, { "epoch": 2.6477003319108583, "grad_norm": 0.4698743666062284, "learning_rate": 4.115558475925602e-07, "loss": 0.0254, "step": 2792 }, { "epoch": 2.6486486486486487, "grad_norm": 0.4382212268806192, "learning_rate": 4.093650524896897e-07, "loss": 0.0275, "step": 2793 }, { "epoch": 2.649596965386439, "grad_norm": 0.4043616799679715, "learning_rate": 4.0717985506489366e-07, "loss": 0.0235, "step": 2794 }, { "epoch": 2.6505452821242295, "grad_norm": 0.3392698836350075, "learning_rate": 4.050002579827345e-07, "loss": 0.0146, "step": 2795 }, { "epoch": 2.65149359886202, "grad_norm": 0.3722272428398198, "learning_rate": 4.028262639009445e-07, "loss": 0.0164, "step": 2796 }, { "epoch": 2.6524419155998102, "grad_norm": 0.47666670751210216, "learning_rate": 4.0065787547042543e-07, "loss": 0.0247, "step": 2797 }, { "epoch": 2.6533902323376006, "grad_norm": 0.35724907861541844, "learning_rate": 3.9849509533524343e-07, "loss": 0.0162, "step": 2798 }, { "epoch": 2.654338549075391, "grad_norm": 0.6380750662875044, "learning_rate": 3.9633792613262557e-07, "loss": 0.0328, "step": 2799 }, { "epoch": 2.6552868658131814, "grad_norm": 0.4876904102585979, "learning_rate": 3.941863704929555e-07, "loss": 0.0269, "step": 2800 }, { "epoch": 2.6562351825509722, "grad_norm": 0.3549869940847376, "learning_rate": 3.920404310397763e-07, "loss": 0.0202, "step": 2801 }, { "epoch": 2.657183499288762, "grad_norm": 0.5769104527453145, "learning_rate": 3.89900110389777e-07, "loss": 0.023, "step": 2802 }, { "epoch": 2.658131816026553, "grad_norm": 0.5050863547101014, "learning_rate": 3.877654111528012e-07, "loss": 0.027, "step": 2803 }, { "epoch": 2.6590801327643434, "grad_norm": 0.35737691730875915, "learning_rate": 3.8563633593183217e-07, "loss": 0.0162, "step": 2804 }, { "epoch": 2.660028449502134, "grad_norm": 0.37236396191380444, "learning_rate": 3.835128873230004e-07, "loss": 0.0209, "step": 2805 }, { "epoch": 2.660976766239924, "grad_norm": 0.32787817974246386, "learning_rate": 3.813950679155731e-07, "loss": 0.0174, "step": 2806 }, { "epoch": 2.6619250829777146, "grad_norm": 0.3052178572544167, "learning_rate": 3.7928288029195294e-07, "loss": 0.0141, "step": 2807 }, { "epoch": 2.662873399715505, "grad_norm": 0.5978262864524307, "learning_rate": 3.771763270276768e-07, "loss": 0.0313, "step": 2808 }, { "epoch": 2.6638217164532954, "grad_norm": 0.5112733834365627, "learning_rate": 3.750754106914101e-07, "loss": 0.0221, "step": 2809 }, { "epoch": 2.6647700331910857, "grad_norm": 0.39109717270185107, "learning_rate": 3.729801338449451e-07, "loss": 0.0211, "step": 2810 }, { "epoch": 2.665718349928876, "grad_norm": 0.34630995301406425, "learning_rate": 3.708904990431983e-07, "loss": 0.0245, "step": 2811 }, { "epoch": 2.6666666666666665, "grad_norm": 0.3894835061084856, "learning_rate": 3.688065088342041e-07, "loss": 0.0135, "step": 2812 }, { "epoch": 2.667614983404457, "grad_norm": 0.413966885464809, "learning_rate": 3.6672816575911597e-07, "loss": 0.0286, "step": 2813 }, { "epoch": 2.6685633001422477, "grad_norm": 0.3333453955002542, "learning_rate": 3.646554723522028e-07, "loss": 0.0156, "step": 2814 }, { "epoch": 2.6695116168800377, "grad_norm": 0.5143420713686682, "learning_rate": 3.6258843114084066e-07, "loss": 0.0197, "step": 2815 }, { "epoch": 2.6704599336178285, "grad_norm": 0.3937058493387821, "learning_rate": 3.6052704464551723e-07, "loss": 0.0269, "step": 2816 }, { "epoch": 2.671408250355619, "grad_norm": 0.34107596225123676, "learning_rate": 3.584713153798214e-07, "loss": 0.0163, "step": 2817 }, { "epoch": 2.6723565670934093, "grad_norm": 0.4433497311363595, "learning_rate": 3.5642124585044736e-07, "loss": 0.0264, "step": 2818 }, { "epoch": 2.6733048838311997, "grad_norm": 0.47719708455229654, "learning_rate": 3.5437683855718663e-07, "loss": 0.0249, "step": 2819 }, { "epoch": 2.67425320056899, "grad_norm": 0.37082517315713176, "learning_rate": 3.5233809599292565e-07, "loss": 0.0216, "step": 2820 }, { "epoch": 2.6752015173067805, "grad_norm": 0.4449107865815135, "learning_rate": 3.5030502064364314e-07, "loss": 0.0242, "step": 2821 }, { "epoch": 2.676149834044571, "grad_norm": 0.7583947129307697, "learning_rate": 3.482776149884093e-07, "loss": 0.0206, "step": 2822 }, { "epoch": 2.6770981507823612, "grad_norm": 0.47130851405721447, "learning_rate": 3.4625588149937883e-07, "loss": 0.0246, "step": 2823 }, { "epoch": 2.6780464675201516, "grad_norm": 0.40574178102286834, "learning_rate": 3.44239822641792e-07, "loss": 0.0211, "step": 2824 }, { "epoch": 2.678994784257942, "grad_norm": 0.39016069764421896, "learning_rate": 3.4222944087396737e-07, "loss": 0.02, "step": 2825 }, { "epoch": 2.6799431009957324, "grad_norm": 0.4193631351806872, "learning_rate": 3.402247386473029e-07, "loss": 0.0262, "step": 2826 }, { "epoch": 2.6808914177335232, "grad_norm": 0.46922685051431373, "learning_rate": 3.3822571840627095e-07, "loss": 0.0356, "step": 2827 }, { "epoch": 2.681839734471313, "grad_norm": 0.3653043259783921, "learning_rate": 3.362323825884134e-07, "loss": 0.0208, "step": 2828 }, { "epoch": 2.682788051209104, "grad_norm": 0.3426904459665249, "learning_rate": 3.342447336243437e-07, "loss": 0.0153, "step": 2829 }, { "epoch": 2.6837363679468944, "grad_norm": 0.36215857651916467, "learning_rate": 3.322627739377388e-07, "loss": 0.018, "step": 2830 }, { "epoch": 2.684684684684685, "grad_norm": 0.3802416370751486, "learning_rate": 3.3028650594533975e-07, "loss": 0.0268, "step": 2831 }, { "epoch": 2.685633001422475, "grad_norm": 0.36281885778655093, "learning_rate": 3.283159320569451e-07, "loss": 0.0195, "step": 2832 }, { "epoch": 2.6865813181602656, "grad_norm": 0.39875935883241936, "learning_rate": 3.263510546754117e-07, "loss": 0.0175, "step": 2833 }, { "epoch": 2.687529634898056, "grad_norm": 0.4168338240621692, "learning_rate": 3.243918761966508e-07, "loss": 0.0244, "step": 2834 }, { "epoch": 2.6884779516358464, "grad_norm": 0.47057878935135905, "learning_rate": 3.224383990096247e-07, "loss": 0.0229, "step": 2835 }, { "epoch": 2.6894262683736367, "grad_norm": 0.38441852189692455, "learning_rate": 3.2049062549634137e-07, "loss": 0.0249, "step": 2836 }, { "epoch": 2.690374585111427, "grad_norm": 0.5637198135695414, "learning_rate": 3.1854855803185645e-07, "loss": 0.0212, "step": 2837 }, { "epoch": 2.6913229018492175, "grad_norm": 0.37781951972907957, "learning_rate": 3.1661219898426743e-07, "loss": 0.0145, "step": 2838 }, { "epoch": 2.692271218587008, "grad_norm": 0.35734390206818134, "learning_rate": 3.1468155071470885e-07, "loss": 0.0192, "step": 2839 }, { "epoch": 2.6932195353247987, "grad_norm": 0.4077467536996209, "learning_rate": 3.127566155773554e-07, "loss": 0.0201, "step": 2840 }, { "epoch": 2.6941678520625887, "grad_norm": 0.5028083771887946, "learning_rate": 3.1083739591941167e-07, "loss": 0.0383, "step": 2841 }, { "epoch": 2.6951161688003795, "grad_norm": 0.501841795908295, "learning_rate": 3.0892389408111625e-07, "loss": 0.0235, "step": 2842 }, { "epoch": 2.69606448553817, "grad_norm": 0.38475074034060736, "learning_rate": 3.070161123957327e-07, "loss": 0.0221, "step": 2843 }, { "epoch": 2.6970128022759603, "grad_norm": 0.4020329048390891, "learning_rate": 3.0511405318955136e-07, "loss": 0.0259, "step": 2844 }, { "epoch": 2.6979611190137507, "grad_norm": 0.48369634537926337, "learning_rate": 3.0321771878188353e-07, "loss": 0.019, "step": 2845 }, { "epoch": 2.698909435751541, "grad_norm": 1.0740629753507862, "learning_rate": 3.0132711148506243e-07, "loss": 0.02, "step": 2846 }, { "epoch": 2.6998577524893315, "grad_norm": 0.3695371754345027, "learning_rate": 2.994422336044345e-07, "loss": 0.0185, "step": 2847 }, { "epoch": 2.700806069227122, "grad_norm": 0.5038388629520099, "learning_rate": 2.9756308743836284e-07, "loss": 0.0208, "step": 2848 }, { "epoch": 2.7017543859649122, "grad_norm": 0.7640080852891192, "learning_rate": 2.9568967527821846e-07, "loss": 0.0249, "step": 2849 }, { "epoch": 2.7027027027027026, "grad_norm": 0.46768948100137725, "learning_rate": 2.93821999408383e-07, "loss": 0.0204, "step": 2850 }, { "epoch": 2.703651019440493, "grad_norm": 0.4319992020883265, "learning_rate": 2.9196006210624296e-07, "loss": 0.0328, "step": 2851 }, { "epoch": 2.7045993361782834, "grad_norm": 0.41315615466424443, "learning_rate": 2.9010386564218616e-07, "loss": 0.0153, "step": 2852 }, { "epoch": 2.7055476529160742, "grad_norm": 0.3744469122321566, "learning_rate": 2.882534122796016e-07, "loss": 0.0209, "step": 2853 }, { "epoch": 2.706495969653864, "grad_norm": 0.42369368397512175, "learning_rate": 2.864087042748753e-07, "loss": 0.0218, "step": 2854 }, { "epoch": 2.707444286391655, "grad_norm": 0.5051296882817911, "learning_rate": 2.8456974387738535e-07, "loss": 0.0215, "step": 2855 }, { "epoch": 2.708392603129445, "grad_norm": 0.41590105364038504, "learning_rate": 2.827365333295051e-07, "loss": 0.0213, "step": 2856 }, { "epoch": 2.709340919867236, "grad_norm": 0.587547148927632, "learning_rate": 2.8090907486659324e-07, "loss": 0.0334, "step": 2857 }, { "epoch": 2.710289236605026, "grad_norm": 0.34887434178067056, "learning_rate": 2.790873707169961e-07, "loss": 0.017, "step": 2858 }, { "epoch": 2.7112375533428166, "grad_norm": 0.39187346235873055, "learning_rate": 2.7727142310204515e-07, "loss": 0.0203, "step": 2859 }, { "epoch": 2.712185870080607, "grad_norm": 0.41720690104657004, "learning_rate": 2.75461234236048e-07, "loss": 0.0167, "step": 2860 }, { "epoch": 2.7131341868183974, "grad_norm": 0.31518371371061915, "learning_rate": 2.7365680632629455e-07, "loss": 0.0144, "step": 2861 }, { "epoch": 2.7140825035561877, "grad_norm": 0.2855027969557929, "learning_rate": 2.718581415730481e-07, "loss": 0.0137, "step": 2862 }, { "epoch": 2.715030820293978, "grad_norm": 0.3870774282480667, "learning_rate": 2.7006524216954543e-07, "loss": 0.0253, "step": 2863 }, { "epoch": 2.7159791370317685, "grad_norm": 0.38164386495554165, "learning_rate": 2.682781103019905e-07, "loss": 0.0179, "step": 2864 }, { "epoch": 2.716927453769559, "grad_norm": 0.4248661479785991, "learning_rate": 2.664967481495584e-07, "loss": 0.0326, "step": 2865 }, { "epoch": 2.7178757705073493, "grad_norm": 0.5433988511920199, "learning_rate": 2.647211578843856e-07, "loss": 0.0247, "step": 2866 }, { "epoch": 2.7188240872451397, "grad_norm": 0.39724442593767356, "learning_rate": 2.629513416715734e-07, "loss": 0.0188, "step": 2867 }, { "epoch": 2.7197724039829305, "grad_norm": 0.33528171893218883, "learning_rate": 2.611873016691796e-07, "loss": 0.0202, "step": 2868 }, { "epoch": 2.7207207207207205, "grad_norm": 0.3965938814255718, "learning_rate": 2.594290400282201e-07, "loss": 0.0295, "step": 2869 }, { "epoch": 2.7216690374585113, "grad_norm": 0.30564880056172694, "learning_rate": 2.576765588926661e-07, "loss": 0.0153, "step": 2870 }, { "epoch": 2.7226173541963017, "grad_norm": 0.3137314915113477, "learning_rate": 2.5592986039943703e-07, "loss": 0.0141, "step": 2871 }, { "epoch": 2.723565670934092, "grad_norm": 0.49092832578220685, "learning_rate": 2.5418894667840363e-07, "loss": 0.0277, "step": 2872 }, { "epoch": 2.7245139876718825, "grad_norm": 0.4015470991385358, "learning_rate": 2.524538198523824e-07, "loss": 0.0188, "step": 2873 }, { "epoch": 2.725462304409673, "grad_norm": 0.41058168996354416, "learning_rate": 2.5072448203713294e-07, "loss": 0.0191, "step": 2874 }, { "epoch": 2.7264106211474632, "grad_norm": 0.5285881611083112, "learning_rate": 2.4900093534135695e-07, "loss": 0.0299, "step": 2875 }, { "epoch": 2.7273589378852536, "grad_norm": 0.40022195777865227, "learning_rate": 2.472831818666921e-07, "loss": 0.0209, "step": 2876 }, { "epoch": 2.728307254623044, "grad_norm": 0.46750753879370927, "learning_rate": 2.455712237077157e-07, "loss": 0.0266, "step": 2877 }, { "epoch": 2.7292555713608344, "grad_norm": 0.3959499198022272, "learning_rate": 2.438650629519351e-07, "loss": 0.0284, "step": 2878 }, { "epoch": 2.730203888098625, "grad_norm": 0.43008633569157745, "learning_rate": 2.421647016797907e-07, "loss": 0.0176, "step": 2879 }, { "epoch": 2.731152204836415, "grad_norm": 0.37556553750404836, "learning_rate": 2.404701419646505e-07, "loss": 0.019, "step": 2880 }, { "epoch": 2.732100521574206, "grad_norm": 0.5032204383672553, "learning_rate": 2.3878138587280717e-07, "loss": 0.0323, "step": 2881 }, { "epoch": 2.733048838311996, "grad_norm": 0.3505093748403086, "learning_rate": 2.3709843546347865e-07, "loss": 0.0236, "step": 2882 }, { "epoch": 2.733997155049787, "grad_norm": 0.2970624200348729, "learning_rate": 2.354212927888022e-07, "loss": 0.0135, "step": 2883 }, { "epoch": 2.734945471787577, "grad_norm": 0.465166372609731, "learning_rate": 2.3374995989383366e-07, "loss": 0.0181, "step": 2884 }, { "epoch": 2.7358937885253676, "grad_norm": 0.44064948151774963, "learning_rate": 2.3208443881654442e-07, "loss": 0.0105, "step": 2885 }, { "epoch": 2.736842105263158, "grad_norm": 0.41355456746628383, "learning_rate": 2.3042473158781998e-07, "loss": 0.0198, "step": 2886 }, { "epoch": 2.7377904220009484, "grad_norm": 0.35065718913864835, "learning_rate": 2.2877084023145425e-07, "loss": 0.0167, "step": 2887 }, { "epoch": 2.7387387387387387, "grad_norm": 0.39535153479567114, "learning_rate": 2.2712276676415346e-07, "loss": 0.0266, "step": 2888 }, { "epoch": 2.739687055476529, "grad_norm": 0.4217230635090741, "learning_rate": 2.254805131955251e-07, "loss": 0.0202, "step": 2889 }, { "epoch": 2.7406353722143195, "grad_norm": 0.384980173712967, "learning_rate": 2.238440815280829e-07, "loss": 0.0196, "step": 2890 }, { "epoch": 2.74158368895211, "grad_norm": 0.4678666261730461, "learning_rate": 2.222134737572429e-07, "loss": 0.041, "step": 2891 }, { "epoch": 2.7425320056899003, "grad_norm": 0.3519750639388043, "learning_rate": 2.2058869187131514e-07, "loss": 0.0211, "step": 2892 }, { "epoch": 2.7434803224276907, "grad_norm": 0.3512574692443541, "learning_rate": 2.1896973785150987e-07, "loss": 0.0153, "step": 2893 }, { "epoch": 2.7444286391654815, "grad_norm": 0.6136655914796003, "learning_rate": 2.173566136719285e-07, "loss": 0.0256, "step": 2894 }, { "epoch": 2.7453769559032715, "grad_norm": 0.40007142163498555, "learning_rate": 2.1574932129956483e-07, "loss": 0.0149, "step": 2895 }, { "epoch": 2.7463252726410623, "grad_norm": 0.42316670484695895, "learning_rate": 2.1414786269430276e-07, "loss": 0.0254, "step": 2896 }, { "epoch": 2.7472735893788527, "grad_norm": 0.42725270284330796, "learning_rate": 2.1255223980891027e-07, "loss": 0.023, "step": 2897 }, { "epoch": 2.748221906116643, "grad_norm": 0.515462917264128, "learning_rate": 2.1096245458903985e-07, "loss": 0.0315, "step": 2898 }, { "epoch": 2.7491702228544335, "grad_norm": 0.3452667601112083, "learning_rate": 2.0937850897322754e-07, "loss": 0.0181, "step": 2899 }, { "epoch": 2.750118539592224, "grad_norm": 0.44912762956346985, "learning_rate": 2.0780040489288666e-07, "loss": 0.0233, "step": 2900 }, { "epoch": 2.7510668563300142, "grad_norm": 0.3191155443482015, "learning_rate": 2.0622814427230908e-07, "loss": 0.0128, "step": 2901 }, { "epoch": 2.7520151730678046, "grad_norm": 0.37094981215906553, "learning_rate": 2.046617290286601e-07, "loss": 0.0167, "step": 2902 }, { "epoch": 2.752963489805595, "grad_norm": 0.4763003959675575, "learning_rate": 2.0310116107197852e-07, "loss": 0.0158, "step": 2903 }, { "epoch": 2.7539118065433854, "grad_norm": 0.4329212628954346, "learning_rate": 2.0154644230517272e-07, "loss": 0.0325, "step": 2904 }, { "epoch": 2.754860123281176, "grad_norm": 0.4514024321208407, "learning_rate": 1.999975746240179e-07, "loss": 0.0256, "step": 2905 }, { "epoch": 2.755808440018966, "grad_norm": 0.376259827428853, "learning_rate": 1.9845455991715613e-07, "loss": 0.0263, "step": 2906 }, { "epoch": 2.756756756756757, "grad_norm": 0.33740568444255076, "learning_rate": 1.969174000660906e-07, "loss": 0.0176, "step": 2907 }, { "epoch": 2.757705073494547, "grad_norm": 0.41875612207034385, "learning_rate": 1.9538609694518763e-07, "loss": 0.0218, "step": 2908 }, { "epoch": 2.758653390232338, "grad_norm": 0.41857690137005205, "learning_rate": 1.9386065242167074e-07, "loss": 0.0245, "step": 2909 }, { "epoch": 2.759601706970128, "grad_norm": 0.4052596751702686, "learning_rate": 1.9234106835561873e-07, "loss": 0.0196, "step": 2910 }, { "epoch": 2.7605500237079186, "grad_norm": 0.5686641821576979, "learning_rate": 1.9082734659996548e-07, "loss": 0.0182, "step": 2911 }, { "epoch": 2.761498340445709, "grad_norm": 0.3255702279516522, "learning_rate": 1.893194890004979e-07, "loss": 0.0143, "step": 2912 }, { "epoch": 2.7624466571834994, "grad_norm": 0.36251640680639674, "learning_rate": 1.878174973958491e-07, "loss": 0.0289, "step": 2913 }, { "epoch": 2.7633949739212897, "grad_norm": 0.4490481069822958, "learning_rate": 1.863213736175018e-07, "loss": 0.0226, "step": 2914 }, { "epoch": 2.76434329065908, "grad_norm": 0.4277937689582375, "learning_rate": 1.8483111948978394e-07, "loss": 0.0167, "step": 2915 }, { "epoch": 2.7652916073968705, "grad_norm": 0.42002641898344123, "learning_rate": 1.833467368298636e-07, "loss": 0.0243, "step": 2916 }, { "epoch": 2.766239924134661, "grad_norm": 0.4499658388831021, "learning_rate": 1.8186822744775234e-07, "loss": 0.0268, "step": 2917 }, { "epoch": 2.7671882408724513, "grad_norm": 0.4274669486577644, "learning_rate": 1.8039559314629808e-07, "loss": 0.0163, "step": 2918 }, { "epoch": 2.7681365576102417, "grad_norm": 0.4400359676580664, "learning_rate": 1.789288357211849e-07, "loss": 0.0277, "step": 2919 }, { "epoch": 2.769084874348032, "grad_norm": 0.3837018238158169, "learning_rate": 1.7746795696093276e-07, "loss": 0.019, "step": 2920 }, { "epoch": 2.7700331910858225, "grad_norm": 0.350819880840381, "learning_rate": 1.760129586468906e-07, "loss": 0.0222, "step": 2921 }, { "epoch": 2.7709815078236133, "grad_norm": 0.4005678605328111, "learning_rate": 1.745638425532392e-07, "loss": 0.0151, "step": 2922 }, { "epoch": 2.7719298245614032, "grad_norm": 0.3792448928276872, "learning_rate": 1.7312061044698569e-07, "loss": 0.0287, "step": 2923 }, { "epoch": 2.772878141299194, "grad_norm": 0.6223382836502508, "learning_rate": 1.7168326408796233e-07, "loss": 0.0196, "step": 2924 }, { "epoch": 2.7738264580369845, "grad_norm": 0.5267243071815664, "learning_rate": 1.7025180522882546e-07, "loss": 0.0189, "step": 2925 }, { "epoch": 2.774774774774775, "grad_norm": 0.3763913747813696, "learning_rate": 1.688262356150505e-07, "loss": 0.0198, "step": 2926 }, { "epoch": 2.7757230915125652, "grad_norm": 0.6370991606268415, "learning_rate": 1.6740655698493313e-07, "loss": 0.0309, "step": 2927 }, { "epoch": 2.7766714082503556, "grad_norm": 0.42771643414825106, "learning_rate": 1.659927710695869e-07, "loss": 0.0336, "step": 2928 }, { "epoch": 2.777619724988146, "grad_norm": 0.4352455322886255, "learning_rate": 1.6458487959293722e-07, "loss": 0.0232, "step": 2929 }, { "epoch": 2.7785680417259364, "grad_norm": 0.46779414418617427, "learning_rate": 1.6318288427172424e-07, "loss": 0.0183, "step": 2930 }, { "epoch": 2.779516358463727, "grad_norm": 0.37840061172320577, "learning_rate": 1.61786786815496e-07, "loss": 0.0208, "step": 2931 }, { "epoch": 2.780464675201517, "grad_norm": 0.3181428862321126, "learning_rate": 1.6039658892661182e-07, "loss": 0.0177, "step": 2932 }, { "epoch": 2.7814129919393076, "grad_norm": 0.4433106803305223, "learning_rate": 1.590122923002363e-07, "loss": 0.0223, "step": 2933 }, { "epoch": 2.782361308677098, "grad_norm": 0.2888850895455348, "learning_rate": 1.576338986243364e-07, "loss": 0.0136, "step": 2934 }, { "epoch": 2.783309625414889, "grad_norm": 0.4252472222588191, "learning_rate": 1.5626140957968383e-07, "loss": 0.0266, "step": 2935 }, { "epoch": 2.7842579421526787, "grad_norm": 0.39718864763140277, "learning_rate": 1.5489482683984925e-07, "loss": 0.0191, "step": 2936 }, { "epoch": 2.7852062588904696, "grad_norm": 0.4490343074595942, "learning_rate": 1.5353415207120082e-07, "loss": 0.0263, "step": 2937 }, { "epoch": 2.78615457562826, "grad_norm": 0.5490222154334174, "learning_rate": 1.5217938693290359e-07, "loss": 0.0316, "step": 2938 }, { "epoch": 2.7871028923660504, "grad_norm": 0.4469751880236888, "learning_rate": 1.5083053307691608e-07, "loss": 0.0284, "step": 2939 }, { "epoch": 2.7880512091038407, "grad_norm": 0.4212606273209142, "learning_rate": 1.494875921479888e-07, "loss": 0.0225, "step": 2940 }, { "epoch": 2.788999525841631, "grad_norm": 0.572352413209743, "learning_rate": 1.4815056578366294e-07, "loss": 0.029, "step": 2941 }, { "epoch": 2.7899478425794215, "grad_norm": 0.5479657138204983, "learning_rate": 1.4681945561426548e-07, "loss": 0.0275, "step": 2942 }, { "epoch": 2.790896159317212, "grad_norm": 0.3807981714509639, "learning_rate": 1.4549426326291193e-07, "loss": 0.0204, "step": 2943 }, { "epoch": 2.7918444760550023, "grad_norm": 0.4491797751181635, "learning_rate": 1.4417499034550143e-07, "loss": 0.0251, "step": 2944 }, { "epoch": 2.7927927927927927, "grad_norm": 0.3987956661004981, "learning_rate": 1.4286163847071377e-07, "loss": 0.0274, "step": 2945 }, { "epoch": 2.793741109530583, "grad_norm": 0.47220214159729007, "learning_rate": 1.4155420924000963e-07, "loss": 0.0328, "step": 2946 }, { "epoch": 2.7946894262683735, "grad_norm": 0.6378932543205542, "learning_rate": 1.402527042476276e-07, "loss": 0.0231, "step": 2947 }, { "epoch": 2.7956377430061643, "grad_norm": 0.3691335230452688, "learning_rate": 1.3895712508058269e-07, "loss": 0.0217, "step": 2948 }, { "epoch": 2.7965860597439542, "grad_norm": 0.4517905770163704, "learning_rate": 1.3766747331866447e-07, "loss": 0.0281, "step": 2949 }, { "epoch": 2.797534376481745, "grad_norm": 0.3558619798708121, "learning_rate": 1.3638375053443343e-07, "loss": 0.0172, "step": 2950 }, { "epoch": 2.7984826932195355, "grad_norm": 0.3057791004881369, "learning_rate": 1.3510595829322237e-07, "loss": 0.0132, "step": 2951 }, { "epoch": 2.799431009957326, "grad_norm": 0.32216168713007337, "learning_rate": 1.3383409815313108e-07, "loss": 0.0181, "step": 2952 }, { "epoch": 2.8003793266951162, "grad_norm": 0.4740758300640553, "learning_rate": 1.3256817166502567e-07, "loss": 0.0311, "step": 2953 }, { "epoch": 2.8013276434329066, "grad_norm": 0.3702293823546692, "learning_rate": 1.3130818037253967e-07, "loss": 0.0239, "step": 2954 }, { "epoch": 2.802275960170697, "grad_norm": 0.6478107471426148, "learning_rate": 1.3005412581206521e-07, "loss": 0.0306, "step": 2955 }, { "epoch": 2.8032242769084874, "grad_norm": 0.3593783860531484, "learning_rate": 1.2880600951275857e-07, "loss": 0.0214, "step": 2956 }, { "epoch": 2.804172593646278, "grad_norm": 0.38273343270548155, "learning_rate": 1.2756383299653452e-07, "loss": 0.0231, "step": 2957 }, { "epoch": 2.805120910384068, "grad_norm": 0.35039846503549316, "learning_rate": 1.263275977780637e-07, "loss": 0.0245, "step": 2958 }, { "epoch": 2.8060692271218586, "grad_norm": 0.4375014938283871, "learning_rate": 1.250973053647736e-07, "loss": 0.0227, "step": 2959 }, { "epoch": 2.807017543859649, "grad_norm": 0.42091207467959957, "learning_rate": 1.2387295725684534e-07, "loss": 0.0237, "step": 2960 }, { "epoch": 2.80796586059744, "grad_norm": 0.46361943079914314, "learning_rate": 1.2265455494721024e-07, "loss": 0.0269, "step": 2961 }, { "epoch": 2.8089141773352297, "grad_norm": 0.4287848018482948, "learning_rate": 1.2144209992155043e-07, "loss": 0.0171, "step": 2962 }, { "epoch": 2.8098624940730206, "grad_norm": 0.4400952401156122, "learning_rate": 1.2023559365829606e-07, "loss": 0.0215, "step": 2963 }, { "epoch": 2.810810810810811, "grad_norm": 0.3500703922509506, "learning_rate": 1.1903503762862311e-07, "loss": 0.0172, "step": 2964 }, { "epoch": 2.8117591275486014, "grad_norm": 0.31141988161587025, "learning_rate": 1.1784043329645389e-07, "loss": 0.0111, "step": 2965 }, { "epoch": 2.8127074442863917, "grad_norm": 0.3598348161337118, "learning_rate": 1.1665178211844985e-07, "loss": 0.0186, "step": 2966 }, { "epoch": 2.813655761024182, "grad_norm": 0.4603355314327043, "learning_rate": 1.1546908554401659e-07, "loss": 0.032, "step": 2967 }, { "epoch": 2.8146040777619725, "grad_norm": 0.3641648570177723, "learning_rate": 1.1429234501529773e-07, "loss": 0.0182, "step": 2968 }, { "epoch": 2.815552394499763, "grad_norm": 0.5217885429345293, "learning_rate": 1.1312156196717383e-07, "loss": 0.0264, "step": 2969 }, { "epoch": 2.8165007112375533, "grad_norm": 0.48416094699379036, "learning_rate": 1.1195673782726235e-07, "loss": 0.0213, "step": 2970 }, { "epoch": 2.8174490279753437, "grad_norm": 0.45587516725174587, "learning_rate": 1.1079787401591213e-07, "loss": 0.0255, "step": 2971 }, { "epoch": 2.818397344713134, "grad_norm": 0.3105673525304809, "learning_rate": 1.0964497194620727e-07, "loss": 0.0199, "step": 2972 }, { "epoch": 2.8193456614509245, "grad_norm": 0.33311835087454994, "learning_rate": 1.08498033023961e-07, "loss": 0.0177, "step": 2973 }, { "epoch": 2.8202939781887153, "grad_norm": 0.36202184705860024, "learning_rate": 1.0735705864771351e-07, "loss": 0.0233, "step": 2974 }, { "epoch": 2.8212422949265052, "grad_norm": 0.3721122938385976, "learning_rate": 1.0622205020873467e-07, "loss": 0.0192, "step": 2975 }, { "epoch": 2.822190611664296, "grad_norm": 0.41898867839074483, "learning_rate": 1.0509300909101904e-07, "loss": 0.0327, "step": 2976 }, { "epoch": 2.823138928402086, "grad_norm": 0.3496114095941287, "learning_rate": 1.0396993667128318e-07, "loss": 0.0197, "step": 2977 }, { "epoch": 2.824087245139877, "grad_norm": 0.3870480111236857, "learning_rate": 1.028528343189683e-07, "loss": 0.0191, "step": 2978 }, { "epoch": 2.8250355618776672, "grad_norm": 0.44439329160244306, "learning_rate": 1.0174170339623313e-07, "loss": 0.0209, "step": 2979 }, { "epoch": 2.8259838786154576, "grad_norm": 0.2985339684820892, "learning_rate": 1.0063654525795663e-07, "loss": 0.016, "step": 2980 }, { "epoch": 2.826932195353248, "grad_norm": 0.5609480662641474, "learning_rate": 9.953736125173474e-08, "loss": 0.0216, "step": 2981 }, { "epoch": 2.8278805120910384, "grad_norm": 0.3391707432744039, "learning_rate": 9.84441527178781e-08, "loss": 0.0175, "step": 2982 }, { "epoch": 2.828828828828829, "grad_norm": 0.34240861985112864, "learning_rate": 9.735692098941207e-08, "loss": 0.0159, "step": 2983 }, { "epoch": 2.829777145566619, "grad_norm": 0.7564810284696015, "learning_rate": 9.627566739207229e-08, "loss": 0.0238, "step": 2984 }, { "epoch": 2.8307254623044096, "grad_norm": 0.35861289080670644, "learning_rate": 9.52003932443063e-08, "loss": 0.0184, "step": 2985 }, { "epoch": 2.8316737790422, "grad_norm": 0.3168867052467738, "learning_rate": 9.413109985727032e-08, "loss": 0.0147, "step": 2986 }, { "epoch": 2.8326220957799904, "grad_norm": 0.30328581611262106, "learning_rate": 9.306778853482745e-08, "loss": 0.0135, "step": 2987 }, { "epoch": 2.8335704125177807, "grad_norm": 0.39852807663419376, "learning_rate": 9.201046057354613e-08, "loss": 0.02, "step": 2988 }, { "epoch": 2.8345187292555716, "grad_norm": 0.42062156640088805, "learning_rate": 9.095911726270057e-08, "loss": 0.0214, "step": 2989 }, { "epoch": 2.8354670459933615, "grad_norm": 0.40669783911572394, "learning_rate": 8.991375988426421e-08, "loss": 0.0191, "step": 2990 }, { "epoch": 2.8364153627311524, "grad_norm": 0.38716601394773986, "learning_rate": 8.887438971291574e-08, "loss": 0.0217, "step": 2991 }, { "epoch": 2.8373636794689427, "grad_norm": 0.47316775873816014, "learning_rate": 8.784100801602913e-08, "loss": 0.0269, "step": 2992 }, { "epoch": 2.838311996206733, "grad_norm": 0.3893105038874972, "learning_rate": 8.681361605367921e-08, "loss": 0.0246, "step": 2993 }, { "epoch": 2.8392603129445235, "grad_norm": 0.44212071533465297, "learning_rate": 8.579221507863555e-08, "loss": 0.0162, "step": 2994 }, { "epoch": 2.840208629682314, "grad_norm": 0.295444897835934, "learning_rate": 8.477680633636298e-08, "loss": 0.0141, "step": 2995 }, { "epoch": 2.8411569464201043, "grad_norm": 0.5950424381452033, "learning_rate": 8.376739106501886e-08, "loss": 0.0347, "step": 2996 }, { "epoch": 2.8421052631578947, "grad_norm": 0.4003901243922404, "learning_rate": 8.276397049545359e-08, "loss": 0.0226, "step": 2997 }, { "epoch": 2.843053579895685, "grad_norm": 0.3217709626825981, "learning_rate": 8.176654585120625e-08, "loss": 0.0146, "step": 2998 }, { "epoch": 2.8440018966334755, "grad_norm": 0.32999877056945826, "learning_rate": 8.077511834850727e-08, "loss": 0.0191, "step": 2999 }, { "epoch": 2.844950213371266, "grad_norm": 0.41667581680322996, "learning_rate": 7.978968919627073e-08, "loss": 0.0292, "step": 3000 }, { "epoch": 2.8458985301090562, "grad_norm": 0.4081492608814963, "learning_rate": 7.881025959609933e-08, "loss": 0.0203, "step": 3001 }, { "epoch": 2.846846846846847, "grad_norm": 0.391926194225634, "learning_rate": 7.783683074227943e-08, "loss": 0.0176, "step": 3002 }, { "epoch": 2.847795163584637, "grad_norm": 0.3969469040185239, "learning_rate": 7.686940382177933e-08, "loss": 0.0224, "step": 3003 }, { "epoch": 2.848743480322428, "grad_norm": 0.458127293518202, "learning_rate": 7.590798001425038e-08, "loss": 0.0266, "step": 3004 }, { "epoch": 2.8496917970602182, "grad_norm": 0.33047498969579775, "learning_rate": 7.495256049202148e-08, "loss": 0.014, "step": 3005 }, { "epoch": 2.8506401137980086, "grad_norm": 0.3768004564754747, "learning_rate": 7.400314642010353e-08, "loss": 0.0188, "step": 3006 }, { "epoch": 2.851588430535799, "grad_norm": 0.4032452837194265, "learning_rate": 7.305973895618157e-08, "loss": 0.0194, "step": 3007 }, { "epoch": 2.8525367472735894, "grad_norm": 0.2862916882723256, "learning_rate": 7.212233925061707e-08, "loss": 0.0146, "step": 3008 }, { "epoch": 2.85348506401138, "grad_norm": 0.45519088532977087, "learning_rate": 7.119094844644681e-08, "loss": 0.028, "step": 3009 }, { "epoch": 2.85443338074917, "grad_norm": 0.5934720177918184, "learning_rate": 7.026556767938009e-08, "loss": 0.0313, "step": 3010 }, { "epoch": 2.8553816974869606, "grad_norm": 0.32292834230201356, "learning_rate": 6.934619807779652e-08, "loss": 0.0176, "step": 3011 }, { "epoch": 2.856330014224751, "grad_norm": 0.357283774394047, "learning_rate": 6.843284076274769e-08, "loss": 0.0254, "step": 3012 }, { "epoch": 2.8572783309625414, "grad_norm": 0.3461413190649742, "learning_rate": 6.752549684795385e-08, "loss": 0.0158, "step": 3013 }, { "epoch": 2.8582266477003317, "grad_norm": 0.33139564022160245, "learning_rate": 6.662416743980105e-08, "loss": 0.0245, "step": 3014 }, { "epoch": 2.8591749644381226, "grad_norm": 0.5110634604202572, "learning_rate": 6.572885363734294e-08, "loss": 0.0222, "step": 3015 }, { "epoch": 2.8601232811759125, "grad_norm": 0.4720638779321911, "learning_rate": 6.483955653229735e-08, "loss": 0.0205, "step": 3016 }, { "epoch": 2.8610715979137034, "grad_norm": 0.3523152512602496, "learning_rate": 6.395627720904518e-08, "loss": 0.0172, "step": 3017 }, { "epoch": 2.8620199146514937, "grad_norm": 0.4762763264950715, "learning_rate": 6.307901674463046e-08, "loss": 0.0157, "step": 3018 }, { "epoch": 2.862968231389284, "grad_norm": 0.38026270453910344, "learning_rate": 6.220777620875695e-08, "loss": 0.0186, "step": 3019 }, { "epoch": 2.8639165481270745, "grad_norm": 0.42457200354403224, "learning_rate": 6.134255666378874e-08, "loss": 0.0197, "step": 3020 }, { "epoch": 2.864864864864865, "grad_norm": 0.39614457102266576, "learning_rate": 6.0483359164748e-08, "loss": 0.0257, "step": 3021 }, { "epoch": 2.8658131816026553, "grad_norm": 0.3617668641258794, "learning_rate": 5.963018475931282e-08, "loss": 0.0228, "step": 3022 }, { "epoch": 2.8667614983404457, "grad_norm": 0.43808543708423153, "learning_rate": 5.8783034487818194e-08, "loss": 0.0245, "step": 3023 }, { "epoch": 2.867709815078236, "grad_norm": 0.43175991173965117, "learning_rate": 5.794190938325284e-08, "loss": 0.0264, "step": 3024 }, { "epoch": 2.8686581318160265, "grad_norm": 0.4848482461129806, "learning_rate": 5.710681047125799e-08, "loss": 0.0225, "step": 3025 }, { "epoch": 2.869606448553817, "grad_norm": 0.41841626879899474, "learning_rate": 5.627773877012854e-08, "loss": 0.0227, "step": 3026 }, { "epoch": 2.8705547652916072, "grad_norm": 0.37759148273500365, "learning_rate": 5.54546952908086e-08, "loss": 0.0169, "step": 3027 }, { "epoch": 2.871503082029398, "grad_norm": 0.37983258288815297, "learning_rate": 5.4637681036890935e-08, "loss": 0.0243, "step": 3028 }, { "epoch": 2.872451398767188, "grad_norm": 0.5681184040431467, "learning_rate": 5.382669700461862e-08, "loss": 0.0202, "step": 3029 }, { "epoch": 2.873399715504979, "grad_norm": 0.5891881317634622, "learning_rate": 5.302174418287953e-08, "loss": 0.0195, "step": 3030 }, { "epoch": 2.8743480322427692, "grad_norm": 0.4508271499743202, "learning_rate": 5.222282355320851e-08, "loss": 0.0164, "step": 3031 }, { "epoch": 2.8752963489805596, "grad_norm": 0.4435781831406531, "learning_rate": 5.142993608978519e-08, "loss": 0.0316, "step": 3032 }, { "epoch": 2.87624466571835, "grad_norm": 0.34441181405039545, "learning_rate": 5.064308275943064e-08, "loss": 0.0176, "step": 3033 }, { "epoch": 2.8771929824561404, "grad_norm": 0.3795476447951702, "learning_rate": 4.9862264521611245e-08, "loss": 0.0265, "step": 3034 }, { "epoch": 2.878141299193931, "grad_norm": 0.3827141067101136, "learning_rate": 4.9087482328430967e-08, "loss": 0.0202, "step": 3035 }, { "epoch": 2.879089615931721, "grad_norm": 0.43056600166627135, "learning_rate": 4.831873712463631e-08, "loss": 0.023, "step": 3036 }, { "epoch": 2.8800379326695116, "grad_norm": 0.3400748622374083, "learning_rate": 4.755602984761132e-08, "loss": 0.0203, "step": 3037 }, { "epoch": 2.880986249407302, "grad_norm": 0.3358342329948678, "learning_rate": 4.679936142737651e-08, "loss": 0.0182, "step": 3038 }, { "epoch": 2.8819345661450924, "grad_norm": 0.3518260649894719, "learning_rate": 4.6048732786591057e-08, "loss": 0.0216, "step": 3039 }, { "epoch": 2.8828828828828827, "grad_norm": 0.373396098412495, "learning_rate": 4.530414484054779e-08, "loss": 0.0181, "step": 3040 }, { "epoch": 2.8838311996206736, "grad_norm": 0.45648381228955215, "learning_rate": 4.456559849717435e-08, "loss": 0.0248, "step": 3041 }, { "epoch": 2.8847795163584635, "grad_norm": 0.5723077765956877, "learning_rate": 4.383309465703145e-08, "loss": 0.0292, "step": 3042 }, { "epoch": 2.8857278330962544, "grad_norm": 0.3588197970323949, "learning_rate": 4.310663421331074e-08, "loss": 0.0153, "step": 3043 }, { "epoch": 2.8866761498340443, "grad_norm": 0.3503812732911283, "learning_rate": 4.23862180518364e-08, "loss": 0.02, "step": 3044 }, { "epoch": 2.887624466571835, "grad_norm": 0.3506207303029663, "learning_rate": 4.167184705106131e-08, "loss": 0.0164, "step": 3045 }, { "epoch": 2.8885727833096255, "grad_norm": 0.42034903702104165, "learning_rate": 4.0963522082067555e-08, "loss": 0.0237, "step": 3046 }, { "epoch": 2.889521100047416, "grad_norm": 0.4494851053535652, "learning_rate": 4.026124400856479e-08, "loss": 0.035, "step": 3047 }, { "epoch": 2.8904694167852063, "grad_norm": 0.3197898192752823, "learning_rate": 3.956501368688859e-08, "loss": 0.0158, "step": 3048 }, { "epoch": 2.8914177335229967, "grad_norm": 0.4563666271619231, "learning_rate": 3.8874831966000946e-08, "loss": 0.0208, "step": 3049 }, { "epoch": 2.892366050260787, "grad_norm": 0.40272960757734055, "learning_rate": 3.819069968748812e-08, "loss": 0.0195, "step": 3050 }, { "epoch": 2.8933143669985775, "grad_norm": 0.46058356280405144, "learning_rate": 3.751261768555947e-08, "loss": 0.0191, "step": 3051 }, { "epoch": 2.894262683736368, "grad_norm": 0.42553257610569817, "learning_rate": 3.684058678704805e-08, "loss": 0.0209, "step": 3052 }, { "epoch": 2.8952110004741582, "grad_norm": 0.33487787758500354, "learning_rate": 3.6174607811406134e-08, "loss": 0.0163, "step": 3053 }, { "epoch": 2.8961593172119486, "grad_norm": 0.3625318971935175, "learning_rate": 3.551468157070914e-08, "loss": 0.0173, "step": 3054 }, { "epoch": 2.897107633949739, "grad_norm": 0.5370407138568671, "learning_rate": 3.4860808869650595e-08, "loss": 0.0313, "step": 3055 }, { "epoch": 2.89805595068753, "grad_norm": 0.44545436437243463, "learning_rate": 3.421299050554161e-08, "loss": 0.0292, "step": 3056 }, { "epoch": 2.89900426742532, "grad_norm": 0.34087790333913004, "learning_rate": 3.357122726831252e-08, "loss": 0.0157, "step": 3057 }, { "epoch": 2.8999525841631106, "grad_norm": 0.3832172647070527, "learning_rate": 3.293551994051014e-08, "loss": 0.0226, "step": 3058 }, { "epoch": 2.900900900900901, "grad_norm": 0.5081414251987051, "learning_rate": 3.230586929729496e-08, "loss": 0.0324, "step": 3059 }, { "epoch": 2.9018492176386914, "grad_norm": 0.4636198191740265, "learning_rate": 3.1682276106444474e-08, "loss": 0.0189, "step": 3060 }, { "epoch": 2.902797534376482, "grad_norm": 0.37250398138400564, "learning_rate": 3.1064741128348786e-08, "loss": 0.0159, "step": 3061 }, { "epoch": 2.903745851114272, "grad_norm": 0.38602339870373964, "learning_rate": 3.0453265116009986e-08, "loss": 0.0133, "step": 3062 }, { "epoch": 2.9046941678520626, "grad_norm": 0.46169156998385363, "learning_rate": 2.984784881504443e-08, "loss": 0.0181, "step": 3063 }, { "epoch": 2.905642484589853, "grad_norm": 0.4028285252992143, "learning_rate": 2.9248492963677156e-08, "loss": 0.0244, "step": 3064 }, { "epoch": 2.9065908013276434, "grad_norm": 0.3732446124156518, "learning_rate": 2.8655198292744104e-08, "loss": 0.02, "step": 3065 }, { "epoch": 2.9075391180654337, "grad_norm": 0.7546073925694232, "learning_rate": 2.8067965525690467e-08, "loss": 0.0329, "step": 3066 }, { "epoch": 2.908487434803224, "grad_norm": 0.3667730204837699, "learning_rate": 2.748679537857013e-08, "loss": 0.0158, "step": 3067 }, { "epoch": 2.9094357515410145, "grad_norm": 0.3803667665108587, "learning_rate": 2.6911688560043447e-08, "loss": 0.0225, "step": 3068 }, { "epoch": 2.9103840682788054, "grad_norm": 0.4424178102463165, "learning_rate": 2.634264577137835e-08, "loss": 0.0209, "step": 3069 }, { "epoch": 2.9113323850165953, "grad_norm": 0.36197907514861327, "learning_rate": 2.5779667706447577e-08, "loss": 0.0199, "step": 3070 }, { "epoch": 2.912280701754386, "grad_norm": 0.354290871645171, "learning_rate": 2.522275505172922e-08, "loss": 0.0208, "step": 3071 }, { "epoch": 2.9132290184921765, "grad_norm": 0.47534916673431526, "learning_rate": 2.4671908486305075e-08, "loss": 0.0276, "step": 3072 }, { "epoch": 2.914177335229967, "grad_norm": 0.4430709376730269, "learning_rate": 2.4127128681861178e-08, "loss": 0.025, "step": 3073 }, { "epoch": 2.9151256519677573, "grad_norm": 0.35749048896901775, "learning_rate": 2.358841630268449e-08, "loss": 0.0198, "step": 3074 }, { "epoch": 2.9160739687055477, "grad_norm": 0.3016708451573105, "learning_rate": 2.3055772005664558e-08, "loss": 0.0182, "step": 3075 }, { "epoch": 2.917022285443338, "grad_norm": 0.476466900640434, "learning_rate": 2.2529196440290723e-08, "loss": 0.0329, "step": 3076 }, { "epoch": 2.9179706021811285, "grad_norm": 0.34737568586667705, "learning_rate": 2.2008690248653264e-08, "loss": 0.018, "step": 3077 }, { "epoch": 2.918918918918919, "grad_norm": 0.45746335820706724, "learning_rate": 2.14942540654417e-08, "loss": 0.0282, "step": 3078 }, { "epoch": 2.9198672356567092, "grad_norm": 0.44008704548623256, "learning_rate": 2.0985888517943697e-08, "loss": 0.0191, "step": 3079 }, { "epoch": 2.9208155523944996, "grad_norm": 0.4198768929738493, "learning_rate": 2.04835942260434e-08, "loss": 0.0149, "step": 3080 }, { "epoch": 2.92176386913229, "grad_norm": 0.46200695328290603, "learning_rate": 1.9987371802223655e-08, "loss": 0.042, "step": 3081 }, { "epoch": 2.922712185870081, "grad_norm": 0.30343363208397217, "learning_rate": 1.9497221851562665e-08, "loss": 0.0173, "step": 3082 }, { "epoch": 2.923660502607871, "grad_norm": 0.558923165292205, "learning_rate": 1.901314497173401e-08, "loss": 0.0345, "step": 3083 }, { "epoch": 2.9246088193456616, "grad_norm": 0.5277396892934476, "learning_rate": 1.8535141753006634e-08, "loss": 0.0244, "step": 3084 }, { "epoch": 2.925557136083452, "grad_norm": 0.3301572904249269, "learning_rate": 1.8063212778241523e-08, "loss": 0.0161, "step": 3085 }, { "epoch": 2.9265054528212424, "grad_norm": 0.39537910641276297, "learning_rate": 1.7597358622895023e-08, "loss": 0.0252, "step": 3086 }, { "epoch": 2.927453769559033, "grad_norm": 0.5693156857346456, "learning_rate": 1.7137579855016073e-08, "loss": 0.0695, "step": 3087 }, { "epoch": 2.928402086296823, "grad_norm": 0.3875441600495483, "learning_rate": 1.668387703524288e-08, "loss": 0.0192, "step": 3088 }, { "epoch": 2.9293504030346136, "grad_norm": 0.38815951485681266, "learning_rate": 1.62362507168079e-08, "loss": 0.0194, "step": 3089 }, { "epoch": 2.930298719772404, "grad_norm": 0.3849521596062896, "learning_rate": 1.5794701445532856e-08, "loss": 0.0205, "step": 3090 }, { "epoch": 2.9312470365101944, "grad_norm": 0.371593598213233, "learning_rate": 1.535922975982873e-08, "loss": 0.0201, "step": 3091 }, { "epoch": 2.9321953532479847, "grad_norm": 0.32985046282671887, "learning_rate": 1.4929836190696322e-08, "loss": 0.0259, "step": 3092 }, { "epoch": 2.933143669985775, "grad_norm": 0.5164324189253016, "learning_rate": 1.4506521261725137e-08, "loss": 0.0267, "step": 3093 }, { "epoch": 2.9340919867235655, "grad_norm": 0.4789431073781785, "learning_rate": 1.4089285489091719e-08, "loss": 0.0201, "step": 3094 }, { "epoch": 2.9350403034613564, "grad_norm": 0.8320517563012525, "learning_rate": 1.3678129381560768e-08, "loss": 0.0215, "step": 3095 }, { "epoch": 2.9359886201991463, "grad_norm": 0.3177776879644222, "learning_rate": 1.3273053440483463e-08, "loss": 0.0145, "step": 3096 }, { "epoch": 2.936936936936937, "grad_norm": 0.4448838819863309, "learning_rate": 1.2874058159796366e-08, "loss": 0.0253, "step": 3097 }, { "epoch": 2.937885253674727, "grad_norm": 0.4731371445018261, "learning_rate": 1.2481144026022519e-08, "loss": 0.0206, "step": 3098 }, { "epoch": 2.938833570412518, "grad_norm": 0.3975126226697693, "learning_rate": 1.209431151826923e-08, "loss": 0.023, "step": 3099 }, { "epoch": 2.9397818871503083, "grad_norm": 0.3839730934184733, "learning_rate": 1.1713561108228077e-08, "loss": 0.0173, "step": 3100 }, { "epoch": 2.9407302038880987, "grad_norm": 0.3128709085052025, "learning_rate": 1.1338893260173234e-08, "loss": 0.0173, "step": 3101 }, { "epoch": 2.941678520625889, "grad_norm": 0.43719703769605983, "learning_rate": 1.0970308430964804e-08, "loss": 0.0222, "step": 3102 }, { "epoch": 2.9426268373636795, "grad_norm": 0.3316575913593028, "learning_rate": 1.0607807070042719e-08, "loss": 0.0135, "step": 3103 }, { "epoch": 2.94357515410147, "grad_norm": 0.4809128170286833, "learning_rate": 1.0251389619430063e-08, "loss": 0.0227, "step": 3104 }, { "epoch": 2.9445234708392602, "grad_norm": 0.3149863969572906, "learning_rate": 9.901056513730856e-09, "loss": 0.0157, "step": 3105 }, { "epoch": 2.9454717875770506, "grad_norm": 0.3504971113027487, "learning_rate": 9.55680818013116e-09, "loss": 0.0137, "step": 3106 }, { "epoch": 2.946420104314841, "grad_norm": 0.3871851442267693, "learning_rate": 9.218645038396312e-09, "loss": 0.0247, "step": 3107 }, { "epoch": 2.9473684210526314, "grad_norm": 0.3216947869349185, "learning_rate": 8.886567500872024e-09, "loss": 0.0159, "step": 3108 }, { "epoch": 2.948316737790422, "grad_norm": 0.4069675609461263, "learning_rate": 8.560575972483276e-09, "loss": 0.0216, "step": 3109 }, { "epoch": 2.9492650545282126, "grad_norm": 0.46983515261566355, "learning_rate": 8.240670850734322e-09, "loss": 0.027, "step": 3110 }, { "epoch": 2.9502133712660026, "grad_norm": 0.3483044312681159, "learning_rate": 7.926852525707574e-09, "loss": 0.0214, "step": 3111 }, { "epoch": 2.9511616880037934, "grad_norm": 0.38354468067188363, "learning_rate": 7.619121380063044e-09, "loss": 0.0211, "step": 3112 }, { "epoch": 2.952110004741584, "grad_norm": 0.3452999961426753, "learning_rate": 7.317477789038352e-09, "loss": 0.0126, "step": 3113 }, { "epoch": 2.953058321479374, "grad_norm": 0.9052240783328028, "learning_rate": 7.021922120449276e-09, "loss": 0.0355, "step": 3114 }, { "epoch": 2.9540066382171646, "grad_norm": 0.376692331880163, "learning_rate": 6.732454734686422e-09, "loss": 0.0194, "step": 3115 }, { "epoch": 2.954954954954955, "grad_norm": 0.2888073338536502, "learning_rate": 6.449075984717446e-09, "loss": 0.0151, "step": 3116 }, { "epoch": 2.9559032716927454, "grad_norm": 0.6205803084744425, "learning_rate": 6.17178621608594e-09, "loss": 0.0387, "step": 3117 }, { "epoch": 2.9568515884305357, "grad_norm": 0.6830407506925207, "learning_rate": 5.9005857669103276e-09, "loss": 0.0204, "step": 3118 }, { "epoch": 2.957799905168326, "grad_norm": 0.3239139372515465, "learning_rate": 5.635474967883858e-09, "loss": 0.0172, "step": 3119 }, { "epoch": 2.9587482219061165, "grad_norm": 0.4204983758314779, "learning_rate": 5.376454142274612e-09, "loss": 0.0194, "step": 3120 }, { "epoch": 2.959696538643907, "grad_norm": 0.33796542007720826, "learning_rate": 5.1235236059249405e-09, "loss": 0.015, "step": 3121 }, { "epoch": 2.9606448553816973, "grad_norm": 0.3853177543515329, "learning_rate": 4.876683667249804e-09, "loss": 0.0175, "step": 3122 }, { "epoch": 2.961593172119488, "grad_norm": 0.3750767973380027, "learning_rate": 4.635934627238436e-09, "loss": 0.0224, "step": 3123 }, { "epoch": 2.962541488857278, "grad_norm": 0.3595739902347792, "learning_rate": 4.401276779452679e-09, "loss": 0.0185, "step": 3124 }, { "epoch": 2.963489805595069, "grad_norm": 0.34064999802722745, "learning_rate": 4.1727104100275366e-09, "loss": 0.0142, "step": 3125 }, { "epoch": 2.9644381223328593, "grad_norm": 0.4298659095790116, "learning_rate": 3.950235797668955e-09, "loss": 0.0183, "step": 3126 }, { "epoch": 2.9653864390706497, "grad_norm": 0.3154960617212208, "learning_rate": 3.733853213656046e-09, "loss": 0.016, "step": 3127 }, { "epoch": 2.96633475580844, "grad_norm": 0.43428561564507495, "learning_rate": 3.5235629218394184e-09, "loss": 0.0245, "step": 3128 }, { "epoch": 2.9672830725462305, "grad_norm": 0.5129124909596331, "learning_rate": 3.3193651786395107e-09, "loss": 0.0298, "step": 3129 }, { "epoch": 2.968231389284021, "grad_norm": 0.3653304436659778, "learning_rate": 3.1212602330499274e-09, "loss": 0.0188, "step": 3130 }, { "epoch": 2.9691797060218112, "grad_norm": 0.38747167610148703, "learning_rate": 2.929248326632994e-09, "loss": 0.0328, "step": 3131 }, { "epoch": 2.9701280227596016, "grad_norm": 0.37075949158686394, "learning_rate": 2.743329693521979e-09, "loss": 0.0162, "step": 3132 }, { "epoch": 2.971076339497392, "grad_norm": 0.32816513761541277, "learning_rate": 2.563504560420538e-09, "loss": 0.0175, "step": 3133 }, { "epoch": 2.9720246562351824, "grad_norm": 0.3176724510277183, "learning_rate": 2.389773146602159e-09, "loss": 0.0141, "step": 3134 }, { "epoch": 2.972972972972973, "grad_norm": 0.3777928889224251, "learning_rate": 2.222135663909053e-09, "loss": 0.0206, "step": 3135 }, { "epoch": 2.9739212897107636, "grad_norm": 0.34874205105798695, "learning_rate": 2.0605923167532625e-09, "loss": 0.0195, "step": 3136 }, { "epoch": 2.9748696064485536, "grad_norm": 0.37751192563561503, "learning_rate": 1.905143302116108e-09, "loss": 0.0186, "step": 3137 }, { "epoch": 2.9758179231863444, "grad_norm": 0.413833078919697, "learning_rate": 1.7557888095465215e-09, "loss": 0.0285, "step": 3138 }, { "epoch": 2.976766239924135, "grad_norm": 0.3495742580635406, "learning_rate": 1.6125290211638222e-09, "loss": 0.0184, "step": 3139 }, { "epoch": 2.977714556661925, "grad_norm": 0.3947867296797152, "learning_rate": 1.475364111653832e-09, "loss": 0.0275, "step": 3140 }, { "epoch": 2.9786628733997156, "grad_norm": 0.30521605231534926, "learning_rate": 1.3442942482710941e-09, "loss": 0.0176, "step": 3141 }, { "epoch": 2.979611190137506, "grad_norm": 0.3974193523935978, "learning_rate": 1.2193195908388744e-09, "loss": 0.0174, "step": 3142 }, { "epoch": 2.9805595068752964, "grad_norm": 0.3321585434506631, "learning_rate": 1.1004402917469403e-09, "loss": 0.0156, "step": 3143 }, { "epoch": 2.9815078236130867, "grad_norm": 0.4374292249333356, "learning_rate": 9.876564959526712e-10, "loss": 0.0265, "step": 3144 }, { "epoch": 2.982456140350877, "grad_norm": 0.36532880710618687, "learning_rate": 8.809683409816138e-10, "loss": 0.0178, "step": 3145 }, { "epoch": 2.9834044570886675, "grad_norm": 0.39027337727641087, "learning_rate": 7.803759569258163e-10, "loss": 0.0243, "step": 3146 }, { "epoch": 2.984352773826458, "grad_norm": 0.37419423922853146, "learning_rate": 6.858794664449386e-10, "loss": 0.0176, "step": 3147 }, { "epoch": 2.9853010905642483, "grad_norm": 0.6397596966792142, "learning_rate": 5.974789847640328e-10, "loss": 0.0235, "step": 3148 }, { "epoch": 2.986249407302039, "grad_norm": 0.3484579240241185, "learning_rate": 5.151746196774277e-10, "loss": 0.0203, "step": 3149 }, { "epoch": 2.987197724039829, "grad_norm": 0.417063201267999, "learning_rate": 4.389664715431785e-10, "loss": 0.02, "step": 3150 }, { "epoch": 2.98814604077762, "grad_norm": 0.2971137707448454, "learning_rate": 3.688546332875076e-10, "loss": 0.0154, "step": 3151 }, { "epoch": 2.9890943575154103, "grad_norm": 0.4175389094327218, "learning_rate": 3.048391904031389e-10, "loss": 0.0171, "step": 3152 }, { "epoch": 2.9900426742532007, "grad_norm": 0.3809367324704737, "learning_rate": 2.469202209476329e-10, "loss": 0.0218, "step": 3153 }, { "epoch": 2.990990990990991, "grad_norm": 0.4043099039616555, "learning_rate": 1.9509779554671704e-10, "loss": 0.0252, "step": 3154 }, { "epoch": 2.9919393077287815, "grad_norm": 0.3951864633474192, "learning_rate": 1.49371977389845e-10, "loss": 0.0149, "step": 3155 }, { "epoch": 2.992887624466572, "grad_norm": 0.4273760381840025, "learning_rate": 1.097428222346375e-10, "loss": 0.0196, "step": 3156 }, { "epoch": 2.9938359412043622, "grad_norm": 0.40712842132273613, "learning_rate": 7.62103784029966e-11, "loss": 0.026, "step": 3157 }, { "epoch": 2.9947842579421526, "grad_norm": 0.49552854470553426, "learning_rate": 4.87746867838812e-11, "loss": 0.022, "step": 3158 }, { "epoch": 2.995732574679943, "grad_norm": 0.3752732337103295, "learning_rate": 2.7435780831086557e-11, "loss": 0.0185, "step": 3159 }, { "epoch": 2.9966808914177334, "grad_norm": 0.46297687373006957, "learning_rate": 1.21936865643546e-11, "loss": 0.0211, "step": 3160 }, { "epoch": 2.997629208155524, "grad_norm": 0.3278922229087332, "learning_rate": 3.0484225704841e-12, "loss": 0.0157, "step": 3161 }, { "epoch": 2.9985775248933146, "grad_norm": 0.4130595870331912, "learning_rate": 0.0, "loss": 0.0283, "step": 3162 }, { "epoch": 2.9985775248933146, "eval_loss": 0.03720796853303909, "eval_runtime": 200.101, "eval_samples_per_second": 35.497, "eval_steps_per_second": 1.109, "step": 3162 }, { "epoch": 2.9985775248933146, "step": 3162, "total_flos": 1.1417019656175288e+18, "train_loss": 0.048461434403982234, "train_runtime": 39585.3646, "train_samples_per_second": 10.226, "train_steps_per_second": 0.08 } ], "logging_steps": 1, "max_steps": 3162, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1417019656175288e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }